#include <linux/kernel.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/signal.h>
#include <linux/kprobes.h>
#include <linux/pagemap.h>
#include <linux/workqueue.h>
#include <linux/bio.h>
#include <linux/task_work.h>
#include <linux/blk-mq.h>
#include <linux/timekeeping.h>

#include "lve_internal.h"
#include "resource.h"
#include "lve_hooks.h"
#include "lve_debug_events.h"

static unsigned int iolimits_latency = 5000; /* 5 sec delay max */

static sigset_t block_sigsinv(unsigned long sigs)
{
	unsigned long flags;
	sigset_t old;

	spin_lock_irqsave(&current->sighand->siglock, flags);
	old = current->blocked;
	sigaddsetmask(&current->blocked, ~sigs);
	recalc_sigpending();
	spin_unlock_irqrestore(&current->sighand->siglock, flags);

	return old;
}

static void restore_sigs(sigset_t old)
{
	unsigned long flags;

	spin_lock_irqsave(&current->sighand->siglock, flags);
	current->blocked = old;
	recalc_sigpending();
	spin_unlock_irqrestore(&current->sighand->siglock, flags);
}

static void iowait(unsigned long timeout)
{
	sigset_t oldsigs;

	/* BSD process accounting can write inside do_exit() */
	if (current->flags & PF_EXITING)
		return;

	oldsigs = block_sigsinv(sigmask(SIGKILL));
	set_current_state(TASK_INTERRUPTIBLE);
	schedule_timeout(timeout);
	restore_sigs(oldsigs);
}
__u64 maxbonustime = 1000000;

/* speed is bytes/ops per second, returns true if limit reached */
static bool account_io(__u32 tr_id, struct lve_iolimit *io, __u64 *last,
		       __u64 speed, __u64 bytes, bool can_sleep)
{
	__u64 io_ts, current_ts;
	bool rc = false;
	unsigned long flags;

	/* get monotonic ts with the microsecond precision */
	current_ts = ktime_get_ns() / 1000;

	/* The algorithm trivially serializes I/O throughout
	 * the history: XXXXYYZZZZ.....QQQWEEE with respect to
	 * the IO(PS) limits. If Z is doing I/O, it will virtually
	 * occupy the time frame after the latest I/O (YY):
	 * [last_io; last_io+bytes/speed]. If the interval end
	 * is in the past, then I/O freely goes, otherwise the
	 * I/O requester has to wait. In order to avoid I/O
	 * peaks, I/O requesters will not benefit from longer
	 * quiescent periods for more than "maxbonustime".
	 */

	spin_lock_irqsave(&io->lock, flags);
	if (*last < current_ts && (current_ts - *last) > maxbonustime)
		*last = current_ts - maxbonustime;
	io_ts = (*last += bytes * 1000000 / speed);
	spin_unlock_irqrestore(&io->lock, flags);

	if (io_ts > current_ts) {
		if (can_sleep) {
			__u64 wait_jiffies =
				usecs_to_jiffies(io_ts - current_ts);

			/* limit maximum latency */
			wait_jiffies = min_t(__u64, wait_jiffies,
				       msecs_to_jiffies(iolimits_latency));
			trace_iolimit_wait(tr_id, jiffies_to_msecs(wait_jiffies));
			iowait(wait_jiffies);
			atomic_long_add(wait_jiffies, &io->throttled_time);
		}
		rc = true;
	}

	return rc;
}

/* returns true if limit reached */
static bool lve_io_account(size_t io, size_t iops, bool can_sleep, bool dirty)
{
	unsigned long flags;
	struct light_ve *ve;
	struct switch_data *sw_data;
	bool rc1 = false, rc2 = false;
	__u64 limit;

	sw_data = LVE_TAG_GET(current);
	if (sw_data == NULL)
		return false;

	if (sw_data->sw_from == NULL ||
	    sw_data->sw_flags & LVE_ENTER_NO_UBC) {
		LVE_TAG_PUT(sw_data);
		return false;
	}

	/* sw_data pins the lve */
	ve = sw_data->sw_from;

	if ((limit = ve->lve_limits[LIM_IOPS]) != 0)
		rc1 = account_io(ve->lve_id,
				 &ve->lve_iolimit,
				 &ve->lve_iolimit.last_iops,
				 limit, iops, can_sleep);

	if ((limit = ve->lve_limits[LIM_IO]) != 0)
		rc2 = account_io(ve->lve_id,
				 &ve->lve_iolimit,
				 &ve->lve_iolimit.last_io,
				 limit << 10,
				 io, can_sleep);

	spin_lock_irqsave(&ve->lve_iolimit.lock, flags);
	ve->lve_iolimit.total_iops += iops;
	ve->lve_iolimit.total_io += io;
	spin_unlock_irqrestore(&ve->lve_iolimit.lock, flags);

	if (io) {
		if (dirty)
			trace_iolimit_dirty(ve->lve_id, io);
		else
			trace_iolimit_io_account(ve->lve_id, io);
	}

	LVE_TAG_PUT(sw_data);

	return rc1 || rc2;
}

/*
 * How do we handle everything?
 *
 * Firstly, penalty sleep is always scheduled for return to userspace.
 *
 * Secondly, different classes of I/O are handled in different ways:
 *
 * (1) DATA READ SYNC
 *     The easiest case by far, both IO and IOPS are handled from
 *     rq_issue(), context can be found from current
 * (2) DATA READ ASYNC
 *     Hopefully, disk filesystems never return -EIOCBQUEUED, so
 *     we don't handle this case
 * (3) DATA WRITE SYNC
 *     Direct I/O?
 * (4) DATA WRITE ASYNC
 *     IO can be accounted in balance_dirty_pages(), IOPS can
 *     only be counted approximately without significant efforts
 * (5) METADATA READ SYNC
 *     Same as (1)
 * (6) METADATA READ ASYNC
 *     Hopefully same as (2)
 * (7) METADATA WRITE SYNC
 *     Non-journalled fs?
 * (8) METADATA WRITE ASYNC
 *     Assuming everything goes through journal
 */

static void lve_iolimits_penalty(struct callback_head *cb)
{
	might_sleep();

	lve_io_account(0, 0, true, false);
	kfree(cb);
}

static void schedule_penalty(void)
{
	struct callback_head *cb;

	cb = lve_task_work_cancel(current, lve_iolimits_penalty);
	if (!cb) {
		cb = kmalloc(sizeof(*cb), GFP_ATOMIC);
		if (!cb) /* we'll try later */
			return;
	}

	init_task_work(cb, lve_iolimits_penalty);
	if (lve_task_work_add(current, cb, true) == 0)
		return; /* success */

	kfree(cb);
}

void lve_iolimits_rq_issue(void *ignore,
#ifndef HAVE_RQ_ISSUE_1ARG
			   struct request_queue *q,
#endif
			   struct request *rq)
{
	if (rq) { /* checks from blk_fill_rwbs() */
		bool overlimit = false;

		if (!(rq->cmd_flags & WRITE)) /* (1) and (5) */
			overlimit = lve_io_account(blk_rq_bytes(rq), 1, false, false);

		if (overlimit)
			schedule_penalty();
	}
}

#ifdef HAVE_WRITEBACK_DIRTY_FOLIO
void lve_iolimits_wb_dirty(void *ignore, struct folio *folio,
			   struct address_space *mapping)
{
	bool overlimit;

	overlimit = lve_io_account(folio_nr_pages(folio) << PAGE_SHIFT,
				   0, false, true);
	if (overlimit)
		schedule_penalty();
}
#else
void lve_iolimits_wb_dirty(void *ignore, struct page *page,
			   struct address_space *mapping)
{
	bool overlimit;

	overlimit = lve_io_account(PAGE_SIZE, 0, false, true); /* (4) */
	if (overlimit)
		schedule_penalty();
}
#endif

#ifndef HAVE_IOMAP_DIO_RW
void lve_iolimits_ext4_direct_IO_enter(void *ignore, struct inode *inode,
					loff_t offset, unsigned long len, int rw)
{
	if (rw == WRITE) {
		bool overlimit;

		overlimit = lve_io_account(len, 0, false, false); /* (3) */
		if (overlimit)
			schedule_penalty();
	}
}

void lve_iolimits_xfs_file_direct_write(void *ignore, void *inode,
					 size_t count, loff_t offset)
{
	bool overlimit;

	overlimit = lve_io_account(count, 0, false, false); /* (3) */
	if (overlimit)
		schedule_penalty();
}
#else
void lve_iolimits_dio_write_account(size_t count)
{
	bool overlimit;

	overlimit = lve_io_account(count, 0, false, false); /* (3) */
	if (overlimit)
		schedule_penalty();
}
#endif

unsigned long lve_get_io_throttled_time(struct light_ve *ve)
{
	struct lve_iolimit *io = &ve->lve_iolimit;

	return atomic_long_read(&io->throttled_time);
}

module_param_named(latency, iolimits_latency, uint, 0644);