#include #include #include #include #include #include #include #include #include #include #include #include "lve_internal.h" #include "resource.h" #include "lve_hooks.h" #include "lve_debug_events.h" static unsigned int iolimits_latency = 5000; /* 5 sec delay max */ static sigset_t block_sigsinv(unsigned long sigs) { unsigned long flags; sigset_t old; spin_lock_irqsave(¤t->sighand->siglock, flags); old = current->blocked; sigaddsetmask(¤t->blocked, ~sigs); recalc_sigpending(); spin_unlock_irqrestore(¤t->sighand->siglock, flags); return old; } static void restore_sigs(sigset_t old) { unsigned long flags; spin_lock_irqsave(¤t->sighand->siglock, flags); current->blocked = old; recalc_sigpending(); spin_unlock_irqrestore(¤t->sighand->siglock, flags); } static void iowait(unsigned long timeout) { sigset_t oldsigs; /* BSD process accounting can write inside do_exit() */ if (current->flags & PF_EXITING) return; oldsigs = block_sigsinv(sigmask(SIGKILL)); set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(timeout); restore_sigs(oldsigs); } __u64 maxbonustime = 1000000; /* speed is bytes/ops per second, returns true if limit reached */ static bool account_io(__u32 tr_id, struct lve_iolimit *io, __u64 *last, __u64 speed, __u64 bytes, bool can_sleep) { __u64 io_ts, current_ts; bool rc = false; unsigned long flags; /* get monotonic ts with the microsecond precision */ current_ts = ktime_get_ns() / 1000; /* The algorithm trivially serializes I/O throughout * the history: XXXXYYZZZZ.....QQQWEEE with respect to * the IO(PS) limits. If Z is doing I/O, it will virtually * occupy the time frame after the latest I/O (YY): * [last_io; last_io+bytes/speed]. If the interval end * is in the past, then I/O freely goes, otherwise the * I/O requester has to wait. In order to avoid I/O * peaks, I/O requesters will not benefit from longer * quiescent periods for more than "maxbonustime". */ spin_lock_irqsave(&io->lock, flags); if (*last < current_ts && (current_ts - *last) > maxbonustime) *last = current_ts - maxbonustime; io_ts = (*last += bytes * 1000000 / speed); spin_unlock_irqrestore(&io->lock, flags); if (io_ts > current_ts) { if (can_sleep) { __u64 wait_jiffies = usecs_to_jiffies(io_ts - current_ts); /* limit maximum latency */ wait_jiffies = min_t(__u64, wait_jiffies, msecs_to_jiffies(iolimits_latency)); trace_iolimit_wait(tr_id, jiffies_to_msecs(wait_jiffies)); iowait(wait_jiffies); atomic_long_add(wait_jiffies, &io->throttled_time); } rc = true; } return rc; } /* returns true if limit reached */ static bool lve_io_account(size_t io, size_t iops, bool can_sleep, bool dirty) { unsigned long flags; struct light_ve *ve; struct switch_data *sw_data; bool rc1 = false, rc2 = false; __u64 limit; sw_data = LVE_TAG_GET(current); if (sw_data == NULL) return false; if (sw_data->sw_from == NULL || sw_data->sw_flags & LVE_ENTER_NO_UBC) { LVE_TAG_PUT(sw_data); return false; } /* sw_data pins the lve */ ve = sw_data->sw_from; if ((limit = ve->lve_limits[LIM_IOPS]) != 0) rc1 = account_io(ve->lve_id, &ve->lve_iolimit, &ve->lve_iolimit.last_iops, limit, iops, can_sleep); if ((limit = ve->lve_limits[LIM_IO]) != 0) rc2 = account_io(ve->lve_id, &ve->lve_iolimit, &ve->lve_iolimit.last_io, limit << 10, io, can_sleep); spin_lock_irqsave(&ve->lve_iolimit.lock, flags); ve->lve_iolimit.total_iops += iops; ve->lve_iolimit.total_io += io; spin_unlock_irqrestore(&ve->lve_iolimit.lock, flags); if (io) { if (dirty) trace_iolimit_dirty(ve->lve_id, io); else trace_iolimit_io_account(ve->lve_id, io); } LVE_TAG_PUT(sw_data); return rc1 || rc2; } /* * How do we handle everything? * * Firstly, penalty sleep is always scheduled for return to userspace. * * Secondly, different classes of I/O are handled in different ways: * * (1) DATA READ SYNC * The easiest case by far, both IO and IOPS are handled from * rq_issue(), context can be found from current * (2) DATA READ ASYNC * Hopefully, disk filesystems never return -EIOCBQUEUED, so * we don't handle this case * (3) DATA WRITE SYNC * Direct I/O? * (4) DATA WRITE ASYNC * IO can be accounted in balance_dirty_pages(), IOPS can * only be counted approximately without significant efforts * (5) METADATA READ SYNC * Same as (1) * (6) METADATA READ ASYNC * Hopefully same as (2) * (7) METADATA WRITE SYNC * Non-journalled fs? * (8) METADATA WRITE ASYNC * Assuming everything goes through journal */ static void lve_iolimits_penalty(struct callback_head *cb) { might_sleep(); lve_io_account(0, 0, true, false); kfree(cb); } static void schedule_penalty(void) { struct callback_head *cb; cb = lve_task_work_cancel(current, lve_iolimits_penalty); if (!cb) { cb = kmalloc(sizeof(*cb), GFP_ATOMIC); if (!cb) /* we'll try later */ return; } init_task_work(cb, lve_iolimits_penalty); if (lve_task_work_add(current, cb, true) == 0) return; /* success */ kfree(cb); } void lve_iolimits_rq_issue(void *ignore, #ifndef HAVE_RQ_ISSUE_1ARG struct request_queue *q, #endif struct request *rq) { if (rq) { /* checks from blk_fill_rwbs() */ bool overlimit = false; if (!(rq->cmd_flags & WRITE)) /* (1) and (5) */ overlimit = lve_io_account(blk_rq_bytes(rq), 1, false, false); if (overlimit) schedule_penalty(); } } #ifdef HAVE_WRITEBACK_DIRTY_FOLIO void lve_iolimits_wb_dirty(void *ignore, struct folio *folio, struct address_space *mapping) { bool overlimit; overlimit = lve_io_account(folio_nr_pages(folio) << PAGE_SHIFT, 0, false, true); if (overlimit) schedule_penalty(); } #else void lve_iolimits_wb_dirty(void *ignore, struct page *page, struct address_space *mapping) { bool overlimit; overlimit = lve_io_account(PAGE_SIZE, 0, false, true); /* (4) */ if (overlimit) schedule_penalty(); } #endif #ifndef HAVE_IOMAP_DIO_RW void lve_iolimits_ext4_direct_IO_enter(void *ignore, struct inode *inode, loff_t offset, unsigned long len, int rw) { if (rw == WRITE) { bool overlimit; overlimit = lve_io_account(len, 0, false, false); /* (3) */ if (overlimit) schedule_penalty(); } } void lve_iolimits_xfs_file_direct_write(void *ignore, void *inode, size_t count, loff_t offset) { bool overlimit; overlimit = lve_io_account(count, 0, false, false); /* (3) */ if (overlimit) schedule_penalty(); } #else void lve_iolimits_dio_write_account(size_t count) { bool overlimit; overlimit = lve_io_account(count, 0, false, false); /* (3) */ if (overlimit) schedule_penalty(); } #endif unsigned long lve_get_io_throttled_time(struct light_ve *ve) { struct lve_iolimit *io = &ve->lve_iolimit; return atomic_long_read(&io->throttled_time); } module_param_named(latency, iolimits_latency, uint, 0644);