#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "lve_internal.h" #include "lve-api.h" #include "cgroup_lib.h" #include "ubc_lib.h" #include "lve_debug.h" #include "light_ve.h" #include "resource.h" #include "openvz_iolimits.h" #include "mm.h" #include "cgroup_generic.h" #include "openvz_connect.h" unsigned int os_context_private_sz(void) { return sizeof(struct c_private); } unsigned int os_lvp_private_sz(void) { return sizeof(struct lvp_private); } void os_resource_usage(struct c_private *private, struct lve_usage *buf) { int precharge[UB_RESOURCES]; __s64 data; data = cgrp_param_get(private->filps[PARAM_CPU_STAT]); if(data > 0) { LVE_DBG("cpu usage "LPU64"\n", data); buf->data[RES_CPU].data = data; } if (!private->lve_ub) return; lve_sync_ub_usage(private->lve_ub); lve_ub_precharge_snapshot(private->lve_ub, precharge); ubc_mem_stat(private->lve_ub, &buf->data[RES_MEM], precharge); ubc_phys_mem_stat(private->lve_ub, &buf->data[RES_MEM_PHY], precharge); ubc_nproc_stat(private->lve_ub, &buf->data[RES_NPROC], precharge); buf->data[RES_IO].data = ovz_get_io_usage(private->lve_ub) >> 10; buf->data[RES_IOPS].data = ovz_get_iops_usage(private->lve_ub); } int os_resource_usage_clear(struct c_private *private) { return cgrp_param_set_u64(private->filps[PARAM_CPU_STAT], 0); } int os_resource_init(struct light_ve *ve __attribute__((unused))) { #ifndef LVE_PER_VE struct lvp_ve_private *lvp = ve->lve_lvp; struct c_private *lcontext = lve_private(ve); #ifdef HAVE_SUB_UBC struct c_private *c = lve_private(lvp->lvp_default); #endif char name[MAX_GRP_NAMESZ]; struct cgroup_subsys_state *css; int rc; generic_lve_path(name, ve->lve_id); #ifdef HAVE_SUB_UBC lcontext->lve_ub = lve_call(get_sub_beancounter_by_name(c->lve_ub, name, 1), LVE_FAIL_GET_SUB_UB_BYUID, NULL); #else /* like native OpenVZ */ lcontext->lve_ub = lve_call(lve_get_beancounter_by_name(name, 1), LVE_FAIL_GET_SUB_UB_BYUID, NULL); #endif if (IS_ERR_OR_NULL(lcontext->lve_ub)) { LVE_ERR("can't create UBC for context %s, err %ld\n", name, PTR_ERR(lcontext->lve_ub)); lcontext->lve_ub = NULL; return -ENOMEM; } LVE_DBG("ubc %p created\n", lcontext->lve_ub); /* no init ubc nolimit needs as it done as part of ubc creation */ /* just for extra init */ css = lve_ub_get_css(lcontext->lve_ub, UB_MEM_CGROUP); lcontext->cgrp[MEM_SUBSYS] = css->cgroup->dentry; css_put(css); rc = generic_lve_init(ve, name); lcontext->cgrp[MEM_SUBSYS] = NULL; if (rc < 0) return rc; ub_set_shortage_cb(lcontext->lve_ub, ubc_shortage); #endif return 0; } static void os_ub_cgroup_unlink(struct c_private *lcontext, char *name) { struct cgroup_subsys_state *css; if (lcontext->lve_ub == NULL) return; css = lve_ub_get_css(lcontext->lve_ub, UB_MEM_CGROUP); cgrp_obfuscate(css->cgroup->dentry, name); css_put(css); css = lve_ub_get_css(lcontext->lve_ub, UB_BLKIO_CGROUP); cgrp_obfuscate(css->cgroup->dentry, name); css_put(css); cgrp_obfuscate(lcontext->lve_ub->css.cgroup->dentry, name); } static void os_ub_cgroup_fini(struct c_private *lcontext) { struct cgroup_subsys_state *mcss, *bcss; struct dentry *md, *bd, *ubd; mcss = lve_ub_get_css(lcontext->lve_ub, UB_MEM_CGROUP); bcss = lve_ub_get_css(lcontext->lve_ub, UB_BLKIO_CGROUP); md = dget(mcss->cgroup->dentry); bd = dget(bcss->cgroup->dentry); css_put(mcss); css_put(bcss); ubd = dget(lcontext->lve_ub->css.cgroup->dentry); put_beancounter(lcontext->lve_ub); lve_cgroup_kernel_remove(ubd); dput(ubd); lve_cgroup_kernel_remove(md); dput(md); lve_cgroup_kernel_remove(bd); dput(bd); } void os_resource_unlink(uint32_t id, struct c_private *lcontext) { char name[MAX_GRP_NAMESZ] = {0}; generic_resource_unlink(id, lcontext, name); /* Hide UBC-related cgroups: beancounter, memory, blkio */ os_ub_cgroup_unlink(lcontext, name); } void os_resource_fini(struct light_ve *ve) { uint32_t id = ve->lve_id; struct c_private *lcontext = lve_private(ve); LVE_ENTER("(id=%u, lcontext=%p)\n", id, lcontext); #ifndef LVE_PER_VE if (lcontext->lve_ub) { ub_set_shortage_cb(lcontext->lve_ub, NULL); if (ve->lve_lvp->lvp_default != ve || ve->lve_lvp->lvp_id != ROOT_LVP) os_ub_cgroup_fini(lcontext); else put_beancounter(lcontext->lve_ub); lcontext->lve_ub = LVE_POISON_PTR; } #endif generic_lve_fini(ve); } static int os_force_pmem_limit(struct c_private *lcontext, int32_t new) { struct light_ve *lve = os_lve(lcontext); int rc; if (lve != lve->lve_lvp->lvp_default) { /* For LVE pmem limit update we just kill all threads */ if (!lve_kill_on_shrink) return -EBUSY; LVE_WARN("lve %u threads will be killed to reduce physmem" " usage below the new limit\n", lve->lve_id); lve_kill_all_threads(0, lve->lve_id); schedule_timeout_killable(msecs_to_jiffies(10)); rc = ubc_set_res(lcontext->lve_ub, LVE_MEM_PHY_LIMIT_RES, new); } else { /* For LVP pmem limit update things are more complicated */ int precharge[UB_RESOURCES]; struct one_resource res; int32_t lim; lve_ub_precharge_snapshot(lcontext->lve_ub, precharge); ubc_phys_mem_stat(lcontext->lve_ub, &res, precharge); lim = res.data; if (new > lim) lim = new; rc = ubc_set_res(lcontext->lve_ub, LVE_MEM_PHY_LIMIT_RES, lim); LVE_DBG("attempt to shrink memory to %u (target %u), rc=%d\n", lim, new, rc); if (!rc && lim == new) { /* OK, we've finally managed to set the limit */ lve->lve_lvp->lvp_pmem_pending = 0; } else if (!rc || rc == -EBUSY) { /* Managed to set an intermediate value or * maybe raced and failed, schedule a new * update, but don't return an error to * the caller */ lve->lve_lvp->lvp_pmem_pending = 1; rc = 0; } else { /* Unexpected error, don't schedule an * update and try to return error */ lve->lve_lvp->lvp_pmem_pending = 0; } } return rc; } int os_resource_setup(struct c_private *lcontext, int32_t new, enum lve_limits custom) { int rc = 0; #ifndef LVE_PER_VE struct light_ve *lve = os_lve(lcontext); struct light_ve *reseller = NULL; /* Check if we are changing limits for a reseller */ if (lve != lve->lve_lvp->lvp_default) reseller = lve->lve_lvp->lvp_default; switch (custom) { case LIM_CPU: rc = lve_cgroup_cpu_set(lcontext, reseller ? reseller->lve_limits : lve->lve_limits, lve->lve_limits, new); break; case LIM_CPUS: rc = lve_cgroup_cpus_set(lcontext, reseller ? reseller->lve_limits : lve->lve_limits, lve->lve_limits, new); break; case LIM_CPU_WEIGHT: rc = lve_cgroup_cpuw_set(lcontext, new); break; case LIM_IO: rc = lve_call(ovz_set_io_limit(lcontext->lve_ub, new << 10, 0), LVE_FAIL_SET_IO_LIM, -ENOMEM); break; case LIM_IOPS: lve_call(ovz_set_iops_limit(lcontext->lve_ub, new, 0), LVE_FAIL_SET_IOPS_LIM, -ENOMEM); break; case LIM_MEMORY: LVE_DBG("set mem to %u\n", new); rc = ubc_set_res(lcontext->lve_ub, LVE_MEM_LIMIT_RES, new); if (rc) LVE_ERR("ubc set virtual memory limit %d\n", rc); break; case LIM_MEMORY_PHY: LVE_DBG("set phy mem to %u\n", new); rc = ubc_set_res(lcontext->lve_ub, LVE_MEM_PHY_LIMIT_RES, new); if (rc) LVE_ERR("ubc set phys memory limit %d\n", rc); else if (lve == lve->lve_lvp->lvp_default) lve->lve_lvp->lvp_pmem_pending = 0; if (rc != -EBUSY) break; rc = os_force_pmem_limit(lcontext, new); break; case LIM_NPROC: LVE_DBG("set nproc to %u\n", new); rc = ubc_set_res(lcontext->lve_ub, LVE_NPROC_LIMIT_RES, new); if (rc) LVE_ERR("ubc set nproc limit %d\n", rc); break; case LIM_ENTER: /* no special handling in this layer */ break; default: BUG(); } #endif return rc; } #include /* mostly copy&paste from audio_write()/lve-kernel-el6 */ static int os_set_dac_override(void) { struct cred *new; if (cap_raised(current_cap(), CAP_DAC_OVERRIDE)) return -EALREADY; new = prepare_creds(); cap_raise(new->cap_effective, CAP_DAC_OVERRIDE); commit_creds(new); return 0; } static void os_clear_dac_override(void) { struct cred *new = prepare_creds(); cap_lower(new->cap_effective, CAP_DAC_OVERRIDE); commit_creds(new); } static int __lve_ub_attach_task(struct user_beancounter *ub, struct task_struct *tsk) { int ret = 0; struct user_beancounter *old_ub = tsk->task_bc.exec_ub; struct cgroup_subsys_state *css; if (ub == old_ub) goto out; css = lve_ub_get_css(ub, UB_MEM_CGROUP); ret = lve_cgroup_kernel_attach(css->cgroup, tsk); css_put(css); if (ret) goto out; ret = lve_cgroup_kernel_attach(ub->css.cgroup, tsk); if (ret) goto fail_ub; out: return ret; fail_ub: css = lve_ub_get_css(old_ub, UB_MEM_CGROUP); lve_cgroup_kernel_attach(css->cgroup, tsk); css_put(css); goto out; } /* enter to memory / io control usage */ int os_resource_push(struct task_struct *task, struct c_private *lcontext) { struct lvp_ve_private *lvp; int rc = 0, rc2; if (lcontext->lve_ub == NULL) return 0; lvp = os_lve(lcontext)->lve_lvp; if (lvp->lvp_pmem_pending) os_force_pmem_limit(lve_private(lvp->lvp_default), lvp->lvp_default->lve_limits[LIM_MEMORY_PHY]); rc2 = os_set_dac_override(); rc = lve_call(__lve_ub_attach_task(lcontext->lve_ub, task), LVE_FAIL_RES_ATTACH_TASK, -ENOMEM); if (rc2 == 0) os_clear_dac_override(); if (rc != 0) LVE_ERR("push ub failed\n"); return rc; } int os_cpu_enter(struct task_struct *task, struct c_private *lcontext) { #ifndef LVE_PER_VE return generic_cgroup_enter(task, lcontext, 1 << CPU_SUBSYS); #else return 0; #endif /* LVE_PER_VE */ } static int os_lvp_cpu_init(struct lvp_ve_private *lvp) { struct c_private *c = lve_private(lvp->lvp_default); char name[MAX_GRP_NAMESZ]; int rc; struct cgroup_subsys_state *css; generic_lvp_path(name, lvp->lvp_id); if (lvp->lvp_id != 0) c->lve_ub = lve_get_beancounter_by_name(name, 1); else c->lve_ub = lve_get_beancounter_by_name("0", 1); if (IS_ERR_OR_NULL(c->lve_ub)) { LVE_ERR("Can't allocate UBC for LVP %s - rc %d\n", name, PTR_ERR(c->lve_ub)); generic_lvp_fini(lvp); c->lve_ub = NULL; return -ENOMEM; } LVE_DBG("ubc %p created\n", c->lve_ub); /* just for extra init */ css = lve_ub_get_css(c->lve_ub, UB_MEM_CGROUP); c->cgrp[MEM_SUBSYS] = css->cgroup->dentry; css_put(css); rc = generic_lvp_init(lvp, name); c->cgrp[MEM_SUBSYS] = NULL; if (rc < 0) return rc; return 0; } static void os_lvp_cpu_fini(struct lvp_ve_private *lvp) { struct c_private *c = lve_private(lvp->lvp_default); generic_lvp_fini(lvp); } static int os_global_mem_init(void) { #ifndef LVE_PER_VE int ret; ret = cgrp_param_open_write_string(cmnt[MEM_SUBSYS].mnt_root, cmnt[MEM_SUBSYS].cgrp_root, "memory.use_hierarchy", "1", 1); if (ret < 0) LVE_WARN("can't set mem cgroup hierarchy\n"); #endif return 0; } static int os_lvp_io_init(struct lvp_ve_private *lvp) { if (lvp->lvp_id != ROOT_LVP) { struct c_private *c = lve_private(lvp->lvp_default); return ovz_io_limits_init(c->lve_ub); } return 0; } static void os_lvp_io_fini(struct lvp_ve_private *lvp) { } static int os_global_io_init(void) { return ovz_iolimits_init(); } static void os_global_io_fini(void) { ovz_iolimits_exit(); } int os_freezer_enter(struct task_struct *task, struct c_private *lcontext) { int rc = 0; #ifndef LVE_PER_VE rc = generic_cgroup_enter(task, lcontext, 1 << FREEZER_SUBSYS); #endif /* LVE_PER_VE */ return rc; } static int freezer_change_state(struct dentry *cgrp, bool freeze) { int rc = 0; /* * We need to wait until tasks enter "refrigerator" * * TODO: the total amount of time should be proportional * to tasks count in the cgroup, leave it a const for now. */ int wait_count = 10; const char *str = freeze ? "FROZEN" : "THAWED"; const char *freezer_param = "freezer.state"; /* Should be big enough to contain "FREEZING" */ char buf[10]; struct vfsmount *mnt = cmnt[FREEZER_SUBSYS].mnt_root; rc = cgrp_param_open_write_string(mnt, cgrp, freezer_param, str, strlen(str)); if (rc < 0 || !freeze) goto out; while (wait_count--) { rc = cgrp_param_open_read_string(mnt, cgrp, freezer_param, buf, sizeof(buf)); if (rc < 0) goto out; if (strncmp(buf, "FROZEN", strlen("FROZEN")) == 0) goto out; schedule_timeout_killable(msecs_to_jiffies(10)); } rc = -EBUSY; out: return rc; } int os_freezer_freeze(struct light_ve *ve) { int rc = -ENOSYS; #ifndef LVE_PER_VE struct c_private *c = lve_private(ve); struct dentry *cgrp = c->cgrp[FREEZER_SUBSYS]; struct mem_cgroup *memcg = __d_cgrp(c->cgrp[MEM_SUBSYS]) ? lve_mem_cgroup_from_cont(__d_cgrp(c->cgrp[MEM_SUBSYS])) : NULL; unsigned long reclaim_est = 0UL, reclaim_total = 0UL; LVE_DBG("freezer: lve_id = %u\n", ve->lve_id); /* Temporarely disable SWAP limit */ rc = ubc_set_res(c->lve_ub, UB_SWAPPAGES, 0); if (rc < 0) { LVE_ERR("freezer: failed to update swappages limit, rc=%d\n", rc); return rc; } rc = freezer_change_state(cgrp, true); if (rc < 0) { LVE_ERR("freezer: cannot change freezer state, rc = %d\n", rc); return freezer_change_state(cgrp, false); } reclaim_est = c->lve_ub->ub_parms[UB_PHYSPAGES].limit; while (memcg && reclaim_total < reclaim_est) { unsigned long reclaim_iter = lve_try_to_free_mem_cgroup_pages(memcg, reclaim_est - reclaim_total, GFP_KERNEL, 0); if (reclaim_iter == 0UL) { LVE_DBG("freezer: the reclaiming is finished\n"); break; } reclaim_total += reclaim_iter; } LVE_DBG("freezer: reclaimed %lu pages total\n", reclaim_total); /* Put the limits back */ rc = init_beancounter_swap_limits(c->lve_ub); if (rc < 0) LVE_ERR("freezer: failed to update swappages limit, rc=%d\n", rc); #endif return rc; } int os_freezer_thaw(struct light_ve *ve) { #ifndef LVE_PER_VE struct c_private *c = lve_private(ve); struct cgroup *cgrp = __d_cgrp(c->cgrp[FREEZER_SUBSYS]); struct cgroup_iter it; struct task_struct *tsk; LVE_DBG("freezer: lve_id = %u\n", ve->lve_id); lve_cgroup_iter_start(cgrp, &it); while ((tsk = lve_cgroup_iter_next(cgrp, &it)) != NULL) { struct mm_struct *mm = tsk->mm; struct vm_area_struct *vma; /* * Its safe to unlock here, because: * 1. The cgroup cannot be destroyed as we hold a refernce * 2. Frozen tasks cannot leave the cgroup, so current "tsk" is in safety * 3. In case of concurrent "enter" new tasks are added to the list, * which is serialized with cgroup_iter_next() by css_set_lock. */ read_unlock(lve_css_set_lock); down_read(&mm->mmap_sem); for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { if (vma->vm_file != NULL) { LVE_DBG("skip file-backed or shared mem. VMA: [0x%lx - 0x%lx]\n", vma->vm_start, vma->vm_end); continue; } if (make_pages_present_ext(mm, vma->vm_start, vma->vm_end, NULL) < 0) { LVE_DBG("cannot make VMA present: [0x%lx - 0x%lx]\n", vma->vm_start, vma->vm_end); } } up_read(&mm->mmap_sem); read_lock(lve_css_set_lock); } lve_cgroup_iter_end(cgrp, &it); freezer_change_state(c->cgrp[FREEZER_SUBSYS], false); return 0; #else return -ENOSYS; #endif } int os_lvp_init(struct lvp_ve_private *lvp, void *data) { int rc; #ifdef LVE_PER_VE struct ve_struct *env = data; #endif rc = os_lvp_cpu_init(lvp); if (rc < 0) return rc; rc = os_lvp_io_init(lvp); if (rc < 0) goto out_io; #ifdef LVE_PER_VE if (env == NULL) env = get_ve0(); env->lve = lvp; lvp->lvp_ve = env; #endif LVE_DBG("os lvp init %d\n", 0); return 0; out_io: os_lvp_cpu_fini(lvp); return rc; } void os_lvp_fini(struct lvp_ve_private *lvp) { if (lvp->lvp_id == ROOT_LVP) { os_lvp_io_fini(lvp); } os_lvp_cpu_fini(lvp); } #ifdef CONFIG_MEMCG_KMEM static struct static_key *key = &memcg_kmem_enabled_key; static inline void os_static_key_slow_inc(void) { atomic_inc(&key->enabled); } static inline void os_static_key_slow_dec(void) { atomic_dec(&key->enabled); } #else static inline void os_static_key_slow_inc(void) { } static inline void os_static_key_slow_dec(void) { } #endif #ifdef VIRTINFO_MEM_FAILCNT static int os_memcg_phys_pages_failcnt_cb(void *arg) { unsigned long failcnt = (unsigned long)arg; LVE_DBG("failcnt=%lu\n", failcnt); lve_resource_fail(current, LVE_RESOURCE_FAIL_MEM_PHY); return 0; } static int os_memcg_ncall(struct vnotifier_block *self, unsigned long event, void *arg, int old_ret) { int ret = 0; switch (event) { case VIRTINFO_MEM_FAILCNT: ret = os_memcg_phys_pages_failcnt_cb(arg); break; default: break; } return ret; } static struct vnotifier_block os_memcg_nb = { .notifier_call = os_memcg_ncall, }; #endif /* VIRTINFO_MEM_FAILCNT */ static int os_memcg_nb_init(void) { #ifdef VIRTINFO_MEM_FAILCNT virtinfo_notifier_register(VITYPE_GENERAL, &os_memcg_nb); #endif return 0; } static void os_memcg_nb_fini(void) { #ifdef VIRTINFO_MEM_FAILCNT virtinfo_notifier_unregister(VITYPE_GENERAL, &os_memcg_nb); #endif } int os_global_init(void) { int rc; LVE_ENTER("os_global_init\n"); os_static_key_slow_inc(); memset(cmnt, 0, sizeof(*cmnt) * NR_SUBSYS); /* XXX force mem cgroup mount to tunning apply */ rc = mount_cgroup_root_fs(cmnt, CGROUPS_SUPPORTED | (1 << MEM_SUBSYS)); if (rc) return rc; #ifdef LVE_PER_VE mutex_lock(&ve_list_lock); if (nr_ve > 1) { mutex_unlock(&ve_list_lock); umount_cgroup_root_fs(cmnt); os_static_key_slow_dec(); LVE_ERR("modlve need load before container start\n"); return -ENOSYS; } #endif os_global_mem_init(); os_global_io_init(); os_memcg_nb_init(); #ifdef LVE_PER_VE init_ve_init_exit_chain(); mutex_unlock(&ve_list_lock); #endif return 0; } void os_global_fini(void) { #ifdef LVE_PER_VE cleanup_ve_init_exit_chain(); get_ve0()->lve = NULL; root_lvp->lvp_ve = NULL; #endif /* XXX need destroy all LVP */ os_global_io_fini(); os_memcg_nb_fini(); umount_cgroup_root_fs(cmnt); os_static_key_slow_dec(); ub_fini_cgroup(); }