#include #include #include #include "lve_internal.h" #include "lve_debug.h" #include "resource.h" #include "cgroup_lib.h" #include "cgroup_generic.h" bool lve_kill_on_shrink = 1; module_param(lve_kill_on_shrink, bool, 0644); void lve_cgroup_release(struct dentry *cgrp, int flags) { const char *name = cgrp->d_name.name; if (flags == LVE_CGRP_CREAT) { int ret = lve_cgroup_kernel_remove(cgrp); LVE_DBG("cgroup %s(%p) is removed, ret=%d\n", name, cgrp, ret); } lve_cgroup_kernel_close(cgrp); } void generic_resource_unlink(uint32_t id, struct c_private *lcontext, char *name) { int i; LVE_ENTER("(id=%u, lcontext=%px)\n", id, lcontext); lcontext->unlink_id = get_unlink_id(); snprintf(name, MAX_GRP_NAMESZ-1, UNLINK_FORMAT, lcontext->unlink_id); cgrp_param_set_s64(lcontext->filps[PARAM_CPU_LIMIT], -1); for (i = 0; i < NR_SUBSYS; i++) { if (lcontext->cgrp[i] && !IS_ERR(lcontext->cgrp[i])) { LVE_DBG("subsys %u\n", i); cgrp_obfuscate(lcontext->cgrp[i], name); } } } static const struct params filp_params[] = { [ PARAM_CPU_STAT ] = { "cpuacct.usage", CPU_SUBSYS }, [ PARAM_CPU_LIMIT] = { "cpu.cfs_quota_us", CPU_SUBSYS }, [ PARAM_CPU_CHWT ] = { "cpu.shares", CPU_SUBSYS }, [ PARAM_CPU_THROTTLE ] = { "cpu.stat", CPU_SUBSYS }, #ifdef HAVE_PIDS_CGRP #if OPENVZ_VERSION == 0 [ PARAM_PIDS_LIMIT ] = { "pids.max", PIDS_SUBSYS }, [ PARAM_PIDS_CURRENT ] = { "pids.current", PIDS_SUBSYS }, [ PARAM_PIDS_EVENTS ] = { "pids.events", PIDS_SUBSYS }, #endif #endif #ifdef CONFIG_KERNFS #if OPENVZ_VERSION == 0 [ PARAM_PIDS_ENTER ] = { "tasks", PIDS_SUBSYS }, [ PARAM_CPU_ENTER ] = { "tasks", CPU_SUBSYS }, [ PARAM_MEM_ENTER ] = { "tasks", MEM_SUBSYS }, [ PARAM_FREEZER_ENTER ] = { "tasks", FREEZER_SUBSYS }, [ PARAM_BLK_ENTER ] = { "tasks", BLK_SUBSYS }, #endif #endif #if OPENVZ_VERSION == 0 [ PARAM_MEM_LIMIT ] = { "memory.limit_in_bytes", MEM_SUBSYS}, [ PARAM_MEM_STAT ] = { "memory.usage_in_bytes", MEM_SUBSYS}, [ PARAM_MEM_ANON_STAT ] = { "memory.stat", MEM_SUBSYS}, #ifdef HAVE_PIDS_CGRP [ PARAM_MEM_FAILCNT ] = { "memory.oom_control", MEM_SUBSYS}, #endif #endif [ PARAM_NETMARK_ENTER ] = { "tasks", NETMARK_SUBSYS }, [ PARAM_NETMARK_MARK ] = { "net_cls.classid", NETMARK_SUBSYS }, }; int generic_filps_open(struct c_private *c) { return cgrp_populate_dir(c->cgrp, c->filps, filp_params, ARRAY_SIZE(filp_params)); } static void generic_filps_close(struct c_private *lcontext) { int i; for (i = 0; i < ARRAY_SIZE(filp_params); i++) { if (lcontext->filps[i] != NULL) { cgrp_param_release(lcontext->filps[i]); lcontext->filps[i] = NULL; } } } void generic_lvp_path(char *name, uint32_t lvp_id) { /* XXX we don't need an exception for ROOT_LVP, because exception * moved to the init_lvp */ snprintf(name, MAX_GRP_NAMESZ-1, LVP_FMT, lvp_id); } void generic_lve_path(char *name, uint32_t id) { snprintf(name, MAX_GRP_NAMESZ - 1 , LVE_FMT, id); } /************************* LVP *************************************/ int lve_extra_init(uint32_t id, struct c_private *lcontext) { int rc; rc = generic_filps_open(lcontext); if (rc < 0) goto out; rc = generic_cgroup_disable_swappiness(lcontext->cgrp[MEM_SUBSYS]); if (rc < 0) goto out; /* make sure class major is 0x1 (or higher) */ rc = cgrp_param_set_u64(lcontext->filps[PARAM_NETMARK_MARK],id+0x10000); /* error exit */ out: LVE_DBG("error rc %d\n", rc); if (rc < 0) generic_filps_close(lcontext); return rc; } static int init_lvp(struct lvp_ve_private *lvp, enum subsys_id subsys, char *name) { struct lvp_private *lvpp = os_lvp_private(lvp); struct c_private *c = lve_private(lvp->lvp_default); /** XXX hack until we will have a 2 level cpu scheduler */ if (lvp->lvp_id == ROOT_LVP) { /* ubc0 name is fixed. */ LVE_DBG("root lvp init %s - subsys %d\n", name, subsys); lvpp->lve_root[subsys] = cmnt[subsys].cgrp_root; c->cgrp[subsys] = lvpp->lve_root[subsys]; dget(c->cgrp[subsys]); c->flags = 0; goto out_lvpp_ref; } c->flags = get_flags(); /* We hold a reference to the parent CPU cgroup so it exists */ c->cgrp[subsys] = lve_cgroup_kernel_open(cmnt[subsys].cgrp_root, c->flags, name); if (IS_ERR(c->cgrp[subsys])) { LVE_ERR("Can't open cgroup %s, err %ld\n", name, PTR_ERR(c->cgrp[subsys])); return PTR_ERR(c->cgrp[subsys]); } lvpp->lve_root[subsys] = c->cgrp[subsys]; out_lvpp_ref: dget(lvpp->lve_root[subsys]); return 0; } static void fini_lvp(struct lvp_ve_private *lvp, enum subsys_id subsys) { struct lvp_private *lvpp = os_lvp_private(lvp); struct c_private *c = lve_private(lvp->lvp_default); LVE_ENTER("close %u[%d] <> %px %px\n", lvp->lvp_id, subsys, lvp, lvpp->lve_root[subsys]); if (lvp->lvp_id == ROOT_LVP) lvpp->lve_root[subsys] = c->cgrp[subsys]; if (lvpp->lve_root[subsys]) lve_cgroup_kernel_close(lvpp->lve_root[subsys]); } /** * UBC is external requerements for name, so make it as parameter */ int generic_lvp_init(struct lvp_ve_private *lvp, char *name) { int i; int rc; for(i=0; ilvp_id, i); return rc; } } return lve_extra_init(lvp->lvp_id, lve_private(lvp->lvp_default)); } void generic_lvp_fini(struct lvp_ve_private *lvp) { int i; for (i = (NR_SUBSYS - 1); i >= 0; i--) fini_lvp(lvp, i); } /************************* LVE *************************************/ int init_lve(struct light_ve *ve, enum subsys_id subsys, char *name) { struct lvp_ve_private *lvp = ve->lve_lvp; struct c_private *lcontext = lve_private(ve); struct dentry *cgrp = NULL; struct lvp_private *lvpp = os_lvp_private(lvp); cgrp = lve_call(lve_cgroup_kernel_open(lvpp->lve_root[subsys], LVE_CGRP_CREAT | LVE_CGRP_EXCL, name), LVE_FAIL_CGRP_OPEN, ERR_PTR(-EEXIST)); if (IS_ERR(cgrp)) { LVE_ERR("subsys %d %s err %ld\n", subsys, name, PTR_ERR(cgrp)); return PTR_ERR(cgrp); } lcontext->cgrp[subsys] = cgrp; LVE_DBG("subsys %d group %px\n", subsys, cgrp); return 0; } /** * UBC is external requerements for name, so make it as parameter */ int generic_lve_init(struct light_ve *ve, char *name) { struct c_private *lcontext = lve_private(ve); uint32_t id = ve->lve_id; int i; int rc; lcontext->flags = get_flags(); for(i=0; ilve_id; struct c_private *lcontext = lve_private(ve); int i; LVE_ENTER("(id=%u, lcontext=%px)\n", id, lcontext); generic_filps_close(lcontext); for (i = (NR_SUBSYS-1); i >= 0; i--) { if (lcontext->cgrp[i] && !IS_ERR(lcontext->cgrp[i])) { LVE_DBG("release %d\n", i); lve_cgroup_release(lcontext->cgrp[i], lcontext->flags); lcontext->cgrp[i] = LVE_POISON_PTR; } } } /************************* LVE end *************************************/ static const uint32_t enter_param[] = { [ CPU_SUBSYS ] = PARAM_CPU_ENTER, [ MEM_SUBSYS ] = PARAM_MEM_ENTER, [ BLK_SUBSYS ] = PARAM_BLK_ENTER, [ FREEZER_SUBSYS ] = PARAM_FREEZER_ENTER, [ PIDS_SUBSYS ] = PARAM_PIDS_ENTER, [ NETMARK_SUBSYS ] = PARAM_NETMARK_ENTER, #if OPENVZ_VERSION > 0 [ UB_SUBSYS ] = 0, #endif }; static const uint32_t enter_fail[] = { [ CPU_SUBSYS ] = LVE_FAIL_CPU_ATTACH_TSK, [ MEM_SUBSYS ] = LVE_FAIL_RES_ATTACH_TASK, [ BLK_SUBSYS ] = LVE_FAIL_RES_ATTACH_TASK, [ FREEZER_SUBSYS ] = LVE_FAIL_FREEZER_ATTACH_TSK, [ PIDS_SUBSYS ] = LVE_FAIL_RES_ATTACH_TASK, [ NETMARK_SUBSYS ] = 0, #if OPENVZ_VERSION > 0 [ UB_SUBSYS ] = 0, #endif }; static int _cgroup_enter(struct task_struct *task, struct c_private *lcontext, enum subsys_id subsys) { int rc = 0; uint32_t fail; uint32_t filp; #if OPENVZ_VERSION == 0 const struct cred *old_creds; #endif BUG_ON(subsys > ARRAY_SIZE(enter_fail)); fail = enter_fail[subsys]; filp = enter_param[subsys]; if (lve_fail_check(fail)) return -ENOMEM; #if OPENVZ_VERSION > 0 rc = lve_cgroup_kernel_attach(__d_cgrp(lcontext->cgrp[subsys]), task); #else /* * Temporarely elevate creds to bypass security * checks in __cgroup1_procs_write(), so we get the similar behaviour * as direct cgroup_attach_task() call. * For thread-safety/security we can simply override the current "cred" pointer. */ old_creds = override_creds(lve_init_cred); rc = cgrp_param_set_u64(lcontext->filps[filp], task->pid); revert_creds(old_creds); #endif if (rc != 0) LVE_ERR("cpu attach task failed with %d\n", rc); return rc; } int os_loadavg_count(struct light_ve *ve) { struct c_private *c = lve_private(ve); struct cgroupstats stats = { 0 }; lve_cgroupstats_build(&stats, c->filps[PARAM_CPU_STAT]->f_path.dentry->d_parent); return stats.nr_running + stats.nr_uninterruptible; } int generic_cgroup_enter(struct task_struct *task, struct c_private *lcontext, unsigned long subsys_mask) { int i; int rc = 0; subsys_mask &= CGROUPS_SUPPORTED; for(i=0;ifilps[PARAM_CPU_CHWT], new *1024/100), LVE_FAIL_SET_CPU_CHWT, -EINVAL); } /* 100% for one core */ /* cgroupfs api have low precession for cpu limit, need to be changed * in case switching to the direct setup */ #define default_cfs_period 100000ULL #define cfs_quota_min 1000ULL /* calculate a cpu power */ static u64 lve_cgroup_cpu_limit(int32_t n_cpus, int32_t cpu) { u64 cpus_lim; u64 cpu_lim; if (n_cpus == 0) n_cpus = num_online_cpus(); if (cpu == 0) cpu = num_online_cpus() * MAX_CPU; cpus_lim = MAX_CPU * n_cpus; cpu_lim = DIV_ROUND_UP((min_t(u64, cpus_lim, cpu) * default_cfs_period),MAX_CPU); LVE_DBG("%u %u : %llu %llu\n", n_cpus, cpu, cpus_lim, cpu_lim); return cpu_lim; } /** r_ncpu, r_cpu = reseller NCPU, CPU limits, c_ncpu, c_cpu = customer NCPU, CPU limits, return a actual value to set a cpu limit */ static int lve_cgroup_real_cpu_limit(struct c_private *lcontext, uint32_t r_ncpu, uint32_t r_cpu, uint32_t c_ncpu, uint32_t c_cpu) { u64 reseller; u64 customer; u64 limit; struct light_ve *lve = os_lve(lcontext); reseller = lve_cgroup_cpu_limit(r_ncpu, r_cpu); customer = lve_cgroup_cpu_limit(c_ncpu, c_cpu); /* Check if we are changing limits for a reseller */ if (lve == lve->lve_lvp->lvp_default) limit = customer; else limit = min(reseller, customer); limit = max(limit, cfs_quota_min); return cgrp_param_set_s64(lcontext->filps[PARAM_CPU_LIMIT], limit > 0 ? limit : -1); } int lve_cgroup_cpus_set(struct c_private *lcontext, lve_limits_t reseller, lve_limits_t old, unsigned int ncpus) { LVE_DBG("set cpu affinity to %d\n", ncpus); if (lve_fail_check(LVE_FAIL_SET_CPUS_LIM)) return -ENOMEM; return lve_cgroup_real_cpu_limit(lcontext, reseller[LIM_CPUS], reseller[LIM_CPU], ncpus, old[LIM_CPU]); } /* try to assume one */ int lve_cgroup_cpu_set(struct c_private *lcontext, lve_limits_t reseller, lve_limits_t old, unsigned int new_cpu) { LVE_DBG("set cpu limit %u\n", new_cpu); if (lve_fail_check(LVE_FAIL_SET_CPU_LIM)) return -ENOMEM; return lve_cgroup_real_cpu_limit(lcontext, reseller[LIM_CPUS], reseller[LIM_CPU], old[LIM_CPUS], new_cpu); } /******************** CGROUP MEM *******************/ int generic_cgroup_disable_swappiness(struct dentry *mem_cgrp) { int ret = 0; LVE_DBG("%px %px\n", cmnt[MEM_SUBSYS].mnt_root, mem_cgrp); if (mem_swappiness == true) ret = cgrp_param_open_write_string(cmnt[MEM_SUBSYS].mnt_root, mem_cgrp, "memory.swappiness", "0", 1); return ret; }