#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/cgroup.h>

#include "lve_internal.h"
#include "lve_debug.h"
#include "resource.h"
#include "cgroup_lib.h"
#include "cgroup_generic.h"

bool lve_kill_on_shrink = 1;
module_param(lve_kill_on_shrink, bool, 0644);

void lve_cgroup_release(struct dentry *cgrp, int flags)
{
	const char *name = cgrp->d_name.name;

	if (flags == LVE_CGRP_CREAT) {
		int ret = lve_cgroup_kernel_remove(cgrp);
		LVE_DBG("cgroup %s(%p) is removed, ret=%d\n", name, cgrp, ret);
	}

	lve_cgroup_kernel_close(cgrp);
}


void generic_resource_unlink(uint32_t id, struct c_private *lcontext,
			     char *name)
{
	int i;

	LVE_ENTER("(id=%u, lcontext=%px)\n", id, lcontext);

	lcontext->unlink_id = get_unlink_id();
	snprintf(name, MAX_GRP_NAMESZ-1, UNLINK_FORMAT, lcontext->unlink_id);

	cgrp_param_set_s64(lcontext->filps[PARAM_CPU_LIMIT], -1);

	for (i = 0; i < NR_SUBSYS; i++) {
		if (lcontext->cgrp[i] && !IS_ERR(lcontext->cgrp[i])) {
			LVE_DBG("subsys %u\n", i);
			cgrp_obfuscate(lcontext->cgrp[i], name);
		}
	}
}


static const struct params filp_params[] = {
	[ PARAM_CPU_STAT ] = { "cpuacct.usage",		CPU_SUBSYS },
	[ PARAM_CPU_LIMIT] = { "cpu.cfs_quota_us",	CPU_SUBSYS },
	[ PARAM_CPU_CHWT ] = { "cpu.shares",		CPU_SUBSYS },
	[ PARAM_CPU_THROTTLE ] = { "cpu.stat",		CPU_SUBSYS },
#ifdef HAVE_PIDS_CGRP
#if OPENVZ_VERSION == 0
	[ PARAM_PIDS_LIMIT ] = { "pids.max",		PIDS_SUBSYS },
	[ PARAM_PIDS_CURRENT ] = { "pids.current",	PIDS_SUBSYS },
	[ PARAM_PIDS_EVENTS ] = { "pids.events",	PIDS_SUBSYS },
#endif
#endif

#ifdef CONFIG_KERNFS
#if OPENVZ_VERSION == 0
	[ PARAM_PIDS_ENTER ] = { "tasks",		PIDS_SUBSYS },
	[ PARAM_CPU_ENTER ] = { "tasks",		CPU_SUBSYS },
	[ PARAM_MEM_ENTER ] = { "tasks",		MEM_SUBSYS },
	[ PARAM_FREEZER_ENTER ] = { "tasks",		FREEZER_SUBSYS },
	[ PARAM_BLK_ENTER ] = { "tasks",		BLK_SUBSYS },
#endif
#endif

#if OPENVZ_VERSION == 0
	[ PARAM_MEM_LIMIT ] = { "memory.limit_in_bytes", MEM_SUBSYS},
	[ PARAM_MEM_STAT  ] = { "memory.usage_in_bytes", MEM_SUBSYS},
	[ PARAM_MEM_ANON_STAT ] = { "memory.stat",       MEM_SUBSYS},
#ifdef HAVE_PIDS_CGRP
	[ PARAM_MEM_FAILCNT ] = { "memory.oom_control", MEM_SUBSYS},
#endif
#endif
	[ PARAM_NETMARK_ENTER ] = { "tasks",		NETMARK_SUBSYS },
	[ PARAM_NETMARK_MARK  ] = { "net_cls.classid",	NETMARK_SUBSYS },
};

int generic_filps_open(struct c_private *c)
{
	return cgrp_populate_dir(c->cgrp, c->filps,
				 filp_params, ARRAY_SIZE(filp_params));
}

static void generic_filps_close(struct c_private *lcontext)
{
	int i;

	for (i = 0; i < ARRAY_SIZE(filp_params); i++) {
		if (lcontext->filps[i] != NULL) {
			cgrp_param_release(lcontext->filps[i]);
			lcontext->filps[i] = NULL;
		}
	}
}


void generic_lvp_path(char *name, uint32_t lvp_id)
{
	/* XXX we don't need an exception for ROOT_LVP, because exception
	 * moved to the init_lvp */
	snprintf(name, MAX_GRP_NAMESZ-1, LVP_FMT, lvp_id);
}

void generic_lve_path(char *name, uint32_t id)
{
	snprintf(name, MAX_GRP_NAMESZ - 1 , LVE_FMT, id);
}


/************************* LVP *************************************/

int lve_extra_init(uint32_t id, struct c_private *lcontext)
{
	int rc;

	rc = generic_filps_open(lcontext);
	if (rc < 0)
		goto out;

	rc = generic_cgroup_disable_swappiness(lcontext->cgrp[MEM_SUBSYS]);
	if (rc < 0)
		goto out;

	/* make sure class major is 0x1 (or higher) */
	rc = cgrp_param_set_u64(lcontext->filps[PARAM_NETMARK_MARK],id+0x10000);

/* error exit */
out:
	LVE_DBG("error rc %d\n", rc);
	if (rc < 0)
		generic_filps_close(lcontext);

	return rc;
}


static int init_lvp(struct lvp_ve_private *lvp, enum subsys_id subsys,
			    char *name)
{
	struct lvp_private *lvpp = os_lvp_private(lvp);
	struct c_private *c = lve_private(lvp->lvp_default);

	/** XXX hack until we will have a 2 level cpu scheduler */
	if (lvp->lvp_id == ROOT_LVP) {
		/* ubc0 name is fixed. */
		LVE_DBG("root lvp init %s - subsys %d\n", name, subsys);
		lvpp->lve_root[subsys] = cmnt[subsys].cgrp_root;

		c->cgrp[subsys] = lvpp->lve_root[subsys];
		dget(c->cgrp[subsys]);

		c->flags = 0;

		goto out_lvpp_ref;
	}

	c->flags = get_flags();
	/* We hold a reference to the parent CPU cgroup so it exists */
	c->cgrp[subsys] = lve_cgroup_kernel_open(cmnt[subsys].cgrp_root,
		c->flags, name);
	if (IS_ERR(c->cgrp[subsys])) {
		LVE_ERR("Can't open cgroup %s, err %ld\n", name,
			PTR_ERR(c->cgrp[subsys]));
		return PTR_ERR(c->cgrp[subsys]);
	}

	lvpp->lve_root[subsys] = c->cgrp[subsys];

out_lvpp_ref:
	dget(lvpp->lve_root[subsys]);

	return 0;
}

static void fini_lvp(struct lvp_ve_private *lvp, enum subsys_id subsys)
{
	struct lvp_private *lvpp = os_lvp_private(lvp);
	struct c_private *c = lve_private(lvp->lvp_default);

	LVE_ENTER("close %u[%d] <> %px %px\n", lvp->lvp_id, subsys,
		lvp, lvpp->lve_root[subsys]);

	if (lvp->lvp_id == ROOT_LVP)
		lvpp->lve_root[subsys] = c->cgrp[subsys];

	if (lvpp->lve_root[subsys])
		lve_cgroup_kernel_close(lvpp->lve_root[subsys]);
}


/** 
 * UBC is external requerements for name, so make it as parameter
 */
int generic_lvp_init(struct lvp_ve_private *lvp, char *name)
{
	int i;
	int rc;

	for(i=0; i<NR_SUBSYS; i++) {
		if (((1 << i) & CGROUPS_SUPPORTED) == 0)
			continue;

		rc = init_lvp(lvp, i, name);
		if (rc < 0) {
			LVE_ERR("lvp %d can't init subsys %d\n",
				lvp->lvp_id, i);
			return rc;
		}
	}

	return lve_extra_init(lvp->lvp_id, lve_private(lvp->lvp_default));
}

void generic_lvp_fini(struct lvp_ve_private *lvp)
{
	int i;

	for (i = (NR_SUBSYS - 1); i >= 0; i--)
		fini_lvp(lvp, i);
}

/************************* LVE *************************************/

int init_lve(struct light_ve *ve, enum subsys_id subsys, char *name)
{
	struct lvp_ve_private *lvp = ve->lve_lvp;
	struct c_private *lcontext = lve_private(ve);
	struct dentry *cgrp = NULL;
	struct lvp_private *lvpp = os_lvp_private(lvp);

	cgrp = lve_call(lve_cgroup_kernel_open(lvpp->lve_root[subsys], 
				LVE_CGRP_CREAT | LVE_CGRP_EXCL, name),
				LVE_FAIL_CGRP_OPEN, ERR_PTR(-EEXIST));
	if (IS_ERR(cgrp)) {
		LVE_ERR("subsys %d %s err %ld\n", subsys,
			name, PTR_ERR(cgrp));
		return PTR_ERR(cgrp);
	}
	lcontext->cgrp[subsys] = cgrp;
	LVE_DBG("subsys %d group %px\n", subsys, cgrp);

	return 0;
}

/** 
 * UBC is external requerements for name, so make it as parameter
 */
int generic_lve_init(struct light_ve *ve, char *name)
{
	struct c_private *lcontext = lve_private(ve);
	uint32_t id = ve->lve_id;
	int i;
	int rc;

	lcontext->flags = get_flags();
	for(i=0; i<NR_SUBSYS; i++) {
		if (((1 << i) & CGROUPS_SUPPORTED) == 0)
			continue;

		rc = init_lve(ve, i, name);
		if (rc < 0) {
			LVE_ERR("lve %d can't init subsys %d\n",
				id, i);
			return rc;
		}
	}

	/* parameters will be open with default lve for lvp */
	return lve_extra_init(id, lcontext);
}

void generic_lve_fini(struct light_ve *ve)
{
	uint32_t id = ve->lve_id;
	struct c_private *lcontext = lve_private(ve);
	int i;

	LVE_ENTER("(id=%u, lcontext=%px)\n", id, lcontext);

	generic_filps_close(lcontext);

	for (i = (NR_SUBSYS-1); i >= 0; i--) {
		if (lcontext->cgrp[i] && !IS_ERR(lcontext->cgrp[i])) {
			LVE_DBG("release %d\n", i);
			lve_cgroup_release(lcontext->cgrp[i],
				lcontext->flags);
			lcontext->cgrp[i] = LVE_POISON_PTR;
		}
	}

}

/************************* LVE end *************************************/

static const uint32_t enter_param[] = {
	[ CPU_SUBSYS ] = PARAM_CPU_ENTER,
	[ MEM_SUBSYS ] = PARAM_MEM_ENTER,
	[ BLK_SUBSYS ] = PARAM_BLK_ENTER,
	[ FREEZER_SUBSYS ] = PARAM_FREEZER_ENTER,
	[ PIDS_SUBSYS ] = PARAM_PIDS_ENTER,
	[ NETMARK_SUBSYS ] = PARAM_NETMARK_ENTER,
#if OPENVZ_VERSION > 0
	[ UB_SUBSYS ] = 0,
#endif
};

static const uint32_t enter_fail[] = {
	[ CPU_SUBSYS ] = LVE_FAIL_CPU_ATTACH_TSK,
	[ MEM_SUBSYS ] = LVE_FAIL_RES_ATTACH_TASK,
	[ BLK_SUBSYS ] = LVE_FAIL_RES_ATTACH_TASK,
	[ FREEZER_SUBSYS ] = LVE_FAIL_FREEZER_ATTACH_TSK,
	[ PIDS_SUBSYS ] = LVE_FAIL_RES_ATTACH_TASK,
	[ NETMARK_SUBSYS ] = 0,
#if OPENVZ_VERSION > 0
	[ UB_SUBSYS ] = 0,
#endif
};


static int _cgroup_enter(struct task_struct *task, struct c_private *lcontext, 
			 enum subsys_id subsys)
{
	int rc = 0;
	uint32_t fail;
	uint32_t filp;

#if OPENVZ_VERSION == 0
	const struct cred *old_creds;
#endif
	BUG_ON(subsys > ARRAY_SIZE(enter_fail));
	fail = enter_fail[subsys];
	filp = enter_param[subsys];


	if (lve_fail_check(fail))
		return -ENOMEM;

#if OPENVZ_VERSION > 0 
	rc = lve_cgroup_kernel_attach(__d_cgrp(lcontext->cgrp[subsys]), task);
#else
	/*
	 * Temporarely elevate creds to bypass security
	 * checks in __cgroup1_procs_write(), so we get the similar behaviour
	 * as direct cgroup_attach_task() call.
	 * For thread-safety/security we can simply override the current "cred" pointer.
	 */ 
	old_creds = override_creds(lve_init_cred);
	rc = cgrp_param_set_u64(lcontext->filps[filp], task->pid);
	revert_creds(old_creds);
#endif
	if (rc != 0)
		LVE_ERR("cpu attach task failed with %d\n", rc);

	return rc;
}

int os_loadavg_count(struct light_ve *ve)
{
	struct c_private *c = lve_private(ve);
	struct cgroupstats stats = { 0 };

	lve_cgroupstats_build(&stats,
			      c->filps[PARAM_CPU_STAT]->f_path.dentry->d_parent);

	return stats.nr_running + stats.nr_uninterruptible;
}

int generic_cgroup_enter(struct task_struct *task, struct c_private *lcontext,
			 unsigned long subsys_mask)
{
	int i;
	int rc = 0;

	subsys_mask &= CGROUPS_SUPPORTED;

	for(i=0;i<NR_SUBSYS;i++) {
		if ((( 1<< i) & subsys_mask) != 0) {
			rc = _cgroup_enter(task, lcontext, i);
			if (rc < 0)
				break;
		}
	}
	BUG_ON(rc != 0);

	return rc;
}


/*************** generic CPU *****************************************************/

int lve_cgroup_cpuw_set(struct c_private *lcontext, int32_t new)
{

	return lve_call(cgrp_param_set_u64(lcontext->filps[PARAM_CPU_CHWT],
				new *1024/100), LVE_FAIL_SET_CPU_CHWT, -EINVAL);
}


/* 100% for one core */
/* cgroupfs api have low precession for cpu limit, need to be changed 
 * in case switching to the direct setup */
#define default_cfs_period 100000ULL
#define cfs_quota_min 1000ULL

/* calculate a cpu power */
static u64 lve_cgroup_cpu_limit(int32_t n_cpus, int32_t cpu)
{
	u64 cpus_lim;
	u64 cpu_lim;

	if (n_cpus == 0)
		n_cpus = num_online_cpus();
	if (cpu == 0)
		cpu = num_online_cpus() * MAX_CPU;

	cpus_lim = MAX_CPU * n_cpus;
	cpu_lim = DIV_ROUND_UP((min_t(u64, cpus_lim, cpu) * default_cfs_period),MAX_CPU);

	LVE_DBG("%u %u : %llu %llu\n", n_cpus, cpu, cpus_lim, cpu_lim);

	return cpu_lim;
}

/**
    r_ncpu, r_cpu = reseller NCPU, CPU limits,
    c_ncpu, c_cpu = customer NCPU, CPU limits,

    return a actual value to set a cpu limit
*/
static int lve_cgroup_real_cpu_limit(struct c_private *lcontext,
				     uint32_t r_ncpu, uint32_t r_cpu,
				     uint32_t c_ncpu, uint32_t c_cpu)
{
	u64 reseller;
	u64 customer;
	u64 limit;

	struct light_ve *lve = os_lve(lcontext);

	reseller = lve_cgroup_cpu_limit(r_ncpu, r_cpu);
	customer = lve_cgroup_cpu_limit(c_ncpu, c_cpu);

	/* Check if we are changing limits for a reseller */
	if (lve == lve->lve_lvp->lvp_default)
		limit = customer;
	else
		limit = min(reseller, customer);

	limit = max(limit, cfs_quota_min);

	return cgrp_param_set_s64(lcontext->filps[PARAM_CPU_LIMIT], limit > 0 ? limit : -1);
}

int lve_cgroup_cpus_set(struct c_private *lcontext, 
			lve_limits_t reseller,
			lve_limits_t old,
			unsigned int ncpus)
{
	LVE_DBG("set cpu affinity to %d\n", ncpus);

	if (lve_fail_check(LVE_FAIL_SET_CPUS_LIM))
		return -ENOMEM;

	return lve_cgroup_real_cpu_limit(lcontext, 
					 reseller[LIM_CPUS], reseller[LIM_CPU],
			    		 ncpus, old[LIM_CPU]);
}


/* try to assume one */
int lve_cgroup_cpu_set(struct c_private *lcontext, 
			lve_limits_t reseller,
			lve_limits_t old,
			unsigned int new_cpu)
{
	LVE_DBG("set cpu limit %u\n", new_cpu);

	if (lve_fail_check(LVE_FAIL_SET_CPU_LIM))
		return -ENOMEM;


	return lve_cgroup_real_cpu_limit(lcontext, 
					 reseller[LIM_CPUS], reseller[LIM_CPU],
					 old[LIM_CPUS], new_cpu);
}


/******************** CGROUP MEM *******************/
int generic_cgroup_disable_swappiness(struct dentry *mem_cgrp)
{
	int ret = 0;

	LVE_DBG("%px %px\n", cmnt[MEM_SUBSYS].mnt_root, mem_cgrp);
	if (mem_swappiness == true)
		ret = cgrp_param_open_write_string(cmnt[MEM_SUBSYS].mnt_root,
			mem_cgrp, "memory.swappiness", "0", 1);
	return ret;
}