#include #include #include #include #include #include #include #include #include #include #include "lve_internal.h" #include "lve_debug.h" #include "light_ve.h" #include "resource.h" #include "tags.h" struct task_struct *lve_init_task; static LIST_HEAD(lve_init_list); static LIST_HEAD(lve_cleanup_list); static spinlock_t lve_init_lock; static spinlock_t lve_cleanup_lock; static DECLARE_WAIT_QUEUE_HEAD(lve_init_wait); struct kmem_cache *lve_struct; struct rw_semaphore global_tree_lock; struct radix_tree_root global_tree; static void lve_add_list(struct light_ve *ve_new); static int lve_submit_to_init(struct light_ve *ptr); static int lve_submit_to_cleanup(struct light_ve *ptr); /** * lve namespace callback */ char *lve_ns_callback = ""; #ifdef HAVE_UMH_OLD #define call_umh(path, argv, env, wait, init, clean, data) \ call_usermodehelper_fns(path, argv, env, wait, init, clean, data) #else static int call_umh(char *path, char **argv, char **envp, int wait, int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *), void *data) { struct subprocess_info *info; gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; info = call_usermodehelper_setup(path, argv, envp, gfp_mask, init, cleanup, data); if (info == NULL) return -ENOMEM; return call_usermodehelper_exec(info, wait); } #endif static int lve_namespace_setup_helper(struct subprocess_info *info, struct cred *new) { return lve_namespace_enter(current, info->data, NULL); } static void lve_namespace_fini_helper(struct subprocess_info *info) { lve_finish_init(info->data, info->retval); } static int lve_mntns_setup(struct light_ve *ve) { char ve_id[11]; /* 32 * log(2) / log(10) + 1 */ char vp_id[11]; /* 32 * log(2) / log(10) + 1 */ char *argv[] = { lve_ns_callback, "setup", vp_id, ve_id, NULL }; char *envp[] = { NULL }; sprintf(ve_id, "%u", ve->lve_id); sprintf(vp_id, "%u", ve->lve_lvp->lvp_id); return call_umh(lve_ns_callback, argv, envp, 0, lve_namespace_setup_helper, lve_namespace_fini_helper, ve); } /*******************************************************************/ /** create containers in separate thread with administrative rights to avoid problems with permissions checks (in CL6) and EBUSY (in CL5). LVE create a similar to create a inode. container locked after creation with LVE_BIT_INIT flag and waiting on that bit until creation will finished. if initialization failed, container marked by flag LVE_BIT_ERROR and unlinked from tree, so next access will start from creation new container for that id. */ static int lve_first_init(struct light_ve *ve) { struct lvp_ve_private *lvp; int rc; /* * Here we can't get lvp from current context, because all work threads * executes in container with veid == 0. */ light_ve_get(ve); lve_stat_init(&ve->lve_stats); lve_net_init(ve); rc = lve_resources_init(ve); if (rc < 0) { LVE_ERR("res init %d\n", rc); goto out; } lvp = ve->lve_lvp; rc = lve_resources_setup(ve, lvp->lvp_def_limits, true); if (rc < 0) { LVE_ERR("res setup %d\n", rc); goto out; } ve->lve_custom = 0; lve_stats_dir_init(ve); if (!lve_no_namespaces && lve_ns_callback[0] != '\0') { /* once upcall set - lets wait until upcall finished * it should be last step as it call lve_finish_setup * internally*/ rc = lve_mntns_setup(ve); if (rc == 0) return 0; } out: lve_finish_init(ve, rc); return rc; } void lve_finish_init(struct light_ve *ve, int rc) { struct lvp_ve_private *lvp = ve->lve_lvp; down_write(&lvp_tree_lock(lvp)); if (rc) { set_bit(LVE_BIT_ERROR, &ve->lve_bit_flag); } else { lve_add_list(ve); } up_write(&lvp_tree_lock(lvp)); clear_bit(LVE_BIT_INIT, &ve->lve_bit_flag); lve_mb_after_clear_bit(); wake_up_bit(&ve->lve_bit_flag, LVE_BIT_INIT); if (rc) lve_unlink(lvp, LVE_UNLINK_VE, ve->lve_id); light_ve_put(ve); } void light_ve_free(struct light_ve *ptr) { LVE_ENTER("%p : %u\n", ptr, ptr->lve_id); lve_resources_free(ptr); /* SELF_LVE don't have a refs for LVP itself */ if (ptr == ptr->lve_lvp->lvp_default) lvp_free(ptr->lve_lvp); else lvp_put(ptr->lve_lvp); kmem_cache_free(lve_struct, ptr); } static int lve_add_to_first_init(struct light_ve *ve) { if (ve == NULL) { LVE_ERR("ve == NULL\n"); return -EINVAL; } spin_lock(&lve_init_lock); list_add_tail(&ve->lve_init_link, &lve_init_list); wake_up(&lve_init_wait); spin_unlock(&lve_init_lock); return 0; } static int lve_add_to_final_cleanup(struct light_ve *ve) { if (ve == NULL) { LVE_ERR("ve == NULL\n"); return -EINVAL; } spin_lock(&lve_cleanup_lock); list_add_tail(&ve->lve_init_link, &lve_cleanup_list); wake_up(&lve_init_wait); spin_unlock(&lve_cleanup_lock); return 0; } static void lve_flush_cleanup_list(void) { struct light_ve *ve; while (!list_empty(&lve_cleanup_list)) { spin_lock(&lve_cleanup_lock); ve = list_first_entry(&lve_cleanup_list, struct light_ve, lve_init_link); list_del(&ve->lve_init_link); spin_unlock(&lve_cleanup_lock); LVE_DBG("cleanup lve id=%d\n", ve->lve_id); light_ve_free(ve); /* make way for faster inits so their waiters should not lock up */ if (!list_empty(&lve_init_list)) break; } } static void lve_flush_init_list(void) { struct light_ve *ve; while (!list_empty(&lve_init_list)) { spin_lock(&lve_init_lock); ve = list_first_entry(&lve_init_list, struct light_ve, lve_init_link); list_del_init(&ve->lve_init_link); spin_unlock(&lve_init_lock); LVE_DBG("adding lve with id %d\n", ve->lve_id); lve_first_init(ve); } } int lve_init_thread(void *data) { while (!kthread_should_stop() || !list_empty(&lve_cleanup_list)) { wait_event_freezable(lve_init_wait, (!list_empty(&lve_init_list) || !list_empty(&lve_cleanup_list) || kthread_should_stop())); lve_flush_cleanup_list(); lve_flush_init_list(); } BUG_ON(!list_empty(&lve_init_list)); BUG_ON(!list_empty(&lve_cleanup_list)); return 0; } static int lve_init_threads_init(void) { spin_lock_init(&lve_init_lock); spin_lock_init(&lve_cleanup_lock); lve_init_task = kthread_create(lve_init_thread, NULL, "lve_init_thread"); if (IS_ERR(lve_init_task)) { LVE_ERR("Can't create lve_init_thread, err: %lu", PTR_ERR(lve_init_task)); return PTR_ERR(lve_init_task); } wake_up_process(lve_init_task); return 0; } static int lve_init_threads_fini(void) { kthread_stop(lve_init_task); return 0; } int lve_list_init() { int ret = 0; lve_struct = lve_call(kmem_cache_create("lve_struct", sizeof(struct light_ve) + os_context_private_sz(), 0, 0, NULL), LVE_FAIL_ALLOC_LVE_CACHE, NULL); if (lve_struct == NULL) { LVE_ERR("Can't create cache lve_struct!\n"); return -ENOMEM; } ret = lve_call(lve_init_threads_init(), LVE_FAIL_INIT_THRDS_INIT, -ENOMEM); if (ret) goto threads_err; return 0; threads_err: kmem_cache_destroy(lve_struct); return ret; #if 0 err: lve_list_fini(); return -ENOMEM; #endif } void lve_list_fini() { lve_init_threads_fini(); wait_event_freezable(lve_init_wait, list_empty(&lve_cleanup_list)); BUG_ON(!list_empty(&lve_init_list)); BUG_ON(!list_empty(&lve_cleanup_list)); kmem_cache_destroy(lve_struct); } void lve_last_put(struct light_ve *ptr) { BUG_ON(!list_empty(&ptr->lve_link)); BUG_ON(!ptr->lve_unlinked); lve_submit_to_cleanup(ptr); } #ifdef HAVE_WAIT_BIT_4ARGS static int __lve_wait_init(void *word) { schedule(); return 0; } static void lve_wait_to_init(struct light_ve *lve) { wait_on_bit(&lve->lve_bit_flag, LVE_BIT_INIT, __lve_wait_init, TASK_UNINTERRUPTIBLE); } #else static void lve_wait_to_init(struct light_ve *lve) { wait_on_bit(&lve->lve_bit_flag, LVE_BIT_INIT, TASK_UNINTERRUPTIBLE); } #endif struct light_ve * __lve_find(struct lvp_ve_private *lvp, uint32_t ve_id) { struct light_ve *lve; lve = lve_call(radix_tree_lookup(&lvp_tree(lvp), ve_id), LVE_FAIL_LVE_LOOKUP, NULL); if (lve) light_ve_get(lve); return lve; } struct light_ve * _lve_find(struct lvp_ve_private *lvp, uint32_t ve_id) { struct light_ve *ve; if (ve_id == ROOT_LVE) { WARN_ON(1); return NULL; } if (ve_id == SELF_LVE) { light_ve_get(lvp->lvp_default); return lvp->lvp_default; } down_read(&lvp_tree_lock(lvp)); ve = __lve_find(lvp, ve_id); up_read(&lvp_tree_lock(lvp)); if (ve) { lve_wait_to_init(ve); if (test_bit(LVE_BIT_ERROR, &ve->lve_bit_flag)) { light_ve_put(ve); ve = NULL; } } return ve; } struct light_ve *lve_find(uint32_t lvp_id, uint32_t ve_id) { struct lvp_ve_private *lvp; struct light_ve *lve; lvp = lvp_find(lvp_id); if (lvp == NULL) return NULL; lve = _lve_find(lvp, ve_id); lvp_put(lvp); return lve; } static void lve_add_list(struct light_ve *ve_new) { struct light_ve *ve; struct list_head *ve_prev; struct lvp_ve_private *lvp = ve_new->lve_lvp; ve_prev = &lvp->lvp_lve_list; list_for_each_entry(ve, &lvp->lvp_lve_list, lve_link) { BUG_ON(ve->lve_id == ve_new->lve_id); if (ve->lve_id > ve_new->lve_id) { list_add(&ve_new->lve_link, ve_prev); return; } ve_prev = &ve->lve_link; } list_add(&ve_new->lve_link, ve_prev); } static struct light_ve * _lve_add(struct lvp_ve_private *lvp, struct light_ve *ve_new) { int rc; rc = lve_call(radix_tree_insert(&lvp_tree(lvp), ve_new->lve_id, ve_new), LVE_FAIL_LVE_INSRT, -ENOMEM); /* We could race with adding something ...*/ if (rc == -EEXIST) return __lve_find(lvp, ve_new->lve_id); if (rc < 0) return ERR_PTR(rc); light_ve_get(ve_new); ve_new->lve_unlinked = 0; return ve_new; } void lve_kill_all_threads(uint32_t ve_id, uint32_t lve_id) { struct task_struct *t, *p; struct switch_data *sw_data; uint32_t id = 0; uint64_t nid = NODEID_ENCODE(ve_id, lve_id); /* XXX: should we lock cgroups? */ task_rlock(); lve_do_each_thread(t, p) { if (p == current) continue; sw_data = LVE_TAG_GET(p); if (sw_data == NULL) continue; if (sw_data->sw_from == NULL || sw_data->sw_flags & LVE_ENTER_NO_KILLABLE) { LVE_TAG_PUT(sw_data); LVE_DBG("task %s is not killable\n", p->comm); continue; } id = sw_data->sw_from->lve_id; LVE_TAG_PUT(sw_data); if (NODEID_ENCODE(task_veid(p), id) == nid) { /* exit early for zombies */ if (p->mm == NULL) continue; lve_kill(p); } } lve_while_each_thread(t, p); task_runlock(); } /* remove lve from lists and kill its threads */ static void lve_clean(struct light_ve *ve, struct list_head *list) { LVE_DBG("cleaning ve %d\n", ve->lve_id); list_del_init(&ve->lve_link); list_move_tail(&ve->lve_init_link, list); ve->lve_unlinked = 1; smp_mb(); /* Let's kill the threads before we release * the lvp_lock. This would allow us to avoid * killing threads from a fresh ve with the * same id. */ #ifdef LVE_PER_VE lve_kill_all_threads(ve->lve_lvp->lvp_ve->veid, ve->lve_id); #else lve_kill_all_threads(0, ve->lve_id); #endif } /* * lve_unlink_generic unlinks a specific ve, all ves or default ves. * * target = LVE_UNLINK_VE, LVE_UNLINK_ALL, LVE_UNLINK_DEFAULT * ve = ve for LVE_UNLINK_VE */ int lve_unlink(struct lvp_ve_private *lvp, enum lve_unlink_target target, uint32_t lve_id) { struct light_ve *ve, *tree_ve; int rc = 0; LIST_HEAD(list); LVE_DBG("target=%d ve_id=%u\n", target, lve_id); down_write(&lvp_tree_lock(lvp)); if (target == LVE_UNLINK_VE) { tree_ve = radix_tree_lookup(&lvp_tree(lvp), lve_id); /* we don't expect to remove from mapping where as it mapping * may replaced with new. it should be ok as lve_destroy is * single user for this usecase. */ /* * Be careful not to pick an lve which is being initialized, * it's a valid race, so we can just return ESRCH ... */ if (likely(tree_ve != NULL && !tree_ve->lve_unlinked && !test_bit(LVE_BIT_INIT, &tree_ve->lve_bit_flag))) lve_clean(tree_ve, &list); else rc = -ESRCH; } else { struct list_head *pos, *next; list_for_each_safe(pos, next, &lvp->lvp_lve_list) { ve = list_entry(pos, struct light_ve, lve_link); LVE_DBG("add ve %px - %u\n", ve, ve->lve_id); if (target == LVE_UNLINK_DEFAULT && ve->lve_custom) continue; lve_clean(ve, &list); } } up_write(&lvp_tree_lock(lvp)); while (!list_empty(&list)) { ve = list_first_entry(&list, struct light_ve, lve_init_link); list_del_init(&ve->lve_init_link); lve_stats_dir_fini(ve); #ifndef LVE_PER_VE if (!(lvp->lvp_id == ROOT_LVP && ve == lvp->lvp_default)) #endif lve_resources_unlink(ve); if (!test_bit(LVE_BIT_ERROR, &ve->lve_bit_flag)) lve_lvp_map_del(ve->lve_id, lvp->lvp_id); if (ve != lvp->lvp_default) { down_write(&lvp_tree_lock(lvp)); tree_ve = radix_tree_delete(&lvp_tree(ve->lve_lvp), ve->lve_id); BUG_ON(tree_ve != ve); up_write(&lvp_tree_lock(lvp)); } /* tasks termination barrier */ wait_event(ve->lve_tags_wq, atomic_read(&ve->lve_tags) == 0); light_ve_put(ve); } return rc; } static int lve_submit_to_init(struct light_ve *ptr) { return lve_add_to_first_init(ptr); } static int lve_submit_to_cleanup(struct light_ve *ptr) { return lve_add_to_final_cleanup(ptr); } struct light_ve *lve_alloc(struct lvp_ve_private *lvp, uint32_t ve_id) { struct light_ve *ve; LVE_DBG("allocating ve=%u\n", ve_id); if (ve_id != SELF_LVE && ve_id >= 0x7fffffff) return ERR_PTR(-EOVERFLOW); ve = lve_call(kmem_cache_zalloc(lve_struct, GFP_KERNEL), LVE_FAIL_ALLOC_VE, NULL); if (!ve) { LVE_ERR("Can't allocate memory for new VE %u\n", ve_id); return ERR_PTR(-ENOMEM); } ve->lve_id = ve_id; ve->lve_bit_flag |= 1 << LVE_BIT_INIT; ve->lve_unlinked = 1; INIT_LIST_HEAD(&ve->lve_link); INIT_LIST_HEAD(&ve->lve_init_link); atomic_set(&ve->lve_refcnt, 1); atomic_set(&ve->lve_tags, 0); init_waitqueue_head(&ve->lve_tags_wq); ve->lve_lvp = lvp; /* lvp_default isn't init where lets take ref later */ if (ve_id != SELF_LVE) lvp_get(lvp); return ve; } static struct light_ve *__lve_init(struct lvp_ve_private *lvp, struct light_ve *ve) { struct light_ve *old_ve; down_write(&lvp_tree_lock(lvp)); old_ve = _lve_add(lvp, ve); up_write(&lvp_tree_lock(lvp)); if (old_ve == ve) lve_submit_to_init(ve); if (old_ve != ve) { LVE_DBG("create race for %d\n", (int)ve->lve_id); light_ve_put(ve); ve = old_ve; } if (!IS_ERR(ve)) { lve_wait_to_init(ve); if (test_bit(LVE_BIT_ERROR, &ve->lve_bit_flag)) { old_ve = ERR_PTR(-EINVAL); goto out; } #ifndef LVE_IN_VE if (ve->lve_lvp != lvp) { old_ve = ERR_PTR(-EINVAL); goto out; } #endif } return ve; out: light_ve_put(ve); return old_ve; } struct light_ve * __lve_find_or_alloc(uint32_t lvp_id, uint32_t ve_id, bool no_create) { struct light_ve *ve; struct lvp_ve_private *lvp; lvp = lvp_find(lvp_id); if (lvp == NULL) return ERR_PTR(-ENOMEM); LVE_FAIL_RACE(LVE_FAIL_MAP_MOVE_RACE_CREATE); LVE_FAIL_RACE(LVE_FAIL_LVP_DESTROY_RACE); ve = _lve_find(lvp, ve_id); if (ve) goto out; if (no_create) { ve = ERR_PTR(-ENOENT); goto out; } down_read(&lvp->lvp_destroy_lock); if (!lvp->lvp_destroy) { ve = lve_alloc(lvp, ve_id); if (!IS_ERR(ve)) ve = __lve_init(lvp, ve); } else { /* raced with lvp_destroy */ ve = NULL; } up_read(&lvp->lvp_destroy_lock); out: lvp_put(lvp); return ve; } struct light_ve * lve_find_or_alloc(uint32_t lvp_id, uint32_t ve_id) { return __lve_find_or_alloc(lvp_id, ve_id, false); }