Each VE should execute release agent notifications within it's own workqueue. This way we achieve a more fine-grained control over release_agent work flushing at VE destruction.
(Cherry-picked from 9fbfb5b4cfb87ba7c9dd63eec5e5e27946a38d3c) Signed-off-by: Valeriy Vdovin <[email protected]> Reviewed-by: Kirill Tkhai <[email protected]> --- include/linux/cgroup-defs.h | 10 ++- include/linux/cgroup.h | 2 + include/linux/ve.h | 10 +++ kernel/cgroup/cgroup-internal.h | 1 + kernel/cgroup/cgroup-v1.c | 109 ++++++++++++++++++++++++-------- kernel/cgroup/cgroup.c | 12 +++- kernel/ve/ve.c | 48 ++++++++++++++ 7 files changed, 159 insertions(+), 33 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index a8eb94d2f97f..22d84aa0778e 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -451,6 +451,13 @@ struct cgroup { */ struct list_head cset_links; + /* + * Linked list running through all cgroups that can + * potentially be reaped by the release agent. Protected by + * release_list_lock + */ + struct list_head release_list; + /* * On the default hierarchy, a css_set for a cgroup with some * susbsys disabled will point to css's which are associated with @@ -488,9 +495,6 @@ struct cgroup { /* used to wait for offlining of csses */ wait_queue_head_t offline_waitq; - /* used to schedule release agent */ - struct work_struct release_agent_work; - /* used to store eBPF programs */ struct cgroup_bpf bpf; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 17ee29f4071b..6693cd36fd82 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -897,6 +897,8 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); +void cgroup1_release_agent(struct work_struct *work); + #ifdef CONFIG_VE extern void cgroup_mark_ve_root(struct ve_struct *ve); void cgroup_unmark_ve_roots(struct ve_struct *ve); diff --git a/include/linux/ve.h b/include/linux/ve.h index d3c1ab840444..2ab39b607708 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -105,7 +105,15 @@ struct ve_struct { unsigned long aio_nr; unsigned long aio_max_nr; #endif + /* + * cgroups, that want to notify about becoming + * empty, are linked to this release_list. + */ + struct list_head release_list; + spinlock_t release_list_lock; + struct workqueue_struct *wq; + struct work_struct release_agent_work; }; struct ve_devmnt { @@ -127,6 +135,8 @@ extern int nr_ve; (ve_is_super(get_exec_env()) && capable(CAP_SYS_ADMIN)) #ifdef CONFIG_VE +void ve_add_to_release_list(struct cgroup *cgrp); +void ve_rm_from_release_list(struct cgroup *cgrp); extern struct ve_struct *get_ve(struct ve_struct *ve); extern void put_ve(struct ve_struct *ve); diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 829997989c41..4de66630d456 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -135,6 +135,7 @@ extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; extern struct file_system_type cgroup_fs_type; +struct cgroup *cgroup_get_local_root(struct cgroup *cgrp); /* iterate across the hierarchies */ #define for_each_root(root) \ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 21a7c36fbf44..c1891317ae3a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -784,7 +784,7 @@ void cgroup1_check_for_release(struct cgroup *cgrp) { if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) - schedule_work(&cgrp->release_agent_work); + ve_add_to_release_list(cgrp); } /* @@ -822,42 +822,95 @@ static inline int cgroup_path_ve_relative(struct cgroup *ve_root_cgrp, */ void cgroup1_release_agent(struct work_struct *work) { - struct cgroup *cgrp = - container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL; - char *argv[3], *envp[3]; - int ret; + struct ve_struct *ve; + unsigned long flags; + char *agentbuf; + + agentbuf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!agentbuf) { + pr_warn("failed to allocate agentbuf\n"); + return; + } + ve = container_of(work, struct ve_struct, release_agent_work); mutex_lock(&cgroup_mutex); + spin_lock_irqsave(&ve->release_list_lock, flags); + while (!list_empty(&ve->release_list)) { + char *argv[3], *envp[3]; + int i, err; + char *pathbuf = NULL; + struct cgroup *cgrp, *root_cgrp; + const char *release_agent; + + cgrp = list_entry(ve->release_list.next, + struct cgroup, + release_list); + list_del_init(&cgrp->release_list); + spin_unlock_irqrestore(&ve->release_list_lock, flags); + + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!pathbuf) + goto continue_free; + /* + * At VE destruction root cgroup looses VE_ROOT flag. + * Because of that 'cgroup_get_local_root' will not see + * VE root and return host's root cgroup instead. + * We can detect this because we have a pointer to + * original ve coming from work argument. + * We do not want to execute VE's notifications on host, + * so in this case we skip. + */ + rcu_read_lock(); + root_cgrp = cgroup_get_local_root(cgrp); - pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); - agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); - if (!pathbuf || !agentbuf || !strlen(agentbuf)) - goto out; + if (rcu_access_pointer(root_cgrp->ve_owner) != ve) { + rcu_read_unlock(); + goto continue_free; + } - spin_lock_irq(&css_set_lock); - ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); - spin_unlock_irq(&css_set_lock); - if (ret < 0 || ret >= PATH_MAX) - goto out; + if (cgroup_path_ve_relative(root_cgrp, cgrp, pathbuf, + PAGE_SIZE) < 0) { + rcu_read_unlock(); + goto continue_free; + } - argv[0] = agentbuf; - argv[1] = pathbuf; - argv[2] = NULL; + release_agent = ve_get_release_agent_path(root_cgrp); - /* minimal command environment */ - envp[0] = "HOME=/"; - envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; - envp[2] = NULL; + *agentbuf = 0; + if (release_agent) + strncpy(agentbuf, release_agent, PATH_MAX); + rcu_read_unlock(); + if (!*agentbuf) + goto continue_free; + + i = 0; + argv[i++] = agentbuf; + argv[i++] = pathbuf; + argv[i] = NULL; + + i = 0; + /* minimal command environment */ + envp[i++] = "HOME=/"; + envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[i] = NULL; + + mutex_unlock(&cgroup_mutex); + err = call_usermodehelper_ve(ve, argv[0], argv, + envp, UMH_WAIT_EXEC); + + if (err < 0 && ve == &ve0) + pr_warn_ratelimited("cgroup1_release_agent " + "%s %s failed: %d\n", + agentbuf, pathbuf, err); + mutex_lock(&cgroup_mutex); +continue_free: + kfree(pathbuf); + spin_lock_irqsave(&ve->release_list_lock, flags); + } + spin_unlock_irqrestore(&ve->release_list_lock, flags); mutex_unlock(&cgroup_mutex); - call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - goto out_free; -out: - mutex_unlock(&cgroup_mutex); -out_free: kfree(agentbuf); - kfree(pathbuf); } /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index beb26dd7cd88..abba370eded0 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1972,6 +1972,15 @@ void cgroup_unmark_ve_roots(struct ve_struct *ve) spin_unlock_irq(&css_set_lock); /* ve_owner == NULL will be visible */ synchronize_rcu(); + + /* + * Anyone already waiting in this wq to execute + * cgroup_release_agent doesn't know that ve_owner is NULL, + * but we can wait for all of them at flush_workqueue. + * After it is complete no other cgroup can seep through + * to this ve's workqueue, so it's safe to shutdown ve. + */ + flush_workqueue(ve->wq); } struct cgroup *cgroup_get_ve_root1(struct cgroup *cgrp) @@ -2026,7 +2035,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); init_waitqueue_head(&cgrp->offline_waitq); - INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) @@ -5001,7 +5009,6 @@ static void css_free_rwork_fn(struct work_struct *work) /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); cgroup1_pidlist_destroy_all(cgrp); - cancel_work_sync(&cgrp->release_agent_work); if (cgroup_parent(cgrp)) { /* @@ -5577,6 +5584,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * the migration path. */ cgrp->self.flags &= ~CSS_ONLINE; + ve_rm_from_release_list(cgrp); spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 25455264b225..934a5ff1c9bb 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -62,6 +62,11 @@ struct ve_struct ve0 = { .meminfo_val = VE_MEMINFO_SYSTEM, .vdso_64 = (struct vdso_image*)&vdso_image_64, .vdso_32 = (struct vdso_image*)&vdso_image_32, + .release_list_lock = __SPIN_LOCK_UNLOCKED( + ve0.release_list_lock), + .release_list = LIST_HEAD_INIT(ve0.release_list), + .release_agent_work = __WORK_INITIALIZER(ve0.release_agent_work, + cgroup1_release_agent), }; EXPORT_SYMBOL(ve0); @@ -403,6 +408,44 @@ static void ve_workqueue_stop(struct ve_struct *ve) destroy_workqueue(ve->wq); } +void ve_add_to_release_list(struct cgroup *cgrp) +{ + struct ve_struct *ve; + unsigned long flags; + int need_schedule_work = 0; + + rcu_read_lock(); + ve = cgroup_get_ve_owner(cgrp); + + spin_lock_irqsave(&ve->release_list_lock, flags); + if (!cgroup_is_dead(cgrp) && + list_empty(&cgrp->release_list)) { + list_add(&cgrp->release_list, &ve->release_list); + need_schedule_work = 1; + } + spin_unlock_irqrestore(&ve->release_list_lock, flags); + + if (need_schedule_work) + queue_work(ve->wq, &ve->release_agent_work); + + rcu_read_unlock(); +} + +void ve_rm_from_release_list(struct cgroup *cgrp) +{ + struct ve_struct *ve; + unsigned long flags; + + rcu_read_lock(); + ve = cgroup_get_ve_owner(cgrp); + + spin_lock_irqsave(&ve->release_list_lock, flags); + if (!list_empty(&cgrp->release_list)) + list_del_init(&cgrp->release_list); + spin_unlock_irqrestore(&ve->release_list_lock, flags); + rcu_read_unlock(); +} + /* under ve->op_sem write-lock */ static int ve_start_container(struct ve_struct *ve) { @@ -653,6 +696,10 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_ goto err_vdso; ve->features = VE_FEATURES_DEF; + + INIT_WORK(&ve->release_agent_work, cgroup1_release_agent); + spin_lock_init(&ve->release_list_lock); + ve->_randomize_va_space = ve0._randomize_va_space; ve->odirect_enable = 2; @@ -673,6 +720,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_ strcpy(ve->core_pattern, "core"); #endif INIT_LIST_HEAD(&ve->devmnt_list); + INIT_LIST_HEAD(&ve->release_list); mutex_init(&ve->devmnt_mutex); #ifdef CONFIG_AIO -- 2.27.0 _______________________________________________ Devel mailing list [email protected] https://lists.openvz.org/mailman/listinfo/devel
