Follow-up patch to per-cgroup release_agent property. release_agent
notifications are spawned from a special kthread, running under ve0. But
newly spawned tasks should run under their own ve context. Easy way to
pass this information to a spawning thread is by adding 've_owner' field
to a root cgroup. At notification any cgroup can be walked upwards to
it's root and get ve_owner from there. ve_owner is initialized at container
start, that's when we know all cgroup ve roots for free. For ve0 roots
we don't need ve_owner field set, because we can always get pointer to ve0.
Only place where we can detch ve_owner knowledge from root ve cgroups is
at cgroup_exit for ve->init_task, because that's that's the last time, when
ve->init_task is related to it's ve root cgroups (see cgroup_exit comments).

https://jira.sw.ru/browse/PSBM-83887
Signed-off-by: Valeriy Vdovin <valeriy.vdo...@virtuozzo.com>
---
 include/linux/cgroup.h |  3 +++
 kernel/cgroup.c        | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5e289fc..34fc017 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -291,6 +291,9 @@ struct cgroup {
        struct simple_xattrs xattrs;
        u64 subgroups_limit;
 
+       /* ve_owner, responsible for running release agent. */
+       struct ve_struct *ve_owner;
+
        /*
         * Per-cgroup path to release agent binary for release
         * notifications.
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d1d4605..31e7db6 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4352,6 +4352,7 @@ int cgroup_mark_ve_root(struct ve_struct *ve)
        mutex_lock(&cgroup_mutex);
        for_each_active_root(root) {
                cgrp = task_cgroup_from_root(ve->init_task, root);
+               cgrp->ve_owner = ve;
                set_bit(CGRP_VE_ROOT, &cgrp->flags);
                err = cgroup_add_file_on_mark_ve(cgrp);
                if (err)
@@ -4363,6 +4364,20 @@ int cgroup_mark_ve_root(struct ve_struct *ve)
        return err;
 }
 
+static void cgroup_unbind_roots_from_ve(struct ve_struct *ve)
+{
+       struct cgroup *cgrp;
+       struct cgroupfs_root *root;
+
+       mutex_lock(&cgroup_mutex);
+       for_each_active_root(root) {
+               cgrp = task_cgroup_from_root(ve->init_task, root);
+               BUG_ON(!cgrp->ve_owner);
+               cgrp->ve_owner = NULL;
+       }
+       mutex_unlock(&cgroup_mutex);
+}
+
 struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
 {
        struct cgroup *ve_root = NULL;
@@ -5401,6 +5416,14 @@ void cgroup_exit(struct task_struct *tsk, int 
run_callbacks)
        int i;
 
        /*
+        * Detect that the task in question is ve->init_task
+        * if so, unbind all cgroups that want to be released under
+        * this ve.
+        */
+       if (!ve_is_super(tsk->task_ve) && tsk == tsk->task_ve->init_task)
+               cgroup_unbind_roots_from_ve(tsk->task_ve);
+
+       /*
         * Unlink from the css_set task list if necessary.
         * Optimistically check cg_list before taking
         * css_set_lock
@@ -5493,6 +5516,7 @@ static void cgroup_release_agent(struct work_struct *work)
        raw_spin_lock(&release_list_lock);
        while (!list_empty(&release_list)) {
                char *argv[3], *envp[3];
+               struct ve_struct *ve = NULL;
                int i, err;
                const char *release_agent;
                char *pathbuf = NULL, *agentbuf = NULL;
@@ -5507,14 +5531,38 @@ static void cgroup_release_agent(struct work_struct 
*work)
                        goto continue_free;
                if (__cgroup_path(cgrp, pathbuf, PAGE_SIZE, true) < 0)
                        goto continue_free;
+
+               /*
+                * Deduce under which VE we are going to spawn nofitier
+                * binary. Non-root VE has it's VE-local root cgroup,
+                * marked with VE_ROOT flag. It has non-NULL ve_owner
+                * set during cgroup_mark_ve_root.
+                * VE0 root cgroup does not need ve_owner field, because
+                * it's ve is ve0. Non-VE root cgroup does not have a parent.
+                */
                root_cgrp = cgroup_get_local_root(cgrp);
+               if (root_cgrp->parent == NULL)
+                       ve = &ve0;
+               else
+                       ve = root_cgrp->ve_owner;
+               if (!ve)
+                       goto continue_free;
+
+               down_read(&ve->op_sem);
+               if (ve->is_running) {
+                       up_read(&ve->op_sem);
+                       goto continue_free;
+               }
+
                rcu_read_lock();
+
                release_agent = cgroup_release_agent_path(root_cgrp);
                if (release_agent)
                        agentbuf = kstrdup(release_agent, GFP_KERNEL);
                rcu_read_unlock();
                if (!agentbuf)
                        goto continue_free;
+               get_ve(ve);
 
                i = 0;
                argv[i++] = agentbuf;
@@ -5530,12 +5578,20 @@ static void cgroup_release_agent(struct work_struct 
*work)
                /* Drop the lock while we invoke the usermode helper,
                 * since the exec could involve hitting disk and hence
                 * be a slow process */
+
                mutex_unlock(&cgroup_mutex);
+#ifdef CONFIG_VE
+               err = call_usermodehelper_fns_ve(ve, argv[0], argv,
+                       envp, UMH_WAIT_EXEC, NULL, NULL, NULL);
+#else
                err = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+#endif
                if (err < 0)
                        pr_warn_ratelimited("cgroup release_agent "
                                            "%s %s failed: %d\n",
                                            agentbuf, pathbuf, err);
+               up_read(&ve->op_sem);
+               put_ve(ve);
 
                mutex_lock(&cgroup_mutex);
  continue_free:
-- 
1.8.3.1

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to