On 25.03.2020 19:30, Valeriy Vdovin wrote: > Follow-up patch to per-cgroup release_agent property. release_agent > notifications are spawned from a special kthread, running under ve0. But > newly spawned tasks should run under their own ve context. Easy way to > pass this information to a spawning thread is by adding 've_owner' field > to a root cgroup. At notification any cgroup can be walked upwards to > it's root and get ve_owner from there. ve_owner is initialized at container > start, that's when we know all cgroup ve roots for free. For ve0 roots > we don't need ve_owner field set, because we can always get pointer to ve0. > Only place where we can detch ve_owner knowledge from root ve cgroups is > at cgroup_exit for ve->init_task, because that's that's the last time, when > ve->init_task is related to it's ve root cgroups (see cgroup_exit comments). > > https://jira.sw.ru/browse/PSBM-83887 > Signed-off-by: Valeriy Vdovin <valeriy.vdo...@virtuozzo.com> > --- > include/linux/cgroup.h | 3 +++ > kernel/cgroup.c | 56 > ++++++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 59 insertions(+) > > diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h > index 5e289fc..34fc017 100644 > --- a/include/linux/cgroup.h > +++ b/include/linux/cgroup.h > @@ -291,6 +291,9 @@ struct cgroup { > struct simple_xattrs xattrs; > u64 subgroups_limit; > > + /* ve_owner, responsible for running release agent. */ > + struct ve_struct *ve_owner; > + > /* > * Per-cgroup path to release agent binary for release > * notifications. > diff --git a/kernel/cgroup.c b/kernel/cgroup.c > index d1d4605..31e7db6 100644 > --- a/kernel/cgroup.c > +++ b/kernel/cgroup.c > @@ -4352,6 +4352,7 @@ int cgroup_mark_ve_root(struct ve_struct *ve) > mutex_lock(&cgroup_mutex); > for_each_active_root(root) { > cgrp = task_cgroup_from_root(ve->init_task, root); > + cgrp->ve_owner = ve; > set_bit(CGRP_VE_ROOT, &cgrp->flags); > err = cgroup_add_file_on_mark_ve(cgrp); > if (err) > @@ -4363,6 +4364,20 @@ int cgroup_mark_ve_root(struct ve_struct *ve) > return err; > } > > +static void cgroup_unbind_roots_from_ve(struct ve_struct *ve) > +{ > + struct cgroup *cgrp; > + struct cgroupfs_root *root; > + > + mutex_lock(&cgroup_mutex); > + for_each_active_root(root) { > + cgrp = task_cgroup_from_root(ve->init_task, root); > + BUG_ON(!cgrp->ve_owner); > + cgrp->ve_owner = NULL; > + } > + mutex_unlock(&cgroup_mutex); > +} > + > struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp) > { > struct cgroup *ve_root = NULL; > @@ -5401,6 +5416,14 @@ void cgroup_exit(struct task_struct *tsk, int > run_callbacks) > int i; > > /* > + * Detect that the task in question is ve->init_task > + * if so, unbind all cgroups that want to be released under > + * this ve. > + */ > + if (!ve_is_super(tsk->task_ve) && tsk == tsk->task_ve->init_task) > + cgroup_unbind_roots_from_ve(tsk->task_ve); > + > + /* > * Unlink from the css_set task list if necessary. > * Optimistically check cg_list before taking > * css_set_lock > @@ -5493,6 +5516,7 @@ static void cgroup_release_agent(struct work_struct > *work) > raw_spin_lock(&release_list_lock); > while (!list_empty(&release_list)) { > char *argv[3], *envp[3]; > + struct ve_struct *ve = NULL; > int i, err; > const char *release_agent; > char *pathbuf = NULL, *agentbuf = NULL; > @@ -5507,14 +5531,38 @@ static void cgroup_release_agent(struct work_struct > *work) > goto continue_free; > if (__cgroup_path(cgrp, pathbuf, PAGE_SIZE, true) < 0) > goto continue_free; > + > + /* > + * Deduce under which VE we are going to spawn nofitier > + * binary. Non-root VE has it's VE-local root cgroup, > + * marked with VE_ROOT flag. It has non-NULL ve_owner > + * set during cgroup_mark_ve_root. > + * VE0 root cgroup does not need ve_owner field, because > + * it's ve is ve0. Non-VE root cgroup does not have a parent. > + */ > root_cgrp = cgroup_get_local_root(cgrp); > + if (root_cgrp->parent == NULL) > + ve = &ve0; > + else > + ve = root_cgrp->ve_owner; > + if (!ve) > + goto continue_free; > + > + down_read(&ve->op_sem); > + if (ve->is_running) { > + up_read(&ve->op_sem); > + goto continue_free; > + } > + > rcu_read_lock(); > + > release_agent = cgroup_release_agent_path(root_cgrp); > if (release_agent) > agentbuf = kstrdup(release_agent, GFP_KERNEL); > rcu_read_unlock(); > if (!agentbuf) > goto continue_free; > + get_ve(ve); > > i = 0; > argv[i++] = agentbuf; > @@ -5530,12 +5578,20 @@ static void cgroup_release_agent(struct work_struct > *work) > /* Drop the lock while we invoke the usermode helper, > * since the exec could involve hitting disk and hence > * be a slow process */ > + > mutex_unlock(&cgroup_mutex); > +#ifdef CONFIG_VE > + err = call_usermodehelper_fns_ve(ve, argv[0], argv, > + envp, UMH_WAIT_EXEC, NULL, NULL, NULL);
It's a bad thing to call userspace task under kernel mutex. 1)The task may hang forever and 2)this introduces untracked circular dependencies between kernel locks. > +#else > err = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); > +#endif > if (err < 0) > pr_warn_ratelimited("cgroup release_agent " > "%s %s failed: %d\n", > agentbuf, pathbuf, err); > + up_read(&ve->op_sem); > + put_ve(ve); > > mutex_lock(&cgroup_mutex); > continue_free: > _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel