This way task_ve pointer's lifetime extends to ve namespace lifetime (which holds a reference to ve) and we can drop all the tricks about unsetting task_ve to prevent use after free.
https://virtuozzo.atlassian.net/browse/VSTOR-118289 Signed-off-by: Pavel Tikhomirov <[email protected]> Feature: ve: ve generic structures --- kernel/ve/ve.c | 51 ---------------------------------------- kernel/ve/ve_namespace.c | 2 ++ 2 files changed, 2 insertions(+), 51 deletions(-) diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 75148d25b7b39..1bde9c27f8683 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -32,7 +32,6 @@ #include <linux/ctype.h> #include <linux/tty.h> #include <linux/device.h> -#include <linux/sched.h> #include <linux/sched/loadavg.h> #include <uapi/linux/vzcalluser.h> @@ -857,14 +856,6 @@ void ve_exit_ns(struct pid_namespace *pid_ns) } rcu_read_unlock(); - /* - * To prevent use-after-free (e.g. reading /proc/pid/stat) in case - * zombie container init outlives the ve_struct. - * - * Note: synchronize_rcu in ve_drop_context makes it rcu safe. - */ - rcu_assign_pointer(current->task_ve, &ve0); - /* * At this point all userspace tasks in container are dead. */ @@ -1211,50 +1202,9 @@ static void ve_attach(struct cgroup_taskset *tset) if (cpuid_override_on()) set_tsk_thread_flag(task, TIF_CPUID_OVERRIDE); - - rcu_assign_pointer(task->task_ve, ve); } } -/* - * When a container task exits it is moved out of its css set, see cgroup_exit. - * So the task effectively puts its reference to ve_struct of the container, - * and now nothing prevents ve_struct from being freed while the task is still - * there. - * - * We need to make sure that this task's ->task_ve is not pointing to the - * container ve before putting the reference, else the use of ->task_ve from - * the task can lead to use-after-free (e.g. when reading /proc/pid/stat). - * - * This is not a problem for a "normal" task (basically any task inside - * container process tree except init), as such a "normal" task will be reaped - * in zap_pid_ns_processes by the init, before init releases it's own reference - * to ve. - * - * So we have two cases: - * - init is handled in ve_exit_ns, unless there is no ve_nsproxy; - * - "external" task (the task which does not resolve to a valid pid in - * container pid namespace, and thus there is no guaranty that it would be - * reaped in time) is handled below. - */ -static void ve_exit(struct task_struct *task) -{ - struct ve_struct *ve = task->task_ve; - - /* - * Pairs with synchronize_rcu in ve_grab_context and ve_drop_context. - */ - rcu_read_lock(); - /* - * Clear task_ve if ve has no namespaces (ve is starting, stopped or - * stopping), or in case of "external" task. - */ - if (!ve->ve_nsproxy || - !task_pid_nr_ns(task, ve->ve_nsproxy->pid_ns_for_children)) - rcu_assign_pointer(task->task_ve, &ve0); - rcu_read_unlock(); -} - static int ve_state_show(struct seq_file *sf, void *v) { struct cgroup_subsys_state *css = seq_css(sf); @@ -1962,7 +1912,6 @@ struct cgroup_subsys ve_cgrp_subsys = { .css_free = ve_destroy, .can_attach = ve_can_attach, .attach = ve_attach, - .exit = ve_exit, .legacy_cftypes = ve_cftypes, }; diff --git a/kernel/ve/ve_namespace.c b/kernel/ve/ve_namespace.c index e5e5f540205ed..6dae8487735f9 100644 --- a/kernel/ve/ve_namespace.c +++ b/kernel/ve/ve_namespace.c @@ -83,6 +83,7 @@ int copy_ve_ns(unsigned long flags, struct task_struct *p) return PTR_ERR(new_ve_ns); p->ve_ns = new_ve_ns; + p->task_ve = new_ve_ns->ve; return 0; } @@ -129,6 +130,7 @@ void switch_ve_namespace(struct task_struct *p, struct ve_namespace *new) task_lock(p); old = p->ve_ns; p->ve_ns = new; + p->task_ve = new ? new->ve : &ve0; task_unlock(p); if (old) -- 2.51.1 _______________________________________________ Devel mailing list [email protected] https://lists.openvz.org/mailman/listinfo/devel
