Vova remind me, we may sleep inside get_exec_env() section.

So, it's yet better to use task work here.

On 15.10.2015 13:02, Kirill Tkhai wrote:
> Since we allow to attach a not current task to ve cgroup, there is the race
> in the places where we use get_exec_env(). The task's ve may be changed after
> it dereferenced get_exec_env(), so a lot of problems are possible there.
> I'm sure the most places, where we use get_exec_env(), was not written in an
> assumption it may change. Also, there are a lot of nested functions and it's
> impossible to check every function to verify if it's input parameters, 
> depending
> on a caller's dereferenced ve, are not actual because of ve has been changed.
> 
> I'm suggest to use to modify get_exec_env() which will supply ve's stability.
> It pairs with put_exec_env() which marks end of area where ve modification is
> not desirable.
> 
> get_exec_env() may be used nested, so here is 
> task_struct::ve_attach_lock_depth,
> which allows nesting. The counter looks a better variant that plain 
> read_lock()
> in get_exec_env() and write_trylock() loop in ve_attach():
> 
> get_exec_env()
> {
>    ...
>    read_lock();
>    ...
> }
> 
> ve_attach()
> {
>    while(!write_trylock())
>       cpu_relax();
> }
> 
> because this case the priority of read_lock() will be absolute and we lost all
> advantages of queued rw locks fairness.
> 
> Also I considered variants with using RCU and task work, but they seems to be 
> worse.
> 
> Please, your comments.
> 
> ---
>  include/linux/init_task.h |  3 ++-
>  include/linux/sched.h     |  1 +
>  include/linux/ve.h        | 29 +++++++++++++++++++++++++++++
>  include/linux/ve_proto.h  |  1 -
>  kernel/fork.c             |  3 +++
>  kernel/ve/ve.c            |  8 +++++++-
>  6 files changed, 42 insertions(+), 3 deletions(-)
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index d2cbad0..57e0796 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -136,7 +136,8 @@ extern struct task_group root_task_group;
>  #endif
>  
>  #ifdef CONFIG_VE
> -#define      INIT_TASK_VE(tsk) .task_ve = &ve0,
> +#define      INIT_TASK_VE(tsk) .task_ve = &ve0,                              
> \
> +                       .ve_attach_lock_depth = 0
>  #else
>  #define      INIT_TASK_VE(tsk)
>  #endif
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index e1bcabe..948481f 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1564,6 +1564,7 @@ struct task_struct {
>  #endif
>  #ifdef CONFIG_VE
>       struct ve_struct *task_ve;
> +     unsigned int ve_attach_lock_depth;
>  #endif
>  #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
>       struct memcg_batch_info {
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 86b95c3..3cea73d 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -33,6 +33,7 @@ struct ve_monitor;
>  struct nsproxy;
>  
>  struct ve_struct {
> +     rwlock_t                attach_lock;
>       struct cgroup_subsys_state      css;
>  
>       const char              *ve_name;
> @@ -130,6 +131,34 @@ struct ve_struct {
>  #endif
>  };
>  
> +static inline struct ve_struct *get_exec_env(void)
> +{
> +     struct ve_struct *ve;
> +
> +     if (++current->ve_attach_lock_depth > 1)
> +             return current->task_ve;
> +
> +     rcu_read_lock();
> +again:
> +     ve = current->task_ve;
> +     read_lock(&ve->attach_lock);
> +     if (unlikely(current->task_ve != ve)) {
> +             read_unlock(&ve->attach_lock);
> +             goto again;
> +     }
> +     rcu_read_unlock();
> +
> +     return ve;
> +}
> +
> +static inline void put_exec_env(void)
> +{
> +     struct ve_struct *ve = current->task_ve;
> +
> +     if (!--current->ve_attach_lock_depth)
> +             read_unlock(&ve->attach_lock);
> +}
> +
>  struct ve_devmnt {
>       struct list_head        link;
>  
> diff --git a/include/linux/ve_proto.h b/include/linux/ve_proto.h
> index 0f5898e..3deb09e 100644
> --- a/include/linux/ve_proto.h
> +++ b/include/linux/ve_proto.h
> @@ -30,7 +30,6 @@ static inline bool ve_is_super(struct ve_struct *ve)
>       return ve == &ve0;
>  }
>  
> -#define get_exec_env()               (current->task_ve)
>  #define get_env_init(ve)     (ve->ve_ns->pid_ns->child_reaper)
>  
>  const char *ve_name(struct ve_struct *ve);
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 505fa21..3d7e452 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1439,6 +1439,9 @@ static struct task_struct *copy_process(unsigned long 
> clone_flags,
>       INIT_LIST_HEAD(&p->pi_state_list);
>       p->pi_state_cache = NULL;
>  #endif
> +#ifdef CONFIG_VE
> +     p->ve_attach_lock_depth = 0;
> +#endif
>       /*
>        * sigaltstack should be cleared when sharing the same VM
>        */
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index 39a95e8..23833ed 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -640,6 +640,7 @@ static struct cgroup_subsys_state *ve_create(struct 
> cgroup *cg)
>       ve->meminfo_val = VE_MEMINFO_DEFAULT;
>  
>  do_init:
> +     ve->attach_lock = __RW_LOCK_UNLOCKED(&ve->attach_lock);
>       init_rwsem(&ve->op_sem);
>       mutex_init(&ve->sync_mutex);
>       INIT_LIST_HEAD(&ve->devices);
> @@ -738,8 +739,11 @@ static int ve_can_attach(struct cgroup *cg, struct 
> cgroup_taskset *tset)
>  
>  static void ve_attach(struct cgroup *cg, struct cgroup_taskset *tset)
>  {
> +     struct task_struct *task = cgroup_taskset_first(tset);
> +     struct ve_struct *old_ve = task->task_ve;
>       struct ve_struct *ve = cgroup_ve(cg);
> -     struct task_struct *task;
> +
> +     write_lock_irq(&old_ve->attach_lock);
>  
>       cgroup_taskset_for_each(task, cg, tset) {
>               /* this probihibts ptracing of task entered to VE from host 
> system */
> @@ -755,6 +759,8 @@ static void ve_attach(struct cgroup *cg, struct 
> cgroup_taskset *tset)
>  
>               task->task_ve = ve;
>       }
> +
> +     write_unlock_irq(&old_ve->attach_lock);
>  }
>  
>  static int ve_state_read(struct cgroup *cg, struct cftype *cft,
> 
_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to