On 23.09.2015 17:50, Vladimir Davydov wrote:
> Currently, we reserve veid right on write to ve.veid, returning EEXIST
> if the id is already in use, and free it when the ve cgroup is removed
> (ve_offline). Before commit 33f3496e5d13 ("ms/cgroup: split cgroup
> destruction into two steps") ve_offline() was called synchronously on
> rmdir and everything worked fine, but the above mentioned commit changed
> this so that ve_offline() is now called from a workqueue. As a result,
> Setting the same ve.veid right after rmdir-mkdir sequence is likely to
> fail with EEXIST, because the work function removing the id might still
> be in progress. This leads to `vzctl start` failures if the container
> stopped by itself (e.g. called halt), because vzctl recreates the ve
> cgroup directory on start if it already exists.
> 
> To fix this issue, this patch reworks veid handling as follows. Now it's
> OK to write any allowed id to ve.veid as long as the container is
> stopped (if it's running EBUSY is returned), and no error will be
> returned even if there is a collision. Actual veid reservation happens
> only on container start (i.e. when START is written to ve.state). If
> ve_start_container() finds that the preferred id is already in use, it
> returns ENOSPC. Correspondingly, veid is freed on container stop
> (ve_stop_ns), not on cgroup removal. This ensures that once the
> container has stopped, it's OK to reuse its id, no matter if its ve
> cgroup is being destroyed or not.
> 
> https://jira.sw.ru/browse/PSBM-39338
> 
> Signed-off-by: Vladimir Davydov <[email protected]>
> ---
>  kernel/ve/ve.c | 71 
> +++++++++++++++++++++++++---------------------------------
>  1 file changed, 31 insertions(+), 40 deletions(-)
> 
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index c67ff3f8d19e..0d1913f14867 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -118,29 +118,36 @@ void put_ve(struct ve_struct *ve)
>  }
>  EXPORT_SYMBOL(put_ve);
>  
> +static int ve_alloc_id(struct ve_struct *ve)
> +{
> +     int err;
> +
> +     mutex_lock(&ve_list_lock);
> +     err = idr_alloc(&ve_idr, ve, ve->veid, ve->veid + 1, GFP_KERNEL);
> +     mutex_unlock(&ve_list_lock);
> +     return err < 0 ? err : 0;
> +}
> +
> +static void ve_free_id(struct ve_struct *ve)
> +{
> +     mutex_lock(&ve_list_lock);
> +     idr_remove(&ve_idr, ve->veid);
> +     mutex_unlock(&ve_list_lock);
> +}
> +
>  static void ve_list_add(struct ve_struct *ve)
>  {
>       mutex_lock(&ve_list_lock);
> -     if (idr_replace(&ve_idr, ve, ve->veid) != NULL)
> -             WARN_ON(1);
>       list_add(&ve->ve_list, &ve_list_head);
>       nr_ve++;
>       mutex_unlock(&ve_list_lock);
>  }
>  
> -static void ve_list_del(struct ve_struct *ve, bool free_id)
> +static void ve_list_del(struct ve_struct *ve)
>  {
>       mutex_lock(&ve_list_lock);
> -     /* Check whether ve linked in list of ve's and unlink ve from list if 
> so */
> -     if (!list_empty(&ve->ve_list)) {
> -             /* Hide ve from finding by veid */
> -             if (idr_replace(&ve_idr, NULL, ve->veid) != ve)
> -                     WARN_ON(1);
> -             list_del_init(&ve->ve_list);
> -             nr_ve--;
> -     }
> -     if (free_id && ve->veid)
> -             idr_remove(&ve_idr, ve->veid);
> +     list_del_init(&ve->ve_list);
> +     nr_ve--;
>       mutex_unlock(&ve_list_lock);
>  }
>  
> @@ -438,6 +445,10 @@ int ve_start_container(struct ve_struct *ve)
>       ve_grab_context(ve);
>       ve_list_add(ve);
>  
> +     err = ve_alloc_id(ve);
> +     if (err)
> +             goto err_id;

I'm confused in a fact we add a ve to &ve_list_head, before we know
the ve's id is unique. for_each_ve() users are able to pick it at the moment.

Furthermore, it looks confusing, we add a ve to the list, before it's completely
initialized.

Can't we move ve_list_add() down to the end of ve_start_container() function?

> +
>       err = ve_start_kthread(ve);
>       if (err)
>               goto err_kthread;
> @@ -475,7 +486,9 @@ err_legacy_pty:
>  err_umh:
>       ve_stop_kthread(ve);
>  err_kthread:
> -     ve_list_del(ve, false);
> +     ve_free_id(ve);
> +err_id:
> +     ve_list_del(ve);
>       ve_drop_context(ve);
>       return err;
>  }
> @@ -508,6 +521,7 @@ void ve_stop_ns(struct pid_namespace *pid_ns)
>        * Stop kernel thread, or zap_pid_ns_processes() would wait it forever.
>        */
>       ve_stop_kthread(ve);
> +     ve_free_id(ve);
>       up_write(&ve->op_sem);
>  }
>  
> @@ -538,7 +552,7 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
>       down_write(&ve->op_sem);
>       ve_hook_iterate_fini(VE_SS_CHAIN, ve);
>  
> -     ve_list_del(ve, false);
> +     ve_list_del(ve);
>       ve_drop_context(ve);
>       up_write(&ve->op_sem);
>  
> @@ -644,13 +658,6 @@ err_ve:
>       return ERR_PTR(err);
>  }
>  
> -static void ve_offline(struct cgroup *cg)
> -{
> -     struct ve_struct *ve = cgroup_ve(cg);
> -
> -     ve_list_del(ve, true);
> -}
> -
>  static void ve_devmnt_free(struct ve_devmnt *devmnt)
>  {
>       if (!devmnt)
> @@ -839,32 +846,17 @@ static u64 ve_id_read(struct cgroup *cg, struct cftype 
> *cft)
>  static int ve_id_write(struct cgroup *cg, struct cftype *cft, u64 value)
>  {
>       struct ve_struct *ve = cgroup_ve(cg);
> -     int veid;
>       int err = 0;
>  
>       if (value <= 0 || value > INT_MAX)
>               return -EINVAL;
>  
>       down_write(&ve->op_sem);
> -     if (ve->veid) {
> +     if (ve->is_running) {
>               if (ve->veid != value)
>                       err = -EBUSY;
> -             goto out;
> -     }
> -
> -     mutex_lock(&ve_list_lock);
> -     /* we forbid to start a container without veid (see ve_start_container)
> -      * so the ve cannot be on the list */
> -     BUG_ON(!list_empty(&ve->ve_list));
> -     veid = idr_alloc(&ve_idr, NULL, value, value + 1, GFP_KERNEL);
> -     if (veid < 0) {
> -             err = veid;
> -             if (err == -ENOSPC)
> -                     err = -EEXIST;
>       } else
> -             ve->veid = veid;
> -     mutex_unlock(&ve_list_lock);
> -out:
> +             ve->veid = value;
>       up_write(&ve->op_sem);
>       return err;
>  }
> @@ -1207,7 +1199,6 @@ struct cgroup_subsys ve_subsys = {
>       .name           = "ve",
>       .subsys_id      = ve_subsys_id,
>       .css_alloc      = ve_create,
> -     .css_offline    = ve_offline,
>       .css_free       = ve_destroy,
>       .can_attach     = ve_can_attach,
>       .attach         = ve_attach,
> 
_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to