On 24.09.2015 13:39, Vladimir Davydov wrote:
> Currently, we reserve veid right on write to ve.veid, returning EEXIST
> if the id is already in use, and free it when the ve cgroup is removed
> (ve_offline). Before commit 33f3496e5d13 ("ms/cgroup: split cgroup
> destruction into two steps") ve_offline() was called synchronously on
> rmdir and everything worked fine, but the above mentioned commit changed
> this so that ve_offline() is now called from a workqueue. As a result,
> Setting the same ve.veid right after rmdir-mkdir sequence is likely to
> fail with EEXIST, because the work function removing the id might still
> be in progress. This leads to `vzctl start` failures if the container
> stopped by itself (e.g. called halt), because vzctl recreates the ve
> cgroup directory on start if it already exists.
> 
> To fix this issue, this patch reworks veid handling as follows. Now it's
> OK to write any allowed id to ve.veid as long as the container is
> stopped (if it's running EBUSY is returned), and no error will be
> returned even if there is a collision. Actual veid reservation happens
> only on container start (i.e. when START is written to ve.state). If
> ve_start_container() finds that the preferred id is already in use, it
> returns EEXIST. Correspondingly, veid is freed on container stop
> (ve_exit_ns), not on cgroup removal. This ensures that once the
> container has stopped, it's OK to reuse its id, no matter if its ve
> cgroup is being destroyed or not.
> 
> https://jira.sw.ru/browse/PSBM-39338
> 
> Signed-off-by: Vladimir Davydov <[email protected]>

Acked-by: Kirill Tkhai <[email protected]>

> ---
> Changes in v2:
>  - move veid alloc/remove to ve_list_add/del so that one can't two ves
>    with the same id on the list
>  - return EEXIST instead of ENOSPC on veid collision
> 
>  kernel/ve/ve.c | 67 
> ++++++++++++++++++++++------------------------------------
>  1 file changed, 25 insertions(+), 42 deletions(-)
> 
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index c67ff3f8d19e..aff3b03fcece 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -118,29 +118,31 @@ void put_ve(struct ve_struct *ve)
>  }
>  EXPORT_SYMBOL(put_ve);
>  
> -static void ve_list_add(struct ve_struct *ve)
> +static int ve_list_add(struct ve_struct *ve)
>  {
> +     int err;
> +
>       mutex_lock(&ve_list_lock);
> -     if (idr_replace(&ve_idr, ve, ve->veid) != NULL)
> -             WARN_ON(1);
> +     err = idr_alloc(&ve_idr, ve, ve->veid, ve->veid + 1, GFP_KERNEL);
> +     if (err < 0) {
> +             if (err == -ENOSPC)
> +                     err = -EEXIST;
> +             goto out;
> +     }
>       list_add(&ve->ve_list, &ve_list_head);
>       nr_ve++;
> +     err = 0;
> +out:
>       mutex_unlock(&ve_list_lock);
> +     return err;
>  }
>  
> -static void ve_list_del(struct ve_struct *ve, bool free_id)
> +static void ve_list_del(struct ve_struct *ve)
>  {
>       mutex_lock(&ve_list_lock);
> -     /* Check whether ve linked in list of ve's and unlink ve from list if 
> so */
> -     if (!list_empty(&ve->ve_list)) {
> -             /* Hide ve from finding by veid */
> -             if (idr_replace(&ve_idr, NULL, ve->veid) != ve)
> -                     WARN_ON(1);
> -             list_del_init(&ve->ve_list);
> -             nr_ve--;
> -     }
> -     if (free_id && ve->veid)
> -             idr_remove(&ve_idr, ve->veid);
> +     idr_remove(&ve_idr, ve->veid);
> +     list_del_init(&ve->ve_list);
> +     nr_ve--;
>       mutex_unlock(&ve_list_lock);
>  }
>  
> @@ -436,7 +438,10 @@ int ve_start_container(struct ve_struct *ve)
>       ve->start_jiffies = get_jiffies_64();
>  
>       ve_grab_context(ve);
> -     ve_list_add(ve);
> +
> +     err = ve_list_add(ve);
> +     if (err)
> +             goto err_list;
>  
>       err = ve_start_kthread(ve);
>       if (err)
> @@ -475,7 +480,8 @@ err_legacy_pty:
>  err_umh:
>       ve_stop_kthread(ve);
>  err_kthread:
> -     ve_list_del(ve, false);
> +     ve_list_del(ve);
> +err_list:
>       ve_drop_context(ve);
>       return err;
>  }
> @@ -538,7 +544,7 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
>       down_write(&ve->op_sem);
>       ve_hook_iterate_fini(VE_SS_CHAIN, ve);
>  
> -     ve_list_del(ve, false);
> +     ve_list_del(ve);
>       ve_drop_context(ve);
>       up_write(&ve->op_sem);
>  
> @@ -644,13 +650,6 @@ err_ve:
>       return ERR_PTR(err);
>  }
>  
> -static void ve_offline(struct cgroup *cg)
> -{
> -     struct ve_struct *ve = cgroup_ve(cg);
> -
> -     ve_list_del(ve, true);
> -}
> -
>  static void ve_devmnt_free(struct ve_devmnt *devmnt)
>  {
>       if (!devmnt)
> @@ -839,32 +838,17 @@ static u64 ve_id_read(struct cgroup *cg, struct cftype 
> *cft)
>  static int ve_id_write(struct cgroup *cg, struct cftype *cft, u64 value)
>  {
>       struct ve_struct *ve = cgroup_ve(cg);
> -     int veid;
>       int err = 0;
>  
>       if (value <= 0 || value > INT_MAX)
>               return -EINVAL;
>  
>       down_write(&ve->op_sem);
> -     if (ve->veid) {
> +     if (ve->is_running || ve->ve_ns) {
>               if (ve->veid != value)
>                       err = -EBUSY;
> -             goto out;
> -     }
> -
> -     mutex_lock(&ve_list_lock);
> -     /* we forbid to start a container without veid (see ve_start_container)
> -      * so the ve cannot be on the list */
> -     BUG_ON(!list_empty(&ve->ve_list));
> -     veid = idr_alloc(&ve_idr, NULL, value, value + 1, GFP_KERNEL);
> -     if (veid < 0) {
> -             err = veid;
> -             if (err == -ENOSPC)
> -                     err = -EEXIST;
>       } else
> -             ve->veid = veid;
> -     mutex_unlock(&ve_list_lock);
> -out:
> +             ve->veid = value;
>       up_write(&ve->op_sem);
>       return err;
>  }
> @@ -1207,7 +1191,6 @@ struct cgroup_subsys ve_subsys = {
>       .name           = "ve",
>       .subsys_id      = ve_subsys_id,
>       .css_alloc      = ve_create,
> -     .css_offline    = ve_offline,
>       .css_free       = ve_destroy,
>       .can_attach     = ve_can_attach,
>       .attach         = ve_attach,
> 
_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to