The commit is pushed to "branch-rh7-3.10.0-229.7.2-ovz" and will appear at
https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-229.7.2.vz7.6.9
------>
commit b56bb530160090c88ded36e05099655a0e799e19
Author: Vladimir Davydov <[email protected]>
Date: Thu Sep 24 14:55:57 2015 +0400
ve: rework veid handling
Currently, we reserve veid right on write to ve.veid, returning EEXIST
if the id is already in use, and free it when the ve cgroup is removed
(ve_offline). Before commit 33f3496e5d13 ("ms/cgroup: split cgroup
destruction into two steps") ve_offline() was called synchronously on
rmdir and everything worked fine, but the above mentioned commit changed
this so that ve_offline() is now called from a workqueue. As a result,
Setting the same ve.veid right after rmdir-mkdir sequence is likely to
fail with EEXIST, because the work function removing the id might still
be in progress. This leads to `vzctl start` failures if the container
stopped by itself (e.g. called halt), because vzctl recreates the ve
cgroup directory on start if it already exists.
To fix this issue, this patch reworks veid handling as follows. Now it's
OK to write any allowed id to ve.veid as long as the container is
stopped (if it's running EBUSY is returned), and no error will be
returned even if there is a collision. Actual veid reservation happens
only on container start (i.e. when START is written to ve.state). If
ve_start_container() finds that the preferred id is already in use, it
returns EEXIST. Correspondingly, veid is freed on container stop
(ve_exit_ns), not on cgroup removal. This ensures that once the
container has stopped, it's OK to reuse its id, no matter if its ve
cgroup is being destroyed or not.
https://jira.sw.ru/browse/PSBM-39338
Signed-off-by: Vladimir Davydov <[email protected]>
Acked-by: Kirill Tkhai <[email protected]>
---
kernel/ve/ve.c | 67 ++++++++++++++++++++++------------------------------------
1 file changed, 25 insertions(+), 42 deletions(-)
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index c67ff3f8d19e..aff3b03fcece 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -118,29 +118,31 @@ void put_ve(struct ve_struct *ve)
}
EXPORT_SYMBOL(put_ve);
-static void ve_list_add(struct ve_struct *ve)
+static int ve_list_add(struct ve_struct *ve)
{
+ int err;
+
mutex_lock(&ve_list_lock);
- if (idr_replace(&ve_idr, ve, ve->veid) != NULL)
- WARN_ON(1);
+ err = idr_alloc(&ve_idr, ve, ve->veid, ve->veid + 1, GFP_KERNEL);
+ if (err < 0) {
+ if (err == -ENOSPC)
+ err = -EEXIST;
+ goto out;
+ }
list_add(&ve->ve_list, &ve_list_head);
nr_ve++;
+ err = 0;
+out:
mutex_unlock(&ve_list_lock);
+ return err;
}
-static void ve_list_del(struct ve_struct *ve, bool free_id)
+static void ve_list_del(struct ve_struct *ve)
{
mutex_lock(&ve_list_lock);
- /* Check whether ve linked in list of ve's and unlink ve from list if
so */
- if (!list_empty(&ve->ve_list)) {
- /* Hide ve from finding by veid */
- if (idr_replace(&ve_idr, NULL, ve->veid) != ve)
- WARN_ON(1);
- list_del_init(&ve->ve_list);
- nr_ve--;
- }
- if (free_id && ve->veid)
- idr_remove(&ve_idr, ve->veid);
+ idr_remove(&ve_idr, ve->veid);
+ list_del_init(&ve->ve_list);
+ nr_ve--;
mutex_unlock(&ve_list_lock);
}
@@ -436,7 +438,10 @@ int ve_start_container(struct ve_struct *ve)
ve->start_jiffies = get_jiffies_64();
ve_grab_context(ve);
- ve_list_add(ve);
+
+ err = ve_list_add(ve);
+ if (err)
+ goto err_list;
err = ve_start_kthread(ve);
if (err)
@@ -475,7 +480,8 @@ err_legacy_pty:
err_umh:
ve_stop_kthread(ve);
err_kthread:
- ve_list_del(ve, false);
+ ve_list_del(ve);
+err_list:
ve_drop_context(ve);
return err;
}
@@ -538,7 +544,7 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
down_write(&ve->op_sem);
ve_hook_iterate_fini(VE_SS_CHAIN, ve);
- ve_list_del(ve, false);
+ ve_list_del(ve);
ve_drop_context(ve);
up_write(&ve->op_sem);
@@ -644,13 +650,6 @@ err_ve:
return ERR_PTR(err);
}
-static void ve_offline(struct cgroup *cg)
-{
- struct ve_struct *ve = cgroup_ve(cg);
-
- ve_list_del(ve, true);
-}
-
static void ve_devmnt_free(struct ve_devmnt *devmnt)
{
if (!devmnt)
@@ -839,32 +838,17 @@ static u64 ve_id_read(struct cgroup *cg, struct cftype
*cft)
static int ve_id_write(struct cgroup *cg, struct cftype *cft, u64 value)
{
struct ve_struct *ve = cgroup_ve(cg);
- int veid;
int err = 0;
if (value <= 0 || value > INT_MAX)
return -EINVAL;
down_write(&ve->op_sem);
- if (ve->veid) {
+ if (ve->is_running || ve->ve_ns) {
if (ve->veid != value)
err = -EBUSY;
- goto out;
- }
-
- mutex_lock(&ve_list_lock);
- /* we forbid to start a container without veid (see ve_start_container)
- * so the ve cannot be on the list */
- BUG_ON(!list_empty(&ve->ve_list));
- veid = idr_alloc(&ve_idr, NULL, value, value + 1, GFP_KERNEL);
- if (veid < 0) {
- err = veid;
- if (err == -ENOSPC)
- err = -EEXIST;
} else
- ve->veid = veid;
- mutex_unlock(&ve_list_lock);
-out:
+ ve->veid = value;
up_write(&ve->op_sem);
return err;
}
@@ -1207,7 +1191,6 @@ struct cgroup_subsys ve_subsys = {
.name = "ve",
.subsys_id = ve_subsys_id,
.css_alloc = ve_create,
- .css_offline = ve_offline,
.css_free = ve_destroy,
.can_attach = ve_can_attach,
.attach = ve_attach,
_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel