To achieve this:

- We call update_tasks_cpumask/nodemask() for empty cpusets when
hotplug happens, instead of moving tasks out of them.

- When a cpuset's masks are changed by writing cpuset.cpus/mems,
we also update tasks in child cpusets which are empty.

Signed-off-by: Li Zefan <lize...@huawei.com>
---
 kernel/cpuset.c | 171 +++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 146 insertions(+), 25 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 741e652..95e9394 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -99,6 +99,9 @@ struct cpuset {
         */
        nodemask_t old_mems_allowed;
 
+       /* used in cpuset_update_nodemask_workfn() */
+       struct ptr_heap *heap;
+
        struct fmeter fmeter;           /* memory_pressure filter */
 
        /*
@@ -114,6 +117,7 @@ struct cpuset {
        int relax_domain_level;
 
        struct work_struct hotplug_work;
+       struct work_struct update_nodemask_work;
 };
 
 /* Retrieve the cpuset for a cgroup */
@@ -276,6 +280,8 @@ static struct cpuset top_cpuset = {
 static DEFINE_MUTEX(cpuset_mutex);
 static DEFINE_MUTEX(callback_mutex);
 
+static struct workqueue_struct *cpuset_update_nodemask_wq;
+
 /*
  * CPU / memory hotplug is handled asynchronously.
  */
@@ -877,6 +883,39 @@ static void update_tasks_cpumask(struct cpuset *cs, struct 
ptr_heap *heap)
        cgroup_scan_tasks(&scan);
 }
 
+/*
+ * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
+ * @root_cs: the root cpuset of the hierarchy
+ * @update_root: update root cpuset or not?
+ * @heap: the heap used by cgroup_scan_tasks()
+ *
+ * This will update cpumasks of tasks in @root_cs and all other empty cpusets
+ * which take on cpumask of @root_cs.
+ *
+ * Called with cpuset_mutex held
+ */
+static void update_tasks_cpumask_hier(struct cpuset *root_cs,
+                                     bool update_root, struct ptr_heap *heap)
+{
+       struct cpuset *cp;
+       struct cgroup *pos_cgrp;
+
+       if (update_root)
+               update_tasks_cpumask(root_cs, heap);
+
+       rcu_read_lock();
+       cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
+               /* skip the whole subtree if @cp have some CPU */
+               if (!cpumask_empty(cp->cpus_allowed)) {
+                       pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+                       continue;
+               }
+
+               update_tasks_cpumask(cp, heap);
+       }
+       rcu_read_unlock();
+}
+
 /**
  * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in 
it
  * @cs: the cpuset to consider
@@ -928,11 +967,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset 
*trialcs,
        cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
        mutex_unlock(&callback_mutex);
 
-       /*
-        * Scan tasks in the cpuset, and update the cpumasks of any
-        * that need an update.
-        */
-       update_tasks_cpumask(cs, &heap);
+       update_tasks_cpumask_hier(cs, true, &heap);
 
        heap_free(&heap);
 
@@ -1099,6 +1134,78 @@ static void update_tasks_nodemask(struct cpuset *cs, 
struct ptr_heap *heap)
        cpuset_being_rebound = NULL;
 }
 
+static void cpuset_update_nodemask_workfn(struct work_struct *work)
+{
+       struct cpuset *cs = container_of(work, struct cpuset,
+                                       update_nodemask_work);
+
+       update_tasks_nodemask(cs, cs->heap);
+       css_put(&cs->css);
+}
+
+static void schedule_update_tasks_nodemask(struct cpuset *cs,
+                                          struct ptr_heap *heap)
+{
+       bool queued;
+
+       /* Will be released when the work item finishes executing. */
+       if (!css_tryget(&cs->css))
+               return;
+
+       /*
+        * The caller will flush the workqueue with cpuset_mutex held,
+        * so it's not possible a work item was already queued, and
+        * we're sure cs->heap is valid.
+        */
+       cs->heap = heap;
+       queued = queue_work(cpuset_update_nodemask_wq,
+                           &cs->update_nodemask_work);
+       if (queued) {
+               WARN_ON(1);
+               css_put(&cs->css);
+       }
+}
+
+/*
+ * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
+ * @cs: the root cpuset of the hierarchy
+ * @update_root: update the root cpuset or not?
+ * @heap: the heap used by cgroup_scan_tasks()
+ *
+ * This will update nodemasks of tasks in @root_cs and all other empty cpusets
+ * which take on nodemask of @root_cs.
+ *
+ * Called with cpuset_mutex held
+ */
+static void update_tasks_nodemask_hier(struct cpuset *root_cs,
+                                      bool update_root, struct ptr_heap *heap)
+{
+       struct cpuset *cp;
+       struct cgroup *pos_cgrp;
+
+       if (update_root)
+               update_tasks_nodemask(root_cs, heap);
+
+       rcu_read_lock();
+       cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
+               /* skip the whole subtree if @cp have some CPU */
+               if (!nodes_empty(cp->mems_allowed)) {
+                       pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+                       continue;
+               }
+
+               schedule_update_tasks_nodemask(cp, heap);
+       }
+       rcu_read_unlock();
+
+       /*
+        * The only reason we use workqueue is update_tasks_nodemask() can't
+        * be called in rcu_read_lock(). Flush the workqueue to make sure
+        * all the updates are done before we return.
+        */
+       flush_workqueue(cpuset_update_nodemask_wq);
+}
+
 /*
  * Handle user request to change the 'mems' memory placement
  * of a cpuset.  Needs to validate the request, update the
@@ -1163,7 +1270,7 @@ static int update_nodemask(struct cpuset *cs, struct 
cpuset *trialcs,
        cs->mems_allowed = trialcs->mems_allowed;
        mutex_unlock(&callback_mutex);
 
-       update_tasks_nodemask(cs, &heap);
+       update_tasks_nodemask_hier(cs, true, &heap);
 
        heap_free(&heap);
 done:
@@ -1888,6 +1995,7 @@ static struct cgroup_subsys_state 
*cpuset_css_alloc(struct cgroup *cont)
        nodes_clear(cs->mems_allowed);
        fmeter_init(&cs->fmeter);
        INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
+       INIT_WORK(&cs->update_nodemask_work, cpuset_update_nodemask_workfn);
        cs->relax_domain_level = -1;
 
        return &cs->css;
@@ -2063,31 +2171,36 @@ static void cpuset_propagate_hotplug_workfn(struct 
work_struct *work)
        static nodemask_t off_mems;
        struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
        bool is_empty;
+       bool sane = cgroup_sane_behavior(cs->css.cgroup);
 
        mutex_lock(&cpuset_mutex);
 
        cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
        nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
 
-       /* remove offline cpus from @cs */
-       if (!cpumask_empty(&off_cpus)) {
-               mutex_lock(&callback_mutex);
-               cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
-               mutex_unlock(&callback_mutex);
+       mutex_lock(&callback_mutex);
+       cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
+       mutex_unlock(&callback_mutex);
 
-               if (!cpumask_empty(cs->cpus_allowed))
-                       update_tasks_cpumask(cs, NULL);
-       }
+       /*
+        * If sane_behavior flag is set, we need to update tasks' cpumask
+        * for empy cpuset to take on ancestor's cpumask
+        */
+       if ((sane && cpumask_empty(cs->cpus_allowed)) ||
+           (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
+               update_tasks_cpumask(cs, NULL);
 
-       /* remove offline mems from @cs */
-       if (!nodes_empty(off_mems)) {
-               mutex_lock(&callback_mutex);
-               nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
-               mutex_unlock(&callback_mutex);
+       mutex_lock(&callback_mutex);
+       nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+       mutex_unlock(&callback_mutex);
 
-               if (!nodes_empty(cs->mems_allowed))
-                       update_tasks_nodemask(cs, NULL);
-       }
+       /*
+        * If sane_behavior flag is set, we need to update tasks' nodemask
+        * for empy cpuset to take on ancestor's nodemask
+        */
+       if ((sane && nodes_empty(cs->mems_allowed)) ||
+           (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
+               update_tasks_nodemask(cs, NULL);
 
        is_empty = cpumask_empty(cs->cpus_allowed) ||
                nodes_empty(cs->mems_allowed);
@@ -2095,11 +2208,13 @@ static void cpuset_propagate_hotplug_workfn(struct 
work_struct *work)
        mutex_unlock(&cpuset_mutex);
 
        /*
-        * If @cs became empty, move tasks to the nearest ancestor with
-        * execution resources.  This is full cgroup operation which will
+        * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
+        *
+        * Otherwise move tasks to the nearest ancestor with execution
+        *  resources.  This is full cgroup operation which will
         * also call back into cpuset.  Should be done outside any lock.
         */
-       if (is_empty)
+       if (!sane && is_empty)
                remove_tasks_in_empty_cpuset(cs);
 
        /* the following may free @cs, should be the last operation */
@@ -2174,6 +2289,7 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
                cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
                mutex_unlock(&callback_mutex);
                /* we don't mess with cpumasks of tasks in top_cpuset */
+               update_tasks_cpumask_hier(&top_cpuset, false, NULL);
        }
 
        /* synchronize mems_allowed to N_MEMORY */
@@ -2182,6 +2298,7 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
                top_cpuset.mems_allowed = new_mems;
                mutex_unlock(&callback_mutex);
                update_tasks_nodemask(&top_cpuset, NULL);
+               update_tasks_nodemask_hier(&top_cpuset, false, NULL);
        }
 
        /* if cpus or mems went down, we need to propagate to descendants */
@@ -2261,6 +2378,10 @@ void __init cpuset_init_smp(void)
        cpuset_propagate_hotplug_wq =
                alloc_ordered_workqueue("cpuset_hotplug", 0);
        BUG_ON(!cpuset_propagate_hotplug_wq);
+
+       cpuset_update_nodemask_wq =
+               create_workqueue("cpuset_update_nodemask");
+       BUG_ON(!cpuset_update_nodemask_wq);
 }
 
 /**
-- 
1.8.0.2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to