Commit:     470fd646444c65a5d062a371f5ec8dcedee61239
Parent:     bd89aabc6761de1c35b154fe6f914a445d301510
Author:     Cliff Wickman <[EMAIL PROTECTED]>
AuthorDate: Thu Oct 18 23:40:46 2007 -0700
Committer:  Linus Torvalds <[EMAIL PROTECTED]>
CommitDate: Fri Oct 19 11:53:44 2007 -0700

    hotplug cpu: migrate a task within its cpuset
    When a cpu is disabled, move_task_off_dead_cpu() is called for tasks that 
    been running on that cpu.
    Currently, such a task is migrated:
     1) to any cpu on the same node as the disabled cpu, which is both online
        and among that task's cpus_allowed
     2) to any cpu which is both online and among that task's cpus_allowed
    It is typical of a multithreaded application running on a large NUMA system 
    have its tasks confined to a cpuset so as to cluster them near the memory 
    they share.  Furthermore, it is typical to explicitly place such a task on a
    specific cpu in that cpuset.  And in that case the task's cpus_allowed
    includes only a single cpu.
    This patch would insert a preference to migrate such a task to some cpu 
    its cpuset (and set its cpus_allowed to its entire cpuset).
    With this patch, migrate the task to:
     1) to any cpu on the same node as the disabled cpu, which is both online
        and among that task's cpus_allowed
     2) to any online cpu within the task's cpuset
     3) to any cpu which is both online and among that task's cpus_allowed
    In order to do this, move_task_off_dead_cpu() must make a call to
    cpuset_cpus_allowed_locked(), a new subset of cpuset_cpus_allowed(), that 
    not block.  (name change - per Oleg's suggestion)
    Calls are made to cpuset_lock() and cpuset_unlock() in migration_call() to 
    the cpuset mutex during the whole migrate_live_tasks() and
    migrate_dead_tasks() procedure.
    [EMAIL PROTECTED]: build fix]
    [EMAIL PROTECTED]: Fix indentation and spacing]
    Signed-off-by: Cliff Wickman <[EMAIL PROTECTED]>
    Cc: Oleg Nesterov <[EMAIL PROTECTED]>
    Cc: Christoph Lameter <[EMAIL PROTECTED]>
    Cc: Paul Jackson <[EMAIL PROTECTED]>
    Cc: Ingo Molnar <[EMAIL PROTECTED]>
    Signed-off-by: Paul Jackson <[EMAIL PROTECTED]>
    Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
    Signed-off-by: Linus Torvalds <[EMAIL PROTECTED]>
 Documentation/cpu-hotplug.txt |    4 +++-
 include/linux/cpuset.h        |    5 +++++
 kernel/cpuset.c               |   15 ++++++++++++++-
 kernel/sched.c                |   12 +++++++++++-
 4 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt
index b6d24c2..a741f65 100644
--- a/Documentation/cpu-hotplug.txt
+++ b/Documentation/cpu-hotplug.txt
@@ -220,7 +220,9 @@ A: The following happen, listed in no particular order :-)
   CPU_DOWN_PREPARE or CPU_DOWN_PREPARE_FROZEN, depending on whether or not the
   CPU is being offlined while tasks are frozen due to a suspend operation in
-- All process is migrated away from this outgoing CPU to a new CPU
+- All processes are migrated away from this outgoing CPU to new CPUs.
+  The new CPU is chosen from each process' current cpuset, which may be
+  a subset of all online CPUs.
 - All interrupts targeted to this CPU is migrated to a new CPU
 - timers/bottom half/task lets are also migrated to a new CPU
 - Once all services are migrated, kernel calls an arch specific routine
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 31adfde..ecae585 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -21,6 +21,7 @@ extern int cpuset_init_early(void);
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
+extern cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
@@ -87,6 +88,10 @@ static inline cpumask_t cpuset_cpus_allowed(struct 
task_struct *p)
        return cpu_possible_map;
+static inline cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p)
+       return cpu_possible_map;
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index fa31cb9..50f5dc4 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1818,10 +1818,23 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
        cpumask_t mask;
+       mask = cpuset_cpus_allowed_locked(tsk);
+       mutex_unlock(&callback_mutex);
+       return mask;
+ * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
+ * Must be  called with callback_mutex held.
+ **/
+cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk)
+       cpumask_t mask;
        guarantee_online_cpus(task_cs(tsk), &mask);
-       mutex_unlock(&callback_mutex);
        return mask;
diff --git a/kernel/sched.c b/kernel/sched.c
index a7e3046..4071306 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5160,8 +5160,16 @@ static void move_task_off_dead_cpu(int dead_cpu, struct 
task_struct *p)
                /* No more Mr. Nice Guy. */
                if (dest_cpu == NR_CPUS) {
+                       cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p);
+                       /*
+                        * Try to stay on the same cpuset, where the
+                        * current cpuset may be a subset of all cpus.
+                        * The cpuset_cpus_allowed_locked() variant of
+                        * cpuset_cpus_allowed() will not block.  It must be
+                        * called within calls to cpuset_lock/cpuset_unlock.
+                        */
                        rq = task_rq_lock(p, &flags);
-                       cpus_setall(p->cpus_allowed);
+                       p->cpus_allowed = cpus_allowed;
                        dest_cpu = any_online_cpu(p->cpus_allowed);
                        task_rq_unlock(rq, &flags);
@@ -5527,6 +5535,7 @@ migration_call(struct notifier_block *nfb, unsigned long 
action, void *hcpu)
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
+               cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
                rq = cpu_rq(cpu);
@@ -5540,6 +5549,7 @@ migration_call(struct notifier_block *nfb, unsigned long 
action, void *hcpu)
                rq->idle->sched_class = &idle_sched_class;
+               cpuset_unlock();
                BUG_ON(rq->nr_running != 0);
