[ANNOUNCE] 3.2.23-rt37

Steven Rostedt Thu, 19 Jul 2012 06:41:34 -0700

Dear RT Folks,

I'm pleased to announce the 3.2.23-rt37 stable release.



You can get this release via the git tree at:

  git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-stable-rt.git

  Head SHA1: 32a32ba43387e6ad9dd74f38b79ad411c6c1c4e6


Or to build 3.2.23-rt37 directly, the following patches should be applied:

  http://www.kernel.org/pub/linux/kernel/v3.x/linux-3.2.tar.xz

  http://www.kernel.org/pub/linux/kernel/v3.x/patch-3.2.23.xz

  
http://www.kernel.org/pub/linux/kernel/projects/rt/3.2/patch-3.2.23-rt37.patch.xz


You can also build from 3.2.23-rt36 by applying the incremental patch:

  
http://www.kernel.org/pub/linux/kernel/projects/rt/3.2/incr/patch-3.2.23-rt36-rt37.patch.xz



Enjoy,

-- Steve


Changes from 3.2.23-rt36:

---

Steven Rostedt (5):
      cpu/rt: Rework cpu down for PREEMPT_RT
      cpu/rt: Fix cpu_hotplug variable initialization
      workqueue: Revert workqueue: Fix PF_THREAD_BOUND abuse
      workqueue: Revert workqueue: Fix cpuhotplug trainwreck
      Linux 3.2.23-rt37

----
 include/linux/cpu.h       |    6 +-
 include/linux/sched.h     |    7 +
 include/linux/workqueue.h |    5 +-
 kernel/cpu.c              |  240 +++++++++++++++----
 kernel/sched.c            |   82 ++++++-
 kernel/workqueue.c        |  578 ++++++++++++++++++++++++++++++++-------------
 localversion-rt           |    2 +-
 7 files changed, 704 insertions(+), 216 deletions(-)
---------------------------
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 72e90bb..c46ec3e 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -66,10 +66,8 @@ enum {
        /* migration should happen before other stuff but after perf */
        CPU_PRI_PERF            = 20,
        CPU_PRI_MIGRATION       = 10,
-
-       CPU_PRI_WORKQUEUE_ACTIVE        = 5,  /* prepare workqueues for others 
*/
-       CPU_PRI_NORMAL                  = 0,
-       CPU_PRI_WORKQUEUE_INACTIVE      = -5, /* flush workqueues after others 
*/
+       /* prepare workqueues for other notifiers */
+       CPU_PRI_WORKQUEUE       = 5,
 };
 
 #define CPU_ONLINE             0x0002 /* CPU (unsigned)v is up */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0174e3a..9ca3172 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1933,6 +1933,10 @@ extern void do_set_cpus_allowed(struct task_struct *p,
 
 extern int set_cpus_allowed_ptr(struct task_struct *p,
                                const struct cpumask *new_mask);
+int migrate_me(void);
+void tell_sched_cpu_down_begin(int cpu);
+void tell_sched_cpu_down_done(int cpu);
+
 #else
 static inline void do_set_cpus_allowed(struct task_struct *p,
                                      const struct cpumask *new_mask)
@@ -1945,6 +1949,9 @@ static inline int set_cpus_allowed_ptr(struct task_struct 
*p,
                return -EINVAL;
        return 0;
 }
+static inline int migrate_me(void) { return 0; }
+static inline void tell_sched_cpu_down_begin(int cpu) { }
+static inline void tell_sched_cpu_down_done(int cpu) { }
 #endif
 
 #ifndef CONFIG_CPUMASK_OFFSTACK
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 3d8ac9d..e228ca9 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -254,10 +254,9 @@ enum {
        WQ_MEM_RECLAIM          = 1 << 3, /* may be used for memory reclaim */
        WQ_HIGHPRI              = 1 << 4, /* high priority */
        WQ_CPU_INTENSIVE        = 1 << 5, /* cpu instensive workqueue */
-       WQ_NON_AFFINE           = 1 << 6, /* free to move works around cpus */
 
-       WQ_DRAINING             = 1 << 7, /* internal: workqueue is draining */
-       WQ_RESCUER              = 1 << 8, /* internal: workqueue has rescuer */
+       WQ_DRAINING             = 1 << 6, /* internal: workqueue is draining */
+       WQ_RESCUER              = 1 << 7, /* internal: workqueue has rescuer */
 
        WQ_MAX_ACTIVE           = 512,    /* I like 512, better ideas? */
        WQ_MAX_UNBOUND_PER_CPU  = 4,      /* 4 * #cpus for unbound wq */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 66dfb74..8f87b72 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -46,12 +46,7 @@ static int cpu_hotplug_disabled;
 
 static struct {
        struct task_struct *active_writer;
-#ifdef CONFIG_PREEMPT_RT_FULL
-       /* Makes the lock keep the task's state */
-       spinlock_t lock;
-#else
        struct mutex lock; /* Synchronizes accesses to refcount, */
-#endif
        /*
         * Also blocks the new readers during
         * an ongoing cpu hotplug operation.
@@ -59,28 +54,46 @@ static struct {
        int refcount;
 } cpu_hotplug = {
        .active_writer = NULL,
-#ifdef CONFIG_PREEMPT_RT_FULL
-       .lock = __SPIN_LOCK_UNLOCKED(cpu_hotplug.lock),
-#else
        .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
-#endif
        .refcount = 0,
 };
 
-#ifdef CONFIG_PREEMPT_RT_FULL
-# define hotplug_lock() rt_spin_lock(&cpu_hotplug.lock)
-# define hotplug_unlock() rt_spin_unlock(&cpu_hotplug.lock)
-#else
-# define hotplug_lock() mutex_lock(&cpu_hotplug.lock)
-# define hotplug_unlock() mutex_unlock(&cpu_hotplug.lock)
-#endif
-
+/**
+ * hotplug_pcp - per cpu hotplug descriptor
+ * @unplug:    set when pin_current_cpu() needs to sync tasks
+ * @sync_tsk:  the task that waits for tasks to finish pinned sections
+ * @refcount:  counter of tasks in pinned sections
+ * @grab_lock: set when the tasks entering pinned sections should wait
+ * @synced:    notifier for @sync_tsk to tell cpu_down it's finished
+ * @mutex:     the mutex to make tasks wait (used when @grab_lock is true)
+ * @mutex_init:        zero if the mutex hasn't been initialized yet.
+ *
+ * Although @unplug and @sync_tsk may point to the same task, the @unplug
+ * is used as a flag and still exists after @sync_tsk has exited and
+ * @sync_tsk set to NULL.
+ */
 struct hotplug_pcp {
        struct task_struct *unplug;
+       struct task_struct *sync_tsk;
        int refcount;
+       int grab_lock;
        struct completion synced;
+#ifdef CONFIG_PREEMPT_RT_FULL
+       spinlock_t lock;
+#else
+       struct mutex mutex;
+#endif
+       int mutex_init;
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define hotplug_lock(hp) rt_spin_lock(&(hp)->lock)
+# define hotplug_unlock(hp) rt_spin_unlock(&(hp)->lock)
+#else
+# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
+# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
+#endif
+
 static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
 
 /**
@@ -94,18 +107,40 @@ static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
 void pin_current_cpu(void)
 {
        struct hotplug_pcp *hp;
+       int force = 0;
 
 retry:
        hp = &__get_cpu_var(hotplug_pcp);
 
-       if (!hp->unplug || hp->refcount || preempt_count() > 1 ||
+       if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
            hp->unplug == current || (current->flags & PF_STOMPER)) {
                hp->refcount++;
                return;
        }
-       preempt_enable();
-       hotplug_lock();
-       hotplug_unlock();
+
+       if (hp->grab_lock) {
+               preempt_enable();
+               hotplug_lock(hp);
+               hotplug_unlock(hp);
+       } else {
+               preempt_enable();
+               /*
+                * Try to push this task off of this CPU.
+                */
+               if (!migrate_me()) {
+                       preempt_disable();
+                       hp = &__get_cpu_var(hotplug_pcp);
+                       if (!hp->grab_lock) {
+                               /*
+                                * Just let it continue it's already pinned
+                                * or about to sleep.
+                                */
+                               force = 1;
+                               goto retry;
+                       }
+                       preempt_enable();
+               }
+       }
        preempt_disable();
        goto retry;
 }
@@ -127,26 +162,84 @@ void unpin_current_cpu(void)
                wake_up_process(hp->unplug);
 }
 
-/*
- * FIXME: Is this really correct under all circumstances ?
- */
+static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
+{
+       set_current_state(TASK_UNINTERRUPTIBLE);
+       while (hp->refcount) {
+               schedule_preempt_disabled();
+               set_current_state(TASK_UNINTERRUPTIBLE);
+       }
+}
+
 static int sync_unplug_thread(void *data)
 {
        struct hotplug_pcp *hp = data;
 
        preempt_disable();
        hp->unplug = current;
+       wait_for_pinned_cpus(hp);
+
+       /*
+        * This thread will synchronize the cpu_down() with threads
+        * that have pinned the CPU. When the pinned CPU count reaches
+        * zero, we inform the cpu_down code to continue to the next step.
+        */
        set_current_state(TASK_UNINTERRUPTIBLE);
-       while (hp->refcount) {
-               schedule_preempt_disabled();
+       preempt_enable();
+       complete(&hp->synced);
+
+       /*
+        * If all succeeds, the next step will need tasks to wait till
+        * the CPU is offline before continuing. To do this, the grab_lock
+        * is set and tasks going into pin_current_cpu() will block on the
+        * mutex. But we still need to wait for those that are already in
+        * pinned CPU sections. If the cpu_down() failed, the 
kthread_should_stop()
+        * will kick this thread out.
+        */
+       while (!hp->grab_lock && !kthread_should_stop()) {
+               schedule();
+               set_current_state(TASK_UNINTERRUPTIBLE);
+       }
+
+       /* Make sure grab_lock is seen before we see a stale completion */
+       smp_mb();
+
+       /*
+        * Now just before cpu_down() enters stop machine, we need to make
+        * sure all tasks that are in pinned CPU sections are out, and new
+        * tasks will now grab the lock, keeping them from entering pinned
+        * CPU sections.
+        */
+       if (!kthread_should_stop()) {
+               preempt_disable();
+               wait_for_pinned_cpus(hp);
+               preempt_enable();
+               complete(&hp->synced);
+       }
+
+       set_current_state(TASK_UNINTERRUPTIBLE);
+       while (!kthread_should_stop()) {
+               schedule();
                set_current_state(TASK_UNINTERRUPTIBLE);
        }
        set_current_state(TASK_RUNNING);
-       preempt_enable();
-       complete(&hp->synced);
+
+       /*
+        * Force this thread off this CPU as it's going down and
+        * we don't want any more work on this CPU.
+        */
+       current->flags &= ~PF_THREAD_BOUND;
+       do_set_cpus_allowed(current, cpu_present_mask);
+       migrate_me();
        return 0;
 }
 
+static void __cpu_unplug_sync(struct hotplug_pcp *hp)
+{
+       wake_up_process(hp->sync_tsk);
+       wait_for_completion(&hp->synced);
+}
+
 /*
  * Start the sync_unplug_thread on the target cpu and wait for it to
  * complete.
@@ -154,23 +247,83 @@ static int sync_unplug_thread(void *data)
 static int cpu_unplug_begin(unsigned int cpu)
 {
        struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
-       struct task_struct *tsk;
+       int err;
+
+       /* Protected by cpu_hotplug.lock */
+       if (!hp->mutex_init) {
+#ifdef CONFIG_PREEMPT_RT_FULL
+               spin_lock_init(&hp->lock);
+#else
+               mutex_init(&hp->mutex);
+#endif
+               hp->mutex_init = 1;
+       }
+
+       /* Inform the scheduler to migrate tasks off this CPU */
+       tell_sched_cpu_down_begin(cpu);
 
        init_completion(&hp->synced);
-       tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
-       if (IS_ERR(tsk))
-               return (PTR_ERR(tsk));
-       kthread_bind(tsk, cpu);
-       wake_up_process(tsk);
-       wait_for_completion(&hp->synced);
+
+       hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", 
cpu);
+       if (IS_ERR(hp->sync_tsk)) {
+               err = PTR_ERR(hp->sync_tsk);
+               hp->sync_tsk = NULL;
+               return err;
+       }
+       kthread_bind(hp->sync_tsk, cpu);
+
+       /*
+        * Wait for tasks to get out of the pinned sections,
+        * it's still OK if new tasks enter. Some CPU notifiers will
+        * wait for tasks that are going to enter these sections and
+        * we must not have them block.
+        */
+       __cpu_unplug_sync(hp);
+
        return 0;
 }
 
+static void cpu_unplug_sync(unsigned int cpu)
+{
+       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
+
+       init_completion(&hp->synced);
+       /* The completion needs to be initialzied before setting grab_lock */
+       smp_wmb();
+
+       /* Grab the mutex before setting grab_lock */
+       hotplug_lock(hp);
+       hp->grab_lock = 1;
+
+       /*
+        * The CPU notifiers have been completed.
+        * Wait for tasks to get out of pinned CPU sections and have new
+        * tasks block until the CPU is completely down.
+        */
+       __cpu_unplug_sync(hp);
+
+       /* All done with the sync thread */
+       kthread_stop(hp->sync_tsk);
+       hp->sync_tsk = NULL;
+}
+
 static void cpu_unplug_done(unsigned int cpu)
 {
        struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
 
        hp->unplug = NULL;
+       /* Let all tasks know cpu unplug is finished before cleaning up */
+       smp_wmb();
+
+       if (hp->sync_tsk)
+               kthread_stop(hp->sync_tsk);
+
+       if (hp->grab_lock) {
+               hotplug_unlock(hp);
+               /* protected by cpu_hotplug.lock */
+               hp->grab_lock = 0;
+       }
+       tell_sched_cpu_down_done(cpu);
 }
 
 void get_online_cpus(void)
@@ -178,9 +331,9 @@ void get_online_cpus(void)
        might_sleep();
        if (cpu_hotplug.active_writer == current)
                return;
-       hotplug_lock();
+       mutex_lock(&cpu_hotplug.lock);
        cpu_hotplug.refcount++;
-       hotplug_unlock();
+       mutex_unlock(&cpu_hotplug.lock);
 
 }
 EXPORT_SYMBOL_GPL(get_online_cpus);
@@ -189,10 +342,10 @@ void put_online_cpus(void)
 {
        if (cpu_hotplug.active_writer == current)
                return;
-       hotplug_lock();
+       mutex_lock(&cpu_hotplug.lock);
        if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
                wake_up_process(cpu_hotplug.active_writer);
-       hotplug_unlock();
+       mutex_unlock(&cpu_hotplug.lock);
 
 }
 EXPORT_SYMBOL_GPL(put_online_cpus);
@@ -224,11 +377,11 @@ static void cpu_hotplug_begin(void)
        cpu_hotplug.active_writer = current;
 
        for (;;) {
-               hotplug_lock();
+               mutex_lock(&cpu_hotplug.lock);
                if (likely(!cpu_hotplug.refcount))
                        break;
                __set_current_state(TASK_UNINTERRUPTIBLE);
-               hotplug_unlock();
+               mutex_unlock(&cpu_hotplug.lock);
                schedule();
        }
 }
@@ -236,7 +389,7 @@ static void cpu_hotplug_begin(void)
 static void cpu_hotplug_done(void)
 {
        cpu_hotplug.active_writer = NULL;
-       hotplug_unlock();
+       mutex_unlock(&cpu_hotplug.lock);
 }
 
 #else /* #if CONFIG_HOTPLUG_CPU */
@@ -371,6 +524,9 @@ static int __ref _cpu_down(unsigned int cpu, int 
tasks_frozen)
                goto out_release;
        }
 
+       /* Notifiers are done. Don't let any more tasks pin this CPU. */
+       cpu_unplug_sync(cpu);
+
        err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
        if (err) {
                /* CPU didn't die: tell everyone.  Can't complain. */
diff --git a/kernel/sched.c b/kernel/sched.c
index 4dd1fff..bc2375e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4403,7 +4403,7 @@ void migrate_disable(void)
 {
        struct task_struct *p = current;
 
-       if (in_atomic() || p->flags & PF_THREAD_BOUND) {
+       if (in_atomic()) {
 #ifdef CONFIG_SCHED_DEBUG
                p->migrate_disable_atomic++;
 #endif
@@ -4434,7 +4434,7 @@ void migrate_enable(void)
        unsigned long flags;
        struct rq *rq;
 
-       if (in_atomic() || p->flags & PF_THREAD_BOUND) {
+       if (in_atomic()) {
 #ifdef CONFIG_SCHED_DEBUG
                p->migrate_disable_atomic--;
 #endif
@@ -6360,6 +6360,84 @@ void do_set_cpus_allowed(struct task_struct *p, const 
struct cpumask *new_mask)
        cpumask_copy(&p->cpus_allowed, new_mask);
 }
 
+static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
+static DEFINE_MUTEX(sched_down_mutex);
+static cpumask_t sched_down_cpumask;
+
+void tell_sched_cpu_down_begin(int cpu)
+{
+       mutex_lock(&sched_down_mutex);
+       cpumask_set_cpu(cpu, &sched_down_cpumask);
+       mutex_unlock(&sched_down_mutex);
+}
+
+void tell_sched_cpu_down_done(int cpu)
+{
+       mutex_lock(&sched_down_mutex);
+       cpumask_clear_cpu(cpu, &sched_down_cpumask);
+       mutex_unlock(&sched_down_mutex);
+}
+
+/**
+ * migrate_me - try to move the current task off this cpu
+ *
+ * Used by the pin_current_cpu() code to try to get tasks
+ * to move off the current CPU as it is going down.
+ * It will only move the task if the task isn't pinned to
+ * the CPU (with migrate_disable, affinity or THREAD_BOUND)
+ * and the task has to be in a RUNNING state. Otherwise the
+ * movement of the task will wake it up (change its state
+ * to running) when the task did not expect it.
+ *
+ * Returns 1 if it succeeded in moving the current task
+ *         0 otherwise.
+ */
+int migrate_me(void)
+{
+       struct task_struct *p = current;
+       struct migration_arg arg;
+       struct cpumask *cpumask;
+       struct cpumask *mask;
+       unsigned long flags;
+       unsigned int dest_cpu;
+       struct rq *rq;
+
+       /*
+        * We can not migrate tasks bounded to a CPU or tasks not
+        * running. The movement of the task will wake it up.
+        */
+       if (p->flags & PF_THREAD_BOUND || p->state)
+               return 0;
+
+       mutex_lock(&sched_down_mutex);
+       rq = task_rq_lock(p, &flags);
+
+       cpumask = &__get_cpu_var(sched_cpumasks);
+       mask = &p->cpus_allowed;
+
+       cpumask_andnot(cpumask, mask, &sched_down_cpumask);
+
+       if (!cpumask_weight(cpumask)) {
+               /* It's only on this CPU? */
+               task_rq_unlock(rq, p, &flags);
+               mutex_unlock(&sched_down_mutex);
+               return 0;
+       }
+
+       dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
+
+       arg.task = p;
+       arg.dest_cpu = dest_cpu;
+
+       task_rq_unlock(rq, p, &flags);
+
+       stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+       tlb_migrate_finish(p->mm);
+       mutex_unlock(&sched_down_mutex);
+
+       return 1;
+}
+
 /*
  * This is how migration works:
  *
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 674d783..5d23c05b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -41,7 +41,6 @@
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
 #include <linux/idr.h>
-#include <linux/delay.h>
 
 #include "workqueue_sched.h"
 
@@ -58,10 +57,20 @@ enum {
        WORKER_DIE              = 1 << 1,       /* die die die */
        WORKER_IDLE             = 1 << 2,       /* is idle */
        WORKER_PREP             = 1 << 3,       /* preparing to run works */
-       WORKER_CPU_INTENSIVE    = 1 << 4,       /* cpu intensive */
-       WORKER_UNBOUND          = 1 << 5,       /* worker is unbound */
+       WORKER_ROGUE            = 1 << 4,       /* not bound to any cpu */
+       WORKER_REBIND           = 1 << 5,       /* mom is home, come back */
+       WORKER_CPU_INTENSIVE    = 1 << 6,       /* cpu intensive */
+       WORKER_UNBOUND          = 1 << 7,       /* worker is unbound */
 
-       WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_CPU_INTENSIVE | 
WORKER_UNBOUND,
+       WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
+                                 WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
+
+       /* gcwq->trustee_state */
+       TRUSTEE_START           = 0,            /* start */
+       TRUSTEE_IN_CHARGE       = 1,            /* trustee in charge of gcwq */
+       TRUSTEE_BUTCHER         = 2,            /* butcher workers */
+       TRUSTEE_RELEASE         = 3,            /* release workers */
+       TRUSTEE_DONE            = 4,            /* trustee is done */
 
        BUSY_WORKER_HASH_ORDER  = 6,            /* 64 pointers */
        BUSY_WORKER_HASH_SIZE   = 1 << BUSY_WORKER_HASH_ORDER,
@@ -75,6 +84,7 @@ enum {
                                                   (min two ticks) */
        MAYDAY_INTERVAL         = HZ / 10,      /* and then every 100ms */
        CREATE_COOLDOWN         = HZ,           /* time to breath after fail */
+       TRUSTEE_COOLDOWN        = HZ / 10,      /* for trustee draining */
 
        /*
         * Rescue workers are used only on emergencies and shared by
@@ -126,6 +136,7 @@ struct worker {
        unsigned long           last_active;    /* L: last active timestamp */
        unsigned int            flags;          /* X: flags */
        int                     id;             /* I: worker id */
+       struct work_struct      rebind_work;    /* L: rebind worker to cpu */
        int                     sleeping;       /* None */
 };
 
@@ -153,8 +164,10 @@ struct global_cwq {
 
        struct ida              worker_ida;     /* L: for worker IDs */
 
+       struct task_struct      *trustee;       /* L: for gcwq shutdown */
+       unsigned int            trustee_state;  /* L: trustee state */
+       wait_queue_head_t       trustee_wait;   /* trustee wait */
        struct worker           *first_idle;    /* L: first idle worker */
-       wait_queue_head_t       idle_wait;
 } ____cacheline_aligned_in_smp;
 
 /*
@@ -961,38 +974,13 @@ static bool is_chained_work(struct workqueue_struct *wq)
        return false;
 }
 
-static void ___queue_work(struct workqueue_struct *wq, struct global_cwq *gcwq,
-                         struct work_struct *work)
-{
-       struct cpu_workqueue_struct *cwq;
-       struct list_head *worklist;
-       unsigned int work_flags;
-
-       /* gcwq determined, get cwq and queue */
-       cwq = get_cwq(gcwq->cpu, wq);
-       trace_workqueue_queue_work(gcwq->cpu, cwq, work);
-
-       BUG_ON(!list_empty(&work->entry));
-
-       cwq->nr_in_flight[cwq->work_color]++;
-       work_flags = work_color_to_flags(cwq->work_color);
-
-       if (likely(cwq->nr_active < cwq->max_active)) {
-               trace_workqueue_activate_work(work);
-               cwq->nr_active++;
-               worklist = gcwq_determine_ins_pos(gcwq, cwq);
-       } else {
-               work_flags |= WORK_STRUCT_DELAYED;
-               worklist = &cwq->delayed_works;
-       }
-
-       insert_work(cwq, work, worklist, work_flags);
-}
-
 static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
                         struct work_struct *work)
 {
        struct global_cwq *gcwq;
+       struct cpu_workqueue_struct *cwq;
+       struct list_head *worklist;
+       unsigned int work_flags;
        unsigned long flags;
 
        debug_work_activate(work);
@@ -1038,32 +1026,27 @@ static void __queue_work(unsigned int cpu, struct 
workqueue_struct *wq,
                spin_lock_irqsave(&gcwq->lock, flags);
        }
 
-       ___queue_work(wq, gcwq, work);
+       /* gcwq determined, get cwq and queue */
+       cwq = get_cwq(gcwq->cpu, wq);
+       trace_workqueue_queue_work(cpu, cwq, work);
 
-       spin_unlock_irqrestore(&gcwq->lock, flags);
-}
+       BUG_ON(!list_empty(&work->entry));
 
-/**
- * queue_work_on - queue work on specific cpu
- * @cpu: CPU number to execute work on
- * @wq: workqueue to use
- * @work: work to queue
- *
- * Returns 0 if @work was already on a queue, non-zero otherwise.
- *
- * We queue the work to a specific CPU, the caller must ensure it
- * can't go away.
- */
-static int
-__queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
-{
-       int ret = 0;
+       cwq->nr_in_flight[cwq->work_color]++;
+       work_flags = work_color_to_flags(cwq->work_color);
 
-       if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
-               __queue_work(cpu, wq, work);
-               ret = 1;
+       if (likely(cwq->nr_active < cwq->max_active)) {
+               trace_workqueue_activate_work(work);
+               cwq->nr_active++;
+               worklist = gcwq_determine_ins_pos(gcwq, cwq);
+       } else {
+               work_flags |= WORK_STRUCT_DELAYED;
+               worklist = &cwq->delayed_works;
        }
-       return ret;
+
+       insert_work(cwq, work, worklist, work_flags);
+
+       spin_unlock_irqrestore(&gcwq->lock, flags);
 }
 
 /**
@@ -1080,19 +1063,34 @@ int queue_work(struct workqueue_struct *wq, struct 
work_struct *work)
 {
        int ret;
 
-       ret = __queue_work_on(get_cpu_light(), wq, work);
+       ret = queue_work_on(get_cpu_light(), wq, work);
        put_cpu_light();
 
        return ret;
 }
 EXPORT_SYMBOL_GPL(queue_work);
 
+/**
+ * queue_work_on - queue work on specific cpu
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @work: work to queue
+ *
+ * Returns 0 if @work was already on a queue, non-zero otherwise.
+ *
+ * We queue the work to a specific CPU, the caller must ensure it
+ * can't go away.
+ */
 int
 queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
 {
-       WARN_ON(wq->flags & WQ_NON_AFFINE);
+       int ret = 0;
 
-       return __queue_work_on(cpu, wq, work);
+       if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+               __queue_work(cpu, wq, work);
+               ret = 1;
+       }
+       return ret;
 }
 EXPORT_SYMBOL_GPL(queue_work_on);
 
@@ -1138,8 +1136,6 @@ int queue_delayed_work_on(int cpu, struct 
workqueue_struct *wq,
        struct timer_list *timer = &dwork->timer;
        struct work_struct *work = &dwork->work;
 
-       WARN_ON((wq->flags & WQ_NON_AFFINE) && cpu != -1);
-
        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
                unsigned int lcpu;
 
@@ -1205,13 +1201,12 @@ static void worker_enter_idle(struct worker *worker)
        /* idle_list is LIFO */
        list_add(&worker->entry, &gcwq->idle_list);
 
-       if (gcwq->nr_idle == gcwq->nr_workers)
-               wake_up_all(&gcwq->idle_wait);
-
-       if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) {
-               mod_timer(&gcwq->idle_timer,
-                               jiffies + IDLE_WORKER_TIMEOUT);
-       }
+       if (likely(!(worker->flags & WORKER_ROGUE))) {
+               if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
+                       mod_timer(&gcwq->idle_timer,
+                                 jiffies + IDLE_WORKER_TIMEOUT);
+       } else
+               wake_up_all(&gcwq->trustee_wait);
 
        /* sanity check nr_running */
        WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
@@ -1288,14 +1283,8 @@ __acquires(&gcwq->lock)
                        return false;
                if (task_cpu(task) == gcwq->cpu &&
                    cpumask_equal(&current->cpus_allowed,
-                                 get_cpu_mask(gcwq->cpu))) {
-                       /*
-                        * Since we're binding to a particular cpu and need to
-                        * stay there for correctness, mark us PF_THREAD_BOUND.
-                        */
-                       task->flags |= PF_THREAD_BOUND;
+                                 get_cpu_mask(gcwq->cpu)))
                        return true;
-               }
                spin_unlock_irq(&gcwq->lock);
 
                /*
@@ -1309,15 +1298,20 @@ __acquires(&gcwq->lock)
        }
 }
 
-static void worker_unbind_and_unlock(struct worker *worker)
+/*
+ * Function for worker->rebind_work used to rebind rogue busy workers
+ * to the associated cpu which is coming back online.  This is
+ * scheduled by cpu up but can race with other cpu hotplug operations
+ * and may be executed twice without intervening cpu down.
+ */
+static void worker_rebind_fn(struct work_struct *work)
 {
+       struct worker *worker = container_of(work, struct worker, rebind_work);
        struct global_cwq *gcwq = worker->gcwq;
-       struct task_struct *task = worker->task;
 
-       /*
-        * Its no longer required we're PF_THREAD_BOUND, the work is done.
-        */
-       task->flags &= ~PF_THREAD_BOUND;
+       if (worker_maybe_bind_and_lock(worker))
+               worker_clr_flags(worker, WORKER_REBIND);
+
        spin_unlock_irq(&gcwq->lock);
 }
 
@@ -1329,6 +1323,7 @@ static struct worker *alloc_worker(void)
        if (worker) {
                INIT_LIST_HEAD(&worker->entry);
                INIT_LIST_HEAD(&worker->scheduled);
+               INIT_WORK(&worker->rebind_work, worker_rebind_fn);
                /* on creation a worker is in !idle && prep state */
                worker->flags = WORKER_PREP;
        }
@@ -1383,9 +1378,15 @@ static struct worker *create_worker(struct global_cwq 
*gcwq, bool bind)
        if (IS_ERR(worker->task))
                goto fail;
 
+       /*
+        * A rogue worker will become a regular one if CPU comes
+        * online later on.  Make sure every worker has
+        * PF_THREAD_BOUND set.
+        */
        if (bind && !on_unbound_cpu)
                kthread_bind(worker->task, gcwq->cpu);
        else {
+               worker->task->flags |= PF_THREAD_BOUND;
                if (on_unbound_cpu)
                        worker->flags |= WORKER_UNBOUND;
        }
@@ -1662,6 +1663,13 @@ static bool manage_workers(struct worker *worker)
 
        gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
 
+       /*
+        * The trustee might be waiting to take over the manager
+        * position, tell it we're done.
+        */
+       if (unlikely(gcwq->trustee))
+               wake_up_all(&gcwq->trustee_wait);
+
        return ret;
 }
 
@@ -2062,7 +2070,7 @@ repeat:
                if (keep_working(gcwq))
                        wake_up_worker(gcwq);
 
-               worker_unbind_and_unlock(rescuer);
+               spin_unlock_irq(&gcwq->lock);
        }
 
        schedule();
@@ -3011,6 +3019,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char 
*name,
                if (IS_ERR(rescuer->task))
                        goto err;
 
+               rescuer->task->flags |= PF_THREAD_BOUND;
                wake_up_process(rescuer->task);
        }
 
@@ -3200,76 +3209,171 @@ EXPORT_SYMBOL_GPL(work_busy);
  * gcwqs serve mix of short, long and very long running works making
  * blocked draining impractical.
  *
+ * This is solved by allowing a gcwq to be detached from CPU, running
+ * it with unbound (rogue) workers and allowing it to be reattached
+ * later if the cpu comes back online.  A separate thread is created
+ * to govern a gcwq in such state and is called the trustee of the
+ * gcwq.
+ *
+ * Trustee states and their descriptions.
+ *
+ * START       Command state used on startup.  On CPU_DOWN_PREPARE, a
+ *             new trustee is started with this state.
+ *
+ * IN_CHARGE   Once started, trustee will enter this state after
+ *             assuming the manager role and making all existing
+ *             workers rogue.  DOWN_PREPARE waits for trustee to
+ *             enter this state.  After reaching IN_CHARGE, trustee
+ *             tries to execute the pending worklist until it's empty
+ *             and the state is set to BUTCHER, or the state is set
+ *             to RELEASE.
+ *
+ * BUTCHER     Command state which is set by the cpu callback after
+ *             the cpu has went down.  Once this state is set trustee
+ *             knows that there will be no new works on the worklist
+ *             and once the worklist is empty it can proceed to
+ *             killing idle workers.
+ *
+ * RELEASE     Command state which is set by the cpu callback if the
+ *             cpu down has been canceled or it has come online
+ *             again.  After recognizing this state, trustee stops
+ *             trying to drain or butcher and clears ROGUE, rebinds
+ *             all remaining workers back to the cpu and releases
+ *             manager role.
+ *
+ * DONE                Trustee will enter this state after BUTCHER or RELEASE
+ *             is complete.
+ *
+ *          trustee                 CPU                draining
+ *         took over                down               complete
+ * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
+ *                        |                     |                  ^
+ *                        | CPU is back online  v   return workers |
+ *                         ----------------> RELEASE --------------
  */
 
-static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
-                                               unsigned long action,
-                                               void *hcpu)
-{
-       unsigned int cpu = (unsigned long)hcpu;
-       struct global_cwq *gcwq = get_gcwq(cpu);
-       struct worker *uninitialized_var(new_worker);
-       unsigned long flags;
+/**
+ * trustee_wait_event_timeout - timed event wait for trustee
+ * @cond: condition to wait for
+ * @timeout: timeout in jiffies
+ *
+ * wait_event_timeout() for trustee to use.  Handles locking and
+ * checks for RELEASE request.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  To be used by trustee.
+ *
+ * RETURNS:
+ * Positive indicating left time if @cond is satisfied, 0 if timed
+ * out, -1 if canceled.
+ */
+#define trustee_wait_event_timeout(cond, timeout) ({                   \
+       long __ret = (timeout);                                         \
+       while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
+              __ret) {                                                 \
+               spin_unlock_irq(&gcwq->lock);                           \
+               __wait_event_timeout(gcwq->trustee_wait, (cond) ||      \
+                       (gcwq->trustee_state == TRUSTEE_RELEASE),       \
+                       __ret);                                         \
+               spin_lock_irq(&gcwq->lock);                             \
+       }                                                               \
+       gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret);          \
+})
 
-       action &= ~CPU_TASKS_FROZEN;
+/**
+ * trustee_wait_event - event wait for trustee
+ * @cond: condition to wait for
+ *
+ * wait_event() for trustee to use.  Automatically handles locking and
+ * checks for CANCEL request.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  To be used by trustee.
+ *
+ * RETURNS:
+ * 0 if @cond is satisfied, -1 if canceled.
+ */
+#define trustee_wait_event(cond) ({                                    \
+       long __ret1;                                                    \
+       __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
+       __ret1 < 0 ? -1 : 0;                                            \
+})
 
-       switch (action) {
-       case CPU_UP_PREPARE:
-               BUG_ON(gcwq->first_idle);
-               new_worker = create_worker(gcwq, false);
-               if (!new_worker)
-                       return NOTIFY_BAD;
-       case CPU_UP_CANCELED:
-       case CPU_ONLINE:
-               break;
-       default:
-               return notifier_from_errno(0);
-       }
+static int __cpuinit trustee_thread(void *__gcwq)
+{
+       struct global_cwq *gcwq = __gcwq;
+       struct worker *worker;
+       struct work_struct *work;
+       struct hlist_node *pos;
+       long rc;
+       int i;
 
-       /* some are called w/ irq disabled, don't disturb irq status */
-       spin_lock_irqsave(&gcwq->lock, flags);
+       BUG_ON(gcwq->cpu != smp_processor_id());
 
-       switch (action) {
-       case CPU_UP_PREPARE:
-               BUG_ON(gcwq->first_idle);
-               gcwq->first_idle = new_worker;
-               break;
+       spin_lock_irq(&gcwq->lock);
+       /*
+        * Claim the manager position and make all workers rogue.
+        * Trustee must be bound to the target cpu and can't be
+        * cancelled.
+        */
+       BUG_ON(gcwq->cpu != smp_processor_id());
+       rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
+       BUG_ON(rc < 0);
 
-       case CPU_UP_CANCELED:
-               destroy_worker(gcwq->first_idle);
-               gcwq->first_idle = NULL;
-               break;
+       gcwq->flags |= GCWQ_MANAGING_WORKERS;
 
-       case CPU_ONLINE:
-               spin_unlock_irq(&gcwq->lock);
-               kthread_bind(gcwq->first_idle->task, cpu);
-               spin_lock_irq(&gcwq->lock);
-               gcwq->flags |= GCWQ_MANAGE_WORKERS;
-               start_worker(gcwq->first_idle);
-               gcwq->first_idle = NULL;
-               break;
-       }
+       list_for_each_entry(worker, &gcwq->idle_list, entry)
+               worker->flags |= WORKER_ROGUE;
 
-       spin_unlock_irqrestore(&gcwq->lock, flags);
+       for_each_busy_worker(worker, i, pos, gcwq)
+               worker->flags |= WORKER_ROGUE;
 
-       return notifier_from_errno(0);
-}
+       /*
+        * Call schedule() so that we cross rq->lock and thus can
+        * guarantee sched callbacks see the rogue flag.  This is
+        * necessary as scheduler callbacks may be invoked from other
+        * cpus.
+        */
+       spin_unlock_irq(&gcwq->lock);
+       schedule();
+       spin_lock_irq(&gcwq->lock);
 
-static void flush_gcwq(struct global_cwq *gcwq)
-{
-       struct work_struct *work, *nw;
-       struct worker *worker, *n;
-       LIST_HEAD(non_affine_works);
+       /*
+        * Sched callbacks are disabled now.  Zap nr_running.  After
+        * this, nr_running stays zero and need_more_worker() and
+        * keep_working() are always true as long as the worklist is
+        * not empty.
+        */
+       atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
 
+       spin_unlock_irq(&gcwq->lock);
+       del_timer_sync(&gcwq->idle_timer);
        spin_lock_irq(&gcwq->lock);
-       list_for_each_entry_safe(work, nw, &gcwq->worklist, entry) {
-               struct workqueue_struct *wq = get_work_cwq(work)->wq;
 
-               if (wq->flags & WQ_NON_AFFINE)
-                       list_move(&work->entry, &non_affine_works);
-       }
+       /*
+        * We're now in charge.  Notify and proceed to drain.  We need
+        * to keep the gcwq running during the whole CPU down
+        * procedure as other cpu hotunplug callbacks may need to
+        * flush currently running tasks.
+        */
+       gcwq->trustee_state = TRUSTEE_IN_CHARGE;
+       wake_up_all(&gcwq->trustee_wait);
 
-       while (!list_empty(&gcwq->worklist)) {
+       /*
+        * The original cpu is in the process of dying and may go away
+        * anytime now.  When that happens, we and all workers would
+        * be migrated to other cpus.  Try draining any left work.  We
+        * want to get it over with ASAP - spam rescuers, wake up as
+        * many idlers as necessary and create new ones till the
+        * worklist is empty.  Note that if the gcwq is frozen, there
+        * may be frozen works in freezable cwqs.  Don't declare
+        * completion while frozen.
+        */
+       while (gcwq->nr_workers != gcwq->nr_idle ||
+              gcwq->flags & GCWQ_FREEZING ||
+              gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
                int nr_works = 0;
 
                list_for_each_entry(work, &gcwq->worklist, entry) {
@@ -3283,55 +3387,200 @@ static void flush_gcwq(struct global_cwq *gcwq)
                        wake_up_process(worker->task);
                }
 
-               spin_unlock_irq(&gcwq->lock);
-
                if (need_to_create_worker(gcwq)) {
-                       worker = create_worker(gcwq, true);
-                       if (worker)
+                       spin_unlock_irq(&gcwq->lock);
+                       worker = create_worker(gcwq, false);
+                       spin_lock_irq(&gcwq->lock);
+                       if (worker) {
+                               worker->flags |= WORKER_ROGUE;
                                start_worker(worker);
+                       }
                }
 
-               wait_event_timeout(gcwq->idle_wait,
-                               gcwq->nr_idle == gcwq->nr_workers, HZ/10);
-
-               spin_lock_irq(&gcwq->lock);
+               /* give a breather */
+               if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
+                       break;
        }
 
-       WARN_ON(gcwq->nr_workers != gcwq->nr_idle);
+       /*
+        * Either all works have been scheduled and cpu is down, or
+        * cpu down has already been canceled.  Wait for and butcher
+        * all workers till we're canceled.
+        */
+       do {
+               rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
+               while (!list_empty(&gcwq->idle_list))
+                       destroy_worker(list_first_entry(&gcwq->idle_list,
+                                                       struct worker, entry));
+       } while (gcwq->nr_workers && rc >= 0);
 
-       list_for_each_entry_safe(worker, n, &gcwq->idle_list, entry)
-               destroy_worker(worker);
+       /*
+        * At this point, either draining has completed and no worker
+        * is left, or cpu down has been canceled or the cpu is being
+        * brought back up.  There shouldn't be any idle one left.
+        * Tell the remaining busy ones to rebind once it finishes the
+        * currently scheduled works by scheduling the rebind_work.
+        */
+       WARN_ON(!list_empty(&gcwq->idle_list));
 
-       WARN_ON(gcwq->nr_workers || gcwq->nr_idle);
+       for_each_busy_worker(worker, i, pos, gcwq) {
+               struct work_struct *rebind_work = &worker->rebind_work;
 
-       spin_unlock_irq(&gcwq->lock);
+               /*
+                * Rebind_work may race with future cpu hotplug
+                * operations.  Use a separate flag to mark that
+                * rebinding is scheduled.
+                */
+               worker->flags |= WORKER_REBIND;
+               worker->flags &= ~WORKER_ROGUE;
 
-       gcwq = get_gcwq(get_cpu_light());
-       spin_lock_irq(&gcwq->lock);
-       list_for_each_entry_safe(work, nw, &non_affine_works, entry) {
-               list_del_init(&work->entry);
-               ___queue_work(get_work_cwq(work)->wq, gcwq, work);
+               /* queue rebind_work, wq doesn't matter, use the default one */
+               if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
+                                    work_data_bits(rebind_work)))
+                       continue;
+
+               debug_work_activate(rebind_work);
+               insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
+                           worker->scheduled.next,
+                           work_color_to_flags(WORK_NO_COLOR));
        }
+
+       /* relinquish manager role */
+       gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
+
+       /* notify completion */
+       gcwq->trustee = NULL;
+       gcwq->trustee_state = TRUSTEE_DONE;
+       wake_up_all(&gcwq->trustee_wait);
        spin_unlock_irq(&gcwq->lock);
-       put_cpu_light();
+       return 0;
 }
 
-static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
+/**
+ * wait_trustee_state - wait for trustee to enter the specified state
+ * @gcwq: gcwq the trustee of interest belongs to
+ * @state: target state to wait for
+ *
+ * Wait for the trustee to reach @state.  DONE is already matched.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.  To be used by cpu_callback.
+ */
+static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
+__releases(&gcwq->lock)
+__acquires(&gcwq->lock)
+{
+       if (!(gcwq->trustee_state == state ||
+             gcwq->trustee_state == TRUSTEE_DONE)) {
+               spin_unlock_irq(&gcwq->lock);
+               __wait_event(gcwq->trustee_wait,
+                            gcwq->trustee_state == state ||
+                            gcwq->trustee_state == TRUSTEE_DONE);
+               spin_lock_irq(&gcwq->lock);
+       }
+}
+
+static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
                                                unsigned long action,
                                                void *hcpu)
 {
        unsigned int cpu = (unsigned long)hcpu;
        struct global_cwq *gcwq = get_gcwq(cpu);
+       struct task_struct *new_trustee = NULL;
+       struct worker *uninitialized_var(new_worker);
+       unsigned long flags;
 
        action &= ~CPU_TASKS_FROZEN;
 
-        switch (action) {
-        case CPU_DOWN_PREPARE:
-                flush_gcwq(gcwq);
-                break;
-        }
+       switch (action) {
+       case CPU_DOWN_PREPARE:
+               new_trustee = kthread_create(trustee_thread, gcwq,
+                                            "workqueue_trustee/%d\n", cpu);
+               if (IS_ERR(new_trustee))
+                       return notifier_from_errno(PTR_ERR(new_trustee));
+               kthread_bind(new_trustee, cpu);
+               /* fall through */
+       case CPU_UP_PREPARE:
+               BUG_ON(gcwq->first_idle);
+               new_worker = create_worker(gcwq, false);
+               if (!new_worker) {
+                       if (new_trustee)
+                               kthread_stop(new_trustee);
+                       return NOTIFY_BAD;
+               }
+               break;
+       case CPU_POST_DEAD:
+       case CPU_UP_CANCELED:
+       case CPU_DOWN_FAILED:
+       case CPU_ONLINE:
+               break;
+       case CPU_DYING:
+               /*
+                * We access this lockless. We are on the dying CPU
+                * and called from stomp machine.
+                *
+                * Before this, the trustee and all workers except for
+                * the ones which are still executing works from
+                * before the last CPU down must be on the cpu.  After
+                * this, they'll all be diasporas.
+                */
+               gcwq->flags |= GCWQ_DISASSOCIATED;
+       default:
+               goto out;
+       }
+
+       /* some are called w/ irq disabled, don't disturb irq status */
+       spin_lock_irqsave(&gcwq->lock, flags);
+
+       switch (action) {
+       case CPU_DOWN_PREPARE:
+               /* initialize trustee and tell it to acquire the gcwq */
+               BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
+               gcwq->trustee = new_trustee;
+               gcwq->trustee_state = TRUSTEE_START;
+               wake_up_process(gcwq->trustee);
+               wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
+               /* fall through */
+       case CPU_UP_PREPARE:
+               BUG_ON(gcwq->first_idle);
+               gcwq->first_idle = new_worker;
+               break;
+
+       case CPU_POST_DEAD:
+               gcwq->trustee_state = TRUSTEE_BUTCHER;
+               /* fall through */
+       case CPU_UP_CANCELED:
+               destroy_worker(gcwq->first_idle);
+               gcwq->first_idle = NULL;
+               break;
 
+       case CPU_DOWN_FAILED:
+       case CPU_ONLINE:
+               gcwq->flags &= ~GCWQ_DISASSOCIATED;
+               if (gcwq->trustee_state != TRUSTEE_DONE) {
+                       gcwq->trustee_state = TRUSTEE_RELEASE;
+                       wake_up_process(gcwq->trustee);
+                       wait_trustee_state(gcwq, TRUSTEE_DONE);
+               }
+
+               /*
+                * Trustee is done and there might be no worker left.
+                * Put the first_idle in and request a real manager to
+                * take a look.
+                */
+               spin_unlock_irq(&gcwq->lock);
+               kthread_bind(gcwq->first_idle->task, cpu);
+               spin_lock_irq(&gcwq->lock);
+               gcwq->flags |= GCWQ_MANAGE_WORKERS;
+               start_worker(gcwq->first_idle);
+               gcwq->first_idle = NULL;
+               break;
+       }
+
+       spin_unlock_irqrestore(&gcwq->lock, flags);
 
+out:
        return notifier_from_errno(0);
 }
 
@@ -3528,8 +3777,7 @@ static int __init init_workqueues(void)
        unsigned int cpu;
        int i;
 
-       cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_ACTIVE);
-       hotcpu_notifier(workqueue_cpu_down_callback, 
CPU_PRI_WORKQUEUE_INACTIVE);
+       cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
 
        /* initialize gcwqs */
        for_each_gcwq_cpu(cpu) {
@@ -3552,7 +3800,9 @@ static int __init init_workqueues(void)
                            (unsigned long)gcwq);
 
                ida_init(&gcwq->worker_ida);
-               init_waitqueue_head(&gcwq->idle_wait);
+
+               gcwq->trustee_state = TRUSTEE_DONE;
+               init_waitqueue_head(&gcwq->trustee_wait);
        }
 
        /* create the initial worker */
diff --git a/localversion-rt b/localversion-rt
index 2294034..a3b2408 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt36
+-rt37


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[ANNOUNCE] 3.2.23-rt37

Reply via email to