Instead of implementing a custom locked reference counting, use lockref.

Current implementation leads to a deadlock splat on Intel SKL platforms
when lockdep debugging is enabled.

This is due to few of CPUfreq drivers (including Intel P-state) having this;
policy->rwsem is locked during driver initialization and the functions called
during init that actually apply CPU limits use get_online_cpus (because they
have other calling paths too), which will briefly lock cpu_hotplug.lock to
increase cpu_hotplug.refcount.

On later calling path, when doing a suspend, when cpu_hotplug_begin() is called
in disable_nonboot_cpus(), callbacks to CPUfreq functions get called after,
which will lock policy->rwsem and cpu_hotplug.lock is already held by
cpu_hotplug_begin() and we do have a potential deadlock scenario reported by
our CI system (though it is a very unlikely one). See the Bugzilla link for more
details.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93294
Cc: Linux kernel development <linux-kernel@vger.kernel.org>
Cc: Ingo Molnar <mi...@kernel.org>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: David Hildenbrand <d...@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paul...@linux.vnet.ibm.com>
Cc: Gautham R. Shenoy <e...@linux.vnet.ibm.com>
Cc: Chris Wilson <ch...@chris-wilson.co.uk>
Signed-off-by: Joonas Lahtinen <joonas.lahti...@linux.intel.com>
---
 kernel/cpu.c | 87 +++++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 54 insertions(+), 33 deletions(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5b9d396..8acec83 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -17,6 +17,7 @@
 #include <linux/kthread.h>
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
+#include <linux/lockref.h>
 #include <linux/gfp.h>
 #include <linux/suspend.h>
 #include <linux/lockdep.h>
@@ -62,13 +63,10 @@ static struct {
        struct task_struct *active_writer;
        /* wait queue to wake up the active_writer */
        wait_queue_head_t wq;
-       /* verifies that no writer will get active while readers are active */
-       struct mutex lock;
-       /*
-        * Also blocks the new readers during
-        * an ongoing cpu hotplug operation.
-        */
-       atomic_t refcount;
+       /* wait queue to wake up readers */
+       wait_queue_head_t reader_wq;
+       /* track online CPU users */
+       struct lockref lockref;
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map dep_map;
@@ -76,7 +74,7 @@ static struct {
 } cpu_hotplug = {
        .active_writer = NULL,
        .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
-       .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
+       .reader_wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.reader_wq),
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        .dep_map = {.name = "cpu_hotplug.lock" },
 #endif
@@ -92,52 +90,64 @@ static struct {
 
 void get_online_cpus(void)
 {
-       might_sleep();
+       DEFINE_WAIT(wait);
+
        if (cpu_hotplug.active_writer == current)
                return;
+
        cpuhp_lock_acquire_read();
-       mutex_lock(&cpu_hotplug.lock);
-       atomic_inc(&cpu_hotplug.refcount);
-       mutex_unlock(&cpu_hotplug.lock);
+
+       /* First to get might have to wait over a hotplug operation. */
+       while (!lockref_get_or_lock(&cpu_hotplug.lockref)) {
+               if (cpu_hotplug.lockref.count == 0) {
+                       cpu_hotplug.lockref.count++;
+                       spin_unlock(&cpu_hotplug.lockref.lock);
+                       break;
+               }
+               spin_unlock(&cpu_hotplug.lockref.lock);
+
+               prepare_to_wait(&cpu_hotplug.reader_wq, &wait, 
TASK_UNINTERRUPTIBLE);
+               schedule();
+               finish_wait(&cpu_hotplug.reader_wq, &wait);
+       }
 }
 EXPORT_SYMBOL_GPL(get_online_cpus);
 
 void put_online_cpus(void)
 {
-       int refcount;
-
        if (cpu_hotplug.active_writer == current)
                return;
 
-       refcount = atomic_dec_return(&cpu_hotplug.refcount);
-       if (WARN_ON(refcount < 0)) /* try to fix things up */
-               atomic_inc(&cpu_hotplug.refcount);
+       /* Last to release might have to wake queued hotplug operation. */
+       if (!lockref_put_or_lock(&cpu_hotplug.lockref)) {
+               WARN_ON(cpu_hotplug.lockref.count <= 0);
+               cpu_hotplug.lockref.count = 0;
+               spin_unlock(&cpu_hotplug.lockref.lock);
 
-       if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
-               wake_up(&cpu_hotplug.wq);
+               if (waitqueue_active(&cpu_hotplug.wq))
+                       wake_up(&cpu_hotplug.wq);
+       }
 
        cpuhp_lock_release();
-
 }
 EXPORT_SYMBOL_GPL(put_online_cpus);
 
 /*
  * This ensures that the hotplug operation can begin only when the
- * refcount goes to zero.
+ * cpu_hotplug.lockref goes to zero.
  *
  * Note that during a cpu-hotplug operation, the new readers, if any,
- * will be blocked by the cpu_hotplug.lock
+ * will be blocked by the cpu_hotplug.lockref being dead.
  *
  * Since cpu_hotplug_begin() is always called after invoking
  * cpu_maps_update_begin(), we can be sure that only one writer is active.
  *
  * Note that theoretically, there is a possibility of a livelock:
- * - Refcount goes to zero, last reader wakes up the sleeping
+ * - Lockref goes to zero, last reader wakes up the sleeping
  *   writer.
- * - Last reader unlocks the cpu_hotplug.lock.
- * - A new reader arrives at this moment, bumps up the refcount.
- * - The writer acquires the cpu_hotplug.lock finds the refcount
- *   non zero and goes to sleep again.
+ * - A new reader arrives at this moment, bumps up the lockref.
+ * - The woken up writer finds the lockref non-zero and goes
+ *   to sleep again.
  *
  * However, this is very difficult to achieve in practice since
  * get_online_cpus() not an api which is called all that often.
@@ -151,20 +161,31 @@ void cpu_hotplug_begin(void)
        cpuhp_lock_acquire();
 
        for (;;) {
-               mutex_lock(&cpu_hotplug.lock);
+               spin_lock(&cpu_hotplug.lockref.lock);
+               if (cpu_hotplug.lockref.count <= 0)
+                       break;
+               spin_unlock(&cpu_hotplug.lockref.lock);
+
                prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
-               if (likely(!atomic_read(&cpu_hotplug.refcount)))
-                               break;
-               mutex_unlock(&cpu_hotplug.lock);
                schedule();
+               finish_wait(&cpu_hotplug.wq, &wait);
        }
-       finish_wait(&cpu_hotplug.wq, &wait);
+
+       WARN_ON(__lockref_is_dead(&cpu_hotplug.lockref));
+       lockref_mark_dead(&cpu_hotplug.lockref);
+       spin_unlock(&cpu_hotplug.lockref.lock);
 }
 
 void cpu_hotplug_done(void)
 {
+       WARN_ON(!__lockref_is_dead(&cpu_hotplug.lockref));
+       cpu_hotplug.lockref.count = 0;
+       spin_unlock(&cpu_hotplug.lockref.lock);
+
+       if (waitqueue_active(&cpu_hotplug.reader_wq))
+               wake_up(&cpu_hotplug.reader_wq);
+
        cpu_hotplug.active_writer = NULL;
-       mutex_unlock(&cpu_hotplug.lock);
        cpuhp_lock_release();
 }
 
-- 
2.5.0

Reply via email to