When PREEMPT_RT_FULL does the spinlock -> rt_mutex substitution the PI
chain code will (falsely) report a deadlock and BUG.

The problem is that we hold hb->lock (now an rt_mutex) while doing
task_blocks_on_rt_mutex on the futex's pi_state::rtmutex. This, when
interleaved just right with futex_unlock_pi() leads it to believe we
have an AB-BA deadlock.

  Task1 (holds rt_mutex,        Task2 (does FUTEX_LOCK_PI)
         does FUTEX_UNLOCK_PI)

                                lock hb->lock
                                lock rt_mutex (as per start_proxy)
  lock hb->lock

Which is a trivial AB-BA.

It is not an actual deadlock, because we won't be holding hb->lock by
the time we actually block on rt_mutex, but the chainwalk code doesn't
know that.

To avoid this problem, do the same thing we do in futex_unlock_pi()
and drop hb->lock after acquiring wait_lock. This still fully
serializes against futex_unlock_pi(), since adding to the wait_list
does the very same lock dance, and removing it holds both locks.

Reported-by: Sebastian Andrzej Siewior <[email protected]>
Suggested-by: Thomas Gleixner <[email protected]>
Tested-by: Sebastian Andrzej Siewior <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
 kernel/futex.c                  |   30 +++++++++++++++++-------
 kernel/locking/rtmutex.c        |   49 ++++++++++++++++++++++------------------
 kernel/locking/rtmutex_common.h |    3 ++
 3 files changed, 52 insertions(+), 30 deletions(-)

--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2654,20 +2654,33 @@ static int futex_lock_pi(u32 __user *uad
                goto no_block;
        }
 
+       rt_mutex_init_waiter(&rt_waiter);
+
        /*
-        * We must add ourselves to the rt_mutex waitlist while holding hb->lock
-        * such that the hb and rt_mutex wait lists match.
+        * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
+        * hold it while doing rt_mutex_start_proxy(), because then it will
+        * include hb->lock in the blocking chain, even through we'll not in
+        * fact hold it while blocking. This will lead it to report -EDEADLK
+        * and BUG when futex_unlock_pi() interleaves with this.
+        *
+        * Therefore acquire wait_lock while holding hb->lock, but drop the
+        * latter before calling rt_mutex_start_proxy_lock(). This still fully
+        * serializes against futex_unlock_pi() as that does the exact same
+        * lock handoff sequence.
         */
-       rt_mutex_init_waiter(&rt_waiter);
-       ret = rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, 
current);
+       raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+       spin_unlock(q.lock_ptr);
+       ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, 
current);
+       raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
+
        if (ret) {
                if (ret == 1)
                        ret = 0;
 
+               spin_lock(q.lock_ptr);
                goto no_block;
        }
 
-       spin_unlock(q.lock_ptr);
 
        if (unlikely(to))
                hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
@@ -2680,6 +2693,9 @@ static int futex_lock_pi(u32 __user *uad
         * first acquire the hb->lock before removing the lock from the
         * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
         * wait lists consistent.
+        *
+        * In particular; it is important that futex_unlock_pi() can not
+        * observe this inconsistency.
         */
        if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, 
&rt_waiter))
                ret = 0;
@@ -2791,10 +2807,6 @@ static int futex_unlock_pi(u32 __user *u
 
                get_pi_state(pi_state);
                /*
-                * Since modifying the wait_list is done while holding both
-                * hb->lock and wait_lock, holding either is sufficient to
-                * observe it.
-                *
                 * By taking wait_lock while still holding hb->lock, we ensure
                 * there is no point where we hold neither; and therefore
                 * wake_futex_pi() must observe a state consistent with what we
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1669,31 +1669,14 @@ void rt_mutex_proxy_unlock(struct rt_mut
        rt_mutex_set_owner(lock, NULL);
 }
 
-/**
- * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
- * @lock:              the rt_mutex to take
- * @waiter:            the pre-initialized rt_mutex_waiter
- * @task:              the task to prepare
- *
- * Returns:
- *  0 - task blocked on lock
- *  1 - acquired the lock for task, caller should wake it up
- * <0 - error
- *
- * Special API call for FUTEX_REQUEUE_PI support.
- */
-int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                              struct rt_mutex_waiter *waiter,
                              struct task_struct *task)
 {
        int ret;
 
-       raw_spin_lock_irq(&lock->wait_lock);
-
-       if (try_to_take_rt_mutex(lock, task, NULL)) {
-               raw_spin_unlock_irq(&lock->wait_lock);
+       if (try_to_take_rt_mutex(lock, task, NULL))
                return 1;
-       }
 
        /* We enforce deadlock detection for futexes */
        ret = task_blocks_on_rt_mutex(lock, waiter, task,
@@ -1712,12 +1695,36 @@ int rt_mutex_start_proxy_lock(struct rt_
        if (unlikely(ret))
                remove_waiter(lock, waiter);
 
-       raw_spin_unlock_irq(&lock->wait_lock);
-
        debug_rt_mutex_print_deadlock(waiter);
 
        return ret;
 }
+
+/**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock:              the rt_mutex to take
+ * @waiter:            the pre-initialized rt_mutex_waiter
+ * @task:              the task to prepare
+ *
+ * Returns:
+ *  0 - task blocked on lock
+ *  1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for FUTEX_REQUEUE_PI support.
+ */
+int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+                             struct rt_mutex_waiter *waiter,
+                             struct task_struct *task)
+{
+       int ret;
+
+       raw_spin_lock_irq(&lock->wait_lock);
+       ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
+       raw_spin_unlock_irq(&lock->wait_lock);
+
+       return ret;
+}
 
 /**
  * rt_mutex_next_owner - return the next owner of the lock
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -104,6 +104,9 @@ extern void rt_mutex_init_proxy_locked(s
 extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
                                  struct task_struct *proxy_owner);
 extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+                                    struct rt_mutex_waiter *waiter,
+                                    struct task_struct *task);
 extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                                     struct rt_mutex_waiter *waiter,
                                     struct task_struct *task);


Reply via email to