Ping Could someone have some comments on this bug? On 2016/04/26 at 16:30, Xunlei Pang wrote: > A crash happened while I was playing with deadline PI rtmutex. > > BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 > IP: [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30 > PGD 232a75067 PUD 230947067 PMD 0 > Oops: 0000 [#1] SMP > CPU: 1 PID: 10994 Comm: a.out Not tainted > > Call Trace: > [<ffffffff810cf8aa>] ? enqueue_task_dl+0x2a/0x320 > [<ffffffff810b658c>] enqueue_task+0x2c/0x80 > [<ffffffff810ba763>] activate_task+0x23/0x30 > [<ffffffff810d0ab5>] pull_dl_task+0x1d5/0x260 > [<ffffffff810d0be6>] pre_schedule_dl+0x16/0x20 > [<ffffffff8164e783>] __schedule+0xd3/0x900 > [<ffffffff8164efd9>] schedule+0x29/0x70 > [<ffffffff8165035b>] __rt_mutex_slowlock+0x4b/0xc0 > [<ffffffff81650501>] rt_mutex_slowlock+0xd1/0x190 > [<ffffffff810eeb33>] rt_mutex_timed_lock+0x53/0x60 > [<ffffffff810ecbfc>] futex_lock_pi.isra.18+0x28c/0x390 > [<ffffffff810cfa15>] ? enqueue_task_dl+0x195/0x320 > [<ffffffff810d0bac>] ? prio_changed_dl+0x6c/0x90 > [<ffffffff810ed8b0>] do_futex+0x190/0x5b0 > [<ffffffff810edd50>] SyS_futex+0x80/0x180 > [<ffffffff8165a089>] system_call_fastpath+0x16/0x1b > RIP [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30 > > This is because rt_mutex_enqueue_pi() and rt_mutex_dequeue_pi() > are only protected by pi_lock when operating pi waiters, while > rt_mutex_get_top_task(), will access them with rq lock held but > not holding pi_lock. > > In order to tackle it, we introduce new "pi_top_task" pointer > cached in task_struct, and add new rt_mutex_update_top_task() > to update its value, it can be called by rt_mutex_setprio() > which held both owner's pi_lock and rq lock. Thus "pi_top_task" > can be safely accessed by enqueue_task_dl() under rq lock. > > One problem is when rt_mutex_adjust_prio()->...->rt_mutex_setprio(), > at that time rtmutex lock was released and owner was marked off, > this can cause "pi_top_task" dereferenced to be a running one(as it > can be falsely woken up by others before rt_mutex_setprio() is > made to update "pi_top_task"). We solve this by directly calling > __rt_mutex_adjust_prio() in mark_wakeup_next_waiter() which held > pi_lock and rtmutex lock, and remove rt_mutex_adjust_prio(). Since > now we moved the deboost point, in order to avoid current to be > preempted due to deboost earlier before wake_up_q(), we also moved > preempt_disable() before unlocking rtmutex. > > Originally-From: Peter Zijlstra <pet...@infradead.org> > Signed-off-by: Xunlei Pang <xlp...@redhat.com> > --- > v3 -> v4: > Cache a task_struct pi_top_task pointer instead of rt_mutex_waiter. > > include/linux/init_task.h | 1 + > include/linux/sched.h | 2 ++ > include/linux/sched/rt.h | 1 + > kernel/fork.c | 1 + > kernel/locking/rtmutex.c | 65 > ++++++++++++++++++++--------------------------- > kernel/sched/core.c | 2 ++ > 6 files changed, 34 insertions(+), 38 deletions(-) > > diff --git a/include/linux/init_task.h b/include/linux/init_task.h > index f2cb8d4..ca088b1 100644 > --- a/include/linux/init_task.h > +++ b/include/linux/init_task.h > @@ -162,6 +162,7 @@ extern struct task_group root_task_group; > #ifdef CONFIG_RT_MUTEXES > # define INIT_RT_MUTEXES(tsk) > \ > .pi_waiters = RB_ROOT, \ > + .pi_top_task = NULL, \ > .pi_waiters_leftmost = NULL, > #else > # define INIT_RT_MUTEXES(tsk) > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 45e848c..322c0ad 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1622,6 +1622,8 @@ struct task_struct { > /* PI waiters blocked on a rt_mutex held by this task */ > struct rb_root pi_waiters; > struct rb_node *pi_waiters_leftmost; > + /* Updated under owner's pi_lock and rq lock */ > + struct task_struct *pi_top_task; > /* Deadlock detection and priority inheritance handling */ > struct rt_mutex_waiter *pi_blocked_on; > #endif > diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h > index a30b172..60d0c47 100644 > --- a/include/linux/sched/rt.h > +++ b/include/linux/sched/rt.h > @@ -19,6 +19,7 @@ static inline int rt_task(struct task_struct *p) > extern int rt_mutex_getprio(struct task_struct *p); > extern void rt_mutex_setprio(struct task_struct *p, int prio); > extern int rt_mutex_get_effective_prio(struct task_struct *task, int > newprio); > +extern void rt_mutex_update_top_task(struct task_struct *p); > extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task); > extern void rt_mutex_adjust_pi(struct task_struct *p); > static inline bool tsk_is_pi_blocked(struct task_struct *tsk) > diff --git a/kernel/fork.c b/kernel/fork.c > index a453546..cb01dfc 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -1225,6 +1225,7 @@ static void rt_mutex_init_task(struct task_struct *p) > #ifdef CONFIG_RT_MUTEXES > p->pi_waiters = RB_ROOT; > p->pi_waiters_leftmost = NULL; > + p->pi_top_task = NULL; > p->pi_blocked_on = NULL; > #endif > } > diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c > index d87f99e..969e436 100644 > --- a/kernel/locking/rtmutex.c > +++ b/kernel/locking/rtmutex.c > @@ -256,6 +256,16 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct > rt_mutex_waiter *waiter) > RB_CLEAR_NODE(&waiter->pi_tree_entry); > } > > +void rt_mutex_update_top_task(struct task_struct *p) > +{ > + if (!task_has_pi_waiters(p)) { > + p->pi_top_task = NULL; > + return; > + } > + > + p->pi_top_task = task_top_pi_waiter(p)->task; > +} > + > /* > * Calculate task priority from the waiter tree priority > * > @@ -273,10 +283,7 @@ int rt_mutex_getprio(struct task_struct *task) > > struct task_struct *rt_mutex_get_top_task(struct task_struct *task) > { > - if (likely(!task_has_pi_waiters(task))) > - return NULL; > - > - return task_top_pi_waiter(task)->task; > + return task->pi_top_task; > } > > /* > @@ -285,12 +292,12 @@ struct task_struct *rt_mutex_get_top_task(struct > task_struct *task) > */ > int rt_mutex_get_effective_prio(struct task_struct *task, int newprio) > { > - if (!task_has_pi_waiters(task)) > + struct task_struct *top_task = rt_mutex_get_top_task(task); > + > + if (!top_task) > return newprio; > > - if (task_top_pi_waiter(task)->task->prio <= newprio) > - return task_top_pi_waiter(task)->task->prio; > - return newprio; > + return min(top_task->prio, newprio); > } > > /* > @@ -307,24 +314,6 @@ static void __rt_mutex_adjust_prio(struct task_struct > *task) > } > > /* > - * Adjust task priority (undo boosting). Called from the exit path of > - * rt_mutex_slowunlock() and rt_mutex_slowlock(). > - * > - * (Note: We do this outside of the protection of lock->wait_lock to > - * allow the lock to be taken while or before we readjust the priority > - * of task. We do not use the spin_xx_mutex() variants here as we are > - * outside of the debug path.) > - */ > -void rt_mutex_adjust_prio(struct task_struct *task) > -{ > - unsigned long flags; > - > - raw_spin_lock_irqsave(&task->pi_lock, flags); > - __rt_mutex_adjust_prio(task); > - raw_spin_unlock_irqrestore(&task->pi_lock, flags); > -} > - > -/* > * Deadlock detection is conditional: > * > * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted > @@ -987,6 +976,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head > *wake_q, > * lock->wait_lock. > */ > rt_mutex_dequeue_pi(current, waiter); > + __rt_mutex_adjust_prio(current); > > /* > * As we are waking up the top waiter, and the waiter stays > @@ -1325,6 +1315,16 @@ static bool __sched rt_mutex_slowunlock(struct > rt_mutex *lock, > */ > mark_wakeup_next_waiter(wake_q, lock); > > + /* > + * We should deboost before waking the top waiter task such that > + * we don't run two tasks with the 'same' priority. This however > + * can lead to prio-inversion if we would get preempted after > + * the deboost but before waking our high-prio task, hence the > + * preempt_disable before unlock. Pairs with preempt_enable() in > + * rt_mutex_postunlock(); > + */ > + preempt_disable(); > + > raw_spin_unlock_irqrestore(&lock->wait_lock, flags); > > /* check PI boosting */ > @@ -1400,20 +1400,9 @@ rt_mutex_fastunlock(struct rt_mutex *lock, > */ > void rt_mutex_postunlock(struct wake_q_head *wake_q, bool deboost) > { > - /* > - * We should deboost before waking the top waiter task such that > - * we don't run two tasks with the 'same' priority. This however > - * can lead to prio-inversion if we would get preempted after > - * the deboost but before waking our high-prio task, hence the > - * preempt_disable. > - */ > - if (deboost) { > - preempt_disable(); > - rt_mutex_adjust_prio(current); > - } > - > wake_up_q(wake_q); > > + /* Pairs with preempt_disable() in rt_mutex_slowunlock() */ > if (deboost) > preempt_enable(); > } > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index 1159423..60f8166 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -3480,6 +3480,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) > goto out_unlock; > } > > + rt_mutex_update_top_task(p); > + > trace_sched_pi_setprio(p, prio); > oldprio = p->prio; >