Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-09-05 Thread Steven Rostedt

Sorry for taking so long to review. So many other things to do :-/

On Fri, 23 Aug 2013 14:26:39 +0800
Lai Jiangshan  wrote:

> [PATCH] rcu/rt_mutex: eliminate a kind of deadlock for rcu read site

"rcu read site"?

This is specific to boosting, thus boosting should be in the subject,
perhaps something like:

"Eliminate deadlock due to rcu boosting"

?

> 
> Current rtmutex's lock->wait_lock doesn't disables softirq nor irq, it will
> cause rcu read site deadlock when rcu overlaps with any 
> softirq-context/irq-context lock.
> 
> @L is a spinlock of softirq or irq context.
> 
> CPU1  cpu2(rcu boost)
> rcu_read_lock()   rt_mutext_lock()
> raw_spin_lock(lock->wait_lock)
> spin_lock_XX(L)  or irq>
> rcu_read_unlock() do_softirq()
>   rcu_read_unlock_special()
> rt_mutext_unlock()
>   raw_spin_lock(lock->wait_lock)  spin_lock_XX(L)  **DEADLOCK**
> 
> This patch fixes this kind of deadlock by removing rt_mutext_unlock() from
> rcu_read_unlock(), new rt_mutex_rcu_deboost_unlock() is called instead.
> Thus rtmutex's lock->wait_lock will not be called from rcu_read_unlock().
> 
> This patch does not eliminate all kinds of rcu-read-site deadlock,
> if @L is a scheduler lock, it will be deadlock, we should apply Paul's rule
> in this case.(avoid overlapping or preempt_disable()).
> 
> rt_mutex_rcu_deboost_unlock() requires the @waiter is queued, so we
> can't directly call rt_mutex_lock() in the rcu_boost thread,
> we split rt_mutex_lock() into two steps just like pi-futex.
> This result a internal state in rcu_boost thread and cause
> rcu_boost thread a bit more complicated.
> 
> Thanks
> Lai
> 
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index 5cd0f09..8830874 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -102,7 +102,7 @@ extern struct group_info init_groups;
>  
>  #ifdef CONFIG_RCU_BOOST
>  #define INIT_TASK_RCU_BOOST()
> \
> - .rcu_boost_mutex = NULL,
> + .rcu_boost_waiter = NULL,
>  #else
>  #define INIT_TASK_RCU_BOOST()
>  #endif
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index e9995eb..1eca99f 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1078,7 +1078,7 @@ struct task_struct {
>   struct rcu_node *rcu_blocked_node;
>  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
>  #ifdef CONFIG_RCU_BOOST
> - struct rt_mutex *rcu_boost_mutex;
> + struct rt_mutex_waiter *rcu_boost_waiter;
>  #endif /* #ifdef CONFIG_RCU_BOOST */
>  
>  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
> @@ -1723,7 +1723,7 @@ static inline void rcu_copy_process(struct task_struct 
> *p)
>   p->rcu_blocked_node = NULL;
>  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
>  #ifdef CONFIG_RCU_BOOST
> - p->rcu_boost_mutex = NULL;
> + p->rcu_boost_waiter = NULL;
>  #endif /* #ifdef CONFIG_RCU_BOOST */
>   INIT_LIST_HEAD(>rcu_node_entry);
>  }
> diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
> index 769e12e..d207ddd 100644
> --- a/kernel/rcutree_plugin.h
> +++ b/kernel/rcutree_plugin.h
> @@ -33,6 +33,7 @@
>  #define RCU_KTHREAD_PRIO 1
>  
>  #ifdef CONFIG_RCU_BOOST
> +#include "rtmutex_common.h"
>  #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
>  #else
>  #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
> @@ -340,7 +341,7 @@ void rcu_read_unlock_special(struct task_struct *t)
>   unsigned long flags;
>   struct list_head *np;
>  #ifdef CONFIG_RCU_BOOST
> - struct rt_mutex *rbmp = NULL;
> + struct rt_mutex_waiter *waiter = NULL;
>  #endif /* #ifdef CONFIG_RCU_BOOST */
>   struct rcu_node *rnp;
>   int special;
> @@ -397,10 +398,10 @@ void rcu_read_unlock_special(struct task_struct *t)
>  #ifdef CONFIG_RCU_BOOST
>   if (>rcu_node_entry == rnp->boost_tasks)
>   rnp->boost_tasks = np;
> - /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
> - if (t->rcu_boost_mutex) {
> - rbmp = t->rcu_boost_mutex;
> - t->rcu_boost_mutex = NULL;
> + /* Snapshot/clear ->rcu_boost_waiter with rcu_node lock held. */
> + if (t->rcu_boost_waiter) {
> + waiter = t->rcu_boost_waiter;
> + t->rcu_boost_waiter = NULL;
>   }
>  #endif /* #ifdef CONFIG_RCU_BOOST */
>  
> @@ -426,8 +427,8 @@ void rcu_read_unlock_special(struct task_struct *t)
>  
>  #ifdef CONFIG_RCU_BOOST
>   /* Unboost if we were boosted. */
> - if (rbmp)
> - rt_mutex_unlock(rbmp);
> + if (waiter)
> + rt_mutex_rcu_deboost_unlock(t, waiter);
>  #endif /* #ifdef CONFIG_RCU_BOOST */
>  
>   /*
> @@ -1129,9 +1130,6 @@ void exit_rcu(void)
>  #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU 

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-09-05 Thread Steven Rostedt

Sorry for taking so long to review. So many other things to do :-/

On Fri, 23 Aug 2013 14:26:39 +0800
Lai Jiangshan la...@cn.fujitsu.com wrote:

 [PATCH] rcu/rt_mutex: eliminate a kind of deadlock for rcu read site

rcu read site?

This is specific to boosting, thus boosting should be in the subject,
perhaps something like:

Eliminate deadlock due to rcu boosting

?

 
 Current rtmutex's lock-wait_lock doesn't disables softirq nor irq, it will
 cause rcu read site deadlock when rcu overlaps with any 
 softirq-context/irq-context lock.
 
 @L is a spinlock of softirq or irq context.
 
 CPU1  cpu2(rcu boost)
 rcu_read_lock()   rt_mutext_lock()
 preemption and reschedule backraw_spin_lock(lock-wait_lock)
 spin_lock_XX(L) interrupt and doing softirq 
 or irq
 rcu_read_unlock() do_softirq()
   rcu_read_unlock_special()
 rt_mutext_unlock()
   raw_spin_lock(lock-wait_lock)  spin_lock_XX(L)  **DEADLOCK**
 
 This patch fixes this kind of deadlock by removing rt_mutext_unlock() from
 rcu_read_unlock(), new rt_mutex_rcu_deboost_unlock() is called instead.
 Thus rtmutex's lock-wait_lock will not be called from rcu_read_unlock().
 
 This patch does not eliminate all kinds of rcu-read-site deadlock,
 if @L is a scheduler lock, it will be deadlock, we should apply Paul's rule
 in this case.(avoid overlapping or preempt_disable()).
 
 rt_mutex_rcu_deboost_unlock() requires the @waiter is queued, so we
 can't directly call rt_mutex_lock(mtx) in the rcu_boost thread,
 we split rt_mutex_lock(mtx) into two steps just like pi-futex.
 This result a internal state in rcu_boost thread and cause
 rcu_boost thread a bit more complicated.
 
 Thanks
 Lai
 
 diff --git a/include/linux/init_task.h b/include/linux/init_task.h
 index 5cd0f09..8830874 100644
 --- a/include/linux/init_task.h
 +++ b/include/linux/init_task.h
 @@ -102,7 +102,7 @@ extern struct group_info init_groups;
  
  #ifdef CONFIG_RCU_BOOST
  #define INIT_TASK_RCU_BOOST()
 \
 - .rcu_boost_mutex = NULL,
 + .rcu_boost_waiter = NULL,
  #else
  #define INIT_TASK_RCU_BOOST()
  #endif
 diff --git a/include/linux/sched.h b/include/linux/sched.h
 index e9995eb..1eca99f 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -1078,7 +1078,7 @@ struct task_struct {
   struct rcu_node *rcu_blocked_node;
  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
  #ifdef CONFIG_RCU_BOOST
 - struct rt_mutex *rcu_boost_mutex;
 + struct rt_mutex_waiter *rcu_boost_waiter;
  #endif /* #ifdef CONFIG_RCU_BOOST */
  
  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 @@ -1723,7 +1723,7 @@ static inline void rcu_copy_process(struct task_struct 
 *p)
   p-rcu_blocked_node = NULL;
  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
  #ifdef CONFIG_RCU_BOOST
 - p-rcu_boost_mutex = NULL;
 + p-rcu_boost_waiter = NULL;
  #endif /* #ifdef CONFIG_RCU_BOOST */
   INIT_LIST_HEAD(p-rcu_node_entry);
  }
 diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
 index 769e12e..d207ddd 100644
 --- a/kernel/rcutree_plugin.h
 +++ b/kernel/rcutree_plugin.h
 @@ -33,6 +33,7 @@
  #define RCU_KTHREAD_PRIO 1
  
  #ifdef CONFIG_RCU_BOOST
 +#include rtmutex_common.h
  #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
  #else
  #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
 @@ -340,7 +341,7 @@ void rcu_read_unlock_special(struct task_struct *t)
   unsigned long flags;
   struct list_head *np;
  #ifdef CONFIG_RCU_BOOST
 - struct rt_mutex *rbmp = NULL;
 + struct rt_mutex_waiter *waiter = NULL;
  #endif /* #ifdef CONFIG_RCU_BOOST */
   struct rcu_node *rnp;
   int special;
 @@ -397,10 +398,10 @@ void rcu_read_unlock_special(struct task_struct *t)
  #ifdef CONFIG_RCU_BOOST
   if (t-rcu_node_entry == rnp-boost_tasks)
   rnp-boost_tasks = np;
 - /* Snapshot/clear -rcu_boost_mutex with rcu_node lock held. */
 - if (t-rcu_boost_mutex) {
 - rbmp = t-rcu_boost_mutex;
 - t-rcu_boost_mutex = NULL;
 + /* Snapshot/clear -rcu_boost_waiter with rcu_node lock held. */
 + if (t-rcu_boost_waiter) {
 + waiter = t-rcu_boost_waiter;
 + t-rcu_boost_waiter = NULL;
   }
  #endif /* #ifdef CONFIG_RCU_BOOST */
  
 @@ -426,8 +427,8 @@ void rcu_read_unlock_special(struct task_struct *t)
  
  #ifdef CONFIG_RCU_BOOST
   /* Unboost if we were boosted. */
 - if (rbmp)
 - rt_mutex_unlock(rbmp);
 + if (waiter)
 + rt_mutex_rcu_deboost_unlock(t, waiter);
  #endif /* #ifdef CONFIG_RCU_BOOST */
  
   /*
 @@ -1129,9 +1130,6 @@ void exit_rcu(void)
  #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
  
  #ifdef CONFIG_RCU_BOOST
 -
 -#include 

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-29 Thread Paul E. McKenney
On Mon, Aug 26, 2013 at 10:39:32AM +0800, Lai Jiangshan wrote:
> On 08/26/2013 01:43 AM, Paul E. McKenney wrote:
> > On Sun, Aug 25, 2013 at 11:19:37PM +0800, Lai Jiangshan wrote:
> >> Hi, Steven
> >>
> >> Any comments about this patch?
> > 
> > For whatever it is worth, it ran without incident for two hours worth
> > of rcutorture on my P5 test (boosting but no CPU hotplug).
> > 
> > Lai, do you have a specific test for this patch?  
> 
> Also rcutorture.
> (A special module is added to ensure all paths of my code are covered.)

OK, good!  Could you please send along your rcutorture changes as well?

Also, it would be good to have Steven Rostedt's Acked-by or Reviewed-by.

> > Your deadlock
> > scenario looks plausible, but is apparently not occurring in the
> > mainline kernel.
> 
> Yes, you can leave this possible bug until the real problem happens
> or just disallow overlapping.
> I can write some debug code for it which allow us find out
> the problems earlier.
> 
> I guess this is an useful usage pattern of rcu:
> 
> again:
>   rcu_read_lock();
>   obj = read_dereference(ptr);
>   spin_lock_XX(obj->lock);
>   if (obj is invalid) {
>   spin_unlock_XX(obj->lock);
>   rcu_read_unlock();
>   goto again;
>   }
>   rcu_read_unlock();
>   # use obj
>   spin_unlock_XX(obj->lock);
> 
> If we encourage this pattern, we should fix all the related problems.

Given that I have had to ask people to move away from this pattern,
it would be good to allow it to work.  The transformation to currently
permitted usage is as follows, for whatever it is worth:

again:
disable_XX();
rcu_read_lock();
obj = read_dereference(ptr);
spin_lock(obj->lock);
if (obj is invalid) {
spin_unlock_XX(obj->lock);
rcu_read_unlock();
goto again;
}
rcu_read_unlock();
# use obj
spin_unlock_XX(obj->lock);

In mainline, this prevents preemption within the RCU read-side critical
section, avoiding the problem.

That said, if we allow your original pattern, that would be even better!

Thanx, Paul

> Thanks,
> Lai
> 
> > 
> > Thanx, Paul
> > 
> >> Thanks,
> >> Lai
> >>
> >>
> >> On Fri, Aug 23, 2013 at 2:26 PM, Lai Jiangshan  
> >> wrote:
> >>
> >>> [PATCH] rcu/rt_mutex: eliminate a kind of deadlock for rcu read site
> >>>
> >>> Current rtmutex's lock->wait_lock doesn't disables softirq nor irq, it 
> >>> will
> >>> cause rcu read site deadlock when rcu overlaps with any
> >>> softirq-context/irq-context lock.
> >>>
> >>> @L is a spinlock of softirq or irq context.
> >>>
> >>> CPU1cpu2(rcu boost)
> >>> rcu_read_lock() rt_mutext_lock()
> >>>   raw_spin_lock(lock->wait_lock)
> >>> spin_lock_XX(L)>>> irq>
> >>> rcu_read_unlock() do_softirq()
> >>>   rcu_read_unlock_special()
> >>> rt_mutext_unlock()
> >>>   raw_spin_lock(lock->wait_lock)spin_lock_XX(L)  **DEADLOCK**
> >>>
> >>> This patch fixes this kind of deadlock by removing rt_mutext_unlock() from
> >>> rcu_read_unlock(), new rt_mutex_rcu_deboost_unlock() is called instead.
> >>> Thus rtmutex's lock->wait_lock will not be called from rcu_read_unlock().
> >>>
> >>> This patch does not eliminate all kinds of rcu-read-site deadlock,
> >>> if @L is a scheduler lock, it will be deadlock, we should apply Paul's 
> >>> rule
> >>> in this case.(avoid overlapping or preempt_disable()).
> >>>
> >>> rt_mutex_rcu_deboost_unlock() requires the @waiter is queued, so we
> >>> can't directly call rt_mutex_lock() in the rcu_boost thread,
> >>> we split rt_mutex_lock() into two steps just like pi-futex.
> >>> This result a internal state in rcu_boost thread and cause
> >>> rcu_boost thread a bit more complicated.
> >>>
> >>> Thanks
> >>> Lai
> >>>
> >>> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> >>> index 5cd0f09..8830874 100644
> >>> --- a/include/linux/init_task.h
> >>> +++ b/include/linux/init_task.h
> >>> @@ -102,7 +102,7 @@ extern struct group_info init_groups;
> >>>
> >>>  #ifdef CONFIG_RCU_BOOST
> >>>  #define INIT_TASK_RCU_BOOST()  \
> >>> -   .rcu_boost_mutex = NULL,
> >>> +   .rcu_boost_waiter = NULL,
> >>>  #else
> >>>  #define INIT_TASK_RCU_BOOST()
> >>>  #endif
> >>> diff --git a/include/linux/sched.h b/include/linux/sched.h
> >>> index e9995eb..1eca99f 100644
> >>> --- a/include/linux/sched.h
> >>> +++ b/include/linux/sched.h
> >>> @@ -1078,7 +1078,7 @@ struct task_struct {
> >>> struct rcu_node *rcu_blocked_node;
> >>>  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
> >>>  #ifdef CONFIG_RCU_BOOST
> >>> -   struct rt_mutex *rcu_boost_mutex;
> >>> +   struct rt_mutex_waiter *rcu_boost_waiter;

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-29 Thread Paul E. McKenney
On Mon, Aug 26, 2013 at 10:39:32AM +0800, Lai Jiangshan wrote:
 On 08/26/2013 01:43 AM, Paul E. McKenney wrote:
  On Sun, Aug 25, 2013 at 11:19:37PM +0800, Lai Jiangshan wrote:
  Hi, Steven
 
  Any comments about this patch?
  
  For whatever it is worth, it ran without incident for two hours worth
  of rcutorture on my P5 test (boosting but no CPU hotplug).
  
  Lai, do you have a specific test for this patch?  
 
 Also rcutorture.
 (A special module is added to ensure all paths of my code are covered.)

OK, good!  Could you please send along your rcutorture changes as well?

Also, it would be good to have Steven Rostedt's Acked-by or Reviewed-by.

  Your deadlock
  scenario looks plausible, but is apparently not occurring in the
  mainline kernel.
 
 Yes, you can leave this possible bug until the real problem happens
 or just disallow overlapping.
 I can write some debug code for it which allow us find out
 the problems earlier.
 
 I guess this is an useful usage pattern of rcu:
 
 again:
   rcu_read_lock();
   obj = read_dereference(ptr);
   spin_lock_XX(obj-lock);
   if (obj is invalid) {
   spin_unlock_XX(obj-lock);
   rcu_read_unlock();
   goto again;
   }
   rcu_read_unlock();
   # use obj
   spin_unlock_XX(obj-lock);
 
 If we encourage this pattern, we should fix all the related problems.

Given that I have had to ask people to move away from this pattern,
it would be good to allow it to work.  The transformation to currently
permitted usage is as follows, for whatever it is worth:

again:
disable_XX();
rcu_read_lock();
obj = read_dereference(ptr);
spin_lock(obj-lock);
if (obj is invalid) {
spin_unlock_XX(obj-lock);
rcu_read_unlock();
goto again;
}
rcu_read_unlock();
# use obj
spin_unlock_XX(obj-lock);

In mainline, this prevents preemption within the RCU read-side critical
section, avoiding the problem.

That said, if we allow your original pattern, that would be even better!

Thanx, Paul

 Thanks,
 Lai
 
  
  Thanx, Paul
  
  Thanks,
  Lai
 
 
  On Fri, Aug 23, 2013 at 2:26 PM, Lai Jiangshan la...@cn.fujitsu.com 
  wrote:
 
  [PATCH] rcu/rt_mutex: eliminate a kind of deadlock for rcu read site
 
  Current rtmutex's lock-wait_lock doesn't disables softirq nor irq, it 
  will
  cause rcu read site deadlock when rcu overlaps with any
  softirq-context/irq-context lock.
 
  @L is a spinlock of softirq or irq context.
 
  CPU1cpu2(rcu boost)
  rcu_read_lock() rt_mutext_lock()
  preemption and reschedule back  raw_spin_lock(lock-wait_lock)
  spin_lock_XX(L)   interrupt and doing softirq or
  irq
  rcu_read_unlock() do_softirq()
rcu_read_unlock_special()
  rt_mutext_unlock()
raw_spin_lock(lock-wait_lock)spin_lock_XX(L)  **DEADLOCK**
 
  This patch fixes this kind of deadlock by removing rt_mutext_unlock() from
  rcu_read_unlock(), new rt_mutex_rcu_deboost_unlock() is called instead.
  Thus rtmutex's lock-wait_lock will not be called from rcu_read_unlock().
 
  This patch does not eliminate all kinds of rcu-read-site deadlock,
  if @L is a scheduler lock, it will be deadlock, we should apply Paul's 
  rule
  in this case.(avoid overlapping or preempt_disable()).
 
  rt_mutex_rcu_deboost_unlock() requires the @waiter is queued, so we
  can't directly call rt_mutex_lock(mtx) in the rcu_boost thread,
  we split rt_mutex_lock(mtx) into two steps just like pi-futex.
  This result a internal state in rcu_boost thread and cause
  rcu_boost thread a bit more complicated.
 
  Thanks
  Lai
 
  diff --git a/include/linux/init_task.h b/include/linux/init_task.h
  index 5cd0f09..8830874 100644
  --- a/include/linux/init_task.h
  +++ b/include/linux/init_task.h
  @@ -102,7 +102,7 @@ extern struct group_info init_groups;
 
   #ifdef CONFIG_RCU_BOOST
   #define INIT_TASK_RCU_BOOST()  \
  -   .rcu_boost_mutex = NULL,
  +   .rcu_boost_waiter = NULL,
   #else
   #define INIT_TASK_RCU_BOOST()
   #endif
  diff --git a/include/linux/sched.h b/include/linux/sched.h
  index e9995eb..1eca99f 100644
  --- a/include/linux/sched.h
  +++ b/include/linux/sched.h
  @@ -1078,7 +1078,7 @@ struct task_struct {
  struct rcu_node *rcu_blocked_node;
   #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
   #ifdef CONFIG_RCU_BOOST
  -   struct rt_mutex *rcu_boost_mutex;
  +   struct rt_mutex_waiter *rcu_boost_waiter;
   #endif /* #ifdef CONFIG_RCU_BOOST */
 
   #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
  @@ -1723,7 +1723,7 @@ static inline void rcu_copy_process(struct
  task_struct *p)
  p-rcu_blocked_node = NULL;
   #endif /* 

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-25 Thread Lai Jiangshan
On 08/26/2013 01:43 AM, Paul E. McKenney wrote:
> On Sun, Aug 25, 2013 at 11:19:37PM +0800, Lai Jiangshan wrote:
>> Hi, Steven
>>
>> Any comments about this patch?
> 
> For whatever it is worth, it ran without incident for two hours worth
> of rcutorture on my P5 test (boosting but no CPU hotplug).
> 
> Lai, do you have a specific test for this patch?  

Also rcutorture.
(A special module is added to ensure all paths of my code are covered.)

> Your deadlock
> scenario looks plausible, but is apparently not occurring in the
> mainline kernel.

Yes, you can leave this possible bug until the real problem happens
or just disallow overlapping.
I can write some debug code for it which allow us find out
the problems earlier.

I guess this is an useful usage pattern of rcu:

again:
rcu_read_lock();
obj = read_dereference(ptr);
spin_lock_XX(obj->lock);
if (obj is invalid) {
spin_unlock_XX(obj->lock);
rcu_read_unlock();
goto again;
}
rcu_read_unlock();
# use obj
spin_unlock_XX(obj->lock);

If we encourage this pattern, we should fix all the related problems.

Thanks,
Lai

> 
>   Thanx, Paul
> 
>> Thanks,
>> Lai
>>
>>
>> On Fri, Aug 23, 2013 at 2:26 PM, Lai Jiangshan  wrote:
>>
>>> [PATCH] rcu/rt_mutex: eliminate a kind of deadlock for rcu read site
>>>
>>> Current rtmutex's lock->wait_lock doesn't disables softirq nor irq, it will
>>> cause rcu read site deadlock when rcu overlaps with any
>>> softirq-context/irq-context lock.
>>>
>>> @L is a spinlock of softirq or irq context.
>>>
>>> CPU1cpu2(rcu boost)
>>> rcu_read_lock() rt_mutext_lock()
>>>   raw_spin_lock(lock->wait_lock)
>>> spin_lock_XX(L)   >> irq>
>>> rcu_read_unlock() do_softirq()
>>>   rcu_read_unlock_special()
>>> rt_mutext_unlock()
>>>   raw_spin_lock(lock->wait_lock)spin_lock_XX(L)  **DEADLOCK**
>>>
>>> This patch fixes this kind of deadlock by removing rt_mutext_unlock() from
>>> rcu_read_unlock(), new rt_mutex_rcu_deboost_unlock() is called instead.
>>> Thus rtmutex's lock->wait_lock will not be called from rcu_read_unlock().
>>>
>>> This patch does not eliminate all kinds of rcu-read-site deadlock,
>>> if @L is a scheduler lock, it will be deadlock, we should apply Paul's rule
>>> in this case.(avoid overlapping or preempt_disable()).
>>>
>>> rt_mutex_rcu_deboost_unlock() requires the @waiter is queued, so we
>>> can't directly call rt_mutex_lock() in the rcu_boost thread,
>>> we split rt_mutex_lock() into two steps just like pi-futex.
>>> This result a internal state in rcu_boost thread and cause
>>> rcu_boost thread a bit more complicated.
>>>
>>> Thanks
>>> Lai
>>>
>>> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
>>> index 5cd0f09..8830874 100644
>>> --- a/include/linux/init_task.h
>>> +++ b/include/linux/init_task.h
>>> @@ -102,7 +102,7 @@ extern struct group_info init_groups;
>>>
>>>  #ifdef CONFIG_RCU_BOOST
>>>  #define INIT_TASK_RCU_BOOST()  \
>>> -   .rcu_boost_mutex = NULL,
>>> +   .rcu_boost_waiter = NULL,
>>>  #else
>>>  #define INIT_TASK_RCU_BOOST()
>>>  #endif
>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>>> index e9995eb..1eca99f 100644
>>> --- a/include/linux/sched.h
>>> +++ b/include/linux/sched.h
>>> @@ -1078,7 +1078,7 @@ struct task_struct {
>>> struct rcu_node *rcu_blocked_node;
>>>  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
>>>  #ifdef CONFIG_RCU_BOOST
>>> -   struct rt_mutex *rcu_boost_mutex;
>>> +   struct rt_mutex_waiter *rcu_boost_waiter;
>>>  #endif /* #ifdef CONFIG_RCU_BOOST */
>>>
>>>  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
>>> @@ -1723,7 +1723,7 @@ static inline void rcu_copy_process(struct
>>> task_struct *p)
>>> p->rcu_blocked_node = NULL;
>>>  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
>>>  #ifdef CONFIG_RCU_BOOST
>>> -   p->rcu_boost_mutex = NULL;
>>> +   p->rcu_boost_waiter = NULL;
>>>  #endif /* #ifdef CONFIG_RCU_BOOST */
>>> INIT_LIST_HEAD(>rcu_node_entry);
>>>  }
>>> diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
>>> index 769e12e..d207ddd 100644
>>> --- a/kernel/rcutree_plugin.h
>>> +++ b/kernel/rcutree_plugin.h
>>> @@ -33,6 +33,7 @@
>>>  #define RCU_KTHREAD_PRIO 1
>>>
>>>  #ifdef CONFIG_RCU_BOOST
>>> +#include "rtmutex_common.h"
>>>  #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
>>>  #else
>>>  #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
>>> @@ -340,7 +341,7 @@ void rcu_read_unlock_special(struct task_struct *t)
>>> unsigned long flags;
>>> struct list_head *np;
>>>  #ifdef CONFIG_RCU_BOOST
>>> -   struct rt_mutex *rbmp = NULL;
>>> +   struct rt_mutex_waiter *waiter = NULL;
>>>  #endif /* #ifdef CONFIG_RCU_BOOST */
>>> 

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-25 Thread Paul E. McKenney
On Sun, Aug 25, 2013 at 11:19:37PM +0800, Lai Jiangshan wrote:
> Hi, Steven
> 
> Any comments about this patch?

For whatever it is worth, it ran without incident for two hours worth
of rcutorture on my P5 test (boosting but no CPU hotplug).

Lai, do you have a specific test for this patch?  Your deadlock
scenario looks plausible, but is apparently not occurring in the
mainline kernel.

Thanx, Paul

> Thanks,
> Lai
> 
> 
> On Fri, Aug 23, 2013 at 2:26 PM, Lai Jiangshan  wrote:
> 
> > [PATCH] rcu/rt_mutex: eliminate a kind of deadlock for rcu read site
> >
> > Current rtmutex's lock->wait_lock doesn't disables softirq nor irq, it will
> > cause rcu read site deadlock when rcu overlaps with any
> > softirq-context/irq-context lock.
> >
> > @L is a spinlock of softirq or irq context.
> >
> > CPU1cpu2(rcu boost)
> > rcu_read_lock() rt_mutext_lock()
> >   raw_spin_lock(lock->wait_lock)
> > spin_lock_XX(L)> irq>
> > rcu_read_unlock() do_softirq()
> >   rcu_read_unlock_special()
> > rt_mutext_unlock()
> >   raw_spin_lock(lock->wait_lock)spin_lock_XX(L)  **DEADLOCK**
> >
> > This patch fixes this kind of deadlock by removing rt_mutext_unlock() from
> > rcu_read_unlock(), new rt_mutex_rcu_deboost_unlock() is called instead.
> > Thus rtmutex's lock->wait_lock will not be called from rcu_read_unlock().
> >
> > This patch does not eliminate all kinds of rcu-read-site deadlock,
> > if @L is a scheduler lock, it will be deadlock, we should apply Paul's rule
> > in this case.(avoid overlapping or preempt_disable()).
> >
> > rt_mutex_rcu_deboost_unlock() requires the @waiter is queued, so we
> > can't directly call rt_mutex_lock() in the rcu_boost thread,
> > we split rt_mutex_lock() into two steps just like pi-futex.
> > This result a internal state in rcu_boost thread and cause
> > rcu_boost thread a bit more complicated.
> >
> > Thanks
> > Lai
> >
> > diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> > index 5cd0f09..8830874 100644
> > --- a/include/linux/init_task.h
> > +++ b/include/linux/init_task.h
> > @@ -102,7 +102,7 @@ extern struct group_info init_groups;
> >
> >  #ifdef CONFIG_RCU_BOOST
> >  #define INIT_TASK_RCU_BOOST()  \
> > -   .rcu_boost_mutex = NULL,
> > +   .rcu_boost_waiter = NULL,
> >  #else
> >  #define INIT_TASK_RCU_BOOST()
> >  #endif
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index e9995eb..1eca99f 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -1078,7 +1078,7 @@ struct task_struct {
> > struct rcu_node *rcu_blocked_node;
> >  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
> >  #ifdef CONFIG_RCU_BOOST
> > -   struct rt_mutex *rcu_boost_mutex;
> > +   struct rt_mutex_waiter *rcu_boost_waiter;
> >  #endif /* #ifdef CONFIG_RCU_BOOST */
> >
> >  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
> > @@ -1723,7 +1723,7 @@ static inline void rcu_copy_process(struct
> > task_struct *p)
> > p->rcu_blocked_node = NULL;
> >  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
> >  #ifdef CONFIG_RCU_BOOST
> > -   p->rcu_boost_mutex = NULL;
> > +   p->rcu_boost_waiter = NULL;
> >  #endif /* #ifdef CONFIG_RCU_BOOST */
> > INIT_LIST_HEAD(>rcu_node_entry);
> >  }
> > diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
> > index 769e12e..d207ddd 100644
> > --- a/kernel/rcutree_plugin.h
> > +++ b/kernel/rcutree_plugin.h
> > @@ -33,6 +33,7 @@
> >  #define RCU_KTHREAD_PRIO 1
> >
> >  #ifdef CONFIG_RCU_BOOST
> > +#include "rtmutex_common.h"
> >  #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
> >  #else
> >  #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
> > @@ -340,7 +341,7 @@ void rcu_read_unlock_special(struct task_struct *t)
> > unsigned long flags;
> > struct list_head *np;
> >  #ifdef CONFIG_RCU_BOOST
> > -   struct rt_mutex *rbmp = NULL;
> > +   struct rt_mutex_waiter *waiter = NULL;
> >  #endif /* #ifdef CONFIG_RCU_BOOST */
> > struct rcu_node *rnp;
> > int special;
> > @@ -397,10 +398,10 @@ void rcu_read_unlock_special(struct task_struct *t)
> >  #ifdef CONFIG_RCU_BOOST
> > if (>rcu_node_entry == rnp->boost_tasks)
> > rnp->boost_tasks = np;
> > -   /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock
> > held. */
> > -   if (t->rcu_boost_mutex) {
> > -   rbmp = t->rcu_boost_mutex;
> > -   t->rcu_boost_mutex = NULL;
> > +   /* Snapshot/clear ->rcu_boost_waiter with rcu_node lock
> > held. */
> > +   if (t->rcu_boost_waiter) {
> > +   waiter = t->rcu_boost_waiter;
> > +   t->rcu_boost_waiter = NULL;
> > }
> >  #endif /* #ifdef 

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-25 Thread Paul E. McKenney
On Sun, Aug 25, 2013 at 11:19:37PM +0800, Lai Jiangshan wrote:
 Hi, Steven
 
 Any comments about this patch?

For whatever it is worth, it ran without incident for two hours worth
of rcutorture on my P5 test (boosting but no CPU hotplug).

Lai, do you have a specific test for this patch?  Your deadlock
scenario looks plausible, but is apparently not occurring in the
mainline kernel.

Thanx, Paul

 Thanks,
 Lai
 
 
 On Fri, Aug 23, 2013 at 2:26 PM, Lai Jiangshan la...@cn.fujitsu.com wrote:
 
  [PATCH] rcu/rt_mutex: eliminate a kind of deadlock for rcu read site
 
  Current rtmutex's lock-wait_lock doesn't disables softirq nor irq, it will
  cause rcu read site deadlock when rcu overlaps with any
  softirq-context/irq-context lock.
 
  @L is a spinlock of softirq or irq context.
 
  CPU1cpu2(rcu boost)
  rcu_read_lock() rt_mutext_lock()
  preemption and reschedule back  raw_spin_lock(lock-wait_lock)
  spin_lock_XX(L)   interrupt and doing softirq or
  irq
  rcu_read_unlock() do_softirq()
rcu_read_unlock_special()
  rt_mutext_unlock()
raw_spin_lock(lock-wait_lock)spin_lock_XX(L)  **DEADLOCK**
 
  This patch fixes this kind of deadlock by removing rt_mutext_unlock() from
  rcu_read_unlock(), new rt_mutex_rcu_deboost_unlock() is called instead.
  Thus rtmutex's lock-wait_lock will not be called from rcu_read_unlock().
 
  This patch does not eliminate all kinds of rcu-read-site deadlock,
  if @L is a scheduler lock, it will be deadlock, we should apply Paul's rule
  in this case.(avoid overlapping or preempt_disable()).
 
  rt_mutex_rcu_deboost_unlock() requires the @waiter is queued, so we
  can't directly call rt_mutex_lock(mtx) in the rcu_boost thread,
  we split rt_mutex_lock(mtx) into two steps just like pi-futex.
  This result a internal state in rcu_boost thread and cause
  rcu_boost thread a bit more complicated.
 
  Thanks
  Lai
 
  diff --git a/include/linux/init_task.h b/include/linux/init_task.h
  index 5cd0f09..8830874 100644
  --- a/include/linux/init_task.h
  +++ b/include/linux/init_task.h
  @@ -102,7 +102,7 @@ extern struct group_info init_groups;
 
   #ifdef CONFIG_RCU_BOOST
   #define INIT_TASK_RCU_BOOST()  \
  -   .rcu_boost_mutex = NULL,
  +   .rcu_boost_waiter = NULL,
   #else
   #define INIT_TASK_RCU_BOOST()
   #endif
  diff --git a/include/linux/sched.h b/include/linux/sched.h
  index e9995eb..1eca99f 100644
  --- a/include/linux/sched.h
  +++ b/include/linux/sched.h
  @@ -1078,7 +1078,7 @@ struct task_struct {
  struct rcu_node *rcu_blocked_node;
   #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
   #ifdef CONFIG_RCU_BOOST
  -   struct rt_mutex *rcu_boost_mutex;
  +   struct rt_mutex_waiter *rcu_boost_waiter;
   #endif /* #ifdef CONFIG_RCU_BOOST */
 
   #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
  @@ -1723,7 +1723,7 @@ static inline void rcu_copy_process(struct
  task_struct *p)
  p-rcu_blocked_node = NULL;
   #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
   #ifdef CONFIG_RCU_BOOST
  -   p-rcu_boost_mutex = NULL;
  +   p-rcu_boost_waiter = NULL;
   #endif /* #ifdef CONFIG_RCU_BOOST */
  INIT_LIST_HEAD(p-rcu_node_entry);
   }
  diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
  index 769e12e..d207ddd 100644
  --- a/kernel/rcutree_plugin.h
  +++ b/kernel/rcutree_plugin.h
  @@ -33,6 +33,7 @@
   #define RCU_KTHREAD_PRIO 1
 
   #ifdef CONFIG_RCU_BOOST
  +#include rtmutex_common.h
   #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
   #else
   #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
  @@ -340,7 +341,7 @@ void rcu_read_unlock_special(struct task_struct *t)
  unsigned long flags;
  struct list_head *np;
   #ifdef CONFIG_RCU_BOOST
  -   struct rt_mutex *rbmp = NULL;
  +   struct rt_mutex_waiter *waiter = NULL;
   #endif /* #ifdef CONFIG_RCU_BOOST */
  struct rcu_node *rnp;
  int special;
  @@ -397,10 +398,10 @@ void rcu_read_unlock_special(struct task_struct *t)
   #ifdef CONFIG_RCU_BOOST
  if (t-rcu_node_entry == rnp-boost_tasks)
  rnp-boost_tasks = np;
  -   /* Snapshot/clear -rcu_boost_mutex with rcu_node lock
  held. */
  -   if (t-rcu_boost_mutex) {
  -   rbmp = t-rcu_boost_mutex;
  -   t-rcu_boost_mutex = NULL;
  +   /* Snapshot/clear -rcu_boost_waiter with rcu_node lock
  held. */
  +   if (t-rcu_boost_waiter) {
  +   waiter = t-rcu_boost_waiter;
  +   t-rcu_boost_waiter = NULL;
  }
   #endif /* #ifdef CONFIG_RCU_BOOST */
 
  @@ -426,8 +427,8 @@ void rcu_read_unlock_special(struct task_struct *t)
 
   #ifdef CONFIG_RCU_BOOST
  /* Unboost if we 

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-25 Thread Lai Jiangshan
On 08/26/2013 01:43 AM, Paul E. McKenney wrote:
 On Sun, Aug 25, 2013 at 11:19:37PM +0800, Lai Jiangshan wrote:
 Hi, Steven

 Any comments about this patch?
 
 For whatever it is worth, it ran without incident for two hours worth
 of rcutorture on my P5 test (boosting but no CPU hotplug).
 
 Lai, do you have a specific test for this patch?  

Also rcutorture.
(A special module is added to ensure all paths of my code are covered.)

 Your deadlock
 scenario looks plausible, but is apparently not occurring in the
 mainline kernel.

Yes, you can leave this possible bug until the real problem happens
or just disallow overlapping.
I can write some debug code for it which allow us find out
the problems earlier.

I guess this is an useful usage pattern of rcu:

again:
rcu_read_lock();
obj = read_dereference(ptr);
spin_lock_XX(obj-lock);
if (obj is invalid) {
spin_unlock_XX(obj-lock);
rcu_read_unlock();
goto again;
}
rcu_read_unlock();
# use obj
spin_unlock_XX(obj-lock);

If we encourage this pattern, we should fix all the related problems.

Thanks,
Lai

 
   Thanx, Paul
 
 Thanks,
 Lai


 On Fri, Aug 23, 2013 at 2:26 PM, Lai Jiangshan la...@cn.fujitsu.com wrote:

 [PATCH] rcu/rt_mutex: eliminate a kind of deadlock for rcu read site

 Current rtmutex's lock-wait_lock doesn't disables softirq nor irq, it will
 cause rcu read site deadlock when rcu overlaps with any
 softirq-context/irq-context lock.

 @L is a spinlock of softirq or irq context.

 CPU1cpu2(rcu boost)
 rcu_read_lock() rt_mutext_lock()
 preemption and reschedule back  raw_spin_lock(lock-wait_lock)
 spin_lock_XX(L)   interrupt and doing softirq or
 irq
 rcu_read_unlock() do_softirq()
   rcu_read_unlock_special()
 rt_mutext_unlock()
   raw_spin_lock(lock-wait_lock)spin_lock_XX(L)  **DEADLOCK**

 This patch fixes this kind of deadlock by removing rt_mutext_unlock() from
 rcu_read_unlock(), new rt_mutex_rcu_deboost_unlock() is called instead.
 Thus rtmutex's lock-wait_lock will not be called from rcu_read_unlock().

 This patch does not eliminate all kinds of rcu-read-site deadlock,
 if @L is a scheduler lock, it will be deadlock, we should apply Paul's rule
 in this case.(avoid overlapping or preempt_disable()).

 rt_mutex_rcu_deboost_unlock() requires the @waiter is queued, so we
 can't directly call rt_mutex_lock(mtx) in the rcu_boost thread,
 we split rt_mutex_lock(mtx) into two steps just like pi-futex.
 This result a internal state in rcu_boost thread and cause
 rcu_boost thread a bit more complicated.

 Thanks
 Lai

 diff --git a/include/linux/init_task.h b/include/linux/init_task.h
 index 5cd0f09..8830874 100644
 --- a/include/linux/init_task.h
 +++ b/include/linux/init_task.h
 @@ -102,7 +102,7 @@ extern struct group_info init_groups;

  #ifdef CONFIG_RCU_BOOST
  #define INIT_TASK_RCU_BOOST()  \
 -   .rcu_boost_mutex = NULL,
 +   .rcu_boost_waiter = NULL,
  #else
  #define INIT_TASK_RCU_BOOST()
  #endif
 diff --git a/include/linux/sched.h b/include/linux/sched.h
 index e9995eb..1eca99f 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -1078,7 +1078,7 @@ struct task_struct {
 struct rcu_node *rcu_blocked_node;
  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
  #ifdef CONFIG_RCU_BOOST
 -   struct rt_mutex *rcu_boost_mutex;
 +   struct rt_mutex_waiter *rcu_boost_waiter;
  #endif /* #ifdef CONFIG_RCU_BOOST */

  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 @@ -1723,7 +1723,7 @@ static inline void rcu_copy_process(struct
 task_struct *p)
 p-rcu_blocked_node = NULL;
  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
  #ifdef CONFIG_RCU_BOOST
 -   p-rcu_boost_mutex = NULL;
 +   p-rcu_boost_waiter = NULL;
  #endif /* #ifdef CONFIG_RCU_BOOST */
 INIT_LIST_HEAD(p-rcu_node_entry);
  }
 diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
 index 769e12e..d207ddd 100644
 --- a/kernel/rcutree_plugin.h
 +++ b/kernel/rcutree_plugin.h
 @@ -33,6 +33,7 @@
  #define RCU_KTHREAD_PRIO 1

  #ifdef CONFIG_RCU_BOOST
 +#include rtmutex_common.h
  #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
  #else
  #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
 @@ -340,7 +341,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 unsigned long flags;
 struct list_head *np;
  #ifdef CONFIG_RCU_BOOST
 -   struct rt_mutex *rbmp = NULL;
 +   struct rt_mutex_waiter *waiter = NULL;
  #endif /* #ifdef CONFIG_RCU_BOOST */
 struct rcu_node *rnp;
 int special;
 @@ -397,10 +398,10 @@ void rcu_read_unlock_special(struct task_struct *t)
  #ifdef CONFIG_RCU_BOOST
 if (t-rcu_node_entry == rnp-boost_tasks)
 

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-23 Thread Lai Jiangshan
[PATCH] rcu/rt_mutex: eliminate a kind of deadlock for rcu read site

Current rtmutex's lock->wait_lock doesn't disables softirq nor irq, it will
cause rcu read site deadlock when rcu overlaps with any 
softirq-context/irq-context lock.

@L is a spinlock of softirq or irq context.

CPU1cpu2(rcu boost)
rcu_read_lock() rt_mutext_lock()
  raw_spin_lock(lock->wait_lock)
spin_lock_XX(L)   
rcu_read_unlock() do_softirq()
  rcu_read_unlock_special()
rt_mutext_unlock()
  raw_spin_lock(lock->wait_lock)spin_lock_XX(L)  **DEADLOCK**

This patch fixes this kind of deadlock by removing rt_mutext_unlock() from
rcu_read_unlock(), new rt_mutex_rcu_deboost_unlock() is called instead.
Thus rtmutex's lock->wait_lock will not be called from rcu_read_unlock().

This patch does not eliminate all kinds of rcu-read-site deadlock,
if @L is a scheduler lock, it will be deadlock, we should apply Paul's rule
in this case.(avoid overlapping or preempt_disable()).

rt_mutex_rcu_deboost_unlock() requires the @waiter is queued, so we
can't directly call rt_mutex_lock() in the rcu_boost thread,
we split rt_mutex_lock() into two steps just like pi-futex.
This result a internal state in rcu_boost thread and cause
rcu_boost thread a bit more complicated.

Thanks
Lai

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 5cd0f09..8830874 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -102,7 +102,7 @@ extern struct group_info init_groups;
 
 #ifdef CONFIG_RCU_BOOST
 #define INIT_TASK_RCU_BOOST()  \
-   .rcu_boost_mutex = NULL,
+   .rcu_boost_waiter = NULL,
 #else
 #define INIT_TASK_RCU_BOOST()
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e9995eb..1eca99f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1078,7 +1078,7 @@ struct task_struct {
struct rcu_node *rcu_blocked_node;
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 #ifdef CONFIG_RCU_BOOST
-   struct rt_mutex *rcu_boost_mutex;
+   struct rt_mutex_waiter *rcu_boost_waiter;
 #endif /* #ifdef CONFIG_RCU_BOOST */
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -1723,7 +1723,7 @@ static inline void rcu_copy_process(struct task_struct *p)
p->rcu_blocked_node = NULL;
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 #ifdef CONFIG_RCU_BOOST
-   p->rcu_boost_mutex = NULL;
+   p->rcu_boost_waiter = NULL;
 #endif /* #ifdef CONFIG_RCU_BOOST */
INIT_LIST_HEAD(>rcu_node_entry);
 }
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 769e12e..d207ddd 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -33,6 +33,7 @@
 #define RCU_KTHREAD_PRIO 1
 
 #ifdef CONFIG_RCU_BOOST
+#include "rtmutex_common.h"
 #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
 #else
 #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
@@ -340,7 +341,7 @@ void rcu_read_unlock_special(struct task_struct *t)
unsigned long flags;
struct list_head *np;
 #ifdef CONFIG_RCU_BOOST
-   struct rt_mutex *rbmp = NULL;
+   struct rt_mutex_waiter *waiter = NULL;
 #endif /* #ifdef CONFIG_RCU_BOOST */
struct rcu_node *rnp;
int special;
@@ -397,10 +398,10 @@ void rcu_read_unlock_special(struct task_struct *t)
 #ifdef CONFIG_RCU_BOOST
if (>rcu_node_entry == rnp->boost_tasks)
rnp->boost_tasks = np;
-   /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
-   if (t->rcu_boost_mutex) {
-   rbmp = t->rcu_boost_mutex;
-   t->rcu_boost_mutex = NULL;
+   /* Snapshot/clear ->rcu_boost_waiter with rcu_node lock held. */
+   if (t->rcu_boost_waiter) {
+   waiter = t->rcu_boost_waiter;
+   t->rcu_boost_waiter = NULL;
}
 #endif /* #ifdef CONFIG_RCU_BOOST */
 
@@ -426,8 +427,8 @@ void rcu_read_unlock_special(struct task_struct *t)
 
 #ifdef CONFIG_RCU_BOOST
/* Unboost if we were boosted. */
-   if (rbmp)
-   rt_mutex_unlock(rbmp);
+   if (waiter)
+   rt_mutex_rcu_deboost_unlock(t, waiter);
 #endif /* #ifdef CONFIG_RCU_BOOST */
 
/*
@@ -1129,9 +1130,6 @@ void exit_rcu(void)
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
 
 #ifdef CONFIG_RCU_BOOST
-
-#include "rtmutex_common.h"
-
 #ifdef CONFIG_RCU_TRACE
 
 static void rcu_initiate_boost_trace(struct rcu_node *rnp)
@@ -1181,14 +1179,15 @@ static int rcu_boost(struct rcu_node *rnp)
 {
unsigned long flags;
struct rt_mutex mtx;
+   struct rt_mutex_waiter rcu_boost_waiter;
struct task_struct *t;
struct list_head *tb;
+   int ret;
 
if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
  

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-23 Thread Lai Jiangshan
[PATCH] rcu/rt_mutex: eliminate a kind of deadlock for rcu read site

Current rtmutex's lock-wait_lock doesn't disables softirq nor irq, it will
cause rcu read site deadlock when rcu overlaps with any 
softirq-context/irq-context lock.

@L is a spinlock of softirq or irq context.

CPU1cpu2(rcu boost)
rcu_read_lock() rt_mutext_lock()
preemption and reschedule back  raw_spin_lock(lock-wait_lock)
spin_lock_XX(L)   interrupt and doing softirq or irq
rcu_read_unlock() do_softirq()
  rcu_read_unlock_special()
rt_mutext_unlock()
  raw_spin_lock(lock-wait_lock)spin_lock_XX(L)  **DEADLOCK**

This patch fixes this kind of deadlock by removing rt_mutext_unlock() from
rcu_read_unlock(), new rt_mutex_rcu_deboost_unlock() is called instead.
Thus rtmutex's lock-wait_lock will not be called from rcu_read_unlock().

This patch does not eliminate all kinds of rcu-read-site deadlock,
if @L is a scheduler lock, it will be deadlock, we should apply Paul's rule
in this case.(avoid overlapping or preempt_disable()).

rt_mutex_rcu_deboost_unlock() requires the @waiter is queued, so we
can't directly call rt_mutex_lock(mtx) in the rcu_boost thread,
we split rt_mutex_lock(mtx) into two steps just like pi-futex.
This result a internal state in rcu_boost thread and cause
rcu_boost thread a bit more complicated.

Thanks
Lai

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 5cd0f09..8830874 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -102,7 +102,7 @@ extern struct group_info init_groups;
 
 #ifdef CONFIG_RCU_BOOST
 #define INIT_TASK_RCU_BOOST()  \
-   .rcu_boost_mutex = NULL,
+   .rcu_boost_waiter = NULL,
 #else
 #define INIT_TASK_RCU_BOOST()
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e9995eb..1eca99f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1078,7 +1078,7 @@ struct task_struct {
struct rcu_node *rcu_blocked_node;
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 #ifdef CONFIG_RCU_BOOST
-   struct rt_mutex *rcu_boost_mutex;
+   struct rt_mutex_waiter *rcu_boost_waiter;
 #endif /* #ifdef CONFIG_RCU_BOOST */
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -1723,7 +1723,7 @@ static inline void rcu_copy_process(struct task_struct *p)
p-rcu_blocked_node = NULL;
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 #ifdef CONFIG_RCU_BOOST
-   p-rcu_boost_mutex = NULL;
+   p-rcu_boost_waiter = NULL;
 #endif /* #ifdef CONFIG_RCU_BOOST */
INIT_LIST_HEAD(p-rcu_node_entry);
 }
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 769e12e..d207ddd 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -33,6 +33,7 @@
 #define RCU_KTHREAD_PRIO 1
 
 #ifdef CONFIG_RCU_BOOST
+#include rtmutex_common.h
 #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
 #else
 #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
@@ -340,7 +341,7 @@ void rcu_read_unlock_special(struct task_struct *t)
unsigned long flags;
struct list_head *np;
 #ifdef CONFIG_RCU_BOOST
-   struct rt_mutex *rbmp = NULL;
+   struct rt_mutex_waiter *waiter = NULL;
 #endif /* #ifdef CONFIG_RCU_BOOST */
struct rcu_node *rnp;
int special;
@@ -397,10 +398,10 @@ void rcu_read_unlock_special(struct task_struct *t)
 #ifdef CONFIG_RCU_BOOST
if (t-rcu_node_entry == rnp-boost_tasks)
rnp-boost_tasks = np;
-   /* Snapshot/clear -rcu_boost_mutex with rcu_node lock held. */
-   if (t-rcu_boost_mutex) {
-   rbmp = t-rcu_boost_mutex;
-   t-rcu_boost_mutex = NULL;
+   /* Snapshot/clear -rcu_boost_waiter with rcu_node lock held. */
+   if (t-rcu_boost_waiter) {
+   waiter = t-rcu_boost_waiter;
+   t-rcu_boost_waiter = NULL;
}
 #endif /* #ifdef CONFIG_RCU_BOOST */
 
@@ -426,8 +427,8 @@ void rcu_read_unlock_special(struct task_struct *t)
 
 #ifdef CONFIG_RCU_BOOST
/* Unboost if we were boosted. */
-   if (rbmp)
-   rt_mutex_unlock(rbmp);
+   if (waiter)
+   rt_mutex_rcu_deboost_unlock(t, waiter);
 #endif /* #ifdef CONFIG_RCU_BOOST */
 
/*
@@ -1129,9 +1130,6 @@ void exit_rcu(void)
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
 
 #ifdef CONFIG_RCU_BOOST
-
-#include rtmutex_common.h
-
 #ifdef CONFIG_RCU_TRACE
 
 static void rcu_initiate_boost_trace(struct rcu_node *rnp)
@@ -1181,14 +1179,15 @@ static int rcu_boost(struct rcu_node *rnp)
 {
unsigned long flags;
struct rt_mutex mtx;
+   struct rt_mutex_waiter rcu_boost_waiter;
struct task_struct *t;
struct list_head *tb;
+   int ret;
 
if (rnp-exp_tasks 

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-22 Thread Steven Rostedt
On Thu, 22 Aug 2013 10:34:31 -0400
Steven Rostedt  wrote:

> On Thu, 22 Aug 2013 22:23:09 +0800
> Lai Jiangshan  wrote:
> 
> 
> > > By making it a irq-safe lock, we need to disable interrupts every time
> > > it is taken, which means the entire pi-chain walk in
> > > rt_mutex_adjust_prio_chain() will pretty much be with interrupts
> > > disabled.
> > 
> > 
> > I didn't catch your meaning.
> > rt_mutex_adjust_prio_chain() is called without wait_lock held.
> > current C.S. of wait_lock are really short.
> > 
> 
> There is quite a bit of overhead to enable and disable interrupts. If
> we are doing this in a very fast path, it's going to slow down -rt even
> more.

Looking at the rt_mutex_adjust_prio_chain(), it's not that bad because
the pi_lock needs irqs disabled too, and we would just extend that
section.

But it still extends irqs off.

But in -rt, it is used in the rt_spin_lock_slowlock() where it can exit
out of the code without ever having to disable interrupts.

> 
> It would be really nice to avoid making wait_lock be irq safe.

I still would like to exhaust other options before just making
wait_lock irq safe.

-- Steve
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-22 Thread Steven Rostedt
On Thu, 22 Aug 2013 22:23:09 +0800
Lai Jiangshan  wrote:


> > By making it a irq-safe lock, we need to disable interrupts every time
> > it is taken, which means the entire pi-chain walk in
> > rt_mutex_adjust_prio_chain() will pretty much be with interrupts
> > disabled.
> 
> 
> I didn't catch your meaning.
> rt_mutex_adjust_prio_chain() is called without wait_lock held.
> current C.S. of wait_lock are really short.
> 

There is quite a bit of overhead to enable and disable interrupts. If
we are doing this in a very fast path, it's going to slow down -rt even
more.

It would be really nice to avoid making wait_lock be irq safe.

-- Steve
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-22 Thread Steven Rostedt
On Thu, 22 Aug 2013 22:23:09 +0800
Lai Jiangshan eag0...@gmail.com wrote:


  By making it a irq-safe lock, we need to disable interrupts every time
  it is taken, which means the entire pi-chain walk in
  rt_mutex_adjust_prio_chain() will pretty much be with interrupts
  disabled.
 
 
 I didn't catch your meaning.
 rt_mutex_adjust_prio_chain() is called without wait_lock held.
 current C.S. of wait_lock are really short.
 

There is quite a bit of overhead to enable and disable interrupts. If
we are doing this in a very fast path, it's going to slow down -rt even
more.

It would be really nice to avoid making wait_lock be irq safe.

-- Steve
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-22 Thread Steven Rostedt
On Thu, 22 Aug 2013 10:34:31 -0400
Steven Rostedt rost...@goodmis.org wrote:

 On Thu, 22 Aug 2013 22:23:09 +0800
 Lai Jiangshan eag0...@gmail.com wrote:
 
 
   By making it a irq-safe lock, we need to disable interrupts every time
   it is taken, which means the entire pi-chain walk in
   rt_mutex_adjust_prio_chain() will pretty much be with interrupts
   disabled.
  
  
  I didn't catch your meaning.
  rt_mutex_adjust_prio_chain() is called without wait_lock held.
  current C.S. of wait_lock are really short.
  
 
 There is quite a bit of overhead to enable and disable interrupts. If
 we are doing this in a very fast path, it's going to slow down -rt even
 more.

Looking at the rt_mutex_adjust_prio_chain(), it's not that bad because
the pi_lock needs irqs disabled too, and we would just extend that
section.

But it still extends irqs off.

But in -rt, it is used in the rt_spin_lock_slowlock() where it can exit
out of the code without ever having to disable interrupts.

 
 It would be really nice to avoid making wait_lock be irq safe.

I still would like to exhaust other options before just making
wait_lock irq safe.

-- Steve
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-21 Thread Paul E. McKenney
On Wed, Aug 21, 2013 at 11:25:55AM +0800, Lai Jiangshan wrote:
> On 08/21/2013 11:17 AM, Paul E. McKenney wrote:
> > On Sat, Aug 10, 2013 at 08:07:15AM -0700, Paul E. McKenney wrote:
> >> On Sat, Aug 10, 2013 at 11:43:59AM +0800, Lai Jiangshan wrote:
> > 
> > [ . . . ]
> > 
> >>> So I have to narrow the range of suspect locks. Two choices:
> >>> A) don't call rt_mutex_unlock() from rcu_read_unlock(), only call it
> >>>from rcu_preempt_not_context_switch(). we need to rework these
> >>>two functions and it will add complexity to RCU, and it also still
> >>>adds some probability of deferring.
> >>
> >> One advantage of bh-disable locks is that enabling bh checks
> >> TIF_NEED_RESCHED, so that there is no deferring beyond that
> >> needed by bh disable.  The same of course applies to preempt_disable().
> >>
> >> So one approach is to defer when rcu_read_unlock_special() is entered
> >> with either preemption or bh disabled.  Your current set_need_resched()
> >> trick would work fine in this case.  Unfortunately, re-enabling interrupts
> >> does -not- check TIF_NEED_RESCHED, which is why we have latency problems
> >> in that case.  (Hence my earlier question about making self-IPI safe
> >> on all arches, which would result in an interrupt as soon as interrupts
> >> were re-enabled.)
> >>
> >> Another possibility is to defer only when preemption or bh are disabled
> >> on entry ro rcu_read_unlock_special(), but to retain the current
> >> (admittedly ugly) nesting rules for the scheduler locks.
> > 
> > Would you be willing to do a patch that deferred rt_mutex_unlock() in
> > the preempt/bh cases?  This of course does not solve the irq-disable
> > case, but it should at least narrow the problem to the scheduler locks.
> > 
> > Not a big hurry, given the testing required, this is 3.13 or 3.14 material,
> > I think.
> > 
> > If you are busy, no problem, I can do it, just figured you have priority
> > if you want it.
> 
> I'm writing a special rt_mutex_unlock() for rcu deboost only.
> I hope Steven accept it.

That would be very cool, though if I understand the requirements,
especially for -rt, very challenging.  Looking forward to seeing it!

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-21 Thread Paul E. McKenney
On Wed, Aug 21, 2013 at 11:25:55AM +0800, Lai Jiangshan wrote:
 On 08/21/2013 11:17 AM, Paul E. McKenney wrote:
  On Sat, Aug 10, 2013 at 08:07:15AM -0700, Paul E. McKenney wrote:
  On Sat, Aug 10, 2013 at 11:43:59AM +0800, Lai Jiangshan wrote:
  
  [ . . . ]
  
  So I have to narrow the range of suspect locks. Two choices:
  A) don't call rt_mutex_unlock() from rcu_read_unlock(), only call it
 from rcu_preempt_not_context_switch(). we need to rework these
 two functions and it will add complexity to RCU, and it also still
 adds some probability of deferring.
 
  One advantage of bh-disable locks is that enabling bh checks
  TIF_NEED_RESCHED, so that there is no deferring beyond that
  needed by bh disable.  The same of course applies to preempt_disable().
 
  So one approach is to defer when rcu_read_unlock_special() is entered
  with either preemption or bh disabled.  Your current set_need_resched()
  trick would work fine in this case.  Unfortunately, re-enabling interrupts
  does -not- check TIF_NEED_RESCHED, which is why we have latency problems
  in that case.  (Hence my earlier question about making self-IPI safe
  on all arches, which would result in an interrupt as soon as interrupts
  were re-enabled.)
 
  Another possibility is to defer only when preemption or bh are disabled
  on entry ro rcu_read_unlock_special(), but to retain the current
  (admittedly ugly) nesting rules for the scheduler locks.
  
  Would you be willing to do a patch that deferred rt_mutex_unlock() in
  the preempt/bh cases?  This of course does not solve the irq-disable
  case, but it should at least narrow the problem to the scheduler locks.
  
  Not a big hurry, given the testing required, this is 3.13 or 3.14 material,
  I think.
  
  If you are busy, no problem, I can do it, just figured you have priority
  if you want it.
 
 I'm writing a special rt_mutex_unlock() for rcu deboost only.
 I hope Steven accept it.

That would be very cool, though if I understand the requirements,
especially for -rt, very challenging.  Looking forward to seeing it!

Thanx, Paul

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-20 Thread Lai Jiangshan
On 08/21/2013 11:17 AM, Paul E. McKenney wrote:
> On Sat, Aug 10, 2013 at 08:07:15AM -0700, Paul E. McKenney wrote:
>> On Sat, Aug 10, 2013 at 11:43:59AM +0800, Lai Jiangshan wrote:
> 
> [ . . . ]
> 
>>> So I have to narrow the range of suspect locks. Two choices:
>>> A) don't call rt_mutex_unlock() from rcu_read_unlock(), only call it
>>>from rcu_preempt_not_context_switch(). we need to rework these
>>>two functions and it will add complexity to RCU, and it also still
>>>adds some probability of deferring.
>>
>> One advantage of bh-disable locks is that enabling bh checks
>> TIF_NEED_RESCHED, so that there is no deferring beyond that
>> needed by bh disable.  The same of course applies to preempt_disable().
>>
>> So one approach is to defer when rcu_read_unlock_special() is entered
>> with either preemption or bh disabled.  Your current set_need_resched()
>> trick would work fine in this case.  Unfortunately, re-enabling interrupts
>> does -not- check TIF_NEED_RESCHED, which is why we have latency problems
>> in that case.  (Hence my earlier question about making self-IPI safe
>> on all arches, which would result in an interrupt as soon as interrupts
>> were re-enabled.)
>>
>> Another possibility is to defer only when preemption or bh are disabled
>> on entry ro rcu_read_unlock_special(), but to retain the current
>> (admittedly ugly) nesting rules for the scheduler locks.
> 
> Would you be willing to do a patch that deferred rt_mutex_unlock() in
> the preempt/bh cases?  This of course does not solve the irq-disable
> case, but it should at least narrow the problem to the scheduler locks.
> 
> Not a big hurry, given the testing required, this is 3.13 or 3.14 material,
> I think.
> 
> If you are busy, no problem, I can do it, just figured you have priority
> if you want it.
> 
>   


I'm writing a special rt_mutex_unlock() for rcu deboost only.
I hope Steven accept it.

Thanks,
Lai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-20 Thread Paul E. McKenney
On Sat, Aug 10, 2013 at 08:07:15AM -0700, Paul E. McKenney wrote:
> On Sat, Aug 10, 2013 at 11:43:59AM +0800, Lai Jiangshan wrote:

[ . . . ]

> > So I have to narrow the range of suspect locks. Two choices:
> > A) don't call rt_mutex_unlock() from rcu_read_unlock(), only call it
> >from rcu_preempt_not_context_switch(). we need to rework these
> >two functions and it will add complexity to RCU, and it also still
> >adds some probability of deferring.
> 
> One advantage of bh-disable locks is that enabling bh checks
> TIF_NEED_RESCHED, so that there is no deferring beyond that
> needed by bh disable.  The same of course applies to preempt_disable().
> 
> So one approach is to defer when rcu_read_unlock_special() is entered
> with either preemption or bh disabled.  Your current set_need_resched()
> trick would work fine in this case.  Unfortunately, re-enabling interrupts
> does -not- check TIF_NEED_RESCHED, which is why we have latency problems
> in that case.  (Hence my earlier question about making self-IPI safe
> on all arches, which would result in an interrupt as soon as interrupts
> were re-enabled.)
> 
> Another possibility is to defer only when preemption or bh are disabled
> on entry ro rcu_read_unlock_special(), but to retain the current
> (admittedly ugly) nesting rules for the scheduler locks.

Would you be willing to do a patch that deferred rt_mutex_unlock() in
the preempt/bh cases?  This of course does not solve the irq-disable
case, but it should at least narrow the problem to the scheduler locks.

Not a big hurry, given the testing required, this is 3.13 or 3.14 material,
I think.

If you are busy, no problem, I can do it, just figured you have priority
if you want it.

Thanx, Paul

> > B) change rtmutex's lock->wait_lock to irqs-disabled.
> 
> I have to defer to Steven on this one.
> 
>   Thanx, Paul
> 
> > 4) In the view of rtmutex, I think it will be better if ->wait_lock is 
> > irqs-disabled.
> >A) like trylock of mutex/rw_sem, we may call rt_mutex_trylock() in irq 
> > in future.
> >B) the critical section of ->wait_lock is short,
> >   making it irqs-disabled don't hurts responsibility/latency.
> >C) almost all time of the critical section of ->wait_lock is 
> > irqs-disabled
> >   (due to task->pi_lock), I think converting whole time of the critical 
> > section
> >   of ->wait_lock to irqs-disabled is OK.
> > 
> > So I hope you change rtmutex's lock->wait_lock.
> > 
> > Any feedback from anyone is welcome.
> > 
> > Thanks,
> > Lai
> > 
> > On 08/09/2013 04:40 AM, Paul E. McKenney wrote:
> > > On Wed, Aug 07, 2013 at 06:25:01PM +0800, Lai Jiangshan wrote:
> > >> Background)
> > >>
> > >> Although all articles declare that rcu read site is deadlock-immunity.
> > >> It is not true for rcu-preempt, it will be deadlock if rcu read site
> > >> overlaps with scheduler lock.
> > >>
> > >> ec433f0c, 10f39bb1 and 016a8d5b just partially solve it. But rcu read 
> > >> site
> > >> is still not deadlock-immunity. And the problem described in 016a8d5b
> > >> is still existed(rcu_read_unlock_special() calls wake_up).
> > >>
> > >> Aim)
> > >>
> > >> We want to fix the problem forever, we want to keep rcu read site
> > >> is deadlock-immunity as books say.
> > >>
> > >> How)
> > >>
> > >> The problem is solved by "if rcu_read_unlock_special() is called inside
> > >> any lock which can be (chained) nested in rcu_read_unlock_special(),
> > >> we defer rcu_read_unlock_special()".
> > >> This kind locks include rnp->lock, scheduler locks, perf ctx->lock, locks
> > >> in printk()/WARN_ON() and all locks nested in these locks or chained 
> > >> nested
> > >> in these locks.
> > >>
> > >> The problem is reduced to "how to distinguish all these locks(context)",
> > >> We don't distinguish all these locks, we know that all these locks
> > >> should be nested in local_irqs_disable().
> > >>
> > >> we just consider if rcu_read_unlock_special() is called in irqs-disabled
> > >> context, it may be called in these suspect locks, we should defer
> > >> rcu_read_unlock_special().
> > >>
> > >> The algorithm enlarges the probability of deferring, but the probability
> > >> is still very very low.
> > >>
> > >> Deferring does add a small overhead, but it offers us:
> > >>  1) really deadlock-immunity for rcu read site
> > >>  2) remove the overhead of the irq-work(250 times per second in avg.)
> > > 
> > > One problem here -- it may take quite some time for a set_need_resched()
> > > to take effect.  This is especially a problem for RCU priority boosting,
> > > but can also needlessly delay preemptible-RCU grace periods because
> > > local_irq_restore() and friends don't check the TIF_NEED_RESCHED bit.
> > > 
> > > OK, alternatives...
> > > 
> > > o Keep the current rule saying that if the scheduler is going
> > >   to exit an RCU read-side critical section while holding

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-20 Thread Paul E. McKenney
On Sat, Aug 10, 2013 at 08:07:15AM -0700, Paul E. McKenney wrote:
 On Sat, Aug 10, 2013 at 11:43:59AM +0800, Lai Jiangshan wrote:

[ . . . ]

  So I have to narrow the range of suspect locks. Two choices:
  A) don't call rt_mutex_unlock() from rcu_read_unlock(), only call it
 from rcu_preempt_not_context_switch(). we need to rework these
 two functions and it will add complexity to RCU, and it also still
 adds some probability of deferring.
 
 One advantage of bh-disable locks is that enabling bh checks
 TIF_NEED_RESCHED, so that there is no deferring beyond that
 needed by bh disable.  The same of course applies to preempt_disable().
 
 So one approach is to defer when rcu_read_unlock_special() is entered
 with either preemption or bh disabled.  Your current set_need_resched()
 trick would work fine in this case.  Unfortunately, re-enabling interrupts
 does -not- check TIF_NEED_RESCHED, which is why we have latency problems
 in that case.  (Hence my earlier question about making self-IPI safe
 on all arches, which would result in an interrupt as soon as interrupts
 were re-enabled.)
 
 Another possibility is to defer only when preemption or bh are disabled
 on entry ro rcu_read_unlock_special(), but to retain the current
 (admittedly ugly) nesting rules for the scheduler locks.

Would you be willing to do a patch that deferred rt_mutex_unlock() in
the preempt/bh cases?  This of course does not solve the irq-disable
case, but it should at least narrow the problem to the scheduler locks.

Not a big hurry, given the testing required, this is 3.13 or 3.14 material,
I think.

If you are busy, no problem, I can do it, just figured you have priority
if you want it.

Thanx, Paul

  B) change rtmutex's lock-wait_lock to irqs-disabled.
 
 I have to defer to Steven on this one.
 
   Thanx, Paul
 
  4) In the view of rtmutex, I think it will be better if -wait_lock is 
  irqs-disabled.
 A) like trylock of mutex/rw_sem, we may call rt_mutex_trylock() in irq 
  in future.
 B) the critical section of -wait_lock is short,
making it irqs-disabled don't hurts responsibility/latency.
 C) almost all time of the critical section of -wait_lock is 
  irqs-disabled
(due to task-pi_lock), I think converting whole time of the critical 
  section
of -wait_lock to irqs-disabled is OK.
  
  So I hope you change rtmutex's lock-wait_lock.
  
  Any feedback from anyone is welcome.
  
  Thanks,
  Lai
  
  On 08/09/2013 04:40 AM, Paul E. McKenney wrote:
   On Wed, Aug 07, 2013 at 06:25:01PM +0800, Lai Jiangshan wrote:
   Background)
  
   Although all articles declare that rcu read site is deadlock-immunity.
   It is not true for rcu-preempt, it will be deadlock if rcu read site
   overlaps with scheduler lock.
  
   ec433f0c, 10f39bb1 and 016a8d5b just partially solve it. But rcu read 
   site
   is still not deadlock-immunity. And the problem described in 016a8d5b
   is still existed(rcu_read_unlock_special() calls wake_up).
  
   Aim)
  
   We want to fix the problem forever, we want to keep rcu read site
   is deadlock-immunity as books say.
  
   How)
  
   The problem is solved by if rcu_read_unlock_special() is called inside
   any lock which can be (chained) nested in rcu_read_unlock_special(),
   we defer rcu_read_unlock_special().
   This kind locks include rnp-lock, scheduler locks, perf ctx-lock, locks
   in printk()/WARN_ON() and all locks nested in these locks or chained 
   nested
   in these locks.
  
   The problem is reduced to how to distinguish all these locks(context),
   We don't distinguish all these locks, we know that all these locks
   should be nested in local_irqs_disable().
  
   we just consider if rcu_read_unlock_special() is called in irqs-disabled
   context, it may be called in these suspect locks, we should defer
   rcu_read_unlock_special().
  
   The algorithm enlarges the probability of deferring, but the probability
   is still very very low.
  
   Deferring does add a small overhead, but it offers us:
1) really deadlock-immunity for rcu read site
2) remove the overhead of the irq-work(250 times per second in avg.)
   
   One problem here -- it may take quite some time for a set_need_resched()
   to take effect.  This is especially a problem for RCU priority boosting,
   but can also needlessly delay preemptible-RCU grace periods because
   local_irq_restore() and friends don't check the TIF_NEED_RESCHED bit.
   
   OK, alternatives...
   
   o Keep the current rule saying that if the scheduler is going
 to exit an RCU read-side critical section while holding
 one of its spinlocks, preemption has to have been disabled
 throughout the full duration of that critical section.
 Well, we can certainly do this, but it would be nice to get
 rid of this rule.
   
   o Use per-CPU variables, possibly injecting delay.  This has 

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-20 Thread Lai Jiangshan
On 08/21/2013 11:17 AM, Paul E. McKenney wrote:
 On Sat, Aug 10, 2013 at 08:07:15AM -0700, Paul E. McKenney wrote:
 On Sat, Aug 10, 2013 at 11:43:59AM +0800, Lai Jiangshan wrote:
 
 [ . . . ]
 
 So I have to narrow the range of suspect locks. Two choices:
 A) don't call rt_mutex_unlock() from rcu_read_unlock(), only call it
from rcu_preempt_not_context_switch(). we need to rework these
two functions and it will add complexity to RCU, and it also still
adds some probability of deferring.

 One advantage of bh-disable locks is that enabling bh checks
 TIF_NEED_RESCHED, so that there is no deferring beyond that
 needed by bh disable.  The same of course applies to preempt_disable().

 So one approach is to defer when rcu_read_unlock_special() is entered
 with either preemption or bh disabled.  Your current set_need_resched()
 trick would work fine in this case.  Unfortunately, re-enabling interrupts
 does -not- check TIF_NEED_RESCHED, which is why we have latency problems
 in that case.  (Hence my earlier question about making self-IPI safe
 on all arches, which would result in an interrupt as soon as interrupts
 were re-enabled.)

 Another possibility is to defer only when preemption or bh are disabled
 on entry ro rcu_read_unlock_special(), but to retain the current
 (admittedly ugly) nesting rules for the scheduler locks.
 
 Would you be willing to do a patch that deferred rt_mutex_unlock() in
 the preempt/bh cases?  This of course does not solve the irq-disable
 case, but it should at least narrow the problem to the scheduler locks.
 
 Not a big hurry, given the testing required, this is 3.13 or 3.14 material,
 I think.
 
 If you are busy, no problem, I can do it, just figured you have priority
 if you want it.
 
   


I'm writing a special rt_mutex_unlock() for rcu deboost only.
I hope Steven accept it.

Thanks,
Lai
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-12 Thread Paul E. McKenney
On Mon, Aug 12, 2013 at 06:21:26PM +0200, Peter Zijlstra wrote:
> On Mon, Aug 12, 2013 at 08:16:18AM -0700, Paul E. McKenney wrote:
> > On Mon, Aug 12, 2013 at 03:55:44PM +0200, Peter Zijlstra wrote:
> > > On Fri, Aug 09, 2013 at 05:31:27PM +0800, Lai Jiangshan wrote:
> > > > On 08/09/2013 04:40 AM, Paul E. McKenney wrote:
> > > > > One problem here -- it may take quite some time for a 
> > > > > set_need_resched()
> > > > > to take effect.  This is especially a problem for RCU priority 
> > > > > boosting,
> > > > > but can also needlessly delay preemptible-RCU grace periods because
> > > > > local_irq_restore() and friends don't check the TIF_NEED_RESCHED bit.
> > > > 
> > > > 
> > > > The final effect of deboosting(rt_mutex_unlock()) is also accomplished
> > > > via set_need_resched()/set_tsk_need_resched().
> > > > set_need_resched() is enough for RCU priority boosting issue here.
> > > 
> > > But there's a huge difference between the boosting and deboosting side
> > > of things. rcu_read_unlock_special() starts the boost, the deboosting
> > > only matters if/when you reschedule. 
> > 
> > Or if there is a pre-existing runnable task whose priority is such that
> > deboosting makes it the highest-priority task.
> 
> Right, I got horribly lost in rt_mutex, but I suspect we deal with that
> case the right way. -rt people would've noticed us screwing that up ;-)
> 
> But there too, we're fully limited to how fast we can get a
> reschedule(). Deboosting sooner than we can reschedule to run the other
> task is effectively pointless. The converse is obviously not true; we
> must not be able to reschedule sooner than we can deboost ;-)

In addition, the proposed change was to defer the deboost based on
a set_need_resched(), which would provide additional opportunity for
delay -- the running task would retain its high priority until the
scheduler acted on the set_need_resched().

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-12 Thread Peter Zijlstra
On Mon, Aug 12, 2013 at 08:16:18AM -0700, Paul E. McKenney wrote:
> On Mon, Aug 12, 2013 at 03:55:44PM +0200, Peter Zijlstra wrote:
> > On Fri, Aug 09, 2013 at 05:31:27PM +0800, Lai Jiangshan wrote:
> > > On 08/09/2013 04:40 AM, Paul E. McKenney wrote:
> > > > One problem here -- it may take quite some time for a set_need_resched()
> > > > to take effect.  This is especially a problem for RCU priority boosting,
> > > > but can also needlessly delay preemptible-RCU grace periods because
> > > > local_irq_restore() and friends don't check the TIF_NEED_RESCHED bit.
> > > 
> > > 
> > > The final effect of deboosting(rt_mutex_unlock()) is also accomplished
> > > via set_need_resched()/set_tsk_need_resched().
> > > set_need_resched() is enough for RCU priority boosting issue here.
> > 
> > But there's a huge difference between the boosting and deboosting side
> > of things. rcu_read_unlock_special() starts the boost, the deboosting
> > only matters if/when you reschedule. 
> 
> Or if there is a pre-existing runnable task whose priority is such that
> deboosting makes it the highest-priority task.

Right, I got horribly lost in rt_mutex, but I suspect we deal with that
case the right way. -rt people would've noticed us screwing that up ;-)

But there too, we're fully limited to how fast we can get a
reschedule(). Deboosting sooner than we can reschedule to run the other
task is effectively pointless. The converse is obviously not true; we
must not be able to reschedule sooner than we can deboost ;-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-12 Thread Paul E. McKenney
On Mon, Aug 12, 2013 at 03:55:44PM +0200, Peter Zijlstra wrote:
> On Fri, Aug 09, 2013 at 05:31:27PM +0800, Lai Jiangshan wrote:
> > On 08/09/2013 04:40 AM, Paul E. McKenney wrote:
> > > One problem here -- it may take quite some time for a set_need_resched()
> > > to take effect.  This is especially a problem for RCU priority boosting,
> > > but can also needlessly delay preemptible-RCU grace periods because
> > > local_irq_restore() and friends don't check the TIF_NEED_RESCHED bit.
> > 
> > 
> > The final effect of deboosting(rt_mutex_unlock()) is also accomplished
> > via set_need_resched()/set_tsk_need_resched().
> > set_need_resched() is enough for RCU priority boosting issue here.
> 
> But there's a huge difference between the boosting and deboosting side
> of things. rcu_read_unlock_special() starts the boost, the deboosting
> only matters if/when you reschedule. 

Or if there is a pre-existing runnable task whose priority is such that
deboosting makes it the highest-priority task.

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-12 Thread Peter Zijlstra
On Mon, Aug 12, 2013 at 10:10:08AM -0400, Steven Rostedt wrote:
> On Mon, 12 Aug 2013 15:53:10 +0200
> Peter Zijlstra  wrote:
> 
> > On Sat, Aug 10, 2013 at 11:43:59AM +0800, Lai Jiangshan wrote:
> > > Hi, Steven
> > > 
> > > I was considering rtmutex's lock->wait_lock is a scheduler lock,
> > > But it is not, and it is just a spinlock of process context.
> > > I hope you change it to a spinlock of irq context.
> > 
> > rwmutex::wait_lock is irq-safe; it had better be because its taken under
> > task_struct::pi_lock. 
> 
> It is? I thought it was the other way around. That is, pi_lock is taken
> under wait_lock.

Urgh, right you are. 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-12 Thread Steven Rostedt
On Mon, 12 Aug 2013 15:53:10 +0200
Peter Zijlstra  wrote:

> On Sat, Aug 10, 2013 at 11:43:59AM +0800, Lai Jiangshan wrote:
> > Hi, Steven
> > 
> > I was considering rtmutex's lock->wait_lock is a scheduler lock,
> > But it is not, and it is just a spinlock of process context.
> > I hope you change it to a spinlock of irq context.
> 
> rwmutex::wait_lock is irq-safe; it had better be because its taken under
> task_struct::pi_lock. 

It is? I thought it was the other way around. That is, pi_lock is taken
under wait_lock.

task_blocks_on_rt_mutex() is called with wait_lock held. The first
thing it does is to grab the pi_lock.

The wait_lock is the mutex internal lock. Currently, no interrupt
context should take that lock, or should there be an interrupt lock
held when it is taken. The lock should be taken in the same contexts
that a mutex can be taken in.

By making it a irq-safe lock, we need to disable interrupts every time
it is taken, which means the entire pi-chain walk in
rt_mutex_adjust_prio_chain() will pretty much be with interrupts
disabled. I really would like to avoid making wait_lock irq-safe if we
can avoid it. I'm sure it will have a large impact to the -rt kernel if
we convert it.


-- Steve
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-12 Thread Peter Zijlstra
On Fri, Aug 09, 2013 at 05:31:27PM +0800, Lai Jiangshan wrote:
> On 08/09/2013 04:40 AM, Paul E. McKenney wrote:
> > One problem here -- it may take quite some time for a set_need_resched()
> > to take effect.  This is especially a problem for RCU priority boosting,
> > but can also needlessly delay preemptible-RCU grace periods because
> > local_irq_restore() and friends don't check the TIF_NEED_RESCHED bit.
> 
> 
> The final effect of deboosting(rt_mutex_unlock()) is also accomplished
> via set_need_resched()/set_tsk_need_resched().
> set_need_resched() is enough for RCU priority boosting issue here.

But there's a huge difference between the boosting and deboosting side
of things. rcu_read_unlock_special() starts the boost, the deboosting
only matters if/when you reschedule. 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-12 Thread Peter Zijlstra
On Sat, Aug 10, 2013 at 11:43:59AM +0800, Lai Jiangshan wrote:
> Hi, Steven
> 
> I was considering rtmutex's lock->wait_lock is a scheduler lock,
> But it is not, and it is just a spinlock of process context.
> I hope you change it to a spinlock of irq context.

rwmutex::wait_lock is irq-safe; it had better be because its taken under
task_struct::pi_lock. 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-12 Thread Peter Zijlstra
On Sat, Aug 10, 2013 at 11:43:59AM +0800, Lai Jiangshan wrote:
 Hi, Steven
 
 I was considering rtmutex's lock-wait_lock is a scheduler lock,
 But it is not, and it is just a spinlock of process context.
 I hope you change it to a spinlock of irq context.

rwmutex::wait_lock is irq-safe; it had better be because its taken under
task_struct::pi_lock. 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-12 Thread Peter Zijlstra
On Fri, Aug 09, 2013 at 05:31:27PM +0800, Lai Jiangshan wrote:
 On 08/09/2013 04:40 AM, Paul E. McKenney wrote:
  One problem here -- it may take quite some time for a set_need_resched()
  to take effect.  This is especially a problem for RCU priority boosting,
  but can also needlessly delay preemptible-RCU grace periods because
  local_irq_restore() and friends don't check the TIF_NEED_RESCHED bit.
 
 
 The final effect of deboosting(rt_mutex_unlock()) is also accomplished
 via set_need_resched()/set_tsk_need_resched().
 set_need_resched() is enough for RCU priority boosting issue here.

But there's a huge difference between the boosting and deboosting side
of things. rcu_read_unlock_special() starts the boost, the deboosting
only matters if/when you reschedule. 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-12 Thread Steven Rostedt
On Mon, 12 Aug 2013 15:53:10 +0200
Peter Zijlstra pet...@infradead.org wrote:

 On Sat, Aug 10, 2013 at 11:43:59AM +0800, Lai Jiangshan wrote:
  Hi, Steven
  
  I was considering rtmutex's lock-wait_lock is a scheduler lock,
  But it is not, and it is just a spinlock of process context.
  I hope you change it to a spinlock of irq context.
 
 rwmutex::wait_lock is irq-safe; it had better be because its taken under
 task_struct::pi_lock. 

It is? I thought it was the other way around. That is, pi_lock is taken
under wait_lock.

task_blocks_on_rt_mutex() is called with wait_lock held. The first
thing it does is to grab the pi_lock.

The wait_lock is the mutex internal lock. Currently, no interrupt
context should take that lock, or should there be an interrupt lock
held when it is taken. The lock should be taken in the same contexts
that a mutex can be taken in.

By making it a irq-safe lock, we need to disable interrupts every time
it is taken, which means the entire pi-chain walk in
rt_mutex_adjust_prio_chain() will pretty much be with interrupts
disabled. I really would like to avoid making wait_lock irq-safe if we
can avoid it. I'm sure it will have a large impact to the -rt kernel if
we convert it.


-- Steve
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-12 Thread Peter Zijlstra
On Mon, Aug 12, 2013 at 10:10:08AM -0400, Steven Rostedt wrote:
 On Mon, 12 Aug 2013 15:53:10 +0200
 Peter Zijlstra pet...@infradead.org wrote:
 
  On Sat, Aug 10, 2013 at 11:43:59AM +0800, Lai Jiangshan wrote:
   Hi, Steven
   
   I was considering rtmutex's lock-wait_lock is a scheduler lock,
   But it is not, and it is just a spinlock of process context.
   I hope you change it to a spinlock of irq context.
  
  rwmutex::wait_lock is irq-safe; it had better be because its taken under
  task_struct::pi_lock. 
 
 It is? I thought it was the other way around. That is, pi_lock is taken
 under wait_lock.

Urgh, right you are. 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-12 Thread Paul E. McKenney
On Mon, Aug 12, 2013 at 03:55:44PM +0200, Peter Zijlstra wrote:
 On Fri, Aug 09, 2013 at 05:31:27PM +0800, Lai Jiangshan wrote:
  On 08/09/2013 04:40 AM, Paul E. McKenney wrote:
   One problem here -- it may take quite some time for a set_need_resched()
   to take effect.  This is especially a problem for RCU priority boosting,
   but can also needlessly delay preemptible-RCU grace periods because
   local_irq_restore() and friends don't check the TIF_NEED_RESCHED bit.
  
  
  The final effect of deboosting(rt_mutex_unlock()) is also accomplished
  via set_need_resched()/set_tsk_need_resched().
  set_need_resched() is enough for RCU priority boosting issue here.
 
 But there's a huge difference between the boosting and deboosting side
 of things. rcu_read_unlock_special() starts the boost, the deboosting
 only matters if/when you reschedule. 

Or if there is a pre-existing runnable task whose priority is such that
deboosting makes it the highest-priority task.

Thanx, Paul

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-12 Thread Peter Zijlstra
On Mon, Aug 12, 2013 at 08:16:18AM -0700, Paul E. McKenney wrote:
 On Mon, Aug 12, 2013 at 03:55:44PM +0200, Peter Zijlstra wrote:
  On Fri, Aug 09, 2013 at 05:31:27PM +0800, Lai Jiangshan wrote:
   On 08/09/2013 04:40 AM, Paul E. McKenney wrote:
One problem here -- it may take quite some time for a set_need_resched()
to take effect.  This is especially a problem for RCU priority boosting,
but can also needlessly delay preemptible-RCU grace periods because
local_irq_restore() and friends don't check the TIF_NEED_RESCHED bit.
   
   
   The final effect of deboosting(rt_mutex_unlock()) is also accomplished
   via set_need_resched()/set_tsk_need_resched().
   set_need_resched() is enough for RCU priority boosting issue here.
  
  But there's a huge difference between the boosting and deboosting side
  of things. rcu_read_unlock_special() starts the boost, the deboosting
  only matters if/when you reschedule. 
 
 Or if there is a pre-existing runnable task whose priority is such that
 deboosting makes it the highest-priority task.

Right, I got horribly lost in rt_mutex, but I suspect we deal with that
case the right way. -rt people would've noticed us screwing that up ;-)

But there too, we're fully limited to how fast we can get a
reschedule(). Deboosting sooner than we can reschedule to run the other
task is effectively pointless. The converse is obviously not true; we
must not be able to reschedule sooner than we can deboost ;-)
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-12 Thread Paul E. McKenney
On Mon, Aug 12, 2013 at 06:21:26PM +0200, Peter Zijlstra wrote:
 On Mon, Aug 12, 2013 at 08:16:18AM -0700, Paul E. McKenney wrote:
  On Mon, Aug 12, 2013 at 03:55:44PM +0200, Peter Zijlstra wrote:
   On Fri, Aug 09, 2013 at 05:31:27PM +0800, Lai Jiangshan wrote:
On 08/09/2013 04:40 AM, Paul E. McKenney wrote:
 One problem here -- it may take quite some time for a 
 set_need_resched()
 to take effect.  This is especially a problem for RCU priority 
 boosting,
 but can also needlessly delay preemptible-RCU grace periods because
 local_irq_restore() and friends don't check the TIF_NEED_RESCHED bit.


The final effect of deboosting(rt_mutex_unlock()) is also accomplished
via set_need_resched()/set_tsk_need_resched().
set_need_resched() is enough for RCU priority boosting issue here.
   
   But there's a huge difference between the boosting and deboosting side
   of things. rcu_read_unlock_special() starts the boost, the deboosting
   only matters if/when you reschedule. 
  
  Or if there is a pre-existing runnable task whose priority is such that
  deboosting makes it the highest-priority task.
 
 Right, I got horribly lost in rt_mutex, but I suspect we deal with that
 case the right way. -rt people would've noticed us screwing that up ;-)
 
 But there too, we're fully limited to how fast we can get a
 reschedule(). Deboosting sooner than we can reschedule to run the other
 task is effectively pointless. The converse is obviously not true; we
 must not be able to reschedule sooner than we can deboost ;-)

In addition, the proposed change was to defer the deboost based on
a set_need_resched(), which would provide additional opportunity for
delay -- the running task would retain its high priority until the
scheduler acted on the set_need_resched().

Thanx, Paul

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-10 Thread Paul E. McKenney
On Sat, Aug 10, 2013 at 08:07:15AM -0700, Paul E. McKenney wrote:
> On Sat, Aug 10, 2013 at 11:43:59AM +0800, Lai Jiangshan wrote:
> > Hi, Steven
> > 
> > I was considering rtmutex's lock->wait_lock is a scheduler lock,
> > But it is not, and it is just a spinlock of process context.
> > I hope you change it to a spinlock of irq context.
> > 
> > 1) it causes rcu read site more deadlockable, example:
> > x is a spinlock of softirq context.
> > 
> > CPU1cpu2(rcu boost)
> > rcu_read_lock() rt_mutext_lock()
> > raw_spin_lock(lock->wait_lock)
> > spin_lock_bh(x)  > after irq>
> > rcu_read_unlock() do_softirq()
> >   rcu_read_unlock_special()
> > rt_mutext_unlock()
> >   raw_spin_lock(lock->wait_lock)spin_lock_bh(x)  DEADLOCK
> > 
> > This example can happen on any one of these code:
> > without my patchset
> > with my patchset
> > with my patchset but reverted patch2
> > 
> > 2) why it causes more deadlockable: it extends the range of suspect locks.
> > #DEFINE suspect locks: any lock can be (chained) nested in 
> > rcu_read_unlock_special().
> > 
> > So the suspect locks are: rnp->lock, scheduler locks, rtmutex's 
> > lock->wait_lock,
> > locks in prink()/WARN_ON() and the locks which can be chained/indirectly 
> > nested
> > in the above locks.
> > 
> > If the rtmutex's lock->wait_lock is a spinlock of irq context, all suspect 
> > locks are
> > some spinlocks of irq context.
> > 
> > If the rtmutex's lock->wait_lock is a spinlock of process context, suspect 
> > locks
> > will be extended to, all spinlocks of irq context, all spinlocks of softirq 
> > context,
> > and (all spinlocks of process context but nested in rtmutex's 
> > lock->wait_lock).
> > 
> > We can see from the definition, if rcu_read_unlock_special() is called from
> > any suspect lock, it may be deadlock like the example. the rtmutex's 
> > lock->wait_lock
> > extends the range of suspect locks, it causes more deadlockable.
> > 
> > 3) How my algorithm works, why smaller range of suspect locks help us.
> > Since rcu_read_unlock_special() can't be called from suspect locks context,
> > we should defer rcu_read_unlock_special() when in these contexts.
> > It is hard to find out current context is suspect locks context or not,
> > but we can determine it based on irq/softirq/process context.
> > 
> > if all suspect locks are some spinlocks of irq context:
> > if (irqs_disabled) /* we may be in suspect locks context */
> > defer rcu_read_unlock_special().
> > 
> > if all suspect locks are some spinlocks of irq/softirq/process context:
> > if (irqs_disabled || in_atomic()) /* we may be in suspect locks context 
> > */
> > defer rcu_read_unlock_special().
> > In this case, the deferring becomes large more, I can't accept it.
> > So I have to narrow the range of suspect locks. Two choices:
> > A) don't call rt_mutex_unlock() from rcu_read_unlock(), only call it
> >from rcu_preempt_not_context_switch(). we need to rework these
> >two functions and it will add complexity to RCU, and it also still
> >adds some probability of deferring.
> 
> One advantage of bh-disable locks is that enabling bh checks
> TIF_NEED_RESCHED, so that there is no deferring beyond that
> needed by bh disable.  The same of course applies to preempt_disable().
> 
> So one approach is to defer when rcu_read_unlock_special() is entered
> with either preemption or bh disabled.  Your current set_need_resched()
> trick would work fine in this case.  Unfortunately, re-enabling interrupts
> does -not- check TIF_NEED_RESCHED, which is why we have latency problems
> in that case.  (Hence my earlier question about making self-IPI safe
> on all arches, which would result in an interrupt as soon as interrupts
> were re-enabled.)
> 
> Another possibility is to defer only when preemption or bh are disabled
> on entry ro rcu_read_unlock_special(), but to retain the current
> (admittedly ugly) nesting rules for the scheduler locks.
> 
> > B) change rtmutex's lock->wait_lock to irqs-disabled.
> 
> I have to defer to Steven on this one.

C) Remove support for RCU priority boosting.

I am reluctant to do this, but...

Thanx, Paul

> > 4) In the view of rtmutex, I think it will be better if ->wait_lock is 
> > irqs-disabled.
> >A) like trylock of mutex/rw_sem, we may call rt_mutex_trylock() in irq 
> > in future.
> >B) the critical section of ->wait_lock is short,
> >   making it irqs-disabled don't hurts responsibility/latency.
> >C) almost all time of the critical section of ->wait_lock is 
> > irqs-disabled
> >   (due to task->pi_lock), I think converting whole time of the critical 
> > section
> >   of ->wait_lock to irqs-disabled is OK.
> > 
> > So I hope you change rtmutex's lock->wait_lock.
> > 
> > Any feedback 

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-10 Thread Paul E. McKenney
On Sat, Aug 10, 2013 at 11:43:59AM +0800, Lai Jiangshan wrote:
> Hi, Steven
> 
> I was considering rtmutex's lock->wait_lock is a scheduler lock,
> But it is not, and it is just a spinlock of process context.
> I hope you change it to a spinlock of irq context.
> 
> 1) it causes rcu read site more deadlockable, example:
> x is a spinlock of softirq context.
> 
> CPU1  cpu2(rcu boost)
> rcu_read_lock()   rt_mutext_lock()
>   raw_spin_lock(lock->wait_lock)
> spin_lock_bh(x)after irq>
> rcu_read_unlock() do_softirq()
>   rcu_read_unlock_special()
> rt_mutext_unlock()
>   raw_spin_lock(lock->wait_lock)  spin_lock_bh(x)  DEADLOCK
> 
> This example can happen on any one of these code:
>   without my patchset
>   with my patchset
>   with my patchset but reverted patch2
> 
> 2) why it causes more deadlockable: it extends the range of suspect locks.
> #DEFINE suspect locks: any lock can be (chained) nested in 
> rcu_read_unlock_special().
> 
> So the suspect locks are: rnp->lock, scheduler locks, rtmutex's 
> lock->wait_lock,
> locks in prink()/WARN_ON() and the locks which can be chained/indirectly 
> nested
> in the above locks.
> 
> If the rtmutex's lock->wait_lock is a spinlock of irq context, all suspect 
> locks are
> some spinlocks of irq context.
> 
> If the rtmutex's lock->wait_lock is a spinlock of process context, suspect 
> locks
> will be extended to, all spinlocks of irq context, all spinlocks of softirq 
> context,
> and (all spinlocks of process context but nested in rtmutex's 
> lock->wait_lock).
> 
> We can see from the definition, if rcu_read_unlock_special() is called from
> any suspect lock, it may be deadlock like the example. the rtmutex's 
> lock->wait_lock
> extends the range of suspect locks, it causes more deadlockable.
> 
> 3) How my algorithm works, why smaller range of suspect locks help us.
> Since rcu_read_unlock_special() can't be called from suspect locks context,
> we should defer rcu_read_unlock_special() when in these contexts.
> It is hard to find out current context is suspect locks context or not,
> but we can determine it based on irq/softirq/process context.
> 
> if all suspect locks are some spinlocks of irq context:
>   if (irqs_disabled) /* we may be in suspect locks context */
>   defer rcu_read_unlock_special().
> 
> if all suspect locks are some spinlocks of irq/softirq/process context:
>   if (irqs_disabled || in_atomic()) /* we may be in suspect locks context 
> */
>   defer rcu_read_unlock_special().
> In this case, the deferring becomes large more, I can't accept it.
> So I have to narrow the range of suspect locks. Two choices:
> A) don't call rt_mutex_unlock() from rcu_read_unlock(), only call it
>from rcu_preempt_not_context_switch(). we need to rework these
>two functions and it will add complexity to RCU, and it also still
>adds some probability of deferring.

One advantage of bh-disable locks is that enabling bh checks
TIF_NEED_RESCHED, so that there is no deferring beyond that
needed by bh disable.  The same of course applies to preempt_disable().

So one approach is to defer when rcu_read_unlock_special() is entered
with either preemption or bh disabled.  Your current set_need_resched()
trick would work fine in this case.  Unfortunately, re-enabling interrupts
does -not- check TIF_NEED_RESCHED, which is why we have latency problems
in that case.  (Hence my earlier question about making self-IPI safe
on all arches, which would result in an interrupt as soon as interrupts
were re-enabled.)

Another possibility is to defer only when preemption or bh are disabled
on entry ro rcu_read_unlock_special(), but to retain the current
(admittedly ugly) nesting rules for the scheduler locks.

> B) change rtmutex's lock->wait_lock to irqs-disabled.

I have to defer to Steven on this one.

Thanx, Paul

> 4) In the view of rtmutex, I think it will be better if ->wait_lock is 
> irqs-disabled.
>A) like trylock of mutex/rw_sem, we may call rt_mutex_trylock() in irq in 
> future.
>B) the critical section of ->wait_lock is short,
>   making it irqs-disabled don't hurts responsibility/latency.
>C) almost all time of the critical section of ->wait_lock is irqs-disabled
>   (due to task->pi_lock), I think converting whole time of the critical 
> section
>   of ->wait_lock to irqs-disabled is OK.
> 
> So I hope you change rtmutex's lock->wait_lock.
> 
> Any feedback from anyone is welcome.
> 
> Thanks,
> Lai
> 
> On 08/09/2013 04:40 AM, Paul E. McKenney wrote:
> > On Wed, Aug 07, 2013 at 06:25:01PM +0800, Lai Jiangshan wrote:
> >> Background)
> >>
> >> Although all articles declare that rcu read site is deadlock-immunity.
> >> It is not true for rcu-preempt, it will be deadlock if rcu read site
> >> overlaps with 

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-10 Thread Paul E. McKenney
On Sat, Aug 10, 2013 at 11:43:59AM +0800, Lai Jiangshan wrote:
 Hi, Steven
 
 I was considering rtmutex's lock-wait_lock is a scheduler lock,
 But it is not, and it is just a spinlock of process context.
 I hope you change it to a spinlock of irq context.
 
 1) it causes rcu read site more deadlockable, example:
 x is a spinlock of softirq context.
 
 CPU1  cpu2(rcu boost)
 rcu_read_lock()   rt_mutext_lock()
 preemption and reschedule back  raw_spin_lock(lock-wait_lock)
 spin_lock_bh(x)   interrupt and doing softirq 
 after irq
 rcu_read_unlock() do_softirq()
   rcu_read_unlock_special()
 rt_mutext_unlock()
   raw_spin_lock(lock-wait_lock)  spin_lock_bh(x)  DEADLOCK
 
 This example can happen on any one of these code:
   without my patchset
   with my patchset
   with my patchset but reverted patch2
 
 2) why it causes more deadlockable: it extends the range of suspect locks.
 #DEFINE suspect locks: any lock can be (chained) nested in 
 rcu_read_unlock_special().
 
 So the suspect locks are: rnp-lock, scheduler locks, rtmutex's 
 lock-wait_lock,
 locks in prink()/WARN_ON() and the locks which can be chained/indirectly 
 nested
 in the above locks.
 
 If the rtmutex's lock-wait_lock is a spinlock of irq context, all suspect 
 locks are
 some spinlocks of irq context.
 
 If the rtmutex's lock-wait_lock is a spinlock of process context, suspect 
 locks
 will be extended to, all spinlocks of irq context, all spinlocks of softirq 
 context,
 and (all spinlocks of process context but nested in rtmutex's 
 lock-wait_lock).
 
 We can see from the definition, if rcu_read_unlock_special() is called from
 any suspect lock, it may be deadlock like the example. the rtmutex's 
 lock-wait_lock
 extends the range of suspect locks, it causes more deadlockable.
 
 3) How my algorithm works, why smaller range of suspect locks help us.
 Since rcu_read_unlock_special() can't be called from suspect locks context,
 we should defer rcu_read_unlock_special() when in these contexts.
 It is hard to find out current context is suspect locks context or not,
 but we can determine it based on irq/softirq/process context.
 
 if all suspect locks are some spinlocks of irq context:
   if (irqs_disabled) /* we may be in suspect locks context */
   defer rcu_read_unlock_special().
 
 if all suspect locks are some spinlocks of irq/softirq/process context:
   if (irqs_disabled || in_atomic()) /* we may be in suspect locks context 
 */
   defer rcu_read_unlock_special().
 In this case, the deferring becomes large more, I can't accept it.
 So I have to narrow the range of suspect locks. Two choices:
 A) don't call rt_mutex_unlock() from rcu_read_unlock(), only call it
from rcu_preempt_not_context_switch(). we need to rework these
two functions and it will add complexity to RCU, and it also still
adds some probability of deferring.

One advantage of bh-disable locks is that enabling bh checks
TIF_NEED_RESCHED, so that there is no deferring beyond that
needed by bh disable.  The same of course applies to preempt_disable().

So one approach is to defer when rcu_read_unlock_special() is entered
with either preemption or bh disabled.  Your current set_need_resched()
trick would work fine in this case.  Unfortunately, re-enabling interrupts
does -not- check TIF_NEED_RESCHED, which is why we have latency problems
in that case.  (Hence my earlier question about making self-IPI safe
on all arches, which would result in an interrupt as soon as interrupts
were re-enabled.)

Another possibility is to defer only when preemption or bh are disabled
on entry ro rcu_read_unlock_special(), but to retain the current
(admittedly ugly) nesting rules for the scheduler locks.

 B) change rtmutex's lock-wait_lock to irqs-disabled.

I have to defer to Steven on this one.

Thanx, Paul

 4) In the view of rtmutex, I think it will be better if -wait_lock is 
 irqs-disabled.
A) like trylock of mutex/rw_sem, we may call rt_mutex_trylock() in irq in 
 future.
B) the critical section of -wait_lock is short,
   making it irqs-disabled don't hurts responsibility/latency.
C) almost all time of the critical section of -wait_lock is irqs-disabled
   (due to task-pi_lock), I think converting whole time of the critical 
 section
   of -wait_lock to irqs-disabled is OK.
 
 So I hope you change rtmutex's lock-wait_lock.
 
 Any feedback from anyone is welcome.
 
 Thanks,
 Lai
 
 On 08/09/2013 04:40 AM, Paul E. McKenney wrote:
  On Wed, Aug 07, 2013 at 06:25:01PM +0800, Lai Jiangshan wrote:
  Background)
 
  Although all articles declare that rcu read site is deadlock-immunity.
  It is not true for rcu-preempt, it will be deadlock if rcu read site
  overlaps with scheduler lock.
 
  ec433f0c, 10f39bb1 and 016a8d5b just 

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-10 Thread Paul E. McKenney
On Sat, Aug 10, 2013 at 08:07:15AM -0700, Paul E. McKenney wrote:
 On Sat, Aug 10, 2013 at 11:43:59AM +0800, Lai Jiangshan wrote:
  Hi, Steven
  
  I was considering rtmutex's lock-wait_lock is a scheduler lock,
  But it is not, and it is just a spinlock of process context.
  I hope you change it to a spinlock of irq context.
  
  1) it causes rcu read site more deadlockable, example:
  x is a spinlock of softirq context.
  
  CPU1cpu2(rcu boost)
  rcu_read_lock() rt_mutext_lock()
  preemption and reschedule backraw_spin_lock(lock-wait_lock)
  spin_lock_bh(x) interrupt and doing softirq 
  after irq
  rcu_read_unlock() do_softirq()
rcu_read_unlock_special()
  rt_mutext_unlock()
raw_spin_lock(lock-wait_lock)spin_lock_bh(x)  DEADLOCK
  
  This example can happen on any one of these code:
  without my patchset
  with my patchset
  with my patchset but reverted patch2
  
  2) why it causes more deadlockable: it extends the range of suspect locks.
  #DEFINE suspect locks: any lock can be (chained) nested in 
  rcu_read_unlock_special().
  
  So the suspect locks are: rnp-lock, scheduler locks, rtmutex's 
  lock-wait_lock,
  locks in prink()/WARN_ON() and the locks which can be chained/indirectly 
  nested
  in the above locks.
  
  If the rtmutex's lock-wait_lock is a spinlock of irq context, all suspect 
  locks are
  some spinlocks of irq context.
  
  If the rtmutex's lock-wait_lock is a spinlock of process context, suspect 
  locks
  will be extended to, all spinlocks of irq context, all spinlocks of softirq 
  context,
  and (all spinlocks of process context but nested in rtmutex's 
  lock-wait_lock).
  
  We can see from the definition, if rcu_read_unlock_special() is called from
  any suspect lock, it may be deadlock like the example. the rtmutex's 
  lock-wait_lock
  extends the range of suspect locks, it causes more deadlockable.
  
  3) How my algorithm works, why smaller range of suspect locks help us.
  Since rcu_read_unlock_special() can't be called from suspect locks context,
  we should defer rcu_read_unlock_special() when in these contexts.
  It is hard to find out current context is suspect locks context or not,
  but we can determine it based on irq/softirq/process context.
  
  if all suspect locks are some spinlocks of irq context:
  if (irqs_disabled) /* we may be in suspect locks context */
  defer rcu_read_unlock_special().
  
  if all suspect locks are some spinlocks of irq/softirq/process context:
  if (irqs_disabled || in_atomic()) /* we may be in suspect locks context 
  */
  defer rcu_read_unlock_special().
  In this case, the deferring becomes large more, I can't accept it.
  So I have to narrow the range of suspect locks. Two choices:
  A) don't call rt_mutex_unlock() from rcu_read_unlock(), only call it
 from rcu_preempt_not_context_switch(). we need to rework these
 two functions and it will add complexity to RCU, and it also still
 adds some probability of deferring.
 
 One advantage of bh-disable locks is that enabling bh checks
 TIF_NEED_RESCHED, so that there is no deferring beyond that
 needed by bh disable.  The same of course applies to preempt_disable().
 
 So one approach is to defer when rcu_read_unlock_special() is entered
 with either preemption or bh disabled.  Your current set_need_resched()
 trick would work fine in this case.  Unfortunately, re-enabling interrupts
 does -not- check TIF_NEED_RESCHED, which is why we have latency problems
 in that case.  (Hence my earlier question about making self-IPI safe
 on all arches, which would result in an interrupt as soon as interrupts
 were re-enabled.)
 
 Another possibility is to defer only when preemption or bh are disabled
 on entry ro rcu_read_unlock_special(), but to retain the current
 (admittedly ugly) nesting rules for the scheduler locks.
 
  B) change rtmutex's lock-wait_lock to irqs-disabled.
 
 I have to defer to Steven on this one.

C) Remove support for RCU priority boosting.

I am reluctant to do this, but...

Thanx, Paul

  4) In the view of rtmutex, I think it will be better if -wait_lock is 
  irqs-disabled.
 A) like trylock of mutex/rw_sem, we may call rt_mutex_trylock() in irq 
  in future.
 B) the critical section of -wait_lock is short,
making it irqs-disabled don't hurts responsibility/latency.
 C) almost all time of the critical section of -wait_lock is 
  irqs-disabled
(due to task-pi_lock), I think converting whole time of the critical 
  section
of -wait_lock to irqs-disabled is OK.
  
  So I hope you change rtmutex's lock-wait_lock.
  
  Any feedback from anyone is welcome.
  
  Thanks,
  Lai
  
  On 08/09/2013 04:40 AM, Paul E. McKenney wrote:
   On Wed, Aug 07, 2013 at 06:25:01PM +0800, Lai 

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-09 Thread Lai Jiangshan
Hi, Steven

I was considering rtmutex's lock->wait_lock is a scheduler lock,
But it is not, and it is just a spinlock of process context.
I hope you change it to a spinlock of irq context.

1) it causes rcu read site more deadlockable, example:
x is a spinlock of softirq context.

CPU1cpu2(rcu boost)
rcu_read_lock() rt_mutext_lock()
raw_spin_lock(lock->wait_lock)
spin_lock_bh(x) 
rcu_read_unlock() do_softirq()
  rcu_read_unlock_special()
rt_mutext_unlock()
  raw_spin_lock(lock->wait_lock)spin_lock_bh(x)  DEADLOCK

This example can happen on any one of these code:
without my patchset
with my patchset
with my patchset but reverted patch2

2) why it causes more deadlockable: it extends the range of suspect locks.
#DEFINE suspect locks: any lock can be (chained) nested in 
rcu_read_unlock_special().

So the suspect locks are: rnp->lock, scheduler locks, rtmutex's lock->wait_lock,
locks in prink()/WARN_ON() and the locks which can be chained/indirectly nested
in the above locks.

If the rtmutex's lock->wait_lock is a spinlock of irq context, all suspect 
locks are
some spinlocks of irq context.

If the rtmutex's lock->wait_lock is a spinlock of process context, suspect locks
will be extended to, all spinlocks of irq context, all spinlocks of softirq 
context,
and (all spinlocks of process context but nested in rtmutex's lock->wait_lock).

We can see from the definition, if rcu_read_unlock_special() is called from
any suspect lock, it may be deadlock like the example. the rtmutex's 
lock->wait_lock
extends the range of suspect locks, it causes more deadlockable.

3) How my algorithm works, why smaller range of suspect locks help us.
Since rcu_read_unlock_special() can't be called from suspect locks context,
we should defer rcu_read_unlock_special() when in these contexts.
It is hard to find out current context is suspect locks context or not,
but we can determine it based on irq/softirq/process context.

if all suspect locks are some spinlocks of irq context:
if (irqs_disabled) /* we may be in suspect locks context */
defer rcu_read_unlock_special().

if all suspect locks are some spinlocks of irq/softirq/process context:
if (irqs_disabled || in_atomic()) /* we may be in suspect locks context 
*/
defer rcu_read_unlock_special().
In this case, the deferring becomes large more, I can't accept it.
So I have to narrow the range of suspect locks. Two choices:
A) don't call rt_mutex_unlock() from rcu_read_unlock(), only call it
   from rcu_preempt_not_context_switch(). we need to rework these
   two functions and it will add complexity to RCU, and it also still
   adds some probability of deferring.
B) change rtmutex's lock->wait_lock to irqs-disabled.

4) In the view of rtmutex, I think it will be better if ->wait_lock is 
irqs-disabled.
   A) like trylock of mutex/rw_sem, we may call rt_mutex_trylock() in irq in 
future.
   B) the critical section of ->wait_lock is short,
  making it irqs-disabled don't hurts responsibility/latency.
   C) almost all time of the critical section of ->wait_lock is irqs-disabled
  (due to task->pi_lock), I think converting whole time of the critical 
section
  of ->wait_lock to irqs-disabled is OK.

So I hope you change rtmutex's lock->wait_lock.

Any feedback from anyone is welcome.

Thanks,
Lai

On 08/09/2013 04:40 AM, Paul E. McKenney wrote:
> On Wed, Aug 07, 2013 at 06:25:01PM +0800, Lai Jiangshan wrote:
>> Background)
>>
>> Although all articles declare that rcu read site is deadlock-immunity.
>> It is not true for rcu-preempt, it will be deadlock if rcu read site
>> overlaps with scheduler lock.
>>
>> ec433f0c, 10f39bb1 and 016a8d5b just partially solve it. But rcu read site
>> is still not deadlock-immunity. And the problem described in 016a8d5b
>> is still existed(rcu_read_unlock_special() calls wake_up).
>>
>> Aim)
>>
>> We want to fix the problem forever, we want to keep rcu read site
>> is deadlock-immunity as books say.
>>
>> How)
>>
>> The problem is solved by "if rcu_read_unlock_special() is called inside
>> any lock which can be (chained) nested in rcu_read_unlock_special(),
>> we defer rcu_read_unlock_special()".
>> This kind locks include rnp->lock, scheduler locks, perf ctx->lock, locks
>> in printk()/WARN_ON() and all locks nested in these locks or chained nested
>> in these locks.
>>
>> The problem is reduced to "how to distinguish all these locks(context)",
>> We don't distinguish all these locks, we know that all these locks
>> should be nested in local_irqs_disable().
>>
>> we just consider if rcu_read_unlock_special() is called in irqs-disabled
>> context, it may be called in these suspect locks, we should defer
>> rcu_read_unlock_special().
>>
>> The algorithm enlarges the probability of deferring, but the probability
>> is still very very low.
>>

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-09 Thread Paul E. McKenney
On Fri, Aug 09, 2013 at 05:31:27PM +0800, Lai Jiangshan wrote:
> On 08/09/2013 04:40 AM, Paul E. McKenney wrote:
> > On Wed, Aug 07, 2013 at 06:25:01PM +0800, Lai Jiangshan wrote:
> >> Background)
> >>
> >> Although all articles declare that rcu read site is deadlock-immunity.
> >> It is not true for rcu-preempt, it will be deadlock if rcu read site
> >> overlaps with scheduler lock.
> >>
> >> ec433f0c, 10f39bb1 and 016a8d5b just partially solve it. But rcu read site
> >> is still not deadlock-immunity. And the problem described in 016a8d5b
> >> is still existed(rcu_read_unlock_special() calls wake_up).
> >>
> >> Aim)
> >>
> >> We want to fix the problem forever, we want to keep rcu read site
> >> is deadlock-immunity as books say.
> >>
> >> How)
> >>
> >> The problem is solved by "if rcu_read_unlock_special() is called inside
> >> any lock which can be (chained) nested in rcu_read_unlock_special(),
> >> we defer rcu_read_unlock_special()".
> >> This kind locks include rnp->lock, scheduler locks, perf ctx->lock, locks
> >> in printk()/WARN_ON() and all locks nested in these locks or chained nested
> >> in these locks.
> >>
> >> The problem is reduced to "how to distinguish all these locks(context)",
> >> We don't distinguish all these locks, we know that all these locks
> >> should be nested in local_irqs_disable().
> >>
> >> we just consider if rcu_read_unlock_special() is called in irqs-disabled
> >> context, it may be called in these suspect locks, we should defer
> >> rcu_read_unlock_special().
> >>
> >> The algorithm enlarges the probability of deferring, but the probability
> >> is still very very low.
> >>
> >> Deferring does add a small overhead, but it offers us:
> >>1) really deadlock-immunity for rcu read site
> >>2) remove the overhead of the irq-work(250 times per second in avg.)
> > 
> > One problem here -- it may take quite some time for a set_need_resched()
> > to take effect.  This is especially a problem for RCU priority boosting,
> > but can also needlessly delay preemptible-RCU grace periods because
> > local_irq_restore() and friends don't check the TIF_NEED_RESCHED bit.
> 
> The final effect of deboosting(rt_mutex_unlock()) is also accomplished
> via set_need_resched()/set_tsk_need_resched().
> set_need_resched() is enough for RCU priority boosting issue here.

Eventually, yes.  But all that set_need_resched() does is set the
TIF_NEED_RESCHED.  This is checked by the outermost preempt_enable(),
return from interrupt, return to userspace, and things like
cond_resched().  So it might well be quite some time until the boosted
reader actually gets around to deboosting itself.

> Since rcu_read_unlock_special() is deferred, it does take quite some time for
> QS report to take effect.

Agreed.

> > OK, alternatives...
> > 
> > o   Keep the current rule saying that if the scheduler is going
> > to exit an RCU read-side critical section while holding
> > one of its spinlocks, preemption has to have been disabled
> 
> Since rtmutex'lock->wait_lock is not irqs-disabled nor bh-disabled.

Yep, because this rule prevents the call to rt_mutex_unlock() from
happening whenever one of the scheduler's irq-disable locks is held.

> This kind of spinlocks include scheduler locks, rtmutex'lock->wait_lock,
> all locks can be acquired in irq/SOFTIRQ.
> 
> So this rule is not only applied for scheduler locks, it should also
> be applied for almost all spinlocks in the kernel.

No, only those locks acquired by the scheduler in the wakeup path.
Or am I missing something here?

> I was hard to accept that rcu read site is not deadlock-immunity.

So did I, see http://lwn.net/Articles/453002/.  RCU was a victim of its
own success.  ;-)

And it would be really cool to restore full deadlock immunity to
rcu_read_unlock(), no two ways about it!  Hmmm...  Any way that a
self-IPI could be made safe for all architectures?

Thanx, Paul

> Thanks,
> Lai
> 
> > throughout the full duration of that critical section.
> > Well, we can certainly do this, but it would be nice to get
> > rid of this rule.
> > 
> > o   Use per-CPU variables, possibly injecting delay.  This has ugly
> > disadvantages as noted above.
> > 
> > o   irq_work_queue() can wait a jiffy (or on some architectures,
> > quite a bit longer) before actually doing anything.
> > 
> > o   raise_softirq() is more immediate and is an easy change, but
> > adds a softirq vector -- which people are really trying to
> > get rid of.  Also, wakeup_softirqd() calls things that acquire
> > the scheduler locks, which is exactly what we were trying to
> > avoid doing.
> > 
> > o   invoke_rcu_core() can invoke raise_softirq() as above.
> > 
> > o   IPI to self.  From what I can see, not all architectures
> > support this.  Easy to fake if you have at least two CPUs,
> > but not so good from an OS jitter viewpoint...
> > 
> > o   Add a check to 

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-09 Thread Lai Jiangshan
On 08/09/2013 04:40 AM, Paul E. McKenney wrote:
> On Wed, Aug 07, 2013 at 06:25:01PM +0800, Lai Jiangshan wrote:
>> Background)
>>
>> Although all articles declare that rcu read site is deadlock-immunity.
>> It is not true for rcu-preempt, it will be deadlock if rcu read site
>> overlaps with scheduler lock.
>>
>> ec433f0c, 10f39bb1 and 016a8d5b just partially solve it. But rcu read site
>> is still not deadlock-immunity. And the problem described in 016a8d5b
>> is still existed(rcu_read_unlock_special() calls wake_up).
>>
>> Aim)
>>
>> We want to fix the problem forever, we want to keep rcu read site
>> is deadlock-immunity as books say.
>>
>> How)
>>
>> The problem is solved by "if rcu_read_unlock_special() is called inside
>> any lock which can be (chained) nested in rcu_read_unlock_special(),
>> we defer rcu_read_unlock_special()".
>> This kind locks include rnp->lock, scheduler locks, perf ctx->lock, locks
>> in printk()/WARN_ON() and all locks nested in these locks or chained nested
>> in these locks.
>>
>> The problem is reduced to "how to distinguish all these locks(context)",
>> We don't distinguish all these locks, we know that all these locks
>> should be nested in local_irqs_disable().
>>
>> we just consider if rcu_read_unlock_special() is called in irqs-disabled
>> context, it may be called in these suspect locks, we should defer
>> rcu_read_unlock_special().
>>
>> The algorithm enlarges the probability of deferring, but the probability
>> is still very very low.
>>
>> Deferring does add a small overhead, but it offers us:
>>  1) really deadlock-immunity for rcu read site
>>  2) remove the overhead of the irq-work(250 times per second in avg.)
> 
> One problem here -- it may take quite some time for a set_need_resched()
> to take effect.  This is especially a problem for RCU priority boosting,
> but can also needlessly delay preemptible-RCU grace periods because
> local_irq_restore() and friends don't check the TIF_NEED_RESCHED bit.


The final effect of deboosting(rt_mutex_unlock()) is also accomplished
via set_need_resched()/set_tsk_need_resched().
set_need_resched() is enough for RCU priority boosting issue here.

Since rcu_read_unlock_special() is deferred, it does take quite some time for
QS report to take effect.


> 
> OK, alternatives...
> 
> o Keep the current rule saying that if the scheduler is going
>   to exit an RCU read-side critical section while holding
>   one of its spinlocks, preemption has to have been disabled

Since rtmutex'lock->wait_lock is not irqs-disabled nor bh-disabled.

This kind of spinlocks include scheduler locks, rtmutex'lock->wait_lock,
all locks can be acquired in irq/SOFTIRQ.

So this rule is not only applied for scheduler locks, it should also
be applied for almost all spinlocks in the kernel.

I was hard to accept that rcu read site is not deadlock-immunity.

Thanks,
Lai

>   throughout the full duration of that critical section.
>   Well, we can certainly do this, but it would be nice to get
>   rid of this rule.
> 
> o Use per-CPU variables, possibly injecting delay.  This has ugly
>   disadvantages as noted above.
> 
> o irq_work_queue() can wait a jiffy (or on some architectures,
>   quite a bit longer) before actually doing anything.
> 
> o raise_softirq() is more immediate and is an easy change, but
>   adds a softirq vector -- which people are really trying to
>   get rid of.  Also, wakeup_softirqd() calls things that acquire
>   the scheduler locks, which is exactly what we were trying to
>   avoid doing.
> 
> o invoke_rcu_core() can invoke raise_softirq() as above.
> 
> o IPI to self.  From what I can see, not all architectures
>   support this.  Easy to fake if you have at least two CPUs,
>   but not so good from an OS jitter viewpoint...
> 
> o Add a check to local_irq_disable() and friends.  I would guess
>   that this suggestion would not make architecture maintainers
>   happy.
> 
> Other thoughts?
> 
>   Thanx, Paul
> 
>> Signed-off-by: Lai Jiangshan 
>> ---
>>  include/linux/rcupdate.h |2 +-
>>  kernel/rcupdate.c|2 +-
>>  kernel/rcutree_plugin.h  |   47 
>> +
>>  3 files changed, 44 insertions(+), 7 deletions(-)
>>
>> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
>> index 4b14bdc..00b4220 100644
>> --- a/include/linux/rcupdate.h
>> +++ b/include/linux/rcupdate.h
>> @@ -180,7 +180,7 @@ extern void synchronize_sched(void);
>>
>>  extern void __rcu_read_lock(void);
>>  extern void __rcu_read_unlock(void);
>> -extern void rcu_read_unlock_special(struct task_struct *t);
>> +extern void rcu_read_unlock_special(struct task_struct *t, bool unlock);
>>  void synchronize_rcu(void);
>>
>>  /*
>> diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
>> index cce6ba8..33b89a3 100644
>> --- a/kernel/rcupdate.c
>> +++ 

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-09 Thread Lai Jiangshan
On 08/09/2013 04:40 AM, Paul E. McKenney wrote:
 On Wed, Aug 07, 2013 at 06:25:01PM +0800, Lai Jiangshan wrote:
 Background)

 Although all articles declare that rcu read site is deadlock-immunity.
 It is not true for rcu-preempt, it will be deadlock if rcu read site
 overlaps with scheduler lock.

 ec433f0c, 10f39bb1 and 016a8d5b just partially solve it. But rcu read site
 is still not deadlock-immunity. And the problem described in 016a8d5b
 is still existed(rcu_read_unlock_special() calls wake_up).

 Aim)

 We want to fix the problem forever, we want to keep rcu read site
 is deadlock-immunity as books say.

 How)

 The problem is solved by if rcu_read_unlock_special() is called inside
 any lock which can be (chained) nested in rcu_read_unlock_special(),
 we defer rcu_read_unlock_special().
 This kind locks include rnp-lock, scheduler locks, perf ctx-lock, locks
 in printk()/WARN_ON() and all locks nested in these locks or chained nested
 in these locks.

 The problem is reduced to how to distinguish all these locks(context),
 We don't distinguish all these locks, we know that all these locks
 should be nested in local_irqs_disable().

 we just consider if rcu_read_unlock_special() is called in irqs-disabled
 context, it may be called in these suspect locks, we should defer
 rcu_read_unlock_special().

 The algorithm enlarges the probability of deferring, but the probability
 is still very very low.

 Deferring does add a small overhead, but it offers us:
  1) really deadlock-immunity for rcu read site
  2) remove the overhead of the irq-work(250 times per second in avg.)
 
 One problem here -- it may take quite some time for a set_need_resched()
 to take effect.  This is especially a problem for RCU priority boosting,
 but can also needlessly delay preemptible-RCU grace periods because
 local_irq_restore() and friends don't check the TIF_NEED_RESCHED bit.


The final effect of deboosting(rt_mutex_unlock()) is also accomplished
via set_need_resched()/set_tsk_need_resched().
set_need_resched() is enough for RCU priority boosting issue here.

Since rcu_read_unlock_special() is deferred, it does take quite some time for
QS report to take effect.


 
 OK, alternatives...
 
 o Keep the current rule saying that if the scheduler is going
   to exit an RCU read-side critical section while holding
   one of its spinlocks, preemption has to have been disabled

Since rtmutex'lock-wait_lock is not irqs-disabled nor bh-disabled.

This kind of spinlocks include scheduler locks, rtmutex'lock-wait_lock,
all locks can be acquired in irq/SOFTIRQ.

So this rule is not only applied for scheduler locks, it should also
be applied for almost all spinlocks in the kernel.

I was hard to accept that rcu read site is not deadlock-immunity.

Thanks,
Lai

   throughout the full duration of that critical section.
   Well, we can certainly do this, but it would be nice to get
   rid of this rule.
 
 o Use per-CPU variables, possibly injecting delay.  This has ugly
   disadvantages as noted above.
 
 o irq_work_queue() can wait a jiffy (or on some architectures,
   quite a bit longer) before actually doing anything.
 
 o raise_softirq() is more immediate and is an easy change, but
   adds a softirq vector -- which people are really trying to
   get rid of.  Also, wakeup_softirqd() calls things that acquire
   the scheduler locks, which is exactly what we were trying to
   avoid doing.
 
 o invoke_rcu_core() can invoke raise_softirq() as above.
 
 o IPI to self.  From what I can see, not all architectures
   support this.  Easy to fake if you have at least two CPUs,
   but not so good from an OS jitter viewpoint...
 
 o Add a check to local_irq_disable() and friends.  I would guess
   that this suggestion would not make architecture maintainers
   happy.
 
 Other thoughts?
 
   Thanx, Paul
 
 Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
 ---
  include/linux/rcupdate.h |2 +-
  kernel/rcupdate.c|2 +-
  kernel/rcutree_plugin.h  |   47 
 +
  3 files changed, 44 insertions(+), 7 deletions(-)

 diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
 index 4b14bdc..00b4220 100644
 --- a/include/linux/rcupdate.h
 +++ b/include/linux/rcupdate.h
 @@ -180,7 +180,7 @@ extern void synchronize_sched(void);

  extern void __rcu_read_lock(void);
  extern void __rcu_read_unlock(void);
 -extern void rcu_read_unlock_special(struct task_struct *t);
 +extern void rcu_read_unlock_special(struct task_struct *t, bool unlock);
  void synchronize_rcu(void);

  /*
 diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
 index cce6ba8..33b89a3 100644
 --- a/kernel/rcupdate.c
 +++ b/kernel/rcupdate.c
 @@ -90,7 +90,7 @@ void __rcu_read_unlock(void)
  #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
  barrier();  /* assign before 

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-09 Thread Paul E. McKenney
On Fri, Aug 09, 2013 at 05:31:27PM +0800, Lai Jiangshan wrote:
 On 08/09/2013 04:40 AM, Paul E. McKenney wrote:
  On Wed, Aug 07, 2013 at 06:25:01PM +0800, Lai Jiangshan wrote:
  Background)
 
  Although all articles declare that rcu read site is deadlock-immunity.
  It is not true for rcu-preempt, it will be deadlock if rcu read site
  overlaps with scheduler lock.
 
  ec433f0c, 10f39bb1 and 016a8d5b just partially solve it. But rcu read site
  is still not deadlock-immunity. And the problem described in 016a8d5b
  is still existed(rcu_read_unlock_special() calls wake_up).
 
  Aim)
 
  We want to fix the problem forever, we want to keep rcu read site
  is deadlock-immunity as books say.
 
  How)
 
  The problem is solved by if rcu_read_unlock_special() is called inside
  any lock which can be (chained) nested in rcu_read_unlock_special(),
  we defer rcu_read_unlock_special().
  This kind locks include rnp-lock, scheduler locks, perf ctx-lock, locks
  in printk()/WARN_ON() and all locks nested in these locks or chained nested
  in these locks.
 
  The problem is reduced to how to distinguish all these locks(context),
  We don't distinguish all these locks, we know that all these locks
  should be nested in local_irqs_disable().
 
  we just consider if rcu_read_unlock_special() is called in irqs-disabled
  context, it may be called in these suspect locks, we should defer
  rcu_read_unlock_special().
 
  The algorithm enlarges the probability of deferring, but the probability
  is still very very low.
 
  Deferring does add a small overhead, but it offers us:
 1) really deadlock-immunity for rcu read site
 2) remove the overhead of the irq-work(250 times per second in avg.)
  
  One problem here -- it may take quite some time for a set_need_resched()
  to take effect.  This is especially a problem for RCU priority boosting,
  but can also needlessly delay preemptible-RCU grace periods because
  local_irq_restore() and friends don't check the TIF_NEED_RESCHED bit.
 
 The final effect of deboosting(rt_mutex_unlock()) is also accomplished
 via set_need_resched()/set_tsk_need_resched().
 set_need_resched() is enough for RCU priority boosting issue here.

Eventually, yes.  But all that set_need_resched() does is set the
TIF_NEED_RESCHED.  This is checked by the outermost preempt_enable(),
return from interrupt, return to userspace, and things like
cond_resched().  So it might well be quite some time until the boosted
reader actually gets around to deboosting itself.

 Since rcu_read_unlock_special() is deferred, it does take quite some time for
 QS report to take effect.

Agreed.

  OK, alternatives...
  
  o   Keep the current rule saying that if the scheduler is going
  to exit an RCU read-side critical section while holding
  one of its spinlocks, preemption has to have been disabled
 
 Since rtmutex'lock-wait_lock is not irqs-disabled nor bh-disabled.

Yep, because this rule prevents the call to rt_mutex_unlock() from
happening whenever one of the scheduler's irq-disable locks is held.

 This kind of spinlocks include scheduler locks, rtmutex'lock-wait_lock,
 all locks can be acquired in irq/SOFTIRQ.
 
 So this rule is not only applied for scheduler locks, it should also
 be applied for almost all spinlocks in the kernel.

No, only those locks acquired by the scheduler in the wakeup path.
Or am I missing something here?

 I was hard to accept that rcu read site is not deadlock-immunity.

So did I, see http://lwn.net/Articles/453002/.  RCU was a victim of its
own success.  ;-)

And it would be really cool to restore full deadlock immunity to
rcu_read_unlock(), no two ways about it!  Hmmm...  Any way that a
self-IPI could be made safe for all architectures?

Thanx, Paul

 Thanks,
 Lai
 
  throughout the full duration of that critical section.
  Well, we can certainly do this, but it would be nice to get
  rid of this rule.
  
  o   Use per-CPU variables, possibly injecting delay.  This has ugly
  disadvantages as noted above.
  
  o   irq_work_queue() can wait a jiffy (or on some architectures,
  quite a bit longer) before actually doing anything.
  
  o   raise_softirq() is more immediate and is an easy change, but
  adds a softirq vector -- which people are really trying to
  get rid of.  Also, wakeup_softirqd() calls things that acquire
  the scheduler locks, which is exactly what we were trying to
  avoid doing.
  
  o   invoke_rcu_core() can invoke raise_softirq() as above.
  
  o   IPI to self.  From what I can see, not all architectures
  support this.  Easy to fake if you have at least two CPUs,
  but not so good from an OS jitter viewpoint...
  
  o   Add a check to local_irq_disable() and friends.  I would guess
  that this suggestion would not make architecture maintainers
  happy.
  
  Other thoughts?
  
  Thanx, Paul
  
  

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-09 Thread Lai Jiangshan
Hi, Steven

I was considering rtmutex's lock-wait_lock is a scheduler lock,
But it is not, and it is just a spinlock of process context.
I hope you change it to a spinlock of irq context.

1) it causes rcu read site more deadlockable, example:
x is a spinlock of softirq context.

CPU1cpu2(rcu boost)
rcu_read_lock() rt_mutext_lock()
preemption and reschedule backraw_spin_lock(lock-wait_lock)
spin_lock_bh(x) interrupt and doing softirq after irq
rcu_read_unlock() do_softirq()
  rcu_read_unlock_special()
rt_mutext_unlock()
  raw_spin_lock(lock-wait_lock)spin_lock_bh(x)  DEADLOCK

This example can happen on any one of these code:
without my patchset
with my patchset
with my patchset but reverted patch2

2) why it causes more deadlockable: it extends the range of suspect locks.
#DEFINE suspect locks: any lock can be (chained) nested in 
rcu_read_unlock_special().

So the suspect locks are: rnp-lock, scheduler locks, rtmutex's lock-wait_lock,
locks in prink()/WARN_ON() and the locks which can be chained/indirectly nested
in the above locks.

If the rtmutex's lock-wait_lock is a spinlock of irq context, all suspect 
locks are
some spinlocks of irq context.

If the rtmutex's lock-wait_lock is a spinlock of process context, suspect locks
will be extended to, all spinlocks of irq context, all spinlocks of softirq 
context,
and (all spinlocks of process context but nested in rtmutex's lock-wait_lock).

We can see from the definition, if rcu_read_unlock_special() is called from
any suspect lock, it may be deadlock like the example. the rtmutex's 
lock-wait_lock
extends the range of suspect locks, it causes more deadlockable.

3) How my algorithm works, why smaller range of suspect locks help us.
Since rcu_read_unlock_special() can't be called from suspect locks context,
we should defer rcu_read_unlock_special() when in these contexts.
It is hard to find out current context is suspect locks context or not,
but we can determine it based on irq/softirq/process context.

if all suspect locks are some spinlocks of irq context:
if (irqs_disabled) /* we may be in suspect locks context */
defer rcu_read_unlock_special().

if all suspect locks are some spinlocks of irq/softirq/process context:
if (irqs_disabled || in_atomic()) /* we may be in suspect locks context 
*/
defer rcu_read_unlock_special().
In this case, the deferring becomes large more, I can't accept it.
So I have to narrow the range of suspect locks. Two choices:
A) don't call rt_mutex_unlock() from rcu_read_unlock(), only call it
   from rcu_preempt_not_context_switch(). we need to rework these
   two functions and it will add complexity to RCU, and it also still
   adds some probability of deferring.
B) change rtmutex's lock-wait_lock to irqs-disabled.

4) In the view of rtmutex, I think it will be better if -wait_lock is 
irqs-disabled.
   A) like trylock of mutex/rw_sem, we may call rt_mutex_trylock() in irq in 
future.
   B) the critical section of -wait_lock is short,
  making it irqs-disabled don't hurts responsibility/latency.
   C) almost all time of the critical section of -wait_lock is irqs-disabled
  (due to task-pi_lock), I think converting whole time of the critical 
section
  of -wait_lock to irqs-disabled is OK.

So I hope you change rtmutex's lock-wait_lock.

Any feedback from anyone is welcome.

Thanks,
Lai

On 08/09/2013 04:40 AM, Paul E. McKenney wrote:
 On Wed, Aug 07, 2013 at 06:25:01PM +0800, Lai Jiangshan wrote:
 Background)

 Although all articles declare that rcu read site is deadlock-immunity.
 It is not true for rcu-preempt, it will be deadlock if rcu read site
 overlaps with scheduler lock.

 ec433f0c, 10f39bb1 and 016a8d5b just partially solve it. But rcu read site
 is still not deadlock-immunity. And the problem described in 016a8d5b
 is still existed(rcu_read_unlock_special() calls wake_up).

 Aim)

 We want to fix the problem forever, we want to keep rcu read site
 is deadlock-immunity as books say.

 How)

 The problem is solved by if rcu_read_unlock_special() is called inside
 any lock which can be (chained) nested in rcu_read_unlock_special(),
 we defer rcu_read_unlock_special().
 This kind locks include rnp-lock, scheduler locks, perf ctx-lock, locks
 in printk()/WARN_ON() and all locks nested in these locks or chained nested
 in these locks.

 The problem is reduced to how to distinguish all these locks(context),
 We don't distinguish all these locks, we know that all these locks
 should be nested in local_irqs_disable().

 we just consider if rcu_read_unlock_special() is called in irqs-disabled
 context, it may be called in these suspect locks, we should defer
 rcu_read_unlock_special().

 The algorithm enlarges the probability of deferring, but the probability
 is still very very low.

 Deferring does add a 

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-08 Thread Paul E. McKenney
On Wed, Aug 07, 2013 at 06:25:01PM +0800, Lai Jiangshan wrote:
> Background)
> 
> Although all articles declare that rcu read site is deadlock-immunity.
> It is not true for rcu-preempt, it will be deadlock if rcu read site
> overlaps with scheduler lock.
> 
> ec433f0c, 10f39bb1 and 016a8d5b just partially solve it. But rcu read site
> is still not deadlock-immunity. And the problem described in 016a8d5b
> is still existed(rcu_read_unlock_special() calls wake_up).
> 
> Aim)
> 
> We want to fix the problem forever, we want to keep rcu read site
> is deadlock-immunity as books say.
> 
> How)
> 
> The problem is solved by "if rcu_read_unlock_special() is called inside
> any lock which can be (chained) nested in rcu_read_unlock_special(),
> we defer rcu_read_unlock_special()".
> This kind locks include rnp->lock, scheduler locks, perf ctx->lock, locks
> in printk()/WARN_ON() and all locks nested in these locks or chained nested
> in these locks.
> 
> The problem is reduced to "how to distinguish all these locks(context)",
> We don't distinguish all these locks, we know that all these locks
> should be nested in local_irqs_disable().
> 
> we just consider if rcu_read_unlock_special() is called in irqs-disabled
> context, it may be called in these suspect locks, we should defer
> rcu_read_unlock_special().
> 
> The algorithm enlarges the probability of deferring, but the probability
> is still very very low.
> 
> Deferring does add a small overhead, but it offers us:
>   1) really deadlock-immunity for rcu read site
>   2) remove the overhead of the irq-work(250 times per second in avg.)

One problem here -- it may take quite some time for a set_need_resched()
to take effect.  This is especially a problem for RCU priority boosting,
but can also needlessly delay preemptible-RCU grace periods because
local_irq_restore() and friends don't check the TIF_NEED_RESCHED bit.

OK, alternatives...

o   Keep the current rule saying that if the scheduler is going
to exit an RCU read-side critical section while holding
one of its spinlocks, preemption has to have been disabled
throughout the full duration of that critical section.
Well, we can certainly do this, but it would be nice to get
rid of this rule.

o   Use per-CPU variables, possibly injecting delay.  This has ugly
disadvantages as noted above.

o   irq_work_queue() can wait a jiffy (or on some architectures,
quite a bit longer) before actually doing anything.

o   raise_softirq() is more immediate and is an easy change, but
adds a softirq vector -- which people are really trying to
get rid of.  Also, wakeup_softirqd() calls things that acquire
the scheduler locks, which is exactly what we were trying to
avoid doing.

o   invoke_rcu_core() can invoke raise_softirq() as above.

o   IPI to self.  From what I can see, not all architectures
support this.  Easy to fake if you have at least two CPUs,
but not so good from an OS jitter viewpoint...

o   Add a check to local_irq_disable() and friends.  I would guess
that this suggestion would not make architecture maintainers
happy.

Other thoughts?

Thanx, Paul

> Signed-off-by: Lai Jiangshan 
> ---
>  include/linux/rcupdate.h |2 +-
>  kernel/rcupdate.c|2 +-
>  kernel/rcutree_plugin.h  |   47 +
>  3 files changed, 44 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index 4b14bdc..00b4220 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -180,7 +180,7 @@ extern void synchronize_sched(void);
> 
>  extern void __rcu_read_lock(void);
>  extern void __rcu_read_unlock(void);
> -extern void rcu_read_unlock_special(struct task_struct *t);
> +extern void rcu_read_unlock_special(struct task_struct *t, bool unlock);
>  void synchronize_rcu(void);
> 
>  /*
> diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
> index cce6ba8..33b89a3 100644
> --- a/kernel/rcupdate.c
> +++ b/kernel/rcupdate.c
> @@ -90,7 +90,7 @@ void __rcu_read_unlock(void)
>  #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
>   barrier();  /* assign before ->rcu_read_unlock_special load */
>   if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
> - rcu_read_unlock_special(t);
> + rcu_read_unlock_special(t, true);
>   barrier();  /* ->rcu_read_unlock_special load before assign */
>   t->rcu_read_lock_nesting = 0;
>   }
> diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
> index fc8b36f..997b424 100644
> --- a/kernel/rcutree_plugin.h
> +++ b/kernel/rcutree_plugin.h
> @@ -242,15 +242,16 @@ static void rcu_preempt_note_context_switch(int cpu)
>  ? rnp->gpnum
>

Re: [PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-08 Thread Paul E. McKenney
On Wed, Aug 07, 2013 at 06:25:01PM +0800, Lai Jiangshan wrote:
 Background)
 
 Although all articles declare that rcu read site is deadlock-immunity.
 It is not true for rcu-preempt, it will be deadlock if rcu read site
 overlaps with scheduler lock.
 
 ec433f0c, 10f39bb1 and 016a8d5b just partially solve it. But rcu read site
 is still not deadlock-immunity. And the problem described in 016a8d5b
 is still existed(rcu_read_unlock_special() calls wake_up).
 
 Aim)
 
 We want to fix the problem forever, we want to keep rcu read site
 is deadlock-immunity as books say.
 
 How)
 
 The problem is solved by if rcu_read_unlock_special() is called inside
 any lock which can be (chained) nested in rcu_read_unlock_special(),
 we defer rcu_read_unlock_special().
 This kind locks include rnp-lock, scheduler locks, perf ctx-lock, locks
 in printk()/WARN_ON() and all locks nested in these locks or chained nested
 in these locks.
 
 The problem is reduced to how to distinguish all these locks(context),
 We don't distinguish all these locks, we know that all these locks
 should be nested in local_irqs_disable().
 
 we just consider if rcu_read_unlock_special() is called in irqs-disabled
 context, it may be called in these suspect locks, we should defer
 rcu_read_unlock_special().
 
 The algorithm enlarges the probability of deferring, but the probability
 is still very very low.
 
 Deferring does add a small overhead, but it offers us:
   1) really deadlock-immunity for rcu read site
   2) remove the overhead of the irq-work(250 times per second in avg.)

One problem here -- it may take quite some time for a set_need_resched()
to take effect.  This is especially a problem for RCU priority boosting,
but can also needlessly delay preemptible-RCU grace periods because
local_irq_restore() and friends don't check the TIF_NEED_RESCHED bit.

OK, alternatives...

o   Keep the current rule saying that if the scheduler is going
to exit an RCU read-side critical section while holding
one of its spinlocks, preemption has to have been disabled
throughout the full duration of that critical section.
Well, we can certainly do this, but it would be nice to get
rid of this rule.

o   Use per-CPU variables, possibly injecting delay.  This has ugly
disadvantages as noted above.

o   irq_work_queue() can wait a jiffy (or on some architectures,
quite a bit longer) before actually doing anything.

o   raise_softirq() is more immediate and is an easy change, but
adds a softirq vector -- which people are really trying to
get rid of.  Also, wakeup_softirqd() calls things that acquire
the scheduler locks, which is exactly what we were trying to
avoid doing.

o   invoke_rcu_core() can invoke raise_softirq() as above.

o   IPI to self.  From what I can see, not all architectures
support this.  Easy to fake if you have at least two CPUs,
but not so good from an OS jitter viewpoint...

o   Add a check to local_irq_disable() and friends.  I would guess
that this suggestion would not make architecture maintainers
happy.

Other thoughts?

Thanx, Paul

 Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
 ---
  include/linux/rcupdate.h |2 +-
  kernel/rcupdate.c|2 +-
  kernel/rcutree_plugin.h  |   47 +
  3 files changed, 44 insertions(+), 7 deletions(-)
 
 diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
 index 4b14bdc..00b4220 100644
 --- a/include/linux/rcupdate.h
 +++ b/include/linux/rcupdate.h
 @@ -180,7 +180,7 @@ extern void synchronize_sched(void);
 
  extern void __rcu_read_lock(void);
  extern void __rcu_read_unlock(void);
 -extern void rcu_read_unlock_special(struct task_struct *t);
 +extern void rcu_read_unlock_special(struct task_struct *t, bool unlock);
  void synchronize_rcu(void);
 
  /*
 diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
 index cce6ba8..33b89a3 100644
 --- a/kernel/rcupdate.c
 +++ b/kernel/rcupdate.c
 @@ -90,7 +90,7 @@ void __rcu_read_unlock(void)
  #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
   barrier();  /* assign before -rcu_read_unlock_special load */
   if (unlikely(ACCESS_ONCE(t-rcu_read_unlock_special)))
 - rcu_read_unlock_special(t);
 + rcu_read_unlock_special(t, true);
   barrier();  /* -rcu_read_unlock_special load before assign */
   t-rcu_read_lock_nesting = 0;
   }
 diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
 index fc8b36f..997b424 100644
 --- a/kernel/rcutree_plugin.h
 +++ b/kernel/rcutree_plugin.h
 @@ -242,15 +242,16 @@ static void rcu_preempt_note_context_switch(int cpu)
  ? rnp-gpnum
  : rnp-gpnum + 1);
   

[PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-07 Thread Lai Jiangshan
Background)

Although all articles declare that rcu read site is deadlock-immunity.
It is not true for rcu-preempt, it will be deadlock if rcu read site
overlaps with scheduler lock.

ec433f0c, 10f39bb1 and 016a8d5b just partially solve it. But rcu read site
is still not deadlock-immunity. And the problem described in 016a8d5b
is still existed(rcu_read_unlock_special() calls wake_up).

Aim)

We want to fix the problem forever, we want to keep rcu read site
is deadlock-immunity as books say.

How)

The problem is solved by "if rcu_read_unlock_special() is called inside
any lock which can be (chained) nested in rcu_read_unlock_special(),
we defer rcu_read_unlock_special()".
This kind locks include rnp->lock, scheduler locks, perf ctx->lock, locks
in printk()/WARN_ON() and all locks nested in these locks or chained nested
in these locks.

The problem is reduced to "how to distinguish all these locks(context)",
We don't distinguish all these locks, we know that all these locks
should be nested in local_irqs_disable().

we just consider if rcu_read_unlock_special() is called in irqs-disabled
context, it may be called in these suspect locks, we should defer
rcu_read_unlock_special().

The algorithm enlarges the probability of deferring, but the probability
is still very very low.

Deferring does add a small overhead, but it offers us:
1) really deadlock-immunity for rcu read site
2) remove the overhead of the irq-work(250 times per second in avg.)

Signed-off-by: Lai Jiangshan 
---
 include/linux/rcupdate.h |2 +-
 kernel/rcupdate.c|2 +-
 kernel/rcutree_plugin.h  |   47 +
 3 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 4b14bdc..00b4220 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -180,7 +180,7 @@ extern void synchronize_sched(void);
 
 extern void __rcu_read_lock(void);
 extern void __rcu_read_unlock(void);
-extern void rcu_read_unlock_special(struct task_struct *t);
+extern void rcu_read_unlock_special(struct task_struct *t, bool unlock);
 void synchronize_rcu(void);
 
 /*
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index cce6ba8..33b89a3 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -90,7 +90,7 @@ void __rcu_read_unlock(void)
 #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
barrier();  /* assign before ->rcu_read_unlock_special load */
if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
-   rcu_read_unlock_special(t);
+   rcu_read_unlock_special(t, true);
barrier();  /* ->rcu_read_unlock_special load before assign */
t->rcu_read_lock_nesting = 0;
}
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index fc8b36f..997b424 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -242,15 +242,16 @@ static void rcu_preempt_note_context_switch(int cpu)
   ? rnp->gpnum
   : rnp->gpnum + 1);
raw_spin_unlock_irqrestore(>lock, flags);
-   } else if (t->rcu_read_lock_nesting < 0 &&
-  !WARN_ON_ONCE(t->rcu_read_lock_nesting != INT_MIN) &&
-  t->rcu_read_unlock_special) {
+   } else if (t->rcu_read_lock_nesting == 0 ||
+  (t->rcu_read_lock_nesting < 0 &&
+  !WARN_ON_ONCE(t->rcu_read_lock_nesting != INT_MIN))) {
 
/*
 * Complete exit from RCU read-side critical section on
 * behalf of preempted instance of __rcu_read_unlock().
 */
-   rcu_read_unlock_special(t);
+   if (t->rcu_read_unlock_special)
+   rcu_read_unlock_special(t, false);
}
 
/*
@@ -333,7 +334,7 @@ static struct list_head *rcu_next_node_entry(struct 
task_struct *t,
  * notify RCU core processing or task having blocked during the RCU
  * read-side critical section.
  */
-void rcu_read_unlock_special(struct task_struct *t)
+void rcu_read_unlock_special(struct task_struct *t, bool unlock)
 {
int empty;
int empty_exp;
@@ -364,6 +365,42 @@ void rcu_read_unlock_special(struct task_struct *t)
 
/* Clean up if blocked during RCU read-side critical section. */
if (special & RCU_READ_UNLOCK_BLOCKED) {
+   /*
+* If rcu read lock overlaps with scheduler lock,
+* rcu_read_unlock_special() may lead to deadlock:
+*
+* rcu_read_lock();
+* preempt_schedule[_irq]() (when preemption)
+* scheduler lock; (or some other locks can be (chained) nested
+*  in rcu_read_unlock_special()/rnp->lock)
+* access and check rcu data
+* rcu_read_unlock();
+*   

[PATCH 5/8] rcu: eliminate deadlock for rcu read site

2013-08-07 Thread Lai Jiangshan
Background)

Although all articles declare that rcu read site is deadlock-immunity.
It is not true for rcu-preempt, it will be deadlock if rcu read site
overlaps with scheduler lock.

ec433f0c, 10f39bb1 and 016a8d5b just partially solve it. But rcu read site
is still not deadlock-immunity. And the problem described in 016a8d5b
is still existed(rcu_read_unlock_special() calls wake_up).

Aim)

We want to fix the problem forever, we want to keep rcu read site
is deadlock-immunity as books say.

How)

The problem is solved by if rcu_read_unlock_special() is called inside
any lock which can be (chained) nested in rcu_read_unlock_special(),
we defer rcu_read_unlock_special().
This kind locks include rnp-lock, scheduler locks, perf ctx-lock, locks
in printk()/WARN_ON() and all locks nested in these locks or chained nested
in these locks.

The problem is reduced to how to distinguish all these locks(context),
We don't distinguish all these locks, we know that all these locks
should be nested in local_irqs_disable().

we just consider if rcu_read_unlock_special() is called in irqs-disabled
context, it may be called in these suspect locks, we should defer
rcu_read_unlock_special().

The algorithm enlarges the probability of deferring, but the probability
is still very very low.

Deferring does add a small overhead, but it offers us:
1) really deadlock-immunity for rcu read site
2) remove the overhead of the irq-work(250 times per second in avg.)

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 include/linux/rcupdate.h |2 +-
 kernel/rcupdate.c|2 +-
 kernel/rcutree_plugin.h  |   47 +
 3 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 4b14bdc..00b4220 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -180,7 +180,7 @@ extern void synchronize_sched(void);
 
 extern void __rcu_read_lock(void);
 extern void __rcu_read_unlock(void);
-extern void rcu_read_unlock_special(struct task_struct *t);
+extern void rcu_read_unlock_special(struct task_struct *t, bool unlock);
 void synchronize_rcu(void);
 
 /*
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index cce6ba8..33b89a3 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -90,7 +90,7 @@ void __rcu_read_unlock(void)
 #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
barrier();  /* assign before -rcu_read_unlock_special load */
if (unlikely(ACCESS_ONCE(t-rcu_read_unlock_special)))
-   rcu_read_unlock_special(t);
+   rcu_read_unlock_special(t, true);
barrier();  /* -rcu_read_unlock_special load before assign */
t-rcu_read_lock_nesting = 0;
}
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index fc8b36f..997b424 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -242,15 +242,16 @@ static void rcu_preempt_note_context_switch(int cpu)
   ? rnp-gpnum
   : rnp-gpnum + 1);
raw_spin_unlock_irqrestore(rnp-lock, flags);
-   } else if (t-rcu_read_lock_nesting  0 
-  !WARN_ON_ONCE(t-rcu_read_lock_nesting != INT_MIN) 
-  t-rcu_read_unlock_special) {
+   } else if (t-rcu_read_lock_nesting == 0 ||
+  (t-rcu_read_lock_nesting  0 
+  !WARN_ON_ONCE(t-rcu_read_lock_nesting != INT_MIN))) {
 
/*
 * Complete exit from RCU read-side critical section on
 * behalf of preempted instance of __rcu_read_unlock().
 */
-   rcu_read_unlock_special(t);
+   if (t-rcu_read_unlock_special)
+   rcu_read_unlock_special(t, false);
}
 
/*
@@ -333,7 +334,7 @@ static struct list_head *rcu_next_node_entry(struct 
task_struct *t,
  * notify RCU core processing or task having blocked during the RCU
  * read-side critical section.
  */
-void rcu_read_unlock_special(struct task_struct *t)
+void rcu_read_unlock_special(struct task_struct *t, bool unlock)
 {
int empty;
int empty_exp;
@@ -364,6 +365,42 @@ void rcu_read_unlock_special(struct task_struct *t)
 
/* Clean up if blocked during RCU read-side critical section. */
if (special  RCU_READ_UNLOCK_BLOCKED) {
+   /*
+* If rcu read lock overlaps with scheduler lock,
+* rcu_read_unlock_special() may lead to deadlock:
+*
+* rcu_read_lock();
+* preempt_schedule[_irq]() (when preemption)
+* scheduler lock; (or some other locks can be (chained) nested
+*  in rcu_read_unlock_special()/rnp-lock)
+* access and check rcu data
+* rcu_read_unlock();
+*