Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-08-13 Thread Paul E. McKenney
On Fri, Aug 09, 2013 at 06:20:59PM +0200, Frederic Weisbecker wrote:
> On Fri, Jul 26, 2013 at 04:19:23PM -0700, Paul E. McKenney wrote:
> > diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
> > index 3edae39..ff84bed 100644
> > --- a/kernel/rcutree_plugin.h
> > +++ b/kernel/rcutree_plugin.h
> > @@ -28,7 +28,7 @@
> >  #include 
> >  #include 
> >  #include 
> > -#include 
> > +#include "time/tick-internal.h"
> >  
> >  #define RCU_KTHREAD_PRIO 1
> >  
> > @@ -2395,12 +2395,12 @@ static void rcu_kick_nohz_cpu(int cpu)
> >   * most active flavor of RCU.
> >   */
> >  #ifdef CONFIG_PREEMPT_RCU
> > -static struct rcu_state __maybe_unused *rcu_sysidle_state = 
> > _preempt_state;
> > +static struct rcu_state *rcu_sysidle_state = _preempt_state;
> >  #else /* #ifdef CONFIG_PREEMPT_RCU */
> > -static struct rcu_state __maybe_unused *rcu_sysidle_state = 
> > _sched_state;
> > +static struct rcu_state *rcu_sysidle_state = _sched_state;
> >  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
> 
> Ah you fixed it here. Ok :)

Bisectability and all that.  ;-)

> > -static int __maybe_unused full_sysidle_state; /* Current system-idle 
> > state. */
> > +static int full_sysidle_state; /* Current system-idle state. */
> >  #define RCU_SYSIDLE_NOT0   /* Some CPU is not idle. */
> >  #define RCU_SYSIDLE_SHORT  1   /* All CPUs idle for brief period. */
> >  #define RCU_SYSIDLE_LONG   2   /* All CPUs idle for long enough. */
> [...]
> > +/*
> > + * Check to see if the system is fully idle, other than the timekeeping 
> > CPU.
> > + * The caller must have disabled interrupts.
> > + */
> > +bool rcu_sys_is_idle(void)
> > +{
> > +   static struct rcu_sysidle_head rsh;
> > +   int rss = ACCESS_ONCE(full_sysidle_state);
> > +
> > +   if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
> > +   return false;
> > +
> > +   /* Handle small-system case by doing a full scan of CPUs. */
> > +   if (nr_cpu_ids <= RCU_SYSIDLE_SMALL) {
> 
> I don't understand how the nr_cpu_ids > RCU_SYSIDLE_SMALL is handled. There 
> don't
> seem to be other calls of rcu_sysidle_check_cpu() than for small systems.

The other calls are from kernel/rcutree.c from the force-quiescent-state
code.  If we have a big system, we don't check until we have some other
reason to touch the cache lines.  If we have a small system, we just
dig through them on transition to idle.

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-08-13 Thread Paul E. McKenney
On Fri, Aug 09, 2013 at 06:20:59PM +0200, Frederic Weisbecker wrote:
 On Fri, Jul 26, 2013 at 04:19:23PM -0700, Paul E. McKenney wrote:
  diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
  index 3edae39..ff84bed 100644
  --- a/kernel/rcutree_plugin.h
  +++ b/kernel/rcutree_plugin.h
  @@ -28,7 +28,7 @@
   #include linux/gfp.h
   #include linux/oom.h
   #include linux/smpboot.h
  -#include linux/tick.h
  +#include time/tick-internal.h
   
   #define RCU_KTHREAD_PRIO 1
   
  @@ -2395,12 +2395,12 @@ static void rcu_kick_nohz_cpu(int cpu)
* most active flavor of RCU.
*/
   #ifdef CONFIG_PREEMPT_RCU
  -static struct rcu_state __maybe_unused *rcu_sysidle_state = 
  rcu_preempt_state;
  +static struct rcu_state *rcu_sysidle_state = rcu_preempt_state;
   #else /* #ifdef CONFIG_PREEMPT_RCU */
  -static struct rcu_state __maybe_unused *rcu_sysidle_state = 
  rcu_sched_state;
  +static struct rcu_state *rcu_sysidle_state = rcu_sched_state;
   #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
 Ah you fixed it here. Ok :)

Bisectability and all that.  ;-)

  -static int __maybe_unused full_sysidle_state; /* Current system-idle 
  state. */
  +static int full_sysidle_state; /* Current system-idle state. */
   #define RCU_SYSIDLE_NOT0   /* Some CPU is not idle. */
   #define RCU_SYSIDLE_SHORT  1   /* All CPUs idle for brief period. */
   #define RCU_SYSIDLE_LONG   2   /* All CPUs idle for long enough. */
 [...]
  +/*
  + * Check to see if the system is fully idle, other than the timekeeping 
  CPU.
  + * The caller must have disabled interrupts.
  + */
  +bool rcu_sys_is_idle(void)
  +{
  +   static struct rcu_sysidle_head rsh;
  +   int rss = ACCESS_ONCE(full_sysidle_state);
  +
  +   if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
  +   return false;
  +
  +   /* Handle small-system case by doing a full scan of CPUs. */
  +   if (nr_cpu_ids = RCU_SYSIDLE_SMALL) {
 
 I don't understand how the nr_cpu_ids  RCU_SYSIDLE_SMALL is handled. There 
 don't
 seem to be other calls of rcu_sysidle_check_cpu() than for small systems.

The other calls are from kernel/rcutree.c from the force-quiescent-state
code.  If we have a big system, we don't check until we have some other
reason to touch the cache lines.  If we have a small system, we just
dig through them on transition to idle.

Thanx, Paul

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-08-09 Thread Frederic Weisbecker
On Fri, Jul 26, 2013 at 04:19:23PM -0700, Paul E. McKenney wrote:
> diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
> index 3edae39..ff84bed 100644
> --- a/kernel/rcutree_plugin.h
> +++ b/kernel/rcutree_plugin.h
> @@ -28,7 +28,7 @@
>  #include 
>  #include 
>  #include 
> -#include 
> +#include "time/tick-internal.h"
>  
>  #define RCU_KTHREAD_PRIO 1
>  
> @@ -2395,12 +2395,12 @@ static void rcu_kick_nohz_cpu(int cpu)
>   * most active flavor of RCU.
>   */
>  #ifdef CONFIG_PREEMPT_RCU
> -static struct rcu_state __maybe_unused *rcu_sysidle_state = 
> _preempt_state;
> +static struct rcu_state *rcu_sysidle_state = _preempt_state;
>  #else /* #ifdef CONFIG_PREEMPT_RCU */
> -static struct rcu_state __maybe_unused *rcu_sysidle_state = _sched_state;
> +static struct rcu_state *rcu_sysidle_state = _sched_state;
>  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */

Ah you fixed it here. Ok :)

>  
> -static int __maybe_unused full_sysidle_state; /* Current system-idle state. 
> */
> +static int full_sysidle_state;   /* Current system-idle state. */
>  #define RCU_SYSIDLE_NOT  0   /* Some CPU is not idle. */
>  #define RCU_SYSIDLE_SHORT1   /* All CPUs idle for brief period. */
>  #define RCU_SYSIDLE_LONG 2   /* All CPUs idle for long enough. */
[...]
> +/*
> + * Check to see if the system is fully idle, other than the timekeeping CPU.
> + * The caller must have disabled interrupts.
> + */
> +bool rcu_sys_is_idle(void)
> +{
> + static struct rcu_sysidle_head rsh;
> + int rss = ACCESS_ONCE(full_sysidle_state);
> +
> + if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
> + return false;
> +
> + /* Handle small-system case by doing a full scan of CPUs. */
> + if (nr_cpu_ids <= RCU_SYSIDLE_SMALL) {

I don't understand how the nr_cpu_ids > RCU_SYSIDLE_SMALL is handled. There 
don't
seem to be other calls of rcu_sysidle_check_cpu() than for small systems.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-08-09 Thread Frederic Weisbecker
On Fri, Jul 26, 2013 at 04:19:23PM -0700, Paul E. McKenney wrote:
 diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
 index 3edae39..ff84bed 100644
 --- a/kernel/rcutree_plugin.h
 +++ b/kernel/rcutree_plugin.h
 @@ -28,7 +28,7 @@
  #include linux/gfp.h
  #include linux/oom.h
  #include linux/smpboot.h
 -#include linux/tick.h
 +#include time/tick-internal.h
  
  #define RCU_KTHREAD_PRIO 1
  
 @@ -2395,12 +2395,12 @@ static void rcu_kick_nohz_cpu(int cpu)
   * most active flavor of RCU.
   */
  #ifdef CONFIG_PREEMPT_RCU
 -static struct rcu_state __maybe_unused *rcu_sysidle_state = 
 rcu_preempt_state;
 +static struct rcu_state *rcu_sysidle_state = rcu_preempt_state;
  #else /* #ifdef CONFIG_PREEMPT_RCU */
 -static struct rcu_state __maybe_unused *rcu_sysidle_state = rcu_sched_state;
 +static struct rcu_state *rcu_sysidle_state = rcu_sched_state;
  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */

Ah you fixed it here. Ok :)

  
 -static int __maybe_unused full_sysidle_state; /* Current system-idle state. 
 */
 +static int full_sysidle_state;   /* Current system-idle state. */
  #define RCU_SYSIDLE_NOT  0   /* Some CPU is not idle. */
  #define RCU_SYSIDLE_SHORT1   /* All CPUs idle for brief period. */
  #define RCU_SYSIDLE_LONG 2   /* All CPUs idle for long enough. */
[...]
 +/*
 + * Check to see if the system is fully idle, other than the timekeeping CPU.
 + * The caller must have disabled interrupts.
 + */
 +bool rcu_sys_is_idle(void)
 +{
 + static struct rcu_sysidle_head rsh;
 + int rss = ACCESS_ONCE(full_sysidle_state);
 +
 + if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
 + return false;
 +
 + /* Handle small-system case by doing a full scan of CPUs. */
 + if (nr_cpu_ids = RCU_SYSIDLE_SMALL) {

I don't understand how the nr_cpu_ids  RCU_SYSIDLE_SMALL is handled. There 
don't
seem to be other calls of rcu_sysidle_check_cpu() than for small systems.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-29 Thread Paul E. McKenney
On Mon, Jul 29, 2013 at 04:19:48PM +0800, Lai Jiangshan wrote:
> On 07/27/2013 07:19 AM, Paul E. McKenney wrote:
> > From: "Paul E. McKenney" 
> > 
> > This commit adds the state machine that takes the per-CPU idle data
> > as input and produces a full-system-idle indication as output.  This
> > state machine is driven out of RCU's quiescent-state-forcing
> > mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU
> > idle state and then rcu_sysidle_report() to drive the state machine.
> > 
> > The full-system-idle state is sampled using rcu_sys_is_idle(), which
> > also drives the state machine if RCU is idle (and does so by forcing
> > RCU to become non-idle).  This function returns true if all but the
> > timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long
> > enough to avoid memory contention on the full_sysidle_state state
> > variable.  The rcu_sysidle_force_exit() may be called externally
> > to reset the state machine back into non-idle state.
> > 
> > Signed-off-by: Paul E. McKenney 
> > Cc: Frederic Weisbecker 
> > Cc: Steven Rostedt 
> > ---
> >  include/linux/rcupdate.h |  18 +++
> >  kernel/rcutree.c |  16 ++-
> >  kernel/rcutree.h |   5 +
> >  kernel/rcutree_plugin.h  | 284 
> > ++-
> >  4 files changed, 316 insertions(+), 7 deletions(-)
> > 
> > diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> > index 48f1ef9..1aa8d8c 100644
> > --- a/include/linux/rcupdate.h
> > +++ b/include/linux/rcupdate.h
> > @@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return 
> > false; }
> >  #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
> >  
> >  
> > +/* Only for use by adaptive-ticks code. */
> > +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
> > +extern bool rcu_sys_is_idle(void);
> > +extern void rcu_sysidle_force_exit(void);
> > +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
> > +
> > +static inline bool rcu_sys_is_idle(void)
> > +{
> > +   return false;
> > +}
> > +
> > +static inline void rcu_sysidle_force_exit(void)
> > +{
> > +}
> > +
> > +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
> > +
> > +
> >  #endif /* __LINUX_RCUPDATE_H */
> > diff --git a/kernel/rcutree.c b/kernel/rcutree.c
> > index 725524e..aa6d96e 100644
> > --- a/kernel/rcutree.c
> > +++ b/kernel/rcutree.c
> > @@ -718,6 +718,7 @@ static int dyntick_save_progress_counter(struct 
> > rcu_data *rdp,
> >  bool *isidle, unsigned long *maxj)
> >  {
> > rdp->dynticks_snap = atomic_add_return(0, >dynticks->dynticks);
> > +   rcu_sysidle_check_cpu(rdp, isidle, maxj);
> > return (rdp->dynticks_snap & 0x1) == 0;
> >  }
> >  
> > @@ -1356,11 +1357,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int 
> > fqs_state_in)
> > rsp->n_force_qs++;
> > if (fqs_state == RCU_SAVE_DYNTICK) {
> > /* Collect dyntick-idle snapshots. */
> > +   if (is_sysidle_rcu_state(rsp)) {
> > +   isidle = 1;
> 
> isidle = true;
> the type of isidle is bool
> 
> > +   maxj = jiffies - ULONG_MAX / 4;
> > +   }
> > force_qs_rnp(rsp, dyntick_save_progress_counter,
> >  , );
> > +   rcu_sysidle_report_gp(rsp, isidle, maxj);
> > fqs_state = RCU_FORCE_QS;
> > } else {
> > /* Handle dyntick-idle and offline CPUs. */
> > +   isidle = 0;
> 
> isidle = false;
> 
> > force_qs_rnp(rsp, rcu_implicit_dynticks_qs, , );
> > }
> > /* Clear flag to prevent immediate re-entry. */
> > @@ -2087,9 +2094,12 @@ static void force_qs_rnp(struct rcu_state *rsp,
> > cpu = rnp->grplo;
> > bit = 1;
> > for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
> > -   if ((rnp->qsmask & bit) != 0 &&
> > -   f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
> > -   mask |= bit;
> > +   if ((rnp->qsmask & bit) != 0) {
> > +   if ((rnp->qsmaskinit & bit) != 0)
> > +   *isidle = 0;
> 
> *isidle = false

All good catches, fixed.

> > +   if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
> > +   mask |= bit;
> > +   }
> > }
> > if (mask != 0) {
> >  
> > diff --git a/kernel/rcutree.h b/kernel/rcutree.h
> > index 1895043..e0de5dc 100644
> > --- a/kernel/rcutree.h
> > +++ b/kernel/rcutree.h
> > @@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu);
> >  static bool init_nocb_callback_list(struct rcu_data *rdp);
> >  static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
> >  static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
> > +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
> > + unsigned long *maxj);
> > +static bool is_sysidle_rcu_state(struct rcu_state *rsp);
> > 

Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-29 Thread Lai Jiangshan
On 07/27/2013 07:19 AM, Paul E. McKenney wrote:
> From: "Paul E. McKenney" 
> 
> This commit adds the state machine that takes the per-CPU idle data
> as input and produces a full-system-idle indication as output.  This
> state machine is driven out of RCU's quiescent-state-forcing
> mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU
> idle state and then rcu_sysidle_report() to drive the state machine.
> 
> The full-system-idle state is sampled using rcu_sys_is_idle(), which
> also drives the state machine if RCU is idle (and does so by forcing
> RCU to become non-idle).  This function returns true if all but the
> timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long
> enough to avoid memory contention on the full_sysidle_state state
> variable.  The rcu_sysidle_force_exit() may be called externally
> to reset the state machine back into non-idle state.
> 
> Signed-off-by: Paul E. McKenney 
> Cc: Frederic Weisbecker 
> Cc: Steven Rostedt 
> ---
>  include/linux/rcupdate.h |  18 +++
>  kernel/rcutree.c |  16 ++-
>  kernel/rcutree.h |   5 +
>  kernel/rcutree_plugin.h  | 284 
> ++-
>  4 files changed, 316 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index 48f1ef9..1aa8d8c 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return 
> false; }
>  #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
>  
>  
> +/* Only for use by adaptive-ticks code. */
> +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
> +extern bool rcu_sys_is_idle(void);
> +extern void rcu_sysidle_force_exit(void);
> +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
> +
> +static inline bool rcu_sys_is_idle(void)
> +{
> + return false;
> +}
> +
> +static inline void rcu_sysidle_force_exit(void)
> +{
> +}
> +
> +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
> +
> +
>  #endif /* __LINUX_RCUPDATE_H */
> diff --git a/kernel/rcutree.c b/kernel/rcutree.c
> index 725524e..aa6d96e 100644
> --- a/kernel/rcutree.c
> +++ b/kernel/rcutree.c
> @@ -718,6 +718,7 @@ static int dyntick_save_progress_counter(struct rcu_data 
> *rdp,
>bool *isidle, unsigned long *maxj)
>  {
>   rdp->dynticks_snap = atomic_add_return(0, >dynticks->dynticks);
> + rcu_sysidle_check_cpu(rdp, isidle, maxj);
>   return (rdp->dynticks_snap & 0x1) == 0;
>  }
>  
> @@ -1356,11 +1357,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int 
> fqs_state_in)
>   rsp->n_force_qs++;
>   if (fqs_state == RCU_SAVE_DYNTICK) {
>   /* Collect dyntick-idle snapshots. */
> + if (is_sysidle_rcu_state(rsp)) {
> + isidle = 1;

isidle = true;
the type of isidle is bool

> + maxj = jiffies - ULONG_MAX / 4;
> + }
>   force_qs_rnp(rsp, dyntick_save_progress_counter,
>, );
> + rcu_sysidle_report_gp(rsp, isidle, maxj);
>   fqs_state = RCU_FORCE_QS;
>   } else {
>   /* Handle dyntick-idle and offline CPUs. */
> + isidle = 0;

isidle = false;

>   force_qs_rnp(rsp, rcu_implicit_dynticks_qs, , );
>   }
>   /* Clear flag to prevent immediate re-entry. */
> @@ -2087,9 +2094,12 @@ static void force_qs_rnp(struct rcu_state *rsp,
>   cpu = rnp->grplo;
>   bit = 1;
>   for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
> - if ((rnp->qsmask & bit) != 0 &&
> - f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
> - mask |= bit;
> + if ((rnp->qsmask & bit) != 0) {
> + if ((rnp->qsmaskinit & bit) != 0)
> + *isidle = 0;

*isidle = false

> + if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
> + mask |= bit;
> + }
>   }
>   if (mask != 0) {
>  
> diff --git a/kernel/rcutree.h b/kernel/rcutree.h
> index 1895043..e0de5dc 100644
> --- a/kernel/rcutree.h
> +++ b/kernel/rcutree.h
> @@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu);
>  static bool init_nocb_callback_list(struct rcu_data *rdp);
>  static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
>  static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
> +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
> +   unsigned long *maxj);
> +static bool is_sysidle_rcu_state(struct rcu_state *rsp);
> +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
> +   unsigned long maxj);
>  static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
>  
>  #endif /* #ifndef RCU_TREE_NONCORE */
> diff --git 

Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-29 Thread Lai Jiangshan
On 07/27/2013 07:19 AM, Paul E. McKenney wrote:
 From: Paul E. McKenney paul...@linux.vnet.ibm.com
 
 This commit adds the state machine that takes the per-CPU idle data
 as input and produces a full-system-idle indication as output.  This
 state machine is driven out of RCU's quiescent-state-forcing
 mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU
 idle state and then rcu_sysidle_report() to drive the state machine.
 
 The full-system-idle state is sampled using rcu_sys_is_idle(), which
 also drives the state machine if RCU is idle (and does so by forcing
 RCU to become non-idle).  This function returns true if all but the
 timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long
 enough to avoid memory contention on the full_sysidle_state state
 variable.  The rcu_sysidle_force_exit() may be called externally
 to reset the state machine back into non-idle state.
 
 Signed-off-by: Paul E. McKenney paul...@linux.vnet.ibm.com
 Cc: Frederic Weisbecker fweis...@gmail.com
 Cc: Steven Rostedt rost...@goodmis.org
 ---
  include/linux/rcupdate.h |  18 +++
  kernel/rcutree.c |  16 ++-
  kernel/rcutree.h |   5 +
  kernel/rcutree_plugin.h  | 284 
 ++-
  4 files changed, 316 insertions(+), 7 deletions(-)
 
 diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
 index 48f1ef9..1aa8d8c 100644
 --- a/include/linux/rcupdate.h
 +++ b/include/linux/rcupdate.h
 @@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return 
 false; }
  #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
  
  
 +/* Only for use by adaptive-ticks code. */
 +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
 +extern bool rcu_sys_is_idle(void);
 +extern void rcu_sysidle_force_exit(void);
 +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 +
 +static inline bool rcu_sys_is_idle(void)
 +{
 + return false;
 +}
 +
 +static inline void rcu_sysidle_force_exit(void)
 +{
 +}
 +
 +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 +
 +
  #endif /* __LINUX_RCUPDATE_H */
 diff --git a/kernel/rcutree.c b/kernel/rcutree.c
 index 725524e..aa6d96e 100644
 --- a/kernel/rcutree.c
 +++ b/kernel/rcutree.c
 @@ -718,6 +718,7 @@ static int dyntick_save_progress_counter(struct rcu_data 
 *rdp,
bool *isidle, unsigned long *maxj)
  {
   rdp-dynticks_snap = atomic_add_return(0, rdp-dynticks-dynticks);
 + rcu_sysidle_check_cpu(rdp, isidle, maxj);
   return (rdp-dynticks_snap  0x1) == 0;
  }
  
 @@ -1356,11 +1357,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int 
 fqs_state_in)
   rsp-n_force_qs++;
   if (fqs_state == RCU_SAVE_DYNTICK) {
   /* Collect dyntick-idle snapshots. */
 + if (is_sysidle_rcu_state(rsp)) {
 + isidle = 1;

isidle = true;
the type of isidle is bool

 + maxj = jiffies - ULONG_MAX / 4;
 + }
   force_qs_rnp(rsp, dyntick_save_progress_counter,
isidle, maxj);
 + rcu_sysidle_report_gp(rsp, isidle, maxj);
   fqs_state = RCU_FORCE_QS;
   } else {
   /* Handle dyntick-idle and offline CPUs. */
 + isidle = 0;

isidle = false;

   force_qs_rnp(rsp, rcu_implicit_dynticks_qs, isidle, maxj);
   }
   /* Clear flag to prevent immediate re-entry. */
 @@ -2087,9 +2094,12 @@ static void force_qs_rnp(struct rcu_state *rsp,
   cpu = rnp-grplo;
   bit = 1;
   for (; cpu = rnp-grphi; cpu++, bit = 1) {
 - if ((rnp-qsmask  bit) != 0 
 - f(per_cpu_ptr(rsp-rda, cpu), isidle, maxj))
 - mask |= bit;
 + if ((rnp-qsmask  bit) != 0) {
 + if ((rnp-qsmaskinit  bit) != 0)
 + *isidle = 0;

*isidle = false

 + if (f(per_cpu_ptr(rsp-rda, cpu), isidle, maxj))
 + mask |= bit;
 + }
   }
   if (mask != 0) {
  
 diff --git a/kernel/rcutree.h b/kernel/rcutree.h
 index 1895043..e0de5dc 100644
 --- a/kernel/rcutree.h
 +++ b/kernel/rcutree.h
 @@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu);
  static bool init_nocb_callback_list(struct rcu_data *rdp);
  static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
  static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
 +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
 +   unsigned long *maxj);
 +static bool is_sysidle_rcu_state(struct rcu_state *rsp);
 +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
 +   unsigned long maxj);
  static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
  
  #endif /* #ifndef RCU_TREE_NONCORE */
 diff --git a/kernel/rcutree_plugin.h 

Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-29 Thread Paul E. McKenney
On Mon, Jul 29, 2013 at 04:19:48PM +0800, Lai Jiangshan wrote:
 On 07/27/2013 07:19 AM, Paul E. McKenney wrote:
  From: Paul E. McKenney paul...@linux.vnet.ibm.com
  
  This commit adds the state machine that takes the per-CPU idle data
  as input and produces a full-system-idle indication as output.  This
  state machine is driven out of RCU's quiescent-state-forcing
  mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU
  idle state and then rcu_sysidle_report() to drive the state machine.
  
  The full-system-idle state is sampled using rcu_sys_is_idle(), which
  also drives the state machine if RCU is idle (and does so by forcing
  RCU to become non-idle).  This function returns true if all but the
  timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long
  enough to avoid memory contention on the full_sysidle_state state
  variable.  The rcu_sysidle_force_exit() may be called externally
  to reset the state machine back into non-idle state.
  
  Signed-off-by: Paul E. McKenney paul...@linux.vnet.ibm.com
  Cc: Frederic Weisbecker fweis...@gmail.com
  Cc: Steven Rostedt rost...@goodmis.org
  ---
   include/linux/rcupdate.h |  18 +++
   kernel/rcutree.c |  16 ++-
   kernel/rcutree.h |   5 +
   kernel/rcutree_plugin.h  | 284 
  ++-
   4 files changed, 316 insertions(+), 7 deletions(-)
  
  diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
  index 48f1ef9..1aa8d8c 100644
  --- a/include/linux/rcupdate.h
  +++ b/include/linux/rcupdate.h
  @@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return 
  false; }
   #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
   
   
  +/* Only for use by adaptive-ticks code. */
  +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
  +extern bool rcu_sys_is_idle(void);
  +extern void rcu_sysidle_force_exit(void);
  +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
  +
  +static inline bool rcu_sys_is_idle(void)
  +{
  +   return false;
  +}
  +
  +static inline void rcu_sysidle_force_exit(void)
  +{
  +}
  +
  +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
  +
  +
   #endif /* __LINUX_RCUPDATE_H */
  diff --git a/kernel/rcutree.c b/kernel/rcutree.c
  index 725524e..aa6d96e 100644
  --- a/kernel/rcutree.c
  +++ b/kernel/rcutree.c
  @@ -718,6 +718,7 @@ static int dyntick_save_progress_counter(struct 
  rcu_data *rdp,
   bool *isidle, unsigned long *maxj)
   {
  rdp-dynticks_snap = atomic_add_return(0, rdp-dynticks-dynticks);
  +   rcu_sysidle_check_cpu(rdp, isidle, maxj);
  return (rdp-dynticks_snap  0x1) == 0;
   }
   
  @@ -1356,11 +1357,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int 
  fqs_state_in)
  rsp-n_force_qs++;
  if (fqs_state == RCU_SAVE_DYNTICK) {
  /* Collect dyntick-idle snapshots. */
  +   if (is_sysidle_rcu_state(rsp)) {
  +   isidle = 1;
 
 isidle = true;
 the type of isidle is bool
 
  +   maxj = jiffies - ULONG_MAX / 4;
  +   }
  force_qs_rnp(rsp, dyntick_save_progress_counter,
   isidle, maxj);
  +   rcu_sysidle_report_gp(rsp, isidle, maxj);
  fqs_state = RCU_FORCE_QS;
  } else {
  /* Handle dyntick-idle and offline CPUs. */
  +   isidle = 0;
 
 isidle = false;
 
  force_qs_rnp(rsp, rcu_implicit_dynticks_qs, isidle, maxj);
  }
  /* Clear flag to prevent immediate re-entry. */
  @@ -2087,9 +2094,12 @@ static void force_qs_rnp(struct rcu_state *rsp,
  cpu = rnp-grplo;
  bit = 1;
  for (; cpu = rnp-grphi; cpu++, bit = 1) {
  -   if ((rnp-qsmask  bit) != 0 
  -   f(per_cpu_ptr(rsp-rda, cpu), isidle, maxj))
  -   mask |= bit;
  +   if ((rnp-qsmask  bit) != 0) {
  +   if ((rnp-qsmaskinit  bit) != 0)
  +   *isidle = 0;
 
 *isidle = false

All good catches, fixed.

  +   if (f(per_cpu_ptr(rsp-rda, cpu), isidle, maxj))
  +   mask |= bit;
  +   }
  }
  if (mask != 0) {
   
  diff --git a/kernel/rcutree.h b/kernel/rcutree.h
  index 1895043..e0de5dc 100644
  --- a/kernel/rcutree.h
  +++ b/kernel/rcutree.h
  @@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu);
   static bool init_nocb_callback_list(struct rcu_data *rdp);
   static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
   static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
  +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
  + unsigned long *maxj);
  +static bool is_sysidle_rcu_state(struct rcu_state *rsp);
  +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
  + unsigned long maxj);
   static void 

Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-27 Thread Frederic Weisbecker
On Fri, Jul 26, 2013 at 03:52:12PM -0700, Paul E. McKenney wrote:
> On Thu, Jul 25, 2013 at 01:26:44AM +0200, Frederic Weisbecker wrote:
> > I don't know because I encountered some troubles to build it, I'm seeing 
> > thousand
> > lines like this:
> > 
> > Name "main::opt_help" used only once: possible typo at /usr/bin/a2ping line 
> > 534.
> > /usr/bin/a2ping: not a GS output from gs -dSAFER
> > ./cartoons/whippersnapper300.eps -> ./cartoons/whippersnapper300.pdf
> > Name "main::opt_extra" used only once: possible typo at /usr/bin/a2ping 
> > line 546.
> > Name "main::opt_help" used only once: possible typo at /usr/bin/a2ping line 
> > 534.
> > /usr/bin/a2ping: not a GS output from gs -dSAFER
> > make: *** [embedfonts] Error 1
> 
> Strange.  My version of a2ping is Ubuntu 12.04's:
> 
> a2ping.pl 2.77p, 2006-11-15 -- Written by  from April 2003.
> 
> You have something more recent?

I run Fedora 15 (yeah, too lazy to upgrade), and:

a2ping.pl 2.77p, 2004-04-28 -- Written by  from April 2003

> > So back to the issue, I think we made nice progresses with my rusty brain 
> > ;-)
> > But just to be clear, I'm pasting that again for just a few precisions:
> > 
> > CPU 0CPU 1
> > 
> >cmpxchg(_sysidle_state,  //CPU 1 wakes up
> > RCU_SYSIDLE_SHORT,   
> > atomic_inc(rdtp(1)->dynticks_idle)
> > RCU_SYSIDLE_LONG);
> > 
> >smp_mb() //cmpxchgsmp_mb()
> >atomic_read(rdtp(1)->dynticks_idle)   ACCESS_ONCE(full_sysidle_state
> >   //CPU 0 goes to sleep
> > 
> > 
> > 
> > 1) If CPU 0 sets RCU_SYSIDLE_LONG and sees dynticks_idle as even, do we 
> > have the _guarantee_
> > that later CPU 1 sees full_sysidle_state == RCU_SYSIDLE_LONG (or any later 
> > full_sysidle_state value)
> > due to the connection between atomic_read / atomic_inc and the barriers 
> > that come along?
> 
> No, we do not have this guarantee.
> 
> What happens instead is that CPU 0 later sets RCU_SYSIDLE_FULL after
> having again seen CPU 1's ->dynticks_idle having an even (idle) value.
> If CPU 1 later increments its ->dynticks_idle to an odd (non-idle) value,
> then does a memory barrier, and then reads full_sysidle_state, then CPU
> 1 is guaranteed to see RCU_SYSIDLE_LONG.  Please note that CPU 1 is -not-
> obligated to see RCU_SYSIDLE_FULL, much less RCU_SYSIDLE_NOTED.

Aah. I think I got it now. So lay this out like what follows:

  void cpu_0(int old, int new)
  {
   cmpxchg(_sysidle_state, old, new);
   smp_mb()
   atomic_read(rdtp(1)->dynticks_idle)
  }

  void cpu_1(void)
  {
   atomic_inc(rdtp(1)->dynticks_idle)
   smp_mb()
   ACCESS_ONCE(full_sysidle_stat)
  }


And this scenario:

  CPU 0  CPU 1

  cpu_0(RCU_SYSIDLE_NOT, RCU_SYSIDLE_SHORT)
  cpu_0(RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG)
 cpu_1() // guaranteed to 
see at least RCU_SYSIDLE_SHORT
  cpu_0(RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL)
 cpu_1() // guaranteed to 
see at least RCU_SYSIDLE_LONG

IIUC, the double slah comments should be true.
If so I can observe that even if we don't have memory commit guarantees, there 
seem to be some
forward progress guarantee against the update and read sequences.

What usually guarantees such forward progress on the sequences is when we have
some locking or atomic ops updates that also return a value (inc_return, 
cmpxchg, ...)
mirroring on both sides.

I can't find anything in atomic_ops.txt or in your book (ok sorry I only read a 
few pages
on the barriers chapter :o) that describes such forward progress guarantee.

Would I retrieve such a guarantee on a generic sequence like this?

//A = B = 0

 CPU 0CPU 1

 store A = 1  store B = 1
 mb() mb()
 read B   read A
 
 store A = 2  store B = 2
 mb() mb()
 read B   read A

On the second sequence can I deduce that A and B as respectively read by CPU 0 
and CPU 1
are at least 1 and might be 2?

> 
> However, when CPU 1 does an atomic operation on full_sysidle_state that
> returns the old value, CPU 1 is guaranteed to get the most recent state.
> (Without this guarantee, we could have a cactus state of values for
> full_sysidle_state, which might make alternate-universe fans happy,
> but which would be really hard to program.)

Haha! Yeah for the cmpxchg I'm ok.

> 
> This is why we need an RCU_SYSIDLE_LONG state in addition to
> RCU_SYSIDLE_SHORT and RCU_SYSIDLE_FULL.
> 
> > 2) You taught me once that barrier != memory committed, and it has been one 
> > of the hardest trauma
> > in my life. How can we be sure that CPU 1 sees memory as committed from CPU 
> > 0? The only fact that
> > we read an even value from CPU 0 is enough for the connection between the 
> > atomic_read() and atomic_inc()
> > 

Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-27 Thread Frederic Weisbecker
On Fri, Jul 26, 2013 at 03:52:12PM -0700, Paul E. McKenney wrote:
 On Thu, Jul 25, 2013 at 01:26:44AM +0200, Frederic Weisbecker wrote:
  I don't know because I encountered some troubles to build it, I'm seeing 
  thousand
  lines like this:
  
  Name main::opt_help used only once: possible typo at /usr/bin/a2ping line 
  534.
  /usr/bin/a2ping: not a GS output from gs -dSAFER
  ./cartoons/whippersnapper300.eps - ./cartoons/whippersnapper300.pdf
  Name main::opt_extra used only once: possible typo at /usr/bin/a2ping 
  line 546.
  Name main::opt_help used only once: possible typo at /usr/bin/a2ping line 
  534.
  /usr/bin/a2ping: not a GS output from gs -dSAFER
  make: *** [embedfonts] Error 1
 
 Strange.  My version of a2ping is Ubuntu 12.04's:
 
 a2ping.pl 2.77p, 2006-11-15 -- Written by p...@fazekas.hu from April 2003.
 
 You have something more recent?

I run Fedora 15 (yeah, too lazy to upgrade), and:

a2ping.pl 2.77p, 2004-04-28 -- Written by p...@fazekas.hu from April 2003

  So back to the issue, I think we made nice progresses with my rusty brain 
  ;-)
  But just to be clear, I'm pasting that again for just a few precisions:
  
  CPU 0CPU 1
  
 cmpxchg(full_sysidle_state,  //CPU 1 wakes up
  RCU_SYSIDLE_SHORT,   
  atomic_inc(rdtp(1)-dynticks_idle)
  RCU_SYSIDLE_LONG);
  
 smp_mb() //cmpxchgsmp_mb()
 atomic_read(rdtp(1)-dynticks_idle)   ACCESS_ONCE(full_sysidle_state
//CPU 0 goes to sleep
  
  
  
  1) If CPU 0 sets RCU_SYSIDLE_LONG and sees dynticks_idle as even, do we 
  have the _guarantee_
  that later CPU 1 sees full_sysidle_state == RCU_SYSIDLE_LONG (or any later 
  full_sysidle_state value)
  due to the connection between atomic_read / atomic_inc and the barriers 
  that come along?
 
 No, we do not have this guarantee.
 
 What happens instead is that CPU 0 later sets RCU_SYSIDLE_FULL after
 having again seen CPU 1's -dynticks_idle having an even (idle) value.
 If CPU 1 later increments its -dynticks_idle to an odd (non-idle) value,
 then does a memory barrier, and then reads full_sysidle_state, then CPU
 1 is guaranteed to see RCU_SYSIDLE_LONG.  Please note that CPU 1 is -not-
 obligated to see RCU_SYSIDLE_FULL, much less RCU_SYSIDLE_NOTED.

Aah. I think I got it now. So lay this out like what follows:

  void cpu_0(int old, int new)
  {
   cmpxchg(full_sysidle_state, old, new);
   smp_mb()
   atomic_read(rdtp(1)-dynticks_idle)
  }

  void cpu_1(void)
  {
   atomic_inc(rdtp(1)-dynticks_idle)
   smp_mb()
   ACCESS_ONCE(full_sysidle_stat)
  }


And this scenario:

  CPU 0  CPU 1

  cpu_0(RCU_SYSIDLE_NOT, RCU_SYSIDLE_SHORT)
  cpu_0(RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG)
 cpu_1() // guaranteed to 
see at least RCU_SYSIDLE_SHORT
  cpu_0(RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL)
 cpu_1() // guaranteed to 
see at least RCU_SYSIDLE_LONG

IIUC, the double slah comments should be true.
If so I can observe that even if we don't have memory commit guarantees, there 
seem to be some
forward progress guarantee against the update and read sequences.

What usually guarantees such forward progress on the sequences is when we have
some locking or atomic ops updates that also return a value (inc_return, 
cmpxchg, ...)
mirroring on both sides.

I can't find anything in atomic_ops.txt or in your book (ok sorry I only read a 
few pages
on the barriers chapter :o) that describes such forward progress guarantee.

Would I retrieve such a guarantee on a generic sequence like this?

//A = B = 0

 CPU 0CPU 1

 store A = 1  store B = 1
 mb() mb()
 read B   read A
 
 store A = 2  store B = 2
 mb() mb()
 read B   read A

On the second sequence can I deduce that A and B as respectively read by CPU 0 
and CPU 1
are at least 1 and might be 2?

 
 However, when CPU 1 does an atomic operation on full_sysidle_state that
 returns the old value, CPU 1 is guaranteed to get the most recent state.
 (Without this guarantee, we could have a cactus state of values for
 full_sysidle_state, which might make alternate-universe fans happy,
 but which would be really hard to program.)

Haha! Yeah for the cmpxchg I'm ok.

 
 This is why we need an RCU_SYSIDLE_LONG state in addition to
 RCU_SYSIDLE_SHORT and RCU_SYSIDLE_FULL.
 
  2) You taught me once that barrier != memory committed, and it has been one 
  of the hardest trauma
  in my life. How can we be sure that CPU 1 sees memory as committed from CPU 
  0? The only fact that
  we read an even value from CPU 0 is enough for the connection between the 
  atomic_read() and atomic_inc()
  and all the barriers that come along?
 
 My condolences on the trauma!  Now, if you will 

[PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-26 Thread Paul E. McKenney
From: "Paul E. McKenney" 

This commit adds the state machine that takes the per-CPU idle data
as input and produces a full-system-idle indication as output.  This
state machine is driven out of RCU's quiescent-state-forcing
mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU
idle state and then rcu_sysidle_report() to drive the state machine.

The full-system-idle state is sampled using rcu_sys_is_idle(), which
also drives the state machine if RCU is idle (and does so by forcing
RCU to become non-idle).  This function returns true if all but the
timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long
enough to avoid memory contention on the full_sysidle_state state
variable.  The rcu_sysidle_force_exit() may be called externally
to reset the state machine back into non-idle state.

Signed-off-by: Paul E. McKenney 
Cc: Frederic Weisbecker 
Cc: Steven Rostedt 
---
 include/linux/rcupdate.h |  18 +++
 kernel/rcutree.c |  16 ++-
 kernel/rcutree.h |   5 +
 kernel/rcutree_plugin.h  | 284 ++-
 4 files changed, 316 insertions(+), 7 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 48f1ef9..1aa8d8c 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return 
false; }
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
 
 
+/* Only for use by adaptive-ticks code. */
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+extern bool rcu_sys_is_idle(void);
+extern void rcu_sysidle_force_exit(void);
+#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+
+static inline bool rcu_sys_is_idle(void)
+{
+   return false;
+}
+
+static inline void rcu_sysidle_force_exit(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+
+
 #endif /* __LINUX_RCUPDATE_H */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 725524e..aa6d96e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -718,6 +718,7 @@ static int dyntick_save_progress_counter(struct rcu_data 
*rdp,
 bool *isidle, unsigned long *maxj)
 {
rdp->dynticks_snap = atomic_add_return(0, >dynticks->dynticks);
+   rcu_sysidle_check_cpu(rdp, isidle, maxj);
return (rdp->dynticks_snap & 0x1) == 0;
 }
 
@@ -1356,11 +1357,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
rsp->n_force_qs++;
if (fqs_state == RCU_SAVE_DYNTICK) {
/* Collect dyntick-idle snapshots. */
+   if (is_sysidle_rcu_state(rsp)) {
+   isidle = 1;
+   maxj = jiffies - ULONG_MAX / 4;
+   }
force_qs_rnp(rsp, dyntick_save_progress_counter,
 , );
+   rcu_sysidle_report_gp(rsp, isidle, maxj);
fqs_state = RCU_FORCE_QS;
} else {
/* Handle dyntick-idle and offline CPUs. */
+   isidle = 0;
force_qs_rnp(rsp, rcu_implicit_dynticks_qs, , );
}
/* Clear flag to prevent immediate re-entry. */
@@ -2087,9 +2094,12 @@ static void force_qs_rnp(struct rcu_state *rsp,
cpu = rnp->grplo;
bit = 1;
for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
-   if ((rnp->qsmask & bit) != 0 &&
-   f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
-   mask |= bit;
+   if ((rnp->qsmask & bit) != 0) {
+   if ((rnp->qsmaskinit & bit) != 0)
+   *isidle = 0;
+   if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
+   mask |= bit;
+   }
}
if (mask != 0) {
 
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1895043..e0de5dc 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu);
 static bool init_nocb_callback_list(struct rcu_data *rdp);
 static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
 static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+ unsigned long *maxj);
+static bool is_sysidle_rcu_state(struct rcu_state *rsp);
+static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
+ unsigned long maxj);
 static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
 
 #endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 3edae39..ff84bed 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -28,7 +28,7 @@
 #include 
 #include 
 #include 
-#include 
+#include "time/tick-internal.h"
 
 #define RCU_KTHREAD_PRIO 1
 
@@ -2395,12 +2395,12 @@ static void 

Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-26 Thread Paul E. McKenney
On Thu, Jul 25, 2013 at 01:26:44AM +0200, Frederic Weisbecker wrote:
> On Wed, Jul 24, 2013 at 03:09:02PM -0700, Paul E. McKenney wrote:
> > On Wed, Jul 24, 2013 at 08:09:04PM +0200, Frederic Weisbecker wrote:
> > > On Thu, Jul 18, 2013 at 10:06:25PM -0700, Paul E. McKenney wrote:
> > > > > Lets summarize the last sequence, the following happens ordered by 
> > > > > time:
> > > > > 
> > > > > CPU 0  CPU 1
> > > > > 
> > > > >  cmpxchg(_sysidle_state,
> > > > >  RCU_SYSIDLE_SHORT,
> > > > >  RCU_SYSIDLE_LONG);
> > > > > 
> > > > >  smp_mb() //cmpxchg
> > > > > 
> > > > >  atomic_read(rdtp(1)->dynticks_idle)
> > > > > 
> > > > >  //CPU 0 goes to sleep
> > > > >//CPU 1 wakes up
> > > > >
> > > > > atomic_inc(rdtp(1)->dynticks_idle)
> > > > > 
> > > > >smp_mb()
> > > > > 
> > > > >ACCESS_ONCE(full_sysidle_state)
> > > > > 
> > > > > 
> > > > > Are you suggesting that because the CPU 1 executes its atomic_inc() 
> > > > > _after_ (in terms
> > > > > of absolute time) the atomic_read of CPU 0, the ordering settled in 
> > > > > both sides guarantees
> > > > > that the value read from CPU 1 is the one from the cmpxchg that 
> > > > > precedes the atomic_read,
> > > > > or FULL or FULL_NOTED that happen later.
> > > > > 
> > > > > If so that's a big lesson for me. 
> > > > 
> > > > It is not absolute time that matters.  Instead, it is the fact that
> > > > CPU 0, when reading from ->dynticks_idle, read the old value before the
> > > > atomic_inc().  Therefore, anything CPU 0 did before that memory barrier
> > > > preceding CPU 0's read must come before anything CPU 1 did after that
> > > > memory barrier following the atomic_inc().  For this to work, there
> > > > must be some access to the same variable on each CPU.
> > > 
> > > Aren't we in the following situation?
> > > 
> > > CPU 0  CPU 1
> > > 
> > > STORE ASTORE B
> > > LOAD B LOAD A
> > > 
> > > 
> > > If so and referring to your perfbook, this is an "ears to mouth" 
> > > situation.
> > > And it seems to describe there is no strong guarantee in that situation.
> > 
> > "Yes" to the first, but on modern hardware, "no" to the second.  The key
> > paragraph is Section 12.2.4.5:
> > 
> > The following pairings from Table 12.1 can be used on modern
> > hardware, but might fail on some systems that were produced in
> > the 1990s. However, these can safely be used on all mainstream
> > hardware introduced since the year 2000.
> 
> Right I missed that!

Nor are you alone.  ;-)

> > That said, you are not the first to be confused by this, so I might need
> > to rework this section to make it clear that each can in fact be used on
> > modern hardware.
> > 
> > If you happen to have an old Sequent NUMA-Q or Symmetry box lying around,
> > things are a bit different.  On the other hand, I don't believe that any
> > of these old boxes are still running Linux.  (Hey, I am as sentimental as
> > the next guy, but there are limits!)
> > 
> > I updated this section and pushed it, please let me know if this helps!
> 
> I don't know because I encountered some troubles to build it, I'm seeing 
> thousand
> lines like this:
> 
> Name "main::opt_help" used only once: possible typo at /usr/bin/a2ping line 
> 534.
> /usr/bin/a2ping: not a GS output from gs -dSAFER
> ./cartoons/whippersnapper300.eps -> ./cartoons/whippersnapper300.pdf
> Name "main::opt_extra" used only once: possible typo at /usr/bin/a2ping line 
> 546.
> Name "main::opt_help" used only once: possible typo at /usr/bin/a2ping line 
> 534.
> /usr/bin/a2ping: not a GS output from gs -dSAFER
> make: *** [embedfonts] Error 1

Strange.  My version of a2ping is Ubuntu 12.04's:

a2ping.pl 2.77p, 2006-11-15 -- Written by  from April 2003.

You have something more recent?

> Anyway I looked at the diff and it looks indeed clearer, thanks!

Glad it helped!

> So back to the issue, I think we made nice progresses with my rusty brain ;-)
> But just to be clear, I'm pasting that again for just a few precisions:
> 
> CPU 0CPU 1
> 
>cmpxchg(_sysidle_state,  //CPU 1 wakes up
> RCU_SYSIDLE_SHORT,   
> atomic_inc(rdtp(1)->dynticks_idle)
> RCU_SYSIDLE_LONG);
> 
>smp_mb() //cmpxchgsmp_mb()
>atomic_read(rdtp(1)->dynticks_idle)   ACCESS_ONCE(full_sysidle_state
>   //CPU 0 goes to sleep
> 
> 
> 
> 1) If CPU 0 sets RCU_SYSIDLE_LONG and sees dynticks_idle as even, do we have 
> the _guarantee_
> that later CPU 1 sees full_sysidle_state == RCU_SYSIDLE_LONG (or any later 
> full_sysidle_state value)
> due to the connection between atomic_read / 

Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-26 Thread Paul E. McKenney
On Thu, Jul 25, 2013 at 01:26:44AM +0200, Frederic Weisbecker wrote:
 On Wed, Jul 24, 2013 at 03:09:02PM -0700, Paul E. McKenney wrote:
  On Wed, Jul 24, 2013 at 08:09:04PM +0200, Frederic Weisbecker wrote:
   On Thu, Jul 18, 2013 at 10:06:25PM -0700, Paul E. McKenney wrote:
 Lets summarize the last sequence, the following happens ordered by 
 time:
 
 CPU 0  CPU 1
 
  cmpxchg(full_sysidle_state,
  RCU_SYSIDLE_SHORT,
  RCU_SYSIDLE_LONG);
 
  smp_mb() //cmpxchg
 
  atomic_read(rdtp(1)-dynticks_idle)
 
  //CPU 0 goes to sleep
//CPU 1 wakes up

 atomic_inc(rdtp(1)-dynticks_idle)
 
smp_mb()
 
ACCESS_ONCE(full_sysidle_state)
 
 
 Are you suggesting that because the CPU 1 executes its atomic_inc() 
 _after_ (in terms
 of absolute time) the atomic_read of CPU 0, the ordering settled in 
 both sides guarantees
 that the value read from CPU 1 is the one from the cmpxchg that 
 precedes the atomic_read,
 or FULL or FULL_NOTED that happen later.
 
 If so that's a big lesson for me. 

It is not absolute time that matters.  Instead, it is the fact that
CPU 0, when reading from -dynticks_idle, read the old value before the
atomic_inc().  Therefore, anything CPU 0 did before that memory barrier
preceding CPU 0's read must come before anything CPU 1 did after that
memory barrier following the atomic_inc().  For this to work, there
must be some access to the same variable on each CPU.
   
   Aren't we in the following situation?
   
   CPU 0  CPU 1
   
   STORE ASTORE B
   LOAD B LOAD A
   
   
   If so and referring to your perfbook, this is an ears to mouth 
   situation.
   And it seems to describe there is no strong guarantee in that situation.
  
  Yes to the first, but on modern hardware, no to the second.  The key
  paragraph is Section 12.2.4.5:
  
  The following pairings from Table 12.1 can be used on modern
  hardware, but might fail on some systems that were produced in
  the 1990s. However, these can safely be used on all mainstream
  hardware introduced since the year 2000.
 
 Right I missed that!

Nor are you alone.  ;-)

  That said, you are not the first to be confused by this, so I might need
  to rework this section to make it clear that each can in fact be used on
  modern hardware.
  
  If you happen to have an old Sequent NUMA-Q or Symmetry box lying around,
  things are a bit different.  On the other hand, I don't believe that any
  of these old boxes are still running Linux.  (Hey, I am as sentimental as
  the next guy, but there are limits!)
  
  I updated this section and pushed it, please let me know if this helps!
 
 I don't know because I encountered some troubles to build it, I'm seeing 
 thousand
 lines like this:
 
 Name main::opt_help used only once: possible typo at /usr/bin/a2ping line 
 534.
 /usr/bin/a2ping: not a GS output from gs -dSAFER
 ./cartoons/whippersnapper300.eps - ./cartoons/whippersnapper300.pdf
 Name main::opt_extra used only once: possible typo at /usr/bin/a2ping line 
 546.
 Name main::opt_help used only once: possible typo at /usr/bin/a2ping line 
 534.
 /usr/bin/a2ping: not a GS output from gs -dSAFER
 make: *** [embedfonts] Error 1

Strange.  My version of a2ping is Ubuntu 12.04's:

a2ping.pl 2.77p, 2006-11-15 -- Written by p...@fazekas.hu from April 2003.

You have something more recent?

 Anyway I looked at the diff and it looks indeed clearer, thanks!

Glad it helped!

 So back to the issue, I think we made nice progresses with my rusty brain ;-)
 But just to be clear, I'm pasting that again for just a few precisions:
 
 CPU 0CPU 1
 
cmpxchg(full_sysidle_state,  //CPU 1 wakes up
 RCU_SYSIDLE_SHORT,   
 atomic_inc(rdtp(1)-dynticks_idle)
 RCU_SYSIDLE_LONG);
 
smp_mb() //cmpxchgsmp_mb()
atomic_read(rdtp(1)-dynticks_idle)   ACCESS_ONCE(full_sysidle_state
   //CPU 0 goes to sleep
 
 
 
 1) If CPU 0 sets RCU_SYSIDLE_LONG and sees dynticks_idle as even, do we have 
 the _guarantee_
 that later CPU 1 sees full_sysidle_state == RCU_SYSIDLE_LONG (or any later 
 full_sysidle_state value)
 due to the connection between atomic_read / atomic_inc and the barriers that 
 come along?

No, we do not have this guarantee.

What happens instead is that CPU 0 later sets RCU_SYSIDLE_FULL after
having again seen CPU 1's -dynticks_idle having an even (idle) value.
If CPU 1 later increments its -dynticks_idle to an odd (non-idle) value,
then 

[PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-26 Thread Paul E. McKenney
From: Paul E. McKenney paul...@linux.vnet.ibm.com

This commit adds the state machine that takes the per-CPU idle data
as input and produces a full-system-idle indication as output.  This
state machine is driven out of RCU's quiescent-state-forcing
mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU
idle state and then rcu_sysidle_report() to drive the state machine.

The full-system-idle state is sampled using rcu_sys_is_idle(), which
also drives the state machine if RCU is idle (and does so by forcing
RCU to become non-idle).  This function returns true if all but the
timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long
enough to avoid memory contention on the full_sysidle_state state
variable.  The rcu_sysidle_force_exit() may be called externally
to reset the state machine back into non-idle state.

Signed-off-by: Paul E. McKenney paul...@linux.vnet.ibm.com
Cc: Frederic Weisbecker fweis...@gmail.com
Cc: Steven Rostedt rost...@goodmis.org
---
 include/linux/rcupdate.h |  18 +++
 kernel/rcutree.c |  16 ++-
 kernel/rcutree.h |   5 +
 kernel/rcutree_plugin.h  | 284 ++-
 4 files changed, 316 insertions(+), 7 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 48f1ef9..1aa8d8c 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return 
false; }
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
 
 
+/* Only for use by adaptive-ticks code. */
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+extern bool rcu_sys_is_idle(void);
+extern void rcu_sysidle_force_exit(void);
+#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+
+static inline bool rcu_sys_is_idle(void)
+{
+   return false;
+}
+
+static inline void rcu_sysidle_force_exit(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+
+
 #endif /* __LINUX_RCUPDATE_H */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 725524e..aa6d96e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -718,6 +718,7 @@ static int dyntick_save_progress_counter(struct rcu_data 
*rdp,
 bool *isidle, unsigned long *maxj)
 {
rdp-dynticks_snap = atomic_add_return(0, rdp-dynticks-dynticks);
+   rcu_sysidle_check_cpu(rdp, isidle, maxj);
return (rdp-dynticks_snap  0x1) == 0;
 }
 
@@ -1356,11 +1357,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
rsp-n_force_qs++;
if (fqs_state == RCU_SAVE_DYNTICK) {
/* Collect dyntick-idle snapshots. */
+   if (is_sysidle_rcu_state(rsp)) {
+   isidle = 1;
+   maxj = jiffies - ULONG_MAX / 4;
+   }
force_qs_rnp(rsp, dyntick_save_progress_counter,
 isidle, maxj);
+   rcu_sysidle_report_gp(rsp, isidle, maxj);
fqs_state = RCU_FORCE_QS;
} else {
/* Handle dyntick-idle and offline CPUs. */
+   isidle = 0;
force_qs_rnp(rsp, rcu_implicit_dynticks_qs, isidle, maxj);
}
/* Clear flag to prevent immediate re-entry. */
@@ -2087,9 +2094,12 @@ static void force_qs_rnp(struct rcu_state *rsp,
cpu = rnp-grplo;
bit = 1;
for (; cpu = rnp-grphi; cpu++, bit = 1) {
-   if ((rnp-qsmask  bit) != 0 
-   f(per_cpu_ptr(rsp-rda, cpu), isidle, maxj))
-   mask |= bit;
+   if ((rnp-qsmask  bit) != 0) {
+   if ((rnp-qsmaskinit  bit) != 0)
+   *isidle = 0;
+   if (f(per_cpu_ptr(rsp-rda, cpu), isidle, maxj))
+   mask |= bit;
+   }
}
if (mask != 0) {
 
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1895043..e0de5dc 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu);
 static bool init_nocb_callback_list(struct rcu_data *rdp);
 static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
 static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+ unsigned long *maxj);
+static bool is_sysidle_rcu_state(struct rcu_state *rsp);
+static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
+ unsigned long maxj);
 static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
 
 #endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 3edae39..ff84bed 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -28,7 +28,7 @@
 #include linux/gfp.h
 #include linux/oom.h
 #include 

Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-24 Thread Frederic Weisbecker
On Wed, Jul 24, 2013 at 03:09:02PM -0700, Paul E. McKenney wrote:
> On Wed, Jul 24, 2013 at 08:09:04PM +0200, Frederic Weisbecker wrote:
> > On Thu, Jul 18, 2013 at 10:06:25PM -0700, Paul E. McKenney wrote:
> > > > Lets summarize the last sequence, the following happens ordered by time:
> > > > 
> > > > CPU 0  CPU 1
> > > > 
> > > >  cmpxchg(_sysidle_state,
> > > >  RCU_SYSIDLE_SHORT,
> > > >  RCU_SYSIDLE_LONG);
> > > > 
> > > >  smp_mb() //cmpxchg
> > > > 
> > > >  atomic_read(rdtp(1)->dynticks_idle)
> > > > 
> > > >  //CPU 0 goes to sleep
> > > >//CPU 1 wakes up
> > > >
> > > > atomic_inc(rdtp(1)->dynticks_idle)
> > > > 
> > > >smp_mb()
> > > > 
> > > >ACCESS_ONCE(full_sysidle_state)
> > > > 
> > > > 
> > > > Are you suggesting that because the CPU 1 executes its atomic_inc() 
> > > > _after_ (in terms
> > > > of absolute time) the atomic_read of CPU 0, the ordering settled in 
> > > > both sides guarantees
> > > > that the value read from CPU 1 is the one from the cmpxchg that 
> > > > precedes the atomic_read,
> > > > or FULL or FULL_NOTED that happen later.
> > > > 
> > > > If so that's a big lesson for me. 
> > > 
> > > It is not absolute time that matters.  Instead, it is the fact that
> > > CPU 0, when reading from ->dynticks_idle, read the old value before the
> > > atomic_inc().  Therefore, anything CPU 0 did before that memory barrier
> > > preceding CPU 0's read must come before anything CPU 1 did after that
> > > memory barrier following the atomic_inc().  For this to work, there
> > > must be some access to the same variable on each CPU.
> > 
> > Aren't we in the following situation?
> > 
> > CPU 0  CPU 1
> > 
> > STORE ASTORE B
> > LOAD B LOAD A
> > 
> > 
> > If so and referring to your perfbook, this is an "ears to mouth" situation.
> > And it seems to describe there is no strong guarantee in that situation.
> 
> "Yes" to the first, but on modern hardware, "no" to the second.  The key
> paragraph is Section 12.2.4.5:
> 
>   The following pairings from Table 12.1 can be used on modern
>   hardware, but might fail on some systems that were produced in
>   the 1990s. However, these can safely be used on all mainstream
>   hardware introduced since the year 2000.

Right I missed that!

> 
> That said, you are not the first to be confused by this, so I might need
> to rework this section to make it clear that each can in fact be used on
> modern hardware.
> 
> If you happen to have an old Sequent NUMA-Q or Symmetry box lying around,
> things are a bit different.  On the other hand, I don't believe that any
> of these old boxes are still running Linux.  (Hey, I am as sentimental as
> the next guy, but there are limits!)
> 
> I updated this section and pushed it, please let me know if this helps!

I don't know because I encountered some troubles to build it, I'm seeing 
thousand
lines like this:

Name "main::opt_help" used only once: possible typo at /usr/bin/a2ping line 534.
/usr/bin/a2ping: not a GS output from gs -dSAFER
./cartoons/whippersnapper300.eps -> ./cartoons/whippersnapper300.pdf
Name "main::opt_extra" used only once: possible typo at /usr/bin/a2ping line 
546.
Name "main::opt_help" used only once: possible typo at /usr/bin/a2ping line 534.
/usr/bin/a2ping: not a GS output from gs -dSAFER
make: *** [embedfonts] Error 1

Anyway I looked at the diff and it looks indeed clearer, thanks!

So back to the issue, I think we made nice progresses with my rusty brain ;-)
But just to be clear, I'm pasting that again for just a few precisions:

CPU 0CPU 1

   cmpxchg(_sysidle_state,  //CPU 1 wakes up
RCU_SYSIDLE_SHORT,   atomic_inc(rdtp(1)->dynticks_idle)
RCU_SYSIDLE_LONG);

   smp_mb() //cmpxchgsmp_mb()
   atomic_read(rdtp(1)->dynticks_idle)   ACCESS_ONCE(full_sysidle_state
  //CPU 0 goes to sleep



1) If CPU 0 sets RCU_SYSIDLE_LONG and sees dynticks_idle as even, do we have 
the _guarantee_
that later CPU 1 sees full_sysidle_state == RCU_SYSIDLE_LONG (or any later 
full_sysidle_state value)
due to the connection between atomic_read / atomic_inc and the barriers that 
come along?

2) You taught me once that barrier != memory committed, and it has been one of 
the hardest trauma
in my life. How can we be sure that CPU 1 sees memory as committed from CPU 0? 
The only fact that
we read an even value from CPU 0 is enough for the connection between the 
atomic_read() and atomic_inc()
and all the barriers that come along?

3) In your book it says: "recent hardware would guarantee that at least one of 
the loads saw 

Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-24 Thread Paul E. McKenney
On Wed, Jul 24, 2013 at 08:09:04PM +0200, Frederic Weisbecker wrote:
> On Thu, Jul 18, 2013 at 10:06:25PM -0700, Paul E. McKenney wrote:
> > > Lets summarize the last sequence, the following happens ordered by time:
> > > 
> > > CPU 0  CPU 1
> > > 
> > >  cmpxchg(_sysidle_state,
> > >  RCU_SYSIDLE_SHORT,
> > >  RCU_SYSIDLE_LONG);
> > > 
> > >  smp_mb() //cmpxchg
> > > 
> > >  atomic_read(rdtp(1)->dynticks_idle)
> > > 
> > >  //CPU 0 goes to sleep
> > >//CPU 1 wakes up
> > >atomic_inc(rdtp(1)->dynticks_idle)
> > > 
> > >smp_mb()
> > > 
> > >ACCESS_ONCE(full_sysidle_state)
> > > 
> > > 
> > > Are you suggesting that because the CPU 1 executes its atomic_inc() 
> > > _after_ (in terms
> > > of absolute time) the atomic_read of CPU 0, the ordering settled in both 
> > > sides guarantees
> > > that the value read from CPU 1 is the one from the cmpxchg that precedes 
> > > the atomic_read,
> > > or FULL or FULL_NOTED that happen later.
> > > 
> > > If so that's a big lesson for me. 
> > 
> > It is not absolute time that matters.  Instead, it is the fact that
> > CPU 0, when reading from ->dynticks_idle, read the old value before the
> > atomic_inc().  Therefore, anything CPU 0 did before that memory barrier
> > preceding CPU 0's read must come before anything CPU 1 did after that
> > memory barrier following the atomic_inc().  For this to work, there
> > must be some access to the same variable on each CPU.
> 
> Aren't we in the following situation?
> 
> CPU 0  CPU 1
> 
> STORE ASTORE B
> LOAD B LOAD A
> 
> 
> If so and referring to your perfbook, this is an "ears to mouth" situation.
> And it seems to describe there is no strong guarantee in that situation.

"Yes" to the first, but on modern hardware, "no" to the second.  The key
paragraph is Section 12.2.4.5:

The following pairings from Table 12.1 can be used on modern
hardware, but might fail on some systems that were produced in
the 1990s. However, these can safely be used on all mainstream
hardware introduced since the year 2000.

That said, you are not the first to be confused by this, so I might need
to rework this section to make it clear that each can in fact be used on
modern hardware.

If you happen to have an old Sequent NUMA-Q or Symmetry box lying around,
things are a bit different.  On the other hand, I don't believe that any
of these old boxes are still running Linux.  (Hey, I am as sentimental as
the next guy, but there are limits!)

I updated this section and pushed it, please let me know if this helps!

Thanx, Paul

> > Or, if you must think in terms of time, you need a separate independent
> > timeline for each variable, with no direct mapping from one timeline to
> > another, except resulting from memory-barrier interactions.
> > 
> > Thanx, Paul
> > 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-24 Thread Frederic Weisbecker
On Thu, Jul 18, 2013 at 10:06:25PM -0700, Paul E. McKenney wrote:
> > Lets summarize the last sequence, the following happens ordered by time:
> > 
> > CPU 0  CPU 1
> > 
> >  cmpxchg(_sysidle_state,
> >  RCU_SYSIDLE_SHORT,
> >  RCU_SYSIDLE_LONG);
> > 
> >  smp_mb() //cmpxchg
> > 
> >  atomic_read(rdtp(1)->dynticks_idle)
> > 
> >  //CPU 0 goes to sleep
> >//CPU 1 wakes up
> >atomic_inc(rdtp(1)->dynticks_idle)
> > 
> >smp_mb()
> > 
> >ACCESS_ONCE(full_sysidle_state)
> > 
> > 
> > Are you suggesting that because the CPU 1 executes its atomic_inc() _after_ 
> > (in terms
> > of absolute time) the atomic_read of CPU 0, the ordering settled in both 
> > sides guarantees
> > that the value read from CPU 1 is the one from the cmpxchg that precedes 
> > the atomic_read,
> > or FULL or FULL_NOTED that happen later.
> > 
> > If so that's a big lesson for me. 
> 
> It is not absolute time that matters.  Instead, it is the fact that
> CPU 0, when reading from ->dynticks_idle, read the old value before the
> atomic_inc().  Therefore, anything CPU 0 did before that memory barrier
> preceding CPU 0's read must come before anything CPU 1 did after that
> memory barrier following the atomic_inc().  For this to work, there
> must be some access to the same variable on each CPU.

Aren't we in the following situation?

CPU 0  CPU 1

STORE ASTORE B
LOAD B LOAD A


If so and referring to your perfbook, this is an "ears to mouth" situation.
And it seems to describe there is no strong guarantee in that situation.

> 
> Or, if you must think in terms of time, you need a separate independent
> timeline for each variable, with no direct mapping from one timeline to
> another, except resulting from memory-barrier interactions.
> 
>   Thanx, Paul
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-24 Thread Paul E. McKenney
On Wed, Jul 24, 2013 at 08:09:04PM +0200, Frederic Weisbecker wrote:
 On Thu, Jul 18, 2013 at 10:06:25PM -0700, Paul E. McKenney wrote:
   Lets summarize the last sequence, the following happens ordered by time:
   
   CPU 0  CPU 1
   
cmpxchg(full_sysidle_state,
RCU_SYSIDLE_SHORT,
RCU_SYSIDLE_LONG);
   
smp_mb() //cmpxchg
   
atomic_read(rdtp(1)-dynticks_idle)
   
//CPU 0 goes to sleep
  //CPU 1 wakes up
  atomic_inc(rdtp(1)-dynticks_idle)
   
  smp_mb()
   
  ACCESS_ONCE(full_sysidle_state)
   
   
   Are you suggesting that because the CPU 1 executes its atomic_inc() 
   _after_ (in terms
   of absolute time) the atomic_read of CPU 0, the ordering settled in both 
   sides guarantees
   that the value read from CPU 1 is the one from the cmpxchg that precedes 
   the atomic_read,
   or FULL or FULL_NOTED that happen later.
   
   If so that's a big lesson for me. 
  
  It is not absolute time that matters.  Instead, it is the fact that
  CPU 0, when reading from -dynticks_idle, read the old value before the
  atomic_inc().  Therefore, anything CPU 0 did before that memory barrier
  preceding CPU 0's read must come before anything CPU 1 did after that
  memory barrier following the atomic_inc().  For this to work, there
  must be some access to the same variable on each CPU.
 
 Aren't we in the following situation?
 
 CPU 0  CPU 1
 
 STORE ASTORE B
 LOAD B LOAD A
 
 
 If so and referring to your perfbook, this is an ears to mouth situation.
 And it seems to describe there is no strong guarantee in that situation.

Yes to the first, but on modern hardware, no to the second.  The key
paragraph is Section 12.2.4.5:

The following pairings from Table 12.1 can be used on modern
hardware, but might fail on some systems that were produced in
the 1990s. However, these can safely be used on all mainstream
hardware introduced since the year 2000.

That said, you are not the first to be confused by this, so I might need
to rework this section to make it clear that each can in fact be used on
modern hardware.

If you happen to have an old Sequent NUMA-Q or Symmetry box lying around,
things are a bit different.  On the other hand, I don't believe that any
of these old boxes are still running Linux.  (Hey, I am as sentimental as
the next guy, but there are limits!)

I updated this section and pushed it, please let me know if this helps!

Thanx, Paul

  Or, if you must think in terms of time, you need a separate independent
  timeline for each variable, with no direct mapping from one timeline to
  another, except resulting from memory-barrier interactions.
  
  Thanx, Paul
  
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-24 Thread Frederic Weisbecker
On Wed, Jul 24, 2013 at 03:09:02PM -0700, Paul E. McKenney wrote:
 On Wed, Jul 24, 2013 at 08:09:04PM +0200, Frederic Weisbecker wrote:
  On Thu, Jul 18, 2013 at 10:06:25PM -0700, Paul E. McKenney wrote:
Lets summarize the last sequence, the following happens ordered by time:

CPU 0  CPU 1

 cmpxchg(full_sysidle_state,
 RCU_SYSIDLE_SHORT,
 RCU_SYSIDLE_LONG);

 smp_mb() //cmpxchg

 atomic_read(rdtp(1)-dynticks_idle)

 //CPU 0 goes to sleep
   //CPU 1 wakes up
   
atomic_inc(rdtp(1)-dynticks_idle)

   smp_mb()

   ACCESS_ONCE(full_sysidle_state)


Are you suggesting that because the CPU 1 executes its atomic_inc() 
_after_ (in terms
of absolute time) the atomic_read of CPU 0, the ordering settled in 
both sides guarantees
that the value read from CPU 1 is the one from the cmpxchg that 
precedes the atomic_read,
or FULL or FULL_NOTED that happen later.

If so that's a big lesson for me. 
   
   It is not absolute time that matters.  Instead, it is the fact that
   CPU 0, when reading from -dynticks_idle, read the old value before the
   atomic_inc().  Therefore, anything CPU 0 did before that memory barrier
   preceding CPU 0's read must come before anything CPU 1 did after that
   memory barrier following the atomic_inc().  For this to work, there
   must be some access to the same variable on each CPU.
  
  Aren't we in the following situation?
  
  CPU 0  CPU 1
  
  STORE ASTORE B
  LOAD B LOAD A
  
  
  If so and referring to your perfbook, this is an ears to mouth situation.
  And it seems to describe there is no strong guarantee in that situation.
 
 Yes to the first, but on modern hardware, no to the second.  The key
 paragraph is Section 12.2.4.5:
 
   The following pairings from Table 12.1 can be used on modern
   hardware, but might fail on some systems that were produced in
   the 1990s. However, these can safely be used on all mainstream
   hardware introduced since the year 2000.

Right I missed that!

 
 That said, you are not the first to be confused by this, so I might need
 to rework this section to make it clear that each can in fact be used on
 modern hardware.
 
 If you happen to have an old Sequent NUMA-Q or Symmetry box lying around,
 things are a bit different.  On the other hand, I don't believe that any
 of these old boxes are still running Linux.  (Hey, I am as sentimental as
 the next guy, but there are limits!)
 
 I updated this section and pushed it, please let me know if this helps!

I don't know because I encountered some troubles to build it, I'm seeing 
thousand
lines like this:

Name main::opt_help used only once: possible typo at /usr/bin/a2ping line 534.
/usr/bin/a2ping: not a GS output from gs -dSAFER
./cartoons/whippersnapper300.eps - ./cartoons/whippersnapper300.pdf
Name main::opt_extra used only once: possible typo at /usr/bin/a2ping line 
546.
Name main::opt_help used only once: possible typo at /usr/bin/a2ping line 534.
/usr/bin/a2ping: not a GS output from gs -dSAFER
make: *** [embedfonts] Error 1

Anyway I looked at the diff and it looks indeed clearer, thanks!

So back to the issue, I think we made nice progresses with my rusty brain ;-)
But just to be clear, I'm pasting that again for just a few precisions:

CPU 0CPU 1

   cmpxchg(full_sysidle_state,  //CPU 1 wakes up
RCU_SYSIDLE_SHORT,   atomic_inc(rdtp(1)-dynticks_idle)
RCU_SYSIDLE_LONG);

   smp_mb() //cmpxchgsmp_mb()
   atomic_read(rdtp(1)-dynticks_idle)   ACCESS_ONCE(full_sysidle_state
  //CPU 0 goes to sleep



1) If CPU 0 sets RCU_SYSIDLE_LONG and sees dynticks_idle as even, do we have 
the _guarantee_
that later CPU 1 sees full_sysidle_state == RCU_SYSIDLE_LONG (or any later 
full_sysidle_state value)
due to the connection between atomic_read / atomic_inc and the barriers that 
come along?

2) You taught me once that barrier != memory committed, and it has been one of 
the hardest trauma
in my life. How can we be sure that CPU 1 sees memory as committed from CPU 0? 
The only fact that
we read an even value from CPU 0 is enough for the connection between the 
atomic_read() and atomic_inc()
and all the barriers that come along?

3) In your book it says: recent hardware would guarantee that at least one of 
the loads saw the value
stored by the corresponding store.

At least one? So in our example, CPU 0 could see dynticks_idle as even (success 
to see some prior store done in
CPU 1) but following the above statement 

Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-24 Thread Frederic Weisbecker
On Thu, Jul 18, 2013 at 10:06:25PM -0700, Paul E. McKenney wrote:
  Lets summarize the last sequence, the following happens ordered by time:
  
  CPU 0  CPU 1
  
   cmpxchg(full_sysidle_state,
   RCU_SYSIDLE_SHORT,
   RCU_SYSIDLE_LONG);
  
   smp_mb() //cmpxchg
  
   atomic_read(rdtp(1)-dynticks_idle)
  
   //CPU 0 goes to sleep
 //CPU 1 wakes up
 atomic_inc(rdtp(1)-dynticks_idle)
  
 smp_mb()
  
 ACCESS_ONCE(full_sysidle_state)
  
  
  Are you suggesting that because the CPU 1 executes its atomic_inc() _after_ 
  (in terms
  of absolute time) the atomic_read of CPU 0, the ordering settled in both 
  sides guarantees
  that the value read from CPU 1 is the one from the cmpxchg that precedes 
  the atomic_read,
  or FULL or FULL_NOTED that happen later.
  
  If so that's a big lesson for me. 
 
 It is not absolute time that matters.  Instead, it is the fact that
 CPU 0, when reading from -dynticks_idle, read the old value before the
 atomic_inc().  Therefore, anything CPU 0 did before that memory barrier
 preceding CPU 0's read must come before anything CPU 1 did after that
 memory barrier following the atomic_inc().  For this to work, there
 must be some access to the same variable on each CPU.

Aren't we in the following situation?

CPU 0  CPU 1

STORE ASTORE B
LOAD B LOAD A


If so and referring to your perfbook, this is an ears to mouth situation.
And it seems to describe there is no strong guarantee in that situation.

 
 Or, if you must think in terms of time, you need a separate independent
 timeline for each variable, with no direct mapping from one timeline to
 another, except resulting from memory-barrier interactions.
 
   Thanx, Paul
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-18 Thread Paul E. McKenney
On Fri, Jul 19, 2013 at 04:12:08AM +0200, Frederic Weisbecker wrote:
> On Thu, Jul 18, 2013 at 05:24:08PM -0700, Paul E. McKenney wrote:
> > On Fri, Jul 19, 2013 at 12:46:21AM +0200, Frederic Weisbecker wrote:
> > > On Thu, Jul 18, 2013 at 09:47:49AM -0700, Paul E. McKenney wrote:
> > > > 1. Some CPU coming out of idle:
> > > > 
> > > > o   rcu_sysidle_exit():
> > > > 
> > > > smp_mb__before_atomic_inc();
> > > > atomic_inc(>dynticks_idle);
> > > > smp_mb__after_atomic_inc(); /* A */
> > > > 
> > > > o   rcu_sysidle_force_exit():
> > > > 
> > > > oldstate = ACCESS_ONCE(full_sysidle_state);
> > > > 
> > > > 2. RCU GP kthread:
> > > > 
> > > > o   rcu_sysidle():
> > > > 
> > > > cmpxchg(_sysidle_state, RCU_SYSIDLE_SHORT, 
> > > > RCU_SYSIDLE_LONG);
> > > > /* B */
> > > > 
> > > > o   rcu_sysidle_check_cpu():
> > > > 
> > > > cur = atomic_read(>dynticks_idle);
> > > > 
> > > > Memory barrier A pairs with memory barrier B, so that if #1's load
> > > > from full_sysidle_state sees RCU_SYSIDLE_SHORT, we know that #1's
> > > > atomic_inc() must be visible to #2's atomic_read().  This will cause #2
> > > > to recognize that the CPU came out of idle, which will in turn cause it
> > > > to invoke rcu_sysidle_cancel() instead of rcu_sysidle(), resulting in
> > > > full_sysidle_state being set to RCU_SYSIDLE_NOT.
> > > 
> > > Ok I get it for that direction.
> > > Now imagine CPU 0 is the RCU GP kthread (#2) and CPU 1 is idle and stays
> > > so.
> > > 
> > > CPU 0 then rounds and see that all CPUs are idle, until it finally sets
> > > up RCU_SYSIDLE_SHORT_FULL and finally goes to sleep.
> > > 
> > > Then CPU 1 wakes up. It really has to see a value above RCU_SYSIDLE_SHORT
> > > otherwise it won't do the cmpxchg and see the FULL_NOTED that makes it 
> > > send
> > > the IPI.
> > > 
> > > What provides the guarantee that CPU 1 sees a value above 
> > > RCU_SYSIDLE_SHORT?
> > > Not on the cmpxchg but when it first dereference with ACCESS_ONCE.
> > 
> > The trick is that CPU 0 will have scanned, moved to RCU_SYSIDLE_SHORT,
> > scanned, moved to RCU_SYSIDLE_LONG, then scanned again before moving
> > to RCU_SYSIDLE_FULL.  Given CPU 1 has been idle all this time, CPU 0
> > will have read its ->dynticks_idle counter on each scan and seen it
> > having an even value.  When CPU 1 comes out of idle, it will atomically
> > increment its ->dyntick_idle(), which will happen after CPU 0's read of
> > ->dyntick_idle() during its last scan.
> > 
> > Because of the memory-barrier pairing above, this means that CPU
> > 1's read from full_sysidle_state must follow the cmpxchg() that
> > set full_sysidle_state to RCU_SYSIDLE_LONG (though not necessarily
> > the two later cmpxchg()s that set it to RCU_SYSIDLE_FULL and
> > RCU_SYSIDLE_FULL_NOTED).  But because RCU_SYSIDLE_LONG is greater than
> > RCU_SYSIDLE_SHORT, CPU 1 will take action to end the idle period.
> 
> Lets summarize the last sequence, the following happens ordered by time:
> 
> CPU 0  CPU 1
> 
>  cmpxchg(_sysidle_state,
>  RCU_SYSIDLE_SHORT,
>  RCU_SYSIDLE_LONG);
> 
>  smp_mb() //cmpxchg
> 
>  atomic_read(rdtp(1)->dynticks_idle)
> 
>  //CPU 0 goes to sleep
>//CPU 1 wakes up
>atomic_inc(rdtp(1)->dynticks_idle)
> 
>smp_mb()
> 
>ACCESS_ONCE(full_sysidle_state)
> 
> 
> Are you suggesting that because the CPU 1 executes its atomic_inc() _after_ 
> (in terms
> of absolute time) the atomic_read of CPU 0, the ordering settled in both 
> sides guarantees
> that the value read from CPU 1 is the one from the cmpxchg that precedes the 
> atomic_read,
> or FULL or FULL_NOTED that happen later.
> 
> If so that's a big lesson for me. 

It is not absolute time that matters.  Instead, it is the fact that
CPU 0, when reading from ->dynticks_idle, read the old value before the
atomic_inc().  Therefore, anything CPU 0 did before that memory barrier
preceding CPU 0's read must come before anything CPU 1 did after that
memory barrier following the atomic_inc().  For this to work, there
must be some access to the same variable on each CPU.

Or, if you must think in terms of time, you need a separate independent
timeline for each variable, with no direct mapping from one timeline to
another, except resulting from memory-barrier interactions.

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-18 Thread Frederic Weisbecker
On Thu, Jul 18, 2013 at 05:24:08PM -0700, Paul E. McKenney wrote:
> On Fri, Jul 19, 2013 at 12:46:21AM +0200, Frederic Weisbecker wrote:
> > On Thu, Jul 18, 2013 at 09:47:49AM -0700, Paul E. McKenney wrote:
> > > 1. Some CPU coming out of idle:
> > > 
> > > o rcu_sysidle_exit():
> > > 
> > >   smp_mb__before_atomic_inc();
> > >   atomic_inc(>dynticks_idle);
> > >   smp_mb__after_atomic_inc(); /* A */
> > > 
> > > o rcu_sysidle_force_exit():
> > > 
> > >   oldstate = ACCESS_ONCE(full_sysidle_state);
> > > 
> > > 2. RCU GP kthread:
> > > 
> > > o rcu_sysidle():
> > > 
> > >   cmpxchg(_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
> > >   /* B */
> > > 
> > > o rcu_sysidle_check_cpu():
> > > 
> > >   cur = atomic_read(>dynticks_idle);
> > > 
> > > Memory barrier A pairs with memory barrier B, so that if #1's load
> > > from full_sysidle_state sees RCU_SYSIDLE_SHORT, we know that #1's
> > > atomic_inc() must be visible to #2's atomic_read().  This will cause #2
> > > to recognize that the CPU came out of idle, which will in turn cause it
> > > to invoke rcu_sysidle_cancel() instead of rcu_sysidle(), resulting in
> > > full_sysidle_state being set to RCU_SYSIDLE_NOT.
> > 
> > Ok I get it for that direction.
> > Now imagine CPU 0 is the RCU GP kthread (#2) and CPU 1 is idle and stays
> > so.
> > 
> > CPU 0 then rounds and see that all CPUs are idle, until it finally sets
> > up RCU_SYSIDLE_SHORT_FULL and finally goes to sleep.
> > 
> > Then CPU 1 wakes up. It really has to see a value above RCU_SYSIDLE_SHORT
> > otherwise it won't do the cmpxchg and see the FULL_NOTED that makes it send
> > the IPI.
> > 
> > What provides the guarantee that CPU 1 sees a value above RCU_SYSIDLE_SHORT?
> > Not on the cmpxchg but when it first dereference with ACCESS_ONCE.
> 
> The trick is that CPU 0 will have scanned, moved to RCU_SYSIDLE_SHORT,
> scanned, moved to RCU_SYSIDLE_LONG, then scanned again before moving
> to RCU_SYSIDLE_FULL.  Given CPU 1 has been idle all this time, CPU 0
> will have read its ->dynticks_idle counter on each scan and seen it
> having an even value.  When CPU 1 comes out of idle, it will atomically
> increment its ->dyntick_idle(), which will happen after CPU 0's read of
> ->dyntick_idle() during its last scan.
> 
> Because of the memory-barrier pairing above, this means that CPU
> 1's read from full_sysidle_state must follow the cmpxchg() that
> set full_sysidle_state to RCU_SYSIDLE_LONG (though not necessarily
> the two later cmpxchg()s that set it to RCU_SYSIDLE_FULL and
> RCU_SYSIDLE_FULL_NOTED).  But because RCU_SYSIDLE_LONG is greater than
> RCU_SYSIDLE_SHORT, CPU 1 will take action to end the idle period.

Lets summarize the last sequence, the following happens ordered by time:

CPU 0  CPU 1

 cmpxchg(_sysidle_state,
 RCU_SYSIDLE_SHORT,
 RCU_SYSIDLE_LONG);

 smp_mb() //cmpxchg

 atomic_read(rdtp(1)->dynticks_idle)

 //CPU 0 goes to sleep
   //CPU 1 wakes up
   atomic_inc(rdtp(1)->dynticks_idle)

   smp_mb()

   ACCESS_ONCE(full_sysidle_state)


Are you suggesting that because the CPU 1 executes its atomic_inc() _after_ (in 
terms
of absolute time) the atomic_read of CPU 0, the ordering settled in both sides 
guarantees
that the value read from CPU 1 is the one from the cmpxchg that precedes the 
atomic_read,
or FULL or FULL_NOTED that happen later.

If so that's a big lesson for me. 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-18 Thread Paul E. McKenney
On Fri, Jul 19, 2013 at 12:46:21AM +0200, Frederic Weisbecker wrote:
> On Thu, Jul 18, 2013 at 09:47:49AM -0700, Paul E. McKenney wrote:
> > On Thu, Jul 18, 2013 at 04:24:51PM +0200, Frederic Weisbecker wrote:
> > > On Wed, Jul 17, 2013 at 08:39:21PM -0700, Paul E. McKenney wrote:
> > > > On Thu, Jul 18, 2013 at 03:33:01AM +0200, Frederic Weisbecker wrote:
> > > > > So it's like:
> > > > > 
> > > > > CPU 0  CPU 1
> > > > > 
> > > > > read I write I
> > > > > smp_mb()   smp_mb()
> > > > > cmpxchg S  read S
> > > > > 
> > > > > I still can't find what guarantees we don't read a value in CPU 1 
> > > > > that is way below
> > > > > what we want.
> > > > 
> > > > One key point is that there is a second cycle from LONG to FULL.
> > > > 
> > > > (Not saying that there is not a bug -- there might well be.  In fact,
> > > > I am starting to think that I need to do another Promela model...
> > > 
> > > Now I'm very confused :)
> > 
> > To quote a Nobel Laureate who presented at an ISEF here in Portland some
> > years back, "Confusion is the most productive state of mind."  ;-)
> 
> Then I must be a very productive guy!

So that is your secret!  ;-)

> > > I'm far from being a specialist on these matters but I would really love 
> > > to
> > > understand this patchset. Is there any documentation somewhere I can read
> > > that could help, something about cycles of committed memory or something?
> > 
> > Documentation/memory-barriers.txt should suffice for this.  If you want
> > more rigor, http://www.cl.cam.ac.uk/~pes20/ppc-supplemental/test7.pdf
> > 
> > But memory-barrier pairing suffices here.  Here is case 2 from my
> > earlier email in more detail.  The comments with capital letters
> > mark important memory barriers, some of which are buried in atomic
> > operations.
> > 
> > 1. Some CPU coming out of idle:
> > 
> > o   rcu_sysidle_exit():
> > 
> > smp_mb__before_atomic_inc();
> > atomic_inc(>dynticks_idle);
> > smp_mb__after_atomic_inc(); /* A */
> > 
> > o   rcu_sysidle_force_exit():
> > 
> > oldstate = ACCESS_ONCE(full_sysidle_state);
> > 
> > 2. RCU GP kthread:
> > 
> > o   rcu_sysidle():
> > 
> > cmpxchg(_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
> > /* B */
> > 
> > o   rcu_sysidle_check_cpu():
> > 
> > cur = atomic_read(>dynticks_idle);
> > 
> > Memory barrier A pairs with memory barrier B, so that if #1's load
> > from full_sysidle_state sees RCU_SYSIDLE_SHORT, we know that #1's
> > atomic_inc() must be visible to #2's atomic_read().  This will cause #2
> > to recognize that the CPU came out of idle, which will in turn cause it
> > to invoke rcu_sysidle_cancel() instead of rcu_sysidle(), resulting in
> > full_sysidle_state being set to RCU_SYSIDLE_NOT.
> 
> Ok I get it for that direction.
> Now imagine CPU 0 is the RCU GP kthread (#2) and CPU 1 is idle and stays
> so.
> 
> CPU 0 then rounds and see that all CPUs are idle, until it finally sets
> up RCU_SYSIDLE_SHORT_FULL and finally goes to sleep.
> 
> Then CPU 1 wakes up. It really has to see a value above RCU_SYSIDLE_SHORT
> otherwise it won't do the cmpxchg and see the FULL_NOTED that makes it send
> the IPI.
> 
> What provides the guarantee that CPU 1 sees a value above RCU_SYSIDLE_SHORT?
> Not on the cmpxchg but when it first dereference with ACCESS_ONCE.

The trick is that CPU 0 will have scanned, moved to RCU_SYSIDLE_SHORT,
scanned, moved to RCU_SYSIDLE_LONG, then scanned again before moving
to RCU_SYSIDLE_FULL.  Given CPU 1 has been idle all this time, CPU 0
will have read its ->dynticks_idle counter on each scan and seen it
having an even value.  When CPU 1 comes out of idle, it will atomically
increment its ->dyntick_idle(), which will happen after CPU 0's read of
->dyntick_idle() during its last scan.

Because of the memory-barrier pairing above, this means that CPU
1's read from full_sysidle_state must follow the cmpxchg() that
set full_sysidle_state to RCU_SYSIDLE_LONG (though not necessarily
the two later cmpxchg()s that set it to RCU_SYSIDLE_FULL and
RCU_SYSIDLE_FULL_NOTED).  But because RCU_SYSIDLE_LONG is greater than
RCU_SYSIDLE_SHORT, CPU 1 will take action to end the idle period.

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-18 Thread Frederic Weisbecker
On Thu, Jul 18, 2013 at 09:47:49AM -0700, Paul E. McKenney wrote:
> On Thu, Jul 18, 2013 at 04:24:51PM +0200, Frederic Weisbecker wrote:
> > On Wed, Jul 17, 2013 at 08:39:21PM -0700, Paul E. McKenney wrote:
> > > On Thu, Jul 18, 2013 at 03:33:01AM +0200, Frederic Weisbecker wrote:
> > > > So it's like:
> > > > 
> > > > CPU 0  CPU 1
> > > > 
> > > > read I write I
> > > > smp_mb()   smp_mb()
> > > > cmpxchg S  read S
> > > > 
> > > > I still can't find what guarantees we don't read a value in CPU 1 that 
> > > > is way below
> > > > what we want.
> > > 
> > > One key point is that there is a second cycle from LONG to FULL.
> > > 
> > > (Not saying that there is not a bug -- there might well be.  In fact,
> > > I am starting to think that I need to do another Promela model...
> > 
> > Now I'm very confused :)
> 
> To quote a Nobel Laureate who presented at an ISEF here in Portland some
> years back, "Confusion is the most productive state of mind."  ;-)

Then I must be a very productive guy!

> 
> > I'm far from being a specialist on these matters but I would really love to
> > understand this patchset. Is there any documentation somewhere I can read
> > that could help, something about cycles of committed memory or something?
> 
> Documentation/memory-barriers.txt should suffice for this.  If you want
> more rigor, http://www.cl.cam.ac.uk/~pes20/ppc-supplemental/test7.pdf
> 
> But memory-barrier pairing suffices here.  Here is case 2 from my
> earlier email in more detail.  The comments with capital letters
> mark important memory barriers, some of which are buried in atomic
> operations.
> 
> 1. Some CPU coming out of idle:
> 
> o rcu_sysidle_exit():
> 
>   smp_mb__before_atomic_inc();
>   atomic_inc(>dynticks_idle);
>   smp_mb__after_atomic_inc(); /* A */
> 
> o rcu_sysidle_force_exit():
> 
>   oldstate = ACCESS_ONCE(full_sysidle_state);
> 
> 2. RCU GP kthread:
> 
> o rcu_sysidle():
> 
>   cmpxchg(_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
>   /* B */
> 
> o rcu_sysidle_check_cpu():
> 
>   cur = atomic_read(>dynticks_idle);
> 
> Memory barrier A pairs with memory barrier B, so that if #1's load
> from full_sysidle_state sees RCU_SYSIDLE_SHORT, we know that #1's
> atomic_inc() must be visible to #2's atomic_read().  This will cause #2
> to recognize that the CPU came out of idle, which will in turn cause it
> to invoke rcu_sysidle_cancel() instead of rcu_sysidle(), resulting in
> full_sysidle_state being set to RCU_SYSIDLE_NOT.

Ok I get it for that direction.
Now imagine CPU 0 is the RCU GP kthread (#2) and CPU 1 is idle and stays
so.

CPU 0 then rounds and see that all CPUs are idle, until it finally sets
up RCU_SYSIDLE_SHORT_FULL and finally goes to sleep.

Then CPU 1 wakes up. It really has to see a value above RCU_SYSIDLE_SHORT
otherwise it won't do the cmpxchg and see the FULL_NOTED that makes it send
the IPI.

What provides the guarantee that CPU 1 sees a value above RCU_SYSIDLE_SHORT?
Not on the cmpxchg but when it first dereference with ACCESS_ONCE.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-18 Thread Paul E. McKenney
On Thu, Jul 18, 2013 at 04:24:51PM +0200, Frederic Weisbecker wrote:
> On Wed, Jul 17, 2013 at 08:39:21PM -0700, Paul E. McKenney wrote:
> > On Thu, Jul 18, 2013 at 03:33:01AM +0200, Frederic Weisbecker wrote:
> > > So it's like:
> > > 
> > > CPU 0  CPU 1
> > > 
> > > read I write I
> > > smp_mb()   smp_mb()
> > > cmpxchg S  read S
> > > 
> > > I still can't find what guarantees we don't read a value in CPU 1 that is 
> > > way below
> > > what we want.
> > 
> > One key point is that there is a second cycle from LONG to FULL.
> > 
> > (Not saying that there is not a bug -- there might well be.  In fact,
> > I am starting to think that I need to do another Promela model...
> 
> Now I'm very confused :)

To quote a Nobel Laureate who presented at an ISEF here in Portland some
years back, "Confusion is the most productive state of mind."  ;-)

> I'm far from being a specialist on these matters but I would really love to
> understand this patchset. Is there any documentation somewhere I can read
> that could help, something about cycles of committed memory or something?

Documentation/memory-barriers.txt should suffice for this.  If you want
more rigor, http://www.cl.cam.ac.uk/~pes20/ppc-supplemental/test7.pdf

But memory-barrier pairing suffices here.  Here is case 2 from my
earlier email in more detail.  The comments with capital letters
mark important memory barriers, some of which are buried in atomic
operations.

1. Some CPU coming out of idle:

o   rcu_sysidle_exit():

smp_mb__before_atomic_inc();
atomic_inc(>dynticks_idle);
smp_mb__after_atomic_inc(); /* A */

o   rcu_sysidle_force_exit():

oldstate = ACCESS_ONCE(full_sysidle_state);

2. RCU GP kthread:

o   rcu_sysidle():

cmpxchg(_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
/* B */

o   rcu_sysidle_check_cpu():

cur = atomic_read(>dynticks_idle);

Memory barrier A pairs with memory barrier B, so that if #1's load
from full_sysidle_state sees RCU_SYSIDLE_SHORT, we know that #1's
atomic_inc() must be visible to #2's atomic_read().  This will cause #2
to recognize that the CPU came out of idle, which will in turn cause it
to invoke rcu_sysidle_cancel() instead of rcu_sysidle(), resulting in
full_sysidle_state being set to RCU_SYSIDLE_NOT.

Thanx, Paul

> > > > Unfortunately, the reasoning in #2 above does not hold in the small-CPU
> > > > case because there is the possibility of both the timekeeping CPU and
> > > > the RCU grace-period kthread concurrently advancing the state machine.
> > > > This would be bad, good catch!!!
> > > 
> > > It's not like I spotted anything myself but you're welcome :)
> > 
> > I will take them any way I can get them.  ;-)
> > 
> > > > The patch below (untested) is an attempt to fix this.  If it actually
> > > > works, I will merge it in with 6/7.
> > > > 
> > > > Anything else I missed?  ;-)
> > > 
> > > Well I guess I'll wait one more night before trying to understand
> > > the below ;)
> > 
> > The key point is that the added check means that either the timekeeping
> > CPU is advancing the state machine (if there are few CPUs) or the
> > RCU grace-period kthread is (if there are many CPUs), but never both.
> > Or that is the intent, anyway!
> 
> Yeah got that.
> 
> Thanks!
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-18 Thread Frederic Weisbecker
On Wed, Jul 17, 2013 at 08:39:21PM -0700, Paul E. McKenney wrote:
> On Thu, Jul 18, 2013 at 03:33:01AM +0200, Frederic Weisbecker wrote:
> > So it's like:
> > 
> > CPU 0  CPU 1
> > 
> > read I write I
> > smp_mb()   smp_mb()
> > cmpxchg S  read S
> > 
> > I still can't find what guarantees we don't read a value in CPU 1 that is 
> > way below
> > what we want.
> 
> One key point is that there is a second cycle from LONG to FULL.
> 
> (Not saying that there is not a bug -- there might well be.  In fact,
> I am starting to think that I need to do another Promela model...

Now I'm very confused :)

I'm far from being a specialist on these matters but I would really love to
understand this patchset. Is there any documentation somewhere I can read
that could help, something about cycles of committed memory or something?


> 
> > > Unfortunately, the reasoning in #2 above does not hold in the small-CPU
> > > case because there is the possibility of both the timekeeping CPU and
> > > the RCU grace-period kthread concurrently advancing the state machine.
> > > This would be bad, good catch!!!
> > 
> > It's not like I spotted anything myself but you're welcome :)
> 
> I will take them any way I can get them.  ;-)
> 
> > > The patch below (untested) is an attempt to fix this.  If it actually
> > > works, I will merge it in with 6/7.
> > > 
> > > Anything else I missed?  ;-)
> > 
> > Well I guess I'll wait one more night before trying to understand
> > the below ;)
> 
> The key point is that the added check means that either the timekeeping
> CPU is advancing the state machine (if there are few CPUs) or the
> RCU grace-period kthread is (if there are many CPUs), but never both.
> Or that is the intent, anyway!

Yeah got that.

Thanks!
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-18 Thread Frederic Weisbecker
On Wed, Jul 17, 2013 at 08:39:21PM -0700, Paul E. McKenney wrote:
 On Thu, Jul 18, 2013 at 03:33:01AM +0200, Frederic Weisbecker wrote:
  So it's like:
  
  CPU 0  CPU 1
  
  read I write I
  smp_mb()   smp_mb()
  cmpxchg S  read S
  
  I still can't find what guarantees we don't read a value in CPU 1 that is 
  way below
  what we want.
 
 One key point is that there is a second cycle from LONG to FULL.
 
 (Not saying that there is not a bug -- there might well be.  In fact,
 I am starting to think that I need to do another Promela model...

Now I'm very confused :)

I'm far from being a specialist on these matters but I would really love to
understand this patchset. Is there any documentation somewhere I can read
that could help, something about cycles of committed memory or something?


 
   Unfortunately, the reasoning in #2 above does not hold in the small-CPU
   case because there is the possibility of both the timekeeping CPU and
   the RCU grace-period kthread concurrently advancing the state machine.
   This would be bad, good catch!!!
  
  It's not like I spotted anything myself but you're welcome :)
 
 I will take them any way I can get them.  ;-)
 
   The patch below (untested) is an attempt to fix this.  If it actually
   works, I will merge it in with 6/7.
   
   Anything else I missed?  ;-)
  
  Well I guess I'll wait one more night before trying to understand
  the below ;)
 
 The key point is that the added check means that either the timekeeping
 CPU is advancing the state machine (if there are few CPUs) or the
 RCU grace-period kthread is (if there are many CPUs), but never both.
 Or that is the intent, anyway!

Yeah got that.

Thanks!
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-18 Thread Paul E. McKenney
On Thu, Jul 18, 2013 at 04:24:51PM +0200, Frederic Weisbecker wrote:
 On Wed, Jul 17, 2013 at 08:39:21PM -0700, Paul E. McKenney wrote:
  On Thu, Jul 18, 2013 at 03:33:01AM +0200, Frederic Weisbecker wrote:
   So it's like:
   
   CPU 0  CPU 1
   
   read I write I
   smp_mb()   smp_mb()
   cmpxchg S  read S
   
   I still can't find what guarantees we don't read a value in CPU 1 that is 
   way below
   what we want.
  
  One key point is that there is a second cycle from LONG to FULL.
  
  (Not saying that there is not a bug -- there might well be.  In fact,
  I am starting to think that I need to do another Promela model...
 
 Now I'm very confused :)

To quote a Nobel Laureate who presented at an ISEF here in Portland some
years back, Confusion is the most productive state of mind.  ;-)

 I'm far from being a specialist on these matters but I would really love to
 understand this patchset. Is there any documentation somewhere I can read
 that could help, something about cycles of committed memory or something?

Documentation/memory-barriers.txt should suffice for this.  If you want
more rigor, http://www.cl.cam.ac.uk/~pes20/ppc-supplemental/test7.pdf

But memory-barrier pairing suffices here.  Here is case 2 from my
earlier email in more detail.  The comments with capital letters
mark important memory barriers, some of which are buried in atomic
operations.

1. Some CPU coming out of idle:

o   rcu_sysidle_exit():

smp_mb__before_atomic_inc();
atomic_inc(rdtp-dynticks_idle);
smp_mb__after_atomic_inc(); /* A */

o   rcu_sysidle_force_exit():

oldstate = ACCESS_ONCE(full_sysidle_state);

2. RCU GP kthread:

o   rcu_sysidle():

cmpxchg(full_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
/* B */

o   rcu_sysidle_check_cpu():

cur = atomic_read(rdtp-dynticks_idle);

Memory barrier A pairs with memory barrier B, so that if #1's load
from full_sysidle_state sees RCU_SYSIDLE_SHORT, we know that #1's
atomic_inc() must be visible to #2's atomic_read().  This will cause #2
to recognize that the CPU came out of idle, which will in turn cause it
to invoke rcu_sysidle_cancel() instead of rcu_sysidle(), resulting in
full_sysidle_state being set to RCU_SYSIDLE_NOT.

Thanx, Paul

Unfortunately, the reasoning in #2 above does not hold in the small-CPU
case because there is the possibility of both the timekeeping CPU and
the RCU grace-period kthread concurrently advancing the state machine.
This would be bad, good catch!!!
   
   It's not like I spotted anything myself but you're welcome :)
  
  I will take them any way I can get them.  ;-)
  
The patch below (untested) is an attempt to fix this.  If it actually
works, I will merge it in with 6/7.

Anything else I missed?  ;-)
   
   Well I guess I'll wait one more night before trying to understand
   the below ;)
  
  The key point is that the added check means that either the timekeeping
  CPU is advancing the state machine (if there are few CPUs) or the
  RCU grace-period kthread is (if there are many CPUs), but never both.
  Or that is the intent, anyway!
 
 Yeah got that.
 
 Thanks!
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-18 Thread Frederic Weisbecker
On Thu, Jul 18, 2013 at 09:47:49AM -0700, Paul E. McKenney wrote:
 On Thu, Jul 18, 2013 at 04:24:51PM +0200, Frederic Weisbecker wrote:
  On Wed, Jul 17, 2013 at 08:39:21PM -0700, Paul E. McKenney wrote:
   On Thu, Jul 18, 2013 at 03:33:01AM +0200, Frederic Weisbecker wrote:
So it's like:

CPU 0  CPU 1

read I write I
smp_mb()   smp_mb()
cmpxchg S  read S

I still can't find what guarantees we don't read a value in CPU 1 that 
is way below
what we want.
   
   One key point is that there is a second cycle from LONG to FULL.
   
   (Not saying that there is not a bug -- there might well be.  In fact,
   I am starting to think that I need to do another Promela model...
  
  Now I'm very confused :)
 
 To quote a Nobel Laureate who presented at an ISEF here in Portland some
 years back, Confusion is the most productive state of mind.  ;-)

Then I must be a very productive guy!

 
  I'm far from being a specialist on these matters but I would really love to
  understand this patchset. Is there any documentation somewhere I can read
  that could help, something about cycles of committed memory or something?
 
 Documentation/memory-barriers.txt should suffice for this.  If you want
 more rigor, http://www.cl.cam.ac.uk/~pes20/ppc-supplemental/test7.pdf
 
 But memory-barrier pairing suffices here.  Here is case 2 from my
 earlier email in more detail.  The comments with capital letters
 mark important memory barriers, some of which are buried in atomic
 operations.
 
 1. Some CPU coming out of idle:
 
 o rcu_sysidle_exit():
 
   smp_mb__before_atomic_inc();
   atomic_inc(rdtp-dynticks_idle);
   smp_mb__after_atomic_inc(); /* A */
 
 o rcu_sysidle_force_exit():
 
   oldstate = ACCESS_ONCE(full_sysidle_state);
 
 2. RCU GP kthread:
 
 o rcu_sysidle():
 
   cmpxchg(full_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
   /* B */
 
 o rcu_sysidle_check_cpu():
 
   cur = atomic_read(rdtp-dynticks_idle);
 
 Memory barrier A pairs with memory barrier B, so that if #1's load
 from full_sysidle_state sees RCU_SYSIDLE_SHORT, we know that #1's
 atomic_inc() must be visible to #2's atomic_read().  This will cause #2
 to recognize that the CPU came out of idle, which will in turn cause it
 to invoke rcu_sysidle_cancel() instead of rcu_sysidle(), resulting in
 full_sysidle_state being set to RCU_SYSIDLE_NOT.

Ok I get it for that direction.
Now imagine CPU 0 is the RCU GP kthread (#2) and CPU 1 is idle and stays
so.

CPU 0 then rounds and see that all CPUs are idle, until it finally sets
up RCU_SYSIDLE_SHORT_FULL and finally goes to sleep.

Then CPU 1 wakes up. It really has to see a value above RCU_SYSIDLE_SHORT
otherwise it won't do the cmpxchg and see the FULL_NOTED that makes it send
the IPI.

What provides the guarantee that CPU 1 sees a value above RCU_SYSIDLE_SHORT?
Not on the cmpxchg but when it first dereference with ACCESS_ONCE.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-18 Thread Paul E. McKenney
On Fri, Jul 19, 2013 at 12:46:21AM +0200, Frederic Weisbecker wrote:
 On Thu, Jul 18, 2013 at 09:47:49AM -0700, Paul E. McKenney wrote:
  On Thu, Jul 18, 2013 at 04:24:51PM +0200, Frederic Weisbecker wrote:
   On Wed, Jul 17, 2013 at 08:39:21PM -0700, Paul E. McKenney wrote:
On Thu, Jul 18, 2013 at 03:33:01AM +0200, Frederic Weisbecker wrote:
 So it's like:
 
 CPU 0  CPU 1
 
 read I write I
 smp_mb()   smp_mb()
 cmpxchg S  read S
 
 I still can't find what guarantees we don't read a value in CPU 1 
 that is way below
 what we want.

One key point is that there is a second cycle from LONG to FULL.

(Not saying that there is not a bug -- there might well be.  In fact,
I am starting to think that I need to do another Promela model...
   
   Now I'm very confused :)
  
  To quote a Nobel Laureate who presented at an ISEF here in Portland some
  years back, Confusion is the most productive state of mind.  ;-)
 
 Then I must be a very productive guy!

So that is your secret!  ;-)

   I'm far from being a specialist on these matters but I would really love 
   to
   understand this patchset. Is there any documentation somewhere I can read
   that could help, something about cycles of committed memory or something?
  
  Documentation/memory-barriers.txt should suffice for this.  If you want
  more rigor, http://www.cl.cam.ac.uk/~pes20/ppc-supplemental/test7.pdf
  
  But memory-barrier pairing suffices here.  Here is case 2 from my
  earlier email in more detail.  The comments with capital letters
  mark important memory barriers, some of which are buried in atomic
  operations.
  
  1. Some CPU coming out of idle:
  
  o   rcu_sysidle_exit():
  
  smp_mb__before_atomic_inc();
  atomic_inc(rdtp-dynticks_idle);
  smp_mb__after_atomic_inc(); /* A */
  
  o   rcu_sysidle_force_exit():
  
  oldstate = ACCESS_ONCE(full_sysidle_state);
  
  2. RCU GP kthread:
  
  o   rcu_sysidle():
  
  cmpxchg(full_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
  /* B */
  
  o   rcu_sysidle_check_cpu():
  
  cur = atomic_read(rdtp-dynticks_idle);
  
  Memory barrier A pairs with memory barrier B, so that if #1's load
  from full_sysidle_state sees RCU_SYSIDLE_SHORT, we know that #1's
  atomic_inc() must be visible to #2's atomic_read().  This will cause #2
  to recognize that the CPU came out of idle, which will in turn cause it
  to invoke rcu_sysidle_cancel() instead of rcu_sysidle(), resulting in
  full_sysidle_state being set to RCU_SYSIDLE_NOT.
 
 Ok I get it for that direction.
 Now imagine CPU 0 is the RCU GP kthread (#2) and CPU 1 is idle and stays
 so.
 
 CPU 0 then rounds and see that all CPUs are idle, until it finally sets
 up RCU_SYSIDLE_SHORT_FULL and finally goes to sleep.
 
 Then CPU 1 wakes up. It really has to see a value above RCU_SYSIDLE_SHORT
 otherwise it won't do the cmpxchg and see the FULL_NOTED that makes it send
 the IPI.
 
 What provides the guarantee that CPU 1 sees a value above RCU_SYSIDLE_SHORT?
 Not on the cmpxchg but when it first dereference with ACCESS_ONCE.

The trick is that CPU 0 will have scanned, moved to RCU_SYSIDLE_SHORT,
scanned, moved to RCU_SYSIDLE_LONG, then scanned again before moving
to RCU_SYSIDLE_FULL.  Given CPU 1 has been idle all this time, CPU 0
will have read its -dynticks_idle counter on each scan and seen it
having an even value.  When CPU 1 comes out of idle, it will atomically
increment its -dyntick_idle(), which will happen after CPU 0's read of
-dyntick_idle() during its last scan.

Because of the memory-barrier pairing above, this means that CPU
1's read from full_sysidle_state must follow the cmpxchg() that
set full_sysidle_state to RCU_SYSIDLE_LONG (though not necessarily
the two later cmpxchg()s that set it to RCU_SYSIDLE_FULL and
RCU_SYSIDLE_FULL_NOTED).  But because RCU_SYSIDLE_LONG is greater than
RCU_SYSIDLE_SHORT, CPU 1 will take action to end the idle period.

Thanx, Paul

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-18 Thread Frederic Weisbecker
On Thu, Jul 18, 2013 at 05:24:08PM -0700, Paul E. McKenney wrote:
 On Fri, Jul 19, 2013 at 12:46:21AM +0200, Frederic Weisbecker wrote:
  On Thu, Jul 18, 2013 at 09:47:49AM -0700, Paul E. McKenney wrote:
   1. Some CPU coming out of idle:
   
   o rcu_sysidle_exit():
   
 smp_mb__before_atomic_inc();
 atomic_inc(rdtp-dynticks_idle);
 smp_mb__after_atomic_inc(); /* A */
   
   o rcu_sysidle_force_exit():
   
 oldstate = ACCESS_ONCE(full_sysidle_state);
   
   2. RCU GP kthread:
   
   o rcu_sysidle():
   
 cmpxchg(full_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
 /* B */
   
   o rcu_sysidle_check_cpu():
   
 cur = atomic_read(rdtp-dynticks_idle);
   
   Memory barrier A pairs with memory barrier B, so that if #1's load
   from full_sysidle_state sees RCU_SYSIDLE_SHORT, we know that #1's
   atomic_inc() must be visible to #2's atomic_read().  This will cause #2
   to recognize that the CPU came out of idle, which will in turn cause it
   to invoke rcu_sysidle_cancel() instead of rcu_sysidle(), resulting in
   full_sysidle_state being set to RCU_SYSIDLE_NOT.
  
  Ok I get it for that direction.
  Now imagine CPU 0 is the RCU GP kthread (#2) and CPU 1 is idle and stays
  so.
  
  CPU 0 then rounds and see that all CPUs are idle, until it finally sets
  up RCU_SYSIDLE_SHORT_FULL and finally goes to sleep.
  
  Then CPU 1 wakes up. It really has to see a value above RCU_SYSIDLE_SHORT
  otherwise it won't do the cmpxchg and see the FULL_NOTED that makes it send
  the IPI.
  
  What provides the guarantee that CPU 1 sees a value above RCU_SYSIDLE_SHORT?
  Not on the cmpxchg but when it first dereference with ACCESS_ONCE.
 
 The trick is that CPU 0 will have scanned, moved to RCU_SYSIDLE_SHORT,
 scanned, moved to RCU_SYSIDLE_LONG, then scanned again before moving
 to RCU_SYSIDLE_FULL.  Given CPU 1 has been idle all this time, CPU 0
 will have read its -dynticks_idle counter on each scan and seen it
 having an even value.  When CPU 1 comes out of idle, it will atomically
 increment its -dyntick_idle(), which will happen after CPU 0's read of
 -dyntick_idle() during its last scan.
 
 Because of the memory-barrier pairing above, this means that CPU
 1's read from full_sysidle_state must follow the cmpxchg() that
 set full_sysidle_state to RCU_SYSIDLE_LONG (though not necessarily
 the two later cmpxchg()s that set it to RCU_SYSIDLE_FULL and
 RCU_SYSIDLE_FULL_NOTED).  But because RCU_SYSIDLE_LONG is greater than
 RCU_SYSIDLE_SHORT, CPU 1 will take action to end the idle period.

Lets summarize the last sequence, the following happens ordered by time:

CPU 0  CPU 1

 cmpxchg(full_sysidle_state,
 RCU_SYSIDLE_SHORT,
 RCU_SYSIDLE_LONG);

 smp_mb() //cmpxchg

 atomic_read(rdtp(1)-dynticks_idle)

 //CPU 0 goes to sleep
   //CPU 1 wakes up
   atomic_inc(rdtp(1)-dynticks_idle)

   smp_mb()

   ACCESS_ONCE(full_sysidle_state)


Are you suggesting that because the CPU 1 executes its atomic_inc() _after_ (in 
terms
of absolute time) the atomic_read of CPU 0, the ordering settled in both sides 
guarantees
that the value read from CPU 1 is the one from the cmpxchg that precedes the 
atomic_read,
or FULL or FULL_NOTED that happen later.

If so that's a big lesson for me. 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-18 Thread Paul E. McKenney
On Fri, Jul 19, 2013 at 04:12:08AM +0200, Frederic Weisbecker wrote:
 On Thu, Jul 18, 2013 at 05:24:08PM -0700, Paul E. McKenney wrote:
  On Fri, Jul 19, 2013 at 12:46:21AM +0200, Frederic Weisbecker wrote:
   On Thu, Jul 18, 2013 at 09:47:49AM -0700, Paul E. McKenney wrote:
1. Some CPU coming out of idle:

o   rcu_sysidle_exit():

smp_mb__before_atomic_inc();
atomic_inc(rdtp-dynticks_idle);
smp_mb__after_atomic_inc(); /* A */

o   rcu_sysidle_force_exit():

oldstate = ACCESS_ONCE(full_sysidle_state);

2. RCU GP kthread:

o   rcu_sysidle():

cmpxchg(full_sysidle_state, RCU_SYSIDLE_SHORT, 
RCU_SYSIDLE_LONG);
/* B */

o   rcu_sysidle_check_cpu():

cur = atomic_read(rdtp-dynticks_idle);

Memory barrier A pairs with memory barrier B, so that if #1's load
from full_sysidle_state sees RCU_SYSIDLE_SHORT, we know that #1's
atomic_inc() must be visible to #2's atomic_read().  This will cause #2
to recognize that the CPU came out of idle, which will in turn cause it
to invoke rcu_sysidle_cancel() instead of rcu_sysidle(), resulting in
full_sysidle_state being set to RCU_SYSIDLE_NOT.
   
   Ok I get it for that direction.
   Now imagine CPU 0 is the RCU GP kthread (#2) and CPU 1 is idle and stays
   so.
   
   CPU 0 then rounds and see that all CPUs are idle, until it finally sets
   up RCU_SYSIDLE_SHORT_FULL and finally goes to sleep.
   
   Then CPU 1 wakes up. It really has to see a value above RCU_SYSIDLE_SHORT
   otherwise it won't do the cmpxchg and see the FULL_NOTED that makes it 
   send
   the IPI.
   
   What provides the guarantee that CPU 1 sees a value above 
   RCU_SYSIDLE_SHORT?
   Not on the cmpxchg but when it first dereference with ACCESS_ONCE.
  
  The trick is that CPU 0 will have scanned, moved to RCU_SYSIDLE_SHORT,
  scanned, moved to RCU_SYSIDLE_LONG, then scanned again before moving
  to RCU_SYSIDLE_FULL.  Given CPU 1 has been idle all this time, CPU 0
  will have read its -dynticks_idle counter on each scan and seen it
  having an even value.  When CPU 1 comes out of idle, it will atomically
  increment its -dyntick_idle(), which will happen after CPU 0's read of
  -dyntick_idle() during its last scan.
  
  Because of the memory-barrier pairing above, this means that CPU
  1's read from full_sysidle_state must follow the cmpxchg() that
  set full_sysidle_state to RCU_SYSIDLE_LONG (though not necessarily
  the two later cmpxchg()s that set it to RCU_SYSIDLE_FULL and
  RCU_SYSIDLE_FULL_NOTED).  But because RCU_SYSIDLE_LONG is greater than
  RCU_SYSIDLE_SHORT, CPU 1 will take action to end the idle period.
 
 Lets summarize the last sequence, the following happens ordered by time:
 
 CPU 0  CPU 1
 
  cmpxchg(full_sysidle_state,
  RCU_SYSIDLE_SHORT,
  RCU_SYSIDLE_LONG);
 
  smp_mb() //cmpxchg
 
  atomic_read(rdtp(1)-dynticks_idle)
 
  //CPU 0 goes to sleep
//CPU 1 wakes up
atomic_inc(rdtp(1)-dynticks_idle)
 
smp_mb()
 
ACCESS_ONCE(full_sysidle_state)
 
 
 Are you suggesting that because the CPU 1 executes its atomic_inc() _after_ 
 (in terms
 of absolute time) the atomic_read of CPU 0, the ordering settled in both 
 sides guarantees
 that the value read from CPU 1 is the one from the cmpxchg that precedes the 
 atomic_read,
 or FULL or FULL_NOTED that happen later.
 
 If so that's a big lesson for me. 

It is not absolute time that matters.  Instead, it is the fact that
CPU 0, when reading from -dynticks_idle, read the old value before the
atomic_inc().  Therefore, anything CPU 0 did before that memory barrier
preceding CPU 0's read must come before anything CPU 1 did after that
memory barrier following the atomic_inc().  For this to work, there
must be some access to the same variable on each CPU.

Or, if you must think in terms of time, you need a separate independent
timeline for each variable, with no direct mapping from one timeline to
another, except resulting from memory-barrier interactions.

Thanx, Paul

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-17 Thread Paul E. McKenney
On Thu, Jul 18, 2013 at 03:33:01AM +0200, Frederic Weisbecker wrote:
> On Wed, Jul 17, 2013 at 05:41:41PM -0700, Paul E. McKenney wrote:
> > On Thu, Jul 18, 2013 at 01:31:21AM +0200, Frederic Weisbecker wrote:
> > > I'm missing a key here.
> > > 
> > > Let's imagine that the timekeeper has finally set full_sysidle_state = 
> > > RCU_SYSIDLE_FULL_NOTED
> > > with cmpxchg, what guarantees that this CPU is not seeing a stale 
> > > RCU_SYSIDLE_SHORT value
> > > for example?
> > 
> > Good question!  Let's see if I have a reasonable answer.  ;-)
> > 
> > I am going to start with the large-CPU case, so that the state is advanced
> > only by the grace-period kthread.
> > 
> > 1.  Case 1: the timekeeper CPU invoked rcu_sysidle_force_exit().
> > In this case, this is the same CPU that set full_sysidle_state
> > to RCU_SYSIDLE_FULL_NOTED, so it is guaranteed not to see a
> > stale value.
> > 
> > 2.  Case 2: Some CPU came out of idle, and invoked rcu_sysidle_exit().
> > In this case, if this CPU reads a RCU_SYSIDLE_SHORT from
> > full_sysidle_state, this read must have come before the
> > cmpxchg() (on some other CPU) that set it to RCU_SYSIDLE_LONG.
> > Because this CPU's read from full_sysidle_state was preceded by
> > an atomic_inc() that updated this CPU's ->dynticks_idle, that
> > update must also precede the cmpxchg() that set full_sysidle_state
> > to RCU_SYSIDLE_LONG.  Because the state advancing is done from
> > within a single thread, the subsequent scan is guaranteed to see
> > the first CPU's update of ->dynticks_idle, and will therefore
> > refrain from advancing full_sysidle_state to RCU_SYSIDLE_FULL.
> > 
> > This will in turn prevent the timekeeping thread from advancing
> > the state to RCU_SYSIDLE_FULL_NOTED, so this scenario cannot
> > happen.
> 
> Ok, so IIUC the safety is guaranteed in the following ordering:
> 
> CPU 0   CPU 1
> 
> idle = 1smp_mb()
> //for each cpu
> if (atomic_read(rdp(1)->dyntick_idle) & 1)  
> atomic_inc(rdp->dyntick_idle)
>   idle = 0  smp_mb()
> 
> if (idle)
>  cmpxchg(full_sysidle_state, SHORT, LONG)   while 
> (full_sysidle_state > SHORT)
> //reset with 
> cmpxchg
> 
> So it's like:
> 
> CPU 0  CPU 1
> 
> read I write I
> smp_mb()   smp_mb()
> cmpxchg S  read S
> 
> I still can't find what guarantees we don't read a value in CPU 1 that is way 
> below
> what we want.

One key point is that there is a second cycle from LONG to FULL.

(Not saying that there is not a bug -- there might well be.  In fact,
I am starting to think that I need to do another Promela model...)

> > Unfortunately, the reasoning in #2 above does not hold in the small-CPU
> > case because there is the possibility of both the timekeeping CPU and
> > the RCU grace-period kthread concurrently advancing the state machine.
> > This would be bad, good catch!!!
> 
> It's not like I spotted anything myself but you're welcome :)

I will take them any way I can get them.  ;-)

> > The patch below (untested) is an attempt to fix this.  If it actually
> > works, I will merge it in with 6/7.
> > 
> > Anything else I missed?  ;-)
> 
> Well I guess I'll wait one more night before trying to understand
> the below ;)

The key point is that the added check means that either the timekeeping
CPU is advancing the state machine (if there are few CPUs) or the
RCU grace-period kthread is (if there are many CPUs), but never both.
Or that is the intent, anyway!

Thanx, Paul

> Thanks!
> 
> > 
> > Thanx, Paul
> > 
> > 
> > 
> > Signed-off-by: Paul E. McKenney 
> > 
> > diff --git a/kernel/rcutree.c b/kernel/rcutree.c
> > index aa3f525..fe83085 100644
> > --- a/kernel/rcutree.c
> > +++ b/kernel/rcutree.c
> > @@ -1364,7 +1364,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int 
> > fqs_state_in)
> > }
> > force_qs_rnp(rsp, dyntick_save_progress_counter,
> >  , );
> > -   rcu_sysidle_report(rsp, isidle, maxj);
> > +   rcu_sysidle_report_gp(rsp, isidle, maxj);
> > fqs_state = RCU_FORCE_QS;
> > } else {
> > /* Handle dyntick-idle and offline CPUs. */
> > diff --git a/kernel/rcutree.h b/kernel/rcutree.h
> > index 1602c21..657b415 100644
> > --- a/kernel/rcutree.h
> > +++ b/kernel/rcutree.h
> > @@ -559,8 +559,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, 
> > bool *isidle,
> >  

Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-17 Thread Frederic Weisbecker
On Wed, Jul 17, 2013 at 05:41:41PM -0700, Paul E. McKenney wrote:
> On Thu, Jul 18, 2013 at 01:31:21AM +0200, Frederic Weisbecker wrote:
> > I'm missing a key here.
> > 
> > Let's imagine that the timekeeper has finally set full_sysidle_state = 
> > RCU_SYSIDLE_FULL_NOTED
> > with cmpxchg, what guarantees that this CPU is not seeing a stale 
> > RCU_SYSIDLE_SHORT value
> > for example?
> 
> Good question!  Let's see if I have a reasonable answer.  ;-)
> 
> I am going to start with the large-CPU case, so that the state is advanced
> only by the grace-period kthread.
> 
> 1.Case 1: the timekeeper CPU invoked rcu_sysidle_force_exit().
>   In this case, this is the same CPU that set full_sysidle_state
>   to RCU_SYSIDLE_FULL_NOTED, so it is guaranteed not to see a
>   stale value.
> 
> 2.Case 2: Some CPU came out of idle, and invoked rcu_sysidle_exit().
>   In this case, if this CPU reads a RCU_SYSIDLE_SHORT from
>   full_sysidle_state, this read must have come before the
>   cmpxchg() (on some other CPU) that set it to RCU_SYSIDLE_LONG.
>   Because this CPU's read from full_sysidle_state was preceded by
>   an atomic_inc() that updated this CPU's ->dynticks_idle, that
>   update must also precede the cmpxchg() that set full_sysidle_state
>   to RCU_SYSIDLE_LONG.  Because the state advancing is done from
>   within a single thread, the subsequent scan is guaranteed to see
>   the first CPU's update of ->dynticks_idle, and will therefore
>   refrain from advancing full_sysidle_state to RCU_SYSIDLE_FULL.
> 
>   This will in turn prevent the timekeeping thread from advancing
>   the state to RCU_SYSIDLE_FULL_NOTED, so this scenario cannot
>   happen.

Ok, so IIUC the safety is guaranteed in the following ordering:

CPU 0   CPU 1

idle = 1smp_mb()
//for each cpu
if (atomic_read(rdp(1)->dyntick_idle) & 1)  
atomic_inc(rdp->dyntick_idle)
  idle = 0  smp_mb()

if (idle)
 cmpxchg(full_sysidle_state, SHORT, LONG)   while 
(full_sysidle_state > SHORT)
//reset with cmpxchg

So it's like:

CPU 0  CPU 1

read I write I
smp_mb()   smp_mb()
cmpxchg S  read S

I still can't find what guarantees we don't read a value in CPU 1 that is way 
below
what we want.

> 
> Unfortunately, the reasoning in #2 above does not hold in the small-CPU
> case because there is the possibility of both the timekeeping CPU and
> the RCU grace-period kthread concurrently advancing the state machine.
> This would be bad, good catch!!!

It's not like I spotted anything myself but you're welcome :)

> 
> The patch below (untested) is an attempt to fix this.  If it actually
> works, I will merge it in with 6/7.
> 
> Anything else I missed?  ;-)

Well I guess I'll wait one more night before trying to understand
the below ;)

Thanks!

> 
>   Thanx, Paul
> 
> 
> 
> Signed-off-by: Paul E. McKenney 
> 
> diff --git a/kernel/rcutree.c b/kernel/rcutree.c
> index aa3f525..fe83085 100644
> --- a/kernel/rcutree.c
> +++ b/kernel/rcutree.c
> @@ -1364,7 +1364,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
>   }
>   force_qs_rnp(rsp, dyntick_save_progress_counter,
>, );
> - rcu_sysidle_report(rsp, isidle, maxj);
> + rcu_sysidle_report_gp(rsp, isidle, maxj);
>   fqs_state = RCU_FORCE_QS;
>   } else {
>   /* Handle dyntick-idle and offline CPUs. */
> diff --git a/kernel/rcutree.h b/kernel/rcutree.h
> index 1602c21..657b415 100644
> --- a/kernel/rcutree.h
> +++ b/kernel/rcutree.h
> @@ -559,8 +559,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, 
> bool *isidle,
> unsigned long *maxj);
>  static bool is_sysidle_rcu_state(struct rcu_state *rsp);
>  static void rcu_bind_gp_kthread(void);
> -static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
> -unsigned long maxj);
> +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
> +   unsigned long maxj);
>  static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
>  
>  #endif /* #ifndef RCU_TREE_NONCORE */
> diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
> index a4d44c3..f65d9c2 100644
> --- a/kernel/rcutree_plugin.h
> +++ b/kernel/rcutree_plugin.h
> @@ -2655,14 +2655,22 @@ static void rcu_sysidle_cancel(void)
>   * scan of the CPUs' dyntick-idle state.

Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-17 Thread Paul E. McKenney
On Thu, Jul 18, 2013 at 01:31:21AM +0200, Frederic Weisbecker wrote:
> On Mon, Jul 08, 2013 at 06:30:05PM -0700, Paul E. McKenney wrote:
> >  }
> >  
> >  /*
> > + * Unconditionally force exit from full system-idle state.  This is
> > + * invoked when a normal CPU exits idle, but must be called separately
> > + * for the timekeeping CPU (tick_do_timer_cpu).  The reason for this
> > + * is that the timekeeping CPU is permitted to take scheduling-clock
> > + * interrupts while the system is in system-idle state, and of course
> > + * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
> > + * interrupt from any other type of interrupt.
> > + */
> > +void rcu_sysidle_force_exit(void)
> > +{
> > +   int oldstate = ACCESS_ONCE(full_sysidle_state);
> > +   int newoldstate;
> > +
> > +   /*
> > +* Each pass through the following loop attempts to exit full
> > +* system-idle state.  If contention proves to be a problem,
> > +* a trylock-based contention tree could be used here.
> > +*/
> > +   while (oldstate > RCU_SYSIDLE_SHORT) {
> 
> I'm missing a key here.
> 
> Let's imagine that the timekeeper has finally set full_sysidle_state = 
> RCU_SYSIDLE_FULL_NOTED
> with cmpxchg, what guarantees that this CPU is not seeing a stale 
> RCU_SYSIDLE_SHORT value
> for example?

Good question!  Let's see if I have a reasonable answer.  ;-)

I am going to start with the large-CPU case, so that the state is advanced
only by the grace-period kthread.

1.  Case 1: the timekeeper CPU invoked rcu_sysidle_force_exit().
In this case, this is the same CPU that set full_sysidle_state
to RCU_SYSIDLE_FULL_NOTED, so it is guaranteed not to see a
stale value.

2.  Case 2: Some CPU came out of idle, and invoked rcu_sysidle_exit().
In this case, if this CPU reads a RCU_SYSIDLE_SHORT from
full_sysidle_state, this read must have come before the
cmpxchg() (on some other CPU) that set it to RCU_SYSIDLE_LONG.
Because this CPU's read from full_sysidle_state was preceded by
an atomic_inc() that updated this CPU's ->dynticks_idle, that
update must also precede the cmpxchg() that set full_sysidle_state
to RCU_SYSIDLE_LONG.  Because the state advancing is done from
within a single thread, the subsequent scan is guaranteed to see
the first CPU's update of ->dynticks_idle, and will therefore
refrain from advancing full_sysidle_state to RCU_SYSIDLE_FULL.

This will in turn prevent the timekeeping thread from advancing
the state to RCU_SYSIDLE_FULL_NOTED, so this scenario cannot
happen.

Unfortunately, the reasoning in #2 above does not hold in the small-CPU
case because there is the possibility of both the timekeeping CPU and
the RCU grace-period kthread concurrently advancing the state machine.
This would be bad, good catch!!!

The patch below (untested) is an attempt to fix this.  If it actually
works, I will merge it in with 6/7.

Anything else I missed?  ;-)

Thanx, Paul



Signed-off-by: Paul E. McKenney 

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index aa3f525..fe83085 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1364,7 +1364,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
}
force_qs_rnp(rsp, dyntick_save_progress_counter,
 , );
-   rcu_sysidle_report(rsp, isidle, maxj);
+   rcu_sysidle_report_gp(rsp, isidle, maxj);
fqs_state = RCU_FORCE_QS;
} else {
/* Handle dyntick-idle and offline CPUs. */
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1602c21..657b415 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -559,8 +559,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, 
bool *isidle,
  unsigned long *maxj);
 static bool is_sysidle_rcu_state(struct rcu_state *rsp);
 static void rcu_bind_gp_kthread(void);
-static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
-  unsigned long maxj);
+static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
+ unsigned long maxj);
 static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
 
 #endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index a4d44c3..f65d9c2 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -2655,14 +2655,22 @@ static void rcu_sysidle_cancel(void)
  * scan of the CPUs' dyntick-idle state.
  */
 static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
-  unsigned long maxj)
+  unsigned long maxj, bool gpkt)
 {
if (rsp != rcu_sysidle_state)
return;  

Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-17 Thread Frederic Weisbecker
On Mon, Jul 08, 2013 at 06:30:05PM -0700, Paul E. McKenney wrote:
>  }
>  
>  /*
> + * Unconditionally force exit from full system-idle state.  This is
> + * invoked when a normal CPU exits idle, but must be called separately
> + * for the timekeeping CPU (tick_do_timer_cpu).  The reason for this
> + * is that the timekeeping CPU is permitted to take scheduling-clock
> + * interrupts while the system is in system-idle state, and of course
> + * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
> + * interrupt from any other type of interrupt.
> + */
> +void rcu_sysidle_force_exit(void)
> +{
> + int oldstate = ACCESS_ONCE(full_sysidle_state);
> + int newoldstate;
> +
> + /*
> +  * Each pass through the following loop attempts to exit full
> +  * system-idle state.  If contention proves to be a problem,
> +  * a trylock-based contention tree could be used here.
> +  */
> + while (oldstate > RCU_SYSIDLE_SHORT) {

I'm missing a key here.

Let's imagine that the timekeeper has finally set full_sysidle_state = 
RCU_SYSIDLE_FULL_NOTED
with cmpxchg, what guarantees that this CPU is not seeing a stale 
RCU_SYSIDLE_SHORT value
for example?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-17 Thread Frederic Weisbecker
On Mon, Jul 08, 2013 at 06:30:05PM -0700, Paul E. McKenney wrote:
  }
  
  /*
 + * Unconditionally force exit from full system-idle state.  This is
 + * invoked when a normal CPU exits idle, but must be called separately
 + * for the timekeeping CPU (tick_do_timer_cpu).  The reason for this
 + * is that the timekeeping CPU is permitted to take scheduling-clock
 + * interrupts while the system is in system-idle state, and of course
 + * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
 + * interrupt from any other type of interrupt.
 + */
 +void rcu_sysidle_force_exit(void)
 +{
 + int oldstate = ACCESS_ONCE(full_sysidle_state);
 + int newoldstate;
 +
 + /*
 +  * Each pass through the following loop attempts to exit full
 +  * system-idle state.  If contention proves to be a problem,
 +  * a trylock-based contention tree could be used here.
 +  */
 + while (oldstate  RCU_SYSIDLE_SHORT) {

I'm missing a key here.

Let's imagine that the timekeeper has finally set full_sysidle_state = 
RCU_SYSIDLE_FULL_NOTED
with cmpxchg, what guarantees that this CPU is not seeing a stale 
RCU_SYSIDLE_SHORT value
for example?
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-17 Thread Paul E. McKenney
On Thu, Jul 18, 2013 at 01:31:21AM +0200, Frederic Weisbecker wrote:
 On Mon, Jul 08, 2013 at 06:30:05PM -0700, Paul E. McKenney wrote:
   }
   
   /*
  + * Unconditionally force exit from full system-idle state.  This is
  + * invoked when a normal CPU exits idle, but must be called separately
  + * for the timekeeping CPU (tick_do_timer_cpu).  The reason for this
  + * is that the timekeeping CPU is permitted to take scheduling-clock
  + * interrupts while the system is in system-idle state, and of course
  + * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
  + * interrupt from any other type of interrupt.
  + */
  +void rcu_sysidle_force_exit(void)
  +{
  +   int oldstate = ACCESS_ONCE(full_sysidle_state);
  +   int newoldstate;
  +
  +   /*
  +* Each pass through the following loop attempts to exit full
  +* system-idle state.  If contention proves to be a problem,
  +* a trylock-based contention tree could be used here.
  +*/
  +   while (oldstate  RCU_SYSIDLE_SHORT) {
 
 I'm missing a key here.
 
 Let's imagine that the timekeeper has finally set full_sysidle_state = 
 RCU_SYSIDLE_FULL_NOTED
 with cmpxchg, what guarantees that this CPU is not seeing a stale 
 RCU_SYSIDLE_SHORT value
 for example?

Good question!  Let's see if I have a reasonable answer.  ;-)

I am going to start with the large-CPU case, so that the state is advanced
only by the grace-period kthread.

1.  Case 1: the timekeeper CPU invoked rcu_sysidle_force_exit().
In this case, this is the same CPU that set full_sysidle_state
to RCU_SYSIDLE_FULL_NOTED, so it is guaranteed not to see a
stale value.

2.  Case 2: Some CPU came out of idle, and invoked rcu_sysidle_exit().
In this case, if this CPU reads a RCU_SYSIDLE_SHORT from
full_sysidle_state, this read must have come before the
cmpxchg() (on some other CPU) that set it to RCU_SYSIDLE_LONG.
Because this CPU's read from full_sysidle_state was preceded by
an atomic_inc() that updated this CPU's -dynticks_idle, that
update must also precede the cmpxchg() that set full_sysidle_state
to RCU_SYSIDLE_LONG.  Because the state advancing is done from
within a single thread, the subsequent scan is guaranteed to see
the first CPU's update of -dynticks_idle, and will therefore
refrain from advancing full_sysidle_state to RCU_SYSIDLE_FULL.

This will in turn prevent the timekeeping thread from advancing
the state to RCU_SYSIDLE_FULL_NOTED, so this scenario cannot
happen.

Unfortunately, the reasoning in #2 above does not hold in the small-CPU
case because there is the possibility of both the timekeeping CPU and
the RCU grace-period kthread concurrently advancing the state machine.
This would be bad, good catch!!!

The patch below (untested) is an attempt to fix this.  If it actually
works, I will merge it in with 6/7.

Anything else I missed?  ;-)

Thanx, Paul



Signed-off-by: Paul E. McKenney paul...@linux.vnet.ibm.com

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index aa3f525..fe83085 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1364,7 +1364,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
}
force_qs_rnp(rsp, dyntick_save_progress_counter,
 isidle, maxj);
-   rcu_sysidle_report(rsp, isidle, maxj);
+   rcu_sysidle_report_gp(rsp, isidle, maxj);
fqs_state = RCU_FORCE_QS;
} else {
/* Handle dyntick-idle and offline CPUs. */
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1602c21..657b415 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -559,8 +559,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, 
bool *isidle,
  unsigned long *maxj);
 static bool is_sysidle_rcu_state(struct rcu_state *rsp);
 static void rcu_bind_gp_kthread(void);
-static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
-  unsigned long maxj);
+static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
+ unsigned long maxj);
 static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
 
 #endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index a4d44c3..f65d9c2 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -2655,14 +2655,22 @@ static void rcu_sysidle_cancel(void)
  * scan of the CPUs' dyntick-idle state.
  */
 static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
-  unsigned long maxj)
+  unsigned long maxj, bool gpkt)
 {
if (rsp != rcu_sysidle_state)
return;  /* Wrong flavor, 

Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-17 Thread Frederic Weisbecker
On Wed, Jul 17, 2013 at 05:41:41PM -0700, Paul E. McKenney wrote:
 On Thu, Jul 18, 2013 at 01:31:21AM +0200, Frederic Weisbecker wrote:
  I'm missing a key here.
  
  Let's imagine that the timekeeper has finally set full_sysidle_state = 
  RCU_SYSIDLE_FULL_NOTED
  with cmpxchg, what guarantees that this CPU is not seeing a stale 
  RCU_SYSIDLE_SHORT value
  for example?
 
 Good question!  Let's see if I have a reasonable answer.  ;-)
 
 I am going to start with the large-CPU case, so that the state is advanced
 only by the grace-period kthread.
 
 1.Case 1: the timekeeper CPU invoked rcu_sysidle_force_exit().
   In this case, this is the same CPU that set full_sysidle_state
   to RCU_SYSIDLE_FULL_NOTED, so it is guaranteed not to see a
   stale value.
 
 2.Case 2: Some CPU came out of idle, and invoked rcu_sysidle_exit().
   In this case, if this CPU reads a RCU_SYSIDLE_SHORT from
   full_sysidle_state, this read must have come before the
   cmpxchg() (on some other CPU) that set it to RCU_SYSIDLE_LONG.
   Because this CPU's read from full_sysidle_state was preceded by
   an atomic_inc() that updated this CPU's -dynticks_idle, that
   update must also precede the cmpxchg() that set full_sysidle_state
   to RCU_SYSIDLE_LONG.  Because the state advancing is done from
   within a single thread, the subsequent scan is guaranteed to see
   the first CPU's update of -dynticks_idle, and will therefore
   refrain from advancing full_sysidle_state to RCU_SYSIDLE_FULL.
 
   This will in turn prevent the timekeeping thread from advancing
   the state to RCU_SYSIDLE_FULL_NOTED, so this scenario cannot
   happen.

Ok, so IIUC the safety is guaranteed in the following ordering:

CPU 0   CPU 1

idle = 1smp_mb()
//for each cpu
if (atomic_read(rdp(1)-dyntick_idle)  1)  
atomic_inc(rdp-dyntick_idle)
  idle = 0  smp_mb()

if (idle)
 cmpxchg(full_sysidle_state, SHORT, LONG)   while 
(full_sysidle_state  SHORT)
//reset with cmpxchg

So it's like:

CPU 0  CPU 1

read I write I
smp_mb()   smp_mb()
cmpxchg S  read S

I still can't find what guarantees we don't read a value in CPU 1 that is way 
below
what we want.

 
 Unfortunately, the reasoning in #2 above does not hold in the small-CPU
 case because there is the possibility of both the timekeeping CPU and
 the RCU grace-period kthread concurrently advancing the state machine.
 This would be bad, good catch!!!

It's not like I spotted anything myself but you're welcome :)

 
 The patch below (untested) is an attempt to fix this.  If it actually
 works, I will merge it in with 6/7.
 
 Anything else I missed?  ;-)

Well I guess I'll wait one more night before trying to understand
the below ;)

Thanks!

 
   Thanx, Paul
 
 
 
 Signed-off-by: Paul E. McKenney paul...@linux.vnet.ibm.com
 
 diff --git a/kernel/rcutree.c b/kernel/rcutree.c
 index aa3f525..fe83085 100644
 --- a/kernel/rcutree.c
 +++ b/kernel/rcutree.c
 @@ -1364,7 +1364,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
   }
   force_qs_rnp(rsp, dyntick_save_progress_counter,
isidle, maxj);
 - rcu_sysidle_report(rsp, isidle, maxj);
 + rcu_sysidle_report_gp(rsp, isidle, maxj);
   fqs_state = RCU_FORCE_QS;
   } else {
   /* Handle dyntick-idle and offline CPUs. */
 diff --git a/kernel/rcutree.h b/kernel/rcutree.h
 index 1602c21..657b415 100644
 --- a/kernel/rcutree.h
 +++ b/kernel/rcutree.h
 @@ -559,8 +559,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, 
 bool *isidle,
 unsigned long *maxj);
  static bool is_sysidle_rcu_state(struct rcu_state *rsp);
  static void rcu_bind_gp_kthread(void);
 -static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
 -unsigned long maxj);
 +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
 +   unsigned long maxj);
  static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
  
  #endif /* #ifndef RCU_TREE_NONCORE */
 diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
 index a4d44c3..f65d9c2 100644
 --- a/kernel/rcutree_plugin.h
 +++ b/kernel/rcutree_plugin.h
 @@ -2655,14 +2655,22 @@ static void rcu_sysidle_cancel(void)
   * scan of the CPUs' dyntick-idle state.
   */
  static void rcu_sysidle_report(struct rcu_state *rsp, 

Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-17 Thread Paul E. McKenney
On Thu, Jul 18, 2013 at 03:33:01AM +0200, Frederic Weisbecker wrote:
 On Wed, Jul 17, 2013 at 05:41:41PM -0700, Paul E. McKenney wrote:
  On Thu, Jul 18, 2013 at 01:31:21AM +0200, Frederic Weisbecker wrote:
   I'm missing a key here.
   
   Let's imagine that the timekeeper has finally set full_sysidle_state = 
   RCU_SYSIDLE_FULL_NOTED
   with cmpxchg, what guarantees that this CPU is not seeing a stale 
   RCU_SYSIDLE_SHORT value
   for example?
  
  Good question!  Let's see if I have a reasonable answer.  ;-)
  
  I am going to start with the large-CPU case, so that the state is advanced
  only by the grace-period kthread.
  
  1.  Case 1: the timekeeper CPU invoked rcu_sysidle_force_exit().
  In this case, this is the same CPU that set full_sysidle_state
  to RCU_SYSIDLE_FULL_NOTED, so it is guaranteed not to see a
  stale value.
  
  2.  Case 2: Some CPU came out of idle, and invoked rcu_sysidle_exit().
  In this case, if this CPU reads a RCU_SYSIDLE_SHORT from
  full_sysidle_state, this read must have come before the
  cmpxchg() (on some other CPU) that set it to RCU_SYSIDLE_LONG.
  Because this CPU's read from full_sysidle_state was preceded by
  an atomic_inc() that updated this CPU's -dynticks_idle, that
  update must also precede the cmpxchg() that set full_sysidle_state
  to RCU_SYSIDLE_LONG.  Because the state advancing is done from
  within a single thread, the subsequent scan is guaranteed to see
  the first CPU's update of -dynticks_idle, and will therefore
  refrain from advancing full_sysidle_state to RCU_SYSIDLE_FULL.
  
  This will in turn prevent the timekeeping thread from advancing
  the state to RCU_SYSIDLE_FULL_NOTED, so this scenario cannot
  happen.
 
 Ok, so IIUC the safety is guaranteed in the following ordering:
 
 CPU 0   CPU 1
 
 idle = 1smp_mb()
 //for each cpu
 if (atomic_read(rdp(1)-dyntick_idle)  1)  
 atomic_inc(rdp-dyntick_idle)
   idle = 0  smp_mb()
 
 if (idle)
  cmpxchg(full_sysidle_state, SHORT, LONG)   while 
 (full_sysidle_state  SHORT)
 //reset with 
 cmpxchg
 
 So it's like:
 
 CPU 0  CPU 1
 
 read I write I
 smp_mb()   smp_mb()
 cmpxchg S  read S
 
 I still can't find what guarantees we don't read a value in CPU 1 that is way 
 below
 what we want.

One key point is that there is a second cycle from LONG to FULL.

(Not saying that there is not a bug -- there might well be.  In fact,
I am starting to think that I need to do another Promela model...)

  Unfortunately, the reasoning in #2 above does not hold in the small-CPU
  case because there is the possibility of both the timekeeping CPU and
  the RCU grace-period kthread concurrently advancing the state machine.
  This would be bad, good catch!!!
 
 It's not like I spotted anything myself but you're welcome :)

I will take them any way I can get them.  ;-)

  The patch below (untested) is an attempt to fix this.  If it actually
  works, I will merge it in with 6/7.
  
  Anything else I missed?  ;-)
 
 Well I guess I'll wait one more night before trying to understand
 the below ;)

The key point is that the added check means that either the timekeeping
CPU is advancing the state machine (if there are few CPUs) or the
RCU grace-period kthread is (if there are many CPUs), but never both.
Or that is the intent, anyway!

Thanx, Paul

 Thanks!
 
  
  Thanx, Paul
  
  
  
  Signed-off-by: Paul E. McKenney paul...@linux.vnet.ibm.com
  
  diff --git a/kernel/rcutree.c b/kernel/rcutree.c
  index aa3f525..fe83085 100644
  --- a/kernel/rcutree.c
  +++ b/kernel/rcutree.c
  @@ -1364,7 +1364,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int 
  fqs_state_in)
  }
  force_qs_rnp(rsp, dyntick_save_progress_counter,
   isidle, maxj);
  -   rcu_sysidle_report(rsp, isidle, maxj);
  +   rcu_sysidle_report_gp(rsp, isidle, maxj);
  fqs_state = RCU_FORCE_QS;
  } else {
  /* Handle dyntick-idle and offline CPUs. */
  diff --git a/kernel/rcutree.h b/kernel/rcutree.h
  index 1602c21..657b415 100644
  --- a/kernel/rcutree.h
  +++ b/kernel/rcutree.h
  @@ -559,8 +559,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, 
  bool *isidle,
unsigned long *maxj);
   static bool is_sysidle_rcu_state(struct rcu_state *rsp);
   static void rcu_bind_gp_kthread(void);
  

[PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-08 Thread Paul E. McKenney
From: "Paul E. McKenney" 

This commit adds the state machine that takes the per-CPU idle data
as input and produces a full-system-idle indication as output.  This
state machine is driven out of RCU's quiescent-state-forcing
mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU
idle state and then rcu_sysidle_report() to drive the state machine.

The full-system-idle state is sampled using rcu_sys_is_idle(), which
also drives the state machine if RCU is idle (and does so by forcing
RCU to become non-idle).  This function returns true if all but the
timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long
enough to avoid memory contention on the full_sysidle_state state
variable.  The rcu_sysidle_force_exit() may be called externally
to reset the state machine back into non-idle state.

Signed-off-by: Paul E. McKenney 
Cc: Frederic Weisbecker 
Cc: Steven Rostedt 

Conflicts:

kernel/rcutree.h
kernel/rcutree_plugin.h
---
 include/linux/rcupdate.h |  18 
 kernel/rcutree.c |  16 ++-
 kernel/rcutree.h |   5 +
 kernel/rcutree_plugin.h  | 275 ++-
 4 files changed, 307 insertions(+), 7 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 48f1ef9..1aa8d8c 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return 
false; }
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
 
 
+/* Only for use by adaptive-ticks code. */
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+extern bool rcu_sys_is_idle(void);
+extern void rcu_sysidle_force_exit(void);
+#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+
+static inline bool rcu_sys_is_idle(void)
+{
+   return false;
+}
+
+static inline void rcu_sysidle_force_exit(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+
+
 #endif /* __LINUX_RCUPDATE_H */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 9971f86..06cfd75 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -718,6 +718,7 @@ static int dyntick_save_progress_counter(struct rcu_data 
*rdp,
 bool *isidle, unsigned long *maxj)
 {
rdp->dynticks_snap = atomic_add_return(0, >dynticks->dynticks);
+   rcu_sysidle_check_cpu(rdp, isidle, maxj);
return (rdp->dynticks_snap & 0x1) == 0;
 }
 
@@ -1356,11 +1357,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
rsp->n_force_qs++;
if (fqs_state == RCU_SAVE_DYNTICK) {
/* Collect dyntick-idle snapshots. */
+   if (is_sysidle_rcu_state(rsp)) {
+   isidle = 1;
+   maxj = jiffies - ULONG_MAX / 4;
+   }
force_qs_rnp(rsp, dyntick_save_progress_counter,
 , );
+   rcu_sysidle_report(rsp, isidle, maxj);
fqs_state = RCU_FORCE_QS;
} else {
/* Handle dyntick-idle and offline CPUs. */
+   isidle = 0;
force_qs_rnp(rsp, rcu_implicit_dynticks_qs, , );
}
/* Clear flag to prevent immediate re-entry. */
@@ -2087,9 +2094,12 @@ static void force_qs_rnp(struct rcu_state *rsp,
cpu = rnp->grplo;
bit = 1;
for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
-   if ((rnp->qsmask & bit) != 0 &&
-   f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
-   mask |= bit;
+   if ((rnp->qsmask & bit) != 0) {
+   if ((rnp->qsmaskinit & bit) != 0)
+   *isidle = 0;
+   if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
+   mask |= bit;
+   }
}
if (mask != 0) {
 
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1895043..7326a3c 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu);
 static bool init_nocb_callback_list(struct rcu_data *rdp);
 static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
 static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+ unsigned long *maxj);
+static bool is_sysidle_rcu_state(struct rcu_state *rsp);
+static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
+  unsigned long maxj);
 static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
 
 #endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 3edae39..b47ffb0 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -28,7 +28,7 @@
 #include 
 #include 
 #include 
-#include 
+#include "time/tick-internal.h"
 
 

[PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine

2013-07-08 Thread Paul E. McKenney
From: Paul E. McKenney paul...@linux.vnet.ibm.com

This commit adds the state machine that takes the per-CPU idle data
as input and produces a full-system-idle indication as output.  This
state machine is driven out of RCU's quiescent-state-forcing
mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU
idle state and then rcu_sysidle_report() to drive the state machine.

The full-system-idle state is sampled using rcu_sys_is_idle(), which
also drives the state machine if RCU is idle (and does so by forcing
RCU to become non-idle).  This function returns true if all but the
timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long
enough to avoid memory contention on the full_sysidle_state state
variable.  The rcu_sysidle_force_exit() may be called externally
to reset the state machine back into non-idle state.

Signed-off-by: Paul E. McKenney paul...@linux.vnet.ibm.com
Cc: Frederic Weisbecker fweis...@gmail.com
Cc: Steven Rostedt rost...@goodmis.org

Conflicts:

kernel/rcutree.h
kernel/rcutree_plugin.h
---
 include/linux/rcupdate.h |  18 
 kernel/rcutree.c |  16 ++-
 kernel/rcutree.h |   5 +
 kernel/rcutree_plugin.h  | 275 ++-
 4 files changed, 307 insertions(+), 7 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 48f1ef9..1aa8d8c 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return 
false; }
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
 
 
+/* Only for use by adaptive-ticks code. */
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+extern bool rcu_sys_is_idle(void);
+extern void rcu_sysidle_force_exit(void);
+#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+
+static inline bool rcu_sys_is_idle(void)
+{
+   return false;
+}
+
+static inline void rcu_sysidle_force_exit(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+
+
 #endif /* __LINUX_RCUPDATE_H */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 9971f86..06cfd75 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -718,6 +718,7 @@ static int dyntick_save_progress_counter(struct rcu_data 
*rdp,
 bool *isidle, unsigned long *maxj)
 {
rdp-dynticks_snap = atomic_add_return(0, rdp-dynticks-dynticks);
+   rcu_sysidle_check_cpu(rdp, isidle, maxj);
return (rdp-dynticks_snap  0x1) == 0;
 }
 
@@ -1356,11 +1357,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
rsp-n_force_qs++;
if (fqs_state == RCU_SAVE_DYNTICK) {
/* Collect dyntick-idle snapshots. */
+   if (is_sysidle_rcu_state(rsp)) {
+   isidle = 1;
+   maxj = jiffies - ULONG_MAX / 4;
+   }
force_qs_rnp(rsp, dyntick_save_progress_counter,
 isidle, maxj);
+   rcu_sysidle_report(rsp, isidle, maxj);
fqs_state = RCU_FORCE_QS;
} else {
/* Handle dyntick-idle and offline CPUs. */
+   isidle = 0;
force_qs_rnp(rsp, rcu_implicit_dynticks_qs, isidle, maxj);
}
/* Clear flag to prevent immediate re-entry. */
@@ -2087,9 +2094,12 @@ static void force_qs_rnp(struct rcu_state *rsp,
cpu = rnp-grplo;
bit = 1;
for (; cpu = rnp-grphi; cpu++, bit = 1) {
-   if ((rnp-qsmask  bit) != 0 
-   f(per_cpu_ptr(rsp-rda, cpu), isidle, maxj))
-   mask |= bit;
+   if ((rnp-qsmask  bit) != 0) {
+   if ((rnp-qsmaskinit  bit) != 0)
+   *isidle = 0;
+   if (f(per_cpu_ptr(rsp-rda, cpu), isidle, maxj))
+   mask |= bit;
+   }
}
if (mask != 0) {
 
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1895043..7326a3c 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu);
 static bool init_nocb_callback_list(struct rcu_data *rdp);
 static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
 static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+ unsigned long *maxj);
+static bool is_sysidle_rcu_state(struct rcu_state *rsp);
+static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
+  unsigned long maxj);
 static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
 
 #endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 3edae39..b47ffb0 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -28,7