Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Fri, Aug 09, 2013 at 06:20:59PM +0200, Frederic Weisbecker wrote: > On Fri, Jul 26, 2013 at 04:19:23PM -0700, Paul E. McKenney wrote: > > diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h > > index 3edae39..ff84bed 100644 > > --- a/kernel/rcutree_plugin.h > > +++ b/kernel/rcutree_plugin.h > > @@ -28,7 +28,7 @@ > > #include > > #include > > #include > > -#include > > +#include "time/tick-internal.h" > > > > #define RCU_KTHREAD_PRIO 1 > > > > @@ -2395,12 +2395,12 @@ static void rcu_kick_nohz_cpu(int cpu) > > * most active flavor of RCU. > > */ > > #ifdef CONFIG_PREEMPT_RCU > > -static struct rcu_state __maybe_unused *rcu_sysidle_state = > > _preempt_state; > > +static struct rcu_state *rcu_sysidle_state = _preempt_state; > > #else /* #ifdef CONFIG_PREEMPT_RCU */ > > -static struct rcu_state __maybe_unused *rcu_sysidle_state = > > _sched_state; > > +static struct rcu_state *rcu_sysidle_state = _sched_state; > > #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ > > Ah you fixed it here. Ok :) Bisectability and all that. ;-) > > -static int __maybe_unused full_sysidle_state; /* Current system-idle > > state. */ > > +static int full_sysidle_state; /* Current system-idle state. */ > > #define RCU_SYSIDLE_NOT0 /* Some CPU is not idle. */ > > #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ > > #define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ > [...] > > +/* > > + * Check to see if the system is fully idle, other than the timekeeping > > CPU. > > + * The caller must have disabled interrupts. > > + */ > > +bool rcu_sys_is_idle(void) > > +{ > > + static struct rcu_sysidle_head rsh; > > + int rss = ACCESS_ONCE(full_sysidle_state); > > + > > + if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) > > + return false; > > + > > + /* Handle small-system case by doing a full scan of CPUs. */ > > + if (nr_cpu_ids <= RCU_SYSIDLE_SMALL) { > > I don't understand how the nr_cpu_ids > RCU_SYSIDLE_SMALL is handled. There > don't > seem to be other calls of rcu_sysidle_check_cpu() than for small systems. The other calls are from kernel/rcutree.c from the force-quiescent-state code. If we have a big system, we don't check until we have some other reason to touch the cache lines. If we have a small system, we just dig through them on transition to idle. Thanx, Paul -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Fri, Aug 09, 2013 at 06:20:59PM +0200, Frederic Weisbecker wrote: On Fri, Jul 26, 2013 at 04:19:23PM -0700, Paul E. McKenney wrote: diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3edae39..ff84bed 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -28,7 +28,7 @@ #include linux/gfp.h #include linux/oom.h #include linux/smpboot.h -#include linux/tick.h +#include time/tick-internal.h #define RCU_KTHREAD_PRIO 1 @@ -2395,12 +2395,12 @@ static void rcu_kick_nohz_cpu(int cpu) * most active flavor of RCU. */ #ifdef CONFIG_PREEMPT_RCU -static struct rcu_state __maybe_unused *rcu_sysidle_state = rcu_preempt_state; +static struct rcu_state *rcu_sysidle_state = rcu_preempt_state; #else /* #ifdef CONFIG_PREEMPT_RCU */ -static struct rcu_state __maybe_unused *rcu_sysidle_state = rcu_sched_state; +static struct rcu_state *rcu_sysidle_state = rcu_sched_state; #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ Ah you fixed it here. Ok :) Bisectability and all that. ;-) -static int __maybe_unused full_sysidle_state; /* Current system-idle state. */ +static int full_sysidle_state; /* Current system-idle state. */ #define RCU_SYSIDLE_NOT0 /* Some CPU is not idle. */ #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ #define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ [...] +/* + * Check to see if the system is fully idle, other than the timekeeping CPU. + * The caller must have disabled interrupts. + */ +bool rcu_sys_is_idle(void) +{ + static struct rcu_sysidle_head rsh; + int rss = ACCESS_ONCE(full_sysidle_state); + + if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) + return false; + + /* Handle small-system case by doing a full scan of CPUs. */ + if (nr_cpu_ids = RCU_SYSIDLE_SMALL) { I don't understand how the nr_cpu_ids RCU_SYSIDLE_SMALL is handled. There don't seem to be other calls of rcu_sysidle_check_cpu() than for small systems. The other calls are from kernel/rcutree.c from the force-quiescent-state code. If we have a big system, we don't check until we have some other reason to touch the cache lines. If we have a small system, we just dig through them on transition to idle. Thanx, Paul -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Fri, Jul 26, 2013 at 04:19:23PM -0700, Paul E. McKenney wrote: > diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h > index 3edae39..ff84bed 100644 > --- a/kernel/rcutree_plugin.h > +++ b/kernel/rcutree_plugin.h > @@ -28,7 +28,7 @@ > #include > #include > #include > -#include > +#include "time/tick-internal.h" > > #define RCU_KTHREAD_PRIO 1 > > @@ -2395,12 +2395,12 @@ static void rcu_kick_nohz_cpu(int cpu) > * most active flavor of RCU. > */ > #ifdef CONFIG_PREEMPT_RCU > -static struct rcu_state __maybe_unused *rcu_sysidle_state = > _preempt_state; > +static struct rcu_state *rcu_sysidle_state = _preempt_state; > #else /* #ifdef CONFIG_PREEMPT_RCU */ > -static struct rcu_state __maybe_unused *rcu_sysidle_state = _sched_state; > +static struct rcu_state *rcu_sysidle_state = _sched_state; > #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ Ah you fixed it here. Ok :) > > -static int __maybe_unused full_sysidle_state; /* Current system-idle state. > */ > +static int full_sysidle_state; /* Current system-idle state. */ > #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ > #define RCU_SYSIDLE_SHORT1 /* All CPUs idle for brief period. */ > #define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ [...] > +/* > + * Check to see if the system is fully idle, other than the timekeeping CPU. > + * The caller must have disabled interrupts. > + */ > +bool rcu_sys_is_idle(void) > +{ > + static struct rcu_sysidle_head rsh; > + int rss = ACCESS_ONCE(full_sysidle_state); > + > + if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) > + return false; > + > + /* Handle small-system case by doing a full scan of CPUs. */ > + if (nr_cpu_ids <= RCU_SYSIDLE_SMALL) { I don't understand how the nr_cpu_ids > RCU_SYSIDLE_SMALL is handled. There don't seem to be other calls of rcu_sysidle_check_cpu() than for small systems. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Fri, Jul 26, 2013 at 04:19:23PM -0700, Paul E. McKenney wrote: diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3edae39..ff84bed 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -28,7 +28,7 @@ #include linux/gfp.h #include linux/oom.h #include linux/smpboot.h -#include linux/tick.h +#include time/tick-internal.h #define RCU_KTHREAD_PRIO 1 @@ -2395,12 +2395,12 @@ static void rcu_kick_nohz_cpu(int cpu) * most active flavor of RCU. */ #ifdef CONFIG_PREEMPT_RCU -static struct rcu_state __maybe_unused *rcu_sysidle_state = rcu_preempt_state; +static struct rcu_state *rcu_sysidle_state = rcu_preempt_state; #else /* #ifdef CONFIG_PREEMPT_RCU */ -static struct rcu_state __maybe_unused *rcu_sysidle_state = rcu_sched_state; +static struct rcu_state *rcu_sysidle_state = rcu_sched_state; #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ Ah you fixed it here. Ok :) -static int __maybe_unused full_sysidle_state; /* Current system-idle state. */ +static int full_sysidle_state; /* Current system-idle state. */ #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ #define RCU_SYSIDLE_SHORT1 /* All CPUs idle for brief period. */ #define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ [...] +/* + * Check to see if the system is fully idle, other than the timekeeping CPU. + * The caller must have disabled interrupts. + */ +bool rcu_sys_is_idle(void) +{ + static struct rcu_sysidle_head rsh; + int rss = ACCESS_ONCE(full_sysidle_state); + + if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) + return false; + + /* Handle small-system case by doing a full scan of CPUs. */ + if (nr_cpu_ids = RCU_SYSIDLE_SMALL) { I don't understand how the nr_cpu_ids RCU_SYSIDLE_SMALL is handled. There don't seem to be other calls of rcu_sysidle_check_cpu() than for small systems. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Mon, Jul 29, 2013 at 04:19:48PM +0800, Lai Jiangshan wrote: > On 07/27/2013 07:19 AM, Paul E. McKenney wrote: > > From: "Paul E. McKenney" > > > > This commit adds the state machine that takes the per-CPU idle data > > as input and produces a full-system-idle indication as output. This > > state machine is driven out of RCU's quiescent-state-forcing > > mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU > > idle state and then rcu_sysidle_report() to drive the state machine. > > > > The full-system-idle state is sampled using rcu_sys_is_idle(), which > > also drives the state machine if RCU is idle (and does so by forcing > > RCU to become non-idle). This function returns true if all but the > > timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long > > enough to avoid memory contention on the full_sysidle_state state > > variable. The rcu_sysidle_force_exit() may be called externally > > to reset the state machine back into non-idle state. > > > > Signed-off-by: Paul E. McKenney > > Cc: Frederic Weisbecker > > Cc: Steven Rostedt > > --- > > include/linux/rcupdate.h | 18 +++ > > kernel/rcutree.c | 16 ++- > > kernel/rcutree.h | 5 + > > kernel/rcutree_plugin.h | 284 > > ++- > > 4 files changed, 316 insertions(+), 7 deletions(-) > > > > diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h > > index 48f1ef9..1aa8d8c 100644 > > --- a/include/linux/rcupdate.h > > +++ b/include/linux/rcupdate.h > > @@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return > > false; } > > #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ > > > > > > +/* Only for use by adaptive-ticks code. */ > > +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE > > +extern bool rcu_sys_is_idle(void); > > +extern void rcu_sysidle_force_exit(void); > > +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ > > + > > +static inline bool rcu_sys_is_idle(void) > > +{ > > + return false; > > +} > > + > > +static inline void rcu_sysidle_force_exit(void) > > +{ > > +} > > + > > +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ > > + > > + > > #endif /* __LINUX_RCUPDATE_H */ > > diff --git a/kernel/rcutree.c b/kernel/rcutree.c > > index 725524e..aa6d96e 100644 > > --- a/kernel/rcutree.c > > +++ b/kernel/rcutree.c > > @@ -718,6 +718,7 @@ static int dyntick_save_progress_counter(struct > > rcu_data *rdp, > > bool *isidle, unsigned long *maxj) > > { > > rdp->dynticks_snap = atomic_add_return(0, >dynticks->dynticks); > > + rcu_sysidle_check_cpu(rdp, isidle, maxj); > > return (rdp->dynticks_snap & 0x1) == 0; > > } > > > > @@ -1356,11 +1357,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int > > fqs_state_in) > > rsp->n_force_qs++; > > if (fqs_state == RCU_SAVE_DYNTICK) { > > /* Collect dyntick-idle snapshots. */ > > + if (is_sysidle_rcu_state(rsp)) { > > + isidle = 1; > > isidle = true; > the type of isidle is bool > > > + maxj = jiffies - ULONG_MAX / 4; > > + } > > force_qs_rnp(rsp, dyntick_save_progress_counter, > > , ); > > + rcu_sysidle_report_gp(rsp, isidle, maxj); > > fqs_state = RCU_FORCE_QS; > > } else { > > /* Handle dyntick-idle and offline CPUs. */ > > + isidle = 0; > > isidle = false; > > > force_qs_rnp(rsp, rcu_implicit_dynticks_qs, , ); > > } > > /* Clear flag to prevent immediate re-entry. */ > > @@ -2087,9 +2094,12 @@ static void force_qs_rnp(struct rcu_state *rsp, > > cpu = rnp->grplo; > > bit = 1; > > for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { > > - if ((rnp->qsmask & bit) != 0 && > > - f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) > > - mask |= bit; > > + if ((rnp->qsmask & bit) != 0) { > > + if ((rnp->qsmaskinit & bit) != 0) > > + *isidle = 0; > > *isidle = false All good catches, fixed. > > + if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) > > + mask |= bit; > > + } > > } > > if (mask != 0) { > > > > diff --git a/kernel/rcutree.h b/kernel/rcutree.h > > index 1895043..e0de5dc 100644 > > --- a/kernel/rcutree.h > > +++ b/kernel/rcutree.h > > @@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu); > > static bool init_nocb_callback_list(struct rcu_data *rdp); > > static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); > > static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); > > +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, > > + unsigned long *maxj); > > +static bool is_sysidle_rcu_state(struct rcu_state *rsp); > >
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On 07/27/2013 07:19 AM, Paul E. McKenney wrote: > From: "Paul E. McKenney" > > This commit adds the state machine that takes the per-CPU idle data > as input and produces a full-system-idle indication as output. This > state machine is driven out of RCU's quiescent-state-forcing > mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU > idle state and then rcu_sysidle_report() to drive the state machine. > > The full-system-idle state is sampled using rcu_sys_is_idle(), which > also drives the state machine if RCU is idle (and does so by forcing > RCU to become non-idle). This function returns true if all but the > timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long > enough to avoid memory contention on the full_sysidle_state state > variable. The rcu_sysidle_force_exit() may be called externally > to reset the state machine back into non-idle state. > > Signed-off-by: Paul E. McKenney > Cc: Frederic Weisbecker > Cc: Steven Rostedt > --- > include/linux/rcupdate.h | 18 +++ > kernel/rcutree.c | 16 ++- > kernel/rcutree.h | 5 + > kernel/rcutree_plugin.h | 284 > ++- > 4 files changed, 316 insertions(+), 7 deletions(-) > > diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h > index 48f1ef9..1aa8d8c 100644 > --- a/include/linux/rcupdate.h > +++ b/include/linux/rcupdate.h > @@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return > false; } > #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ > > > +/* Only for use by adaptive-ticks code. */ > +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE > +extern bool rcu_sys_is_idle(void); > +extern void rcu_sysidle_force_exit(void); > +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ > + > +static inline bool rcu_sys_is_idle(void) > +{ > + return false; > +} > + > +static inline void rcu_sysidle_force_exit(void) > +{ > +} > + > +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ > + > + > #endif /* __LINUX_RCUPDATE_H */ > diff --git a/kernel/rcutree.c b/kernel/rcutree.c > index 725524e..aa6d96e 100644 > --- a/kernel/rcutree.c > +++ b/kernel/rcutree.c > @@ -718,6 +718,7 @@ static int dyntick_save_progress_counter(struct rcu_data > *rdp, >bool *isidle, unsigned long *maxj) > { > rdp->dynticks_snap = atomic_add_return(0, >dynticks->dynticks); > + rcu_sysidle_check_cpu(rdp, isidle, maxj); > return (rdp->dynticks_snap & 0x1) == 0; > } > > @@ -1356,11 +1357,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int > fqs_state_in) > rsp->n_force_qs++; > if (fqs_state == RCU_SAVE_DYNTICK) { > /* Collect dyntick-idle snapshots. */ > + if (is_sysidle_rcu_state(rsp)) { > + isidle = 1; isidle = true; the type of isidle is bool > + maxj = jiffies - ULONG_MAX / 4; > + } > force_qs_rnp(rsp, dyntick_save_progress_counter, >, ); > + rcu_sysidle_report_gp(rsp, isidle, maxj); > fqs_state = RCU_FORCE_QS; > } else { > /* Handle dyntick-idle and offline CPUs. */ > + isidle = 0; isidle = false; > force_qs_rnp(rsp, rcu_implicit_dynticks_qs, , ); > } > /* Clear flag to prevent immediate re-entry. */ > @@ -2087,9 +2094,12 @@ static void force_qs_rnp(struct rcu_state *rsp, > cpu = rnp->grplo; > bit = 1; > for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { > - if ((rnp->qsmask & bit) != 0 && > - f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) > - mask |= bit; > + if ((rnp->qsmask & bit) != 0) { > + if ((rnp->qsmaskinit & bit) != 0) > + *isidle = 0; *isidle = false > + if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) > + mask |= bit; > + } > } > if (mask != 0) { > > diff --git a/kernel/rcutree.h b/kernel/rcutree.h > index 1895043..e0de5dc 100644 > --- a/kernel/rcutree.h > +++ b/kernel/rcutree.h > @@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu); > static bool init_nocb_callback_list(struct rcu_data *rdp); > static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); > static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); > +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, > + unsigned long *maxj); > +static bool is_sysidle_rcu_state(struct rcu_state *rsp); > +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, > + unsigned long maxj); > static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); > > #endif /* #ifndef RCU_TREE_NONCORE */ > diff --git
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On 07/27/2013 07:19 AM, Paul E. McKenney wrote: From: Paul E. McKenney paul...@linux.vnet.ibm.com This commit adds the state machine that takes the per-CPU idle data as input and produces a full-system-idle indication as output. This state machine is driven out of RCU's quiescent-state-forcing mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU idle state and then rcu_sysidle_report() to drive the state machine. The full-system-idle state is sampled using rcu_sys_is_idle(), which also drives the state machine if RCU is idle (and does so by forcing RCU to become non-idle). This function returns true if all but the timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long enough to avoid memory contention on the full_sysidle_state state variable. The rcu_sysidle_force_exit() may be called externally to reset the state machine back into non-idle state. Signed-off-by: Paul E. McKenney paul...@linux.vnet.ibm.com Cc: Frederic Weisbecker fweis...@gmail.com Cc: Steven Rostedt rost...@goodmis.org --- include/linux/rcupdate.h | 18 +++ kernel/rcutree.c | 16 ++- kernel/rcutree.h | 5 + kernel/rcutree_plugin.h | 284 ++- 4 files changed, 316 insertions(+), 7 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48f1ef9..1aa8d8c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return false; } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ +/* Only for use by adaptive-ticks code. */ +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE +extern bool rcu_sys_is_idle(void); +extern void rcu_sysidle_force_exit(void); +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + +static inline bool rcu_sys_is_idle(void) +{ + return false; +} + +static inline void rcu_sysidle_force_exit(void) +{ +} + +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + + #endif /* __LINUX_RCUPDATE_H */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 725524e..aa6d96e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -718,6 +718,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { rdp-dynticks_snap = atomic_add_return(0, rdp-dynticks-dynticks); + rcu_sysidle_check_cpu(rdp, isidle, maxj); return (rdp-dynticks_snap 0x1) == 0; } @@ -1356,11 +1357,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) rsp-n_force_qs++; if (fqs_state == RCU_SAVE_DYNTICK) { /* Collect dyntick-idle snapshots. */ + if (is_sysidle_rcu_state(rsp)) { + isidle = 1; isidle = true; the type of isidle is bool + maxj = jiffies - ULONG_MAX / 4; + } force_qs_rnp(rsp, dyntick_save_progress_counter, isidle, maxj); + rcu_sysidle_report_gp(rsp, isidle, maxj); fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ + isidle = 0; isidle = false; force_qs_rnp(rsp, rcu_implicit_dynticks_qs, isidle, maxj); } /* Clear flag to prevent immediate re-entry. */ @@ -2087,9 +2094,12 @@ static void force_qs_rnp(struct rcu_state *rsp, cpu = rnp-grplo; bit = 1; for (; cpu = rnp-grphi; cpu++, bit = 1) { - if ((rnp-qsmask bit) != 0 - f(per_cpu_ptr(rsp-rda, cpu), isidle, maxj)) - mask |= bit; + if ((rnp-qsmask bit) != 0) { + if ((rnp-qsmaskinit bit) != 0) + *isidle = 0; *isidle = false + if (f(per_cpu_ptr(rsp-rda, cpu), isidle, maxj)) + mask |= bit; + } } if (mask != 0) { diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 1895043..e0de5dc 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, + unsigned long *maxj); +static bool is_sysidle_rcu_state(struct rcu_state *rsp); +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Mon, Jul 29, 2013 at 04:19:48PM +0800, Lai Jiangshan wrote: On 07/27/2013 07:19 AM, Paul E. McKenney wrote: From: Paul E. McKenney paul...@linux.vnet.ibm.com This commit adds the state machine that takes the per-CPU idle data as input and produces a full-system-idle indication as output. This state machine is driven out of RCU's quiescent-state-forcing mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU idle state and then rcu_sysidle_report() to drive the state machine. The full-system-idle state is sampled using rcu_sys_is_idle(), which also drives the state machine if RCU is idle (and does so by forcing RCU to become non-idle). This function returns true if all but the timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long enough to avoid memory contention on the full_sysidle_state state variable. The rcu_sysidle_force_exit() may be called externally to reset the state machine back into non-idle state. Signed-off-by: Paul E. McKenney paul...@linux.vnet.ibm.com Cc: Frederic Weisbecker fweis...@gmail.com Cc: Steven Rostedt rost...@goodmis.org --- include/linux/rcupdate.h | 18 +++ kernel/rcutree.c | 16 ++- kernel/rcutree.h | 5 + kernel/rcutree_plugin.h | 284 ++- 4 files changed, 316 insertions(+), 7 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48f1ef9..1aa8d8c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return false; } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ +/* Only for use by adaptive-ticks code. */ +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE +extern bool rcu_sys_is_idle(void); +extern void rcu_sysidle_force_exit(void); +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + +static inline bool rcu_sys_is_idle(void) +{ + return false; +} + +static inline void rcu_sysidle_force_exit(void) +{ +} + +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + + #endif /* __LINUX_RCUPDATE_H */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 725524e..aa6d96e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -718,6 +718,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { rdp-dynticks_snap = atomic_add_return(0, rdp-dynticks-dynticks); + rcu_sysidle_check_cpu(rdp, isidle, maxj); return (rdp-dynticks_snap 0x1) == 0; } @@ -1356,11 +1357,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) rsp-n_force_qs++; if (fqs_state == RCU_SAVE_DYNTICK) { /* Collect dyntick-idle snapshots. */ + if (is_sysidle_rcu_state(rsp)) { + isidle = 1; isidle = true; the type of isidle is bool + maxj = jiffies - ULONG_MAX / 4; + } force_qs_rnp(rsp, dyntick_save_progress_counter, isidle, maxj); + rcu_sysidle_report_gp(rsp, isidle, maxj); fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ + isidle = 0; isidle = false; force_qs_rnp(rsp, rcu_implicit_dynticks_qs, isidle, maxj); } /* Clear flag to prevent immediate re-entry. */ @@ -2087,9 +2094,12 @@ static void force_qs_rnp(struct rcu_state *rsp, cpu = rnp-grplo; bit = 1; for (; cpu = rnp-grphi; cpu++, bit = 1) { - if ((rnp-qsmask bit) != 0 - f(per_cpu_ptr(rsp-rda, cpu), isidle, maxj)) - mask |= bit; + if ((rnp-qsmask bit) != 0) { + if ((rnp-qsmaskinit bit) != 0) + *isidle = 0; *isidle = false All good catches, fixed. + if (f(per_cpu_ptr(rsp-rda, cpu), isidle, maxj)) + mask |= bit; + } } if (mask != 0) { diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 1895043..e0de5dc 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, + unsigned long *maxj); +static bool is_sysidle_rcu_state(struct rcu_state *rsp); +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj); static void
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Fri, Jul 26, 2013 at 03:52:12PM -0700, Paul E. McKenney wrote: > On Thu, Jul 25, 2013 at 01:26:44AM +0200, Frederic Weisbecker wrote: > > I don't know because I encountered some troubles to build it, I'm seeing > > thousand > > lines like this: > > > > Name "main::opt_help" used only once: possible typo at /usr/bin/a2ping line > > 534. > > /usr/bin/a2ping: not a GS output from gs -dSAFER > > ./cartoons/whippersnapper300.eps -> ./cartoons/whippersnapper300.pdf > > Name "main::opt_extra" used only once: possible typo at /usr/bin/a2ping > > line 546. > > Name "main::opt_help" used only once: possible typo at /usr/bin/a2ping line > > 534. > > /usr/bin/a2ping: not a GS output from gs -dSAFER > > make: *** [embedfonts] Error 1 > > Strange. My version of a2ping is Ubuntu 12.04's: > > a2ping.pl 2.77p, 2006-11-15 -- Written by from April 2003. > > You have something more recent? I run Fedora 15 (yeah, too lazy to upgrade), and: a2ping.pl 2.77p, 2004-04-28 -- Written by from April 2003 > > So back to the issue, I think we made nice progresses with my rusty brain > > ;-) > > But just to be clear, I'm pasting that again for just a few precisions: > > > > CPU 0CPU 1 > > > >cmpxchg(_sysidle_state, //CPU 1 wakes up > > RCU_SYSIDLE_SHORT, > > atomic_inc(rdtp(1)->dynticks_idle) > > RCU_SYSIDLE_LONG); > > > >smp_mb() //cmpxchgsmp_mb() > >atomic_read(rdtp(1)->dynticks_idle) ACCESS_ONCE(full_sysidle_state > > //CPU 0 goes to sleep > > > > > > > > 1) If CPU 0 sets RCU_SYSIDLE_LONG and sees dynticks_idle as even, do we > > have the _guarantee_ > > that later CPU 1 sees full_sysidle_state == RCU_SYSIDLE_LONG (or any later > > full_sysidle_state value) > > due to the connection between atomic_read / atomic_inc and the barriers > > that come along? > > No, we do not have this guarantee. > > What happens instead is that CPU 0 later sets RCU_SYSIDLE_FULL after > having again seen CPU 1's ->dynticks_idle having an even (idle) value. > If CPU 1 later increments its ->dynticks_idle to an odd (non-idle) value, > then does a memory barrier, and then reads full_sysidle_state, then CPU > 1 is guaranteed to see RCU_SYSIDLE_LONG. Please note that CPU 1 is -not- > obligated to see RCU_SYSIDLE_FULL, much less RCU_SYSIDLE_NOTED. Aah. I think I got it now. So lay this out like what follows: void cpu_0(int old, int new) { cmpxchg(_sysidle_state, old, new); smp_mb() atomic_read(rdtp(1)->dynticks_idle) } void cpu_1(void) { atomic_inc(rdtp(1)->dynticks_idle) smp_mb() ACCESS_ONCE(full_sysidle_stat) } And this scenario: CPU 0 CPU 1 cpu_0(RCU_SYSIDLE_NOT, RCU_SYSIDLE_SHORT) cpu_0(RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG) cpu_1() // guaranteed to see at least RCU_SYSIDLE_SHORT cpu_0(RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL) cpu_1() // guaranteed to see at least RCU_SYSIDLE_LONG IIUC, the double slah comments should be true. If so I can observe that even if we don't have memory commit guarantees, there seem to be some forward progress guarantee against the update and read sequences. What usually guarantees such forward progress on the sequences is when we have some locking or atomic ops updates that also return a value (inc_return, cmpxchg, ...) mirroring on both sides. I can't find anything in atomic_ops.txt or in your book (ok sorry I only read a few pages on the barriers chapter :o) that describes such forward progress guarantee. Would I retrieve such a guarantee on a generic sequence like this? //A = B = 0 CPU 0CPU 1 store A = 1 store B = 1 mb() mb() read B read A store A = 2 store B = 2 mb() mb() read B read A On the second sequence can I deduce that A and B as respectively read by CPU 0 and CPU 1 are at least 1 and might be 2? > > However, when CPU 1 does an atomic operation on full_sysidle_state that > returns the old value, CPU 1 is guaranteed to get the most recent state. > (Without this guarantee, we could have a cactus state of values for > full_sysidle_state, which might make alternate-universe fans happy, > but which would be really hard to program.) Haha! Yeah for the cmpxchg I'm ok. > > This is why we need an RCU_SYSIDLE_LONG state in addition to > RCU_SYSIDLE_SHORT and RCU_SYSIDLE_FULL. > > > 2) You taught me once that barrier != memory committed, and it has been one > > of the hardest trauma > > in my life. How can we be sure that CPU 1 sees memory as committed from CPU > > 0? The only fact that > > we read an even value from CPU 0 is enough for the connection between the > > atomic_read() and atomic_inc() > >
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Fri, Jul 26, 2013 at 03:52:12PM -0700, Paul E. McKenney wrote: On Thu, Jul 25, 2013 at 01:26:44AM +0200, Frederic Weisbecker wrote: I don't know because I encountered some troubles to build it, I'm seeing thousand lines like this: Name main::opt_help used only once: possible typo at /usr/bin/a2ping line 534. /usr/bin/a2ping: not a GS output from gs -dSAFER ./cartoons/whippersnapper300.eps - ./cartoons/whippersnapper300.pdf Name main::opt_extra used only once: possible typo at /usr/bin/a2ping line 546. Name main::opt_help used only once: possible typo at /usr/bin/a2ping line 534. /usr/bin/a2ping: not a GS output from gs -dSAFER make: *** [embedfonts] Error 1 Strange. My version of a2ping is Ubuntu 12.04's: a2ping.pl 2.77p, 2006-11-15 -- Written by p...@fazekas.hu from April 2003. You have something more recent? I run Fedora 15 (yeah, too lazy to upgrade), and: a2ping.pl 2.77p, 2004-04-28 -- Written by p...@fazekas.hu from April 2003 So back to the issue, I think we made nice progresses with my rusty brain ;-) But just to be clear, I'm pasting that again for just a few precisions: CPU 0CPU 1 cmpxchg(full_sysidle_state, //CPU 1 wakes up RCU_SYSIDLE_SHORT, atomic_inc(rdtp(1)-dynticks_idle) RCU_SYSIDLE_LONG); smp_mb() //cmpxchgsmp_mb() atomic_read(rdtp(1)-dynticks_idle) ACCESS_ONCE(full_sysidle_state //CPU 0 goes to sleep 1) If CPU 0 sets RCU_SYSIDLE_LONG and sees dynticks_idle as even, do we have the _guarantee_ that later CPU 1 sees full_sysidle_state == RCU_SYSIDLE_LONG (or any later full_sysidle_state value) due to the connection between atomic_read / atomic_inc and the barriers that come along? No, we do not have this guarantee. What happens instead is that CPU 0 later sets RCU_SYSIDLE_FULL after having again seen CPU 1's -dynticks_idle having an even (idle) value. If CPU 1 later increments its -dynticks_idle to an odd (non-idle) value, then does a memory barrier, and then reads full_sysidle_state, then CPU 1 is guaranteed to see RCU_SYSIDLE_LONG. Please note that CPU 1 is -not- obligated to see RCU_SYSIDLE_FULL, much less RCU_SYSIDLE_NOTED. Aah. I think I got it now. So lay this out like what follows: void cpu_0(int old, int new) { cmpxchg(full_sysidle_state, old, new); smp_mb() atomic_read(rdtp(1)-dynticks_idle) } void cpu_1(void) { atomic_inc(rdtp(1)-dynticks_idle) smp_mb() ACCESS_ONCE(full_sysidle_stat) } And this scenario: CPU 0 CPU 1 cpu_0(RCU_SYSIDLE_NOT, RCU_SYSIDLE_SHORT) cpu_0(RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG) cpu_1() // guaranteed to see at least RCU_SYSIDLE_SHORT cpu_0(RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL) cpu_1() // guaranteed to see at least RCU_SYSIDLE_LONG IIUC, the double slah comments should be true. If so I can observe that even if we don't have memory commit guarantees, there seem to be some forward progress guarantee against the update and read sequences. What usually guarantees such forward progress on the sequences is when we have some locking or atomic ops updates that also return a value (inc_return, cmpxchg, ...) mirroring on both sides. I can't find anything in atomic_ops.txt or in your book (ok sorry I only read a few pages on the barriers chapter :o) that describes such forward progress guarantee. Would I retrieve such a guarantee on a generic sequence like this? //A = B = 0 CPU 0CPU 1 store A = 1 store B = 1 mb() mb() read B read A store A = 2 store B = 2 mb() mb() read B read A On the second sequence can I deduce that A and B as respectively read by CPU 0 and CPU 1 are at least 1 and might be 2? However, when CPU 1 does an atomic operation on full_sysidle_state that returns the old value, CPU 1 is guaranteed to get the most recent state. (Without this guarantee, we could have a cactus state of values for full_sysidle_state, which might make alternate-universe fans happy, but which would be really hard to program.) Haha! Yeah for the cmpxchg I'm ok. This is why we need an RCU_SYSIDLE_LONG state in addition to RCU_SYSIDLE_SHORT and RCU_SYSIDLE_FULL. 2) You taught me once that barrier != memory committed, and it has been one of the hardest trauma in my life. How can we be sure that CPU 1 sees memory as committed from CPU 0? The only fact that we read an even value from CPU 0 is enough for the connection between the atomic_read() and atomic_inc() and all the barriers that come along? My condolences on the trauma! Now, if you will
[PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
From: "Paul E. McKenney" This commit adds the state machine that takes the per-CPU idle data as input and produces a full-system-idle indication as output. This state machine is driven out of RCU's quiescent-state-forcing mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU idle state and then rcu_sysidle_report() to drive the state machine. The full-system-idle state is sampled using rcu_sys_is_idle(), which also drives the state machine if RCU is idle (and does so by forcing RCU to become non-idle). This function returns true if all but the timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long enough to avoid memory contention on the full_sysidle_state state variable. The rcu_sysidle_force_exit() may be called externally to reset the state machine back into non-idle state. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Cc: Steven Rostedt --- include/linux/rcupdate.h | 18 +++ kernel/rcutree.c | 16 ++- kernel/rcutree.h | 5 + kernel/rcutree_plugin.h | 284 ++- 4 files changed, 316 insertions(+), 7 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48f1ef9..1aa8d8c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return false; } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ +/* Only for use by adaptive-ticks code. */ +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE +extern bool rcu_sys_is_idle(void); +extern void rcu_sysidle_force_exit(void); +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + +static inline bool rcu_sys_is_idle(void) +{ + return false; +} + +static inline void rcu_sysidle_force_exit(void) +{ +} + +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + + #endif /* __LINUX_RCUPDATE_H */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 725524e..aa6d96e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -718,6 +718,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { rdp->dynticks_snap = atomic_add_return(0, >dynticks->dynticks); + rcu_sysidle_check_cpu(rdp, isidle, maxj); return (rdp->dynticks_snap & 0x1) == 0; } @@ -1356,11 +1357,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) rsp->n_force_qs++; if (fqs_state == RCU_SAVE_DYNTICK) { /* Collect dyntick-idle snapshots. */ + if (is_sysidle_rcu_state(rsp)) { + isidle = 1; + maxj = jiffies - ULONG_MAX / 4; + } force_qs_rnp(rsp, dyntick_save_progress_counter, , ); + rcu_sysidle_report_gp(rsp, isidle, maxj); fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ + isidle = 0; force_qs_rnp(rsp, rcu_implicit_dynticks_qs, , ); } /* Clear flag to prevent immediate re-entry. */ @@ -2087,9 +2094,12 @@ static void force_qs_rnp(struct rcu_state *rsp, cpu = rnp->grplo; bit = 1; for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { - if ((rnp->qsmask & bit) != 0 && - f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) - mask |= bit; + if ((rnp->qsmask & bit) != 0) { + if ((rnp->qsmaskinit & bit) != 0) + *isidle = 0; + if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) + mask |= bit; + } } if (mask != 0) { diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 1895043..e0de5dc 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, + unsigned long *maxj); +static bool is_sysidle_rcu_state(struct rcu_state *rsp); +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3edae39..ff84bed 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -28,7 +28,7 @@ #include #include #include -#include +#include "time/tick-internal.h" #define RCU_KTHREAD_PRIO 1 @@ -2395,12 +2395,12 @@ static void
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Thu, Jul 25, 2013 at 01:26:44AM +0200, Frederic Weisbecker wrote: > On Wed, Jul 24, 2013 at 03:09:02PM -0700, Paul E. McKenney wrote: > > On Wed, Jul 24, 2013 at 08:09:04PM +0200, Frederic Weisbecker wrote: > > > On Thu, Jul 18, 2013 at 10:06:25PM -0700, Paul E. McKenney wrote: > > > > > Lets summarize the last sequence, the following happens ordered by > > > > > time: > > > > > > > > > > CPU 0 CPU 1 > > > > > > > > > > cmpxchg(_sysidle_state, > > > > > RCU_SYSIDLE_SHORT, > > > > > RCU_SYSIDLE_LONG); > > > > > > > > > > smp_mb() //cmpxchg > > > > > > > > > > atomic_read(rdtp(1)->dynticks_idle) > > > > > > > > > > //CPU 0 goes to sleep > > > > >//CPU 1 wakes up > > > > > > > > > > atomic_inc(rdtp(1)->dynticks_idle) > > > > > > > > > >smp_mb() > > > > > > > > > >ACCESS_ONCE(full_sysidle_state) > > > > > > > > > > > > > > > Are you suggesting that because the CPU 1 executes its atomic_inc() > > > > > _after_ (in terms > > > > > of absolute time) the atomic_read of CPU 0, the ordering settled in > > > > > both sides guarantees > > > > > that the value read from CPU 1 is the one from the cmpxchg that > > > > > precedes the atomic_read, > > > > > or FULL or FULL_NOTED that happen later. > > > > > > > > > > If so that's a big lesson for me. > > > > > > > > It is not absolute time that matters. Instead, it is the fact that > > > > CPU 0, when reading from ->dynticks_idle, read the old value before the > > > > atomic_inc(). Therefore, anything CPU 0 did before that memory barrier > > > > preceding CPU 0's read must come before anything CPU 1 did after that > > > > memory barrier following the atomic_inc(). For this to work, there > > > > must be some access to the same variable on each CPU. > > > > > > Aren't we in the following situation? > > > > > > CPU 0 CPU 1 > > > > > > STORE ASTORE B > > > LOAD B LOAD A > > > > > > > > > If so and referring to your perfbook, this is an "ears to mouth" > > > situation. > > > And it seems to describe there is no strong guarantee in that situation. > > > > "Yes" to the first, but on modern hardware, "no" to the second. The key > > paragraph is Section 12.2.4.5: > > > > The following pairings from Table 12.1 can be used on modern > > hardware, but might fail on some systems that were produced in > > the 1990s. However, these can safely be used on all mainstream > > hardware introduced since the year 2000. > > Right I missed that! Nor are you alone. ;-) > > That said, you are not the first to be confused by this, so I might need > > to rework this section to make it clear that each can in fact be used on > > modern hardware. > > > > If you happen to have an old Sequent NUMA-Q or Symmetry box lying around, > > things are a bit different. On the other hand, I don't believe that any > > of these old boxes are still running Linux. (Hey, I am as sentimental as > > the next guy, but there are limits!) > > > > I updated this section and pushed it, please let me know if this helps! > > I don't know because I encountered some troubles to build it, I'm seeing > thousand > lines like this: > > Name "main::opt_help" used only once: possible typo at /usr/bin/a2ping line > 534. > /usr/bin/a2ping: not a GS output from gs -dSAFER > ./cartoons/whippersnapper300.eps -> ./cartoons/whippersnapper300.pdf > Name "main::opt_extra" used only once: possible typo at /usr/bin/a2ping line > 546. > Name "main::opt_help" used only once: possible typo at /usr/bin/a2ping line > 534. > /usr/bin/a2ping: not a GS output from gs -dSAFER > make: *** [embedfonts] Error 1 Strange. My version of a2ping is Ubuntu 12.04's: a2ping.pl 2.77p, 2006-11-15 -- Written by from April 2003. You have something more recent? > Anyway I looked at the diff and it looks indeed clearer, thanks! Glad it helped! > So back to the issue, I think we made nice progresses with my rusty brain ;-) > But just to be clear, I'm pasting that again for just a few precisions: > > CPU 0CPU 1 > >cmpxchg(_sysidle_state, //CPU 1 wakes up > RCU_SYSIDLE_SHORT, > atomic_inc(rdtp(1)->dynticks_idle) > RCU_SYSIDLE_LONG); > >smp_mb() //cmpxchgsmp_mb() >atomic_read(rdtp(1)->dynticks_idle) ACCESS_ONCE(full_sysidle_state > //CPU 0 goes to sleep > > > > 1) If CPU 0 sets RCU_SYSIDLE_LONG and sees dynticks_idle as even, do we have > the _guarantee_ > that later CPU 1 sees full_sysidle_state == RCU_SYSIDLE_LONG (or any later > full_sysidle_state value) > due to the connection between atomic_read /
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Thu, Jul 25, 2013 at 01:26:44AM +0200, Frederic Weisbecker wrote: On Wed, Jul 24, 2013 at 03:09:02PM -0700, Paul E. McKenney wrote: On Wed, Jul 24, 2013 at 08:09:04PM +0200, Frederic Weisbecker wrote: On Thu, Jul 18, 2013 at 10:06:25PM -0700, Paul E. McKenney wrote: Lets summarize the last sequence, the following happens ordered by time: CPU 0 CPU 1 cmpxchg(full_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); smp_mb() //cmpxchg atomic_read(rdtp(1)-dynticks_idle) //CPU 0 goes to sleep //CPU 1 wakes up atomic_inc(rdtp(1)-dynticks_idle) smp_mb() ACCESS_ONCE(full_sysidle_state) Are you suggesting that because the CPU 1 executes its atomic_inc() _after_ (in terms of absolute time) the atomic_read of CPU 0, the ordering settled in both sides guarantees that the value read from CPU 1 is the one from the cmpxchg that precedes the atomic_read, or FULL or FULL_NOTED that happen later. If so that's a big lesson for me. It is not absolute time that matters. Instead, it is the fact that CPU 0, when reading from -dynticks_idle, read the old value before the atomic_inc(). Therefore, anything CPU 0 did before that memory barrier preceding CPU 0's read must come before anything CPU 1 did after that memory barrier following the atomic_inc(). For this to work, there must be some access to the same variable on each CPU. Aren't we in the following situation? CPU 0 CPU 1 STORE ASTORE B LOAD B LOAD A If so and referring to your perfbook, this is an ears to mouth situation. And it seems to describe there is no strong guarantee in that situation. Yes to the first, but on modern hardware, no to the second. The key paragraph is Section 12.2.4.5: The following pairings from Table 12.1 can be used on modern hardware, but might fail on some systems that were produced in the 1990s. However, these can safely be used on all mainstream hardware introduced since the year 2000. Right I missed that! Nor are you alone. ;-) That said, you are not the first to be confused by this, so I might need to rework this section to make it clear that each can in fact be used on modern hardware. If you happen to have an old Sequent NUMA-Q or Symmetry box lying around, things are a bit different. On the other hand, I don't believe that any of these old boxes are still running Linux. (Hey, I am as sentimental as the next guy, but there are limits!) I updated this section and pushed it, please let me know if this helps! I don't know because I encountered some troubles to build it, I'm seeing thousand lines like this: Name main::opt_help used only once: possible typo at /usr/bin/a2ping line 534. /usr/bin/a2ping: not a GS output from gs -dSAFER ./cartoons/whippersnapper300.eps - ./cartoons/whippersnapper300.pdf Name main::opt_extra used only once: possible typo at /usr/bin/a2ping line 546. Name main::opt_help used only once: possible typo at /usr/bin/a2ping line 534. /usr/bin/a2ping: not a GS output from gs -dSAFER make: *** [embedfonts] Error 1 Strange. My version of a2ping is Ubuntu 12.04's: a2ping.pl 2.77p, 2006-11-15 -- Written by p...@fazekas.hu from April 2003. You have something more recent? Anyway I looked at the diff and it looks indeed clearer, thanks! Glad it helped! So back to the issue, I think we made nice progresses with my rusty brain ;-) But just to be clear, I'm pasting that again for just a few precisions: CPU 0CPU 1 cmpxchg(full_sysidle_state, //CPU 1 wakes up RCU_SYSIDLE_SHORT, atomic_inc(rdtp(1)-dynticks_idle) RCU_SYSIDLE_LONG); smp_mb() //cmpxchgsmp_mb() atomic_read(rdtp(1)-dynticks_idle) ACCESS_ONCE(full_sysidle_state //CPU 0 goes to sleep 1) If CPU 0 sets RCU_SYSIDLE_LONG and sees dynticks_idle as even, do we have the _guarantee_ that later CPU 1 sees full_sysidle_state == RCU_SYSIDLE_LONG (or any later full_sysidle_state value) due to the connection between atomic_read / atomic_inc and the barriers that come along? No, we do not have this guarantee. What happens instead is that CPU 0 later sets RCU_SYSIDLE_FULL after having again seen CPU 1's -dynticks_idle having an even (idle) value. If CPU 1 later increments its -dynticks_idle to an odd (non-idle) value, then
[PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
From: Paul E. McKenney paul...@linux.vnet.ibm.com This commit adds the state machine that takes the per-CPU idle data as input and produces a full-system-idle indication as output. This state machine is driven out of RCU's quiescent-state-forcing mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU idle state and then rcu_sysidle_report() to drive the state machine. The full-system-idle state is sampled using rcu_sys_is_idle(), which also drives the state machine if RCU is idle (and does so by forcing RCU to become non-idle). This function returns true if all but the timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long enough to avoid memory contention on the full_sysidle_state state variable. The rcu_sysidle_force_exit() may be called externally to reset the state machine back into non-idle state. Signed-off-by: Paul E. McKenney paul...@linux.vnet.ibm.com Cc: Frederic Weisbecker fweis...@gmail.com Cc: Steven Rostedt rost...@goodmis.org --- include/linux/rcupdate.h | 18 +++ kernel/rcutree.c | 16 ++- kernel/rcutree.h | 5 + kernel/rcutree_plugin.h | 284 ++- 4 files changed, 316 insertions(+), 7 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48f1ef9..1aa8d8c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return false; } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ +/* Only for use by adaptive-ticks code. */ +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE +extern bool rcu_sys_is_idle(void); +extern void rcu_sysidle_force_exit(void); +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + +static inline bool rcu_sys_is_idle(void) +{ + return false; +} + +static inline void rcu_sysidle_force_exit(void) +{ +} + +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + + #endif /* __LINUX_RCUPDATE_H */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 725524e..aa6d96e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -718,6 +718,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { rdp-dynticks_snap = atomic_add_return(0, rdp-dynticks-dynticks); + rcu_sysidle_check_cpu(rdp, isidle, maxj); return (rdp-dynticks_snap 0x1) == 0; } @@ -1356,11 +1357,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) rsp-n_force_qs++; if (fqs_state == RCU_SAVE_DYNTICK) { /* Collect dyntick-idle snapshots. */ + if (is_sysidle_rcu_state(rsp)) { + isidle = 1; + maxj = jiffies - ULONG_MAX / 4; + } force_qs_rnp(rsp, dyntick_save_progress_counter, isidle, maxj); + rcu_sysidle_report_gp(rsp, isidle, maxj); fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ + isidle = 0; force_qs_rnp(rsp, rcu_implicit_dynticks_qs, isidle, maxj); } /* Clear flag to prevent immediate re-entry. */ @@ -2087,9 +2094,12 @@ static void force_qs_rnp(struct rcu_state *rsp, cpu = rnp-grplo; bit = 1; for (; cpu = rnp-grphi; cpu++, bit = 1) { - if ((rnp-qsmask bit) != 0 - f(per_cpu_ptr(rsp-rda, cpu), isidle, maxj)) - mask |= bit; + if ((rnp-qsmask bit) != 0) { + if ((rnp-qsmaskinit bit) != 0) + *isidle = 0; + if (f(per_cpu_ptr(rsp-rda, cpu), isidle, maxj)) + mask |= bit; + } } if (mask != 0) { diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 1895043..e0de5dc 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, + unsigned long *maxj); +static bool is_sysidle_rcu_state(struct rcu_state *rsp); +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3edae39..ff84bed 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -28,7 +28,7 @@ #include linux/gfp.h #include linux/oom.h #include
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Wed, Jul 24, 2013 at 03:09:02PM -0700, Paul E. McKenney wrote: > On Wed, Jul 24, 2013 at 08:09:04PM +0200, Frederic Weisbecker wrote: > > On Thu, Jul 18, 2013 at 10:06:25PM -0700, Paul E. McKenney wrote: > > > > Lets summarize the last sequence, the following happens ordered by time: > > > > > > > > CPU 0 CPU 1 > > > > > > > > cmpxchg(_sysidle_state, > > > > RCU_SYSIDLE_SHORT, > > > > RCU_SYSIDLE_LONG); > > > > > > > > smp_mb() //cmpxchg > > > > > > > > atomic_read(rdtp(1)->dynticks_idle) > > > > > > > > //CPU 0 goes to sleep > > > >//CPU 1 wakes up > > > > > > > > atomic_inc(rdtp(1)->dynticks_idle) > > > > > > > >smp_mb() > > > > > > > >ACCESS_ONCE(full_sysidle_state) > > > > > > > > > > > > Are you suggesting that because the CPU 1 executes its atomic_inc() > > > > _after_ (in terms > > > > of absolute time) the atomic_read of CPU 0, the ordering settled in > > > > both sides guarantees > > > > that the value read from CPU 1 is the one from the cmpxchg that > > > > precedes the atomic_read, > > > > or FULL or FULL_NOTED that happen later. > > > > > > > > If so that's a big lesson for me. > > > > > > It is not absolute time that matters. Instead, it is the fact that > > > CPU 0, when reading from ->dynticks_idle, read the old value before the > > > atomic_inc(). Therefore, anything CPU 0 did before that memory barrier > > > preceding CPU 0's read must come before anything CPU 1 did after that > > > memory barrier following the atomic_inc(). For this to work, there > > > must be some access to the same variable on each CPU. > > > > Aren't we in the following situation? > > > > CPU 0 CPU 1 > > > > STORE ASTORE B > > LOAD B LOAD A > > > > > > If so and referring to your perfbook, this is an "ears to mouth" situation. > > And it seems to describe there is no strong guarantee in that situation. > > "Yes" to the first, but on modern hardware, "no" to the second. The key > paragraph is Section 12.2.4.5: > > The following pairings from Table 12.1 can be used on modern > hardware, but might fail on some systems that were produced in > the 1990s. However, these can safely be used on all mainstream > hardware introduced since the year 2000. Right I missed that! > > That said, you are not the first to be confused by this, so I might need > to rework this section to make it clear that each can in fact be used on > modern hardware. > > If you happen to have an old Sequent NUMA-Q or Symmetry box lying around, > things are a bit different. On the other hand, I don't believe that any > of these old boxes are still running Linux. (Hey, I am as sentimental as > the next guy, but there are limits!) > > I updated this section and pushed it, please let me know if this helps! I don't know because I encountered some troubles to build it, I'm seeing thousand lines like this: Name "main::opt_help" used only once: possible typo at /usr/bin/a2ping line 534. /usr/bin/a2ping: not a GS output from gs -dSAFER ./cartoons/whippersnapper300.eps -> ./cartoons/whippersnapper300.pdf Name "main::opt_extra" used only once: possible typo at /usr/bin/a2ping line 546. Name "main::opt_help" used only once: possible typo at /usr/bin/a2ping line 534. /usr/bin/a2ping: not a GS output from gs -dSAFER make: *** [embedfonts] Error 1 Anyway I looked at the diff and it looks indeed clearer, thanks! So back to the issue, I think we made nice progresses with my rusty brain ;-) But just to be clear, I'm pasting that again for just a few precisions: CPU 0CPU 1 cmpxchg(_sysidle_state, //CPU 1 wakes up RCU_SYSIDLE_SHORT, atomic_inc(rdtp(1)->dynticks_idle) RCU_SYSIDLE_LONG); smp_mb() //cmpxchgsmp_mb() atomic_read(rdtp(1)->dynticks_idle) ACCESS_ONCE(full_sysidle_state //CPU 0 goes to sleep 1) If CPU 0 sets RCU_SYSIDLE_LONG and sees dynticks_idle as even, do we have the _guarantee_ that later CPU 1 sees full_sysidle_state == RCU_SYSIDLE_LONG (or any later full_sysidle_state value) due to the connection between atomic_read / atomic_inc and the barriers that come along? 2) You taught me once that barrier != memory committed, and it has been one of the hardest trauma in my life. How can we be sure that CPU 1 sees memory as committed from CPU 0? The only fact that we read an even value from CPU 0 is enough for the connection between the atomic_read() and atomic_inc() and all the barriers that come along? 3) In your book it says: "recent hardware would guarantee that at least one of the loads saw
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Wed, Jul 24, 2013 at 08:09:04PM +0200, Frederic Weisbecker wrote: > On Thu, Jul 18, 2013 at 10:06:25PM -0700, Paul E. McKenney wrote: > > > Lets summarize the last sequence, the following happens ordered by time: > > > > > > CPU 0 CPU 1 > > > > > > cmpxchg(_sysidle_state, > > > RCU_SYSIDLE_SHORT, > > > RCU_SYSIDLE_LONG); > > > > > > smp_mb() //cmpxchg > > > > > > atomic_read(rdtp(1)->dynticks_idle) > > > > > > //CPU 0 goes to sleep > > >//CPU 1 wakes up > > >atomic_inc(rdtp(1)->dynticks_idle) > > > > > >smp_mb() > > > > > >ACCESS_ONCE(full_sysidle_state) > > > > > > > > > Are you suggesting that because the CPU 1 executes its atomic_inc() > > > _after_ (in terms > > > of absolute time) the atomic_read of CPU 0, the ordering settled in both > > > sides guarantees > > > that the value read from CPU 1 is the one from the cmpxchg that precedes > > > the atomic_read, > > > or FULL or FULL_NOTED that happen later. > > > > > > If so that's a big lesson for me. > > > > It is not absolute time that matters. Instead, it is the fact that > > CPU 0, when reading from ->dynticks_idle, read the old value before the > > atomic_inc(). Therefore, anything CPU 0 did before that memory barrier > > preceding CPU 0's read must come before anything CPU 1 did after that > > memory barrier following the atomic_inc(). For this to work, there > > must be some access to the same variable on each CPU. > > Aren't we in the following situation? > > CPU 0 CPU 1 > > STORE ASTORE B > LOAD B LOAD A > > > If so and referring to your perfbook, this is an "ears to mouth" situation. > And it seems to describe there is no strong guarantee in that situation. "Yes" to the first, but on modern hardware, "no" to the second. The key paragraph is Section 12.2.4.5: The following pairings from Table 12.1 can be used on modern hardware, but might fail on some systems that were produced in the 1990s. However, these can safely be used on all mainstream hardware introduced since the year 2000. That said, you are not the first to be confused by this, so I might need to rework this section to make it clear that each can in fact be used on modern hardware. If you happen to have an old Sequent NUMA-Q or Symmetry box lying around, things are a bit different. On the other hand, I don't believe that any of these old boxes are still running Linux. (Hey, I am as sentimental as the next guy, but there are limits!) I updated this section and pushed it, please let me know if this helps! Thanx, Paul > > Or, if you must think in terms of time, you need a separate independent > > timeline for each variable, with no direct mapping from one timeline to > > another, except resulting from memory-barrier interactions. > > > > Thanx, Paul > > > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Thu, Jul 18, 2013 at 10:06:25PM -0700, Paul E. McKenney wrote: > > Lets summarize the last sequence, the following happens ordered by time: > > > > CPU 0 CPU 1 > > > > cmpxchg(_sysidle_state, > > RCU_SYSIDLE_SHORT, > > RCU_SYSIDLE_LONG); > > > > smp_mb() //cmpxchg > > > > atomic_read(rdtp(1)->dynticks_idle) > > > > //CPU 0 goes to sleep > >//CPU 1 wakes up > >atomic_inc(rdtp(1)->dynticks_idle) > > > >smp_mb() > > > >ACCESS_ONCE(full_sysidle_state) > > > > > > Are you suggesting that because the CPU 1 executes its atomic_inc() _after_ > > (in terms > > of absolute time) the atomic_read of CPU 0, the ordering settled in both > > sides guarantees > > that the value read from CPU 1 is the one from the cmpxchg that precedes > > the atomic_read, > > or FULL or FULL_NOTED that happen later. > > > > If so that's a big lesson for me. > > It is not absolute time that matters. Instead, it is the fact that > CPU 0, when reading from ->dynticks_idle, read the old value before the > atomic_inc(). Therefore, anything CPU 0 did before that memory barrier > preceding CPU 0's read must come before anything CPU 1 did after that > memory barrier following the atomic_inc(). For this to work, there > must be some access to the same variable on each CPU. Aren't we in the following situation? CPU 0 CPU 1 STORE ASTORE B LOAD B LOAD A If so and referring to your perfbook, this is an "ears to mouth" situation. And it seems to describe there is no strong guarantee in that situation. > > Or, if you must think in terms of time, you need a separate independent > timeline for each variable, with no direct mapping from one timeline to > another, except resulting from memory-barrier interactions. > > Thanx, Paul > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Wed, Jul 24, 2013 at 08:09:04PM +0200, Frederic Weisbecker wrote: On Thu, Jul 18, 2013 at 10:06:25PM -0700, Paul E. McKenney wrote: Lets summarize the last sequence, the following happens ordered by time: CPU 0 CPU 1 cmpxchg(full_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); smp_mb() //cmpxchg atomic_read(rdtp(1)-dynticks_idle) //CPU 0 goes to sleep //CPU 1 wakes up atomic_inc(rdtp(1)-dynticks_idle) smp_mb() ACCESS_ONCE(full_sysidle_state) Are you suggesting that because the CPU 1 executes its atomic_inc() _after_ (in terms of absolute time) the atomic_read of CPU 0, the ordering settled in both sides guarantees that the value read from CPU 1 is the one from the cmpxchg that precedes the atomic_read, or FULL or FULL_NOTED that happen later. If so that's a big lesson for me. It is not absolute time that matters. Instead, it is the fact that CPU 0, when reading from -dynticks_idle, read the old value before the atomic_inc(). Therefore, anything CPU 0 did before that memory barrier preceding CPU 0's read must come before anything CPU 1 did after that memory barrier following the atomic_inc(). For this to work, there must be some access to the same variable on each CPU. Aren't we in the following situation? CPU 0 CPU 1 STORE ASTORE B LOAD B LOAD A If so and referring to your perfbook, this is an ears to mouth situation. And it seems to describe there is no strong guarantee in that situation. Yes to the first, but on modern hardware, no to the second. The key paragraph is Section 12.2.4.5: The following pairings from Table 12.1 can be used on modern hardware, but might fail on some systems that were produced in the 1990s. However, these can safely be used on all mainstream hardware introduced since the year 2000. That said, you are not the first to be confused by this, so I might need to rework this section to make it clear that each can in fact be used on modern hardware. If you happen to have an old Sequent NUMA-Q or Symmetry box lying around, things are a bit different. On the other hand, I don't believe that any of these old boxes are still running Linux. (Hey, I am as sentimental as the next guy, but there are limits!) I updated this section and pushed it, please let me know if this helps! Thanx, Paul Or, if you must think in terms of time, you need a separate independent timeline for each variable, with no direct mapping from one timeline to another, except resulting from memory-barrier interactions. Thanx, Paul -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Wed, Jul 24, 2013 at 03:09:02PM -0700, Paul E. McKenney wrote: On Wed, Jul 24, 2013 at 08:09:04PM +0200, Frederic Weisbecker wrote: On Thu, Jul 18, 2013 at 10:06:25PM -0700, Paul E. McKenney wrote: Lets summarize the last sequence, the following happens ordered by time: CPU 0 CPU 1 cmpxchg(full_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); smp_mb() //cmpxchg atomic_read(rdtp(1)-dynticks_idle) //CPU 0 goes to sleep //CPU 1 wakes up atomic_inc(rdtp(1)-dynticks_idle) smp_mb() ACCESS_ONCE(full_sysidle_state) Are you suggesting that because the CPU 1 executes its atomic_inc() _after_ (in terms of absolute time) the atomic_read of CPU 0, the ordering settled in both sides guarantees that the value read from CPU 1 is the one from the cmpxchg that precedes the atomic_read, or FULL or FULL_NOTED that happen later. If so that's a big lesson for me. It is not absolute time that matters. Instead, it is the fact that CPU 0, when reading from -dynticks_idle, read the old value before the atomic_inc(). Therefore, anything CPU 0 did before that memory barrier preceding CPU 0's read must come before anything CPU 1 did after that memory barrier following the atomic_inc(). For this to work, there must be some access to the same variable on each CPU. Aren't we in the following situation? CPU 0 CPU 1 STORE ASTORE B LOAD B LOAD A If so and referring to your perfbook, this is an ears to mouth situation. And it seems to describe there is no strong guarantee in that situation. Yes to the first, but on modern hardware, no to the second. The key paragraph is Section 12.2.4.5: The following pairings from Table 12.1 can be used on modern hardware, but might fail on some systems that were produced in the 1990s. However, these can safely be used on all mainstream hardware introduced since the year 2000. Right I missed that! That said, you are not the first to be confused by this, so I might need to rework this section to make it clear that each can in fact be used on modern hardware. If you happen to have an old Sequent NUMA-Q or Symmetry box lying around, things are a bit different. On the other hand, I don't believe that any of these old boxes are still running Linux. (Hey, I am as sentimental as the next guy, but there are limits!) I updated this section and pushed it, please let me know if this helps! I don't know because I encountered some troubles to build it, I'm seeing thousand lines like this: Name main::opt_help used only once: possible typo at /usr/bin/a2ping line 534. /usr/bin/a2ping: not a GS output from gs -dSAFER ./cartoons/whippersnapper300.eps - ./cartoons/whippersnapper300.pdf Name main::opt_extra used only once: possible typo at /usr/bin/a2ping line 546. Name main::opt_help used only once: possible typo at /usr/bin/a2ping line 534. /usr/bin/a2ping: not a GS output from gs -dSAFER make: *** [embedfonts] Error 1 Anyway I looked at the diff and it looks indeed clearer, thanks! So back to the issue, I think we made nice progresses with my rusty brain ;-) But just to be clear, I'm pasting that again for just a few precisions: CPU 0CPU 1 cmpxchg(full_sysidle_state, //CPU 1 wakes up RCU_SYSIDLE_SHORT, atomic_inc(rdtp(1)-dynticks_idle) RCU_SYSIDLE_LONG); smp_mb() //cmpxchgsmp_mb() atomic_read(rdtp(1)-dynticks_idle) ACCESS_ONCE(full_sysidle_state //CPU 0 goes to sleep 1) If CPU 0 sets RCU_SYSIDLE_LONG and sees dynticks_idle as even, do we have the _guarantee_ that later CPU 1 sees full_sysidle_state == RCU_SYSIDLE_LONG (or any later full_sysidle_state value) due to the connection between atomic_read / atomic_inc and the barriers that come along? 2) You taught me once that barrier != memory committed, and it has been one of the hardest trauma in my life. How can we be sure that CPU 1 sees memory as committed from CPU 0? The only fact that we read an even value from CPU 0 is enough for the connection between the atomic_read() and atomic_inc() and all the barriers that come along? 3) In your book it says: recent hardware would guarantee that at least one of the loads saw the value stored by the corresponding store. At least one? So in our example, CPU 0 could see dynticks_idle as even (success to see some prior store done in CPU 1) but following the above statement
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Thu, Jul 18, 2013 at 10:06:25PM -0700, Paul E. McKenney wrote: Lets summarize the last sequence, the following happens ordered by time: CPU 0 CPU 1 cmpxchg(full_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); smp_mb() //cmpxchg atomic_read(rdtp(1)-dynticks_idle) //CPU 0 goes to sleep //CPU 1 wakes up atomic_inc(rdtp(1)-dynticks_idle) smp_mb() ACCESS_ONCE(full_sysidle_state) Are you suggesting that because the CPU 1 executes its atomic_inc() _after_ (in terms of absolute time) the atomic_read of CPU 0, the ordering settled in both sides guarantees that the value read from CPU 1 is the one from the cmpxchg that precedes the atomic_read, or FULL or FULL_NOTED that happen later. If so that's a big lesson for me. It is not absolute time that matters. Instead, it is the fact that CPU 0, when reading from -dynticks_idle, read the old value before the atomic_inc(). Therefore, anything CPU 0 did before that memory barrier preceding CPU 0's read must come before anything CPU 1 did after that memory barrier following the atomic_inc(). For this to work, there must be some access to the same variable on each CPU. Aren't we in the following situation? CPU 0 CPU 1 STORE ASTORE B LOAD B LOAD A If so and referring to your perfbook, this is an ears to mouth situation. And it seems to describe there is no strong guarantee in that situation. Or, if you must think in terms of time, you need a separate independent timeline for each variable, with no direct mapping from one timeline to another, except resulting from memory-barrier interactions. Thanx, Paul -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Fri, Jul 19, 2013 at 04:12:08AM +0200, Frederic Weisbecker wrote: > On Thu, Jul 18, 2013 at 05:24:08PM -0700, Paul E. McKenney wrote: > > On Fri, Jul 19, 2013 at 12:46:21AM +0200, Frederic Weisbecker wrote: > > > On Thu, Jul 18, 2013 at 09:47:49AM -0700, Paul E. McKenney wrote: > > > > 1. Some CPU coming out of idle: > > > > > > > > o rcu_sysidle_exit(): > > > > > > > > smp_mb__before_atomic_inc(); > > > > atomic_inc(>dynticks_idle); > > > > smp_mb__after_atomic_inc(); /* A */ > > > > > > > > o rcu_sysidle_force_exit(): > > > > > > > > oldstate = ACCESS_ONCE(full_sysidle_state); > > > > > > > > 2. RCU GP kthread: > > > > > > > > o rcu_sysidle(): > > > > > > > > cmpxchg(_sysidle_state, RCU_SYSIDLE_SHORT, > > > > RCU_SYSIDLE_LONG); > > > > /* B */ > > > > > > > > o rcu_sysidle_check_cpu(): > > > > > > > > cur = atomic_read(>dynticks_idle); > > > > > > > > Memory barrier A pairs with memory barrier B, so that if #1's load > > > > from full_sysidle_state sees RCU_SYSIDLE_SHORT, we know that #1's > > > > atomic_inc() must be visible to #2's atomic_read(). This will cause #2 > > > > to recognize that the CPU came out of idle, which will in turn cause it > > > > to invoke rcu_sysidle_cancel() instead of rcu_sysidle(), resulting in > > > > full_sysidle_state being set to RCU_SYSIDLE_NOT. > > > > > > Ok I get it for that direction. > > > Now imagine CPU 0 is the RCU GP kthread (#2) and CPU 1 is idle and stays > > > so. > > > > > > CPU 0 then rounds and see that all CPUs are idle, until it finally sets > > > up RCU_SYSIDLE_SHORT_FULL and finally goes to sleep. > > > > > > Then CPU 1 wakes up. It really has to see a value above RCU_SYSIDLE_SHORT > > > otherwise it won't do the cmpxchg and see the FULL_NOTED that makes it > > > send > > > the IPI. > > > > > > What provides the guarantee that CPU 1 sees a value above > > > RCU_SYSIDLE_SHORT? > > > Not on the cmpxchg but when it first dereference with ACCESS_ONCE. > > > > The trick is that CPU 0 will have scanned, moved to RCU_SYSIDLE_SHORT, > > scanned, moved to RCU_SYSIDLE_LONG, then scanned again before moving > > to RCU_SYSIDLE_FULL. Given CPU 1 has been idle all this time, CPU 0 > > will have read its ->dynticks_idle counter on each scan and seen it > > having an even value. When CPU 1 comes out of idle, it will atomically > > increment its ->dyntick_idle(), which will happen after CPU 0's read of > > ->dyntick_idle() during its last scan. > > > > Because of the memory-barrier pairing above, this means that CPU > > 1's read from full_sysidle_state must follow the cmpxchg() that > > set full_sysidle_state to RCU_SYSIDLE_LONG (though not necessarily > > the two later cmpxchg()s that set it to RCU_SYSIDLE_FULL and > > RCU_SYSIDLE_FULL_NOTED). But because RCU_SYSIDLE_LONG is greater than > > RCU_SYSIDLE_SHORT, CPU 1 will take action to end the idle period. > > Lets summarize the last sequence, the following happens ordered by time: > > CPU 0 CPU 1 > > cmpxchg(_sysidle_state, > RCU_SYSIDLE_SHORT, > RCU_SYSIDLE_LONG); > > smp_mb() //cmpxchg > > atomic_read(rdtp(1)->dynticks_idle) > > //CPU 0 goes to sleep >//CPU 1 wakes up >atomic_inc(rdtp(1)->dynticks_idle) > >smp_mb() > >ACCESS_ONCE(full_sysidle_state) > > > Are you suggesting that because the CPU 1 executes its atomic_inc() _after_ > (in terms > of absolute time) the atomic_read of CPU 0, the ordering settled in both > sides guarantees > that the value read from CPU 1 is the one from the cmpxchg that precedes the > atomic_read, > or FULL or FULL_NOTED that happen later. > > If so that's a big lesson for me. It is not absolute time that matters. Instead, it is the fact that CPU 0, when reading from ->dynticks_idle, read the old value before the atomic_inc(). Therefore, anything CPU 0 did before that memory barrier preceding CPU 0's read must come before anything CPU 1 did after that memory barrier following the atomic_inc(). For this to work, there must be some access to the same variable on each CPU. Or, if you must think in terms of time, you need a separate independent timeline for each variable, with no direct mapping from one timeline to another, except resulting from memory-barrier interactions. Thanx, Paul -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Thu, Jul 18, 2013 at 05:24:08PM -0700, Paul E. McKenney wrote: > On Fri, Jul 19, 2013 at 12:46:21AM +0200, Frederic Weisbecker wrote: > > On Thu, Jul 18, 2013 at 09:47:49AM -0700, Paul E. McKenney wrote: > > > 1. Some CPU coming out of idle: > > > > > > o rcu_sysidle_exit(): > > > > > > smp_mb__before_atomic_inc(); > > > atomic_inc(>dynticks_idle); > > > smp_mb__after_atomic_inc(); /* A */ > > > > > > o rcu_sysidle_force_exit(): > > > > > > oldstate = ACCESS_ONCE(full_sysidle_state); > > > > > > 2. RCU GP kthread: > > > > > > o rcu_sysidle(): > > > > > > cmpxchg(_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); > > > /* B */ > > > > > > o rcu_sysidle_check_cpu(): > > > > > > cur = atomic_read(>dynticks_idle); > > > > > > Memory barrier A pairs with memory barrier B, so that if #1's load > > > from full_sysidle_state sees RCU_SYSIDLE_SHORT, we know that #1's > > > atomic_inc() must be visible to #2's atomic_read(). This will cause #2 > > > to recognize that the CPU came out of idle, which will in turn cause it > > > to invoke rcu_sysidle_cancel() instead of rcu_sysidle(), resulting in > > > full_sysidle_state being set to RCU_SYSIDLE_NOT. > > > > Ok I get it for that direction. > > Now imagine CPU 0 is the RCU GP kthread (#2) and CPU 1 is idle and stays > > so. > > > > CPU 0 then rounds and see that all CPUs are idle, until it finally sets > > up RCU_SYSIDLE_SHORT_FULL and finally goes to sleep. > > > > Then CPU 1 wakes up. It really has to see a value above RCU_SYSIDLE_SHORT > > otherwise it won't do the cmpxchg and see the FULL_NOTED that makes it send > > the IPI. > > > > What provides the guarantee that CPU 1 sees a value above RCU_SYSIDLE_SHORT? > > Not on the cmpxchg but when it first dereference with ACCESS_ONCE. > > The trick is that CPU 0 will have scanned, moved to RCU_SYSIDLE_SHORT, > scanned, moved to RCU_SYSIDLE_LONG, then scanned again before moving > to RCU_SYSIDLE_FULL. Given CPU 1 has been idle all this time, CPU 0 > will have read its ->dynticks_idle counter on each scan and seen it > having an even value. When CPU 1 comes out of idle, it will atomically > increment its ->dyntick_idle(), which will happen after CPU 0's read of > ->dyntick_idle() during its last scan. > > Because of the memory-barrier pairing above, this means that CPU > 1's read from full_sysidle_state must follow the cmpxchg() that > set full_sysidle_state to RCU_SYSIDLE_LONG (though not necessarily > the two later cmpxchg()s that set it to RCU_SYSIDLE_FULL and > RCU_SYSIDLE_FULL_NOTED). But because RCU_SYSIDLE_LONG is greater than > RCU_SYSIDLE_SHORT, CPU 1 will take action to end the idle period. Lets summarize the last sequence, the following happens ordered by time: CPU 0 CPU 1 cmpxchg(_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); smp_mb() //cmpxchg atomic_read(rdtp(1)->dynticks_idle) //CPU 0 goes to sleep //CPU 1 wakes up atomic_inc(rdtp(1)->dynticks_idle) smp_mb() ACCESS_ONCE(full_sysidle_state) Are you suggesting that because the CPU 1 executes its atomic_inc() _after_ (in terms of absolute time) the atomic_read of CPU 0, the ordering settled in both sides guarantees that the value read from CPU 1 is the one from the cmpxchg that precedes the atomic_read, or FULL or FULL_NOTED that happen later. If so that's a big lesson for me. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Fri, Jul 19, 2013 at 12:46:21AM +0200, Frederic Weisbecker wrote: > On Thu, Jul 18, 2013 at 09:47:49AM -0700, Paul E. McKenney wrote: > > On Thu, Jul 18, 2013 at 04:24:51PM +0200, Frederic Weisbecker wrote: > > > On Wed, Jul 17, 2013 at 08:39:21PM -0700, Paul E. McKenney wrote: > > > > On Thu, Jul 18, 2013 at 03:33:01AM +0200, Frederic Weisbecker wrote: > > > > > So it's like: > > > > > > > > > > CPU 0 CPU 1 > > > > > > > > > > read I write I > > > > > smp_mb() smp_mb() > > > > > cmpxchg S read S > > > > > > > > > > I still can't find what guarantees we don't read a value in CPU 1 > > > > > that is way below > > > > > what we want. > > > > > > > > One key point is that there is a second cycle from LONG to FULL. > > > > > > > > (Not saying that there is not a bug -- there might well be. In fact, > > > > I am starting to think that I need to do another Promela model... > > > > > > Now I'm very confused :) > > > > To quote a Nobel Laureate who presented at an ISEF here in Portland some > > years back, "Confusion is the most productive state of mind." ;-) > > Then I must be a very productive guy! So that is your secret! ;-) > > > I'm far from being a specialist on these matters but I would really love > > > to > > > understand this patchset. Is there any documentation somewhere I can read > > > that could help, something about cycles of committed memory or something? > > > > Documentation/memory-barriers.txt should suffice for this. If you want > > more rigor, http://www.cl.cam.ac.uk/~pes20/ppc-supplemental/test7.pdf > > > > But memory-barrier pairing suffices here. Here is case 2 from my > > earlier email in more detail. The comments with capital letters > > mark important memory barriers, some of which are buried in atomic > > operations. > > > > 1. Some CPU coming out of idle: > > > > o rcu_sysidle_exit(): > > > > smp_mb__before_atomic_inc(); > > atomic_inc(>dynticks_idle); > > smp_mb__after_atomic_inc(); /* A */ > > > > o rcu_sysidle_force_exit(): > > > > oldstate = ACCESS_ONCE(full_sysidle_state); > > > > 2. RCU GP kthread: > > > > o rcu_sysidle(): > > > > cmpxchg(_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); > > /* B */ > > > > o rcu_sysidle_check_cpu(): > > > > cur = atomic_read(>dynticks_idle); > > > > Memory barrier A pairs with memory barrier B, so that if #1's load > > from full_sysidle_state sees RCU_SYSIDLE_SHORT, we know that #1's > > atomic_inc() must be visible to #2's atomic_read(). This will cause #2 > > to recognize that the CPU came out of idle, which will in turn cause it > > to invoke rcu_sysidle_cancel() instead of rcu_sysidle(), resulting in > > full_sysidle_state being set to RCU_SYSIDLE_NOT. > > Ok I get it for that direction. > Now imagine CPU 0 is the RCU GP kthread (#2) and CPU 1 is idle and stays > so. > > CPU 0 then rounds and see that all CPUs are idle, until it finally sets > up RCU_SYSIDLE_SHORT_FULL and finally goes to sleep. > > Then CPU 1 wakes up. It really has to see a value above RCU_SYSIDLE_SHORT > otherwise it won't do the cmpxchg and see the FULL_NOTED that makes it send > the IPI. > > What provides the guarantee that CPU 1 sees a value above RCU_SYSIDLE_SHORT? > Not on the cmpxchg but when it first dereference with ACCESS_ONCE. The trick is that CPU 0 will have scanned, moved to RCU_SYSIDLE_SHORT, scanned, moved to RCU_SYSIDLE_LONG, then scanned again before moving to RCU_SYSIDLE_FULL. Given CPU 1 has been idle all this time, CPU 0 will have read its ->dynticks_idle counter on each scan and seen it having an even value. When CPU 1 comes out of idle, it will atomically increment its ->dyntick_idle(), which will happen after CPU 0's read of ->dyntick_idle() during its last scan. Because of the memory-barrier pairing above, this means that CPU 1's read from full_sysidle_state must follow the cmpxchg() that set full_sysidle_state to RCU_SYSIDLE_LONG (though not necessarily the two later cmpxchg()s that set it to RCU_SYSIDLE_FULL and RCU_SYSIDLE_FULL_NOTED). But because RCU_SYSIDLE_LONG is greater than RCU_SYSIDLE_SHORT, CPU 1 will take action to end the idle period. Thanx, Paul -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Thu, Jul 18, 2013 at 09:47:49AM -0700, Paul E. McKenney wrote: > On Thu, Jul 18, 2013 at 04:24:51PM +0200, Frederic Weisbecker wrote: > > On Wed, Jul 17, 2013 at 08:39:21PM -0700, Paul E. McKenney wrote: > > > On Thu, Jul 18, 2013 at 03:33:01AM +0200, Frederic Weisbecker wrote: > > > > So it's like: > > > > > > > > CPU 0 CPU 1 > > > > > > > > read I write I > > > > smp_mb() smp_mb() > > > > cmpxchg S read S > > > > > > > > I still can't find what guarantees we don't read a value in CPU 1 that > > > > is way below > > > > what we want. > > > > > > One key point is that there is a second cycle from LONG to FULL. > > > > > > (Not saying that there is not a bug -- there might well be. In fact, > > > I am starting to think that I need to do another Promela model... > > > > Now I'm very confused :) > > To quote a Nobel Laureate who presented at an ISEF here in Portland some > years back, "Confusion is the most productive state of mind." ;-) Then I must be a very productive guy! > > > I'm far from being a specialist on these matters but I would really love to > > understand this patchset. Is there any documentation somewhere I can read > > that could help, something about cycles of committed memory or something? > > Documentation/memory-barriers.txt should suffice for this. If you want > more rigor, http://www.cl.cam.ac.uk/~pes20/ppc-supplemental/test7.pdf > > But memory-barrier pairing suffices here. Here is case 2 from my > earlier email in more detail. The comments with capital letters > mark important memory barriers, some of which are buried in atomic > operations. > > 1. Some CPU coming out of idle: > > o rcu_sysidle_exit(): > > smp_mb__before_atomic_inc(); > atomic_inc(>dynticks_idle); > smp_mb__after_atomic_inc(); /* A */ > > o rcu_sysidle_force_exit(): > > oldstate = ACCESS_ONCE(full_sysidle_state); > > 2. RCU GP kthread: > > o rcu_sysidle(): > > cmpxchg(_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); > /* B */ > > o rcu_sysidle_check_cpu(): > > cur = atomic_read(>dynticks_idle); > > Memory barrier A pairs with memory barrier B, so that if #1's load > from full_sysidle_state sees RCU_SYSIDLE_SHORT, we know that #1's > atomic_inc() must be visible to #2's atomic_read(). This will cause #2 > to recognize that the CPU came out of idle, which will in turn cause it > to invoke rcu_sysidle_cancel() instead of rcu_sysidle(), resulting in > full_sysidle_state being set to RCU_SYSIDLE_NOT. Ok I get it for that direction. Now imagine CPU 0 is the RCU GP kthread (#2) and CPU 1 is idle and stays so. CPU 0 then rounds and see that all CPUs are idle, until it finally sets up RCU_SYSIDLE_SHORT_FULL and finally goes to sleep. Then CPU 1 wakes up. It really has to see a value above RCU_SYSIDLE_SHORT otherwise it won't do the cmpxchg and see the FULL_NOTED that makes it send the IPI. What provides the guarantee that CPU 1 sees a value above RCU_SYSIDLE_SHORT? Not on the cmpxchg but when it first dereference with ACCESS_ONCE. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Thu, Jul 18, 2013 at 04:24:51PM +0200, Frederic Weisbecker wrote: > On Wed, Jul 17, 2013 at 08:39:21PM -0700, Paul E. McKenney wrote: > > On Thu, Jul 18, 2013 at 03:33:01AM +0200, Frederic Weisbecker wrote: > > > So it's like: > > > > > > CPU 0 CPU 1 > > > > > > read I write I > > > smp_mb() smp_mb() > > > cmpxchg S read S > > > > > > I still can't find what guarantees we don't read a value in CPU 1 that is > > > way below > > > what we want. > > > > One key point is that there is a second cycle from LONG to FULL. > > > > (Not saying that there is not a bug -- there might well be. In fact, > > I am starting to think that I need to do another Promela model... > > Now I'm very confused :) To quote a Nobel Laureate who presented at an ISEF here in Portland some years back, "Confusion is the most productive state of mind." ;-) > I'm far from being a specialist on these matters but I would really love to > understand this patchset. Is there any documentation somewhere I can read > that could help, something about cycles of committed memory or something? Documentation/memory-barriers.txt should suffice for this. If you want more rigor, http://www.cl.cam.ac.uk/~pes20/ppc-supplemental/test7.pdf But memory-barrier pairing suffices here. Here is case 2 from my earlier email in more detail. The comments with capital letters mark important memory barriers, some of which are buried in atomic operations. 1. Some CPU coming out of idle: o rcu_sysidle_exit(): smp_mb__before_atomic_inc(); atomic_inc(>dynticks_idle); smp_mb__after_atomic_inc(); /* A */ o rcu_sysidle_force_exit(): oldstate = ACCESS_ONCE(full_sysidle_state); 2. RCU GP kthread: o rcu_sysidle(): cmpxchg(_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); /* B */ o rcu_sysidle_check_cpu(): cur = atomic_read(>dynticks_idle); Memory barrier A pairs with memory barrier B, so that if #1's load from full_sysidle_state sees RCU_SYSIDLE_SHORT, we know that #1's atomic_inc() must be visible to #2's atomic_read(). This will cause #2 to recognize that the CPU came out of idle, which will in turn cause it to invoke rcu_sysidle_cancel() instead of rcu_sysidle(), resulting in full_sysidle_state being set to RCU_SYSIDLE_NOT. Thanx, Paul > > > > Unfortunately, the reasoning in #2 above does not hold in the small-CPU > > > > case because there is the possibility of both the timekeeping CPU and > > > > the RCU grace-period kthread concurrently advancing the state machine. > > > > This would be bad, good catch!!! > > > > > > It's not like I spotted anything myself but you're welcome :) > > > > I will take them any way I can get them. ;-) > > > > > > The patch below (untested) is an attempt to fix this. If it actually > > > > works, I will merge it in with 6/7. > > > > > > > > Anything else I missed? ;-) > > > > > > Well I guess I'll wait one more night before trying to understand > > > the below ;) > > > > The key point is that the added check means that either the timekeeping > > CPU is advancing the state machine (if there are few CPUs) or the > > RCU grace-period kthread is (if there are many CPUs), but never both. > > Or that is the intent, anyway! > > Yeah got that. > > Thanks! > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Wed, Jul 17, 2013 at 08:39:21PM -0700, Paul E. McKenney wrote: > On Thu, Jul 18, 2013 at 03:33:01AM +0200, Frederic Weisbecker wrote: > > So it's like: > > > > CPU 0 CPU 1 > > > > read I write I > > smp_mb() smp_mb() > > cmpxchg S read S > > > > I still can't find what guarantees we don't read a value in CPU 1 that is > > way below > > what we want. > > One key point is that there is a second cycle from LONG to FULL. > > (Not saying that there is not a bug -- there might well be. In fact, > I am starting to think that I need to do another Promela model... Now I'm very confused :) I'm far from being a specialist on these matters but I would really love to understand this patchset. Is there any documentation somewhere I can read that could help, something about cycles of committed memory or something? > > > > Unfortunately, the reasoning in #2 above does not hold in the small-CPU > > > case because there is the possibility of both the timekeeping CPU and > > > the RCU grace-period kthread concurrently advancing the state machine. > > > This would be bad, good catch!!! > > > > It's not like I spotted anything myself but you're welcome :) > > I will take them any way I can get them. ;-) > > > > The patch below (untested) is an attempt to fix this. If it actually > > > works, I will merge it in with 6/7. > > > > > > Anything else I missed? ;-) > > > > Well I guess I'll wait one more night before trying to understand > > the below ;) > > The key point is that the added check means that either the timekeeping > CPU is advancing the state machine (if there are few CPUs) or the > RCU grace-period kthread is (if there are many CPUs), but never both. > Or that is the intent, anyway! Yeah got that. Thanks! -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Wed, Jul 17, 2013 at 08:39:21PM -0700, Paul E. McKenney wrote: On Thu, Jul 18, 2013 at 03:33:01AM +0200, Frederic Weisbecker wrote: So it's like: CPU 0 CPU 1 read I write I smp_mb() smp_mb() cmpxchg S read S I still can't find what guarantees we don't read a value in CPU 1 that is way below what we want. One key point is that there is a second cycle from LONG to FULL. (Not saying that there is not a bug -- there might well be. In fact, I am starting to think that I need to do another Promela model... Now I'm very confused :) I'm far from being a specialist on these matters but I would really love to understand this patchset. Is there any documentation somewhere I can read that could help, something about cycles of committed memory or something? Unfortunately, the reasoning in #2 above does not hold in the small-CPU case because there is the possibility of both the timekeeping CPU and the RCU grace-period kthread concurrently advancing the state machine. This would be bad, good catch!!! It's not like I spotted anything myself but you're welcome :) I will take them any way I can get them. ;-) The patch below (untested) is an attempt to fix this. If it actually works, I will merge it in with 6/7. Anything else I missed? ;-) Well I guess I'll wait one more night before trying to understand the below ;) The key point is that the added check means that either the timekeeping CPU is advancing the state machine (if there are few CPUs) or the RCU grace-period kthread is (if there are many CPUs), but never both. Or that is the intent, anyway! Yeah got that. Thanks! -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Thu, Jul 18, 2013 at 04:24:51PM +0200, Frederic Weisbecker wrote: On Wed, Jul 17, 2013 at 08:39:21PM -0700, Paul E. McKenney wrote: On Thu, Jul 18, 2013 at 03:33:01AM +0200, Frederic Weisbecker wrote: So it's like: CPU 0 CPU 1 read I write I smp_mb() smp_mb() cmpxchg S read S I still can't find what guarantees we don't read a value in CPU 1 that is way below what we want. One key point is that there is a second cycle from LONG to FULL. (Not saying that there is not a bug -- there might well be. In fact, I am starting to think that I need to do another Promela model... Now I'm very confused :) To quote a Nobel Laureate who presented at an ISEF here in Portland some years back, Confusion is the most productive state of mind. ;-) I'm far from being a specialist on these matters but I would really love to understand this patchset. Is there any documentation somewhere I can read that could help, something about cycles of committed memory or something? Documentation/memory-barriers.txt should suffice for this. If you want more rigor, http://www.cl.cam.ac.uk/~pes20/ppc-supplemental/test7.pdf But memory-barrier pairing suffices here. Here is case 2 from my earlier email in more detail. The comments with capital letters mark important memory barriers, some of which are buried in atomic operations. 1. Some CPU coming out of idle: o rcu_sysidle_exit(): smp_mb__before_atomic_inc(); atomic_inc(rdtp-dynticks_idle); smp_mb__after_atomic_inc(); /* A */ o rcu_sysidle_force_exit(): oldstate = ACCESS_ONCE(full_sysidle_state); 2. RCU GP kthread: o rcu_sysidle(): cmpxchg(full_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); /* B */ o rcu_sysidle_check_cpu(): cur = atomic_read(rdtp-dynticks_idle); Memory barrier A pairs with memory barrier B, so that if #1's load from full_sysidle_state sees RCU_SYSIDLE_SHORT, we know that #1's atomic_inc() must be visible to #2's atomic_read(). This will cause #2 to recognize that the CPU came out of idle, which will in turn cause it to invoke rcu_sysidle_cancel() instead of rcu_sysidle(), resulting in full_sysidle_state being set to RCU_SYSIDLE_NOT. Thanx, Paul Unfortunately, the reasoning in #2 above does not hold in the small-CPU case because there is the possibility of both the timekeeping CPU and the RCU grace-period kthread concurrently advancing the state machine. This would be bad, good catch!!! It's not like I spotted anything myself but you're welcome :) I will take them any way I can get them. ;-) The patch below (untested) is an attempt to fix this. If it actually works, I will merge it in with 6/7. Anything else I missed? ;-) Well I guess I'll wait one more night before trying to understand the below ;) The key point is that the added check means that either the timekeeping CPU is advancing the state machine (if there are few CPUs) or the RCU grace-period kthread is (if there are many CPUs), but never both. Or that is the intent, anyway! Yeah got that. Thanks! -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Thu, Jul 18, 2013 at 09:47:49AM -0700, Paul E. McKenney wrote: On Thu, Jul 18, 2013 at 04:24:51PM +0200, Frederic Weisbecker wrote: On Wed, Jul 17, 2013 at 08:39:21PM -0700, Paul E. McKenney wrote: On Thu, Jul 18, 2013 at 03:33:01AM +0200, Frederic Weisbecker wrote: So it's like: CPU 0 CPU 1 read I write I smp_mb() smp_mb() cmpxchg S read S I still can't find what guarantees we don't read a value in CPU 1 that is way below what we want. One key point is that there is a second cycle from LONG to FULL. (Not saying that there is not a bug -- there might well be. In fact, I am starting to think that I need to do another Promela model... Now I'm very confused :) To quote a Nobel Laureate who presented at an ISEF here in Portland some years back, Confusion is the most productive state of mind. ;-) Then I must be a very productive guy! I'm far from being a specialist on these matters but I would really love to understand this patchset. Is there any documentation somewhere I can read that could help, something about cycles of committed memory or something? Documentation/memory-barriers.txt should suffice for this. If you want more rigor, http://www.cl.cam.ac.uk/~pes20/ppc-supplemental/test7.pdf But memory-barrier pairing suffices here. Here is case 2 from my earlier email in more detail. The comments with capital letters mark important memory barriers, some of which are buried in atomic operations. 1. Some CPU coming out of idle: o rcu_sysidle_exit(): smp_mb__before_atomic_inc(); atomic_inc(rdtp-dynticks_idle); smp_mb__after_atomic_inc(); /* A */ o rcu_sysidle_force_exit(): oldstate = ACCESS_ONCE(full_sysidle_state); 2. RCU GP kthread: o rcu_sysidle(): cmpxchg(full_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); /* B */ o rcu_sysidle_check_cpu(): cur = atomic_read(rdtp-dynticks_idle); Memory barrier A pairs with memory barrier B, so that if #1's load from full_sysidle_state sees RCU_SYSIDLE_SHORT, we know that #1's atomic_inc() must be visible to #2's atomic_read(). This will cause #2 to recognize that the CPU came out of idle, which will in turn cause it to invoke rcu_sysidle_cancel() instead of rcu_sysidle(), resulting in full_sysidle_state being set to RCU_SYSIDLE_NOT. Ok I get it for that direction. Now imagine CPU 0 is the RCU GP kthread (#2) and CPU 1 is idle and stays so. CPU 0 then rounds and see that all CPUs are idle, until it finally sets up RCU_SYSIDLE_SHORT_FULL and finally goes to sleep. Then CPU 1 wakes up. It really has to see a value above RCU_SYSIDLE_SHORT otherwise it won't do the cmpxchg and see the FULL_NOTED that makes it send the IPI. What provides the guarantee that CPU 1 sees a value above RCU_SYSIDLE_SHORT? Not on the cmpxchg but when it first dereference with ACCESS_ONCE. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Fri, Jul 19, 2013 at 12:46:21AM +0200, Frederic Weisbecker wrote: On Thu, Jul 18, 2013 at 09:47:49AM -0700, Paul E. McKenney wrote: On Thu, Jul 18, 2013 at 04:24:51PM +0200, Frederic Weisbecker wrote: On Wed, Jul 17, 2013 at 08:39:21PM -0700, Paul E. McKenney wrote: On Thu, Jul 18, 2013 at 03:33:01AM +0200, Frederic Weisbecker wrote: So it's like: CPU 0 CPU 1 read I write I smp_mb() smp_mb() cmpxchg S read S I still can't find what guarantees we don't read a value in CPU 1 that is way below what we want. One key point is that there is a second cycle from LONG to FULL. (Not saying that there is not a bug -- there might well be. In fact, I am starting to think that I need to do another Promela model... Now I'm very confused :) To quote a Nobel Laureate who presented at an ISEF here in Portland some years back, Confusion is the most productive state of mind. ;-) Then I must be a very productive guy! So that is your secret! ;-) I'm far from being a specialist on these matters but I would really love to understand this patchset. Is there any documentation somewhere I can read that could help, something about cycles of committed memory or something? Documentation/memory-barriers.txt should suffice for this. If you want more rigor, http://www.cl.cam.ac.uk/~pes20/ppc-supplemental/test7.pdf But memory-barrier pairing suffices here. Here is case 2 from my earlier email in more detail. The comments with capital letters mark important memory barriers, some of which are buried in atomic operations. 1. Some CPU coming out of idle: o rcu_sysidle_exit(): smp_mb__before_atomic_inc(); atomic_inc(rdtp-dynticks_idle); smp_mb__after_atomic_inc(); /* A */ o rcu_sysidle_force_exit(): oldstate = ACCESS_ONCE(full_sysidle_state); 2. RCU GP kthread: o rcu_sysidle(): cmpxchg(full_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); /* B */ o rcu_sysidle_check_cpu(): cur = atomic_read(rdtp-dynticks_idle); Memory barrier A pairs with memory barrier B, so that if #1's load from full_sysidle_state sees RCU_SYSIDLE_SHORT, we know that #1's atomic_inc() must be visible to #2's atomic_read(). This will cause #2 to recognize that the CPU came out of idle, which will in turn cause it to invoke rcu_sysidle_cancel() instead of rcu_sysidle(), resulting in full_sysidle_state being set to RCU_SYSIDLE_NOT. Ok I get it for that direction. Now imagine CPU 0 is the RCU GP kthread (#2) and CPU 1 is idle and stays so. CPU 0 then rounds and see that all CPUs are idle, until it finally sets up RCU_SYSIDLE_SHORT_FULL and finally goes to sleep. Then CPU 1 wakes up. It really has to see a value above RCU_SYSIDLE_SHORT otherwise it won't do the cmpxchg and see the FULL_NOTED that makes it send the IPI. What provides the guarantee that CPU 1 sees a value above RCU_SYSIDLE_SHORT? Not on the cmpxchg but when it first dereference with ACCESS_ONCE. The trick is that CPU 0 will have scanned, moved to RCU_SYSIDLE_SHORT, scanned, moved to RCU_SYSIDLE_LONG, then scanned again before moving to RCU_SYSIDLE_FULL. Given CPU 1 has been idle all this time, CPU 0 will have read its -dynticks_idle counter on each scan and seen it having an even value. When CPU 1 comes out of idle, it will atomically increment its -dyntick_idle(), which will happen after CPU 0's read of -dyntick_idle() during its last scan. Because of the memory-barrier pairing above, this means that CPU 1's read from full_sysidle_state must follow the cmpxchg() that set full_sysidle_state to RCU_SYSIDLE_LONG (though not necessarily the two later cmpxchg()s that set it to RCU_SYSIDLE_FULL and RCU_SYSIDLE_FULL_NOTED). But because RCU_SYSIDLE_LONG is greater than RCU_SYSIDLE_SHORT, CPU 1 will take action to end the idle period. Thanx, Paul -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Thu, Jul 18, 2013 at 05:24:08PM -0700, Paul E. McKenney wrote: On Fri, Jul 19, 2013 at 12:46:21AM +0200, Frederic Weisbecker wrote: On Thu, Jul 18, 2013 at 09:47:49AM -0700, Paul E. McKenney wrote: 1. Some CPU coming out of idle: o rcu_sysidle_exit(): smp_mb__before_atomic_inc(); atomic_inc(rdtp-dynticks_idle); smp_mb__after_atomic_inc(); /* A */ o rcu_sysidle_force_exit(): oldstate = ACCESS_ONCE(full_sysidle_state); 2. RCU GP kthread: o rcu_sysidle(): cmpxchg(full_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); /* B */ o rcu_sysidle_check_cpu(): cur = atomic_read(rdtp-dynticks_idle); Memory barrier A pairs with memory barrier B, so that if #1's load from full_sysidle_state sees RCU_SYSIDLE_SHORT, we know that #1's atomic_inc() must be visible to #2's atomic_read(). This will cause #2 to recognize that the CPU came out of idle, which will in turn cause it to invoke rcu_sysidle_cancel() instead of rcu_sysidle(), resulting in full_sysidle_state being set to RCU_SYSIDLE_NOT. Ok I get it for that direction. Now imagine CPU 0 is the RCU GP kthread (#2) and CPU 1 is idle and stays so. CPU 0 then rounds and see that all CPUs are idle, until it finally sets up RCU_SYSIDLE_SHORT_FULL and finally goes to sleep. Then CPU 1 wakes up. It really has to see a value above RCU_SYSIDLE_SHORT otherwise it won't do the cmpxchg and see the FULL_NOTED that makes it send the IPI. What provides the guarantee that CPU 1 sees a value above RCU_SYSIDLE_SHORT? Not on the cmpxchg but when it first dereference with ACCESS_ONCE. The trick is that CPU 0 will have scanned, moved to RCU_SYSIDLE_SHORT, scanned, moved to RCU_SYSIDLE_LONG, then scanned again before moving to RCU_SYSIDLE_FULL. Given CPU 1 has been idle all this time, CPU 0 will have read its -dynticks_idle counter on each scan and seen it having an even value. When CPU 1 comes out of idle, it will atomically increment its -dyntick_idle(), which will happen after CPU 0's read of -dyntick_idle() during its last scan. Because of the memory-barrier pairing above, this means that CPU 1's read from full_sysidle_state must follow the cmpxchg() that set full_sysidle_state to RCU_SYSIDLE_LONG (though not necessarily the two later cmpxchg()s that set it to RCU_SYSIDLE_FULL and RCU_SYSIDLE_FULL_NOTED). But because RCU_SYSIDLE_LONG is greater than RCU_SYSIDLE_SHORT, CPU 1 will take action to end the idle period. Lets summarize the last sequence, the following happens ordered by time: CPU 0 CPU 1 cmpxchg(full_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); smp_mb() //cmpxchg atomic_read(rdtp(1)-dynticks_idle) //CPU 0 goes to sleep //CPU 1 wakes up atomic_inc(rdtp(1)-dynticks_idle) smp_mb() ACCESS_ONCE(full_sysidle_state) Are you suggesting that because the CPU 1 executes its atomic_inc() _after_ (in terms of absolute time) the atomic_read of CPU 0, the ordering settled in both sides guarantees that the value read from CPU 1 is the one from the cmpxchg that precedes the atomic_read, or FULL or FULL_NOTED that happen later. If so that's a big lesson for me. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Fri, Jul 19, 2013 at 04:12:08AM +0200, Frederic Weisbecker wrote: On Thu, Jul 18, 2013 at 05:24:08PM -0700, Paul E. McKenney wrote: On Fri, Jul 19, 2013 at 12:46:21AM +0200, Frederic Weisbecker wrote: On Thu, Jul 18, 2013 at 09:47:49AM -0700, Paul E. McKenney wrote: 1. Some CPU coming out of idle: o rcu_sysidle_exit(): smp_mb__before_atomic_inc(); atomic_inc(rdtp-dynticks_idle); smp_mb__after_atomic_inc(); /* A */ o rcu_sysidle_force_exit(): oldstate = ACCESS_ONCE(full_sysidle_state); 2. RCU GP kthread: o rcu_sysidle(): cmpxchg(full_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); /* B */ o rcu_sysidle_check_cpu(): cur = atomic_read(rdtp-dynticks_idle); Memory barrier A pairs with memory barrier B, so that if #1's load from full_sysidle_state sees RCU_SYSIDLE_SHORT, we know that #1's atomic_inc() must be visible to #2's atomic_read(). This will cause #2 to recognize that the CPU came out of idle, which will in turn cause it to invoke rcu_sysidle_cancel() instead of rcu_sysidle(), resulting in full_sysidle_state being set to RCU_SYSIDLE_NOT. Ok I get it for that direction. Now imagine CPU 0 is the RCU GP kthread (#2) and CPU 1 is idle and stays so. CPU 0 then rounds and see that all CPUs are idle, until it finally sets up RCU_SYSIDLE_SHORT_FULL and finally goes to sleep. Then CPU 1 wakes up. It really has to see a value above RCU_SYSIDLE_SHORT otherwise it won't do the cmpxchg and see the FULL_NOTED that makes it send the IPI. What provides the guarantee that CPU 1 sees a value above RCU_SYSIDLE_SHORT? Not on the cmpxchg but when it first dereference with ACCESS_ONCE. The trick is that CPU 0 will have scanned, moved to RCU_SYSIDLE_SHORT, scanned, moved to RCU_SYSIDLE_LONG, then scanned again before moving to RCU_SYSIDLE_FULL. Given CPU 1 has been idle all this time, CPU 0 will have read its -dynticks_idle counter on each scan and seen it having an even value. When CPU 1 comes out of idle, it will atomically increment its -dyntick_idle(), which will happen after CPU 0's read of -dyntick_idle() during its last scan. Because of the memory-barrier pairing above, this means that CPU 1's read from full_sysidle_state must follow the cmpxchg() that set full_sysidle_state to RCU_SYSIDLE_LONG (though not necessarily the two later cmpxchg()s that set it to RCU_SYSIDLE_FULL and RCU_SYSIDLE_FULL_NOTED). But because RCU_SYSIDLE_LONG is greater than RCU_SYSIDLE_SHORT, CPU 1 will take action to end the idle period. Lets summarize the last sequence, the following happens ordered by time: CPU 0 CPU 1 cmpxchg(full_sysidle_state, RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); smp_mb() //cmpxchg atomic_read(rdtp(1)-dynticks_idle) //CPU 0 goes to sleep //CPU 1 wakes up atomic_inc(rdtp(1)-dynticks_idle) smp_mb() ACCESS_ONCE(full_sysidle_state) Are you suggesting that because the CPU 1 executes its atomic_inc() _after_ (in terms of absolute time) the atomic_read of CPU 0, the ordering settled in both sides guarantees that the value read from CPU 1 is the one from the cmpxchg that precedes the atomic_read, or FULL or FULL_NOTED that happen later. If so that's a big lesson for me. It is not absolute time that matters. Instead, it is the fact that CPU 0, when reading from -dynticks_idle, read the old value before the atomic_inc(). Therefore, anything CPU 0 did before that memory barrier preceding CPU 0's read must come before anything CPU 1 did after that memory barrier following the atomic_inc(). For this to work, there must be some access to the same variable on each CPU. Or, if you must think in terms of time, you need a separate independent timeline for each variable, with no direct mapping from one timeline to another, except resulting from memory-barrier interactions. Thanx, Paul -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Thu, Jul 18, 2013 at 03:33:01AM +0200, Frederic Weisbecker wrote: > On Wed, Jul 17, 2013 at 05:41:41PM -0700, Paul E. McKenney wrote: > > On Thu, Jul 18, 2013 at 01:31:21AM +0200, Frederic Weisbecker wrote: > > > I'm missing a key here. > > > > > > Let's imagine that the timekeeper has finally set full_sysidle_state = > > > RCU_SYSIDLE_FULL_NOTED > > > with cmpxchg, what guarantees that this CPU is not seeing a stale > > > RCU_SYSIDLE_SHORT value > > > for example? > > > > Good question! Let's see if I have a reasonable answer. ;-) > > > > I am going to start with the large-CPU case, so that the state is advanced > > only by the grace-period kthread. > > > > 1. Case 1: the timekeeper CPU invoked rcu_sysidle_force_exit(). > > In this case, this is the same CPU that set full_sysidle_state > > to RCU_SYSIDLE_FULL_NOTED, so it is guaranteed not to see a > > stale value. > > > > 2. Case 2: Some CPU came out of idle, and invoked rcu_sysidle_exit(). > > In this case, if this CPU reads a RCU_SYSIDLE_SHORT from > > full_sysidle_state, this read must have come before the > > cmpxchg() (on some other CPU) that set it to RCU_SYSIDLE_LONG. > > Because this CPU's read from full_sysidle_state was preceded by > > an atomic_inc() that updated this CPU's ->dynticks_idle, that > > update must also precede the cmpxchg() that set full_sysidle_state > > to RCU_SYSIDLE_LONG. Because the state advancing is done from > > within a single thread, the subsequent scan is guaranteed to see > > the first CPU's update of ->dynticks_idle, and will therefore > > refrain from advancing full_sysidle_state to RCU_SYSIDLE_FULL. > > > > This will in turn prevent the timekeeping thread from advancing > > the state to RCU_SYSIDLE_FULL_NOTED, so this scenario cannot > > happen. > > Ok, so IIUC the safety is guaranteed in the following ordering: > > CPU 0 CPU 1 > > idle = 1smp_mb() > //for each cpu > if (atomic_read(rdp(1)->dyntick_idle) & 1) > atomic_inc(rdp->dyntick_idle) > idle = 0 smp_mb() > > if (idle) > cmpxchg(full_sysidle_state, SHORT, LONG) while > (full_sysidle_state > SHORT) > //reset with > cmpxchg > > So it's like: > > CPU 0 CPU 1 > > read I write I > smp_mb() smp_mb() > cmpxchg S read S > > I still can't find what guarantees we don't read a value in CPU 1 that is way > below > what we want. One key point is that there is a second cycle from LONG to FULL. (Not saying that there is not a bug -- there might well be. In fact, I am starting to think that I need to do another Promela model...) > > Unfortunately, the reasoning in #2 above does not hold in the small-CPU > > case because there is the possibility of both the timekeeping CPU and > > the RCU grace-period kthread concurrently advancing the state machine. > > This would be bad, good catch!!! > > It's not like I spotted anything myself but you're welcome :) I will take them any way I can get them. ;-) > > The patch below (untested) is an attempt to fix this. If it actually > > works, I will merge it in with 6/7. > > > > Anything else I missed? ;-) > > Well I guess I'll wait one more night before trying to understand > the below ;) The key point is that the added check means that either the timekeeping CPU is advancing the state machine (if there are few CPUs) or the RCU grace-period kthread is (if there are many CPUs), but never both. Or that is the intent, anyway! Thanx, Paul > Thanks! > > > > > Thanx, Paul > > > > > > > > Signed-off-by: Paul E. McKenney > > > > diff --git a/kernel/rcutree.c b/kernel/rcutree.c > > index aa3f525..fe83085 100644 > > --- a/kernel/rcutree.c > > +++ b/kernel/rcutree.c > > @@ -1364,7 +1364,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int > > fqs_state_in) > > } > > force_qs_rnp(rsp, dyntick_save_progress_counter, > > , ); > > - rcu_sysidle_report(rsp, isidle, maxj); > > + rcu_sysidle_report_gp(rsp, isidle, maxj); > > fqs_state = RCU_FORCE_QS; > > } else { > > /* Handle dyntick-idle and offline CPUs. */ > > diff --git a/kernel/rcutree.h b/kernel/rcutree.h > > index 1602c21..657b415 100644 > > --- a/kernel/rcutree.h > > +++ b/kernel/rcutree.h > > @@ -559,8 +559,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, > > bool *isidle, > >
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Wed, Jul 17, 2013 at 05:41:41PM -0700, Paul E. McKenney wrote: > On Thu, Jul 18, 2013 at 01:31:21AM +0200, Frederic Weisbecker wrote: > > I'm missing a key here. > > > > Let's imagine that the timekeeper has finally set full_sysidle_state = > > RCU_SYSIDLE_FULL_NOTED > > with cmpxchg, what guarantees that this CPU is not seeing a stale > > RCU_SYSIDLE_SHORT value > > for example? > > Good question! Let's see if I have a reasonable answer. ;-) > > I am going to start with the large-CPU case, so that the state is advanced > only by the grace-period kthread. > > 1.Case 1: the timekeeper CPU invoked rcu_sysidle_force_exit(). > In this case, this is the same CPU that set full_sysidle_state > to RCU_SYSIDLE_FULL_NOTED, so it is guaranteed not to see a > stale value. > > 2.Case 2: Some CPU came out of idle, and invoked rcu_sysidle_exit(). > In this case, if this CPU reads a RCU_SYSIDLE_SHORT from > full_sysidle_state, this read must have come before the > cmpxchg() (on some other CPU) that set it to RCU_SYSIDLE_LONG. > Because this CPU's read from full_sysidle_state was preceded by > an atomic_inc() that updated this CPU's ->dynticks_idle, that > update must also precede the cmpxchg() that set full_sysidle_state > to RCU_SYSIDLE_LONG. Because the state advancing is done from > within a single thread, the subsequent scan is guaranteed to see > the first CPU's update of ->dynticks_idle, and will therefore > refrain from advancing full_sysidle_state to RCU_SYSIDLE_FULL. > > This will in turn prevent the timekeeping thread from advancing > the state to RCU_SYSIDLE_FULL_NOTED, so this scenario cannot > happen. Ok, so IIUC the safety is guaranteed in the following ordering: CPU 0 CPU 1 idle = 1smp_mb() //for each cpu if (atomic_read(rdp(1)->dyntick_idle) & 1) atomic_inc(rdp->dyntick_idle) idle = 0 smp_mb() if (idle) cmpxchg(full_sysidle_state, SHORT, LONG) while (full_sysidle_state > SHORT) //reset with cmpxchg So it's like: CPU 0 CPU 1 read I write I smp_mb() smp_mb() cmpxchg S read S I still can't find what guarantees we don't read a value in CPU 1 that is way below what we want. > > Unfortunately, the reasoning in #2 above does not hold in the small-CPU > case because there is the possibility of both the timekeeping CPU and > the RCU grace-period kthread concurrently advancing the state machine. > This would be bad, good catch!!! It's not like I spotted anything myself but you're welcome :) > > The patch below (untested) is an attempt to fix this. If it actually > works, I will merge it in with 6/7. > > Anything else I missed? ;-) Well I guess I'll wait one more night before trying to understand the below ;) Thanks! > > Thanx, Paul > > > > Signed-off-by: Paul E. McKenney > > diff --git a/kernel/rcutree.c b/kernel/rcutree.c > index aa3f525..fe83085 100644 > --- a/kernel/rcutree.c > +++ b/kernel/rcutree.c > @@ -1364,7 +1364,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) > } > force_qs_rnp(rsp, dyntick_save_progress_counter, >, ); > - rcu_sysidle_report(rsp, isidle, maxj); > + rcu_sysidle_report_gp(rsp, isidle, maxj); > fqs_state = RCU_FORCE_QS; > } else { > /* Handle dyntick-idle and offline CPUs. */ > diff --git a/kernel/rcutree.h b/kernel/rcutree.h > index 1602c21..657b415 100644 > --- a/kernel/rcutree.h > +++ b/kernel/rcutree.h > @@ -559,8 +559,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, > bool *isidle, > unsigned long *maxj); > static bool is_sysidle_rcu_state(struct rcu_state *rsp); > static void rcu_bind_gp_kthread(void); > -static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, > -unsigned long maxj); > +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, > + unsigned long maxj); > static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); > > #endif /* #ifndef RCU_TREE_NONCORE */ > diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h > index a4d44c3..f65d9c2 100644 > --- a/kernel/rcutree_plugin.h > +++ b/kernel/rcutree_plugin.h > @@ -2655,14 +2655,22 @@ static void rcu_sysidle_cancel(void) > * scan of the CPUs' dyntick-idle state.
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Thu, Jul 18, 2013 at 01:31:21AM +0200, Frederic Weisbecker wrote: > On Mon, Jul 08, 2013 at 06:30:05PM -0700, Paul E. McKenney wrote: > > } > > > > /* > > + * Unconditionally force exit from full system-idle state. This is > > + * invoked when a normal CPU exits idle, but must be called separately > > + * for the timekeeping CPU (tick_do_timer_cpu). The reason for this > > + * is that the timekeeping CPU is permitted to take scheduling-clock > > + * interrupts while the system is in system-idle state, and of course > > + * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock > > + * interrupt from any other type of interrupt. > > + */ > > +void rcu_sysidle_force_exit(void) > > +{ > > + int oldstate = ACCESS_ONCE(full_sysidle_state); > > + int newoldstate; > > + > > + /* > > +* Each pass through the following loop attempts to exit full > > +* system-idle state. If contention proves to be a problem, > > +* a trylock-based contention tree could be used here. > > +*/ > > + while (oldstate > RCU_SYSIDLE_SHORT) { > > I'm missing a key here. > > Let's imagine that the timekeeper has finally set full_sysidle_state = > RCU_SYSIDLE_FULL_NOTED > with cmpxchg, what guarantees that this CPU is not seeing a stale > RCU_SYSIDLE_SHORT value > for example? Good question! Let's see if I have a reasonable answer. ;-) I am going to start with the large-CPU case, so that the state is advanced only by the grace-period kthread. 1. Case 1: the timekeeper CPU invoked rcu_sysidle_force_exit(). In this case, this is the same CPU that set full_sysidle_state to RCU_SYSIDLE_FULL_NOTED, so it is guaranteed not to see a stale value. 2. Case 2: Some CPU came out of idle, and invoked rcu_sysidle_exit(). In this case, if this CPU reads a RCU_SYSIDLE_SHORT from full_sysidle_state, this read must have come before the cmpxchg() (on some other CPU) that set it to RCU_SYSIDLE_LONG. Because this CPU's read from full_sysidle_state was preceded by an atomic_inc() that updated this CPU's ->dynticks_idle, that update must also precede the cmpxchg() that set full_sysidle_state to RCU_SYSIDLE_LONG. Because the state advancing is done from within a single thread, the subsequent scan is guaranteed to see the first CPU's update of ->dynticks_idle, and will therefore refrain from advancing full_sysidle_state to RCU_SYSIDLE_FULL. This will in turn prevent the timekeeping thread from advancing the state to RCU_SYSIDLE_FULL_NOTED, so this scenario cannot happen. Unfortunately, the reasoning in #2 above does not hold in the small-CPU case because there is the possibility of both the timekeeping CPU and the RCU grace-period kthread concurrently advancing the state machine. This would be bad, good catch!!! The patch below (untested) is an attempt to fix this. If it actually works, I will merge it in with 6/7. Anything else I missed? ;-) Thanx, Paul Signed-off-by: Paul E. McKenney diff --git a/kernel/rcutree.c b/kernel/rcutree.c index aa3f525..fe83085 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1364,7 +1364,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) } force_qs_rnp(rsp, dyntick_save_progress_counter, , ); - rcu_sysidle_report(rsp, isidle, maxj); + rcu_sysidle_report_gp(rsp, isidle, maxj); fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 1602c21..657b415 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -559,8 +559,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, unsigned long *maxj); static bool is_sysidle_rcu_state(struct rcu_state *rsp); static void rcu_bind_gp_kthread(void); -static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, - unsigned long maxj); +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index a4d44c3..f65d9c2 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2655,14 +2655,22 @@ static void rcu_sysidle_cancel(void) * scan of the CPUs' dyntick-idle state. */ static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, - unsigned long maxj) + unsigned long maxj, bool gpkt) { if (rsp != rcu_sysidle_state) return;
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Mon, Jul 08, 2013 at 06:30:05PM -0700, Paul E. McKenney wrote: > } > > /* > + * Unconditionally force exit from full system-idle state. This is > + * invoked when a normal CPU exits idle, but must be called separately > + * for the timekeeping CPU (tick_do_timer_cpu). The reason for this > + * is that the timekeeping CPU is permitted to take scheduling-clock > + * interrupts while the system is in system-idle state, and of course > + * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock > + * interrupt from any other type of interrupt. > + */ > +void rcu_sysidle_force_exit(void) > +{ > + int oldstate = ACCESS_ONCE(full_sysidle_state); > + int newoldstate; > + > + /* > + * Each pass through the following loop attempts to exit full > + * system-idle state. If contention proves to be a problem, > + * a trylock-based contention tree could be used here. > + */ > + while (oldstate > RCU_SYSIDLE_SHORT) { I'm missing a key here. Let's imagine that the timekeeper has finally set full_sysidle_state = RCU_SYSIDLE_FULL_NOTED with cmpxchg, what guarantees that this CPU is not seeing a stale RCU_SYSIDLE_SHORT value for example? -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Mon, Jul 08, 2013 at 06:30:05PM -0700, Paul E. McKenney wrote: } /* + * Unconditionally force exit from full system-idle state. This is + * invoked when a normal CPU exits idle, but must be called separately + * for the timekeeping CPU (tick_do_timer_cpu). The reason for this + * is that the timekeeping CPU is permitted to take scheduling-clock + * interrupts while the system is in system-idle state, and of course + * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock + * interrupt from any other type of interrupt. + */ +void rcu_sysidle_force_exit(void) +{ + int oldstate = ACCESS_ONCE(full_sysidle_state); + int newoldstate; + + /* + * Each pass through the following loop attempts to exit full + * system-idle state. If contention proves to be a problem, + * a trylock-based contention tree could be used here. + */ + while (oldstate RCU_SYSIDLE_SHORT) { I'm missing a key here. Let's imagine that the timekeeper has finally set full_sysidle_state = RCU_SYSIDLE_FULL_NOTED with cmpxchg, what guarantees that this CPU is not seeing a stale RCU_SYSIDLE_SHORT value for example? -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Thu, Jul 18, 2013 at 01:31:21AM +0200, Frederic Weisbecker wrote: On Mon, Jul 08, 2013 at 06:30:05PM -0700, Paul E. McKenney wrote: } /* + * Unconditionally force exit from full system-idle state. This is + * invoked when a normal CPU exits idle, but must be called separately + * for the timekeeping CPU (tick_do_timer_cpu). The reason for this + * is that the timekeeping CPU is permitted to take scheduling-clock + * interrupts while the system is in system-idle state, and of course + * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock + * interrupt from any other type of interrupt. + */ +void rcu_sysidle_force_exit(void) +{ + int oldstate = ACCESS_ONCE(full_sysidle_state); + int newoldstate; + + /* +* Each pass through the following loop attempts to exit full +* system-idle state. If contention proves to be a problem, +* a trylock-based contention tree could be used here. +*/ + while (oldstate RCU_SYSIDLE_SHORT) { I'm missing a key here. Let's imagine that the timekeeper has finally set full_sysidle_state = RCU_SYSIDLE_FULL_NOTED with cmpxchg, what guarantees that this CPU is not seeing a stale RCU_SYSIDLE_SHORT value for example? Good question! Let's see if I have a reasonable answer. ;-) I am going to start with the large-CPU case, so that the state is advanced only by the grace-period kthread. 1. Case 1: the timekeeper CPU invoked rcu_sysidle_force_exit(). In this case, this is the same CPU that set full_sysidle_state to RCU_SYSIDLE_FULL_NOTED, so it is guaranteed not to see a stale value. 2. Case 2: Some CPU came out of idle, and invoked rcu_sysidle_exit(). In this case, if this CPU reads a RCU_SYSIDLE_SHORT from full_sysidle_state, this read must have come before the cmpxchg() (on some other CPU) that set it to RCU_SYSIDLE_LONG. Because this CPU's read from full_sysidle_state was preceded by an atomic_inc() that updated this CPU's -dynticks_idle, that update must also precede the cmpxchg() that set full_sysidle_state to RCU_SYSIDLE_LONG. Because the state advancing is done from within a single thread, the subsequent scan is guaranteed to see the first CPU's update of -dynticks_idle, and will therefore refrain from advancing full_sysidle_state to RCU_SYSIDLE_FULL. This will in turn prevent the timekeeping thread from advancing the state to RCU_SYSIDLE_FULL_NOTED, so this scenario cannot happen. Unfortunately, the reasoning in #2 above does not hold in the small-CPU case because there is the possibility of both the timekeeping CPU and the RCU grace-period kthread concurrently advancing the state machine. This would be bad, good catch!!! The patch below (untested) is an attempt to fix this. If it actually works, I will merge it in with 6/7. Anything else I missed? ;-) Thanx, Paul Signed-off-by: Paul E. McKenney paul...@linux.vnet.ibm.com diff --git a/kernel/rcutree.c b/kernel/rcutree.c index aa3f525..fe83085 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1364,7 +1364,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) } force_qs_rnp(rsp, dyntick_save_progress_counter, isidle, maxj); - rcu_sysidle_report(rsp, isidle, maxj); + rcu_sysidle_report_gp(rsp, isidle, maxj); fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 1602c21..657b415 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -559,8 +559,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, unsigned long *maxj); static bool is_sysidle_rcu_state(struct rcu_state *rsp); static void rcu_bind_gp_kthread(void); -static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, - unsigned long maxj); +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index a4d44c3..f65d9c2 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2655,14 +2655,22 @@ static void rcu_sysidle_cancel(void) * scan of the CPUs' dyntick-idle state. */ static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, - unsigned long maxj) + unsigned long maxj, bool gpkt) { if (rsp != rcu_sysidle_state) return; /* Wrong flavor,
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Wed, Jul 17, 2013 at 05:41:41PM -0700, Paul E. McKenney wrote: On Thu, Jul 18, 2013 at 01:31:21AM +0200, Frederic Weisbecker wrote: I'm missing a key here. Let's imagine that the timekeeper has finally set full_sysidle_state = RCU_SYSIDLE_FULL_NOTED with cmpxchg, what guarantees that this CPU is not seeing a stale RCU_SYSIDLE_SHORT value for example? Good question! Let's see if I have a reasonable answer. ;-) I am going to start with the large-CPU case, so that the state is advanced only by the grace-period kthread. 1.Case 1: the timekeeper CPU invoked rcu_sysidle_force_exit(). In this case, this is the same CPU that set full_sysidle_state to RCU_SYSIDLE_FULL_NOTED, so it is guaranteed not to see a stale value. 2.Case 2: Some CPU came out of idle, and invoked rcu_sysidle_exit(). In this case, if this CPU reads a RCU_SYSIDLE_SHORT from full_sysidle_state, this read must have come before the cmpxchg() (on some other CPU) that set it to RCU_SYSIDLE_LONG. Because this CPU's read from full_sysidle_state was preceded by an atomic_inc() that updated this CPU's -dynticks_idle, that update must also precede the cmpxchg() that set full_sysidle_state to RCU_SYSIDLE_LONG. Because the state advancing is done from within a single thread, the subsequent scan is guaranteed to see the first CPU's update of -dynticks_idle, and will therefore refrain from advancing full_sysidle_state to RCU_SYSIDLE_FULL. This will in turn prevent the timekeeping thread from advancing the state to RCU_SYSIDLE_FULL_NOTED, so this scenario cannot happen. Ok, so IIUC the safety is guaranteed in the following ordering: CPU 0 CPU 1 idle = 1smp_mb() //for each cpu if (atomic_read(rdp(1)-dyntick_idle) 1) atomic_inc(rdp-dyntick_idle) idle = 0 smp_mb() if (idle) cmpxchg(full_sysidle_state, SHORT, LONG) while (full_sysidle_state SHORT) //reset with cmpxchg So it's like: CPU 0 CPU 1 read I write I smp_mb() smp_mb() cmpxchg S read S I still can't find what guarantees we don't read a value in CPU 1 that is way below what we want. Unfortunately, the reasoning in #2 above does not hold in the small-CPU case because there is the possibility of both the timekeeping CPU and the RCU grace-period kthread concurrently advancing the state machine. This would be bad, good catch!!! It's not like I spotted anything myself but you're welcome :) The patch below (untested) is an attempt to fix this. If it actually works, I will merge it in with 6/7. Anything else I missed? ;-) Well I guess I'll wait one more night before trying to understand the below ;) Thanks! Thanx, Paul Signed-off-by: Paul E. McKenney paul...@linux.vnet.ibm.com diff --git a/kernel/rcutree.c b/kernel/rcutree.c index aa3f525..fe83085 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1364,7 +1364,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) } force_qs_rnp(rsp, dyntick_save_progress_counter, isidle, maxj); - rcu_sysidle_report(rsp, isidle, maxj); + rcu_sysidle_report_gp(rsp, isidle, maxj); fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 1602c21..657b415 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -559,8 +559,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, unsigned long *maxj); static bool is_sysidle_rcu_state(struct rcu_state *rsp); static void rcu_bind_gp_kthread(void); -static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, -unsigned long maxj); +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index a4d44c3..f65d9c2 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2655,14 +2655,22 @@ static void rcu_sysidle_cancel(void) * scan of the CPUs' dyntick-idle state. */ static void rcu_sysidle_report(struct rcu_state *rsp,
Re: [PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
On Thu, Jul 18, 2013 at 03:33:01AM +0200, Frederic Weisbecker wrote: On Wed, Jul 17, 2013 at 05:41:41PM -0700, Paul E. McKenney wrote: On Thu, Jul 18, 2013 at 01:31:21AM +0200, Frederic Weisbecker wrote: I'm missing a key here. Let's imagine that the timekeeper has finally set full_sysidle_state = RCU_SYSIDLE_FULL_NOTED with cmpxchg, what guarantees that this CPU is not seeing a stale RCU_SYSIDLE_SHORT value for example? Good question! Let's see if I have a reasonable answer. ;-) I am going to start with the large-CPU case, so that the state is advanced only by the grace-period kthread. 1. Case 1: the timekeeper CPU invoked rcu_sysidle_force_exit(). In this case, this is the same CPU that set full_sysidle_state to RCU_SYSIDLE_FULL_NOTED, so it is guaranteed not to see a stale value. 2. Case 2: Some CPU came out of idle, and invoked rcu_sysidle_exit(). In this case, if this CPU reads a RCU_SYSIDLE_SHORT from full_sysidle_state, this read must have come before the cmpxchg() (on some other CPU) that set it to RCU_SYSIDLE_LONG. Because this CPU's read from full_sysidle_state was preceded by an atomic_inc() that updated this CPU's -dynticks_idle, that update must also precede the cmpxchg() that set full_sysidle_state to RCU_SYSIDLE_LONG. Because the state advancing is done from within a single thread, the subsequent scan is guaranteed to see the first CPU's update of -dynticks_idle, and will therefore refrain from advancing full_sysidle_state to RCU_SYSIDLE_FULL. This will in turn prevent the timekeeping thread from advancing the state to RCU_SYSIDLE_FULL_NOTED, so this scenario cannot happen. Ok, so IIUC the safety is guaranteed in the following ordering: CPU 0 CPU 1 idle = 1smp_mb() //for each cpu if (atomic_read(rdp(1)-dyntick_idle) 1) atomic_inc(rdp-dyntick_idle) idle = 0 smp_mb() if (idle) cmpxchg(full_sysidle_state, SHORT, LONG) while (full_sysidle_state SHORT) //reset with cmpxchg So it's like: CPU 0 CPU 1 read I write I smp_mb() smp_mb() cmpxchg S read S I still can't find what guarantees we don't read a value in CPU 1 that is way below what we want. One key point is that there is a second cycle from LONG to FULL. (Not saying that there is not a bug -- there might well be. In fact, I am starting to think that I need to do another Promela model...) Unfortunately, the reasoning in #2 above does not hold in the small-CPU case because there is the possibility of both the timekeeping CPU and the RCU grace-period kthread concurrently advancing the state machine. This would be bad, good catch!!! It's not like I spotted anything myself but you're welcome :) I will take them any way I can get them. ;-) The patch below (untested) is an attempt to fix this. If it actually works, I will merge it in with 6/7. Anything else I missed? ;-) Well I guess I'll wait one more night before trying to understand the below ;) The key point is that the added check means that either the timekeeping CPU is advancing the state machine (if there are few CPUs) or the RCU grace-period kthread is (if there are many CPUs), but never both. Or that is the intent, anyway! Thanx, Paul Thanks! Thanx, Paul Signed-off-by: Paul E. McKenney paul...@linux.vnet.ibm.com diff --git a/kernel/rcutree.c b/kernel/rcutree.c index aa3f525..fe83085 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1364,7 +1364,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) } force_qs_rnp(rsp, dyntick_save_progress_counter, isidle, maxj); - rcu_sysidle_report(rsp, isidle, maxj); + rcu_sysidle_report_gp(rsp, isidle, maxj); fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 1602c21..657b415 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -559,8 +559,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, unsigned long *maxj); static bool is_sysidle_rcu_state(struct rcu_state *rsp); static void rcu_bind_gp_kthread(void);
[PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
From: "Paul E. McKenney" This commit adds the state machine that takes the per-CPU idle data as input and produces a full-system-idle indication as output. This state machine is driven out of RCU's quiescent-state-forcing mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU idle state and then rcu_sysidle_report() to drive the state machine. The full-system-idle state is sampled using rcu_sys_is_idle(), which also drives the state machine if RCU is idle (and does so by forcing RCU to become non-idle). This function returns true if all but the timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long enough to avoid memory contention on the full_sysidle_state state variable. The rcu_sysidle_force_exit() may be called externally to reset the state machine back into non-idle state. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Cc: Steven Rostedt Conflicts: kernel/rcutree.h kernel/rcutree_plugin.h --- include/linux/rcupdate.h | 18 kernel/rcutree.c | 16 ++- kernel/rcutree.h | 5 + kernel/rcutree_plugin.h | 275 ++- 4 files changed, 307 insertions(+), 7 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48f1ef9..1aa8d8c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return false; } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ +/* Only for use by adaptive-ticks code. */ +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE +extern bool rcu_sys_is_idle(void); +extern void rcu_sysidle_force_exit(void); +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + +static inline bool rcu_sys_is_idle(void) +{ + return false; +} + +static inline void rcu_sysidle_force_exit(void) +{ +} + +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + + #endif /* __LINUX_RCUPDATE_H */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 9971f86..06cfd75 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -718,6 +718,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { rdp->dynticks_snap = atomic_add_return(0, >dynticks->dynticks); + rcu_sysidle_check_cpu(rdp, isidle, maxj); return (rdp->dynticks_snap & 0x1) == 0; } @@ -1356,11 +1357,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) rsp->n_force_qs++; if (fqs_state == RCU_SAVE_DYNTICK) { /* Collect dyntick-idle snapshots. */ + if (is_sysidle_rcu_state(rsp)) { + isidle = 1; + maxj = jiffies - ULONG_MAX / 4; + } force_qs_rnp(rsp, dyntick_save_progress_counter, , ); + rcu_sysidle_report(rsp, isidle, maxj); fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ + isidle = 0; force_qs_rnp(rsp, rcu_implicit_dynticks_qs, , ); } /* Clear flag to prevent immediate re-entry. */ @@ -2087,9 +2094,12 @@ static void force_qs_rnp(struct rcu_state *rsp, cpu = rnp->grplo; bit = 1; for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { - if ((rnp->qsmask & bit) != 0 && - f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) - mask |= bit; + if ((rnp->qsmask & bit) != 0) { + if ((rnp->qsmaskinit & bit) != 0) + *isidle = 0; + if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) + mask |= bit; + } } if (mask != 0) { diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 1895043..7326a3c 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, + unsigned long *maxj); +static bool is_sysidle_rcu_state(struct rcu_state *rsp); +static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, + unsigned long maxj); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3edae39..b47ffb0 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -28,7 +28,7 @@ #include #include #include -#include +#include "time/tick-internal.h"
[PATCH RFC nohz_full 6/7] nohz_full: Add full-system-idle state machine
From: Paul E. McKenney paul...@linux.vnet.ibm.com This commit adds the state machine that takes the per-CPU idle data as input and produces a full-system-idle indication as output. This state machine is driven out of RCU's quiescent-state-forcing mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU idle state and then rcu_sysidle_report() to drive the state machine. The full-system-idle state is sampled using rcu_sys_is_idle(), which also drives the state machine if RCU is idle (and does so by forcing RCU to become non-idle). This function returns true if all but the timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long enough to avoid memory contention on the full_sysidle_state state variable. The rcu_sysidle_force_exit() may be called externally to reset the state machine back into non-idle state. Signed-off-by: Paul E. McKenney paul...@linux.vnet.ibm.com Cc: Frederic Weisbecker fweis...@gmail.com Cc: Steven Rostedt rost...@goodmis.org Conflicts: kernel/rcutree.h kernel/rcutree_plugin.h --- include/linux/rcupdate.h | 18 kernel/rcutree.c | 16 ++- kernel/rcutree.h | 5 + kernel/rcutree_plugin.h | 275 ++- 4 files changed, 307 insertions(+), 7 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48f1ef9..1aa8d8c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return false; } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ +/* Only for use by adaptive-ticks code. */ +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE +extern bool rcu_sys_is_idle(void); +extern void rcu_sysidle_force_exit(void); +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + +static inline bool rcu_sys_is_idle(void) +{ + return false; +} + +static inline void rcu_sysidle_force_exit(void) +{ +} + +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + + #endif /* __LINUX_RCUPDATE_H */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 9971f86..06cfd75 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -718,6 +718,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { rdp-dynticks_snap = atomic_add_return(0, rdp-dynticks-dynticks); + rcu_sysidle_check_cpu(rdp, isidle, maxj); return (rdp-dynticks_snap 0x1) == 0; } @@ -1356,11 +1357,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) rsp-n_force_qs++; if (fqs_state == RCU_SAVE_DYNTICK) { /* Collect dyntick-idle snapshots. */ + if (is_sysidle_rcu_state(rsp)) { + isidle = 1; + maxj = jiffies - ULONG_MAX / 4; + } force_qs_rnp(rsp, dyntick_save_progress_counter, isidle, maxj); + rcu_sysidle_report(rsp, isidle, maxj); fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ + isidle = 0; force_qs_rnp(rsp, rcu_implicit_dynticks_qs, isidle, maxj); } /* Clear flag to prevent immediate re-entry. */ @@ -2087,9 +2094,12 @@ static void force_qs_rnp(struct rcu_state *rsp, cpu = rnp-grplo; bit = 1; for (; cpu = rnp-grphi; cpu++, bit = 1) { - if ((rnp-qsmask bit) != 0 - f(per_cpu_ptr(rsp-rda, cpu), isidle, maxj)) - mask |= bit; + if ((rnp-qsmask bit) != 0) { + if ((rnp-qsmaskinit bit) != 0) + *isidle = 0; + if (f(per_cpu_ptr(rsp-rda, cpu), isidle, maxj)) + mask |= bit; + } } if (mask != 0) { diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 1895043..7326a3c 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, + unsigned long *maxj); +static bool is_sysidle_rcu_state(struct rcu_state *rsp); +static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, + unsigned long maxj); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3edae39..b47ffb0 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -28,7