9] nohz_full: Add full-system-idle state machine

Lai Jiangshan Sun, 25 Aug 2013 22:42:12 -0700

On 08/20/2013 10:47 AM, Paul E. McKenney wrote:
> From: "Paul E. McKenney" <[email protected]>
> 
> This commit adds the state machine that takes the per-CPU idle data
> as input and produces a full-system-idle indication as output.  This
> state machine is driven out of RCU's quiescent-state-forcing
> mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU
> idle state and then rcu_sysidle_report() to drive the state machine.
> 
> The full-system-idle state is sampled using rcu_sys_is_idle(), which
> also drives the state machine if RCU is idle (and does so by forcing
> RCU to become non-idle).  This function returns true if all but the
> timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long
> enough to avoid memory contention on the full_sysidle_state state
> variable.  The rcu_sysidle_force_exit() may be called externally
> to reset the state machine back into non-idle state.
> 
> For large systems the state machine is driven out of RCU's
> force-quiescent-state logic, which provides good scalability at the price
> of millisecond-scale latencies on the transition to full-system-idle
> state.  This is not so good for battery-powered systems, which are usually
> small enough that they don't need to care about scalability, but which
> do care deeply about energy efficiency.  Small systems therefore drive
> the state machine directly out of the idle-entry code.  The number of
> CPUs in a "small" system is defined by a new NO_HZ_FULL_SYSIDLE_SMALL
> Kconfig parameter, which defaults to 8.  Note that this is a build-time
> definition.
> 
> Signed-off-by: Paul E. McKenney <[email protected]>
> Cc: Frederic Weisbecker <[email protected]>
> Cc: Steven Rostedt <[email protected]>
> Cc: Lai Jiangshan <[email protected]>
> [ paulmck: Use true and false for boolean constants per Lai Jiangshan. ]
> Reviewed-by: Josh Triplett <[email protected]>
> ---
>  include/linux/rcupdate.h |  18 +++
>  kernel/rcutree.c         |  16 ++-
>  kernel/rcutree.h         |   5 +
>  kernel/rcutree_plugin.h  | 284 
> ++++++++++++++++++++++++++++++++++++++++++++++-
>  kernel/time/Kconfig      |  27 +++++
>  5 files changed, 343 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
> index 30bea9c..f1f1bc3 100644
> --- a/include/linux/rcupdate.h
> +++ b/include/linux/rcupdate.h
> @@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return 
> false; }
>  #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
>  
>  
> +/* Only for use by adaptive-ticks code. */
> +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
> +extern bool rcu_sys_is_idle(void);
> +extern void rcu_sysidle_force_exit(void);
> +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
> +
> +static inline bool rcu_sys_is_idle(void)
> +{
> +     return false;
> +}
> +
> +static inline void rcu_sysidle_force_exit(void)
> +{
> +}
> +
> +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
> +
> +
>  #endif /* __LINUX_RCUPDATE_H */
> diff --git a/kernel/rcutree.c b/kernel/rcutree.c
> index 7b5be56..eca70f44 100644
> --- a/kernel/rcutree.c
> +++ b/kernel/rcutree.c
> @@ -734,6 +734,7 @@ static int dyntick_save_progress_counter(struct rcu_data 
> *rdp,
>                                        bool *isidle, unsigned long *maxj)
>  {
>       rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
> +     rcu_sysidle_check_cpu(rdp, isidle, maxj);
>       return (rdp->dynticks_snap & 0x1) == 0;
>  }
>  
> @@ -1373,11 +1374,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int 
> fqs_state_in)
>       rsp->n_force_qs++;
>       if (fqs_state == RCU_SAVE_DYNTICK) {
>               /* Collect dyntick-idle snapshots. */
> +             if (is_sysidle_rcu_state(rsp)) {
> +                     isidle = 1;
> +                     maxj = jiffies - ULONG_MAX / 4;
> +             }
>               force_qs_rnp(rsp, dyntick_save_progress_counter,
>                            &isidle, &maxj);
> +             rcu_sysidle_report_gp(rsp, isidle, maxj);
>               fqs_state = RCU_FORCE_QS;
>       } else {
>               /* Handle dyntick-idle and offline CPUs. */
> +             isidle = 0;
>               force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
>       }
>       /* Clear flag to prevent immediate re-entry. */
> @@ -2103,9 +2110,12 @@ static void force_qs_rnp(struct rcu_state *rsp,
>               cpu = rnp->grplo;
>               bit = 1;
>               for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
> -                     if ((rnp->qsmask & bit) != 0 &&
> -                         f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
> -                             mask |= bit;
> +                     if ((rnp->qsmask & bit) != 0) {
> +                             if ((rnp->qsmaskinit & bit) != 0)
> +                                     *isidle = 0;
> +                             if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
> +                                     mask |= bit;
> +                     }
>               }
>               if (mask != 0) {
>  
> diff --git a/kernel/rcutree.h b/kernel/rcutree.h
> index 9dd8b17..6fd3659 100644
> --- a/kernel/rcutree.h
> +++ b/kernel/rcutree.h
> @@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu);
>  static bool init_nocb_callback_list(struct rcu_data *rdp);
>  static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
>  static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
> +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
> +                               unsigned long *maxj);
> +static bool is_sysidle_rcu_state(struct rcu_state *rsp);
> +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
> +                               unsigned long maxj);
>  static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
>  
>  #endif /* #ifndef RCU_TREE_NONCORE */
> diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
> index a7419ce..90c3fba 100644
> --- a/kernel/rcutree_plugin.h
> +++ b/kernel/rcutree_plugin.h
> @@ -28,7 +28,7 @@
>  #include <linux/gfp.h>
>  #include <linux/oom.h>
>  #include <linux/smpboot.h>
> -#include <linux/tick.h>
> +#include "time/tick-internal.h"
>  
>  #define RCU_KTHREAD_PRIO 1
>  
> @@ -2382,12 +2382,12 @@ static void rcu_kick_nohz_cpu(int cpu)
>   * most active flavor of RCU.
>   */
>  #ifdef CONFIG_PREEMPT_RCU
> -static struct rcu_state __maybe_unused *rcu_sysidle_state = 
> &rcu_preempt_state;
> +static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state;
>  #else /* #ifdef CONFIG_PREEMPT_RCU */
> -static struct rcu_state __maybe_unused *rcu_sysidle_state = &rcu_sched_state;
> +static struct rcu_state *rcu_sysidle_state = &rcu_sched_state;
>  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
>  
> -static int __maybe_unused full_sysidle_state; /* Current system-idle state. 
> */
> +static int full_sysidle_state;               /* Current system-idle state. */
>  #define RCU_SYSIDLE_NOT              0       /* Some CPU is not idle. */
>  #define RCU_SYSIDLE_SHORT    1       /* All CPUs idle for brief period. */
>  #define RCU_SYSIDLE_LONG     2       /* All CPUs idle for long enough. */
> @@ -2431,6 +2431,38 @@ static void rcu_sysidle_enter(struct rcu_dynticks 
> *rdtp, int irq)
>  }
>  
>  /*
> + * Unconditionally force exit from full system-idle state.  This is
> + * invoked when a normal CPU exits idle, but must be called separately
> + * for the timekeeping CPU (tick_do_timer_cpu).  The reason for this
> + * is that the timekeeping CPU is permitted to take scheduling-clock
> + * interrupts while the system is in system-idle state, and of course
> + * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
> + * interrupt from any other type of interrupt.
> + */
> +void rcu_sysidle_force_exit(void)
> +{
> +     int oldstate = ACCESS_ONCE(full_sysidle_state);
> +     int newoldstate;
> +
> +     /*
> +      * Each pass through the following loop attempts to exit full
> +      * system-idle state.  If contention proves to be a problem,
> +      * a trylock-based contention tree could be used here.
> +      */
> +     while (oldstate > RCU_SYSIDLE_SHORT) {
> +             newoldstate = cmpxchg(&full_sysidle_state,
> +                                   oldstate, RCU_SYSIDLE_NOT);
> +             if (oldstate == newoldstate &&
> +                 oldstate == RCU_SYSIDLE_FULL_NOTED) {
> +                     rcu_kick_nohz_cpu(tick_do_timer_cpu);
> +                     return; /* We cleared it, done! */
> +             }
> +             oldstate = newoldstate;
> +     }
> +     smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
> +}
> +
> +/*
>   * Invoked to note entry to irq or task transition from idle.  Note that
>   * usermode execution does -not- count as idle here!  The caller must
>   * have disabled interrupts.
> @@ -2463,6 +2495,235 @@ static void rcu_sysidle_exit(struct rcu_dynticks 
> *rdtp, int irq)
>       atomic_inc(&rdtp->dynticks_idle);
>       smp_mb__after_atomic_inc();
>       WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
> +
> +     /*
> +      * If we are the timekeeping CPU, we are permitted to be non-idle
> +      * during a system-idle state.  This must be the case, because
> +      * the timekeeping CPU has to take scheduling-clock interrupts
> +      * during the time that the system is transitioning to full
> +      * system-idle state.  This means that the timekeeping CPU must
> +      * invoke rcu_sysidle_force_exit() directly if it does anything
> +      * more than take a scheduling-clock interrupt.
> +      */
> +     if (smp_processor_id() == tick_do_timer_cpu)
> +             return;
> +
> +     /* Update system-idle state: We are clearly no longer fully idle! */
> +     rcu_sysidle_force_exit();
> +}
> +
> +/*
> + * Check to see if the current CPU is idle.  Note that usermode execution
> + * does not count as idle.  The caller must have disabled interrupts.
> + */
> +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
> +                               unsigned long *maxj)
> +{
> +     int cur;
> +     unsigned long j;
> +     struct rcu_dynticks *rdtp = rdp->dynticks;
> +
> +     /*
> +      * If some other CPU has already reported non-idle, if this is
> +      * not the flavor of RCU that tracks sysidle state, or if this
> +      * is an offline or the timekeeping CPU, nothing to do.
> +      */
> +     if (!*isidle || rdp->rsp != rcu_sysidle_state ||
> +         cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
> +             return;
> +     /* WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); */
> +
> +     /* Pick up current idle and NMI-nesting counter and check. */
> +     cur = atomic_read(&rdtp->dynticks_idle);
> +     if (cur & 0x1) {
> +             *isidle = false; /* We are not idle! */
> +             return;
> +     }
> +     smp_mb(); /* Read counters before timestamps. */
> +
> +     /* Pick up timestamps. */
> +     j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
> +     /* If this CPU entered idle more recently, update maxj timestamp. */
> +     if (ULONG_CMP_LT(*maxj, j))
> +             *maxj = j;
> +}
> +
> +/*
> + * Is this the flavor of RCU that is handling full-system idle?
> + */
> +static bool is_sysidle_rcu_state(struct rcu_state *rsp)
> +{
> +     return rsp == rcu_sysidle_state;
> +}
> +
> +/*
> + * Return a delay in jiffies based on the number of CPUs, rcu_node
> + * leaf fanout, and jiffies tick rate.  The idea is to allow larger
> + * systems more time to transition to full-idle state in order to
> + * avoid the cache thrashing that otherwise occur on the state variable.
> + * Really small systems (less than a couple of tens of CPUs) should
> + * instead use a single global atomically incremented counter, and later
> + * versions of this will automatically reconfigure themselves accordingly.
> + */
> +static unsigned long rcu_sysidle_delay(void)
> +{
> +     if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
> +             return 0;
> +     return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
> +}
> +
> +/*
> + * Advance the full-system-idle state.  This is invoked when all of
> + * the non-timekeeping CPUs are idle.
> + */
> +static void rcu_sysidle(unsigned long j)
> +{
> +     /* Check the current state. */
> +     switch (ACCESS_ONCE(full_sysidle_state)) {
> +     case RCU_SYSIDLE_NOT:
> +
> +             /* First time all are idle, so note a short idle period. */
> +             ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
> +             break;
> +
> +     case RCU_SYSIDLE_SHORT:
> +
> +             /*
> +              * Idle for a bit, time to advance to next state?
> +              * cmpxchg failure means race with non-idle, let them win.
> +              */
> +             if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
> +                     (void)cmpxchg(&full_sysidle_state,
> +                                   RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
> +             break;
> +
> +     case RCU_SYSIDLE_LONG:
> +
> +             /*
> +              * Do an additional check pass before advancing to full.
> +              * cmpxchg failure means race with non-idle, let them win.
> +              */
> +             if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
> +                     (void)cmpxchg(&full_sysidle_state,
> +                                   RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
> +             break;
> +
> +     default:
> +             break;
> +     }
> +}
> +
> +/*
> + * Found a non-idle non-timekeeping CPU, so kick the system-idle state
> + * back to the beginning.
> + */
> +static void rcu_sysidle_cancel(void)
> +{
> +     smp_mb();
> +     ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
> +}
> +
> +/*
> + * Update the sysidle state based on the results of a force-quiescent-state
> + * scan of the CPUs' dyntick-idle state.
> + */
> +static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
> +                            unsigned long maxj, bool gpkt)
> +{
> +     if (rsp != rcu_sysidle_state)
> +             return;  /* Wrong flavor, ignore. */
> +     if (isidle) {
> +             if (gpkt && nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
> +                     rcu_sysidle(maxj);    /* More idle! */
> +     } else {
> +             rcu_sysidle_cancel(); /* Idle is over. */
> +     }


"gpkt" is always equal to "nr_cpu_ids > RCU_SYSIDLE_SMALL",

so we can remove "gpkt" argument and rcu_sysidle_report_gp(

> +}
> +
> +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
> +                               unsigned long maxj)
> +{
> +     rcu_sysidle_report(rsp, isidle, maxj, true);
> +}
> +
> +/* Callback and function for forcing an RCU grace period. */
> +struct rcu_sysidle_head {
> +     struct rcu_head rh;
> +     int inuse;
> +};
> +
> +static void rcu_sysidle_cb(struct rcu_head *rhp)
> +{
> +     struct rcu_sysidle_head *rshp;
> +
> +     smp_mb();  /* grace period precedes setting inuse. */

Why we need this mb()?


> +     rshp = container_of(rhp, struct rcu_sysidle_head, rh);
> +     ACCESS_ONCE(rshp->inuse) = 0;
> +}
> +
> +/*
> + * Check to see if the system is fully idle, other than the timekeeping CPU.
> + * The caller must have disabled interrupts.
> + */
> +bool rcu_sys_is_idle(void)
> +{
> +     static struct rcu_sysidle_head rsh;
> +     int rss = ACCESS_ONCE(full_sysidle_state);
> +
> +     if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
> +             return false;
> +
> +     /* Handle small-system case by doing a full scan of CPUs. */
> +     if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
> +             int oldrss = rss - 1;
> +
> +             /*
> +              * One pass to advance to each state up to _FULL.
> +              * Give up if any pass fails to advance the state.
> +              */
> +             while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
> +                     int cpu;
> +                     bool isidle = true;
> +                     unsigned long maxj = jiffies - ULONG_MAX / 4;
> +                     struct rcu_data *rdp;
> +
> +                     /* Scan all the CPUs looking for nonidle CPUs. */
> +                     for_each_possible_cpu(cpu) {
> +                             rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu);
> +                             rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
> +                             if (!isidle)
> +                                     break;
> +                     }
> +                     rcu_sysidle_report(rcu_sysidle_state,
> +                                        isidle, maxj, false);
> +                     oldrss = rss;
> +                     rss = ACCESS_ONCE(full_sysidle_state);
> +             }
> +     }

I don't think it is a good idea to move the overhead to fqs when nr_cpu_ids > 8
the total overhead will no be reduced, and it maybe more.

I think we can calculate it on the time-keeping-cpu when nr_cpu_ids > 8 && 
time-keeping-cpu is idle.

> +
> +     /* If this is the first observation of an idle period, record it. */
> +     if (rss == RCU_SYSIDLE_FULL) {
> +             rss = cmpxchg(&full_sysidle_state,
> +                           RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
> +             return rss == RCU_SYSIDLE_FULL;
> +     }
> +
> +     smp_mb(); /* ensure rss load happens before later caller actions. */
> +
> +     /* If already fully idle, tell the caller (in case of races). */
> +     if (rss == RCU_SYSIDLE_FULL_NOTED)
> +             return true;
> +
> +     /*
> +      * If we aren't there yet, and a grace period is not in flight,
> +      * initiate a grace period.  Either way, tell the caller that
> +      * we are not there yet.
> +      */
> +     if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
> +         !rcu_gp_in_progress(rcu_sysidle_state) &&
> +         !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
> +             call_rcu(&rsh.rh, rcu_sysidle_cb);

why need to use xchg()? Who will it race with?


Thanks,
Lai


> +     return false;
>  }
>  
>  /*
> @@ -2483,6 +2744,21 @@ static void rcu_sysidle_exit(struct rcu_dynticks 
> *rdtp, int irq)
>  {
>  }
>  
> +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
> +                               unsigned long *maxj)
> +{
> +}
> +
> +static bool is_sysidle_rcu_state(struct rcu_state *rsp)
> +{
> +     return false;
> +}
> +
> +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
> +                               unsigned long maxj)
> +{
> +}
> +
>  static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
>  {
>  }
> diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
> index c7d2fd6..3381f09 100644
> --- a/kernel/time/Kconfig
> +++ b/kernel/time/Kconfig
> @@ -157,6 +157,33 @@ config NO_HZ_FULL_SYSIDLE
>  
>        Say N if you are unsure.
>  
> +config NO_HZ_FULL_SYSIDLE_SMALL
> +     int "Number of CPUs above which large-system approach is used"
> +     depends on NO_HZ_FULL_SYSIDLE
> +     range 1 NR_CPUS
> +     default 8
> +     help
> +      The full-system idle detection mechanism takes a lazy approach
> +      on large systems, as is required to attain decent scalability.
> +      However, on smaller systems, scalability is not anywhere near as
> +      large a concern as is energy efficiency.  The sysidle subsystem
> +      therefore uses a fast but non-scalable algorithm for small
> +      systems and a lazier but scalable algorithm for large systems.
> +      This Kconfig parameter defines the number of CPUs in the largest
> +      system that will be considered to be "small".
> +
> +      The default value will be fine in most cases.  Battery-powered
> +      systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger
> +      numbers of CPUs, and (3) are suffering from battery-lifetime
> +      problems due to long sysidle latencies might wish to experiment
> +      with larger values for this Kconfig parameter.  On the other
> +      hand, they might be even better served by disabling NO_HZ_FULL
> +      entirely, given that NO_HZ_FULL is intended for HPC and
> +      real-time workloads that at present do not tend to be run on
> +      battery-powered systems.
> +
> +      Take the default if you are unsure.
> +
>  config NO_HZ
>       bool "Old Idle dynticks config"
>       depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH tip/core/rcu 8/9] nohz_full: Add full-system-idle state machine

Reply via email to