Re: [PATCH 05/32] nohz: Adaptive tick stop and restart on nohz cpuset
On Mon, 2012-10-29 at 16:27 -0400, Steven Rostedt wrote: > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -1196,6 +1196,29 @@ static void update_avg(u64 *avg, u64 sample) > } > #endif > > +#ifdef CONFIG_CPUSETS_NO_HZ > +bool sched_can_stop_tick(void) > +{ > + struct rq *rq; > + > + rq = this_rq(); > + > + /* > + * This is called right after cpuset_adaptive_nohz() that See below (for this caller). > + * uses atomic_add_return() so that we are ordered against > + * cpu_adaptive_nohz_ref. When inc_nr_running() sends an > + * IPI to this CPU, we are guaranteed to see the update on > + * nr_running. > + */ > + > + /* More than one running task need preemption */ > + if (rq->nr_running > 1) > + return false; > + > + return true; > +} > +#endif > + > static void > ttwu_stat(struct task_struct *p, int cpu, int wake_flags) > { > @@ -1897,6 +1920,7 @@ context_switch(struct rq *rq, struct task_struct *prev, >* frame will be invalid. >*/ > finish_task_switch(this_rq(), prev); > + tick_nohz_post_schedule(); > } > > /* > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index 7a7db09..c6cd9ec 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -1,6 +1,7 @@ > > #include > #include > +#include > #include > #include > > @@ -927,6 +928,17 @@ static inline u64 steal_ticks(u64 steal) > static inline void inc_nr_running(struct rq *rq) > { > rq->nr_running++; > + > + if (rq->nr_running == 2) { > + /* > + * cpuset_cpu_adaptive_nohz() uses atomic_add_return() > + * to order against rq->nr_running updates. This way > + * the CPU that receives the IPI is guaranteed to see > + * the update on nr_running without the rq->lock. > + */ > + if (cpuset_cpu_adaptive_nohz(rq->cpu)) > + smp_cpuset_update_nohz(rq->cpu); > + } > } > > static inline void dec_nr_running(struct rq *rq) Should we add one for dec_nr_running()? Or is this done elsewhere. I would think that there's a good chance that we can miss a chance to stop the tick. > diff --git a/kernel/softirq.c b/kernel/softirq.c > index cc96bdc..e06b8eb 100644 > --- a/kernel/softirq.c > +++ b/kernel/softirq.c > @@ -25,6 +25,7 @@ > #include > #include > #include > +#include > > #define CREATE_TRACE_POINTS > #include > @@ -307,7 +308,8 @@ void irq_enter(void) > int cpu = smp_processor_id(); > > rcu_irq_enter(); > - if (is_idle_task(current) && !in_interrupt()) { > + > + if ((is_idle_task(current) || cpuset_adaptive_nohz()) && > !in_interrupt()) { > /* >* Prevent raise_softirq from needlessly waking up ksoftirqd >* here, as softirq will be serviced on return from interrupt. > @@ -349,7 +351,7 @@ void irq_exit(void) > > #ifdef CONFIG_NO_HZ > /* Make sure that timer wheel updates are propagated */ > - if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) > + if (!in_interrupt()) > tick_nohz_irq_exit(); > #endif > rcu_irq_exit(); > diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c > index c7a78c6..35047b2 100644 > --- a/kernel/time/tick-sched.c > +++ b/kernel/time/tick-sched.c > @@ -512,6 +512,24 @@ void tick_nohz_idle_enter(void) > local_irq_enable(); > } > > +static void tick_nohz_cpuset_stop_tick(struct tick_sched *ts) > +{ > +#ifdef CONFIG_CPUSETS_NO_HZ > + int cpu = smp_processor_id(); > + > + if (!cpuset_adaptive_nohz() || is_idle_task(current)) > + return; The above is most likely true. Lets remove the memory barrier in cpuset_adaptive_nohz(), just add an explicit one here, in the slow path. /* Before checking the below conditions, we must first * make sure that the cpuset/nohz is active, so we do * not miss a deactivating IPI. * ie. when nr_running == 2, an IPI is sent, and this * code must see the nr_running changed after testing * if the current CPU is adaptive nohz. */ smp_mb(); > + > + if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) > + return; > + > + if (!sched_can_stop_tick()) > + return; > + > + tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); > +#endif > +} > + > /** > * tick_nohz_irq_exit - update next tick event from interrupt exit > * > @@ -524,10 +542,12 @@ void tick_nohz_irq_exit(void) > { > struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); > > - if (!ts->inidle) > - return; > - > - __tick_nohz_idle_enter(ts); > + if (ts->inidle) { > + if (!need_resched()) > + __tick_nohz_idle_enter(ts); > + } else { > + tick_nohz_cpuset_stop_tick(ts); > + } > } > > /** > @@ -568,7 +588,7 @@ static void
Re: [PATCH 05/32] nohz: Adaptive tick stop and restart on nohz cpuset
On Mon, 2012-10-29 at 16:27 -0400, Steven Rostedt wrote: --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1196,6 +1196,29 @@ static void update_avg(u64 *avg, u64 sample) } #endif +#ifdef CONFIG_CPUSETS_NO_HZ +bool sched_can_stop_tick(void) +{ + struct rq *rq; + + rq = this_rq(); + + /* + * This is called right after cpuset_adaptive_nohz() that See below (for this caller). + * uses atomic_add_return() so that we are ordered against + * cpu_adaptive_nohz_ref. When inc_nr_running() sends an + * IPI to this CPU, we are guaranteed to see the update on + * nr_running. + */ + + /* More than one running task need preemption */ + if (rq-nr_running 1) + return false; + + return true; +} +#endif + static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { @@ -1897,6 +1920,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * frame will be invalid. */ finish_task_switch(this_rq(), prev); + tick_nohz_post_schedule(); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7a7db09..c6cd9ec 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1,6 +1,7 @@ #include linux/sched.h #include linux/mutex.h +#include linux/cpuset.h #include linux/spinlock.h #include linux/stop_machine.h @@ -927,6 +928,17 @@ static inline u64 steal_ticks(u64 steal) static inline void inc_nr_running(struct rq *rq) { rq-nr_running++; + + if (rq-nr_running == 2) { + /* + * cpuset_cpu_adaptive_nohz() uses atomic_add_return() + * to order against rq-nr_running updates. This way + * the CPU that receives the IPI is guaranteed to see + * the update on nr_running without the rq-lock. + */ + if (cpuset_cpu_adaptive_nohz(rq-cpu)) + smp_cpuset_update_nohz(rq-cpu); + } } static inline void dec_nr_running(struct rq *rq) Should we add one for dec_nr_running()? Or is this done elsewhere. I would think that there's a good chance that we can miss a chance to stop the tick. diff --git a/kernel/softirq.c b/kernel/softirq.c index cc96bdc..e06b8eb 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -25,6 +25,7 @@ #include linux/smp.h #include linux/smpboot.h #include linux/tick.h +#include linux/cpuset.h #define CREATE_TRACE_POINTS #include trace/events/irq.h @@ -307,7 +308,8 @@ void irq_enter(void) int cpu = smp_processor_id(); rcu_irq_enter(); - if (is_idle_task(current) !in_interrupt()) { + + if ((is_idle_task(current) || cpuset_adaptive_nohz()) !in_interrupt()) { /* * Prevent raise_softirq from needlessly waking up ksoftirqd * here, as softirq will be serviced on return from interrupt. @@ -349,7 +351,7 @@ void irq_exit(void) #ifdef CONFIG_NO_HZ /* Make sure that timer wheel updates are propagated */ - if (idle_cpu(smp_processor_id()) !in_interrupt() !need_resched()) + if (!in_interrupt()) tick_nohz_irq_exit(); #endif rcu_irq_exit(); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c7a78c6..35047b2 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -512,6 +512,24 @@ void tick_nohz_idle_enter(void) local_irq_enable(); } +static void tick_nohz_cpuset_stop_tick(struct tick_sched *ts) +{ +#ifdef CONFIG_CPUSETS_NO_HZ + int cpu = smp_processor_id(); + + if (!cpuset_adaptive_nohz() || is_idle_task(current)) + return; The above is most likely true. Lets remove the memory barrier in cpuset_adaptive_nohz(), just add an explicit one here, in the slow path. /* Before checking the below conditions, we must first * make sure that the cpuset/nohz is active, so we do * not miss a deactivating IPI. * ie. when nr_running == 2, an IPI is sent, and this * code must see the nr_running changed after testing * if the current CPU is adaptive nohz. */ smp_mb(); + + if (!ts-tick_stopped ts-nohz_mode == NOHZ_MODE_INACTIVE) + return; + + if (!sched_can_stop_tick()) + return; + + tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); +#endif +} + /** * tick_nohz_irq_exit - update next tick event from interrupt exit * @@ -524,10 +542,12 @@ void tick_nohz_irq_exit(void) { struct tick_sched *ts = __get_cpu_var(tick_cpu_sched); - if (!ts-inidle) - return; - - __tick_nohz_idle_enter(ts); + if (ts-inidle) { + if (!need_resched()) + __tick_nohz_idle_enter(ts); + } else { + tick_nohz_cpuset_stop_tick(ts); + } } /** @@ -568,7 +588,7 @@ static void tick_nohz_restart(struct
[PATCH 05/32] nohz: Adaptive tick stop and restart on nohz cpuset
From: Frederic Weisbecker When a CPU is included in a nohz cpuset, try to switch it to nohz mode from the interrupt exit path if it is running a single non-idle task. Then restart the tick if necessary if we are enqueuing a second task while the timer is stopped, so that the scheduler tick is rearmed. [TODO: Handle the many things done from scheduler_tick()] [ Included build fix from Geoff Levand ] Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner --- arch/x86/kernel/smp.c|2 ++ include/linux/sched.h|6 include/linux/tick.h | 11 +- init/Kconfig |2 +- kernel/sched/core.c | 24 + kernel/sched/sched.h | 12 +++ kernel/softirq.c |6 ++-- kernel/time/tick-sched.c | 86 +- 8 files changed, 137 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 4c0b7d2..0bad72d 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -275,6 +276,7 @@ void smp_cpuset_update_nohz_interrupt(struct pt_regs *regs) { ack_APIC_irq(); irq_enter(); + tick_nohz_check_adaptive(); inc_irq_stat(irq_call_count); irq_exit(); } diff --git a/include/linux/sched.h b/include/linux/sched.h index 0dd42a0..749752e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2753,6 +2753,12 @@ static inline void inc_syscw(struct task_struct *tsk) #define TASK_SIZE_OF(tsk) TASK_SIZE #endif +#ifdef CONFIG_CPUSETS_NO_HZ +extern bool sched_can_stop_tick(void); +#else +static inline bool sched_can_stop_tick(void) { return false; } +#endif + #ifdef CONFIG_MM_OWNER extern void mm_update_next_owner(struct mm_struct *mm); extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); diff --git a/include/linux/tick.h b/include/linux/tick.h index f37fceb..9b66fd3 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -124,11 +124,12 @@ static inline int tick_oneshot_mode_active(void) { return 0; } # ifdef CONFIG_NO_HZ extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); +extern void tick_nohz_restart_sched_tick(void); extern void tick_nohz_irq_exit(void); extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); -# else +# else /* !NO_HZ */ static inline void tick_nohz_idle_enter(void) { } static inline void tick_nohz_idle_exit(void) { } @@ -142,4 +143,12 @@ static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } # endif /* !NO_HZ */ +#ifdef CONFIG_CPUSETS_NO_HZ +extern void tick_nohz_check_adaptive(void); +extern void tick_nohz_post_schedule(void); +#else /* !CPUSETS_NO_HZ */ +static inline void tick_nohz_check_adaptive(void) { } +static inline void tick_nohz_post_schedule(void) { } +#endif /* CPUSETS_NO_HZ */ + #endif diff --git a/init/Kconfig b/init/Kconfig index ffdeeab..418e078 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -751,7 +751,7 @@ config PROC_PID_CPUSET config CPUSETS_NO_HZ bool "Tickless cpusets" - depends on CPUSETS && HAVE_CPUSETS_NO_HZ + depends on CPUSETS && HAVE_CPUSETS_NO_HZ && NO_HZ && HIGH_RES_TIMERS help This options let you apply a nohz property to a cpuset such that the periodic timer tick tries to be avoided when possible on diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927f..2716b79 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1196,6 +1196,29 @@ static void update_avg(u64 *avg, u64 sample) } #endif +#ifdef CONFIG_CPUSETS_NO_HZ +bool sched_can_stop_tick(void) +{ + struct rq *rq; + + rq = this_rq(); + + /* +* This is called right after cpuset_adaptive_nohz() that +* uses atomic_add_return() so that we are ordered against +* cpu_adaptive_nohz_ref. When inc_nr_running() sends an +* IPI to this CPU, we are guaranteed to see the update on +* nr_running. +*/ + + /* More than one running task need preemption */ + if (rq->nr_running > 1) + return false; + + return true; +} +#endif + static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { @@ -1897,6 +1920,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * frame will be invalid. */
[PATCH 05/32] nohz: Adaptive tick stop and restart on nohz cpuset
From: Frederic Weisbecker fweis...@gmail.com When a CPU is included in a nohz cpuset, try to switch it to nohz mode from the interrupt exit path if it is running a single non-idle task. Then restart the tick if necessary if we are enqueuing a second task while the timer is stopped, so that the scheduler tick is rearmed. [TODO: Handle the many things done from scheduler_tick()] [ Included build fix from Geoff Levand ] Signed-off-by: Frederic Weisbecker fweis...@gmail.com Cc: Alessio Igor Bogani abog...@kernel.org Cc: Andrew Morton a...@linux-foundation.org Cc: Avi Kivity a...@redhat.com Cc: Chris Metcalf cmetc...@tilera.com Cc: Christoph Lameter c...@linux.com Cc: Daniel Lezcano daniel.lezc...@linaro.org Cc: Geoff Levand ge...@infradead.org Cc: Gilad Ben Yossef gi...@benyossef.com Cc: Hakan Akkan hakanak...@gmail.com Cc: Ingo Molnar mi...@kernel.org Cc: Kevin Hilman khil...@ti.com Cc: Max Krasnyansky m...@qualcomm.com Cc: Paul E. McKenney paul...@linux.vnet.ibm.com Cc: Peter Zijlstra pet...@infradead.org Cc: Stephen Hemminger shemmin...@vyatta.com Cc: Steven Rostedt rost...@goodmis.org Cc: Sven-Thorsten Dietrich thebigcorporat...@gmail.com Cc: Thomas Gleixner t...@linutronix.de --- arch/x86/kernel/smp.c|2 ++ include/linux/sched.h|6 include/linux/tick.h | 11 +- init/Kconfig |2 +- kernel/sched/core.c | 24 + kernel/sched/sched.h | 12 +++ kernel/softirq.c |6 ++-- kernel/time/tick-sched.c | 86 +- 8 files changed, 137 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 4c0b7d2..0bad72d 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -23,6 +23,7 @@ #include linux/interrupt.h #include linux/cpu.h #include linux/gfp.h +#include linux/tick.h #include asm/mtrr.h #include asm/tlbflush.h @@ -275,6 +276,7 @@ void smp_cpuset_update_nohz_interrupt(struct pt_regs *regs) { ack_APIC_irq(); irq_enter(); + tick_nohz_check_adaptive(); inc_irq_stat(irq_call_count); irq_exit(); } diff --git a/include/linux/sched.h b/include/linux/sched.h index 0dd42a0..749752e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2753,6 +2753,12 @@ static inline void inc_syscw(struct task_struct *tsk) #define TASK_SIZE_OF(tsk) TASK_SIZE #endif +#ifdef CONFIG_CPUSETS_NO_HZ +extern bool sched_can_stop_tick(void); +#else +static inline bool sched_can_stop_tick(void) { return false; } +#endif + #ifdef CONFIG_MM_OWNER extern void mm_update_next_owner(struct mm_struct *mm); extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); diff --git a/include/linux/tick.h b/include/linux/tick.h index f37fceb..9b66fd3 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -124,11 +124,12 @@ static inline int tick_oneshot_mode_active(void) { return 0; } # ifdef CONFIG_NO_HZ extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); +extern void tick_nohz_restart_sched_tick(void); extern void tick_nohz_irq_exit(void); extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); -# else +# else /* !NO_HZ */ static inline void tick_nohz_idle_enter(void) { } static inline void tick_nohz_idle_exit(void) { } @@ -142,4 +143,12 @@ static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } # endif /* !NO_HZ */ +#ifdef CONFIG_CPUSETS_NO_HZ +extern void tick_nohz_check_adaptive(void); +extern void tick_nohz_post_schedule(void); +#else /* !CPUSETS_NO_HZ */ +static inline void tick_nohz_check_adaptive(void) { } +static inline void tick_nohz_post_schedule(void) { } +#endif /* CPUSETS_NO_HZ */ + #endif diff --git a/init/Kconfig b/init/Kconfig index ffdeeab..418e078 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -751,7 +751,7 @@ config PROC_PID_CPUSET config CPUSETS_NO_HZ bool Tickless cpusets - depends on CPUSETS HAVE_CPUSETS_NO_HZ + depends on CPUSETS HAVE_CPUSETS_NO_HZ NO_HZ HIGH_RES_TIMERS help This options let you apply a nohz property to a cpuset such that the periodic timer tick tries to be avoided when possible on diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927f..2716b79 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1196,6 +1196,29 @@ static void update_avg(u64 *avg, u64 sample) } #endif +#ifdef CONFIG_CPUSETS_NO_HZ +bool sched_can_stop_tick(void) +{ + struct rq *rq; + + rq = this_rq(); + + /* +* This is called right after cpuset_adaptive_nohz() that +* uses atomic_add_return() so that we are ordered against +* cpu_adaptive_nohz_ref. When inc_nr_running() sends an +* IPI to this