Re: [PATCH 05/32] nohz: Adaptive tick stop and restart on nohz cpuset

2012-10-30 Thread Steven Rostedt
On Mon, 2012-10-29 at 16:27 -0400, Steven Rostedt wrote:

> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1196,6 +1196,29 @@ static void update_avg(u64 *avg, u64 sample)
>  }
>  #endif
>  
> +#ifdef CONFIG_CPUSETS_NO_HZ
> +bool sched_can_stop_tick(void)
> +{
> + struct rq *rq;
> +
> + rq = this_rq();
> +
> + /*
> +  * This is called right after cpuset_adaptive_nohz() that

See below (for this caller).


> +  * uses atomic_add_return() so that we are ordered against
> +  * cpu_adaptive_nohz_ref. When inc_nr_running() sends an
> +  * IPI to this CPU, we are guaranteed to see the update on
> +  * nr_running.
> +  */
> +
> + /* More than one running task need preemption */
> + if (rq->nr_running > 1)
> + return false;
> +
> + return true;
> +}
> +#endif
> +
>  static void
>  ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
>  {
> @@ -1897,6 +1920,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
>* frame will be invalid.
>*/
>   finish_task_switch(this_rq(), prev);
> + tick_nohz_post_schedule();
>  }
>  
>  /*
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 7a7db09..c6cd9ec 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1,6 +1,7 @@
>  
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  
> @@ -927,6 +928,17 @@ static inline u64 steal_ticks(u64 steal)
>  static inline void inc_nr_running(struct rq *rq)
>  {
>   rq->nr_running++;
> +
> + if (rq->nr_running == 2) {
> + /*
> +  * cpuset_cpu_adaptive_nohz() uses atomic_add_return()
> +  * to order against rq->nr_running updates. This way
> +  * the CPU that receives the IPI is guaranteed to see
> +  * the update on nr_running without the rq->lock.
> +  */
> + if (cpuset_cpu_adaptive_nohz(rq->cpu))
> + smp_cpuset_update_nohz(rq->cpu);
> + }
>  }
>  
>  static inline void dec_nr_running(struct rq *rq)

Should we add one for dec_nr_running()? Or is this done elsewhere. I
would think that there's a good chance that we can miss a chance to stop
the tick.


> diff --git a/kernel/softirq.c b/kernel/softirq.c
> index cc96bdc..e06b8eb 100644
> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -25,6 +25,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #define CREATE_TRACE_POINTS
>  #include 
> @@ -307,7 +308,8 @@ void irq_enter(void)
>   int cpu = smp_processor_id();
>  
>   rcu_irq_enter();
> - if (is_idle_task(current) && !in_interrupt()) {
> +
> + if ((is_idle_task(current) || cpuset_adaptive_nohz()) && 
> !in_interrupt()) {
>   /*
>* Prevent raise_softirq from needlessly waking up ksoftirqd
>* here, as softirq will be serviced on return from interrupt.
> @@ -349,7 +351,7 @@ void irq_exit(void)
>  
>  #ifdef CONFIG_NO_HZ
>   /* Make sure that timer wheel updates are propagated */
> - if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
> + if (!in_interrupt())
>   tick_nohz_irq_exit();
>  #endif
>   rcu_irq_exit();
> diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
> index c7a78c6..35047b2 100644
> --- a/kernel/time/tick-sched.c
> +++ b/kernel/time/tick-sched.c
> @@ -512,6 +512,24 @@ void tick_nohz_idle_enter(void)
>   local_irq_enable();
>  }
>  
> +static void tick_nohz_cpuset_stop_tick(struct tick_sched *ts)
> +{
> +#ifdef CONFIG_CPUSETS_NO_HZ
> + int cpu = smp_processor_id();
> +
> + if (!cpuset_adaptive_nohz() || is_idle_task(current))
> + return;

The above is most likely true. Lets remove the memory barrier in
cpuset_adaptive_nohz(), just add an explicit one here, in the slow path.

/* Before checking the below conditions, we must first
 * make sure that the cpuset/nohz is active, so we do
 * not miss a deactivating IPI. 
 * ie. when nr_running == 2, an IPI is sent, and this
 * code must see the nr_running changed after testing
 * if the current CPU is adaptive nohz.
 */
smp_mb();

> +
> + if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
> + return;
> +
> + if (!sched_can_stop_tick())
> + return;
> +
> + tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
> +#endif
> +}
> +
>  /**
>   * tick_nohz_irq_exit - update next tick event from interrupt exit
>   *
> @@ -524,10 +542,12 @@ void tick_nohz_irq_exit(void)
>  {
>   struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
>  
> - if (!ts->inidle)
> - return;
> -
> - __tick_nohz_idle_enter(ts);
> + if (ts->inidle) {
> + if (!need_resched())
> + __tick_nohz_idle_enter(ts);
> + } else {
> + tick_nohz_cpuset_stop_tick(ts);
> + }
>  }
>  
>  /**
> @@ -568,7 +588,7 @@ static void 

Re: [PATCH 05/32] nohz: Adaptive tick stop and restart on nohz cpuset

2012-10-30 Thread Steven Rostedt
On Mon, 2012-10-29 at 16:27 -0400, Steven Rostedt wrote:

 --- a/kernel/sched/core.c
 +++ b/kernel/sched/core.c
 @@ -1196,6 +1196,29 @@ static void update_avg(u64 *avg, u64 sample)
  }
  #endif
  
 +#ifdef CONFIG_CPUSETS_NO_HZ
 +bool sched_can_stop_tick(void)
 +{
 + struct rq *rq;
 +
 + rq = this_rq();
 +
 + /*
 +  * This is called right after cpuset_adaptive_nohz() that

See below (for this caller).


 +  * uses atomic_add_return() so that we are ordered against
 +  * cpu_adaptive_nohz_ref. When inc_nr_running() sends an
 +  * IPI to this CPU, we are guaranteed to see the update on
 +  * nr_running.
 +  */
 +
 + /* More than one running task need preemption */
 + if (rq-nr_running  1)
 + return false;
 +
 + return true;
 +}
 +#endif
 +
  static void
  ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
  {
 @@ -1897,6 +1920,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
* frame will be invalid.
*/
   finish_task_switch(this_rq(), prev);
 + tick_nohz_post_schedule();
  }
  
  /*
 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
 index 7a7db09..c6cd9ec 100644
 --- a/kernel/sched/sched.h
 +++ b/kernel/sched/sched.h
 @@ -1,6 +1,7 @@
  
  #include linux/sched.h
  #include linux/mutex.h
 +#include linux/cpuset.h
  #include linux/spinlock.h
  #include linux/stop_machine.h
  
 @@ -927,6 +928,17 @@ static inline u64 steal_ticks(u64 steal)
  static inline void inc_nr_running(struct rq *rq)
  {
   rq-nr_running++;
 +
 + if (rq-nr_running == 2) {
 + /*
 +  * cpuset_cpu_adaptive_nohz() uses atomic_add_return()
 +  * to order against rq-nr_running updates. This way
 +  * the CPU that receives the IPI is guaranteed to see
 +  * the update on nr_running without the rq-lock.
 +  */
 + if (cpuset_cpu_adaptive_nohz(rq-cpu))
 + smp_cpuset_update_nohz(rq-cpu);
 + }
  }
  
  static inline void dec_nr_running(struct rq *rq)

Should we add one for dec_nr_running()? Or is this done elsewhere. I
would think that there's a good chance that we can miss a chance to stop
the tick.


 diff --git a/kernel/softirq.c b/kernel/softirq.c
 index cc96bdc..e06b8eb 100644
 --- a/kernel/softirq.c
 +++ b/kernel/softirq.c
 @@ -25,6 +25,7 @@
  #include linux/smp.h
  #include linux/smpboot.h
  #include linux/tick.h
 +#include linux/cpuset.h
  
  #define CREATE_TRACE_POINTS
  #include trace/events/irq.h
 @@ -307,7 +308,8 @@ void irq_enter(void)
   int cpu = smp_processor_id();
  
   rcu_irq_enter();
 - if (is_idle_task(current)  !in_interrupt()) {
 +
 + if ((is_idle_task(current) || cpuset_adaptive_nohz())  
 !in_interrupt()) {
   /*
* Prevent raise_softirq from needlessly waking up ksoftirqd
* here, as softirq will be serviced on return from interrupt.
 @@ -349,7 +351,7 @@ void irq_exit(void)
  
  #ifdef CONFIG_NO_HZ
   /* Make sure that timer wheel updates are propagated */
 - if (idle_cpu(smp_processor_id())  !in_interrupt()  !need_resched())
 + if (!in_interrupt())
   tick_nohz_irq_exit();
  #endif
   rcu_irq_exit();
 diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
 index c7a78c6..35047b2 100644
 --- a/kernel/time/tick-sched.c
 +++ b/kernel/time/tick-sched.c
 @@ -512,6 +512,24 @@ void tick_nohz_idle_enter(void)
   local_irq_enable();
  }
  
 +static void tick_nohz_cpuset_stop_tick(struct tick_sched *ts)
 +{
 +#ifdef CONFIG_CPUSETS_NO_HZ
 + int cpu = smp_processor_id();
 +
 + if (!cpuset_adaptive_nohz() || is_idle_task(current))
 + return;

The above is most likely true. Lets remove the memory barrier in
cpuset_adaptive_nohz(), just add an explicit one here, in the slow path.

/* Before checking the below conditions, we must first
 * make sure that the cpuset/nohz is active, so we do
 * not miss a deactivating IPI. 
 * ie. when nr_running == 2, an IPI is sent, and this
 * code must see the nr_running changed after testing
 * if the current CPU is adaptive nohz.
 */
smp_mb();

 +
 + if (!ts-tick_stopped  ts-nohz_mode == NOHZ_MODE_INACTIVE)
 + return;
 +
 + if (!sched_can_stop_tick())
 + return;
 +
 + tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
 +#endif
 +}
 +
  /**
   * tick_nohz_irq_exit - update next tick event from interrupt exit
   *
 @@ -524,10 +542,12 @@ void tick_nohz_irq_exit(void)
  {
   struct tick_sched *ts = __get_cpu_var(tick_cpu_sched);
  
 - if (!ts-inidle)
 - return;
 -
 - __tick_nohz_idle_enter(ts);
 + if (ts-inidle) {
 + if (!need_resched())
 + __tick_nohz_idle_enter(ts);
 + } else {
 + tick_nohz_cpuset_stop_tick(ts);
 + }
  }
  
  /**
 @@ -568,7 +588,7 @@ static void tick_nohz_restart(struct 

[PATCH 05/32] nohz: Adaptive tick stop and restart on nohz cpuset

2012-10-29 Thread Steven Rostedt
From: Frederic Weisbecker 

When a CPU is included in a nohz cpuset, try to switch
it to nohz mode from the interrupt exit path if it is running
a single non-idle task.

Then restart the tick if necessary if we are enqueuing a
second task while the timer is stopped, so that the scheduler
tick is rearmed.

[TODO: Handle the many things done from scheduler_tick()]

[ Included build fix from Geoff Levand ]

Signed-off-by: Frederic Weisbecker 
Cc: Alessio Igor Bogani 
Cc: Andrew Morton 
Cc: Avi Kivity 
Cc: Chris Metcalf 
Cc: Christoph Lameter 
Cc: Daniel Lezcano 
Cc: Geoff Levand 
Cc: Gilad Ben Yossef 
Cc: Hakan Akkan 
Cc: Ingo Molnar 
Cc: Kevin Hilman 
Cc: Max Krasnyansky 
Cc: Paul E. McKenney 
Cc: Peter Zijlstra 
Cc: Stephen Hemminger 
Cc: Steven Rostedt 
Cc: Sven-Thorsten Dietrich 
Cc: Thomas Gleixner 
---
 arch/x86/kernel/smp.c|2 ++
 include/linux/sched.h|6 
 include/linux/tick.h |   11 +-
 init/Kconfig |2 +-
 kernel/sched/core.c  |   24 +
 kernel/sched/sched.h |   12 +++
 kernel/softirq.c |6 ++--
 kernel/time/tick-sched.c |   86 +-
 8 files changed, 137 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 4c0b7d2..0bad72d 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -275,6 +276,7 @@ void smp_cpuset_update_nohz_interrupt(struct pt_regs *regs)
 {
ack_APIC_irq();
irq_enter();
+   tick_nohz_check_adaptive();
inc_irq_stat(irq_call_count);
irq_exit();
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0dd42a0..749752e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2753,6 +2753,12 @@ static inline void inc_syscw(struct task_struct *tsk)
 #define TASK_SIZE_OF(tsk)  TASK_SIZE
 #endif
 
+#ifdef CONFIG_CPUSETS_NO_HZ
+extern bool sched_can_stop_tick(void);
+#else
+static inline bool sched_can_stop_tick(void) { return false; }
+#endif
+
 #ifdef CONFIG_MM_OWNER
 extern void mm_update_next_owner(struct mm_struct *mm);
 extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --git a/include/linux/tick.h b/include/linux/tick.h
index f37fceb..9b66fd3 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -124,11 +124,12 @@ static inline int tick_oneshot_mode_active(void) { return 
0; }
 # ifdef CONFIG_NO_HZ
 extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
+extern void tick_nohz_restart_sched_tick(void);
 extern void tick_nohz_irq_exit(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
-# else
+# else /* !NO_HZ */
 static inline void tick_nohz_idle_enter(void) { }
 static inline void tick_nohz_idle_exit(void) { }
 
@@ -142,4 +143,12 @@ static inline u64 get_cpu_idle_time_us(int cpu, u64 
*unused) { return -1; }
 static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
 # endif /* !NO_HZ */
 
+#ifdef CONFIG_CPUSETS_NO_HZ
+extern void tick_nohz_check_adaptive(void);
+extern void tick_nohz_post_schedule(void);
+#else /* !CPUSETS_NO_HZ */
+static inline void tick_nohz_check_adaptive(void) { }
+static inline void tick_nohz_post_schedule(void) { }
+#endif /* CPUSETS_NO_HZ */
+
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index ffdeeab..418e078 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -751,7 +751,7 @@ config PROC_PID_CPUSET
 
 config CPUSETS_NO_HZ
bool "Tickless cpusets"
-   depends on CPUSETS && HAVE_CPUSETS_NO_HZ
+   depends on CPUSETS && HAVE_CPUSETS_NO_HZ && NO_HZ && HIGH_RES_TIMERS
help
  This options let you apply a nohz property to a cpuset such
 that the periodic timer tick tries to be avoided when possible on
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927f..2716b79 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1196,6 +1196,29 @@ static void update_avg(u64 *avg, u64 sample)
 }
 #endif
 
+#ifdef CONFIG_CPUSETS_NO_HZ
+bool sched_can_stop_tick(void)
+{
+   struct rq *rq;
+
+   rq = this_rq();
+
+   /*
+* This is called right after cpuset_adaptive_nohz() that
+* uses atomic_add_return() so that we are ordered against
+* cpu_adaptive_nohz_ref. When inc_nr_running() sends an
+* IPI to this CPU, we are guaranteed to see the update on
+* nr_running.
+*/
+
+   /* More than one running task need preemption */
+   if (rq->nr_running > 1)
+   return false;
+
+   return true;
+}
+#endif
+
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 {
@@ -1897,6 +1920,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 * frame will be invalid.
 */

[PATCH 05/32] nohz: Adaptive tick stop and restart on nohz cpuset

2012-10-29 Thread Steven Rostedt
From: Frederic Weisbecker fweis...@gmail.com

When a CPU is included in a nohz cpuset, try to switch
it to nohz mode from the interrupt exit path if it is running
a single non-idle task.

Then restart the tick if necessary if we are enqueuing a
second task while the timer is stopped, so that the scheduler
tick is rearmed.

[TODO: Handle the many things done from scheduler_tick()]

[ Included build fix from Geoff Levand ]

Signed-off-by: Frederic Weisbecker fweis...@gmail.com
Cc: Alessio Igor Bogani abog...@kernel.org
Cc: Andrew Morton a...@linux-foundation.org
Cc: Avi Kivity a...@redhat.com
Cc: Chris Metcalf cmetc...@tilera.com
Cc: Christoph Lameter c...@linux.com
Cc: Daniel Lezcano daniel.lezc...@linaro.org
Cc: Geoff Levand ge...@infradead.org
Cc: Gilad Ben Yossef gi...@benyossef.com
Cc: Hakan Akkan hakanak...@gmail.com
Cc: Ingo Molnar mi...@kernel.org
Cc: Kevin Hilman khil...@ti.com
Cc: Max Krasnyansky m...@qualcomm.com
Cc: Paul E. McKenney paul...@linux.vnet.ibm.com
Cc: Peter Zijlstra pet...@infradead.org
Cc: Stephen Hemminger shemmin...@vyatta.com
Cc: Steven Rostedt rost...@goodmis.org
Cc: Sven-Thorsten Dietrich thebigcorporat...@gmail.com
Cc: Thomas Gleixner t...@linutronix.de
---
 arch/x86/kernel/smp.c|2 ++
 include/linux/sched.h|6 
 include/linux/tick.h |   11 +-
 init/Kconfig |2 +-
 kernel/sched/core.c  |   24 +
 kernel/sched/sched.h |   12 +++
 kernel/softirq.c |6 ++--
 kernel/time/tick-sched.c |   86 +-
 8 files changed, 137 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 4c0b7d2..0bad72d 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -23,6 +23,7 @@
 #include linux/interrupt.h
 #include linux/cpu.h
 #include linux/gfp.h
+#include linux/tick.h
 
 #include asm/mtrr.h
 #include asm/tlbflush.h
@@ -275,6 +276,7 @@ void smp_cpuset_update_nohz_interrupt(struct pt_regs *regs)
 {
ack_APIC_irq();
irq_enter();
+   tick_nohz_check_adaptive();
inc_irq_stat(irq_call_count);
irq_exit();
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0dd42a0..749752e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2753,6 +2753,12 @@ static inline void inc_syscw(struct task_struct *tsk)
 #define TASK_SIZE_OF(tsk)  TASK_SIZE
 #endif
 
+#ifdef CONFIG_CPUSETS_NO_HZ
+extern bool sched_can_stop_tick(void);
+#else
+static inline bool sched_can_stop_tick(void) { return false; }
+#endif
+
 #ifdef CONFIG_MM_OWNER
 extern void mm_update_next_owner(struct mm_struct *mm);
 extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --git a/include/linux/tick.h b/include/linux/tick.h
index f37fceb..9b66fd3 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -124,11 +124,12 @@ static inline int tick_oneshot_mode_active(void) { return 
0; }
 # ifdef CONFIG_NO_HZ
 extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
+extern void tick_nohz_restart_sched_tick(void);
 extern void tick_nohz_irq_exit(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
-# else
+# else /* !NO_HZ */
 static inline void tick_nohz_idle_enter(void) { }
 static inline void tick_nohz_idle_exit(void) { }
 
@@ -142,4 +143,12 @@ static inline u64 get_cpu_idle_time_us(int cpu, u64 
*unused) { return -1; }
 static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
 # endif /* !NO_HZ */
 
+#ifdef CONFIG_CPUSETS_NO_HZ
+extern void tick_nohz_check_adaptive(void);
+extern void tick_nohz_post_schedule(void);
+#else /* !CPUSETS_NO_HZ */
+static inline void tick_nohz_check_adaptive(void) { }
+static inline void tick_nohz_post_schedule(void) { }
+#endif /* CPUSETS_NO_HZ */
+
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index ffdeeab..418e078 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -751,7 +751,7 @@ config PROC_PID_CPUSET
 
 config CPUSETS_NO_HZ
bool Tickless cpusets
-   depends on CPUSETS  HAVE_CPUSETS_NO_HZ
+   depends on CPUSETS  HAVE_CPUSETS_NO_HZ  NO_HZ  HIGH_RES_TIMERS
help
  This options let you apply a nohz property to a cpuset such
 that the periodic timer tick tries to be avoided when possible on
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927f..2716b79 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1196,6 +1196,29 @@ static void update_avg(u64 *avg, u64 sample)
 }
 #endif
 
+#ifdef CONFIG_CPUSETS_NO_HZ
+bool sched_can_stop_tick(void)
+{
+   struct rq *rq;
+
+   rq = this_rq();
+
+   /*
+* This is called right after cpuset_adaptive_nohz() that
+* uses atomic_add_return() so that we are ordered against
+* cpu_adaptive_nohz_ref. When inc_nr_running() sends an
+* IPI to this