Re: [PATCH v2 1/3] sched: introduce distinct per-cpu load average

2012-10-22 Thread Andrea Righi
On Mon, Oct 22, 2012 at 01:10:40PM +0200, Peter Zijlstra wrote:
> On Sat, 2012-10-20 at 21:06 +0200, Andrea Righi wrote:
> > @@ -383,13 +383,7 @@ struct rq {
> > struct list_head leaf_rt_rq_list;
> >  #endif
> >  
> 
> > +   unsigned long __percpu *nr_uninterruptible; 
> 
> This is O(nr_cpus^2) memory..
> 

Correct, this doesn't add too much overhead to the wakeup/sleep path,
but it's bad both in terms of memory and performance overhead in the
other parts of the code for large SMP systems.

> 
> > +unsigned long nr_uninterruptible_cpu(int cpu)
> > +{
> > +   struct rq *this = cpu_rq(cpu);
> > +   unsigned long val = 0;
> > +   int i;
> > +
> > +   for_each_online_cpu(i)
> > +   val += per_cpu(*this->nr_uninterruptible, i);
> > +
> > +   return val;
> > +}
> > 
> > 
> I suspect you've got an accounting leak here on hot-plug.

And I think you're right about the accounting leak with cpu hotplug.
I'll do more tests with this part, until I come up with a better idea
in general for the nr_uninterruptible accounting.

Thanks!
-Andrea

> > 
> >  unsigned long nr_uninterruptible(void)
> >  {
> > unsigned long i, sum = 0;
> >  
> > for_each_possible_cpu(i)
> > -   sum += cpu_rq(i)->nr_uninterruptible;
> > +   sum += nr_uninterruptible_cpu(i);
> >  
> > /*
> >  * Since we read the counters lockless, it might be slightly
> 
> And this makes O(n^2) runtime!
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 1/3] sched: introduce distinct per-cpu load average

2012-10-22 Thread Peter Zijlstra
On Sat, 2012-10-20 at 21:06 +0200, Andrea Righi wrote:
> @@ -383,13 +383,7 @@ struct rq {
> struct list_head leaf_rt_rq_list;
>  #endif
>  

> +   unsigned long __percpu *nr_uninterruptible; 

This is O(nr_cpus^2) memory..


> +unsigned long nr_uninterruptible_cpu(int cpu)
> +{
> +   struct rq *this = cpu_rq(cpu);
> +   unsigned long val = 0;
> +   int i;
> +
> +   for_each_online_cpu(i)
> +   val += per_cpu(*this->nr_uninterruptible, i);
> +
> +   return val;
> +}
> 
> 
I suspect you've got an accounting leak here on hot-plug.
> 
>  unsigned long nr_uninterruptible(void)
>  {
> unsigned long i, sum = 0;
>  
> for_each_possible_cpu(i)
> -   sum += cpu_rq(i)->nr_uninterruptible;
> +   sum += nr_uninterruptible_cpu(i);
>  
> /*
>  * Since we read the counters lockless, it might be slightly

And this makes O(n^2) runtime!


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 1/3] sched: introduce distinct per-cpu load average

2012-10-22 Thread Peter Zijlstra
On Sat, 2012-10-20 at 21:06 +0200, Andrea Righi wrote:
 @@ -383,13 +383,7 @@ struct rq {
 struct list_head leaf_rt_rq_list;
  #endif
  

 +   unsigned long __percpu *nr_uninterruptible; 

This is O(nr_cpus^2) memory..


 +unsigned long nr_uninterruptible_cpu(int cpu)
 +{
 +   struct rq *this = cpu_rq(cpu);
 +   unsigned long val = 0;
 +   int i;
 +
 +   for_each_online_cpu(i)
 +   val += per_cpu(*this-nr_uninterruptible, i);
 +
 +   return val;
 +}
 
 
I suspect you've got an accounting leak here on hot-plug.
 
  unsigned long nr_uninterruptible(void)
  {
 unsigned long i, sum = 0;
  
 for_each_possible_cpu(i)
 -   sum += cpu_rq(i)-nr_uninterruptible;
 +   sum += nr_uninterruptible_cpu(i);
  
 /*
  * Since we read the counters lockless, it might be slightly

And this makes O(n^2) runtime!


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 1/3] sched: introduce distinct per-cpu load average

2012-10-22 Thread Andrea Righi
On Mon, Oct 22, 2012 at 01:10:40PM +0200, Peter Zijlstra wrote:
 On Sat, 2012-10-20 at 21:06 +0200, Andrea Righi wrote:
  @@ -383,13 +383,7 @@ struct rq {
  struct list_head leaf_rt_rq_list;
   #endif
   
 
  +   unsigned long __percpu *nr_uninterruptible; 
 
 This is O(nr_cpus^2) memory..
 

Correct, this doesn't add too much overhead to the wakeup/sleep path,
but it's bad both in terms of memory and performance overhead in the
other parts of the code for large SMP systems.

 
  +unsigned long nr_uninterruptible_cpu(int cpu)
  +{
  +   struct rq *this = cpu_rq(cpu);
  +   unsigned long val = 0;
  +   int i;
  +
  +   for_each_online_cpu(i)
  +   val += per_cpu(*this-nr_uninterruptible, i);
  +
  +   return val;
  +}
  
  
 I suspect you've got an accounting leak here on hot-plug.

And I think you're right about the accounting leak with cpu hotplug.
I'll do more tests with this part, until I come up with a better idea
in general for the nr_uninterruptible accounting.

Thanks!
-Andrea

  
   unsigned long nr_uninterruptible(void)
   {
  unsigned long i, sum = 0;
   
  for_each_possible_cpu(i)
  -   sum += cpu_rq(i)-nr_uninterruptible;
  +   sum += nr_uninterruptible_cpu(i);
   
  /*
   * Since we read the counters lockless, it might be slightly
 
 And this makes O(n^2) runtime!
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 1/3] sched: introduce distinct per-cpu load average

2012-10-20 Thread Andrea Righi
Account load average, nr_running and nr_uninterruptible tasks per-cpu.

The new task_struct attribute on_cpu_uninterruptible is added to
properly keep track of the cpu at deactivate time, when the task is set
to the uninterruptible sleep state.

Moreover, rq->nr_uninterruptible is converted to a percpu variable to
maintain a coherent nr_uninterruptible counter for each CPU (rather than
having a single global counter defined as the sum over all CPUs). This
adds less performance overhead than introducing atomic operations in the
wakeup/sleep path.

This feature is required by the cpusets cgroup subsystem to report the
load average per-cpuset.

Signed-off-by: Andrea Righi 
---
 include/linux/sched.h |6 +++
 kernel/sched/core.c   |  112 ++---
 kernel/sched/debug.c  |3 +-
 kernel/sched/sched.h  |8 +---
 4 files changed, 105 insertions(+), 24 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0dd42a0..e5dfe2a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -80,6 +80,8 @@ struct blk_plug;
  */
 extern unsigned long avenrun[];/* Load averages */
 extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
+extern void get_cpu_avenrun(unsigned long *loads, int cpu,
+   unsigned long offset, int shift);
 
 #define FSHIFT 11  /* nr of bits of precision */
 #define FIXED_1(1on_cpu_uninterruptible);
+   __this_cpu_dec(*prev_rq->nr_uninterruptible);
+   }
 
enqueue_task(rq, p, flags);
 }
 
 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
-   if (task_contributes_to_load(p))
-   rq->nr_uninterruptible++;
+   if (task_contributes_to_load(p)) {
+   __this_cpu_inc(*rq->nr_uninterruptible);
+   p->on_cpu_uninterruptible = cpu_of(rq);
+   }
 
dequeue_task(rq, p, flags);
 }
@@ -1277,8 +1281,10 @@ static void
 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 #ifdef CONFIG_SMP
-   if (p->sched_contributes_to_load)
-   rq->nr_uninterruptible--;
+   if (p->sched_contributes_to_load) {
+   struct rq *prev_rq = cpu_rq(p->on_cpu_uninterruptible);
+   __this_cpu_dec(*prev_rq->nr_uninterruptible);
+   }
 #endif
 
ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
@@ -1916,12 +1922,17 @@ unsigned long nr_running(void)
return sum;
 }
 
+unsigned long nr_running_cpu(int cpu)
+{
+   return cpu_rq(cpu)->nr_running;
+}
+
 unsigned long nr_uninterruptible(void)
 {
unsigned long i, sum = 0;
 
for_each_possible_cpu(i)
-   sum += cpu_rq(i)->nr_uninterruptible;
+   sum += nr_uninterruptible_cpu(i);
 
/*
 * Since we read the counters lockless, it might be slightly
@@ -1933,6 +1944,18 @@ unsigned long nr_uninterruptible(void)
return sum;
 }
 
+unsigned long nr_uninterruptible_cpu(int cpu)
+{
+   struct rq *this = cpu_rq(cpu);
+   unsigned long val = 0;
+   int i;
+
+   for_each_online_cpu(i)
+   val += per_cpu(*this->nr_uninterruptible, i);
+
+   return val;
+}
+
 unsigned long long nr_context_switches(void)
 {
int i;
@@ -1980,7 +2003,8 @@ unsigned long this_cpu_load(void)
  *
  *   nr_active = 0;
  *   for_each_possible_cpu(cpu)
- * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ * nr_active += cpu_of(cpu)->nr_running +
+ *  (cpu_of(cpu)->nr_uninterruptible;
  *
  *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
  *
@@ -2004,13 +2028,6 @@ unsigned long this_cpu_load(void)
  *This places an upper-bound on the IRQ-off latency of the machine. Then
  *again, being late doesn't loose the delta, just wrecks the sample.
  *
- *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
- *this would add another cross-cpu cacheline miss and atomic operation
- *to the wakeup path. Instead we increment on whatever cpu the task ran
- *when it went into uninterruptible state and decrement on whatever cpu
- *did the wakeup. This means that only the sum of nr_uninterruptible over
- *all cpus yields the correct result.
- *
  *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
  */
 
@@ -2035,12 +2052,15 @@ void get_avenrun(unsigned long *loads, unsigned long 
offset, int shift)
loads[2] = (avenrun[2] + offset) << shift;
 }
 
+static DEFINE_PER_CPU(unsigned long [3], cpu_avenrun);
+
 static long calc_load_fold_active(struct rq *this_rq)
 {
long nr_active, delta = 0;
+   int cpu = cpu_of(this_rq);
 
nr_active = this_rq->nr_running;
-   nr_active += (long) this_rq->nr_uninterruptible;
+   

[PATCH v2 1/3] sched: introduce distinct per-cpu load average

2012-10-20 Thread Andrea Righi
Account load average, nr_running and nr_uninterruptible tasks per-cpu.

The new task_struct attribute on_cpu_uninterruptible is added to
properly keep track of the cpu at deactivate time, when the task is set
to the uninterruptible sleep state.

Moreover, rq-nr_uninterruptible is converted to a percpu variable to
maintain a coherent nr_uninterruptible counter for each CPU (rather than
having a single global counter defined as the sum over all CPUs). This
adds less performance overhead than introducing atomic operations in the
wakeup/sleep path.

This feature is required by the cpusets cgroup subsystem to report the
load average per-cpuset.

Signed-off-by: Andrea Righi and...@betterlinux.com
---
 include/linux/sched.h |6 +++
 kernel/sched/core.c   |  112 ++---
 kernel/sched/debug.c  |3 +-
 kernel/sched/sched.h  |8 +---
 4 files changed, 105 insertions(+), 24 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0dd42a0..e5dfe2a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -80,6 +80,8 @@ struct blk_plug;
  */
 extern unsigned long avenrun[];/* Load averages */
 extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
+extern void get_cpu_avenrun(unsigned long *loads, int cpu,
+   unsigned long offset, int shift);
 
 #define FSHIFT 11  /* nr of bits of precision */
 #define FIXED_1(1FSHIFT) /* 1.0 as fixed-point */
@@ -98,7 +100,9 @@ extern int nr_threads;
 DECLARE_PER_CPU(unsigned long, process_counts);
 extern int nr_processes(void);
 extern unsigned long nr_running(void);
+extern unsigned long nr_running_cpu(int cpu);
 extern unsigned long nr_uninterruptible(void);
+extern unsigned long nr_uninterruptible_cpu(int cpu);
 extern unsigned long nr_iowait(void);
 extern unsigned long nr_iowait_cpu(int cpu);
 extern unsigned long this_cpu_load(void);
@@ -1197,6 +1201,8 @@ struct task_struct {
 #ifdef CONFIG_SMP
struct llist_node wake_entry;
int on_cpu;
+   /* Used to keep track of nr_uninterruptible tasks per-cpu */
+   int on_cpu_uninterruptible;
 #endif
int on_rq;
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927f..a1487ee 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -726,16 +726,20 @@ static void dequeue_task(struct rq *rq, struct 
task_struct *p, int flags)
 
 void activate_task(struct rq *rq, struct task_struct *p, int flags)
 {
-   if (task_contributes_to_load(p))
-   rq-nr_uninterruptible--;
+   if (task_contributes_to_load(p)) {
+   struct rq *prev_rq = cpu_rq(p-on_cpu_uninterruptible);
+   __this_cpu_dec(*prev_rq-nr_uninterruptible);
+   }
 
enqueue_task(rq, p, flags);
 }
 
 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
-   if (task_contributes_to_load(p))
-   rq-nr_uninterruptible++;
+   if (task_contributes_to_load(p)) {
+   __this_cpu_inc(*rq-nr_uninterruptible);
+   p-on_cpu_uninterruptible = cpu_of(rq);
+   }
 
dequeue_task(rq, p, flags);
 }
@@ -1277,8 +1281,10 @@ static void
 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 #ifdef CONFIG_SMP
-   if (p-sched_contributes_to_load)
-   rq-nr_uninterruptible--;
+   if (p-sched_contributes_to_load) {
+   struct rq *prev_rq = cpu_rq(p-on_cpu_uninterruptible);
+   __this_cpu_dec(*prev_rq-nr_uninterruptible);
+   }
 #endif
 
ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
@@ -1916,12 +1922,17 @@ unsigned long nr_running(void)
return sum;
 }
 
+unsigned long nr_running_cpu(int cpu)
+{
+   return cpu_rq(cpu)-nr_running;
+}
+
 unsigned long nr_uninterruptible(void)
 {
unsigned long i, sum = 0;
 
for_each_possible_cpu(i)
-   sum += cpu_rq(i)-nr_uninterruptible;
+   sum += nr_uninterruptible_cpu(i);
 
/*
 * Since we read the counters lockless, it might be slightly
@@ -1933,6 +1944,18 @@ unsigned long nr_uninterruptible(void)
return sum;
 }
 
+unsigned long nr_uninterruptible_cpu(int cpu)
+{
+   struct rq *this = cpu_rq(cpu);
+   unsigned long val = 0;
+   int i;
+
+   for_each_online_cpu(i)
+   val += per_cpu(*this-nr_uninterruptible, i);
+
+   return val;
+}
+
 unsigned long long nr_context_switches(void)
 {
int i;
@@ -1980,7 +2003,8 @@ unsigned long this_cpu_load(void)
  *
  *   nr_active = 0;
  *   for_each_possible_cpu(cpu)
- * nr_active += cpu_of(cpu)-nr_running + cpu_of(cpu)-nr_uninterruptible;
+ * nr_active += cpu_of(cpu)-nr_running +
+ *  (cpu_of(cpu)-nr_uninterruptible;
  *
  *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
  *
@@ -2004,13 +2028,6 @@ unsigned long this_cpu_load(void)
  *This places an