Re: [PATCH v2 05/12] sched/fair: Optimize __update_sched_avg()

2016-05-03 Thread Peter Zijlstra
On Tue, May 03, 2016 at 05:54:31AM +0800, Yuyang Du wrote:
> __update_sched_avg() has these steps:
> 1. add the left of the last incomplete period
> 2. decay old sum
> 3. accumulate new sum since last_update_time
> 4. add the current incomplete period
> 5. update averages
> 
> Previously, we separately computed steps 1, 3, and 4, leading to
> each one of them ugly in codes and costly in overhead. But actually
> they all do the same thing, so we combine them together. The result
> will be much cleaner codes and less CPU cycles.

I would very much like to see an explanation of how we can fold all that
here. Without me having to untangle the code first.

That also helps me to verify if the code does indeed implement what you
meant it to, etc..




Re: [PATCH v2 05/12] sched/fair: Optimize __update_sched_avg()

2016-05-03 Thread Peter Zijlstra
On Tue, May 03, 2016 at 05:54:31AM +0800, Yuyang Du wrote:
> __update_sched_avg() has these steps:
> 1. add the left of the last incomplete period
> 2. decay old sum
> 3. accumulate new sum since last_update_time
> 4. add the current incomplete period
> 5. update averages
> 
> Previously, we separately computed steps 1, 3, and 4, leading to
> each one of them ugly in codes and costly in overhead. But actually
> they all do the same thing, so we combine them together. The result
> will be much cleaner codes and less CPU cycles.

I would very much like to see an explanation of how we can fold all that
here. Without me having to untangle the code first.

That also helps me to verify if the code does indeed implement what you
meant it to, etc..




[PATCH v2 05/12] sched/fair: Optimize __update_sched_avg()

2016-05-02 Thread Yuyang Du
__update_sched_avg() has these steps:
1. add the left of the last incomplete period
2. decay old sum
3. accumulate new sum since last_update_time
4. add the current incomplete period
5. update averages

Previously, we separately computed steps 1, 3, and 4, leading to
each one of them ugly in codes and costly in overhead. But actually
they all do the same thing, so we combine them together. The result
will be much cleaner codes and less CPU cycles.

Signed-off-by: Yuyang Du 
---
 kernel/sched/fair.c |  185 +--
 1 file changed, 92 insertions(+), 93 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1655280..ea99c2c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -668,7 +668,7 @@ static unsigned long task_h_load(struct task_struct *p);
  */
 #define SCHED_AVG_HALFLIFE 32  /* number of periods as a half-life */
 #define SCHED_AVG_MAX 47742/* maximum possible sched avg */
-#define SCHED_AVG_MAX_N 345/* number of full periods to produce 
SCHED_AVG_MAX */
+#define SCHED_AVG_MAX_N 347/* number of full periods to produce 
SCHED_AVG_MAX */
 
 /* Give new sched_entity start runnable values to heavy its load in infant 
time */
 void init_entity_runnable_average(struct sched_entity *se)
@@ -2606,7 +2606,7 @@ static const u32 __accumulated_sum_N[] = {
 
 /*
  * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
- * lower integers.
+ * lower integers. Since n < SCHED_AVG_MAX_N, n/SCHED_AVG_HALFLIFE < 11
  */
 static const u32 __accumulated_sum_N32[] = {
0, 23371, 35056, 40899, 43820, 45281,
@@ -2649,20 +2649,31 @@ static __always_inline u64 __decay_sum(u64 val, u32 n)
  * We can compute this efficiently by combining:
  * y^32 = 1/2 with precomputed \Sum 1024*y^n   (where n < 32)
  */
-static __always_inline u32 __accumulate_sum(u32 n)
+static __always_inline u32
+__accumulate_sum(u32 periods, u32 period_contrib, u32 remainder)
 {
-   u32 contrib = 0;
+   u32 contrib;
 
-   if (likely(n <= SCHED_AVG_HALFLIFE))
-   return __accumulated_sum_N[n];
-   else if (unlikely(n >= SCHED_AVG_MAX_N))
+   if (!periods)
+   return remainder - period_contrib;
+
+   if (unlikely(periods >= SCHED_AVG_MAX_N))
return SCHED_AVG_MAX;
 
-   /* Since n < SCHED_AVG_MAX_N, n/SCHED_AVG_HALFLIFE < 11 */
-   contrib = __accumulated_sum_N32[n/SCHED_AVG_HALFLIFE];
-   n %= SCHED_AVG_HALFLIFE;
-   contrib = __decay_sum(contrib, n);
-   return contrib + __accumulated_sum_N[n];
+   remainder += __decay_sum((u64)(1024 - period_contrib), periods);
+
+   periods -= 1;
+   if (likely(periods <= SCHED_AVG_HALFLIFE))
+   contrib = __accumulated_sum_N[periods];
+   else {
+   /*(periods>>5) = (periods/SCHED_AVG_HALFLIFE) */
+   contrib = __accumulated_sum_N32[periods/SCHED_AVG_HALFLIFE];
+   periods %= SCHED_AVG_HALFLIFE;
+   contrib = __decay_sum(contrib, periods);
+   contrib += __accumulated_sum_N[periods];
+   }
+
+   return contrib + remainder;
 }
 
 #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT 
!= 10
@@ -2671,6 +2682,55 @@ static __always_inline u32 __accumulate_sum(u32 n)
 
 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 
+static __always_inline u32 accumulate_sum(u64 delta, struct sched_avg *sa,
+   struct cfs_rq *cfs_rq, int cpu, unsigned long weight, int running)
+{
+   u32 contrib, periods;
+   unsigned long scale_freq, scale_cpu;
+
+   scale_freq = arch_scale_freq_capacity(NULL, cpu);
+   scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+   delta += sa->period_contrib;
+   periods = delta >> 10; /* A period is 1024us (~1ms) */
+
+   /*
+* Accumulating *_sum has two steps.
+*
+* Step 1: decay old *_sum if we crossed period boundaries.
+*/
+   if (periods) {
+   sa->load_sum = __decay_sum(sa->load_sum, periods);
+   if (cfs_rq) {
+   cfs_rq->runnable_load_sum =
+   __decay_sum(cfs_rq->runnable_load_sum, periods);
+   }
+   sa->util_sum = __decay_sum((u64)(sa->util_sum), periods);
+   }
+
+   /*
+* Step 2: accumulate new *_sum since last_update_time. This at most has
+* three parts (at least one part): (1) remainder of incomplete last
+* period, (2) full periods since (1), and (3) incomplete current 
period.
+*
+* Fortunately, we can (and should) do all these three at once.
+*/
+   delta %= 1024;
+   contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+   sa->period_contrib = delta;
+
+   contrib = cap_scale(contrib, scale_freq);
+   if (weight) {
+   sa->load_sum += weight * contrib;
+   if (cfs_rq)
+ 

[PATCH v2 05/12] sched/fair: Optimize __update_sched_avg()

2016-05-02 Thread Yuyang Du
__update_sched_avg() has these steps:
1. add the left of the last incomplete period
2. decay old sum
3. accumulate new sum since last_update_time
4. add the current incomplete period
5. update averages

Previously, we separately computed steps 1, 3, and 4, leading to
each one of them ugly in codes and costly in overhead. But actually
they all do the same thing, so we combine them together. The result
will be much cleaner codes and less CPU cycles.

Signed-off-by: Yuyang Du 
---
 kernel/sched/fair.c |  185 +--
 1 file changed, 92 insertions(+), 93 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1655280..ea99c2c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -668,7 +668,7 @@ static unsigned long task_h_load(struct task_struct *p);
  */
 #define SCHED_AVG_HALFLIFE 32  /* number of periods as a half-life */
 #define SCHED_AVG_MAX 47742/* maximum possible sched avg */
-#define SCHED_AVG_MAX_N 345/* number of full periods to produce 
SCHED_AVG_MAX */
+#define SCHED_AVG_MAX_N 347/* number of full periods to produce 
SCHED_AVG_MAX */
 
 /* Give new sched_entity start runnable values to heavy its load in infant 
time */
 void init_entity_runnable_average(struct sched_entity *se)
@@ -2606,7 +2606,7 @@ static const u32 __accumulated_sum_N[] = {
 
 /*
  * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
- * lower integers.
+ * lower integers. Since n < SCHED_AVG_MAX_N, n/SCHED_AVG_HALFLIFE < 11
  */
 static const u32 __accumulated_sum_N32[] = {
0, 23371, 35056, 40899, 43820, 45281,
@@ -2649,20 +2649,31 @@ static __always_inline u64 __decay_sum(u64 val, u32 n)
  * We can compute this efficiently by combining:
  * y^32 = 1/2 with precomputed \Sum 1024*y^n   (where n < 32)
  */
-static __always_inline u32 __accumulate_sum(u32 n)
+static __always_inline u32
+__accumulate_sum(u32 periods, u32 period_contrib, u32 remainder)
 {
-   u32 contrib = 0;
+   u32 contrib;
 
-   if (likely(n <= SCHED_AVG_HALFLIFE))
-   return __accumulated_sum_N[n];
-   else if (unlikely(n >= SCHED_AVG_MAX_N))
+   if (!periods)
+   return remainder - period_contrib;
+
+   if (unlikely(periods >= SCHED_AVG_MAX_N))
return SCHED_AVG_MAX;
 
-   /* Since n < SCHED_AVG_MAX_N, n/SCHED_AVG_HALFLIFE < 11 */
-   contrib = __accumulated_sum_N32[n/SCHED_AVG_HALFLIFE];
-   n %= SCHED_AVG_HALFLIFE;
-   contrib = __decay_sum(contrib, n);
-   return contrib + __accumulated_sum_N[n];
+   remainder += __decay_sum((u64)(1024 - period_contrib), periods);
+
+   periods -= 1;
+   if (likely(periods <= SCHED_AVG_HALFLIFE))
+   contrib = __accumulated_sum_N[periods];
+   else {
+   /*(periods>>5) = (periods/SCHED_AVG_HALFLIFE) */
+   contrib = __accumulated_sum_N32[periods/SCHED_AVG_HALFLIFE];
+   periods %= SCHED_AVG_HALFLIFE;
+   contrib = __decay_sum(contrib, periods);
+   contrib += __accumulated_sum_N[periods];
+   }
+
+   return contrib + remainder;
 }
 
 #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT 
!= 10
@@ -2671,6 +2682,55 @@ static __always_inline u32 __accumulate_sum(u32 n)
 
 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 
+static __always_inline u32 accumulate_sum(u64 delta, struct sched_avg *sa,
+   struct cfs_rq *cfs_rq, int cpu, unsigned long weight, int running)
+{
+   u32 contrib, periods;
+   unsigned long scale_freq, scale_cpu;
+
+   scale_freq = arch_scale_freq_capacity(NULL, cpu);
+   scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+   delta += sa->period_contrib;
+   periods = delta >> 10; /* A period is 1024us (~1ms) */
+
+   /*
+* Accumulating *_sum has two steps.
+*
+* Step 1: decay old *_sum if we crossed period boundaries.
+*/
+   if (periods) {
+   sa->load_sum = __decay_sum(sa->load_sum, periods);
+   if (cfs_rq) {
+   cfs_rq->runnable_load_sum =
+   __decay_sum(cfs_rq->runnable_load_sum, periods);
+   }
+   sa->util_sum = __decay_sum((u64)(sa->util_sum), periods);
+   }
+
+   /*
+* Step 2: accumulate new *_sum since last_update_time. This at most has
+* three parts (at least one part): (1) remainder of incomplete last
+* period, (2) full periods since (1), and (3) incomplete current 
period.
+*
+* Fortunately, we can (and should) do all these three at once.
+*/
+   delta %= 1024;
+   contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+   sa->period_contrib = delta;
+
+   contrib = cap_scale(contrib, scale_freq);
+   if (weight) {
+   sa->load_sum += weight * contrib;
+   if (cfs_rq)
+