The util_avg follows task group's hierarchy to update, but the util_avgs
of all group entities and cfs_rqs except the top cfs_rq are needless and
thus never used. More importantly, the top cfs_rq's util_avg does not
reflect migration of a group task effectively, because the util_avg of
the task is accounted to its direct cfs_rq, and slowly trickle down to
the top cfs_rq.

So this patch proposes a flat task hierarchy for util_avg - all cfs tasks
affiliated to a rq are flat, removing their task group hierarchy, and
therefore the rq's util is the sum of all the cfs tasks.

Regarding overhead, the rq's util updates may be more costly - rq's util
updates can be very frequent, but we don't update any group entity's util,
so the net overhead should be evened out.

Signed-off-by: Yuyang Du <[email protected]>
---
 kernel/sched/core.c  |    1 +
 kernel/sched/debug.c |   13 +++--
 kernel/sched/fair.c  |  140 +++++++++++++++++++++++++++++++-------------------
 kernel/sched/sched.h |    5 +-
 4 files changed, 101 insertions(+), 58 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 91fe97f9..d215d04 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7475,6 +7475,7 @@ void __init sched_init(void)
 #ifdef CONFIG_NO_HZ_FULL
                rq->last_sched_tick = 0;
 #endif
+               atomic_long_set(&rq->removed_util_avg, 0);
 #endif /* CONFIG_SMP */
                init_rq_hrtick(rq);
                atomic_set(&rq->nr_iowait, 0);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2a0a999..14dc121 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -395,7 +395,6 @@ static void print_cfs_group_stats(struct seq_file *m, int 
cpu, struct task_group
        P(se->load.weight);
 #ifdef CONFIG_SMP
        P(se->avg.load_avg);
-       P(se->avg.util_avg);
 #endif
 #undef PN
 #undef P
@@ -462,6 +461,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, 
int rq_cpu)
                print_task(m, rq, p);
        }
        rcu_read_unlock();
+
+#ifdef CONFIG_SMP
+       SEQ_printf(m, "\nutilization: \n");
+       SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
+                       rq->avg.util_avg);
+       SEQ_printf(m, "  .%-30s: %ld\n", "removed_util_avg",
+                       atomic_long_read(&rq->removed_util_avg));
+#endif
 }
 
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
@@ -510,12 +517,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct 
cfs_rq *cfs_rq)
                        cfs_rq->avg.load_avg);
        SEQ_printf(m, "  .%-30s: %lu\n", "runnable_load_avg",
                        cfs_rq->runnable_load_avg);
-       SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
-                       cfs_rq->avg.util_avg);
        SEQ_printf(m, "  .%-30s: %ld\n", "removed_load_avg",
                        atomic_long_read(&cfs_rq->removed_load_avg));
-       SEQ_printf(m, "  .%-30s: %ld\n", "removed_util_avg",
-                       atomic_long_read(&cfs_rq->removed_util_avg));
 #ifdef CONFIG_FAIR_GROUP_SCHED
        SEQ_printf(m, "  .%-30s: %lu\n", "tg_load_avg_contrib",
                        cfs_rq->tg_load_avg_contrib);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1d4640c..b514129 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -703,47 +703,30 @@ static void attach_entity_sched_avg(struct cfs_rq 
*cfs_rq, struct sched_entity *
 
 /*
  * With new tasks being created, their initial util_avgs are extrapolated
- * based on the cfs_rq's current util_avg:
+ * based on the rq's current util_avg. To make the util_avg converge, we
+ * cap the util_avg of successive tasks to only 1/2 of the left utilization
+ * budget:
  *
- *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
- *
- * However, in many cases, the above util_avg does not give a desired
- * value. Moreover, the sum of the util_avgs may be divergent, such
- * as when the series is a harmonic series.
- *
- * To solve this problem, we also cap the util_avg of successive tasks to
- * only 1/2 of the left utilization budget:
- *
- *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *   util_avg = (1024 - rq->avg.util_avg) / 2^n
  *
  * where n denotes the nth task.
  *
  * For example, a simplest series from the beginning would be like:
  *
- *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
- * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
- *
- * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
- * if util_avg > util_avg_cap.
+ *  task util_avg: 512, 256, 128,  64,  32,   16,    8, ...
+ *    rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
  */
 void post_init_entity_sched_avg(struct sched_entity *se)
 {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       struct rq *rq = rq_of(cfs_rq);
        struct sched_avg *sa = &se->avg;
-       long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+       long cap = (long)(SCHED_CAPACITY_SCALE - rq->avg.util_avg) / 2;
        u64 now = cfs_rq_clock_task(cfs_rq);
        int tg_update;
 
        if (cap > 0) {
-               if (cfs_rq->avg.util_avg != 0) {
-                       sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
-                       sa->util_avg /= (cfs_rq->avg.load_avg + 1);
-
-                       if (sa->util_avg > cap)
-                               sa->util_avg = cap;
-               } else {
-                       sa->util_avg = cap;
-               }
+               sa->util_avg = cap;
                sa->util_sum = sa->util_avg * SCHED_AVG_MAX;
        }
 
@@ -2676,8 +2659,45 @@ __accumulate_sum(u64 periods, u32 period_contrib, u32 
remainder)
 
 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 
-static __always_inline u32 accumulate_sum(u64 delta, struct sched_avg *sa,
-       struct cfs_rq *cfs_rq, int cpu, unsigned long weight, int running)
+static __always_inline void
+__update_rq_util_avg(struct rq *rq, unsigned long scale_freq, unsigned long 
scale_cpu)
+{
+       u32 contrib;
+       struct sched_avg *sa = &rq->avg;
+       u64 delta, periods, now = rq_clock_task(rq);
+
+       /*
+        * We have new delta (in ns unit) and periods (in ms unit).
+        */
+       delta = (now - sa->last_update_time) >> 10;
+       if (!delta)
+               return;
+       sa->last_update_time = now;
+
+       delta += sa->period_contrib;
+       periods = delta >> 10;
+
+       /* Step 1: decay */
+       if (periods)
+               sa->util_sum = __decay_sum(sa->util_sum, periods);
+
+       /* Step 2: accumulate */
+       delta %= 1024;
+       contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+       sa->period_contrib = delta;
+
+       contrib = cap_scale(contrib, scale_freq);
+       if (rq->cfs.curr != NULL) /* new running */
+               sa->util_sum += contrib * scale_cpu;
+
+       /* Step 3: update avg */
+       if (periods)
+               sa->util_avg = sa->util_sum / SCHED_AVG_MAX;
+}
+
+static __always_inline u32
+accumulate_sum(u64 delta, struct sched_avg *sa, struct cfs_rq *cfs_rq, int cpu,
+              unsigned long weight, int running, int update_util)
 {
        u32 contrib;
        u64 periods;
@@ -2696,11 +2716,11 @@ static __always_inline u32 accumulate_sum(u64 delta, 
struct sched_avg *sa,
         */
        if (periods) {
                sa->load_sum = __decay_sum(sa->load_sum, periods);
-               if (cfs_rq) {
+               if (cfs_rq)
                        cfs_rq->runnable_load_sum =
                                __decay_sum(cfs_rq->runnable_load_sum, periods);
-               }
-               sa->util_sum = __decay_sum((u64)(sa->util_sum), periods);
+               else if (update_util)
+                       sa->util_sum = __decay_sum((u64)(sa->util_sum), 
periods);
        }
 
        /*
@@ -2720,9 +2740,12 @@ static __always_inline u32 accumulate_sum(u64 delta, 
struct sched_avg *sa,
                if (cfs_rq)
                        cfs_rq->runnable_load_sum += weight * contrib;
        }
-       if (running)
+       if (running && update_util)
                sa->util_sum += contrib * scale_cpu;
 
+       if (cfs_rq)
+               __update_rq_util_avg(rq_of(cfs_rq), scale_freq, scale_cpu);
+
        return periods;
 }
 
@@ -2759,6 +2782,7 @@ __update_sched_avg(u64 now, int cpu, struct sched_avg *sa,
                   unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
        u64 delta;
+       int update_util = 0;
 
        delta = now - sa->last_update_time;
        /*
@@ -2780,24 +2804,30 @@ __update_sched_avg(u64 now, int cpu, struct sched_avg 
*sa,
        sa->last_update_time = now;
 
        /*
+        * We update util_sum together with load_sum iff it is a task
+        */
+       if (!cfs_rq && entity_is_task(container_of(sa, struct sched_entity, 
avg)))
+               update_util = 1;
+
+       /*
         * Now we know we crossed measurement unit boundaries. The *_avg
         * accrues by two steps:
         *
         * Step 1: accumulate *_sum since last_update_time. If we haven't
         * crossed period boundaries, finish.
         */
-       if (!accumulate_sum(delta, sa, cfs_rq, cpu, weight, running))
+       if (!accumulate_sum(delta, sa, cfs_rq, cpu, weight, running, 
update_util))
                return 0;
 
        /*
         * Step 2: update *_avg.
         */
        sa->load_avg = div_u64(sa->load_sum, SCHED_AVG_MAX);
-       if (cfs_rq) {
+       if (cfs_rq)
                cfs_rq->runnable_load_avg =
                        div_u64(cfs_rq->runnable_load_sum, SCHED_AVG_MAX);
-       }
-       sa->util_avg = sa->util_sum / SCHED_AVG_MAX;
+       else if (update_util)
+               sa->util_avg = sa->util_sum / SCHED_AVG_MAX;
 
        return 1;
 }
@@ -2897,8 +2927,7 @@ static inline void cfs_rq_util_change(struct cfs_rq 
*cfs_rq)
                 *
                 * See cpu_util().
                 */
-               cpufreq_update_util(rq_clock(rq),
-                                   min(cfs_rq->avg.util_avg, max), max);
+               cpufreq_update_util(rq_clock(rq), min(rq->avg.util_avg, max), 
max);
        }
 }
 
@@ -2941,18 +2970,21 @@ update_cfs_rq_sched_avg(u64 now, struct cfs_rq *cfs_rq, 
bool update_freq)
 {
        struct sched_avg *sa = &cfs_rq->avg;
        int decayed, removed_load = 0, removed_util = 0;
+       struct rq *rq = rq_of(cfs_rq);
 
        if (atomic_long_read(&cfs_rq->removed_load_avg)) {
                s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+
                sub_positive(&sa->load_avg, r);
                sub_positive(&sa->load_sum, r * SCHED_AVG_MAX);
                removed_load = 1;
        }
 
-       if (atomic_long_read(&cfs_rq->removed_util_avg)) {
-               long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
-               sub_positive(&sa->util_avg, r);
-               sub_positive(&sa->util_sum, r * SCHED_AVG_MAX);
+       if (atomic_long_read(&rq->removed_util_avg)) {
+               long r = atomic_long_xchg(&rq->removed_util_avg, 0);
+
+               sub_positive(&rq->avg.util_avg, r);
+               sub_positive(&rq->avg.util_sum, r * SCHED_AVG_MAX);
                removed_util = 1;
        }
 
@@ -2999,6 +3031,8 @@ static inline void update_sched_avg(struct sched_entity 
*se, int update_tg)
  */
 static void attach_entity_sched_avg(struct cfs_rq *cfs_rq, struct sched_entity 
*se)
 {
+       struct rq *rq = rq_of(cfs_rq);
+
        if (!sched_feat(ATTACH_AGE_LOAD))
                goto skip_aging;
 
@@ -3022,8 +3056,8 @@ skip_aging:
        se->avg.last_update_time = cfs_rq->avg.last_update_time;
        cfs_rq->avg.load_avg += se->avg.load_avg;
        cfs_rq->avg.load_sum += se->avg.load_sum;
-       cfs_rq->avg.util_avg += se->avg.util_avg;
-       cfs_rq->avg.util_sum += se->avg.util_sum;
+       rq->avg.util_avg += se->avg.util_avg;
+       rq->avg.util_sum += se->avg.util_sum;
 
        cfs_rq_util_change(cfs_rq);
 }
@@ -3038,14 +3072,16 @@ skip_aging:
  */
 static void detach_entity_sched_avg(struct cfs_rq *cfs_rq, struct sched_entity 
*se)
 {
+       struct rq *rq = rq_of(cfs_rq);
+
        __update_sched_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
                           &se->avg, se->on_rq * se->load.weight,
                           cfs_rq->curr == se, NULL);
 
        sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
        sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
-       sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
-       sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+       sub_positive(&rq->avg.util_avg, se->avg.util_avg);
+       sub_positive(&rq->avg.util_sum, se->avg.util_sum);
 
        cfs_rq_util_change(cfs_rq);
 }
@@ -3117,6 +3153,7 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq 
*cfs_rq)
 static void remove_entity_sched_avg(struct sched_entity *se)
 {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       struct rq *rq = rq_of(cfs_rq);
        u64 last_update_time;
 
        /*
@@ -3134,7 +3171,7 @@ static void remove_entity_sched_avg(struct sched_entity 
*se)
        __update_sched_avg(last_update_time, cpu_of(rq_of(cfs_rq)),
                           &se->avg, 0, 0, NULL);
        atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
-       atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
+       atomic_long_add(se->avg.util_avg, &rq->removed_util_avg);
 }
 
 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
@@ -5332,7 +5369,7 @@ done:
  * compare the utilization with the capacity of the CPU that is available for
  * CFS task (ie cpu_capacity).
  *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * rq->avg.util_avg is the sum of running time of runnable tasks plus the
  * recent utilization of currently non-runnable tasks on a CPU. It represents
  * the amount of utilization of a CPU in the range [0..capacity_orig] where
  * capacity_orig is the cpu_capacity available at the highest frequency
@@ -5341,9 +5378,9 @@ done:
  * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
  * the running time on this CPU scaled by capacity_curr.
  *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * Nevertheless, rq->avg.util_avg can be higher than capacity_curr or even
  * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * rq->avg.util_avg or just after migrating tasks and new task wakeups until
  * the average stabilizes with the new running time. We need to check that the
  * utilization stays within the range of [0..capacity_orig] and cap it if
  * necessary. Without utilization capping, a group could be seen as overloaded
@@ -5354,7 +5391,7 @@ done:
  */
 static int cpu_util(int cpu)
 {
-       unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+       unsigned long util = cpu_rq(cpu)->avg.util_avg;
        unsigned long capacity = capacity_orig_of(cpu);
 
        return (util >= capacity) ? capacity : util;
@@ -8533,7 +8570,6 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #endif
 #ifdef CONFIG_SMP
        atomic_long_set(&cfs_rq->removed_load_avg, 0);
-       atomic_long_set(&cfs_rq->removed_util_avg, 0);
 #endif
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4723d4a..f6785c1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -398,7 +398,7 @@ struct cfs_rq {
 #ifdef CONFIG_FAIR_GROUP_SCHED
        unsigned long tg_load_avg_contrib;
 #endif
-       atomic_long_t removed_load_avg, removed_util_avg;
+       atomic_long_t removed_load_avg;
 #ifndef CONFIG_64BIT
        u64 load_last_update_time_copy;
 #endif
@@ -662,6 +662,9 @@ struct rq {
 
        /* This is used to determine avg_idle's max value */
        u64 max_idle_balance_cost;
+
+       struct sched_avg avg;
+       atomic_long_t removed_util_avg;
 #endif
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
-- 
1.7.9.5

Reply via email to