When a new cgroup is created, scheduler attaches the child cgroup
to its parent and also increases the parent's task_group load_avg to
account increased load with following path :

  sched_create_group()
    alloc_fair_sched_group()
      sched_online_group()
        online_fair_sched_group()
          for_each_possible_cpu()
            post_init_entity_util_avg()
              update_tg_load_avg()

However the parent's load_avg is shared by all cpus hence it's being
increased number of cpu times.  For example when there are 8 cpus
available (in fact 1 available cpu after hotplugging out too),
making empty cgroups /grp1 and /grp1/grp11 leads each task_group's
load_avg to be 8092 and 1024 whereas desired both cgroup's task_group
load_avg is 1024 which happens when booting with 1 cpu at present.

Such an incorrect load_avg accounting causes quite steep unfairness
to the tasks when they are in different cgroups.
With a scenario when online cpus = 1, possible cpus = 4 and 2 cpu
bound tasks are running but each runs on the parent and the child
cgroup :

  # echo 0 > /sys/devices/system/cpu/cpu1/online
  # echo 0 > /sys/devices/system/cpu/cpu2/online
  # echo 0 > /sys/devices/system/cpu/cpu3/online
  # cat /sys/devices/system/cpu/online
  0
  # mkdir /sys/fs/cgroup/grp1
  # dd if=/dev/zero of=/dev/null &
  # echo $! > /sys/fs/cgroup/tasks
  # dd if=/dev/zero of=/dev/null &
  # echo $! > /sys/fs/cgroup/grp1/tasks

After 3 seconds, the task in the root cgroup got 4 times of execution
time than the task in the child cgroup because weight of possible cpu
is 4 so scheduler thinks the root cgroup has 4 times more load than
child cgroup.

  dd (2029, #threads: 1)
  se.exec_start                                :        562900.460656
  se.sum_exec_runtime                          :          2573.175002
  dd (2032, #threads: 1)
  se.exec_start                                :        562900.037152
  se.sum_exec_runtime                          :           655.439360

Whereas booting the same system with maxcpus=1 makes both tasks run
evenly.

  dd (1952, #threads: 1)
  se.exec_start                                :         75660.457449
  se.sum_exec_runtime                          :          1754.045078
  dd (1955, #threads: 1)
  se.exec_start                                :         75680.029689
  se.sum_exec_runtime                          :          1768.195390

Fix such fairness problems by updating parent's task group load_avg
only once when a new child cgroup is being created.

Cc: Ingo Molnar <mi...@redhat.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Joonwoo Park <joonw...@codeaurora.org>
---
 kernel/sched/core.c  | 2 +-
 kernel/sched/fair.c  | 9 ++++++---
 kernel/sched/sched.h | 3 ++-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 94732d1..2cf46aa 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2577,7 +2577,7 @@ void wake_up_new_task(struct task_struct *p)
        __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
        rq = __task_rq_lock(p, &rf);
-       post_init_entity_util_avg(&p->se);
+       post_init_entity_util_avg(&p->se, true);
 
        activate_task(rq, p, 0);
        p->on_rq = TASK_ON_RQ_QUEUED;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 502e95a..71c08a8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -730,7 +730,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, 
struct sched_entity *s
  * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
  * if util_avg > util_avg_cap.
  */
-void post_init_entity_util_avg(struct sched_entity *se)
+void post_init_entity_util_avg(struct sched_entity *se, bool update_tg_load)
 {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        struct sched_avg *sa = &se->avg;
@@ -770,7 +770,8 @@ void post_init_entity_util_avg(struct sched_entity *se)
 
        update_cfs_rq_load_avg(now, cfs_rq, false);
        attach_entity_load_avg(cfs_rq, se);
-       update_tg_load_avg(cfs_rq, false);
+       if (update_tg_load)
+               update_tg_load_avg(cfs_rq, false);
 }
 
 #else /* !CONFIG_SMP */
@@ -8872,15 +8873,17 @@ void online_fair_sched_group(struct task_group *tg)
        struct sched_entity *se;
        struct rq *rq;
        int i;
+       bool update_tg_load = true;
 
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
                se = tg->se[i];
 
                raw_spin_lock_irq(&rq->lock);
-               post_init_entity_util_avg(se);
+               post_init_entity_util_avg(se, update_tg_load);
                sync_throttle(tg, i);
                raw_spin_unlock_irq(&rq->lock);
+               update_tg_load = false;
        }
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 055f935..6ab89af 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1356,7 +1356,8 @@ extern void init_dl_task_timer(struct sched_dl_entity 
*dl_se);
 unsigned long to_ratio(u64 period, u64 runtime);
 
 extern void init_entity_runnable_average(struct sched_entity *se);
-extern void post_init_entity_util_avg(struct sched_entity *se);
+extern void post_init_entity_util_avg(struct sched_entity *se,
+                                     bool update_tg_load);
 
 #ifdef CONFIG_NO_HZ_FULL
 extern bool sched_can_stop_tick(struct rq *rq);
-- 
2.9.3

Reply via email to