Now that enqueue_task_fair and dequeue_task_fair no longer iterate up
the hierarchy all the time, a method to lazily propagate sum_exec_runtime
up the hierarchy is necessary.

Once a tick, propagate the newly accumulated exec_runtime up the hierarchy,
and feed it into CFS bandwidth control.

Remove the pointless call to account_cfs_rq_runtime from update_curr,
which is always called with a root cfs_rq.

Signed-off-by: Rik van Riel <r...@surriel.com>
---
 include/linux/sched.h |  1 +
 kernel/sched/core.c   |  1 +
 kernel/sched/fair.c   | 22 ++++++++++++++++++++--
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 901c710363e7..bdca15b3afe7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -454,6 +454,7 @@ struct sched_entity {
        int                             depth;
        unsigned long                   enqueued_h_load;
        unsigned long                   enqueued_h_weight;
+       u64                             propagated_exec_runtime;
        struct load_weight              h_load;
        struct sched_entity             *parent;
        /* rq on which this entity is (to be) queued: */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fbd96900f715..9915d20e84a9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2137,6 +2137,7 @@ static void __sched_fork(unsigned long clone_flags, 
struct task_struct *p)
        INIT_LIST_HEAD(&p->se.group_node);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+       p->se.propagated_exec_runtime   = 0;
        p->se.cfs_rq                    = NULL;
 #endif
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5cfa3dbeba49..d6c881c5c4d5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -898,8 +898,6 @@ static void update_curr(struct cfs_rq *cfs_rq)
        trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
        cgroup_account_cputime(curtask, delta_exec);
        account_group_exec_runtime(curtask, delta_exec);
-
-       account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
 
 static void update_curr_fair(struct rq *rq)
@@ -3412,6 +3410,20 @@ static inline bool skip_blocked_update(struct 
sched_entity *se)
        return true;
 }
 
+static void propagate_exec_runtime(struct cfs_rq *cfs_rq,
+                               struct sched_entity *se)
+{
+       struct sched_entity *parent = se->parent;
+       u64 diff = se->sum_exec_runtime - se->propagated_exec_runtime;
+
+       if (parent) {
+               parent->sum_exec_runtime += diff;
+               account_cfs_rq_runtime(cfs_rq, diff);
+       }
+
+       se->propagated_exec_runtime = se->sum_exec_runtime;
+}
+
 #else /* CONFIG_FAIR_GROUP_SCHED */
 
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
@@ -3423,6 +3435,11 @@ static inline int propagate_entity_load_avg(struct 
sched_entity *se)
 
 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long 
runnable_sum) {}
 
+static void propagate_exec_runtime(struct cfs_rq *cfs_rq,
+                               struct sched_entity *se);
+{
+}
+
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 /**
@@ -10157,6 +10174,7 @@ static void propagate_entity_cfs_rq(struct sched_entity 
*se, int flags)
                        if (!(flags & DO_ATTACH))
                                break;
 
+               propagate_exec_runtime(cfs_rq, se);
                update_cfs_group(se);
        }
 }
-- 
2.20.1

Reply via email to