From: Peter Zijlstra <[email protected]>

This patch is combination of

   
http://lkml.kernel.org/r/[email protected]
 + 
http://lkml.kernel.org/r/[email protected]
 + build fix & use shares_avg for propagating load_avg instead of runnable

This fixes the propagation problem described in the following while
keeping group se->load_avg.avg in line with the matching
cfs_rq->load_avg.avg.

   http://lkml.kernel.org/r/[email protected]

---
 kernel/sched/fair.c |   98 +++++++++++++++++++++++++---------------------------
 1 file changed, 48 insertions(+), 50 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2636,26 +2636,57 @@ account_entity_dequeue(struct cfs_rq *cf
        cfs_rq->nr_running--;
 }
 
+enum shares_type {
+       shares_runnable,
+       shares_avg,
+       shares_weight,
+};
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 # ifdef CONFIG_SMP
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+static long
+calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+               enum shares_type shares_type)
 {
-       long tg_weight, load, shares;
+       long tg_weight, tg_shares, load, shares;
 
-       /*
-        * This really should be: cfs_rq->avg.load_avg, but instead we use
-        * cfs_rq->load.weight, which is its upper bound. This helps ramp up
-        * the shares for small weight interactive tasks.
-        */
-       load = scale_load_down(cfs_rq->load.weight);
+       tg_shares = READ_ONCE(tg->shares);
+
+       switch (shares_type) {
+       case shares_runnable:
+               /*
+                * Instead of the correct cfs_rq->avg.load_avg we use
+                * cfs_rq->runnable_load_avg, which does not include the
+                * blocked load.
+                */
+               load = cfs_rq->runnable_load_avg;
+               break;
+
+       case shares_avg:
+               load = cfs_rq->avg.load_avg;
+               break;
+
+       case shares_weight:
+               /*
+                * Instead of the correct cfs_rq->avg.load_avg we use
+                * cfs_rq->load.weight, which is its upper bound. This helps
+                * ramp up the shares for small weight interactive tasks.
+                */
+               load = scale_load_down(cfs_rq->load.weight);
+               break;
+       }
 
        tg_weight = atomic_long_read(&tg->load_avg);
 
-       /* Ensure tg_weight >= load */
+       /*
+        * This ensures the sum is up-to-date for this CPU, in case of the other
+        * two approximations it biases the sum towards their value and in case
+        * of (near) UP ensures the division ends up <= 1.
+        */
        tg_weight -= cfs_rq->tg_load_avg_contrib;
        tg_weight += load;
 
-       shares = (tg->shares * load);
+       shares = (tg_shares * load);
        if (tg_weight)
                shares /= tg_weight;
 
@@ -2671,15 +2702,11 @@ static long calc_cfs_shares(struct cfs_r
         * case no task is runnable on a CPU MIN_SHARES=2 should be returned
         * instead of 0.
         */
-       if (shares < MIN_SHARES)
-               shares = MIN_SHARES;
-       if (shares > tg->shares)
-               shares = tg->shares;
-
-       return shares;
+       return clamp_t(long, shares, MIN_SHARES, tg_shares);
 }
 # else /* CONFIG_SMP */
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group 
*tg)
+static inline long
+calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, enum shares_type 
shares_type)
 {
        return tg->shares;
 }
@@ -2721,7 +2748,7 @@ static void update_cfs_shares(struct sch
        if (likely(se->load.weight == tg->shares))
                return;
 #endif
-       shares = calc_cfs_shares(cfs_rq, tg);
+       shares = calc_cfs_shares(cfs_rq, tg, shares_weight);
 
        reweight_entity(cfs_rq_of(se), se, shares);
 }
@@ -3078,39 +3105,10 @@ static inline void
 update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        struct cfs_rq *gcfs_rq = group_cfs_rq(se);
-       long delta, load = gcfs_rq->avg.load_avg;
-
-       /*
-        * If the load of group cfs_rq is null, the load of the
-        * sched_entity will also be null so we can skip the formula
-        */
-       if (load) {
-               long tg_load;
-
-               /* Get tg's load and ensure tg_load > 0 */
-               tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
-
-               /* Ensure tg_load >= load and updated with current load*/
-               tg_load -= gcfs_rq->tg_load_avg_contrib;
-               tg_load += load;
-
-               /*
-                * We need to compute a correction term in the case that the
-                * task group is consuming more CPU than a task of equal
-                * weight. A task with a weight equals to tg->shares will have
-                * a load less or equal to scale_load_down(tg->shares).
-                * Similarly, the sched_entities that represent the task group
-                * at parent level, can't have a load higher than
-                * scale_load_down(tg->shares). And the Sum of sched_entities'
-                * load must be <= scale_load_down(tg->shares).
-                */
-               if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
-                       /* scale gcfs_rq's load into tg's shares*/
-                       load *= scale_load_down(gcfs_rq->tg->shares);
-                       load /= tg_load;
-               }
-       }
+       long load, delta;
 
+       load = scale_load_down(calc_cfs_shares(gcfs_rq, gcfs_rq->tg,
+                                              shares_avg));
        delta = load - se->avg.load_avg;
 
        /* Nothing to update */

Reply via email to