Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-urgent-for-linus

   # HEAD: a4c3c04974d648ee6e1a09ef4131eb32a02ab494 sched/fair: Update and fix 
the runnable propagation rule

This includes a fix for the add_wait_queue() queue ordering brown paperbag bug, 
plus PELT accounting fixes for cgroups scheduling artifacts.

 Thanks,

        Ingo

------------------>
Omar Sandoval (1):
      sched/wait: Fix add_wait_queue() behavioral change

Vincent Guittot (1):
      sched/fair: Update and fix the runnable propagation rule


 kernel/sched/fair.c | 102 +++++++++++++++++++++++++++++++++++++---------------
 kernel/sched/wait.c |   2 +-
 2 files changed, 74 insertions(+), 30 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4037e19bbca2..2fe3aa853e4d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3413,9 +3413,9 @@ void set_task_rq_fair(struct sched_entity *se,
  * _IFF_ we look at the pure running and runnable sums. Because they
  * represent the very same entity, just at different points in the hierarchy.
  *
- *
- * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and
- * simply copies the running sum over.
+ * Per the above update_tg_cfs_util() is trivial and simply copies the running
+ * sum over (but still wrong, because the group entity and group rq do not have
+ * their PELT windows aligned).
  *
  * However, update_tg_cfs_runnable() is more complex. So we have:
  *
@@ -3424,11 +3424,11 @@ void set_task_rq_fair(struct sched_entity *se,
  * And since, like util, the runnable part should be directly transferable,
  * the following would _appear_ to be the straight forward approach:
  *
- *   grq->avg.load_avg = grq->load.weight * grq->avg.running_avg       (3)
+ *   grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg      (3)
  *
  * And per (1) we have:
  *
- *   ge->avg.running_avg == grq->avg.running_avg
+ *   ge->avg.runnable_avg == grq->avg.runnable_avg
  *
  * Which gives:
  *
@@ -3447,27 +3447,28 @@ void set_task_rq_fair(struct sched_entity *se,
  * to (shortly) return to us. This only works by keeping the weights as
  * integral part of the sum. We therefore cannot decompose as per (3).
  *
- * OK, so what then?
+ * Another reason this doesn't work is that runnable isn't a 0-sum entity.
+ * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
+ * rq itself is runnable anywhere between 2/3 and 1 depending on how the
+ * runnable section of these tasks overlap (or not). If they were to perfectly
+ * align the rq as a whole would be runnable 2/3 of the time. If however we
+ * always have at least 1 runnable task, the rq as a whole is always runnable.
  *
+ * So we'll have to approximate.. :/
  *
- * Another way to look at things is:
+ * Given the constraint:
  *
- *   grq->avg.load_avg = \Sum se->avg.load_avg
+ *   ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
  *
- * Therefore, per (2):
+ * We can construct a rule that adds runnable to a rq by assuming minimal
+ * overlap.
  *
- *   grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg
+ * On removal, we'll assume each task is equally runnable; which yields:
  *
- * And the very thing we're propagating is a change in that sum (someone
- * joined/left). So we can easily know the runnable change, which would be, per
- * (2) the already tracked se->load_avg divided by the corresponding
- * se->weight.
+ *   grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
  *
- * Basically (4) but in differential form:
+ * XXX: only do this for the part of runnable > running ?
  *
- *   d(runnable_avg) += se->avg.load_avg / se->load.weight
- *                                                                (5)
- *   ge->avg.load_avg += ge->load.weight * d(runnable_avg)
  */
 
 static inline void
@@ -3479,6 +3480,14 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct 
sched_entity *se, struct cfs_rq
        if (!delta)
                return;
 
+       /*
+        * The relation between sum and avg is:
+        *
+        *   LOAD_AVG_MAX - 1024 + sa->period_contrib
+        *
+        * however, the PELT windows are not aligned between grq and gse.
+        */
+
        /* Set new sched_entity's utilization */
        se->avg.util_avg = gcfs_rq->avg.util_avg;
        se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
@@ -3491,33 +3500,68 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct 
sched_entity *se, struct cfs_rq
 static inline void
 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct 
cfs_rq *gcfs_rq)
 {
-       long runnable_sum = gcfs_rq->prop_runnable_sum;
-       long runnable_load_avg, load_avg;
-       s64 runnable_load_sum, load_sum;
+       long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
+       unsigned long runnable_load_avg, load_avg;
+       u64 runnable_load_sum, load_sum = 0;
+       s64 delta_sum;
 
        if (!runnable_sum)
                return;
 
        gcfs_rq->prop_runnable_sum = 0;
 
+       if (runnable_sum >= 0) {
+               /*
+                * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
+                * the CPU is saturated running == runnable.
+                */
+               runnable_sum += se->avg.load_sum;
+               runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
+       } else {
+               /*
+                * Estimate the new unweighted runnable_sum of the gcfs_rq by
+                * assuming all tasks are equally runnable.
+                */
+               if (scale_load_down(gcfs_rq->load.weight)) {
+                       load_sum = div_s64(gcfs_rq->avg.load_sum,
+                               scale_load_down(gcfs_rq->load.weight));
+               }
+
+               /* But make sure to not inflate se's runnable */
+               runnable_sum = min(se->avg.load_sum, load_sum);
+       }
+
+       /*
+        * runnable_sum can't be lower than running_sum
+        * As running sum is scale with cpu capacity wehreas the runnable sum
+        * is not we rescale running_sum 1st
+        */
+       running_sum = se->avg.util_sum /
+               arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+       runnable_sum = max(runnable_sum, running_sum);
+
        load_sum = (s64)se_weight(se) * runnable_sum;
        load_avg = div_s64(load_sum, LOAD_AVG_MAX);
 
-       add_positive(&se->avg.load_sum, runnable_sum);
-       add_positive(&se->avg.load_avg, load_avg);
+       delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
+       delta_avg = load_avg - se->avg.load_avg;
 
-       add_positive(&cfs_rq->avg.load_avg, load_avg);
-       add_positive(&cfs_rq->avg.load_sum, load_sum);
+       se->avg.load_sum = runnable_sum;
+       se->avg.load_avg = load_avg;
+       add_positive(&cfs_rq->avg.load_avg, delta_avg);
+       add_positive(&cfs_rq->avg.load_sum, delta_sum);
 
        runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
        runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
+       delta_sum = runnable_load_sum - se_weight(se) * 
se->avg.runnable_load_sum;
+       delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
 
-       add_positive(&se->avg.runnable_load_sum, runnable_sum);
-       add_positive(&se->avg.runnable_load_avg, runnable_load_avg);
+       se->avg.runnable_load_sum = runnable_sum;
+       se->avg.runnable_load_avg = runnable_load_avg;
 
        if (se->on_rq) {
-               add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg);
-               add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum);
+               add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
+               add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
        }
 }
 
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 98feab7933c7..929ecb7d6b78 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -27,7 +27,7 @@ void add_wait_queue(struct wait_queue_head *wq_head, struct 
wait_queue_entry *wq
 
        wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
        spin_lock_irqsave(&wq_head->lock, flags);
-       __add_wait_queue_entry_tail(wq_head, wq_entry);
+       __add_wait_queue(wq_head, wq_entry);
        spin_unlock_irqrestore(&wq_head->lock, flags);
 }
 EXPORT_SYMBOL(add_wait_queue);

Reply via email to