[tip:sched/core] sched: Guarantee new group-entities always have weight

2013-10-29 Thread tip-bot for Paul Turner
Commit-ID:  0ac9b1c21874d2490331233b3242085f8151e166
Gitweb: http://git.kernel.org/tip/0ac9b1c21874d2490331233b3242085f8151e166
Author: Paul Turner 
AuthorDate: Wed, 16 Oct 2013 11:16:27 -0700
Committer:  Ingo Molnar 
CommitDate: Tue, 29 Oct 2013 12:02:23 +0100

sched: Guarantee new group-entities always have weight

Currently, group entity load-weights are initialized to zero. This
admits some races with respect to the first time they are re-weighted in
earlty use. ( Let g[x] denote the se for "g" on cpu "x". )

Suppose that we have root->a and that a enters a throttled state,
immediately followed by a[0]->t1 (the only task running on cpu[0])
blocking:

  put_prev_task(group_cfs_rq(a[0]), t1)
  put_prev_entity(..., t1)
  check_cfs_rq_runtime(group_cfs_rq(a[0]))
  throttle_cfs_rq(group_cfs_rq(a[0]))

Then, before unthrottling occurs, let a[0]->b[0]->t2 wake for the first
time:

  enqueue_task_fair(rq[0], t2)
  enqueue_entity(group_cfs_rq(b[0]), t2)
  enqueue_entity_load_avg(group_cfs_rq(b[0]), t2)
  account_entity_enqueue(group_cfs_ra(b[0]), t2)
  update_cfs_shares(group_cfs_rq(b[0]))
  < skipped because b is part of a throttled hierarchy >
  enqueue_entity(group_cfs_rq(a[0]), b[0])
  ...

We now have b[0] enqueued, yet group_cfs_rq(a[0])->load.weight == 0
which violates invariants in several code-paths. Eliminate the
possibility of this by initializing group entity weight.

Signed-off-by: Paul Turner 
Signed-off-by: Peter Zijlstra 
Link: 
http://lkml.kernel.org/r/20131016181627.22647.47543.st...@sword-of-the-dawn.mtv.corp.google.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f6308cb..0923ab2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7198,7 +7198,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct 
cfs_rq *cfs_rq,
se->cfs_rq = parent->my_q;
 
se->my_q = cfs_rq;
-   update_load_set(&se->load, 0);
+   /* guarantee group entities always have weight */
+   update_load_set(&se->load, NICE_0_LOAD);
se->parent = parent;
 }
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[tip:sched/core] sched: Introduce temporary FAIR_GROUP_SCHED dependency for load-tracking

2012-10-24 Thread tip-bot for Paul Turner
Commit-ID:  f4e26b120b9de84cb627bc7361ba43cfdc51341f
Gitweb: http://git.kernel.org/tip/f4e26b120b9de84cb627bc7361ba43cfdc51341f
Author: Paul Turner 
AuthorDate: Thu, 4 Oct 2012 13:18:32 +0200
Committer:  Ingo Molnar 
CommitDate: Wed, 24 Oct 2012 10:27:31 +0200

sched: Introduce temporary FAIR_GROUP_SCHED dependency for load-tracking

While per-entity load-tracking is generally useful, beyond computing shares
distribution, e.g. runnable based load-balance (in progress), governors,
power-management, etc.

These facilities are not yet consumers of this data.  This may be trivially
reverted when the information is required; but avoid paying the overhead for
calculations we will not use until then.

Signed-off-by: Paul Turner 
Reviewed-by: Ben Segall 
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/r/20120823141507.422162...@google.com
Signed-off-by: Ingo Molnar 
---
 include/linux/sched.h |8 +++-
 kernel/sched/core.c   |7 ++-
 kernel/sched/fair.c   |   13 +++--
 kernel/sched/sched.h  |9 -
 4 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e483ccb..e1581a0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1168,7 +1168,13 @@ struct sched_entity {
/* rq "owned" by this entity/group: */
struct cfs_rq   *my_q;
 #endif
-#ifdef CONFIG_SMP
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+   /* Per-entity load-tracking */
struct sched_avgavg;
 #endif
 };
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f268600..5dae0d2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1526,7 +1526,12 @@ static void __sched_fork(struct task_struct *p)
p->se.vruntime  = 0;
INIT_LIST_HEAD(&p->se.group_node);
 
-#ifdef CONFIG_SMP
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
p->se.avg.runnable_avg_period = 0;
p->se.avg.runnable_avg_sum = 0;
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6ecf455..3e6a353 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -882,7 +882,8 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
-#ifdef CONFIG_SMP
+/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
 /*
  * We choose a half-life close to 1 scheduling period.
  * Note: The tables below are dependent on this value.
@@ -3174,6 +3175,12 @@ unlock:
 }
 
 /*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
  * cfs_rq_of(p) references at time of call are still valid and identify the
  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
@@ -3196,6 +3203,7 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
}
 }
+#endif
 #endif /* CONFIG_SMP */
 
 static unsigned long
@@ -5773,8 +5781,9 @@ const struct sched_class fair_sched_class = {
 
 #ifdef CONFIG_SMP
.select_task_rq = select_task_rq_fair,
+#ifdef CONFIG_FAIR_GROUP_SCHED
.migrate_task_rq= migrate_task_rq_fair,
-
+#endif
.rq_online  = rq_online_fair,
.rq_offline = rq_offline_fair,
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0a75a43..5eca173 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -225,6 +225,12 @@ struct cfs_rq {
 #endif
 
 #ifdef CONFIG_SMP
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#ifdef CONFIG_FAIR_GROUP_SCHED
/*
 * CFS Load tracking
 * Under CFS, load is tracked on a per-entity basis and aggregated up.
@@ -234,7 +240,8 @@ struct cfs_rq {
u64 runnable_load_avg, blocked_load_avg;
atomic64_t decay_counter, removed_load;
u64 last_decay;
-
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+/* These always depend on CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_FAIR_GROUP_SCHED
u32 tg_runnable_contrib;
u64 tg_load_contrib;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.

[tip:sched/core] sched: Update_cfs_shares at period edge

2012-10-24 Thread tip-bot for Paul Turner
Commit-ID:  f269ae0469fc882332bdfb5db15d3c1315fe2a10
Gitweb: http://git.kernel.org/tip/f269ae0469fc882332bdfb5db15d3c1315fe2a10
Author: Paul Turner 
AuthorDate: Thu, 4 Oct 2012 13:18:31 +0200
Committer:  Ingo Molnar 
CommitDate: Wed, 24 Oct 2012 10:27:29 +0200

sched: Update_cfs_shares at period edge

Now that our measurement intervals are small (~1ms) we can amortize the posting
of update_shares() to be about each period overflow.  This is a large cost
saving for frequently switching tasks.

Signed-off-by: Paul Turner 
Reviewed-by: Ben Segall 
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/r/20120823141507.200772...@google.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c |   18 ++
 1 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dcc27d8..002a769 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1187,6 +1187,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq 
*cfs_rq, int force_update)
}
 
__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+   update_cfs_shares(cfs_rq);
 }
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
@@ -1396,9 +1397,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
 * Update run-time statistics of the 'current'.
 */
update_curr(cfs_rq);
-   enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
account_entity_enqueue(cfs_rq, se);
-   update_cfs_shares(cfs_rq);
+   enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
 
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
@@ -1471,7 +1471,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
 * Update run-time statistics of the 'current'.
 */
update_curr(cfs_rq);
-   dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
 
update_stats_dequeue(cfs_rq, se);
if (flags & DEQUEUE_SLEEP) {
@@ -1491,8 +1490,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
 
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
-   se->on_rq = 0;
account_entity_dequeue(cfs_rq, se);
+   dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
 
/*
 * Normalize the entity after updating the min_vruntime because the
@@ -1506,7 +1505,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
return_cfs_rq_runtime(cfs_rq);
 
update_min_vruntime(cfs_rq);
-   update_cfs_shares(cfs_rq);
+   se->on_rq = 0;
 }
 
 /*
@@ -2518,8 +2517,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, 
int flags)
if (cfs_rq_throttled(cfs_rq))
break;
 
-   update_cfs_shares(cfs_rq);
update_entity_load_avg(se, 1);
+   update_cfs_rq_blocked_load(cfs_rq, 0);
}
 
if (!se) {
@@ -2579,8 +2578,8 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
 
-   update_cfs_shares(cfs_rq);
update_entity_load_avg(se, 1);
+   update_cfs_rq_blocked_load(cfs_rq, 0);
}
 
if (!se) {
@@ -5639,8 +5638,11 @@ int sched_group_set_shares(struct task_group *tg, 
unsigned long shares)
se = tg->se[i];
/* Propagate contribution to hierarchy */
raw_spin_lock_irqsave(&rq->lock, flags);
-   for_each_sched_entity(se)
+   for_each_sched_entity(se) {
update_cfs_shares(group_cfs_rq(se));
+   /* update contribution to parent */
+   update_entity_load_avg(se, 1);
+   }
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[tip:sched/core] sched: Refactor update_shares_cpu() -> update_blocked_avgs()

2012-10-24 Thread tip-bot for Paul Turner
Commit-ID:  48a1675323fa1b7844e479ad2a4469f4558c0f79
Gitweb: http://git.kernel.org/tip/48a1675323fa1b7844e479ad2a4469f4558c0f79
Author: Paul Turner 
AuthorDate: Thu, 4 Oct 2012 13:18:31 +0200
Committer:  Ingo Molnar 
CommitDate: Wed, 24 Oct 2012 10:27:28 +0200

sched: Refactor update_shares_cpu() -> update_blocked_avgs()

Now that running entities maintain their own load-averages the work we must do
in update_shares() is largely restricted to the periodic decay of blocked
entities.  This allows us to be a little less pessimistic regarding our
occupancy on rq->lock and the associated rq->clock updates required.

Signed-off-by: Paul Turner 
Reviewed-by: Ben Segall 
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/r/20120823141507.133999...@google.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c |   50 +++---
 1 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 57fae95..dcc27d8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3639,20 +3639,15 @@ next:
 /*
  * update tg->load_weight by folding this cpu's load_avg
  */
-static int update_shares_cpu(struct task_group *tg, int cpu)
+static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
 {
-   struct sched_entity *se;
-   struct cfs_rq *cfs_rq;
-   unsigned long flags;
-   struct rq *rq;
-
-   rq = cpu_rq(cpu);
-   se = tg->se[cpu];
-   cfs_rq = tg->cfs_rq[cpu];
+   struct sched_entity *se = tg->se[cpu];
+   struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
 
-   raw_spin_lock_irqsave(&rq->lock, flags);
+   /* throttled entities do not contribute to load */
+   if (throttled_hierarchy(cfs_rq))
+   return;
 
-   update_rq_clock(rq);
update_cfs_rq_blocked_load(cfs_rq, 1);
 
if (se) {
@@ -3669,32 +3664,33 @@ static int update_shares_cpu(struct task_group *tg, int 
cpu)
if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
list_del_leaf_cfs_rq(cfs_rq);
} else {
+   struct rq *rq = rq_of(cfs_rq);
update_rq_runnable_avg(rq, rq->nr_running);
}
-
-   raw_spin_unlock_irqrestore(&rq->lock, flags);
-
-   return 0;
 }
 
-static void update_shares(int cpu)
+static void update_blocked_averages(int cpu)
 {
-   struct cfs_rq *cfs_rq;
struct rq *rq = cpu_rq(cpu);
+   struct cfs_rq *cfs_rq;
+   unsigned long flags;
 
-   rcu_read_lock();
+   raw_spin_lock_irqsave(&rq->lock, flags);
+   update_rq_clock(rq);
/*
 * Iterates the task_group tree in a bottom up fashion, see
 * list_add_leaf_cfs_rq() for details.
 */
for_each_leaf_cfs_rq(rq, cfs_rq) {
-   /* throttled entities do not contribute to load */
-   if (throttled_hierarchy(cfs_rq))
-   continue;
-
-   update_shares_cpu(cfs_rq->tg, cpu);
+   /*
+* Note: We may want to consider periodically releasing
+* rq->lock about these updates so that creating many task
+* groups does not result in continually extending hold time.
+*/
+   __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
}
-   rcu_read_unlock();
+
+   raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
 /*
@@ -3746,7 +3742,7 @@ static unsigned long task_h_load(struct task_struct *p)
return load;
 }
 #else
-static inline void update_shares(int cpu)
+static inline void update_blocked_averages(int cpu)
 {
 }
 
@@ -4813,7 +4809,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
 */
raw_spin_unlock(&this_rq->lock);
 
-   update_shares(this_cpu);
+   update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
unsigned long interval;
@@ -5068,7 +5064,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type 
idle)
int update_next_balance = 0;
int need_serialize;
 
-   update_shares(cpu);
+   update_blocked_averages(cpu);
 
rcu_read_lock();
for_each_domain(cpu, sd) {
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[tip:sched/core] sched: Replace update_shares weight distribution with per-entity computation

2012-10-24 Thread tip-bot for Paul Turner
Commit-ID:  82958366cfea1a50e7e90907b2d55ae29ed69974
Gitweb: http://git.kernel.org/tip/82958366cfea1a50e7e90907b2d55ae29ed69974
Author: Paul Turner 
AuthorDate: Thu, 4 Oct 2012 13:18:31 +0200
Committer:  Ingo Molnar 
CommitDate: Wed, 24 Oct 2012 10:27:28 +0200

sched: Replace update_shares weight distribution with per-entity computation

Now that the machinery in place is in place to compute contributed load in a
bottom up fashion; replace the shares distribution code within update_shares()
accordingly.

Signed-off-by: Paul Turner 
Reviewed-by: Ben Segall 
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/r/20120823141507.061208...@google.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/debug.c |8 ---
 kernel/sched/fair.c  |  157 --
 kernel/sched/sched.h |   36 
 3 files changed, 36 insertions(+), 165 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 71b0ea3..2cd3c1b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -218,14 +218,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct 
cfs_rq *cfs_rq)
SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_SMP
-   SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_avg",
-   SPLIT_NS(cfs_rq->load_avg));
-   SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_period",
-   SPLIT_NS(cfs_rq->load_period));
-   SEQ_printf(m, "  .%-30s: %ld\n", "load_contrib",
-   cfs_rq->load_contribution);
-   SEQ_printf(m, "  .%-30s: %d\n", "load_tg",
-   atomic_read(&cfs_rq->tg->load_weight));
SEQ_printf(m, "  .%-30s: %lld\n", "runnable_load_avg",
cfs_rq->runnable_load_avg);
SEQ_printf(m, "  .%-30s: %lld\n", "blocked_load_avg",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 873c9f5..57fae95 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -658,9 +658,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
 
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq);
-
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -680,10 +677,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity 
*curr,
 
curr->vruntime += delta_exec_weighted;
update_min_vruntime(cfs_rq);
-
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-   cfs_rq->load_unacc_exec_time += delta_exec;
-#endif
 }
 
 static void update_curr(struct cfs_rq *cfs_rq)
@@ -806,72 +799,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-/* we need this in update_cfs_load and load-balance functions below */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 # ifdef CONFIG_SMP
-static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
-   int global_update)
-{
-   struct task_group *tg = cfs_rq->tg;
-   long load_avg;
-
-   load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
-   load_avg -= cfs_rq->load_contribution;
-
-   if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
-   atomic_add(load_avg, &tg->load_weight);
-   cfs_rq->load_contribution += load_avg;
-   }
-}
-
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-   u64 period = sysctl_sched_shares_window;
-   u64 now, delta;
-   unsigned long load = cfs_rq->load.weight;
-
-   if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
-   return;
-
-   now = rq_of(cfs_rq)->clock_task;
-   delta = now - cfs_rq->load_stamp;
-
-   /* truncate load history at 4 idle periods */
-   if (cfs_rq->load_stamp > cfs_rq->load_last &&
-   now - cfs_rq->load_last > 4 * period) {
-   cfs_rq->load_period = 0;
-   cfs_rq->load_avg = 0;
-   delta = period - 1;
-   }
-
-   cfs_rq->load_stamp = now;
-   cfs_rq->load_unacc_exec_time = 0;
-   cfs_rq->load_period += delta;
-   if (load) {
-   cfs_rq->load_last = now;
-   cfs_rq->load_avg += delta * load;
-   }
-
-   /* consider updating load contribution on each fold or truncate */
-   if (global_update || cfs_rq->load_period > period
-   || !cfs_rq->load_period)
-   update_cfs_rq_load_contribution(cfs_rq, global_update);
-
-   while (cfs_rq->load_period > period) {
-   /*
-* Inline assembly required to prevent the compiler
-* optimising this loop into a divmod call.
-* See __iter_div_u64_rem() for another ex

[tip:sched/core] sched: Maintain runnable averages across throttled periods

2012-10-24 Thread tip-bot for Paul Turner
Commit-ID:  f1b17280efbd21873d1db8631117bdbccbcb39a2
Gitweb: http://git.kernel.org/tip/f1b17280efbd21873d1db8631117bdbccbcb39a2
Author: Paul Turner 
AuthorDate: Thu, 4 Oct 2012 13:18:31 +0200
Committer:  Ingo Molnar 
CommitDate: Wed, 24 Oct 2012 10:27:27 +0200

sched: Maintain runnable averages across throttled periods

With bandwidth control tracked entities may cease execution according to user
specified bandwidth limits.  Charging this time as either throttled or blocked
however, is incorrect and would falsely skew in either direction.

What we actually want is for any throttled periods to be "invisible" to
load-tracking as they are removed from the system for that interval and
contribute normally otherwise.

Do this by moderating the progression of time to omit any periods in which the
entity belonged to a throttled hierarchy.

Signed-off-by: Paul Turner 
Reviewed-by: Ben Segall 
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/r/20120823141506.998912...@google.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c  |   50 --
 kernel/sched/sched.h |3 ++-
 2 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9e49722..873c9f5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1222,15 +1222,26 @@ static inline void subtract_blocked_load_contrib(struct 
cfs_rq *cfs_rq,
cfs_rq->blocked_load_avg = 0;
 }
 
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+
 /* Update a sched_entity's runnable average */
 static inline void update_entity_load_avg(struct sched_entity *se,
  int update_cfs_rq)
 {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
long contrib_delta;
+   u64 now;
 
-   if (!__update_entity_runnable_avg(rq_of(cfs_rq)->clock_task, &se->avg,
- se->on_rq))
+   /*
+* For a group entity we need to use their owned cfs_rq_clock_task() in
+* case they are the parent of a throttled hierarchy.
+*/
+   if (entity_is_task(se))
+   now = cfs_rq_clock_task(cfs_rq);
+   else
+   now = cfs_rq_clock_task(group_cfs_rq(se));
+
+   if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
return;
 
contrib_delta = __update_entity_load_avg_contrib(se);
@@ -1250,7 +1261,7 @@ static inline void update_entity_load_avg(struct 
sched_entity *se,
  */
 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 {
-   u64 now = rq_of(cfs_rq)->clock_task >> 20;
+   u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
u64 decays;
 
decays = now - cfs_rq->last_decay;
@@ -1841,6 +1852,15 @@ static inline struct cfs_bandwidth 
*tg_cfs_bandwidth(struct task_group *tg)
return &tg->cfs_bandwidth;
 }
 
+/* rq->task_clock normalized against any time this cfs_rq has spent throttled 
*/
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+   if (unlikely(cfs_rq->throttle_count))
+   return cfs_rq->throttled_clock_task;
+
+   return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
+}
+
 /* returns 0 on failure to allocate runtime */
 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
@@ -1991,6 +2011,10 @@ static int tg_unthrottle_up(struct task_group *tg, void 
*data)
cfs_rq->load_stamp += delta;
cfs_rq->load_last += delta;
 
+   /* adjust cfs_rq_clock_task() */
+   cfs_rq->throttled_clock_task_time += rq->clock_task -
+cfs_rq->throttled_clock_task;
+
/* update entity weight now that we are on_rq again */
update_cfs_shares(cfs_rq);
}
@@ -2005,8 +2029,10 @@ static int tg_throttle_down(struct task_group *tg, void 
*data)
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
/* group is entering throttled state, record last load */
-   if (!cfs_rq->throttle_count)
+   if (!cfs_rq->throttle_count) {
update_cfs_load(cfs_rq, 0);
+   cfs_rq->throttled_clock_task = rq->clock_task;
+   }
cfs_rq->throttle_count++;
 
return 0;
@@ -2021,7 +2047,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
 
-   /* account load preceding throttle */
+   /* freeze hierarchy runnable averages while throttled */
rcu_read_lock();
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
@@ -2045,7 +2071,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
rq->nr_running -= task_delta;
 
cfs_rq->throttled = 1;
-   cfs_rq->throttled_timestamp = rq->clock;
+   cfs_rq->throttled_clock = rq->clock;
raw_spin_lock(&cfs_b->lock);
list_add_tail_rcu(&cfs_rq-

[tip:sched/core] sched: Compute load contribution by a group entity

2012-10-24 Thread tip-bot for Paul Turner
Commit-ID:  8165e145ceb62fc338e099c9b12b3239c83d2f8e
Gitweb: http://git.kernel.org/tip/8165e145ceb62fc338e099c9b12b3239c83d2f8e
Author: Paul Turner 
AuthorDate: Thu, 4 Oct 2012 13:18:31 +0200
Committer:  Ingo Molnar 
CommitDate: Wed, 24 Oct 2012 10:27:25 +0200

sched: Compute load contribution by a group entity

Unlike task entities who have a fixed weight, group entities instead own a
fraction of their parenting task_group's shares as their contributed weight.

Compute this fraction so that we can correctly account hierarchies and shared
entity nodes.

Signed-off-by: Paul Turner 
Reviewed-by: Ben Segall 
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/r/20120823141506.855074...@google.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c |   33 +++--
 1 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index db78822..e20cb26 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1117,22 +1117,43 @@ static inline void 
__update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
cfs_rq->tg_load_contrib += tg_contrib;
}
 }
+
+static inline void __update_group_entity_contrib(struct sched_entity *se)
+{
+   struct cfs_rq *cfs_rq = group_cfs_rq(se);
+   struct task_group *tg = cfs_rq->tg;
+   u64 contrib;
+
+   contrib = cfs_rq->tg_load_contrib * tg->shares;
+   se->avg.load_avg_contrib = div64_u64(contrib,
+atomic64_read(&tg->load_avg) + 1);
+}
 #else
 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
 int force_update) {}
+static inline void __update_group_entity_contrib(struct sched_entity *se) {}
 #endif
 
+static inline void __update_task_entity_contrib(struct sched_entity *se)
+{
+   u32 contrib;
+
+   /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+   contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
+   contrib /= (se->avg.runnable_avg_period + 1);
+   se->avg.load_avg_contrib = scale_load(contrib);
+}
+
 /* Compute the current contribution to load_avg by se, return any delta */
 static long __update_entity_load_avg_contrib(struct sched_entity *se)
 {
long old_contrib = se->avg.load_avg_contrib;
 
-   if (!entity_is_task(se))
-   return 0;
-
-   se->avg.load_avg_contrib = div64_u64(se->avg.runnable_avg_sum *
-se->load.weight,
-se->avg.runnable_avg_period + 1);
+   if (entity_is_task(se)) {
+   __update_task_entity_contrib(se);
+   } else {
+   __update_group_entity_contrib(se);
+   }
 
return se->avg.load_avg_contrib - old_contrib;
 }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[tip:sched/core] sched: Aggregate total task_group load

2012-10-24 Thread tip-bot for Paul Turner
Commit-ID:  c566e8e9e44b72b53091da20e2dedefc730f2ee2
Gitweb: http://git.kernel.org/tip/c566e8e9e44b72b53091da20e2dedefc730f2ee2
Author: Paul Turner 
AuthorDate: Thu, 4 Oct 2012 13:18:30 +0200
Committer:  Ingo Molnar 
CommitDate: Wed, 24 Oct 2012 10:27:24 +0200

sched: Aggregate total task_group load

Maintain a global running sum of the average load seen on each cfs_rq belonging
to each task group so that it may be used in calculating an appropriate
shares:weight distribution.

Signed-off-by: Paul Turner 
Reviewed-by: Ben Segall 
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/r/20120823141506.792901...@google.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/debug.c |4 
 kernel/sched/fair.c  |   22 ++
 kernel/sched/sched.h |4 
 3 files changed, 30 insertions(+), 0 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2d2e2b3..2908923 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -230,6 +230,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct 
cfs_rq *cfs_rq)
cfs_rq->runnable_load_avg);
SEQ_printf(m, "  .%-30s: %lld\n", "blocked_load_avg",
cfs_rq->blocked_load_avg);
+   SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_avg",
+   atomic64_read(&cfs_rq->tg->load_avg));
+   SEQ_printf(m, "  .%-30s: %lld\n", "tg_load_contrib",
+   cfs_rq->tg_load_contrib);
 #endif
 
print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 74dc29b..db78822 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1102,6 +1102,26 @@ static inline u64 __synchronize_entity_decay(struct 
sched_entity *se)
return decays;
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+int force_update)
+{
+   struct task_group *tg = cfs_rq->tg;
+   s64 tg_contrib;
+
+   tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
+   tg_contrib -= cfs_rq->tg_load_contrib;
+
+   if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
+   atomic64_add(tg_contrib, &tg->load_avg);
+   cfs_rq->tg_load_contrib += tg_contrib;
+   }
+}
+#else
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+int force_update) {}
+#endif
+
 /* Compute the current contribution to load_avg by se, return any delta */
 static long __update_entity_load_avg_contrib(struct sched_entity *se)
 {
@@ -1172,6 +1192,8 @@ static void update_cfs_rq_blocked_load(struct cfs_rq 
*cfs_rq, int force_update)
atomic64_add(decays, &cfs_rq->decay_counter);
cfs_rq->last_decay = now;
}
+
+   __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
 }
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 30236ab..924a990 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -112,6 +112,7 @@ struct task_group {
unsigned long shares;
 
atomic_t load_weight;
+   atomic64_t load_avg;
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -232,6 +233,9 @@ struct cfs_rq {
u64 runnable_load_avg, blocked_load_avg;
atomic64_t decay_counter, removed_load;
u64 last_decay;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+   u64 tg_load_contrib;
+#endif
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[tip:sched/core] sched: Account for blocked load waking back up

2012-10-24 Thread tip-bot for Paul Turner
Commit-ID:  aff3e49881fa71c5ee1bbc470e1dff9548d9
Gitweb: http://git.kernel.org/tip/aff3e49881fa71c5ee1bbc470e1dff9548d9
Author: Paul Turner 
AuthorDate: Thu, 4 Oct 2012 13:18:30 +0200
Committer:  Ingo Molnar 
CommitDate: Wed, 24 Oct 2012 10:27:23 +0200

sched: Account for blocked load waking back up

When a running entity blocks we migrate its tracked load to
cfs_rq->blocked_runnable_avg.  In the sleep case this occurs while holding
rq->lock and so is a natural transition.  Wake-ups however, are potentially
asynchronous in the presence of migration and so special care must be taken.

We use an atomic counter to track such migrated load, taking care to match this
with the previously introduced decay counters so that we don't migrate too much
load.

Signed-off-by: Paul Turner 
Reviewed-by: Ben Segall 
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/r/20120823141506.726077...@google.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c  |  100 --
 kernel/sched/sched.h |2 +-
 2 files changed, 81 insertions(+), 21 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5e602e6..74dc29b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -259,7 +259,8 @@ static inline struct cfs_rq *group_cfs_rq(struct 
sched_entity *grp)
return grp->my_q;
 }
 
-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq);
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+  int force_update);
 
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
@@ -281,7 +282,7 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq 
*cfs_rq)
 
cfs_rq->on_list = 1;
/* We should have no load, but we need to update last_decay. */
-   update_cfs_rq_blocked_load(cfs_rq);
+   update_cfs_rq_blocked_load(cfs_rq, 0);
}
 }
 
@@ -1086,17 +1087,19 @@ static __always_inline int 
__update_entity_runnable_avg(u64 now,
 }
 
 /* Synchronize an entity's decay with its parenting cfs_rq.*/
-static inline void __synchronize_entity_decay(struct sched_entity *se)
+static inline u64 __synchronize_entity_decay(struct sched_entity *se)
 {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 decays = atomic64_read(&cfs_rq->decay_counter);
 
decays -= se->avg.decay_count;
if (!decays)
-   return;
+   return 0;
 
se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
se->avg.decay_count = 0;
+
+   return decays;
 }
 
 /* Compute the current contribution to load_avg by se, return any delta */
@@ -1149,20 +1152,26 @@ static inline void update_entity_load_avg(struct 
sched_entity *se,
  * Decay the load contributed by all blocked children and account this so that
  * their contribution may appropriately discounted when they wake up.
  */
-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq)
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 {
u64 now = rq_of(cfs_rq)->clock_task >> 20;
u64 decays;
 
decays = now - cfs_rq->last_decay;
-   if (!decays)
+   if (!decays && !force_update)
return;
 
-   cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
- decays);
-   atomic64_add(decays, &cfs_rq->decay_counter);
+   if (atomic64_read(&cfs_rq->removed_load)) {
+   u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
+   subtract_blocked_load_contrib(cfs_rq, removed_load);
+   }
 
-   cfs_rq->last_decay = now;
+   if (decays) {
+   cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
+ decays);
+   atomic64_add(decays, &cfs_rq->decay_counter);
+   cfs_rq->last_decay = now;
+   }
 }
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
@@ -1175,20 +1184,42 @@ static inline void enqueue_entity_load_avg(struct 
cfs_rq *cfs_rq,
  struct sched_entity *se,
  int wakeup)
 {
-   /* we track migrations using entity decay_count == 0 */
-   if (unlikely(!se->avg.decay_count)) {
+   /*
+* We track migrations using entity decay_count <= 0, on a wake-up
+* migration we use a negative decay count to track the remote decays
+* accumulated while sleeping.
+*/
+   if (unlikely(se->avg.decay_count <= 0)) {
se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
+   if (se->avg.decay_count) {
+   /*
+* In a wake-up migration we have to approximate the
+* time sleeping.  This is because we can't synchronize
+

[tip:sched/core] sched: Maintain the load contribution of blocked entities

2012-10-24 Thread tip-bot for Paul Turner
Commit-ID:  9ee474f55664ff63111c843099d365e7ecffb56f
Gitweb: http://git.kernel.org/tip/9ee474f55664ff63111c843099d365e7ecffb56f
Author: Paul Turner 
AuthorDate: Thu, 4 Oct 2012 13:18:30 +0200
Committer:  Ingo Molnar 
CommitDate: Wed, 24 Oct 2012 10:27:22 +0200

sched: Maintain the load contribution of blocked entities

We are currently maintaining:

  runnable_load(cfs_rq) = \Sum task_load(t)

For all running children t of cfs_rq.  While this can be naturally updated for
tasks in a runnable state (as they are scheduled); this does not account for
the load contributed by blocked task entities.

This can be solved by introducing a separate accounting for blocked load:

  blocked_load(cfs_rq) = \Sum runnable(b) * weight(b)

Obviously we do not want to iterate over all blocked entities to account for
their decay, we instead observe that:

  runnable_load(t) = \Sum p_i*y^i

and that to account for an additional idle period we only need to compute:

  y*runnable_load(t).

This means that we can compute all blocked entities at once by evaluating:

  blocked_load(cfs_rq)` = y * blocked_load(cfs_rq)

Finally we maintain a decay counter so that when a sleeping entity re-awakens
we can determine how much of its load should be removed from the blocked sum.

Signed-off-by: Paul Turner 
Reviewed-by: Ben Segall 
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/r/20120823141506.585389...@google.com
Signed-off-by: Ingo Molnar 
---
 include/linux/sched.h |1 +
 kernel/sched/core.c   |1 -
 kernel/sched/debug.c  |3 +
 kernel/sched/fair.c   |  128 -
 kernel/sched/sched.h  |4 +-
 5 files changed, 122 insertions(+), 15 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 81d8b1b..b1831ac 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1103,6 +1103,7 @@ struct sched_avg {
 */
u32 runnable_avg_sum, runnable_avg_period;
u64 last_runnable_update;
+   s64 decay_count;
unsigned long load_avg_contrib;
 };
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fd9d085..00898f1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1528,7 +1528,6 @@ static void __sched_fork(struct task_struct *p)
p->se.avg.runnable_avg_period = 0;
p->se.avg.runnable_avg_sum = 0;
 #endif
-
 #ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index c953a89..2d2e2b3 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -95,6 +95,7 @@ static void print_cfs_group_stats(struct seq_file *m, int 
cpu, struct task_group
P(se->avg.runnable_avg_sum);
P(se->avg.runnable_avg_period);
P(se->avg.load_avg_contrib);
+   P(se->avg.decay_count);
 #endif
 #undef PN
 #undef P
@@ -227,6 +228,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct 
cfs_rq *cfs_rq)
atomic_read(&cfs_rq->tg->load_weight));
SEQ_printf(m, "  .%-30s: %lld\n", "runnable_load_avg",
cfs_rq->runnable_load_avg);
+   SEQ_printf(m, "  .%-30s: %lld\n", "blocked_load_avg",
+   cfs_rq->blocked_load_avg);
 #endif
 
print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 77af759..8319417 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -259,6 +259,8 @@ static inline struct cfs_rq *group_cfs_rq(struct 
sched_entity *grp)
return grp->my_q;
 }
 
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq);
+
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
if (!cfs_rq->on_list) {
@@ -278,6 +280,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq 
*cfs_rq)
}
 
cfs_rq->on_list = 1;
+   /* We should have no load, but we need to update last_decay. */
+   update_cfs_rq_blocked_load(cfs_rq);
}
 }
 
@@ -1081,6 +1085,20 @@ static __always_inline int 
__update_entity_runnable_avg(u64 now,
return decayed;
 }
 
+/* Synchronize an entity's decay with its parenting cfs_rq.*/
+static inline void __synchronize_entity_decay(struct sched_entity *se)
+{
+   struct cfs_rq *cfs_rq = cfs_rq_of(se);
+   u64 decays = atomic64_read(&cfs_rq->decay_counter);
+
+   decays -= se->avg.decay_count;
+   if (!decays)
+   return;
+
+   se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+   se->avg.decay_count = 0;
+}
+
 /* Compute the current contribution to load_avg by se, return any delta */
 static long __update_entity_load_avg_contrib(struct sched_entity *se)
 {
@@ -1096,8 +1114,18 @@ static long __update_entity_load_avg_contrib(struct 
sched_entity *se)
return se->avg.load_avg_contrib - old_contrib;
 }
 
+static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
+ 

[tip:sched/core] sched: Aggregate load contributed by task entities on parenting cfs_rq

2012-10-24 Thread tip-bot for Paul Turner
Commit-ID:  2dac754e10a5d41d94d2d2365c0345d4f215a266
Gitweb: http://git.kernel.org/tip/2dac754e10a5d41d94d2d2365c0345d4f215a266
Author: Paul Turner 
AuthorDate: Thu, 4 Oct 2012 13:18:30 +0200
Committer:  Ingo Molnar 
CommitDate: Wed, 24 Oct 2012 10:27:21 +0200

sched: Aggregate load contributed by task entities on parenting cfs_rq

For a given task t, we can compute its contribution to load as:

  task_load(t) = runnable_avg(t) * weight(t)

On a parenting cfs_rq we can then aggregate:

  runnable_load(cfs_rq) = \Sum task_load(t), for all runnable children t

Maintain this bottom up, with task entities adding their contributed load to
the parenting cfs_rq sum.  When a task entity's load changes we add the same
delta to the maintained sum.

Signed-off-by: Paul Turner 
Reviewed-by: Ben Segall 
Signed-off-by: Peter Zijlstra 
Link: http://lkml.kernel.org/r/20120823141506.514678...@google.com
Signed-off-by: Ingo Molnar 
---
 include/linux/sched.h |1 +
 kernel/sched/debug.c  |3 ++
 kernel/sched/fair.c   |   51 +---
 kernel/sched/sched.h  |   10 -
 4 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 418fc6d..81d8b1b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1103,6 +1103,7 @@ struct sched_avg {
 */
u32 runnable_avg_sum, runnable_avg_period;
u64 last_runnable_update;
+   unsigned long load_avg_contrib;
 };
 
 #ifdef CONFIG_SCHEDSTATS
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4240abc..c953a89 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -94,6 +94,7 @@ static void print_cfs_group_stats(struct seq_file *m, int 
cpu, struct task_group
 #ifdef CONFIG_SMP
P(se->avg.runnable_avg_sum);
P(se->avg.runnable_avg_period);
+   P(se->avg.load_avg_contrib);
 #endif
 #undef PN
 #undef P
@@ -224,6 +225,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct 
cfs_rq *cfs_rq)
cfs_rq->load_contribution);
SEQ_printf(m, "  .%-30s: %d\n", "load_tg",
atomic_read(&cfs_rq->tg->load_weight));
+   SEQ_printf(m, "  .%-30s: %lld\n", "runnable_load_avg",
+   cfs_rq->runnable_load_avg);
 #endif
 
print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8c5468f..77af759 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1081,20 +1081,63 @@ static __always_inline int 
__update_entity_runnable_avg(u64 now,
return decayed;
 }
 
+/* Compute the current contribution to load_avg by se, return any delta */
+static long __update_entity_load_avg_contrib(struct sched_entity *se)
+{
+   long old_contrib = se->avg.load_avg_contrib;
+
+   if (!entity_is_task(se))
+   return 0;
+
+   se->avg.load_avg_contrib = div64_u64(se->avg.runnable_avg_sum *
+se->load.weight,
+se->avg.runnable_avg_period + 1);
+
+   return se->avg.load_avg_contrib - old_contrib;
+}
+
 /* Update a sched_entity's runnable average */
 static inline void update_entity_load_avg(struct sched_entity *se)
 {
-   __update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg,
-se->on_rq);
+   struct cfs_rq *cfs_rq = cfs_rq_of(se);
+   long contrib_delta;
+
+   if (!__update_entity_runnable_avg(rq_of(cfs_rq)->clock_task, &se->avg,
+ se->on_rq))
+   return;
+
+   contrib_delta = __update_entity_load_avg_contrib(se);
+   if (se->on_rq)
+   cfs_rq->runnable_load_avg += contrib_delta;
 }
 
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 {
__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
 }
+
+/* Add the load generated by se into cfs_rq's child load-average */
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se)
+{
+   update_entity_load_avg(se);
+   cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+}
+
+/* Remove se's load from this cfs_rq child load-average */
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se)
+{
+   update_entity_load_avg(se);
+   cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+}
 #else
 static inline void update_entity_load_avg(struct sched_entity *se) {}
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struct sched_entity *se) {}
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+ struc