of ghost CPUs) but if one CPU is fully used by rt tasks (and its capacity is
reduced to nearly nothing), the capacity factor of the group will still be 4
(div_round_closest(3*1512/1024) = 5 which is cap to 4 with [0]).
So, this patch tries to solve this issue by removing capacity_factor and
replacing it with the 2 following metrics :
-The available CPU's capacity for CFS tasks which is already used by
load_balance.
-The usage of the CPU by the CFS tasks. For the latter, utilization_avg_contrib
has been re-introduced to compute the usage of a CPU by CFS tasks.
group_capacity_factor and group_has_free_capacity has been removed and replaced
by group_no_capacity. We compare the number of task with the number of CPUs and
we evaluate the level of utilization of the CPUs to define if a group is
overloaded or if a group has capacity to handle more tasks.
For SD_PREFER_SIBLING, a group is tagged overloaded if it has more than 1 task
so it will be selected in priority (among the overloaded groups). Since [1],
SD_PREFER_SIBLING is no more concerned by the computation of load_above_capacity
because local is not overloaded.
Finally, the sched_group->sched_group_capacity->capacity_orig has been removed
because it's no more used during load balance.
[1] https://lkml.org/lkml/2014/8/12/295
Signed-off-by: Vincent Guittot <[email protected]>
---
kernel/sched/core.c | 12 -----
kernel/sched/fair.c | 150 +++++++++++++++++++++++----------------------------
kernel/sched/sched.h | 2 +-
3 files changed, 69 insertions(+), 95 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 45ae52d..37fb92c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5373,17 +5373,6 @@ static int sched_domain_debug_one(struct sched_domain
*sd, int cpu, int level,
break;
}
- /*
- * Even though we initialize ->capacity to something semi-sane,
- * we leave capacity_orig unset. This allows us to detect if
- * domain iteration is still funny without causing /0 traps.
- */
- if (!group->sgc->capacity_orig) {
- printk(KERN_CONT "\n");
- printk(KERN_ERR "ERROR: domain->cpu_capacity not
set\n");
- break;
- }
-
if (!cpumask_weight(sched_group_cpus(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: empty group\n");
@@ -5868,7 +5857,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int
cpu)
* die on a /0 trap.
*/
sg->sgc->capacity = SCHED_CAPACITY_SCALE *
cpumask_weight(sg_span);
- sg->sgc->capacity_orig = sg->sgc->capacity;
/*
* Make sure the first group of this domain contains the
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7ca5656..c04f2b2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5710,11 +5710,10 @@ struct sg_lb_stats {
unsigned long group_capacity;
unsigned long group_usage; /* Total usage of the group */
unsigned int sum_nr_running; /* Nr tasks running in the group */
- unsigned int group_capacity_factor;
unsigned int idle_cpus;
unsigned int group_weight;
enum group_type group_type;
- int group_has_free_capacity;
+ int group_no_capacity;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -5855,7 +5854,6 @@ static void update_cpu_capacity(struct sched_domain *sd,
int cpu)
capacity >>= SCHED_CAPACITY_SHIFT;
cpu_rq(cpu)->cpu_capacity_orig = capacity;
- sdg->sgc->capacity_orig = capacity;
capacity *= scale_rt_capacity(cpu);
capacity >>= SCHED_CAPACITY_SHIFT;
@@ -5871,7 +5869,7 @@ void update_group_capacity(struct sched_domain *sd, int
cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity, capacity_orig;
+ unsigned long capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -5883,7 +5881,7 @@ void update_group_capacity(struct sched_domain *sd, int
cpu)
return;
}
- capacity_orig = capacity = 0;
+ capacity = 0;
if (child->flags & SD_OVERLAP) {
/*
@@ -5903,19 +5901,15 @@ void update_group_capacity(struct sched_domain *sd, int
cpu)
* Use capacity_of(), which is set irrespective of
domains
* in update_cpu_capacity().
*
- * This avoids capacity/capacity_orig from being 0 and
+ * This avoids capacity from being 0 and
* causing divide-by-zero issues on boot.
- *
- * Runtime updates will correct capacity_orig.
*/
if (unlikely(!rq->sd)) {
- capacity_orig += capacity_orig_of(cpu);
capacity += capacity_of(cpu);
continue;
}
sgc = rq->sd->groups->sgc;
- capacity_orig += sgc->capacity_orig;
capacity += sgc->capacity;
}
} else {
@@ -5926,42 +5920,15 @@ void update_group_capacity(struct sched_domain *sd, int
cpu)
group = child->groups;
do {
- capacity_orig += group->sgc->capacity_orig;
capacity += group->sgc->capacity;
group = group->next;
} while (group != child->groups);
}
- sdg->sgc->capacity_orig = capacity_orig;
sdg->sgc->capacity = capacity;
}
/*
- * Try and fix up capacity for tiny siblings, this is needed when
- * things like SD_ASYM_PACKING need f_b_g to select another sibling
- * which on its own isn't powerful enough.
- *
- * See update_sd_pick_busiest() and check_asym_packing().
- */
-static inline int
-fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
-{
- /*
- * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
- */
- if (!(sd->flags & SD_SHARE_CPUCAPACITY))
- return 0;
-
- /*
- * If ~90% of the cpu_capacity is still there, we're good.
- */
- if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
- return 1;
-
- return 0;
-}
-
-/*
* Check whether the capacity of the rq has been noticeably reduced by side
* activity. The imbalance_pct is used for the threshold.
* Return true is the capacity is reduced
@@ -6008,37 +5975,54 @@ static inline int sg_imbalanced(struct sched_group
*group)
}
/*
- * Compute the group capacity factor.
- *
- * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
- * first dividing out the smt factor and computing the actual number of cores
- * and limit unit capacity with that.
+ * group_has_capacity returns true if the group has spare capacity that could
+ * be used by some tasks. We consider that a group has spare capacity if the
+ * number of task is smaller than the number of CPUs or if the usage is lower
+ * than the available capacity for CFS tasks. For the latter, we use a
+ * threshold to stabilize the state, to take into account the variance of the
+ * tasks' load and to return true if the available capacity in meaningful for
+ * the load balancer. As an example, an available capacity of 1% can appear
+ * but it doesn't make any benefit for the load balance.
*/
-static inline int sg_capacity_factor(struct lb_env *env, struct sched_group
*group)
+static inline bool
+group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
{
- unsigned int capacity_factor, smt, cpus;
- unsigned int capacity, capacity_orig;
+ if ((sgs->group_capacity * 100) >
+ (sgs->group_usage * env->sd->imbalance_pct))
+ return true;
- capacity = group->sgc->capacity;
- capacity_orig = group->sgc->capacity_orig;
- cpus = group->group_weight;
+ if (sgs->sum_nr_running < sgs->group_weight)
+ return true;
- /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
- smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
- capacity_factor = cpus / smt; /* cores */
+ return false;
+}
- capacity_factor = min_t(unsigned,
- capacity_factor, DIV_ROUND_CLOSEST(capacity,
SCHED_CAPACITY_SCALE));
- if (!capacity_factor)
- capacity_factor = fix_small_capacity(env->sd, group);
+/*
+ * group_is_overloaded returns true if the group has more tasks than it can
+ * handle. We consider that a group is overloaded if the number of tasks is
+ * greater than the number of CPUs and the tasks already use all available
+ * capacity for CFS tasks. For the latter, we use a threshold to stabilize
+ * the state, to take into account the variance of tasks' load and to return
+ * true if available capacity is no more meaningful for load balancer
+ */
+static inline bool
+group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running <= sgs->group_weight)
+ return false;
+
+ if ((sgs->group_capacity * 100) <
+ (sgs->group_usage * env->sd->imbalance_pct))
+ return true;
- return capacity_factor;
+ return false;
}
-static enum group_type
-group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
+static enum group_type group_classify(struct lb_env *env,
+ struct sched_group *group,
+ struct sg_lb_stats *sgs)
{
- if (sgs->sum_nr_running > sgs->group_capacity_factor)
+ if (sgs->group_no_capacity)
return group_overloaded;
if (sg_imbalanced(group))
@@ -6099,11 +6083,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->load_per_task = sgs->sum_weighted_load /
sgs->sum_nr_running;
sgs->group_weight = group->group_weight;
- sgs->group_capacity_factor = sg_capacity_factor(env, group);
- sgs->group_type = group_classify(group, sgs);
- if (sgs->group_capacity_factor > sgs->sum_nr_running)
- sgs->group_has_free_capacity = 1;
+ sgs->group_no_capacity = group_is_overloaded(env, sgs);
+ sgs->group_type = group_classify(env, group, sgs);
}
/**
@@ -6225,17 +6207,20 @@ static inline void update_sd_lb_stats(struct lb_env
*env, struct sd_lb_stats *sd
/*
* In case the child domain prefers tasks go to siblings
- * first, lower the sg capacity factor to one so that we'll try
+ * first, lower the sg capacity so that we'll try
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
- * these excess tasks, i.e. nr_running < group_capacity_factor.
The
- * extra check prevents the case where you always pull from the
- * heaviest group when it is already under-utilized (possible
- * with a large weight task outweighs the tasks on the system).
+ * these excess tasks. The extra check prevents the case where
+ * you always pull from the heaviest group when it is already
+ * under-utilized (possible with a large weight task outweighs
+ * the tasks on the system).
*/
if (prefer_sibling && sds->local &&
- sds->local_stat.group_has_free_capacity)
- sgs->group_capacity_factor =
min(sgs->group_capacity_factor, 1U);
+ group_has_capacity(env, &sds->local_stat) &&
+ (sgs->sum_nr_running > 1)) {
+ sgs->group_no_capacity = 1;
+ sgs->group_type = group_overloaded;
+ }
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
@@ -6414,11 +6399,12 @@ static inline void calculate_imbalance(struct lb_env
*env, struct sd_lb_stats *s
*/
if (busiest->group_type == group_overloaded &&
local->group_type == group_overloaded) {
- load_above_capacity =
- (busiest->sum_nr_running -
busiest->group_capacity_factor);
-
- load_above_capacity *= (SCHED_LOAD_SCALE *
SCHED_CAPACITY_SCALE);
- load_above_capacity /= busiest->group_capacity;
+ load_above_capacity = busiest->sum_nr_running *
+ SCHED_LOAD_SCALE;
+ if (load_above_capacity > busiest->group_capacity)
+ load_above_capacity -= busiest->group_capacity;
+ else
+ load_above_capacity = ~0UL;
}
/*
@@ -6481,6 +6467,7 @@ static struct sched_group *find_busiest_group(struct
lb_env *env)
local = &sds.local_stat;
busiest = &sds.busiest_stat;
+ /* ASYM feature bypasses nice load balance check */
if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
check_asym_packing(env, &sds))
return sds.busiest;
@@ -6501,8 +6488,8 @@ static struct sched_group *find_busiest_group(struct
lb_env *env)
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
- !busiest->group_has_free_capacity)
+ if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+ busiest->group_no_capacity)
goto force_balance;
/*
@@ -6561,7 +6548,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
int i;
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
- unsigned long capacity, capacity_factor, wl;
+ unsigned long capacity, wl;
enum fbq_type rt;
rq = cpu_rq(i);
@@ -6590,9 +6577,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
continue;
capacity = capacity_of(i);
- capacity_factor = DIV_ROUND_CLOSEST(capacity,
SCHED_CAPACITY_SCALE);
- if (!capacity_factor)
- capacity_factor = fix_small_capacity(env->sd, group);
wl = weighted_cpuload(i);
@@ -6600,7 +6584,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
* When comparing with imbalance, use weighted_cpuload()
* which is not scaled with the cpu capacity.
*/
- if (capacity_factor && rq->nr_running == 1 && wl >
env->imbalance)
+
+ if (rq->nr_running == 1 && wl > env->imbalance &&
+ !check_cpu_capacity(rq, env->sd))
continue;
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 35eca7d..7963981 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -759,7 +759,7 @@ struct sched_group_capacity {
* CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
* for a single CPU.
*/
- unsigned int capacity, capacity_orig;
+ unsigned int capacity;
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
/*