On Thu, 15 Apr 2021 at 19:58, Valentin Schneider <valentin.schnei...@arm.com> wrote: > > Consider the following (hypothetical) asymmetric CPU capacity topology, > with some amount of capacity pressure (RT | DL | IRQ | thermal): > > DIE [ ] > MC [ ][ ] > 0 1 2 3 > > | CPU | capacity_orig | capacity | > |-----+---------------+----------| > | 0 | 870 | 860 | > | 1 | 870 | 600 | > | 2 | 1024 | 850 | > | 3 | 1024 | 860 | > > If CPU1 has a misfit task, then CPU0, CPU2 and CPU3 are valid candidates to > grant the task an uplift in CPU capacity. Consider CPU0 and CPU3 as > sufficiently busy, i.e. don't have enough spare capacity to accommodate > CPU1's misfit task. This would then fall on CPU2 to pull the task. > > This currently won't happen, because CPU2 will fail > > capacity_greater(capacity_of(CPU2), sg->sgc->max_capacity) > > in update_sd_pick_busiest(), where 'sg' is the [0, 1] group at DIE > level. In this case, the max_capacity is that of CPU0's, which is at this > point in time greater than that of CPU2's. This comparison doesn't make > much sense, given that the only CPUs we should care about in this scenario > are CPU1 (the CPU with the misfit task) and CPU2 (the load-balance > destination CPU). > > Aggregate a misfit task's load into sgs->group_misfit_task_load only if > env->dst_cpu would grant it a capacity uplift. > > Note that the aforementioned capacity vs sgc->max_capacity comparison was > meant to prevent misfit task downmigration: candidate groups classified as > group_misfit_task but with a higher (max) CPU capacity than the destination > CPU > would be discarded. This change makes it so said group_misfit_task > classification can't happen anymore, which may cause some undesired > downmigrations. > > Further tweak find_busiest_queue() to ensure this doesn't happen. Also note > find_busiest_queue() can now iterate over CPUs with a higher capacity than > the local CPU's, so add a capacity check there. > > Signed-off-by: Valentin Schneider <valentin.schnei...@arm.com> > --- > kernel/sched/fair.c | 63 ++++++++++++++++++++++++++++++++------------- > 1 file changed, 45 insertions(+), 18 deletions(-) > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 9b8ae02f1994..d2d1a69d7aa7 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -5759,6 +5759,12 @@ static unsigned long capacity_of(int cpu) > return cpu_rq(cpu)->cpu_capacity; > } > > +/* Is CPU a's capacity noticeably greater than CPU b's? */ > +static inline bool cpu_capacity_greater(int a, int b) > +{ > + return capacity_greater(capacity_of(a), capacity_of(b)); > +} > + > static void record_wakee(struct task_struct *p) > { > /* > @@ -7486,6 +7492,7 @@ struct lb_env { > > enum fbq_type fbq_type; > enum migration_type migration_type; > + enum group_type src_grp_type; > struct list_head tasks; > }; > > @@ -8447,6 +8454,32 @@ static bool update_nohz_stats(struct rq *rq) > #endif > } > > +static inline void update_sg_lb_misfit_stats(struct lb_env *env, > + struct sched_group *group, > + struct sg_lb_stats *sgs, > + int *sg_status, > + int cpu) > +{ > + struct rq *rq = cpu_rq(cpu); > + > + if (!(env->sd->flags & SD_ASYM_CPUCAPACITY) || > + !rq->misfit_task_load) > + return; > + > + *sg_status |= SG_OVERLOAD; > + > + /* > + * Don't attempt to maximize load for misfit tasks that can't be > + * granted a CPU capacity uplift. > + */ > + if (cpu_capacity_greater(env->dst_cpu, cpu)) { > + sgs->group_misfit_task_load = max( > + sgs->group_misfit_task_load, > + rq->misfit_task_load); > + } > + > +} > + > /** > * update_sg_lb_stats - Update sched_group's statistics for load balancing. > * @env: The load balancing environment. > @@ -8498,12 +8531,7 @@ static inline void update_sg_lb_stats(struct lb_env > *env, > if (local_group) > continue; > > - /* Check for a misfit task on the cpu */ > - if (env->sd->flags & SD_ASYM_CPUCAPACITY && > - sgs->group_misfit_task_load < rq->misfit_task_load) { > - sgs->group_misfit_task_load = rq->misfit_task_load; > - *sg_status |= SG_OVERLOAD; > - } > + update_sg_lb_misfit_stats(env, group, sgs, sg_status, i); > } > > /* Check if dst CPU is idle and preferred to this group */ > @@ -8550,15 +8578,9 @@ static bool update_sd_pick_busiest(struct lb_env *env, > if (!sgs->sum_h_nr_running) > return false; > > - /* > - * Don't try to pull misfit tasks we can't help. > - * We can use max_capacity here as reduction in capacity on some > - * CPUs in the group should either be possible to resolve > - * internally or be covered by avg_load imbalance (eventually). > - */ > + /* Don't try to pull misfit tasks we can't help */ > if (sgs->group_type == group_misfit_task && > - (!capacity_greater(capacity_of(env->dst_cpu), > sg->sgc->max_capacity) || > - sds->local_stat.group_type != group_has_spare)) > + sds->local_stat.group_type != group_has_spare) > return false; > > if (sgs->group_type > busiest->group_type) > @@ -9288,6 +9310,8 @@ static struct sched_group *find_busiest_group(struct > lb_env *env) > if (!sds.busiest) > goto out_balanced; > > + env->src_grp_type = busiest->group_type; > + > /* Misfit tasks should be dealt with regardless of the avg load */ > if (busiest->group_type == group_misfit_task) > goto force_balance; > @@ -9441,8 +9465,8 @@ static struct rq *find_busiest_queue(struct lb_env *env, > * average load. > */ > if (env->sd->flags & SD_ASYM_CPUCAPACITY && > - !capacity_greater(capacity_of(env->dst_cpu), capacity) && > - nr_running == 1) > + env->src_grp_type <= group_fully_busy && > + !capacity_greater(capacity_of(env->dst_cpu), capacity)) > continue; > > switch (env->migration_type) { > @@ -9504,15 +9528,18 @@ static struct rq *find_busiest_queue(struct lb_env > *env, > case migrate_misfit: > /* > * For ASYM_CPUCAPACITY domains with misfit tasks we > - * simply seek the "biggest" misfit task. > + * simply seek the "biggest" misfit task we can > + * accommodate. > */ > + if (!cpu_capacity_greater(env->dst_cpu, i))
Use the same level of interface as above. This makes code and the condition easier to follow in find_busiest_queue() capacity_greater(capacity_of(env->dst_cpu), capacity_of(i)) > + continue; > + > if (rq->misfit_task_load > busiest_load) { > busiest_load = rq->misfit_task_load; > busiest = rq; > } > > break; > - > } > } > > -- > 2.25.1 >