On 12/12/2012 09:31 PM, Vincent Guittot wrote: > During the creation of sched_domain, we define a pack buddy CPU for each CPU > when one is available. We want to pack at all levels where a group of CPU can > be power gated independently from others. > On a system that can't power gate a group of CPUs independently, the flag is > set at all sched_domain level and the buddy is set to -1. This is the default > behavior. > On a dual clusters / dual cores system which can power gate each core and > cluster independently, the buddy configuration will be : > > | Cluster 0 | Cluster 1 | > | CPU0 | CPU1 | CPU2 | CPU3 | > ----------------------------------- > buddy | CPU0 | CPU0 | CPU0 | CPU2 | > > Small tasks tend to slip out of the periodic load balance so the best place > to choose to migrate them is during their wake up. The decision is in O(1) as > we only check again one buddy CPU
Just have a little worry about the scalability on a big machine, like on a 4 sockets NUMA machine * 8 cores * HT machine, the buddy cpu in whole system need care 64 LCPUs. and in your case cpu0 just care 4 LCPU. That is different on task distribution decision. > > Signed-off-by: Vincent Guittot <vincent.guit...@linaro.org> > --- > kernel/sched/core.c | 1 + > kernel/sched/fair.c | 110 > ++++++++++++++++++++++++++++++++++++++++++++++++++ > kernel/sched/sched.h | 5 +++ > 3 files changed, 116 insertions(+) > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index 4f36e9d..3436aad 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -5693,6 +5693,7 @@ cpu_attach_domain(struct sched_domain *sd, struct > root_domain *rd, int cpu) > rcu_assign_pointer(rq->sd, sd); > destroy_sched_domains(tmp, cpu); > > + update_packing_domain(cpu); > update_domain_cache(cpu); > } > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 9916d41..fc93d96 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -163,6 +163,73 @@ void sched_init_granularity(void) > update_sysctl(); > } > > + > +#ifdef CONFIG_SMP > +/* > + * Save the id of the optimal CPU that should be used to pack small tasks > + * The value -1 is used when no buddy has been found > + */ > +DEFINE_PER_CPU(int, sd_pack_buddy); > + > +/* Look for the best buddy CPU that can be used to pack small tasks > + * We make the assumption that it doesn't wort to pack on CPU that share the > + * same powerline. We looks for the 1st sched_domain without the > + * SD_SHARE_POWERDOMAIN flag. Then We look for the sched_group witht the > lowest > + * power per core based on the assumption that their power efficiency is > + * better */ > +void update_packing_domain(int cpu) > +{ > + struct sched_domain *sd; > + int id = -1; > + > + sd = highest_flag_domain(cpu, SD_SHARE_POWERDOMAIN & SD_LOAD_BALANCE); > + if (!sd) > + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); > + else > + sd = sd->parent; > + > + while (sd && (sd->flags && SD_LOAD_BALANCE)) { > + struct sched_group *sg = sd->groups; > + struct sched_group *pack = sg; > + struct sched_group *tmp; > + > + /* > + * The sched_domain of a CPU points on the local sched_group > + * and the 1st CPU of this local group is a good candidate > + */ > + id = cpumask_first(sched_group_cpus(pack)); > + > + /* loop the sched groups to find the best one */ > + for (tmp = sg->next; tmp != sg; tmp = tmp->next) { > + if (tmp->sgp->power * pack->group_weight > > + pack->sgp->power * tmp->group_weight) > + continue; > + > + if ((tmp->sgp->power * pack->group_weight == > + pack->sgp->power * tmp->group_weight) > + && (cpumask_first(sched_group_cpus(tmp)) >= id)) > + continue; > + > + /* we have found a better group */ > + pack = tmp; > + > + /* Take the 1st CPU of the new group */ > + id = cpumask_first(sched_group_cpus(pack)); > + } > + > + /* Look for another CPU than itself */ > + if (id != cpu) > + break; > + > + sd = sd->parent; > + } > + > + pr_debug("CPU%d packing on CPU%d\n", cpu, id); > + per_cpu(sd_pack_buddy, cpu) = id; > +} > + > +#endif /* CONFIG_SMP */ > + > #if BITS_PER_LONG == 32 > # define WMULT_CONST (~0UL) > #else > @@ -5083,6 +5150,46 @@ static bool numa_allow_migration(struct task_struct > *p, int prev_cpu, int new_cp > return true; > } > > +static bool is_buddy_busy(int cpu) > +{ > + struct rq *rq = cpu_rq(cpu); > + > + /* > + * A busy buddy is a CPU with a high load or a small load with a lot of > + * running tasks. > + */ > + return ((rq->avg.runnable_avg_sum << rq->nr_running) > If nr_running a bit big, rq->avg.runnable_avg_sum << rq->nr_running is zero. you will get the wrong decision. > + rq->avg.runnable_avg_period); > +} > + > +static bool is_light_task(struct task_struct *p) > +{ > + /* A light task runs less than 25% in average */ > + return ((p->se.avg.runnable_avg_sum << 1) < > + p->se.avg.runnable_avg_period); 25% may not suitable for big machine. > +} > + > +static int check_pack_buddy(int cpu, struct task_struct *p) > +{ > + int buddy = per_cpu(sd_pack_buddy, cpu); > + > + /* No pack buddy for this CPU */ > + if (buddy == -1) > + return false; > + > + /* buddy is not an allowed CPU */ > + if (!cpumask_test_cpu(buddy, tsk_cpus_allowed(p))) > + return false; > + > + /* > + * If the task is a small one and the buddy is not overloaded, > + * we use buddy cpu > + */ > + if (!is_light_task(p) || is_buddy_busy(buddy)) > + return false; > + > + return true; > +} > > /* > * sched_balance_self: balance the current task (running on cpu) in domains > @@ -5120,6 +5227,9 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, > int wake_flags) > return p->ideal_cpu; > #endif > > + if (check_pack_buddy(cpu, p)) > + return per_cpu(sd_pack_buddy, cpu); > + > if (sd_flag & SD_BALANCE_WAKE) { > if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) > want_affine = 1; > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index 92ba891..3802fc4 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -892,6 +892,7 @@ extern const struct sched_class idle_sched_class; > > extern void trigger_load_balance(struct rq *rq, int cpu); > extern void idle_balance(int this_cpu, struct rq *this_rq); > +extern void update_packing_domain(int cpu); > > #else /* CONFIG_SMP */ > > @@ -899,6 +900,10 @@ static inline void idle_balance(int cpu, struct rq *rq) > { > } > > +static inline void update_packing_domain(int cpu) > +{ > +} > + > #endif > > extern void sysrq_sched_debug_show(void); > _______________________________________________ linaro-dev mailing list linaro-dev@lists.linaro.org http://lists.linaro.org/mailman/listinfo/linaro-dev