This refactoring attempts to achieve: - Decouple waker/wakee with the three kinds of wakeup SD_* flags. - Form a complete topology view in the select - Determine fast idle select vs. slow avg select with more info
To enable this refactoring: echo NEW_SELECT > sched_features Signed-off-by: Yuyang Du <yuyang...@intel.com> --- include/linux/sched.h | 1 + kernel/sched/fair.c | 284 ++++++++++++++++++++++++++++++++++++++++++++++- kernel/sched/features.h | 1 + 3 files changed, 285 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4b2a0fa..c4b4b90 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1475,6 +1475,7 @@ struct task_struct { struct llist_node wake_entry; int on_cpu; unsigned int wakee_flips; + unsigned int wakee_count; unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f15461f..1ab41b8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4986,12 +4986,14 @@ static void record_wakee(struct task_struct *p) */ if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { current->wakee_flips >>= 1; + current->wakee_count >>= 1; current->wakee_flip_decay_ts = jiffies; } if (current->last_wakee != p) { current->last_wakee = p; current->wakee_flips++; + current->wakee_count++; } } @@ -5025,6 +5027,45 @@ static int wake_wide(struct task_struct *p) return 1; } +#define TP_IDENT 0x1 /* waker is wakee */ +#define TP_SMT 0x2 /* SMT sibling */ +#define TP_LLC 0x4 /* cpus_share_cache(waker, wakee) */ +#define TP_SHARE_CACHE 0x7 /* waker and wakee share cache */ +#define TP_LLC_SOCK 0x10 /* all CPUs have complete LLC coverage */ +#define TP_NOLLC_SOCK 0x20 /* !TP_LLC_SOCK */ +#define TP_NOLLC_XSOCK 0x40 /* cross socket */ + +static int wake_wide2(int topology, struct task_struct *p) +{ + unsigned int master = current->wakee_count; + unsigned int slave = p->wakee_count; + unsigned int master_flips = current->wakee_flips; + unsigned int slave_flips = p->wakee_flips; + int factor = this_cpu_read(sd_llc_size); + + if (!master_flips) + master /= master_flips; + else + master = !!master; + + if (!slave_flips) + slave /= slave_flips; + else + slave = !!slave; + + if (master < slave) + swap(master, slave); + + /* + * The system is either TP_NOLLC_SOCK or TP_NOLLC_XSOCK, the waker + * and wakee may still share some cache though. + */ + if (topology | TP_SHARE_CACHE) + return master > factor; + + return slave > factor; +} + static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { s64 this_load, load; @@ -5399,6 +5440,20 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t return cpu; } +static int select_idle_cpu2(struct task_struct *p, struct sched_domain *sd, int target) +{ + int cpu, wrap; + + for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { + if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) + continue; + if (idle_cpu(cpu)) + break; + } + + return cpu; +} + /* * Try and locate an idle core/thread in the LLC cache domain. */ @@ -5424,7 +5479,10 @@ static int select_idle_sibling(struct task_struct *p, int target) if ((unsigned)i < nr_cpumask_bits) return i; - i = select_idle_cpu(p, sd, target); + if (sched_feat(NEW_SELECT)) + i = select_idle_cpu2(p, sd, target); + else + i = select_idle_cpu(p, sd, target); if ((unsigned)i < nr_cpumask_bits) return i; @@ -5470,6 +5528,227 @@ static int cpu_util(int cpu) } /* + * XXX just copied from waker_affine(). + * use util other than load for some topologes? + * real-time vs. avg + */ +static int +wake_waker(int topology, struct sched_domain *sd, struct task_struct *p, int sync) +{ + s64 this_load, load; + s64 this_eff_load, prev_eff_load; + int idx, this_cpu, prev_cpu; + struct task_group *tg; + unsigned long weight; + int balanced; + + idx = sd->wake_idx; + this_cpu = smp_processor_id(); + prev_cpu = task_cpu(p); + load = source_load(prev_cpu, idx); + this_load = target_load(this_cpu, idx); + + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) { + tg = task_group(current); + weight = current->se.avg.load_avg; + + this_load += effective_load(tg, this_cpu, -weight, -weight); + load += effective_load(tg, prev_cpu, 0, -weight); + } + + tg = task_group(p); + weight = p->se.avg.load_avg; + + /* + * In low-load situations, where prev_cpu is idle and this_cpu is idle + * due to the sync cause above having dropped this_load to 0, we'll + * always have an imbalance, but there's really nothing you can do + * about that, so that's good too. + * + * Otherwise check if either cpus are near enough in load to allow this + * task to be woken on this_cpu. + */ + this_eff_load = 100; + this_eff_load *= capacity_of(prev_cpu); + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= capacity_of(this_cpu); + + if (this_load > 0) { + this_eff_load *= this_load + + effective_load(tg, this_cpu, weight, weight); + + prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); + } + + balanced = this_eff_load <= prev_eff_load; + + schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); + + if (!balanced) + return 0; + + schedstat_inc(sd, ttwu_move_affine); + schedstat_inc(p, se.statistics.nr_wakeups_affine); + + return 1; +} +static int wake_idle(struct sched_domain *sd) +{ + u64 avg_cost = sd->avg_scan_cost, avg_idle = this_rq()->avg_idle; + + /* + * Due to large variance we need a large fuzz factor; hackbench in + * particularly is sensitive here. + */ + if ((avg_idle / 512) < avg_cost) + return 0; + + return 1; +} + +static inline int +waker_wakee_topology(int waker, int wakee, int *allowed, struct task_struct *p) +{ + int topology; + + if (static_branch_likely(&sched_llc_complete)) + topology = TP_LLC_SOCK; + + if (waker == wakee) + return TP_IDENT | topology; + + *allowed = cpumask_test_cpu(waker, tsk_cpus_allowed(p)); + +#ifdef CONFIG_SCHED_SMT + if (static_branch_likely(&sched_smt_present) && + cpumask_test_cpu(waker, cpu_smt_mask(wakee))) + return TP_SMT | topology; +#endif + if (cpus_share_cache(wakee, waker)) + return TP_LLC | topology; + + /* We are here without TP_LLC_SOCK for sure */ + if (unlikely(cpus_share_socket(wakee, waker))) + return TP_NOLLC_SOCK; + + return TP_NOLLC_XSOCK; +} + +/* + * Notes: + * - how one-time local suboptimal select approaches to overall global optimal selects + * - certain randomness + * - spread vs. consolidate + * - load vs. util, real-time vs. avg + * - use topology more + */ +static int +select_task_rq_fair2(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) +{ + struct sched_domain *tmp, *sd = NULL, *this_sd_llc = NULL; + int waker_allowed = 1, select_idle = 0; + int cpu = smp_processor_id(), new_cpu = prev_cpu; + int sync = wake_flags & WF_SYNC; + int topology = waker_wakee_topology(cpu, prev_cpu, &waker_allowed, p); + + record_wakee(p); + + rcu_read_lock(); + + for_each_domain(cpu, tmp) { + /* Stop if domain flags says no */ + if (!(tmp->flags & SD_LOAD_BALANCE) || !(tmp->flags & sd_flag)) + break; + + sd = tmp; + } + + if (!sd) + goto out_unlock; + + this_sd_llc = rcu_dereference(*this_cpu_ptr(&sd_llc)); + if (!this_sd_llc || (sd->span_weight < this_sd_llc->span_weight)) + goto select_avg; + + /* Now we can attempt to fast select an idle CPU */ + if (topology & TP_LLC_SOCK) { + + if (wake_idle(this_sd_llc)) + select_idle = 1; + + } else if (!wake_wide2(topology, p)) + select_idle = 1; + + if (topology > TP_IDENT && waker_allowed && + wake_waker(topology, this_sd_llc, p, sync)) + new_cpu = cpu; + + if (select_idle) { + /* + * Scan the LLC domain for idle CPUs; this is dynamically + * regulated by comparing the average scan cost (tracked in + * this_sd_llc->avg_scan_cost) against the average idle time + * for this rq (as found in this_rq->avg_idle). + */ + u64 time = local_clock(); + + new_cpu = select_idle_sibling(p, new_cpu); + time = local_clock() - time; + this_sd_llc->avg_scan_cost += + (s64)(time - this_sd_llc->avg_scan_cost) / 8; + + goto out_unlock; + } + +select_avg: + while (sd) { + struct sched_group *group; + int weight; + + if (!(sd->flags & sd_flag)) { + sd = sd->child; + continue; + } + + group = find_idlest_group(sd, p, cpu, sd_flag); + if (!group) { + sd = sd->child; + continue; + } + + new_cpu = find_idlest_cpu(group, p, cpu); + if (new_cpu == -1 || new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = new_cpu; + weight = sd->span_weight; + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= tmp->span_weight) + break; + if (tmp->flags & sd_flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ + } + +out_unlock: + rcu_read_unlock(); + + return new_cpu; +} + +/* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, * SD_BALANCE_FORK, or SD_BALANCE_EXEC. @@ -5489,6 +5768,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; + if (sched_feat(NEW_SELECT)) + return select_task_rq_fair2(p, prev_cpu, sd_flag, wake_flags); + if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 69631fa..ea41750 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -69,3 +69,4 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) +SCHED_FEAT(NEW_SELECT, false) -- 1.7.9.5