On Fri, 2017-11-24 at 19:46 +0100, Mike Galbraith wrote: > > My view is you're barking up the wrong tree: you're making the idle > data SIS is using more accurate, but I question the benefit. That it > makes an imperfect placement decision occasionally due to raciness is > nearly meaningless compared to the cost of frequent bounce.
Playing with SIS (yet again), below is a hack that I think illustrates why I think the occasional races are nearly meaningless. Box = i4790 desktop. master masterx TCP_STREAM-1 Avg: 70495 71295 TCP_STREAM-2 Avg: 54388 66202 TCP_STREAM-4 Avg: 19316 21413 TCP_STREAM-8 Avg: 9678 8894 TCP_STREAM-16 Avg: 4286 4360 TCP_MAERTS-1 Avg: 69238 71799 TCP_MAERTS-2 Avg: 50729 65612 TCP_MAERTS-4 Avg: 19095 21984 TCP_MAERTS-8 Avg: 9405 8888 TCP_MAERTS-16 Avg: 4891 4371 TCP_RR-1 Avg: 198617 203291 TCP_RR-2 Avg: 152862 191761 TCP_RR-4 Avg: 112241 117888 TCP_RR-8 Avg: 104453 113260 TCP_RR-16 Avg: 50897 55280 UDP_RR-1 Avg: 250738 264214 UDP_RR-2 Avg: 196250 253352 UDP_RR-4 Avg: 152862 158819 UDP_RR-8 Avg: 143781 154071 UDP_RR-16 Avg: 68605 76492 tbench 1 2 4 8 16 master 772 1207 1829 3516 3440 masterx 811 1466 1959 3737 3670 hackbench -l 10000 5.917 5.990 5.957 avg 5.954 NO_SIS_DEBOUNCE 5.886 5.808 5.826 avg 5.840 SIS_DEBOUNCE echo 0 > tracing_on echo 1 > events/sched/sched_migrate_task/enable start endless tbench 2 for i in `seq 3` do echo > trace echo 1 > tracing_on sleep 10 echo 0 > tracing_on cat trace|grep tbench|wc -l done kde desktop idling NO_SIS_DEBOUNCE 261 208 199 SIS_DEBOUNCE 8 6 0 add firefox playing youtube documentary NO_SIS_DEBOUNCE 10906 10094 10774 SIS_DEBOUNCE 34 34 34 tbench 2 throughput as firefox runs NO_SIS_DEBOUNCE 1129.63 MB/sec SIS_DEBOUNCE 1462.53 MB/sec Advisory: welding goggles. --- include/linux/sched.h | 3 ++- kernel/sched/fair.c | 41 +++++++++++++++++++++++++++++++++++++++-- kernel/sched/features.h | 1 + 3 files changed, 42 insertions(+), 3 deletions(-) --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -541,7 +541,6 @@ struct task_struct { unsigned int ptrace; #ifdef CONFIG_SMP - struct llist_node wake_entry; int on_cpu; #ifdef CONFIG_THREAD_INFO_IN_TASK /* Current CPU: */ @@ -549,8 +548,10 @@ struct task_struct { #endif unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; + unsigned long wakee_placed; struct task_struct *last_wakee; + struct llist_node wake_entry; int wake_cpu; #endif int on_rq; --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6174,6 +6174,9 @@ static int select_idle_sibling(struct ta if ((unsigned)i < nr_cpumask_bits) return i; + if (sched_feat(SIS_DEBOUNCE)) + p->wakee_placed = jiffies; + return target; } @@ -6258,6 +6261,22 @@ static int wake_cap(struct task_struct * return min_cap * 1024 < task_util(p) * capacity_margin; } +static bool task_placed(struct task_struct *p) +{ + return p->wakee_placed == jiffies; +} + +static bool task_llc_affine_and_cold(struct task_struct *p, int cpu, int prev) +{ + int cold = sysctl_sched_migration_cost; + + if (!cpus_share_cache(cpu, prev)) + return false; + if (cold > 0 && rq_clock_task(cpu_rq(prev)) - p->se.exec_start > cold) + return true; + return false; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -6276,16 +6295,26 @@ select_task_rq_fair(struct task_struct * struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; - int want_affine = 0; + int want_affine = 0, want_debounce = 0; int sync = wake_flags & WF_SYNC; + rcu_read_lock(); if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); + want_debounce = sched_feat(SIS_DEBOUNCE); + if (task_placed(p)) + goto out_unlock; + /* Balance cold tasks to reduce hot task bounce tendency. */ + if (want_debounce && task_llc_affine_and_cold(p, cpu, prev_cpu)) { + sd_flag |= SD_SHARE_PKG_RESOURCES; + sd = highest_flag_domain(prev_cpu, SD_SHARE_PKG_RESOURCES); + p->wakee_placed = jiffies; + goto pick_cpu_cold; + } want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && cpumask_test_cpu(cpu, &p->cpus_allowed); } - rcu_read_lock(); for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) break; @@ -6315,6 +6344,7 @@ select_task_rq_fair(struct task_struct * new_cpu = cpu; } +pick_cpu_cold: if (sd && !(sd_flag & SD_BALANCE_FORK)) { /* * We're going to need the task's util for capacity_spare_wake @@ -6329,9 +6359,13 @@ select_task_rq_fair(struct task_struct * if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); + if (want_debounce && new_cpu == prev_cpu) + p->wakee_placed = jiffies; + } else { new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); } +out_unlock: rcu_read_unlock(); return new_cpu; @@ -6952,6 +6986,9 @@ static int task_hot(struct task_struct * if (sysctl_sched_migration_cost == 0) return 0; + if (task_placed(p)) + return 1; + delta = rq_clock_task(env->src_rq) - p->se.exec_start; return delta < (s64)sysctl_sched_migration_cost; --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -57,6 +57,7 @@ SCHED_FEAT(TTWU_QUEUE, true) */ SCHED_FEAT(SIS_AVG_CPU, false) SCHED_FEAT(SIS_PROP, true) +SCHED_FEAT(SIS_DEBOUNCE, true) /* * Issue a WARN when we do multiple update_rq_clock() calls