The person who reported this is on vacation right now. I will be back with more detailed info in two weeks.
///jon > -----Original Message----- > From: netdev-ow...@vger.kernel.org [mailto:netdev- > ow...@vger.kernel.org] On Behalf Of Peter Zijlstra > Sent: Monday, February 12, 2018 16:17 > To: Jon Maloy <jon.ma...@ericsson.com> > Cc: netdev@vger.kernel.org; mi...@kernel.org; David Miller > (da...@davemloft.net) <da...@davemloft.net>; Mike Galbraith > <umgwanakikb...@gmail.com>; Matt Fleming <m...@codeblueprint.co.uk> > Subject: Re: Serious performance degradation in Linux 4.15 > > On Fri, Feb 09, 2018 at 05:59:12PM +0000, Jon Maloy wrote: > > Command for TCP: > > "netperf TCP_STREAM (netperf -n 4 -f m -c 4 -C 4 -P 1 -H 10.0.0.1 -t > TCP_STREAM -l 10 -- -O THROUGHPUT)" > > Command for TIPC: > > "netperf TIPC_STREAM (netperf -n 4 -f m -c 4 -C 4 -P 1 -H 10.0.0.1 -t > TCP_STREAM -l 10 -- -O THROUGHPUT)" > > That looks like identical tests to me. And my netperf (debian testing) doesn't > appear to have -t TIPC_STREAM. > > Please try a coherent report and I'll have another look. Don't (again) forget > to > mention what kind of setup you're running this on. > > > On my IVB-EP (2 sockets, 10 cores, 2 threads), performance cpufreq, PTI=n > RETPOLINE=n, I get: > > > CPUS=`grep -c ^processor /proc/cpuinfo` > > for test in TCP_STREAM > do > for i in 1 $((CPUS/4)) $((CPUS/2)) $((CPUS)) $((CPUS*2)) > do > echo -n $test-$i ": " > > ( > for ((j=0; j<i; j++)) > do > netperf -t $test -4 -c -C -l 60 -P0 | head -1 & > done > > wait > ) | awk '{ n++; v+=$5; } END { print "Avg: " v/n }' > done > done > > > > NO_WA_OLD WA_IDLE WA_WEIGHT: > > TCP_STREAM-1 : Avg: 44139.8 > TCP_STREAM-10 : Avg: 27301.6 > TCP_STREAM-20 : Avg: 12701.5 > TCP_STREAM-40 : Avg: 5711.62 > TCP_STREAM-80 : Avg: 2870.16 > > > WA_OLD NO_WA_IDLE NO_WA_WEIGHT: > > TCP_STREAM-1 : Avg: 25293.1 > TCP_STREAM-10 : Avg: 28196.3 > TCP_STREAM-20 : Avg: 12463.7 > TCP_STREAM-40 : Avg: 5566.83 > TCP_STREAM-80 : Avg: 2630.03 > > --- > include/linux/sched/topology.h | 4 ++ > kernel/sched/fair.c | 99 > +++++++++++++++++++++++++++++++++++++----- > kernel/sched/features.h | 2 + > 3 files changed, 93 insertions(+), 12 deletions(-) > > diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h > index 26347741ba50..2cb74343c252 100644 > --- a/include/linux/sched/topology.h > +++ b/include/linux/sched/topology.h > @@ -72,6 +72,10 @@ struct sched_domain_shared { > atomic_t ref; > atomic_t nr_busy_cpus; > int has_idle_cores; > + > + unsigned long nr_running; > + unsigned long load; > + unsigned long capacity; > }; > > struct sched_domain { > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index > 5eb3ffc9be84..4a561311241a 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -5680,6 +5680,68 @@ static int wake_wide(struct task_struct *p) > return 1; > } > > +struct llc_stats { > + unsigned long nr_running; > + unsigned long load; > + unsigned long capacity; > + int has_capacity; > +}; > + > +static bool get_llc_stats(struct llc_stats *stats, int cpu) { > + struct sched_domain_shared *sds = > +rcu_dereference(per_cpu(sd_llc_shared, cpu)); > + > + if (!sds) > + return false; > + > + stats->nr_running = READ_ONCE(sds->nr_running); > + stats->load = READ_ONCE(sds->load); > + stats->capacity = READ_ONCE(sds->capacity); > + stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu); > + > + return true; > +} > + > +static int > +wake_affine_old(struct sched_domain *sd, struct task_struct *p, > + int this_cpu, int prev_cpu, int sync) { > + struct llc_stats prev_stats, this_stats; > + s64 this_eff_load, prev_eff_load; > + unsigned long task_load; > + > + if (!get_llc_stats(&prev_stats, prev_cpu) || > + !get_llc_stats(&this_stats, this_cpu)) > + return nr_cpumask_bits; > + > + if (sync) { > + unsigned long current_load = task_h_load(current); > + if (current_load > this_stats.load) > + return this_cpu; > + > + this_stats.load -= current_load; > + } > + > + if (prev_stats.has_capacity && prev_stats.nr_running < > this_stats.nr_running+1) > + return nr_cpumask_bits; > + > + if (this_stats.has_capacity && this_stats.nr_running+1 < > prev_stats.nr_running) > + return this_cpu; > + > + task_load = task_h_load(p); > + > + this_eff_load = 100; > + this_eff_load *= prev_stats.capacity; > + > + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; > + prev_eff_load *= this_stats.capacity; > + > + this_eff_load *= this_stats.load + task_load; > + prev_eff_load *= prev_stats.load - task_load; > + > + return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits; > } > + > /* > * The purpose of wake_affine() is to quickly determine on which CPU we > can run > * soonest. For the purpose of speed we only consider the waking and > previous @@ -5756,6 +5818,9 @@ static int wake_affine(struct > sched_domain *sd, struct task_struct *p, > int this_cpu = smp_processor_id(); > int target = nr_cpumask_bits; > > + if (sched_feat(WA_OLD)) > + target = wake_affine_old(sd, p, this_cpu, prev_cpu, sync); > + > if (sched_feat(WA_IDLE)) > target = wake_affine_idle(this_cpu, prev_cpu, sync); > > @@ -6209,18 +6274,20 @@ static int select_idle_sibling(struct task_struct *p, > int prev, int target) > return prev; > > /* Check a recently used CPU as a potential idle candidate */ > - recent_used_cpu = p->recent_used_cpu; > - if (recent_used_cpu != prev && > - recent_used_cpu != target && > - cpus_share_cache(recent_used_cpu, target) && > - idle_cpu(recent_used_cpu) && > - cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { > - /* > - * Replace recent_used_cpu with prev as it is a potential > - * candidate for the next wake. > - */ > - p->recent_used_cpu = prev; > - return recent_used_cpu; > + if (sched_feat(SIS_RECENT)) { > + recent_used_cpu = p->recent_used_cpu; > + if (recent_used_cpu != prev && > + recent_used_cpu != target && > + cpus_share_cache(recent_used_cpu, target) && > + idle_cpu(recent_used_cpu) && > + cpumask_test_cpu(p->recent_used_cpu, &p- > >cpus_allowed)) { > + /* > + * Replace recent_used_cpu with prev as it is a > potential > + * candidate for the next wake. > + */ > + p->recent_used_cpu = prev; > + return recent_used_cpu; > + } > } > > sd = rcu_dereference(per_cpu(sd_llc, target)); @@ -7961,6 +8028,7 > @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) > */ > static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats > *sds) { > + struct sched_domain_shared *shared = env->sd->shared; > struct sched_domain *child = env->sd->child; > struct sched_group *sg = env->sd->groups; > struct sg_lb_stats *local = &sds->local_stat; @@ -8032,6 +8100,13 > @@ static inline void update_sd_lb_stats(struct lb_env *env, struct > sd_lb_stats *sd > if (env->dst_rq->rd->overload != overload) > env->dst_rq->rd->overload = overload; > } > + > + if (!shared) > + return; > + > + WRITE_ONCE(shared->nr_running, sds->total_running); > + WRITE_ONCE(shared->load, sds->total_load); > + WRITE_ONCE(shared->capacity, sds->total_capacity); > } > > /** > diff --git a/kernel/sched/features.h b/kernel/sched/features.h index > 9552fd5854bf..bdb0a66caaae 100644 > --- a/kernel/sched/features.h > +++ b/kernel/sched/features.h > @@ -57,6 +57,7 @@ SCHED_FEAT(TTWU_QUEUE, true) > */ > SCHED_FEAT(SIS_AVG_CPU, false) > SCHED_FEAT(SIS_PROP, true) > +SCHED_FEAT(SIS_RECENT, true) > > /* > * Issue a WARN when we do multiple update_rq_clock() calls @@ -82,6 > +83,7 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, > false) SCHED_FEAT(ATTACH_AGE_LOAD, true) > > +SCHED_FEAT(WA_OLD, false) > SCHED_FEAT(WA_IDLE, true) > SCHED_FEAT(WA_WEIGHT, true) > SCHED_FEAT(WA_BIAS, true)