RE: Serious performance degradation in Linux 4.15

Jon Maloy Tue, 13 Feb 2018 00:14:53 -0800

The person who reported this is on vacation right now. I will be back with more 
detailed info in two weeks.


///jon

> -----Original Message-----
> From: netdev-ow...@vger.kernel.org [mailto:netdev-
> ow...@vger.kernel.org] On Behalf Of Peter Zijlstra
> Sent: Monday, February 12, 2018 16:17
> To: Jon Maloy <jon.ma...@ericsson.com>
> Cc: netdev@vger.kernel.org; mi...@kernel.org; David Miller
> (da...@davemloft.net) <da...@davemloft.net>; Mike Galbraith
> <umgwanakikb...@gmail.com>; Matt Fleming <m...@codeblueprint.co.uk>
> Subject: Re: Serious performance degradation in Linux 4.15
> 
> On Fri, Feb 09, 2018 at 05:59:12PM +0000, Jon Maloy wrote:
> > Command for TCP:
> > "netperf TCP_STREAM  (netperf -n 4 -f m -c 4 -C 4 -P 1 -H 10.0.0.1 -t
> TCP_STREAM -l 10 -- -O THROUGHPUT)"
> > Command for TIPC:
> > "netperf TIPC_STREAM (netperf -n 4 -f m -c 4 -C 4 -P 1 -H 10.0.0.1 -t
> TCP_STREAM -l 10 -- -O THROUGHPUT)"
> 
> That looks like identical tests to me. And my netperf (debian testing) doesn't
> appear to have -t TIPC_STREAM.
> 
> Please try a coherent report and I'll have another look. Don't (again) forget 
> to
> mention what kind of setup you're running this on.
> 
> 
> On my IVB-EP (2 sockets, 10 cores, 2 threads), performance cpufreq, PTI=n
> RETPOLINE=n, I get:
> 
> 
> CPUS=`grep -c ^processor /proc/cpuinfo`
> 
> for test in TCP_STREAM
> do
>         for i in 1 $((CPUS/4)) $((CPUS/2)) $((CPUS)) $((CPUS*2))
>         do
>                 echo -n $test-$i ": "
> 
>                 (
>                   for ((j=0; j<i; j++))
>                   do
>                         netperf -t $test -4 -c -C -l 60 -P0 | head -1 &
>                   done
> 
>                   wait
>                 ) | awk '{ n++; v+=$5; } END { print "Avg: " v/n }'
>         done
> done
> 
> 
> 
> NO_WA_OLD WA_IDLE WA_WEIGHT:
> 
> TCP_STREAM-1 : Avg: 44139.8
> TCP_STREAM-10 : Avg: 27301.6
> TCP_STREAM-20 : Avg: 12701.5
> TCP_STREAM-40 : Avg: 5711.62
> TCP_STREAM-80 : Avg: 2870.16
> 
> 
> WA_OLD NO_WA_IDLE NO_WA_WEIGHT:
> 
> TCP_STREAM-1 : Avg: 25293.1
> TCP_STREAM-10 : Avg: 28196.3
> TCP_STREAM-20 : Avg: 12463.7
> TCP_STREAM-40 : Avg: 5566.83
> TCP_STREAM-80 : Avg: 2630.03
> 
> ---
>  include/linux/sched/topology.h |  4 ++
>  kernel/sched/fair.c            | 99
> +++++++++++++++++++++++++++++++++++++-----
>  kernel/sched/features.h        |  2 +
>  3 files changed, 93 insertions(+), 12 deletions(-)
> 
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index 26347741ba50..2cb74343c252 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -72,6 +72,10 @@ struct sched_domain_shared {
>       atomic_t        ref;
>       atomic_t        nr_busy_cpus;
>       int             has_idle_cores;
> +
> +     unsigned long   nr_running;
> +     unsigned long   load;
> +     unsigned long   capacity;
>  };
> 
>  struct sched_domain {
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index
> 5eb3ffc9be84..4a561311241a 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5680,6 +5680,68 @@ static int wake_wide(struct task_struct *p)
>       return 1;
>  }
> 
> +struct llc_stats {
> +     unsigned long nr_running;
> +     unsigned long load;
> +     unsigned long capacity;
> +     int             has_capacity;
> +};
> +
> +static bool get_llc_stats(struct llc_stats *stats, int cpu) {
> +     struct sched_domain_shared *sds =
> +rcu_dereference(per_cpu(sd_llc_shared, cpu));
> +
> +     if (!sds)
> +             return false;
> +
> +     stats->nr_running = READ_ONCE(sds->nr_running);
> +     stats->load       = READ_ONCE(sds->load);
> +     stats->capacity   = READ_ONCE(sds->capacity);
> +     stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
> +
> +     return true;
> +}
> +
> +static int
> +wake_affine_old(struct sched_domain *sd, struct task_struct *p,
> +             int this_cpu, int prev_cpu, int sync) {
> +     struct llc_stats prev_stats, this_stats;
> +     s64 this_eff_load, prev_eff_load;
> +     unsigned long task_load;
> +
> +     if (!get_llc_stats(&prev_stats, prev_cpu) ||
> +         !get_llc_stats(&this_stats, this_cpu))
> +             return nr_cpumask_bits;
> +
> +     if (sync) {
> +             unsigned long current_load = task_h_load(current);
> +             if (current_load > this_stats.load)
> +                     return this_cpu;
> +
> +             this_stats.load -= current_load;
> +     }
> +
> +     if (prev_stats.has_capacity && prev_stats.nr_running <
> this_stats.nr_running+1)
> +             return nr_cpumask_bits;
> +
> +     if (this_stats.has_capacity && this_stats.nr_running+1 <
> prev_stats.nr_running)
> +             return this_cpu;
> +
> +     task_load = task_h_load(p);
> +
> +     this_eff_load = 100;
> +     this_eff_load *= prev_stats.capacity;
> +
> +     prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
> +     prev_eff_load *= this_stats.capacity;
> +
> +     this_eff_load *= this_stats.load + task_load;
> +     prev_eff_load *= prev_stats.load - task_load;
> +
> +     return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits;
> }
> +
>  /*
>   * The purpose of wake_affine() is to quickly determine on which CPU we
> can run
>   * soonest. For the purpose of speed we only consider the waking and
> previous @@ -5756,6 +5818,9 @@ static int wake_affine(struct
> sched_domain *sd, struct task_struct *p,
>       int this_cpu = smp_processor_id();
>       int target = nr_cpumask_bits;
> 
> +     if (sched_feat(WA_OLD))
> +             target = wake_affine_old(sd, p, this_cpu, prev_cpu, sync);
> +
>       if (sched_feat(WA_IDLE))
>               target = wake_affine_idle(this_cpu, prev_cpu, sync);
> 
> @@ -6209,18 +6274,20 @@ static int select_idle_sibling(struct task_struct *p,
> int prev, int target)
>               return prev;
> 
>       /* Check a recently used CPU as a potential idle candidate */
> -     recent_used_cpu = p->recent_used_cpu;
> -     if (recent_used_cpu != prev &&
> -         recent_used_cpu != target &&
> -         cpus_share_cache(recent_used_cpu, target) &&
> -         idle_cpu(recent_used_cpu) &&
> -         cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
> -             /*
> -              * Replace recent_used_cpu with prev as it is a potential
> -              * candidate for the next wake.
> -              */
> -             p->recent_used_cpu = prev;
> -             return recent_used_cpu;
> +     if (sched_feat(SIS_RECENT)) {
> +             recent_used_cpu = p->recent_used_cpu;
> +             if (recent_used_cpu != prev &&
> +                 recent_used_cpu != target &&
> +                 cpus_share_cache(recent_used_cpu, target) &&
> +                 idle_cpu(recent_used_cpu) &&
> +                 cpumask_test_cpu(p->recent_used_cpu, &p-
> >cpus_allowed)) {
> +                     /*
> +                      * Replace recent_used_cpu with prev as it is a
> potential
> +                      * candidate for the next wake.
> +                      */
> +                     p->recent_used_cpu = prev;
> +                     return recent_used_cpu;
> +             }
>       }
> 
>       sd = rcu_dereference(per_cpu(sd_llc, target)); @@ -7961,6 +8028,7
> @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
>   */
>  static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats
> *sds)  {
> +     struct sched_domain_shared *shared = env->sd->shared;
>       struct sched_domain *child = env->sd->child;
>       struct sched_group *sg = env->sd->groups;
>       struct sg_lb_stats *local = &sds->local_stat; @@ -8032,6 +8100,13
> @@ static inline void update_sd_lb_stats(struct lb_env *env, struct
> sd_lb_stats *sd
>               if (env->dst_rq->rd->overload != overload)
>                       env->dst_rq->rd->overload = overload;
>       }
> +
> +     if (!shared)
> +             return;
> +
> +     WRITE_ONCE(shared->nr_running, sds->total_running);
> +     WRITE_ONCE(shared->load, sds->total_load);
> +     WRITE_ONCE(shared->capacity, sds->total_capacity);
>  }
> 
>  /**
> diff --git a/kernel/sched/features.h b/kernel/sched/features.h index
> 9552fd5854bf..bdb0a66caaae 100644
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -57,6 +57,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
>   */
>  SCHED_FEAT(SIS_AVG_CPU, false)
>  SCHED_FEAT(SIS_PROP, true)
> +SCHED_FEAT(SIS_RECENT, true)
> 
>  /*
>   * Issue a WARN when we do multiple update_rq_clock() calls @@ -82,6
> +83,7 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)  SCHED_FEAT(LB_MIN,
> false)  SCHED_FEAT(ATTACH_AGE_LOAD, true)
> 
> +SCHED_FEAT(WA_OLD, false)
>  SCHED_FEAT(WA_IDLE, true)
>  SCHED_FEAT(WA_WEIGHT, true)
>  SCHED_FEAT(WA_BIAS, true)

RE: Serious performance degradation in Linux 4.15

Reply via email to