Re: Serious performance degradation in Linux 4.15

Peter Zijlstra Mon, 12 Feb 2018 07:19:48 -0800

On Fri, Feb 09, 2018 at 05:59:12PM +0000, Jon Maloy wrote:
> Command for TCP:
> "netperf TCP_STREAM  (netperf -n 4 -f m -c 4 -C 4 -P 1 -H 10.0.0.1 -t 
> TCP_STREAM -l 10 -- -O THROUGHPUT)"
> Command for TIPC:
> "netperf TIPC_STREAM (netperf -n 4 -f m -c 4 -C 4 -P 1 -H 10.0.0.1 -t 
> TCP_STREAM -l 10 -- -O THROUGHPUT)"


That looks like identical tests to me. And my netperf (debian testing)
doesn't appear to have -t TIPC_STREAM.

Please try a coherent report and I'll have another look. Don't (again)
forget to mention what kind of setup you're running this on.


On my IVB-EP (2 sockets, 10 cores, 2 threads), performance cpufreq,
PTI=n RETPOLINE=n, I get:


CPUS=`grep -c ^processor /proc/cpuinfo`

for test in TCP_STREAM
do
        for i in 1 $((CPUS/4)) $((CPUS/2)) $((CPUS)) $((CPUS*2))
        do
                echo -n $test-$i ": "

                (
                  for ((j=0; j<i; j++))
                  do
                        netperf -t $test -4 -c -C -l 60 -P0 | head -1 &
                  done

                  wait
                ) | awk '{ n++; v+=$5; } END { print "Avg: " v/n }'
        done
done



NO_WA_OLD WA_IDLE WA_WEIGHT:

TCP_STREAM-1 : Avg: 44139.8
TCP_STREAM-10 : Avg: 27301.6
TCP_STREAM-20 : Avg: 12701.5
TCP_STREAM-40 : Avg: 5711.62
TCP_STREAM-80 : Avg: 2870.16


WA_OLD NO_WA_IDLE NO_WA_WEIGHT:

TCP_STREAM-1 : Avg: 25293.1
TCP_STREAM-10 : Avg: 28196.3
TCP_STREAM-20 : Avg: 12463.7
TCP_STREAM-40 : Avg: 5566.83
TCP_STREAM-80 : Avg: 2630.03

---
 include/linux/sched/topology.h |  4 ++
 kernel/sched/fair.c            | 99 +++++++++++++++++++++++++++++++++++++-----
 kernel/sched/features.h        |  2 +
 3 files changed, 93 insertions(+), 12 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 26347741ba50..2cb74343c252 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -72,6 +72,10 @@ struct sched_domain_shared {
        atomic_t        ref;
        atomic_t        nr_busy_cpus;
        int             has_idle_cores;
+
+       unsigned long   nr_running;
+       unsigned long   load;
+       unsigned long   capacity;
 };
 
 struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5eb3ffc9be84..4a561311241a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5680,6 +5680,68 @@ static int wake_wide(struct task_struct *p)
        return 1;
 }
 
+struct llc_stats {
+       unsigned long nr_running;
+       unsigned long load;
+       unsigned long capacity;
+       int             has_capacity;
+};
+
+static bool get_llc_stats(struct llc_stats *stats, int cpu)
+{
+       struct sched_domain_shared *sds = 
rcu_dereference(per_cpu(sd_llc_shared, cpu));
+
+       if (!sds)
+               return false;
+
+       stats->nr_running = READ_ONCE(sds->nr_running);
+       stats->load       = READ_ONCE(sds->load);
+       stats->capacity   = READ_ONCE(sds->capacity);
+       stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
+
+       return true;
+}
+
+static int
+wake_affine_old(struct sched_domain *sd, struct task_struct *p,
+               int this_cpu, int prev_cpu, int sync)
+{
+       struct llc_stats prev_stats, this_stats;
+       s64 this_eff_load, prev_eff_load;
+       unsigned long task_load;
+
+       if (!get_llc_stats(&prev_stats, prev_cpu) ||
+           !get_llc_stats(&this_stats, this_cpu))
+               return nr_cpumask_bits;
+
+       if (sync) {
+               unsigned long current_load = task_h_load(current);
+               if (current_load > this_stats.load)
+                       return this_cpu;
+
+               this_stats.load -= current_load;
+       }
+
+       if (prev_stats.has_capacity && prev_stats.nr_running < 
this_stats.nr_running+1)
+               return nr_cpumask_bits;
+
+       if (this_stats.has_capacity && this_stats.nr_running+1 < 
prev_stats.nr_running)
+               return this_cpu;
+
+       task_load = task_h_load(p);
+
+       this_eff_load = 100;
+       this_eff_load *= prev_stats.capacity;
+
+       prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+       prev_eff_load *= this_stats.capacity;
+
+       this_eff_load *= this_stats.load + task_load;
+       prev_eff_load *= prev_stats.load - task_load;
+
+       return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits;
+}
+
 /*
  * The purpose of wake_affine() is to quickly determine on which CPU we can run
  * soonest. For the purpose of speed we only consider the waking and previous
@@ -5756,6 +5818,9 @@ static int wake_affine(struct sched_domain *sd, struct 
task_struct *p,
        int this_cpu = smp_processor_id();
        int target = nr_cpumask_bits;
 
+       if (sched_feat(WA_OLD))
+               target = wake_affine_old(sd, p, this_cpu, prev_cpu, sync);
+
        if (sched_feat(WA_IDLE))
                target = wake_affine_idle(this_cpu, prev_cpu, sync);
 
@@ -6209,18 +6274,20 @@ static int select_idle_sibling(struct task_struct *p, 
int prev, int target)
                return prev;
 
        /* Check a recently used CPU as a potential idle candidate */
-       recent_used_cpu = p->recent_used_cpu;
-       if (recent_used_cpu != prev &&
-           recent_used_cpu != target &&
-           cpus_share_cache(recent_used_cpu, target) &&
-           idle_cpu(recent_used_cpu) &&
-           cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
-               /*
-                * Replace recent_used_cpu with prev as it is a potential
-                * candidate for the next wake.
-                */
-               p->recent_used_cpu = prev;
-               return recent_used_cpu;
+       if (sched_feat(SIS_RECENT)) {
+               recent_used_cpu = p->recent_used_cpu;
+               if (recent_used_cpu != prev &&
+                   recent_used_cpu != target &&
+                   cpus_share_cache(recent_used_cpu, target) &&
+                   idle_cpu(recent_used_cpu) &&
+                   cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
+                       /*
+                        * Replace recent_used_cpu with prev as it is a 
potential
+                        * candidate for the next wake.
+                        */
+                       p->recent_used_cpu = prev;
+                       return recent_used_cpu;
+               }
        }
 
        sd = rcu_dereference(per_cpu(sd_llc, target));
@@ -7961,6 +8028,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
  */
 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats 
*sds)
 {
+       struct sched_domain_shared *shared = env->sd->shared;
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
        struct sg_lb_stats *local = &sds->local_stat;
@@ -8032,6 +8100,13 @@ static inline void update_sd_lb_stats(struct lb_env 
*env, struct sd_lb_stats *sd
                if (env->dst_rq->rd->overload != overload)
                        env->dst_rq->rd->overload = overload;
        }
+
+       if (!shared)
+               return;
+
+       WRITE_ONCE(shared->nr_running, sds->total_running);
+       WRITE_ONCE(shared->load, sds->total_load);
+       WRITE_ONCE(shared->capacity, sds->total_capacity);
 }
 
 /**
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 9552fd5854bf..bdb0a66caaae 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -57,6 +57,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
  */
 SCHED_FEAT(SIS_AVG_CPU, false)
 SCHED_FEAT(SIS_PROP, true)
+SCHED_FEAT(SIS_RECENT, true)
 
 /*
  * Issue a WARN when we do multiple update_rq_clock() calls
@@ -82,6 +83,7 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
 SCHED_FEAT(ATTACH_AGE_LOAD, true)
 
+SCHED_FEAT(WA_OLD, false)
 SCHED_FEAT(WA_IDLE, true)
 SCHED_FEAT(WA_WEIGHT, true)
 SCHED_FEAT(WA_BIAS, true)

Re: Serious performance degradation in Linux 4.15

Reply via email to