Re: [PATCH 3/4] sched,numa: implement numa node level wake_affine

2017-06-26 Thread Peter Zijlstra
On Fri, Jun 23, 2017 at 12:55:29PM -0400, r...@redhat.com wrote:
> From: Rik van Riel 
> 
> Since select_idle_sibling can place a task anywhere on a socket,
> comparing loads between individual CPU cores makes no real sense
> for deciding whether to do an affine wakeup across sockets, either.
> 
> Instead, compare the load between the sockets in a similar way the
> load balancer and the numa balancing code do.

This seems to assume LLC == NUMA, which isn't strictly so.




[PATCH 3/4] sched,numa: implement numa node level wake_affine

2017-06-23 Thread riel
From: Rik van Riel 

Since select_idle_sibling can place a task anywhere on a socket,
comparing loads between individual CPU cores makes no real sense
for deciding whether to do an affine wakeup across sockets, either.

Instead, compare the load between the sockets in a similar way the
load balancer and the numa balancing code do.

Signed-off-by: Rik van Riel 
---
 kernel/sched/fair.c | 130 
 1 file changed, 71 insertions(+), 59 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 949de24e36bd..d03a21e6627d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2590,6 +2590,60 @@ void task_tick_numa(struct rq *rq, struct task_struct 
*curr)
}
}
 }
+
+/*
+ * Can p be moved from prev_cpu to this_cpu without causing a load
+ * imbalance that would trigger the load balancer?
+ */
+static inline bool numa_wake_affine(struct sched_domain *sd,
+   struct task_struct *p, int this_cpu,
+   int prev_cpu, int sync)
+{
+   struct numa_stats prev_load, this_load;
+   s64 this_eff_load, prev_eff_load;
+
+   update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
+   update_numa_stats(&this_load, cpu_to_node(this_cpu));
+
+   /*
+* If sync wakeup then subtract the (maximum possible)
+* effect of the currently running task from the load
+* of the current CPU:
+*/
+   if (sync) {
+   unsigned long current_load = task_h_load(current);
+
+   if (this_load.load > current_load)
+   this_load.load -= current_load;
+   else
+   this_load.load = 0;
+   }
+
+   /*
+* In low-load situations, where this_cpu's node is idle due to the
+* sync cause above having dropped this_load.load to 0, move the task.
+* Moving to an idle socket will not create a bad imbalance.
+*
+* Otherwise check if the nodes are near enough in load to allow this
+* task to be woken on this_cpu's node.
+*/
+   if (this_load.load > 0) {
+   unsigned long task_load = task_h_load(p);
+
+   this_eff_load = 100;
+   this_eff_load *= prev_load.compute_capacity;
+
+   prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+   prev_eff_load *= this_load.compute_capacity;
+
+   this_eff_load *= this_load.load + task_load;
+   prev_eff_load *= prev_load.load - task_load;
+
+   return this_eff_load <= prev_eff_load;
+   }
+
+   return true;
+}
 #else
 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 {
@@ -2602,6 +2656,13 @@ static inline void account_numa_enqueue(struct rq *rq, 
struct task_struct *p)
 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 {
 }
+
+static inline bool numa_wake_affine(struct sched_domain *sd,
+   struct task_struct *p, int this_cpu,
+   int prev_cpu, int sync)
+{
+   return true;
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 static void
@@ -5360,74 +5421,25 @@ static int wake_wide(struct task_struct *p)
 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
   int prev_cpu, int sync)
 {
-   s64 this_load, load;
-   s64 this_eff_load, prev_eff_load;
-   int idx, this_cpu;
-   struct task_group *tg;
-   unsigned long weight;
-   int balanced;
-
-   idx   = sd->wake_idx;
-   this_cpu  = smp_processor_id();
-   load  = source_load(prev_cpu, idx);
-   this_load = target_load(this_cpu, idx);
+   int this_cpu = smp_processor_id();
+   bool affine = false;
 
/*
 * Common case: CPUs are in the same socket, and select_idle_sibling
 * will do its thing regardless of what we return.
 */
if (cpus_share_cache(prev_cpu, this_cpu))
-   return true;
-
-   /*
-* If sync wakeup then subtract the (maximum possible)
-* effect of the currently running task from the load
-* of the current CPU:
-*/
-   if (sync) {
-   tg = task_group(current);
-   weight = current->se.avg.load_avg;
-
-   this_load += effective_load(tg, this_cpu, -weight, -weight);
-   load += effective_load(tg, prev_cpu, 0, -weight);
-   }
-
-   tg = task_group(p);
-   weight = p->se.avg.load_avg;
-
-   /*
-* In low-load situations, where prev_cpu is idle and this_cpu is idle
-* due to the sync cause above having dropped this_load to 0, we'll
-* always have an imbalance, but there's really nothing you can do
-* about that, so that's good too.
-*
-* Otherwise check if either cpus are near enough in load to allow this
-* task to be wo