Re: [PATCH 00/13] Reconcile NUMA balancing decisions with the load balancer v6

Mel Gorman Fri, 22 May 2020 06:29:46 -0700

On Thu, May 21, 2020 at 01:41:32PM +0200, Peter Zijlstra wrote:
> On Thu, May 21, 2020 at 11:38:16AM +0100, Mel Gorman wrote:
> > IIUC, this patch front-loads as much work as possible before checking if
> > the task is on_rq and then the waker/wakee shares a cache, queue task on
> > the wake_list and otherwise do a direct wakeup.
> > 
> > The advantage is that spinning is avoided on p->on_rq when p does not
> > share a cache. The disadvantage is that it may result in tasks being
> > stacked but this should only happen when the domain is overloaded and
> > select_task_eq() is unlikely to find an idle CPU. The load balancer would
> > soon correct the situation anyway.
> > 
> > In terms of netperf for my testing, the benefit is marginal because the
> > wakeups are primarily between tasks that share cache. It does trigger as
> > perf indicates that some time is spent in ttwu_queue_remote with this
> > patch, it's just that the overall time spent spinning on p->on_rq is
> > very similar. I'm still waiting on other workloads to complete to see
> > what the impact is.
> 
> So it might make sense to play with the exact conditions under which
> we'll attempt this remote queue, if we see a large 'local' p->on_cpu
> spin time, it might make sense to attempt the queue even in this case.
> 
> We could for example change it to:
> 
>       if (REAC_ONCE(p->on_cpu) && ttwu_queue_remote(p, cpu, wake_flags | 
> WF_ON_CPU))
>               goto unlock;
> 
> and then use that in ttwu_queue_remote() to differentiate between these
> two cases.
>


>  #endif /* CONFIG_SMP */
>  
>       ttwu_queue(p, cpu, wake_flags);

Is something like this on top of your patch what you had in mind?

---8<---

---
 kernel/sched/core.c  | 35 ++++++++++++++++++++++++++---------
 kernel/sched/sched.h |  3 ++-
 2 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 987b8ecf2ee9..435ecf5820ee 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2330,13 +2330,19 @@ void scheduler_ipi(void)
        irq_exit();
 }
 
-static void __ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
+/*
+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
+ * necessary. The wakee CPU on receipt of the IPI will queue the task
+ * via sched_ttwu_wakeup() for activation instead of the waking task
+ * activating and queueing the wakee.
+ */
+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int 
wake_flags)
 {
        struct rq *rq = cpu_rq(cpu);
 
        p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
 
-       if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
+       if (llist_add(&p->wake_entry, &rq->wake_list)) {
                if (!set_nr_if_polling(rq->idle))
                        smp_send_reschedule(cpu);
                else
@@ -2373,12 +2379,23 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
        return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
 }
 
-static bool ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
 {
-       if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), 
cpu)) {
-               sched_clock_cpu(cpu); /* Sync clocks across CPUs */
-               __ttwu_queue_remote(p, cpu, wake_flags);
-               return true;
+       if (sched_feat(TTWU_QUEUE)) {
+               /*
+                * If CPU does not share cache then queue the task on the remote
+                * rqs wakelist to avoid accessing remote data. Alternatively,
+                * if the task is descheduling and the only running task on the
+                * CPU then use the wakelist to offload the task activation to
+                * the CPU that will soon be idle so the waker can continue.
+                * nr_running is checked to avoid unnecessary task stacking.
+                */
+               if (!cpus_share_cache(smp_processor_id(), cpu) ||
+                   ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1)) {
+                       sched_clock_cpu(cpu); /* Sync clocks across CPUs */
+                       __ttwu_queue_wakelist(p, cpu, wake_flags);
+                       return true;
+               }
        }
 
        return false;
@@ -2391,7 +2408,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, 
int wake_flags)
        struct rq_flags rf;
 
 #if defined(CONFIG_SMP)
-       if (ttwu_queue_remote(p, cpu, wake_flags))
+       if (ttwu_queue_wakelist(p, cpu, wake_flags))
                return;
 #endif
 
@@ -2611,7 +2628,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, 
int wake_flags)
         * let the waker make forward progress. This is safe because IRQs are
         * disabled and the IPI will deliver after on_cpu is cleared.
         */
-       if (READ_ONCE(p->on_cpu) && ttwu_queue_remote(p, cpu, wake_flags))
+       if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | 
WF_ON_RQ))
                goto unlock;
 
        /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index db3a57675ccf..06297d1142a0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1688,7 +1688,8 @@ static inline int task_on_rq_migrating(struct task_struct 
*p)
  */
 #define WF_SYNC                        0x01            /* Waker goes to sleep 
after wakeup */
 #define WF_FORK                        0x02            /* Child wakeup after 
fork */
-#define WF_MIGRATED            0x4             /* Internal use, task got 
migrated */
+#define WF_MIGRATED            0x04            /* Internal use, task got 
migrated */
+#define WF_ON_RQ               0x08            /* Wakee is on_rq */
 
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution

Re: [PATCH 00/13] Reconcile NUMA balancing decisions with the load balancer v6

Reply via email to