On Wed, Oct 24, 2018 at 08:32:49AM +0530, Srikar Dronamraju wrote:
> Load balancer and NUMA balancer are not suppose to work on isolcpus.
> 
> Currently when setting sched affinity, there are no checks to see if the
> requested cpumask has CPUs from both isolcpus and housekeeping CPUs.
> 
> If user passes a mix of isolcpus and housekeeping CPUs, then
> NUMA balancer can pick a isolcpu to schedule.
> With this change, if a combination of isolcpus and housekeeping CPUs are
> provided, then we restrict ourselves to housekeeping CPUs.

I'm still not liking this much. This adds more special cases for
isolcpus. Also, I don't believe in correcting silly users; give 'em rope
and show them how to tie the knot.

Where does the numa balancer pick the 'wrong' CPU?

task_numa_migrate() checks to see if the task is currently part of a
SD_NUMA domain, otherwise it doesn't do anything. This means your
housekeeping mask spans multiple nodes to begin with, right?

But after that we seem to ignore the sched domains entirely;
task_numa_find_cpu() only tests cpus_allowed.

It appears to me the for_each_online_node() iteration in
task_numa_migrate() needs an addition test to see if the selected node
has any CPUs in the relevant sched_domain _at_all_.

A little something like the below -- except we also need to do something
about cpus_active_mask. Not been near a compiler.

---
 kernel/sched/fair.c | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ee271bb661cc..287ef7f0203b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1497,6 +1497,8 @@ struct task_numa_env {
        struct task_struct *best_task;
        long best_imp;
        int best_cpu;
+
+       cpumask_var_t cpus;
 };
 
 static void task_numa_assign(struct task_numa_env *env,
@@ -1704,7 +1706,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
         */
        maymove = !load_too_imbalanced(src_load, dst_load, env);
 
-       for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
+       for_each_cpu_and(cpu, cpumask_of_node(env->dst_nid), env->cpus) {
                /* Skip this CPU if the source task cannot migrate */
                if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
                        continue;
@@ -1734,6 +1736,9 @@ static int task_numa_migrate(struct task_struct *p)
        int nid, ret, dist;
        long taskimp, groupimp;
 
+       if (!alloc_cpumask_var(&env.cpus, GFP_KERNEL))
+               return -ENOMEM;
+
        /*
         * Pick the lowest SD_NUMA domain, as that would have the smallest
         * imbalance and would be the first to start moving tasks about.
@@ -1744,20 +1749,23 @@ static int task_numa_migrate(struct task_struct *p)
         */
        rcu_read_lock();
        sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
-       if (sd)
-               env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
-       rcu_read_unlock();
-
        /*
         * Cpusets can break the scheduler domain tree into smaller
         * balance domains, some of which do not cross NUMA boundaries.
         * Tasks that are "trapped" in such domains cannot be migrated
         * elsewhere, so there is no point in (re)trying.
         */
-       if (unlikely(!sd)) {
+       if (!sd) {
                sched_setnuma(p, task_node(p));
-               return -EINVAL;
+               rcu_read_unlock();
+               ret = -EINVAL;
+               goto out;
        }
+       env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+       while (sd->parent)
+               sd = sd->parent;
+       cpumask_copy(env.cpus, sched_domain_span(sd));
+       rcu_read_unlock();
 
        env.dst_nid = p->numa_preferred_nid;
        dist = env.dist = node_distance(env.src_nid, env.dst_nid);
@@ -1783,6 +1791,9 @@ static int task_numa_migrate(struct task_struct *p)
                        if (nid == env.src_nid || nid == p->numa_preferred_nid)
                                continue;
 
+                       if (!cpumask_intersects(cpumask_of_node(nid), env.cpus))
+                               continue;
+
                        dist = node_distance(env.src_nid, env.dst_nid);
                        if (sched_numa_topology_type == NUMA_BACKPLANE &&
                                                dist != env.dist) {
@@ -1822,8 +1833,10 @@ static int task_numa_migrate(struct task_struct *p)
        }
 
        /* No better CPU than the current one was found. */
-       if (env.best_cpu == -1)
-               return -EAGAIN;
+       if (env.best_cpu == -1) {
+               ret = -EAGAIN;
+               goto out;
+       }
 
        best_rq = cpu_rq(env.best_cpu);
        if (env.best_task == NULL) {
@@ -1831,7 +1844,7 @@ static int task_numa_migrate(struct task_struct *p)
                WRITE_ONCE(best_rq->numa_migrate_on, 0);
                if (ret != 0)
                        trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
-               return ret;
+               goto out;
        }
 
        ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
@@ -1840,6 +1853,9 @@ static int task_numa_migrate(struct task_struct *p)
        if (ret != 0)
                trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
        put_task_struct(env.best_task);
+
+out:
+       free_cpumask_var(&env.cpus);
        return ret;
 }
 

Reply via email to