On Mon, Aug 21, 2017 at 04:21:28PM +0100, Brendan Jackman wrote:
> The current use of returning NULL from find_idlest_group is broken in
> two cases:
> 
> a1) The local group is not allowed.
> 
>    In this case, we currently do not change this_runnable_load or
>    this_avg_load from its initial value of 0, which means we return
>    NULL regardless of the load of the other, allowed groups. This
>    results in pointlessly continuing the find_idlest_group search
>    within the local group and then returning prev_cpu from
>    select_task_rq_fair.

> b) smp_processor_id() is the "idlest" and != prev_cpu.
> 
>    find_idlest_group also returns NULL when the local group is
>    allowed and is the idlest. The caller then continues the
>    find_idlest_group search at a lower level of the current CPU's
>    sched_domain hierarchy. However new_cpu is not updated. This means
>    the search is pointless and we return prev_cpu from
>    select_task_rq_fair.
> 

I think its much simpler than that.. but its late, so who knows ;-)

Both cases seem predicated on the assumption that we'll return @cpu when
we don't find any idler CPU. Consider, if the local group is the idlest,
we should stick with @cpu and simply proceed with the child domain.

The confusion, and the bugs, seem to have snuck in when we started
considering @prev_cpu, whenever that was. The below is mostly code
movement to put that whole while(sd) loop into its own function.

The effective change is setting @new_cpu = @cpu when we start that loop:

@@ -6023,6 +6023,8 @@ static int wake_cap(struct task_struct *p, int cpu, int 
prev_cpu)
                struct sched_group *group;
                int weight;
 
+               new_cpu = cpu;
+
                if (!(sd->flags & sd_flag)) {
                        sd = sd->child;
                        continue;




---
 kernel/sched/fair.c | 83 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 48 insertions(+), 35 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c77e4b1d51c0..3e77265c480a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5588,10 +5588,10 @@ static unsigned long capacity_spare_wake(int cpu, 
struct task_struct *p)
 }
 
 /*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
  */
 static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int 
this_cpu)
 {
        unsigned long load, min_load = ULONG_MAX;
        unsigned int min_exit_latency = UINT_MAX;
@@ -5640,6 +5640,50 @@ static unsigned long capacity_spare_wake(int cpu, struct 
task_struct *p)
        return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : 
least_loaded_cpu;
 }
 
+static int
+find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, int cpu, int 
sd_flag)
+{
+       struct sched_domain *tmp;
+       int new_cpu = cpu;
+
+       while (sd) {
+               struct sched_group *group;
+               int weight;
+
+               if (!(sd->flags & sd_flag)) {
+                       sd = sd->child;
+                       continue;
+               }
+
+               group = find_idlest_group(sd, p, cpu, sd_flag);
+               if (!group) {
+                       sd = sd->child;
+                       continue;
+               }
+
+               new_cpu = find_idlest_group_cpu(group, p, cpu);
+               if (new_cpu == -1 || new_cpu == cpu) {
+                       /* Now try balancing at a lower domain level of cpu */
+                       sd = sd->child;
+                       continue;
+               }
+
+               /* Now try balancing at a lower domain level of new_cpu */
+               cpu = new_cpu;
+               weight = sd->span_weight;
+               sd = NULL;
+               for_each_domain(cpu, tmp) {
+                       if (weight <= tmp->span_weight)
+                               break;
+                       if (tmp->flags & sd_flag)
+                               sd = tmp;
+               }
+               /* while loop will break here if sd == NULL */
+       }
+
+       return new_cpu;
+}
+
 /*
  * Implement a for_each_cpu() variant that starts the scan at a given cpu
  * (@start), and wraps around.
@@ -6019,39 +6063,8 @@ static int wake_cap(struct task_struct *p, int cpu, int 
prev_cpu)
                if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
                        new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
 
-       } else while (sd) {
-               struct sched_group *group;
-               int weight;
-
-               if (!(sd->flags & sd_flag)) {
-                       sd = sd->child;
-                       continue;
-               }
-
-               group = find_idlest_group(sd, p, cpu, sd_flag);
-               if (!group) {
-                       sd = sd->child;
-                       continue;
-               }
-
-               new_cpu = find_idlest_cpu(group, p, cpu);
-               if (new_cpu == -1 || new_cpu == cpu) {
-                       /* Now try balancing at a lower domain level of cpu */
-                       sd = sd->child;
-                       continue;
-               }
-
-               /* Now try balancing at a lower domain level of new_cpu */
-               cpu = new_cpu;
-               weight = sd->span_weight;
-               sd = NULL;
-               for_each_domain(cpu, tmp) {
-                       if (weight <= tmp->span_weight)
-                               break;
-                       if (tmp->flags & sd_flag)
-                               sd = tmp;
-               }
-               /* while loop will break here if sd == NULL */
+       } else {
+               new_cpu = find_idlest_cpu(sd, p, cpu, sd_flag);
        }
        rcu_read_unlock();
 



Reply via email to