> -----Original Message-----
> From: Valentin Schneider [mailto:valentin.schnei...@arm.com]
> Sent: Saturday, January 23, 2021 1:40 AM
> To: linux-kernel@vger.kernel.org
> Cc: mi...@kernel.org; pet...@infradead.org; vincent.guit...@linaro.org;
> dietmar.eggem...@arm.com; morten.rasmus...@arm.com; mgor...@suse.de; Song Bao
> Hua (Barry Song) <song.bao....@hisilicon.com>
> Subject: [PATCH 1/1] sched/topology: Make sched_init_numa() use a set for the
> deduplicating sort
> 
> The deduplicating sort in sched_init_numa() assumes that the first line in
> the distance table contains all unique values in the entire table. I've
> been trying to pen what this exactly means for the topology, but it's not
> straightforward. For instance, topology.c uses this example:
> 
>   node   0   1   2   3
>     0:  10  20  20  30
>     1:  20  10  20  20
>     2:  20  20  10  20
>     3:  30  20  20  10
> 
>   0 ----- 1
>   |     / |
>   |   /   |
>   | /     |
>   2 ----- 3
> 
> Which works out just fine. However, if we swap nodes 0 and 1:
> 
>   1 ----- 0
>   |     / |
>   |   /   |
>   | /     |
>   2 ----- 3
> 
> we get this distance table:
> 
>   node   0  1  2  3
>     0:  10 20 20 20
>     1:  20 10 20 30
>     2:  20 20 10 20
>     3:  20 30 20 10
> 
> Which breaks the deduplicating sort (non-representative first line). In
> this case this would just be a renumbering exercise, but it so happens that
> we can have a deduplicating sort that goes through the whole table in O(n²)
> at the extra cost of a temporary memory allocation (i.e. any form of set).
> 
> The ACPI spec (SLIT) mentions distances are encoded on 8 bits. Following
> this, implement the set as a 256-bits bitmap. Should this not be
> satisfactory (i.e. we want to support 32-bit values), then we'll have to go
> for some other sparse set implementation.
> 
> This has the added benefit of letting us allocate just the right amount of
> memory for sched_domains_numa_distance[], rather than an arbitrary
> (nr_node_ids + 1).
> 
> Note: DT binding equivalent (distance-map) decodes distances as 32-bit
> values.
> 
> Signed-off-by: Valentin Schneider <valentin.schnei...@arm.com>

Hi Valentin, thanks.
It seems it didn't resolve my problem. The simplest code to workaround
my problem is:
void sched_init_numa(void)
{
        ...
        next_distance = curr_distance;
        for (i = 0; i < nr_node_ids; i++) {
                for (j = 0; j < nr_node_ids; j++) {
                        for (k = 0; k < nr_node_ids; k++) {
+                               int ii = (i + 1) % nr_node_ids;
-                               int distance = node_distance(i, k);
+                               int distance = node_distance(ii, k);

                                if (distance > curr_distance &&
                                    (distance < next_distance ||
                                     next_distance == curr_distance))
                                        next_distance = distance;

                                /*

On the other hand, the patch made the kernel panic during boot:

[    1.596789] swapper pgtable: 4k pages, 48-bit VAs, pgdp=00000000417c9000
[    1.598406] [ffff80002e8cc001] pgd=000000013ffff003,
p4d=000000013ffff003, pud=000000013fffe003, pmd=0000000000000000
[    1.600258] Internal error: Oops: 96000006 [#1] PREEMPT SMP
[    1.600730] Modules linked in:
[    1.601072] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G        W
  5.11.0-rc1-00079-g8da796ff6f58-dirty #114
[    1.601652] Hardware name: linux,dummy-virt (DT)
[    1.601917] pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--)
[    1.602202] pc : build_sched_domains+0x3e4/0x1498
[    1.604050] lr : build_sched_domains+0x3c0/0x1498
[    1.604512] sp : ffff800011f1bce0
[    1.604654] x29: ffff800011f1bce0 x28: ffff800011b7a410
[    1.604924] x27: ffff800011b799c0 x26: ffff00000263b300
[    1.605227] x25: 00000000ffffffff x24: ffff00000263b3a0
[    1.605470] x23: 0000000000000000 x22: ffff800011b799c0
[    1.605813] x21: 0000000000000000 x20: ffff00000261db00
[    1.606140] x19: ffff0000025c7600 x18: 0000000000000010
[    1.606472] x17: 000000001472bb62 x16: 000000006e92504d
[    1.606885] x15: ffff000002600508 x14: 0000000000000116
[    1.607408] x13: ffff000002600508 x12: 00000000ffffffea
[    1.607681] x11: ffff800011c02fe8 x10: ffff800011beafa8
[    1.607931] x9 : ffff800011beb000 x8 : 0000000000017fe8
[    1.608225] x7 : 00000000000002a8 x6 : 00000000000000ff
[    1.608480] x5 : 0000000000000000 x4 : 0000000000000000
[    1.608690] x3 : 0000000000000000 x2 : ffff80002e8cc000
[    1.609048] x1 : 0000000000000001 x0 : 0000000000000001
[    1.609425] Call trace:
[    1.609550]  build_sched_domains+0x3e4/0x1498
[    1.609777]  sched_init_domains+0x88/0xb8
[    1.609937]  sched_init_smp+0x30/0x80
[    1.610170]  kernel_init_freeable+0xf4/0x238
[    1.610388]  kernel_init+0x14/0x118
[    1.610559]  ret_from_fork+0x10/0x30
[    1.611234] Code: b4000201 93407eb7 aa0103e0 f8777ac2 (f8626800)
[    1.613107] ---[ end trace 01540465b01c8e3b ]---
[    1.614871] Kernel panic - not syncing: Attempted to kill init!
exitcode=0x0000000b
[    1.615433] SMP: stopping secondary CPUs
[    1.616415] ---[ end Kernel panic - not syncing: Attempted to kill
init! exitcode=0x0000000b ]---

with the below topology:
qemu-system-aarch64 -M virt -nographic \
 -smp cpus=8 \
 -numa node,cpus=0-1,nodeid=0 \
 -numa node,cpus=2-3,nodeid=1 \
 -numa node,cpus=4-5,nodeid=2 \
 -numa node,cpus=6-7,nodeid=3 \
 -numa dist,src=0,dst=1,val=12 \
 -numa dist,src=0,dst=2,val=20 \
 -numa dist,src=0,dst=3,val=22 \
 -numa dist,src=1,dst=2,val=22 \
 -numa dist,src=2,dst=3,val=12 \
 -numa dist,src=1,dst=3,val=24 \


The panic address is *1294:

                        if (sdd->sd) {
    1280:       f9400e61        ldr     x1, [x19, #24]
    1284:       b4000201        cbz     x1, 12c4 <build_sched_domains+0x414>
                                sd = *per_cpu_ptr(sdd->sd, j);
    1288:       93407eb7        sxtw    x23, w21
    128c:       aa0103e0        mov     x0, x1
    1290:       f8777ac2        ldr     x2, [x22, x23, lsl #3]
    *1294:       f8626800        ldr     x0, [x0, x2]
                                if (sd && (sd->flags & SD_OVERLAP))
    1298:       b4000120        cbz     x0, 12bc <build_sched_domains+0x40c>
    129c:       b9403803        ldr     w3, [x0, #56]
    12a0:       365800e3        tbz     w3, #11, 12bc
<build_sched_domains+0x40c>
                                        free_sched_groups(sd->groups, 0);
    12a4:       f9400800        ldr     x0, [x0, #16]
        if (!sg)

Thanks
Barry

> ---
>  include/linux/topology.h |  1 +
>  kernel/sched/topology.c  | 99 +++++++++++++++++++---------------------
>  2 files changed, 49 insertions(+), 51 deletions(-)
> 
> diff --git a/include/linux/topology.h b/include/linux/topology.h
> index ad03df1cc266..7634cd737061 100644
> --- a/include/linux/topology.h
> +++ b/include/linux/topology.h
> @@ -48,6 +48,7 @@ int arch_update_cpu_topology(void);
>  /* Conform to ACPI 2.0 SLIT distance definitions */
>  #define LOCAL_DISTANCE               10
>  #define REMOTE_DISTANCE              20
> +#define DISTANCE_BITS           8
>  #ifndef node_distance
>  #define node_distance(from,to)       ((from) == (to) ? LOCAL_DISTANCE :
> REMOTE_DISTANCE)
>  #endif
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 5d3675c7a76b..bf5c9bd10bfe 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1596,66 +1596,58 @@ static void init_numa_topology_type(void)
>       }
>  }
> 
> +
> +#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
> +
>  void sched_init_numa(void)
>  {
> -     int next_distance, curr_distance = node_distance(0, 0);
>       struct sched_domain_topology_level *tl;
> -     int level = 0;
> -     int i, j, k;
> -
> -     sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1),
> GFP_KERNEL);
> -     if (!sched_domains_numa_distance)
> -             return;
> -
> -     /* Includes NUMA identity node at level 0. */
> -     sched_domains_numa_distance[level++] = curr_distance;
> -     sched_domains_numa_levels = level;
> +     unsigned long *distance_map;
> +     int nr_levels = 0;
> +     int i, j;
> 
>       /*
>        * O(nr_nodes^2) deduplicating selection sort -- in order to find the
>        * unique distances in the node_distance() table.
> -      *
> -      * Assumes node_distance(0,j) includes all distances in
> -      * node_distance(i,j) in order to avoid cubic time.
>        */
> -     next_distance = curr_distance;
> +     distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
> +     if (!distance_map)
> +             return;
> +
> +     bitmap_zero(distance_map, NR_DISTANCE_VALUES);
>       for (i = 0; i < nr_node_ids; i++) {
>               for (j = 0; j < nr_node_ids; j++) {
> -                     for (k = 0; k < nr_node_ids; k++) {
> -                             int distance = node_distance(i, k);
> -
> -                             if (distance > curr_distance &&
> -                                 (distance < next_distance ||
> -                                  next_distance == curr_distance))
> -                                     next_distance = distance;
> -
> -                             /*
> -                              * While not a strong assumption it would be 
> nice to know
> -                              * about cases where if node A is connected to 
> B, B is not
> -                              * equally connected to A.
> -                              */
> -                             if (sched_debug() && node_distance(k, i) != 
> distance)
> -                                     sched_numa_warn("Node-distance not 
> symmetric");
> +                     int distance = node_distance(i, j);
> 
> -                             if (sched_debug() && i && 
> !find_numa_distance(distance))
> -                                     sched_numa_warn("Node-0 not 
> representative");
> +                     if (distance < LOCAL_DISTANCE || distance >= 
> NR_DISTANCE_VALUES) {
> +                             sched_numa_warn("Invalid distance value range");
> +                             return;
>                       }
> -                     if (next_distance != curr_distance) {
> -                             sched_domains_numa_distance[level++] = 
> next_distance;
> -                             sched_domains_numa_levels = level;
> -                             curr_distance = next_distance;
> -                     } else break;
> +
> +                     bitmap_set(distance_map, distance, 1);
>               }
> +     }
> +     /*
> +      * We can now figure out how many unique distance values there are and
> +      * allocate memory accordingly.
> +      */
> +     nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
> 
> -             /*
> -              * In case of sched_debug() we verify the above assumption.
> -              */
> -             if (!sched_debug())
> -                     break;
> +     sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int),
> GFP_KERNEL);
> +     if (!sched_domains_numa_distance) {
> +             bitmap_free(distance_map);
> +             return;
> +     }
> +
> +     for (i = 0, j = 0; i < nr_levels; i++, j++) {
> +             j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
> +             sched_domains_numa_distance[i] = j;
>       }
> 
> +     bitmap_free(distance_map);
> +
>       /*
> -      * 'level' contains the number of unique distances
> +      * 'nr_levels' contains the number of unique distances
>        *
>        * The sched_domains_numa_distance[] array includes the actual distance
>        * numbers.
> @@ -1664,15 +1656,15 @@ void sched_init_numa(void)
>       /*
>        * Here, we should temporarily reset sched_domains_numa_levels to 0.
>        * If it fails to allocate memory for array 
> sched_domains_numa_masks[][],
> -      * the array will contain less then 'level' members. This could be
> +      * the array will contain less then 'nr_levels' members. This could be
>        * dangerous when we use it to iterate array 
> sched_domains_numa_masks[][]
>        * in other functions.
>        *
> -      * We reset it to 'level' at the end of this function.
> +      * We reset it to 'nr_levels' at the end of this function.
>        */
>       sched_domains_numa_levels = 0;
> 
> -     sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
> +     sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels,
> GFP_KERNEL);
>       if (!sched_domains_numa_masks)
>               return;
> 
> @@ -1680,7 +1672,7 @@ void sched_init_numa(void)
>        * Now for each level, construct a mask per node which contains all
>        * CPUs of nodes that are that many hops away from us.
>        */
> -     for (i = 0; i < level; i++) {
> +     for (i = 0; i < nr_levels; i++) {
>               sched_domains_numa_masks[i] =
>                       kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
>               if (!sched_domains_numa_masks[i])
> @@ -1688,12 +1680,17 @@ void sched_init_numa(void)
> 
>               for (j = 0; j < nr_node_ids; j++) {
>                       struct cpumask *mask = kzalloc(cpumask_size(), 
> GFP_KERNEL);
> +                     int k;
> +
>                       if (!mask)
>                               return;
> 
>                       sched_domains_numa_masks[i][j] = mask;
> 
>                       for_each_node(k) {
> +                             if (sched_debug() && (node_distance(j, k) != 
> node_distance(k,
> j)))
> +                                     sched_numa_warn("Node-distance not 
> symmetric");
> +
>                               if (node_distance(j, k) > 
> sched_domains_numa_distance[i])
>                                       continue;
> 
> @@ -1705,7 +1702,7 @@ void sched_init_numa(void)
>       /* Compute default topology size */
>       for (i = 0; sched_domain_topology[i].mask; i++);
> 
> -     tl = kzalloc((i + level + 1) *
> +     tl = kzalloc((i + nr_levels) *
>                       sizeof(struct sched_domain_topology_level), GFP_KERNEL);
>       if (!tl)
>               return;
> @@ -1728,7 +1725,7 @@ void sched_init_numa(void)
>       /*
>        * .. and append 'j' levels of NUMA goodness.
>        */
> -     for (j = 1; j < level; i++, j++) {
> +     for (j = 1; j < nr_levels; i++, j++) {
>               tl[i] = (struct sched_domain_topology_level){
>                       .mask = sd_numa_mask,
>                       .sd_flags = cpu_numa_flags,
> @@ -1740,8 +1737,8 @@ void sched_init_numa(void)
> 
>       sched_domain_topology = tl;
> 
> -     sched_domains_numa_levels = level;
> -     sched_max_numa_distance = sched_domains_numa_distance[level - 1];
> +     sched_domains_numa_levels = nr_levels;
> +     sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
> 
>       init_numa_topology_type();
>  }
> --
> 2.27.0

Reply via email to