Hi Laurent,

Laurent Dufour <lduf...@linux.ibm.com> writes:
> When a CPU is hot added, the CPU ids are taken from the available mask from
> the lower possible set. If that set of values was previously used for CPU
> attached to a different node, this seems to application like if these CPUs
> have migrated from a node to another one which is not expected in real
> life.

This seems like a problem that could affect other architectures or
platforms? I guess as long as arch code is responsible for placing new
CPUs in cpu_present_mask, that code will have the responsibility of
ensuring CPU IDs' NUMA assignments remain stable.

[...]

> The effect of this patch can be seen by removing and adding CPUs using the
> Qemu monitor. In the following case, the first CPU from the node 2 is
> removed, then the first one from the node 1 is removed too. Later, the
> first CPU of the node 2 is added back. Without that patch, the kernel will
> numbered these CPUs using the first CPU ids available which are the ones
> freed when removing the second CPU of the node 0. This leads to the CPU ids
> 16-23 to move from the node 1 to the node 2. With the patch applied, the
> CPU ids 32-39 are used since they are the lowest free ones which have not
> been used on another node.
>
> At boot time:
> [root@vm40 ~]# numactl -H | grep cpus
> node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
> node 1 cpus: 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
> node 2 cpus: 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
>
> Vanilla kernel, after the CPU hot unplug/plug operations:
> [root@vm40 ~]# numactl -H | grep cpus
> node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
> node 1 cpus: 24 25 26 27 28 29 30 31
> node 2 cpus: 16 17 18 19 20 21 22 23 40 41 42 43 44 45 46 47
>
> Patched kernel, after the CPU hot unplug/plug operations:
> [root@vm40 ~]# numactl -H | grep cpus
> node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
> node 1 cpus: 24 25 26 27 28 29 30 31
> node 2 cpus: 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47

Good demonstration of the problem. CPUs 16-23 "move" from node 1 to
node 2.


> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
> b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> index 12cbffd3c2e3..48c7943b25b0 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> @@ -39,6 +39,8 @@
>  /* This version can't take the spinlock, because it never returns */
>  static int rtas_stop_self_token = RTAS_UNKNOWN_SERVICE;
>  
> +static cpumask_var_t node_recorded_ids_map[MAX_NUMNODES];

I guess this should have documentation that it must be
accessed/manipulated with cpu_add_remove_lock held?


> +
>  static void rtas_stop_self(void)
>  {
>       static struct rtas_args args;
> @@ -151,29 +153,61 @@ static void pseries_cpu_die(unsigned int cpu)
>   */
>  static int pseries_add_processor(struct device_node *np)
>  {
> -     unsigned int cpu;
> +     unsigned int cpu, node;
>       cpumask_var_t candidate_mask, tmp;
> -     int err = -ENOSPC, len, nthreads, i;
> +     int err = -ENOSPC, len, nthreads, i, nid;

>From eight local vars to ten, and the two new variables' names are
"node" and "nid". More distinctive names would help readers.


>       const __be32 *intserv;
> +     bool force_reusing = false;
>  
>       intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len);
>       if (!intserv)
>               return 0;
>  
> -     zalloc_cpumask_var(&candidate_mask, GFP_KERNEL);
> -     zalloc_cpumask_var(&tmp, GFP_KERNEL);
> +     alloc_cpumask_var(&candidate_mask, GFP_KERNEL);
> +     alloc_cpumask_var(&tmp, GFP_KERNEL);
> +
> +     /*
> +      * Fetch from the DT nodes read by dlpar_configure_connector() the NUMA
> +      * node id the added CPU belongs to.
> +      */
> +     nid = of_node_to_nid(np);
> +     if (nid < 0 || !node_possible(nid))
> +             nid = first_online_node;
>  
>       nthreads = len / sizeof(u32);
> -     for (i = 0; i < nthreads; i++)
> -             cpumask_set_cpu(i, tmp);
>  
>       cpu_maps_update_begin();
>  
>       BUG_ON(!cpumask_subset(cpu_present_mask, cpu_possible_mask));
>  
> +again:
> +     cpumask_clear(candidate_mask);
> +     cpumask_clear(tmp);
> +     for (i = 0; i < nthreads; i++)
> +             cpumask_set_cpu(i, tmp);
> +
>       /* Get a bitmap of unoccupied slots. */
>       cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask);
> +
> +     /*
> +      * Remove free ids previously assigned on the other nodes. We can walk
> +      * only online nodes because once a node became online it is not turned
> +      * offlined back.
> +      */
> +     if (!force_reusing)
> +             for_each_online_node(node) {
> +                     if (node == nid) /* Keep our node's recorded ids */
> +                             continue;
> +                     cpumask_andnot(candidate_mask, candidate_mask,
> +                                    node_recorded_ids_map[node]);
> +             }
> +
>       if (cpumask_empty(candidate_mask)) {
> +             if (!force_reusing) {
> +                     force_reusing = true;
> +                     goto again;
> +             }
> +

Hmm I'd encourage you to work toward a solution that doesn't involve
adding backwards jumps and a bool flag to this code.

The function already mixes concerns and this change makes it a bit more
difficult to follow. I'd suggest that you first factor out into a
separate function the parts that allocate a suitable range from
cpu_possible_mask, and only then introduce the behavior change
constraining the results.


>               /* If we get here, it most likely means that NR_CPUS is
>                * less than the partition's max processors setting.
>                */
> @@ -191,12 +225,36 @@ static int pseries_add_processor(struct device_node *np)
>                       cpumask_shift_left(tmp, tmp, nthreads);
>  
>       if (cpumask_empty(tmp)) {
> +             if (!force_reusing) {
> +                     force_reusing = true;
> +                     goto again;
> +             }
>               printk(KERN_ERR "Unable to find space in cpu_present_mask for"
>                      " processor %pOFn with %d thread(s)\n", np,
>                      nthreads);
>               goto out_unlock;
>       }
>  
> +     /* Record the newly used CPU ids for the associate node. */
> +     cpumask_or(node_recorded_ids_map[nid], node_recorded_ids_map[nid], tmp);
> +
> +     /*
> +      * If we force reusing the id, remove these ids from any node which was
> +      * previously using it.
> +      */
> +     if (force_reusing) {
> +             cpu = cpumask_first(tmp);
> +             pr_warn("Reusing free CPU ids %d-%d from another node\n",
> +                     cpu, cpu + nthreads - 1);
> +
> +             for_each_online_node(node) {
> +                     if (node == nid)
> +                             continue;
> +                     cpumask_andnot(node_recorded_ids_map[node],
> +                                    node_recorded_ids_map[node], tmp);
> +             }
> +     }
> +

I don't know, should we not fail the request instead of doing the
ABI-breaking thing the code in this change is trying to prevent? I don't
think a warning in the kernel log is going to help any application that
would be affected by this.


>       for_each_cpu(cpu, tmp) {
>               BUG_ON(cpu_present(cpu));
>               set_cpu_present(cpu, true);
> @@ -889,6 +947,7 @@ static struct notifier_block pseries_smp_nb = {
>  static int __init pseries_cpu_hotplug_init(void)
>  {
>       int qcss_tok;
> +     unsigned int node;
>  
>  #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
>       ppc_md.cpu_probe = dlpar_cpu_probe;
> @@ -910,8 +969,18 @@ static int __init pseries_cpu_hotplug_init(void)
>       smp_ops->cpu_die = pseries_cpu_die;
>  
>       /* Processors can be added/removed only on LPAR */
> -     if (firmware_has_feature(FW_FEATURE_LPAR))
> +     if (firmware_has_feature(FW_FEATURE_LPAR)) {
> +             for_each_node(node) {
> +                     alloc_bootmem_cpumask_var(&node_recorded_ids_map[node]);
> +
> +                     /* Record ids of CPU added at boot time */
> +                     cpumask_or(node_recorded_ids_map[node],
> +                                node_recorded_ids_map[node],
> +                                node_to_cpumask_map[node]);
> +             }
> +
>               of_reconfig_notifier_register(&pseries_smp_nb);
> +     }

This looks OK.

Reply via email to