Re: [PATCH] powerpc/nvdimm: Pick the nearby online node if the device node is not online

2019-07-15 Thread Aneesh Kumar K.V
"Aneesh Kumar K.V"  writes:

> This is similar to what ACPI does. Nvdimm layer doesn't bring the SCM device
> numa node online. Hence we need to make sure we always use an online node
> as ndr_desc.numa_node. Otherwise this result in kernel crashes. The target
> node is used by dax/kmem and that will bring up the numa node online 
> correctly.
>
> Without this patch, we do hit kernel crash as below because we try to access
> uninitialized NODE_DATA in different code paths.
>
> cpu 0x0: Vector: 300 (Data Access) at [c000fac53170]
> pc: c04bbc50: ___slab_alloc+0x120/0xca0
> lr: c04bc834: __slab_alloc+0x64/0xc0
> sp: c000fac53400
>msr: 82009033
>dar: 73e8
>  dsisr: 8
>   current = 0xc000fabb6d80
>   paca= 0xc387   irqmask: 0x03   irq_happened: 0x01
> pid   = 7, comm = kworker/u16:0
> Linux version 5.2.0-06234-g76bd729b2644 (kvaneesh@ltc-boston123) (gcc version 
> 7.4.0 (Ubuntu 7.4.0-1ubuntu1~18.04.1)) #135 SMP Thu Jul 11 05:36:30 CDT 2019
> enter ? for help
> [link register   ] c04bc834 __slab_alloc+0x64/0xc0
> [c000fac53400] c000fac53480 (unreliable)
> [c000fac53500] c04bc818 __slab_alloc+0x48/0xc0
> [c000fac53560] c04c30a0 __kmalloc_node_track_caller+0x3c0/0x6b0
> [c000fac535d0] c0cfafe4 devm_kmalloc+0x74/0xc0
> [c000fac53600] c0d69434 nd_region_activate+0x144/0x560
> [c000fac536d0] c0d6b19c nd_region_probe+0x17c/0x370
> [c000fac537b0] c0d6349c nvdimm_bus_probe+0x10c/0x230
> [c000fac53840] c0cf3cc4 really_probe+0x254/0x4e0
> [c000fac538d0] c0cf429c driver_probe_device+0x16c/0x1e0
> [c000fac53950] c0cf0b44 bus_for_each_drv+0x94/0x130
> [c000fac539b0] c0cf392c __device_attach+0xdc/0x200
> [c000fac53a50] c0cf231c bus_probe_device+0x4c/0xf0
> [c000fac53a90] c0ced268 device_add+0x528/0x810
> [c000fac53b60] c0d62a58 nd_async_device_register+0x28/0xa0
> [c000fac53bd0] c01ccb8c async_run_entry_fn+0xcc/0x1f0
> [c000fac53c50] c01bcd9c process_one_work+0x46c/0x860
> [c000fac53d20] c01bd4f4 worker_thread+0x364/0x5f0
> [c000fac53db0] c01c7260 kthread+0x1b0/0x1c0
> [c000fac53e20] c000b954 ret_from_kernel_thread+0x5c/0x68
>
> With the patch we get
>
>  # numactl -H
> available: 2 nodes (0-1)
> node 0 cpus:
> node 0 size: 0 MB
> node 0 free: 0 MB
> node 1 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
> 25 26 27 28 29 30 31
> node 1 size: 130865 MB
> node 1 free: 129130 MB
> node distances:
> node   0   1
>   0:  10  20
>   1:  20  10
>  # cat /sys/bus/nd/devices/region0/numa_node
> 0
>  # dmesg | grep papr_scm
> [   91.332305] papr_scm ibm,persistent-memory:ibm,pmemory@44104001: Region 
> registered with target node 2 and online node 0
>
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/platforms/pseries/papr_scm.c | 30 +--
>  1 file changed, 28 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
> b/arch/powerpc/platforms/pseries/papr_scm.c
> index c8ec670ee924..4abb0ecda30a 100644
> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> @@ -255,12 +255,32 @@ static const struct attribute_group 
> *papr_scm_dimm_groups[] = {
>   NULL,
>  };
>  
> +static inline int papr_scm_node(int node)
> +{
> + int min_dist = INT_MAX, dist;
> + int nid, min_node;
> +
> + if (node_online(node))
> + return node;

We should handle NUMA_NO_NODE here.

modified   arch/powerpc/platforms/pseries/papr_scm.c
@@ -260,7 +260,7 @@ static inline int papr_scm_node(int node)
int min_dist = INT_MAX, dist;
int nid, min_node;
 
-   if (node_online(node))
+   if ((node == NUMA_NO_NODE) || node_online(node))
return node;
 
min_node = first_online_node;

Will send an updated patch.

> +
> + min_node = first_online_node;
> + for_each_online_node(nid) {
> + dist = node_distance(node, nid);
> + if (dist < min_dist) {
> + min_dist = dist;
> + min_node = nid;
> + }
> + }
> + return min_node;
> +}
> +
>  static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>  {
>   struct device *dev = >pdev->dev;
>   struct nd_mapping_desc mapping;
>   struct nd_region_desc ndr_desc;
>   unsigned long dimm_flags;
> + int target_nid, online_nid;
>  
>   p->bus_desc.ndctl = papr_scm_ndctl;
>   p->bus_desc.module = THIS_MODULE;
> @@ -299,8 +319,11 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>  
>   memset(_desc, 0, sizeof(ndr_desc));
>   ndr_desc.attr_groups = region_attr_groups;
> - ndr_desc.numa_node = dev_to_node(>pdev->dev);
> - ndr_desc.target_node = ndr_desc.numa_node;
> + target_nid = 

[PATCH] powerpc/nvdimm: Pick the nearby online node if the device node is not online

2019-07-11 Thread Aneesh Kumar K.V
This is similar to what ACPI does. Nvdimm layer doesn't bring the SCM device
numa node online. Hence we need to make sure we always use an online node
as ndr_desc.numa_node. Otherwise this result in kernel crashes. The target
node is used by dax/kmem and that will bring up the numa node online correctly.

Without this patch, we do hit kernel crash as below because we try to access
uninitialized NODE_DATA in different code paths.

cpu 0x0: Vector: 300 (Data Access) at [c000fac53170]
pc: c04bbc50: ___slab_alloc+0x120/0xca0
lr: c04bc834: __slab_alloc+0x64/0xc0
sp: c000fac53400
   msr: 82009033
   dar: 73e8
 dsisr: 8
  current = 0xc000fabb6d80
  paca= 0xc387   irqmask: 0x03   irq_happened: 0x01
pid   = 7, comm = kworker/u16:0
Linux version 5.2.0-06234-g76bd729b2644 (kvaneesh@ltc-boston123) (gcc version 
7.4.0 (Ubuntu 7.4.0-1ubuntu1~18.04.1)) #135 SMP Thu Jul 11 05:36:30 CDT 2019
enter ? for help
[link register   ] c04bc834 __slab_alloc+0x64/0xc0
[c000fac53400] c000fac53480 (unreliable)
[c000fac53500] c04bc818 __slab_alloc+0x48/0xc0
[c000fac53560] c04c30a0 __kmalloc_node_track_caller+0x3c0/0x6b0
[c000fac535d0] c0cfafe4 devm_kmalloc+0x74/0xc0
[c000fac53600] c0d69434 nd_region_activate+0x144/0x560
[c000fac536d0] c0d6b19c nd_region_probe+0x17c/0x370
[c000fac537b0] c0d6349c nvdimm_bus_probe+0x10c/0x230
[c000fac53840] c0cf3cc4 really_probe+0x254/0x4e0
[c000fac538d0] c0cf429c driver_probe_device+0x16c/0x1e0
[c000fac53950] c0cf0b44 bus_for_each_drv+0x94/0x130
[c000fac539b0] c0cf392c __device_attach+0xdc/0x200
[c000fac53a50] c0cf231c bus_probe_device+0x4c/0xf0
[c000fac53a90] c0ced268 device_add+0x528/0x810
[c000fac53b60] c0d62a58 nd_async_device_register+0x28/0xa0
[c000fac53bd0] c01ccb8c async_run_entry_fn+0xcc/0x1f0
[c000fac53c50] c01bcd9c process_one_work+0x46c/0x860
[c000fac53d20] c01bd4f4 worker_thread+0x364/0x5f0
[c000fac53db0] c01c7260 kthread+0x1b0/0x1c0
[c000fac53e20] c000b954 ret_from_kernel_thread+0x5c/0x68

With the patch we get

 # numactl -H
available: 2 nodes (0-1)
node 0 cpus:
node 0 size: 0 MB
node 0 free: 0 MB
node 1 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
25 26 27 28 29 30 31
node 1 size: 130865 MB
node 1 free: 129130 MB
node distances:
node   0   1
  0:  10  20
  1:  20  10
 # cat /sys/bus/nd/devices/region0/numa_node
0
 # dmesg | grep papr_scm
[   91.332305] papr_scm ibm,persistent-memory:ibm,pmemory@44104001: Region 
registered with target node 2 and online node 0

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/platforms/pseries/papr_scm.c | 30 +--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
b/arch/powerpc/platforms/pseries/papr_scm.c
index c8ec670ee924..4abb0ecda30a 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -255,12 +255,32 @@ static const struct attribute_group 
*papr_scm_dimm_groups[] = {
NULL,
 };
 
+static inline int papr_scm_node(int node)
+{
+   int min_dist = INT_MAX, dist;
+   int nid, min_node;
+
+   if (node_online(node))
+   return node;
+
+   min_node = first_online_node;
+   for_each_online_node(nid) {
+   dist = node_distance(node, nid);
+   if (dist < min_dist) {
+   min_dist = dist;
+   min_node = nid;
+   }
+   }
+   return min_node;
+}
+
 static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
 {
struct device *dev = >pdev->dev;
struct nd_mapping_desc mapping;
struct nd_region_desc ndr_desc;
unsigned long dimm_flags;
+   int target_nid, online_nid;
 
p->bus_desc.ndctl = papr_scm_ndctl;
p->bus_desc.module = THIS_MODULE;
@@ -299,8 +319,11 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
 
memset(_desc, 0, sizeof(ndr_desc));
ndr_desc.attr_groups = region_attr_groups;
-   ndr_desc.numa_node = dev_to_node(>pdev->dev);
-   ndr_desc.target_node = ndr_desc.numa_node;
+   target_nid = dev_to_node(>pdev->dev);
+   online_nid = papr_scm_node(target_nid);
+   set_dev_node(>pdev->dev, online_nid);
+   ndr_desc.numa_node = online_nid;
+   ndr_desc.target_node = target_nid;
ndr_desc.res = >res;
ndr_desc.of_node = p->dn;
ndr_desc.provider_data = p;
@@ -318,6 +341,9 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
ndr_desc.res, p->dn);
goto err;
}
+   if (target_nid != online_nid)
+   dev_info(dev, "Region registered with target node %d and online 
node %d",
+