Re: [PATCH] powerpc/nvdimm: Pick the nearby online node if the device node is not online
"Aneesh Kumar K.V" writes: > This is similar to what ACPI does. Nvdimm layer doesn't bring the SCM device > numa node online. Hence we need to make sure we always use an online node > as ndr_desc.numa_node. Otherwise this result in kernel crashes. The target > node is used by dax/kmem and that will bring up the numa node online > correctly. > > Without this patch, we do hit kernel crash as below because we try to access > uninitialized NODE_DATA in different code paths. > > cpu 0x0: Vector: 300 (Data Access) at [c000fac53170] > pc: c04bbc50: ___slab_alloc+0x120/0xca0 > lr: c04bc834: __slab_alloc+0x64/0xc0 > sp: c000fac53400 >msr: 82009033 >dar: 73e8 > dsisr: 8 > current = 0xc000fabb6d80 > paca= 0xc387 irqmask: 0x03 irq_happened: 0x01 > pid = 7, comm = kworker/u16:0 > Linux version 5.2.0-06234-g76bd729b2644 (kvaneesh@ltc-boston123) (gcc version > 7.4.0 (Ubuntu 7.4.0-1ubuntu1~18.04.1)) #135 SMP Thu Jul 11 05:36:30 CDT 2019 > enter ? for help > [link register ] c04bc834 __slab_alloc+0x64/0xc0 > [c000fac53400] c000fac53480 (unreliable) > [c000fac53500] c04bc818 __slab_alloc+0x48/0xc0 > [c000fac53560] c04c30a0 __kmalloc_node_track_caller+0x3c0/0x6b0 > [c000fac535d0] c0cfafe4 devm_kmalloc+0x74/0xc0 > [c000fac53600] c0d69434 nd_region_activate+0x144/0x560 > [c000fac536d0] c0d6b19c nd_region_probe+0x17c/0x370 > [c000fac537b0] c0d6349c nvdimm_bus_probe+0x10c/0x230 > [c000fac53840] c0cf3cc4 really_probe+0x254/0x4e0 > [c000fac538d0] c0cf429c driver_probe_device+0x16c/0x1e0 > [c000fac53950] c0cf0b44 bus_for_each_drv+0x94/0x130 > [c000fac539b0] c0cf392c __device_attach+0xdc/0x200 > [c000fac53a50] c0cf231c bus_probe_device+0x4c/0xf0 > [c000fac53a90] c0ced268 device_add+0x528/0x810 > [c000fac53b60] c0d62a58 nd_async_device_register+0x28/0xa0 > [c000fac53bd0] c01ccb8c async_run_entry_fn+0xcc/0x1f0 > [c000fac53c50] c01bcd9c process_one_work+0x46c/0x860 > [c000fac53d20] c01bd4f4 worker_thread+0x364/0x5f0 > [c000fac53db0] c01c7260 kthread+0x1b0/0x1c0 > [c000fac53e20] c000b954 ret_from_kernel_thread+0x5c/0x68 > > With the patch we get > > # numactl -H > available: 2 nodes (0-1) > node 0 cpus: > node 0 size: 0 MB > node 0 free: 0 MB > node 1 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 > 25 26 27 28 29 30 31 > node 1 size: 130865 MB > node 1 free: 129130 MB > node distances: > node 0 1 > 0: 10 20 > 1: 20 10 > # cat /sys/bus/nd/devices/region0/numa_node > 0 > # dmesg | grep papr_scm > [ 91.332305] papr_scm ibm,persistent-memory:ibm,pmemory@44104001: Region > registered with target node 2 and online node 0 > > Signed-off-by: Aneesh Kumar K.V > --- > arch/powerpc/platforms/pseries/papr_scm.c | 30 +-- > 1 file changed, 28 insertions(+), 2 deletions(-) > > diff --git a/arch/powerpc/platforms/pseries/papr_scm.c > b/arch/powerpc/platforms/pseries/papr_scm.c > index c8ec670ee924..4abb0ecda30a 100644 > --- a/arch/powerpc/platforms/pseries/papr_scm.c > +++ b/arch/powerpc/platforms/pseries/papr_scm.c > @@ -255,12 +255,32 @@ static const struct attribute_group > *papr_scm_dimm_groups[] = { > NULL, > }; > > +static inline int papr_scm_node(int node) > +{ > + int min_dist = INT_MAX, dist; > + int nid, min_node; > + > + if (node_online(node)) > + return node; We should handle NUMA_NO_NODE here. modified arch/powerpc/platforms/pseries/papr_scm.c @@ -260,7 +260,7 @@ static inline int papr_scm_node(int node) int min_dist = INT_MAX, dist; int nid, min_node; - if (node_online(node)) + if ((node == NUMA_NO_NODE) || node_online(node)) return node; min_node = first_online_node; Will send an updated patch. > + > + min_node = first_online_node; > + for_each_online_node(nid) { > + dist = node_distance(node, nid); > + if (dist < min_dist) { > + min_dist = dist; > + min_node = nid; > + } > + } > + return min_node; > +} > + > static int papr_scm_nvdimm_init(struct papr_scm_priv *p) > { > struct device *dev = >pdev->dev; > struct nd_mapping_desc mapping; > struct nd_region_desc ndr_desc; > unsigned long dimm_flags; > + int target_nid, online_nid; > > p->bus_desc.ndctl = papr_scm_ndctl; > p->bus_desc.module = THIS_MODULE; > @@ -299,8 +319,11 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) > > memset(_desc, 0, sizeof(ndr_desc)); > ndr_desc.attr_groups = region_attr_groups; > - ndr_desc.numa_node = dev_to_node(>pdev->dev); > - ndr_desc.target_node = ndr_desc.numa_node; > + target_nid =
[PATCH] powerpc/nvdimm: Pick the nearby online node if the device node is not online
This is similar to what ACPI does. Nvdimm layer doesn't bring the SCM device numa node online. Hence we need to make sure we always use an online node as ndr_desc.numa_node. Otherwise this result in kernel crashes. The target node is used by dax/kmem and that will bring up the numa node online correctly. Without this patch, we do hit kernel crash as below because we try to access uninitialized NODE_DATA in different code paths. cpu 0x0: Vector: 300 (Data Access) at [c000fac53170] pc: c04bbc50: ___slab_alloc+0x120/0xca0 lr: c04bc834: __slab_alloc+0x64/0xc0 sp: c000fac53400 msr: 82009033 dar: 73e8 dsisr: 8 current = 0xc000fabb6d80 paca= 0xc387 irqmask: 0x03 irq_happened: 0x01 pid = 7, comm = kworker/u16:0 Linux version 5.2.0-06234-g76bd729b2644 (kvaneesh@ltc-boston123) (gcc version 7.4.0 (Ubuntu 7.4.0-1ubuntu1~18.04.1)) #135 SMP Thu Jul 11 05:36:30 CDT 2019 enter ? for help [link register ] c04bc834 __slab_alloc+0x64/0xc0 [c000fac53400] c000fac53480 (unreliable) [c000fac53500] c04bc818 __slab_alloc+0x48/0xc0 [c000fac53560] c04c30a0 __kmalloc_node_track_caller+0x3c0/0x6b0 [c000fac535d0] c0cfafe4 devm_kmalloc+0x74/0xc0 [c000fac53600] c0d69434 nd_region_activate+0x144/0x560 [c000fac536d0] c0d6b19c nd_region_probe+0x17c/0x370 [c000fac537b0] c0d6349c nvdimm_bus_probe+0x10c/0x230 [c000fac53840] c0cf3cc4 really_probe+0x254/0x4e0 [c000fac538d0] c0cf429c driver_probe_device+0x16c/0x1e0 [c000fac53950] c0cf0b44 bus_for_each_drv+0x94/0x130 [c000fac539b0] c0cf392c __device_attach+0xdc/0x200 [c000fac53a50] c0cf231c bus_probe_device+0x4c/0xf0 [c000fac53a90] c0ced268 device_add+0x528/0x810 [c000fac53b60] c0d62a58 nd_async_device_register+0x28/0xa0 [c000fac53bd0] c01ccb8c async_run_entry_fn+0xcc/0x1f0 [c000fac53c50] c01bcd9c process_one_work+0x46c/0x860 [c000fac53d20] c01bd4f4 worker_thread+0x364/0x5f0 [c000fac53db0] c01c7260 kthread+0x1b0/0x1c0 [c000fac53e20] c000b954 ret_from_kernel_thread+0x5c/0x68 With the patch we get # numactl -H available: 2 nodes (0-1) node 0 cpus: node 0 size: 0 MB node 0 free: 0 MB node 1 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 node 1 size: 130865 MB node 1 free: 129130 MB node distances: node 0 1 0: 10 20 1: 20 10 # cat /sys/bus/nd/devices/region0/numa_node 0 # dmesg | grep papr_scm [ 91.332305] papr_scm ibm,persistent-memory:ibm,pmemory@44104001: Region registered with target node 2 and online node 0 Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/platforms/pseries/papr_scm.c | 30 +-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index c8ec670ee924..4abb0ecda30a 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -255,12 +255,32 @@ static const struct attribute_group *papr_scm_dimm_groups[] = { NULL, }; +static inline int papr_scm_node(int node) +{ + int min_dist = INT_MAX, dist; + int nid, min_node; + + if (node_online(node)) + return node; + + min_node = first_online_node; + for_each_online_node(nid) { + dist = node_distance(node, nid); + if (dist < min_dist) { + min_dist = dist; + min_node = nid; + } + } + return min_node; +} + static int papr_scm_nvdimm_init(struct papr_scm_priv *p) { struct device *dev = >pdev->dev; struct nd_mapping_desc mapping; struct nd_region_desc ndr_desc; unsigned long dimm_flags; + int target_nid, online_nid; p->bus_desc.ndctl = papr_scm_ndctl; p->bus_desc.module = THIS_MODULE; @@ -299,8 +319,11 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) memset(_desc, 0, sizeof(ndr_desc)); ndr_desc.attr_groups = region_attr_groups; - ndr_desc.numa_node = dev_to_node(>pdev->dev); - ndr_desc.target_node = ndr_desc.numa_node; + target_nid = dev_to_node(>pdev->dev); + online_nid = papr_scm_node(target_nid); + set_dev_node(>pdev->dev, online_nid); + ndr_desc.numa_node = online_nid; + ndr_desc.target_node = target_nid; ndr_desc.res = >res; ndr_desc.of_node = p->dn; ndr_desc.provider_data = p; @@ -318,6 +341,9 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) ndr_desc.res, p->dn); goto err; } + if (target_nid != online_nid) + dev_info(dev, "Region registered with target node %d and online node %d", +