Re: [PATCH v5 1/6] powerpc/pseries: rename min_common_depth to primary_domain_index

2021-07-21 Thread Aneesh Kumar K.V

On 7/22/21 8:06 AM, David Gibson wrote:

On Thu, Jul 22, 2021 at 11:59:15AM +1000, David Gibson wrote:

On Mon, Jun 28, 2021 at 08:41:12PM +0530, Aneesh Kumar K.V wrote:

No functional change in this patch.


The new name does not match how you describe "primary domain index" in
the documentation from patch 6/6.  There it comes from the values in
associativity-reference-points, but here it simply comes from the
lengths of all the associativity properties.


No, sorry, I misread this code... misled by the old name, so it's a
good thing you're changing it.

But.. I'm still not sure the new name is accurate, either...

[snip]

if (form1_affinity) {
-   depth = of_read_number(distance_ref_points, 1);
+   index = of_read_number(distance_ref_points, 1);


AFACIT distance_ref_points hasn't been altered from the
of_get_property() at this point, so isn't this setting depth / index
to the number of entries in ref-points, rather than the value of the
first entry (which is what primary domain index is supposed to be).



ibm,associativity-reference-points property format is as below.

# lsprop  ibm,associativity-reference-points
ibm,associativity-reference-points
 0004 0002

it doesn't have the number of elements as the first item.

For FORM1 1 element is the NUMA boundary index/primary_domain_index
For FORM0 2 element is the NUMA boundary index/primary_domain_index.



} else {
if (distance_ref_points_depth < 2) {
printk(KERN_WARNING "NUMA: "
@@ -334,7 +334,7 @@ static int __init find_min_common_depth(void)
goto err;
}
  
-		depth = of_read_number(&distance_ref_points[1], 1);

+   index = of_read_number(&distance_ref_points[1], 1);
}
  
  	/*

@@ -348,7 +348,7 @@ static int __init find_min_common_depth(void)
}
  
  	of_node_put(root);

-   return depth;
+   return index;
  
  err:

of_node_put(root);
@@ -437,16 +437,16 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
int nid = default_nid;
int rc, index;
  
-	if ((min_common_depth < 0) || !numa_enabled)

+   if ((primary_domain_index < 0) || !numa_enabled)
return default_nid;
  
  	rc = of_get_assoc_arrays(&aa);

if (rc)
return default_nid;
  
-	if (min_common_depth <= aa.array_sz &&

+   if (primary_domain_index <= aa.array_sz &&
!(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
aa.n_arrays) {
-   index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
+   index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
nid = of_read_number(&aa.arrays[index], 1);
  
  		if (nid == 0x || nid >= nr_node_ids)

@@ -708,18 +708,18 @@ static int __init parse_numa_properties(void)
return -1;
}
  
-	min_common_depth = find_min_common_depth();

+   primary_domain_index = find_primary_domain_index();
  
-	if (min_common_depth < 0) {

+   if (primary_domain_index < 0) {
/*
-* if we fail to parse min_common_depth from device tree
+* if we fail to parse primary_domain_index from device tree
 * mark the numa disabled, boot with numa disabled.
 */
numa_enabled = false;
-   return min_common_depth;
+   return primary_domain_index;
}
  
-	dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);

+   dbg("NUMA associativity depth for CPU/Memory: %d\n", 
primary_domain_index);
  
  	/*

 * Even though we connect cpus to numa domains later in SMP
@@ -919,14 +919,14 @@ static void __init find_possible_nodes(void)
goto out;
}
  
-	max_nodes = of_read_number(&domains[min_common_depth], 1);

+   max_nodes = of_read_number(&domains[primary_domain_index], 1);
for (i = 0; i < max_nodes; i++) {
if (!node_possible(i))
node_set(i, node_possible_map);
}
  
  	prop_length /= sizeof(int);

-   if (prop_length > min_common_depth + 2)
+   if (prop_length > primary_domain_index + 2)
coregroup_enabled = 1;
  
  out:

@@ -1259,7 +1259,7 @@ int cpu_to_coregroup_id(int cpu)
goto out;
  
  	index = of_read_number(associativity, 1);

-   if (index > min_common_depth + 1)
+   if (index > primary_domain_index + 1)
return of_read_number(&associativity[index - 1], 1);
  
  out:










[powerpc:merge] BUILD SUCCESS bbff45e5b6020c1259f90034926aef03dce1803d

2021-07-21 Thread kernel test robot
720
x86_64   randconfig-a001-20210720
x86_64   randconfig-a005-20210720
x86_64   randconfig-a004-20210720
x86_64   randconfig-a002-20210720
i386 randconfig-a005-20210720
i386 randconfig-a003-20210720
i386 randconfig-a004-20210720
i386 randconfig-a002-20210720
i386 randconfig-a001-20210720
i386 randconfig-a006-20210720
i386 randconfig-a005-20210722
i386 randconfig-a003-20210722
i386 randconfig-a004-20210722
i386 randconfig-a002-20210722
i386 randconfig-a001-20210722
i386 randconfig-a006-20210722
i386 randconfig-a005-20210719
i386 randconfig-a004-20210719
i386 randconfig-a006-20210719
i386 randconfig-a001-20210719
i386 randconfig-a003-20210719
i386 randconfig-a002-20210719
i386 randconfig-a016-20210720
i386 randconfig-a013-20210720
i386 randconfig-a012-20210720
i386 randconfig-a014-20210720
i386 randconfig-a011-20210720
i386 randconfig-a015-20210720
i386 randconfig-a016-20210721
i386 randconfig-a013-20210721
i386 randconfig-a012-20210721
i386 randconfig-a014-20210721
i386 randconfig-a011-20210721
i386 randconfig-a015-20210721
riscvnommu_k210_defconfig
riscvallyesconfig
riscv allnoconfig
riscv   defconfig
riscvallmodconfig
x86_64rhel-8.3-kselftests
um   x86_64_defconfig
um i386_defconfig
x86_64   allyesconfig
x86_64  defconfig
x86_64   rhel-8.3
x86_64  rhel-8.3-kbuiltin
x86_64  kexec

clang tested configs:
x86_64   randconfig-b001-20210720
x86_64   randconfig-a011-20210722
x86_64   randconfig-a003-20210721
x86_64   randconfig-a006-20210721
x86_64   randconfig-a001-20210721
x86_64   randconfig-a005-20210721
x86_64   randconfig-a004-20210721
x86_64   randconfig-a002-20210721

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


Re: [PATCH v5 1/6] powerpc/pseries: rename min_common_depth to primary_domain_index

2021-07-21 Thread David Gibson
On Thu, Jul 22, 2021 at 11:59:15AM +1000, David Gibson wrote:
> On Mon, Jun 28, 2021 at 08:41:12PM +0530, Aneesh Kumar K.V wrote:
> > No functional change in this patch.
> 
> The new name does not match how you describe "primary domain index" in
> the documentation from patch 6/6.  There it comes from the values in
> associativity-reference-points, but here it simply comes from the
> lengths of all the associativity properties.

No, sorry, I misread this code... misled by the old name, so it's a
good thing you're changing it.

But.. I'm still not sure the new name is accurate, either...

[snip]
> > if (form1_affinity) {
> > -   depth = of_read_number(distance_ref_points, 1);
> > +   index = of_read_number(distance_ref_points, 1);

AFACIT distance_ref_points hasn't been altered from the
of_get_property() at this point, so isn't this setting depth / index
to the number of entries in ref-points, rather than the value of the
first entry (which is what primary domain index is supposed to be).

> > } else {
> > if (distance_ref_points_depth < 2) {
> > printk(KERN_WARNING "NUMA: "
> > @@ -334,7 +334,7 @@ static int __init find_min_common_depth(void)
> > goto err;
> > }
> >  
> > -   depth = of_read_number(&distance_ref_points[1], 1);
> > +   index = of_read_number(&distance_ref_points[1], 1);
> > }
> >  
> > /*
> > @@ -348,7 +348,7 @@ static int __init find_min_common_depth(void)
> > }
> >  
> > of_node_put(root);
> > -   return depth;
> > +   return index;
> >  
> >  err:
> > of_node_put(root);
> > @@ -437,16 +437,16 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
> > int nid = default_nid;
> > int rc, index;
> >  
> > -   if ((min_common_depth < 0) || !numa_enabled)
> > +   if ((primary_domain_index < 0) || !numa_enabled)
> > return default_nid;
> >  
> > rc = of_get_assoc_arrays(&aa);
> > if (rc)
> > return default_nid;
> >  
> > -   if (min_common_depth <= aa.array_sz &&
> > +   if (primary_domain_index <= aa.array_sz &&
> > !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
> > aa.n_arrays) {
> > -   index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
> > +   index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
> > nid = of_read_number(&aa.arrays[index], 1);
> >  
> > if (nid == 0x || nid >= nr_node_ids)
> > @@ -708,18 +708,18 @@ static int __init parse_numa_properties(void)
> > return -1;
> > }
> >  
> > -   min_common_depth = find_min_common_depth();
> > +   primary_domain_index = find_primary_domain_index();
> >  
> > -   if (min_common_depth < 0) {
> > +   if (primary_domain_index < 0) {
> > /*
> > -* if we fail to parse min_common_depth from device tree
> > +* if we fail to parse primary_domain_index from device tree
> >  * mark the numa disabled, boot with numa disabled.
> >  */
> > numa_enabled = false;
> > -   return min_common_depth;
> > +   return primary_domain_index;
> > }
> >  
> > -   dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
> > +   dbg("NUMA associativity depth for CPU/Memory: %d\n", 
> > primary_domain_index);
> >  
> > /*
> >  * Even though we connect cpus to numa domains later in SMP
> > @@ -919,14 +919,14 @@ static void __init find_possible_nodes(void)
> > goto out;
> > }
> >  
> > -   max_nodes = of_read_number(&domains[min_common_depth], 1);
> > +   max_nodes = of_read_number(&domains[primary_domain_index], 1);
> > for (i = 0; i < max_nodes; i++) {
> > if (!node_possible(i))
> > node_set(i, node_possible_map);
> > }
> >  
> > prop_length /= sizeof(int);
> > -   if (prop_length > min_common_depth + 2)
> > +   if (prop_length > primary_domain_index + 2)
> > coregroup_enabled = 1;
> >  
> >  out:
> > @@ -1259,7 +1259,7 @@ int cpu_to_coregroup_id(int cpu)
> > goto out;
> >  
> > index = of_read_number(associativity, 1);
> > -   if (index > min_common_depth + 1)
> > +   if (index > primary_domain_index + 1)
> > return of_read_number(&associativity[index - 1], 1);
> >  
> >  out:
> 



-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v5 2/6] powerpc/pseries: rename distance_ref_points_depth to max_associativity_domain_index

2021-07-21 Thread David Gibson
On Mon, Jun 28, 2021 at 08:41:13PM +0530, Aneesh Kumar K.V wrote:
> No functional change in this patch
> 
> Signed-off-by: Aneesh Kumar K.V 

Reviewed-by: David Gibson 

> ---
>  arch/powerpc/mm/numa.c | 20 ++--
>  1 file changed, 10 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 8365b298ec48..132813dd1a6c 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -56,7 +56,7 @@ static int n_mem_addr_cells, n_mem_size_cells;
>  static int form1_affinity;
>  
>  #define MAX_DISTANCE_REF_POINTS 4
> -static int distance_ref_points_depth;
> +static int max_associativity_domain_index;
>  static const __be32 *distance_ref_points;
>  static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
>  
> @@ -169,7 +169,7 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>  
>   int i, index;
>  
> - for (i = 0; i < distance_ref_points_depth; i++) {
> + for (i = 0; i < max_associativity_domain_index; i++) {
>   index = be32_to_cpu(distance_ref_points[i]);
>   if (cpu1_assoc[index] == cpu2_assoc[index])
>   break;
> @@ -193,7 +193,7 @@ int __node_distance(int a, int b)
>   if (!form1_affinity)
>   return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
>  
> - for (i = 0; i < distance_ref_points_depth; i++) {
> + for (i = 0; i < max_associativity_domain_index; i++) {
>   if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
>   break;
>  
> @@ -213,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
>   if (!form1_affinity)
>   return;
>  
> - for (i = 0; i < distance_ref_points_depth; i++) {
> + for (i = 0; i < max_associativity_domain_index; i++) {
>   const __be32 *entry;
>  
>   entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
> @@ -240,7 +240,7 @@ static int associativity_to_nid(const __be32 
> *associativity)
>   nid = NUMA_NO_NODE;
>  
>   if (nid > 0 &&
> - of_read_number(associativity, 1) >= distance_ref_points_depth) {
> + of_read_number(associativity, 1) >= 
> max_associativity_domain_index) {
>   /*
>* Skip the length field and send start of associativity array
>*/
> @@ -310,14 +310,14 @@ static int __init find_primary_domain_index(void)
>*/
>   distance_ref_points = of_get_property(root,
>   "ibm,associativity-reference-points",
> - &distance_ref_points_depth);
> + &max_associativity_domain_index);
>  
>   if (!distance_ref_points) {
>   dbg("NUMA: ibm,associativity-reference-points not found.\n");
>   goto err;
>   }
>  
> - distance_ref_points_depth /= sizeof(int);
> + max_associativity_domain_index /= sizeof(int);
>  
>   if (firmware_has_feature(FW_FEATURE_OPAL) ||
>   firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
> @@ -328,7 +328,7 @@ static int __init find_primary_domain_index(void)
>   if (form1_affinity) {
>   index = of_read_number(distance_ref_points, 1);
>   } else {
> - if (distance_ref_points_depth < 2) {
> + if (max_associativity_domain_index < 2) {
>   printk(KERN_WARNING "NUMA: "
>   "short ibm,associativity-reference-points\n");
>   goto err;
> @@ -341,10 +341,10 @@ static int __init find_primary_domain_index(void)
>* Warn and cap if the hardware supports more than
>* MAX_DISTANCE_REF_POINTS domains.
>*/
> - if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
> + if (max_associativity_domain_index > MAX_DISTANCE_REF_POINTS) {
>   printk(KERN_WARNING "NUMA: distance array capped at "
>   "%d entries\n", MAX_DISTANCE_REF_POINTS);
> - distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
> + max_associativity_domain_index = MAX_DISTANCE_REF_POINTS;
>   }
>  
>   of_node_put(root);

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v5 1/6] powerpc/pseries: rename min_common_depth to primary_domain_index

2021-07-21 Thread David Gibson
On Mon, Jun 28, 2021 at 08:41:12PM +0530, Aneesh Kumar K.V wrote:
> No functional change in this patch.

The new name does not match how you describe "primary domain index" in
the documentation from patch 6/6.  There it comes from the values in
associativity-reference-points, but here it simply comes from the
lengths of all the associativity properties.

> 
> Reviewed-by: David Gibson 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/mm/numa.c | 38 +++---
>  1 file changed, 19 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index f2bf98bdcea2..8365b298ec48 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -51,7 +51,7 @@ EXPORT_SYMBOL(numa_cpu_lookup_table);
>  EXPORT_SYMBOL(node_to_cpumask_map);
>  EXPORT_SYMBOL(node_data);
>  
> -static int min_common_depth;
> +static int primary_domain_index;
>  static int n_mem_addr_cells, n_mem_size_cells;
>  static int form1_affinity;
>  
> @@ -232,8 +232,8 @@ static int associativity_to_nid(const __be32 
> *associativity)
>   if (!numa_enabled)
>   goto out;
>  
> - if (of_read_number(associativity, 1) >= min_common_depth)
> - nid = of_read_number(&associativity[min_common_depth], 1);
> + if (of_read_number(associativity, 1) >= primary_domain_index)
> + nid = of_read_number(&associativity[primary_domain_index], 1);
>  
>   /* POWER4 LPAR uses 0x as invalid node */
>   if (nid == 0x || nid >= nr_node_ids)
> @@ -284,9 +284,9 @@ int of_node_to_nid(struct device_node *device)
>  }
>  EXPORT_SYMBOL(of_node_to_nid);
>  
> -static int __init find_min_common_depth(void)
> +static int __init find_primary_domain_index(void)
>  {
> - int depth;
> + int index;
>   struct device_node *root;
>  
>   if (firmware_has_feature(FW_FEATURE_OPAL))
> @@ -326,7 +326,7 @@ static int __init find_min_common_depth(void)
>   }
>  
>   if (form1_affinity) {
> - depth = of_read_number(distance_ref_points, 1);
> + index = of_read_number(distance_ref_points, 1);
>   } else {
>   if (distance_ref_points_depth < 2) {
>   printk(KERN_WARNING "NUMA: "
> @@ -334,7 +334,7 @@ static int __init find_min_common_depth(void)
>   goto err;
>   }
>  
> - depth = of_read_number(&distance_ref_points[1], 1);
> + index = of_read_number(&distance_ref_points[1], 1);
>   }
>  
>   /*
> @@ -348,7 +348,7 @@ static int __init find_min_common_depth(void)
>   }
>  
>   of_node_put(root);
> - return depth;
> + return index;
>  
>  err:
>   of_node_put(root);
> @@ -437,16 +437,16 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
>   int nid = default_nid;
>   int rc, index;
>  
> - if ((min_common_depth < 0) || !numa_enabled)
> + if ((primary_domain_index < 0) || !numa_enabled)
>   return default_nid;
>  
>   rc = of_get_assoc_arrays(&aa);
>   if (rc)
>   return default_nid;
>  
> - if (min_common_depth <= aa.array_sz &&
> + if (primary_domain_index <= aa.array_sz &&
>   !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
> aa.n_arrays) {
> - index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
> + index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
>   nid = of_read_number(&aa.arrays[index], 1);
>  
>   if (nid == 0x || nid >= nr_node_ids)
> @@ -708,18 +708,18 @@ static int __init parse_numa_properties(void)
>   return -1;
>   }
>  
> - min_common_depth = find_min_common_depth();
> + primary_domain_index = find_primary_domain_index();
>  
> - if (min_common_depth < 0) {
> + if (primary_domain_index < 0) {
>   /*
> -  * if we fail to parse min_common_depth from device tree
> +  * if we fail to parse primary_domain_index from device tree
>* mark the numa disabled, boot with numa disabled.
>*/
>   numa_enabled = false;
> - return min_common_depth;
> + return primary_domain_index;
>   }
>  
> - dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
> + dbg("NUMA associativity depth for CPU/Memory: %d\n", 
> primary_domain_index);
>  
>   /*
>* Even though we connect cpus to numa domains later in SMP
> @@ -919,14 +919,14 @@ static void __init find_possible_nodes(void)
>   goto out;
>   }
>  
> - max_nodes = of_read_number(&domains[min_common_depth], 1);
> + max_nodes = of_read_number(&domains[primary_domain_index], 1);
>   for (i = 0; i < max_nodes; i++) {
>   if (!node_possible(i))
>   node_set(i, node_possible_map);
>   }
>  
>   prop_length /= sizeof(int);
> - if (prop_length > min_common_d

Re: [PATCH v5 4/6] powerpc/pseries: Consolidate different NUMA distance update code paths

2021-07-21 Thread David Gibson
On Mon, Jun 28, 2021 at 08:41:15PM +0530, Aneesh Kumar K.V wrote:
> The associativity details of the newly added resourced are collected from
> the hypervisor via "ibm,configure-connector" rtas call. Update the numa
> distance details of the newly added numa node after the above call.
> 
> Instead of updating NUMA distance every time we lookup a node id
> from the associativity property, add helpers that can be used
> during boot which does this only once. Also remove the distance
> update from node id lookup helpers.
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/mm/numa.c| 173 +-
>  arch/powerpc/platforms/pseries/hotplug-cpu.c  |   2 +
>  .../platforms/pseries/hotplug-memory.c|   2 +
>  arch/powerpc/platforms/pseries/pseries.h  |   1 +
>  4 files changed, 132 insertions(+), 46 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 0ec16999beef..7b142f79d600 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -208,22 +208,6 @@ int __node_distance(int a, int b)
>  }
>  EXPORT_SYMBOL(__node_distance);
>  
> -static void initialize_distance_lookup_table(int nid,
> - const __be32 *associativity)
> -{
> - int i;
> -
> - if (affinity_form != FORM1_AFFINITY)
> - return;
> -
> - for (i = 0; i < max_associativity_domain_index; i++) {
> - const __be32 *entry;
> -
> - entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
> - distance_lookup_table[nid][i] = of_read_number(entry, 1);
> - }
> -}
> -
>  /*
>   * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
>   * info is found.
> @@ -241,15 +225,6 @@ static int associativity_to_nid(const __be32 
> *associativity)
>   /* POWER4 LPAR uses 0x as invalid node */
>   if (nid == 0x || nid >= nr_node_ids)
>   nid = NUMA_NO_NODE;
> -
> - if (nid > 0 &&
> - of_read_number(associativity, 1) >= 
> max_associativity_domain_index) {
> - /*
> -  * Skip the length field and send start of associativity array
> -  */
> - initialize_distance_lookup_table(nid, associativity + 1);
> - }
> -
>  out:
>   return nid;
>  }
> @@ -287,6 +262,49 @@ int of_node_to_nid(struct device_node *device)
>  }
>  EXPORT_SYMBOL(of_node_to_nid);
>  
> +static void __initialize_form1_numa_distance(const __be32 *associativity)
> +{
> + int i, nid;
> +
> + if (affinity_form != FORM1_AFFINITY)

Since this shouldn't be called on a !form1 system, this could be a WARN_ON().

> + return;
> +
> + if (of_read_number(associativity, 1) >= primary_domain_index) {
> + nid = of_read_number(&associativity[primary_domain_index], 1);

This computes the nid from the assoc array independently of
associativity_to_nid, which doesn't seem like a good idea.  Wouldn't
it be better to call assocaitivity_to_nid(), then make the next bit
conditional on nid !== NUMA_NO_NODE?

> +
> + for (i = 0; i < max_associativity_domain_index; i++) {
> + const __be32 *entry;
> +
> + entry = 
> &associativity[be32_to_cpu(distance_ref_points[i])];
> + distance_lookup_table[nid][i] = of_read_number(entry, 
> 1);
> + }
> + }
> +}
> +
> +static void initialize_form1_numa_distance(struct device_node *node)
> +{
> + const __be32 *associativity;
> +
> + associativity = of_get_associativity(node);
> + if (!associativity)
> + return;
> +
> + __initialize_form1_numa_distance(associativity);
> +}
> +
> +/*
> + * Used to update distance information w.r.t newly added node.
> + */
> +void update_numa_distance(struct device_node *node)
> +{
> + if (affinity_form == FORM0_AFFINITY)
> + return;
> + else if (affinity_form == FORM1_AFFINITY) {
> + initialize_form1_numa_distance(node);
> + return;
> + }
> +}
> +
>  static int __init find_primary_domain_index(void)
>  {
>   int index;
> @@ -433,6 +451,48 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa)
>   return 0;
>  }
>  
> +static int get_nid_and_numa_distance(struct drmem_lmb *lmb)
> +{
> + struct assoc_arrays aa = { .arrays = NULL };
> + int default_nid = NUMA_NO_NODE;
> + int nid = default_nid;
> + int rc, index;
> +
> + if ((primary_domain_index < 0) || !numa_enabled)

Under what circumstances could you get primary_domain_index < 0?

> + return default_nid;
> +
> + rc = of_get_assoc_arrays(&aa);
> + if (rc)
> + return default_nid;
> +
> + if (primary_domain_index <= aa.array_sz &&
> + !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
> aa.n_arrays) {
> + index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;

Does anywhere verify that primary_domain_index <= aa.array_sz?

> + 

Re: [PATCH v5 5/6] powerpc/pseries: Add a helper for form1 cpu distance

2021-07-21 Thread David Gibson
On Mon, Jun 28, 2021 at 08:41:16PM +0530, Aneesh Kumar K.V wrote:
> This helper is only used with the dispatch trace log collection.
> A later patch will add Form2 affinity support and this change helps
> in keeping that simpler. Also add a comment explaining we don't expect
> the code to be called with FORM0
> 
> Reviewed-by: David Gibson 
> Signed-off-by: Aneesh Kumar K.V 

What makes it a "relative_distance" rather than just a "distance"?

> ---
>  arch/powerpc/include/asm/topology.h   |  4 ++--
>  arch/powerpc/mm/numa.c| 10 +-
>  arch/powerpc/platforms/pseries/lpar.c |  4 ++--
>  3 files changed, 13 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/topology.h 
> b/arch/powerpc/include/asm/topology.h
> index e4db64c0e184..ac8b5ed79832 100644
> --- a/arch/powerpc/include/asm/topology.h
> +++ b/arch/powerpc/include/asm/topology.h
> @@ -36,7 +36,7 @@ static inline int pcibus_to_node(struct pci_bus *bus)
>cpu_all_mask : \
>cpumask_of_node(pcibus_to_node(bus)))
>  
> -extern int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc);
> +int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc);
>  extern int __node_distance(int, int);
>  #define node_distance(a, b) __node_distance(a, b)
>  
> @@ -83,7 +83,7 @@ static inline void sysfs_remove_device_from_node(struct 
> device *dev,
>  
>  static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) 
> {}
>  
> -static inline int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
> +static inline int cpu_relative_distance(__be32 *cpu1_assoc, __be32 
> *cpu2_assoc)
>  {
>   return 0;
>  }
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 7b142f79d600..c6293037a103 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -166,7 +166,7 @@ static void unmap_cpu_from_node(unsigned long cpu)
>  }
>  #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
>  
> -int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
> +static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 
> *cpu2_assoc)
>  {
>   int dist = 0;
>  
> @@ -182,6 +182,14 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>   return dist;
>  }
>  
> +int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
> +{
> + /* We should not get called with FORM0 */
> + VM_WARN_ON(affinity_form == FORM0_AFFINITY);
> +
> + return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc);
> +}
> +
>  /* must hold reference to node during call */
>  static const __be32 *of_get_associativity(struct device_node *dev)
>  {
> diff --git a/arch/powerpc/platforms/pseries/lpar.c 
> b/arch/powerpc/platforms/pseries/lpar.c
> index dab356e3ff87..afefbdfe768d 100644
> --- a/arch/powerpc/platforms/pseries/lpar.c
> +++ b/arch/powerpc/platforms/pseries/lpar.c
> @@ -261,7 +261,7 @@ static int cpu_relative_dispatch_distance(int 
> last_disp_cpu, int cur_disp_cpu)
>   if (!last_disp_cpu_assoc || !cur_disp_cpu_assoc)
>   return -EIO;
>  
> - return cpu_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc);
> + return cpu_relative_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc);
>  }
>  
>  static int cpu_home_node_dispatch_distance(int disp_cpu)
> @@ -281,7 +281,7 @@ static int cpu_home_node_dispatch_distance(int disp_cpu)
>   if (!disp_cpu_assoc || !vcpu_assoc)
>   return -EIO;
>  
> - return cpu_distance(disp_cpu_assoc, vcpu_assoc);
> + return cpu_relative_distance(disp_cpu_assoc, vcpu_assoc);
>  }
>  
>  static void update_vcpu_disp_stat(int disp_cpu)

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v5 6/6] powerpc/pseries: Add support for FORM2 associativity

2021-07-21 Thread David Gibson
On Mon, Jun 28, 2021 at 08:41:17PM +0530, Aneesh Kumar K.V wrote:
> PAPR interface currently supports two different ways of communicating resource
> grouping details to the OS. These are referred to as Form 0 and Form 1
> associativity grouping. Form 0 is the older format and is now considered
> deprecated. This patch adds another resource grouping named FORM2.
> 
> Signed-off-by: Daniel Henrique Barboza 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  Documentation/powerpc/associativity.rst   | 103 ++
>  arch/powerpc/include/asm/firmware.h   |   3 +-
>  arch/powerpc/include/asm/prom.h   |   1 +
>  arch/powerpc/kernel/prom_init.c   |   3 +-
>  arch/powerpc/mm/numa.c| 157 ++
>  arch/powerpc/platforms/pseries/firmware.c |   1 +
>  6 files changed, 242 insertions(+), 26 deletions(-)
>  create mode 100644 Documentation/powerpc/associativity.rst
> 
> diff --git a/Documentation/powerpc/associativity.rst 
> b/Documentation/powerpc/associativity.rst
> new file mode 100644
> index ..31cc7da2c7a6
> --- /dev/null
> +++ b/Documentation/powerpc/associativity.rst
> @@ -0,0 +1,103 @@
> +
> +NUMA resource associativity
> +=
> +
> +Associativity represents the groupings of the various platform resources into
> +domains of substantially similar mean performance relative to resources 
> outside
> +of that domain. Resources subsets of a given domain that exhibit better
> +performance relative to each other than relative to other resources subsets
> +are represented as being members of a sub-grouping domain. This performance
> +characteristic is presented in terms of NUMA node distance within the Linux 
> kernel.
> +From the platform view, these groups are also referred to as domains.

Pretty hard to decipher, but that's typical for PAPR.

> +PAPR interface currently supports different ways of communicating these 
> resource
> +grouping details to the OS. These are referred to as Form 0, Form 1 and Form2
> +associativity grouping. Form 0 is the older format and is now considered 
> deprecated.

Nit: s/older/oldest/ since there are now >2 forms.

> +Hypervisor indicates the type/form of associativity used via 
> "ibm,architecture-vec-5 property".
> +Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of 
> Form 0 or Form 1.
> +A value of 1 indicates the usage of Form 1 associativity. For Form 2 
> associativity
> +bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used.
> +
> +Form 0
> +-
> +Form 0 associativity supports only two NUMA distances (LOCAL and REMOTE).
> +
> +Form 1
> +-
> +With Form 1 a combination of ibm,associativity-reference-points, and 
> ibm,associativity
> +device tree properties are used to determine the NUMA distance between 
> resource groups/domains.
> +
> +The “ibm,associativity” property contains a list of one or more numbers 
> (domainID)
> +representing the resource’s platform grouping domains.
> +
> +The “ibm,associativity-reference-points” property contains a list of one or 
> more numbers
> +(domainID index) that represents the 1 based ordinal in the associativity 
> lists.
> +The list of domainID indexes represents an increasing hierarchy of resource 
> grouping.
> +
> +ex:
> +{ primary domainID index, secondary domainID index, tertiary domainID 
> index.. }
> +
> +Linux kernel uses the domainID at the primary domainID index as the NUMA 
> node id.
> +Linux kernel computes NUMA distance between two domains by recursively 
> comparing
> +if they belong to the same higher-level domains. For mismatch at every higher
> +level of the resource group, the kernel doubles the NUMA distance between the
> +comparing domains.
> +
> +Form 2
> +---
> +Form 2 associativity format adds separate device tree properties 
> representing NUMA node distance
> +thereby making the node distance computation flexible. Form 2 also allows 
> flexible primary
> +domain numbering. With numa distance computation now detached from the index 
> value in
> +"ibm,associativity-reference-points" property, Form 2 allows a large number 
> of primary domain
> +ids at the same domainID index representing resource groups of different 
> performance/latency
> +characteristics.
> +
> +Hypervisor indicates the usage of FORM2 associativity using bit 2 of byte 5 
> in the
> +"ibm,architecture-vec-5" property.
> +
> +"ibm,numa-lookup-index-table" property contains a list of one or more 
> numbers representing
> +the domainIDs present in the system. The offset of the domainID in this 
> property is
> +used as an index while computing numa distance information via 
> "ibm,numa-distance-table".
> +
> +prop-encoded-array: The number N of the domainIDs encoded as with 
> encode-int, followed by
> +N domainID encoded as with encode-int
> +
> +For ex:
> +"ibm,numa-lookup-index-table" =  {4, 0, 8, 250, 252}. The offset of domainID 
> 8 (2) is used when
> +computing the distan

Re: [PATCH v5 2/6] powerpc/pseries: rename distance_ref_points_depth to max_associativity_domain_index

2021-07-21 Thread David Gibson
On Thu, Jul 22, 2021 at 10:59:09AM +1000, David Gibson wrote:
> On Mon, Jun 28, 2021 at 08:41:13PM +0530, Aneesh Kumar K.V wrote:
> > No functional change in this patch
> > 
> > Signed-off-by: Aneesh Kumar K.V 
> 
> Reviewed-by: David Gibson 

No... wait, I take that back.  This change makes the code *more*
confusing.

"distance_ref_points_depth" is accurate - it's the length of the
distance_ref_points array.

"max_associativity_domain_index" is not.  That implies it's the
maximum value that a domain index can have - which it isn't.  You
could have 15 entries in every associativity array, but if only 2 of
them are referenced in distance_ref_points, then
"max_associativity_domain_index" would only be 2.

> 
> > ---
> >  arch/powerpc/mm/numa.c | 20 ++--
> >  1 file changed, 10 insertions(+), 10 deletions(-)
> > 
> > diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> > index 8365b298ec48..132813dd1a6c 100644
> > --- a/arch/powerpc/mm/numa.c
> > +++ b/arch/powerpc/mm/numa.c
> > @@ -56,7 +56,7 @@ static int n_mem_addr_cells, n_mem_size_cells;
> >  static int form1_affinity;
> >  
> >  #define MAX_DISTANCE_REF_POINTS 4
> > -static int distance_ref_points_depth;
> > +static int max_associativity_domain_index;
> >  static const __be32 *distance_ref_points;
> >  static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
> >  
> > @@ -169,7 +169,7 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
> >  
> > int i, index;
> >  
> > -   for (i = 0; i < distance_ref_points_depth; i++) {
> > +   for (i = 0; i < max_associativity_domain_index; i++) {
> > index = be32_to_cpu(distance_ref_points[i]);
> > if (cpu1_assoc[index] == cpu2_assoc[index])
> > break;
> > @@ -193,7 +193,7 @@ int __node_distance(int a, int b)
> > if (!form1_affinity)
> > return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
> >  
> > -   for (i = 0; i < distance_ref_points_depth; i++) {
> > +   for (i = 0; i < max_associativity_domain_index; i++) {
> > if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
> > break;
> >  
> > @@ -213,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
> > if (!form1_affinity)
> > return;
> >  
> > -   for (i = 0; i < distance_ref_points_depth; i++) {
> > +   for (i = 0; i < max_associativity_domain_index; i++) {
> > const __be32 *entry;
> >  
> > entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
> > @@ -240,7 +240,7 @@ static int associativity_to_nid(const __be32 
> > *associativity)
> > nid = NUMA_NO_NODE;
> >  
> > if (nid > 0 &&
> > -   of_read_number(associativity, 1) >= distance_ref_points_depth) {
> > +   of_read_number(associativity, 1) >= 
> > max_associativity_domain_index) {
> > /*
> >  * Skip the length field and send start of associativity array
> >  */
> > @@ -310,14 +310,14 @@ static int __init find_primary_domain_index(void)
> >  */
> > distance_ref_points = of_get_property(root,
> > "ibm,associativity-reference-points",
> > -   &distance_ref_points_depth);
> > +   &max_associativity_domain_index);
> >  
> > if (!distance_ref_points) {
> > dbg("NUMA: ibm,associativity-reference-points not found.\n");
> > goto err;
> > }
> >  
> > -   distance_ref_points_depth /= sizeof(int);
> > +   max_associativity_domain_index /= sizeof(int);
> >  
> > if (firmware_has_feature(FW_FEATURE_OPAL) ||
> > firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
> > @@ -328,7 +328,7 @@ static int __init find_primary_domain_index(void)
> > if (form1_affinity) {
> > index = of_read_number(distance_ref_points, 1);
> > } else {
> > -   if (distance_ref_points_depth < 2) {
> > +   if (max_associativity_domain_index < 2) {
> > printk(KERN_WARNING "NUMA: "
> > "short ibm,associativity-reference-points\n");
> > goto err;
> > @@ -341,10 +341,10 @@ static int __init find_primary_domain_index(void)
> >  * Warn and cap if the hardware supports more than
> >  * MAX_DISTANCE_REF_POINTS domains.
> >  */
> > -   if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
> > +   if (max_associativity_domain_index > MAX_DISTANCE_REF_POINTS) {
> > printk(KERN_WARNING "NUMA: distance array capped at "
> > "%d entries\n", MAX_DISTANCE_REF_POINTS);
> > -   distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
> > +   max_associativity_domain_index = MAX_DISTANCE_REF_POINTS;
> > }
> >  
> > of_node_put(root);
> 



-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank yo

[powerpc:fixes-test] BUILD SUCCESS bc4188a2f56e821ea057aca6bf444e138d06c252

2021-07-21 Thread kernel test robot
 randconfig-a002-20210719
i386 randconfig-a005-20210720
i386 randconfig-a003-20210720
i386 randconfig-a004-20210720
i386 randconfig-a002-20210720
i386 randconfig-a001-20210720
i386 randconfig-a006-20210720
i386 randconfig-a016-20210720
i386 randconfig-a013-20210720
i386 randconfig-a012-20210720
i386 randconfig-a014-20210720
i386 randconfig-a011-20210720
i386 randconfig-a015-20210720
i386 randconfig-a016-20210721
i386 randconfig-a013-20210721
i386 randconfig-a012-20210721
i386 randconfig-a014-20210721
i386 randconfig-a011-20210721
i386 randconfig-a015-20210721
riscvnommu_k210_defconfig
riscvallyesconfig
riscvnommu_virt_defconfig
riscv allnoconfig
riscv   defconfig
riscv  rv32_defconfig
riscvallmodconfig
x86_64rhel-8.3-kselftests
um   x86_64_defconfig
um i386_defconfig
x86_64   allyesconfig
x86_64  defconfig
x86_64   rhel-8.3
x86_64  rhel-8.3-kbuiltin
x86_64  kexec

clang tested configs:
x86_64   randconfig-b001-20210720
x86_64   randconfig-a011-20210720
x86_64   randconfig-a016-20210720
x86_64   randconfig-a013-20210720
x86_64   randconfig-a014-20210720
x86_64   randconfig-a012-20210720
x86_64   randconfig-a015-20210720
x86_64   randconfig-a003-20210721
x86_64   randconfig-a006-20210721
x86_64   randconfig-a001-20210721
x86_64   randconfig-a005-20210721
x86_64   randconfig-a004-20210721
x86_64   randconfig-a002-20210721

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


Re: [PATCH printk v4 4/6] printk: remove NMI tracking

2021-07-21 Thread Petr Mladek
On Wed 2021-07-21 14:52:15, John Ogness wrote:
> On 2021-07-21, Petr Mladek  wrote:
> >> --- a/kernel/trace/trace.c
> >> +++ b/kernel/trace/trace.c
> >> @@ -9647,7 +9647,7 @@ void ftrace_dump(enum ftrace_dump_mode 
> >> oops_dump_mode)
> >>tracing_off();
> >>  
> >>local_irq_save(flags);
> >> -  printk_nmi_direct_enter();
> >> +  printk_deferred_enter();
> >
> > I would prefer to do not manipulate the printk context here anymore,
> > as it was done in v3.
> >
> > printk_nmi_direct_enter() was added here by the commit the commit
> > 03fc7f9c99c1e7ae2925d4 ("printk/nmi: Prevent deadlock when
> > accessing the main log buffer in NMI"). It was _not_ about console
> > handling. The reason was to modify the default behavior under NMI
> > and store the messages directly into the main log buffer.
> >
> > When I think about it. The original fix was not correct. We should have
> > modified the context only when ftrace_dump() was really called under NMI:
> >
> > if (in_nmi())
> > printk_nmi_direct_enter();
> >
> > By other words. We should try to show the messages on the console
> > when ftrace_dump()/panic() is not called from NMI. It will help
> > to see all messages even when the ftrace buffers are bigger
> > than printk() ones.
> >
> > And we do not need any special handling here for NMI. vprintk()
> > in printk/printk_safe.c will do the right thing for us.
> 
> Agreed. We need to mention this behavior change in the commit
> message. Perhaps this as the commit message:
>
> All NMI contexts are handled the same as the safe context: store the
> message and defer printing. There is no need to have special NMI
> context tracking for this. Using in_nmi() is enough.
> 
> There are several parts of the kernel that are manually calling into
> the printk NMI context tracking in order to cause general printk
> deferred printing:
> 
> arch/arm/kernel/smp.c
> arch/powerpc/kexec/crash.c
> kernel/trace/trace.c
> 
> For arm/kernel/smp.c and powerpc/kexec/crash.c, provide a new
> function pair printk_deferred_enter/exit that explicitly achieves the
> same objective.
> 
> For ftrace, remove general printk deferring. This general deferrment
> was added in commit 03fc7f9c99c1 ("printk/nmi: Prevent deadlock when
> accessing the main log buffer in NMI"), but really should have only
> been deferred when in NMI context. Since vprintk() now checks for
> NMI context when deciding to defer, ftrace does not need any special
> handling.

I would make it less focused on the deferring part and try to explain
the original purpose here, something like:

"For ftrace, remove the printk context manipulation completely. It was
added in commit 03fc7f9c99c1 ("printk/nmi: Prevent deadlock when
accessing the main log buffer in NMI"). The purpose was to enforce
storing messages directly into the ring buffer even in NMI context.
It really should have only modified the behavior in NMI context.
There is no need for a special behavior any longer. All messages are
always stored directly now. The console deferring is handled
transparently in vprintk()."

But I do not resist on it. And feel free to make it shorter.

Best Regards,
Petr


Re: [PATCH printk v4 4/6] printk: remove NMI tracking

2021-07-21 Thread Petr Mladek
On Thu 2021-07-15 21:39:57, John Ogness wrote:
> All NMI contexts are handled the same as the safe context: store the
> message and defer printing. There is no need to have special NMI
> context tracking for this. Using in_nmi() is enough.
> 
> There are several parts of the kernel that are manually calling into
> the printk NMI context tracking in order to cause general printk
> deferred printing:
> 
> arch/arm/kernel/smp.c
> arch/powerpc/kexec/crash.c
> kernel/trace/trace.c
> 
> For these users, provide a new function pair
> printk_deferred_enter/exit that explicitly achieves the same
> objective.
> 
> Signed-off-by: John Ogness 
> ---
>  arch/arm/kernel/smp.c   |  4 ++--
>  arch/powerpc/kexec/crash.c  |  2 +-
>  include/linux/hardirq.h |  2 --
>  include/linux/printk.h  | 31 +++
>  init/Kconfig|  5 -
>  kernel/printk/internal.h|  8 
>  kernel/printk/printk_safe.c | 37 +
>  kernel/trace/trace.c|  4 ++--
>  8 files changed, 25 insertions(+), 68 deletions(-)
> 
> diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
> index c7bb168b0d97..842427ff2b3c 100644
> --- a/arch/arm/kernel/smp.c
> +++ b/arch/arm/kernel/smp.c
> @@ -667,9 +667,9 @@ static void do_handle_IPI(int ipinr)
>   break;
>  
>   case IPI_CPU_BACKTRACE:
> - printk_nmi_enter();
> + printk_deferred_enter();
>   nmi_cpu_backtrace(get_irq_regs());
> - printk_nmi_exit();
> + printk_deferred_exit();
>   break;
>  
>   default:
> diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c
> index 0196d0c211ac..1070378c8e35 100644
> --- a/arch/powerpc/kexec/crash.c
> +++ b/arch/powerpc/kexec/crash.c
> @@ -313,7 +313,7 @@ void default_machine_crash_shutdown(struct pt_regs *regs)
>   int (*old_handler)(struct pt_regs *regs);
>  
>   /* Avoid hardlocking with irresponsive CPU holding logbuf_lock */
> - printk_nmi_enter();
> + printk_deferred_enter();
>  
>   /*
>* This function is only called after the system
> diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
> index 69bc86ea382c..76878b357ffa 100644
> --- a/include/linux/hardirq.h
> +++ b/include/linux/hardirq.h
> @@ -116,7 +116,6 @@ extern void rcu_nmi_exit(void);
>   do {\
>   lockdep_off();  \
>   arch_nmi_enter();   \
> - printk_nmi_enter(); \
>   BUG_ON(in_nmi() == NMI_MASK);   \
>   __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);   \
>   } while (0)
> @@ -135,7 +134,6 @@ extern void rcu_nmi_exit(void);
>   do {\
>   BUG_ON(!in_nmi());  \
>   __preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);   \
> - printk_nmi_exit();  \
>   arch_nmi_exit();\
>   lockdep_on();   \
>   } while (0)
> diff --git a/include/linux/printk.h b/include/linux/printk.h
> index 664612f75dac..03f7ccedaf18 100644
> --- a/include/linux/printk.h
> +++ b/include/linux/printk.h
> @@ -149,18 +149,6 @@ static inline __printf(1, 2) __cold
>  void early_printk(const char *s, ...) { }
>  #endif
>  
> -#ifdef CONFIG_PRINTK_NMI
> -extern void printk_nmi_enter(void);
> -extern void printk_nmi_exit(void);
> -extern void printk_nmi_direct_enter(void);
> -extern void printk_nmi_direct_exit(void);
> -#else
> -static inline void printk_nmi_enter(void) { }
> -static inline void printk_nmi_exit(void) { }
> -static inline void printk_nmi_direct_enter(void) { }
> -static inline void printk_nmi_direct_exit(void) { }
> -#endif /* PRINTK_NMI */
> -
>  struct dev_printk_info;
>  
>  #ifdef CONFIG_PRINTK
> @@ -180,6 +168,16 @@ int printk(const char *fmt, ...);
>   */
>  __printf(1, 2) __cold int printk_deferred(const char *fmt, ...);
>  
> +extern void __printk_safe_enter(void);
> +extern void __printk_safe_exit(void);
> +/*
> + * The printk_deferred_enter/exit macros are available only as a hack for
> + * some code paths that need to defer all printk console printing. Interrupts
> + * must be disabled for the deferred duration.
> + */
> +#define printk_deferred_enter __printk_safe_enter
> +#define printk_deferred_exit __printk_safe_exit
> +
>  /*
>   * Please don't use printk_ratelimit(), because it shares ratelimiting state
>   * with all other unrelated printk_ratelimit() callsites.  Instead use
> @@ -223,6 +221,15 @@ int printk_deferred(const char *s, ...)
>  {
>   return 0;
>  }
> +
> +static inline void printk_deferred_enter(void)
> +{
> +}
> +
> +static inline void printk_deferred_exit(void)
> +{
> +}
> +
> 

Re: [PATCH printk v4 3/6] printk: remove safe buffers

2021-07-21 Thread Petr Mladek
On Thu 2021-07-15 21:39:56, John Ogness wrote:
> With @logbuf_lock removed, the high level printk functions for
> storing messages are lockless. Messages can be stored from any
> context, so there is no need for the NMI and safe buffers anymore.
> Remove the NMI and safe buffers.
> 
> Although the safe buffers are removed, the NMI and safe context
> tracking is still in place. In these contexts, store the message
> immediately but still use irq_work to defer the console printing.
> 
> Since printk recursion tracking is in place, safe context tracking
> for most of printk is not needed. Remove it. Only safe context
> tracking relating to the console and console_owner locks is left
> in place. This is because the console and console_owner locks are
> needed for the actual printing.
> 
> Signed-off-by: John Ogness 

Reviewed-by: Petr Mladek 

Best Regards,
Petr


Re: [PATCH v5 10/11] powerpc/pseries/iommu: Make use of DDW for indirect mapping

2021-07-21 Thread Frederic Barrat




On 21/07/2021 05:32, Alexey Kardashevskiy wrote:

+    struct iommu_table *newtbl;
+    int i;
+
+    for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) {
+    const unsigned long mask = IORESOURCE_MEM_64 | 
IORESOURCE_MEM;

+
+    /* Look for MMIO32 */
+    if ((pci->phb->mem_resources[i].flags & mask) == 
IORESOURCE_MEM)

+    break;
+    }
+
+    if (i == ARRAY_SIZE(pci->phb->mem_resources))
+    goto out_del_list;



So we exit and do nothing if there's no MMIO32 bar?
Isn't the intent just to figure out the MMIO32 area to reserve it when 
init'ing the table? In which case we could default to 0,0


I'm actually not clear why we are reserving this area on pseries.




If we do not reserve it, then the iommu code will allocate DMA pages 
from there and these addresses are MMIO32 from the kernel pov at least. 
I saw crashes when (I think) a device tried DMAing to the top 2GB of the 
bus space which happened to be a some other device's BAR.



hmmm... then figuring out the correct range needs more work. We could 
have more than one MMIO32 bar. And they don't have to be adjacent. I 
don't see that we are reserving any range on the initial table though 
(on pseries).


  Fred


Re: [PATCH printk v4 4/6] printk: remove NMI tracking

2021-07-21 Thread John Ogness
On 2021-07-21, Petr Mladek  wrote:
> On Wed 2021-07-21 14:52:15, John Ogness wrote:
>> On 2021-07-21, Petr Mladek  wrote:
>> >> --- a/kernel/trace/trace.c
>> >> +++ b/kernel/trace/trace.c
>> >> @@ -9647,7 +9647,7 @@ void ftrace_dump(enum ftrace_dump_mode 
>> >> oops_dump_mode)
>> >>   tracing_off();
>> >>  
>> >>   local_irq_save(flags);
>> >> - printk_nmi_direct_enter();
>> >> + printk_deferred_enter();
>> >
>> > I would prefer to do not manipulate the printk context here anymore,
>> > as it was done in v3.
>> >
>> > printk_nmi_direct_enter() was added here by the commit the commit
>> > 03fc7f9c99c1e7ae2925d4 ("printk/nmi: Prevent deadlock when
>> > accessing the main log buffer in NMI"). It was _not_ about console
>> > handling. The reason was to modify the default behavior under NMI
>> > and store the messages directly into the main log buffer.
>> >
>> > When I think about it. The original fix was not correct. We should have
>> > modified the context only when ftrace_dump() was really called under NMI:
>> >
>> >if (in_nmi())
>> >printk_nmi_direct_enter();
>> >
>> > By other words. We should try to show the messages on the console
>> > when ftrace_dump()/panic() is not called from NMI. It will help
>> > to see all messages even when the ftrace buffers are bigger
>> > than printk() ones.
>> >
>> > And we do not need any special handling here for NMI. vprintk()
>> > in printk/printk_safe.c will do the right thing for us.
>> 
>> Agreed. We need to mention this behavior change in the commit
>> message. Perhaps this as the commit message:
>>
>> All NMI contexts are handled the same as the safe context: store the
>> message and defer printing. There is no need to have special NMI
>> context tracking for this. Using in_nmi() is enough.
>> 
>> There are several parts of the kernel that are manually calling into
>> the printk NMI context tracking in order to cause general printk
>> deferred printing:
>> 
>> arch/arm/kernel/smp.c
>> arch/powerpc/kexec/crash.c
>> kernel/trace/trace.c
>> 
>> For arm/kernel/smp.c and powerpc/kexec/crash.c, provide a new
>> function pair printk_deferred_enter/exit that explicitly achieves the
>> same objective.
>> 
>> For ftrace, remove general printk deferring. This general deferrment
>> was added in commit 03fc7f9c99c1 ("printk/nmi: Prevent deadlock when
>> accessing the main log buffer in NMI"), but really should have only
>> been deferred when in NMI context. Since vprintk() now checks for
>> NMI context when deciding to defer, ftrace does not need any special
>> handling.
>
> I would make it less focused on the deferring part and try to explain
> the original purpose here, something like:
>
> "For ftrace, remove the printk context manipulation completely. It was
> added in commit 03fc7f9c99c1 ("printk/nmi: Prevent deadlock when
> accessing the main log buffer in NMI"). The purpose was to enforce
> storing messages directly into the ring buffer even in NMI context.
> It really should have only modified the behavior in NMI context.
> There is no need for a special behavior any longer. All messages are
> always stored directly now. The console deferring is handled
> transparently in vprintk()."

Your wording is OK for me.

John Ogness


Re: [PATCH v4 0/5] bus: Make remove callback return void

2021-07-21 Thread Greg Kroah-Hartman
On Tue, Jul 13, 2021 at 09:35:17PM +0200, Uwe Kleine-König wrote:
> Hello,
> 
> this is v4 of the final patch set for my effort to make struct
> bus_type::remove return void.
> 
> The first four patches contain cleanups that make some of these
> callbacks (more obviously) always return 0. They are acked by the
> respective maintainers. Bjorn Helgaas explicitly asked to include the
> pci patch (#1) into this series, so Greg taking this is fine. I assume
> the s390 people are fine with Greg taking patches #2 to #4, too, they
> didn't explicitly said so though.
> 
> The last patch actually changes the prototype and so touches quite some
> drivers and has the potential to conflict with future developments, so I
> consider it beneficial to put these patches into next soon. I expect
> that it will be Greg who takes the complete series, he already confirmed
> via irc (for v2) to look into this series.
> 
> The only change compared to v3 is in the fourth patch where I modified a
> few more drivers to fix build failures. Some of them were found by build
> bots (thanks!), some of them I found myself using a regular expression
> search. The newly modified files are:
> 
>  arch/sparc/kernel/vio.c
>  drivers/nubus/bus.c
>  drivers/sh/superhyway/superhyway.c
>  drivers/vlynq/vlynq.c
>  drivers/zorro/zorro-driver.c
>  sound/ac97/bus.c
> 
> Best regards
> Uwe

Now queued up.  I can go make a git tag that people can pull from after
0-day is finished testing this to verify all is good, if others need it.

thanks,

greg k-h


Re: [PATCH printk v4 4/6] printk: remove NMI tracking

2021-07-21 Thread John Ogness
On 2021-07-21, Petr Mladek  wrote:
>> --- a/kernel/trace/trace.c
>> +++ b/kernel/trace/trace.c
>> @@ -9647,7 +9647,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
>>  tracing_off();
>>  
>>  local_irq_save(flags);
>> -printk_nmi_direct_enter();
>> +printk_deferred_enter();
>
> I would prefer to do not manipulate the printk context here anymore,
> as it was done in v3.
>
> printk_nmi_direct_enter() was added here by the commit the commit
> 03fc7f9c99c1e7ae2925d4 ("printk/nmi: Prevent deadlock when
> accessing the main log buffer in NMI"). It was _not_ about console
> handling. The reason was to modify the default behavior under NMI
> and store the messages directly into the main log buffer.
>
> When I think about it. The original fix was not correct. We should have
> modified the context only when ftrace_dump() was really called under NMI:
>
>   if (in_nmi())
>   printk_nmi_direct_enter();
>
> By other words. We should try to show the messages on the console
> when ftrace_dump()/panic() is not called from NMI. It will help
> to see all messages even when the ftrace buffers are bigger
> than printk() ones.
>
> And we do not need any special handling here for NMI. vprintk()
> in printk/printk_safe.c will do the right thing for us.

Agreed. We need to mention this behavior change in the commit
message. Perhaps this as the commit message:

All NMI contexts are handled the same as the safe context: store the
message and defer printing. There is no need to have special NMI
context tracking for this. Using in_nmi() is enough.

There are several parts of the kernel that are manually calling into
the printk NMI context tracking in order to cause general printk
deferred printing:

arch/arm/kernel/smp.c
arch/powerpc/kexec/crash.c
kernel/trace/trace.c

For arm/kernel/smp.c and powerpc/kexec/crash.c, provide a new
function pair printk_deferred_enter/exit that explicitly achieves the
same objective.

For ftrace, remove general printk deferring. This general deferrment
was added in commit 03fc7f9c99c1 ("printk/nmi: Prevent deadlock when
accessing the main log buffer in NMI"), but really should have only
been deferred when in NMI context. Since vprintk() now checks for
NMI context when deciding to defer, ftrace does not need any special
handling.

Signed-off-by: John Ogness 


Re: [PATCH v2] Revert "mm/pgtable: add stubs for {pmd/pub}_{set/clear}_huge"

2021-07-21 Thread Will Deacon
On Wed, 21 Jul 2021 17:02:13 +1000, Michael Ellerman wrote:
> This reverts commit c742199a014de23ee92055c2473d91fe5561ffdf.
> 
> c742199a014d ("mm/pgtable: add stubs for {pmd/pub}_{set/clear}_huge")
> breaks arm64 in at least two ways for configurations where PUD or PMD
> folding occur:
> 
>   1. We no longer install huge-vmap mappings and silently fall back to
>  page-granular entries, despite being able to install block entries
>  at what is effectively the PGD level.
> 
> [...]

Thank you Michael! I owe you a beer next time I see you, if we don't go
extinct before then.

Applied to arm64 (for-next/fixes), thanks!

[1/1] Revert "mm/pgtable: add stubs for {pmd/pub}_{set/clear}_huge"
  https://git.kernel.org/arm64/c/d8a719059b9d

Cheers,
-- 
Will

https://fixes.arm64.dev
https://next.arm64.dev
https://will.arm64.dev


Re: [RFC v2 2/2] drm/amd/display: Use PPC FPU functions

2021-07-21 Thread Christian König

Am 21.07.21 um 08:51 schrieb Christoph Hellwig:

On Wed, Jul 21, 2021 at 08:29:43AM +0200, Christian K??nig wrote:

Looks good in general, but question is what about other architectures like
ARM?

DRM_AMD_DC_DCN currently requires X86 || PPC64.


And exactly that's the problem I'm noting here. At least officially AMD 
claims that we support ARM and some very brave still use the hardware 
together with MIPS as well.



Maybe a good think would be to add a new KERNEL_FPU_API Kconfig symbol,
selected by x86 and powerpc (I think ppc32 should be fine too now) so
that we get these arch dependencies out of the driver.


Good idea.

Christian.


Re: [PATCH 1/2] KVM: PPC: Book3S: Fix CONFIG_TRANSACTIONAL_MEM=n crash

2021-07-21 Thread Michael Ellerman
On Fri, 16 Jul 2021 12:43:09 +1000, Nicholas Piggin wrote:
> When running CPU_FTR_P9_TM_HV_ASSIST, HFSCR[TM] is set for the guest
> even if the host has CONFIG_TRANSACTIONAL_MEM=n, which causes it to be
> unprepared to handle guest exits while transactional.
> 
> Normal guests don't have a problem because the HTM capability will not
> be advertised, but a rogue or buggy one could crash the host.

Applied to powerpc/fixes.

[1/2] KVM: PPC: Book3S: Fix CONFIG_TRANSACTIONAL_MEM=n crash
  https://git.kernel.org/powerpc/c/bd31ecf44b8e18ccb1e5f6b50f85de6922a60de3
[2/2] KVM: PPC: Fix kvm_arch_vcpu_ioctl vcpu_load leak
  https://git.kernel.org/powerpc/c/bc4188a2f56e821ea057aca6bf444e138d06c252

cheers


[PATCH v2] Revert "mm/pgtable: add stubs for {pmd/pub}_{set/clear}_huge"

2021-07-21 Thread Michael Ellerman
From: Jonathan Marek 

This reverts commit c742199a014de23ee92055c2473d91fe5561ffdf.

c742199a014d ("mm/pgtable: add stubs for {pmd/pub}_{set/clear}_huge")
breaks arm64 in at least two ways for configurations where PUD or PMD
folding occur:

  1. We no longer install huge-vmap mappings and silently fall back to
 page-granular entries, despite being able to install block entries
 at what is effectively the PGD level.

  2. If the linear map is backed with block mappings, these will now
 silently fail to be created in alloc_init_pud(), causing a panic
 early during boot.

The pgtable selftests caught this, although a fix has not been
forthcoming and Christophe is AWOL at the moment, so just revert the
change for now to get a working -rc3 on which we can queue patches for
5.15.

A simple revert breaks the build for 32-bit PowerPC 8xx machines, which
rely on the default function definitions when the corresponding
page-table levels are folded, since commit a6a8f7c4aa7e ("powerpc/8xx:
add support for huge pages on VMAP and VMALLOC"), eg:

  powerpc64-linux-ld: mm/vmalloc.o: in function `vunmap_pud_range':
  linux/mm/vmalloc.c:362: undefined reference to `pud_clear_huge'

To avoid that, add stubs for pud_clear_huge() and pmd_clear_huge() in
arch/powerpc/mm/nohash/8xx.c as suggested by Christophe.

Cc: Christophe Leroy 
Cc: Catalin Marinas 
Cc: Andrew Morton 
Cc: Nicholas Piggin 
Cc: Mike Rapoport 
Cc: Mark Rutland 
Cc: Geert Uytterhoeven 
Fixes: c742199a014d ("mm/pgtable: add stubs for {pmd/pub}_{set/clear}_huge")
Signed-off-by: Jonathan Marek 
Signed-off-by: Will Deacon 
Reviewed-by: Ard Biesheuvel 
Acked-by: Marc Zyngier 
[mpe: Fold in 8xx.c changes from Christophe and mention in change log]
Signed-off-by: Michael Ellerman 
Link: 
https://lore.kernel.org/linux-arm-kernel/camuhmdxshordox-xxaeufdw3wx2peggfsqhvshvznkcgk-y...@mail.gmail.com/
Link: https://lore.kernel.org/r/20210717160118.9855-1-jonat...@marek.ca
---
 arch/arm64/mm/mmu.c  | 20 
 arch/powerpc/mm/nohash/8xx.c | 10 ++
 arch/x86/mm/pgtable.c| 34 +++---
 include/linux/pgtable.h  | 26 +-
 4 files changed, 34 insertions(+), 56 deletions(-)

v2: Fold in suggestion from Christophe to add stubs for 8xx.

I kept the reviewed-by/acked-by tags from arm64 folks as the patch is
unchanged as far as arm64 is concerned.

Please take this via the arm64 tree.

cheers


diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index d74586508448..9ff0de1b2b93 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1339,7 +1339,6 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int 
*size, pgprot_t prot)
return dt_virt;
 }
 
-#if CONFIG_PGTABLE_LEVELS > 3
 int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
 {
pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot));
@@ -1354,16 +1353,6 @@ int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t 
prot)
return 1;
 }
 
-int pud_clear_huge(pud_t *pudp)
-{
-   if (!pud_sect(READ_ONCE(*pudp)))
-   return 0;
-   pud_clear(pudp);
-   return 1;
-}
-#endif
-
-#if CONFIG_PGTABLE_LEVELS > 2
 int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
 {
pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot));
@@ -1378,6 +1367,14 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t 
prot)
return 1;
 }
 
+int pud_clear_huge(pud_t *pudp)
+{
+   if (!pud_sect(READ_ONCE(*pudp)))
+   return 0;
+   pud_clear(pudp);
+   return 1;
+}
+
 int pmd_clear_huge(pmd_t *pmdp)
 {
if (!pmd_sect(READ_ONCE(*pmdp)))
@@ -1385,7 +1382,6 @@ int pmd_clear_huge(pmd_t *pmdp)
pmd_clear(pmdp);
return 1;
 }
-#endif
 
 int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
 {
diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
index 60780e089118..0df9fe29dd56 100644
--- a/arch/powerpc/mm/nohash/8xx.c
+++ b/arch/powerpc/mm/nohash/8xx.c
@@ -240,3 +240,13 @@ void __init setup_kuap(bool disabled)
mtspr(SPRN_MD_AP, MD_APG_KUAP);
 }
 #endif
+
+int pud_clear_huge(pud_t *pud)
+{
+return 0;
+}
+
+int pmd_clear_huge(pmd_t *pmd)
+{
+return 0;
+}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 3364fe62b903..3481b35cb4ec 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -682,7 +682,6 @@ int p4d_clear_huge(p4d_t *p4d)
 }
 #endif
 
-#if CONFIG_PGTABLE_LEVELS > 3
 /**
  * pud_set_huge - setup kernel PUD mapping
  *
@@ -721,23 +720,6 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t 
prot)
return 1;
 }
 
-/**
- * pud_clear_huge - clear kernel PUD mapping when it is set
- *
- * Returns 1 on success and 0 on failure (no PUD map is found).
- */
-int pud_clear_huge(pud_t *pud)
-{
-   if (pud_large(*pud)) {
-   pud_clear(pud);
-   return 1;
-   }
-
-   return 0;
-}
-#endif
-
-#i