[PATCH v7 6/6] powerpc/pseries: Consolidate form1 distance initialization into a helper

2021-08-08 Thread Aneesh Kumar K.V
Currently, we duplicate parsing code for ibm,associativity and
ibm,associativity-lookup-arrays in the kernel. The associativity array provided
by these device tree properties are very similar and hence can use
a helper to parse the node id and numa distance details.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c | 104 +++--
 1 file changed, 58 insertions(+), 46 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index fffb3c40f595..e6d47fcba335 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -171,26 +171,36 @@ static void unmap_cpu_from_node(unsigned long cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
 
-/*
- * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
- * info is found.
- */
-static int associativity_to_nid(const __be32 *associativity)
+static int __associativity_to_nid(const __be32 *associativity,
+ int max_array_sz)
 {
-   int nid = NUMA_NO_NODE;
+   int nid;
+   /*
+* primary_domain_index is 1 based array index.
+*/
+   int index = primary_domain_index  - 1;
 
-   if (!numa_enabled)
-   goto out;
+   if (!numa_enabled || index >= max_array_sz)
+   return NUMA_NO_NODE;
 
-   if (of_read_number(associativity, 1) >= primary_domain_index)
-   nid = of_read_number([primary_domain_index], 1);
+   nid = of_read_number([index], 1);
 
/* POWER4 LPAR uses 0x as invalid node */
if (nid == 0x || nid >= nr_node_ids)
nid = NUMA_NO_NODE;
-out:
return nid;
 }
+/*
+ * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
+ * info is found.
+ */
+static int associativity_to_nid(const __be32 *associativity)
+{
+   int array_sz = of_read_number(associativity, 1);
+
+   /* Skip the first element in the associativity array */
+   return __associativity_to_nid((associativity + 1), array_sz);
+}
 
 static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 
*cpu2_assoc)
 {
@@ -295,33 +305,39 @@ int of_node_to_nid(struct device_node *device)
 }
 EXPORT_SYMBOL(of_node_to_nid);
 
-static void __initialize_form1_numa_distance(const __be32 *associativity)
+static void __initialize_form1_numa_distance(const __be32 *associativity,
+int max_array_sz)
 {
int i, nid;
 
if (affinity_form != FORM1_AFFINITY)
return;
 
-   nid = associativity_to_nid(associativity);
+   nid = __associativity_to_nid(associativity, max_array_sz);
if (nid != NUMA_NO_NODE) {
for (i = 0; i < distance_ref_points_depth; i++) {
const __be32 *entry;
+   int index = be32_to_cpu(distance_ref_points[i]) - 1;
+
+   /*
+* broken hierarchy, return with broken distance table
+*/
+   if (WARN(index >= max_array_sz, "Broken 
ibm,associativity property"))
+   return;
 
-   entry = 
[be32_to_cpu(distance_ref_points[i])];
+   entry = [index];
distance_lookup_table[nid][i] = of_read_number(entry, 
1);
}
}
 }
 
-static void initialize_form1_numa_distance(struct device_node *node)
+static void initialize_form1_numa_distance(const __be32 *associativity)
 {
-   const __be32 *associativity;
-
-   associativity = of_get_associativity(node);
-   if (!associativity)
-   return;
+   int array_sz;
 
-   __initialize_form1_numa_distance(associativity);
+   array_sz = of_read_number(associativity, 1);
+   /* Skip the first element in the associativity array */
+   __initialize_form1_numa_distance(associativity + 1, array_sz);
 }
 
 /*
@@ -334,7 +350,13 @@ void update_numa_distance(struct device_node *node)
if (affinity_form == FORM0_AFFINITY)
return;
else if (affinity_form == FORM1_AFFINITY) {
-   initialize_form1_numa_distance(node);
+   const __be32 *associativity;
+
+   associativity = of_get_associativity(node);
+   if (!associativity)
+   return;
+
+   initialize_form1_numa_distance(associativity);
return;
}
 
@@ -586,27 +608,17 @@ static int get_nid_and_numa_distance(struct drmem_lmb 
*lmb)
 
if (primary_domain_index <= aa.array_sz &&
!(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
aa.n_arrays) {
-   index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
-   nid = of_read_number([index], 1);
+   const __be32 *associativity;
 
-   if (nid == 0x || nid >= nr_node_ids)
-   nid = default_nid;
+   index = 

[PATCH v7 5/6] powerpc/pseries: Add support for FORM2 associativity

2021-08-08 Thread Aneesh Kumar K.V
PAPR interface currently supports two different ways of communicating resource
grouping details to the OS. These are referred to as Form 0 and Form 1
associativity grouping. Form 0 is the older format and is now considered
deprecated. This patch adds another resource grouping named FORM2.

Signed-off-by: Daniel Henrique Barboza 
Signed-off-by: Aneesh Kumar K.V 
---
 Documentation/powerpc/associativity.rst   | 103 +
 arch/powerpc/include/asm/firmware.h   |   3 +-
 arch/powerpc/include/asm/prom.h   |   1 +
 arch/powerpc/kernel/prom_init.c   |   3 +-
 arch/powerpc/mm/numa.c| 168 ++
 arch/powerpc/platforms/pseries/firmware.c |   1 +
 6 files changed, 252 insertions(+), 27 deletions(-)
 create mode 100644 Documentation/powerpc/associativity.rst

diff --git a/Documentation/powerpc/associativity.rst 
b/Documentation/powerpc/associativity.rst
new file mode 100644
index ..b6c89706ca03
--- /dev/null
+++ b/Documentation/powerpc/associativity.rst
@@ -0,0 +1,103 @@
+
+NUMA resource associativity
+=
+
+Associativity represents the groupings of the various platform resources into
+domains of substantially similar mean performance relative to resources outside
+of that domain. Resources subsets of a given domain that exhibit better
+performance relative to each other than relative to other resources subsets
+are represented as being members of a sub-grouping domain. This performance
+characteristic is presented in terms of NUMA node distance within the Linux 
kernel.
+From the platform view, these groups are also referred to as domains.
+
+PAPR interface currently supports different ways of communicating these 
resource
+grouping details to the OS. These are referred to as Form 0, Form 1 and Form2
+associativity grouping. Form 0 is the oldest format and is now considered 
deprecated.
+
+Hypervisor indicates the type/form of associativity used via 
"ibm,architecture-vec-5 property".
+Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of 
Form 0 or Form 1.
+A value of 1 indicates the usage of Form 1 associativity. For Form 2 
associativity
+bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used.
+
+Form 0
+-
+Form 0 associativity supports only two NUMA distances (LOCAL and REMOTE).
+
+Form 1
+-
+With Form 1 a combination of ibm,associativity-reference-points, and 
ibm,associativity
+device tree properties are used to determine the NUMA distance between 
resource groups/domains.
+
+The “ibm,associativity” property contains a list of one or more numbers 
(domainID)
+representing the resource’s platform grouping domains.
+
+The “ibm,associativity-reference-points” property contains a list of one or 
more numbers
+(domainID index) that represents the 1 based ordinal in the associativity 
lists.
+The list of domainID indexes represents an increasing hierarchy of resource 
grouping.
+
+ex:
+{ primary domainID index, secondary domainID index, tertiary domainID index.. }
+
+Linux kernel uses the domainID at the primary domainID index as the NUMA node 
id.
+Linux kernel computes NUMA distance between two domains by recursively 
comparing
+if they belong to the same higher-level domains. For mismatch at every higher
+level of the resource group, the kernel doubles the NUMA distance between the
+comparing domains.
+
+Form 2
+---
+Form 2 associativity format adds separate device tree properties representing 
NUMA node distance
+thereby making the node distance computation flexible. Form 2 also allows 
flexible primary
+domain numbering. With numa distance computation now detached from the index 
value in
+"ibm,associativity-reference-points" property, Form 2 allows a large number of 
primary domain
+ids at the same domainID index representing resource groups of different 
performance/latency
+characteristics.
+
+Hypervisor indicates the usage of FORM2 associativity using bit 2 of byte 5 in 
the
+"ibm,architecture-vec-5" property.
+
+"ibm,numa-lookup-index-table" property contains a list of one or more numbers 
representing
+the domainIDs present in the system. The offset of the domainID in this 
property is
+used as an index while computing numa distance information via 
"ibm,numa-distance-table".
+
+prop-encoded-array: The number N of the domainIDs encoded as with encode-int, 
followed by
+N domainID encoded as with encode-int
+
+For ex:
+"ibm,numa-lookup-index-table" =  {4, 0, 8, 250, 252}. The offset of domainID 8 
(2) is used when
+computing the distance of domain 8 from other domains present in the system. 
For the rest of
+this document, this offset will be referred to as domain distance offset.
+
+"ibm,numa-distance-table" property contains a list of one or more numbers 
representing the NUMA
+distance between resource groups/domains present in the system.
+
+prop-encoded-array: The number N of the distance values encoded as with 
encode-int, 

[PATCH v7 4/6] powerpc/pseries: Add a helper for form1 cpu distance

2021-08-08 Thread Aneesh Kumar K.V
This helper is only used with the dispatch trace log collection.
A later patch will add Form2 affinity support and this change helps
in keeping that simpler. Also add a comment explaining we don't expect
the code to be called with FORM0

Reviewed-by: David Gibson 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/topology.h   |  4 ++--
 arch/powerpc/mm/numa.c| 10 +-
 arch/powerpc/platforms/pseries/lpar.c |  4 ++--
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index a6425a70c37b..a4712ecad3e9 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -36,7 +36,7 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 cpu_all_mask : \
 cpumask_of_node(pcibus_to_node(bus)))
 
-extern int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc);
+int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc);
 extern int __node_distance(int, int);
 #define node_distance(a, b) __node_distance(a, b)
 
@@ -84,7 +84,7 @@ static inline void sysfs_remove_device_from_node(struct 
device *dev,
 
 static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) {}
 
-static inline int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+static inline int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
 {
return 0;
 }
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index c695faf67d68..a244398a7766 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -166,7 +166,7 @@ static void unmap_cpu_from_node(unsigned long cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
 
-int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 
*cpu2_assoc)
 {
int dist = 0;
 
@@ -182,6 +182,14 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
return dist;
 }
 
+int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+{
+   /* We should not get called with FORM0 */
+   VM_WARN_ON(affinity_form == FORM0_AFFINITY);
+
+   return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc);
+}
+
 /* must hold reference to node during call */
 static const __be32 *of_get_associativity(struct device_node *dev)
 {
diff --git a/arch/powerpc/platforms/pseries/lpar.c 
b/arch/powerpc/platforms/pseries/lpar.c
index dab356e3ff87..afefbdfe768d 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -261,7 +261,7 @@ static int cpu_relative_dispatch_distance(int 
last_disp_cpu, int cur_disp_cpu)
if (!last_disp_cpu_assoc || !cur_disp_cpu_assoc)
return -EIO;
 
-   return cpu_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc);
+   return cpu_relative_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc);
 }
 
 static int cpu_home_node_dispatch_distance(int disp_cpu)
@@ -281,7 +281,7 @@ static int cpu_home_node_dispatch_distance(int disp_cpu)
if (!disp_cpu_assoc || !vcpu_assoc)
return -EIO;
 
-   return cpu_distance(disp_cpu_assoc, vcpu_assoc);
+   return cpu_relative_distance(disp_cpu_assoc, vcpu_assoc);
 }
 
 static void update_vcpu_disp_stat(int disp_cpu)
-- 
2.31.1



[PATCH v7 3/6] powerpc/pseries: Consolidate different NUMA distance update code paths

2021-08-08 Thread Aneesh Kumar K.V
The associativity details of the newly added resourced are collected from
the hypervisor via "ibm,configure-connector" rtas call. Update the numa
distance details of the newly added numa node after the above call.

Instead of updating NUMA distance every time we lookup a node id
from the associativity property, add helpers that can be used
during boot which does this only once. Also remove the distance
update from node id lookup helpers.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/topology.h   |   2 +
 arch/powerpc/mm/numa.c| 178 +-
 arch/powerpc/platforms/pseries/hotplug-cpu.c  |   2 +
 .../platforms/pseries/hotplug-memory.c|   2 +
 4 files changed, 138 insertions(+), 46 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index e4db64c0e184..a6425a70c37b 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -64,6 +64,7 @@ static inline int early_cpu_to_node(int cpu)
 }
 
 int of_drconf_to_nid_single(struct drmem_lmb *lmb);
+void update_numa_distance(struct device_node *node);
 
 #else
 
@@ -93,6 +94,7 @@ static inline int of_drconf_to_nid_single(struct drmem_lmb 
*lmb)
return first_online_node;
 }
 
+static inline void update_numa_distance(struct device_node *node) {}
 #endif /* CONFIG_NUMA */
 
 #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 368719b14dcc..c695faf67d68 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -208,22 +208,6 @@ int __node_distance(int a, int b)
 }
 EXPORT_SYMBOL(__node_distance);
 
-static void initialize_distance_lookup_table(int nid,
-   const __be32 *associativity)
-{
-   int i;
-
-   if (affinity_form != FORM1_AFFINITY)
-   return;
-
-   for (i = 0; i < distance_ref_points_depth; i++) {
-   const __be32 *entry;
-
-   entry = [be32_to_cpu(distance_ref_points[i]) - 1];
-   distance_lookup_table[nid][i] = of_read_number(entry, 1);
-   }
-}
-
 /*
  * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
  * info is found.
@@ -241,15 +225,6 @@ static int associativity_to_nid(const __be32 
*associativity)
/* POWER4 LPAR uses 0x as invalid node */
if (nid == 0x || nid >= nr_node_ids)
nid = NUMA_NO_NODE;
-
-   if (nid > 0 &&
-   of_read_number(associativity, 1) >= distance_ref_points_depth) {
-   /*
-* Skip the length field and send start of associativity array
-*/
-   initialize_distance_lookup_table(nid, associativity + 1);
-   }
-
 out:
return nid;
 }
@@ -287,6 +262,48 @@ int of_node_to_nid(struct device_node *device)
 }
 EXPORT_SYMBOL(of_node_to_nid);
 
+static void __initialize_form1_numa_distance(const __be32 *associativity)
+{
+   int i, nid;
+
+   if (affinity_form != FORM1_AFFINITY)
+   return;
+
+   nid = associativity_to_nid(associativity);
+   if (nid != NUMA_NO_NODE) {
+   for (i = 0; i < distance_ref_points_depth; i++) {
+   const __be32 *entry;
+
+   entry = 
[be32_to_cpu(distance_ref_points[i])];
+   distance_lookup_table[nid][i] = of_read_number(entry, 
1);
+   }
+   }
+}
+
+static void initialize_form1_numa_distance(struct device_node *node)
+{
+   const __be32 *associativity;
+
+   associativity = of_get_associativity(node);
+   if (!associativity)
+   return;
+
+   __initialize_form1_numa_distance(associativity);
+}
+
+/*
+ * Used to update distance information w.r.t newly added node.
+ */
+void update_numa_distance(struct device_node *node)
+{
+   if (affinity_form == FORM0_AFFINITY)
+   return;
+   else if (affinity_form == FORM1_AFFINITY) {
+   initialize_form1_numa_distance(node);
+   return;
+   }
+}
+
 static int __init find_primary_domain_index(void)
 {
int index;
@@ -433,6 +450,48 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa)
return 0;
 }
 
+static int get_nid_and_numa_distance(struct drmem_lmb *lmb)
+{
+   struct assoc_arrays aa = { .arrays = NULL };
+   int default_nid = NUMA_NO_NODE;
+   int nid = default_nid;
+   int rc, index;
+
+   if ((primary_domain_index < 0) || !numa_enabled)
+   return default_nid;
+
+   rc = of_get_assoc_arrays();
+   if (rc)
+   return default_nid;
+
+   if (primary_domain_index <= aa.array_sz &&
+   !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
aa.n_arrays) {
+   index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
+   nid = of_read_number([index], 1);
+
+   if (nid == 0x || nid >= nr_node_ids)
+   

[PATCH v7 2/6] powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY

2021-08-08 Thread Aneesh Kumar K.V
Also make related code cleanup that will allow adding FORM2_AFFINITY in
later patches. No functional change in this patch.

Reviewed-by: David Gibson 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/firmware.h   |  4 +--
 arch/powerpc/include/asm/prom.h   |  2 +-
 arch/powerpc/kernel/prom_init.c   |  2 +-
 arch/powerpc/mm/numa.c| 35 ++-
 arch/powerpc/platforms/pseries/firmware.c |  2 +-
 5 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/firmware.h 
b/arch/powerpc/include/asm/firmware.h
index 7604673787d6..60b631161360 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -44,7 +44,7 @@
 #define FW_FEATURE_OPALASM_CONST(0x1000)
 #define FW_FEATURE_SET_MODEASM_CONST(0x4000)
 #define FW_FEATURE_BEST_ENERGY ASM_CONST(0x8000)
-#define FW_FEATURE_TYPE1_AFFINITY ASM_CONST(0x0001)
+#define FW_FEATURE_FORM1_AFFINITY ASM_CONST(0x0001)
 #define FW_FEATURE_PRRNASM_CONST(0x0002)
 #define FW_FEATURE_DRMEM_V2ASM_CONST(0x0004)
 #define FW_FEATURE_DRC_INFOASM_CONST(0x0008)
@@ -69,7 +69,7 @@ enum {
FW_FEATURE_SPLPAR | FW_FEATURE_LPAR |
FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO |
FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY |
-   FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN |
+   FW_FEATURE_FORM1_AFFINITY | FW_FEATURE_PRRN |
FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 |
FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE |
FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR |
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 324a13351749..df9fec9d232c 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -147,7 +147,7 @@ extern int of_read_drc_info_cell(struct property **prop,
 #define OV5_MSI0x0201  /* PCIe/MSI support */
 #define OV5_CMO0x0480  /* Cooperative Memory 
Overcommitment */
 #define OV5_XCMO   0x0440  /* Page Coalescing */
-#define OV5_TYPE1_AFFINITY 0x0580  /* Type 1 NUMA affinity */
+#define OV5_FORM1_AFFINITY 0x0580  /* FORM1 NUMA affinity */
 #define OV5_PRRN   0x0540  /* Platform Resource Reassignment */
 #define OV5_HP_EVT 0x0604  /* Hot Plug Event support */
 #define OV5_RESIZE_HPT 0x0601  /* Hash Page Table resizing */
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index a5bf355ce1d6..57db605ad33a 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -1096,7 +1096,7 @@ static const struct ibm_arch_vec 
ibm_architecture_vec_template __initconst = {
 #else
0,
 #endif
-   .associativity = OV5_FEAT(OV5_TYPE1_AFFINITY) | 
OV5_FEAT(OV5_PRRN),
+   .associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | 
OV5_FEAT(OV5_PRRN),
.bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT),
.micro_checkpoint = 0,
.reserved0 = 0,
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 8365b298ec48..368719b14dcc 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -53,7 +53,10 @@ EXPORT_SYMBOL(node_data);
 
 static int primary_domain_index;
 static int n_mem_addr_cells, n_mem_size_cells;
-static int form1_affinity;
+
+#define FORM0_AFFINITY 0
+#define FORM1_AFFINITY 1
+static int affinity_form;
 
 #define MAX_DISTANCE_REF_POINTS 4
 static int distance_ref_points_depth;
@@ -190,7 +193,7 @@ int __node_distance(int a, int b)
int i;
int distance = LOCAL_DISTANCE;
 
-   if (!form1_affinity)
+   if (affinity_form == FORM0_AFFINITY)
return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
 
for (i = 0; i < distance_ref_points_depth; i++) {
@@ -210,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
 {
int i;
 
-   if (!form1_affinity)
+   if (affinity_form != FORM1_AFFINITY)
return;
 
for (i = 0; i < distance_ref_points_depth; i++) {
@@ -289,6 +292,17 @@ static int __init find_primary_domain_index(void)
int index;
struct device_node *root;
 
+   /*
+* Check for which form of affinity.
+*/
+   if (firmware_has_feature(FW_FEATURE_OPAL)) {
+   affinity_form = FORM1_AFFINITY;
+   } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
+   dbg("Using form 1 affinity\n");
+   affinity_form = FORM1_AFFINITY;
+   } else
+   affinity_form = FORM0_AFFINITY;
+
if (firmware_has_feature(FW_FEATURE_OPAL))
root = of_find_node_by_path("/ibm,opal");
else
@@ -318,23 +332,16 @@ static 

[PATCH v7 1/6] powerpc/pseries: rename min_common_depth to primary_domain_index

2021-08-08 Thread Aneesh Kumar K.V
No functional change in this patch.

Reviewed-by: David Gibson 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c | 38 +++---
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index f2bf98bdcea2..8365b298ec48 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -51,7 +51,7 @@ EXPORT_SYMBOL(numa_cpu_lookup_table);
 EXPORT_SYMBOL(node_to_cpumask_map);
 EXPORT_SYMBOL(node_data);
 
-static int min_common_depth;
+static int primary_domain_index;
 static int n_mem_addr_cells, n_mem_size_cells;
 static int form1_affinity;
 
@@ -232,8 +232,8 @@ static int associativity_to_nid(const __be32 *associativity)
if (!numa_enabled)
goto out;
 
-   if (of_read_number(associativity, 1) >= min_common_depth)
-   nid = of_read_number([min_common_depth], 1);
+   if (of_read_number(associativity, 1) >= primary_domain_index)
+   nid = of_read_number([primary_domain_index], 1);
 
/* POWER4 LPAR uses 0x as invalid node */
if (nid == 0x || nid >= nr_node_ids)
@@ -284,9 +284,9 @@ int of_node_to_nid(struct device_node *device)
 }
 EXPORT_SYMBOL(of_node_to_nid);
 
-static int __init find_min_common_depth(void)
+static int __init find_primary_domain_index(void)
 {
-   int depth;
+   int index;
struct device_node *root;
 
if (firmware_has_feature(FW_FEATURE_OPAL))
@@ -326,7 +326,7 @@ static int __init find_min_common_depth(void)
}
 
if (form1_affinity) {
-   depth = of_read_number(distance_ref_points, 1);
+   index = of_read_number(distance_ref_points, 1);
} else {
if (distance_ref_points_depth < 2) {
printk(KERN_WARNING "NUMA: "
@@ -334,7 +334,7 @@ static int __init find_min_common_depth(void)
goto err;
}
 
-   depth = of_read_number(_ref_points[1], 1);
+   index = of_read_number(_ref_points[1], 1);
}
 
/*
@@ -348,7 +348,7 @@ static int __init find_min_common_depth(void)
}
 
of_node_put(root);
-   return depth;
+   return index;
 
 err:
of_node_put(root);
@@ -437,16 +437,16 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
int nid = default_nid;
int rc, index;
 
-   if ((min_common_depth < 0) || !numa_enabled)
+   if ((primary_domain_index < 0) || !numa_enabled)
return default_nid;
 
rc = of_get_assoc_arrays();
if (rc)
return default_nid;
 
-   if (min_common_depth <= aa.array_sz &&
+   if (primary_domain_index <= aa.array_sz &&
!(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
aa.n_arrays) {
-   index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
+   index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
nid = of_read_number([index], 1);
 
if (nid == 0x || nid >= nr_node_ids)
@@ -708,18 +708,18 @@ static int __init parse_numa_properties(void)
return -1;
}
 
-   min_common_depth = find_min_common_depth();
+   primary_domain_index = find_primary_domain_index();
 
-   if (min_common_depth < 0) {
+   if (primary_domain_index < 0) {
/*
-* if we fail to parse min_common_depth from device tree
+* if we fail to parse primary_domain_index from device tree
 * mark the numa disabled, boot with numa disabled.
 */
numa_enabled = false;
-   return min_common_depth;
+   return primary_domain_index;
}
 
-   dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
+   dbg("NUMA associativity depth for CPU/Memory: %d\n", 
primary_domain_index);
 
/*
 * Even though we connect cpus to numa domains later in SMP
@@ -919,14 +919,14 @@ static void __init find_possible_nodes(void)
goto out;
}
 
-   max_nodes = of_read_number([min_common_depth], 1);
+   max_nodes = of_read_number([primary_domain_index], 1);
for (i = 0; i < max_nodes; i++) {
if (!node_possible(i))
node_set(i, node_possible_map);
}
 
prop_length /= sizeof(int);
-   if (prop_length > min_common_depth + 2)
+   if (prop_length > primary_domain_index + 2)
coregroup_enabled = 1;
 
 out:
@@ -1259,7 +1259,7 @@ int cpu_to_coregroup_id(int cpu)
goto out;
 
index = of_read_number(associativity, 1);
-   if (index > min_common_depth + 1)
+   if (index > primary_domain_index + 1)
return of_read_number([index - 1], 1);
 
 out:
-- 
2.31.1



[PATCH v7 0/6] Add support for FORM2 associativity

2021-08-08 Thread Aneesh Kumar K.V
Form2 associativity adds a much more flexible NUMA topology layout
than what is provided by Form1. More details can be found in patch 7.

$ numactl -H
...
node distances:
node   0   1   2   3 
  0:  10  11  222  33 
  1:  44  10  55  66 
  2:  77  88  10  99 
  3:  101  121  132  10 
$

After DAX kmem memory add
# numactl -H
available: 5 nodes (0-4)
...
node distances:
node   0   1   2   3   4 
  0:  10  11  222  33  240 
  1:  44  10  55  66  255 
  2:  77  88  10  99  255 
  3:  101  121  132  10  230 
  4:  255  255  255  230  10 


PAPR SCM now use the numa distance details to find the numa_node and target_node
for the device.

kvaneesh@ubuntu-guest:~$ ndctl  list -N -v 
[
  {
"dev":"namespace0.0",
"mode":"devdax",
"map":"dev",
"size":1071644672,
"uuid":"d333d867-3f57-44c8-b386-d4d3abdc2bf2",
"raw_uuid":"915361ad-fe6a-42dd-848f-d6dc9f5af362",
"daxregion":{
  "id":0,
  "size":1071644672,
  "devices":[
{
  "chardev":"dax0.0",
  "size":1071644672,
  "target_node":4,
  "mode":"devdax"
}
  ]
},
"align":2097152,
"numa_node":3
  }
]
kvaneesh@ubuntu-guest:~$ 


The above output is with a Qemu command line

-numa node,nodeid=4 \
-numa dist,src=0,dst=1,val=11 -numa dist,src=0,dst=2,val=222 -numa 
dist,src=0,dst=3,val=33 -numa dist,src=0,dst=4,val=240 \
-numa dist,src=1,dst=0,val=44 -numa dist,src=1,dst=2,val=55 -numa 
dist,src=1,dst=3,val=66 -numa dist,src=1,dst=4,val=255 \
-numa dist,src=2,dst=0,val=77 -numa dist,src=2,dst=1,val=88 -numa 
dist,src=2,dst=3,val=99 -numa dist,src=2,dst=4,val=255 \
-numa dist,src=3,dst=0,val=101 -numa dist,src=3,dst=1,val=121 -numa 
dist,src=3,dst=2,val=132 -numa dist,src=3,dst=4,val=230 \
-numa dist,src=4,dst=0,val=255 -numa dist,src=4,dst=1,val=255 -numa 
dist,src=4,dst=2,val=255 -numa dist,src=4,dst=3,val=230 \
-object 
memory-backend-file,id=memnvdimm1,prealloc=yes,mem-path=$PMEM_DISK,share=yes,size=${PMEM_SIZE}
  \
-device 
nvdimm,label-size=128K,memdev=memnvdimm1,id=nvdimm1,slot=4,uuid=72511b67-0b3b-42fd-8d1d-5be3cae8bcaa,node=4

Qemu changes can be found at 
https://lore.kernel.org/qemu-devel/20210616011944.2996399-1-danielhb...@gmail.com/

Changes from v6:
* Address review feedback 

Changes from v5:
* Fix build error reported by kernel test robot
* Address review feedback 

Changes from v4:
* Drop DLPAR related device tree property for now because both Qemu nor PowerVM
  will provide the distance details of all possible NUMA nodes during boot.
* Rework numa distance code based on review feedback.

Changes from v3:
* Drop PAPR SCM specific changes and depend completely on NUMA distance 
information.

Changes from v2:
* Add nvdimm list to Cc:
* update PATCH 8 commit message.

Changes from v1:
* Update FORM2 documentation.
* rename max_domain_index to max_associativity_domain_index


Aneesh Kumar K.V (6):
  powerpc/pseries: rename min_common_depth to primary_domain_index
  powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY
  powerpc/pseries: Consolidate different NUMA distance update code paths
  powerpc/pseries: Add a helper for form1 cpu distance
  powerpc/pseries: Add support for FORM2 associativity
  powerpc/pseries: Consolidate form1 distance initialization into a
helper

 Documentation/powerpc/associativity.rst   | 103 +
 arch/powerpc/include/asm/firmware.h   |   7 +-
 arch/powerpc/include/asm/prom.h   |   3 +-
 arch/powerpc/include/asm/topology.h   |   6 +-
 arch/powerpc/kernel/prom_init.c   |   3 +-
 arch/powerpc/mm/numa.c| 433 ++
 arch/powerpc/platforms/pseries/firmware.c |   3 +-
 arch/powerpc/platforms/pseries/hotplug-cpu.c  |   2 +
 .../platforms/pseries/hotplug-memory.c|   2 +
 arch/powerpc/platforms/pseries/lpar.c |   4 +-
 10 files changed, 455 insertions(+), 111 deletions(-)
 create mode 100644 Documentation/powerpc/associativity.rst

-- 
2.31.1



Re: [PATCH v6 6/6] powerpc/pseries: Consolidate form1 distance initialization into a helper

2021-08-08 Thread David Gibson
On Fri, Aug 06, 2021 at 09:53:59PM +0530, Aneesh Kumar K.V wrote:
> On 8/6/21 12:17 PM, David Gibson wrote:
> > On Tue, Jul 27, 2021 at 03:33:11PM +0530, Aneesh Kumar K.V wrote:
> > > Currently, we duplicate parsing code for ibm,associativity and
> > > ibm,associativity-lookup-arrays in the kernel. The associativity array 
> > > provided
> > > by these device tree properties are very similar and hence can use
> > > a helper to parse the node id and numa distance details.
> > 
> > Oh... sorry.. comments on the earlier patch were from before I read
> > and saw you adjusted things here.
> > 
> > > 
> > > Signed-off-by: Aneesh Kumar K.V 
> > > ---
> > >   arch/powerpc/mm/numa.c | 83 ++
> > >   1 file changed, 51 insertions(+), 32 deletions(-)
> > > 
> > > diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> > > index fffb3c40f595..7506251e17f2 100644
> > > --- a/arch/powerpc/mm/numa.c
> > > +++ b/arch/powerpc/mm/numa.c
> > > @@ -171,19 +171,19 @@ static void unmap_cpu_from_node(unsigned long cpu)
> > >   }
> > >   #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
> > > -/*
> > > - * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
> > > - * info is found.
> > > - */
> > > -static int associativity_to_nid(const __be32 *associativity)
> > > +static int __associativity_to_nid(const __be32 *associativity,
> > > +   int max_array_sz)
> > >   {
> > >   int nid = NUMA_NO_NODE;
> > > + /*
> > > +  * primary_domain_index is 1 based array index.
> > > +  */
> > > + int index = primary_domain_index  - 1;
> > > - if (!numa_enabled)
> > > + if (!numa_enabled || index >= max_array_sz)
> > >   goto out;
> > 
> > You don't need a goto, you can just return NUMA_NO_NODE.
> 
> updated
> 
> > 
> > > - if (of_read_number(associativity, 1) >= primary_domain_index)
> > > - nid = of_read_number([primary_domain_index], 1);
> > > + nid = of_read_number([index], 1);
> > >   /* POWER4 LPAR uses 0x as invalid node */
> > >   if (nid == 0x || nid >= nr_node_ids)
> > > @@ -191,6 +191,17 @@ static int associativity_to_nid(const __be32 
> > > *associativity)
> > >   out:
> > >   return nid;
> > >   }
> > > +/*
> > > + * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
> > > + * info is found.
> > > + */
> > > +static int associativity_to_nid(const __be32 *associativity)
> > > +{
> > > + int array_sz = of_read_number(associativity, 1);
> > > +
> > > + /* Skip the first element in the associativity array */
> > > + return __associativity_to_nid((associativity + 1), array_sz);
> > > +}
> > >   static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 
> > > *cpu2_assoc)
> > >   {
> > > @@ -295,24 +306,41 @@ int of_node_to_nid(struct device_node *device)
> > >   }
> > >   EXPORT_SYMBOL(of_node_to_nid);
> > > -static void __initialize_form1_numa_distance(const __be32 *associativity)
> > > +static void ___initialize_form1_numa_distance(const __be32 
> > > *associativity,
> > > +  int max_array_sz)
> > >   {
> > >   int i, nid;
> > >   if (affinity_form != FORM1_AFFINITY)
> > >   return;
> > > - nid = associativity_to_nid(associativity);
> > > + nid = __associativity_to_nid(associativity, max_array_sz);
> > >   if (nid != NUMA_NO_NODE) {
> > >   for (i = 0; i < distance_ref_points_depth; i++) {
> > >   const __be32 *entry;
> > > + int index = be32_to_cpu(distance_ref_points[i]) - 1;
> > > +
> > > + /*
> > > +  * broken hierarchy, return with broken distance table
> > 
> > WARN_ON, maybe?
> 
> 
> updated
> 
> > 
> > > +  */
> > > + if (index >= max_array_sz)
> > > + return;
> > > - entry = 
> > > [be32_to_cpu(distance_ref_points[i])];
> > > + entry = [index];
> > >   distance_lookup_table[nid][i] = 
> > > of_read_number(entry, 1);
> > >   }
> > >   }
> > >   }
> > > +static void __initialize_form1_numa_distance(const __be32 *associativity)
> > 
> > Do you actually use this in-between wrapper?
> 
> yes used in
> 
> static void initialize_form1_numa_distance(struct device_node *node)
> {
>   const __be32 *associativity;
> 
>   associativity = of_get_associativity(node);
>   if (!associativity)
>   return;
> 
>   __initialize_form1_numa_distance(associativity);

Right, but I mean in more than one place.  If this is the only user,
you might as well expand it inline here.

> }
> 
> 
> 
> > 
> > > +{
> > > + int array_sz;
> > > +
> > > + array_sz = of_read_number(associativity, 1);
> > > + /* Skip the first element in the associativity array */
> > > + ___initialize_form1_numa_distance(associativity + 1, array_sz);
> > > +}
> > > +
> > >   static 

Re: [PATCH v1 17/55] KVM: PPC: Book3S HV P9: Implement PMU save/restore in C

2021-08-08 Thread Athira Rajeev



> On 26-Jul-2021, at 9:19 AM, Nicholas Piggin  wrote:
> 
> Implement the P9 path PMU save/restore code in C, and remove the
> POWER9/10 code from the P7/8 path assembly.
> 
> -449 cycles (8533) POWER9 virt-mode NULL hcall
> 
> Signed-off-by: Nicholas Piggin 
> ---
> arch/powerpc/include/asm/asm-prototypes.h |   5 -
> arch/powerpc/kvm/book3s_hv.c  | 205 --
> arch/powerpc/kvm/book3s_hv_interrupts.S   |  13 +-
> arch/powerpc/kvm/book3s_hv_rmhandlers.S   |  43 +
> 4 files changed, 200 insertions(+), 66 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
> b/arch/powerpc/include/asm/asm-prototypes.h
> index 222823861a67..41b8a1e1144a 100644
> --- a/arch/powerpc/include/asm/asm-prototypes.h
> +++ b/arch/powerpc/include/asm/asm-prototypes.h
> @@ -141,11 +141,6 @@ static inline void kvmppc_restore_tm_hv(struct kvm_vcpu 
> *vcpu, u64 msr,
>   bool preserve_nv) { }
> #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
> 
> -void kvmhv_save_host_pmu(void);
> -void kvmhv_load_host_pmu(void);
> -void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use);
> -void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu);
> -
> void kvmppc_p9_enter_guest(struct kvm_vcpu *vcpu);
> 
> long kvmppc_h_set_dabr(struct kvm_vcpu *vcpu, unsigned long dabr);
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 2eef708c4354..d20b579ddcdf 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -3735,6 +3735,188 @@ static noinline void kvmppc_run_core(struct 
> kvmppc_vcore *vc)
>   trace_kvmppc_run_core(vc, 1);
> }
> 
> +/*
> + * Privileged (non-hypervisor) host registers to save.
> + */
> +struct p9_host_os_sprs {
> + unsigned long dscr;
> + unsigned long tidr;
> + unsigned long iamr;
> + unsigned long amr;
> + unsigned long fscr;
> +
> + unsigned int pmc1;
> + unsigned int pmc2;
> + unsigned int pmc3;
> + unsigned int pmc4;
> + unsigned int pmc5;
> + unsigned int pmc6;
> + unsigned long mmcr0;
> + unsigned long mmcr1;
> + unsigned long mmcr2;
> + unsigned long mmcr3;
> + unsigned long mmcra;
> + unsigned long siar;
> + unsigned long sier1;
> + unsigned long sier2;
> + unsigned long sier3;
> + unsigned long sdar;
> +};
> +
> +static void freeze_pmu(unsigned long mmcr0, unsigned long mmcra)
> +{
> + if (!(mmcr0 & MMCR0_FC))
> + goto do_freeze;
> + if (mmcra & MMCRA_SAMPLE_ENABLE)
> + goto do_freeze;
> + if (cpu_has_feature(CPU_FTR_ARCH_31)) {
> + if (!(mmcr0 & MMCR0_PMCCEXT))
> + goto do_freeze;
> + if (!(mmcra & MMCRA_BHRB_DISABLE))
> + goto do_freeze;
> + }
> + return;
> +
> +do_freeze:
> + mmcr0 = MMCR0_FC;
> + mmcra = 0;
> + if (cpu_has_feature(CPU_FTR_ARCH_31)) {
> + mmcr0 |= MMCR0_PMCCEXT;
> + mmcra = MMCRA_BHRB_DISABLE;
> + }
> +
> + mtspr(SPRN_MMCR0, mmcr0);
> + mtspr(SPRN_MMCRA, mmcra);
> + isync();
> +}
> +
Hi Nick,

After feezing pmu, do we need to clear “pmcregs_in_use” as well?
Also can’t we unconditionally do the MMCR0/MMCRA/ freeze settings in here ? do 
we need the if conditions for FC/PMCCEXT/BHRB ?

Thanks
Athira
> +static void save_p9_host_pmu(struct p9_host_os_sprs *host_os_sprs)
> +{
> + if (ppc_get_pmu_inuse()) {
> + /*
> +  * It might be better to put PMU handling (at least for the
> +  * host) in the perf subsystem because it knows more about what
> +  * is being used.
> +  */
> +
> + /* POWER9, POWER10 do not implement HPMC or SPMC */
> +
> + host_os_sprs->mmcr0 = mfspr(SPRN_MMCR0);
> + host_os_sprs->mmcra = mfspr(SPRN_MMCRA);
> +
> + freeze_pmu(host_os_sprs->mmcr0, host_os_sprs->mmcra);
> +
> + host_os_sprs->pmc1 = mfspr(SPRN_PMC1);
> + host_os_sprs->pmc2 = mfspr(SPRN_PMC2);
> + host_os_sprs->pmc3 = mfspr(SPRN_PMC3);
> + host_os_sprs->pmc4 = mfspr(SPRN_PMC4);
> + host_os_sprs->pmc5 = mfspr(SPRN_PMC5);
> + host_os_sprs->pmc6 = mfspr(SPRN_PMC6);
> + host_os_sprs->mmcr1 = mfspr(SPRN_MMCR1);
> + host_os_sprs->mmcr2 = mfspr(SPRN_MMCR2);
> + host_os_sprs->sdar = mfspr(SPRN_SDAR);
> + host_os_sprs->siar = mfspr(SPRN_SIAR);
> + host_os_sprs->sier1 = mfspr(SPRN_SIER);
> +
> + if (cpu_has_feature(CPU_FTR_ARCH_31)) {
> + host_os_sprs->mmcr3 = mfspr(SPRN_MMCR3);
> + host_os_sprs->sier2 = mfspr(SPRN_SIER2);
> + host_os_sprs->sier3 = mfspr(SPRN_SIER3);
> + }
> + }
> +}
> +
> +static void load_p9_guest_pmu(struct kvm_vcpu *vcpu)
> +{
> + mtspr(SPRN_PMC1, vcpu->arch.pmc[0]);
> + mtspr(SPRN_PMC2, 

[PATCH v2] powerpc/kprobes: Fix kprobe Oops happens in booke

2021-08-08 Thread Pu Lehui
When using kprobe on powerpc booke series processor, Oops happens
as show bellow:

/ # echo "p:myprobe do_nanosleep" > /sys/kernel/debug/tracing/kprobe_events
/ # echo 1 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable
/ # sleep 1
[   50.076730] Oops: Exception in kernel mode, sig: 5 [#1]
[   50.077017] BE PAGE_SIZE=4K SMP NR_CPUS=24 QEMU e500
[   50.077221] Modules linked in:
[   50.077462] CPU: 0 PID: 77 Comm: sleep Not tainted 
5.14.0-rc4-00022-g251a1524293d #21
[   50.077887] NIP:  c0b9c4e0 LR: c00ebecc CTR: 
[   50.078067] REGS: c3883de0 TRAP: 0700   Not tainted 
(5.14.0-rc4-00022-g251a1524293d)
[   50.078349] MSR:  00029000   CR: 24000228  XER: 2000
[   50.078675]
[   50.078675] GPR00: c00ebdf0 c3883e90 c313e300 c3883ea0 0001  
c3883ecc 0001
[   50.078675] GPR08: c100598c c00ea250 0004  24000222 102490c2 
bff4180c 101e60d4
[   50.078675] GPR16:  102454ac 0040 1024 10241100 102410f8 
1024 0050
[   50.078675] GPR24: 0002  c3883ea0 0001  c350 
3b9b8d50 
[   50.080151] NIP [c0b9c4e0] do_nanosleep+0x0/0x190
[   50.080352] LR [c00ebecc] hrtimer_nanosleep+0x14c/0x1e0
[   50.080638] Call Trace:
[   50.080801] [c3883e90] [c00ebdf0] hrtimer_nanosleep+0x70/0x1e0 (unreliable)
[   50.081110] [c3883f00] [c00ec004] sys_nanosleep_time32+0xa4/0x110
[   50.081336] [c3883f40] [c001509c] ret_from_syscall+0x0/0x28
[   50.081541] --- interrupt: c00 at 0x100a4d08
[   50.081749] NIP:  100a4d08 LR: 101b5234 CTR: 0003
[   50.081931] REGS: c3883f50 TRAP: 0c00   Not tainted 
(5.14.0-rc4-00022-g251a1524293d)
[   50.082183] MSR:  0002f902   CR: 24000222  XER: 
[   50.082457]
[   50.082457] GPR00: 00a2 bf980040 1024b4d0 bf980084 bf980084 6400 
00555345 fefefeff
[   50.082457] GPR08: 7f7f7f7f 101e 0069 0003 28000422 102490c2 
bff4180c 101e60d4
[   50.082457] GPR16:  102454ac 0040 1024 10241100 102410f8 
1024 0050
[   50.082457] GPR24: 0002 bf9803f4 1024   100039e0 
 102444e8
[   50.083789] NIP [100a4d08] 0x100a4d08
[   50.083917] LR [101b5234] 0x101b5234
[   50.084042] --- interrupt: c00
[   50.084238] Instruction dump:
[   50.084483] 4bfffc40 6000 6000 6000 9421fff0 39400402 914200c0 
38210010
[   50.084841] 4bfffc20    <7fe8> 7c0802a6 7c892378 
93c10048
[   50.085487] ---[ end trace f6fffe98e2fa8f3e ]---
[   50.085678]
Trace/breakpoint trap

There is no real mode for booke arch and the MMU translation is
always on. The corresponding MSR_IS/MSR_DS bit in booke is used
to switch the address space, but not for real mode judgment.

Fixes: 21f8b2fa3ca5 ("powerpc/kprobes: Ignore traps that happened in real mode")
Signed-off-by: Pu Lehui 
---
v1->v2:
- use IS_ENABLED(CONFIG_BOOKE) as suggested by Michael Ellerman and
  Christophe Leroy
- update Oops log to make problem clear

 arch/powerpc/kernel/kprobes.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index cbc28d1a2e1b..7a7cd6bda53e 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -292,7 +292,8 @@ int kprobe_handler(struct pt_regs *regs)
if (user_mode(regs))
return 0;
 
-   if (!(regs->msr & MSR_IR) || !(regs->msr & MSR_DR))
+   if (!IS_ENABLED(CONFIG_BOOKE) &&
+   (!(regs->msr & MSR_IR) || !(regs->msr & MSR_DR)))
return 0;
 
/*
-- 
2.17.1



Re: [PATCH 00/11] Implement generic prot_guest_has() helper function

2021-08-08 Thread Kuppuswamy, Sathyanarayanan

Hi Tom,

On 7/27/21 3:26 PM, Tom Lendacky wrote:

This patch series provides a generic helper function, prot_guest_has(),
to replace the sme_active(), sev_active(), sev_es_active() and
mem_encrypt_active() functions.

It is expected that as new protected virtualization technologies are
added to the kernel, they can all be covered by a single function call
instead of a collection of specific function calls all called from the
same locations.

The powerpc and s390 patches have been compile tested only. Can the
folks copied on this series verify that nothing breaks for them.


With this patch set, select ARCH_HAS_PROTECTED_GUEST and set
CONFIG_AMD_MEM_ENCRYPT=n, creates following error.

ld: arch/x86/mm/ioremap.o: in function `early_memremap_is_setup_data':
arch/x86/mm/ioremap.c:672: undefined reference to `early_memremap_decrypted'

It looks like early_memremap_is_setup_data() is not protected with
appropriate config.




Cc: Andi Kleen 
Cc: Andy Lutomirski 
Cc: Ard Biesheuvel 
Cc: Baoquan He 
Cc: Benjamin Herrenschmidt 
Cc: Borislav Petkov 
Cc: Christian Borntraeger 
Cc: Daniel Vetter 
Cc: Dave Hansen 
Cc: Dave Young 
Cc: David Airlie 
Cc: Heiko Carstens 
Cc: Ingo Molnar 
Cc: Joerg Roedel 
Cc: Maarten Lankhorst 
Cc: Maxime Ripard 
Cc: Michael Ellerman 
Cc: Paul Mackerras 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: Thomas Zimmermann 
Cc: Vasily Gorbik 
Cc: VMware Graphics 
Cc: Will Deacon 

---

Patches based on:
   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git master
   commit 79e920060fa7 ("Merge branch 'WIP/fixes'")

Tom Lendacky (11):
   mm: Introduce a function to check for virtualization protection
 features
   x86/sev: Add an x86 version of prot_guest_has()
   powerpc/pseries/svm: Add a powerpc version of prot_guest_has()
   x86/sme: Replace occurrences of sme_active() with prot_guest_has()
   x86/sev: Replace occurrences of sev_active() with prot_guest_has()
   x86/sev: Replace occurrences of sev_es_active() with prot_guest_has()
   treewide: Replace the use of mem_encrypt_active() with
 prot_guest_has()
   mm: Remove the now unused mem_encrypt_active() function
   x86/sev: Remove the now unused mem_encrypt_active() function
   powerpc/pseries/svm: Remove the now unused mem_encrypt_active()
 function
   s390/mm: Remove the now unused mem_encrypt_active() function

  arch/Kconfig   |  3 ++
  arch/powerpc/include/asm/mem_encrypt.h |  5 --
  arch/powerpc/include/asm/protected_guest.h | 30 +++
  arch/powerpc/platforms/pseries/Kconfig |  1 +
  arch/s390/include/asm/mem_encrypt.h|  2 -
  arch/x86/Kconfig   |  1 +
  arch/x86/include/asm/kexec.h   |  2 +-
  arch/x86/include/asm/mem_encrypt.h | 13 +
  arch/x86/include/asm/protected_guest.h | 27 ++
  arch/x86/kernel/crash_dump_64.c|  4 +-
  arch/x86/kernel/head64.c   |  4 +-
  arch/x86/kernel/kvm.c  |  3 +-
  arch/x86/kernel/kvmclock.c |  4 +-
  arch/x86/kernel/machine_kexec_64.c | 19 +++
  arch/x86/kernel/pci-swiotlb.c  |  9 ++--
  arch/x86/kernel/relocate_kernel_64.S   |  2 +-
  arch/x86/kernel/sev.c  |  6 +--
  arch/x86/kvm/svm/svm.c |  3 +-
  arch/x86/mm/ioremap.c  | 16 +++---
  arch/x86/mm/mem_encrypt.c  | 60 +++---
  arch/x86/mm/mem_encrypt_identity.c |  3 +-
  arch/x86/mm/pat/set_memory.c   |  3 +-
  arch/x86/platform/efi/efi_64.c |  9 ++--
  arch/x86/realmode/init.c   |  8 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  4 +-
  drivers/gpu/drm/drm_cache.c|  4 +-
  drivers/gpu/drm/vmwgfx/vmwgfx_drv.c|  4 +-
  drivers/gpu/drm/vmwgfx/vmwgfx_msg.c|  6 +--
  drivers/iommu/amd/init.c   |  7 +--
  drivers/iommu/amd/iommu.c  |  3 +-
  drivers/iommu/amd/iommu_v2.c   |  3 +-
  drivers/iommu/iommu.c  |  3 +-
  fs/proc/vmcore.c   |  6 +--
  include/linux/mem_encrypt.h|  4 --
  include/linux/protected_guest.h| 37 +
  kernel/dma/swiotlb.c   |  4 +-
  36 files changed, 218 insertions(+), 104 deletions(-)
  create mode 100644 arch/powerpc/include/asm/protected_guest.h
  create mode 100644 arch/x86/include/asm/protected_guest.h
  create mode 100644 include/linux/protected_guest.h



--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer


Re: [PATCH v2 1/2] sched/topology: Skip updating masks for non-online nodes

2021-08-08 Thread Valentin Schneider


A bit late, but technically the week isn't over yet! :D

On 23/07/21 20:09, Srikar Dronamraju wrote:
> * Valentin Schneider  [2021-07-13 17:32:14]:
>> Now, let's take examples from your cover letter:
>>
>>   node distances:
>>   node   0   1   2   3   4   5   6   7
>> 0:  10  20  40  40  40  40  40  40
>> 1:  20  10  40  40  40  40  40  40
>> 2:  40  40  10  20  40  40  40  40
>> 3:  40  40  20  10  40  40  40  40
>> 4:  40  40  40  40  10  20  40  40
>> 5:  40  40  40  40  20  10  40  40
>> 6:  40  40  40  40  40  40  10  20
>> 7:  40  40  40  40  40  40  20  10
>>
>> But the system boots with just nodes 0 and 1, thus only this distance
>> matrix is valid:
>>
>>   node   0   1
>> 0:  10  20
>> 1:  20  10
>>
>> topology_span_sane() is going to use tl->mask(cpu), and as you reported the
>> NODE topology level should cause issues. Let's assume all offline nodes say
>> they're 10 distance away from everyone else, and that we have one CPU per
>> node. This would give us:
>>
>
> No,
> All offline nodes would be at a distance of 10 from node 0 only.
> So here node distance of all offline nodes from node 1 would be 20.
>
>>   NODE->mask(0) == 0,2-7
>>   NODE->mask(1) == 1-7
>
> so
>
> NODE->mask(0) == 0,2-7
> NODE->mask(1) should be 1
> and NODE->mask(2-7) == 0,2-7
>

Ok, so that shouldn't trigger the warning.

>>
>> The intersection is 2-7, we'll trigger the WARN_ON().
>> Now, with the above snippet, we'll check if that intersection covers any
>> online CPU. For sched_init_domains(), cpu_map is cpu_active_mask, so we'd
>> end up with an empty intersection and we shouldn't warn - that's the theory
>> at least.
>
> Now lets say we onlined CPU 3 and node 3 which was at a actual distance
> of 20 from node 0.
>
> (If we only consider online CPUs, and since scheduler masks like
> sched_domains_numa_masks arent updated with offline CPUs,)
> then
>
> NODE->mask(0) == 0
> NODE->mask(1) == 1
> NODE->mask(3) == 0,3
>

Wait, doesn't the distance matrix (without any offline node) say

  distance(0, 3) == 40

? We should have at the very least:

  node   0   1   2   3
0:  10  20  ??  40
1:  20  20  ??  40
2:  ??  ??  ??  ??
3:  40  40  ??  10

Regardless, NODE->mask(x) is sched_domains_numa_masks[0][x], if

  distance(0,3) > LOCAL_DISTANCE

then

  node0 ∉ NODE->mask(3)

> cpumask_and(intersect, tl->mask(cpu), tl->mask(i));
> if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) && 
> cpumask_intersects(intersect, cpu_map))
>
> cpu_map is 0,1,3
> intersect is 0
>
> From above NODE->mask(0) is !equal to NODE->mask(1) and
> cpumask_intersects(intersect, cpu_map) is also true.
>
> I picked Node 3 since if Node 1 is online, we would have faked distance
> for Node 2 to be at distance of 40.
>
> Any node from 3 to 7, we would have faced the same problem.
>
>>
>> Looking at sd_numa_mask(), I think there's a bug with topology_span_sane():
>> it doesn't run in the right place wrt where sched_domains_curr_level is
>> updated. Could you try the below on top of the previous snippet?
>>
>> If that doesn't help, could you share the node distances / topology masks
>> that lead to the WARN_ON()? Thanks.
>>
>> ---
>> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
>> index b77ad49dc14f..cda69dfa4065 100644
>> --- a/kernel/sched/topology.c
>> +++ b/kernel/sched/topology.c
>> @@ -1516,13 +1516,6 @@ sd_init(struct sched_domain_topology_level *tl,
>>  int sd_id, sd_weight, sd_flags = 0;
>>  struct cpumask *sd_span;
>>
>> -#ifdef CONFIG_NUMA
>> -/*
>> - * Ugly hack to pass state to sd_numa_mask()...
>> - */
>> -sched_domains_curr_level = tl->numa_level;
>> -#endif
>> -
>>  sd_weight = cpumask_weight(tl->mask(cpu));
>>
>>  if (tl->sd_flags)
>> @@ -2131,7 +2124,12 @@ build_sched_domains(const struct cpumask *cpu_map, 
>> struct sched_domain_attr *att
>>
>>  sd = NULL;
>>  for_each_sd_topology(tl) {
>> -
>> +#ifdef CONFIG_NUMA
>> +/*
>> + * Ugly hack to pass state to sd_numa_mask()...
>> + */
>> +sched_domains_curr_level = tl->numa_level;
>> +#endif
>>  if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
>>  goto error;
>>
>>
>
> I tested with the above patch too. However it still not helping.
>
> Here is the log from my testing.
>
> At Boot.
>
> (Do remember to arrive at sched_max_numa_levels we faked the
> numa_distance of node 1 to be at 20 from node 0. All other offline
> nodes are at a distance of 10 from node 0.)
>

[...]

> ( First addition of a CPU to a non-online node esp node whose node
> distance was not faked.)
>
> numactl -H
> available: 3 nodes (0,5,7)
> node 0 cpus: 0 1 2 3 4 5 6 7
> node 0 size: 0 MB
> node 0 free: 0 MB
> node 5 cpus: 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 
> 32 33 34 35 40 41 42 43 48 49 50 51 56 57 58 59 64 65 66 67 72 73 74 75 76 77 
> 78 79 80