Re: [PATCH v2] Fix display of Maximum Memory

2020-01-16 Thread Michael Bringmann
On 1/15/20 11:53 PM, Michael Ellerman wrote:
> Michael Bringmann  writes:
>> Correct overflow problem in calculation+display of Maximum Memory
>> value to syscfg where 32bits is insufficient.
>>
>> Signed-off-by: Michael Bringmann 
>> ---
>>  arch/powerpc/platforms/pseries/lparcfg.c | 8 
>>  1 file changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/pseries/lparcfg.c 
>> b/arch/powerpc/platforms/pseries/lparcfg.c
>> index e33e8bc..f00411c 100644
>> --- a/arch/powerpc/platforms/pseries/lparcfg.c
>> +++ b/arch/powerpc/platforms/pseries/lparcfg.c
>> @@ -433,12 +433,12 @@ static void parse_em_data(struct seq_file *m)
>>  
>>  static void maxmem_data(struct seq_file *m)
>>  {
>> -unsigned long maxmem = 0;
>> +u64 maxmem = 0;
> 
> This is 64-bit only code, so u64 == unsigned long.
> 
>> -maxmem += drmem_info->n_lmbs * drmem_info->lmb_size;
>> -maxmem += hugetlb_total_pages() * PAGE_SIZE;
>> +maxmem += (u64)drmem_info->n_lmbs * drmem_info->lmb_size;
> 
> The only problem AFAICS is n_lmbs is int and lmb_size is u32, so this
> multiplication will overflow.
> 
>> +maxmem += (u64)hugetlb_total_pages() * PAGE_SIZE;
> 
> hugetlb_total_pages() already returns unsigned long.
> 
>> -seq_printf(m, "MaxMem=%ld\n", maxmem);
>> +seq_printf(m, "MaxMem=%llu\n", maxmem);
>>  }
> 
> This should be sufficient?
> 
> diff --git a/arch/powerpc/platforms/pseries/lparcfg.c 
> b/arch/powerpc/platforms/pseries/lparcfg.c
> index e33e8bc4b69b..38c306551f76 100644
> --- a/arch/powerpc/platforms/pseries/lparcfg.c
> +++ b/arch/powerpc/platforms/pseries/lparcfg.c
> @@ -435,10 +435,10 @@ static void maxmem_data(struct seq_file *m)
>  {
> unsigned long maxmem = 0;
>  
> -   maxmem += drmem_info->n_lmbs * drmem_info->lmb_size;
> +   maxmem += (unsigned long)drmem_info->n_lmbs * drmem_info->lmb_size;
> maxmem += hugetlb_total_pages() * PAGE_SIZE;
>  
> -   seq_printf(m, "MaxMem=%ld\n", maxmem);
> +   seq_printf(m, "MaxMem=%lu\n", maxmem);
>  }
>  
>  static int pseries_lparcfg_data(struct seq_file *m, void *v)
> 
> 
> cheers
> 

Trying it out.

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.ibm.com


[PATCH v2] Fix display of Maximum Memory

2020-01-15 Thread Michael Bringmann
Correct overflow problem in calculation+display of Maximum Memory
value to syscfg where 32bits is insufficient.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/platforms/pseries/lparcfg.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/lparcfg.c 
b/arch/powerpc/platforms/pseries/lparcfg.c
index e33e8bc..f00411c 100644
--- a/arch/powerpc/platforms/pseries/lparcfg.c
+++ b/arch/powerpc/platforms/pseries/lparcfg.c
@@ -433,12 +433,12 @@ static void parse_em_data(struct seq_file *m)
 
 static void maxmem_data(struct seq_file *m)
 {
-   unsigned long maxmem = 0;
+   u64 maxmem = 0;
 
-   maxmem += drmem_info->n_lmbs * drmem_info->lmb_size;
-   maxmem += hugetlb_total_pages() * PAGE_SIZE;
+   maxmem += (u64)drmem_info->n_lmbs * drmem_info->lmb_size;
+   maxmem += (u64)hugetlb_total_pages() * PAGE_SIZE;
 
-   seq_printf(m, "MaxMem=%ld\n", maxmem);
+   seq_printf(m, "MaxMem=%llu\n", maxmem);
 }
 
 static int pseries_lparcfg_data(struct seq_file *m, void *v)
-- 
1.8.3.1



Re: [PATCH] Fix display of Maximum Memory

2020-01-15 Thread Michael Bringmann
On 1/14/20 11:41 PM, Christophe Leroy wrote:
> 
> 
> Le 14/01/2020 à 22:07, Michael Bringmann a écrit :
>> Correct overflow problem in calculation+display of Maximum Memory
>> value to syscfg where 32bits is insufficient.
>>
>> Signed-off-by: Michael Bringmann 
>> ---
>>   arch/powerpc/platforms/pseries/lparcfg.c | 8 
>>   1 file changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/pseries/lparcfg.c 
>> b/arch/powerpc/platforms/pseries/lparcfg.c
>> index 4ee2594..183aeb7 100644
>> --- a/arch/powerpc/platforms/pseries/lparcfg.c
>> +++ b/arch/powerpc/platforms/pseries/lparcfg.c
>> @@ -435,12 +435,12 @@ static void parse_em_data(struct seq_file *m)
>>
>>   static void maxmem_data(struct seq_file *m)
>>   {
>> -   unsigned long maxmem = 0;
>> +   unsigned long long maxmem = 0;
> 
> What about using u64 instead, for readability ?

Okay.
> 
>>
>> -   maxmem += drmem_info->n_lmbs * drmem_info->lmb_size;
>> -   maxmem += hugetlb_total_pages() * PAGE_SIZE;
>> +   maxmem += (unsigned long long)drmem_info->n_lmbs * (unsigned long 
>> long)drmem_info->lmb_size;
> 
> This line is likely too long. You only need to cast one of the two operants 
> to force a 64 bits multiply. And using u64 would shorten the line.
> 
> Can both multiplications overflow ?

Yes.

> 
> Christophe
> 
>> +   maxmem += (unsigned long long)hugetlb_total_pages() * (unsigned long 
>> long)PAGE_SIZE;
>>
>> -   seq_printf(m, "MaxMem=%ld\n", maxmem);
>> +   seq_printf(m, "MaxMem=%llu\n", maxmem);
>>   }
>>
>>   static int pseries_lparcfg_data(struct seq_file *m, void *v)
>>
> 

Thanks.
-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.ibm.com


[PATCH] Fix display of Maximum Memory

2020-01-14 Thread Michael Bringmann
Correct overflow problem in calculation+display of Maximum Memory
value to syscfg where 32bits is insufficient.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/platforms/pseries/lparcfg.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/lparcfg.c 
b/arch/powerpc/platforms/pseries/lparcfg.c
index 4ee2594..183aeb7 100644
--- a/arch/powerpc/platforms/pseries/lparcfg.c
+++ b/arch/powerpc/platforms/pseries/lparcfg.c
@@ -435,12 +435,12 @@ static void parse_em_data(struct seq_file *m)

 static void maxmem_data(struct seq_file *m)
 {
-   unsigned long maxmem = 0;
+   unsigned long long maxmem = 0;

-   maxmem += drmem_info->n_lmbs * drmem_info->lmb_size;
-   maxmem += hugetlb_total_pages() * PAGE_SIZE;
+   maxmem += (unsigned long long)drmem_info->n_lmbs * (unsigned long 
long)drmem_info->lmb_size;
+   maxmem += (unsigned long long)hugetlb_total_pages() * (unsigned long 
long)PAGE_SIZE;

-   seq_printf(m, "MaxMem=%ld\n", maxmem);
+   seq_printf(m, "MaxMem=%llu\n", maxmem);
 }

 static int pseries_lparcfg_data(struct seq_file *m, void *v)
-- 
1.8.3.1


REPOST [PATCH v04] powerpc/numa: Perform full re-add of CPU for PRRN/VPHN topology update

2019-03-05 Thread Michael Bringmann
On pseries systems, performing changes to a partition's affinity
can result in altering the nodes a CPU is assigned to the
current system.  For example, some systems are subject to resource
balancing operations by the operator or control software.  In such
environments, system CPUs may be in node 1 and 3 at boot, and be
moved to nodes 2, 3, and 5, for better performance.

The current implementation attempts to recognize such changes within
the powerpc-specific version of arch_update_cpu_topology to modify a
range of system data structures directly.  However, some scheduler
data structures may be inaccessible, or the timing of a node change
may still lead to corruption or error in other modules (e.g. user
space) which do not receive notification of these changes.

This patch modifies the PRRN/VPHN topology update worker function to
recognize an affinity change for a CPU, and to perform a full DLPAR
remove and add of the CPU instead of dynamically changing its node
to resolve this issue.

[Based upon patch submission:
Subject: [PATCH] powerpc/pseries: Perform full re-add of CPU for topology 
update post-migration
From: Nathan Fontenot 
Date: Tue Oct 30 05:43:36 AEDT 2018
]

[Replace patch submission:
Subject: [PATCH] powerpc/topology: Update numa mask when cpu node mapping 
changes
From: Srikar Dronamraju 
Date: Wed Oct 10 15:24:46 AEDT 2018
]

Signed-off-by: Michael Bringmann 
---
Changes in v04:
  -- Revise tests in topology_timer_fn to check vphn_enabled before prrn_enabled
  -- Remove unnecessary changes to numa_update_cpu_topology
Changes in v03:
  -- Fixed under-scheduling of topo updates.
Changes in v02:
  -- Reuse more of the previous implementation to reduce patch size
  -- Replace former calls to numa_update_cpu_topology(false) by
 topology_schedule_update
  -- Make sure that we report topology changes back through
 arch_update_cpu_topology
  -- Fix problem observed in powerpc next kernel with updating
 cpu_associativity_changes_mask in timer_topology_fn when both
 prrn_enabled and vphn_enabled, and many extra CPUs are possible,
 but not installed.
  -- Fix problem with updating cpu_associativity_changes_mask when
 VPHN associativity information does not arrive until after first
 call to update topology occurs.
---
 arch/powerpc/include/asm/topology.h |7 +
 arch/powerpc/kernel/rtasd.c |2 +
 arch/powerpc/mm/numa.c  |   47 +++
 3 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index f85e2b01c3df..79505c371fd5 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -42,7 +42,7 @@ extern void __init dump_numa_cpu_topology(void);

 extern int sysfs_add_device_to_node(struct device *dev, int nid);
 extern void sysfs_remove_device_from_node(struct device *dev, int nid);
-extern int numa_update_cpu_topology(bool cpus_locked);
+extern void topology_schedule_update(void);

 static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node)
 {
@@ -77,10 +77,7 @@ static inline void sysfs_remove_device_from_node(struct 
device *dev,
 {
 }

-static inline int numa_update_cpu_topology(bool cpus_locked)
-{
-   return 0;
-}
+static inline void topology_schedule_update(void) {}

 static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) {}

diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
index 8a1746d755c9..b1828de7ab78 100644
--- a/arch/powerpc/kernel/rtasd.c
+++ b/arch/powerpc/kernel/rtasd.c
@@ -285,7 +285,7 @@ static void handle_prrn_event(s32 scope)
 * the RTAS event.
 */
pseries_devicetree_update(-scope);
-   numa_update_cpu_topology(false);
+   topology_schedule_update();
 }

 static void handle_rtas_event(const struct rtas_error_log *log)
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index b5d1c45c1475..eb63479f09d7 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1077,6 +1077,8 @@ static int prrn_enabled;
 static void reset_topology_timer(void);
 static int topology_timer_secs = 1;
 static int topology_inited;
+static int topology_update_in_progress;
+static int topology_changed;

 /*
  * Change polling interval for associativity changes.
@@ -1297,9 +1299,9 @@ static int update_lookup_table(void *data)
  * Update the node maps and sysfs entries for each cpu whose home node
  * has changed. Returns 1 when the topology has changed, and 0 otherwise.
  *
- * cpus_locked says whether we already hold cpu_hotplug_lock.
+ * readd_cpus: Also readd any CPUs that have changed affinity
  */
-int numa_update_cpu_topology(bool cpus_locked)
+static int numa_update_cpu_topology(bool readd_cpus)
 {
unsigned int cpu, sibling, changed = 0;
struct topology_update_data *updates, *ud;
@@ -1307,7 +1309,8 @@ int numa_update_cpu_topology(bool cpus_locked)
struct device *dev

Re: [PATCH] powerpc/pseries: Fix dn reference error in dlpar_cpu_remove_by_index

2019-02-20 Thread Michael Bringmann
On 2/19/19 2:03 PM, Tyrel Datwyler wrote:
> On 02/19/2019 07:46 AM, Michael Bringmann wrote:
>> powerpc/pseries: Fix dn reference error in dlpar_cpu_remove_by_index()
>>
>> A reference to the device node of the CPU to be removed is released
>> upon successful removal of the associated CPU device.  If the call
>> to remove the CPU device fails, dlpar_cpu_remove_by_index() still
>> frees the reference and this leads to miscomparisons and/or
>> addressing errors later on.
>>
>> This problem may be observed when trying to DLPAR 'hot-remove' a CPU
>> from a system that has only a single CPU.  The operation will fail
>> because there is no other CPU to which the kernel operations may be
>> migrated, but the refcount will still be decremented.
>>
>> Signed-off-by: Michael Bringmann 
>>
>>
>> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
>> b/arch/powerpc/platforms/pseries/hotplug-cpu.c
>> index 97feb6e..9537bb9 100644
>> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
>> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
>> @@ -635,7 +635,8 @@ static int dlpar_cpu_remove_by_index(u32 drc_index)
>>  }
>>
>>  rc = dlpar_cpu_remove(dn, drc_index);
>> -of_node_put(dn);
>> +if (!rc)
>> +of_node_put(dn);
>>  return rc;
>>  }
>>
> 
> NACK!
> 
> The logic here is wrong. Here is the full function.
> 
> static int dlpar_cpu_remove_by_index(u32 drc_index)
> {
> struct device_node *dn;
> int rc;
> 
> dn = cpu_drc_index_to_dn(drc_index);
> if (!dn) {
> pr_warn("Cannot find CPU (drc index %x) to remove\n",
> drc_index);
> return -ENODEV;
> }
> 
> rc = dlpar_cpu_remove(dn, drc_index);
> of_node_put(dn);
> return rc;
> }
> 
> The call to cpu_drc_index_to_dn() returns a device_node with the reference 
> count
> incremented. So, regardless of the success or failure of the call to
> dlpar_cpu_remove() you need to release that reference.
> 
> If there is a reference counting issue it is somewhere else.

Okay.  Withdrawn while we look some more.

> -Tyrel

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



[PATCH] powerpc/pseries: Fix dn reference error in dlpar_cpu_remove_by_index

2019-02-19 Thread Michael Bringmann
powerpc/pseries: Fix dn reference error in dlpar_cpu_remove_by_index()

A reference to the device node of the CPU to be removed is released
upon successful removal of the associated CPU device.  If the call
to remove the CPU device fails, dlpar_cpu_remove_by_index() still
frees the reference and this leads to miscomparisons and/or
addressing errors later on.

This problem may be observed when trying to DLPAR 'hot-remove' a CPU
from a system that has only a single CPU.  The operation will fail
because there is no other CPU to which the kernel operations may be
migrated, but the refcount will still be decremented.

Signed-off-by: Michael Bringmann 


diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 97feb6e..9537bb9 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -635,7 +635,8 @@ static int dlpar_cpu_remove_by_index(u32 drc_index)
}
 
rc = dlpar_cpu_remove(dn, drc_index);
-   of_node_put(dn);
+   if (!rc)
+   of_node_put(dn);
return rc;
 }
 



Re: [PATCH v04] powerpc/numa: Perform full re-add of CPU for PRRN/VPHN topology update

2019-02-18 Thread Michael Bringmann



On 2/18/19 8:15 AM, Michal Suchánek wrote:
> On Mon, 18 Feb 2019 11:49:17 +0100
> Michal Suchánek  wrote:
> 
> Nevermind
> 
> Looks like some version of the patch is queued in powerpc/next already.

Might you be referring to,
[PATCH] powerpc/pseries: Perform full re-add of CPU for topology update 
post-migration
aka
81b6132 powerpc/pseries: Perform full re-add of CPU for topology update post-mig
in the powerpc-next tree?

That is for the case of device-tree changes observed after a migration.
This patch builds upon it for CPU affinity changes observed via PRRN/VPHN 
events.

> 
> Thanks
> 
> Michal

Thanks.

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



[PATCH v04] powerpc/numa: Perform full re-add of CPU for PRRN/VPHN topology update

2019-02-14 Thread Michael Bringmann
To: linuxppc-dev@lists.ozlabs.org
To: linux-ker...@vger.kernel.org
Benjamin Herrenschmidt 
Paul Mackerras 
Michael Ellerman 
Nathan Lynch 
Corentin Labbe 
Tyrel Datwyler 
Srikar Dronamraju 
Guenter Roeck 
Michael Bringmann 
"Oliver O'Halloran" 
Russell Currey 
Haren Myneni 
Al Viro 
Kees Cook 
Nicholas Piggin 
Rob Herring 
Juliet Kim 
Thomas Falcon 
Date: 2018-11-05 16:14:12 -0600
Subject: [PATCH v04] powerpc/numa: Perform full re-add of CPU for PRRN/VPHN 
topology update

On pseries systems, performing changes to a partition's affinity
can result in altering the nodes a CPU is assigned to the
current system.  For example, some systems are subject to resource
balancing operations by the operator or control software.  In such
environments, system CPUs may be in node 1 and 3 at boot, and be
moved to nodes 2, 3, and 5, for better performance.

The current implementation attempts to recognize such changes within
the powerpc-specific version of arch_update_cpu_topology to modify a
range of system data structures directly.  However, some scheduler
data structures may be inaccessible, or the timing of a node change
may still lead to corruption or error in other modules (e.g. user
space) which do not receive notification of these changes.

This patch modifies the PRRN/VPHN topology update worker function to
recognize an affinity change for a CPU, and to perform a full DLPAR
remove and add of the CPU instead of dynamically changing its node
to resolve this issue.

[Based upon patch submission:
Subject: [PATCH] powerpc/pseries: Perform full re-add of CPU for topology 
update post-migration
From: Nathan Fontenot 
Date: Tue Oct 30 05:43:36 AEDT 2018
]

[Replace patch submission:
Subject: [PATCH] powerpc/topology: Update numa mask when cpu node mapping 
changes
From: Srikar Dronamraju 
Date: Wed Oct 10 15:24:46 AEDT 2018
]

Signed-off-by: Michael Bringmann 
---
Changes in v04:
  -- Revise tests in topology_timer_fn to check vphn_enabled before prrn_enabled
  -- Remove unnecessary changes to numa_update_cpu_topology
Changes in v03:
  -- Fixed under-scheduling of topo updates.
Changes in v02:
  -- Reuse more of the previous implementation to reduce patch size
  -- Replace former calls to numa_update_cpu_topology(false) by
 topology_schedule_update
  -- Make sure that we report topology changes back through
 arch_update_cpu_topology
  -- Fix problem observed in powerpc next kernel with updating
 cpu_associativity_changes_mask in timer_topology_fn when both
 prrn_enabled and vphn_enabled, and many extra CPUs are possible,
 but not installed.
  -- Fix problem with updating cpu_associativity_changes_mask when
 VPHN associativity information does not arrive until after first
 call to update topology occurs.
---
 arch/powerpc/include/asm/topology.h |7 +
 arch/powerpc/kernel/rtasd.c |2 +
 arch/powerpc/mm/numa.c  |   47 +++
 3 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index f85e2b01c3df..79505c371fd5 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -42,7 +42,7 @@ extern void __init dump_numa_cpu_topology(void);
 
 extern int sysfs_add_device_to_node(struct device *dev, int nid);
 extern void sysfs_remove_device_from_node(struct device *dev, int nid);
-extern int numa_update_cpu_topology(bool cpus_locked);
+extern void topology_schedule_update(void);
 
 static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node)
 {
@@ -77,10 +77,7 @@ static inline void sysfs_remove_device_from_node(struct 
device *dev,
 {
 }
 
-static inline int numa_update_cpu_topology(bool cpus_locked)
-{
-   return 0;
-}
+static inline void topology_schedule_update(void) {}
 
 static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) {}
 
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
index 8a1746d755c9..b1828de7ab78 100644
--- a/arch/powerpc/kernel/rtasd.c
+++ b/arch/powerpc/kernel/rtasd.c
@@ -285,7 +285,7 @@ static void handle_prrn_event(s32 scope)
 * the RTAS event.
 */
pseries_devicetree_update(-scope);
-   numa_update_cpu_topology(false);
+   topology_schedule_update();
 }
 
 static void handle_rtas_event(const struct rtas_error_log *log)
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index b5d1c45c1475..eb63479f09d7 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1077,6 +1077,8 @@ static int prrn_enabled;
 static void reset_topology_timer(void);
 static int topology_timer_secs = 1;
 static int topology_inited;
+static int topology_update_in_progress;
+static int topology_changed;
 
 /*
  * Change polling interval for associativity changes.
@@ -1297,9 +1299,9 @@ static int update_lookup_table(void *data)
  * Update the node maps and sysfs entries for each cpu whose home node
  * has change

Re: [PATCH v03] powerpc/numa: Perform full re-add of CPU for PRRN/VPHN topology update

2019-02-08 Thread Michael Bringmann
On 2/7/19 11:44 PM, Srikar Dronamraju wrote:
>>
>>  int arch_update_cpu_topology(void)
>>  {
>> -return numa_update_cpu_topology(true);
>> +int changed = topology_changed;
>> +
>> +topology_changed = 0;
>> +return changed;
>>  }
>>
> 
> Do we need Powerpc override for arch_update_cpu_topology() now?  That
> topology_changed sometime back doesn't seem to have help. The scheduler
> atleast now is neglecting whether the topology changed or not.

I was dealing with a a concurrency problem.  Revisiting again.
> 
> Also we can do away with the new topology_changed.
> 
>>  static void topology_work_fn(struct work_struct *work)
>>  {
>> -rebuild_sched_domains();
>> +lock_device_hotplug();
>> +if (numa_update_cpu_topology(true))
>> +rebuild_sched_domains();
>> +unlock_device_hotplug();
>>  }
> 
> Should this hunk be a separate patch by itself to say why
> rebuild_sched_domains with a changelog that explains why it should be under
> lock_device_hotplug? rebuild_sched_domains already takes cpuset_mutex. 
> So I am not sure if we need to take device_hotplug_lock.

topology_work_fn runs in its own thread like the DLPAR operations.
This patch adds calls to Nathan's 'dlpar_cpu_readd' from the topology_work_fn
thread.  The lock/unlock_device_hotplug guard against concurrency issues
with the DLPAR operations, grabbing that lock here to avoid overlap with
those other operations.  This mod is dependent upon using dlpar_cpu_readd.

> 
>>  static DECLARE_WORK(topology_work, topology_work_fn);
>>
>> -static void topology_schedule_update(void)
>> +void topology_schedule_update(void)
>>  {
>> -schedule_work(_work);
>> +if (!topology_update_in_progress)
>> +schedule_work(_work);
>>  }
>>
>>  static void topology_timer_fn(struct timer_list *unused)
>>  {
>> +bool sdo = false;
> 
> Is sdo any abbrevation?

'for do the schedule update'.  Will remove per below.

> 
>> +
>> +if (topology_scans < 1)
>> +bitmap_fill(cpumask_bits(_associativity_changes_mask),
>> +nr_cpumask_bits);
> 
> Why do we need topology_scan? Just to make sure
> cpu_associativity_changes_mask is populated only once?
> cant we use a static bool inside the function for the same?

I was running into a race condition.  On one of my test systems,
start_topology_update via shared_proc_topology_init and the PHYP did
not provide any change info about the CPUs that early in the boot.
The first run erased the cpu bits in cpu_associativity_changes_mask,
and subsequent runs did not pay attention to the reported updates.
Taking another look.
> 
> 
>> +
>>  if (prrn_enabled && cpumask_weight(_associativity_changes_mask))
>> -topology_schedule_update();
>> -else if (vphn_enabled) {
>> +sdo =  true;
>> +if (vphn_enabled) {
> 
> Any reason to remove the else above?
When vphn_enabled and prrn_enabled, it was not calling 
'update_cpu_associativity_changes_mask()',
so was not getting the necessary change info.

>>  if (update_cpu_associativity_changes_mask() > 0)
>> -topology_schedule_update();
>> +sdo =  true;
>>  reset_topology_timer();
>>  }
>> +if (sdo)
>> +topology_schedule_update();
>> +topology_scans++;
>>  }
> 
> Are the above two hunks necessary? Not getting how the current changes are
> different from the previous.
Not important.  Will undo.
> 

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



[PATCH v03] powerpc/numa: Perform full re-add of CPU for PRRN/VPHN topology update

2019-02-06 Thread Michael Bringmann
On pseries systems, performing changes to a partition's affinity
can result in altering the nodes a CPU is assigned to the
current system.  For example, some systems are subject to resource
balancing operations by the operator or control software.  In such
environments, system CPUs may be in node 1 and 3 at boot, and be
moved to nodes 2, 3, and 5, for better performance.

The current implementation attempts to recognize such changes within
the powerpc-specific version of arch_update_cpu_topology to modify a
range of system data structures directly.  However, some scheduler
data structures may be inaccessible, or the timing of a node change
may still lead to corruption or error in other modules (e.g. user
space) which do not receive notification of these changes.

This patch modifies the PRRN/VPHN topology update worker function to
recognize an affinity change for a CPU, and to perform a full DLPAR
remove and add of the CPU instead of dynamically changing its node
to resolve this issue.

[Based upon patch submission:
Subject: [PATCH] powerpc/pseries: Perform full re-add of CPU for topology 
update post-migration
From: Nathan Fontenot 
Date: Tue Oct 30 05:43:36 AEDT 2018
]

[Replace patch submission:
Subject: [PATCH] powerpc/topology: Update numa mask when cpu node mapping 
changes
From: Srikar Dronamraju 
Date: Wed Oct 10 15:24:46 AEDT 2018
]

Signed-off-by: Michael Bringmann 
---
Changes in v03:
  -- Fixed under-scheduling of topo updates.
Changes in v02:
  -- Reuse more of the previous implementation to reduce patch size
  -- Replace former calls to numa_update_cpu_topology(false) by
 topology_schedule_update
  -- Make sure that we report topology changes back through
 arch_update_cpu_topology
  -- Fix problem observed in powerpc next kernel with updating
 cpu_associativity_changes_mask in timer_topology_fn when both
 prrn_enabled and vphn_enabled, and many extra CPUs are possible,
 but not installed.
  -- Fix problem with updating cpu_associativity_changes_mask when
 VPHN associativity information does not arrive until after first
 call to update topology occurs.
---
 arch/powerpc/include/asm/topology.h |7 +---
 arch/powerpc/kernel/rtasd.c |2 +
 arch/powerpc/mm/numa.c  |   69 +++
 3 files changed, 48 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index f85e2b0..79505c3 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -42,7 +42,7 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 
 extern int sysfs_add_device_to_node(struct device *dev, int nid);
 extern void sysfs_remove_device_from_node(struct device *dev, int nid);
-extern int numa_update_cpu_topology(bool cpus_locked);
+extern void topology_schedule_update(void);
 
 static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node)
 {
@@ -77,10 +77,7 @@ static inline void sysfs_remove_device_from_node(struct 
device *dev,
 {
 }
 
-static inline int numa_update_cpu_topology(bool cpus_locked)
-{
-   return 0;
-}
+static inline void topology_schedule_update(void) {}
 
 static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) {}
 
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
index 8a1746d..b1828de 100644
--- a/arch/powerpc/kernel/rtasd.c
+++ b/arch/powerpc/kernel/rtasd.c
@@ -285,7 +285,7 @@ static void handle_prrn_event(s32 scope)
 * the RTAS event.
 */
pseries_devicetree_update(-scope);
-   numa_update_cpu_topology(false);
+   topology_schedule_update();
 }
 
 static void handle_rtas_event(const struct rtas_error_log *log)
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index ef6bdf1..a750ec0 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1077,6 +1077,9 @@ struct topology_update_data {
 static void reset_topology_timer(void);
 static int topology_timer_secs = 1;
 static int topology_inited;
+static int topology_update_in_progress;
+static int topology_changed;
+static unsigned long topology_scans;
 
 /*
  * Change polling interval for associativity changes.
@@ -1297,9 +1300,9 @@ static int update_lookup_table(void *data)
  * Update the node maps and sysfs entries for each cpu whose home node
  * has changed. Returns 1 when the topology has changed, and 0 otherwise.
  *
- * cpus_locked says whether we already hold cpu_hotplug_lock.
+ * readd_cpus: Also readd any CPUs that have changed affinity
  */
-int numa_update_cpu_topology(bool cpus_locked)
+static int numa_update_cpu_topology(bool readd_cpus)
 {
unsigned int cpu, sibling, changed = 0;
struct topology_update_data *updates, *ud;
@@ -1307,7 +1310,8 @@ int numa_update_cpu_topology(bool cpus_locked)
struct device *dev;
int weight, new_nid, i = 0;
 
-   if (!prrn_enabled && !vphn_enabled && topology_inited)
+   if

Re: [PATCH v02] powerpc/pseries: Check for ceded CPU's during LPAR migration

2019-02-01 Thread Michael Bringmann
See below.

On 1/31/19 3:53 PM, Michael Bringmann wrote:
> On 1/30/19 11:38 PM, Michael Ellerman wrote:
>> Michael Bringmann  writes:
>>> This patch is to check for cede'ed CPUs during LPM.  Some extreme
>>> tests encountered a problem ehere Linux has put some threads to
>>> sleep (possibly to save energy or something), LPM was attempted,
>>> and the Linux kernel didn't awaken the sleeping threads, but issued
>>> the H_JOIN for the active threads.  Since the sleeping threads
>>> are not awake, they can not issue the expected H_JOIN, and the
>>> partition would never suspend.  This patch wakes the sleeping
>>> threads back up.
>>
>> I'm don't think this is the right solution.
>>
>> Just after your for loop we do an on_each_cpu() call, which sends an IPI
>> to every CPU, and that should wake all CPUs up from CEDE.
>>
>> If that's not happening then there is a bug somewhere, and we need to
>> work out where.

>From Pete Heyrman:
Both sending IPI or H_PROD will awaken a logical processors that has ceded.
When you have logical proc doing cede and one logical proc doing prod or IPI
you have a race condition that the prod/IPI can proceed the cede request.
If you use prod, the hypervisor takes care of the synchronization by 
ignoring
a cede request if it was preceeded by a prod.  With IPI the interrupt is
delivered which could then be followed by a cede so OS would need to provide
synchronization.

Shouldn't this answer your concerns about race conditions and the suitability
of using H_PROD?

Michael

> 
> Let me explain the scenario of the LPM case that Pete Heyrman found, and
> that Nathan F. was working upon, previously.
> 
> In the scenario, the partition has 5 dedicated processors each with 8 threads
> running.
> 
>>From the PHYP data we can see that on VP 0, threads 3, 4, 5, 6 and 7 issued
> a H_CEDE requesting to save energy by putting the requesting thread into
> sleep mode.  In this state, the thread will only be awakened by H_PROD from
> another running thread or from an external user action (power off, reboot
> and such).  Timers and external interrupts are disabled in this mode.
> 
> About 3 seconds later, as part of the LPM operation, the other 35 threads
> have all issued a H_JOIN request.  Join is part of the LPM process where
> the threads suspend themselves as part of the LPM operation so the partition
> can be migrated to the target server.
> 
> So, the current state is the the OS has suspended the execution of all the
> threads in the partition without successfully suspending all threads as part
> of LPM.
> 
> Net, OS has an issue where they suspended every processor thread so nothing
> can run.
> 
> This appears to be slightly different than the previous LPM stalls we have
> seen where the migration stalls because of cpus being taken offline and not
> making the H_JOIN call.
> 
> In this scenario we appear to have CPUs that have done an H_CEDE prior to
> the LPM. For these CPUs we would need to do a H_PROD to wake them back up
> so they can do a H_JOIN and allow the LPM to continue.
> 
> The problem is that Linux has some threads that they put to sleep (probably
> to save energy or something), LPM was attempted, Linux didn't awaken the
> sleeping threads but issued the H_JOIN for the active threads.  Since the
> sleeping threads don't issue the H_JOIN the partition will never suspend.
> 
> I am checking again with Pete regarding your concerns.
> 
> Thanks.
> 
>>
>>
>>> diff --git a/arch/powerpc/include/asm/plpar_wrappers.h 
>>> b/arch/powerpc/include/asm/plpar_wrappers.h
>>> index cff5a41..8292eff 100644
>>> --- a/arch/powerpc/include/asm/plpar_wrappers.h
>>> +++ b/arch/powerpc/include/asm/plpar_wrappers.h
>>> @@ -26,10 +26,8 @@ static inline void set_cede_latency_hint(u8 latency_hint)
>>> get_lppaca()->cede_latency_hint = latency_hint;
>>>  }
>>>  
>>> -static inline long cede_processor(void)
>>> -{
>>> -   return plpar_hcall_norets(H_CEDE);
>>> -}
>>> +int cpu_is_ceded(int cpu);
>>> +long cede_processor(void);
>>>  
>>>  static inline long extended_cede_processor(unsigned long latency_hint)
>>>  {
>>> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
>>> index de35bd8f..fea3d21 100644
>>> --- a/arch/powerpc/kernel/rtas.c
>>> +++ b/arch/powerpc/kernel/rtas.c
>>> @@ -44,6 +44,7 @@
>>>  #include 
>>>  #include 
>>>  #include 
>>> +#include 
>>>  
>>>  /* This is here deliberately so it's only used in this file */
&g

Re: [PATCH v02] powerpc/pseries: Check for ceded CPU's during LPAR migration

2019-01-31 Thread Michael Bringmann
On 1/31/19 4:21 PM, Tyrel Datwyler wrote:
> On 01/31/2019 01:53 PM, Michael Bringmann wrote:
>> On 1/30/19 11:38 PM, Michael Ellerman wrote:
>>> Michael Bringmann  writes:
>>>> This patch is to check for cede'ed CPUs during LPM.  Some extreme
>>>> tests encountered a problem ehere Linux has put some threads to
>>>> sleep (possibly to save energy or something), LPM was attempted,
>>>> and the Linux kernel didn't awaken the sleeping threads, but issued
>>>> the H_JOIN for the active threads.  Since the sleeping threads
>>>> are not awake, they can not issue the expected H_JOIN, and the
>>>> partition would never suspend.  This patch wakes the sleeping
>>>> threads back up.
>>>
>>> I'm don't think this is the right solution.
>>>
>>> Just after your for loop we do an on_each_cpu() call, which sends an IPI
>>> to every CPU, and that should wake all CPUs up from CEDE.
>>>
>>> If that's not happening then there is a bug somewhere, and we need to
>>> work out where.
>>
>> Let me explain the scenario of the LPM case that Pete Heyrman found, and
>> that Nathan F. was working upon, previously.
>>
>> In the scenario, the partition has 5 dedicated processors each with 8 threads
>> running.
> 
> Do we CEDE processors when running dedicated? I thought H_CEDE was part of the
> Shared Processor LPAR option.
> 
>>
>> From the PHYP data we can see that on VP 0, threads 3, 4, 5, 6 and 7 issued
>> a H_CEDE requesting to save energy by putting the requesting thread into
>> sleep mode.  In this state, the thread will only be awakened by H_PROD from
>> another running thread or from an external user action (power off, reboot
>> and such).  Timers and external interrupts are disabled in this mode.
> 
> Not according to PAPR. A CEDE'd processor should awaken if signaled by 
> external
> interrupt such as decrementer or IPI as well.

Checking these points with Pete H.
Thanks.

> 
> -Tyrel
> 
>>
>> About 3 seconds later, as part of the LPM operation, the other 35 threads
>> have all issued a H_JOIN request.  Join is part of the LPM process where
>> the threads suspend themselves as part of the LPM operation so the partition
>> can be migrated to the target server.
>>
>> So, the current state is the the OS has suspended the execution of all the
>> threads in the partition without successfully suspending all threads as part
>> of LPM.
>>
>> Net, OS has an issue where they suspended every processor thread so nothing
>> can run.
>>
>> This appears to be slightly different than the previous LPM stalls we have
>> seen where the migration stalls because of cpus being taken offline and not
>> making the H_JOIN call.
>>
>> In this scenario we appear to have CPUs that have done an H_CEDE prior to
>> the LPM. For these CPUs we would need to do a H_PROD to wake them back up
>> so they can do a H_JOIN and allow the LPM to continue.
>>
>> The problem is that Linux has some threads that they put to sleep (probably
>> to save energy or something), LPM was attempted, Linux didn't awaken the
>> sleeping threads but issued the H_JOIN for the active threads.  Since the
>> sleeping threads don't issue the H_JOIN the partition will never suspend.
>>
>> I am checking again with Pete regarding your concerns.
>>
>> Thanks.
>>
>>>
>>>
>>>> diff --git a/arch/powerpc/include/asm/plpar_wrappers.h 
>>>> b/arch/powerpc/include/asm/plpar_wrappers.h
>>>> index cff5a41..8292eff 100644
>>>> --- a/arch/powerpc/include/asm/plpar_wrappers.h
>>>> +++ b/arch/powerpc/include/asm/plpar_wrappers.h
>>>> @@ -26,10 +26,8 @@ static inline void set_cede_latency_hint(u8 
>>>> latency_hint)
>>>>get_lppaca()->cede_latency_hint = latency_hint;
>>>>  }
>>>>  
>>>> -static inline long cede_processor(void)
>>>> -{
>>>> -  return plpar_hcall_norets(H_CEDE);
>>>> -}
>>>> +int cpu_is_ceded(int cpu);
>>>> +long cede_processor(void);
>>>>  
>>>>  static inline long extended_cede_processor(unsigned long latency_hint)
>>>>  {
>>>> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
>>>> index de35bd8f..fea3d21 100644
>>>> --- a/arch/powerpc/kernel/rtas.c
>>>> +++ b/arch/powerpc/kernel/rtas.c
>>>> @@ -44,6 +44,7 @@
>>>>  #include 
>>>>  #include 
>>>>  #include 
>&

Re: [PATCH v02] powerpc/pseries: Check for ceded CPU's during LPAR migration

2019-01-31 Thread Michael Bringmann
On 1/30/19 11:38 PM, Michael Ellerman wrote:
> Michael Bringmann  writes:
>> This patch is to check for cede'ed CPUs during LPM.  Some extreme
>> tests encountered a problem ehere Linux has put some threads to
>> sleep (possibly to save energy or something), LPM was attempted,
>> and the Linux kernel didn't awaken the sleeping threads, but issued
>> the H_JOIN for the active threads.  Since the sleeping threads
>> are not awake, they can not issue the expected H_JOIN, and the
>> partition would never suspend.  This patch wakes the sleeping
>> threads back up.
> 
> I'm don't think this is the right solution.
> 
> Just after your for loop we do an on_each_cpu() call, which sends an IPI
> to every CPU, and that should wake all CPUs up from CEDE.
> 
> If that's not happening then there is a bug somewhere, and we need to
> work out where.

Let me explain the scenario of the LPM case that Pete Heyrman found, and
that Nathan F. was working upon, previously.

In the scenario, the partition has 5 dedicated processors each with 8 threads
running.

>From the PHYP data we can see that on VP 0, threads 3, 4, 5, 6 and 7 issued
a H_CEDE requesting to save energy by putting the requesting thread into
sleep mode.  In this state, the thread will only be awakened by H_PROD from
another running thread or from an external user action (power off, reboot
and such).  Timers and external interrupts are disabled in this mode.

About 3 seconds later, as part of the LPM operation, the other 35 threads
have all issued a H_JOIN request.  Join is part of the LPM process where
the threads suspend themselves as part of the LPM operation so the partition
can be migrated to the target server.

So, the current state is the the OS has suspended the execution of all the
threads in the partition without successfully suspending all threads as part
of LPM.

Net, OS has an issue where they suspended every processor thread so nothing
can run.

This appears to be slightly different than the previous LPM stalls we have
seen where the migration stalls because of cpus being taken offline and not
making the H_JOIN call.

In this scenario we appear to have CPUs that have done an H_CEDE prior to
the LPM. For these CPUs we would need to do a H_PROD to wake them back up
so they can do a H_JOIN and allow the LPM to continue.

The problem is that Linux has some threads that they put to sleep (probably
to save energy or something), LPM was attempted, Linux didn't awaken the
sleeping threads but issued the H_JOIN for the active threads.  Since the
sleeping threads don't issue the H_JOIN the partition will never suspend.

I am checking again with Pete regarding your concerns.

Thanks.

> 
> 
>> diff --git a/arch/powerpc/include/asm/plpar_wrappers.h 
>> b/arch/powerpc/include/asm/plpar_wrappers.h
>> index cff5a41..8292eff 100644
>> --- a/arch/powerpc/include/asm/plpar_wrappers.h
>> +++ b/arch/powerpc/include/asm/plpar_wrappers.h
>> @@ -26,10 +26,8 @@ static inline void set_cede_latency_hint(u8 latency_hint)
>>  get_lppaca()->cede_latency_hint = latency_hint;
>>  }
>>  
>> -static inline long cede_processor(void)
>> -{
>> -return plpar_hcall_norets(H_CEDE);
>> -}
>> +int cpu_is_ceded(int cpu);
>> +long cede_processor(void);
>>  
>>  static inline long extended_cede_processor(unsigned long latency_hint)
>>  {
>> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
>> index de35bd8f..fea3d21 100644
>> --- a/arch/powerpc/kernel/rtas.c
>> +++ b/arch/powerpc/kernel/rtas.c
>> @@ -44,6 +44,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  
>>  /* This is here deliberately so it's only used in this file */
>>  void enter_rtas(unsigned long);
>> @@ -942,7 +943,7 @@ int rtas_ibm_suspend_me(u64 handle)
>>  struct rtas_suspend_me_data data;
>>  DECLARE_COMPLETION_ONSTACK(done);
>>  cpumask_var_t offline_mask;
>> -int cpuret;
>> +int cpuret, cpu;
>>  
>>  if (!rtas_service_present("ibm,suspend-me"))
>>  return -ENOSYS;
>> @@ -991,6 +992,11 @@ int rtas_ibm_suspend_me(u64 handle)
>>  goto out_hotplug_enable;
>>  }
>>  
>> +for_each_present_cpu(cpu) {
>> +if (cpu_is_ceded(cpu))
>> +plpar_hcall_norets(H_PROD, 
>> get_hard_smp_processor_id(cpu));
>> +}
> 
> There's a race condition here, there's nothing to prevent the CPUs you
> just PROD'ed from going back into CEDE before you do the on_each_cpu()
> call below> 
>>  /* Call function on all CPUs.  One of us will make the
>>   * rtas call
>>   */
>> diff --git a/arch/powerpc

[PATCH v02] powerpc/pseries: Check for ceded CPU's during LPAR migration

2019-01-30 Thread Michael Bringmann
This patch is to check for cede'ed CPUs during LPM.  Some extreme
tests encountered a problem ehere Linux has put some threads to
sleep (possibly to save energy or something), LPM was attempted,
and the Linux kernel didn't awaken the sleeping threads, but issued
the H_JOIN for the active threads.  Since the sleeping threads
are not awake, they can not issue the expected H_JOIN, and the
partition would never suspend.  This patch wakes the sleeping
threads back up.

Signed-off-by: Nathan Fontenot 
Signed-off-by: Gustavo Walbon 
---
Changes in v02:
   -- Rebase to latest powerpc kernel source.
---
 arch/powerpc/include/asm/plpar_wrappers.h |6 ++
 arch/powerpc/kernel/rtas.c|8 +++-
 arch/powerpc/platforms/pseries/setup.c|   18 ++
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/plpar_wrappers.h 
b/arch/powerpc/include/asm/plpar_wrappers.h
index cff5a41..8292eff 100644
--- a/arch/powerpc/include/asm/plpar_wrappers.h
+++ b/arch/powerpc/include/asm/plpar_wrappers.h
@@ -26,10 +26,8 @@ static inline void set_cede_latency_hint(u8 latency_hint)
get_lppaca()->cede_latency_hint = latency_hint;
 }
 
-static inline long cede_processor(void)
-{
-   return plpar_hcall_norets(H_CEDE);
-}
+int cpu_is_ceded(int cpu);
+long cede_processor(void);
 
 static inline long extended_cede_processor(unsigned long latency_hint)
 {
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index de35bd8f..fea3d21 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -44,6 +44,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* This is here deliberately so it's only used in this file */
 void enter_rtas(unsigned long);
@@ -942,7 +943,7 @@ int rtas_ibm_suspend_me(u64 handle)
struct rtas_suspend_me_data data;
DECLARE_COMPLETION_ONSTACK(done);
cpumask_var_t offline_mask;
-   int cpuret;
+   int cpuret, cpu;
 
if (!rtas_service_present("ibm,suspend-me"))
return -ENOSYS;
@@ -991,6 +992,11 @@ int rtas_ibm_suspend_me(u64 handle)
goto out_hotplug_enable;
}
 
+   for_each_present_cpu(cpu) {
+   if (cpu_is_ceded(cpu))
+   plpar_hcall_norets(H_PROD, 
get_hard_smp_processor_id(cpu));
+   }
+
/* Call function on all CPUs.  One of us will make the
 * rtas call
 */
diff --git a/arch/powerpc/platforms/pseries/setup.c 
b/arch/powerpc/platforms/pseries/setup.c
index 41f62ca2..48ae6d4 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -331,6 +331,24 @@ static int alloc_dispatch_log_kmem_cache(void)
 }
 machine_early_initcall(pseries, alloc_dispatch_log_kmem_cache);
 
+static DEFINE_PER_CPU(int, cpu_ceded);
+
+int cpu_is_ceded(int cpu)
+{
+   return per_cpu(cpu_ceded, cpu);
+}
+
+long cede_processor(void)
+{
+   long rc;
+
+   per_cpu(cpu_ceded, raw_smp_processor_id()) = 1;
+   rc = plpar_hcall_norets(H_CEDE);
+   per_cpu(cpu_ceded, raw_smp_processor_id()) = 0;
+
+   return rc;
+}
+
 static void pseries_lpar_idle(void)
 {
/*



[REPOST PATCH v07 5/5] migration/memory: Support 'ibm,dynamic-memory-v2'

2019-01-29 Thread Michael Bringmann
migration/memory: This patch adds recognition for changes to the
associativity of memory blocks described by 'ibm,dynamic-memory-v2'.
If the associativity of an LMB has changed, it should be readded to
the system in order to update local and general kernel data structures.
This patch builds upon previous enhancements that scan the device-tree
"ibm,dynamic-memory" properties using the base LMB array, and a copy
derived from the updated properties.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/platforms/pseries/hotplug-memory.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index f7a40f4..23f5655 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -1184,7 +1184,8 @@ static int pseries_memory_notifier(struct notifier_block 
*nb,
err = pseries_remove_mem_node(rd->dn);
break;
case OF_RECONFIG_UPDATE_PROPERTY:
-   if (!strcmp(rd->prop->name, "ibm,dynamic-memory")) {
+   if (!strcmp(rd->prop->name, "ibm,dynamic-memory") ||
+   !strcmp(rd->prop->name, "ibm,dynamic-memory-v2")) {
struct drmem_lmb_info *dinfo =
drmem_lmbs_init(rd->prop);
if (!dinfo)



[REPOST PATCH v07 4/5] migration/memory: Evaluate LMB assoc changes

2019-01-29 Thread Michael Bringmann
migration/memory: This patch adds code that recognizes changes to
the associativity of memory blocks described by the device-tree
properties in order to drive equivalent 'hotplug' operations to
update local and general kernel data structures to reflect those
changes.  These differences may include:

* Evaluate 'ibm,dynamic-memory' properties when processing the
  updated device-tree properties of the system during Post Migration
  events (migration_store).  The new functionality looks for changes
  to the aa_index values for each drc_index/LMB to identify any memory
  blocks that should be readded.

* In an LPAR migration scenario, the "ibm,associativity-lookup-arrays"
  property may change.  In the event that a row of the array differs,
  locate all assigned memory blocks with that 'aa_index' and 're-add'
  them to the system memory block data structures.  In the process of
  the 're-add', the system routines will update the corresponding entry
  for the memory in the LMB structures and any other relevant kernel
  data structures.

A number of previous extensions made to the DRMEM code for scanning
device-tree properties and creating LMB arrays are used here to
ensure that the resulting code is simpler and more usable:

* Use new paired list iterator for the DRMEM LMB info arrays to find
  differences in old and new versions of properties.
* Use new iterator for copies of the DRMEM info arrays to evaluate
  completely new structures.
* Combine common code for parsing and evaluating memory description
  properties based on the DRMEM LMB array model to greatly simplify
  extension from the older property 'ibm,dynamic-memory' to the new
  property model of 'ibm,dynamic-memory-v2'.

For support, add a new pseries hotplug action for DLPAR operations,
PSERIES_HP_ELOG_ACTION_READD_MULTIPLE.  It is a variant of the READD
operation which performs the action upon multiple instances of the
resource at one time.  The operation is to be triggered by device-tree
analysis of updates by RTAS events analyzed by 'migation_store' during
post-migration processing.  It will be used for memory updates,
initially.

Signed-off-by: Michael Bringmann 
---
Changes in v07:
  -- Ensure return value from dlpar_memory_readd_multiple
Changes in v06:
  -- Rebase to powerpc next branch to account for recent code changes.
  -- Fix prototype problem when CONFIG_MEMORY_HOTPLUG not defined.
Changes in v05:
  -- Move common structure from numa.c + hotplug-memory.c to header file.
  -- Clarify some comments.
  -- Use walk_drmem_lmbs_pairs and callback instead of local loop
Changes in v04:
  -- Move dlpar_memory_readd_multiple() function definition and use
 into previous patch along with action constant definition.
  -- Correct spacing in patch
Changes in v03:
  -- Modify the code that parses the memory affinity attributes to
 mark relevant DRMEM LMB array entries using the internal_flags
 mechanism instead of generate unique hotplug actions for each
 memory block to be readded.  The change is intended to both
 simplify the code, and to require fewer resources on systems
 with huge amounts of memory.
  -- Save up notice about any all LMB entries until the end of the
 'migration_store' operation at which point a single action is
 queued to scan the entire DRMEM array.
  -- Add READD_MULTIPLE function for memory that scans the DRMEM
 array to identify multiple entries that were marked previously.
 The corresponding memory blocks are to be readded to the system
 to update relevant data structures outside of the powerpc-
 specific code.
  -- Change dlpar_memory_pmt_changes_action to directly queue worker
 to pseries work queue.
---
 arch/powerpc/include/asm/topology.h |7 +
 arch/powerpc/mm/numa.c  |6 -
 arch/powerpc/platforms/pseries/hotplug-memory.c |  209 +++
 arch/powerpc/platforms/pseries/mobility.c   |3 
 arch/powerpc/platforms/pseries/pseries.h|8 +
 5 files changed, 187 insertions(+), 46 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index a4a718d..fbe03df 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -135,5 +135,12 @@ static inline void shared_proc_topology_init(void) {}
 #endif
 #endif
 
+
+struct assoc_arrays {
+   u32 n_arrays;
+   u32 array_sz;
+   const __be32 *arrays;
+};
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_TOPOLOGY_H */
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 693ae1c..f1e7287 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -368,12 +368,6 @@ static unsigned long read_n_cells(int n, const __be32 
**buf)
return result;
 }
 
-struct assoc_arrays {
-   u32 n_arrays;
-   u32 array_sz;
-   const __be32 *arrays;
-};
-
 /*
  * Retrieve and validate the list of associativity arrays for drconf
  * m

[REPOST PATCH v07 3/5] migration/memory: Add hotplug READD_MULTIPLE

2019-01-29 Thread Michael Bringmann
migration/memory: This patch adds a new pseries hotplug action
for CPU and memory operations, PSERIES_HP_ELOG_ACTION_READD_MULTIPLE.
This is a variant of the READD operation which performs the action
upon multiple instances of the resource at one time.  The operation
is to be triggered by device-tree analysis of updates by RTAS events
analyzed by 'migation_store' during post-migration processing.  It
will be used for memory updates, initially.

Signed-off-by: Michael Bringmann 
---
Changes in v05:
  -- Provide dlpar_memory_readd_helper routine to compress some common code
Changes in v04:
  -- Move init of 'lmb->internal_flags' in init_drmem_v2_lmbs to
 previous patch.
  -- Pull in implementation of dlpar_memory_readd_multiple() to go
 with operation flag.
---
 arch/powerpc/include/asm/rtas.h |1 +
 arch/powerpc/platforms/pseries/hotplug-memory.c |   44 ---
 2 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 0183e95..cc00451 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -333,6 +333,7 @@ struct pseries_hp_errorlog {
 #define PSERIES_HP_ELOG_ACTION_ADD 1
 #define PSERIES_HP_ELOG_ACTION_REMOVE  2
 #define PSERIES_HP_ELOG_ACTION_READD   3
+#define PSERIES_HP_ELOG_ACTION_READD_MULTIPLE  4
 
 #define PSERIES_HP_ELOG_ID_DRC_NAME1
 #define PSERIES_HP_ELOG_ID_DRC_INDEX   2
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 2b796da..9c76345 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -507,6 +507,19 @@ static int dlpar_memory_remove_by_index(u32 drc_index)
return rc;
 }
 
+static int dlpar_memory_readd_helper(struct drmem_lmb *lmb)
+{
+   int rc;
+
+   rc = dlpar_remove_lmb(lmb);
+   if (!rc) {
+   rc = dlpar_add_lmb(lmb);
+   if (rc)
+   dlpar_release_drc(lmb->drc_index);
+   }
+   return rc;
+}
+
 static int dlpar_memory_readd_by_index(u32 drc_index)
 {
struct drmem_lmb *lmb;
@@ -519,12 +532,7 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
for_each_drmem_lmb(lmb) {
if (lmb->drc_index == drc_index) {
lmb_found = 1;
-   rc = dlpar_remove_lmb(lmb);
-   if (!rc) {
-   rc = dlpar_add_lmb(lmb);
-   if (rc)
-   dlpar_release_drc(lmb->drc_index);
-   }
+   rc = dlpar_memory_readd_helper(lmb);
break;
}
}
@@ -541,6 +549,23 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
return rc;
 }
 
+static int dlpar_memory_readd_multiple(void)
+{
+   struct drmem_lmb *lmb;
+   int rc;
+
+   pr_info("Attempting to update multiple LMBs\n");
+
+   for_each_drmem_lmb(lmb) {
+   if (drmem_lmb_update(lmb)) {
+   rc = dlpar_memory_readd_helper(lmb);
+   drmem_remove_lmb_update(lmb);
+   }
+   }
+
+   return rc;
+}
+
 static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
 {
struct drmem_lmb *lmb, *start_lmb, *end_lmb;
@@ -641,6 +666,10 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
 {
return -EOPNOTSUPP;
 }
+static int dlpar_memory_readd_multiple(void)
+{
+   return -EOPNOTSUPP;
+}
 
 static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
 {
@@ -918,6 +947,9 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
drc_index = hp_elog->_drc_u.drc_index;
rc = dlpar_memory_readd_by_index(drc_index);
break;
+   case PSERIES_HP_ELOG_ACTION_READD_MULTIPLE:
+   rc = dlpar_memory_readd_multiple();
+   break;
default:
pr_err("Invalid action (%d) specified\n", hp_elog->action);
rc = -EINVAL;



[REPOST PATCH v07 2/5] powerpc/drmem: Add internal_flags feature

2019-01-29 Thread Michael Bringmann
powerpc/drmem: Add internal_flags field to each LMB to allow
marking of kernel software-specific operations that need not
be exported to other users.  For instance, if information about
selected LMBs needs to be maintained for subsequent passes
through the system, it can be encoded into the LMB array itself
without requiring the allocation and maintainance of additional
data structures.

Signed-off-by: Michael Bringmann 
---
Changes in v04:
  -- Add another initialization of 'lmb->internal_flags' to
 init_drmem_v2_lmbs.
---
 arch/powerpc/include/asm/drmem.h |   18 ++
 arch/powerpc/mm/drmem.c  |3 +++
 2 files changed, 21 insertions(+)

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index cfe8598..dbb3e6c 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -17,6 +17,7 @@ struct drmem_lmb {
u32 drc_index;
u32 aa_index;
u32 flags;
+   u32 internal_flags;
 };
 
 struct drmem_lmb_info {
@@ -94,6 +95,23 @@ static inline bool drmem_lmb_reserved(struct drmem_lmb *lmb)
return lmb->flags & DRMEM_LMB_RESERVED;
 }
 
+#define DRMEM_LMBINT_UPDATE0x0001
+
+static inline void drmem_mark_lmb_update(struct drmem_lmb *lmb)
+{
+   lmb->internal_flags |= DRMEM_LMBINT_UPDATE;
+}
+
+static inline void drmem_remove_lmb_update(struct drmem_lmb *lmb)
+{
+   lmb->internal_flags &= ~DRMEM_LMBINT_UPDATE;
+}
+
+static inline bool drmem_lmb_update(struct drmem_lmb *lmb)
+{
+   return lmb->internal_flags & DRMEM_LMBINT_UPDATE;
+}
+
 u64 drmem_lmb_memory_max(void);
 void __init walk_drmem_lmbs(struct device_node *dn,
void (*func)(struct drmem_lmb *, const __be32 **));
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index ded9dbf..f199fe5 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -207,6 +207,7 @@ static void read_drconf_v1_cell(struct drmem_lmb *lmb,
 
lmb->aa_index = of_read_number(p++, 1);
lmb->flags = of_read_number(p++, 1);
+   lmb->internal_flags = 0;
 
*prop = p;
 }
@@ -265,6 +266,7 @@ static void __walk_drmem_v2_lmbs(const __be32 *prop, const 
__be32 *usm,
 
lmb.aa_index = dr_cell.aa_index;
lmb.flags = dr_cell.flags;
+   lmb.internal_flags = 0;
 
func(, );
}
@@ -441,6 +443,7 @@ static void init_drmem_v2_lmbs(const __be32 *prop,
 
lmb->aa_index = dr_cell.aa_index;
lmb->flags = dr_cell.flags;
+   lmb->internal_flags = 0;
}
}
 }



[REPOST PATCH v07 1/5] powerpc/drmem: Export 'dynamic-memory' loader

2019-01-29 Thread Michael Bringmann
powerpc/drmem: Export many of the functions of DRMEM to parse
"ibm,dynamic-memory" and "ibm,dynamic-memory-v2" during hotplug
operations and for Post Migration events.

Also modify the DRMEM initialization code to allow it to,

* Be called after system initialization
* Provide a separate user copy of the LMB array that is produces
* Free the user copy upon request

In addition, a couple of changes were made to make the creation
of additional copies of the LMB array more useful including,

* Add iterator function to work through a pair of drmem_info arrays
  with a callback function to apply specific tests.
* Modify DRMEM code to replace usages of dt_root_addr_cells, and
  dt_mem_next_cell, as these are only available at first boot.

Signed-off-by: Michael Bringmann 
---
Changes in v05:
  -- Add walk_drmem_lmbs_pairs to replace macro for_each_pair_lmb
---
 arch/powerpc/include/asm/drmem.h |   13 +
 arch/powerpc/mm/drmem.c  |   96 ++
 2 files changed, 89 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index 7c1d8e7..cfe8598 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -35,6 +35,11 @@ struct drmem_lmb_info {
_info->lmbs[0],   \
_info->lmbs[drmem_info->n_lmbs - 1])
 
+#define for_each_dinfo_lmb(dinfo, lmb) \
+   for_each_drmem_lmb_in_range((lmb),  \
+   >lmbs[0],\
+   >lmbs[dinfo->n_lmbs - 1])
+
 /*
  * The of_drconf_cell_v1 struct defines the layout of the LMB data
  * specified in the ibm,dynamic-memory device tree property.
@@ -94,6 +99,14 @@ void __init walk_drmem_lmbs(struct device_node *dn,
void (*func)(struct drmem_lmb *, const __be32 **));
 int drmem_update_dt(void);
 
+struct drmem_lmb_info *drmem_lmbs_init(struct property *prop);
+void drmem_lmbs_free(struct drmem_lmb_info *dinfo);
+int walk_drmem_lmbs_pairs(struct drmem_lmb_info *dinfo_oth,
+ int (*func)(struct drmem_lmb *cnt,
+   struct drmem_lmb *oth,
+   void *data),
+ void *data);
+
 #ifdef CONFIG_PPC_PSERIES
 void __init walk_drmem_lmbs_early(unsigned long node,
void (*func)(struct drmem_lmb *, const __be32 **));
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index 3f18036..ded9dbf 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -20,6 +20,7 @@
 
 static struct drmem_lmb_info __drmem_info;
 struct drmem_lmb_info *drmem_info = &__drmem_info;
+static int n_root_addr_cells;
 
 u64 drmem_lmb_memory_max(void)
 {
@@ -193,12 +194,13 @@ int drmem_update_dt(void)
return rc;
 }
 
-static void __init read_drconf_v1_cell(struct drmem_lmb *lmb,
+static void read_drconf_v1_cell(struct drmem_lmb *lmb,
   const __be32 **prop)
 {
const __be32 *p = *prop;
 
-   lmb->base_addr = dt_mem_next_cell(dt_root_addr_cells, );
+   lmb->base_addr = of_read_number(p, n_root_addr_cells);
+   p += n_root_addr_cells;
lmb->drc_index = of_read_number(p++, 1);
 
p++; /* skip reserved field */
@@ -209,7 +211,7 @@ static void __init read_drconf_v1_cell(struct drmem_lmb 
*lmb,
*prop = p;
 }
 
-static void __init __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm,
+static void __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm,
void (*func)(struct drmem_lmb *, const __be32 **))
 {
struct drmem_lmb lmb;
@@ -225,13 +227,14 @@ static void __init __walk_drmem_v1_lmbs(const __be32 
*prop, const __be32 *usm,
}
 }
 
-static void __init read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
+static void read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
   const __be32 **prop)
 {
const __be32 *p = *prop;
 
dr_cell->seq_lmbs = of_read_number(p++, 1);
-   dr_cell->base_addr = dt_mem_next_cell(dt_root_addr_cells, );
+   dr_cell->base_addr = of_read_number(p, n_root_addr_cells);
+   p += n_root_addr_cells;
dr_cell->drc_index = of_read_number(p++, 1);
dr_cell->aa_index = of_read_number(p++, 1);
dr_cell->flags = of_read_number(p++, 1);
@@ -239,7 +242,7 @@ static void __init read_drconf_v2_cell(struct 
of_drconf_cell_v2 *dr_cell,
*prop = p;
 }
 
-static void __init __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm,
+static void __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm,
void (*func)(struct drmem_lmb *, const __be32 **))
 {
struct of_drconf_cell_v2 dr_cell;
@@ -275,6 +278,9 @@ void __init walk_drmem_lmbs_early(

[REPOST PATCH v07 0/5] powerpc/migration: Affinity fix for memory

2019-01-29 Thread Michael Bringmann
The migration of LPARs across Power systems affects many attributes
including that of the associativity of memory blocks.  The patches
in this set execute when a system is coming up fresh upon a migration
target.  They are intended to,

* Recognize changes to the associativity of memory recorded in
  internal data structures when compared to the latest copies in
  the device tree (e.g. ibm,dynamic-memory, ibm,dynamic-memory-v2).
* Recognize changes to the associativity mapping (e.g. ibm,
  associativity-lookup-arrays), locate all assigned memory blocks
  corresponding to each changed row, and readd all such blocks.
* Generate calls to other code layers to reset the data structures
  related to associativity of memory.
* Re-register the 'changed' entities into the target system.
  Re-registration of memory blocks mostly entails acting as if they
  have been newly hot-added into the target system.

This code builds upon features introduced in a previous patch set
that updates CPUs for affinity changes that may occur during LPM.

Signed-off-by: Michael Bringmann 

Michael Bringmann (5):
  powerpc/drmem: Export 'dynamic-memory' loader
  powerpc/drmem: Add internal_flags feature
  migration/memory: Add hotplug flags READD_MULTIPLE
  migration/memory: Evaluate LMB assoc changes
  migration/memory: Support 'ibm,dynamic-memory-v2'
---
Changes in v07:
  -- Provide more useful return value from dlpar_memory_readd_multiple
Changes in v06:
  -- Rebase to powerpc next branch to account for recent code changes.
  -- Fix prototype problem when CONFIG_MEMORY_HOTPLUG not defined.
Changes in v05:
  -- Add walk_drmem_lmbs_pairs to replace macro for_each_pair_lmb
  -- Use walk_drmem_lmbs_pairs and callback instead of local loop
  -- Provide dlpar_memory_readd_helper routine to compress some common code
  -- Move common structure from numa.c + hotplug-memory.c to header file.
  -- Clarify some comments.
Changes in v04:
  -- Move dlpar_memory_readd_multiple() to patch with new ACTION
 constant.
  -- Move init of 'lmb->internal_flags' in init_drmem_v2_lmbs to
 patch with other references to flag.
  -- Correct spacing in one of the patches
Changes in v03:
  -- Change operation to tag changed LMBs in DRMEM array instead of
 queuing a potentially huge number of structures.
  -- Added another hotplug queue event for CPU/memory operations
  -- Added internal_flags feature to DRMEM
  -- Improve the patch description language for the patch set.
  -- Revise patch set to queue worker for memory association
 updates directly to pseries worker queue.



Re: [RFC 1/6] powerpc:/drc Define interface to acquire arch-specific drc info

2019-01-29 Thread Michael Bringmann
On 1/29/19 3:31 AM, Michael Ellerman wrote:
> Tyrel Datwyler  writes:
>> On 12/14/2018 12:50 PM, Michael Bringmann wrote:
>>> Define interface to acquire arch-specific drc info to match against
>>> hotpluggable devices.  The current implementation exposes several
>>> pseries-specific dynamic memory properties in generic kernel code.
>>> This patch set provides an interface to pull that code out of the
>>> generic kernel.
>>>
>>> Signed-off-by: Michael Bringmann 
>>> ---
>>>  include/linux/topology.h |9 +
>>>  1 file changed, 9 insertions(+)
>>>
>>> diff --git a/include/linux/topology.h b/include/linux/topology.h
>>> index cb0775e..df97f5f 100644
>>> --- a/include/linux/topology.h
>>> +++ b/include/linux/topology.h
>>> @@ -44,6 +44,15 @@
>>
>> As far as I know pseries is the only platform that uses DR connectors, and I
>> highly doubt that any other powerpc platform or arch ever will. So, I'm not 
>> sure
>> that this is really generic enough to belong in topology.h.
> 
> Right. This does not belong in include/linux.
> 
>> If anything I would
>> suggest putting this in an include in arch/powerpc/include/ named something 
>> like
>> drcinfo.h or pseries-drc.h. That will make it visible to modules like rpaphp
>> that want/need to use this functionality.
> 
> Yeah that would make more sense.

If you see no objection to referencing a powerpc-specific function from
the code ...

> 
> Using "arch" in the name is wrong, it's pseries specific so
> pseries_find_drc_match() would be more appropriate.
> 
>>> +int arch_find_drc_match(struct device_node *dn,
>>> +   bool (*usercb)(struct device_node *dn,
>>> +   u32 drc_index, char *drc_name,
>>> +   char *drc_type, u32 drc_power_domain,
>>> +   void *data),
>>> +   char *opt_drc_type, char *opt_drc_name,
>>> +   bool match_drc_index, bool ck_php_type,
>>> +   void *data);
> 
> This function signature is kind of insane.
> 
> You end with calls like:
> 
> + return arch_find_drc_match(dn, rpaphp_add_slot_cb,
> + NULL, NULL, false, true, NULL);
> 
> Which is impossible to parse.
> 
> I feel like maybe this isn't the right level of abstraction.

...
I had already been considering simplifying the interface for these
calls to something like the following:

int rpaphp_check_drc_props(struct device_node *dn, char *drc_name,
char *drc_type)
{
return pseries_find_drc_match(dn, drc_type, drc_name);
}
...
int rpaphp_add_slot(struct device_node *dn)
{
   if (!dn->name || strcmp(dn->name, "pci"))
   return 0;

   return pseries_add_drc_slot(dn, rpaphp_add_slot_cb);
}
...

Further details would be hidden within the pseries code.


> 
> cheers

Regards

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



Re: [PATCH] powerpc/pseries: Perform full re-add of CPU for topology update post-migration

2019-01-29 Thread Michael Bringmann



On 1/29/19 3:37 AM, Michael Ellerman wrote:
> Michael Bringmann  writes:
> 
>> On 10/29/18 1:43 PM, Nathan Fontenot wrote:
>>> On pseries systems, performing a partition migration can result in
>>> altering the nodes a CPU is assigned to on the destination system. For
>>> exampl, pre-migration on the source system CPUs are in node 1 and 3,
>>> post-migration on the destination system CPUs are in nodes 2 and 3.
>>>
>>> Handling the node change for a CPU can cause corruption in the slab
>>> cache if we hit a timing where a CPUs node is changed while cache_reap()
>>> is invoked. The corruption occurs because the slab cache code appears
>>> to rely on the CPU and slab cache pages being on the same node.
>>>
>>> The current dynamic updating of a CPUs node done in arch/powerpc/mm/numa.c
>>> does not prevent us from hitting this scenario.
>>>
>>> Changing the device tree property update notification handler that
>>> recognizes an affinity change for a CPU to do a full DLPAR remove and
>>> add of the CPU instead of dynamically changing its node resolves this
>>> issue.
>>>
>>> Signed-off-by: Nathan Fontenot > Signed-off-by: Michael W. Bringmann 

Tested-by: Michael W. Bringmann 

> 
> Are you sure that's what you meant? ie. you wrote some of the patch?
> 
> What I'd like is to get a Tested-by from you.
> 
> cheers
> 
> 

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



Re: [RFC 1/6] powerpc:/drc Define interface to acquire arch-specific drc info

2019-01-28 Thread Michael Bringmann
On 1/25/19 10:09 AM, Michael Bringmann wrote:
> Adding Nathan Lynch
> 
> On 1/24/19 6:04 PM, Tyrel Datwyler wrote:
>> On 12/14/2018 12:50 PM, Michael Bringmann wrote:
>>> Define interface to acquire arch-specific drc info to match against
>>> hotpluggable devices.  The current implementation exposes several
>>> pseries-specific dynamic memory properties in generic kernel code.
>>> This patch set provides an interface to pull that code out of the
>>> generic kernel.
>>>
>>> Signed-off-by: Michael Bringmann 
>>> ---
>>>  include/linux/topology.h |9 +
>>>  1 file changed, 9 insertions(+)
>>>
>>> diff --git a/include/linux/topology.h b/include/linux/topology.h
>>> index cb0775e..df97f5f 100644
>>> --- a/include/linux/topology.h
>>> +++ b/include/linux/topology.h
>>> @@ -44,6 +44,15 @@
>>
>> As far as I know pseries is the only platform that uses DR connectors, and I
>> highly doubt that any other powerpc platform or arch ever will. So, I'm not 
>> sure
>> that this is really generic enough to belong in topology.h. If anything I 
>> would
>> suggest putting this in an include in arch/powerpc/include/ named something 
>> like
>> drcinfo.h or pseries-drc.h. That will make it visible to modules like rpaphp
>> that want/need to use this functionality.

It looks like the 'rpaphp' and 'rpadlpar_io' modules are also dependent upon the
powerpc platform.  Shouldn't the relevant source files be moved completely to 
the
powerpc-specific directories out of drivers/pci/hotplug as well?

drivers/pci/hotplug/Kconfig has:

config HOTPLUG_PCI_RPA
tristate "RPA PCI Hotplug driver"
depends on PPC_PSERIES && EEH
help
  Say Y here if you have a RPA system that supports PCI Hotplug.

  To compile this driver as a module, choose M here: the
  module will be called rpaphp.

  When in doubt, say N.

config HOTPLUG_PCI_RPA_DLPAR
tristate "RPA Dynamic Logical Partitioning for I/O slots"
depends on HOTPLUG_PCI_RPA
help
  Say Y here if your system supports Dynamic Logical Partitioning
  for I/O slots.

  To compile this driver as a module, choose M here: the
  module will be called rpadlpar_io.

  When in doubt, say N.

Michael

>>
>> -Tyrel
>>
>>>  
>>>  int arch_update_cpu_topology(void);
>>>  
>>> +int arch_find_drc_match(struct device_node *dn,
>>> +   bool (*usercb)(struct device_node *dn,
>>> +   u32 drc_index, char *drc_name,
>>> +   char *drc_type, u32 drc_power_domain,
>>> +   void *data),
>>> +   char *opt_drc_type, char *opt_drc_name,
>>> +   bool match_drc_index, bool ck_php_type,
>>> +   void *data);
>>> +
>>>  /* Conform to ACPI 2.0 SLIT distance definitions */
>>>  #define LOCAL_DISTANCE 10
>>>  #define REMOTE_DISTANCE20
>>>
>>
>>
> 

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



Re: [REPOST PATCH v08 0/5] powerpc/hotplug: Update affinity for migrated CPUs

2019-01-28 Thread Michael Bringmann
On 8/21/18 10:33 AM, m...@linux.vnet.ibm.com wrote:
> The migration of LPARs across Power systems affects many attributes
> including that of the associativity of CPUs.  The patches in this
> set execute when a system is coming up fresh upon a migration target.
> They are intended to,
> 
> * Recognize changes to the associativity of CPUs recorded in internal
>   data structures when compared to the latest copies in the device tree.
> * Generate calls to other code layers to reset the data structures
>   related to associativity of the CPUs.
> * Re-register the 'changed' entities into the target system.
>   Re-registration of CPUs mostly entails acting as if they have been
>   newly hot-added into the target system.
> 
> Signed-off-by: Michael Bringmann 

Retract this series in preference to
 [PATCH] powerpc/pseries: Perform full re-add of CPU for topology update 
post-migration

Michael

> 
> Michael Bringmann (5):
>   hotplug/cpu: Conditionally acquire/release DRC index
>   hotplug/cpu: Add operation queuing function
>   hotplug/cpu: Provide CPU readd operation
>   mobility/numa: Ensure numa update does not overlap
>   hotplug/pmt: Update topology after PMT
> ---
> Changes in patch:
>   -- Restructure and rearrange content of patches to co-locate
>  similar or related modifications
>   -- Rename pseries_update_drconf_cpu to pseries_update_processor
>   -- Simplify code to update CPU nodes during mobility checks.
>  Remove functions to generate extra HP_ELOG messages in favor
>  of direct function calls to dlpar_cpu_readd_by_index.
>   -- Revise code order in dlpar_cpu_readd_by_index() to present
>  more appropriate error codes from underlying layers of the
>  implementation.
>   -- Add hotplug device lock around all property updates
>   -- Add call to rebuild_sched_domains in case of changes
>   -- Various code cleanups and compaction
>   -- Rebase to 4.18 kernel
>   -- Change operation to run CPU readd after end of migration store.
>   -- Improve descriptive text
>   -- Cleanup patch reference to outdated function
>   -- Code cleanup a 'acquire_drc' check in dlpar_cpu_add.
>   -- Code cleanup a 'release_drc' check in dlpar_cpu_remove.
>   -- Add more information to patch descriptions.
>   -- More code cleanup
>   -- Rearrange call to rebuild_sched_domains to allow removal
>  of some locking code.
> 

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



Re: [PATCH] powerpc/pseries: Perform full re-add of CPU for topology update post-migration

2019-01-28 Thread Michael Bringmann
On 10/29/18 1:43 PM, Nathan Fontenot wrote:
> On pseries systems, performing a partition migration can result in
> altering the nodes a CPU is assigned to on the destination system. For
> exampl, pre-migration on the source system CPUs are in node 1 and 3,
> post-migration on the destination system CPUs are in nodes 2 and 3.
> 
> Handling the node change for a CPU can cause corruption in the slab
> cache if we hit a timing where a CPUs node is changed while cache_reap()
> is invoked. The corruption occurs because the slab cache code appears
> to rely on the CPU and slab cache pages being on the same node.
> 
> The current dynamic updating of a CPUs node done in arch/powerpc/mm/numa.c
> does not prevent us from hitting this scenario.
> 
> Changing the device tree property update notification handler that
> recognizes an affinity change for a CPU to do a full DLPAR remove and
> add of the CPU instead of dynamically changing its node resolves this
> issue.
> 
> Signed-off-by: Nathan Fontenot 

> ---
>  arch/powerpc/include/asm/topology.h  |2 ++
>  arch/powerpc/mm/numa.c   |9 +
>  arch/powerpc/platforms/pseries/hotplug-cpu.c |   19 +++
>  3 files changed, 22 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/topology.h 
> b/arch/powerpc/include/asm/topology.h
> index a4a718dbfec6..f85e2b01c3df 100644
> --- a/arch/powerpc/include/asm/topology.h
> +++ b/arch/powerpc/include/asm/topology.h
> @@ -132,6 +132,8 @@ static inline void shared_proc_topology_init(void) {}
>  #define topology_sibling_cpumask(cpu)(per_cpu(cpu_sibling_map, cpu))
>  #define topology_core_cpumask(cpu)   (per_cpu(cpu_core_map, cpu))
>  #define topology_core_id(cpu)(cpu_to_core_id(cpu))
> +
> +int dlpar_cpu_readd(int cpu);
>  #endif
>  #endif
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 693ae1c1acba..bb6a7b56bef7 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -1461,13 +1461,6 @@ static void reset_topology_timer(void)
> 
>  #ifdef CONFIG_SMP
> 
> -static void stage_topology_update(int core_id)
> -{
> - cpumask_or(_associativity_changes_mask,
> - _associativity_changes_mask, cpu_sibling_mask(core_id));
> - reset_topology_timer();
> -}
> -
>  static int dt_update_callback(struct notifier_block *nb,
>   unsigned long action, void *data)
>  {
> @@ -1480,7 +1473,7 @@ static int dt_update_callback(struct notifier_block *nb,
>   !of_prop_cmp(update->prop->name, "ibm,associativity")) {
>   u32 core_id;
>   of_property_read_u32(update->dn, "reg", _id);
> - stage_topology_update(core_id);
> + rc = dlpar_cpu_readd(core_id);
>   rc = NOTIFY_OK;
>   }
>   break;
> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
> b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> index 2f8e62163602..97feb6e79f1a 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> @@ -802,6 +802,25 @@ static int dlpar_cpu_add_by_count(u32 cpus_to_add)
>   return rc;
>  }
> 
> +int dlpar_cpu_readd(int cpu)
> +{
> + struct device_node *dn;
> + struct device *dev;
> + u32 drc_index;
> + int rc;
> +
> + dev = get_cpu_device(cpu);
> + dn = dev->of_node;
> +
> + rc = of_property_read_u32(dn, "ibm,my-drc-index", _index);
> +
> + rc = dlpar_cpu_remove_by_index(drc_index);
> + if (!rc)
> + rc = dlpar_cpu_add(drc_index);
> +
> + return rc;
> +}
> +
>  int dlpar_cpu(struct pseries_hp_errorlog *hp_elog)
>  {
>   u32 count, drc_index;
> 
> 

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



Re: [RFC 5/6] powerpc/pci/hotplug: Use common drcinfo parsing

2019-01-25 Thread Michael Bringmann
Adding Nathan Lynch.

On 1/24/19 6:29 PM, Tyrel Datwyler wrote:
> On 01/14/2019 04:28 PM, Bjorn Helgaas wrote:
>> On Fri, Dec 14, 2018 at 02:51:31PM -0600, Michael Bringmann wrote:
>>> The implementation of the pseries-specific drc info properties
>>> is currently implemented in pseries-specific and non-pseries-specific
>>> files.  This patch set uses a new implementation of the device-tree
>>> parsing code for the properties.
>>>
>>> This patch refactors parsing of the pseries-specific drc-info properties
>>> out of rpaphp_core.c to use the common parser.  In the case where an
>>> architecture does not use these properties, an __weak copy of the
>>> function is provided with dummy return values.  Changes include creating
>>> appropriate callback functions and passing callback-specific data
>>> blocks into arch_find_drc_match.  All functions that were used just
>>> to support the previous parsing implementation have been moved.
>>>
>>> Signed-off-by: Michael Bringmann 
>>
>> This is fine with me.  Any rpaphp_core.c maintainers want to comment?
>> Tyrel?
> 
> It greatly simplifies the code in rpaphp_core.c, and as far as I can tell the
> refactoring maintains the existing functionality.
> 
> Acked-by: Tyrel Datwyler 
> 
>>
>> $ ./scripts/get_maintainer.pl -f drivers/pci/hotplug/rpaphp_core.c
>> Tyrel Datwyler  (supporter:IBM Power PCI Hotplug 
>> Driver for RPA-compliant...)
>> Benjamin Herrenschmidt  (supporter:LINUX FOR 
>> POWERPC (32-BIT AND 64-BIT))
>> Paul Mackerras  (supporter:LINUX FOR POWERPC (32-BIT AND 
>> 64-BIT))
>> Michael Ellerman  (supporter:LINUX FOR POWERPC (32-BIT 
>> AND 64-BIT))
>>
>>> ---
>>>  drivers/pci/hotplug/rpaphp_core.c |  232 
>>> -
>>>  1 file changed, 28 insertions(+), 204 deletions(-)
>>>
>>> diff --git a/drivers/pci/hotplug/rpaphp_core.c 
>>> b/drivers/pci/hotplug/rpaphp_core.c
>>> index bcd5d35..9ad7384 100644
>>> --- a/drivers/pci/hotplug/rpaphp_core.c
>>> +++ b/drivers/pci/hotplug/rpaphp_core.c
>>> @@ -154,182 +154,18 @@ static enum pci_bus_speed get_max_bus_speed(struct 
>>> slot *slot)
>>> return speed;
>>>  }
>>>  
>>> -static int get_children_props(struct device_node *dn, const int 
>>> **drc_indexes,
>>> -   const int **drc_names, const int **drc_types,
>>> -   const int **drc_power_domains)
>>> -{
>>> -   const int *indexes, *names, *types, *domains;
>>> -
>>> -   indexes = of_get_property(dn, "ibm,drc-indexes", NULL);
>>> -   names = of_get_property(dn, "ibm,drc-names", NULL);
>>> -   types = of_get_property(dn, "ibm,drc-types", NULL);
>>> -   domains = of_get_property(dn, "ibm,drc-power-domains", NULL);
>>> -
>>> -   if (!indexes || !names || !types || !domains) {
>>> -   /* Slot does not have dynamically-removable children */
>>> -   return -EINVAL;
>>> -   }
>>> -   if (drc_indexes)
>>> -   *drc_indexes = indexes;
>>> -   if (drc_names)
>>> -   /* _names[1] contains NULL terminated slot names */
>>> -   *drc_names = names;
>>> -   if (drc_types)
>>> -   /* _types[1] contains NULL terminated slot types */
>>> -   *drc_types = types;
>>> -   if (drc_power_domains)
>>> -   *drc_power_domains = domains;
>>> -
>>> -   return 0;
>>> -}
>>> -
>>> -
>>>  /* Verify the existence of 'drc_name' and/or 'drc_type' within the
>>> - * current node.  First obtain it's my-drc-index property.  Next,
>>> - * obtain the DRC info from it's parent.  Use the my-drc-index for
>>> - * correlation, and obtain/validate the requested properties.
>>> + * current node.
>>>   */
>>>  
>>> -static int rpaphp_check_drc_props_v1(struct device_node *dn, char 
>>> *drc_name,
>>> -   char *drc_type, unsigned int my_index)
>>> -{
>>> -   char *name_tmp, *type_tmp;
>>> -   const int *indexes, *names;
>>> -   const int *types, *domains;
>>> -   int i, rc;
>>> -
>>> -   rc = get_children_props(dn->parent, , , , );
>>> -   if (rc < 0) {
>>> -   return -EINVAL;
>>> -   }
>>> -
>>> -   name_tmp = (char *) [1];
>>> -   type_tmp = (char *) [1];
>>> -
>>&g

Re: [RFC 1/6] powerpc:/drc Define interface to acquire arch-specific drc info

2019-01-25 Thread Michael Bringmann
Adding Nathan Lynch.

On 1/24/19 6:10 PM, Tyrel Datwyler wrote:
> On 12/14/2018 12:50 PM, Michael Bringmann wrote:
>> Define interface to acquire arch-specific drc info to match against
>> hotpluggable devices.  The current implementation exposes several
>> pseries-specific dynamic memory properties in generic kernel code.
>> This patch set provides an interface to pull that code out of the
>> generic kernel.
>>
>> Signed-off-by: Michael Bringmann 
>> ---
>>  include/linux/topology.h |9 +
>>  1 file changed, 9 insertions(+)
>>
>> diff --git a/include/linux/topology.h b/include/linux/topology.h
>> index cb0775e..df97f5f 100644
>> --- a/include/linux/topology.h
>> +++ b/include/linux/topology.h
>> @@ -44,6 +44,15 @@
>>  
>>  int arch_update_cpu_topology(void);
> 
> On another note a kern doc comment for this function would also be nice.
> 
> -Tyrel
> 
>>  
>> +int arch_find_drc_match(struct device_node *dn,
>> +bool (*usercb)(struct device_node *dn,
>> +u32 drc_index, char *drc_name,
>> +char *drc_type, u32 drc_power_domain,
>> +void *data),
>> +char *opt_drc_type, char *opt_drc_name,
>> +bool match_drc_index, bool ck_php_type,
>> +void *data);
>> +
>>  /* Conform to ACPI 2.0 SLIT distance definitions */
>>  #define LOCAL_DISTANCE  10
>>  #define REMOTE_DISTANCE 20
>>
> 
> 

-- 
Michael W. Bringmann
Linux I/O, Networking and Security Development
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



Re: [RFC 3/6] pseries/drcinfo: Pseries impl of arch_find_drc_info

2019-01-25 Thread Michael Bringmann
Adding Nathan Lynch.

Yes.  We can amend the title.

On 1/24/19 6:04 PM, Tyrel Datwyler wrote:
> On 12/14/2018 12:51 PM, Michael Bringmann wrote:
>> This patch provides a common interface to parse ibm,drc-indexes,
>> ibm,drc-names, ibm,drc-types, ibm,drc-power-domains, or ibm,drc-info.
>> The generic interface arch_find_drc_match is provided which accepts
>> callback functions that may be applied to examine the data for each
>> entry.
>>
> 
> The title of your patch is "pseries/drcinfo: Pseries impl of 
> arch_find_drc_info"
> but the name of the function you are ultimately implementing is
> arch_find_drc_match if I'm not mistaken.
> 
>> Signed-off-by: Michael Bringmann 
>> ---
>>  arch/powerpc/include/asm/prom.h |3 
>>  arch/powerpc/platforms/pseries/of_helpers.c |  299 
>> +++
>>  include/linux/topology.h|2 
>>  3 files changed, 298 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/prom.h 
>> b/arch/powerpc/include/asm/prom.h
>> index b04c5ce..910d1dc 100644
>> --- a/arch/powerpc/include/asm/prom.h
>> +++ b/arch/powerpc/include/asm/prom.h
>> @@ -91,9 +91,6 @@ struct of_drc_info {
>>  u32 last_drc_index;
>>  };
>>  
>> -extern int of_read_drc_info_cell(struct property **prop,
>> -const __be32 **curval, struct of_drc_info *data);
>> -
>>  
>>  /*
>>   * There are two methods for telling firmware what our capabilities are.
>> diff --git a/arch/powerpc/platforms/pseries/of_helpers.c 
>> b/arch/powerpc/platforms/pseries/of_helpers.c
>> index 0185e50..11c90cd 100644
>> --- a/arch/powerpc/platforms/pseries/of_helpers.c
>> +++ b/arch/powerpc/platforms/pseries/of_helpers.c
>> @@ -1,5 +1,7 @@
>>  // SPDX-License-Identifier: GPL-2.0
>>  
>> +#define pr_fmt(fmt) "drc: " fmt
>> +
>>  #include 
>>  #include 
>>  #include 
>> @@ -11,6 +13,12 @@
>>  
>>  #define MAX_DRC_NAME_LEN 64
>>  
>> +static int drc_debug;
>> +#define dbg(args...) if (drc_debug) { printk(KERN_DEBUG args); }
>> +#define err(arg...) printk(KERN_ERR args);
>> +#define info(arg...) printk(KERN_INFO args);
>> +#define warn(arg...) printk(KERN_WARNING args);
> 
> Its pretty standard these days to use the pr_debug, pr_err, pr_info, pr_warn
> variations over printk(LEVEL args).
> 
>> +
>>  /**
>>   * pseries_of_derive_parent - basically like dirname(1)
>>   * @path:  the full_name of a node to be added to the tree
>> @@ -46,7 +54,8 @@ struct device_node *pseries_of_derive_parent(const char 
>> *path)
>>  
>>  /* Helper Routines to convert between drc_index to cpu numbers */
>>  
>> -int of_read_drc_info_cell(struct property **prop, const __be32 **curval,
>> +static int of_read_drc_info_cell(struct property **prop,
>> +const __be32 **curval,
>>  struct of_drc_info *data)
>>  {
>>  const char *p;
>> @@ -90,4 +99,290 @@ int of_read_drc_info_cell(struct property **prop, const 
>> __be32 **curval,
>>  
>>  return 0;
>>  }
>> -EXPORT_SYMBOL(of_read_drc_info_cell);
>> +
>> +static int walk_drc_info(struct device_node *dn,
>> +bool (*usercb)(struct of_drc_info *drc,
>> +void *data,
>> +int *ret_code),
>> +char *opt_drc_type,
>> +void *data)
>> +{
>> +struct property *info;
>> +unsigned int entries;
>> +struct of_drc_info drc;
>> +const __be32 *value;
>> +int j, ret_code = -EINVAL;
>> +bool done = false;
>> +
>> +info = of_find_property(dn, "ibm,drc-info", NULL);
>> +if (info == NULL)
>> +return -EINVAL;
>> +
>> +value = info->value;
>> +entries = of_read_number(value++, 1);
>> +
>> +for (j = 0, done = 0; (j < entries) && (!done); j++) {
>> +of_read_drc_info_cell(, , );
>> +
>> +if (opt_drc_type && strcmp(opt_drc_type, drc.drc_type))
>> +continue;
>> +
>> +done = usercb(, data, _code);
>> +}
>> +
>> +return ret_code;
>> +}
>> +
>> +static int get_children_props(struct device_node *dn, const int 
>> **drc_indexes,
>> +const int **drc_names, const int **drc_types,
>> +const int **drc_power_domains)
>> +{
>> +co

Re: [RFC 1/6] powerpc:/drc Define interface to acquire arch-specific drc info

2019-01-25 Thread Michael Bringmann
Adding Nathan Lynch

On 1/24/19 6:04 PM, Tyrel Datwyler wrote:
> On 12/14/2018 12:50 PM, Michael Bringmann wrote:
>> Define interface to acquire arch-specific drc info to match against
>> hotpluggable devices.  The current implementation exposes several
>> pseries-specific dynamic memory properties in generic kernel code.
>> This patch set provides an interface to pull that code out of the
>> generic kernel.
>>
>> Signed-off-by: Michael Bringmann 
>> ---
>>  include/linux/topology.h |9 +
>>  1 file changed, 9 insertions(+)
>>
>> diff --git a/include/linux/topology.h b/include/linux/topology.h
>> index cb0775e..df97f5f 100644
>> --- a/include/linux/topology.h
>> +++ b/include/linux/topology.h
>> @@ -44,6 +44,15 @@
> 
> As far as I know pseries is the only platform that uses DR connectors, and I
> highly doubt that any other powerpc platform or arch ever will. So, I'm not 
> sure
> that this is really generic enough to belong in topology.h. If anything I 
> would
> suggest putting this in an include in arch/powerpc/include/ named something 
> like
> drcinfo.h or pseries-drc.h. That will make it visible to modules like rpaphp
> that want/need to use this functionality.
> 
> -Tyrel
> 
>>  
>>  int arch_update_cpu_topology(void);
>>  
>> +int arch_find_drc_match(struct device_node *dn,
>> +bool (*usercb)(struct device_node *dn,
>> +u32 drc_index, char *drc_name,
>> +char *drc_type, u32 drc_power_domain,
>> +void *data),
>> +char *opt_drc_type, char *opt_drc_name,
>> +bool match_drc_index, bool ck_php_type,
>> +void *data);
>> +
>>  /* Conform to ACPI 2.0 SLIT distance definitions */
>>  #define LOCAL_DISTANCE  10
>>  #define REMOTE_DISTANCE 20
>>
> 
> 

-- 
Michael W. Bringmann
Linux I/O, Networking and Security Development
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



[PATCH] powerpc/pseries: Check for ceded CPU's during LPAR migration

2019-01-23 Thread Michael Bringmann
This patch is to check for cede'ed CPUs during LPM.  Some extreme
tests encountered a problem ehere Linux has put some threads to
sleep (possibly to save energy or something), LPM was attempted,
and the Linux kernel didn't awaken the sleeping threads, but issued
the H_JOIN for the active threads.  Since the sleeping threads
are not awake, they can not issue the expected H_JOIN, and the
partition would never suspend.  This patch wakes the sleeping
threads back up.

Signed-off-by: Nathan Fontenot 
Signed-off-by: Gustavo Walbon 
---
 arch/powerpc/include/asm/plpar_wrappers.h |6 ++
 arch/powerpc/kernel/rtas.c|6 ++
 arch/powerpc/platforms/pseries/setup.c|   18 ++
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/plpar_wrappers.h 
b/arch/powerpc/include/asm/plpar_wrappers.h
index cff5a41..8292eff 100644
--- a/arch/powerpc/include/asm/plpar_wrappers.h
+++ b/arch/powerpc/include/asm/plpar_wrappers.h
@@ -26,10 +26,8 @@ static inline void set_cede_latency_hint(u8 latency_hint)
get_lppaca()->cede_latency_hint = latency_hint;
 }
 
-static inline long cede_processor(void)
-{
-   return plpar_hcall_norets(H_CEDE);
-}
+int cpu_is_ceded(int cpu);
+long cede_processor(void);
 
 static inline long extended_cede_processor(unsigned long latency_hint)
 {
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index de35bd8f..9d9d08d 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -44,6 +44,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* This is here deliberately so it's only used in this file */
 void enter_rtas(unsigned long);
@@ -991,6 +992,11 @@ int rtas_ibm_suspend_me(u64 handle)
goto out_hotplug_enable;
}
 
+   for_each_present_cpu(cpu) {
+   if (cpu_is_ceded(cpu))
+   plpar_hcall_norets(H_PROD, 
get_hard_smp_processor_id(cpu));
+   }
+
/* Call function on all CPUs.  One of us will make the
 * rtas call
 */
diff --git a/arch/powerpc/platforms/pseries/setup.c 
b/arch/powerpc/platforms/pseries/setup.c
index 4078a05..0106668 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -331,6 +331,24 @@ static int alloc_dispatch_log_kmem_cache(void)
 }
 machine_early_initcall(pseries, alloc_dispatch_log_kmem_cache);
 
+static DEFINE_PER_CPU(int, cpu_ceded);
+
+int cpu_is_ceded(int cpu)
+{
+   return per_cpu(cpu_ceded, cpu);
+}
+
+long cede_processor(void)
+{
+   long rc;
+
+   per_cpu(cpu_ceded, raw_smp_processor_id()) = 1;
+   rc = plpar_hcall_norets(H_CEDE);
+   per_cpu(cpu_ceded, raw_smp_processor_id()) = 0;
+
+   return rc;
+}
+
 static void pseries_lpar_idle(void)
 {
/*



Re: [PATCH] pseries/hotplug: Add more delay in pseries_cpu_die while waiting for rtas-stop

2019-01-14 Thread Michael Bringmann
On 1/9/19 12:08 AM, Gautham R Shenoy wrote:

> I did some testing during the holidays. Here are the observations:
> 
> 1) With just your patch (without any additional debug patch), if I run
> DLPAR on /off operations on a system that has SMT=off, I am able to
> see a crash involving RTAS stack corruption within an hour's time.
> 
> 2) With the debug patch (appended below) which has additional debug to
> capture the callers of stop-self, start-cpu, set-power-levels, the
> system is able to perform DLPAR on/off operations on a system with
> SMT=off for three days. And then, it crashed with the dead CPU showing
> a "Bad kernel stack pointer". From this log, I can clearly
> see that there were no concurrent calls to stop-self, start-cpu,
> set-power-levels. The only concurrent RTAS calls were the dying CPU
> calling "stop-self", and the CPU running the DLPAR operation calling
> "query-cpu-stopped-state". The crash signature is appended below as
> well.
> 
> 3) Modifying your patch to remove the udelay and increase the loop
> count from 25 to 1000 doesn't improve the situation. We are still able
> to see the crash.
> 
> 4) With my patch, even without any additional debug, I was able to
> observe the system run the tests successfully for over a week (I
> started the tests before the Christmas weekend, and forgot to turn it
> off!)

So does this mean that the problem is fixed with your patch?

> 
> It appears that there is a narrow race window involving rtas-stop-self
> and rtas-query-cpu-stopped-state calls that can be observed with your
> patch. Adding any printk's seems to reduce the probability of hitting
> this race window. It might be worth the while to check with RTAS
> folks, if they suspect something here.

What would the RTAS folks be looking at here?  The 'narrow race window'
is with respect to a patch that it sound like we should not be using.

Thanks.
Michael

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



[PATCH] powerpc/pseries: Split CPU readd for single CPU systems

2019-01-11 Thread Michael Bringmann
We have encountered cases where DLPAR CPU 'readd' fails on single
CPU platforms, because the system needs a minimum amount of resources
to keep operating.  The current implementation attempts to add, and
remove all of the threads of a specified core at once, and will fail
if there is a problem removing any of the thread cpus.  In single CPU
platforms, the system must hold onto at least some resources to keep
operating i.e. at least one thread of a CPU.  So in such environments,
attempting to remove and add the single core and all of its CPU threads
in order to reset and flush system structures and/or caches fails.

This problem has been observed on PowerVM and qemu environments.

This change attempts to resolve such situations by breaking up the
DLPAR CPU 'readd' operation into multiple steps, performing the
remove+readd of the CPU threads until an error occurs, and then
continuing the 'readd' operation for the threads that could not be
removed during the first phase of the operation.

Requires: ("powerpc/pseries: Perform full re-add of CPU for topology update 
post-migration")
Signed-off-by: Michael W. Bringmann 
---
 arch/powerpc/platforms/pseries/hotplug-cpu.c |  187 --
 1 file changed, 117 insertions(+), 70 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 97feb6e..b33e066 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -342,7 +342,8 @@ static void pseries_remove_processor(struct device_node *np)
cpu_maps_update_done();
 }
 
-static int dlpar_online_cpu(struct device_node *dn)
+static int dlpar_online_cpu(struct device_node *dn, cpumask_t *whichcpus,
+   int partial)
 {
int rc = 0;
unsigned int cpu;
@@ -359,6 +360,8 @@ static int dlpar_online_cpu(struct device_node *dn)
cpu_maps_update_begin();
for (i = 0; i < nthreads; i++) {
thread = be32_to_cpu(intserv[i]);
+   if (partial && !cpumask_test_cpu(thread, whichcpus))
+   continue;
for_each_present_cpu(cpu) {
if (get_hard_smp_processor_id(cpu) != thread)
continue;
@@ -371,7 +374,6 @@ static int dlpar_online_cpu(struct device_node *dn)
if (rc)
goto out;
cpu_maps_update_begin();
-
break;
}
if (cpu == num_possible_cpus())
@@ -432,7 +434,10 @@ static bool valid_cpu_drc_index(struct device_node 
*parent, u32 drc_index)
return found;
 }
 
-static ssize_t dlpar_cpu_add(u32 drc_index)
+static struct device_node *cpu_drc_index_to_dn(u32 drc_index);
+
+static ssize_t dlpar_cpu_add(u32 drc_index, cpumask_t *whichcpus,
+   bool partial)
 {
struct device_node *dn, *parent;
int rc, saved_rc;
@@ -445,10 +450,12 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
return -ENODEV;
}
 
-   if (dlpar_cpu_exists(parent, drc_index)) {
-   of_node_put(parent);
-   pr_warn("CPU with drc index %x already exists\n", drc_index);
-   return -EINVAL;
+   if (!parent) {
+   if (dlpar_cpu_exists(parent, drc_index)) {
+   of_node_put(parent);
+   pr_warn("CPU with drc index %x already exists\n", 
drc_index);
+   return -EINVAL;
+   }
}
 
if (!valid_cpu_drc_index(parent, drc_index)) {
@@ -457,49 +464,59 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
return -EINVAL;
}
 
-   rc = dlpar_acquire_drc(drc_index);
-   if (rc) {
-   pr_warn("Failed to acquire DRC, rc: %d, drc index: %x\n",
-   rc, drc_index);
-   of_node_put(parent);
-   return -EINVAL;
-   }
+   if (!partial) {
+   rc = dlpar_acquire_drc(drc_index);
+   if (rc) {
+   pr_warn("Failed to acquire DRC, rc: %d, drc index: 
%x\n",
+   rc, drc_index);
+   of_node_put(parent);
+   return -EINVAL;
+   }
 
-   dn = dlpar_configure_connector(cpu_to_be32(drc_index), parent);
-   if (!dn) {
-   pr_warn("Failed call to configure-connector, drc index: %x\n",
-   drc_index);
-   dlpar_release_drc(drc_index);
-   of_node_put(parent);
-   return -EINVAL;
-   }
+   dn = dlpar_configure_connector(cpu_to_be32(drc_index), parent);
+   if (!dn) {
+   pr_warn("Failed call to configure-connector, drc index: 
%x\n",
+   drc_index);
+   dlpar_release_drc(drc_index);
+

[PATCH] powerpc/pseries: Split CPU readd for single CPU systems

2019-01-10 Thread Michael Bringmann
[FYI: Please post to linuxppc-dev mailing list when you are ready.
  Good luck.]

We have encountered cases where DLPAR CPU 'readd' fails on single
CPU platforms, because the system needs a minimum amount of resources
to keep operating.  The current implementation attempts to add, and
remove all of the threads of a specified core at once, and will fail
if there is a problem removing any of the thread cpus.  In single CPU
platforms, the system must hold onto at least some resources to keep
operating i.e. at least one thread of a CPU.  So in such environments,
attempting to remove and add the single core and all of its CPU threads
in order to reset and flush system structures and/or caches fails.

This problem has been observed on PowerVM and qemu environments.

This change attempts to resolve such situations by breaking up the
DLPAR CPU 'readd' operation into multiple steps, performing the
remove+readd of the CPU threads until an error occurs, and then
continuing the 'readd' operation for the threads that could not be
removed during the first phase of the operation.

Requires: ("powerpc/pseries: Perform full re-add of CPU for topology update 
post-migration")
Signed-off-by: Michael W. Bringmann 
---
 arch/powerpc/platforms/pseries/hotplug-cpu.c |  187 --
 1 file changed, 117 insertions(+), 70 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 97feb6e..b33e066 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -342,7 +342,8 @@ static void pseries_remove_processor(struct device_node *np)
cpu_maps_update_done();
 }
 
-static int dlpar_online_cpu(struct device_node *dn)
+static int dlpar_online_cpu(struct device_node *dn, cpumask_t *whichcpus,
+   int partial)
 {
int rc = 0;
unsigned int cpu;
@@ -359,6 +360,8 @@ static int dlpar_online_cpu(struct device_node *dn)
cpu_maps_update_begin();
for (i = 0; i < nthreads; i++) {
thread = be32_to_cpu(intserv[i]);
+   if (partial && !cpumask_test_cpu(thread, whichcpus))
+   continue;
for_each_present_cpu(cpu) {
if (get_hard_smp_processor_id(cpu) != thread)
continue;
@@ -371,7 +374,6 @@ static int dlpar_online_cpu(struct device_node *dn)
if (rc)
goto out;
cpu_maps_update_begin();
-
break;
}
if (cpu == num_possible_cpus())
@@ -432,7 +434,10 @@ static bool valid_cpu_drc_index(struct device_node 
*parent, u32 drc_index)
return found;
 }
 
-static ssize_t dlpar_cpu_add(u32 drc_index)
+static struct device_node *cpu_drc_index_to_dn(u32 drc_index);
+
+static ssize_t dlpar_cpu_add(u32 drc_index, cpumask_t *whichcpus,
+   bool partial)
 {
struct device_node *dn, *parent;
int rc, saved_rc;
@@ -445,10 +450,12 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
return -ENODEV;
}
 
-   if (dlpar_cpu_exists(parent, drc_index)) {
-   of_node_put(parent);
-   pr_warn("CPU with drc index %x already exists\n", drc_index);
-   return -EINVAL;
+   if (!parent) {
+   if (dlpar_cpu_exists(parent, drc_index)) {
+   of_node_put(parent);
+   pr_warn("CPU with drc index %x already exists\n", 
drc_index);
+   return -EINVAL;
+   }
}
 
if (!valid_cpu_drc_index(parent, drc_index)) {
@@ -457,49 +464,59 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
return -EINVAL;
}
 
-   rc = dlpar_acquire_drc(drc_index);
-   if (rc) {
-   pr_warn("Failed to acquire DRC, rc: %d, drc index: %x\n",
-   rc, drc_index);
-   of_node_put(parent);
-   return -EINVAL;
-   }
+   if (!partial) {
+   rc = dlpar_acquire_drc(drc_index);
+   if (rc) {
+   pr_warn("Failed to acquire DRC, rc: %d, drc index: 
%x\n",
+   rc, drc_index);
+   of_node_put(parent);
+   return -EINVAL;
+   }
 
-   dn = dlpar_configure_connector(cpu_to_be32(drc_index), parent);
-   if (!dn) {
-   pr_warn("Failed call to configure-connector, drc index: %x\n",
-   drc_index);
-   dlpar_release_drc(drc_index);
-   of_node_put(parent);
-   return -EINVAL;
-   }
+   dn = dlpar_configure_connector(cpu_to_be32(drc_index), parent);
+   if (!dn) {
+   pr_warn("Failed call to configure-connector, drc index: 
%x\n",
+

Re: [PATCH v03] powerpc/mobility: Fix node detach/rename problem

2018-12-14 Thread Michael Bringmann
On 12/12/2018 08:57 PM, Michael Ellerman wrote:
> Frank Rowand  writes:
>> On 12/11/18 8:07 AM, Rob Herring wrote:
>>> On Tue, Dec 11, 2018 at 7:29 AM Michael Ellerman  
>>> wrote:
> ...
 diff --git a/drivers/of/base.c b/drivers/of/base.c
 index 09692c9b32a7..d8e4534c0686 100644
 --- a/drivers/of/base.c
 +++ b/drivers/of/base.c
 @@ -1190,6 +1190,10 @@ struct device_node *of_find_node_by_phandle(phandle 
 handle)
 if (phandle_cache[masked_handle] &&
 handle == phandle_cache[masked_handle]->phandle)
 np = phandle_cache[masked_handle];
 +
 +   /* If we find a detached node, remove it */
 +   if (of_node_check_flag(np, OF_DETACHED))
 +   np = phandle_cache[masked_handle] = NULL;
>>
>> The bug you found exposes a couple of different issues, a little bit
>> deeper than the proposed fix.  I'll work on a fuller fix tonight or
>> tomorrow.
> 
> OK thanks.
> 
>>> I'm wondering if we should explicitly remove the node from the cache
>>> when we set OF_DETACHED. Otherwise, it could be possible that the node
>>> pointer has been freed already. Or maybe we need both?
>>
>> Yes, it should be explicitly removed.  I may also add in a paranoia check in
>> of_find_node_by_phandle().
> 
> That seems best to me.

I agree that we should do both.

> 
> cheers

Michael

-- 
Michael W. Bringmann
Linux I/O, Networking and Security Development
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



Re: [PATCH 2/2] of: __of_detach_node() - remove node from phandle cache

2018-12-14 Thread Michael Bringmann
On 12/14/2018 11:20 AM, Rob Herring wrote:
> On Fri, Dec 14, 2018 at 12:43 AM  wrote:
>>
>> From: Frank Rowand 
>>
>> Non-overlay dynamic devicetree node removal may leave the node in
>> the phandle cache.  Subsequent calls to of_find_node_by_phandle()
>> will incorrectly find the stale entry.  Remove the node from the
>> cache.
>>
>> Add paranoia checks in of_find_node_by_phandle() as a second level
>> of defense (do not return cached node if detached, do not add node
>> to cache if detached).
>>
>> Reported-by: Michael Bringmann 
>> Signed-off-by: Frank Rowand 
>> ---
>>  drivers/of/base.c   | 29 -
>>  drivers/of/dynamic.c|  3 +++
>>  drivers/of/of_private.h |  4 
>>  3 files changed, 35 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/of/base.c b/drivers/of/base.c
>> index d599367cb92a..34a5125713c8 100644
>> --- a/drivers/of/base.c
>> +++ b/drivers/of/base.c
>> @@ -162,6 +162,27 @@ int of_free_phandle_cache(void)
>>  late_initcall_sync(of_free_phandle_cache);
>>  #endif
>>
>> +/*
>> + * Caller must hold devtree_lock.
>> + */
>> +void __of_free_phandle_cache_entry(phandle handle)
>> +{
>> +   phandle masked_handle;
>> +
>> +   if (!handle)
>> +   return;
>> +
>> +   masked_handle = handle & phandle_cache_mask;
>> +
>> +   if (phandle_cache) {
>> +   if (phandle_cache[masked_handle] &&
>> +   handle == phandle_cache[masked_handle]->phandle) {
>> +   of_node_put(phandle_cache[masked_handle]);
>> +   phandle_cache[masked_handle] = NULL;
>> +   }
>> +   }
>> +}
>> +
>>  void of_populate_phandle_cache(void)
>>  {
>> unsigned long flags;
>> @@ -1209,11 +1230,17 @@ struct device_node *of_find_node_by_phandle(phandle 
>> handle)
>> if (phandle_cache[masked_handle] &&
>> handle == phandle_cache[masked_handle]->phandle)
>> np = phandle_cache[masked_handle];
>> +   if (np && of_node_check_flag(np, OF_DETACHED)) {
>> +   of_node_put(np);
>> +   phandle_cache[masked_handle] = NULL;
> 
> This should never happen, right? Any time we set OF_DETACHED, the
> entry should get removed from the cache. I think we want a WARN here
> in case we're in an unexpected state.

We don't actually remove the pointer from the phandle cache when we set
OF_DETACHED in drivers/of/dynamic.c:__of_detach_node.  The phandle cache
is currently static within drivers/of/base.c.  There are a couple of
calls to of_populate_phandle_cache / of_free_phandle_cache within
drivers/of/overlay.c, but these are not involved in the device tree
updates that occur during LPAR migration.  A WARN here would only make
sense, if we also arrange to clear the handle.

> 
> Rob

Michael

> 
> 

-- 
Michael W. Bringmann
Linux I/O, Networking and Security Development
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



[RFC 6/6] powerpc: Enable support for ibm, drc-info devtree property

2018-12-14 Thread Michael Bringmann
Enable support for new DRC device tree property "ibm,drc-info"
in initial handshake between the Linux kernel and the front end
processor.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/kernel/prom_init.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index f33ff41..5d20a4d 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -910,7 +910,7 @@ struct ibm_arch_vec {
.reserved2 = 0,
.reserved3 = 0,
.subprocessors = 1,
-   .byte22 = OV5_FEAT(OV5_DRMEM_V2),
+   .byte22 = OV5_FEAT(OV5_DRMEM_V2 | OV5_DRC_INFO),
.intarch = 0,
.mmu = 0,
.hash_ext = 0,



[RFC 5/6] powerpc/pci/hotplug: Use common drcinfo parsing

2018-12-14 Thread Michael Bringmann
The implementation of the pseries-specific drc info properties
is currently implemented in pseries-specific and non-pseries-specific
files.  This patch set uses a new implementation of the device-tree
parsing code for the properties.

This patch refactors parsing of the pseries-specific drc-info properties
out of rpaphp_core.c to use the common parser.  In the case where an
architecture does not use these properties, an __weak copy of the
function is provided with dummy return values.  Changes include creating
appropriate callback functions and passing callback-specific data
blocks into arch_find_drc_match.  All functions that were used just
to support the previous parsing implementation have been moved.

Signed-off-by: Michael Bringmann 
---
 drivers/pci/hotplug/rpaphp_core.c |  232 -
 1 file changed, 28 insertions(+), 204 deletions(-)

diff --git a/drivers/pci/hotplug/rpaphp_core.c 
b/drivers/pci/hotplug/rpaphp_core.c
index bcd5d35..9ad7384 100644
--- a/drivers/pci/hotplug/rpaphp_core.c
+++ b/drivers/pci/hotplug/rpaphp_core.c
@@ -154,182 +154,18 @@ static enum pci_bus_speed get_max_bus_speed(struct slot 
*slot)
return speed;
 }
 
-static int get_children_props(struct device_node *dn, const int **drc_indexes,
-   const int **drc_names, const int **drc_types,
-   const int **drc_power_domains)
-{
-   const int *indexes, *names, *types, *domains;
-
-   indexes = of_get_property(dn, "ibm,drc-indexes", NULL);
-   names = of_get_property(dn, "ibm,drc-names", NULL);
-   types = of_get_property(dn, "ibm,drc-types", NULL);
-   domains = of_get_property(dn, "ibm,drc-power-domains", NULL);
-
-   if (!indexes || !names || !types || !domains) {
-   /* Slot does not have dynamically-removable children */
-   return -EINVAL;
-   }
-   if (drc_indexes)
-   *drc_indexes = indexes;
-   if (drc_names)
-   /* _names[1] contains NULL terminated slot names */
-   *drc_names = names;
-   if (drc_types)
-   /* _types[1] contains NULL terminated slot types */
-   *drc_types = types;
-   if (drc_power_domains)
-   *drc_power_domains = domains;
-
-   return 0;
-}
-
-
 /* Verify the existence of 'drc_name' and/or 'drc_type' within the
- * current node.  First obtain it's my-drc-index property.  Next,
- * obtain the DRC info from it's parent.  Use the my-drc-index for
- * correlation, and obtain/validate the requested properties.
+ * current node.
  */
 
-static int rpaphp_check_drc_props_v1(struct device_node *dn, char *drc_name,
-   char *drc_type, unsigned int my_index)
-{
-   char *name_tmp, *type_tmp;
-   const int *indexes, *names;
-   const int *types, *domains;
-   int i, rc;
-
-   rc = get_children_props(dn->parent, , , , );
-   if (rc < 0) {
-   return -EINVAL;
-   }
-
-   name_tmp = (char *) [1];
-   type_tmp = (char *) [1];
-
-   /* Iterate through parent properties, looking for my-drc-index */
-   for (i = 0; i < be32_to_cpu(indexes[0]); i++) {
-   if ((unsigned int) indexes[i + 1] == my_index)
-   break;
-
-   name_tmp += (strlen(name_tmp) + 1);
-   type_tmp += (strlen(type_tmp) + 1);
-   }
-
-   if (((drc_name == NULL) || (drc_name && !strcmp(drc_name, name_tmp))) &&
-   ((drc_type == NULL) || (drc_type && !strcmp(drc_type, type_tmp
-   return 0;
-
-   return -EINVAL;
-}
-
-static int rpaphp_check_drc_props_v2(struct device_node *dn, char *drc_name,
-   char *drc_type, unsigned int my_index)
-{
-   struct property *info;
-   unsigned int entries;
-   struct of_drc_info drc;
-   const __be32 *value;
-   char cell_drc_name[MAX_DRC_NAME_LEN];
-   int j, fndit;
-
-   info = of_find_property(dn->parent, "ibm,drc-info", NULL);
-   if (info == NULL)
-   return -EINVAL;
-
-   value = of_prop_next_u32(info, NULL, );
-   if (!value)
-   return -EINVAL;
-
-   for (j = 0; j < entries; j++) {
-   of_read_drc_info_cell(, , );
-
-   /* Should now know end of current entry */
-
-   if (my_index > drc.last_drc_index)
-   continue;
-
-   fndit = 1;
-   break;
-   }
-   /* Found it */
-
-   if (fndit)
-   sprintf(cell_drc_name, "%s%d", drc.drc_name_prefix, 
-   my_index);
-
-   if (((drc_name == NULL) ||
-(drc_name && !strcmp(drc_name, cell_drc_name))) &&
-   ((drc_type == NULL) ||
-(drc_type && !strcmp(drc_type, drc.drc_type
-   return 0;
-
-   r

[RFC 4/6] powerpc/pseries: Use common drcinfo parsing

2018-12-14 Thread Michael Bringmann
The implementation of the pseries-specific drc info properties
is currently implemented in pseries-specific and non-pseries-specific
files.  This patch set uses a new implementation of the device-tree
parsing code for the properties.

This patch refactors parsing of the drc-info properties out of
pseries_energy.c and hotplug-cpu.c to use the common parser.
Changes include creating appropriate callback functions and
passing callback-specific data blocks into arch_find_drc_match.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/platforms/pseries/hotplug-cpu.c|   83 +++-
 arch/powerpc/platforms/pseries/pseries_energy.c |  157 ---
 2 files changed, 107 insertions(+), 133 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 2f8e621..ee3028c 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -411,23 +411,29 @@ static bool dlpar_cpu_exists(struct device_node *parent, 
u32 drc_index)
return found;
 }
 
-static bool valid_cpu_drc_index(struct device_node *parent, u32 drc_index)
+struct cpu_drc_index_struct {
+   u32 drc_index;
+};
+
+bool cpu_drc_index_cb(struct device_node *dn,
+   u32 drc_index, char *drc_name,
+   char *drc_type, u32 drc_power_domain,
+   void *data)
 {
-   bool found = false;
-   int rc, index;
+   struct cpu_drc_index_struct *cdata = data;
 
-   index = 0;
-   while (!found) {
-   u32 drc;
+   if (drc_index == cdata->drc_index)
+   return true;
+   return false;
+}
 
-   rc = of_property_read_u32_index(parent, "ibm,drc-indexes",
-   index++, );
-   if (rc)
-   break;
+static bool valid_cpu_drc_index(struct device_node *parent, u32 drc_index)
+{
+   struct cpu_drc_index_struct cdata = { drc_index };
+   bool found = false;
 
-   if (drc == drc_index)
-   found = true;
-   }
+   found = arch_find_drc_match(parent, cpu_drc_index_cb,
+   "CPU", NULL, false, false, );
 
return found;
 }
@@ -721,11 +727,34 @@ static int dlpar_cpu_remove_by_count(u32 cpus_to_remove)
return rc;
 }
 
+struct cpus_to_add_cb_struct {
+   struct device_node *parent;
+   u32 *cpu_drcs;
+   u32 cpus_to_add;
+   u32 cpus_found;
+};
+
+static bool cpus_to_add_cb(struct device_node *dn,
+   u32 drc_index, char *drc_name,
+   char *drc_type, u32 drc_power_domain,
+   void *data)
+{
+   struct cpus_to_add_cb_struct *cdata = data;
+
+   if (cdata->cpus_found < cdata->cpus_to_add) {
+   if (!dlpar_cpu_exists(cdata->parent, drc_index))
+   cdata->cpu_drcs[cdata->cpus_found++] = drc_index;
+   }
+
+   return !(cdata->cpus_found < cdata->cpus_to_add);
+}
+
 static int find_dlpar_cpus_to_add(u32 *cpu_drcs, u32 cpus_to_add)
 {
struct device_node *parent;
-   int cpus_found = 0;
-   int index, rc;
+   struct cpus_to_add_cb_struct cdata = {
+   NULL, cpu_drcs, cpus_to_add, 0 };
+   int cpus_found;
 
parent = of_find_node_by_path("/cpus");
if (!parent) {
@@ -734,25 +763,13 @@ static int find_dlpar_cpus_to_add(u32 *cpu_drcs, u32 
cpus_to_add)
return -1;
}
 
-   /* Search the ibm,drc-indexes array for possible CPU drcs to
-* add. Note that the format of the ibm,drc-indexes array is
-* the number of entries in the array followed by the array
-* of drc values so we start looking at index = 1.
+   /* Search the appropriate property for possible CPU drcs
+* to add.
 */
-   index = 1;
-   while (cpus_found < cpus_to_add) {
-   u32 drc;
-
-   rc = of_property_read_u32_index(parent, "ibm,drc-indexes",
-   index++, );
-   if (rc)
-   break;
-
-   if (dlpar_cpu_exists(parent, drc))
-   continue;
-
-   cpu_drcs[cpus_found++] = drc;
-   }
+   cdata.parent = parent;
+   arch_find_drc_match(parent, cpus_to_add_cb, "CPU",
+   NULL, false, false, );
+   cpus_found = cdata.cpus_found;
 
of_node_put(parent);
return cpus_found;
diff --git a/arch/powerpc/platforms/pseries/pseries_energy.c 
b/arch/powerpc/platforms/pseries/pseries_energy.c
index 6ed2212..f7b9d86 100644
--- a/arch/powerpc/platforms/pseries/pseries_energy.c
+++ b/arch/powerpc/platforms/pseries/pseries_energy.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 
 #define MODULE_VERS "1.

[RFC 3/6] pseries/drcinfo: Pseries impl of arch_find_drc_info

2018-12-14 Thread Michael Bringmann
This patch provides a common interface to parse ibm,drc-indexes,
ibm,drc-names, ibm,drc-types, ibm,drc-power-domains, or ibm,drc-info.
The generic interface arch_find_drc_match is provided which accepts
callback functions that may be applied to examine the data for each
entry.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/include/asm/prom.h |3 
 arch/powerpc/platforms/pseries/of_helpers.c |  299 +++
 include/linux/topology.h|2 
 3 files changed, 298 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index b04c5ce..910d1dc 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -91,9 +91,6 @@ struct of_drc_info {
u32 last_drc_index;
 };
 
-extern int of_read_drc_info_cell(struct property **prop,
-   const __be32 **curval, struct of_drc_info *data);
-
 
 /*
  * There are two methods for telling firmware what our capabilities are.
diff --git a/arch/powerpc/platforms/pseries/of_helpers.c 
b/arch/powerpc/platforms/pseries/of_helpers.c
index 0185e50..11c90cd 100644
--- a/arch/powerpc/platforms/pseries/of_helpers.c
+++ b/arch/powerpc/platforms/pseries/of_helpers.c
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#define pr_fmt(fmt) "drc: " fmt
+
 #include 
 #include 
 #include 
@@ -11,6 +13,12 @@
 
 #defineMAX_DRC_NAME_LEN 64
 
+static int drc_debug;
+#define dbg(args...) if (drc_debug) { printk(KERN_DEBUG args); }
+#define err(arg...) printk(KERN_ERR args);
+#define info(arg...) printk(KERN_INFO args);
+#define warn(arg...) printk(KERN_WARNING args);
+
 /**
  * pseries_of_derive_parent - basically like dirname(1)
  * @path:  the full_name of a node to be added to the tree
@@ -46,7 +54,8 @@ struct device_node *pseries_of_derive_parent(const char *path)
 
 /* Helper Routines to convert between drc_index to cpu numbers */
 
-int of_read_drc_info_cell(struct property **prop, const __be32 **curval,
+static int of_read_drc_info_cell(struct property **prop,
+   const __be32 **curval,
struct of_drc_info *data)
 {
const char *p;
@@ -90,4 +99,290 @@ int of_read_drc_info_cell(struct property **prop, const 
__be32 **curval,
 
return 0;
 }
-EXPORT_SYMBOL(of_read_drc_info_cell);
+
+static int walk_drc_info(struct device_node *dn,
+   bool (*usercb)(struct of_drc_info *drc,
+   void *data,
+   int *ret_code),
+   char *opt_drc_type,
+   void *data)
+{
+   struct property *info;
+   unsigned int entries;
+   struct of_drc_info drc;
+   const __be32 *value;
+   int j, ret_code = -EINVAL;
+   bool done = false;
+
+   info = of_find_property(dn, "ibm,drc-info", NULL);
+   if (info == NULL)
+   return -EINVAL;
+
+   value = info->value;
+   entries = of_read_number(value++, 1);
+
+   for (j = 0, done = 0; (j < entries) && (!done); j++) {
+   of_read_drc_info_cell(, , );
+
+   if (opt_drc_type && strcmp(opt_drc_type, drc.drc_type))
+   continue;
+
+   done = usercb(, data, _code);
+   }
+
+   return ret_code;
+}
+
+static int get_children_props(struct device_node *dn, const int **drc_indexes,
+   const int **drc_names, const int **drc_types,
+   const int **drc_power_domains)
+{
+   const int *indexes, *names, *types, *domains;
+
+   indexes = of_get_property(dn, "ibm,drc-indexes", NULL);
+   names = of_get_property(dn, "ibm,drc-names", NULL);
+   types = of_get_property(dn, "ibm,drc-types", NULL);
+   domains = of_get_property(dn, "ibm,drc-power-domains", NULL);
+
+   if (!indexes || !names || !types || !domains) {
+   /* Slot does not have dynamically-removable children */
+   return -EINVAL;
+   }
+   if (drc_indexes)
+   *drc_indexes = indexes;
+   if (drc_names)
+   /* _names[1] contains NULL terminated slot names */
+   *drc_names = names;
+   if (drc_types)
+   /* _types[1] contains NULL terminated slot types */
+   *drc_types = types;
+   if (drc_power_domains)
+   *drc_power_domains = domains;
+
+   return 0;
+}
+
+static int is_php_type(char *drc_type)
+{
+   unsigned long value;
+   char *endptr;
+
+   /* PCI Hotplug nodes have an integer for drc_type */
+   value = simple_strtoul(drc_type, , 10);
+   if (endptr == drc_type)
+   return 0;
+
+   return 1;
+}
+
+/**
+ * is_php_dn() - return 1 if this is a hotpluggable pci slot, else 0
+ * @dn: target _node
+ * @indexes: passed to get_children_props()
+ * @names: passed to get_children_props()
+ * @types: returned from g

[RFC 2/6] pseries/drcinfo: Fix bug parsing ibm,drc-info

2018-12-14 Thread Michael Bringmann
Replace use of of_prop_next_u32() in when parsing 'ibm,drc-info'
structure to simplify and reduce parsing code.

Signed-off-by: Michael Bringmann 
Fixes: 3f38000eda48 ("powerpc/firmware: Add definitions for new drc-info 
firmware feature" -- end of patch series applied to powerpc next)
---
 arch/powerpc/platforms/pseries/of_helpers.c |   24 +---
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/of_helpers.c 
b/arch/powerpc/platforms/pseries/of_helpers.c
index 6df192f..0185e50 100644
--- a/arch/powerpc/platforms/pseries/of_helpers.c
+++ b/arch/powerpc/platforms/pseries/of_helpers.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+
 #include 
 #include 
 #include 
@@ -6,6 +7,9 @@
 #include 
 
 #include "of_helpers.h"
+#include "pseries.h"
+
+#defineMAX_DRC_NAME_LEN 64
 
 /**
  * pseries_of_derive_parent - basically like dirname(1)
@@ -65,29 +69,19 @@ int of_read_drc_info_cell(struct property **prop, const 
__be32 **curval,
 
/* Get drc-index-start:encode-int */
p2 = (const __be32 *)p;
-   p2 = of_prop_next_u32(*prop, p2, >drc_index_start);
-   if (!p2)
-   return -EINVAL;
+   data->drc_index_start = of_read_number(p2++, 1);
 
/* Get drc-name-suffix-start:encode-int */
-   p2 = of_prop_next_u32(*prop, p2, >drc_name_suffix_start);
-   if (!p2)
-   return -EINVAL;
+   data->drc_name_suffix_start = of_read_number(p2++, 1);
 
/* Get number-sequential-elements:encode-int */
-   p2 = of_prop_next_u32(*prop, p2, >num_sequential_elems);
-   if (!p2)
-   return -EINVAL;
+   data->num_sequential_elems = of_read_number(p2++, 1);
 
/* Get sequential-increment:encode-int */
-   p2 = of_prop_next_u32(*prop, p2, >sequential_inc);
-   if (!p2)
-   return -EINVAL;
+   data->sequential_inc = of_read_number(p2++, 1);
 
/* Get drc-power-domain:encode-int */
-   p2 = of_prop_next_u32(*prop, p2, >drc_power_domain);
-   if (!p2)
-   return -EINVAL;
+   data->drc_power_domain = of_read_number(p2++, 1);
 
/* Should now know end of current entry */
(*curval) = (void *)p2;



[RFC 1/6] powerpc:/drc Define interface to acquire arch-specific drc info

2018-12-14 Thread Michael Bringmann
Define interface to acquire arch-specific drc info to match against
hotpluggable devices.  The current implementation exposes several
pseries-specific dynamic memory properties in generic kernel code.
This patch set provides an interface to pull that code out of the
generic kernel.

Signed-off-by: Michael Bringmann 
---
 include/linux/topology.h |9 +
 1 file changed, 9 insertions(+)

diff --git a/include/linux/topology.h b/include/linux/topology.h
index cb0775e..df97f5f 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -44,6 +44,15 @@
 
 int arch_update_cpu_topology(void);
 
+int arch_find_drc_match(struct device_node *dn,
+   bool (*usercb)(struct device_node *dn,
+   u32 drc_index, char *drc_name,
+   char *drc_type, u32 drc_power_domain,
+   void *data),
+   char *opt_drc_type, char *opt_drc_name,
+   bool match_drc_index, bool ck_php_type,
+   void *data);
+
 /* Conform to ACPI 2.0 SLIT distance definitions */
 #define LOCAL_DISTANCE 10
 #define REMOTE_DISTANCE20



[RFC 0/6] powerpc/pseries: Refactor code to centralize drcinfo parsing

2018-12-14 Thread Michael Bringmann
The implementation of the pseries-specific drc info information feature
is currently implemented within and outside of the powerpc pseries
code.  This patch set moves the parsing code for the pseries-specific
device-tree properties ibm,drc-indexes, ibm,drc-names, ibm,drc-types,
ibm,drc-power-domains, and the compressed ibm,drc-info into the
platform-specific code for the Pseries features.

Signed-off-by: Michael W. Bringmann 

Michael Bringmann (6):
  powerpc:/drc Define interface to acquire arch-specific drc info
  pseries/drcinfo: Fix bug parsing ibm,drc-info
  pseries/drcinfo: Pseries impl of arch_find_drc_info
  powerpc/pseries: Use common drcinfo parsing
  powerpc/pci/hotplug: Use common drcinfo parsing
  powerpc: Enable support for ibm,drc-info devtree property



Re: [PATCH] pseries/hotplug: Add more delay in pseries_cpu_die while waiting for rtas-stop

2018-12-11 Thread Michael Bringmann
Note from Scott Mayes on latest crash:

Michael,

Since the partition crashed, I was able to get the last .2 seconds worth of 
RTAS call trace leading up to the crash.

Best I could tell from that bit of trace was that the removal of a processor 
involved the following steps:
-- Call to stop-self for a given thread
-- Repeated calls to query-cpu-stopped-state (which eventually indicated the 
thread was stopped)
-- Call to get-sensor-state for the thread to check its entity-state (9003) 
sensor which returned 'dr-entity-present'
-- Call to set-indicator to set the isolation-state (9001) indicator to ISOLATE 
state
-- Call to set-indicator to set the allocation-state (9003) indicator to 
UNUSABLE state

I noticed one example of thread x28 getting through all of these steps just 
fine, but for thread x20, although the
query-cpu-stopped state returned 0 status (STOPPED), a subsequent call to 
set-indicator to ISOLATE
failed.  This failure was near the end of the trace, but was not the very last 
RTAS call made in the trace.
The set-indicator failure reported to Linux was a -9001 (Valid outstanding 
translation) which was mapped
from a 0x502 (Invalid thread state) return code from PHYP's H_SET_DR_STATE 
h-call.

On 12/10/2018 02:31 PM, Thiago Jung Bauermann wrote:
> 
> Hello Michael,
> 
> Michael Bringmann  writes:
> 
>> I have asked Scott Mayes to take a look at one of these crashes from
>> the phyp side.  I will let you know if he finds anything notable.
> 
> Thanks! It might make sense to test whether booting with
> cede_offline=off makes the bug go away.

Scott is looking at the system.  I will try once he is finished.

> 
> One suspicion I have is regarding the code handling CPU_STATE_INACTIVE.
>>From what I understand, it is a powerpc-specific CPU state and from the
> perspective of the generic CPU hotplug state machine, inactive CPUs are
> already fully offline. Which means that the locking performed by the
> generic code state machine doesn't apply to transitioning CPUs from
> INACTIVE to OFFLINE state. Perhaps the bug is that there is more than
> one CPU making that transition at the same time? That would cause two
> CPUs to call RTAS stop-self.
> 
> I haven't checked whether this is really possible or not, though. It's
> just a conjecture.

Michael

> 
> --
> Thiago Jung Bauermann
> IBM Linux Technology Center
> 
> 

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



[RFC 3/3] powerpc/numa: Apply mapping between HW and kernel cpus

2018-12-11 Thread Michael Bringmann
Apply new interface to map external powerpc cpus across multiple
nodes to a range of kernel cpu values.  Mapping is intended to
prevent confusion within the kernel about the cpu+node mapping, and
the changes in configuration that may happen due to powerpc LPAR
migration or other associativity changes during the lifetime of a
system.  These interfaces exchange the thread_index provided by the
'ibm,ppc-interrupt-server#s' properties, for an internal index to
be used by kernel scheduling interfaces.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/mm/numa.c   |   45 +-
 arch/powerpc/platforms/pseries/hotplug-cpu.c |   15 +++--
 2 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 7d6bba264..59d7cd9 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1063,7 +1063,8 @@ u64 memory_hotplug_max(void)
 
 struct topology_update_data {
struct topology_update_data *next;
-   unsigned int cpu;
+   unsigned int old_cpu;
+   unsigned int new_cpu;
int old_nid;
int new_nid;
 };
@@ -1253,13 +1254,13 @@ static int update_cpu_topology(void *data)
 
for (update = data; update; update = update->next) {
int new_nid = update->new_nid;
-   if (cpu != update->cpu)
+   if (cpu != update->new_cpu)
continue;
 
-   unmap_cpu_from_node(cpu);
-   map_cpu_to_node(cpu, new_nid);
-   set_cpu_numa_node(cpu, new_nid);
-   set_cpu_numa_mem(cpu, local_memory_node(new_nid));
+   unmap_cpu_from_node(update->old_cpu);
+   map_cpu_to_node(update->new_cpu, new_nid);
+   set_cpu_numa_node(update->new_cpu, new_nid);
+   set_cpu_numa_mem(update->new_cpu, local_memory_node(new_nid));
vdso_getcpu_init();
}
 
@@ -1283,7 +1284,7 @@ static int update_lookup_table(void *data)
int nid, base, j;
 
nid = update->new_nid;
-   base = cpu_first_thread_sibling(update->cpu);
+   base = cpu_first_thread_sibling(update->new_cpu);
 
for (j = 0; j < threads_per_core; j++) {
update_numa_cpu_lookup_table(base + j, nid);
@@ -1305,7 +1306,7 @@ int numa_update_cpu_topology(bool cpus_locked)
struct topology_update_data *updates, *ud;
cpumask_t updated_cpus;
struct device *dev;
-   int weight, new_nid, i = 0;
+   int weight, new_nid, i = 0, ii;
 
if (!prrn_enabled && !vphn_enabled && topology_inited)
return 0;
@@ -1349,12 +1350,16 @@ int numa_update_cpu_topology(bool cpus_locked)
continue;
}
 
+   ii = 0;
for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
ud = [i++];
ud->next = [i];
-   ud->cpu = sibling;
ud->new_nid = new_nid;
ud->old_nid = numa_cpu_lookup_table[sibling];
+   ud->old_cpu = sibling;
+   ud->new_cpu = cpuremap_map_cpu(
+   get_hard_smp_processor_id(sibling),
+   ii++, new_nid);
cpumask_set_cpu(sibling, _cpus);
}
cpu = cpu_last_thread_sibling(cpu);
@@ -1370,9 +1375,10 @@ int numa_update_cpu_topology(bool cpus_locked)
pr_debug("Topology update for the following CPUs:\n");
if (cpumask_weight(_cpus)) {
for (ud = [0]; ud; ud = ud->next) {
-   pr_debug("cpu %d moving from node %d "
- "to %d\n", ud->cpu,
- ud->old_nid, ud->new_nid);
+   pr_debug("cpu %d, node %d moving to"
+" cpu %d, node %d\n",
+ud->old_cpu, ud->old_nid,
+ud->new_cpu, ud->new_nid);
}
}
 
@@ -1409,13 +1415,20 @@ int numa_update_cpu_topology(bool cpus_locked)
 cpumask_of(raw_smp_processor_id()));
 
for (ud = [0]; ud; ud = ud->next) {
-   unregister_cpu_under_node(ud->cpu, ud->old_nid);
-   register_cpu_under_node(ud->cpu, ud->new_nid);
+   unregister_cpu_under_node(ud->old_cpu, ud->old_nid);
+   register_cpu_under_node(ud->new_cpu, ud->new_nid);
 
-   dev = get_cpu_device(ud->cpu);
+   dev = get_cpu_device(ud->old_cpu);
if (dev)
kobject_uevent(>kobj, KOBJ_CHANGE);
-

[RFC 2/3] powerpc/numa: Define mapping between HW and kernel cpus

2018-12-11 Thread Michael Bringmann
Define interface to map external powerpc cpus across multiple nodes
to a range of kernel cpu values.  Mapping is intended to prevent
confusion within the kernel about the cpu+node mapping, and the
changes in configuration that may happen due to powerpc LPAR
migration or other associativity changes during the lifetime of a
system.  These interfaces will be used entirely within the powerpc
kernel code to maintain separation between the machine and kernel
contexts.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/include/asm/topology.h   |   31 +++
 arch/powerpc/platforms/pseries/Kconfig|   10 ++
 arch/powerpc/platforms/pseries/Makefile   |1 
 arch/powerpc/platforms/pseries/cpuremap.c |  131 +
 4 files changed, 173 insertions(+)
 create mode 100644 arch/powerpc/platforms/pseries/cpuremap.c

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index 4621f40..db11969 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -135,5 +135,36 @@ static inline void shared_proc_topology_init(void) {}
 #endif
 #endif
 
+#define CPUREMAP_NO_CPU(~0)
+#define CPUREMAP_NO_THREAD (~0)
+
+#ifdef CONFIG_CPUREMAP
+extern int cpuremap_thread_to_cpu(int thread_index);
+   /* Return CPUREMAP_NO_CPU if not found */
+extern int cpuremap_map_cpu(int thread_index, int in_core_ndx, int node);
+   /* Return CPUREMAP_NO_CPU if fails */
+extern int cpuremap_reserve_cpu(int cpu);
+   /* Return CPUREMAP_NO_CPU if fails */
+extern int cpuremap_release_cpu(int cpu);
+   /* Return CPUREMAP_NO_CPU if fails */
+extern int cpuremap_cpu_to_thread(int cpu);
+   /* Return CPUREMAP_NO_THREAD if not found */
+extern void cpuremap_init(void);
+   /* Identify necessary constants & alloc memory at boot */
+#else
+static inline int cpuremap_thread_to_cpu(int thread_index)
+{
+   return thread_index;
+}
+static inline int cpuremap_map_cpu(int thread_index, int in_core_ndx, int node)
+{
+   return thread_index;
+}
+static inline int cpuremap_reserve_cpu(int cpu) { return cpu; }
+static inline int cpuremap_release_cpu(int cpu) { return cpu; }
+static inline int cpuremap_cpu_to_thread(int cpu) { return cpu; }
+static inline void cpuremap_init(void) {}
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_TOPOLOGY_H */
diff --git a/arch/powerpc/platforms/pseries/Kconfig 
b/arch/powerpc/platforms/pseries/Kconfig
index 2e4bd32..c35009f 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -145,3 +145,13 @@ config PAPR_SCM
tristate "Support for the PAPR Storage Class Memory interface"
help
  Enable access to hypervisor provided storage class memory.
+  Enable access to hypervisor provided storage class memory.
+
+config CPUREMAP
+bool "Support for mapping hw cpu+node to kernel index"
+depends on SMP && (PPC_PSERIES)
+---help---
+  Say Y here to be able to remap hw cpu+node to standardized
+  kernel CPUs at runtime on Pseries machines.
+
+  Say N if you are unsure.
diff --git a/arch/powerpc/platforms/pseries/Makefile 
b/arch/powerpc/platforms/pseries/Makefile
index a43ec84..ad49d8e 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -13,6 +13,7 @@ obj-$(CONFIG_KEXEC_CORE)  += kexec.o
 obj-$(CONFIG_PSERIES_ENERGY)   += pseries_energy.o
 
 obj-$(CONFIG_HOTPLUG_CPU)  += hotplug-cpu.o
+obj-$(CONFIG_CPUREMAP) += cpuremap.o
 obj-$(CONFIG_MEMORY_HOTPLUG)   += hotplug-memory.o pmem.o
 
 obj-$(CONFIG_HVC_CONSOLE)  += hvconsole.o
diff --git a/arch/powerpc/platforms/pseries/cpuremap.c 
b/arch/powerpc/platforms/pseries/cpuremap.c
new file mode 100644
index 000..86fdf12
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/cpuremap.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct cpuremap_cpu {
+   int thread_index;
+   /* Set to thread_index from ibm,ppc-interrupt-server#s arrays
+* Don't clear when release'ed
+*/
+   int node;
+   bool in_use;
+   /* Set to true when reserve'ed
+* Don't clear when release'ed
+   */
+};
+
+struct cpuremap_struct {
+   int num_nodes;
+   int num_cores;
+   int num_threads_per_core;
+   struct cpuremap_cpu *threads;
+} cpuremap_data;
+
+
+void cpuremap_init(void)
+{
+   int i, k;
+
+   /* Identify necessary constants & alloc memory at boot */
+   cpuremap_data.num_threads_per_core = 8;
+   cpuremap_data.num_cores = 32;
+   cpuremap_data.num_nodes =
+   nr_cpu_ids /
+   (cpuremap_data.num_threads_per_core * cpuremap_data.num_cores);
+   cpuremap_data.

[RFC 1/3] powerpc/numa: Conditionally online new nodes

2018-12-11 Thread Michael Bringmann
Add argument to allow caller to determine whether the node identified
for a cpu after an associativity / affinity change should be inited.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/include/asm/topology.h  |2 +-
 arch/powerpc/mm/numa.c   |6 +++---
 arch/powerpc/platforms/pseries/hotplug-cpu.c |2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index a4a718d..4621f40 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -90,7 +90,7 @@ static inline void update_numa_cpu_lookup_table(unsigned int 
cpu, int node) {}
 extern int start_topology_update(void);
 extern int stop_topology_update(void);
 extern int prrn_is_enabled(void);
-extern int find_and_online_cpu_nid(int cpu);
+extern int find_and_online_cpu_nid(int cpu, bool must_online);
 extern int timed_topology_update(int nsecs);
 extern void __init shared_proc_topology_init(void);
 #else
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 87f0dd0..7d6bba264 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1197,7 +1197,7 @@ static long vphn_get_associativity(unsigned long cpu,
return rc;
 }
 
-int find_and_online_cpu_nid(int cpu)
+int find_and_online_cpu_nid(int cpu, bool must_online)
 {
__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
int new_nid;
@@ -1210,7 +1210,7 @@ int find_and_online_cpu_nid(int cpu)
if (new_nid < 0 || !node_possible(new_nid))
new_nid = first_online_node;
 
-   if (NODE_DATA(new_nid) == NULL) {
+   if (must_online && (NODE_DATA(new_nid) == NULL)) {
 #ifdef CONFIG_MEMORY_HOTPLUG
/*
 * Need to ensure that NODE_DATA is initialized for a node from
@@ -1337,7 +1337,7 @@ int numa_update_cpu_topology(bool cpus_locked)
continue;
}
 
-   new_nid = find_and_online_cpu_nid(cpu);
+   new_nid = find_and_online_cpu_nid(cpu, true);
 
if (new_nid == numa_cpu_lookup_table[cpu]) {
cpumask_andnot(_associativity_changes_mask,
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 2f8e621..620cb57 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -366,7 +366,7 @@ static int dlpar_online_cpu(struct device_node *dn)
!= CPU_STATE_OFFLINE);
cpu_maps_update_done();
timed_topology_update(1);
-   find_and_online_cpu_nid(cpu);
+   find_and_online_cpu_nid(cpu, true);
rc = device_online(get_cpu_device(cpu));
if (rc)
goto out;



[RFC 0/3] powerpc/pseries: Remap hw to kernel cpu indexes

2018-12-11 Thread Michael Bringmann
Define and apply new interface to map hardware-specific powerpc cpu
ids to a kernel specific range of cpu values.  Mapping is intended
to prevent confusion within the kernel about the cpu+node mapping,
and the changes in configuration that may happen due to powerpc LPAR
migration or other associativity changes during the lifetime of a
system.  These interfaces exchange the thread_index provided by the
'ibm,ppc-interrupt-server#s' properties, for an internal index to
be used by kernel scheduling interfaces.

Signed-off-by: Michael Bringmann 

Michael Bringmann (3):
  powerpc/numa: Conditionally online new nodes
  powerpc/numa: Define mapping between HW and kernel cpus
  powerpc/numa: Apply mapping between HW and kernel cpu



Re: [PATCH v03] powerpc/mobility: Fix node detach/rename problem

2018-12-11 Thread Michael Bringmann
--- Snip ---

>>
>> The mobility.c code continues on during the second migration, accepts
>> the definitions of the new nodes from the PHYP and ends up renaming
>> the new properties e.g.
>>
>> [ 4565.827296] Duplicate name in base, renamed to "ibm,platform-facilities#1"
>>
>> There is no check like 'of_node_check_flag(np, OF_DETACHED)' within
>> of_find_node_by_phandle to skip nodes that are detached, but still
>> present due to caching or use count considerations.  Also, note that
>> of_find_node_by_phandle also uses a 'phandle_cache' which does not
>> appear to be updated when of_detach_node() is invoked.
> 
> This seems like the real bug. Since the phandle cache was added we can
> now find detached nodes when we shouldn't be able to.
> 
> Does the patch below work?
> 
> cheers
> 
> diff --git a/drivers/of/base.c b/drivers/of/base.c
> index 09692c9b32a7..d8e4534c0686 100644
> --- a/drivers/of/base.c
> +++ b/drivers/of/base.c
> @@ -1190,6 +1190,10 @@ struct device_node *of_find_node_by_phandle(phandle 
> handle)
>   if (phandle_cache[masked_handle] &&
>   handle == phandle_cache[masked_handle]->phandle)
>   np = phandle_cache[masked_handle];
> +
> + /* If we find a detached node, remove it */
> + if (of_node_check_flag(np, OF_DETACHED))
> + np = phandle_cache[masked_handle] = NULL;
>   }
> 
>   if (!np) {
> 
> 

I think this would be a bit better for cases where masked values overlap:

diff --git a/drivers/of/base.c b/drivers/of/base.c
index 09692c9..ec79129 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -1188,8 +1188,13 @@ struct device_node *of_find_node_by_phandle(phandle 
handle)
 
if (phandle_cache) {
if (phandle_cache[masked_handle] &&
-   handle == phandle_cache[masked_handle]->phandle)
-   np = phandle_cache[masked_handle];
+   handle == phandle_cache[masked_handle]->phandle) {
+   np = phandle_cache[masked_handle];
+
+   /* If we find a detached node, remove it */
+   if (of_node_check_flag(np, OF_DETACHED))
+   np = phandle_cache[masked_handle] = NULL;
+   }
}
 
if (!np) {


Will try it out.  Wouldn't it be better to do this when the node is detached
in drivers/of/dynamic.c:__of_detach_node()?

Thanks.
Michael

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



Re: [PATCH v03] powerpc/mobility: Fix node detach/rename problem

2018-12-11 Thread Michael Bringmann



On 12/11/2018 07:29 AM, Michael Ellerman wrote:
> Hi Michael,
> 
> Please Cc the device tree folks on device tree patches, and also the
> original author of the patch that added the code you're modifying.
> 
> So I've added:
>   robh...@kernel.org
>   frowand.l...@gmail.com
>   devicet...@vger.kernel.org
>   linux-ker...@vger.kernel.org

Thanks.

> 
> Michael Bringmann  writes:
>> The PPC mobility code receives RTAS requests to delete nodes with
>> platform-/hardware-specific attributes when restarting the kernel
>> after a migration.  My example is for migration between a P8 Alpine
>> and a P8 Brazos.   Nodes to be deleted include 'ibm,random-v1',
>> 'ibm,platform-facilities', 'ibm,sym-encryption-v1', and,
>> 'ibm,compression-v1'.
>>
>> The mobility.c code calls 'of_detach_node' for the nodes and their
>> children.  This makes calls to detach the properties and to remove
>> the associated sysfs/kernfs files.
>>
>> Then new copies of the same nodes are next provided by the PHYP,
>> local copies are built, and a pointer to the 'struct device_node'
>> is passed to of_attach_node.  Before the call to of_attach_node,
>> the phandle is initialized to 0 when the data structure is alloced.
>> During the call to of_attach_node, it calls __of_attach_node which
>> pulls the actual name and phandle from just created sub-properties
>> named something like 'name' and 'ibm,phandle'.
>>
>> This is all fine for the first migration.  The problem occurs with
>> the second and subsequent migrations when the PHYP on the new system
>> wants to replace the same set of nodes again, referenced with the
>> same names and phandle values.
>>
>> On the second and subsequent migrations, the PHYP tells the system
>> to again delete the nodes 'ibm,platform-facilities', 'ibm,random-v1',
>> 'ibm,compression-v1', 'ibm,sym-encryption-v1'.  It specifies these
>> nodes by its known set of phandle values -- the same handles used
>> by the PHYP on the source system are known on the target system.
>> The mobility.c code calls of_find_node_by_phandle() with these values
>> and ends up locating the first instance of each node that was added
>> during the original boot, instead of the second instance of each node
>> created after the first migration.  The detach during the second
>> migration fails with errors like,
>>
>> [ 4565.030704] WARNING: CPU: 3 PID: 4787 at drivers/of/dynamic.c:252 
>> __of_detach_node+0x8/0xa0
>> [ 4565.030708] Modules linked in: nfsv3 nfs_acl nfs tcp_diag udp_diag 
>> inet_diag unix_diag af_packet_diag netlink_diag lockd grace fscache sunrpc 
>> xts vmx_crypto sg pseries_rng binfmt_misc ip_tables xfs libcrc32c sd_mod 
>> ibmveth ibmvscsi scsi_transport_srp dm_mirror dm_region_hash dm_log dm_mod
>> [ 4565.030733] CPU: 3 PID: 4787 Comm: drmgr Tainted: GW 
>> 4.18.0-rc1-wi107836-v05-120+ #201
>> [ 4565.030737] NIP:  c07c1ea8 LR: c07c1fb4 CTR: 
>> 00655170
>> [ 4565.030741] REGS: c003f302b690 TRAP: 0700   Tainted: GW   
>>(4.18.0-rc1-wi107836-v05-120+)
>> [ 4565.030745] MSR:  80010282b033 
>>   CR: 22288822  XER: 000a
>> [ 4565.030757] CFAR: c07c1fb0 IRQMASK: 1
>> [ 4565.030757] GPR00: c07c1fa4 c003f302b910 c114bf00 
>> c0038e68
>> [ 4565.030757] GPR04: 0001  80c008e0b4b8 
>> 
>> [ 4565.030757] GPR08:  0001 8003 
>> 2843
>> [ 4565.030757] GPR12: 8800 c0001ec9ae00 4000 
>> 
>> [ 4565.030757] GPR16:  0008  
>> f6ff
>> [ 4565.030757] GPR20: 0007  c003e9f1f034 
>> 0001
>> [ 4565.030757] GPR24:    
>> 
>> [ 4565.030757] GPR28: c1549d28 c1134828 c0038e68 
>> c003f302b930
>> [ 4565.030804] NIP [c07c1ea8] __of_detach_node+0x8/0xa0
>> [ 4565.030808] LR [c07c1fb4] of_detach_node+0x74/0xd0
>> [ 4565.030811] Call Trace:
>> [ 4565.030815] [c003f302b910] [c07c1fa4] 
>> of_detach_node+0x64/0xd0 (unreliable)
>> [ 4565.030821] [c003f302b980] [c00c33c4] 
>> dlpar_detach_node+0xb4/0x150
>> [ 4565.030826] [c003f302ba10] [c00c3ffc] delete_dt_node+0x3c/0x80
>> [ 4565.030831] [c003f302ba40] [c00c4380] 
>> pseries_devicetree_update+0x150/0x4f0
>> [ 4565.030

[PATCH v03] powerpc/mobility: Fix node detach/rename problem

2018-12-10 Thread Michael Bringmann
erties e.g.

[ 4565.827296] Duplicate name in base, renamed to "ibm,platform-facilities#1"

There is no check like 'of_node_check_flag(np, OF_DETACHED)' within
of_find_node_by_phandle to skip nodes that are detached, but still
present due to caching or use count considerations.  Also, note that
of_find_node_by_phandle also uses a 'phandle_cache' which does not
appear to be updated when of_detach_node() is invoked.

We don't appear to have anything that invalidates the phandle_cache
when a node is removed.

The right solution may be for __of_detach_node() to invalidate
phandle_cache for the node being detached.  Alternatively, we can
manually invalidate / rebuild the phandle_cache at the point of
LPAR migration.  The latter solution is presented here.

Signed-off-by: Michael Bringmann 
---
Changes in v03:
  -- Move private prototypes of phandle cache build functions to public
 header file.
---
 arch/powerpc/platforms/pseries/mobility.c |4 
 drivers/of/of_private.h   |2 --
 include/linux/of.h|3 +++
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/mobility.c 
b/arch/powerpc/platforms/pseries/mobility.c
index 2f78890..7da222d 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -341,6 +341,8 @@ void post_mobility_fixup(void)
if (rc)
printk(KERN_ERR "Post-mobility activate-fw failed: %d\n", rc);
 
+   of_free_phandle_cache();
+
rc = pseries_devicetree_update(MIGRATION_SCOPE);
if (rc)
printk(KERN_ERR "Post-mobility device tree update "
@@ -349,6 +351,8 @@ void post_mobility_fixup(void)
/* Possibly switch to a new RFI flush type */
pseries_setup_rfi_flush();
 
+   of_populate_phandle_cache();
+
return;
 }
 
diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h
index 216175d..891d780 100644
--- a/drivers/of/of_private.h
+++ b/drivers/of/of_private.h
@@ -79,8 +79,6 @@ static inline void __of_detach_node_sysfs(struct device_node 
*np) {}
 #if defined(CONFIG_OF_OVERLAY)
 void of_overlay_mutex_lock(void);
 void of_overlay_mutex_unlock(void);
-int of_free_phandle_cache(void);
-void of_populate_phandle_cache(void);
 #else
 static inline void of_overlay_mutex_lock(void) {};
 static inline void of_overlay_mutex_unlock(void) {};
diff --git a/include/linux/of.h b/include/linux/of.h
index 99b0ebf..482fc52 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -1441,4 +1441,7 @@ static inline int of_overlay_notifier_unregister(struct 
notifier_block *nb)
 
 #endif
 
+int of_free_phandle_cache(void);
+void of_populate_phandle_cache(void);
+
 #endif /* _LINUX_OF_H */



Re: [PATCH] pseries/hotplug: Add more delay in pseries_cpu_die while waiting for rtas-stop

2018-12-10 Thread Michael Bringmann
I have asked Scott Mayes to take a look at one of these crashes from
the phyp side.  I will let you know if he finds anything notable.

Michael

On 12/07/2018 08:40 PM, Thiago Jung Bauermann wrote:
> 
> Gautham R Shenoy  writes:
>> On Fri, Dec 07, 2018 at 04:13:11PM +0530, Gautham R Shenoy wrote:
>>> Sure. I will test the patch and report back.
>>
>> I added the following debug patch on top of your patch, and after an
>> hour's run, the system crashed. Appending the log at the end.
> 
> Thank you very much for testing! Your debug patch was very helpful as
> well.
> 
>> I suppose we still need to increase the number of tries since we wait
>> only for 2.5ms looping before giving up.
> 
> Do you think it would have helped? In the debug output you posted I
> would have expected the following message to show up if the loop
> finished too early, and it didn't:
> 
> "Querying DEAD? cpu %i (%i) shows %i\n"
> 
> So I don't think increasing the loop length would have made a
> difference. In fact, the call to smp_query_cpu_stopped() always
> succeeded on the first iteration.
> 
> I think there is something else going on which we don't fully understand
> yet. From your other email:
> 
>> I agree that the Kernel has to respect RTAS's restriction. The PAPR
>> v2.8.1, Requirement R1-7.2.3-8 under section 7.2.3 says the following:
>>
>> "The stop-self service needs to be serialized with calls to the
>>  stop-self, start-cpu, and set-power-level services. The OS must
>>  be able to call RTAS services on other processors while the
>>  processor is stopped or being stopped"
>>
>> Thus the onus is on the OS to ensure that there are no concurrent rtas
>> calls with "stop-self" token.
> 
> As you say perhaps there's another call to stop-self, start-cpu or
> set-power-level being made concurrently. I don't currently see how more
> than one stop-self or start-cpu call could be in flight at the same time
> given that there are a number of locks being grabbed during CPU hotplug
> and unplug. OTOH the CPU that actually calls stop-self doesn't seem to
> grab any locks itself so it's a possibility.
> 
> As for set-power-level, it's only used in the case of PCI hotplug from
> what I can see, and that isn't part of the picture in this case, right?
> 
> We could address this problem directly by adding another lock separate
> from rtas.lock to serialize just these calls. The challenge is with
> stop-self, because the CPU calling it will never return to release the
> lock. Is it possible to grab a lock (or down a semaphore) in the CPU
> calling stop-self and then release the lock (or up the semaphore) in the
> CPU running pseries_cpu_die()?
> 
>>> There's also a race between the CPU driving the unplug and the CPU
>>> being unplugged which I think is not easy for the CPU being
>>> unplugged to win, which makes the busy loop in pseries_cpu_die() a
>>> bit fragile. I describe the race in the patch description.
>>>
>>> My solution to make the race less tight is to make the CPU driving
>>> the unplug to only start the busy loop only after the CPU being
>>> unplugged is in the CPU_STATE_OFFLINE state. At that point, we know
>>> that it either is about to call RTAS or it already has.
>>
>> Ah, yes this is good optimization. Though, I think we ought to
>> unconditionally wait until the target CPU has woken up from CEDE and
>> changed its state to CPU_STATE_OFFLINE. After if PROD failed, then we
>> would have caught it in dlpar_offline_cpu() itself.
> 
> I recently saw a QEMU-implemented hcall (H_LOGICAL_CI_LOAD) return
> success when it had been given an invalid memory address to load from,
> so my confidence in the error reporting of hcalls is a bit shaken. :-)
> 
> In that case the CPU would wait forever for the CPU state to change. If
> you believe 100 ms is too short a timeout, we could make it 500 ms or
> even 1s. What do you think?
> 
>> cpu 112 (hwid 112) Ready to die...
>> [DEBUG] Waited for CPU 112 to enter rtas: tries=0, time=65
>> cpu 113 (hwid 113) Ready to die...
>> [DEBUG] Waited for CPU 113 to enter rtas: tries=0, time=1139
>> cpu 114 (hwid 114) Ready to die...
>> [DEBUG] Waited for CPU 114 to enter rtas: tries=0, time=1036
>> cpu 115 (hwid 115) Ready to die...
>> [DEBUG] Waited for CPU 115 to enter rtas: tries=0, time=133
>> cpu 116 (hwid 116) Ready to die...
>> [DEBUG] Waited for CPU 116 to enter rtas: tries=0, time=1231
>> cpu 117 (hwid 117) Ready to die...
>> [DEBUG] Waited for CPU 117 to enter rtas: tries=0, time=1231
>> cpu 118 (hwid 118) Ready to die...
>> [DEBUG] Waited for CPU 118 to enter rtas: tries=0, time=1231
>> cpu 119 (hwid 119) Ready to die...
>> [DEBUG] Waited for CPU 119 to enter rtas: tries=0, time=1131
>> cpu 104 (hwid 104) Ready to die...
>> [DEBUG] Waited for CPU 104 to enter rtas: tries=0, time=40
> 
> Interesting, so 1.2 ms can pass before the dying CPU actually gets close
> to making the stop-self call. And even in those cases the retry loop is
> succeeding on the first try! So this shows 

[PATCH] powerpc/numa: Ensure nodes are inited for mem changes

2018-11-27 Thread Michael Bringmann
This patch fixes some problems encountered at runtime where changes
to memory affinity changes reference nodes that were not initialized
during system execution after boot.  We are initializing a node
description that may be subsequently used for memory or CPUs before
it can be referenced as invalid.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/mm/numa.c |5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index d9109e0..f3714fa 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1026,6 +1026,11 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
pr_debug("new nid %d for %#010lx\n", nid, scn_addr);
}
 
+   if (NODE_DATA(nid) == NULL) {
+   if (try_online_node(nid))
+   nid = first_online_node;
+   }
+
return nid;
 }
 



[PATCH 4/4] powerpc/pseries: Relocate drmem.h to pseries

2018-11-27 Thread Michael Bringmann
The implementation of the pseries-specific dynamic memory features
is currently implemented in several non-pseries-specific files.
This patch set moves the implementation of the device-tree parsing
code for the properties ibm,dynamic-memory, ibm,dynamic-memory-v2,
and its representation in the kernel into the platform-specific
directory to the Pseries features.

This patch moves drmem.h from directory arch/powerpc/include/asm to
arch/powerpc/platforms/pseries, and fixing include file references
in pseries files.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/include/asm/drmem.h|  107 ---
 arch/powerpc/platforms/pseries/drmem.c  |2 
 arch/powerpc/platforms/pseries/drmem.h  |  107 +++
 arch/powerpc/platforms/pseries/hotplug-memory.c |2 
 arch/powerpc/platforms/pseries/lparcfg.c|2 
 5 files changed, 110 insertions(+), 110 deletions(-)
 delete mode 100644 arch/powerpc/include/asm/drmem.h
 create mode 100644 arch/powerpc/platforms/pseries/drmem.h

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
deleted file mode 100644
index 7c1d8e7..000
--- a/arch/powerpc/include/asm/drmem.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * drmem.h: Power specific logical memory block representation
- *
- * Copyright 2017 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _ASM_POWERPC_LMB_H
-#define _ASM_POWERPC_LMB_H
-
-struct drmem_lmb {
-   u64 base_addr;
-   u32 drc_index;
-   u32 aa_index;
-   u32 flags;
-};
-
-struct drmem_lmb_info {
-   struct drmem_lmb*lmbs;
-   int n_lmbs;
-   u32 lmb_size;
-};
-
-extern struct drmem_lmb_info *drmem_info;
-
-#define for_each_drmem_lmb_in_range(lmb, start, end)   \
-   for ((lmb) = (start); (lmb) <= (end); (lmb)++)
-
-#define for_each_drmem_lmb(lmb)\
-   for_each_drmem_lmb_in_range((lmb),  \
-   _info->lmbs[0],   \
-   _info->lmbs[drmem_info->n_lmbs - 1])
-
-/*
- * The of_drconf_cell_v1 struct defines the layout of the LMB data
- * specified in the ibm,dynamic-memory device tree property.
- * The property itself is a 32-bit value specifying the number of
- * LMBs followed by an array of of_drconf_cell_v1 entries, one
- * per LMB.
- */
-struct of_drconf_cell_v1 {
-   __be64  base_addr;
-   __be32  drc_index;
-   __be32  reserved;
-   __be32  aa_index;
-   __be32  flags;
-};
-
-/*
- * Version 2 of the ibm,dynamic-memory property is defined as a
- * 32-bit value specifying the number of LMB sets followed by an
- * array of of_drconf_cell_v2 entries, one per LMB set.
- */
-struct of_drconf_cell_v2 {
-   u32 seq_lmbs;
-   u64 base_addr;
-   u32 drc_index;
-   u32 aa_index;
-   u32 flags;
-} __packed;
-
-#define DRCONF_MEM_ASSIGNED0x0008
-#define DRCONF_MEM_AI_INVALID  0x0040
-#define DRCONF_MEM_RESERVED0x0080
-
-static inline u32 drmem_lmb_size(void)
-{
-   return drmem_info->lmb_size;
-}
-
-#define DRMEM_LMB_RESERVED 0x8000
-
-static inline void drmem_mark_lmb_reserved(struct drmem_lmb *lmb)
-{
-   lmb->flags |= DRMEM_LMB_RESERVED;
-}
-
-static inline void drmem_remove_lmb_reservation(struct drmem_lmb *lmb)
-{
-   lmb->flags &= ~DRMEM_LMB_RESERVED;
-}
-
-static inline bool drmem_lmb_reserved(struct drmem_lmb *lmb)
-{
-   return lmb->flags & DRMEM_LMB_RESERVED;
-}
-
-u64 drmem_lmb_memory_max(void);
-void __init walk_drmem_lmbs(struct device_node *dn,
-   void (*func)(struct drmem_lmb *, const __be32 **));
-int drmem_update_dt(void);
-
-#ifdef CONFIG_PPC_PSERIES
-void __init walk_drmem_lmbs_early(unsigned long node,
-   void (*func)(struct drmem_lmb *, const __be32 **));
-#endif
-
-static inline void invalidate_lmb_associativity_index(struct drmem_lmb *lmb)
-{
-   lmb->aa_index = 0x;
-}
-
-#endif /* _ASM_POWERPC_LMB_H */
diff --git a/arch/powerpc/platforms/pseries/drmem.c 
b/arch/powerpc/platforms/pseries/drmem.c
index 01ac651..a52f10e 100644
--- a/arch/powerpc/platforms/pseries/drmem.c
+++ b/arch/powerpc/platforms/pseries/drmem.c
@@ -17,7 +17,7 @@
 #include 
 #include 
 #include 
-#include 
+#include "drmem.h"
 
 static struct drmem_lmb_info __drmem_info;
 struct drmem_lmb_info *drmem_info = &__drmem_info;
diff --git a/arch/powerpc/platforms/pseries/drmem.h 
b/arch/powerpc/platforms/pseries/drmem.h
new file mode 100644
index 000..7c1d8e7
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/drmem.h
@@ -0,0 +1,107 

[PATCH 4/4] powerpc/pseries: Move DRMEM processing out of numa.c

2018-11-27 Thread Michael Bringmann
The implementation of the pseries-specific dynamic memory features
is currently implemented in several non-pseries-specific files.
This patch set moves the implementation of the device-tree parsing
code for the properties ibm,dynamic-memory, ibm,dynamic-memory-v2,
and its representation in the kernel into the platform-specific
directory to the Pseries features.

This patch refactors references to drmem features out of numa.c, so
that they can be moved to drmem.c.  Changes include exporting a few
support functions from numa.c via powerpc/include/asm/topology.h, and
the creation of platform function platform_parse_numa_properties that
any powerpc platform may implement.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/include/asm/topology.h|   13 +
 arch/powerpc/mm/numa.c |  238 +++
 arch/powerpc/platforms/pseries/drmem.c |  330 
 3 files changed, 329 insertions(+), 252 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index a4a718d..0c1ad7e 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -135,5 +135,18 @@ static inline void shared_proc_topology_init(void) {}
 #endif
 #endif
 
+extern unsigned long numa_enforce_memory_limit(unsigned long start,
+   unsigned long size);
+extern void initialize_distance_lookup_table(int nid,
+   const __be32 *associativity);
+extern int fake_numa_create_new_node(unsigned long end_pfn,
+   unsigned int *nid);
+
+struct assoc_arrays {
+   u32 n_arrays;
+   u32 array_sz;
+   const __be32 *arrays;
+};
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_TOPOLOGY_H */
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 3a048e9..6c982df 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -39,7 +39,6 @@
 #include 
 #include 
 #include 
-#include 
 
 static int numa_enabled = 1;
 
@@ -87,8 +86,8 @@ static void __init setup_node_to_cpumask_map(void)
dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
 }
 
-static int __init fake_numa_create_new_node(unsigned long end_pfn,
-   unsigned int *nid)
+int __init fake_numa_create_new_node(unsigned long end_pfn,
+   unsigned int *nid)
 {
unsigned long long mem;
char *p = cmdline;
@@ -194,7 +193,7 @@ int __node_distance(int a, int b)
 }
 EXPORT_SYMBOL(__node_distance);
 
-static void initialize_distance_lookup_table(int nid,
+void initialize_distance_lookup_table(int nid,
const __be32 *associativity)
 {
int i;
@@ -209,6 +208,7 @@ static void initialize_distance_lookup_table(int nid,
distance_lookup_table[nid][i] = of_read_number(entry, 1);
}
 }
+EXPORT_SYMBOL(initialize_distance_lookup_table);
 
 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
  * info is found.
@@ -356,98 +356,6 @@ static void __init get_n_mem_cells(int *n_addr_cells, int 
*n_size_cells)
of_node_put(memory);
 }
 
-static unsigned long read_n_cells(int n, const __be32 **buf)
-{
-   unsigned long result = 0;
-
-   while (n--) {
-   result = (result << 32) | of_read_number(*buf, 1);
-   (*buf)++;
-   }
-   return result;
-}
-
-struct assoc_arrays {
-   u32 n_arrays;
-   u32 array_sz;
-   const __be32 *arrays;
-};
-
-/*
- * Retrieve and validate the list of associativity arrays for drconf
- * memory from the ibm,associativity-lookup-arrays property of the
- * device tree..
- *
- * The layout of the ibm,associativity-lookup-arrays property is a number N
- * indicating the number of associativity arrays, followed by a number M
- * indicating the size of each associativity array, followed by a list
- * of N associativity arrays.
- */
-static int of_get_assoc_arrays(struct assoc_arrays *aa)
-{
-   struct device_node *memory;
-   const __be32 *prop;
-   u32 len;
-
-   memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
-   if (!memory)
-   return -1;
-
-   prop = of_get_property(memory, "ibm,associativity-lookup-arrays", );
-   if (!prop || len < 2 * sizeof(unsigned int)) {
-   of_node_put(memory);
-   return -1;
-   }
-
-   aa->n_arrays = of_read_number(prop++, 1);
-   aa->array_sz = of_read_number(prop++, 1);
-
-   of_node_put(memory);
-
-   /* Now that we know the number of arrays and size of each array,
-* revalidate the size of the property read in.
-*/
-   if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
-   return -1;
-
-   aa->arrays = prop;
-   return 0;
-}
-
-/*
- * This is like of_n

[PATCH 2/4] powerpc/pseries: Move DRMEM processing out of prom.c

2018-11-27 Thread Michael Bringmann
The implementation of the pseries-specific dynamic memory features
is currently implemented in several non-pseries-specific files.
This patch set moves the implementation of the device-tree parsing
code for the properties ibm,dynamic-memory, ibm,dynamic-memory-v2,
and its representation in the kernel into the platform-specific
directory to the Pseries features.

This patch refactors references to drmem features out of prom.c, so
that they can be moved to drmem.c.  Changes include creating a
platform function platform_early_init_dt_scan_memory_ppc that any
powerpc platform may implement, and moving a support function to
powerpc/include/asm/sparsemem.h

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/include/asm/platform.h|   23 
 arch/powerpc/include/asm/prom.h|3 +
 arch/powerpc/include/asm/sparsemem.h   |   19 +++
 arch/powerpc/kernel/prom.c |   90 +---
 arch/powerpc/platforms/pseries/drmem.c |   73 ++
 5 files changed, 122 insertions(+), 86 deletions(-)
 create mode 100644 arch/powerpc/include/asm/platform.h

diff --git a/arch/powerpc/include/asm/platform.h 
b/arch/powerpc/include/asm/platform.h
new file mode 100644
index 000..36f0f9e
--- /dev/null
+++ b/arch/powerpc/include/asm/platform.h
@@ -0,0 +1,23 @@
+#ifndef _POWERPC_PLATFORM_H
+#define _POWERPC_PLATFORM_H
+#ifdef __KERNEL__
+
+/*
+ * Definitions for talking to the Platform-specific functions of PowerPC
+ *
+ * Copyright (C) 2018 Michael Bringmann, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include 
+
+/* Memory initialization support */
+extern int platform_early_init_dt_scan_memory_ppc(unsigned long node,
+   const char *uname,
+   int depth, void *data);
+
+#endif /* __KERNEL__ */
+#endif /* _POWERPC_PLATFORM_H */
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index b04c5ce..4504773 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -180,5 +180,8 @@ extern int of_read_drc_info_cell(struct property **prop,
 /* Option Vector 6: IBM PAPR hints */
 #define OV6_LINUX  0x02/* Linux is our OS */
 
+/* Other functions */
+extern bool validate_mem_limit(u64 base, u64 *size);
+
 #endif /* __KERNEL__ */
 #endif /* _POWERPC_PROM_H */
diff --git a/arch/powerpc/include/asm/sparsemem.h 
b/arch/powerpc/include/asm/sparsemem.h
index 68da493..25edfc2 100644
--- a/arch/powerpc/include/asm/sparsemem.h
+++ b/arch/powerpc/include/asm/sparsemem.h
@@ -32,5 +32,24 @@ static inline int hot_add_scn_to_nid(unsigned long scn_addr)
 #endif /* CONFIG_NUMA */
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
+
+#ifdef CONFIG_SPARSEMEM
+static inline bool validate_mem_limit(u64 base, u64 *size) 
+{
+   u64 max_mem = 1UL << (MAX_PHYSMEM_BITS);
+
+   if (base >= max_mem)
+   return false;
+   if ((base + *size) > max_mem)
+   *size = max_mem - base;
+   return true;
+}
+#else
+static inline bool validate_mem_limit(u64 base, u64 *size) 
+{
+   return true;
+}
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_SPARSEMEM_H */
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index fe758ce..ea32fee 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -58,7 +58,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 
 #include 
 
@@ -444,96 +444,14 @@ static int __init early_init_dt_scan_chosen_ppc(unsigned 
long node,
  * size if it cross the limit.
  */
 
-#ifdef CONFIG_SPARSEMEM
-static bool validate_mem_limit(u64 base, u64 *size)
-{
-   u64 max_mem = 1UL << (MAX_PHYSMEM_BITS);
-
-   if (base >= max_mem)
-   return false;
-   if ((base + *size) > max_mem)
-   *size = max_mem - base;
-   return true;
-}
-#else
-static bool validate_mem_limit(u64 base, u64 *size)
-{
-   return true;
-}
-#endif
-
-#ifdef CONFIG_PPC_PSERIES
-/*
- * Interpret the ibm dynamic reconfiguration memory LMBs.
- * This contains a list of memory blocks along with NUMA affinity
- * information.
- */
-static void __init early_init_drmem_lmb(struct drmem_lmb *lmb,
-   const __be32 **usm)
-{
-   u64 base, size;
-   int is_kexec_kdump = 0, rngs;
-
-   base = lmb->base_addr;
-   size = drmem_lmb_size();
-   rngs = 1;
-
-   /*
-* Skip this block if the reserved bit is set in flags
-* or if the block is not assigned to this partition.
-*/
-   if ((lmb->flags & DRCONF_MEM_RESERVED) ||
-   !(lmb->flags & DRCONF_MEM_ASSIGNED))
-   return;
-
-  

[PATCH 1/4] powerpc/pseries: Relocate drmem.c to pseries

2018-11-27 Thread Michael Bringmann
The implementation of the pseries-specific dynamic memory features
is currently implemented in several non-pseries-specific files.
This patch set moves the implementation of the device-tree parsing
code for the properties ibm,dynamic-memory, ibm,dynamic-memory-v2,
and its representation in the kernel into the platform-specific
directory to the Pseries features.

This patch moves drmem.c from kernel directory arch/powerpc/mm to
powerpc/platforms/pseries.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/mm/Makefile|2 
 arch/powerpc/mm/drmem.c |  447 ---
 arch/powerpc/platforms/pseries/Makefile |3 
 arch/powerpc/platforms/pseries/drmem.c  |  447 +++
 4 files changed, 450 insertions(+), 449 deletions(-)
 delete mode 100644 arch/powerpc/mm/drmem.c
 create mode 100644 arch/powerpc/platforms/pseries/drmem.c

diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index ca96e7b..06281e0f 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -9,7 +9,7 @@ CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE)
 
 obj-y  := fault.o mem.o pgtable.o mmap.o \
   init_$(BITS).o pgtable_$(BITS).o \
-  init-common.o mmu_context.o drmem.o
+  init-common.o mmu_context.o
 obj-$(CONFIG_PPC_MMU_NOHASH)   += mmu_context_nohash.o tlb_nohash.o \
   tlb_nohash_low.o
 obj-$(CONFIG_PPC_BOOK3E)   += tlb_low_$(BITS)e.o
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
deleted file mode 100644
index 3f18036..000
--- a/arch/powerpc/mm/drmem.c
+++ /dev/null
@@ -1,447 +0,0 @@
-/*
- * Dynamic reconfiguration memory support
- *
- * Copyright 2017 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#define pr_fmt(fmt) "drmem: " fmt
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-static struct drmem_lmb_info __drmem_info;
-struct drmem_lmb_info *drmem_info = &__drmem_info;
-
-u64 drmem_lmb_memory_max(void)
-{
-   struct drmem_lmb *last_lmb;
-
-   last_lmb = _info->lmbs[drmem_info->n_lmbs - 1];
-   return last_lmb->base_addr + drmem_lmb_size();
-}
-
-static u32 drmem_lmb_flags(struct drmem_lmb *lmb)
-{
-   /*
-* Return the value of the lmb flags field minus the reserved
-* bit used internally for hotplug processing.
-*/
-   return lmb->flags & ~DRMEM_LMB_RESERVED;
-}
-
-static struct property *clone_property(struct property *prop, u32 prop_sz)
-{
-   struct property *new_prop;
-
-   new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
-   if (!new_prop)
-   return NULL;
-
-   new_prop->name = kstrdup(prop->name, GFP_KERNEL);
-   new_prop->value = kzalloc(prop_sz, GFP_KERNEL);
-   if (!new_prop->name || !new_prop->value) {
-   kfree(new_prop->name);
-   kfree(new_prop->value);
-   kfree(new_prop);
-   return NULL;
-   }
-
-   new_prop->length = prop_sz;
-#if defined(CONFIG_OF_DYNAMIC)
-   of_property_set_flag(new_prop, OF_DYNAMIC);
-#endif
-   return new_prop;
-}
-
-static int drmem_update_dt_v1(struct device_node *memory,
- struct property *prop)
-{
-   struct property *new_prop;
-   struct of_drconf_cell_v1 *dr_cell;
-   struct drmem_lmb *lmb;
-   u32 *p;
-
-   new_prop = clone_property(prop, prop->length);
-   if (!new_prop)
-   return -1;
-
-   p = new_prop->value;
-   *p++ = cpu_to_be32(drmem_info->n_lmbs);
-
-   dr_cell = (struct of_drconf_cell_v1 *)p;
-
-   for_each_drmem_lmb(lmb) {
-   dr_cell->base_addr = cpu_to_be64(lmb->base_addr);
-   dr_cell->drc_index = cpu_to_be32(lmb->drc_index);
-   dr_cell->aa_index = cpu_to_be32(lmb->aa_index);
-   dr_cell->flags = cpu_to_be32(drmem_lmb_flags(lmb));
-
-   dr_cell++;
-   }
-
-   of_update_property(memory, new_prop);
-   return 0;
-}
-
-static void init_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
-   struct drmem_lmb *lmb)
-{
-   dr_cell->base_addr = cpu_to_be64(lmb->base_addr);
-   dr_cell->drc_index = cpu_to_be32(lmb->drc_index);
-   dr_cell->aa_index = cpu_to_be32(lmb->aa_index);
-   dr_cell->flags = cpu_to_be32(drmem_lmb_flags(lmb));
-}
-
-static int drmem_update_dt_v2(struct device_node *memory,
- struct property *prop)
-{
-   struct property *new_prop;
-   struct of_drconf_cell_v2 *d

[PATCH 0/4] powerpc/pseries: Refactor code to centralize drmem feature

2018-11-27 Thread Michael Bringmann
The implementation of the pseries-specific dynamic memory features
is currently implemented in several non-pseries-specific files.
This patch set moves the implementation of the device-tree parsing
code for the properties ibm,dynamic-memory, ibm,dynamic-memory-v2,
and its representation in the kernel into the platform-specific
directory to the Pseries features.

Signed-off-by: Michael Bringmann 

Michael Bringmann (4):
  powerpc/pseries: Relocate drmem.c to pseries
  powerpc/pseries: Move DRMEM processing out of prom.c
  powerpc/pseries: Move DRMEM processing out of numa.c
  powerpc/pseries: Relocate drmem.h to pseries



Re: [Internal Review PATCH] powerpc/pseries: Refactor code to centralize drmem feature

2018-11-27 Thread Michael Bringmann
Wrong Subject.  Will repost.

On 11/26/2018 02:36 PM, Michael Bringmann wrote:
> The implementation of the pseries-specific dynamic memory features
> is currently implemented in several non-pseries-specific files.
> This patch set moves the implementation of the device-tree parsing
> code for the properties ibm,dynamic-memory, ibm,dynamic-memory-v2,
> and its representation in the kernel into the platform-specific
> directory to the Pseries features.
> 
> Signed-off-by: Michael Bringmann 
> 
> Michael Bringmann (4):
>   powerpc/pseries: Relocate drmem.c to pseries
>   powerpc/pseries: Move DRMEM processing out of prom.c
>   powerpc/pseries: Move DRMEM processing out of numa.c
>   powerpc/pseries: Relocate drmem.h to pseries
> 
> 

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



Re: [Internal Review PATCH] powerpc/pseries: Remap hw to kernel cpu indexes

2018-11-27 Thread Michael Bringmann
This should have been posted as RFC.  Will repost.

On 11/26/2018 02:33 PM, Michael Bringmann wrote:
> Define and apply new interface to map hardware-specific powerpc cpu
> ids to a kernel specific range of cpu values.  Mapping is intended
> to prevent confusion within the kernel about the cpu+node mapping,
> and the changes in configuration that may happen due to powerpc LPAR
> migration or other associativity changes during the lifetime of a
> system.  These interfaces exchange the thread_index provided by the
> 'ibm,ppc-interrupt-server#s' properties, for an internal index to
> be used by kernel scheduling interfaces.
> 
> Signed-off-by: Michael Bringmann 
> 
> Michael Bringmann (3):
>   powerpc/numa: Conditionally online new nodes
>   powerpc/numa: Define mapping between HW and kernel cpus
>   powerpc/numa: Apply mapping between HW and kernel cpu
> 
> 

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



[Internal Review PATCH] powerpc/pseries: Relocate drmem.h to pseries

2018-11-26 Thread Michael Bringmann
The implementation of the pseries-specific dynamic memory features
is currently implemented in several non-pseries-specific files.
This patch set moves the implementation of the device-tree parsing
code for the properties ibm,dynamic-memory, ibm,dynamic-memory-v2,
and its representation in the kernel into the platform-specific
directory to the Pseries features.

This patch moves drmem.h from directory arch/powerpc/include/asm to
arch/powerpc/platforms/pseries, and fixing include file references
in pseries files.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/include/asm/drmem.h|  107 ---
 arch/powerpc/platforms/pseries/drmem.c  |2 
 arch/powerpc/platforms/pseries/drmem.h  |  107 +++
 arch/powerpc/platforms/pseries/hotplug-memory.c |2 
 arch/powerpc/platforms/pseries/lparcfg.c|2 
 5 files changed, 110 insertions(+), 110 deletions(-)
 delete mode 100644 arch/powerpc/include/asm/drmem.h
 create mode 100644 arch/powerpc/platforms/pseries/drmem.h

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
deleted file mode 100644
index 7c1d8e7..000
--- a/arch/powerpc/include/asm/drmem.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * drmem.h: Power specific logical memory block representation
- *
- * Copyright 2017 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _ASM_POWERPC_LMB_H
-#define _ASM_POWERPC_LMB_H
-
-struct drmem_lmb {
-   u64 base_addr;
-   u32 drc_index;
-   u32 aa_index;
-   u32 flags;
-};
-
-struct drmem_lmb_info {
-   struct drmem_lmb*lmbs;
-   int n_lmbs;
-   u32 lmb_size;
-};
-
-extern struct drmem_lmb_info *drmem_info;
-
-#define for_each_drmem_lmb_in_range(lmb, start, end)   \
-   for ((lmb) = (start); (lmb) <= (end); (lmb)++)
-
-#define for_each_drmem_lmb(lmb)\
-   for_each_drmem_lmb_in_range((lmb),  \
-   _info->lmbs[0],   \
-   _info->lmbs[drmem_info->n_lmbs - 1])
-
-/*
- * The of_drconf_cell_v1 struct defines the layout of the LMB data
- * specified in the ibm,dynamic-memory device tree property.
- * The property itself is a 32-bit value specifying the number of
- * LMBs followed by an array of of_drconf_cell_v1 entries, one
- * per LMB.
- */
-struct of_drconf_cell_v1 {
-   __be64  base_addr;
-   __be32  drc_index;
-   __be32  reserved;
-   __be32  aa_index;
-   __be32  flags;
-};
-
-/*
- * Version 2 of the ibm,dynamic-memory property is defined as a
- * 32-bit value specifying the number of LMB sets followed by an
- * array of of_drconf_cell_v2 entries, one per LMB set.
- */
-struct of_drconf_cell_v2 {
-   u32 seq_lmbs;
-   u64 base_addr;
-   u32 drc_index;
-   u32 aa_index;
-   u32 flags;
-} __packed;
-
-#define DRCONF_MEM_ASSIGNED0x0008
-#define DRCONF_MEM_AI_INVALID  0x0040
-#define DRCONF_MEM_RESERVED0x0080
-
-static inline u32 drmem_lmb_size(void)
-{
-   return drmem_info->lmb_size;
-}
-
-#define DRMEM_LMB_RESERVED 0x8000
-
-static inline void drmem_mark_lmb_reserved(struct drmem_lmb *lmb)
-{
-   lmb->flags |= DRMEM_LMB_RESERVED;
-}
-
-static inline void drmem_remove_lmb_reservation(struct drmem_lmb *lmb)
-{
-   lmb->flags &= ~DRMEM_LMB_RESERVED;
-}
-
-static inline bool drmem_lmb_reserved(struct drmem_lmb *lmb)
-{
-   return lmb->flags & DRMEM_LMB_RESERVED;
-}
-
-u64 drmem_lmb_memory_max(void);
-void __init walk_drmem_lmbs(struct device_node *dn,
-   void (*func)(struct drmem_lmb *, const __be32 **));
-int drmem_update_dt(void);
-
-#ifdef CONFIG_PPC_PSERIES
-void __init walk_drmem_lmbs_early(unsigned long node,
-   void (*func)(struct drmem_lmb *, const __be32 **));
-#endif
-
-static inline void invalidate_lmb_associativity_index(struct drmem_lmb *lmb)
-{
-   lmb->aa_index = 0x;
-}
-
-#endif /* _ASM_POWERPC_LMB_H */
diff --git a/arch/powerpc/platforms/pseries/drmem.c 
b/arch/powerpc/platforms/pseries/drmem.c
index 01ac651..a52f10e 100644
--- a/arch/powerpc/platforms/pseries/drmem.c
+++ b/arch/powerpc/platforms/pseries/drmem.c
@@ -17,7 +17,7 @@
 #include 
 #include 
 #include 
-#include 
+#include "drmem.h"
 
 static struct drmem_lmb_info __drmem_info;
 struct drmem_lmb_info *drmem_info = &__drmem_info;
diff --git a/arch/powerpc/platforms/pseries/drmem.h 
b/arch/powerpc/platforms/pseries/drmem.h
new file mode 100644
index 000..7c1d8e7
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/drmem.h
@@ -0,0 +1,107 

[Internal Review PATCH] powerpc/pseries: Move DRMEM processing out of numa.c

2018-11-26 Thread Michael Bringmann
The implementation of the pseries-specific dynamic memory features
is currently implemented in several non-pseries-specific files.
This patch set moves the implementation of the device-tree parsing
code for the properties ibm,dynamic-memory, ibm,dynamic-memory-v2,
and its representation in the kernel into the platform-specific
directory to the Pseries features.

This patch refactors references to drmem features out of numa.c, so
that they can be moved to drmem.c.  Changes include exporting a few
support functions from numa.c via powerpc/include/asm/topology.h, and
the creation of platform function platform_parse_numa_properties that
any powerpc platform may implement.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/include/asm/topology.h|   13 +
 arch/powerpc/mm/numa.c |  238 +++
 arch/powerpc/platforms/pseries/drmem.c |  330 
 3 files changed, 329 insertions(+), 252 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index a4a718d..0c1ad7e 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -135,5 +135,18 @@ static inline void shared_proc_topology_init(void) {}
 #endif
 #endif
 
+extern unsigned long numa_enforce_memory_limit(unsigned long start,
+   unsigned long size);
+extern void initialize_distance_lookup_table(int nid,
+   const __be32 *associativity);
+extern int fake_numa_create_new_node(unsigned long end_pfn,
+   unsigned int *nid);
+
+struct assoc_arrays {
+   u32 n_arrays;
+   u32 array_sz;
+   const __be32 *arrays;
+};
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_TOPOLOGY_H */
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 3a048e9..6c982df 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -39,7 +39,6 @@
 #include 
 #include 
 #include 
-#include 
 
 static int numa_enabled = 1;
 
@@ -87,8 +86,8 @@ static void __init setup_node_to_cpumask_map(void)
dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
 }
 
-static int __init fake_numa_create_new_node(unsigned long end_pfn,
-   unsigned int *nid)
+int __init fake_numa_create_new_node(unsigned long end_pfn,
+   unsigned int *nid)
 {
unsigned long long mem;
char *p = cmdline;
@@ -194,7 +193,7 @@ int __node_distance(int a, int b)
 }
 EXPORT_SYMBOL(__node_distance);
 
-static void initialize_distance_lookup_table(int nid,
+void initialize_distance_lookup_table(int nid,
const __be32 *associativity)
 {
int i;
@@ -209,6 +208,7 @@ static void initialize_distance_lookup_table(int nid,
distance_lookup_table[nid][i] = of_read_number(entry, 1);
}
 }
+EXPORT_SYMBOL(initialize_distance_lookup_table);
 
 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
  * info is found.
@@ -356,98 +356,6 @@ static void __init get_n_mem_cells(int *n_addr_cells, int 
*n_size_cells)
of_node_put(memory);
 }
 
-static unsigned long read_n_cells(int n, const __be32 **buf)
-{
-   unsigned long result = 0;
-
-   while (n--) {
-   result = (result << 32) | of_read_number(*buf, 1);
-   (*buf)++;
-   }
-   return result;
-}
-
-struct assoc_arrays {
-   u32 n_arrays;
-   u32 array_sz;
-   const __be32 *arrays;
-};
-
-/*
- * Retrieve and validate the list of associativity arrays for drconf
- * memory from the ibm,associativity-lookup-arrays property of the
- * device tree..
- *
- * The layout of the ibm,associativity-lookup-arrays property is a number N
- * indicating the number of associativity arrays, followed by a number M
- * indicating the size of each associativity array, followed by a list
- * of N associativity arrays.
- */
-static int of_get_assoc_arrays(struct assoc_arrays *aa)
-{
-   struct device_node *memory;
-   const __be32 *prop;
-   u32 len;
-
-   memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
-   if (!memory)
-   return -1;
-
-   prop = of_get_property(memory, "ibm,associativity-lookup-arrays", );
-   if (!prop || len < 2 * sizeof(unsigned int)) {
-   of_node_put(memory);
-   return -1;
-   }
-
-   aa->n_arrays = of_read_number(prop++, 1);
-   aa->array_sz = of_read_number(prop++, 1);
-
-   of_node_put(memory);
-
-   /* Now that we know the number of arrays and size of each array,
-* revalidate the size of the property read in.
-*/
-   if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
-   return -1;
-
-   aa->arrays = prop;
-   return 0;
-}
-
-/*
- * This is like of_n

[Internal Review PATCH] powerpc/pseries: Move DRMEM processing out of prom.c

2018-11-26 Thread Michael Bringmann
The implementation of the pseries-specific dynamic memory features
is currently implemented in several non-pseries-specific files.
This patch set moves the implementation of the device-tree parsing
code for the properties ibm,dynamic-memory, ibm,dynamic-memory-v2,
and its representation in the kernel into the platform-specific
directory to the Pseries features.

This patch refactors references to drmem features out of prom.c, so
that they can be moved to drmem.c.  Changes include creating a
platform function platform_early_init_dt_scan_memory_ppc that any
powerpc platform may implement, and moving a support function to
powerpc/include/asm/sparsemem.h

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/include/asm/platform.h|   23 
 arch/powerpc/include/asm/prom.h|3 +
 arch/powerpc/include/asm/sparsemem.h   |   19 +++
 arch/powerpc/kernel/prom.c |   90 +---
 arch/powerpc/platforms/pseries/drmem.c |   73 ++
 5 files changed, 122 insertions(+), 86 deletions(-)
 create mode 100644 arch/powerpc/include/asm/platform.h

diff --git a/arch/powerpc/include/asm/platform.h 
b/arch/powerpc/include/asm/platform.h
new file mode 100644
index 000..36f0f9e
--- /dev/null
+++ b/arch/powerpc/include/asm/platform.h
@@ -0,0 +1,23 @@
+#ifndef _POWERPC_PLATFORM_H
+#define _POWERPC_PLATFORM_H
+#ifdef __KERNEL__
+
+/*
+ * Definitions for talking to the Platform-specific functions of PowerPC
+ *
+ * Copyright (C) 2018 Michael Bringmann, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include 
+
+/* Memory initialization support */
+extern int platform_early_init_dt_scan_memory_ppc(unsigned long node,
+   const char *uname,
+   int depth, void *data);
+
+#endif /* __KERNEL__ */
+#endif /* _POWERPC_PLATFORM_H */
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index b04c5ce..4504773 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -180,5 +180,8 @@ extern int of_read_drc_info_cell(struct property **prop,
 /* Option Vector 6: IBM PAPR hints */
 #define OV6_LINUX  0x02/* Linux is our OS */
 
+/* Other functions */
+extern bool validate_mem_limit(u64 base, u64 *size);
+
 #endif /* __KERNEL__ */
 #endif /* _POWERPC_PROM_H */
diff --git a/arch/powerpc/include/asm/sparsemem.h 
b/arch/powerpc/include/asm/sparsemem.h
index 68da493..25edfc2 100644
--- a/arch/powerpc/include/asm/sparsemem.h
+++ b/arch/powerpc/include/asm/sparsemem.h
@@ -32,5 +32,24 @@ static inline int hot_add_scn_to_nid(unsigned long scn_addr)
 #endif /* CONFIG_NUMA */
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
+
+#ifdef CONFIG_SPARSEMEM
+static inline bool validate_mem_limit(u64 base, u64 *size) 
+{
+   u64 max_mem = 1UL << (MAX_PHYSMEM_BITS);
+
+   if (base >= max_mem)
+   return false;
+   if ((base + *size) > max_mem)
+   *size = max_mem - base;
+   return true;
+}
+#else
+static inline bool validate_mem_limit(u64 base, u64 *size) 
+{
+   return true;
+}
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_SPARSEMEM_H */
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index fe758ce..ea32fee 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -58,7 +58,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 
 #include 
 
@@ -444,96 +444,14 @@ static int __init early_init_dt_scan_chosen_ppc(unsigned 
long node,
  * size if it cross the limit.
  */
 
-#ifdef CONFIG_SPARSEMEM
-static bool validate_mem_limit(u64 base, u64 *size)
-{
-   u64 max_mem = 1UL << (MAX_PHYSMEM_BITS);
-
-   if (base >= max_mem)
-   return false;
-   if ((base + *size) > max_mem)
-   *size = max_mem - base;
-   return true;
-}
-#else
-static bool validate_mem_limit(u64 base, u64 *size)
-{
-   return true;
-}
-#endif
-
-#ifdef CONFIG_PPC_PSERIES
-/*
- * Interpret the ibm dynamic reconfiguration memory LMBs.
- * This contains a list of memory blocks along with NUMA affinity
- * information.
- */
-static void __init early_init_drmem_lmb(struct drmem_lmb *lmb,
-   const __be32 **usm)
-{
-   u64 base, size;
-   int is_kexec_kdump = 0, rngs;
-
-   base = lmb->base_addr;
-   size = drmem_lmb_size();
-   rngs = 1;
-
-   /*
-* Skip this block if the reserved bit is set in flags
-* or if the block is not assigned to this partition.
-*/
-   if ((lmb->flags & DRCONF_MEM_RESERVED) ||
-   !(lmb->flags & DRCONF_MEM_ASSIGNED))
-   return;
-
-  

[Internal Review PATCH] powerpc/pseries: Relocate drmem.c to pseries

2018-11-26 Thread Michael Bringmann
The implementation of the pseries-specific dynamic memory features
is currently implemented in several non-pseries-specific files.
This patch set moves the implementation of the device-tree parsing
code for the properties ibm,dynamic-memory, ibm,dynamic-memory-v2,
and its representation in the kernel into the platform-specific
directory to the Pseries features.

This patch moves drmem.c from kernel directory arch/powerpc/mm to
powerpc/platforms/pseries.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/mm/Makefile|2 
 arch/powerpc/mm/drmem.c |  447 ---
 arch/powerpc/platforms/pseries/Makefile |3 
 arch/powerpc/platforms/pseries/drmem.c  |  447 +++
 4 files changed, 450 insertions(+), 449 deletions(-)
 delete mode 100644 arch/powerpc/mm/drmem.c
 create mode 100644 arch/powerpc/platforms/pseries/drmem.c

diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index ca96e7b..06281e0f 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -9,7 +9,7 @@ CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE)
 
 obj-y  := fault.o mem.o pgtable.o mmap.o \
   init_$(BITS).o pgtable_$(BITS).o \
-  init-common.o mmu_context.o drmem.o
+  init-common.o mmu_context.o
 obj-$(CONFIG_PPC_MMU_NOHASH)   += mmu_context_nohash.o tlb_nohash.o \
   tlb_nohash_low.o
 obj-$(CONFIG_PPC_BOOK3E)   += tlb_low_$(BITS)e.o
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
deleted file mode 100644
index 3f18036..000
--- a/arch/powerpc/mm/drmem.c
+++ /dev/null
@@ -1,447 +0,0 @@
-/*
- * Dynamic reconfiguration memory support
- *
- * Copyright 2017 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#define pr_fmt(fmt) "drmem: " fmt
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-static struct drmem_lmb_info __drmem_info;
-struct drmem_lmb_info *drmem_info = &__drmem_info;
-
-u64 drmem_lmb_memory_max(void)
-{
-   struct drmem_lmb *last_lmb;
-
-   last_lmb = _info->lmbs[drmem_info->n_lmbs - 1];
-   return last_lmb->base_addr + drmem_lmb_size();
-}
-
-static u32 drmem_lmb_flags(struct drmem_lmb *lmb)
-{
-   /*
-* Return the value of the lmb flags field minus the reserved
-* bit used internally for hotplug processing.
-*/
-   return lmb->flags & ~DRMEM_LMB_RESERVED;
-}
-
-static struct property *clone_property(struct property *prop, u32 prop_sz)
-{
-   struct property *new_prop;
-
-   new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
-   if (!new_prop)
-   return NULL;
-
-   new_prop->name = kstrdup(prop->name, GFP_KERNEL);
-   new_prop->value = kzalloc(prop_sz, GFP_KERNEL);
-   if (!new_prop->name || !new_prop->value) {
-   kfree(new_prop->name);
-   kfree(new_prop->value);
-   kfree(new_prop);
-   return NULL;
-   }
-
-   new_prop->length = prop_sz;
-#if defined(CONFIG_OF_DYNAMIC)
-   of_property_set_flag(new_prop, OF_DYNAMIC);
-#endif
-   return new_prop;
-}
-
-static int drmem_update_dt_v1(struct device_node *memory,
- struct property *prop)
-{
-   struct property *new_prop;
-   struct of_drconf_cell_v1 *dr_cell;
-   struct drmem_lmb *lmb;
-   u32 *p;
-
-   new_prop = clone_property(prop, prop->length);
-   if (!new_prop)
-   return -1;
-
-   p = new_prop->value;
-   *p++ = cpu_to_be32(drmem_info->n_lmbs);
-
-   dr_cell = (struct of_drconf_cell_v1 *)p;
-
-   for_each_drmem_lmb(lmb) {
-   dr_cell->base_addr = cpu_to_be64(lmb->base_addr);
-   dr_cell->drc_index = cpu_to_be32(lmb->drc_index);
-   dr_cell->aa_index = cpu_to_be32(lmb->aa_index);
-   dr_cell->flags = cpu_to_be32(drmem_lmb_flags(lmb));
-
-   dr_cell++;
-   }
-
-   of_update_property(memory, new_prop);
-   return 0;
-}
-
-static void init_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
-   struct drmem_lmb *lmb)
-{
-   dr_cell->base_addr = cpu_to_be64(lmb->base_addr);
-   dr_cell->drc_index = cpu_to_be32(lmb->drc_index);
-   dr_cell->aa_index = cpu_to_be32(lmb->aa_index);
-   dr_cell->flags = cpu_to_be32(drmem_lmb_flags(lmb));
-}
-
-static int drmem_update_dt_v2(struct device_node *memory,
- struct property *prop)
-{
-   struct property *new_prop;
-   struct of_drconf_cell_v2 *d

[Internal Review PATCH] powerpc/pseries: Refactor code to centralize drmem feature

2018-11-26 Thread Michael Bringmann
The implementation of the pseries-specific dynamic memory features
is currently implemented in several non-pseries-specific files.
This patch set moves the implementation of the device-tree parsing
code for the properties ibm,dynamic-memory, ibm,dynamic-memory-v2,
and its representation in the kernel into the platform-specific
directory to the Pseries features.

Signed-off-by: Michael Bringmann 

Michael Bringmann (4):
  powerpc/pseries: Relocate drmem.c to pseries
  powerpc/pseries: Move DRMEM processing out of prom.c
  powerpc/pseries: Move DRMEM processing out of numa.c
  powerpc/pseries: Relocate drmem.h to pseries



[Internal Review PATCH 2/3] powerpc/numa: Define mapping between HW and kernel cpus

2018-11-26 Thread Michael Bringmann
Define interface to map external powerpc cpus across multiple nodes
to a range of kernel cpu values.  Mapping is intended to prevent
confusion within the kernel about the cpu+node mapping, and the
changes in configuration that may happen due to powerpc LPAR
migration or other associativity changes during the lifetime of a
system.  These interfaces will be used entirely within the powerpc
kernel code to maintain separation between the machine and kernel
contexts.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/include/asm/topology.h   |   31 +++
 arch/powerpc/platforms/pseries/Kconfig|   10 ++
 arch/powerpc/platforms/pseries/Makefile   |1 
 arch/powerpc/platforms/pseries/cpuremap.c |  131 +
 4 files changed, 173 insertions(+)
 create mode 100644 arch/powerpc/platforms/pseries/cpuremap.c

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index 4621f40..db11969 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -135,5 +135,36 @@ static inline void shared_proc_topology_init(void) {}
 #endif
 #endif
 
+#define CPUREMAP_NO_CPU(~0)
+#define CPUREMAP_NO_THREAD (~0)
+
+#ifdef CONFIG_CPUREMAP
+extern int cpuremap_thread_to_cpu(int thread_index);
+   /* Return CPUREMAP_NO_CPU if not found */
+extern int cpuremap_map_cpu(int thread_index, int in_core_ndx, int node);
+   /* Return CPUREMAP_NO_CPU if fails */
+extern int cpuremap_reserve_cpu(int cpu);
+   /* Return CPUREMAP_NO_CPU if fails */
+extern int cpuremap_release_cpu(int cpu);
+   /* Return CPUREMAP_NO_CPU if fails */
+extern int cpuremap_cpu_to_thread(int cpu);
+   /* Return CPUREMAP_NO_THREAD if not found */
+extern void cpuremap_init(void);
+   /* Identify necessary constants & alloc memory at boot */
+#else
+static inline int cpuremap_thread_to_cpu(int thread_index)
+{
+   return thread_index;
+}
+static inline int cpuremap_map_cpu(int thread_index, int in_core_ndx, int node)
+{
+   return thread_index;
+}
+static inline int cpuremap_reserve_cpu(int cpu) { return cpu; }
+static inline int cpuremap_release_cpu(int cpu) { return cpu; }
+static inline int cpuremap_cpu_to_thread(int cpu) { return cpu; }
+static inline void cpuremap_init(void) {}
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_TOPOLOGY_H */
diff --git a/arch/powerpc/platforms/pseries/Kconfig 
b/arch/powerpc/platforms/pseries/Kconfig
index 2e4bd32..c35009f 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -145,3 +145,13 @@ config PAPR_SCM
tristate "Support for the PAPR Storage Class Memory interface"
help
  Enable access to hypervisor provided storage class memory.
+  Enable access to hypervisor provided storage class memory.
+
+config CPUREMAP
+bool "Support for mapping hw cpu+node to kernel index"
+depends on SMP && (PPC_PSERIES)
+---help---
+  Say Y here to be able to remap hw cpu+node to standardized
+  kernel CPUs at runtime on Pseries machines.
+
+  Say N if you are unsure.
diff --git a/arch/powerpc/platforms/pseries/Makefile 
b/arch/powerpc/platforms/pseries/Makefile
index a43ec84..ad49d8e 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -13,6 +13,7 @@ obj-$(CONFIG_KEXEC_CORE)  += kexec.o
 obj-$(CONFIG_PSERIES_ENERGY)   += pseries_energy.o
 
 obj-$(CONFIG_HOTPLUG_CPU)  += hotplug-cpu.o
+obj-$(CONFIG_CPUREMAP) += cpuremap.o
 obj-$(CONFIG_MEMORY_HOTPLUG)   += hotplug-memory.o pmem.o
 
 obj-$(CONFIG_HVC_CONSOLE)  += hvconsole.o
diff --git a/arch/powerpc/platforms/pseries/cpuremap.c 
b/arch/powerpc/platforms/pseries/cpuremap.c
new file mode 100644
index 000..86fdf12
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/cpuremap.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct cpuremap_cpu {
+   int thread_index;
+   /* Set to thread_index from ibm,ppc-interrupt-server#s arrays
+* Don't clear when release'ed
+*/
+   int node;
+   bool in_use;
+   /* Set to true when reserve'ed
+* Don't clear when release'ed
+   */
+};
+
+struct cpuremap_struct {
+   int num_nodes;
+   int num_cores;
+   int num_threads_per_core;
+   struct cpuremap_cpu *threads;
+} cpuremap_data;
+
+
+void cpuremap_init(void)
+{
+   int i, k;
+
+   /* Identify necessary constants & alloc memory at boot */
+   cpuremap_data.num_threads_per_core = 8;
+   cpuremap_data.num_cores = 32;
+   cpuremap_data.num_nodes =
+   nr_cpu_ids /
+   (cpuremap_data.num_threads_per_core * cpuremap_data.num_cores);
+   cpuremap_data.

[Internal Review PATCH 3/3] powerpc/numa: Apply mapping between HW and kernel cpus

2018-11-26 Thread Michael Bringmann
Apply new interface to map external powerpc cpus across multiple
nodes to a range of kernel cpu values.  Mapping is intended to
prevent confusion within the kernel about the cpu+node mapping, and
the changes in configuration that may happen due to powerpc LPAR
migration or other associativity changes during the lifetime of a
system.  These interfaces exchange the thread_index provided by the
'ibm,ppc-interrupt-server#s' properties, for an internal index to
be used by kernel scheduling interfaces.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/mm/numa.c   |   45 +-
 arch/powerpc/platforms/pseries/hotplug-cpu.c |   15 +++--
 2 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 460d60f..9825fc9 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1063,7 +1063,8 @@ u64 memory_hotplug_max(void)
 
 struct topology_update_data {
struct topology_update_data *next;
-   unsigned int cpu;
+   unsigned int old_cpu;
+   unsigned int new_cpu;
int old_nid;
int new_nid;
 };
@@ -1253,13 +1254,13 @@ static int update_cpu_topology(void *data)
 
for (update = data; update; update = update->next) {
int new_nid = update->new_nid;
-   if (cpu != update->cpu)
+   if (cpu != update->new_cpu)
continue;
 
-   unmap_cpu_from_node(cpu);
-   map_cpu_to_node(cpu, new_nid);
-   set_cpu_numa_node(cpu, new_nid);
-   set_cpu_numa_mem(cpu, local_memory_node(new_nid));
+   unmap_cpu_from_node(update->old_cpu);
+   map_cpu_to_node(update->new_cpu, new_nid);
+   set_cpu_numa_node(update->new_cpu, new_nid);
+   set_cpu_numa_mem(update->new_cpu, local_memory_node(new_nid));
vdso_getcpu_init();
}
 
@@ -1283,7 +1284,7 @@ static int update_lookup_table(void *data)
int nid, base, j;
 
nid = update->new_nid;
-   base = cpu_first_thread_sibling(update->cpu);
+   base = cpu_first_thread_sibling(update->new_cpu);
 
for (j = 0; j < threads_per_core; j++) {
update_numa_cpu_lookup_table(base + j, nid);
@@ -1305,7 +1306,7 @@ int numa_update_cpu_topology(bool cpus_locked)
struct topology_update_data *updates, *ud;
cpumask_t updated_cpus;
struct device *dev;
-   int weight, new_nid, i = 0;
+   int weight, new_nid, i = 0, ii;
 
if (!prrn_enabled && !vphn_enabled && topology_inited)
return 0;
@@ -1349,12 +1350,16 @@ int numa_update_cpu_topology(bool cpus_locked)
continue;
}
 
+   ii = 0;
for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
ud = [i++];
ud->next = [i];
-   ud->cpu = sibling;
ud->new_nid = new_nid;
ud->old_nid = numa_cpu_lookup_table[sibling];
+   ud->old_cpu = sibling;
+   ud->new_cpu = cpuremap_map_cpu(
+   get_hard_smp_processor_id(sibling),
+   ii++, new_nid);
cpumask_set_cpu(sibling, _cpus);
}
cpu = cpu_last_thread_sibling(cpu);
@@ -1370,9 +1375,10 @@ int numa_update_cpu_topology(bool cpus_locked)
pr_debug("Topology update for the following CPUs:\n");
if (cpumask_weight(_cpus)) {
for (ud = [0]; ud; ud = ud->next) {
-   pr_debug("cpu %d moving from node %d "
- "to %d\n", ud->cpu,
- ud->old_nid, ud->new_nid);
+   pr_debug("cpu %d, node %d moving to"
+" cpu %d, node %d\n",
+ud->old_cpu, ud->old_nid,
+ud->new_cpu, ud->new_nid);
}
}
 
@@ -1409,13 +1415,20 @@ int numa_update_cpu_topology(bool cpus_locked)
 cpumask_of(raw_smp_processor_id()));
 
for (ud = [0]; ud; ud = ud->next) {
-   unregister_cpu_under_node(ud->cpu, ud->old_nid);
-   register_cpu_under_node(ud->cpu, ud->new_nid);
+   unregister_cpu_under_node(ud->old_cpu, ud->old_nid);
+   register_cpu_under_node(ud->new_cpu, ud->new_nid);
 
-   dev = get_cpu_device(ud->cpu);
+   dev = get_cpu_device(ud->old_cpu);
if (dev)
kobject_uevent(>kobj, KOBJ_CHANGE);
-

[Internal Review PATCH 1/3] powerpc/numa: Conditionally online new nodes

2018-11-26 Thread Michael Bringmann
Add argument to allow caller to determine whether the node identified
for a cpu after an associativity / affinity change should be inited.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/include/asm/topology.h  |2 +-
 arch/powerpc/mm/numa.c   |6 +++---
 arch/powerpc/platforms/pseries/hotplug-cpu.c |2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index a4a718d..4621f40 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -90,7 +90,7 @@ static inline void update_numa_cpu_lookup_table(unsigned int 
cpu, int node) {}
 extern int start_topology_update(void);
 extern int stop_topology_update(void);
 extern int prrn_is_enabled(void);
-extern int find_and_online_cpu_nid(int cpu);
+extern int find_and_online_cpu_nid(int cpu, bool must_online);
 extern int timed_topology_update(int nsecs);
 extern void __init shared_proc_topology_init(void);
 #else
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 3a048e9..460d60f 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1197,7 +1197,7 @@ static long vphn_get_associativity(unsigned long cpu,
return rc;
 }
 
-int find_and_online_cpu_nid(int cpu)
+int find_and_online_cpu_nid(int cpu, bool must_online)
 {
__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
int new_nid;
@@ -1210,7 +1210,7 @@ int find_and_online_cpu_nid(int cpu)
if (new_nid < 0 || !node_possible(new_nid))
new_nid = first_online_node;
 
-   if (NODE_DATA(new_nid) == NULL) {
+   if (must_online && (NODE_DATA(new_nid) == NULL)) {
 #ifdef CONFIG_MEMORY_HOTPLUG
/*
 * Need to ensure that NODE_DATA is initialized for a node from
@@ -1337,7 +1337,7 @@ int numa_update_cpu_topology(bool cpus_locked)
continue;
}
 
-   new_nid = find_and_online_cpu_nid(cpu);
+   new_nid = find_and_online_cpu_nid(cpu, true);
 
if (new_nid == numa_cpu_lookup_table[cpu]) {
cpumask_andnot(_associativity_changes_mask,
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 2f8e621..620cb57 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -366,7 +366,7 @@ static int dlpar_online_cpu(struct device_node *dn)
!= CPU_STATE_OFFLINE);
cpu_maps_update_done();
timed_topology_update(1);
-   find_and_online_cpu_nid(cpu);
+   find_and_online_cpu_nid(cpu, true);
rc = device_online(get_cpu_device(cpu));
if (rc)
goto out;



[Internal Review PATCH] powerpc/pseries: Remap hw to kernel cpu indexes

2018-11-26 Thread Michael Bringmann
Define and apply new interface to map hardware-specific powerpc cpu
ids to a kernel specific range of cpu values.  Mapping is intended
to prevent confusion within the kernel about the cpu+node mapping,
and the changes in configuration that may happen due to powerpc LPAR
migration or other associativity changes during the lifetime of a
system.  These interfaces exchange the thread_index provided by the
'ibm,ppc-interrupt-server#s' properties, for an internal index to
be used by kernel scheduling interfaces.

Signed-off-by: Michael Bringmann 

Michael Bringmann (3):
  powerpc/numa: Conditionally online new nodes
  powerpc/numa: Define mapping between HW and kernel cpus
  powerpc/numa: Apply mapping between HW and kernel cpu



[PATCH v02] powerpc/numa: Perform full re-add of CPU for PRRN/VPHN topology update

2018-11-17 Thread Michael Bringmann
On pseries systems, performing changes to a partition's affinity
can result in altering the nodes a CPU is assigned to the
current system.  For example, some systems are subject to resource
balancing operations by the operator or control software.  In such
environments, system CPUs may be in node 1 and 3 at boot, and be
moved to nodes 2, 3, and 5, for better performance.

The current implementation attempts to recognize such changes within
the powerpc-specific version of arch_update_cpu_topology to modify a
range of system data structures directly.  However, some scheduler
data structures may be inaccessible, or the timing of a node change
may still lead to corruption or error in other modules (e.g. user
space) which do not receive notification of these changes.

This patch modifies the PRRN/VPHN topology update worker function to
recognize an affinity change for a CPU, and to perform a full DLPAR
remove and add of the CPU instead of dynamically changing its node
to resolve this issue.

[Based upon patch submission:
Subject: [PATCH] powerpc/pseries: Perform full re-add of CPU for topology 
update post-migration
From: Nathan Fontenot 
Date: Tue Oct 30 05:43:36 AEDT 2018
]

[Replace patch submission:
Subject: [PATCH] powerpc/topology: Update numa mask when cpu node mapping 
changes
From: Srikar Dronamraju 
Date: Wed Oct 10 15:24:46 AEDT 2018
]

Signed-off-by: Michael Bringmann 
---
Changes in v02:
  -- Reuse more of the previous implementation to reduce patch size
  -- Replace former calls to numa_update_cpu_topology(false) by
 topology_schedule_update
  -- Make sure that we report topology changes back through
 arch_update_cpu_topology
  -- Fix problem observed in powerpc next kernel with updating
 cpu_associativity_changes_mask in timer_topology_fn when both
 prrn_enabled and vphn_enabled, and many extra CPUs are possible,
 but not installed.
  -- Fix problem with updating cpu_associativity_changes_mask when
 VPHN associativity information does not arrive until after first
 call to update topology occurs.
---
 arch/powerpc/include/asm/topology.h |7 +---
 arch/powerpc/kernel/rtasd.c |2 +
 arch/powerpc/mm/numa.c  |   69 +++
 3 files changed, 48 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index f85e2b0..79505c3 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -42,7 +42,7 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 
 extern int sysfs_add_device_to_node(struct device *dev, int nid);
 extern void sysfs_remove_device_from_node(struct device *dev, int nid);
-extern int numa_update_cpu_topology(bool cpus_locked);
+extern void topology_schedule_update(void);
 
 static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node)
 {
@@ -77,10 +77,7 @@ static inline void sysfs_remove_device_from_node(struct 
device *dev,
 {
 }
 
-static inline int numa_update_cpu_topology(bool cpus_locked)
-{
-   return 0;
-}
+static inline void topology_schedule_update(void) {}
 
 static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) {}
 
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
index 38cadae..7e2777c 100644
--- a/arch/powerpc/kernel/rtasd.c
+++ b/arch/powerpc/kernel/rtasd.c
@@ -285,7 +285,7 @@ static void handle_prrn_event(s32 scope)
 * the RTAS event.
 */
pseries_devicetree_update(-scope);
-   numa_update_cpu_topology(false);
+   topology_schedule_update();
 }
 
 static void handle_rtas_event(const struct rtas_error_log *log)
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 5c2cfaf..15e0e06 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1077,6 +1077,9 @@ struct topology_update_data {
 static void reset_topology_timer(void);
 static int topology_timer_secs = 1;
 static int topology_inited;
+static int topology_update_in_progress;
+static int topology_changed;
+static unsigned long topology_scans;
 
 /*
  * Change polling interval for associativity changes.
@@ -1297,9 +1300,9 @@ static int update_lookup_table(void *data)
  * Update the node maps and sysfs entries for each cpu whose home node
  * has changed. Returns 1 when the topology has changed, and 0 otherwise.
  *
- * cpus_locked says whether we already hold cpu_hotplug_lock.
+ * readd_cpus: Also readd any CPUs that have changed affinity
  */
-int numa_update_cpu_topology(bool cpus_locked)
+static int numa_update_cpu_topology(bool readd_cpus)
 {
unsigned int cpu, sibling, changed = 0;
struct topology_update_data *updates, *ud;
@@ -1307,7 +1310,8 @@ int numa_update_cpu_topology(bool cpus_locked)
struct device *dev;
int weight, new_nid, i = 0;
 
-   if (!prrn_enabled && !vphn_enabled && topology_inited)
+   if ((!prrn_enabled && !vphn_en

Re: [PATCH] powerpc/numa: Perform full re-add of CPU for PRRN/VPHN topology update

2018-11-14 Thread Michael Bringmann



On 11/13/2018 02:39 AM, Srikar Dronamraju wrote:
>> -static void topology_work_fn(struct work_struct *work)
>> -{
>> -rebuild_sched_domains();
>> +if (changed)
>> +rebuild_sched_domains();
>>  }
>>  static DECLARE_WORK(topology_work, topology_work_fn);
>>
>> @@ -1553,7 +1424,6 @@ void __init shared_proc_topology_init(void)
>>  if (lppaca_shared_proc(get_lppaca())) {
>>  bitmap_fill(cpumask_bits(_associativity_changes_mask),
>>  nr_cpumask_bits);
>> -numa_update_cpu_topology(false);
> 
> Shouldn't we be calling topology_schedule_update() here?

Agreed.

> 
>>  }
>>  }
>>
> 
> 

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



[PATCH] powerpc/numa: Perform full re-add of CPU for PRRN/VPHN topology update

2018-11-12 Thread Michael Bringmann
On pseries systems, performing changes to a partition's affinity
can result in altering the nodes a CPU is assigned to the
current system.  For example, some systems are subject to resource
balancing operations by the operator or control software.  In such
environments, system CPUs may be in node 1 and 3 at boot, and be
moved to nodes 2, 3, and 5, for better performance.

The current implementation attempts to recognize such changes within
the powerpc-specific version of arch_update_cpu_topology to modify a
range of system data structures directly.  However, some scheduler
data structures may be inaccessible, or the timing of a node change
may still lead to corruption or error in other modules (e.g. user
space) which do not receive notification of these changes.

This patch modifies the PRRN/VPHN topology update worker function to
recognize an affinity change for a CPU, and to perform a full DLPAR
remove and add of the CPU instead of dynamically changing its node
to resolve this issue.

[Based upon patch submission:
Subject: [PATCH] powerpc/pseries: Perform full re-add of CPU for topology 
update post-migration
From: Nathan Fontenot 
Date: Tue Oct 30 05:43:36 AEDT 2018
]

[Replace patch submission:
Subject: [PATCH] powerpc/topology: Update numa mask when cpu node mapping 
changes
From: Srikar Dronamraju 
DAte: Wed Oct 10 15:24:46 AEDT 2018
]

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/include/asm/topology.h |6 -
 arch/powerpc/kernel/rtasd.c |1 
 arch/powerpc/mm/numa.c  |  184 +--
 3 files changed, 27 insertions(+), 164 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index f85e2b0..9f85246 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -42,7 +42,6 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 
 extern int sysfs_add_device_to_node(struct device *dev, int nid);
 extern void sysfs_remove_device_from_node(struct device *dev, int nid);
-extern int numa_update_cpu_topology(bool cpus_locked);
 
 static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node)
 {
@@ -77,11 +76,6 @@ static inline void sysfs_remove_device_from_node(struct 
device *dev,
 {
 }
 
-static inline int numa_update_cpu_topology(bool cpus_locked)
-{
-   return 0;
-}
-
 static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) {}
 
 #endif /* CONFIG_NUMA */
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
index 38cadae..c161d74 100644
--- a/arch/powerpc/kernel/rtasd.c
+++ b/arch/powerpc/kernel/rtasd.c
@@ -285,7 +285,6 @@ static void handle_prrn_event(s32 scope)
 * the RTAS event.
 */
pseries_devicetree_update(-scope);
-   numa_update_cpu_topology(false);
 }
 
 static void handle_rtas_event(const struct rtas_error_log *log)
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index be6216e..f79b65f 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1236,96 +1236,25 @@ int find_and_online_cpu_nid(int cpu)
return new_nid;
 }
 
-/*
- * Update the CPU maps and sysfs entries for a single CPU when its NUMA
- * characteristics change. This function doesn't perform any locking and is
- * only safe to call from stop_machine().
- */
-static int update_cpu_topology(void *data)
-{
-   struct topology_update_data *update;
-   unsigned long cpu;
-
-   if (!data)
-   return -EINVAL;
-
-   cpu = smp_processor_id();
-
-   for (update = data; update; update = update->next) {
-   int new_nid = update->new_nid;
-   if (cpu != update->cpu)
-   continue;
-
-   unmap_cpu_from_node(cpu);
-   map_cpu_to_node(cpu, new_nid);
-   set_cpu_numa_node(cpu, new_nid);
-   set_cpu_numa_mem(cpu, local_memory_node(new_nid));
-   vdso_getcpu_init();
-   }
-
-   return 0;
-}
-
-static int update_lookup_table(void *data)
-{
-   struct topology_update_data *update;
-
-   if (!data)
-   return -EINVAL;
-
-   /*
-* Upon topology update, the numa-cpu lookup table needs to be updated
-* for all threads in the core, including offline CPUs, to ensure that
-* future hotplug operations respect the cpu-to-node associativity
-* properly.
-*/
-   for (update = data; update; update = update->next) {
-   int nid, base, j;
-
-   nid = update->new_nid;
-   base = cpu_first_thread_sibling(update->cpu);
-
-   for (j = 0; j < threads_per_core; j++) {
-   update_numa_cpu_lookup_table(base + j, nid);
-   }
-   }
-
-   return 0;
-}
-
-/*
- * Update the node maps and sysfs entries for each cpu whose home node
- * has changed. Returns 1 when the topology has changed, and 0 otherwise.
- *
- * cpus_l

Re: [PATCH v08 0/5] powerpc/hotplug: Update affinity for migrated CPUs

2018-11-05 Thread Michael Bringmann
On 10/29/2018 02:51 PM, Michal Suchánek wrote:
> On Sun, 29 Jul 2018 08:18:34 -0500
> Michael Bringmann  wrote:
> 
>> The migration of LPARs across Power systems affects many attributes
>> including that of the associativity of CPUs.  The patches in this
>> set execute when a system is coming up fresh upon a migration target.
>> They are intended to,
>>
>> * Recognize changes to the associativity of CPUs recorded in internal
>>   data structures when compared to the latest copies in the device
>> tree.
>> * Generate calls to other code layers to reset the data structures
>>   related to associativity of the CPUs.
>> * Re-register the 'changed' entities into the target system.
>>   Re-registration of CPUs mostly entails acting as if they have been
>>   newly hot-added into the target system.
>>
>> Signed-off-by: Michael Bringmann 
>>
> Hello,
> 
> what is the status of this patchset other than it no longer applies?

The current PowerPC systems already perform remove_processor /
add_processor operations during migration events that subsume
the necessary affinity changes.  This patchset was overkill,
so we pulled it.

Memory affinity on the other hand does need intervention, and that
patchset has been posted.  I was just about to ping Michael Ellerman
about the status from his end.
 
> 
> Thanks
> 
> Michal

Thanks.
Michael

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



[PATCH v07 5/5] migration/memory: Support 'ibm,dynamic-memory-v2'

2018-10-17 Thread Michael Bringmann
migration/memory: This patch adds recognition for changes to the
associativity of memory blocks described by 'ibm,dynamic-memory-v2'.
If the associativity of an LMB has changed, it should be readded to
the system in order to update local and general kernel data structures.
This patch builds upon previous enhancements that scan the device-tree
"ibm,dynamic-memory" properties using the base LMB array, and a copy
derived from the updated properties.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/platforms/pseries/hotplug-memory.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 6856010..03c5e49 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -1187,7 +1187,8 @@ static int pseries_memory_notifier(struct notifier_block 
*nb,
err = pseries_remove_mem_node(rd->dn);
break;
case OF_RECONFIG_UPDATE_PROPERTY:
-   if (!strcmp(rd->prop->name, "ibm,dynamic-memory")) {
+   if (!strcmp(rd->prop->name, "ibm,dynamic-memory") ||
+   !strcmp(rd->prop->name, "ibm,dynamic-memory-v2")) {
struct drmem_lmb_info *dinfo =
drmem_lmbs_init(rd->prop);
if (!dinfo)



[PATCH v07 4/5] migration/memory: Evaluate LMB assoc changes

2018-10-17 Thread Michael Bringmann
migration/memory: This patch adds code that recognizes changes to
the associativity of memory blocks described by the device-tree
properties in order to drive equivalent 'hotplug' operations to
update local and general kernel data structures to reflect those
changes.  These differences may include:

* Evaluate 'ibm,dynamic-memory' properties when processing the
  updated device-tree properties of the system during Post Migration
  events (migration_store).  The new functionality looks for changes
  to the aa_index values for each drc_index/LMB to identify any memory
  blocks that should be readded.

* In an LPAR migration scenario, the "ibm,associativity-lookup-arrays"
  property may change.  In the event that a row of the array differs,
  locate all assigned memory blocks with that 'aa_index' and 're-add'
  them to the system memory block data structures.  In the process of
  the 're-add', the system routines will update the corresponding entry
  for the memory in the LMB structures and any other relevant kernel
  data structures.

A number of previous extensions made to the DRMEM code for scanning
device-tree properties and creating LMB arrays are used here to
ensure that the resulting code is simpler and more usable:

* Use new paired list iterator for the DRMEM LMB info arrays to find
  differences in old and new versions of properties.
* Use new iterator for copies of the DRMEM info arrays to evaluate
  completely new structures.
* Combine common code for parsing and evaluating memory description
  properties based on the DRMEM LMB array model to greatly simplify
  extension from the older property 'ibm,dynamic-memory' to the new
  property model of 'ibm,dynamic-memory-v2'.

For support, add a new pseries hotplug action for DLPAR operations,
PSERIES_HP_ELOG_ACTION_READD_MULTIPLE.  It is a variant of the READD
operation which performs the action upon multiple instances of the
resource at one time.  The operation is to be triggered by device-tree
analysis of updates by RTAS events analyzed by 'migation_store' during
post-migration processing.  It will be used for memory updates,
initially.

Signed-off-by: Michael Bringmann 
---
Changes in v06:
  -- Rebase to powerpc next branch to account for recent code changes.
  -- Fix prototype problem when CONFIG_MEMORY_HOTPLUG not defined.
Changes in v05:
  -- Move common structure from numa.c + hotplug-memory.c to header file.
  -- Clarify some comments.
  -- Use walk_drmem_lmbs_pairs and callback instead of local loop
Changes in v04:
  -- Move dlpar_memory_readd_multiple() function definition and use
 into previous patch along with action constant definition.
  -- Correct spacing in patch
Changes in v03:
  -- Modify the code that parses the memory affinity attributes to
 mark relevant DRMEM LMB array entries using the internal_flags
 mechanism instead of generate unique hotplug actions for each
 memory block to be readded.  The change is intended to both
 simplify the code, and to require fewer resources on systems
 with huge amounts of memory.
  -- Save up notice about any all LMB entries until the end of the
 'migration_store' operation at which point a single action is
 queued to scan the entire DRMEM array.
  -- Add READD_MULTIPLE function for memory that scans the DRMEM
 array to identify multiple entries that were marked previously.
 The corresponding memory blocks are to be readded to the system
 to update relevant data structures outside of the powerpc-
 specific code.
  -- Change dlpar_memory_pmt_changes_action to directly queue worker
 to pseries work queue.
---
 arch/powerpc/include/asm/topology.h |7 +
 arch/powerpc/mm/numa.c  |6 -
 arch/powerpc/platforms/pseries/hotplug-memory.c |  207 +++
 arch/powerpc/platforms/pseries/mobility.c   |3 
 arch/powerpc/platforms/pseries/pseries.h|8 +
 5 files changed, 186 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index a4a718d..fbe03df 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -135,5 +135,12 @@ static inline void shared_proc_topology_init(void) {}
 #endif
 #endif
 
+
+struct assoc_arrays {
+   u32 n_arrays;
+   u32 array_sz;
+   const __be32 *arrays;
+};
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_TOPOLOGY_H */
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 693ae1c..f1e7287 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -368,12 +368,6 @@ static unsigned long read_n_cells(int n, const __be32 
**buf)
return result;
 }
 
-struct assoc_arrays {
-   u32 n_arrays;
-   u32 array_sz;
-   const __be32 *arrays;
-};
-
 /*
  * Retrieve and validate the list of associativity arrays for drconf
  * memory from the ibm,associativity-lookup-arrays property of the
diff --

[PATCH v07 3/5] migration/memory: Add hotplug READD_MULTIPLE

2018-10-17 Thread Michael Bringmann
migration/memory: This patch adds a new pseries hotplug action
for CPU and memory operations, PSERIES_HP_ELOG_ACTION_READD_MULTIPLE.
This is a variant of the READD operation which performs the action
upon multiple instances of the resource at one time.  The operation
is to be triggered by device-tree analysis of updates by RTAS events
analyzed by 'migation_store' during post-migration processing.  It
will be used for memory updates, initially.

Signed-off-by: Michael Bringmann 
---
Changes in v07:
  -- Provide more useful return value from dlpar_memory_readd_multiple
Changes in v05:
  -- Provide dlpar_memory_readd_helper routine to compress some common code
Changes in v04:
  -- Move init of 'lmb->internal_flags' in init_drmem_v2_lmbs to
 previous patch.
  -- Pull in implementation of dlpar_memory_readd_multiple() to go
 with operation flag.
---
 arch/powerpc/include/asm/rtas.h |1 
 arch/powerpc/platforms/pseries/hotplug-memory.c |   47 ---
 2 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 0183e95..cc00451 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -333,6 +333,7 @@ struct pseries_hp_errorlog {
 #define PSERIES_HP_ELOG_ACTION_ADD 1
 #define PSERIES_HP_ELOG_ACTION_REMOVE  2
 #define PSERIES_HP_ELOG_ACTION_READD   3
+#define PSERIES_HP_ELOG_ACTION_READD_MULTIPLE  4
 
 #define PSERIES_HP_ELOG_ID_DRC_NAME1
 #define PSERIES_HP_ELOG_ID_DRC_INDEX   2
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 2b796da..c44c6a6 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -507,6 +507,19 @@ static int dlpar_memory_remove_by_index(u32 drc_index)
return rc;
 }
 
+static int dlpar_memory_readd_helper(struct drmem_lmb *lmb)
+{
+   int rc;
+
+   rc = dlpar_remove_lmb(lmb);
+   if (!rc) {
+   rc = dlpar_add_lmb(lmb);
+   if (rc)
+   dlpar_release_drc(lmb->drc_index);
+   }
+   return rc;
+}
+
 static int dlpar_memory_readd_by_index(u32 drc_index)
 {
struct drmem_lmb *lmb;
@@ -519,12 +532,7 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
for_each_drmem_lmb(lmb) {
if (lmb->drc_index == drc_index) {
lmb_found = 1;
-   rc = dlpar_remove_lmb(lmb);
-   if (!rc) {
-   rc = dlpar_add_lmb(lmb);
-   if (rc)
-   dlpar_release_drc(lmb->drc_index);
-   }
+   rc = dlpar_memory_readd_helper(lmb);
break;
}
}
@@ -541,6 +549,26 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
return rc;
 }
 
+static int dlpar_memory_readd_multiple(void)
+{
+   struct drmem_lmb *lmb;
+   int rc = 0;
+
+   pr_info("Attempting to update multiple LMBs\n");
+
+   for_each_drmem_lmb(lmb) {
+   if (drmem_lmb_update(lmb)) {
+   rc |= dlpar_memory_readd_helper(lmb);
+   drmem_remove_lmb_update(lmb);
+   }
+   }
+
+   if (rc)
+   return -EIO;
+
+   return rc;
+}
+
 static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
 {
struct drmem_lmb *lmb, *start_lmb, *end_lmb;
@@ -641,6 +669,10 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
 {
return -EOPNOTSUPP;
 }
+static int dlpar_memory_readd_multiple(void)
+{
+   return -EOPNOTSUPP;
+}
 
 static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
 {
@@ -918,6 +950,9 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
drc_index = hp_elog->_drc_u.drc_index;
rc = dlpar_memory_readd_by_index(drc_index);
break;
+   case PSERIES_HP_ELOG_ACTION_READD_MULTIPLE:
+   rc = dlpar_memory_readd_multiple();
+   break;
default:
pr_err("Invalid action (%d) specified\n", hp_elog->action);
rc = -EINVAL;



[PATCH v07 2/5] powerpc/drmem: Add internal_flags feature

2018-10-17 Thread Michael Bringmann
powerpc/drmem: Add internal_flags field to each LMB to allow
marking of kernel software-specific operations that need not
be exported to other users.  For instance, if information about
selected LMBs needs to be maintained for subsequent passes
through the system, it can be encoded into the LMB array itself
without requiring the allocation and maintainance of additional
data structures.

Signed-off-by: Michael Bringmann 
---
Changes in v04:
  -- Add another initialization of 'lmb->internal_flags' to
 init_drmem_v2_lmbs.
---
 arch/powerpc/include/asm/drmem.h |   18 ++
 arch/powerpc/mm/drmem.c  |3 +++
 2 files changed, 21 insertions(+)

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index cfe8598..dbb3e6c 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -17,6 +17,7 @@ struct drmem_lmb {
u32 drc_index;
u32 aa_index;
u32 flags;
+   u32 internal_flags;
 };
 
 struct drmem_lmb_info {
@@ -94,6 +95,23 @@ static inline bool drmem_lmb_reserved(struct drmem_lmb *lmb)
return lmb->flags & DRMEM_LMB_RESERVED;
 }
 
+#define DRMEM_LMBINT_UPDATE0x0001
+
+static inline void drmem_mark_lmb_update(struct drmem_lmb *lmb)
+{
+   lmb->internal_flags |= DRMEM_LMBINT_UPDATE;
+}
+
+static inline void drmem_remove_lmb_update(struct drmem_lmb *lmb)
+{
+   lmb->internal_flags &= ~DRMEM_LMBINT_UPDATE;
+}
+
+static inline bool drmem_lmb_update(struct drmem_lmb *lmb)
+{
+   return lmb->internal_flags & DRMEM_LMBINT_UPDATE;
+}
+
 u64 drmem_lmb_memory_max(void);
 void __init walk_drmem_lmbs(struct device_node *dn,
void (*func)(struct drmem_lmb *, const __be32 **));
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index ded9dbf..f199fe5 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -207,6 +207,7 @@ static void read_drconf_v1_cell(struct drmem_lmb *lmb,
 
lmb->aa_index = of_read_number(p++, 1);
lmb->flags = of_read_number(p++, 1);
+   lmb->internal_flags = 0;
 
*prop = p;
 }
@@ -265,6 +266,7 @@ static void __walk_drmem_v2_lmbs(const __be32 *prop, const 
__be32 *usm,
 
lmb.aa_index = dr_cell.aa_index;
lmb.flags = dr_cell.flags;
+   lmb.internal_flags = 0;
 
func(, );
}
@@ -441,6 +443,7 @@ static void init_drmem_v2_lmbs(const __be32 *prop,
 
lmb->aa_index = dr_cell.aa_index;
lmb->flags = dr_cell.flags;
+   lmb->internal_flags = 0;
}
}
 }



[PATCH v07 1/5] powerpc/drmem: Export 'dynamic-memory' loader

2018-10-17 Thread Michael Bringmann
powerpc/drmem: Export many of the functions of DRMEM to parse
"ibm,dynamic-memory" and "ibm,dynamic-memory-v2" during hotplug
operations and for Post Migration events.

Also modify the DRMEM initialization code to allow it to,

* Be called after system initialization
* Provide a separate user copy of the LMB array that is produces
* Free the user copy upon request

In addition, a couple of changes were made to make the creation
of additional copies of the LMB array more useful including,

* Add iterator function to work through a pair of drmem_info arrays
  with a callback function to apply specific tests.
* Modify DRMEM code to replace usages of dt_root_addr_cells, and
  dt_mem_next_cell, as these are only available at first boot.

Signed-off-by: Michael Bringmann 
---
Changes in v05:
  -- Add walk_drmem_lmbs_pairs to replace macro for_each_pair_lmb
---
 arch/powerpc/include/asm/drmem.h |   13 +
 arch/powerpc/mm/drmem.c  |   96 ++
 2 files changed, 89 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index 7c1d8e7..cfe8598 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -35,6 +35,11 @@ struct drmem_lmb_info {
_info->lmbs[0],   \
_info->lmbs[drmem_info->n_lmbs - 1])
 
+#define for_each_dinfo_lmb(dinfo, lmb) \
+   for_each_drmem_lmb_in_range((lmb),  \
+   >lmbs[0],\
+   >lmbs[dinfo->n_lmbs - 1])
+
 /*
  * The of_drconf_cell_v1 struct defines the layout of the LMB data
  * specified in the ibm,dynamic-memory device tree property.
@@ -94,6 +99,14 @@ void __init walk_drmem_lmbs(struct device_node *dn,
void (*func)(struct drmem_lmb *, const __be32 **));
 int drmem_update_dt(void);
 
+struct drmem_lmb_info *drmem_lmbs_init(struct property *prop);
+void drmem_lmbs_free(struct drmem_lmb_info *dinfo);
+int walk_drmem_lmbs_pairs(struct drmem_lmb_info *dinfo_oth,
+ int (*func)(struct drmem_lmb *cnt,
+   struct drmem_lmb *oth,
+   void *data),
+ void *data);
+
 #ifdef CONFIG_PPC_PSERIES
 void __init walk_drmem_lmbs_early(unsigned long node,
void (*func)(struct drmem_lmb *, const __be32 **));
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index 3f18036..ded9dbf 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -20,6 +20,7 @@
 
 static struct drmem_lmb_info __drmem_info;
 struct drmem_lmb_info *drmem_info = &__drmem_info;
+static int n_root_addr_cells;
 
 u64 drmem_lmb_memory_max(void)
 {
@@ -193,12 +194,13 @@ int drmem_update_dt(void)
return rc;
 }
 
-static void __init read_drconf_v1_cell(struct drmem_lmb *lmb,
+static void read_drconf_v1_cell(struct drmem_lmb *lmb,
   const __be32 **prop)
 {
const __be32 *p = *prop;
 
-   lmb->base_addr = dt_mem_next_cell(dt_root_addr_cells, );
+   lmb->base_addr = of_read_number(p, n_root_addr_cells);
+   p += n_root_addr_cells;
lmb->drc_index = of_read_number(p++, 1);
 
p++; /* skip reserved field */
@@ -209,7 +211,7 @@ static void __init read_drconf_v1_cell(struct drmem_lmb 
*lmb,
*prop = p;
 }
 
-static void __init __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm,
+static void __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm,
void (*func)(struct drmem_lmb *, const __be32 **))
 {
struct drmem_lmb lmb;
@@ -225,13 +227,14 @@ static void __init __walk_drmem_v1_lmbs(const __be32 
*prop, const __be32 *usm,
}
 }
 
-static void __init read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
+static void read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
   const __be32 **prop)
 {
const __be32 *p = *prop;
 
dr_cell->seq_lmbs = of_read_number(p++, 1);
-   dr_cell->base_addr = dt_mem_next_cell(dt_root_addr_cells, );
+   dr_cell->base_addr = of_read_number(p, n_root_addr_cells);
+   p += n_root_addr_cells;
dr_cell->drc_index = of_read_number(p++, 1);
dr_cell->aa_index = of_read_number(p++, 1);
dr_cell->flags = of_read_number(p++, 1);
@@ -239,7 +242,7 @@ static void __init read_drconf_v2_cell(struct 
of_drconf_cell_v2 *dr_cell,
*prop = p;
 }
 
-static void __init __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm,
+static void __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm,
void (*func)(struct drmem_lmb *, const __be32 **))
 {
struct of_drconf_cell_v2 dr_cell;
@@ -275,6 +278,9 @@ void __init walk_drmem_lmbs_early(

[PATCH v07 0/5] powerpc/migration: Affinity fix for memory

2018-10-17 Thread Michael Bringmann
The migration of LPARs across Power systems affects many attributes
including that of the associativity of memory blocks.  The patches
in this set execute when a system is coming up fresh upon a migration
target.  They are intended to,

* Recognize changes to the associativity of memory recorded in
  internal data structures when compared to the latest copies in
  the device tree (e.g. ibm,dynamic-memory, ibm,dynamic-memory-v2).
* Recognize changes to the associativity mapping (e.g. ibm,
  associativity-lookup-arrays), locate all assigned memory blocks
  corresponding to each changed row, and readd all such blocks.
* Generate calls to other code layers to reset the data structures
  related to associativity of memory.
* Re-register the 'changed' entities into the target system.
  Re-registration of memory blocks mostly entails acting as if they
  have been newly hot-added into the target system.

This code builds upon features introduced in a previous patch set
that updates CPUs for affinity changes that may occur during LPM.

Signed-off-by: Michael Bringmann 

Michael Bringmann (5):
  powerpc/drmem: Export 'dynamic-memory' loader
  powerpc/drmem: Add internal_flags feature
  migration/memory: Add hotplug flags READD_MULTIPLE
  migration/memory: Evaluate LMB assoc changes
  migration/memory: Support 'ibm,dynamic-memory-v2'
---
Changes in v07:
  -- Provide more useful return value from dlpar_memory_readd_multiple
Changes in v06:
  -- Rebase to powerpc next branch to account for recent code changes.
  -- Fix prototype problem when CONFIG_MEMORY_HOTPLUG not defined.
Changes in v05:
  -- Add walk_drmem_lmbs_pairs to replace macro for_each_pair_lmb
  -- Use walk_drmem_lmbs_pairs and callback instead of local loop
  -- Provide dlpar_memory_readd_helper routine to compress some common code
  -- Move common structure from numa.c + hotplug-memory.c to header file.
  -- Clarify some comments.
Changes in v04:
  -- Move dlpar_memory_readd_multiple() to patch with new ACTION
 constant.
  -- Move init of 'lmb->internal_flags' in init_drmem_v2_lmbs to
 patch with other references to flag.
  -- Correct spacing in one of the patches
Changes in v03:
  -- Change operation to tag changed LMBs in DRMEM array instead of
 queuing a potentially huge number of structures.
  -- Added another hotplug queue event for CPU/memory operations
  -- Added internal_flags feature to DRMEM
  -- Improve the patch description language for the patch set.
  -- Revise patch set to queue worker for memory association
 updates directly to pseries worker queue.



Re: [PATCH v06 3/5] migration/memory: Add hotplug READD_MULTIPLE

2018-10-17 Thread Michael Bringmann
On 10/16/2018 07:48 PM, Michael Ellerman wrote:
> Michael Bringmann  writes:
>> On 10/16/2018 02:57 PM, Tyrel Datwyler wrote:
>>> On 10/15/2018 05:39 PM, Michael Ellerman wrote:
>>>> Michael Bringmann  writes:
>>>>> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
>>>>> b/arch/powerpc/platforms/pseries/hotplug-memory.c
>>>>> index 2b796da..9c76345 100644
>>>>> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
>>>>> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
>>>>> @@ -541,6 +549,23 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
>>>>>   return rc;
>>>>>  }
>>>>>
>>>>> +static int dlpar_memory_readd_multiple(void)
>>>>> +{
>>>>> + struct drmem_lmb *lmb;
>>>>> + int rc;
>>>>> +
>>>>> + pr_info("Attempting to update multiple LMBs\n");
>>>>> +
>>>>> + for_each_drmem_lmb(lmb) {
>>>>> + if (drmem_lmb_update(lmb)) {
>>>>> + rc = dlpar_memory_readd_helper(lmb);
>>>>> + drmem_remove_lmb_update(lmb);
>>>>> + }
>>>>> + }
>>>>> +
>>>>> + return rc;
>>>>> +}
>>>>
>>>> This leaves rc potentially uninitialised.
>>>>
>>>> What should the result be in that case, -EINVAL ?
>>>
>>> On another note if there are multiple LMBs to update the value of rc only 
>>> reflects the final dlpar_memory_readd_helper() call.
>>
>> Correct.  But that is what happens when we compress common code
>> between two disparate uses i.e. updating memory association after
>> a migration event with no reporting mechanism other than the console
>> log, vs re-adding a single LMB by index for the purposes of DLPAR / drmgr.
>>
>> I could discard the return value from dlpar_memory_readd_helper entirely
>> in this function and just return 0, but in my experience, once errors start
>> to occur in memory dlpar ops, they tend to keep on occurring, so I was
>> returning the last one.  We could also make the code smart enough to
>> capture and return the first/last non-zero return code.  I didn't believe
>> that the frequency of errors for this operation warranted the overhead.
> 
> The actual error value is probably not very relevant.
> 
> But dropping errors entirely is almost always a bad idea.
> 
> So I think you should at least return an error if any error occurred,
> that way at least an error will be returned up to the caller(s).
> 
> Something like:
> 
>   int rc;
> 
>   rc = 0;
>   for_each_drmem_lmb(lmb) {
>   if (drmem_lmb_update(lmb)) {
>   rc |= dlpar_memory_readd_helper(lmb);
>   drmem_remove_lmb_update(lmb);
>   }
>   }
> 
>   if (rc)
>   return -EIO;

Okay.

> 
> cheers
> 

Thanks.

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



Re: [PATCH v06 3/5] migration/memory: Add hotplug READD_MULTIPLE

2018-10-16 Thread Michael Bringmann
On 10/16/2018 02:57 PM, Tyrel Datwyler wrote:
> On 10/15/2018 05:39 PM, Michael Ellerman wrote:
>> Michael Bringmann  writes:
>>> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
>>> b/arch/powerpc/platforms/pseries/hotplug-memory.c
>>> index 2b796da..9c76345 100644
>>> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
>>> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
>>> @@ -541,6 +549,23 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
>>> return rc;
>>>  }
>>>
>>> +static int dlpar_memory_readd_multiple(void)
>>> +{
>>> +   struct drmem_lmb *lmb;
>>> +   int rc;
>>> +
>>> +   pr_info("Attempting to update multiple LMBs\n");
>>> +
>>> +   for_each_drmem_lmb(lmb) {
>>> +   if (drmem_lmb_update(lmb)) {
>>> +   rc = dlpar_memory_readd_helper(lmb);
>>> +   drmem_remove_lmb_update(lmb);
>>> +   }
>>> +   }
>>> +
>>> +   return rc;
>>> +}
>>
>> This leaves rc potentially uninitialised.
>>
>> What should the result be in that case, -EINVAL ?
> 
> On another note if there are multiple LMBs to update the value of rc only 
> reflects the final dlpar_memory_readd_helper() call.

Correct.  But that is what happens when we compress common code
between two disparate uses i.e. updating memory association after
a migration event with no reporting mechanism other than the console
log, vs re-adding a single LMB by index for the purposes of DLPAR / drmgr.

I could discard the return value from dlpar_memory_readd_helper entirely
in this function and just return 0, but in my experience, once errors start
to occur in memory dlpar ops, they tend to keep on occurring, so I was
returning the last one.  We could also make the code smart enough to
capture and return the first/last non-zero return code.  I didn't believe
that the frequency of errors for this operation warranted the overhead.

> 
> -Tyrel

Michael


-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



Re: [PATCH v06 3/5] migration/memory: Add hotplug READD_MULTIPLE

2018-10-16 Thread Michael Bringmann
On 10/15/2018 07:39 PM, Michael Ellerman wrote:
> Michael Bringmann  writes:
>> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
>> b/arch/powerpc/platforms/pseries/hotplug-memory.c
>> index 2b796da..9c76345 100644
>> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
>> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
>> @@ -541,6 +549,23 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
>>  return rc;
>>  }
>>
>> +static int dlpar_memory_readd_multiple(void)
>> +{
>> +struct drmem_lmb *lmb;
>> +int rc;
>> +
>> +pr_info("Attempting to update multiple LMBs\n");
>> +
>> +for_each_drmem_lmb(lmb) {
>> +if (drmem_lmb_update(lmb)) {
>> +rc = dlpar_memory_readd_helper(lmb);
>> +drmem_remove_lmb_update(lmb);
>> +}
>> +}
>> +
>> +return rc;
>> +}
> 
> This leaves rc potentially uninitialised.
> 
> What should the result be in that case, -EINVAL ?

I will force it to be zero (0).  Failure to find anything
to update is not an error.

> 
> cheers

Thanks.

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



[PATCH v06 2/5] powerpc/drmem: Add internal_flags feature

2018-10-15 Thread Michael Bringmann
powerpc/drmem: Add internal_flags field to each LMB to allow
marking of kernel software-specific operations that need not
be exported to other users.  For instance, if information about
selected LMBs needs to be maintained for subsequent passes
through the system, it can be encoded into the LMB array itself
without requiring the allocation and maintainance of additional
data structures.

Signed-off-by: Michael Bringmann 
---
Changes in v04:
  -- Add another initialization of 'lmb->internal_flags' to
 init_drmem_v2_lmbs.
---
 arch/powerpc/include/asm/drmem.h |   18 ++
 arch/powerpc/mm/drmem.c  |3 +++
 2 files changed, 21 insertions(+)

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index cfe8598..dbb3e6c 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -17,6 +17,7 @@ struct drmem_lmb {
u32 drc_index;
u32 aa_index;
u32 flags;
+   u32 internal_flags;
 };

 struct drmem_lmb_info {
@@ -94,6 +95,23 @@ static inline bool drmem_lmb_reserved(struct drmem_lmb *lmb)
return lmb->flags & DRMEM_LMB_RESERVED;
 }

+#define DRMEM_LMBINT_UPDATE0x0001
+
+static inline void drmem_mark_lmb_update(struct drmem_lmb *lmb)
+{
+   lmb->internal_flags |= DRMEM_LMBINT_UPDATE;
+}
+
+static inline void drmem_remove_lmb_update(struct drmem_lmb *lmb)
+{
+   lmb->internal_flags &= ~DRMEM_LMBINT_UPDATE;
+}
+
+static inline bool drmem_lmb_update(struct drmem_lmb *lmb)
+{
+   return lmb->internal_flags & DRMEM_LMBINT_UPDATE;
+}
+
 u64 drmem_lmb_memory_max(void);
 void __init walk_drmem_lmbs(struct device_node *dn,
void (*func)(struct drmem_lmb *, const __be32 **));
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index ded9dbf..f199fe5 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -207,6 +207,7 @@ static void read_drconf_v1_cell(struct drmem_lmb *lmb,

lmb->aa_index = of_read_number(p++, 1);
lmb->flags = of_read_number(p++, 1);
+   lmb->internal_flags = 0;

*prop = p;
 }
@@ -265,6 +266,7 @@ static void __walk_drmem_v2_lmbs(const __be32 *prop, const 
__be32 *usm,

lmb.aa_index = dr_cell.aa_index;
lmb.flags = dr_cell.flags;
+   lmb.internal_flags = 0;

func(, );
}
@@ -441,6 +443,7 @@ static void init_drmem_v2_lmbs(const __be32 *prop,

lmb->aa_index = dr_cell.aa_index;
lmb->flags = dr_cell.flags;
+   lmb->internal_flags = 0;
}
}
 }



[PATCH v06 5/5] migration/memory: Support 'ibm,dynamic-memory-v2'

2018-10-15 Thread Michael Bringmann
migration/memory: This patch adds recognition for changes to the
associativity of memory blocks described by 'ibm,dynamic-memory-v2'.
If the associativity of an LMB has changed, it should be readded to
the system in order to update local and general kernel data structures.
This patch builds upon previous enhancements that scan the device-tree
"ibm,dynamic-memory" properties using the base LMB array, and a copy
derived from the updated properties.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/platforms/pseries/hotplug-memory.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index dc2aa34..8c08eb2 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -1184,7 +1184,8 @@ static int pseries_memory_notifier(struct notifier_block 
*nb,
err = pseries_remove_mem_node(rd->dn);
break;
case OF_RECONFIG_UPDATE_PROPERTY:
-   if (!strcmp(rd->prop->name, "ibm,dynamic-memory")) {
+   if (!strcmp(rd->prop->name, "ibm,dynamic-memory") ||
+   !strcmp(rd->prop->name, "ibm,dynamic-memory-v2")) {
struct drmem_lmb_info *dinfo =
drmem_lmbs_init(rd->prop);
if (!dinfo)



[PATCH v06 4/5] migration/memory: Evaluate LMB assoc changes

2018-10-15 Thread Michael Bringmann
migration/memory: This patch adds code that recognizes changes to
the associativity of memory blocks described by the device-tree
properties in order to drive equivalent 'hotplug' operations to
update local and general kernel data structures to reflect those
changes.  These differences may include:

* Evaluate 'ibm,dynamic-memory' properties when processing the
  updated device-tree properties of the system during Post Migration
  events (migration_store).  The new functionality looks for changes
  to the aa_index values for each drc_index/LMB to identify any memory
  blocks that should be readded.

* In an LPAR migration scenario, the "ibm,associativity-lookup-arrays"
  property may change.  In the event that a row of the array differs,
  locate all assigned memory blocks with that 'aa_index' and 're-add'
  them to the system memory block data structures.  In the process of
  the 're-add', the system routines will update the corresponding entry
  for the memory in the LMB structures and any other relevant kernel
  data structures.

A number of previous extensions made to the DRMEM code for scanning
device-tree properties and creating LMB arrays are used here to
ensure that the resulting code is simpler and more usable:

* Use new paired list iterator for the DRMEM LMB info arrays to find
  differences in old and new versions of properties.
* Use new iterator for copies of the DRMEM info arrays to evaluate
  completely new structures.
* Combine common code for parsing and evaluating memory description
  properties based on the DRMEM LMB array model to greatly simplify
  extension from the older property 'ibm,dynamic-memory' to the new
  property model of 'ibm,dynamic-memory-v2'.

For support, add a new pseries hotplug action for DLPAR operations,
PSERIES_HP_ELOG_ACTION_READD_MULTIPLE.  It is a variant of the READD
operation which performs the action upon multiple instances of the
resource at one time.  The operation is to be triggered by device-tree
analysis of updates by RTAS events analyzed by 'migation_store' during
post-migration processing.  It will be used for memory updates,
initially.

Signed-off-by: Michael Bringmann 
---
Changes in v06:
  -- Rebase to powerpc next branch to account for recent code changes.
  -- Fix prototype problem when CONFIG_MEMORY_HOTPLUG not defined.
Changes in v05:
  -- Move common structure from numa.c + hotplug-memory.c to header file.
  -- Clarify some comments.
  -- Use walk_drmem_lmbs_pairs and callback instead of local loop
Changes in v04:
  -- Move dlpar_memory_readd_multiple() function definition and use
 into previous patch along with action constant definition.
  -- Correct spacing in patch
Changes in v03:
  -- Modify the code that parses the memory affinity attributes to
 mark relevant DRMEM LMB array entries using the internal_flags
 mechanism instead of generate unique hotplug actions for each
 memory block to be readded.  The change is intended to both
 simplify the code, and to require fewer resources on systems
 with huge amounts of memory.
  -- Save up notice about any all LMB entries until the end of the
 'migration_store' operation at which point a single action is
 queued to scan the entire DRMEM array.
  -- Add READD_MULTIPLE function for memory that scans the DRMEM
 array to identify multiple entries that were marked previously.
 The corresponding memory blocks are to be readded to the system
 to update relevant data structures outside of the powerpc-
 specific code.
  -- Change dlpar_memory_pmt_changes_action to directly queue worker
 to pseries work queue.
---
 arch/powerpc/include/asm/topology.h |7 +
 arch/powerpc/mm/numa.c  |6 -
 arch/powerpc/platforms/pseries/hotplug-memory.c |  207 +++
 arch/powerpc/platforms/pseries/mobility.c   |3 
 arch/powerpc/platforms/pseries/pseries.h|8 +
 5 files changed, 186 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index a4a718d..fbe03df 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -135,5 +135,12 @@ static inline void shared_proc_topology_init(void) {}
 #endif
 #endif

+
+struct assoc_arrays {
+   u32 n_arrays;
+   u32 array_sz;
+   const __be32 *arrays;
+};
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_TOPOLOGY_H */
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 693ae1c..f1e7287 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -368,12 +368,6 @@ static unsigned long read_n_cells(int n, const __be32 
**buf)
return result;
 }

-struct assoc_arrays {
-   u32 n_arrays;
-   u32 array_sz;
-   const __be32 *arrays;
-};
-
 /*
  * Retrieve and validate the list of associativity arrays for drconf
  * memory from the ibm,associativity-lookup-arrays property of the
diff --git a/ar

[PATCH v06 3/5] migration/memory: Add hotplug READD_MULTIPLE

2018-10-15 Thread Michael Bringmann
migration/memory: This patch adds a new pseries hotplug action
for CPU and memory operations, PSERIES_HP_ELOG_ACTION_READD_MULTIPLE.
This is a variant of the READD operation which performs the action
upon multiple instances of the resource at one time.  The operation
is to be triggered by device-tree analysis of updates by RTAS events
analyzed by 'migation_store' during post-migration processing.  It
will be used for memory updates, initially.

Signed-off-by: Michael Bringmann 
---
Changes in v05:
  -- Provide dlpar_memory_readd_helper routine to compress some common code
Changes in v04:
  -- Move init of 'lmb->internal_flags' in init_drmem_v2_lmbs to
 previous patch.
  -- Pull in implementation of dlpar_memory_readd_multiple() to go
 with operation flag.
---
 arch/powerpc/include/asm/rtas.h |1 +
 arch/powerpc/platforms/pseries/hotplug-memory.c |   44 ---
 2 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 0183e95..cc00451 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -333,6 +333,7 @@ struct pseries_hp_errorlog {
 #define PSERIES_HP_ELOG_ACTION_ADD 1
 #define PSERIES_HP_ELOG_ACTION_REMOVE  2
 #define PSERIES_HP_ELOG_ACTION_READD   3
+#define PSERIES_HP_ELOG_ACTION_READD_MULTIPLE  4

 #define PSERIES_HP_ELOG_ID_DRC_NAME1
 #define PSERIES_HP_ELOG_ID_DRC_INDEX   2
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 2b796da..9c76345 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -507,6 +507,19 @@ static int dlpar_memory_remove_by_index(u32 drc_index)
return rc;
 }

+static int dlpar_memory_readd_helper(struct drmem_lmb *lmb)
+{
+   int rc;
+
+   rc = dlpar_remove_lmb(lmb);
+   if (!rc) {
+   rc = dlpar_add_lmb(lmb);
+   if (rc)
+   dlpar_release_drc(lmb->drc_index);
+   }
+   return rc;
+}
+
 static int dlpar_memory_readd_by_index(u32 drc_index)
 {
struct drmem_lmb *lmb;
@@ -519,12 +532,7 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
for_each_drmem_lmb(lmb) {
if (lmb->drc_index == drc_index) {
lmb_found = 1;
-   rc = dlpar_remove_lmb(lmb);
-   if (!rc) {
-   rc = dlpar_add_lmb(lmb);
-   if (rc)
-   dlpar_release_drc(lmb->drc_index);
-   }
+   rc = dlpar_memory_readd_helper(lmb);
break;
}
}
@@ -541,6 +549,23 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
return rc;
 }

+static int dlpar_memory_readd_multiple(void)
+{
+   struct drmem_lmb *lmb;
+   int rc;
+
+   pr_info("Attempting to update multiple LMBs\n");
+
+   for_each_drmem_lmb(lmb) {
+   if (drmem_lmb_update(lmb)) {
+   rc = dlpar_memory_readd_helper(lmb);
+   drmem_remove_lmb_update(lmb);
+   }
+   }
+
+   return rc;
+}
+
 static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
 {
struct drmem_lmb *lmb, *start_lmb, *end_lmb;
@@ -641,6 +666,10 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
 {
return -EOPNOTSUPP;
 }
+static int dlpar_memory_readd_multiple(void)
+{
+   return -EOPNOTSUPP;
+}

 static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
 {
@@ -918,6 +947,9 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
drc_index = hp_elog->_drc_u.drc_index;
rc = dlpar_memory_readd_by_index(drc_index);
break;
+   case PSERIES_HP_ELOG_ACTION_READD_MULTIPLE:
+   rc = dlpar_memory_readd_multiple();
+   break;
default:
pr_err("Invalid action (%d) specified\n", hp_elog->action);
rc = -EINVAL;



Fwd: [PATCH v06 2/5] powerpc/drmem: Add internal_flags feature

2018-10-15 Thread Michael Bringmann
powerpc/drmem: Add internal_flags field to each LMB to allow
marking of kernel software-specific operations that need not
be exported to other users.  For instance, if information about
selected LMBs needs to be maintained for subsequent passes
through the system, it can be encoded into the LMB array itself
without requiring the allocation and maintainance of additional
data structures.

Signed-off-by: Michael Bringmann 
---
Changes in v04:
  -- Add another initialization of 'lmb->internal_flags' to
 init_drmem_v2_lmbs.
---
 arch/powerpc/include/asm/drmem.h |   18 ++
 arch/powerpc/mm/drmem.c  |3 +++
 2 files changed, 21 insertions(+)

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index cfe8598..dbb3e6c 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -17,6 +17,7 @@ struct drmem_lmb {
u32 drc_index;
u32 aa_index;
u32 flags;
+   u32 internal_flags;
 };

 struct drmem_lmb_info {
@@ -94,6 +95,23 @@ static inline bool drmem_lmb_reserved(struct drmem_lmb *lmb)
return lmb->flags & DRMEM_LMB_RESERVED;
 }

+#define DRMEM_LMBINT_UPDATE0x0001
+
+static inline void drmem_mark_lmb_update(struct drmem_lmb *lmb)
+{
+   lmb->internal_flags |= DRMEM_LMBINT_UPDATE;
+}
+
+static inline void drmem_remove_lmb_update(struct drmem_lmb *lmb)
+{
+   lmb->internal_flags &= ~DRMEM_LMBINT_UPDATE;
+}
+
+static inline bool drmem_lmb_update(struct drmem_lmb *lmb)
+{
+   return lmb->internal_flags & DRMEM_LMBINT_UPDATE;
+}
+
 u64 drmem_lmb_memory_max(void);
 void __init walk_drmem_lmbs(struct device_node *dn,
void (*func)(struct drmem_lmb *, const __be32 **));
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index ded9dbf..f199fe5 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -207,6 +207,7 @@ static void read_drconf_v1_cell(struct drmem_lmb *lmb,

lmb->aa_index = of_read_number(p++, 1);
lmb->flags = of_read_number(p++, 1);
+   lmb->internal_flags = 0;

*prop = p;
 }
@@ -265,6 +266,7 @@ static void __walk_drmem_v2_lmbs(const __be32 *prop, const 
__be32 *usm,

lmb.aa_index = dr_cell.aa_index;
lmb.flags = dr_cell.flags;
+   lmb.internal_flags = 0;

func(, );
}
@@ -441,6 +443,7 @@ static void init_drmem_v2_lmbs(const __be32 *prop,

lmb->aa_index = dr_cell.aa_index;
lmb->flags = dr_cell.flags;
+   lmb->internal_flags = 0;
}
}
 }



[PATCH v06 1/5] powerpc/drmem: Export 'dynamic-memory' loader

2018-10-15 Thread Michael Bringmann
powerpc/drmem: Export many of the functions of DRMEM to parse
"ibm,dynamic-memory" and "ibm,dynamic-memory-v2" during hotplug
operations and for Post Migration events.

Also modify the DRMEM initialization code to allow it to,

* Be called after system initialization
* Provide a separate user copy of the LMB array that is produces
* Free the user copy upon request

In addition, a couple of changes were made to make the creation
of additional copies of the LMB array more useful including,

* Add iterator function to work through a pair of drmem_info arrays
  with a callback function to apply specific tests.
* Modify DRMEM code to replace usages of dt_root_addr_cells, and
  dt_mem_next_cell, as these are only available at first boot.

Signed-off-by: Michael Bringmann 
---
Changes in v05:
  -- Add walk_drmem_lmbs_pairs to replace macro for_each_pair_lmb
---
 arch/powerpc/include/asm/drmem.h |   13 +
 arch/powerpc/mm/drmem.c  |   96 ++
 2 files changed, 89 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index 7c1d8e7..cfe8598 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -35,6 +35,11 @@ struct drmem_lmb_info {
_info->lmbs[0],   \
_info->lmbs[drmem_info->n_lmbs - 1])

+#define for_each_dinfo_lmb(dinfo, lmb) \
+   for_each_drmem_lmb_in_range((lmb),  \
+   >lmbs[0],\
+   >lmbs[dinfo->n_lmbs - 1])
+
 /*
  * The of_drconf_cell_v1 struct defines the layout of the LMB data
  * specified in the ibm,dynamic-memory device tree property.
@@ -94,6 +99,14 @@ void __init walk_drmem_lmbs(struct device_node *dn,
void (*func)(struct drmem_lmb *, const __be32 **));
 int drmem_update_dt(void);

+struct drmem_lmb_info *drmem_lmbs_init(struct property *prop);
+void drmem_lmbs_free(struct drmem_lmb_info *dinfo);
+int walk_drmem_lmbs_pairs(struct drmem_lmb_info *dinfo_oth,
+ int (*func)(struct drmem_lmb *cnt,
+   struct drmem_lmb *oth,
+   void *data),
+ void *data);
+
 #ifdef CONFIG_PPC_PSERIES
 void __init walk_drmem_lmbs_early(unsigned long node,
void (*func)(struct drmem_lmb *, const __be32 **));
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index 3f18036..ded9dbf 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -20,6 +20,7 @@

 static struct drmem_lmb_info __drmem_info;
 struct drmem_lmb_info *drmem_info = &__drmem_info;
+static int n_root_addr_cells;

 u64 drmem_lmb_memory_max(void)
 {
@@ -193,12 +194,13 @@ int drmem_update_dt(void)
return rc;
 }

-static void __init read_drconf_v1_cell(struct drmem_lmb *lmb,
+static void read_drconf_v1_cell(struct drmem_lmb *lmb,
   const __be32 **prop)
 {
const __be32 *p = *prop;

-   lmb->base_addr = dt_mem_next_cell(dt_root_addr_cells, );
+   lmb->base_addr = of_read_number(p, n_root_addr_cells);
+   p += n_root_addr_cells;
lmb->drc_index = of_read_number(p++, 1);

p++; /* skip reserved field */
@@ -209,7 +211,7 @@ static void __init read_drconf_v1_cell(struct drmem_lmb 
*lmb,
*prop = p;
 }

-static void __init __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm,
+static void __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm,
void (*func)(struct drmem_lmb *, const __be32 **))
 {
struct drmem_lmb lmb;
@@ -225,13 +227,14 @@ static void __init __walk_drmem_v1_lmbs(const __be32 
*prop, const __be32 *usm,
}
 }

-static void __init read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
+static void read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
   const __be32 **prop)
 {
const __be32 *p = *prop;

dr_cell->seq_lmbs = of_read_number(p++, 1);
-   dr_cell->base_addr = dt_mem_next_cell(dt_root_addr_cells, );
+   dr_cell->base_addr = of_read_number(p, n_root_addr_cells);
+   p += n_root_addr_cells;
dr_cell->drc_index = of_read_number(p++, 1);
dr_cell->aa_index = of_read_number(p++, 1);
dr_cell->flags = of_read_number(p++, 1);
@@ -239,7 +242,7 @@ static void __init read_drconf_v2_cell(struct 
of_drconf_cell_v2 *dr_cell,
*prop = p;
 }

-static void __init __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm,
+static void __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm,
void (*func)(struct drmem_lmb *, const __be32 **))
 {
struct of_drconf_cell_v2 dr_cell;
@@ -275,6 +278,9 @@ void __init walk_drmem_lmbs_early(

[PATCH v06 0/5] powerpc/migration: Affinity fix for memory

2018-10-15 Thread Michael Bringmann
The migration of LPARs across Power systems affects many attributes
including that of the associativity of memory blocks.  The patches
in this set execute when a system is coming up fresh upon a migration
target.  They are intended to,

* Recognize changes to the associativity of memory recorded in
  internal data structures when compared to the latest copies in
  the device tree (e.g. ibm,dynamic-memory, ibm,dynamic-memory-v2).
* Recognize changes to the associativity mapping (e.g. ibm,
  associativity-lookup-arrays), locate all assigned memory blocks
  corresponding to each changed row, and readd all such blocks.
* Generate calls to other code layers to reset the data structures
  related to associativity of memory.
* Re-register the 'changed' entities into the target system.
  Re-registration of memory blocks mostly entails acting as if they
  have been newly hot-added into the target system.

This code builds upon features introduced in a previous patch set
that updates CPUs for affinity changes that may occur during LPM.

Signed-off-by: Michael Bringmann 

Michael Bringmann (5):
  powerpc/drmem: Export 'dynamic-memory' loader
  powerpc/drmem: Add internal_flags feature
  migration/memory: Add hotplug flags READD_MULTIPLE
  migration/memory: Evaluate LMB assoc changes
  migration/memory: Support 'ibm,dynamic-memory-v2'
---
Changes in v06:
  -- Rebase to powerpc next branch to account for recent code changes.
  -- Fix prototype problem when CONFIG_MEMORY_HOTPLUG not defined.
Changes in v05:
  -- Add walk_drmem_lmbs_pairs to replace macro for_each_pair_lmb
  -- Use walk_drmem_lmbs_pairs and callback instead of local loop
  -- Provide dlpar_memory_readd_helper routine to compress some common code
  -- Move common structure from numa.c + hotplug-memory.c to header file.
  -- Clarify some comments.
Changes in v04:
  -- Move dlpar_memory_readd_multiple() to patch with new ACTION
 constant.
  -- Move init of 'lmb->internal_flags' in init_drmem_v2_lmbs to
 patch with other references to flag.
  -- Correct spacing in one of the patches
Changes in v03:
  -- Change operation to tag changed LMBs in DRMEM array instead of
 queuing a potentially huge number of structures.
  -- Added another hotplug queue event for CPU/memory operations
  -- Added internal_flags feature to DRMEM
  -- Improve the patch description language for the patch set.
  -- Revise patch set to queue worker for memory association
 updates directly to pseries worker queue.



[PATCH] hotplug/cpu: Extend start/stop cpumap lock scope

2018-10-13 Thread Michael Bringmann
The PPC mobility code may receive DLPAR CPU add/remove requests
to perform CPU changes at any time, including during LPAR migration
or RTAS requests or SMT changes.  When the operations are received
concurrently, there is an opportunity for DLPAR CPU remove requests
and other requests to overlap, and for one of the requests to be
interrupted after some shared state has been modified.  This patch
changes the duration for which cpu maps updates are suppressed
during DLPAR CPU operations made by 'drmgr' are suppressed to
reduce the period in which changes to shared state may occur.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/platforms/pseries/hotplug-cpu.c |   10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 2f8e621..fce46c56 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -356,7 +356,6 @@ static int dlpar_online_cpu(struct device_node *dn)
 
nthreads = len / sizeof(u32);
 
-   cpu_maps_update_begin();
for (i = 0; i < nthreads; i++) {
thread = be32_to_cpu(intserv[i]);
for_each_present_cpu(cpu) {
@@ -378,7 +377,6 @@ static int dlpar_online_cpu(struct device_node *dn)
printk(KERN_WARNING "Could not find cpu to online "
   "with physical id 0x%x\n", thread);
}
-   cpu_maps_update_done();
 
 out:
return rc;
@@ -523,7 +521,6 @@ static int dlpar_offline_cpu(struct device_node *dn)
 
nthreads = len / sizeof(u32);
 
-   cpu_maps_update_begin();
for (i = 0; i < nthreads; i++) {
thread = be32_to_cpu(intserv[i]);
for_each_present_cpu(cpu) {
@@ -559,7 +556,6 @@ static int dlpar_offline_cpu(struct device_node *dn)
if (cpu == num_possible_cpus())
printk(KERN_WARNING "Could not find cpu to offline with 
physical id 0x%x\n", thread);
}
-   cpu_maps_update_done();
 
 out:
return rc;
@@ -811,6 +807,7 @@ int dlpar_cpu(struct pseries_hp_errorlog *hp_elog)
drc_index = hp_elog->_drc_u.drc_index;
 
lock_device_hotplug();
+   cpu_maps_update_begin();
 
switch (hp_elog->action) {
case PSERIES_HP_ELOG_ACTION_REMOVE:
@@ -835,6 +832,7 @@ int dlpar_cpu(struct pseries_hp_errorlog *hp_elog)
break;
}
 
+   cpu_maps_update_done();
unlock_device_hotplug();
return rc;
 }
@@ -850,7 +848,9 @@ static ssize_t dlpar_cpu_probe(const char *buf, size_t 
count)
if (rc)
return -EINVAL;
 
+   cpu_maps_update_begin();
rc = dlpar_cpu_add(drc_index);
+   cpu_maps_update_done();
 
return rc ? rc : count;
 }
@@ -871,7 +871,9 @@ static ssize_t dlpar_cpu_release(const char *buf, size_t 
count)
return -EINVAL;
}
 
+   cpu_maps_update_begin();
rc = dlpar_cpu_remove(dn, drc_index);
+   cpu_maps_update_done();
of_node_put(dn);
 
return rc ? rc : count;



[PATCH v05 5/5] migration/memory: Support 'ibm,dynamic-memory-v2'

2018-10-13 Thread Michael Bringmann
migration/memory: This patch adds recognition for changes to the
associativity of memory blocks described by 'ibm,dynamic-memory-v2'.
If the associativity of an LMB has changed, it should be readded to
the system in order to update local and general kernel data structures.
This patch builds upon previous enhancements that scan the device-tree
"ibm,dynamic-memory" properties using the base LMB array, and a copy
derived from the updated properties.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/platforms/pseries/hotplug-memory.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 6df5722..75c2118 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -1189,7 +1189,8 @@ static int pseries_memory_notifier(struct notifier_block 
*nb,
err = pseries_remove_mem_node(rd->dn);
break;
case OF_RECONFIG_UPDATE_PROPERTY:
-   if (!strcmp(rd->prop->name, "ibm,dynamic-memory")) {
+   if (!strcmp(rd->prop->name, "ibm,dynamic-memory") ||
+   !strcmp(rd->prop->name, "ibm,dynamic-memory-v2")) {
struct drmem_lmb_info *dinfo =
drmem_lmbs_init(rd->prop);
if (!dinfo)



[PATCH v05 4/5] migration/memory: Evaluate LMB assoc changes

2018-10-13 Thread Michael Bringmann
migration/memory: This patch adds code that recognizes changes to
the associativity of memory blocks described by the device-tree
properties in order to drive equivalent 'hotplug' operations to
update local and general kernel data structures to reflect those
changes.  These differences may include:

* Evaluate 'ibm,dynamic-memory' properties when processing the
  updated device-tree properties of the system during Post Migration
  events (migration_store).  The new functionality looks for changes
  to the aa_index values for each drc_index/LMB to identify any memory
  blocks that should be readded.

* In an LPAR migration scenario, the "ibm,associativity-lookup-arrays"
  property may change.  In the event that a row of the array differs,
  locate all assigned memory blocks with that 'aa_index' and 're-add'
  them to the system memory block data structures.  In the process of
  the 're-add', the system routines will update the corresponding entry
  for the memory in the LMB structures and any other relevant kernel
  data structures.

A number of previous extensions made to the DRMEM code for scanning
device-tree properties and creating LMB arrays are used here to
ensure that the resulting code is simpler and more usable:

* Use new paired list iterator for the DRMEM LMB info arrays to find
  differences in old and new versions of properties.
* Use new iterator for copies of the DRMEM info arrays to evaluate
  completely new structures.
* Combine common code for parsing and evaluating memory description
  properties based on the DRMEM LMB array model to greatly simplify
  extension from the older property 'ibm,dynamic-memory' to the new
  property model of 'ibm,dynamic-memory-v2'.

For support, add a new pseries hotplug action for DLPAR operations,
PSERIES_HP_ELOG_ACTION_READD_MULTIPLE.  It is a variant of the READD
operation which performs the action upon multiple instances of the
resource at one time.  The operation is to be triggered by device-tree
analysis of updates by RTAS events analyzed by 'migation_store' during
post-migration processing.  It will be used for memory updates,
initially.

Signed-off-by: Michael Bringmann 
---
Changes in v05:
  -- Move common structure from numa.c + hotplug-memory.c to header file.
  -- Clarify some comments.
  -- Use walk_drmem_lmbs_pairs and callback instead of local loop
Changes in v04:
  -- Move dlpar_memory_readd_multiple() function definition and use
 into previous patch along with action constant definition.
  -- Correct spacing in patch
Changes in v03:
  -- Modify the code that parses the memory affinity attributes to
 mark relevant DRMEM LMB array entries using the internal_flags
 mechanism instead of generate unique hotplug actions for each
 memory block to be readded.  The change is intended to both
 simplify the code, and to require fewer resources on systems
 with huge amounts of memory.
  -- Save up notice about any all LMB entries until the end of the
 'migration_store' operation at which point a single action is
 queued to scan the entire DRMEM array.
  -- Add READD_MULTIPLE function for memory that scans the DRMEM
 array to identify multiple entries that were marked previously.
 The corresponding memory blocks are to be readded to the system
 to update relevant data structures outside of the powerpc-
 specific code.
  -- Change dlpar_memory_pmt_changes_action to directly queue worker
 to pseries work queue.
---
 arch/powerpc/include/asm/topology.h |7 +
 arch/powerpc/mm/numa.c  |6 -
 arch/powerpc/platforms/pseries/hotplug-memory.c |  207 +++
 arch/powerpc/platforms/pseries/mobility.c   |4 
 arch/powerpc/platforms/pseries/pseries.h|4 
 5 files changed, 183 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index a4a718d..fbe03df 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -135,5 +135,12 @@ static inline void shared_proc_topology_init(void) {}
 #endif
 #endif
 
+
+struct assoc_arrays {
+   u32 n_arrays;
+   u32 array_sz;
+   const __be32 *arrays;
+};
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_TOPOLOGY_H */
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index b5a71ba..ab881e3 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -368,12 +368,6 @@ static unsigned long read_n_cells(int n, const __be32 
**buf)
return result;
 }
 
-struct assoc_arrays {
-   u32 n_arrays;
-   u32 array_sz;
-   const __be32 *arrays;
-};
-
 /*
  * Retrieve and validate the list of associativity arrays for drconf
  * memory from the ibm,associativity-lookup-arrays property of the
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 2b12deb..6df5722 100644
--- a/arch/powerpc/platfor

[PATCH v05 3/5] migration/memory: Add hotplug READD_MULTIPLE

2018-10-13 Thread Michael Bringmann
migration/memory: This patch adds a new pseries hotplug action
for CPU and memory operations, PSERIES_HP_ELOG_ACTION_READD_MULTIPLE.
This is a variant of the READD operation which performs the action
upon multiple instances of the resource at one time.  The operation
is to be triggered by device-tree analysis of updates by RTAS events
analyzed by 'migation_store' during post-migration processing.  It
will be used for memory updates, initially.

Signed-off-by: Michael Bringmann 
---
Changes in v05:
  -- Provide dlpar_memory_readd_helper routine to compress some common code
Changes in v04:
  -- Move init of 'lmb->internal_flags' in init_drmem_v2_lmbs to
 previous patch.
  -- Pull in implementation of dlpar_memory_readd_multiple() to go
 with operation flag.
---
 arch/powerpc/include/asm/rtas.h |1 +
 arch/powerpc/platforms/pseries/hotplug-memory.c |   44 ---
 2 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 0183e95..cc00451 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -333,6 +333,7 @@ struct pseries_hp_errorlog {
 #define PSERIES_HP_ELOG_ACTION_ADD 1
 #define PSERIES_HP_ELOG_ACTION_REMOVE  2
 #define PSERIES_HP_ELOG_ACTION_READD   3
+#define PSERIES_HP_ELOG_ACTION_READD_MULTIPLE  4
 
 #define PSERIES_HP_ELOG_ID_DRC_NAME1
 #define PSERIES_HP_ELOG_ID_DRC_INDEX   2
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 9a15d39..2b12deb 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -512,6 +512,19 @@ static int dlpar_memory_remove_by_index(u32 drc_index)
return rc;
 }
 
+static int dlpar_memory_readd_helper(struct drmem_lmb *lmb)
+{
+   int rc;
+
+   rc = dlpar_remove_lmb(lmb);
+   if (!rc) {
+   rc = dlpar_add_lmb(lmb);
+   if (rc)
+   dlpar_release_drc(lmb->drc_index);
+   }
+   return rc;
+}
+
 static int dlpar_memory_readd_by_index(u32 drc_index)
 {
struct drmem_lmb *lmb;
@@ -524,12 +537,7 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
for_each_drmem_lmb(lmb) {
if (lmb->drc_index == drc_index) {
lmb_found = 1;
-   rc = dlpar_remove_lmb(lmb);
-   if (!rc) {
-   rc = dlpar_add_lmb(lmb);
-   if (rc)
-   dlpar_release_drc(lmb->drc_index);
-   }
+   rc = dlpar_memory_readd_helper(lmb);
break;
}
}
@@ -546,6 +554,23 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
return rc;
 }
 
+static int dlpar_memory_readd_multiple(void)
+{
+   struct drmem_lmb *lmb;
+   int rc;
+
+   pr_info("Attempting to update multiple LMBs\n");
+
+   for_each_drmem_lmb(lmb) {
+   if (drmem_lmb_update(lmb)) {
+   rc = dlpar_memory_readd_helper(lmb);
+   drmem_remove_lmb_update(lmb);
+   }
+   }
+
+   return rc;
+}
+
 static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
 {
struct drmem_lmb *lmb, *start_lmb, *end_lmb;
@@ -646,6 +671,10 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
 {
return -EOPNOTSUPP;
 }
+static int dlpar_memory_readd_multiple(void)
+{
+   return -EOPNOTSUPP;
+}
 
 static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
 {
@@ -923,6 +952,9 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
drc_index = hp_elog->_drc_u.drc_index;
rc = dlpar_memory_readd_by_index(drc_index);
break;
+   case PSERIES_HP_ELOG_ACTION_READD_MULTIPLE:
+   rc = dlpar_memory_readd_multiple();
+   break;
default:
pr_err("Invalid action (%d) specified\n", hp_elog->action);
rc = -EINVAL;



[PATCH v05 2/5] powerpc/drmem: Add internal_flags feature

2018-10-13 Thread Michael Bringmann
powerpc/drmem: Add internal_flags field to each LMB to allow
marking of kernel software-specific operations that need not
be exported to other users.  For instance, if information about
selected LMBs needs to be maintained for subsequent passes
through the system, it can be encoded into the LMB array itself
without requiring the allocation and maintainance of additional
data structures.

Signed-off-by: Michael Bringmann 
---
Changes in v04:
  -- Add another initialization of 'lmb->internal_flags' to
 init_drmem_v2_lmbs.
---
 arch/powerpc/include/asm/drmem.h |   18 ++
 arch/powerpc/mm/drmem.c  |3 +++
 2 files changed, 21 insertions(+)

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index cfe8598..dbb3e6c 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -17,6 +17,7 @@ struct drmem_lmb {
u32 drc_index;
u32 aa_index;
u32 flags;
+   u32 internal_flags;
 };
 
 struct drmem_lmb_info {
@@ -94,6 +95,23 @@ static inline bool drmem_lmb_reserved(struct drmem_lmb *lmb)
return lmb->flags & DRMEM_LMB_RESERVED;
 }
 
+#define DRMEM_LMBINT_UPDATE0x0001
+
+static inline void drmem_mark_lmb_update(struct drmem_lmb *lmb)
+{
+   lmb->internal_flags |= DRMEM_LMBINT_UPDATE;
+}
+
+static inline void drmem_remove_lmb_update(struct drmem_lmb *lmb)
+{
+   lmb->internal_flags &= ~DRMEM_LMBINT_UPDATE;
+}
+
+static inline bool drmem_lmb_update(struct drmem_lmb *lmb)
+{
+   return lmb->internal_flags & DRMEM_LMBINT_UPDATE;
+}
+
 u64 drmem_lmb_memory_max(void);
 void __init walk_drmem_lmbs(struct device_node *dn,
void (*func)(struct drmem_lmb *, const __be32 **));
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index ded9dbf..f199fe5 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -207,6 +207,7 @@ static void read_drconf_v1_cell(struct drmem_lmb *lmb,
 
lmb->aa_index = of_read_number(p++, 1);
lmb->flags = of_read_number(p++, 1);
+   lmb->internal_flags = 0;
 
*prop = p;
 }
@@ -265,6 +266,7 @@ static void __walk_drmem_v2_lmbs(const __be32 *prop, const 
__be32 *usm,
 
lmb.aa_index = dr_cell.aa_index;
lmb.flags = dr_cell.flags;
+   lmb.internal_flags = 0;
 
func(, );
}
@@ -441,6 +443,7 @@ static void init_drmem_v2_lmbs(const __be32 *prop,
 
lmb->aa_index = dr_cell.aa_index;
lmb->flags = dr_cell.flags;
+   lmb->internal_flags = 0;
}
}
 }



[PATCH v05 1/5] powerpc/drmem: Export 'dynamic-memory' loader

2018-10-13 Thread Michael Bringmann
powerpc/drmem: Export many of the functions of DRMEM to parse
"ibm,dynamic-memory" and "ibm,dynamic-memory-v2" during hotplug
operations and for Post Migration events.

Also modify the DRMEM initialization code to allow it to,

* Be called after system initialization
* Provide a separate user copy of the LMB array that is produces
* Free the user copy upon request

In addition, a couple of changes were made to make the creation
of additional copies of the LMB array more useful including,

* Add iterator function to work through a pair of drmem_info arrays
  with a callback function to apply specific tests.
* Modify DRMEM code to replace usages of dt_root_addr_cells, and
  dt_mem_next_cell, as these are only available at first boot.

Signed-off-by: Michael Bringmann 
---
Changes in v05:
  -- Add walk_drmem_lmbs_pairs to replace macro for_each_pair_lmb
---
 arch/powerpc/include/asm/drmem.h |   13 +
 arch/powerpc/mm/drmem.c  |   96 ++
 2 files changed, 89 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index 7c1d8e7..cfe8598 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -35,6 +35,11 @@ struct drmem_lmb_info {
_info->lmbs[0],   \
_info->lmbs[drmem_info->n_lmbs - 1])
 
+#define for_each_dinfo_lmb(dinfo, lmb) \
+   for_each_drmem_lmb_in_range((lmb),  \
+   >lmbs[0],\
+   >lmbs[dinfo->n_lmbs - 1])
+
 /*
  * The of_drconf_cell_v1 struct defines the layout of the LMB data
  * specified in the ibm,dynamic-memory device tree property.
@@ -94,6 +99,14 @@ void __init walk_drmem_lmbs(struct device_node *dn,
void (*func)(struct drmem_lmb *, const __be32 **));
 int drmem_update_dt(void);
 
+struct drmem_lmb_info *drmem_lmbs_init(struct property *prop);
+void drmem_lmbs_free(struct drmem_lmb_info *dinfo);
+int walk_drmem_lmbs_pairs(struct drmem_lmb_info *dinfo_oth,
+ int (*func)(struct drmem_lmb *cnt,
+   struct drmem_lmb *oth,
+   void *data),
+ void *data);
+
 #ifdef CONFIG_PPC_PSERIES
 void __init walk_drmem_lmbs_early(unsigned long node,
void (*func)(struct drmem_lmb *, const __be32 **));
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index 3f18036..ded9dbf 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -20,6 +20,7 @@
 
 static struct drmem_lmb_info __drmem_info;
 struct drmem_lmb_info *drmem_info = &__drmem_info;
+static int n_root_addr_cells;
 
 u64 drmem_lmb_memory_max(void)
 {
@@ -193,12 +194,13 @@ int drmem_update_dt(void)
return rc;
 }
 
-static void __init read_drconf_v1_cell(struct drmem_lmb *lmb,
+static void read_drconf_v1_cell(struct drmem_lmb *lmb,
   const __be32 **prop)
 {
const __be32 *p = *prop;
 
-   lmb->base_addr = dt_mem_next_cell(dt_root_addr_cells, );
+   lmb->base_addr = of_read_number(p, n_root_addr_cells);
+   p += n_root_addr_cells;
lmb->drc_index = of_read_number(p++, 1);
 
p++; /* skip reserved field */
@@ -209,7 +211,7 @@ static void __init read_drconf_v1_cell(struct drmem_lmb 
*lmb,
*prop = p;
 }
 
-static void __init __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm,
+static void __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm,
void (*func)(struct drmem_lmb *, const __be32 **))
 {
struct drmem_lmb lmb;
@@ -225,13 +227,14 @@ static void __init __walk_drmem_v1_lmbs(const __be32 
*prop, const __be32 *usm,
}
 }
 
-static void __init read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
+static void read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
   const __be32 **prop)
 {
const __be32 *p = *prop;
 
dr_cell->seq_lmbs = of_read_number(p++, 1);
-   dr_cell->base_addr = dt_mem_next_cell(dt_root_addr_cells, );
+   dr_cell->base_addr = of_read_number(p, n_root_addr_cells);
+   p += n_root_addr_cells;
dr_cell->drc_index = of_read_number(p++, 1);
dr_cell->aa_index = of_read_number(p++, 1);
dr_cell->flags = of_read_number(p++, 1);
@@ -239,7 +242,7 @@ static void __init read_drconf_v2_cell(struct 
of_drconf_cell_v2 *dr_cell,
*prop = p;
 }
 
-static void __init __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm,
+static void __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm,
void (*func)(struct drmem_lmb *, const __be32 **))
 {
struct of_drconf_cell_v2 dr_cell;
@@ -275,6 +278,9 @@ void __init walk_drmem_lmbs_early(

[PATCH v05 0/5] powerpc/migration: Affinity fix for memory

2018-10-13 Thread Michael Bringmann
The migration of LPARs across Power systems affects many attributes
including that of the associativity of memory blocks.  The patches
in this set execute when a system is coming up fresh upon a migration
target.  They are intended to,

* Recognize changes to the associativity of memory recorded in
  internal data structures when compared to the latest copies in
  the device tree (e.g. ibm,dynamic-memory, ibm,dynamic-memory-v2).
* Recognize changes to the associativity mapping (e.g. ibm,
  associativity-lookup-arrays), locate all assigned memory blocks
  corresponding to each changed row, and readd all such blocks.
* Generate calls to other code layers to reset the data structures
  related to associativity of memory.
* Re-register the 'changed' entities into the target system.
  Re-registration of memory blocks mostly entails acting as if they
  have been newly hot-added into the target system.

This code builds upon features introduced in a previous patch set
that updates CPUs for affinity changes that may occur during LPM.

Signed-off-by: Michael Bringmann 

Michael Bringmann (5):
  powerpc/drmem: Export 'dynamic-memory' loader
  powerpc/drmem: Add internal_flags feature
  migration/memory: Add hotplug flags READD_MULTIPLE
  migration/memory: Evaluate LMB assoc changes
  migration/memory: Support 'ibm,dynamic-memory-v2'
---
Changes in v05:
  -- Add walk_drmem_lmbs_pairs to replace macro for_each_pair_lmb
  -- Use walk_drmem_lmbs_pairs and callback instead of local loop
  -- Provide dlpar_memory_readd_helper routine to compress some common code
  -- Move common structure from numa.c + hotplug-memory.c to header file.
  -- Clarify some comments.
Changes in v04:
  -- Move dlpar_memory_readd_multiple() to patch with new ACTION
 constant.
  -- Move init of 'lmb->internal_flags' in init_drmem_v2_lmbs to
 patch with other references to flag.
  -- Correct spacing in one of the patches
Changes in v03:
  -- Change operation to tag changed LMBs in DRMEM array instead of
 queuing a potentially huge number of structures.
  -- Added another hotplug queue event for CPU/memory operations
  -- Added internal_flags feature to DRMEM
  -- Improve the patch description language for the patch set.
  -- Revise patch set to queue worker for memory association
 updates directly to pseries worker queue.



Re: [PATCH v04 1/5] powerpc/drmem: Export 'dynamic-memory' loader

2018-10-11 Thread Michael Bringmann
Checked my notes.  I changed read_drconf_v1_cell/read_drconf_v2_cell
to use of_read_number in place of dt_mem_next_cell, because the function
is marked __init, and is not loaded in memory after a migration, so the
system crashes.  So, we need that modification, unless we also add some
changes to kernel/drivers/of/fdt.c to the mix.

On 10/10/2018 12:34 PM, Michael Bringmann wrote:
> On 10/10/2018 11:54 AM, Nathan Fontenot wrote:
>> On 10/09/2018 03:36 PM, Michael Bringmann wrote:
>>> powerpc/drmem: Export many of the functions of DRMEM to parse
>>> "ibm,dynamic-memory" and "ibm,dynamic-memory-v2" during hotplug
>>> operations and for Post Migration events.
>>>
>>> Also modify the DRMEM initialization code to allow it to,
>>>
>>> * Be called after system initialization
>>> * Provide a separate user copy of the LMB array that is produces
>>> * Free the user copy upon request
>>>
>>> In addition, a couple of changes were made to make the creation
>>> of additional copies of the LMB array more useful including,
>>>
>>> * Add new iterator to work through a pair of drmem_info arrays.
>>> * Modify DRMEM code to replace usages of dt_root_addr_cells, and
>>>   dt_mem_next_cell, as these are only available at first boot.
>>>
>>> Signed-off-by: Michael Bringmann 
>>> ---
>>>  arch/powerpc/include/asm/drmem.h |   15 
>>>  arch/powerpc/mm/drmem.c  |   75 
>>> --
>>>  2 files changed, 70 insertions(+), 20 deletions(-)
>>>
>>> diff --git a/arch/powerpc/include/asm/drmem.h 
>>> b/arch/powerpc/include/asm/drmem.h
>>> index 7c1d8e7..1fbb684 100644
>>> --- a/arch/powerpc/include/asm/drmem.h
>>> +++ b/arch/powerpc/include/asm/drmem.h
>>> @@ -35,6 +35,18 @@ struct drmem_lmb_info {
>>> _info->lmbs[0],   \
>>> _info->lmbs[drmem_info->n_lmbs - 1])
>>>
>>> +#define for_each_dinfo_lmb(dinfo, lmb) \
>>> +   for_each_drmem_lmb_in_range((lmb),  \
>>> +   >lmbs[0],\
>>> +   >lmbs[dinfo->n_lmbs - 1])
>>> +
>>> +#define for_each_pair_dinfo_lmb(dinfo1, lmb1, dinfo2, lmb2)\
>>> +   for ((lmb1) = (>lmbs[0]),   \
>>> +(lmb2) = (>lmbs[0]);   \
>>> +((lmb1) <= (>lmbs[dinfo1->n_lmbs - 1])) && \
>>> +((lmb2) <= (>lmbs[dinfo2->n_lmbs - 1]));   \
>>> +(lmb1)++, (lmb2)++)
>>> +
>>
>> The macros for traversing seem to be getting a bit unwieldy with these
>> updates. I wonder if we should move to just using walk routine
>> for all traversing of the drmem lmbs.
> 
> We can do that.  One new routine + one API - several macros + 2 files changed.
> 
>>
>>>  /*
>>>   * The of_drconf_cell_v1 struct defines the layout of the LMB data
>>>   * specified in the ibm,dynamic-memory device tree property.
>>> @@ -94,6 +106,9 @@ void __init walk_drmem_lmbs(struct device_node *dn,
>>> void (*func)(struct drmem_lmb *, const __be32 **));
>>>  int drmem_update_dt(void);
>>>
>>> +struct drmem_lmb_info *drmem_lmbs_init(struct property *prop);
>>> +void drmem_lmbs_free(struct drmem_lmb_info *dinfo);
>>> +
>>>  #ifdef CONFIG_PPC_PSERIES
>>>  void __init walk_drmem_lmbs_early(unsigned long node,
>>> void (*func)(struct drmem_lmb *, const __be32 **));
>>> diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
>>> index 3f18036..13d2abb 100644
>>> --- a/arch/powerpc/mm/drmem.c
>>> +++ b/arch/powerpc/mm/drmem.c
>>> @@ -20,6 +20,7 @@
>>>
>>>  static struct drmem_lmb_info __drmem_info;
>>>  struct drmem_lmb_info *drmem_info = &__drmem_info;
>>> +static int n_root_addr_cells;
>>>
>>>  u64 drmem_lmb_memory_max(void)
>>>  {
>>> @@ -193,12 +194,13 @@ int drmem_update_dt(void)
>>> return rc;
>>>  }
>>>
>>> -static void __init read_drconf_v1_cell(struct drmem_lmb *lmb,
>>> +static void read_drconf_v1_cell(struct drmem_lmb *lmb,
>>>const __be32 **prop)
>>>  {
>>> const __be32 *p = *prop;
>>>
>>> -   lmb->base_addr = dt_mem_next_cell(dt_root_addr_cells, );

Re: [PATCH v04 3/4] migration/memory: Evaluate LMB assoc changes

2018-10-10 Thread Michael Bringmann
On 10/10/2018 12:24 PM, Nathan Fontenot wrote:
> On 10/09/2018 03:37 PM, Michael Bringmann wrote:

>
>> +static void pseries_update_ala_memory_aai(int aa_index)
>> +{
>> +struct drmem_lmb *lmb;
>> +
>> +/* Readd all LMBs which were previously using the
>> + * specified aa_index value.
>> + */
>> +for_each_drmem_lmb(lmb) {
>> +if ((lmb->aa_index == aa_index) &&
>> +(lmb->flags & DRCONF_MEM_ASSIGNED)) {
>> +drmem_mark_lmb_update(lmb);
>> +dlpar_memory_pmt_changes_set();
>> +}
>> +}
>> +}
>> +
>> +struct assoc_arrays {
>> +u32 n_arrays;
>> +u32 array_sz;
>> +const __be32 *arrays;
>> +};
> 
> This struct is also defined in arch/powerpc/mm/numa.c. May be a good idea to 
> move the
> definition to common place.

Moving to topology.h in arch/powerpc/include/asm.

> 
>> +
>> +static int pseries_update_ala_memory(struct of_reconfig_data *pr)
>> +{
>> +struct assoc_arrays new_ala, old_ala;
>> +__be32 *p;
>> +int i, lim;
>> +
>> +if (rtas_hp_event)
>> +return 0;
>> +
>> +/*
>> + * The layout of the ibm,associativity-lookup-arrays
>> + * property is a number N indicating the number of
>> + * associativity arrays, followed by a number M
>> + * indicating the size of each associativity array,
>> + * followed by a list of N associativity arrays.
>> + */
>> +
>> +p = (__be32 *) pr->old_prop->value;
>> +if (!p)
>> +return -EINVAL;
>> +old_ala.n_arrays = of_read_number(p++, 1);
>> +old_ala.array_sz = of_read_number(p++, 1);
>> +old_ala.arrays = p;
>> +
>> +p = (__be32 *) pr->prop->value;
>> +if (!p)
>> +return -EINVAL;
>> +new_ala.n_arrays = of_read_number(p++, 1);
>> +new_ala.array_sz = of_read_number(p++, 1);
>> +new_ala.arrays = p;
>> +
>> +lim = (new_ala.n_arrays > old_ala.n_arrays) ? old_ala.n_arrays :
>> +new_ala.n_arrays;
>> +
>> +if (old_ala.array_sz == new_ala.array_sz) {
>> +
>> +/* Reset any entries where the old and new rows
>> + * the array have changed.
> 
> Small nit, the wording in that comment could be clearer.

Right.

> 
> -Nathan

Michael

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



Re: [PATCH v04 3/5] migration/memory: Add hotplug READD_MULTIPLE

2018-10-10 Thread Michael Bringmann
On 10/10/2018 11:59 AM, Nathan Fontenot wrote:
> On 10/09/2018 03:36 PM, Michael Bringmann wrote:
>> migration/memory: This patch adds a new pseries hotplug action
>> for CPU and memory operations, PSERIES_HP_ELOG_ACTION_READD_MULTIPLE.
>> This is a variant of the READD operation which performs the action
>> upon multiple instances of the resource at one time.  The operation
>> is to be triggered by device-tree analysis of updates by RTAS events
>> analyzed by 'migation_store' during post-migration processing.  It
>> will be used for memory updates, initially.
>>
>> Signed-off-by: Michael Bringmann 
>> ---
>> Changes in v04:
>>   -- Move init of 'lmb->internal_flags' in init_drmem_v2_lmbs to
>>  previous patch.
>>   -- Pull in implementation of dlpar_memory_readd_multiple() to go
>>  with operation flag.
>> ---
>>  arch/powerpc/include/asm/rtas.h |1 +
>>  arch/powerpc/platforms/pseries/hotplug-memory.c |   31 
>> +++
>>  2 files changed, 32 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/rtas.h 
>> b/arch/powerpc/include/asm/rtas.h
>> index 0183e95..cc00451 100644
>> --- a/arch/powerpc/include/asm/rtas.h
>> +++ b/arch/powerpc/include/asm/rtas.h
>> @@ -333,6 +333,7 @@ struct pseries_hp_errorlog {
>>  #define PSERIES_HP_ELOG_ACTION_ADD  1
>>  #define PSERIES_HP_ELOG_ACTION_REMOVE   2
>>  #define PSERIES_HP_ELOG_ACTION_READD3
>> +#define PSERIES_HP_ELOG_ACTION_READD_MULTIPLE   4
>>
>>  #define PSERIES_HP_ELOG_ID_DRC_NAME 1
>>  #define PSERIES_HP_ELOG_ID_DRC_INDEX2
>> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
>> b/arch/powerpc/platforms/pseries/hotplug-memory.c
>> index 9a15d39..bf2420a 100644
>> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
>> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
>> @@ -546,6 +546,30 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
>>  return rc;
>>  }
>>
>> +static int dlpar_memory_readd_multiple(void)
>> +{
>> +struct drmem_lmb *lmb;
>> +int rc;
>> +
>> +pr_info("Attempting to update multiple LMBs\n");
>> +
>> +for_each_drmem_lmb(lmb) {
>> +if (drmem_lmb_update(lmb)) {
>> +rc = dlpar_remove_lmb(lmb);
>> +
>> +if (!rc) {
>> +rc = dlpar_add_lmb(lmb);
>> +if (rc)
>> +dlpar_release_drc(lmb->drc_index);
>> +}
> 
> The work you're doing here is essentially the same that is done in
> dlpar_memory_readd_by_index(). Perhaps pulling the commin bits of both
> routines into a helper routine. This could include the success/failure
> messages in dlpar_memory_readd_by_index()

Really, only the interior of the loop is common to the two functions.
Creating a helper that incorporated the loop would mean either several
helper functions customized to each path (and a lot more code).
Or a common helper function that does everything for both paths, and
would be harder to understand/maintain.

It would be a lot cleaner to put the common loop interior into a helper 
function,
and retain the other two functions with their unique loop + test + extra
operations.  I will update with this method.

> 
> -Nathan

Michael

> 
>> +
>> +drmem_remove_lmb_update(lmb);
>> +}
>> +}
>> +
>> +return rc;
>> +}
>> +
>>  static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
>>  {
>>  struct drmem_lmb *lmb, *start_lmb, *end_lmb;
>> @@ -646,6 +670,10 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
>>  {
>>  return -EOPNOTSUPP;
>>  }
>> +static int dlpar_memory_readd_multiple(void)
>> +{
>> +return -EOPNOTSUPP;
>> +}
>>
>>  static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
>>  {
>> @@ -923,6 +951,9 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
>>  drc_index = hp_elog->_drc_u.drc_index;
>>  rc = dlpar_memory_readd_by_index(drc_index);
>>  break;
>> +case PSERIES_HP_ELOG_ACTION_READD_MULTIPLE:
>> +rc = dlpar_memory_readd_multiple();
>> +break;
>>  default:
>>  pr_err("Invalid action (%d) specified\n", hp_elog->action);
>>  rc = -EINVAL;
>>
> 
> 

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



Re: [PATCH v04 1/5] powerpc/drmem: Export 'dynamic-memory' loader

2018-10-10 Thread Michael Bringmann
On 10/10/2018 11:54 AM, Nathan Fontenot wrote:
> On 10/09/2018 03:36 PM, Michael Bringmann wrote:
>> powerpc/drmem: Export many of the functions of DRMEM to parse
>> "ibm,dynamic-memory" and "ibm,dynamic-memory-v2" during hotplug
>> operations and for Post Migration events.
>>
>> Also modify the DRMEM initialization code to allow it to,
>>
>> * Be called after system initialization
>> * Provide a separate user copy of the LMB array that is produces
>> * Free the user copy upon request
>>
>> In addition, a couple of changes were made to make the creation
>> of additional copies of the LMB array more useful including,
>>
>> * Add new iterator to work through a pair of drmem_info arrays.
>> * Modify DRMEM code to replace usages of dt_root_addr_cells, and
>>   dt_mem_next_cell, as these are only available at first boot.
>>
>> Signed-off-by: Michael Bringmann 
>> ---
>>  arch/powerpc/include/asm/drmem.h |   15 
>>  arch/powerpc/mm/drmem.c  |   75 
>> --
>>  2 files changed, 70 insertions(+), 20 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/drmem.h 
>> b/arch/powerpc/include/asm/drmem.h
>> index 7c1d8e7..1fbb684 100644
>> --- a/arch/powerpc/include/asm/drmem.h
>> +++ b/arch/powerpc/include/asm/drmem.h
>> @@ -35,6 +35,18 @@ struct drmem_lmb_info {
>>  _info->lmbs[0],   \
>>  _info->lmbs[drmem_info->n_lmbs - 1])
>>
>> +#define for_each_dinfo_lmb(dinfo, lmb)  \
>> +for_each_drmem_lmb_in_range((lmb),  \
>> +>lmbs[0],\
>> +>lmbs[dinfo->n_lmbs - 1])
>> +
>> +#define for_each_pair_dinfo_lmb(dinfo1, lmb1, dinfo2, lmb2) \
>> +for ((lmb1) = (>lmbs[0]),   \
>> + (lmb2) = (>lmbs[0]);   \
>> + ((lmb1) <= (>lmbs[dinfo1->n_lmbs - 1])) && \
>> + ((lmb2) <= (>lmbs[dinfo2->n_lmbs - 1]));   \
>> + (lmb1)++, (lmb2)++)
>> +
> 
> The macros for traversing seem to be getting a bit unwieldy with these
> updates. I wonder if we should move to just using walk routine
> for all traversing of the drmem lmbs.

We can do that.  One new routine + one API - several macros + 2 files changed.

> 
>>  /*
>>   * The of_drconf_cell_v1 struct defines the layout of the LMB data
>>   * specified in the ibm,dynamic-memory device tree property.
>> @@ -94,6 +106,9 @@ void __init walk_drmem_lmbs(struct device_node *dn,
>>  void (*func)(struct drmem_lmb *, const __be32 **));
>>  int drmem_update_dt(void);
>>
>> +struct drmem_lmb_info *drmem_lmbs_init(struct property *prop);
>> +void drmem_lmbs_free(struct drmem_lmb_info *dinfo);
>> +
>>  #ifdef CONFIG_PPC_PSERIES
>>  void __init walk_drmem_lmbs_early(unsigned long node,
>>  void (*func)(struct drmem_lmb *, const __be32 **));
>> diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
>> index 3f18036..13d2abb 100644
>> --- a/arch/powerpc/mm/drmem.c
>> +++ b/arch/powerpc/mm/drmem.c
>> @@ -20,6 +20,7 @@
>>
>>  static struct drmem_lmb_info __drmem_info;
>>  struct drmem_lmb_info *drmem_info = &__drmem_info;
>> +static int n_root_addr_cells;
>>
>>  u64 drmem_lmb_memory_max(void)
>>  {
>> @@ -193,12 +194,13 @@ int drmem_update_dt(void)
>>  return rc;
>>  }
>>
>> -static void __init read_drconf_v1_cell(struct drmem_lmb *lmb,
>> +static void read_drconf_v1_cell(struct drmem_lmb *lmb,
>> const __be32 **prop)
>>  {
>>  const __be32 *p = *prop;
>>
>> -lmb->base_addr = dt_mem_next_cell(dt_root_addr_cells, );
>> +lmb->base_addr = of_read_number(p, n_root_addr_cells);
>> +p += n_root_addr_cells;
> 
> Any reason this can't just be
>   lmb->base_addr= dt_mem_next_cell(n_root_addr_cells, );

Probably, not.  I will rebuild/retest with this.

> 
>>  lmb->drc_index = of_read_number(p++, 1);
>>
>>  p++; /* skip reserved field */
>> @@ -209,7 +211,7 @@ static void __init read_drconf_v1_cell(struct drmem_lmb 
>> *lmb,
>>  *prop = p;
>>  }
>>
>> -static void __init __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 
>> *usm,
>> +static void __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm,
>>  void (*func)(struct 

  1   2   3   4   5   6   >