Re: [PATCH 1/3] core/device: Add function to return child node using name at substring "@"

2023-09-18 Thread Reza Arbab

On Thu, Sep 14, 2023 at 10:02:04PM +0530, Athira Rajeev wrote:

Add a function dt_find_by_name_before_addr() that returns the child node if
it matches till first occurrence at "@" of a given name, otherwise NULL.
This is helpful for cases with node name like: "name@addr". In
scenarios where nodes are added with "name@addr" format and if the
value of "addr" is not known, that node can't be matched with node
name or addr. Hence matching with substring as node name will return
the expected result. Patch adds dt_find_by_name_before_addr() function
and testcase for the same in core/test/run-device.c


Series applied to skiboot master with the fixup we discussed.

--
Reza Arbab


Re: [PATCH 1/3] core/device: Add function to return child node using name at substring "@"

2023-09-15 Thread Reza Arbab

Hi Athira,

On Thu, Sep 14, 2023 at 10:02:04PM +0530, Athira Rajeev wrote: 

+struct dt_node *dt_find_by_name_before_addr(struct dt_node *root, const char 
*name)
+{
+   struct dt_node *child, *match;
+   char *child_node = NULL;
+
+   list_for_each(>children, child, list) {
+   child_node = strdup(child->name);
+   if (!child_node)
+   goto err;
+   child_node = strtok(child_node, "@");
+   if (!strcmp(child_node, name)) {
+   free(child_node);
+   return child;
+   }
+
+   match = dt_find_by_name_before_addr(child, name);
+   if (match)
+   return match;


When the function returns on this line, child_node is not freed.


+   }
+
+   free(child_node);
+err:
+   return NULL;
+}


I took at stab at moving free(child_node) inside the loop, and ended up 
with this:


struct dt_node *dt_find_by_name_before_addr(struct dt_node *root, const char 
*name)
{
struct dt_node *child, *match = NULL;
char *child_name = NULL;

list_for_each(>children, child, list) {
child_name = strdup(child->name);
if (!child_name)
return NULL;

child_name = strtok(child_name, "@");
if (!strcmp(child_name, name))
match = child;
else
match = dt_find_by_name_before_addr(child, name);

free(child_name);
if (match)
return match;
}

return NULL;
}

Does this seem okay to you? If you agree, no need to send another 
revision, I can just fixup during commit. Let me know.



diff --git a/core/test/run-device.c b/core/test/run-device.c
index 4a12382bb..fb7a7d2c0 100644
--- a/core/test/run-device.c
+++ b/core/test/run-device.c
@@ -466,6 +466,20 @@ int main(void)
new_prop_ph = dt_prop_get_u32(ut2, "something");
assert(!(new_prop_ph == ev1_ph));
dt_free(subtree);
+
+   /* Test dt_find_by_name_before_addr */
+   root = dt_new_root("");
+   addr1 = dt_new_addr(root, "node", 0x1);
+   addr2 = dt_new_addr(root, "node0_1", 0x2);
+   assert(dt_find_by_name(root, "node@1") == addr1);
+   assert(dt_find_by_name(root, "node0_1@2") == addr2);
+   assert(dt_find_by_name_before_addr(root, "node") == addr1);
+   assert(dt_find_by_name_before_addr(root, "node0_") == NULL);


This line appears twice. As above, can fix during commit, so no need for 
a new patch.



+   assert(dt_find_by_name_before_addr(root, "node0_1") == addr2);
+   assert(dt_find_by_name_before_addr(root, "node0") == NULL);
+   assert(dt_find_by_name_before_addr(root, "node0_") == NULL);
+   dt_free(root);
+
return 0;
}



--
Reza Arbab


Re: [PATCH V5 3/3] skiboot: Update IMC PMU node names for power10

2023-08-09 Thread Reza Arbab

On Mon, Jul 17, 2023 at 08:54:31AM +0530, Athira Rajeev wrote:

@@ -408,14 +469,129 @@ static void disable_unavailable_units(struct dt_node 
*dev)
avl_vec = (0xffULL) << 56;
}

-   for (i = 0; i < ARRAY_SIZE(nest_pmus); i++) {
-   if (!(PPC_BITMASK(i, i) & avl_vec)) {
-   /* Check if the device node exists */
-   target = dt_find_by_name_before_addr(dev, nest_pmus[i]);
-   if (!target)
-   continue;
-   /* Remove the device node */
-   dt_free(target);
+   if (proc_gen == proc_gen_p9) {
+   for (i = 0; i < ARRAY_SIZE(nest_pmus_p9); i++) {
+   if (!(PPC_BITMASK(i, i) & avl_vec)) {


I think all these PPC_BITMASK(i, i) can be changed to PPC_BIT(i).


+   /* Check if the device node exists */
+   target = dt_find_by_name_before_addr(dev, 
nest_pmus_p9[i]);
+   if (!target)
+   continue;
+   /* Remove the device node */
+   dt_free(target);
+   }
+   }
+   } else if (proc_gen == proc_gen_p10) {
+   int val;
+   char name[8];
+
+   for (i = 0; i < 11; i++) {
+   if (!(PPC_BITMASK(i, i) & avl_vec)) {
+   /* Check if the device node exists */
+   target = dt_find_by_name_before_addr(dev, 
nest_pmus_p10[i]);
+   if (!target)
+   continue;
+   /* Remove the device node */
+   dt_free(target);
+   }
+   }
+
+   for (i = 35; i < 41; i++) {
+   if (!(PPC_BITMASK(i, i) & avl_vec)) {
+   /* Check if the device node exists for phb */
+   for (j = 0; j < 3; j++) {
+   snprintf(name, sizeof(name), 
"phb%d_%d", (i-35), j);
+   target = 
dt_find_by_name_before_addr(dev, name);
+   if (!target)
+   continue;
+   /* Remove the device node */
+   dt_free(target);
+   }
+   }
+   }
+
+   for (i = 41; i < 58; i++) {
+   if (!(PPC_BITMASK(i, i) & avl_vec)) {
+   /* Check if the device node exists */
+   target = dt_find_by_name_before_addr(dev, 
nest_pmus_p10[i]);
+   if (!target)
+   continue;
+   /* Remove the device node */
+   dt_free(target);
+   }
+   }


--
Reza Arbab


Re: [PATCH V5 1/3] core/device: Add function to return child node using name at substring "@"

2023-08-09 Thread Reza Arbab

Hi Athira,

I still have a couple of the same questions I asked in v4.

On Mon, Jul 17, 2023 at 08:54:29AM +0530, Athira Rajeev wrote:

Add a function dt_find_by_name_before_addr() that returns the child node if
it matches till first occurrence at "@" of a given name, otherwise NULL.


Given this summary, I don't userstand the following:


+   assert(dt_find_by_name(root, "node@1") == addr1);
+   assert(dt_find_by_name(root, "node0_1@2") == addr2);


Is this behavior required? I don't think it makes sense to call this 
function with a second argument containing '@', so I wouldn't expect it 
to match anything in these cases. The function seems to specifically 
enable it:



+struct dt_node *dt_find_by_name_before_addr(struct dt_node *root, const char 
*name)
+{

[snip]

+   node = strdup(name);
+   if (!node)
+   return NULL;
+   node = strtok(node, "@");


Seems like you could get rid of this and just use name as-is.

I was curious about something else; say we have 'node@1' and 'node@2'.  
Is there an expectation of which it should match?


addr1 = dt_new_addr(root, "node", 0x1);
addr2 = dt_new_addr(root, "node", 0x2);
assert(dt_find_by_name_substr(root, "node") == ???);
   ^^^

--
Reza Arbab


Re: [PATCH v3 1/2] powerpc/mm: Cleanup memory block size probing

2023-07-31 Thread Reza Arbab

On Sat, Jul 29, 2023 at 08:58:57PM +0530, Aneesh Kumar K V wrote:
Thanks for correcting the right device tree node and testing the 
changes. Can I add


Co-authored-by: Reza Arbab 


Sure, that's fine.

Signed-off-by: Reza Arbab 

--
Reza Arbab


Re: [PATCH v3 1/2] powerpc/mm: Cleanup memory block size probing

2023-07-28 Thread Reza Arbab

On Fri, Jul 28, 2023 at 04:05:55PM +0530, Aneesh Kumar K.V wrote:

--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c

[snip]

+   /*
+* "ibm,coherent-device-memory with linux,usable-memory = 0
+* Force 256MiB block size. Work around for GPUs on P9 PowerNV
+* linux,usable-memory == 0 implies driver managed memory and
+* we can't use large memory block size due to hotplug/unplug
+* limitations.
+*/
+   compatible = of_get_flat_dt_prop(node, "compatible", NULL);
+   if (compatible && !strcmp(compatible, "ibm,coherent-device-memory")) {
+   int len = 0;
+   const __be32 *usm;
+
+   usm = of_get_flat_dt_prop(node, "linux,drconf-usable-memory", 
);


I think this should be "linux,usable-memory".


+   if (usm && !len) {
+   *block_size = SZ_256M;
+   return 1;
+   }


This isn't quite right. The criteria is not that the property itself has 
no registers, it's that the base/size combo has size zero.


If you fold in the patch appended to the end of this mail, things worked 
for me.



+   }
+
+   reg = of_get_flat_dt_prop(node, "reg", );
+   endp = reg + (l / sizeof(__be32));
+
+   while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
+   u64 base, size;
+
+   base = dt_mem_next_cell(dt_root_addr_cells, );
+   size = dt_mem_next_cell(dt_root_size_cells, );
+
+   if (size == 0)
+   continue;
+
+   update_memory_block_size(block_size, size);
+   }
+   /* continue looking for other memory device types */
+   return 0;
+}
+
+/*
+ * start with 1G memory block size. Early init will
+ * fix this with correct value.
+ */
+unsigned long memory_block_size __ro_after_init = 1UL << 30;


Could use SZ_1G here.

With the following fixup, I got 256MiB blocks on a system with
"ibm,coherent-device-memory" nodes.

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index dbed37d6cffb..1ac58e72a885 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -487,7 +487,6 @@ static int __init probe_memory_block_size(unsigned long 
node, const char *uname,
  depth, void *data)
 {
const char *type;
-   const char *compatible;
unsigned long *block_size = (unsigned long *)data;
const __be32 *reg, *endp;
int l;
@@ -532,38 +531,38 @@ static int __init probe_memory_block_size(unsigned long 
node, const char *uname,
if (type == NULL || strcmp(type, "memory") != 0)
return 0;
 
-	/*

-* "ibm,coherent-device-memory with linux,usable-memory = 0
-* Force 256MiB block size. Work around for GPUs on P9 PowerNV
-* linux,usable-memory == 0 implies driver managed memory and
-* we can't use large memory block size due to hotplug/unplug
-* limitations.
-*/
-   compatible = of_get_flat_dt_prop(node, "compatible", NULL);
-   if (compatible && !strcmp(compatible, "ibm,coherent-device-memory")) {
-   int len = 0;
-   const __be32 *usm;
-
-   usm = of_get_flat_dt_prop(node, "linux,drconf-usable-memory", 
);
-   if (usm && !len) {
-   *block_size = SZ_256M;
-   return 1;
-   }
-   }
+   reg = of_get_flat_dt_prop(node, "linux,usable-memory", );
+   if (!reg)
+   reg = of_get_flat_dt_prop(node, "reg", );
+   if (!reg)
+   return 0;
 
-	reg = of_get_flat_dt_prop(node, "reg", );

endp = reg + (l / sizeof(__be32));
 
 	while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {

+   const char *compatible;
u64 base, size;
 
 		base = dt_mem_next_cell(dt_root_addr_cells, );

size = dt_mem_next_cell(dt_root_size_cells, );
 
-		if (size == 0)

+   if (size) {
+   update_memory_block_size(block_size, size);
continue;
+   }
 
-		update_memory_block_size(block_size, size);

+   /*
+* ibm,coherent-device-memory with linux,usable-memory = 0
+* Force 256MiB block size. Work around for GPUs on P9 PowerNV
+* linux,usable-memory == 0 implies driver managed memory and
+* we can't use large memory block size due to hotplug/unplug
+* limitations.
+*/
+   compatible = of_get_flat_dt_prop(node, "compatible", NULL);
+   if (compatible && !strcmp(compatible, 
"ibm,coherent-device-memory")) {
+   *block_size = SZ_256M;
+   return 1;
+   }
}
/* continue looking for other memory device types */
return 0;
--
Reza Arbab


Re: [PATCH v2 2/2] powerpc/mm: Add memory_block_size as a kernel parameter

2023-06-13 Thread Reza Arbab

On Fri, Jun 09, 2023 at 11:38:51AM +0530, Aneesh Kumar K.V wrote:

Certain devices can possess non-standard memory capacities, not constrained
to multiples of 1GB. Provide a kernel parameter so that we can map the
device memory completely on memory hotplug.


Case in point; the memory block size determined at boot is 1GB, but we 
know that 15.75GB of device memory will be hotplugged during runtime.


Reviewed-by: Reza Arbab 

--
Reza Arbab


Re: [PATCH v2 1/2] powerpc/mm: Cleanup memory block size probing

2023-06-13 Thread Reza Arbab

On Fri, Jun 09, 2023 at 11:38:50AM +0530, Aneesh Kumar K.V wrote:

Parse the device tree in early init to find the memory block size to be
used by the kernel. Consolidate the memory block size device tree parsing
to one helper and use that on both powernv and pseries. We still want to
use machine-specific callback because on all machine types other than
powernv and pseries we continue to return MIN_MEMORY_BLOCK_SIZE.

pseries_memory_block_size used to look for the second memory
block (memory@x) to determine the memory_block_size value. This patch
changed that to look at all memory blocks and make sure we can map them all
correctly using the computed memory block size value.


Reviewed-by: Reza Arbab 

--
Reza Arbab


Re: [Skiboot] [PATCH V4 3/3] skiboot: Update IMC PMU node names for power10

2023-03-22 Thread Reza Arbab

On Mon, Mar 06, 2023 at 09:09:13AM +0530, Athira Rajeev wrote:

+   } else if (proc_gen == proc_gen_p10) {
+   int val;
+   char al[8], xl[8], otl[8], phb[8];


Are four different variables needed here? I think you could just reuse 
one temporary variable.


char name[8];


+   for (i=0; i<8; i++) {
+   val = ((avl_vec & (0x7ULL << (29 + (3 * i >> (29 + 
(3 * i)));
+   switch (val) {
+   case 0x5: //xlink configured and functional
+
+   snprintf(al, sizeof(al), "alink%1d",(7-i));
+   target = dt_find_by_name_substr(dev, al);
+   if (target)
+   dt_free(target);
+
+   snprintf(otl, sizeof(otl),"otl%1d_0",(7-i));
+   target = dt_find_by_name_substr(dev, otl);
+   if (target)
+   dt_free(target);
+
+   snprintf(otl,sizeof(otl),"otl%1d_1",(7-i));
+   target = dt_find_by_name_substr(dev, otl);
+   if (target)
+   dt_free(target);
+
+   break;
+   case 0x6: //alink configured and functional
+
+   snprintf(xl,sizeof(xl),"xlink%1d",(7-i));
+   target = dt_find_by_name_substr(dev, xl);
+   if (target)
+   dt_free(target);
+
+   snprintf(otl,sizeof(otl),"otl%1d_0",(7-i));
+   target = dt_find_by_name_substr(dev, otl);
+   if (target)
+   dt_free(target);
+
+   snprintf(otl,sizeof(otl),"otl%1d_1",(7-i));
+   target = dt_find_by_name_substr(dev, otl);
+   if (target)
+   dt_free(target);
+   break;
+
+   case 0x7: //CAPI configured and functional
+   snprintf(al,sizeof(al),"alink%1d",(7-i));
+   target = dt_find_by_name_substr(dev, al);
+   if (target)
+   dt_free(target);
+
+   snprintf(xl,sizeof(xl),"xlink%1d",(7-i));
+   target = dt_find_by_name_substr(dev, xl);
+   if (target)
+   dt_free(target);
+   break;
+   default:
+   snprintf(xl,sizeof(xl),"xlink%1d",(7-i));
+   target = dt_find_by_name_substr(dev, xl);
+   if (target)
+   dt_free(target);
+
+   snprintf(al,sizeof(al),"alink%1d",(7-i));
+   target = dt_find_by_name_substr(dev, al);
+   if (target)
+   dt_free(target);
+
+   snprintf(otl,sizeof(otl),"otl%1d_0",(7-i));
+   target = dt_find_by_name_substr(dev, otl);
+   if (target)
+   dt_free(target);
+
+   snprintf(otl,sizeof(otl),"otl%1d_1",(7-i));
+   target = dt_find_by_name_substr(dev, otl);
+   if (target)
+   dt_free(target);
+   break;


As far as I know skiboot follows the kernel coding style. Would you mind 
fixing up the minor style nits checkpatch.pl reports for this patch?


--
Reza Arbab


Re: [Skiboot] [PATCH V4 1/3] core/device: Add function to return child node using name at substring "@"

2023-03-22 Thread Reza Arbab

Hi Athira,

On Mon, Mar 06, 2023 at 09:09:11AM +0530, Athira Rajeev wrote:

Add a function dt_find_by_name_substr() that returns the child node if
it matches till first occurence at "@" of a given name, otherwise NULL.


Given this summary, I don't understand the following:


+   assert(dt_find_by_name_substr(root, "node@1") == addr1);
+   assert(dt_find_by_name_substr(root, "node0_1@2") == addr2);


Is this behavior required? I don't think it makes sense to call this 
function with a second argument containing '@', so I wouldn't expect it 
to match anything in these cases. The function seems to specifically 
enable it:



+struct dt_node *dt_find_by_name_substr(struct dt_node *root, const char *name)
+{

[snip]

+  node = strdup(name);
+  if (!node)
+  return NULL;
+  node = strtok(node, "@");


Seems like you could get rid of this and just use name as-is.

I was curious about something else; say we have 'node@1' and 'node@2'.  
Is there an expectation of which it should match?


addr1 = dt_new_addr(root, "node", 0x1);
addr2 = dt_new_addr(root, "node", 0x2);
assert(dt_find_by_name_substr(root, "node") == ???);
   ^^^


+/* Find a child node by name and substring */
+struct dt_node *dt_find_by_name_substr(struct dt_node *root, const char *name);


I think this name fit better in previous versions of the patch, but 
since you're specifically looking for '@' now, maybe call it something 
like dt_find_by_name_before_addr?


--
Reza Arbab


[PATCH] powerpc: Remove TM XER[SO] bug workaround on POWER9 v2.3

2023-03-08 Thread Reza Arbab via B4 Relay
From: Reza Arbab 

When creating the CPU feature bits for DD2.3, I should not have carried
forward CPU_FTR_P9_TM_XER_SO_BUG. That bug is fixed in DD2.3, so remove
the flag.

Fixes: 26b78c81e84c ("powerpc: Enable the DAWR on POWER9 DD2.3 and above")
Signed-off-by: Reza Arbab 
---
 arch/powerpc/include/asm/cputable.h | 1 -
 arch/powerpc/kernel/dt_cpu_ftrs.c   | 1 -
 2 files changed, 2 deletions(-)

diff --git a/arch/powerpc/include/asm/cputable.h 
b/arch/powerpc/include/asm/cputable.h
index 757dbded11dc..5dc6906498ef 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -439,7 +439,6 @@ static inline void cpu_feature_keys_init(void) { }
   CPU_FTR_P9_TM_XER_SO_BUG)
 #define CPU_FTRS_POWER9_DD2_3 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1 | \
   CPU_FTR_P9_TM_HV_ASSIST | \
-  CPU_FTR_P9_TM_XER_SO_BUG | \
   CPU_FTR_DAWR)
 #define CPU_FTRS_POWER10 (CPU_FTR_LWSYNC | \
CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_206 |\
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c 
b/arch/powerpc/kernel/dt_cpu_ftrs.c
index c3fb9fdf5bd7..afcdbeed8b44 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -782,7 +782,6 @@ static __init void cpufeatures_cpu_quirks(void)
cur_cpu_spec->cpu_features &= ~(CPU_FTR_DAWR);
} else if ((version & 0xefff) == 0x004e0203) {
cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_HV_ASSIST;
-   cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_XER_SO_BUG;
cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1;
} else if ((version & 0x) == 0x004e) {
/* DD2.1 and up have DD2_1 */

---
base-commit: fe15c26ee26efa11741a7b632e9f23b01aca4cc6
change-id: 20230308-cpu_ftr_p9_tm_xer_so_bug-ec58b00a9716

Best regards,
-- 
Reza Arbab



[PATCH] powerpc: Enable the DAWR on POWER9 DD2.3 and above

2022-05-03 Thread Reza Arbab
The hardware bug in POWER9 preventing use of the DAWR was fixed in
DD2.3. Set the CPU_FTR_DAWR feature bit on these newer systems to start
using it again, and update the documentation accordingly.

The CPU features for DD2.3 are currently determined by "DD2.2 or later"
logic. In adding DD2.3 as a discrete case for the first time here, I'm
carrying the quirks of DD2.2 forward to keep all behavior outside of
this DAWR change the same. This leaves the assessment and potential
removal of those quirks on DD2.3 for later.

Signed-off-by: Reza Arbab 
---
 Documentation/powerpc/dawr-power9.rst | 26 +-
 arch/powerpc/include/asm/cputable.h   | 10 --
 arch/powerpc/kernel/cputable.c| 22 --
 arch/powerpc/kernel/dt_cpu_ftrs.c |  8 +++-
 4 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/Documentation/powerpc/dawr-power9.rst 
b/Documentation/powerpc/dawr-power9.rst
index e55ac6a24b97..310f2e0cea81 100644
--- a/Documentation/powerpc/dawr-power9.rst
+++ b/Documentation/powerpc/dawr-power9.rst
@@ -2,15 +2,23 @@
 DAWR issues on POWER9
 =
 
-On POWER9 the Data Address Watchpoint Register (DAWR) can cause a checkstop
-if it points to cache inhibited (CI) memory. Currently Linux has no way to
-distinguish CI memory when configuring the DAWR, so (for now) the DAWR is
-disabled by this commit::
-
-commit 9654153158d3e0684a1bdb76dbababdb7111d5a0
-Author: Michael Neuling 
-Date:   Tue Mar 27 15:37:24 2018 +1100
-powerpc: Disable DAWR in the base POWER9 CPU features
+On older POWER9 processors, the Data Address Watchpoint Register (DAWR) can
+cause a checkstop if it points to cache inhibited (CI) memory. Currently Linux
+has no way to distinguish CI memory when configuring the DAWR, so on affected
+systems, the DAWR is disabled.
+
+Affected processor revisions
+
+
+This issue is only present on processors prior to v2.3. The revision can be
+found in /proc/cpuinfo::
+
+processor   : 0
+cpu : POWER9, altivec supported
+clock   : 3800.00MHz
+revision: 2.3 (pvr 004e 1203)
+
+On a system with the issue, the DAWR is disabled as detailed below.
 
 Technical Details:
 ==
diff --git a/arch/powerpc/include/asm/cputable.h 
b/arch/powerpc/include/asm/cputable.h
index e85c849214a2..978a4450d190 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -440,6 +440,10 @@ static inline void cpu_feature_keys_init(void) { }
 #define CPU_FTRS_POWER9_DD2_2 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1 | \
   CPU_FTR_P9_TM_HV_ASSIST | \
   CPU_FTR_P9_TM_XER_SO_BUG)
+#define CPU_FTRS_POWER9_DD2_3 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1 | \
+  CPU_FTR_P9_TM_HV_ASSIST | \
+  CPU_FTR_P9_TM_XER_SO_BUG | \
+  CPU_FTR_DAWR)
 #define CPU_FTRS_POWER10 (CPU_FTR_LWSYNC | \
CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_206 |\
CPU_FTR_MMCRA | CPU_FTR_SMT | \
@@ -469,14 +473,16 @@ static inline void cpu_feature_keys_init(void) { }
 #define CPU_FTRS_POSSIBLE  \
(CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | CPU_FTRS_POWER8 | \
 CPU_FTR_ALTIVEC_COMP | CPU_FTR_VSX_COMP | CPU_FTRS_POWER9 | \
-CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | CPU_FTRS_POWER10)
+CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | \
+CPU_FTRS_POWER9_DD2_3 | CPU_FTRS_POWER10)
 #else
 #define CPU_FTRS_POSSIBLE  \
(CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | \
 CPU_FTRS_POWER6 | CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | \
 CPU_FTRS_POWER8 | CPU_FTRS_CELL | CPU_FTRS_PA6T | \
 CPU_FTR_VSX_COMP | CPU_FTR_ALTIVEC_COMP | CPU_FTRS_POWER9 | \
-CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | CPU_FTRS_POWER10)
+CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | \
+CPU_FTRS_POWER9_DD2_3 | CPU_FTRS_POWER10)
 #endif /* CONFIG_CPU_LITTLE_ENDIAN */
 #endif
 #else
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index ae0fdef0ac11..0134249a4ca8 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -487,11 +487,29 @@ static struct cpu_spec __initdata cpu_specs[] = {
.machine_check_early= __machine_check_early_realmode_p9,
.platform   = "power9",
},
-   {   /* Power9 DD2.2 or later */
+   {   /* Power9 DD2.2 */
+   .pvr_mask   = 0xefff,
+   .pvr_value  = 0x004e0202,
+   .cpu_name   = "POWER9 (raw)",
+   .cpu_features   = CPU_FTRS_POWER9_DD2_2,
+   .cpu_user_features  = COMMON_USER_POWER9,
+

Re: [PATCH v2 2/4] powerpc/mm/radix: Free PUD table when freeing pagetable

2020-07-08 Thread Reza Arbab

On Thu, Jun 25, 2020 at 12:15:45PM +0530, Aneesh Kumar K.V wrote:

remove_pagetable() isn't freeing PUD table. This causes memory
leak during memory unplug. Fix this.


This has come up before:
https://lore.kernel.org/linuxppc-dev/20190731061920.ga18...@in.ibm.com/

tl;dr, x86 intentionally does not free, and it wasn't quite clear if 
their motivation also applies to us. Probably not, but I thought it was 
worth mentioning again.


--
Reza Arbab


Re: [PATCH 00/11] powerpv/powernv: Restore pnv_npu_try_dma_set_bypass()

2019-10-30 Thread Reza Arbab

On Wed, Oct 30, 2019 at 07:13:59PM +0100, Christoph Hellwig wrote:

On Wed, Oct 30, 2019 at 01:08:51PM -0500, Reza Arbab wrote:

On Wed, Oct 30, 2019 at 06:53:41PM +0100, Christoph Hellwig wrote:

How do you even use this code?  Nothing in the kernel even calls
dma_set_mask for NPU devices, as we only suport vfio pass through.


You use it by calling dma_set_mask() for the *GPU* device. The purpose of
pnv_npu_try_dma_set_bypass() is to then propagate the same bypass
configuration to all the NPU devices associated with that GPU.


Which in-kernel driver, which PCI ID?


Aha, it's this again. Didn't catch your meaning at first. Point taken.

--
Reza Arbab


Re: [PATCH 11/11] powerpc/powernv: Add pnv_pci_ioda_dma_set_mask()

2019-10-30 Thread Reza Arbab

On Wed, Oct 30, 2019 at 06:55:18PM +0100, Christoph Hellwig wrote:

On Wed, Oct 30, 2019 at 12:00:00PM -0500, Reza Arbab wrote:

Change pnv_pci_ioda_iommu_bypass_supported() to have no side effects, by
separating the part of the function that determines if bypass is
supported from the part that actually attempts to configure it.

Move the latter to a controller-specific dma_set_mask() callback.


Nak, the dma_set_mask overrides are going away.  But as said in the
reply to the cover letter I don't even see how you could end up calling
this code.


Okay. As mentioned in the cover letter these last few patches could be 
omitted if that's the case.


--
Reza Arbab


Re: [PATCH 00/11] powerpv/powernv: Restore pnv_npu_try_dma_set_bypass()

2019-10-30 Thread Reza Arbab

On Wed, Oct 30, 2019 at 06:53:41PM +0100, Christoph Hellwig wrote:

How do you even use this code?  Nothing in the kernel even calls
dma_set_mask for NPU devices, as we only suport vfio pass through.


You use it by calling dma_set_mask() for the *GPU* device. The purpose 
of pnv_npu_try_dma_set_bypass() is to then propagate the same bypass 
configuration to all the NPU devices associated with that GPU.


--
Reza Arbab


[PATCH 03/11] powerpc/powernv/npu: Change pnv_npu_try_dma_set_bypass() argument

2019-10-30 Thread Reza Arbab
To enable simpler calling code, change this function to find the value
of bypass instead of taking it as an argument.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/platforms/powernv/npu-dma.c | 12 +---
 arch/powerpc/platforms/powernv/pci.h |  2 +-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index 5a8313654033..a6b8c7ad36e4 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -258,13 +258,21 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
return rc;
 }
 
-void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass)
+void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, u64 mask)
 {
+   struct pnv_ioda_pe *gpe = pnv_ioda_get_pe(gpdev);
int i;
struct pnv_phb *phb;
struct pci_dn *pdn;
struct pnv_ioda_pe *npe;
struct pci_dev *npdev;
+   bool bypass;
+
+   if (!gpe)
+   return;
+
+   /* We only do bypass if it's enabled on the linked device */
+   bypass = pnv_ioda_pe_iommu_bypass_supported(gpe, mask);
 
for (i = 0; ; ++i) {
npdev = pnv_pci_get_npu_dev(gpdev, i);
@@ -277,8 +285,6 @@ void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool 
bypass)
return;
 
phb = pci_bus_to_host(npdev->bus)->private_data;
-
-   /* We only do bypass if it's enabled on the linked device */
npe = >ioda.pe_array[pdn->pe_number];
 
if (bypass) {
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index 41f7dec3aee5..21db0f4cfb11 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -211,7 +211,7 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, 
const char *level,
pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
 
 /* Nvlink functions */
-extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
+extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, u64 mask);
 extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
 extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
 extern struct iommu_table_group *pnv_try_setup_npu_table_group(
-- 
1.8.3.1



[PATCH 00/11] powerpv/powernv: Restore pnv_npu_try_dma_set_bypass()

2019-10-30 Thread Reza Arbab
With recent kernels, TCE tables for NPU devices are no longer being
configured. That task was performed by pnv_npu_try_dma_set_bypass(), a
function that got swept away in recent overhauling of dma code.

Patches 1-4 here bring the lost function back and reintegrate it with
the updated generic iommu bypass infrastructure.

Patch 5 fixes a regression in behavior when a requested dma mask can not
be fulfilled.

Patches 6-8 are cleanup. I put these later in the set because they
aren't bisectable until after the restored code is wired back in.

Patches 9-11 refactor pnv_pci_ioda_iommu_bypass_supported(). It seems
wrong for a boolean *_supported() function to have side effects. They
reintroduce a pci controller based dma_set_mask() hook. If that's
undesirable, these last three patches can be dropped.

Reza Arbab (11):
  Revert "powerpc/powernv: Remove unused pnv_npu_try_dma_set_bypass()
function"
  powerpc/powernv: Add pnv_ioda_pe_iommu_bypass_supported()
  powerpc/powernv/npu: Change pnv_npu_try_dma_set_bypass() argument
  powerpc/powernv/npu: Wire up pnv_npu_try_dma_set_bypass()
  powerpc/powernv: Return failure for some uses of dma_set_mask()
  powerpc/powernv: Remove intermediate variable
  powerpc/powernv/npu: Simplify pnv_npu_try_dma_set_bypass() loop
  powerpc/powernv: Replace open coded pnv_ioda_get_pe()s
  Revert "powerpc/pci: remove the dma_set_mask pci_controller ops
methods"
  powerpc/powernv: Add pnv_phb3_iommu_bypass_supported()
  powerpc/powernv: Add pnv_pci_ioda_dma_set_mask()

 arch/powerpc/include/asm/pci-bridge.h |   2 +
 arch/powerpc/kernel/dma-iommu.c   |  19 --
 arch/powerpc/kernel/dma-mask.c|   9 +++
 arch/powerpc/platforms/powernv/Kconfig|   1 +
 arch/powerpc/platforms/powernv/npu-dma.c  | 106 +++---
 arch/powerpc/platforms/powernv/pci-ioda.c |  71 
 arch/powerpc/platforms/powernv/pci.h  |  10 ++-
 7 files changed, 177 insertions(+), 41 deletions(-)

-- 
1.8.3.1



[PATCH 09/11] Revert "powerpc/pci: remove the dma_set_mask pci_controller ops methods"

2019-10-30 Thread Reza Arbab
Bring back the pci controller based hook in dma_set_mask(), as it will
have a user again.

This reverts commit 662acad4067a ("powerpc/pci: remove the dma_set_mask
pci_controller ops methods"). The callback signature has been adjusted
with void return to fit its caller.

Signed-off-by: Reza Arbab 
Cc: Christoph Hellwig 
---
 arch/powerpc/include/asm/pci-bridge.h | 2 ++
 arch/powerpc/kernel/dma-mask.c| 9 +
 2 files changed, 11 insertions(+)

diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index ea6ec65970ef..8512dcd053c5 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -43,6 +43,8 @@ struct pci_controller_ops {
void(*teardown_msi_irqs)(struct pci_dev *pdev);
 #endif
 
+   void(*dma_set_mask)(struct pci_dev *pdev, u64 dma_mask);
+
void(*shutdown)(struct pci_controller *hose);
 };
 
diff --git a/arch/powerpc/kernel/dma-mask.c b/arch/powerpc/kernel/dma-mask.c
index ffbbbc432612..35b5fd1b03a6 100644
--- a/arch/powerpc/kernel/dma-mask.c
+++ b/arch/powerpc/kernel/dma-mask.c
@@ -2,11 +2,20 @@
 
 #include 
 #include 
+#include 
 #include 
 
 void arch_dma_set_mask(struct device *dev, u64 dma_mask)
 {
if (ppc_md.dma_set_mask)
ppc_md.dma_set_mask(dev, dma_mask);
+
+   if (dev_is_pci(dev)) {
+   struct pci_dev *pdev = to_pci_dev(dev);
+   struct pci_controller *phb = pci_bus_to_host(pdev->bus);
+
+   if (phb->controller_ops.dma_set_mask)
+   phb->controller_ops.dma_set_mask(pdev, dma_mask);
+   }
 }
 EXPORT_SYMBOL(arch_dma_set_mask);
-- 
1.8.3.1



[PATCH 02/11] powerpc/powernv: Add pnv_ioda_pe_iommu_bypass_supported()

2019-10-30 Thread Reza Arbab
This little calculation will be needed in other places. Move it to a
convenience function.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 8 +++-
 arch/powerpc/platforms/powernv/pci.h  | 8 
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index c28d0d9b7ee0..8849218187d7 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1838,11 +1838,9 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct 
pci_dev *pdev,
return false;
 
pe = >ioda.pe_array[pdn->pe_number];
-   if (pe->tce_bypass_enabled) {
-   u64 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
-   if (dma_mask >= top)
-   return true;
-   }
+
+   if (pnv_ioda_pe_iommu_bypass_supported(pe, dma_mask))
+   return true;
 
/*
 * If the device can't set the TCE bypass bit but still wants
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index f914f0b14e4e..41f7dec3aee5 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -4,6 +4,7 @@
 
 #include /* for __printf */
 #include 
+#include 
 #include 
 #include 
 
@@ -247,4 +248,11 @@ extern void pnv_pci_setup_iommu_table(struct iommu_table 
*tbl,
void *tce_mem, u64 tce_size,
u64 dma_offset, unsigned int page_shift);
 
+static inline bool pnv_ioda_pe_iommu_bypass_supported(struct pnv_ioda_pe *pe,
+ u64 mask)
+{
+   return pe->tce_bypass_enabled &&
+  mask >= pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
+}
+
 #endif /* __POWERNV_PCI_H */
-- 
1.8.3.1



[PATCH 10/11] powerpc/powernv: Add pnv_phb3_iommu_bypass_supported()

2019-10-30 Thread Reza Arbab
Move this code to its own function for reuse. As a side benefit,
rearrange the comments and spread things out for readability.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 37 +--
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 6b932cfc0deb..57e6a43d9a3a 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1826,6 +1826,30 @@ static int pnv_pci_ioda_dma_64bit_bypass(struct 
pnv_ioda_pe *pe)
return -EIO;
 }
 
+/*
+ * If the device can't set the TCE bypass bit but still wants
+ * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
+ * bypass the 32-bit region and be usable for 64-bit DMAs.
+ */
+static bool pnv_phb3_iommu_bypass_supported(struct pnv_ioda_pe *pe, u64 mask)
+{
+   if (pe->phb->model != PNV_PHB_MODEL_PHB3)
+   return false;
+
+   /* pe->pdev should be set if it's a single device, pe->pbus if not */
+   if (pe->pbus && pe->device_count != 1)
+   return false;
+
+   if (!(mask >> 32))
+   return false;
+
+   /* The device needs to be able to address all of this space. */
+   if (mask <= memory_hotplug_max() + (1ULL << 32))
+   return false;
+
+   return true;
+}
+
 static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
u64 dma_mask)
 {
@@ -1837,18 +1861,7 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct 
pci_dev *pdev,
 
bypass = pnv_ioda_pe_iommu_bypass_supported(pe, dma_mask);
 
-   /*
-* If the device can't set the TCE bypass bit but still wants
-* to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
-* bypass the 32-bit region and be usable for 64-bit DMAs.
-* The device needs to be able to address all of this space.
-*/
-   if (!bypass &&
-   dma_mask >> 32 &&
-   dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
-   /* pe->pdev should be set if it's a single device, pe->pbus if not 
*/
-   (pe->device_count == 1 || !pe->pbus) &&
-   pe->phb->model == PNV_PHB_MODEL_PHB3) {
+   if (!bypass && pnv_phb3_iommu_bypass_supported(pe, dma_mask)) {
/* Configure the bypass mode */
if (pnv_pci_ioda_dma_64bit_bypass(pe))
return false;
-- 
1.8.3.1



[PATCH 07/11] powerpc/powernv/npu: Simplify pnv_npu_try_dma_set_bypass() loop

2019-10-30 Thread Reza Arbab
Write this loop more compactly to improve readability.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/platforms/powernv/npu-dma.c | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index a6b8c7ad36e4..a77ce7d71634 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -261,12 +261,12 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
 void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, u64 mask)
 {
struct pnv_ioda_pe *gpe = pnv_ioda_get_pe(gpdev);
-   int i;
struct pnv_phb *phb;
struct pci_dn *pdn;
struct pnv_ioda_pe *npe;
struct pci_dev *npdev;
bool bypass;
+   int i = 0;
 
if (!gpe)
return;
@@ -274,12 +274,7 @@ void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, u64 
mask)
/* We only do bypass if it's enabled on the linked device */
bypass = pnv_ioda_pe_iommu_bypass_supported(gpe, mask);
 
-   for (i = 0; ; ++i) {
-   npdev = pnv_pci_get_npu_dev(gpdev, i);
-
-   if (!npdev)
-   break;
-
+   while ((npdev = pnv_pci_get_npu_dev(gpdev, i++))) {
pdn = pci_get_pdn(npdev);
if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
return;
-- 
1.8.3.1



[PATCH 01/11] Revert "powerpc/powernv: Remove unused pnv_npu_try_dma_set_bypass() function"

2019-10-30 Thread Reza Arbab
Revert commit b4d37a7b6934 ("powerpc/powernv: Remove unused
pnv_npu_try_dma_set_bypass() function") so that this function can be
reintegrated.

Fixes: 2d6ad41b2c21 ("powerpc/powernv: use the generic iommu bypass code")
Signed-off-by: Reza Arbab 
Cc: Christoph Hellwig 
---
 arch/powerpc/platforms/powernv/npu-dma.c | 99 
 1 file changed, 99 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index b95b9e3c4c98..5a8313654033 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -193,6 +193,105 @@ static long pnv_npu_unset_window(struct iommu_table_group 
*table_group, int num)
return 0;
 }
 
+/*
+ * Enables 32 bit DMA on NPU.
+ */
+static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
+{
+   struct pci_dev *gpdev;
+   struct pnv_ioda_pe *gpe;
+   int64_t rc;
+
+   /*
+* Find the assoicated PCI devices and get the dma window
+* information from there.
+*/
+   if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV))
+   return;
+
+   gpe = get_gpu_pci_dev_and_pe(npe, );
+   if (!gpe)
+   return;
+
+   rc = pnv_npu_set_window(>table_group, 0,
+   gpe->table_group.tables[0]);
+
+   /*
+* NVLink devices use the same TCE table configuration as
+* their parent device so drivers shouldn't be doing DMA
+* operations directly on these devices.
+*/
+   set_dma_ops(>pdev->dev, _dummy_ops);
+}
+
+/*
+ * Enables bypass mode on the NPU. The NPU only supports one
+ * window per link, so bypass needs to be explicitly enabled or
+ * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be
+ * active at the same time.
+ */
+static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
+{
+   struct pnv_phb *phb = npe->phb;
+   int64_t rc = 0;
+   phys_addr_t top = memblock_end_of_DRAM();
+
+   if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev)
+   return -EINVAL;
+
+   rc = pnv_npu_unset_window(>table_group, 0);
+   if (rc != OPAL_SUCCESS)
+   return rc;
+
+   /* Enable the bypass window */
+
+   top = roundup_pow_of_two(top);
+   dev_info(>pdev->dev, "Enabling bypass for PE %x\n",
+   npe->pe_number);
+   rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
+   npe->pe_number, npe->pe_number,
+   0 /* bypass base */, top);
+
+   if (rc == OPAL_SUCCESS)
+   pnv_pci_ioda2_tce_invalidate_entire(phb, false);
+
+   return rc;
+}
+
+void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass)
+{
+   int i;
+   struct pnv_phb *phb;
+   struct pci_dn *pdn;
+   struct pnv_ioda_pe *npe;
+   struct pci_dev *npdev;
+
+   for (i = 0; ; ++i) {
+   npdev = pnv_pci_get_npu_dev(gpdev, i);
+
+   if (!npdev)
+   break;
+
+   pdn = pci_get_pdn(npdev);
+   if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
+   return;
+
+   phb = pci_bus_to_host(npdev->bus)->private_data;
+
+   /* We only do bypass if it's enabled on the linked device */
+   npe = >ioda.pe_array[pdn->pe_number];
+
+   if (bypass) {
+   dev_info(>dev,
+   "Using 64-bit DMA iommu bypass\n");
+   pnv_npu_dma_set_bypass(npe);
+   } else {
+   dev_info(>dev, "Using 32-bit DMA via iommu\n");
+   pnv_npu_dma_set_32(npe);
+   }
+   }
+}
+
 /* Switch ownership from platform code to external user (e.g. VFIO) */
 static void pnv_npu_take_ownership(struct iommu_table_group *table_group)
 {
-- 
1.8.3.1



[PATCH 08/11] powerpc/powernv: Replace open coded pnv_ioda_get_pe()s

2019-10-30 Thread Reza Arbab
Collapse several open coded instances of pnv_ioda_get_pe().

Signed-off-by: Reza Arbab 
---
 arch/powerpc/platforms/powernv/npu-dma.c  | 22 +-
 arch/powerpc/platforms/powernv/pci-ioda.c | 10 +++---
 2 files changed, 8 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index a77ce7d71634..0010b21d45b8 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -97,24 +97,17 @@ struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, 
int index)
 static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
  struct pci_dev **gpdev)
 {
-   struct pnv_phb *phb;
-   struct pci_controller *hose;
struct pci_dev *pdev;
struct pnv_ioda_pe *pe;
-   struct pci_dn *pdn;
 
pdev = pnv_pci_get_gpu_dev(npe->pdev);
if (!pdev)
return NULL;
 
-   pdn = pci_get_pdn(pdev);
-   if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
+   pe = pnv_ioda_get_pe(pdev);
+   if (WARN_ON(!pe))
return NULL;
 
-   hose = pci_bus_to_host(pdev->bus);
-   phb = hose->private_data;
-   pe = >ioda.pe_array[pdn->pe_number];
-
if (gpdev)
*gpdev = pdev;
 
@@ -261,9 +254,6 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
 void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, u64 mask)
 {
struct pnv_ioda_pe *gpe = pnv_ioda_get_pe(gpdev);
-   struct pnv_phb *phb;
-   struct pci_dn *pdn;
-   struct pnv_ioda_pe *npe;
struct pci_dev *npdev;
bool bypass;
int i = 0;
@@ -275,12 +265,10 @@ void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, 
u64 mask)
bypass = pnv_ioda_pe_iommu_bypass_supported(gpe, mask);
 
while ((npdev = pnv_pci_get_npu_dev(gpdev, i++))) {
-   pdn = pci_get_pdn(npdev);
-   if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
-   return;
+   struct pnv_ioda_pe *npe = pnv_ioda_get_pe(npdev);
 
-   phb = pci_bus_to_host(npdev->bus)->private_data;
-   npe = >ioda.pe_array[pdn->pe_number];
+   if (WARN_ON(!npe))
+   return;
 
if (bypass) {
dev_info(>dev,
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 319152d30bc3..6b932cfc0deb 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1829,16 +1829,12 @@ static int pnv_pci_ioda_dma_64bit_bypass(struct 
pnv_ioda_pe *pe)
 static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
u64 dma_mask)
 {
-   struct pci_controller *hose = pci_bus_to_host(pdev->bus);
-   struct pnv_phb *phb = hose->private_data;
-   struct pci_dn *pdn = pci_get_pdn(pdev);
-   struct pnv_ioda_pe *pe;
+   struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev);
bool bypass;
 
-   if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
+   if (WARN_ON(!pe))
return false;
 
-   pe = >ioda.pe_array[pdn->pe_number];
bypass = pnv_ioda_pe_iommu_bypass_supported(pe, dma_mask);
 
/*
@@ -1852,7 +1848,7 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct 
pci_dev *pdev,
dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
/* pe->pdev should be set if it's a single device, pe->pbus if not 
*/
(pe->device_count == 1 || !pe->pbus) &&
-   phb->model == PNV_PHB_MODEL_PHB3) {
+   pe->phb->model == PNV_PHB_MODEL_PHB3) {
/* Configure the bypass mode */
if (pnv_pci_ioda_dma_64bit_bypass(pe))
return false;
-- 
1.8.3.1



[PATCH 11/11] powerpc/powernv: Add pnv_pci_ioda_dma_set_mask()

2019-10-30 Thread Reza Arbab
Change pnv_pci_ioda_iommu_bypass_supported() to have no side effects, by
separating the part of the function that determines if bypass is
supported from the part that actually attempts to configure it.

Move the latter to a controller-specific dma_set_mask() callback.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/platforms/powernv/Kconfig|  1 +
 arch/powerpc/platforms/powernv/pci-ioda.c | 30 --
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/Kconfig 
b/arch/powerpc/platforms/powernv/Kconfig
index 938803eab0ad..6e6e27841764 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -17,6 +17,7 @@ config PPC_POWERNV
select PPC_DOORBELL
select MMU_NOTIFIER
select FORCE_SMP
+   select ARCH_HAS_DMA_SET_MASK
default y
 
 config OPAL_PRD
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 57e6a43d9a3a..5291464930ed 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1854,32 +1854,33 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct 
pci_dev *pdev,
u64 dma_mask)
 {
struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev);
-   bool bypass;
 
if (WARN_ON(!pe))
return false;
 
-   bypass = pnv_ioda_pe_iommu_bypass_supported(pe, dma_mask);
+   return pnv_ioda_pe_iommu_bypass_supported(pe, dma_mask) ||
+  pnv_phb3_iommu_bypass_supported(pe, dma_mask);
+}
+
+static void pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 mask)
+{
+   struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev);
+
+   if (!pe)
+   return;
 
-   if (!bypass && pnv_phb3_iommu_bypass_supported(pe, dma_mask)) {
+   if (!pnv_ioda_pe_iommu_bypass_supported(pe, mask) &&
+   pnv_phb3_iommu_bypass_supported(pe, mask)) {
/* Configure the bypass mode */
if (pnv_pci_ioda_dma_64bit_bypass(pe))
-   return false;
+   return;
 
/* 4GB offset bypasses 32-bit space */
pdev->dev.archdata.dma_offset = (1ULL << 32);
-
-   bypass = true;
}
 
-   /*
-* Update peer npu devices. We also do this for the special case where
-* a 64-bit dma mask can't be fulfilled and falls back to default.
-*/
-   if (bypass || !(dma_mask >> 32) || dma_mask == DMA_BIT_MASK(64))
-   pnv_npu_try_dma_set_bypass(pdev, dma_mask);
-
-   return bypass;
+   /* Update peer npu devices */
+   pnv_npu_try_dma_set_bypass(pdev, mask);
 }
 
 static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
@@ -3612,6 +3613,7 @@ static void pnv_pci_ioda_shutdown(struct pci_controller 
*hose)
 static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
.dma_dev_setup  = pnv_pci_dma_dev_setup,
.dma_bus_setup  = pnv_pci_dma_bus_setup,
+   .dma_set_mask   = pnv_pci_ioda_dma_set_mask,
.iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported,
.setup_msi_irqs = pnv_setup_msi_irqs,
.teardown_msi_irqs  = pnv_teardown_msi_irqs,
-- 
1.8.3.1



[PATCH 05/11] powerpc/powernv: Return failure for some uses of dma_set_mask()

2019-10-30 Thread Reza Arbab
Rework of pnv_pci_ioda_dma_set_mask() effectively reverted commit
253fd51e2f53 ("powerpc/powernv/pci: Return failure for some uses of
dma_set_mask()").

Reintroduce the desired behavior that an unfulfilled request for a DMA
mask between 32 and 64 bits will return error instead of silently
falling back to 32 bits.

Fixes: 2d6ad41b2c21 ("powerpc/powernv: use the generic iommu bypass code")
Signed-off-by: Reza Arbab 
Cc: Christoph Hellwig 
---
 arch/powerpc/kernel/dma-iommu.c   | 19 +++
 arch/powerpc/platforms/powernv/pci-ioda.c |  8 ++--
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index e486d1d78de2..e1693760654b 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -122,10 +122,21 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask)
 {
struct iommu_table *tbl = get_iommu_table_base(dev);
 
-   if (dev_is_pci(dev) && dma_iommu_bypass_supported(dev, mask)) {
-   dev->archdata.iommu_bypass = true;
-   dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n");
-   return 1;
+   if (dev_is_pci(dev)) {
+   if (dma_iommu_bypass_supported(dev, mask)) {
+   dev->archdata.iommu_bypass = true;
+   dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n");
+   return 1;
+   }
+
+   if (mask >> 32) {
+   dev_err(dev, "Warning: IOMMU dma bypass not supported: 
mask 0x%016llx\n",
+   mask);
+
+   /* A 64-bit request falls back to default ops */
+   if (mask != DMA_BIT_MASK(64))
+   return 0;
+   }
}
 
if (!tbl) {
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 70e834635971..b78b5e81f941 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1863,8 +1863,12 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct 
pci_dev *pdev,
bypass = true;
}
 
-   /* Update peer npu devices */
-   pnv_npu_try_dma_set_bypass(pdev, dma_mask);
+   /*
+* Update peer npu devices. We also do this for the special case where
+* a 64-bit dma mask can't be fulfilled and falls back to default.
+*/
+   if (bypass || !(dma_mask >> 32) || dma_mask == DMA_BIT_MASK(64))
+   pnv_npu_try_dma_set_bypass(pdev, dma_mask);
 
return bypass;
 }
-- 
1.8.3.1



[PATCH 04/11] powerpc/powernv/npu: Wire up pnv_npu_try_dma_set_bypass()

2019-10-30 Thread Reza Arbab
Rework of pnv_pci_ioda_iommu_bypass_supported() dropped a call to
pnv_npu_try_dma_set_bypass(). Reintroduce this call, so that the DMA
bypass configuration of a GPU device is propagated to its corresponding
NPU devices.

Fixes: 2d6ad41b2c21 ("powerpc/powernv: use the generic iommu bypass code")
Signed-off-by: Reza Arbab 
Cc: Christoph Hellwig 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 8849218187d7..70e834635971 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1833,14 +1833,13 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct 
pci_dev *pdev,
struct pnv_phb *phb = hose->private_data;
struct pci_dn *pdn = pci_get_pdn(pdev);
struct pnv_ioda_pe *pe;
+   bool bypass;
 
if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
return false;
 
pe = >ioda.pe_array[pdn->pe_number];
-
-   if (pnv_ioda_pe_iommu_bypass_supported(pe, dma_mask))
-   return true;
+   bypass = pnv_ioda_pe_iommu_bypass_supported(pe, dma_mask);
 
/*
 * If the device can't set the TCE bypass bit but still wants
@@ -1848,7 +1847,8 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct 
pci_dev *pdev,
 * bypass the 32-bit region and be usable for 64-bit DMAs.
 * The device needs to be able to address all of this space.
 */
-   if (dma_mask >> 32 &&
+   if (!bypass &&
+   dma_mask >> 32 &&
dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
/* pe->pdev should be set if it's a single device, pe->pbus if not 
*/
(pe->device_count == 1 || !pe->pbus) &&
@@ -1859,10 +1859,14 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct 
pci_dev *pdev,
return false;
/* 4GB offset bypasses 32-bit space */
pdev->dev.archdata.dma_offset = (1ULL << 32);
-   return true;
+
+   bypass = true;
}
 
-   return false;
+   /* Update peer npu devices */
+   pnv_npu_try_dma_set_bypass(pdev, dma_mask);
+
+   return bypass;
 }
 
 static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
-- 
1.8.3.1



[PATCH 06/11] powerpc/powernv: Remove intermediate variable

2019-10-30 Thread Reza Arbab
Trim the pointless temporary variable.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index b78b5e81f941..319152d30bc3 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1854,9 +1854,9 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct 
pci_dev *pdev,
(pe->device_count == 1 || !pe->pbus) &&
phb->model == PNV_PHB_MODEL_PHB3) {
/* Configure the bypass mode */
-   s64 rc = pnv_pci_ioda_dma_64bit_bypass(pe);
-   if (rc)
+   if (pnv_pci_ioda_dma_64bit_bypass(pe))
return false;
+
/* 4GB offset bypasses 32-bit space */
pdev->dev.archdata.dma_offset = (1ULL << 32);
 
-- 
1.8.3.1



Re: [PATCH v3 2/2] powerpc/mm/radix: Free PUD table when freeing pagetable

2019-07-30 Thread Reza Arbab

On Fri, Jul 26, 2019 at 10:34:40AM +0530, Bharata B Rao wrote:

remove_pagetable() isn't freeing PUD table. This causes memory
leak during memory unplug. Fix this.


On x86, this is intentional. See

af2cf278ef4f ("x86/mm/hotplug: Don't remove PGD entries in remove_pagetable()")
98fe3633c5a4 ("x86/mm/hotplug: Fix BUG_ON() after hot-remove by not freeing 
PUD")

Does their reasoning apply to powerpc as well?

--
Reza Arbab


Re: [v2 09/12] powerpc/mce: Enable MCE notifiers in external modules

2019-07-08 Thread Reza Arbab

On Fri, Jul 05, 2019 at 03:29:39PM +1000, Nicholas Piggin wrote:

Okay. It might be possible to save the address in the kernel and
then notify the driver afterward. For user-mode and any non-atomic
user copy AFAIK the irq_work should practically run synchronously
after the machine check returns so it might be enough to have a
notifier in the irq work processing.


We can pick up this thread later, but if I remember correctly the 
sticking point we ran into was that we never got that far. Instead of 
returning from the MCE, we went down the fatal codepath.


--
Reza Arbab


Re: [v3 7/7] powerpc/64s: save r13 in MCE handler (simulator workaroud)

2019-07-08 Thread Reza Arbab

On Sat, Jul 06, 2019 at 07:56:39PM +1000, Nicholas Piggin wrote:

Santosh Sivaraj's on July 6, 2019 7:26 am:

From: Reza Arbab 

Testing my memcpy_mcsafe() work in progress with an injected UE, I get
an error like this immediately after the function returns:

BUG: Unable to handle kernel data access at 0x7fff84dec8f8
Faulting instruction address: 0xc008009c00b0
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Radix MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV
Modules linked in: mce(O+) vmx_crypto crc32c_vpmsum
CPU: 0 PID: 1375 Comm: modprobe Tainted: G   O  5.1.0-rc6 #267
NIP:  c008009c00b0 LR: c008009c00a8 CTR: c0095f90
REGS: c000ee197790 TRAP: 0300   Tainted: G   O   (5.1.0-rc6)
MSR:  9280b033   CR: 88002826  XER: 
0004
CFAR: c0095f8c DAR: 7fff84dec8f8 DSISR: 4000 IRQMASK: 0
GPR00: 6c6c6568 c000ee197a20 c008009c8400 fff2
GPR04: c008009c02e0 0006  c3c834c8
GPR08: 0080 776a6681b7fb5100  c008009c01c8
GPR12: c0095f90 7fff84debc00 4d071440 
GPR16: 00010601 c008009e c0c98dd8 c0c98d98
GPR20: c3bba970 c008009c04d0 c008009c0618 c01e5820
GPR24:  0100 0001 c3bba958
GPR28: c008009c02e8 c008009c0318 c008009c02e0 
NIP [c008009c00b0] cause_ue+0xa8/0xe8 [mce]
LR [c008009c00a8] cause_ue+0xa0/0xe8 [mce]

After debugging we see that the first instruction at vector 200 is skipped by
the simulator, due to which r13 is not saved. Adding a nop at 0x200 fixes the
issue.

(This commit is needed for testing this series. This should not be taken
into the tree)


Would be good if this was testable in simulator upstream, did you
report it? What does cause_ue do? exc_mce in mambo seems to do the
right thing AFAIKS.


I think I posted this earlier, but cause_ue() is just a test function 
telling me where to set up the error injection:


static noinline void cause_ue(void)
{
static const char src[] = "hello";
char dst[10];
int rc;

/* During the pause, break into mambo and run the following */
pr_info("inject_mce_ue_on_addr 0x%px\n", src);
pause(10);

rc = memcpy_mcsafe(dst, src, sizeof(src));
pr_info("memcpy_mcsafe() returns %d\n", rc);
if (!rc)
pr_info("dst=\"%s\"\n", dst);
}

Can't speak for the others, but I haven't chased this upstream. I didn't 
know it was a simulator issue.


--
Reza Arbab



Re: [v2 09/12] powerpc/mce: Enable MCE notifiers in external modules

2019-07-04 Thread Reza Arbab

On Thu, Jul 04, 2019 at 12:36:18PM +1000, Nicholas Piggin wrote:

Reza Arbab's on July 4, 2019 3:20 am:

Since the notifier chain is actually part of the decision between (1)
and (2), it's a hard limitation then that callbacks be in real address
space. Is there any way to structure things so that's not the case?


If we tested for KVM guest first, and went through and marked (maybe
in a paca flag) everywhere else that put the MMU into a bad / non-host
state, and had the notifiers use the machine check stack, then it
would be possible to enable MMU here.

Hmm, testing for IR|DR after testing for KVM guest might actually be
enough without requiring changes outside the machine check handler...
Actually no that may not quite work because the handler could take a
SLB miss and it might have been triggered inside the SLB miss handler.

All in all I'm pretty against turning on MMU in the MCE handler
anywhere.


Hey, fair enough. Just making sure there really isnt't any room to make 
things work the way I was trying.



Luckily this patch isn't really necessary for memcpy_mcsafe(), but we
have a couple of other potential users of the notifier from external
modules (so their callbacks would require virtual mode).


What users are there? Do they do any significant amount of logic that
can not be moved to vmlinux?


One I had in mind was the NVIDIA driver. When taking a UE from defective 
GPU memory, it could use the notifier to save the bad address to a 
blacklist in their nvram. Not so much recovering the machine check, just 
logging before the system reboots.


The other user is a prototype driver for the IBM Research project we had 
a talk about offline a while back.


We can make this patchset work for memcpy_mcsafe(), but I think it's 
back to the drawing board for the others.


--
Reza Arbab



Re: [v2 09/12] powerpc/mce: Enable MCE notifiers in external modules

2019-07-03 Thread Reza Arbab

On Tue, Jul 02, 2019 at 04:17:11PM +1000, Nicholas Piggin wrote:

Santosh Sivaraj's on July 2, 2019 3:19 pm:

--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -458,6 +458,12 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
bl  machine_check_early
std r3,RESULT(r1)   /* Save result */

+   /* Notifiers may be in a module, so enable virtual addressing. */
+   mfmsr   r11
+   ori r11,r11,MSR_IR
+   ori r11,r11,MSR_DR
+   mtmsr   r11


Can't do this, we could take a machine check somewhere the MMU is
not sane (in fact the guest early mce handling that was added recently
should not be enabling virtual mode either, which needs to be fixed).


Rats. So in machine_check_handle_early() there are two options; either 

1. The mc is unhandled/unrecoverable. Stay in real mode, proceed to 
unrecover_mce(), the fatal path of no return (panic, reboot, etc).


2. The mc is handled/recovered. Return from MCE where any further action 
can be done by processing the machine check event workqueue. Am I  
understanding you correctly that this is the absolute earliest we can 
get back to virtual mode?


Since the notifier chain is actually part of the decision between (1) 
and (2), it's a hard limitation then that callbacks be in real address 
space. Is there any way to structure things so that's not the case?


Luckily this patch isn't really necessary for memcpy_mcsafe(), but we 
have a couple of other potential users of the notifier from external 
modules (so their callbacks would require virtual mode).


--
Reza Arbab



Re: [v2 03/12] powerpc/mce: Add MCE notification chain

2019-07-02 Thread Reza Arbab

On Tue, Jul 02, 2019 at 10:49:23AM +0530, Santosh Sivaraj wrote:

+static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);


Mahesh suggested using an atomic notifier chain instead of blocking, 
since we are in an interrupt.


--
Reza Arbab



Re: [PATCH 11/13] powerpc/64s: Save r13 in machine_check_common_early

2019-06-22 Thread Reza Arbab

On Sat, Jun 22, 2019 at 09:21:55AM +1000, Nicholas Piggin wrote:

Yep, from the stack trace, r13 is corrupted. So r13 must have got
corrupted before the machine check and this just happens to have
corrected it.

How does cause_ue work? It or memcpy_mcsafe must be corrupting
r13.


Well, cause_ue() is just my little testcase using inject_mce_ue_on_addr 
from skiboot/external/mambo/mambo_utils.tcl:


static noinline void cause_ue(void)
{
static const char src[] = "hello";
char dst[10];
int rc;

/* During the pause, break into mambo and run the following */
pr_info("inject_mce_ue_on_addr 0x%px\n", src);
pause(10);

rc = memcpy_mcsafe(dst, src, sizeof(src));
pr_info("memcpy_mcsafe() returns %d\n", rc);
if (!rc)
pr_info("dst=\"%s\"\n", dst);
}

I can't see how memcpy_mcsafe() would be causing it. I tried changing it 
to save/restore r13 where it already does r14-r22, but that didn't make 
a difference. Any ideas?


--
Reza Arbab


Re: [PATCH 05/13] powerpc/mce: Allow notifier callback to handle MCE

2019-06-22 Thread Reza Arbab

Hi Mahesh,

On Fri, Jun 21, 2019 at 12:35:08PM +0530, Mahesh Jagannath Salgaonkar wrote:

On 6/21/19 6:27 AM, Santosh Sivaraj wrote:

-   blocking_notifier_call_chain(_notifier_list, 0, );
+   rc = blocking_notifier_call_chain(_notifier_list, 0, evt);
+   if (rc & NOTIFY_STOP_MASK) {
+   evt->disposition = MCE_DISPOSITION_RECOVERED;
+   regs->msr |= MSR_RI;


What is the reason for setting MSR_RI ? I don't think this is a good
idea. MSR_RI = 0 means system got MCE interrupt when SRR0 and SRR1
contents were live and was overwritten by MCE interrupt. Hence this
interrupt is unrecoverable irrespective of whether machine check handler
recovers from it or not.


Good catch! I think this is an artifact from when I was first trying to 
get all this working.


Instead of setting MSR_RI, we should probably just check for it. Ie,

if ((rc & NOTIFY_STOP_MASK) && (regs->msr & MSR_RI)) {
evt->disposition = MCE_DISPOSITION_RECOVERED;

--
Reza Arbab



Re: [PATCH] powerpc/mm: Add _PAGE_SAO to _PAGE_CACHE_CTL mask

2019-01-29 Thread Reza Arbab

On Tue, Jan 29, 2019 at 08:37:28PM +0530, Aneesh Kumar K.V wrote:

Not sure what the fix is about. We set the related hash pte flags via

if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT)
rflags |= HPTE_R_I;
else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)
rflags |= (HPTE_R_I | HPTE_R_G);
else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO)
rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M);


Again, nothing broken here, just a code readability thing. As Alexey 
(and Charlie) noted, given the above it is a little confusing to define 
_PAGE_CACHE_CTL this way:


 #define _PAGE_CACHE_CTL  (_PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT)

I like Alexey's idea, maybe just use a literal?

 #define _PAGE_CACHE_CTL 0x30

--
Reza Arbab



[PATCH] powerpc/mm: Add _PAGE_SAO to _PAGE_CACHE_CTL mask

2019-01-28 Thread Reza Arbab
In htab_convert_pte_flags(), _PAGE_CACHE_CTL is used to check for the
_PAGE_SAO flag:

  else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO)
  rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M);

But, it isn't defined to include that flag:

  #define _PAGE_CACHE_CTL (_PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT)

This happens to work, but only because of the flag values:

  #define _PAGE_SAO   0x00010 /* Strong access order */
  #define _PAGE_NON_IDEMPOTENT0x00020 /* non idempotent memory */
  #define _PAGE_TOLERANT  0x00030 /* tolerant memory, cache inhibited */

To prevent any issues if these particulars ever change, add _PAGE_SAO to
the mask.

Suggested-by: Charles Johns 
Signed-off-by: Reza Arbab 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 2e6ada2..1d97a28 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -811,7 +811,7 @@ static inline void __set_pte_at(struct mm_struct *mm, 
unsigned long addr,
return hash__set_pte_at(mm, addr, ptep, pte, percpu);
 }
 
-#define _PAGE_CACHE_CTL(_PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT)
+#define _PAGE_CACHE_CTL(_PAGE_SAO | _PAGE_NON_IDEMPOTENT | 
_PAGE_TOLERANT)
 
 #define pgprot_noncached pgprot_noncached
 static inline pgprot_t pgprot_noncached(pgprot_t prot)
-- 
1.8.3.1



Re: [PATCH] Correct PowerPC VDSO call frame info

2018-09-13 Thread Reza Arbab

On Fri, Sep 14, 2018 at 08:57:04AM +0930, Alan Modra wrote:

There is control flow in __kernel_clock_gettime that reaches label 99
without saving lr in r12.  CFI info however is interpreted by the
unwinder without reference to control flow: It's a simple matter of
"Execute all the CFI opcodes up to the current address".  That means
the unwinder thinks r12 contains the return address at label 99.
Disabuse it of that notion by resetting CFI for the return address at
label 99.


Thanks for this! It looks like v2 will just be a commit log change, so 
feel free to carry over my


Tested-by: Reza Arbab 

--
Reza Arbab



Re: [PATCH v2 3/3] powerpc/mce: Handle memcpy_mcsafe

2018-08-13 Thread Reza Arbab
& NOTIFY_STOP_MASK) {
+   entry = search_exception_tables(regs->nip);
+   if (entry != NULL) {
+   mce->u.ue_error.error_return = 1;
+   regs->nip = extable_fixup(entry);
+   } else
+   machine_check_ue_event(mce);
+   } else
+   machine_check_ue_event(mce);
}
}
return;


With the above, this logic would be simplified. So,

rc = blocking_notifier_call_chain(_notifier_list,
(unsigned long)mce, NULL);
if (rc & NOTIFY_STOP_MASK) {
entry = search_exception_tables(regs->nip);
if (entry != NULL) {
mce->u.ue_error.error_return = 1;
regs->nip = extable_fixup(entry);
}
}

if (!mce->u.ue_error.error_return)
machine_check_ue_event(mce);


@@ -208,7 +278,6 @@ void release_mce_event(void)
get_mce_event(NULL, true);
}

-
/*
 * Queue up the MCE event which then can be handled later.
 */
@@ -239,6 +308,10 @@ void machine_check_queue_event(void)
if (!get_mce_event(, MCE_EVENT_RELEASE))
return;

+   if (evt.error_type == MCE_ERROR_TYPE_UE &&
+   evt.u.ue_error.error_return == 1)
+   return;
+
index = __this_cpu_inc_return(mce_queue_count) - 1;
/* If queue is full, just return for now. */
    if (index >= MAX_MC_EVT) {
--
2.13.6



--
Reza Arbab



[PATCH] powerpc/powernv: Add support for NPU2 relaxed-ordering mode

2018-08-07 Thread Reza Arbab
From: Alistair Popple 

Some device drivers support out of order access to GPU memory. This does
not affect the CPU view of memory but it does affect the GPU view, so it
should only be enabled once the GPU driver has requested it. Add APIs
allowing a driver to do so.

Signed-off-by: Alistair Popple 
[ar...@linux.ibm.com: Rebase, add commit log]
Signed-off-by: Reza Arbab 
---
 arch/powerpc/include/asm/opal-api.h|  4 ++-
 arch/powerpc/include/asm/opal.h|  3 ++
 arch/powerpc/include/asm/powernv.h | 12 
 arch/powerpc/platforms/powernv/npu-dma.c   | 39 ++
 arch/powerpc/platforms/powernv/opal-wrappers.S |  2 ++
 5 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 3bab299..be6fe23e 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -208,7 +208,9 @@
 #define OPAL_SENSOR_READ_U64   162
 #define OPAL_PCI_GET_PBCQ_TUNNEL_BAR   164
 #define OPAL_PCI_SET_PBCQ_TUNNEL_BAR   165
-#define OPAL_LAST  165
+#define OPAL_NPU_SET_RELAXED_ORDER 168
+#define OPAL_NPU_GET_RELAXED_ORDER 169
+#define OPAL_LAST  169
 
 #define QUIESCE_HOLD   1 /* Spin all calls at entry */
 #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index e1b2910..48bea30 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -43,6 +43,9 @@ int64_t opal_npu_spa_clear_cache(uint64_t phb_id, uint32_t 
bdfn,
uint64_t PE_handle);
 int64_t opal_npu_tl_set(uint64_t phb_id, uint32_t bdfn, long cap,
uint64_t rate_phys, uint32_t size);
+int64_t opal_npu_set_relaxed_order(uint64_t phb_id, uint16_t bdfn,
+  bool request_enabled);
+int64_t opal_npu_get_relaxed_order(uint64_t phb_id, uint16_t bdfn);
 int64_t opal_console_write(int64_t term_number, __be64 *length,
   const uint8_t *buffer);
 int64_t opal_console_read(int64_t term_number, __be64 *length,
diff --git a/arch/powerpc/include/asm/powernv.h 
b/arch/powerpc/include/asm/powernv.h
index 2f3ff7a..874ec6d 100644
--- a/arch/powerpc/include/asm/powernv.h
+++ b/arch/powerpc/include/asm/powernv.h
@@ -22,6 +22,8 @@ extern void pnv_npu2_destroy_context(struct npu_context 
*context,
 extern int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
unsigned long *flags, unsigned long *status,
int count);
+int pnv_npu2_request_relaxed_ordering(struct pci_dev *pdev, bool enable);
+int pnv_npu2_get_relaxed_ordering(struct pci_dev *pdev);
 
 void pnv_tm_init(void);
 #else
@@ -39,6 +41,16 @@ static inline int pnv_npu2_handle_fault(struct npu_context 
*context,
return -ENODEV;
 }
 
+static int pnv_npu2_request_relaxed_ordering(struct pci_dev *pdev, bool enable)
+{
+   return -ENODEV;
+}
+
+static int pnv_npu2_get_relaxed_ordering(struct pci_dev *pdev)
+{
+   return -ENODEV;
+}
+
 static inline void pnv_tm_init(void) { }
 static inline void pnv_power9_force_smt4(void) { }
 #endif
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index 8cdf91f..038dc1e 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "powernv.h"
 #include "pci.h"
@@ -988,3 +989,41 @@ int pnv_npu2_init(struct pnv_phb *phb)
 
return 0;
 }
+
+/*
+ * Request relaxed ordering be enabled or disabled for the given PCI device.
+ * This function may or may not actually enable relaxed ordering depending on
+ * the exact system configuration. Use pnv_npu2_get_relaxed_ordering() below to
+ * determine the current state of relaxed ordering.
+ */
+int pnv_npu2_request_relaxed_ordering(struct pci_dev *pdev, bool enable)
+{
+   struct pci_controller *hose;
+   struct pnv_phb *phb;
+   int rc;
+
+   hose = pci_bus_to_host(pdev->bus);
+   phb = hose->private_data;
+
+   rc = opal_npu_set_relaxed_order(phb->opal_id,
+   PCI_DEVID(pdev->bus->number, pdev->devfn),
+   enable);
+   if (rc != OPAL_SUCCESS && rc != OPAL_CONSTRAINED)
+   return -EPERM;
+
+   return 0;
+}
+EXPORT_SYMBOL(pnv_npu2_request_relaxed_ordering);
+
+int pnv_npu2_get_relaxed_ordering(struct pci_dev *pdev)
+{
+   struct pci_controller *hose;
+   struct pnv_phb *phb;
+
+   hose = pci_bus_to_host(pdev->bus);
+   phb = hose->private_data;
+
+   return opal_npu_get_relaxed_order(phb->opal_id,
+ 

[PATCH] powerpc/powernv: Fix concurrency issue with npu->mmio_atsd_usage

2018-08-02 Thread Reza Arbab
We've encountered a performance issue when multiple processors stress
{get,put}_mmio_atsd_reg(). These functions contend for mmio_atsd_usage,
an unsigned long used as a bitmask.

The accesses to mmio_atsd_usage are done using test_and_set_bit_lock()
and clear_bit_unlock(). As implemented, both of these will require a
(successful) stwcx to that same cache line.

What we end up with is thread A, attempting to unlock, being slowed by
other threads repeatedly attempting to lock. A's stwcx instructions fail
and retry because the memory reservation is lost every time a different
thread beats it to the punch.

There may be a long-term way to fix this at a larger scale, but for now
resolve the immediate problem by gating our call to
test_and_set_bit_lock() with one to test_bit(), which is obviously
implemented without using a store.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/platforms/powernv/npu-dma.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index 8cdf91f..c773465 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -437,8 +437,9 @@ static int get_mmio_atsd_reg(struct npu *npu)
int i;
 
for (i = 0; i < npu->mmio_atsd_count; i++) {
-   if (!test_and_set_bit_lock(i, >mmio_atsd_usage))
-   return i;
+   if (!test_bit(i, >mmio_atsd_usage))
+   if (!test_and_set_bit_lock(i, >mmio_atsd_usage))
+   return i;
}
 
return -ENOSPC;
-- 
1.8.3.1



Re: [PATCH 1/2] powerpc/mm: Flush cache on memory hot(un)plug

2018-04-09 Thread Reza Arbab

On Fri, Apr 06, 2018 at 03:24:23PM +1000, Balbir Singh wrote:

This patch adds support for flushing potentially dirty
cache lines when memory is hot-plugged/hot-un-plugged.


Acked-by: Reza Arbab <ar...@linux.ibm.com>

--
Reza Arbab



Re: [PATCH] powerpc/powernv: Increase memory block size to 1GB on radix

2017-09-07 Thread Reza Arbab

On Thu, Sep 07, 2017 at 05:17:41AM +, Anton Blanchard wrote:

But all of memory on PowerNV should be able to be hot unplugged, so
there are two options as I see it - either increase the memory block
size, or map everything with 2MB pages.


I may be misunderstanding this, but what if we did something like x86 
does? When trying to unplug a region smaller than the mapping, they fill 
that part of the pagetable with 0xFD instead of freeing the whole thing.  
Once the whole thing is 0xFD, free it.


See arch/x86/mm/init_64.c:remove_{pte,pmd,pud}_table()

---%<---
memset((void *)addr, PAGE_INUSE, next - addr);

page_addr = page_address(pte_page(*pte));
if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
...
pte_clear(_mm, addr, pte);
...
}
---%<---

--
Reza Arbab



Re: [FIX PATCH v0] powerpc: Fix memory unplug failure on radix guest

2017-08-11 Thread Reza Arbab

On Fri, Aug 11, 2017 at 02:07:51PM +0530, Aneesh Kumar K.V wrote:

Reza Arbab <ar...@linux.vnet.ibm.com> writes:


On Thu, Aug 10, 2017 at 02:53:48PM +0530, Bharata B Rao wrote:

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index f830562..24ecf53 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -524,6 +524,7 @@ static int __init early_init_dt_scan_drconf_memory(unsigned 
long node)
size = 0x8000ul - base;
}
memblock_add(base, size);
+   memblock_mark_hotplug(base, size);
} while (--rngs);
}
memblock_dump_all();


Doing this has the effect of putting all the affected memory into
ZONE_MOVABLE. See find_zone_movable_pfns_for_nodes(). This means no
kernel allocations can occur there. Is that okay?



So the thinking here is any memory identified via ibm,dynamic-memory can
be hot removed later. Hence the need to add them lmb size, because our
hotplug framework remove them in lmb size. If we want to support
hotunplug, then we will have to make sure kernel allocation doesn't
happen in that region right ?


Yes, the net result is that this memory can now be hotremoved. I just 
wanted to point out that the patch doesn't only change the granularity 
of addition--it also causes the memory to end up in a different zone 
(when using movable_node).



With the above i would consider not marking it hotplug was a bug before
?


Sure, that's reasonable.

--
Reza Arbab



Re: [FIX PATCH v0] powerpc: Fix memory unplug failure on radix guest

2017-08-10 Thread Reza Arbab

On Thu, Aug 10, 2017 at 11:50:19AM -0500, Reza Arbab wrote:

On Thu, Aug 10, 2017 at 02:53:48PM +0530, Bharata B Rao wrote:

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index f830562..24ecf53 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -524,6 +524,7 @@ static int __init early_init_dt_scan_drconf_memory(unsigned 
long node)
size = 0x8000ul - base;
}
memblock_add(base, size);
+   memblock_mark_hotplug(base, size);
} while (--rngs);
}
memblock_dump_all();


Doing this has the effect of putting all the affected memory into 
ZONE_MOVABLE. See find_zone_movable_pfns_for_nodes(). This means no 
kernel allocations can occur there. Is that okay?


I should clarify. The behavior change I mention applies when 
movable_node_is_enabled().


--
Reza Arbab



Re: [FIX PATCH v0] powerpc: Fix memory unplug failure on radix guest

2017-08-10 Thread Reza Arbab

On Thu, Aug 10, 2017 at 02:53:48PM +0530, Bharata B Rao wrote:

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index f830562..24ecf53 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -524,6 +524,7 @@ static int __init early_init_dt_scan_drconf_memory(unsigned 
long node)
size = 0x8000ul - base;
}
memblock_add(base, size);
+   memblock_mark_hotplug(base, size);
} while (--rngs);
}
memblock_dump_all();


Doing this has the effect of putting all the affected memory into 
ZONE_MOVABLE. See find_zone_movable_pfns_for_nodes(). This means no 
kernel allocations can occur there. Is that okay?



diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 671a45d..180d25a 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -12,6 +12,7 @@
#include 
#include 
#include 
+#include 

#include 
#include 
@@ -255,15 +256,25 @@ static void __init radix_init_pgtable(void)
{
unsigned long rts_field;
struct memblock_region *reg;
+   phys_addr_t addr;
+   u64 lmb_size = memory_block_size_bytes();

/* We don't support slb for radix */
mmu_slb_size = 0;
/*
 * Create the linear mapping, using standard page size for now
 */
-   for_each_memblock(memory, reg)
-   WARN_ON(create_physical_mapping(reg->base,
-   reg->base + reg->size));
+   for_each_memblock(memory, reg) {
+   if (memblock_is_hotpluggable(reg)) {
+   for (addr = reg->base; addr < (reg->base + reg->size);
+   addr += lmb_size)
+   WARN_ON(create_physical_mapping(addr,
+   addr + lmb_size));
+   } else {
+   WARN_ON(create_physical_mapping(reg->base,
+   reg->base + reg->size));
+   }
+   }

/* Find out how many PID bits are supported */
if (cpu_has_feature(CPU_FTR_HVMODE)) {
--
2.7.4



--
Reza Arbab



Re: [Patch 2/2]: powerpc/hotplug/mm: Fix hot-add memory node assoc

2017-06-01 Thread Reza Arbab

On Thu, Jun 01, 2017 at 07:36:31PM +1000, Michael Ellerman wrote:
I don't think that's what the patch does. It just marks 32 (!?) nodes 
as online. Or if you're talking about reverting 3af229f2071f that 
leaves you with 256 possible nodes. Both of which are wasteful.


To be clear, with Balbir's set the latter is no longer wasteful.

The right fix is to make sure any nodes which are present at boot 
remain in the possible map, even if they don't have memory/CPUs 
assigned at boot.


I'm still hoping 3af229f2071f could indeed be reverted some day, but 
until then the following would follow your suggestion for our GPU nodes.  
What do you think?


--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -895,6 +895,7 @@ static void __init setup_node_data(int nid, u64 start_pfn, 
u64 end_pfn)
void __init initmem_init(void)
{
int nid, cpu;
+   struct device_node *dn;

max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
max_pfn = max_low_pfn;
@@ -911,6 +912,18 @@ void __init initmem_init(void)
 */
nodes_and(node_possible_map, node_possible_map, node_online_map);

+   /*
+* Consider an ibm,coherent-device-memory node possible. Even though
+* it is not online at boot, it may be hotplugged later.
+*/
+   for_each_compatible_node(dn, NULL, "ibm,coherent-device-memory") {
+   nid = of_node_to_nid_single(dn);
+   if (nid < 0)
+   continue;
+
+   node_set(nid, node_possible_map);
+   }
+
for_each_online_node(nid) {
unsigned long start_pfn, end_pfn;


--
Reza Arbab



Re: [Patch 2/2]: powerpc/hotplug/mm: Fix hot-add memory node assoc

2017-05-26 Thread Reza Arbab

On Fri, May 26, 2017 at 01:46:58PM +1000, Michael Ellerman wrote:

Reza Arbab <ar...@linux.vnet.ibm.com> writes:


On Thu, May 25, 2017 at 04:19:53PM +1000, Michael Ellerman wrote:

The commit message for 3af229f2071f says:

   In practice, we never see a system with 256 NUMA nodes, and in fact, we
   do not support node hotplug on power in the first place, so the nodes
   ^^^
   that are online when we come up are the nodes that will be present for
   the lifetime of this kernel.

Is that no longer true?


I don't know what the reasoning behind that statement was at the time,
but as far as I can tell, the only thing missing for node hotplug now is
Balbir's patchset [1]. He fixes the resource issue which motivated
3af229f2071f and reverts it.

With that set, I can instantiate a new numa node just by doing
add_memory(nid, ...) where nid doesn't currently exist.


But does that actually happen on any real system?


I don't know if anything currently tries to do this. My interest in 
having this working is so that in the future, our coherent gpu memory 
could be added as a distinct node by the device driver.


--
Reza Arbab



Re: [Patch 2/2]: powerpc/hotplug/mm: Fix hot-add memory node assoc

2017-05-25 Thread Reza Arbab

On Thu, May 25, 2017 at 04:19:53PM +1000, Michael Ellerman wrote:

The commit message for 3af229f2071f says:

   In practice, we never see a system with 256 NUMA nodes, and in fact, we
   do not support node hotplug on power in the first place, so the nodes
   ^^^
   that are online when we come up are the nodes that will be present for
   the lifetime of this kernel.

Is that no longer true?


I don't know what the reasoning behind that statement was at the time, 
but as far as I can tell, the only thing missing for node hotplug now is 
Balbir's patchset [1]. He fixes the resource issue which motivated 
3af229f2071f and reverts it.


With that set, I can instantiate a new numa node just by doing 
add_memory(nid, ...) where nid doesn't currently exist.


[1] 
https://lkml.kernel.org/r/1479253501-26261-1-git-send-email-bsinghar...@gmail.com

--
Reza Arbab



Re: [Patch 2/2]: powerpc/hotplug/mm: Fix hot-add memory node assoc

2017-05-24 Thread Reza Arbab

On Tue, May 23, 2017 at 05:44:23PM -0500, Michael Bringmann wrote:

On 05/23/2017 04:49 PM, Reza Arbab wrote:

On Tue, May 23, 2017 at 03:05:08PM -0500, Michael Bringmann wrote:

On 05/23/2017 10:52 AM, Reza Arbab wrote:

On Tue, May 23, 2017 at 10:15:44AM -0500, Michael Bringmann wrote:

+static void setup_nodes(void)
+{
+int i, l = 32 /* MAX_NUMNODES */;
+
+for (i = 0; i < l; i++) {
+if (!node_possible(i)) {
+setup_node_data(i, 0, 0);
+node_set(i, node_possible_map);
+}
+}
+}


This seems to be a workaround for 3af229f2071f ("powerpc/numa: Reset 
node_possible_map to only node_online_map").


They may be related, but that commit is not a replacement.  The above patch 
ensures that
there are enough of the nodes initialized at startup to allow for memory 
hot-add into a
node that was not used at boot.  (See 'setup_node_data' function in 'numa.c'.)  
That and
recording that the node was initialized.


Is it really necessary to preinitialize these empty nodes using 
setup_node_data()? When you do memory hotadd into a node that was not used at 
boot, the node data already gets set up by

add_memory
 add_memory_resource
   hotadd_new_pgdat
 arch_alloc_nodedata <-- allocs the pg_data_t
 ...
 free_area_init_node <-- sets NODE_DATA(nid)->node_id, etc.

Removing setup_node_data() from that loop leaves only the call to node_set(). 
If 3af229f2071f (which reduces node_possible_map) was reverted, you wouldn't 
need to do that either.


With or without 3af229f2071f, we would still need to add something, somewhere 
to add new
bits to the 'node_possible_map'.  That is not being done.


Without 3af229f2071f, those bits would already BE set in 
node_possible_map. You wouldn't have to do anything.


--
Reza Arbab



Re: [Patch 2/2]: powerpc/hotplug/mm: Fix hot-add memory node assoc

2017-05-23 Thread Reza Arbab

On Tue, May 23, 2017 at 03:05:08PM -0500, Michael Bringmann wrote:

On 05/23/2017 10:52 AM, Reza Arbab wrote:

On Tue, May 23, 2017 at 10:15:44AM -0500, Michael Bringmann wrote:

+static void setup_nodes(void)
+{
+int i, l = 32 /* MAX_NUMNODES */;
+
+for (i = 0; i < l; i++) {
+if (!node_possible(i)) {
+setup_node_data(i, 0, 0);
+node_set(i, node_possible_map);
+}
+}
+}


This seems to be a workaround for 3af229f2071f ("powerpc/numa: Reset 
node_possible_map to only node_online_map").


They may be related, but that commit is not a replacement.  The above patch 
ensures that
there are enough of the nodes initialized at startup to allow for memory 
hot-add into a
node that was not used at boot.  (See 'setup_node_data' function in 'numa.c'.)  
That and
recording that the node was initialized.


Is it really necessary to preinitialize these empty nodes using 
setup_node_data()? When you do memory hotadd into a node that was not 
used at boot, the node data already gets set up by


add_memory
 add_memory_resource
   hotadd_new_pgdat
 arch_alloc_nodedata <-- allocs the pg_data_t
 ...
 free_area_init_node <-- sets NODE_DATA(nid)->node_id, etc.

Removing setup_node_data() from that loop leaves only the call to 
node_set(). If 3af229f2071f (which reduces node_possible_map) was 
reverted, you wouldn't need to do that either.



I didn't see where any part of commit 3af229f2071f would touch the 
'node_possible_map'
which is needed by 'numa.c' and 'workqueue.c'.  The nodemask created and 
updated by
'mem_cgroup_may_update_nodemask()' does not appear to be the same mask.


Are you sure you're looking at 3af229f2071f? It only adds one line of 
code; the reduction of node_possible_map.


--
Reza Arbab



Re: [Patch 2/2]: powerpc/hotplug/mm: Fix hot-add memory node assoc

2017-05-23 Thread Reza Arbab

On Tue, May 23, 2017 at 10:15:44AM -0500, Michael Bringmann wrote:

+static void setup_nodes(void)
+{
+   int i, l = 32 /* MAX_NUMNODES */;
+
+   for (i = 0; i < l; i++) {
+   if (!node_possible(i)) {
+   setup_node_data(i, 0, 0);
+   node_set(i, node_possible_map);
+   }
+   }
+}


This seems to be a workaround for 3af229f2071f ("powerpc/numa: Reset 
node_possible_map to only node_online_map").


Balbir, you have a patchset which reverts it. Do you think that will be 
getting merged?


http://lkml.kernel.org/r/1479253501-26261-1-git-send-email-bsinghar...@gmail.com
(see patch 3/3)

--
Reza Arbab



Re: ioctl structs differ from x86_64?

2017-03-15 Thread Reza Arbab

On Wed, Mar 15, 2017 at 01:11:19PM -0500, Reza Arbab wrote:

https://groups.google.com/forum/#!topic/golang-nuts/K5NoG8slez0


Oops.

https://groups.google.com/d/msg/golang-nuts/K5NoG8slez0/mixUse17iaMJ

--
Reza Arbab



Re: ioctl structs differ from x86_64?

2017-03-15 Thread Reza Arbab

On Tue, Mar 14, 2017 at 10:37:44AM +, Harshal Patil wrote:
Our guess is the ioctls in ppc64le differ than x86_64, and thats why 
the code which is clearing onclr bit 
([4]https://github.com/opencontainers/runc/commit/eea28f480db435dbef4a275de9776b9934818b8c#diff-5f5c07d0cab3ce2086437d3d43c0d25fR164)

is failing on ppc64le but works fine on x86_64. 
 
Any pointers on the possible solution would be really helpful. 


This looks like a bug in Go.

The syscall.TCGETS and syscall.TCSETS constants have the wrong values. 
They were generated using the glibc termios struct instead of the kernel 
termios struct. It's the issue described here:


https://groups.google.com/forum/#!topic/golang-nuts/K5NoG8slez0

Things work if you replace syscall.TCGETS with 0x402c7413 and 
syscall.TCSETS with 0x802c7414, the correct values on ppc64le.


--
Reza Arbab



Re: [PATCH v5 1/4] powerpc/mm: refactor radix physical page mapping

2017-01-30 Thread Reza Arbab

On Mon, Jan 30, 2017 at 07:38:18PM +1100, Michael Ellerman wrote:

Doesn't build.

In file included from ../include/linux/kernel.h:13:0,
from ../include/linux/sched.h:17,
from ../arch/powerpc/mm/pgtable-radix.c:11:
../arch/powerpc/mm/pgtable-radix.c: In function ‘create_physical_mapping’:
../include/linux/printk.h:299:2: error: ‘mapping_size’ may be used 
uninitialized in this function [-Werror=maybe-uninitialized]
 printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
 ^~
../arch/powerpc/mm/pgtable-radix.c:123:22: note: ‘mapping_size’ was declared 
here
 unsigned long addr, mapping_size;


Doh. Could you please do the following for now?

-   unsigned long addr, mapping_size;
+   unsigned long addr, mapping_size = 0;

I'd like to delay spinning v6 with this until I see any input you might 
have on the rest of the set.


And for future reference, how are you ending up with 
-Werror=maybe-uninitialized? On powerpc/next, with pseries_le_defconfig, 
I get -Wno-maybe-uninitialized.


--
Reza Arbab



[PATCH] powerpc/mm: use the correct pointer when setting a 2M pte

2017-01-25 Thread Reza Arbab
When setting a 2M pte, radix__map_kernel_page() is using the address

ptep = (pte_t *)pudp;

Fix this conversion to use pmdp instead. Use pmdp_ptep() to do this
instead of casting the pointer.

Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
---
 arch/powerpc/mm/pgtable-radix.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index cfa53cc..34f1a0d 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -65,7 +65,7 @@ int radix__map_kernel_page(unsigned long ea, unsigned long pa,
if (!pmdp)
return -ENOMEM;
if (map_page_size == PMD_SIZE) {
-   ptep = (pte_t *)pudp;
+   ptep = pmdp_ptep(pmdp);
goto set_the_pte;
}
ptep = pte_alloc_kernel(pmdp, ea);
@@ -90,7 +90,7 @@ int radix__map_kernel_page(unsigned long ea, unsigned long pa,
}
pmdp = pmd_offset(pudp, ea);
if (map_page_size == PMD_SIZE) {
-   ptep = (pte_t *)pudp;
+   ptep = pmdp_ptep(pmdp);
goto set_the_pte;
}
if (!pmd_present(*pmdp)) {
-- 
1.8.3.1



Re: [PATCH v5 4/4] powerpc/mm: unstub radix__vmemmap_remove_mapping()

2017-01-17 Thread Reza Arbab

On Tue, Jan 17, 2017 at 12:55:13PM +0530, Balbir Singh wrote:

On Mon, Jan 16, 2017 at 01:07:46PM -0600, Reza Arbab wrote:

Use remove_pagetable() and friends for radix vmemmap removal.

We do not require the special-case handling of vmemmap done in the x86
versions of these functions. This is because vmemmap_free() has already
freed the mapped pages, and calls us with an aligned address range.

So, add a few failsafe WARNs, but otherwise the code to remove physical
mappings is already sufficient for vmemmap.


I wonder if we really need them?


Not sure what the guideline is for a "this shouldn't happen" WARN. It 
could save us some grief, should our vmemmap code ever start calling 
with an unaligned range, like it does on x86.


--
Reza Arbab



Re: [PATCH v5 3/4] powerpc/mm: add radix__remove_section_mapping()

2017-01-17 Thread Reza Arbab

On Tue, Jan 17, 2017 at 12:52:51PM +0530, Balbir Singh wrote:

Shouldn't most of these functions have __meminit?


I don't think so. The mapping functions are __meminit, but the unmapping 
functions are completely within #ifdef CONFIG_MEMORY_HOTPLUG already.



On Mon, Jan 16, 2017 at 01:07:45PM -0600, Reza Arbab wrote:

 #ifdef CONFIG_MEMORY_HOTPLUG
+static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+   pte_t *pte;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PTE; i++) {
+   pte = pte_start + i;
+   if (!pte_none(*pte))
+   return;


If !pte_none() we fail the hotplug? Or silently
leave the allocated pte's around. I guess this is
the same as x86


The latter--it's not a failure. If you provided remove_pagetable() an 
unaligned address range, there could be a pte left unremoved at either 
end.



+static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
+unsigned long end)
+{
+   unsigned long next;
+   pte_t *pte_base;
+   pmd_t *pmd;
+
+   pmd = pmd_start + pmd_index(addr);
+   for (; addr < end; addr = next, pmd++) {
+   next = pmd_addr_end(addr, end);
+
+   if (!pmd_present(*pmd))
+   continue;
+
+   if (pmd_huge(*pmd)) {
+   pte_clear(_mm, addr, (pte_t *)pmd);


pmd_clear()?


I used pte_clear() to mirror what happens in radix__map_kernel_page():

if (map_page_size == PMD_SIZE) {
ptep = (pte_t *)pmdp;
goto set_the_pte;
}

[...]

set_the_pte:
set_pte_at(_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, 
flags));

Would pmd_clear() be equivalent, since the pointer got set like a pte?


+static void remove_pagetable(unsigned long start, unsigned long end)
+{
+   unsigned long addr, next;
+   pud_t *pud_base;
+   pgd_t *pgd;
+
+   spin_lock(_mm.page_table_lock);
+


x86 does more granular lock acquisition only during
clearing the relevant entries. I suppose we don't have
to worry about it since its not fast path and frequent.


Yep. Ben thought the locking in remove_pte_table() was actually too 
granular, and Aneesh questioned what was being protected in the first 
place. So I left one lock/unlock in the outermost function for now.


--
Reza Arbab



Re: [PATCH v5 1/4] powerpc/mm: refactor radix physical page mapping

2017-01-17 Thread Reza Arbab

Thanks for your review!

On Tue, Jan 17, 2017 at 12:16:35PM +0530, Balbir Singh wrote:

On Mon, Jan 16, 2017 at 01:07:43PM -0600, Reza Arbab wrote:

--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -107,54 +107,66 @@ int radix__map_kernel_page(unsigned long ea, unsigned 
long pa,
return 0;
 }

+static inline void __meminit print_mapping(unsigned long start,
+  unsigned long end,
+  unsigned long size)
+{
+   if (end <= start)
+   return;


Should we pr_err for start > end?


I think that would be overkill. The way this little inline is called, 
start > end is not possible. The real point is not to print anything if 
start == end. Using <= just seemed better in context.



+static int __meminit create_physical_mapping(unsigned long start,
+unsigned long end)
+{
+   unsigned long addr, mapping_size;
+
+   start = _ALIGN_UP(start, PAGE_SIZE);
+   for (addr = start; addr < end; addr += mapping_size) {
+   unsigned long gap, previous_size;
+   int rc;
+
+   gap = end - addr;
+   previous_size = mapping_size;
+
+   if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
+   mmu_psize_defs[MMU_PAGE_1G].shift)
+   mapping_size = PUD_SIZE;
+   else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
+mmu_psize_defs[MMU_PAGE_2M].shift)
+   mapping_size = PMD_SIZE;
+   else
+   mapping_size = PAGE_SIZE;
+
+   if (mapping_size != previous_size) {
+   print_mapping(start, addr, previous_size);
+   start = addr;
+   }
+
+   rc = radix__map_kernel_page((unsigned long)__va(addr), addr,
+   PAGE_KERNEL_X, mapping_size);
+   if (rc)
+   return rc;


Should we try a lower page size if map_kernel_page fails for this mapping_size?


The only way map_kernel_page can fail is -ENOMEM. If that's the case,
there's no way we're going to be able to map this range at all. Better 
to fail fast here, I would think.


--
Reza Arbab



Re: Normal service will resume shortly ...

2017-01-16 Thread Reza Arbab

On Mon, Jan 16, 2017 at 08:33:09PM +1100, Michael Ellerman wrote:

If you've sent a fix that should go into 4.10 that hasn't been merged
yet, please feel free to reply to this message giving me a pointer to
it.


Welcome back!

I think this should go into 4.10:
https://lkml.kernel.org/r/1483475991-16999-1-git-send-email-ar...@linux.vnet.ibm.com

(aka http://patchwork.ozlabs.org/patch/710629/)

--
Reza Arbab



[PATCH v5 3/4] powerpc/mm: add radix__remove_section_mapping()

2017-01-16 Thread Reza Arbab
Tear down and free the four-level page tables of physical mappings
during memory hotremove.

Borrow the basic structure of remove_pagetable() and friends from the
identically-named x86 functions. Reduce the frequency of tlb flushes and
page_table_lock spinlocks by only doing them in the outermost function.
There was some question as to whether the locking is needed at all.
Leave it for now, but we could consider dropping it.

Memory must be offline to be removed, thus not in use. So there
shouldn't be the sort of concurrent page walking activity here that
might prompt us to use RCU.

Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/radix.h |   1 +
 arch/powerpc/mm/pgtable-book3s64.c |   2 +-
 arch/powerpc/mm/pgtable-radix.c| 133 +
 3 files changed, 135 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index 43c2571..0032b66 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -294,6 +294,7 @@ static inline unsigned long radix__get_tree_size(void)
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 int radix__create_section_mapping(unsigned long start, unsigned long end);
+int radix__remove_section_mapping(unsigned long start, unsigned long end);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 #endif /* __ASSEMBLY__ */
 #endif
diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
b/arch/powerpc/mm/pgtable-book3s64.c
index 2b13f6b..b798ff6 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -139,7 +139,7 @@ int create_section_mapping(unsigned long start, unsigned 
long end)
 int remove_section_mapping(unsigned long start, unsigned long end)
 {
if (radix_enabled())
-   return -ENODEV;
+   return radix__remove_section_mapping(start, end);
 
return hash__remove_section_mapping(start, end);
 }
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 075b4ec..cfba666 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -476,10 +476,143 @@ void radix__setup_initial_memory_limit(phys_addr_t 
first_memblock_base,
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
+static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+   pte_t *pte;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PTE; i++) {
+   pte = pte_start + i;
+   if (!pte_none(*pte))
+   return;
+   }
+
+   pte_free_kernel(_mm, pte_start);
+   pmd_clear(pmd);
+}
+
+static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+   pmd_t *pmd;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PMD; i++) {
+   pmd = pmd_start + i;
+   if (!pmd_none(*pmd))
+   return;
+   }
+
+   pmd_free(_mm, pmd_start);
+   pud_clear(pud);
+}
+
+static void remove_pte_table(pte_t *pte_start, unsigned long addr,
+unsigned long end)
+{
+   unsigned long next;
+   pte_t *pte;
+
+   pte = pte_start + pte_index(addr);
+   for (; addr < end; addr = next, pte++) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   if (next > end)
+   next = end;
+
+   if (!pte_present(*pte))
+   continue;
+
+   pte_clear(_mm, addr, pte);
+   }
+}
+
+static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
+unsigned long end)
+{
+   unsigned long next;
+   pte_t *pte_base;
+   pmd_t *pmd;
+
+   pmd = pmd_start + pmd_index(addr);
+   for (; addr < end; addr = next, pmd++) {
+   next = pmd_addr_end(addr, end);
+
+   if (!pmd_present(*pmd))
+   continue;
+
+   if (pmd_huge(*pmd)) {
+   pte_clear(_mm, addr, (pte_t *)pmd);
+   continue;
+   }
+
+   pte_base = (pte_t *)pmd_page_vaddr(*pmd);
+   remove_pte_table(pte_base, addr, next);
+   free_pte_table(pte_base, pmd);
+   }
+}
+
+static void remove_pud_table(pud_t *pud_start, unsigned long addr,
+unsigned long end)
+{
+   unsigned long next;
+   pmd_t *pmd_base;
+   pud_t *pud;
+
+   pud = pud_start + pud_index(addr);
+   for (; addr < end; addr = next, pud++) {
+   next = pud_addr_end(addr, end);
+
+   if (!pud_present(*pud))
+   continue;
+
+   if (pud_huge(*pud)) {
+   pte_clear(_mm, addr, (pte_t *)pud);
+   continue;
+   }
+
+   pmd_base = (pmd_t *)pud_page_vaddr(*pud);
+   remove_pmd_table(pmd_base, addr, next);
+   free_pmd_table(pmd_base, pud);
+   }
+}
+
+

[PATCH v5 4/4] powerpc/mm: unstub radix__vmemmap_remove_mapping()

2017-01-16 Thread Reza Arbab
Use remove_pagetable() and friends for radix vmemmap removal.

We do not require the special-case handling of vmemmap done in the x86
versions of these functions. This is because vmemmap_free() has already
freed the mapped pages, and calls us with an aligned address range.

So, add a few failsafe WARNs, but otherwise the code to remove physical
mappings is already sufficient for vmemmap.

Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
---
 arch/powerpc/mm/pgtable-radix.c | 29 -
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index cfba666..6d06171 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -521,6 +521,15 @@ static void remove_pte_table(pte_t *pte_start, unsigned 
long addr,
if (!pte_present(*pte))
continue;
 
+   if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
+   /*
+* The vmemmap_free() and remove_section_mapping()
+* codepaths call us with aligned addresses.
+*/
+   WARN_ONCE(1, "%s: unaligned range\n", __func__);
+   continue;
+   }
+
pte_clear(_mm, addr, pte);
}
 }
@@ -540,6 +549,12 @@ static void remove_pmd_table(pmd_t *pmd_start, unsigned 
long addr,
continue;
 
if (pmd_huge(*pmd)) {
+   if (!IS_ALIGNED(addr, PMD_SIZE) ||
+   !IS_ALIGNED(next, PMD_SIZE)) {
+   WARN_ONCE(1, "%s: unaligned range\n", __func__);
+   continue;
+   }
+
pte_clear(_mm, addr, (pte_t *)pmd);
continue;
}
@@ -565,6 +580,12 @@ static void remove_pud_table(pud_t *pud_start, unsigned 
long addr,
continue;
 
if (pud_huge(*pud)) {
+   if (!IS_ALIGNED(addr, PUD_SIZE) ||
+   !IS_ALIGNED(next, PUD_SIZE)) {
+   WARN_ONCE(1, "%s: unaligned range\n", __func__);
+   continue;
+   }
+
pte_clear(_mm, addr, (pte_t *)pud);
continue;
}
@@ -591,6 +612,12 @@ static void remove_pagetable(unsigned long start, unsigned 
long end)
continue;
 
if (pgd_huge(*pgd)) {
+   if (!IS_ALIGNED(addr, PGDIR_SIZE) ||
+   !IS_ALIGNED(next, PGDIR_SIZE)) {
+   WARN_ONCE(1, "%s: unaligned range\n", __func__);
+   continue;
+   }
+
pte_clear(_mm, addr, (pte_t *)pgd);
continue;
}
@@ -630,7 +657,7 @@ int __meminit radix__vmemmap_create_mapping(unsigned long 
start,
 #ifdef CONFIG_MEMORY_HOTPLUG
 void radix__vmemmap_remove_mapping(unsigned long start, unsigned long 
page_size)
 {
-   /* FIXME!! intel does more. We should free page tables mapping vmemmap 
? */
+   remove_pagetable(start, start + page_size);
 }
 #endif
 #endif
-- 
1.8.3.1



[PATCH v5 2/4] powerpc/mm: add radix__create_section_mapping()

2017-01-16 Thread Reza Arbab
Wire up memory hotplug page mapping for radix. Share the mapping
function already used by radix_init_pgtable().

Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/radix.h | 4 
 arch/powerpc/mm/pgtable-book3s64.c | 2 +-
 arch/powerpc/mm/pgtable-radix.c| 7 +++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index b4d1302..43c2571 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -291,5 +291,9 @@ static inline unsigned long radix__get_tree_size(void)
}
return rts_field;
 }
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int radix__create_section_mapping(unsigned long start, unsigned long end);
+#endif /* CONFIG_MEMORY_HOTPLUG */
 #endif /* __ASSEMBLY__ */
 #endif
diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
b/arch/powerpc/mm/pgtable-book3s64.c
index 653ff6c..2b13f6b 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -131,7 +131,7 @@ void mmu_cleanup_all(void)
 int create_section_mapping(unsigned long start, unsigned long end)
 {
if (radix_enabled())
-   return -ENODEV;
+   return radix__create_section_mapping(start, end);
 
return hash__create_section_mapping(start, end);
 }
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 2ce1354..075b4ec 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -475,6 +475,13 @@ void radix__setup_initial_memory_limit(phys_addr_t 
first_memblock_base,
memblock_set_current_limit(first_memblock_base + first_memblock_size);
 }
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+int __ref radix__create_section_mapping(unsigned long start, unsigned long end)
+{
+   return create_physical_mapping(start, end);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 int __meminit radix__vmemmap_create_mapping(unsigned long start,
  unsigned long page_size,
-- 
1.8.3.1



[PATCH v5 1/4] powerpc/mm: refactor radix physical page mapping

2017-01-16 Thread Reza Arbab
Move the page mapping code in radix_init_pgtable() into a separate
function that will also be used for memory hotplug.

The current goto loop progressively decreases its mapping size as it
covers the tail of a range whose end is unaligned. Change this to a for
loop which can do the same for both ends of the range.

Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
---
 arch/powerpc/mm/pgtable-radix.c | 88 +++--
 1 file changed, 50 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 623a0dc..2ce1354 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -107,54 +107,66 @@ int radix__map_kernel_page(unsigned long ea, unsigned 
long pa,
return 0;
 }
 
+static inline void __meminit print_mapping(unsigned long start,
+  unsigned long end,
+  unsigned long size)
+{
+   if (end <= start)
+   return;
+
+   pr_info("Mapped range 0x%lx - 0x%lx with 0x%lx\n", start, end, size);
+}
+
+static int __meminit create_physical_mapping(unsigned long start,
+unsigned long end)
+{
+   unsigned long addr, mapping_size;
+
+   start = _ALIGN_UP(start, PAGE_SIZE);
+   for (addr = start; addr < end; addr += mapping_size) {
+   unsigned long gap, previous_size;
+   int rc;
+
+   gap = end - addr;
+   previous_size = mapping_size;
+
+   if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
+   mmu_psize_defs[MMU_PAGE_1G].shift)
+   mapping_size = PUD_SIZE;
+   else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
+mmu_psize_defs[MMU_PAGE_2M].shift)
+   mapping_size = PMD_SIZE;
+   else
+   mapping_size = PAGE_SIZE;
+
+   if (mapping_size != previous_size) {
+   print_mapping(start, addr, previous_size);
+   start = addr;
+   }
+
+   rc = radix__map_kernel_page((unsigned long)__va(addr), addr,
+   PAGE_KERNEL_X, mapping_size);
+   if (rc)
+   return rc;
+   }
+
+   print_mapping(start, addr, mapping_size);
+   return 0;
+}
+
 static void __init radix_init_pgtable(void)
 {
-   int loop_count;
-   u64 base, end, start_addr;
unsigned long rts_field;
struct memblock_region *reg;
-   unsigned long linear_page_size;
 
/* We don't support slb for radix */
mmu_slb_size = 0;
/*
 * Create the linear mapping, using standard page size for now
 */
-   loop_count = 0;
-   for_each_memblock(memory, reg) {
-
-   start_addr = reg->base;
-
-redo:
-   if (loop_count < 1 && mmu_psize_defs[MMU_PAGE_1G].shift)
-   linear_page_size = PUD_SIZE;
-   else if (loop_count < 2 && mmu_psize_defs[MMU_PAGE_2M].shift)
-   linear_page_size = PMD_SIZE;
-   else
-   linear_page_size = PAGE_SIZE;
-
-   base = _ALIGN_UP(start_addr, linear_page_size);
-   end = _ALIGN_DOWN(reg->base + reg->size, linear_page_size);
-
-   pr_info("Mapping range 0x%lx - 0x%lx with 0x%lx\n",
-   (unsigned long)base, (unsigned long)end,
-   linear_page_size);
-
-   while (base < end) {
-   radix__map_kernel_page((unsigned long)__va(base),
- base, PAGE_KERNEL_X,
- linear_page_size);
-   base += linear_page_size;
-   }
-   /*
-* map the rest using lower page size
-*/
-   if (end < reg->base + reg->size) {
-   start_addr = end;
-   loop_count++;
-   goto redo;
-   }
-   }
+   for_each_memblock(memory, reg)
+   WARN_ON(create_physical_mapping(reg->base,
+   reg->base + reg->size));
/*
 * Allocate Partition table and process table for the
 * host.
-- 
1.8.3.1



[PATCH v5 0/4] powerpc/mm: enable memory hotplug on radix

2017-01-16 Thread Reza Arbab
Do the plumbing needed for memory hotplug on systems using radix pagetables,
borrowing from existing vmemmap and x86 code.

This passes basic verification of plugging and removing memory, but this 
stuff is tricky and I'd appreciate extra scrutiny of the series for 
correctness--in particular, the adaptation of remove_pagetable() from x86.

Former patch 1 is now a separate fix. This set still depends on it:
https://lkml.kernel.org/r/1483475991-16999-1-git-send-email-ar...@linux.vnet.ibm.com

/* changelog */

v5:
* Retain pr_info() of the size used to map each address range.

* flush_tlb_kernel_range() -> radix__flush_tlb_kernel_range()

v4:
* 
https://lkml.kernel.org/r/1483476218-17271-1-git-send-email-ar...@linux.vnet.ibm.com

* Sent patch 1 as a standalone fix.

* Extract a common function that can be used by both radix_init_pgtable() and
  radix__create_section_mapping().

* Reduce tlb flushing to one flush_tlb_kernel_range() per section, and do
  less granular locking of init_mm->page_table_lock.

v3:
* 
https://lkml.kernel.org/r/1481831443-22761-1-git-send-email-ar...@linux.vnet.ibm.com

* Port remove_pagetable() et al. from x86 for unmapping.

* [RFC] -> [PATCH]

v2:
* 
https://lkml.kernel.org/r/1471449083-15931-1-git-send-email-ar...@linux.vnet.ibm.com

* Do not simply fall through to vmemmap_{create,remove}_mapping(). As Aneesh
  and Michael pointed out, they are tied to CONFIG_SPARSEMEM_VMEMMAP and only
  did what I needed by luck anyway.

v1:
* 
https://lkml.kernel.org/r/1466699962-22412-1-git-send-email-ar...@linux.vnet.ibm.com

Reza Arbab (4):
  powerpc/mm: refactor radix physical page mapping
  powerpc/mm: add radix__create_section_mapping()
  powerpc/mm: add radix__remove_section_mapping()
  powerpc/mm: unstub radix__vmemmap_remove_mapping()

 arch/powerpc/include/asm/book3s/64/radix.h |   5 +
 arch/powerpc/mm/pgtable-book3s64.c |   4 +-
 arch/powerpc/mm/pgtable-radix.c| 257 -
 3 files changed, 225 insertions(+), 41 deletions(-)

-- 
1.8.3.1



Re: [PATCH v4 3/4] powerpc/mm: add radix__remove_section_mapping()

2017-01-04 Thread Reza Arbab

On Wed, Jan 04, 2017 at 10:37:58AM +0530, Aneesh Kumar K.V wrote:

Reza Arbab <ar...@linux.vnet.ibm.com> writes:

+static void remove_pagetable(unsigned long start, unsigned long end)
+{
+   unsigned long addr, next;
+   pud_t *pud_base;
+   pgd_t *pgd;
+
+   spin_lock(_mm.page_table_lock);
+
+   for (addr = start; addr < end; addr = next) {
+   next = pgd_addr_end(addr, end);
+
+   pgd = pgd_offset_k(addr);
+   if (!pgd_present(*pgd))
+   continue;
+
+   if (pgd_huge(*pgd)) {
+   pte_clear(_mm, addr, (pte_t *)pgd);
+   continue;
+   }
+
+   pud_base = (pud_t *)pgd_page_vaddr(*pgd);
+   remove_pud_table(pud_base, addr, next);
+   free_pud_table(pud_base, pgd);
+   }
+
+   spin_unlock(_mm.page_table_lock);


What is this lock protecting ?


The more I look into it, I'm not sure. This is still an artifact from 
the x86 functions, where they lock/unlock agressively, as you and Ben 
noted. I can take it out. 


+   flush_tlb_kernel_range(start, end);


We can use radix__flush_tlb_kernel_range avoiding an if
(radix_enabled()) conditional ?


Yes, good idea.


(radix_enabled()) conditional ? Also if needed we could make all the
above take a radix__ prefix ?


You mean rename all these new functions? We could, but I don't really 
see why. These functions are static to pgtable-radix.c, there aren't 
hash__ versions to differentiate from, and it seemed helpful to mirror 
the x86 names.


--
Reza Arbab



Re: [PATCH v4 1/4] powerpc/mm: refactor radix physical page mapping

2017-01-04 Thread Reza Arbab

On Wed, Jan 04, 2017 at 10:34:25AM +0530, Aneesh Kumar K.V wrote:

We lost the below in the change.

pr_info("Mapping range 0x%lx - 0x%lx with 0x%lx\n",
(unsigned long)base, (unsigned long)end,
linear_page_size);


Is there a way to dump the range and the size with which we mapped that
range ?


Sure. It's a little more difficult than before, because the mapping size 
is now reselected in each iteration of the loop, but a similar print can 
be done.


--
Reza Arbab



[PATCH v4 4/4] powerpc/mm: unstub radix__vmemmap_remove_mapping()

2017-01-03 Thread Reza Arbab
Use remove_pagetable() and friends for radix vmemmap removal.

We do not require the special-case handling of vmemmap done in the x86
versions of these functions. This is because vmemmap_free() has already
freed the mapped pages, and calls us with an aligned address range.

So, add a few failsafe WARNs, but otherwise the code to remove physical
mappings is already sufficient for vmemmap.

Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
---
 arch/powerpc/mm/pgtable-radix.c | 29 -
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index f7a8e625..bada0d9 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -517,6 +517,15 @@ static void remove_pte_table(pte_t *pte_start, unsigned 
long addr,
if (!pte_present(*pte))
continue;
 
+   if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
+   /*
+* The vmemmap_free() and remove_section_mapping()
+* codepaths call us with aligned addresses.
+*/
+   WARN_ONCE(1, "%s: unaligned range\n", __func__);
+   continue;
+   }
+
pte_clear(_mm, addr, pte);
}
 }
@@ -536,6 +545,12 @@ static void remove_pmd_table(pmd_t *pmd_start, unsigned 
long addr,
continue;
 
if (pmd_huge(*pmd)) {
+   if (!IS_ALIGNED(addr, PMD_SIZE) ||
+   !IS_ALIGNED(next, PMD_SIZE)) {
+   WARN_ONCE(1, "%s: unaligned range\n", __func__);
+   continue;
+   }
+
pte_clear(_mm, addr, (pte_t *)pmd);
continue;
}
@@ -561,6 +576,12 @@ static void remove_pud_table(pud_t *pud_start, unsigned 
long addr,
continue;
 
if (pud_huge(*pud)) {
+   if (!IS_ALIGNED(addr, PUD_SIZE) ||
+   !IS_ALIGNED(next, PUD_SIZE)) {
+   WARN_ONCE(1, "%s: unaligned range\n", __func__);
+   continue;
+   }
+
pte_clear(_mm, addr, (pte_t *)pud);
continue;
}
@@ -587,6 +608,12 @@ static void remove_pagetable(unsigned long start, unsigned 
long end)
continue;
 
if (pgd_huge(*pgd)) {
+   if (!IS_ALIGNED(addr, PGDIR_SIZE) ||
+   !IS_ALIGNED(next, PGDIR_SIZE)) {
+   WARN_ONCE(1, "%s: unaligned range\n", __func__);
+   continue;
+   }
+
pte_clear(_mm, addr, (pte_t *)pgd);
continue;
}
@@ -627,7 +654,7 @@ int __meminit radix__vmemmap_create_mapping(unsigned long 
start,
 #ifdef CONFIG_MEMORY_HOTPLUG
 void radix__vmemmap_remove_mapping(unsigned long start, unsigned long 
page_size)
 {
-   /* FIXME!! intel does more. We should free page tables mapping vmemmap 
? */
+   remove_pagetable(start, start + page_size);
 }
 #endif
 #endif
-- 
1.8.3.1



[PATCH v4 1/4] powerpc/mm: refactor radix physical page mapping

2017-01-03 Thread Reza Arbab
Move the page mapping code in radix_init_pgtable() into a separate
function that will also be used for memory hotplug.

The current goto loop progressively decreases its mapping size as it
covers the tail of a range whose end is unaligned. Change this to a for
loop which can do the same for both ends of the range.

Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
---
 arch/powerpc/mm/pgtable-radix.c | 69 ++---
 1 file changed, 31 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 623a0dc..5cee6d1 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -107,54 +107,47 @@ int radix__map_kernel_page(unsigned long ea, unsigned 
long pa,
return 0;
 }
 
+static int __meminit create_physical_mapping(unsigned long start,
+unsigned long end)
+{
+   unsigned long mapping_size;
+
+   start = _ALIGN_UP(start, PAGE_SIZE);
+   for (; start < end; start += mapping_size) {
+   unsigned long gap = end - start;
+   int rc;
+
+   if (IS_ALIGNED(start, PUD_SIZE) && gap >= PUD_SIZE &&
+   mmu_psize_defs[MMU_PAGE_1G].shift)
+   mapping_size = PUD_SIZE;
+   else if (IS_ALIGNED(start, PMD_SIZE) && gap >= PMD_SIZE &&
+mmu_psize_defs[MMU_PAGE_2M].shift)
+   mapping_size = PMD_SIZE;
+   else
+   mapping_size = PAGE_SIZE;
+
+   rc = radix__map_kernel_page((unsigned long)__va(start), start,
+   PAGE_KERNEL_X, mapping_size);
+   if (rc)
+   return rc;
+   }
+
+   return 0;
+}
+
 static void __init radix_init_pgtable(void)
 {
-   int loop_count;
-   u64 base, end, start_addr;
unsigned long rts_field;
struct memblock_region *reg;
-   unsigned long linear_page_size;
 
/* We don't support slb for radix */
mmu_slb_size = 0;
/*
 * Create the linear mapping, using standard page size for now
 */
-   loop_count = 0;
-   for_each_memblock(memory, reg) {
-
-   start_addr = reg->base;
-
-redo:
-   if (loop_count < 1 && mmu_psize_defs[MMU_PAGE_1G].shift)
-   linear_page_size = PUD_SIZE;
-   else if (loop_count < 2 && mmu_psize_defs[MMU_PAGE_2M].shift)
-   linear_page_size = PMD_SIZE;
-   else
-   linear_page_size = PAGE_SIZE;
-
-   base = _ALIGN_UP(start_addr, linear_page_size);
-   end = _ALIGN_DOWN(reg->base + reg->size, linear_page_size);
-
-   pr_info("Mapping range 0x%lx - 0x%lx with 0x%lx\n",
-   (unsigned long)base, (unsigned long)end,
-   linear_page_size);
-
-   while (base < end) {
-   radix__map_kernel_page((unsigned long)__va(base),
- base, PAGE_KERNEL_X,
- linear_page_size);
-   base += linear_page_size;
-   }
-   /*
-* map the rest using lower page size
-*/
-   if (end < reg->base + reg->size) {
-   start_addr = end;
-   loop_count++;
-   goto redo;
-   }
-   }
+   for_each_memblock(memory, reg)
+   WARN_ON(create_physical_mapping(reg->base,
+   reg->base + reg->size));
/*
 * Allocate Partition table and process table for the
 * host.
-- 
1.8.3.1



[PATCH v4 0/4] powerpc/mm: enable memory hotplug on radix

2017-01-03 Thread Reza Arbab
Do the plumbing needed for memory hotplug on systems using radix pagetables,
borrowing from existing vmemmap and x86 code.

This passes basic verification of plugging and removing memory, but this 
stuff is tricky and I'd appreciate extra scrutiny of the series for 
correctness--in particular, the adaptation of remove_pagetable() from x86.

/* changelog */

v4:
* Sent patch 1 as a standalone fix. This set still depends on it:
  
https://lkml.kernel.org/r/1483475991-16999-1-git-send-email-ar...@linux.vnet.ibm.com

* Extract a common function that can be used by both radix_init_pgtable() and
  radix__create_section_mapping().

* Reduce tlb flushing to one flush_tlb_kernel_range() per section, and do
  less granular locking of init_mm->page_table_lock.

v3:
* 
https://lkml.kernel.org/r/1481831443-22761-1-git-send-email-ar...@linux.vnet.ibm.com

* Port remove_pagetable() et al. from x86 for unmapping.

* [RFC] -> [PATCH]

v2:
* 
https://lkml.kernel.org/r/1471449083-15931-1-git-send-email-ar...@linux.vnet.ibm.com

* Do not simply fall through to vmemmap_{create,remove}_mapping(). As Aneesh
  and Michael pointed out, they are tied to CONFIG_SPARSEMEM_VMEMMAP and only
  did what I needed by luck anyway.

v1:
* 
https://lkml.kernel.org/r/1466699962-22412-1-git-send-email-ar...@linux.vnet.ibm.com

Reza Arbab (4):
  powerpc/mm: refactor radix physical page mapping
  powerpc/mm: add radix__create_section_mapping()
  powerpc/mm: add radix__remove_section_mapping()
  powerpc/mm: unstub radix__vmemmap_remove_mapping()

 arch/powerpc/include/asm/book3s/64/radix.h |   5 +
 arch/powerpc/mm/pgtable-book3s64.c |   4 +-
 arch/powerpc/mm/pgtable-radix.c| 254 -
 3 files changed, 222 insertions(+), 41 deletions(-)

-- 
1.8.3.1



[PATCH v4 3/4] powerpc/mm: add radix__remove_section_mapping()

2017-01-03 Thread Reza Arbab
Tear down and free the four-level page tables of physical mappings
during memory hotremove.

Borrow the basic structure of remove_pagetable() and friends from the
identically-named x86 functions. Simplify things a bit so locking and
tlb flushing are only done in the outermost function.

Memory must be offline to be removed, thus not in use. So there
shouldn't be the sort of concurrent page walking activity here that
might prompt us to use RCU.

Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/radix.h |   1 +
 arch/powerpc/mm/pgtable-book3s64.c |   2 +-
 arch/powerpc/mm/pgtable-radix.c| 149 +
 3 files changed, 151 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index 43c2571..0032b66 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -294,6 +294,7 @@ static inline unsigned long radix__get_tree_size(void)
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 int radix__create_section_mapping(unsigned long start, unsigned long end);
+int radix__remove_section_mapping(unsigned long start, unsigned long end);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 #endif /* __ASSEMBLY__ */
 #endif
diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
b/arch/powerpc/mm/pgtable-book3s64.c
index 2b13f6b..b798ff6 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -139,7 +139,7 @@ int create_section_mapping(unsigned long start, unsigned 
long end)
 int remove_section_mapping(unsigned long start, unsigned long end)
 {
if (radix_enabled())
-   return -ENODEV;
+   return radix__remove_section_mapping(start, end);
 
return hash__remove_section_mapping(start, end);
 }
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 3588895..f7a8e625 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -457,10 +457,159 @@ void radix__setup_initial_memory_limit(phys_addr_t 
first_memblock_base,
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
+static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+   pte_t *pte;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PTE; i++) {
+   pte = pte_start + i;
+   if (!pte_none(*pte))
+   return;
+   }
+
+   pte_free_kernel(_mm, pte_start);
+   pmd_clear(pmd);
+}
+
+static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+   pmd_t *pmd;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PMD; i++) {
+   pmd = pmd_start + i;
+   if (!pmd_none(*pmd))
+   return;
+   }
+
+   pmd_free(_mm, pmd_start);
+   pud_clear(pud);
+}
+
+static void free_pud_table(pud_t *pud_start, pgd_t *pgd)
+{
+   pud_t *pud;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PUD; i++) {
+   pud = pud_start + i;
+   if (!pud_none(*pud))
+   return;
+   }
+
+   pud_free(_mm, pud_start);
+   pgd_clear(pgd);
+}
+
+static void remove_pte_table(pte_t *pte_start, unsigned long addr,
+unsigned long end)
+{
+   unsigned long next;
+   pte_t *pte;
+
+   pte = pte_start + pte_index(addr);
+   for (; addr < end; addr = next, pte++) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   if (next > end)
+   next = end;
+
+   if (!pte_present(*pte))
+   continue;
+
+   pte_clear(_mm, addr, pte);
+   }
+}
+
+static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
+unsigned long end)
+{
+   unsigned long next;
+   pte_t *pte_base;
+   pmd_t *pmd;
+
+   pmd = pmd_start + pmd_index(addr);
+   for (; addr < end; addr = next, pmd++) {
+   next = pmd_addr_end(addr, end);
+
+   if (!pmd_present(*pmd))
+   continue;
+
+   if (pmd_huge(*pmd)) {
+   pte_clear(_mm, addr, (pte_t *)pmd);
+   continue;
+   }
+
+   pte_base = (pte_t *)pmd_page_vaddr(*pmd);
+   remove_pte_table(pte_base, addr, next);
+   free_pte_table(pte_base, pmd);
+   }
+}
+
+static void remove_pud_table(pud_t *pud_start, unsigned long addr,
+unsigned long end)
+{
+   unsigned long next;
+   pmd_t *pmd_base;
+   pud_t *pud;
+
+   pud = pud_start + pud_index(addr);
+   for (; addr < end; addr = next, pud++) {
+   next = pud_addr_end(addr, end);
+
+   if (!pud_present(*pud))
+   continue;
+
+   if (pud_huge(*pud)) {
+   pte_clear(_mm, addr, (pte_t *)pud);
+   continue;
+

[PATCH v4 2/4] powerpc/mm: add radix__create_section_mapping()

2017-01-03 Thread Reza Arbab
Wire up memory hotplug page mapping for radix. Share the mapping
function already used by radix_init_pgtable().

Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/radix.h | 4 
 arch/powerpc/mm/pgtable-book3s64.c | 2 +-
 arch/powerpc/mm/pgtable-radix.c| 7 +++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index b4d1302..43c2571 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -291,5 +291,9 @@ static inline unsigned long radix__get_tree_size(void)
}
return rts_field;
 }
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int radix__create_section_mapping(unsigned long start, unsigned long end);
+#endif /* CONFIG_MEMORY_HOTPLUG */
 #endif /* __ASSEMBLY__ */
 #endif
diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
b/arch/powerpc/mm/pgtable-book3s64.c
index 653ff6c..2b13f6b 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -131,7 +131,7 @@ void mmu_cleanup_all(void)
 int create_section_mapping(unsigned long start, unsigned long end)
 {
if (radix_enabled())
-   return -ENODEV;
+   return radix__create_section_mapping(start, end);
 
return hash__create_section_mapping(start, end);
 }
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 5cee6d1..3588895 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -456,6 +456,13 @@ void radix__setup_initial_memory_limit(phys_addr_t 
first_memblock_base,
memblock_set_current_limit(first_memblock_base + first_memblock_size);
 }
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+int __ref radix__create_section_mapping(unsigned long start, unsigned long end)
+{
+   return create_physical_mapping(start, end);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 int __meminit radix__vmemmap_create_mapping(unsigned long start,
  unsigned long page_size,
-- 
1.8.3.1



[PATCH] powerpc/mm: fix memory hotplug BUG() on radix

2017-01-03 Thread Reza Arbab
Memory hotplug is leading to hash page table calls, even on radix:

...
arch_add_memory
create_section_mapping
htab_bolt_mapping
BUG_ON(!ppc_md.hpte_insert);

To fix, refactor {create,remove}_section_mapping() into hash__ and
radix__ variants. Leave the radix versions stubbed for now.

Reviewed-by: Aneesh Kumar K.V <aneesh.ku...@linux.vnet.ibm.com>
Acked-by: Balbir Singh <bsinghar...@gmail.com>
Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
---
It was suggested that this fix be separated from the rest of the
set which implements the radix page mapping/unmapping.

 arch/powerpc/include/asm/book3s/64/hash.h |  5 +
 arch/powerpc/mm/hash_utils_64.c   |  4 ++--
 arch/powerpc/mm/pgtable-book3s64.c| 18 ++
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index f61cad3..dd90574 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -201,6 +201,11 @@ extern int __meminit hash__vmemmap_create_mapping(unsigned 
long start,
  unsigned long phys);
 extern void hash__vmemmap_remove_mapping(unsigned long start,
 unsigned long page_size);
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int hash__create_section_mapping(unsigned long start, unsigned long end);
+int hash__remove_section_mapping(unsigned long start, unsigned long end);
+#endif /* CONFIG_MEMORY_HOTPLUG */
 #endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_HASH_H */
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index b9a062f..96a4fb7 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -743,7 +743,7 @@ static unsigned long __init htab_get_table_size(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int create_section_mapping(unsigned long start, unsigned long end)
+int hash__create_section_mapping(unsigned long start, unsigned long end)
 {
int rc = htab_bolt_mapping(start, end, __pa(start),
   pgprot_val(PAGE_KERNEL), mmu_linear_psize,
@@ -757,7 +757,7 @@ int create_section_mapping(unsigned long start, unsigned 
long end)
return rc;
 }
 
-int remove_section_mapping(unsigned long start, unsigned long end)
+int hash__remove_section_mapping(unsigned long start, unsigned long end)
 {
int rc = htab_remove_mapping(start, end, mmu_linear_psize,
 mmu_kernel_ssize);
diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
b/arch/powerpc/mm/pgtable-book3s64.c
index ebf9782..653ff6c 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -126,3 +126,21 @@ void mmu_cleanup_all(void)
else if (mmu_hash_ops.hpte_clear_all)
mmu_hash_ops.hpte_clear_all();
 }
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int create_section_mapping(unsigned long start, unsigned long end)
+{
+   if (radix_enabled())
+   return -ENODEV;
+
+   return hash__create_section_mapping(start, end);
+}
+
+int remove_section_mapping(unsigned long start, unsigned long end)
+{
+   if (radix_enabled())
+   return -ENODEV;
+
+   return hash__remove_section_mapping(start, end);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
-- 
1.8.3.1



Re: [PATCH v3 3/5] powerpc/mm: add radix__create_section_mapping()

2016-12-20 Thread Reza Arbab

On Tue, Dec 20, 2016 at 05:28:40PM +1100, Balbir Singh wrote:

+#ifdef CONFIG_MEMORY_HOTPLUG
+int radix__create_section_mapping(unsigned long start, unsigned long end)
+{
+   unsigned long page_size = 1 << mmu_psize_defs[mmu_linear_psize].shift;


Can we refactor bits from radix_init_pgtable() and reuse?


Yes, that's my plan for v4.

--
Reza Arbab



Re: [PATCH v3 4/5] powerpc/mm: add radix__remove_section_mapping()

2016-12-19 Thread Reza Arbab

On Mon, Dec 19, 2016 at 03:18:07PM +0530, Aneesh Kumar K.V wrote:

Reza Arbab <ar...@linux.vnet.ibm.com> writes:

+static void remove_pte_table(pte_t *pte_start, unsigned long addr,
+unsigned long end)
+{
+   unsigned long next;
+   pte_t *pte;
+
+   pte = pte_start + pte_index(addr);
+   for (; addr < end; addr = next, pte++) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   if (next > end)
+   next = end;
+
+   if (!pte_present(*pte))
+   continue;
+
+   spin_lock(_mm.page_table_lock);
+   pte_clear(_mm, addr, pte);
+   spin_unlock(_mm.page_table_lock);
+   }
+
+   flush_tlb_mm(_mm);


Why call a flush here. we do that at the end of remove_page_table .
Isn't that sufficient ?


This was carried over from the x86 version of the function, where they 
do flush_tlb_all(). I can experiment to make sure things work without 
it.



+static void remove_pagetable(unsigned long start, unsigned long end,
+unsigned long map_page_size)
+{
+   unsigned long next;
+   unsigned long addr;
+   pgd_t *pgd;
+   pud_t *pud;
+
+   for (addr = start; addr < end; addr = next) {
+   next = pgd_addr_end(addr, end);
+
+   pgd = pgd_offset_k(addr);
+   if (!pgd_present(*pgd))
+   continue;
+
+   pud = (pud_t *)pgd_page_vaddr(*pgd);
+   remove_pud_table(pud, addr, next, map_page_size);
+   free_pud_table(pud, pgd);
+   }
+
+   flush_tlb_mm(_mm);



So we want to flush the full kernel tlb when we do a hotplug ?
May be check using flush_tlb_kernel_range(). Also that flush_tlb_mm() do
check for mm_is_thread_local(). Do we update init_mm correct to handle
that check ? I assume we want a tlbie() here instead of tlbiel() ?


I'll try using flush_tlb_kernel_range() instead. That sure does seem 
more appropriate.


--
Reza Arbab



Re: [PATCH v3 3/5] powerpc/mm: add radix__create_section_mapping()

2016-12-19 Thread Reza Arbab

On Mon, Dec 19, 2016 at 02:34:13PM +0530, Aneesh Kumar K.V wrote:

Reza Arbab <ar...@linux.vnet.ibm.com> writes:
Add the linear page mapping function for radix, used by memory 
hotplug. This is similar to vmemmap_populate().




Ok with this patch your first patch becomes useful. Can you merge that
with this and rename mmu_linear_psize to mmu_hotplug_psize or even use
mmu_virtual_psize. The linear naming is confusing.


Thanks for pointing out radix_init_pgtable(). I think the right thing to 
do here is create these mappings the same way it does. We can probably 
factor out a common function.


--
Reza Arbab



Re: [PATCH v3 2/5] powerpc/mm: refactor {create,remove}_section_mapping()

2016-12-19 Thread Reza Arbab

On Mon, Dec 19, 2016 at 02:30:28PM +0530, Aneesh Kumar K.V wrote:

Reza Arbab <ar...@linux.vnet.ibm.com> writes:
Change {create,remove}_section_mapping() to be wrappers around 
functions prefixed with "hash__".


This is preparation for the addition of their "radix__" variants. No
functional change.



I think this can go upstream now ? To fixup broken hotplug with radix ?


Yes, this one might be worth separating as a fix for the BUG() on radix.

--
Reza Arbab



Re: [PATCH v3 0/5] powerpc/mm: enable memory hotplug on radix

2016-12-19 Thread Reza Arbab

On Sat, Dec 17, 2016 at 01:38:40AM +1100, Balbir Singh wrote:

Do we care about alt maps yet?


Good question. I'll try to see if/how altmaps might need special 
consideration here.


--
Reza Arbab



[PATCH v3 0/5] powerpc/mm: enable memory hotplug on radix

2016-12-15 Thread Reza Arbab
Memory hotplug is leading to hash page table calls, even on radix:

...
arch_add_memory
create_section_mapping
htab_bolt_mapping
BUG_ON(!ppc_md.hpte_insert);

To fix, refactor {create,remove}_section_mapping() into hash__ and radix__
variants. Implement the radix versions by borrowing from existing vmemmap
and x86 code.

This passes basic verification of plugging and removing memory, but this 
stuff is tricky and I'd appreciate extra scrutiny of the series for 
correctness--in particular, the adaptation of remove_pagetable() from x86.

/* changelog */

v3:
* Port remove_pagetable() et al. from x86 for unmapping.

* [RFC] -> [PATCH]

v2:
* 
https://lkml.kernel.org/r/1471449083-15931-1-git-send-email-ar...@linux.vnet.ibm.com

* Do not simply fall through to vmemmap_{create,remove}_mapping(). As Aneesh
  and Michael pointed out, they are tied to CONFIG_SPARSEMEM_VMEMMAP and only
  did what I needed by luck anyway.

v1:
* 
https://lkml.kernel.org/r/1466699962-22412-1-git-send-email-ar...@linux.vnet.ibm.com

Reza Arbab (5):
  powerpc/mm: set the radix linear page mapping size
  powerpc/mm: refactor {create,remove}_section_mapping()
  powerpc/mm: add radix__create_section_mapping()
  powerpc/mm: add radix__remove_section_mapping()
  powerpc/mm: unstub radix__vmemmap_remove_mapping()

 arch/powerpc/include/asm/book3s/64/hash.h  |   5 +
 arch/powerpc/include/asm/book3s/64/radix.h |   5 +
 arch/powerpc/mm/hash_utils_64.c|   4 +-
 arch/powerpc/mm/pgtable-book3s64.c |  18 +++
 arch/powerpc/mm/pgtable-radix.c| 207 -
 5 files changed, 236 insertions(+), 3 deletions(-)

-- 
1.8.3.1



[PATCH v3 3/5] powerpc/mm: add radix__create_section_mapping()

2016-12-15 Thread Reza Arbab
Add the linear page mapping function for radix, used by memory hotplug.
This is similar to vmemmap_populate().

Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/radix.h |  4 
 arch/powerpc/mm/pgtable-book3s64.c |  2 +-
 arch/powerpc/mm/pgtable-radix.c| 19 +++
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index b4d1302..43c2571 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -291,5 +291,9 @@ static inline unsigned long radix__get_tree_size(void)
}
return rts_field;
 }
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int radix__create_section_mapping(unsigned long start, unsigned long end);
+#endif /* CONFIG_MEMORY_HOTPLUG */
 #endif /* __ASSEMBLY__ */
 #endif
diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
b/arch/powerpc/mm/pgtable-book3s64.c
index 653ff6c..2b13f6b 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -131,7 +131,7 @@ void mmu_cleanup_all(void)
 int create_section_mapping(unsigned long start, unsigned long end)
 {
if (radix_enabled())
-   return -ENODEV;
+   return radix__create_section_mapping(start, end);
 
return hash__create_section_mapping(start, end);
 }
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 54bd70e..8201d1f 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -465,6 +465,25 @@ void radix__setup_initial_memory_limit(phys_addr_t 
first_memblock_base,
memblock_set_current_limit(first_memblock_base + first_memblock_size);
 }
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+int radix__create_section_mapping(unsigned long start, unsigned long end)
+{
+   unsigned long page_size = 1 << mmu_psize_defs[mmu_linear_psize].shift;
+
+   /* Align to the page size of the linear mapping. */
+   start = _ALIGN_DOWN(start, page_size);
+
+   for (; start < end; start += page_size) {
+   int rc = radix__map_kernel_page(start, __pa(start),
+   PAGE_KERNEL, page_size);
+   if (rc)
+   return rc;
+   }
+
+   return 0;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 int __meminit radix__vmemmap_create_mapping(unsigned long start,
  unsigned long page_size,
-- 
1.8.3.1



[PATCH v3 4/5] powerpc/mm: add radix__remove_section_mapping()

2016-12-15 Thread Reza Arbab
Tear down and free the four-level page tables of the linear mapping
during memory hotremove.

We borrow the basic structure of remove_pagetable() and friends from the
identically-named x86 functions.

Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/radix.h |   1 +
 arch/powerpc/mm/pgtable-book3s64.c |   2 +-
 arch/powerpc/mm/pgtable-radix.c| 163 +
 3 files changed, 165 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index 43c2571..0032b66 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -294,6 +294,7 @@ static inline unsigned long radix__get_tree_size(void)
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 int radix__create_section_mapping(unsigned long start, unsigned long end);
+int radix__remove_section_mapping(unsigned long start, unsigned long end);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 #endif /* __ASSEMBLY__ */
 #endif
diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
b/arch/powerpc/mm/pgtable-book3s64.c
index 2b13f6b..b798ff6 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -139,7 +139,7 @@ int create_section_mapping(unsigned long start, unsigned 
long end)
 int remove_section_mapping(unsigned long start, unsigned long end)
 {
if (radix_enabled())
-   return -ENODEV;
+   return radix__remove_section_mapping(start, end);
 
return hash__remove_section_mapping(start, end);
 }
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 8201d1f..315237c 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -466,6 +466,159 @@ void radix__setup_initial_memory_limit(phys_addr_t 
first_memblock_base,
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
+static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+   pte_t *pte;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PTE; i++) {
+   pte = pte_start + i;
+   if (!pte_none(*pte))
+   return;
+   }
+
+   pte_free_kernel(_mm, pte_start);
+   spin_lock(_mm.page_table_lock);
+   pmd_clear(pmd);
+   spin_unlock(_mm.page_table_lock);
+}
+
+static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+   pmd_t *pmd;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PMD; i++) {
+   pmd = pmd_start + i;
+   if (!pmd_none(*pmd))
+   return;
+   }
+
+   pmd_free(_mm, pmd_start);
+   spin_lock(_mm.page_table_lock);
+   pud_clear(pud);
+   spin_unlock(_mm.page_table_lock);
+}
+
+static void free_pud_table(pud_t *pud_start, pgd_t *pgd)
+{
+   pud_t *pud;
+   int i;
+
+   for (i = 0; i < PTRS_PER_PUD; i++) {
+   pud = pud_start + i;
+   if (!pud_none(*pud))
+   return;
+   }
+
+   pud_free(_mm, pud_start);
+   spin_lock(_mm.page_table_lock);
+   pgd_clear(pgd);
+   spin_unlock(_mm.page_table_lock);
+}
+
+static void remove_pte_table(pte_t *pte_start, unsigned long addr,
+unsigned long end)
+{
+   unsigned long next;
+   pte_t *pte;
+
+   pte = pte_start + pte_index(addr);
+   for (; addr < end; addr = next, pte++) {
+   next = (addr + PAGE_SIZE) & PAGE_MASK;
+   if (next > end)
+   next = end;
+
+   if (!pte_present(*pte))
+   continue;
+
+   spin_lock(_mm.page_table_lock);
+   pte_clear(_mm, addr, pte);
+   spin_unlock(_mm.page_table_lock);
+   }
+
+   flush_tlb_mm(_mm);
+}
+
+static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
+unsigned long end, unsigned long map_page_size)
+{
+   unsigned long next;
+   pte_t *pte_base;
+   pmd_t *pmd;
+
+   pmd = pmd_start + pmd_index(addr);
+   for (; addr < end; addr = next, pmd++) {
+   next = pmd_addr_end(addr, end);
+
+   if (!pmd_present(*pmd))
+   continue;
+
+   if (map_page_size == PMD_SIZE) {
+   spin_lock(_mm.page_table_lock);
+   pte_clear(_mm, addr, (pte_t *)pmd);
+   spin_unlock(_mm.page_table_lock);
+
+   continue;
+   }
+
+   pte_base = (pte_t *)pmd_page_vaddr(*pmd);
+   remove_pte_table(pte_base, addr, next);
+   free_pte_table(pte_base, pmd);
+   }
+}
+
+static void remove_pud_table(pud_t *pud_start, unsigned long addr,
+unsigned long end, unsigned long map_page_size)
+{
+   unsigned long next;
+   pmd_t *pmd_base;
+   pud_t *pud;
+
+   pud = pud_start + pud_index(addr);
+   for (; 

[PATCH v3 1/5] powerpc/mm: set the radix linear page mapping size

2016-12-15 Thread Reza Arbab
This was defaulting to 4K, regardless of PAGE_SIZE.

Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
---
 arch/powerpc/mm/pgtable-radix.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 623a0dc..54bd70e 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -351,8 +351,10 @@ void __init radix__early_init_mmu(void)
 #ifdef CONFIG_PPC_64K_PAGES
/* PAGE_SIZE mappings */
mmu_virtual_psize = MMU_PAGE_64K;
+   mmu_linear_psize = MMU_PAGE_64K;
 #else
mmu_virtual_psize = MMU_PAGE_4K;
+   mmu_linear_psize = MMU_PAGE_4K;
 #endif
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-- 
1.8.3.1



[PATCH v3 5/5] powerpc/mm: unstub radix__vmemmap_remove_mapping()

2016-12-15 Thread Reza Arbab
Use remove_pagetable() and friends for radix vmemmap removal.

We do not require the special-case handling of vmemmap done in the x86
versions of these functions. This is because vmemmap_free() has already
freed the mapped pages, and calls us with an aligned address range.

So, add a few failsafe WARNs, but otherwise the code to remove linear
mappings is already sufficient for vmemmap.

Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
---
 arch/powerpc/mm/pgtable-radix.c | 23 ++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 315237c..9d1d51e 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -532,6 +532,15 @@ static void remove_pte_table(pte_t *pte_start, unsigned 
long addr,
if (!pte_present(*pte))
continue;
 
+   if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
+   /*
+* The vmemmap_free() and remove_section_mapping()
+* codepaths call us with aligned addresses.
+*/
+   WARN_ONCE(1, "%s: unaligned range\n", __func__);
+   continue;
+   }
+
spin_lock(_mm.page_table_lock);
pte_clear(_mm, addr, pte);
spin_unlock(_mm.page_table_lock);
@@ -555,6 +564,12 @@ static void remove_pmd_table(pmd_t *pmd_start, unsigned 
long addr,
continue;
 
if (map_page_size == PMD_SIZE) {
+   if (!IS_ALIGNED(addr, PMD_SIZE) ||
+   !IS_ALIGNED(next, PMD_SIZE)) {
+   WARN_ONCE(1, "%s: unaligned range\n", __func__);
+   continue;
+   }
+
spin_lock(_mm.page_table_lock);
pte_clear(_mm, addr, (pte_t *)pmd);
spin_unlock(_mm.page_table_lock);
@@ -583,6 +598,12 @@ static void remove_pud_table(pud_t *pud_start, unsigned 
long addr,
continue;
 
if (map_page_size == PUD_SIZE) {
+   if (!IS_ALIGNED(addr, PUD_SIZE) ||
+   !IS_ALIGNED(next, PUD_SIZE)) {
+   WARN_ONCE(1, "%s: unaligned range\n", __func__);
+   continue;
+   }
+
spin_lock(_mm.page_table_lock);
pte_clear(_mm, addr, (pte_t *)pud);
spin_unlock(_mm.page_table_lock);
@@ -662,7 +683,7 @@ int __meminit radix__vmemmap_create_mapping(unsigned long 
start,
 #ifdef CONFIG_MEMORY_HOTPLUG
 void radix__vmemmap_remove_mapping(unsigned long start, unsigned long 
page_size)
 {
-   /* FIXME!! intel does more. We should free page tables mapping vmemmap 
? */
+   remove_pagetable(start, start + page_size, page_size);
 }
 #endif
 #endif
-- 
1.8.3.1



[PATCH v3 2/5] powerpc/mm: refactor {create, remove}_section_mapping()

2016-12-15 Thread Reza Arbab
Change {create,remove}_section_mapping() to be wrappers around functions
prefixed with "hash__".

This is preparation for the addition of their "radix__" variants. No
functional change.

Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/hash.h |  5 +
 arch/powerpc/mm/hash_utils_64.c   |  4 ++--
 arch/powerpc/mm/pgtable-book3s64.c| 18 ++
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index f61cad3..dd90574 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -201,6 +201,11 @@ extern int __meminit hash__vmemmap_create_mapping(unsigned 
long start,
  unsigned long phys);
 extern void hash__vmemmap_remove_mapping(unsigned long start,
 unsigned long page_size);
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int hash__create_section_mapping(unsigned long start, unsigned long end);
+int hash__remove_section_mapping(unsigned long start, unsigned long end);
+#endif /* CONFIG_MEMORY_HOTPLUG */
 #endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_HASH_H */
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index b9a062f..96a4fb7 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -743,7 +743,7 @@ static unsigned long __init htab_get_table_size(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int create_section_mapping(unsigned long start, unsigned long end)
+int hash__create_section_mapping(unsigned long start, unsigned long end)
 {
int rc = htab_bolt_mapping(start, end, __pa(start),
   pgprot_val(PAGE_KERNEL), mmu_linear_psize,
@@ -757,7 +757,7 @@ int create_section_mapping(unsigned long start, unsigned 
long end)
return rc;
 }
 
-int remove_section_mapping(unsigned long start, unsigned long end)
+int hash__remove_section_mapping(unsigned long start, unsigned long end)
 {
int rc = htab_remove_mapping(start, end, mmu_linear_psize,
 mmu_kernel_ssize);
diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
b/arch/powerpc/mm/pgtable-book3s64.c
index ebf9782..653ff6c 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -126,3 +126,21 @@ void mmu_cleanup_all(void)
else if (mmu_hash_ops.hpte_clear_all)
mmu_hash_ops.hpte_clear_all();
 }
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int create_section_mapping(unsigned long start, unsigned long end)
+{
+   if (radix_enabled())
+   return -ENODEV;
+
+   return hash__create_section_mapping(start, end);
+}
+
+int remove_section_mapping(unsigned long start, unsigned long end)
+{
+   if (radix_enabled())
+   return -ENODEV;
+
+   return hash__remove_section_mapping(start, end);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
-- 
1.8.3.1



[PATCH] powerpc/mm: allow memory hotplug into an offline node

2016-11-16 Thread Reza Arbab
Relax the check preventing us from hotplugging into an offline node.

This limitation was added in commit 482ec7c403d2 ("[PATCH] powerpc numa:
Support sparse online node map") to prevent adding resources to an
uninitialized node.

These days, there is no harm in doing so. The addition will actually
cause the node to be initialized and onlined; add_memory_resource()
calls hotadd_new_pgdat() (if necessary) and node_set_online().

Cc: Balbir Singh <bsinghar...@gmail.com>
Cc: Nathan Fontenot <nf...@linux.vnet.ibm.com>
Cc: John Allen <jal...@linux.vnet.ibm.com>
Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
---
This applies on top of "powerpc/mm: allow memory hotplug into a
memoryless node", currently in the -mm tree:
http://lkml.kernel.org/r/1479160961-25840-2-git-send-email-ar...@linux.vnet.ibm.com

 arch/powerpc/mm/numa.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index d69f6f6..07620c9 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1091,7 +1091,7 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
nid = hot_add_node_scn_to_nid(scn_addr);
}
 
-   if (nid < 0 || !node_online(nid))
+   if (nid < 0 || !node_possible(nid))
nid = first_online_node;
 
return nid;
-- 
1.8.3.1



Re: [RESEND] [PATCH v1 3/3] powerpc: fix node_possible_map limitations

2016-11-16 Thread Reza Arbab

On Wed, Nov 16, 2016 at 10:45:01AM +1100, Balbir Singh wrote:

Reverts: commit 3af229f2071f
("powerpc/numa: Reset node_possible_map to only node_online_map")


Nice! With this limitation going away, I have a small patch to enable 
onlining new nodes via memory hotplug. Incoming.


--
Reza Arbab



Re: [PATCH v7 2/5] mm: remove x86-only restriction of movable_node

2016-11-15 Thread Reza Arbab

On Tue, Nov 15, 2016 at 12:35:42PM +0530, Aneesh Kumar K.V wrote:

Considering that we now can mark memblock hotpluggable, do we need to
enable the bottom up allocation for ppc64 also ?


No, we don't, because early_init_dt_scan_memory() marks the memblocks 
hotpluggable immediately when they are added. There is no gap between 
the addition and the marking, as there is on x86, during which an 
allocation might accidentally occur in a movable node.


--
Reza Arbab



[PATCH v7 4/5] of/fdt: mark hotpluggable memory

2016-11-14 Thread Reza Arbab
When movable nodes are enabled, any node containing only hotpluggable
memory is made movable at boot time.

On x86, hotpluggable memory is discovered by parsing the ACPI SRAT,
making corresponding calls to memblock_mark_hotplug().

If we introduce a dt property to describe memory as hotpluggable,
configs supporting early fdt may then also do this marking and use
movable nodes.

Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
Tested-by: Balbir Singh <bsinghar...@gmail.com>
---
 drivers/of/fdt.c   | 19 +++
 include/linux/of_fdt.h |  1 +
 mm/Kconfig |  2 +-
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index c89d5d2..c9b5cac 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -1015,6 +1015,7 @@ int __init early_init_dt_scan_memory(unsigned long node, 
const char *uname,
const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
const __be32 *reg, *endp;
int l;
+   bool hotpluggable;
 
/* We are scanning "memory" nodes only */
if (type == NULL) {
@@ -1034,6 +1035,7 @@ int __init early_init_dt_scan_memory(unsigned long node, 
const char *uname,
return 0;
 
endp = reg + (l / sizeof(__be32));
+   hotpluggable = of_get_flat_dt_prop(node, "hotpluggable", NULL);
 
pr_debug("memory scan node %s, reg size %d,\n", uname, l);
 
@@ -1049,6 +1051,13 @@ int __init early_init_dt_scan_memory(unsigned long node, 
const char *uname,
(unsigned long long)size);
 
early_init_dt_add_memory_arch(base, size);
+
+   if (!hotpluggable)
+   continue;
+
+   if (early_init_dt_mark_hotplug_memory_arch(base, size))
+   pr_warn("failed to mark hotplug range 0x%llx - 
0x%llx\n",
+   base, base + size);
}
 
return 0;
@@ -1146,6 +1155,11 @@ void __init __weak early_init_dt_add_memory_arch(u64 
base, u64 size)
memblock_add(base, size);
 }
 
+int __init __weak early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size)
+{
+   return memblock_mark_hotplug(base, size);
+}
+
 int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base,
phys_addr_t size, bool nomap)
 {
@@ -1168,6 +1182,11 @@ void __init __weak early_init_dt_add_memory_arch(u64 
base, u64 size)
WARN_ON(1);
 }
 
+int __init __weak early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size)
+{
+   return -ENOSYS;
+}
+
 int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base,
phys_addr_t size, bool nomap)
 {
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 4341f32..271b3fd 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -71,6 +71,7 @@ extern int early_init_dt_scan_memory(unsigned long node, 
const char *uname,
 extern void early_init_fdt_scan_reserved_mem(void);
 extern void early_init_fdt_reserve_self(void);
 extern void early_init_dt_add_memory_arch(u64 base, u64 size);
+extern int early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size);
 extern int early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t 
size,
 bool no_map);
 extern void * early_init_dt_alloc_memory_arch(u64 size, u64 align);
diff --git a/mm/Kconfig b/mm/Kconfig
index 061b46b..33a9b06 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -153,7 +153,7 @@ config MOVABLE_NODE
bool "Enable to assign a node which has only movable memory"
depends on HAVE_MEMBLOCK
depends on NO_BOOTMEM
-   depends on X86_64 || MEMORY_HOTPLUG
+   depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG
depends on NUMA
default n
help
-- 
1.8.3.1



[PATCH v7 1/5] powerpc/mm: allow memory hotplug into a memoryless node

2016-11-14 Thread Reza Arbab
Remove the check which prevents us from hotplugging into an empty node.

The original commit b226e4621245 ("[PATCH] powerpc: don't add memory to
empty node/zone"), states that this was intended to be a temporary measure.
It is a workaround for an oops which no longer occurs.

Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.ku...@linux.vnet.ibm.com>
Acked-by: Balbir Singh <bsinghar...@gmail.com>
Acked-by: Michael Ellerman <m...@ellerman.id.au>
Cc: Nathan Fontenot <nf...@linux.vnet.ibm.com>
Cc: Bharata B Rao <bhar...@linux.vnet.ibm.com>
---
 arch/powerpc/mm/numa.c | 13 +
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index a51c188..0cb6bd8 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1085,7 +1085,7 @@ static int hot_add_node_scn_to_nid(unsigned long scn_addr)
 int hot_add_scn_to_nid(unsigned long scn_addr)
 {
struct device_node *memory = NULL;
-   int nid, found = 0;
+   int nid;
 
if (!numa_enabled || (min_common_depth < 0))
return first_online_node;
@@ -1101,17 +1101,6 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
if (nid < 0 || !node_online(nid))
nid = first_online_node;
 
-   if (NODE_DATA(nid)->node_spanned_pages)
-   return nid;
-
-   for_each_online_node(nid) {
-   if (NODE_DATA(nid)->node_spanned_pages) {
-   found = 1;
-   break;
-   }
-   }
-
-   BUG_ON(!found);
return nid;
 }
 
-- 
1.8.3.1



[PATCH v7 2/5] mm: remove x86-only restriction of movable_node

2016-11-14 Thread Reza Arbab
In commit c5320926e370 ("mem-hotplug: introduce movable_node boot
option"), the memblock allocation direction is changed to bottom-up and
then back to top-down like this:

1. memblock_set_bottom_up(true), called by cmdline_parse_movable_node().
2. memblock_set_bottom_up(false), called by x86's numa_init().

Even though (1) occurs in generic mm code, it is wrapped by #ifdef
CONFIG_MOVABLE_NODE, which depends on X86_64.

This means that when we extend CONFIG_MOVABLE_NODE to non-x86 arches,
things will be unbalanced. (1) will happen for them, but (2) will not.

This toggle was added in the first place because x86 has a delay between
adding memblocks and marking them as hotpluggable. Since other arches do
this marking either immediately or not at all, they do not require the
bottom-up toggle.

So, resolve things by moving (1) from cmdline_parse_movable_node() to
x86's setup_arch(), immediately after the movable_node parameter has
been parsed.

Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
---
 Documentation/kernel-parameters.txt |  2 +-
 arch/x86/kernel/setup.c | 24 
 mm/memory_hotplug.c | 20 
 3 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 37babf9..adcccd5 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2401,7 +2401,7 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
that the amount of memory usable for all allocations
is not too small.
 
-   movable_node[KNL,X86] Boot-time switch to enable the effects
+   movable_node[KNL] Boot-time switch to enable the effects
of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details.
 
MTD_Partition=  [MTD]
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 9c337b0..4cfba94 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -985,6 +985,30 @@ void __init setup_arch(char **cmdline_p)
 
parse_early_param();
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+   /*
+* Memory used by the kernel cannot be hot-removed because Linux
+* cannot migrate the kernel pages. When memory hotplug is
+* enabled, we should prevent memblock from allocating memory
+* for the kernel.
+*
+* ACPI SRAT records all hotpluggable memory ranges. But before
+* SRAT is parsed, we don't know about it.
+*
+* The kernel image is loaded into memory at very early time. We
+* cannot prevent this anyway. So on NUMA system, we set any
+* node the kernel resides in as un-hotpluggable.
+*
+* Since on modern servers, one node could have double-digit
+* gigabytes memory, we can assume the memory around the kernel
+* image is also un-hotpluggable. So before SRAT is parsed, just
+* allocate memory near the kernel image to try the best to keep
+* the kernel away from hotpluggable memory.
+*/
+   if (movable_node_is_enabled())
+   memblock_set_bottom_up(true);
+#endif
+
x86_report_nx();
 
/* after early param, so could get panic from serial */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index cad4b91..e43142c1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1727,26 +1727,6 @@ static bool can_offline_normal(struct zone *zone, 
unsigned long nr_pages)
 static int __init cmdline_parse_movable_node(char *p)
 {
 #ifdef CONFIG_MOVABLE_NODE
-   /*
-* Memory used by the kernel cannot be hot-removed because Linux
-* cannot migrate the kernel pages. When memory hotplug is
-* enabled, we should prevent memblock from allocating memory
-* for the kernel.
-*
-* ACPI SRAT records all hotpluggable memory ranges. But before
-* SRAT is parsed, we don't know about it.
-*
-* The kernel image is loaded into memory at very early time. We
-* cannot prevent this anyway. So on NUMA system, we set any
-* node the kernel resides in as un-hotpluggable.
-*
-* Since on modern servers, one node could have double-digit
-* gigabytes memory, we can assume the memory around the kernel
-* image is also un-hotpluggable. So before SRAT is parsed, just
-* allocate memory near the kernel image to try the best to keep
-* the kernel away from hotpluggable memory.
-*/
-   memblock_set_bottom_up(true);
movable_node_enabled = true;
 #else
pr_warn("movable_node option not supported\n");
-- 
1.8.3.1



[PATCH v7 3/5] mm: enable CONFIG_MOVABLE_NODE on non-x86 arches

2016-11-14 Thread Reza Arbab
To support movable memory nodes (CONFIG_MOVABLE_NODE), at least one of
the following must be true:

1. This config has the capability to identify movable nodes at boot.
   Right now, only x86 can do this.

2. Our config supports memory hotplug, which means that a movable node
   can be created by hotplugging all of its memory into ZONE_MOVABLE.

Fix the Kconfig definition of CONFIG_MOVABLE_NODE, which currently
recognizes (1), but not (2).

Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.ku...@linux.vnet.ibm.com>
Acked-by: Balbir Singh <bsinghar...@gmail.com>
---
 mm/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index 86e3e0e..061b46b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -153,7 +153,7 @@ config MOVABLE_NODE
bool "Enable to assign a node which has only movable memory"
depends on HAVE_MEMBLOCK
depends on NO_BOOTMEM
-   depends on X86_64
+   depends on X86_64 || MEMORY_HOTPLUG
depends on NUMA
default n
help
-- 
1.8.3.1



[PATCH v7 0/5] enable movable nodes on non-x86 configs

2016-11-14 Thread Reza Arbab
This patchset allows more configs to make use of movable nodes. When
CONFIG_MOVABLE_NODE is selected, there are two ways to introduce such
nodes into the system:

1. Discover movable nodes at boot. Currently this is only possible on
   x86, but we will enable configs supporting fdt to do the same.

2. Hotplug and online all of a node's memory using online_movable. This
   is already possible on any config supporting memory hotplug, not
   just x86, but the Kconfig doesn't say so. We will fix that.

We'll also remove some cruft on power which would prevent (2).

/* changelog */

v7:
* Fix error when !CONFIG_HAVE_MEMBLOCK, found by the kbuild test robot.

* Remove the prefix of "linux,hotpluggable". Document the property's purpose.

v6:
* 
http://lkml.kernel.org/r/1478562276-25539-1-git-send-email-ar...@linux.vnet.ibm.com

* Add a patch enabling the fdt to describe hotpluggable memory.

v5:
* 
http://lkml.kernel.org/r/1477339089-5455-1-git-send-email-ar...@linux.vnet.ibm.com

* Drop the patches which recognize the "status" property of dt memory
  nodes. Firmware can set the size of "linux,usable-memory" to zero instead.

v4:
* 
http://lkml.kernel.org/r/1475778995-1420-1-git-send-email-ar...@linux.vnet.ibm.com

* Rename of_fdt_is_available() to of_fdt_device_is_available().
  Rename of_flat_dt_is_available() to of_flat_dt_device_is_available().

* Instead of restoring top-down allocation, ensure it never goes
  bottom-up in the first place, by making movable_node arch-specific.

* Use MEMORY_HOTPLUG instead of PPC64 in the mm/Kconfig patch.

v3:
* 
http://lkml.kernel.org/r/1474828616-16608-1-git-send-email-ar...@linux.vnet.ibm.com

* Use Rob Herring's suggestions to improve the node availability check.

* More verbose commit log in the patch enabling CONFIG_MOVABLE_NODE.

* Add a patch to restore top-down allocation the way x86 does.

v2:
* 
http://lkml.kernel.org/r/1473883618-14998-1-git-send-email-ar...@linux.vnet.ibm.com

* Use the "status" property of standard dt memory nodes instead of
  introducing a new "ibm,hotplug-aperture" compatible id.

* Remove the patch which explicitly creates a memoryless node. This set
  no longer has any bearing on whether the pgdat is created at boot or
  at the time of memory addition.

v1:
* 
http://lkml.kernel.org/r/1470680843-28702-1-git-send-email-ar...@linux.vnet.ibm.com

Reza Arbab (5):
  powerpc/mm: allow memory hotplug into a memoryless node
  mm: remove x86-only restriction of movable_node
  mm: enable CONFIG_MOVABLE_NODE on non-x86 arches
  of/fdt: mark hotpluggable memory
  dt: add documentation of "hotpluggable" memory property

 Documentation/devicetree/booting-without-of.txt |  7 +++
 Documentation/kernel-parameters.txt |  2 +-
 arch/powerpc/mm/numa.c  | 13 +
 arch/x86/kernel/setup.c | 24 
 drivers/of/fdt.c| 19 +++
 include/linux/of_fdt.h  |  1 +
 mm/Kconfig  |  2 +-
 mm/memory_hotplug.c | 20 
 8 files changed, 54 insertions(+), 34 deletions(-)

-- 
1.8.3.1



[PATCH v7 5/5] dt: add documentation of "hotpluggable" memory property

2016-11-14 Thread Reza Arbab
Summarize the "hotpluggable" property of dt memory nodes.

Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
---
 Documentation/devicetree/booting-without-of.txt | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/Documentation/devicetree/booting-without-of.txt 
b/Documentation/devicetree/booting-without-of.txt
index 3f1437f..280d283 100644
--- a/Documentation/devicetree/booting-without-of.txt
+++ b/Documentation/devicetree/booting-without-of.txt
@@ -974,6 +974,13 @@ compatibility.
   4Gb. Some vendors prefer splitting those ranges into smaller
   segments, but the kernel doesn't care.
 
+  Additional properties:
+
+- hotpluggable : The presence of this property provides an explicit
+  hint to the operating system that this memory may potentially be
+  removed later. The kernel can take this into consideration when
+  doing nonmovable allocations and when laying out memory zones.
+
   e) The /chosen node
 
   This node is a bit "special". Normally, that's where Open Firmware
-- 
1.8.3.1



Re: [PATCH v6 4/4] of/fdt: mark hotpluggable memory

2016-11-14 Thread Reza Arbab

On Mon, Nov 14, 2016 at 10:59:43PM +1100, Michael Ellerman wrote:

So I'm not opposed to this, but it is a little vague.

What does the "hotpluggable" property really mean?

Is it just a hint to the operating system? (which may or may not be
Linux).

Or is it a direction, "this memory must be able to be hotunplugged"?

I think you're intending the former, ie. a hint, which is probably OK.
But it needs to be documented clearly.


Yes, you've got it right. It's just a hint, not a mandate.

I'm about to send v7 which adds a description of "hotpluggable" in the 
documentation. Hopefully I've explained it well enough there.


--
Reza Arbab



Re: [PATCH v6 4/4] of/fdt: mark hotpluggable memory

2016-11-10 Thread Reza Arbab

On Thu, Nov 10, 2016 at 11:56:02AM +1100, Balbir Singh wrote:
Have you tested this across all combinations of skiboot/kexec/SLOF 
boots?


I've tested it under qemu/grub, simics/skiboot, and via kexec.

--
Reza Arbab



Re: [PATCH v6 4/4] of/fdt: mark hotpluggable memory

2016-11-09 Thread Reza Arbab

On Wed, Nov 09, 2016 at 12:12:55PM -0600, Rob Herring wrote:

On Mon, Nov 7, 2016 at 5:44 PM, Reza Arbab <ar...@linux.vnet.ibm.com> wrote:

+   hotpluggable = of_get_flat_dt_prop(node, "linux,hotpluggable", NULL);


Memory being hotpluggable doesn't seem like a linux property to me.
I'd drop the linux prefix. Also, this needs to be documented.


Sure, that makes sense. I'll do both in v7.

--
Reza Arbab



Re: [PATCH v6 4/4] of/fdt: mark hotpluggable memory

2016-11-08 Thread Reza Arbab

On Tue, Nov 08, 2016 at 09:59:26AM +0800, kbuild test robot wrote:

All errors (new ones prefixed by >>):

  drivers/of/fdt.c: In function 'early_init_dt_scan_memory':

drivers/of/fdt.c:1064:3: error: implicit declaration of function 
'memblock_mark_hotplug'

  cc1: some warnings being treated as errors

vim +/memblock_mark_hotplug +1064 drivers/of/fdt.c

 1058   continue;
 1059   pr_debug(" - %llx ,  %llx\n", (unsigned long long)base,
 1060   (unsigned long long)size);
 1061   
 1062   early_init_dt_add_memory_arch(base, size);
 1063   

1064if (hotpluggable && memblock_mark_hotplug(base, size))

 1065   pr_warn("failed to mark hotplug range 0x%llx - 
0x%llx\n",
 1066   base, base + size);
 1067   }


Ah, I need to adjust for !CONFIG_HAVE_MEMBLOCK. Will correct in v7.

--
Reza Arbab



[PATCH v6 1/4] powerpc/mm: allow memory hotplug into a memoryless node

2016-11-07 Thread Reza Arbab
Remove the check which prevents us from hotplugging into an empty node.

The original commit b226e4621245 ("[PATCH] powerpc: don't add memory to
empty node/zone"), states that this was intended to be a temporary measure.
It is a workaround for an oops which no longer occurs.

Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.ku...@linux.vnet.ibm.com>
Acked-by: Balbir Singh <bsinghar...@gmail.com>
Cc: Nathan Fontenot <nf...@linux.vnet.ibm.com>
Cc: Bharata B Rao <bhar...@linux.vnet.ibm.com>
---
 arch/powerpc/mm/numa.c | 13 +
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index a51c188..0cb6bd8 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1085,7 +1085,7 @@ static int hot_add_node_scn_to_nid(unsigned long scn_addr)
 int hot_add_scn_to_nid(unsigned long scn_addr)
 {
struct device_node *memory = NULL;
-   int nid, found = 0;
+   int nid;
 
if (!numa_enabled || (min_common_depth < 0))
return first_online_node;
@@ -1101,17 +1101,6 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
if (nid < 0 || !node_online(nid))
nid = first_online_node;
 
-   if (NODE_DATA(nid)->node_spanned_pages)
-   return nid;
-
-   for_each_online_node(nid) {
-   if (NODE_DATA(nid)->node_spanned_pages) {
-   found = 1;
-   break;
-   }
-   }
-
-   BUG_ON(!found);
return nid;
 }
 
-- 
1.8.3.1



[PATCH v6 2/4] mm: remove x86-only restriction of movable_node

2016-11-07 Thread Reza Arbab
In commit c5320926e370 ("mem-hotplug: introduce movable_node boot
option"), the memblock allocation direction is changed to bottom-up and
then back to top-down like this:

1. memblock_set_bottom_up(true), called by cmdline_parse_movable_node().
2. memblock_set_bottom_up(false), called by x86's numa_init().

Even though (1) occurs in generic mm code, it is wrapped by #ifdef
CONFIG_MOVABLE_NODE, which depends on X86_64.

This means that when we extend CONFIG_MOVABLE_NODE to non-x86 arches,
things will be unbalanced. (1) will happen for them, but (2) will not.

This toggle was added in the first place because x86 has a delay between
adding memblocks and marking them as hotpluggable. Since other arches do
this marking either immediately or not at all, they do not require the
bottom-up toggle.

So, resolve things by moving (1) from cmdline_parse_movable_node() to
x86's setup_arch(), immediately after the movable_node parameter has
been parsed.

Signed-off-by: Reza Arbab <ar...@linux.vnet.ibm.com>
---
 Documentation/kernel-parameters.txt |  2 +-
 arch/x86/kernel/setup.c | 24 
 mm/memory_hotplug.c | 20 
 3 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 37babf9..adcccd5 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2401,7 +2401,7 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
that the amount of memory usable for all allocations
is not too small.
 
-   movable_node[KNL,X86] Boot-time switch to enable the effects
+   movable_node[KNL] Boot-time switch to enable the effects
of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details.
 
MTD_Partition=  [MTD]
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 9c337b0..4cfba94 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -985,6 +985,30 @@ void __init setup_arch(char **cmdline_p)
 
parse_early_param();
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+   /*
+* Memory used by the kernel cannot be hot-removed because Linux
+* cannot migrate the kernel pages. When memory hotplug is
+* enabled, we should prevent memblock from allocating memory
+* for the kernel.
+*
+* ACPI SRAT records all hotpluggable memory ranges. But before
+* SRAT is parsed, we don't know about it.
+*
+* The kernel image is loaded into memory at very early time. We
+* cannot prevent this anyway. So on NUMA system, we set any
+* node the kernel resides in as un-hotpluggable.
+*
+* Since on modern servers, one node could have double-digit
+* gigabytes memory, we can assume the memory around the kernel
+* image is also un-hotpluggable. So before SRAT is parsed, just
+* allocate memory near the kernel image to try the best to keep
+* the kernel away from hotpluggable memory.
+*/
+   if (movable_node_is_enabled())
+   memblock_set_bottom_up(true);
+#endif
+
x86_report_nx();
 
/* after early param, so could get panic from serial */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index cad4b91..e43142c1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1727,26 +1727,6 @@ static bool can_offline_normal(struct zone *zone, 
unsigned long nr_pages)
 static int __init cmdline_parse_movable_node(char *p)
 {
 #ifdef CONFIG_MOVABLE_NODE
-   /*
-* Memory used by the kernel cannot be hot-removed because Linux
-* cannot migrate the kernel pages. When memory hotplug is
-* enabled, we should prevent memblock from allocating memory
-* for the kernel.
-*
-* ACPI SRAT records all hotpluggable memory ranges. But before
-* SRAT is parsed, we don't know about it.
-*
-* The kernel image is loaded into memory at very early time. We
-* cannot prevent this anyway. So on NUMA system, we set any
-* node the kernel resides in as un-hotpluggable.
-*
-* Since on modern servers, one node could have double-digit
-* gigabytes memory, we can assume the memory around the kernel
-* image is also un-hotpluggable. So before SRAT is parsed, just
-* allocate memory near the kernel image to try the best to keep
-* the kernel away from hotpluggable memory.
-*/
-   memblock_set_bottom_up(true);
movable_node_enabled = true;
 #else
pr_warn("movable_node option not supported\n");
-- 
1.8.3.1



  1   2   >