[PATCH] powerpc/powernv: Rename pe_level_printk to pe_printk and embed KERN_LEVEL in format

2019-06-20 Thread Joe Perches
Remove the separate KERN_ from each pe_level_printk and
instead add the KERN_ to the format.

pfix in pe_level_printk could also be used uninitialized so
add a new else and set pfx to the hex value of pe->flags.

Rename pe_level_printk to pe_printk and update the pe_
macros.

Signed-off-by: Joe Perches 
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 14 --
 arch/powerpc/platforms/powernv/pci.h  | 11 +--
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 10cc42b9e541..60fc36ae626a 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -50,15 +50,23 @@
 static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK",
  "NPU_OCAPI" };
 
-void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
-   const char *fmt, ...)
+void pe_printk(const struct pnv_ioda_pe *pe, const char *fmt, ...)
 {
struct va_format vaf;
va_list args;
char pfix[32];
+   char level[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
 
va_start(args, fmt);
 
+   while (printk_get_level(fmt)) {
+   size_t size = printk_skip_level(fmt) - fmt;
+
+   memcpy(level, fmt,  size);
+   level[size] = '\0';
+   fmt += size;
+   }
+
vaf.fmt = fmt;
vaf.va = 
 
@@ -74,6 +82,8 @@ void pe_level_printk(const struct pnv_ioda_pe *pe, const char 
*level,
(pe->rid & 0xff00) >> 8,
PCI_SLOT(pe->rid), PCI_FUNC(pe->rid));
 #endif /* CONFIG_PCI_IOV*/
+   else
+   sprintf(pfix, "(flags: 0x%lx)", pe->flags);
 
printk("%spci %s: [PE# %.2x] %pV",
   level, pfix, pe->pe_number, );
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index be26ab3d99e0..870b21f55b3f 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -205,15 +205,14 @@ extern unsigned long pnv_pci_ioda2_get_table_size(__u32 
page_shift,
__u64 window_size, __u32 levels);
 extern int pnv_eeh_post_init(void);
 
-__printf(3, 4)
-extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
-   const char *fmt, ...);
+__printf(2, 3)
+extern void pe_printk(const struct pnv_ioda_pe *pe, const char *fmt, ...);
 #define pe_err(pe, fmt, ...)   \
-   pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__)
+   pe_printk(pe, KERN_ERR fmt, ##__VA_ARGS__)
 #define pe_warn(pe, fmt, ...)  \
-   pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__)
+   pe_printk(pe, KERN_WARNING fmt, ##__VA_ARGS__)
 #define pe_info(pe, fmt, ...)  \
-   pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
+   pe_printk(pe, KERN_INFO fmt, ##__VA_ARGS__)
 
 /* Nvlink functions */
 extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);




[PATCH kernel] powerpc/of/pci: Rewrite pci_parse_of_flags

2019-06-20 Thread Alexey Kardashevskiy
The existing code uses bunch of hardcoded values from the PCI Bus Binding
to IEEE Std 1275 spec; and it does so in quite non-obvious way.

This defines fields from the cell#0 of the "reg" property of a PCI device
and uses them for parsing.

This should cause no behavioral change.

Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/kernel/pci_of_scan.c | 53 ++-
 1 file changed, 45 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/pci_of_scan.c 
b/arch/powerpc/kernel/pci_of_scan.c
index 8078bce89bec..fc55ee710eb3 100644
--- a/arch/powerpc/kernel/pci_of_scan.c
+++ b/arch/powerpc/kernel/pci_of_scan.c
@@ -42,13 +42,50 @@ unsigned int pci_parse_of_flags(u32 addr0, int bridge)
 {
unsigned int flags = 0;
 
-   if (addr0 & 0x0200) {
+/*
+ * PCI Bus Binding to IEEE Std 1275-1994
+ *
+ * Bit#3322  1100 
+ * 10987654 32109876 54321098 76543210
+ * phys.hi cell:   npt000ss  dfff 
+ * phys.mid cell:     
+ * phys.lo cell:      
+ *
+ * where:
+ * nis 0 if the address is relocatable, 1 otherwise
+ * pis 1 if the addressable region is "prefetchable", 0 otherwise
+ * tis 1 if the address is aliased (for non-relocatable I/O),
+ *  below 1 MB (for Memory),or below 64 KB (for relocatable I/O).
+ * ss   is the space code, denoting the address space
+ *  is the 8-bit Bus Number
+ * dis the 5-bit Device Number
+ * fff  is the 3-bit Function Number
+ *  is the 8-bit Register Number
+ */
+#define OF_PCI_ADDR0_SPACE_CODE(ss)(((ss)&0x3UL)<<24)
+#define OF_PCI_ADDR0_SPACE_CFG OF_PCI_ADDR0_SPACE_CODE(0)
+#define OF_PCI_ADDR0_SPACE_IO  OF_PCI_ADDR0_SPACE_CODE(1)
+#define OF_PCI_ADDR0_SPACE_MMIO32  OF_PCI_ADDR0_SPACE_CODE(2)
+#define OF_PCI_ADDR0_SPACE_MMIO64  OF_PCI_ADDR0_SPACE_CODE(3)
+#define OF_PCI_ADDR0_SPACE_MMIO(OF_PCI_ADDR0_SPACE_MMIO32 | \
+   OF_PCI_ADDR0_SPACE_MMIO64)
+#define OF_PCI_ADDR0_RELOC (1UL<<31)
+#define OF_PCI_ADDR0_PREFETCH  (1UL<<30)
+#define OF_PCI_ADDR0_ALIAS (1UL<<29)
+#define OF_PCI_ADDR0_BUS   0x00FFUL
+#define OF_PCI_ADDR0_DEV   0xF800UL
+#define OF_PCI_ADDR0_FN0x0700UL
+#define OF_PCI_ADDR0_BARREG0x00FFUL
+
+   if (addr0 & OF_PCI_ADDR0_SPACE_MMIO) {
flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY;
-   flags |= (addr0 >> 22) & PCI_BASE_ADDRESS_MEM_TYPE_64;
-   if (flags & PCI_BASE_ADDRESS_MEM_TYPE_64)
-   flags |= IORESOURCE_MEM_64;
-   flags |= (addr0 >> 28) & PCI_BASE_ADDRESS_MEM_TYPE_1M;
-   if (addr0 & 0x4000)
+   if ((addr0 & OF_PCI_ADDR0_SPACE_MMIO) ==
+   OF_PCI_ADDR0_SPACE_MMIO64)
+   flags |= PCI_BASE_ADDRESS_MEM_TYPE_64 |
+   IORESOURCE_MEM_64;
+   if (addr0 & OF_PCI_ADDR0_ALIAS)
+   flags |= PCI_BASE_ADDRESS_MEM_TYPE_1M;
+   if (addr0 & OF_PCI_ADDR0_PREFETCH)
flags |= IORESOURCE_PREFETCH
 | PCI_BASE_ADDRESS_MEM_PREFETCH;
/* Note: We don't know whether the ROM has been left enabled
@@ -56,9 +93,9 @@ unsigned int pci_parse_of_flags(u32 addr0, int bridge)
 * not set the IORESOURCE_ROM_ENABLE flag) for now rather than
 * do a config space read, it will be force-enabled if needed
 */
-   if (!bridge && (addr0 & 0xff) == 0x30)
+   if (!bridge && (addr0 & OF_PCI_ADDR0_BARREG) == PCI_ROM_ADDRESS)
flags |= IORESOURCE_READONLY;
-   } else if (addr0 & 0x0100)
+   } else if (addr0 & OF_PCI_ADDR0_SPACE_IO)
flags = IORESOURCE_IO | PCI_BASE_ADDRESS_SPACE_IO;
if (flags)
flags |= IORESOURCE_SIZEALIGN;
-- 
2.17.1



[PATCH 13/13] powerpc: add machine check safe copy_to_user

2019-06-20 Thread Santosh Sivaraj
Use  memcpy_mcsafe() implementation to define copy_to_user_mcsafe()

Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/include/asm/uaccess.h | 12 
 2 files changed, 13 insertions(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8c1c636308c8..a173b392c272 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -134,6 +134,7 @@ config PPC
select ARCH_HAS_STRICT_KERNEL_RWX   if ((PPC_BOOK3S_64 || PPC32) && 
!RELOCATABLE && !HIBERNATION)
select ARCH_HAS_TICK_BROADCAST  if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UACCESS_FLUSHCACHE  if PPC64
+   select ARCH_HAS_UACCESS_MCSAFE  if PPC64
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64
select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 76f34346b642..f8fcaab4c5bc 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -386,6 +386,18 @@ static inline unsigned long raw_copy_to_user(void __user 
*to,
return ret;
 }
 
+static __always_inline unsigned long __must_check
+copy_to_user_mcsafe(void __user *to, const void *from, unsigned long n)
+{
+   if (likely(check_copy_size(from, n, true))) {
+   allow_write_to_user(to, n);
+   n = memcpy_mcsafe(to, from, n);
+   prevent_write_to_user(to, n);
+   }
+
+   return n;
+}
+
 extern unsigned long __clear_user(void __user *addr, unsigned long size);
 
 static inline unsigned long clear_user(void __user *addr, unsigned long size)
-- 
2.20.1



[PATCH 12/13] powerpc/memcpy_mcsafe: return remaining bytes

2019-06-20 Thread Santosh Sivaraj
memcpy_mcsafe currently return -EFAULT on a machine check exception, change
it to return the remaining bytes that needs to be copied, so that machine
check safe copy_to_user can maintain the same behavior as copy_to_user.

Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/lib/memcpy_mcsafe_64.S | 129 +++-
 1 file changed, 70 insertions(+), 59 deletions(-)

diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S 
b/arch/powerpc/lib/memcpy_mcsafe_64.S
index 50f865db0338..566c664aa640 100644
--- a/arch/powerpc/lib/memcpy_mcsafe_64.S
+++ b/arch/powerpc/lib/memcpy_mcsafe_64.S
@@ -30,11 +30,12 @@
ld  r14,STK_REG(R14)(r1)
addir1,r1,STACKFRAMESIZE
 .Ldo_err1:
-   li  r3,-EFAULT
+   mr  r3,r7
blr
 
 
 _GLOBAL(memcpy_mcsafe)
+   mr  r7,r5
cmpldi  r5,16
blt .Lshort_copy
 
@@ -49,18 +50,21 @@ err1;   lbz r0,0(r4)
addir4,r4,1
 err1;  stb r0,0(r3)
addir3,r3,1
+   subir7,r7,1
 
 1: bf  cr7*4+2,2f
 err1;  lhz r0,0(r4)
addir4,r4,2
 err1;  sth r0,0(r3)
addir3,r3,2
+   subir7,r7,2
 
 2: bf  cr7*4+1,3f
 err1;  lwz r0,0(r4)
addir4,r4,4
 err1;  stw r0,0(r3)
addir3,r3,4
+   subir7,r7,4
 
 3: sub r5,r5,r6
cmpldi  r5,128
@@ -87,43 +91,69 @@ err1;   stw r0,0(r3)
 4:
 err2;  ld  r0,0(r4)
 err2;  ld  r6,8(r4)
-err2;  ld  r7,16(r4)
-err2;  ld  r8,24(r4)
-err2;  ld  r9,32(r4)
-err2;  ld  r10,40(r4)
-err2;  ld  r11,48(r4)
-err2;  ld  r12,56(r4)
-err2;  ld  r14,64(r4)
-err2;  ld  r15,72(r4)
-err2;  ld  r16,80(r4)
-err2;  ld  r17,88(r4)
-err2;  ld  r18,96(r4)
-err2;  ld  r19,104(r4)
-err2;  ld  r20,112(r4)
-err2;  ld  r21,120(r4)
+err2;  ld  r8,16(r4)
+err2;  ld  r9,24(r4)
+err2;  ld  r10,32(r4)
+err2;  ld  r11,40(r4)
+err2;  ld  r12,48(r4)
+err2;  ld  r14,56(r4)
+err2;  ld  r15,64(r4)
+err2;  ld  r16,72(r4)
+err2;  ld  r17,80(r4)
+err2;  ld  r18,88(r4)
+err2;  ld  r19,96(r4)
+err2;  ld  r20,104(r4)
+err2;  ld  r21,112(r4)
+err2;  ld  r22,120(r4)
addir4,r4,128
 err2;  std r0,0(r3)
 err2;  std r6,8(r3)
-err2;  std r7,16(r3)
-err2;  std r8,24(r3)
-err2;  std r9,32(r3)
-err2;  std r10,40(r3)
-err2;  std r11,48(r3)
-err2;  std r12,56(r3)
-err2;  std r14,64(r3)
-err2;  std r15,72(r3)
-err2;  std r16,80(r3)
-err2;  std r17,88(r3)
-err2;  std r18,96(r3)
-err2;  std r19,104(r3)
-err2;  std r20,112(r3)
-err2;  std r21,120(r3)
+err2;  std r8,16(r3)
+err2;  std r9,24(r3)
+err2;  std r10,32(r3)
+err2;  std r11,40(r3)
+err2;  std r12,48(r3)
+err2;  std r14,56(r3)
+err2;  std r15,64(r3)
+err2;  std r16,72(r3)
+err2;  std r17,80(r3)
+err2;  std r18,88(r3)
+err2;  std r19,96(r3)
+err2;  std r20,104(r3)
+err2;  std r21,112(r3)
+err2;  std r22,120(r3)
addir3,r3,128
+   subir7,r7,128
bdnz4b
 
clrldi  r5,r5,(64-7)
 
-   ld  r14,STK_REG(R14)(r1)
+   /* Up to 127B to go */
+5: srdir6,r5,4
+   mtocrf  0x01,r6
+
+6: bf  cr7*4+1,7f
+err2;  ld  r0,0(r4)
+err2;  ld  r6,8(r4)
+err2;  ld  r8,16(r4)
+err2;  ld  r9,24(r4)
+err2;  ld  r10,32(r4)
+err2;  ld  r11,40(r4)
+err2;  ld  r12,48(r4)
+err2;  ld  r14,56(r4)
+   addir4,r4,64
+err2;  std r0,0(r3)
+err2;  std r6,8(r3)
+err2;  std r8,16(r3)
+err2;  std r9,24(r3)
+err2;  std r10,32(r3)
+err2;  std r11,40(r3)
+err2;  std r12,48(r3)
+err2;  std r14,56(r3)
+   addir3,r3,64
+   subir7,r7,64
+
+7: ld  r14,STK_REG(R14)(r1)
ld  r15,STK_REG(R15)(r1)
ld  r16,STK_REG(R16)(r1)
ld  r17,STK_REG(R17)(r1)
@@ -134,42 +164,19 @@ err2; std r21,120(r3)
ld  r22,STK_REG(R22)(r1)
addir1,r1,STACKFRAMESIZE
 
-   /* Up to 127B to go */
-5: srdir6,r5,4
-   mtocrf  0x01,r6
-
-6: bf  cr7*4+1,7f
-err1;  ld  r0,0(r4)
-err1;  ld  r6,8(r4)
-err1;  ld  r7,16(r4)
-err1;  ld  r8,24(r4)
-err1;  ld  r9,32(r4)
-err1;  ld  r10,40(r4)
-err1;  ld  r11,48(r4)
-err1;  ld  r12,56(r4)
-   addir4,r4,64
-err1;  std r0,0(r3)
-err1;  std r6,8(r3)
-err1;  std r7,16(r3)
-err1;  std r8,24(r3)
-err1;  std r9,32(r3)
-err1;  std r10,40(r3)
-err1;  std r11,48(r3)
-err1;  std r12,56(r3)
-   addir3,r3,64
-
/* Up to 63B to go */
-7: bf  cr7*4+2,8f
+   bf  cr7*4+2,8f
 err1;  ld  r0,0(r4)
 err1;  ld  r6,8(r4)
-err1;  ld  r7,16(r4)
-err1;  ld  r8,24(r4)
+err1;  ld  r8,16(r4)
+err1;  ld  r9,24(r4)
addir4,r4,32
 err1;  std r0,0(r3)
 err1;  std r6,8(r3)
-err1;  std r7,16(r3)

[PATCH 11/13] powerpc/64s: Save r13 in machine_check_common_early

2019-06-20 Thread Santosh Sivaraj
From: Reza Arbab 

Testing my memcpy_mcsafe() work in progress with an injected UE, I get
an error like this immediately after the function returns:

BUG: Unable to handle kernel data access at 0x7fff84dec8f8
Faulting instruction address: 0xc008009c00b0
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Radix MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV
Modules linked in: mce(O+) vmx_crypto crc32c_vpmsum
CPU: 0 PID: 1375 Comm: modprobe Tainted: G   O  5.1.0-rc6 #267
NIP:  c008009c00b0 LR: c008009c00a8 CTR: c0095f90
REGS: c000ee197790 TRAP: 0300   Tainted: G   O   (5.1.0-rc6)
MSR:  9280b033   CR: 88002826  XER: 
0004
CFAR: c0095f8c DAR: 7fff84dec8f8 DSISR: 4000 IRQMASK: 0
GPR00: 6c6c6568 c000ee197a20 c008009c8400 fff2
GPR04: c008009c02e0 0006  c3c834c8
GPR08: 0080 776a6681b7fb5100  c008009c01c8
GPR12: c0095f90 7fff84debc00 4d071440 
GPR16: 00010601 c008009e c0c98dd8 c0c98d98
GPR20: c3bba970 c008009c04d0 c008009c0618 c01e5820
GPR24:  0100 0001 c3bba958
GPR28: c008009c02e8 c008009c0318 c008009c02e0 
NIP [c008009c00b0] cause_ue+0xa8/0xe8 [mce]
LR [c008009c00a8] cause_ue+0xa0/0xe8 [mce]

To fix, ensure that r13 is properly restored after an MCE.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/kernel/exceptions-64s.S | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 311f1392a2ec..932d8d05892c 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -265,6 +265,7 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
 EXC_REAL_END(machine_check, 0x200, 0x100)
 EXC_VIRT_NONE(0x4200, 0x100)
 TRAMP_REAL_BEGIN(machine_check_common_early)
+   SET_SCRATCH0(r13)   /* save r13 */
EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
/*
 * Register contents:
-- 
2.20.1



[PATCH 10/13] powerpc/mce: Enable MCE notifiers in external modules

2019-06-20 Thread Santosh Sivaraj
From: Reza Arbab 

Signed-off-by: Reza Arbab 
---
 arch/powerpc/kernel/exceptions-64s.S | 6 ++
 arch/powerpc/kernel/mce.c| 2 ++
 2 files changed, 8 insertions(+)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index c83e38a403fd..311f1392a2ec 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -458,6 +458,12 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
bl  machine_check_early
std r3,RESULT(r1)   /* Save result */
 
+   /* Notifiers may be in a module, so enable virtual addressing. */
+   mfmsr   r11
+   ori r11,r11,MSR_IR
+   ori r11,r11,MSR_DR
+   mtmsr   r11
+
addir3,r1,STACK_FRAME_OVERHEAD
bl  machine_check_notify
ld  r11,RESULT(r1)
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 9cb5a731377b..413f7866a9c4 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -50,11 +50,13 @@ int mce_register_notifier(struct notifier_block *nb)
 {
return blocking_notifier_chain_register(_notifier_list, nb);
 }
+EXPORT_SYMBOL_GPL(mce_register_notifier);
 
 int mce_unregister_notifier(struct notifier_block *nb)
 {
return blocking_notifier_chain_unregister(_notifier_list, nb);
 }
+EXPORT_SYMBOL_GPL(mce_unregister_notifier);
 
 static int check_memcpy_mcsafe(struct notifier_block *nb, unsigned long val,
   void *data)
-- 
2.20.1



[PATCH 09/13] powerpc/mce: Handle memcpy_mcsafe()

2019-06-20 Thread Santosh Sivaraj
From: Reza Arbab 

Add an mce notifier intended to service memcpy_mcsafe().

The notifier uses this heuristic; if a UE occurs when accessing device
memory, and the faulting instruction had a fixup entry, the callback
will return NOTIFY_STOP.

This causes the notification mechanism to consider the MCE handled and
continue execution at the fixup address, which returns -EFAULT from the
memcpy_mcsafe() call.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/kernel/mce.c | 34 ++
 1 file changed, 34 insertions(+)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 8afda1ab7358..9cb5a731377b 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -56,6 +56,40 @@ int mce_unregister_notifier(struct notifier_block *nb)
return blocking_notifier_chain_unregister(_notifier_list, nb);
 }
 
+static int check_memcpy_mcsafe(struct notifier_block *nb, unsigned long val,
+  void *data)
+{
+   struct machine_check_event *evt = data;
+   unsigned long pfn;
+   struct page *page;
+
+   if (evt->error_type != MCE_ERROR_TYPE_UE ||
+   !evt->u.ue_error.physical_address_provided)
+   return NOTIFY_DONE;
+
+   pfn = evt->u.ue_error.physical_address >> PAGE_SHIFT;
+   page = pfn_to_page(pfn);
+   if (!page)
+   return NOTIFY_DONE;
+
+   /* HMM and PMEM */
+   if (is_zone_device_page(page) && evt->u.ue_error.fixup_address_provided)
+   return NOTIFY_STOP;
+
+   return NOTIFY_DONE;
+}
+
+static struct notifier_block memcpy_mcsafe_nb = {
+   .notifier_call = check_memcpy_mcsafe
+};
+
+static int __init mce_mcsafe_register(void)
+{
+   mce_register_notifier(_mcsafe_nb);
+   return 0;
+}
+arch_initcall(mce_mcsafe_register);
+
 static void mce_set_error_info(struct machine_check_event *mce,
   struct mce_error_info *mce_err)
 {
-- 
2.20.1



[PATCH 08/13] powerpc/memcpy: Add memcpy_mcsafe for pmem

2019-06-20 Thread Santosh Sivaraj
From: Balbir Singh 

The pmem infrastructure uses memcpy_mcsafe in the pmem
layer so as to convert machine check exceptions into
a return value on failure in case a machine check
exception is encountered during the memcpy.

This patch largely borrows from the copyuser_power7
logic and does not add the VMX optimizations, largely
to keep the patch simple. If needed those optimizations
can be folded in.

Signed-off-by: Balbir Singh 
Acked-by: Nicholas Piggin 
[ar...@linux.ibm.com: Added symbol export]
---
 arch/powerpc/include/asm/string.h   |   2 +
 arch/powerpc/lib/Makefile   |   2 +-
 arch/powerpc/lib/memcpy_mcsafe_64.S | 215 
 3 files changed, 218 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/lib/memcpy_mcsafe_64.S

diff --git a/arch/powerpc/include/asm/string.h 
b/arch/powerpc/include/asm/string.h
index 9bf6dffb4090..b72692702f35 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -53,7 +53,9 @@ void *__memmove(void *to, const void *from, __kernel_size_t 
n);
 #ifndef CONFIG_KASAN
 #define __HAVE_ARCH_MEMSET32
 #define __HAVE_ARCH_MEMSET64
+#define __HAVE_ARCH_MEMCPY_MCSAFE
 
+extern int memcpy_mcsafe(void *dst, const void *src, __kernel_size_t sz);
 extern void *__memset16(uint16_t *, uint16_t v, __kernel_size_t);
 extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
 extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t);
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index c55f9c27bf79..529d6536eb4a 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o 
copypage_power7.o \
   memcpy_power7.o
 
 obj64-y+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
-  memcpy_64.o pmem.o
+  memcpy_64.o pmem.o memcpy_mcsafe_64.o
 
 obj64-$(CONFIG_SMP)+= locks.o
 obj64-$(CONFIG_ALTIVEC)+= vmx-helper.o
diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S 
b/arch/powerpc/lib/memcpy_mcsafe_64.S
new file mode 100644
index ..50f865db0338
--- /dev/null
+++ b/arch/powerpc/lib/memcpy_mcsafe_64.S
@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) IBM Corporation, 2011
+ * Derived from copyuser_power7.s by Anton Blanchard 
+ * Author - Balbir Singh 
+ */
+#include 
+#include 
+#include 
+
+   .macro err1
+100:
+   EX_TABLE(100b,.Ldo_err1)
+   .endm
+
+   .macro err2
+200:
+   EX_TABLE(200b,.Ldo_err2)
+   .endm
+
+.Ldo_err2:
+   ld  r22,STK_REG(R22)(r1)
+   ld  r21,STK_REG(R21)(r1)
+   ld  r20,STK_REG(R20)(r1)
+   ld  r19,STK_REG(R19)(r1)
+   ld  r18,STK_REG(R18)(r1)
+   ld  r17,STK_REG(R17)(r1)
+   ld  r16,STK_REG(R16)(r1)
+   ld  r15,STK_REG(R15)(r1)
+   ld  r14,STK_REG(R14)(r1)
+   addir1,r1,STACKFRAMESIZE
+.Ldo_err1:
+   li  r3,-EFAULT
+   blr
+
+
+_GLOBAL(memcpy_mcsafe)
+   cmpldi  r5,16
+   blt .Lshort_copy
+
+.Lcopy:
+   /* Get the source 8B aligned */
+   neg r6,r4
+   mtocrf  0x01,r6
+   clrldi  r6,r6,(64-3)
+
+   bf  cr7*4+3,1f
+err1;  lbz r0,0(r4)
+   addir4,r4,1
+err1;  stb r0,0(r3)
+   addir3,r3,1
+
+1: bf  cr7*4+2,2f
+err1;  lhz r0,0(r4)
+   addir4,r4,2
+err1;  sth r0,0(r3)
+   addir3,r3,2
+
+2: bf  cr7*4+1,3f
+err1;  lwz r0,0(r4)
+   addir4,r4,4
+err1;  stw r0,0(r3)
+   addir3,r3,4
+
+3: sub r5,r5,r6
+   cmpldi  r5,128
+   blt 5f
+
+   mflrr0
+   stdur1,-STACKFRAMESIZE(r1)
+   std r14,STK_REG(R14)(r1)
+   std r15,STK_REG(R15)(r1)
+   std r16,STK_REG(R16)(r1)
+   std r17,STK_REG(R17)(r1)
+   std r18,STK_REG(R18)(r1)
+   std r19,STK_REG(R19)(r1)
+   std r20,STK_REG(R20)(r1)
+   std r21,STK_REG(R21)(r1)
+   std r22,STK_REG(R22)(r1)
+   std r0,STACKFRAMESIZE+16(r1)
+
+   srdir6,r5,7
+   mtctr   r6
+
+   /* Now do cacheline (128B) sized loads and stores. */
+   .align  5
+4:
+err2;  ld  r0,0(r4)
+err2;  ld  r6,8(r4)
+err2;  ld  r7,16(r4)
+err2;  ld  r8,24(r4)
+err2;  ld  r9,32(r4)
+err2;  ld  r10,40(r4)
+err2;  ld  r11,48(r4)
+err2;  ld  r12,56(r4)
+err2;  ld  r14,64(r4)
+err2;  ld  r15,72(r4)
+err2;  ld  r16,80(r4)
+err2;  ld  r17,88(r4)
+err2;  ld  r18,96(r4)
+err2;  ld  r19,104(r4)
+err2;  ld  r20,112(r4)
+err2;  ld  r21,120(r4)
+   addir4,r4,128
+err2;  std r0,0(r3)
+err2;  std r6,8(r3)
+err2;  std r7,16(r3)
+err2;  std r8,24(r3)
+err2;  std r9,32(r3)
+err2;  std r10,40(r3)
+err2;  std r11,48(r3)
+err2;  std r12,56(r3)
+err2;  std r14,64(r3)
+err2;  std r15,72(r3)
+err2;  std r16,80(r3)
+err2;  std 

[PATCH 07/13] powerpc/mce: Add fixup address to UE events

2019-06-20 Thread Santosh Sivaraj
From: Reza Arbab 

If the instruction causing a UE has an exception table entry with fixup
address, save it in the machine_check_event struct.

If a machine check notifier callback returns NOTIFY_STOP to indicate it
has handled the error, set nip to continue execution from the fixup
address.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/include/asm/mce.h |  5 +++--
 arch/powerpc/kernel/mce.c  | 16 +++-
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 240dd1fdfe35..9d9661747adf 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -122,11 +122,12 @@ struct machine_check_event {
enum MCE_UeErrorType ue_error_type:8;
u8  effective_address_provided;
u8  physical_address_provided;
+   u8  fixup_address_provided;
u8  process_event;
-   u8  reserved_1[4];
+   u8  reserved_1[3];
u64 effective_address;
u64 physical_address;
-   u8  reserved_2[8];
+   u64 fixup_address;
} ue_error;
 
struct {
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 2616f1f71734..8afda1ab7358 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -15,10 +15,12 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
+#include 
 
 static DEFINE_PER_CPU(int, mce_nest_count);
 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
@@ -151,6 +153,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
mce->u.link_error.effective_address_provided = true;
mce->u.link_error.effective_address = addr;
} else if (mce->error_type == MCE_ERROR_TYPE_UE) {
+   const struct exception_table_entry *entry;
+
mce->u.ue_error.effective_address_provided = true;
mce->u.ue_error.effective_address = addr;
if (phys_addr != ULONG_MAX) {
@@ -158,6 +162,12 @@ void save_mce_event(struct pt_regs *regs, long handled,
mce->u.ue_error.physical_address = phys_addr;
}
 
+   entry = search_exception_tables(regs->nip);
+   if (entry) {
+   mce->u.ue_error.fixup_address_provided = true;
+   mce->u.ue_error.fixup_address = extable_fixup(entry);
+   }
+
mce->u.ue_error.process_event = true;
}
return;
@@ -666,8 +676,12 @@ long machine_check_notify(struct pt_regs *regs)
 
rc = blocking_notifier_call_chain(_notifier_list, 0, evt);
if (rc & NOTIFY_STOP_MASK) {
-   if (evt->error_type == MCE_ERROR_TYPE_UE)
+   if (evt->error_type == MCE_ERROR_TYPE_UE) {
+   if (evt->u.ue_error.fixup_address_provided)
+   regs->nip = evt->u.ue_error.fixup_address;
+
evt->u.ue_error.process_event = false;
+   }
 
evt->disposition = MCE_DISPOSITION_RECOVERED;
regs->msr |= MSR_RI;
-- 
2.20.1



[PATCH 06/13] powerpc/mce: Do not process notifier-handled UE events

2019-06-20 Thread Santosh Sivaraj
From: Reza Arbab 

Signed-off-by: Reza Arbab 
---
 arch/powerpc/include/asm/mce.h | 3 ++-
 arch/powerpc/kernel/mce.c  | 9 +
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 948bef579086..240dd1fdfe35 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -122,7 +122,8 @@ struct machine_check_event {
enum MCE_UeErrorType ue_error_type:8;
u8  effective_address_provided;
u8  physical_address_provided;
-   u8  reserved_1[5];
+   u8  process_event;
+   u8  reserved_1[4];
u64 effective_address;
u64 physical_address;
u8  reserved_2[8];
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 912efe58e0b1..2616f1f71734 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -157,6 +157,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
mce->u.ue_error.physical_address_provided = true;
mce->u.ue_error.physical_address = phys_addr;
}
+
+   mce->u.ue_error.process_event = true;
}
return;
 }
@@ -241,6 +243,10 @@ void machine_check_queue_event(void)
if (!get_mce_event(, MCE_EVENT_RELEASE))
return;
 
+   if (evt.error_type == MCE_ERROR_TYPE_UE &&
+   !evt.u.ue_error.process_event)
+   return;
+
index = __this_cpu_inc_return(mce_queue_count) - 1;
/* If queue is full, just return for now. */
if (index >= MAX_MC_EVT) {
@@ -660,6 +666,9 @@ long machine_check_notify(struct pt_regs *regs)
 
rc = blocking_notifier_call_chain(_notifier_list, 0, evt);
if (rc & NOTIFY_STOP_MASK) {
+   if (evt->error_type == MCE_ERROR_TYPE_UE)
+   evt->u.ue_error.process_event = false;
+
evt->disposition = MCE_DISPOSITION_RECOVERED;
regs->msr |= MSR_RI;
 
-- 
2.20.1



[PATCH 05/13] powerpc/mce: Allow notifier callback to handle MCE

2019-06-20 Thread Santosh Sivaraj
From: Reza Arbab 

If a notifier returns NOTIFY_STOP, consider the MCE handled, just as we
do when machine_check_early() returns 1.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/include/asm/asm-prototypes.h |  2 +-
 arch/powerpc/kernel/exceptions-64s.S  |  3 +++
 arch/powerpc/kernel/mce.c | 28 ---
 3 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index f66f26ef3ce0..49ee8f08de2a 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -72,7 +72,7 @@ void machine_check_exception(struct pt_regs *regs);
 void emulation_assist_interrupt(struct pt_regs *regs);
 long do_slb_fault(struct pt_regs *regs, unsigned long ea);
 void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err);
-void machine_check_notify(struct pt_regs *regs);
+long machine_check_notify(struct pt_regs *regs);
 
 /* signals, syscalls and interrupts */
 long sys_swapcontext(struct ucontext __user *old_ctx,
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 2e56014fca21..c83e38a403fd 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -460,6 +460,9 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
 
addir3,r1,STACK_FRAME_OVERHEAD
bl  machine_check_notify
+   ld  r11,RESULT(r1)
+   or  r3,r3,r11
+   std r3,RESULT(r1)
 
ld  r12,_MSR(r1)
 BEGIN_FTR_SECTION
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 0ab171b41ede..912efe58e0b1 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -647,16 +647,28 @@ long hmi_exception_realmode(struct pt_regs *regs)
return 1;
 }
 
-void machine_check_notify(struct pt_regs *regs)
+long machine_check_notify(struct pt_regs *regs)
 {
-   struct machine_check_event evt;
+   int index = __this_cpu_read(mce_nest_count) - 1;
+   struct machine_check_event *evt;
+   int rc;
 
-   if (!get_mce_event(, MCE_EVENT_DONTRELEASE))
-   return;
+   if (index < 0 || index >= MAX_MC_EVT)
+   return 0;
+
+   evt = this_cpu_ptr(_event[index]);
 
-   blocking_notifier_call_chain(_notifier_list, 0, );
+   rc = blocking_notifier_call_chain(_notifier_list, 0, evt);
+   if (rc & NOTIFY_STOP_MASK) {
+   evt->disposition = MCE_DISPOSITION_RECOVERED;
+   regs->msr |= MSR_RI;
 
-   if (evt.error_type == MCE_ERROR_TYPE_UE &&
-   evt.u.ue_error.physical_address_provided)
-   machine_check_ue_event();
+   return 1;
+   }
+
+   if (evt->error_type == MCE_ERROR_TYPE_UE &&
+   evt->u.ue_error.physical_address_provided)
+   machine_check_ue_event(evt);
+
+   return 0;
 }
-- 
2.20.1



[PATCH 04/13] powerpc/mce: Move machine_check_ue_event() call

2019-06-20 Thread Santosh Sivaraj
From: Reza Arbab 

Move the call site of machine_check_ue_event() slightly later in the MCE
codepath. No functional change intended--this is prep for a later patch
to conditionally skip the call.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/kernel/mce.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 24d350a934e4..0ab171b41ede 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -156,7 +156,6 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (phys_addr != ULONG_MAX) {
mce->u.ue_error.physical_address_provided = true;
mce->u.ue_error.physical_address = phys_addr;
-   machine_check_ue_event(mce);
}
}
return;
@@ -656,4 +655,8 @@ void machine_check_notify(struct pt_regs *regs)
return;
 
blocking_notifier_call_chain(_notifier_list, 0, );
+
+   if (evt.error_type == MCE_ERROR_TYPE_UE &&
+   evt.u.ue_error.physical_address_provided)
+   machine_check_ue_event();
 }
-- 
2.20.1



[PATCH 03/13] powerpc/mce: Add MCE notification chain

2019-06-20 Thread Santosh Sivaraj
From: Reza Arbab 

Signed-off-by: Reza Arbab 
---
 arch/powerpc/include/asm/asm-prototypes.h |  1 +
 arch/powerpc/include/asm/mce.h|  4 
 arch/powerpc/kernel/exceptions-64s.S  |  4 
 arch/powerpc/kernel/mce.c | 22 ++
 4 files changed, 31 insertions(+)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index ec1c97a8e8cb..f66f26ef3ce0 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -72,6 +72,7 @@ void machine_check_exception(struct pt_regs *regs);
 void emulation_assist_interrupt(struct pt_regs *regs);
 long do_slb_fault(struct pt_regs *regs, unsigned long ea);
 void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err);
+void machine_check_notify(struct pt_regs *regs);
 
 /* signals, syscalls and interrupts */
 long sys_swapcontext(struct ucontext __user *old_ctx,
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 94888a7025b3..948bef579086 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -214,4 +214,8 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned 
long addr,
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 #endif /* CONFIG_PPC_BOOK3S_64 */
+
+int mce_register_notifier(struct notifier_block *nb);
+int mce_unregister_notifier(struct notifier_block *nb);
+
 #endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 6b86055e5251..2e56014fca21 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -457,6 +457,10 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
addir3,r1,STACK_FRAME_OVERHEAD
bl  machine_check_early
std r3,RESULT(r1)   /* Save result */
+
+   addir3,r1,STACK_FRAME_OVERHEAD
+   bl  machine_check_notify
+
ld  r12,_MSR(r1)
 BEGIN_FTR_SECTION
b   4f
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index e78c4f18ea0a..24d350a934e4 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -42,6 +42,18 @@ static struct irq_work mce_event_process_work = {
 
 DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
 
+static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
+
+int mce_register_notifier(struct notifier_block *nb)
+{
+   return blocking_notifier_chain_register(_notifier_list, nb);
+}
+
+int mce_unregister_notifier(struct notifier_block *nb)
+{
+   return blocking_notifier_chain_unregister(_notifier_list, nb);
+}
+
 static void mce_set_error_info(struct machine_check_event *mce,
   struct mce_error_info *mce_err)
 {
@@ -635,3 +647,13 @@ long hmi_exception_realmode(struct pt_regs *regs)
 
return 1;
 }
+
+void machine_check_notify(struct pt_regs *regs)
+{
+   struct machine_check_event evt;
+
+   if (!get_mce_event(, MCE_EVENT_DONTRELEASE))
+   return;
+
+   blocking_notifier_call_chain(_notifier_list, 0, );
+}
-- 
2.20.1



[PATCH 02/13] powerpc/mce: Bug fixes for MCE handling in kernel space

2019-06-20 Thread Santosh Sivaraj
From: Balbir Singh 

The code currently assumes PAGE_SHIFT as the shift value of
the pfn, this works correctly (mostly) for user space pages,
but the correct thing to do is

1. Extract the shift value returned via the pte-walk API's
2. Use the shift value to access the instruction address.

Note, the final physical address still use PAGE_SHIFT for
computation. handle_ierror() is not modified and handle_derror()
is modified just for extracting the correct instruction
address.

This is largely due to __find_linux_pte() returning pfn's
shifted by pdshift. The code is much more generic and can
handle shift values returned.

Fixes: ba41e1e1ccb9 ("powerpc/mce: Hookup derror (load/store) UE errors")

Signed-off-by: Balbir Singh 
[ar...@linux.ibm.com: Fixup pseries_do_memory_failure()]
---
 arch/powerpc/include/asm/mce.h   |  3 ++-
 arch/powerpc/kernel/mce_power.c  | 26 --
 arch/powerpc/platforms/pseries/ras.c |  6 --
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index a4c6a74ad2fb..94888a7025b3 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -209,7 +209,8 @@ extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
 extern void machine_check_print_event_info(struct machine_check_event *evt,
   bool user_mode, bool in_guest);
-unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
+unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr,
+ unsigned int *shift);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 #endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index e39536aad30d..04666c0b40a8 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -23,7 +23,8 @@
  * Convert an address related to an mm to a PFN. NOTE: we are in real
  * mode, we could potentially race with page table updates.
  */
-unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
+unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr,
+ unsigned int *shift)
 {
pte_t *ptep;
unsigned long flags;
@@ -36,13 +37,15 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned 
long addr)
 
local_irq_save(flags);
if (mm == current->mm)
-   ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL);
+   ptep = find_current_mm_pte(mm->pgd, addr, NULL, shift);
else
-   ptep = find_init_mm_pte(addr, NULL);
+   ptep = find_init_mm_pte(addr, shift);
local_irq_restore(flags);
if (!ptep || pte_special(*ptep))
return ULONG_MAX;
-   return pte_pfn(*ptep);
+   if (!*shift)
+   *shift = PAGE_SHIFT;
+   return (pte_val(*ptep) & PTE_RPN_MASK) >> *shift;
 }
 
 /* flush SLBs and reload */
@@ -358,15 +361,16 @@ static int mce_find_instr_ea_and_pfn(struct pt_regs 
*regs, uint64_t *addr,
unsigned long pfn, instr_addr;
struct instruction_op op;
struct pt_regs tmp = *regs;
+   unsigned int shift;
 
-   pfn = addr_to_pfn(regs, regs->nip);
+   pfn = addr_to_pfn(regs, regs->nip, );
if (pfn != ULONG_MAX) {
-   instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK);
+   instr_addr = (pfn << shift) + (regs->nip & ((1 << shift) - 1));
instr = *(unsigned int *)(instr_addr);
if (!analyse_instr(, , instr)) {
-   pfn = addr_to_pfn(regs, op.ea);
+   pfn = addr_to_pfn(regs, op.ea, );
*addr = op.ea;
-   *phys_addr = (pfn << PAGE_SHIFT);
+   *phys_addr = (pfn << shift);
return 0;
}
/*
@@ -442,12 +446,14 @@ static int mce_handle_ierror(struct pt_regs *regs,
if (mce_err->sync_error &&
table[i].error_type == MCE_ERROR_TYPE_UE) {
unsigned long pfn;
+   unsigned int shift;
 
if (get_paca()->in_mce < MAX_MCE_DEPTH) {
-   pfn = addr_to_pfn(regs, regs->nip);
+   pfn = addr_to_pfn(regs, regs->nip,
+ );
if (pfn != ULONG_MAX) {
*phys_addr =
-   (pfn << PAGE_SHIFT);
+   (pfn << shift);
}
}
}
diff --git 

[PATCH 00/13] powerpc: implement machine check safe memcpy

2019-06-20 Thread Santosh Sivaraj
During a memcpy from a pmem device, if a machine check exception is
generated we end up in a panic. In case of fsdax read, this should
only result in a -EIO. Avoid MCE by implementing memcpy_mcsafe.

Before this patch series:

```
bash-4.4# mount -o dax /dev/pmem0 /mnt/pmem/
[ 7621.714094] Disabling lock debugging due to kernel taint
[ 7621.714099] MCE: CPU0: machine check (Severe) Host UE Load/Store [Not 
recovered]
[ 7621.714104] MCE: CPU0: NIP: [c0088978] memcpy_power7+0x418/0x7e0
[ 7621.714107] MCE: CPU0: Hardware error
[ 7621.714112] opal: Hardware platform error: Unrecoverable Machine Check 
exception
[ 7621.714118] CPU: 0 PID: 1368 Comm: mount Tainted: G   M  
5.2.0-rc5-00239-g241e39004581 #50
[ 7621.714123] NIP:  c0088978 LR: c08e16f8 CTR: 01de
[ 7621.714129] REGS: c000fffbfd70 TRAP: 0200   Tainted: G   M   
(5.2.0-rc5-00239-g241e39004581)
[ 7621.714131] MSR:  92209033   CR: 
24428840  XER: 0004
[ 7621.714160] CFAR: c00889a8 DAR: deadbeefdeadbeef DSISR: 8000 
IRQMASK: 0
[ 7621.714171] GPR00: 0e00 c000f0b8b1e0 c12cf100 
c000ed8e1100 
[ 7621.714186] GPR04: c2001100 0001 0200 
03fff1272000 
[ 7621.714201] GPR08: 8000 0010 0020 
0030 
[ 7621.714216] GPR12: 0040 7fffb8c6d390 0050 
0060 
[ 7621.714232] GPR16: 0070  0001 
c000f0b8b960 
[ 7621.714247] GPR20: 0001 c000f0b8b940 0001 
0001 
[ 7621.714262] GPR24: c1382560 c00c003b6380 c00c003b6380 
0001 
[ 7621.714277] GPR28:  0001 c200 
0001 
[ 7621.714294] NIP [c0088978] memcpy_power7+0x418/0x7e0
[ 7621.714298] LR [c08e16f8] pmem_do_bvec+0xf8/0x430
...  ...
```

After this patch series:

```
bash-4.4# mount -o dax /dev/pmem0 /mnt/pmem/
[25302.883978] Buffer I/O error on dev pmem0, logical block 0, async page read
[25303.020816] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25303.021236] EXT4-fs (pmem0): Can't read superblock on 2nd try
[25303.152515] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25303.284031] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25304.084100] UDF-fs: bad mount option "dax" or missing value
mount: /mnt/pmem: wrong fs type, bad option, bad superblock on /dev/pmem0, 
missing codepage or helper program, or other error.
```

MCE is injected on a pmem address using mambo.
---

Balbir Singh (2):
  powerpc/mce: Bug fixes for MCE handling in kernel space
  powerpc/memcpy: Add memcpy_mcsafe for pmem

Reza Arbab (9):
  powerpc/mce: Make machine_check_ue_event() static
  powerpc/mce: Add MCE notification chain
  powerpc/mce: Move machine_check_ue_event() call
  powerpc/mce: Allow notifier callback to handle MCE
  powerpc/mce: Do not process notifier-handled UE events
  powerpc/mce: Add fixup address to UE events
  powerpc/mce: Handle memcpy_mcsafe()
  powerpc/mce: Enable MCE notifiers in external modules
  powerpc/64s: Save r13 in machine_check_common_early

Santosh Sivaraj (2):
  powerpc/memcpy_mcsafe: return remaining bytes
  powerpc: add machine check safe copy_to_user

 arch/powerpc/Kconfig  |   1 +
 arch/powerpc/include/asm/asm-prototypes.h |   1 +
 arch/powerpc/include/asm/mce.h|  13 +-
 arch/powerpc/include/asm/string.h |   2 +
 arch/powerpc/include/asm/uaccess.h|  12 ++
 arch/powerpc/kernel/exceptions-64s.S  |  14 ++
 arch/powerpc/kernel/mce.c | 102 +-
 arch/powerpc/kernel/mce_power.c   |  26 ++-
 arch/powerpc/lib/Makefile |   2 +-
 arch/powerpc/lib/memcpy_mcsafe_64.S   | 226 ++
 arch/powerpc/platforms/pseries/ras.c  |   6 +-
 11 files changed, 386 insertions(+), 19 deletions(-)
 create mode 100644 arch/powerpc/lib/memcpy_mcsafe_64.S

-- 
2.20.1



[PATCH 01/13] powerpc/mce: Make machine_check_ue_event() static

2019-06-20 Thread Santosh Sivaraj
From: Reza Arbab 

The function doesn't get used outside this file, so make it static.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/kernel/mce.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index b18df633eae9..e78c4f18ea0a 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -33,7 +33,7 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
mce_ue_event_queue);
 
 static void machine_check_process_queued_event(struct irq_work *work);
-void machine_check_ue_event(struct machine_check_event *evt);
+static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
 static struct irq_work mce_event_process_work = {
@@ -203,7 +203,7 @@ void release_mce_event(void)
 /*
  * Queue up the MCE event which then can be handled later.
  */
-void machine_check_ue_event(struct machine_check_event *evt)
+static void machine_check_ue_event(struct machine_check_event *evt)
 {
int index;
 
-- 
2.20.1



Re: [PATCH 2/4] powerpc/powernv: remove the unused tunneling exports

2019-06-20 Thread Oliver O'Halloran
On Thu, May 23, 2019 at 5:51 PM Christoph Hellwig  wrote:
>
> These have been unused ever since they've been added to the kernel.
>
> Signed-off-by: Christoph Hellwig 
> ---
>  arch/powerpc/include/asm/pnv-pci.h|  4 --
>  arch/powerpc/platforms/powernv/pci-ioda.c |  4 +-
>  arch/powerpc/platforms/powernv/pci.c  | 71 ---
>  arch/powerpc/platforms/powernv/pci.h  |  1 -
>  4 files changed, 3 insertions(+), 77 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/pnv-pci.h 
> b/arch/powerpc/include/asm/pnv-pci.h
> index 9fcb0bc462c6..1ab4b0111abc 100644
> --- a/arch/powerpc/include/asm/pnv-pci.h
> +++ b/arch/powerpc/include/asm/pnv-pci.h
> @@ -27,12 +27,8 @@ extern int pnv_pci_get_power_state(uint64_t id, uint8_t 
> *state);
>  extern int pnv_pci_set_power_state(uint64_t id, uint8_t state,
>struct opal_msg *msg);
>
> -extern int pnv_pci_enable_tunnel(struct pci_dev *dev, uint64_t *asnind);
> -extern int pnv_pci_disable_tunnel(struct pci_dev *dev);
>  extern int pnv_pci_set_tunnel_bar(struct pci_dev *dev, uint64_t addr,
>   int enable);
> -extern int pnv_pci_get_as_notify_info(struct task_struct *task, u32 *lpid,
> - u32 *pid, u32 *tid);

IIRC as-notify was for CAPI which has an in-tree driver (cxl). Fred or
Andrew (+cc), what's going on with this? Will it ever see the light of
day?

>  int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode);
>  int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq,
>unsigned int virq);
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 126602b4e399..6b0caa2d0425 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -54,6 +54,8 @@
>  static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK",
>   "NPU_OCAPI" };
>
> +static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
> +
>  void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
> const char *fmt, ...)
>  {
> @@ -2360,7 +2362,7 @@ static long pnv_pci_ioda2_set_window(struct 
> iommu_table_group *table_group,
> return 0;
>  }
>
> -void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
> +static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
>  {
> uint16_t window_id = (pe->pe_number << 1 ) + 1;
> int64_t rc;
> diff --git a/arch/powerpc/platforms/powernv/pci.c 
> b/arch/powerpc/platforms/powernv/pci.c
> index 8d28f2932c3b..fc69f5611020 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -868,54 +868,6 @@ struct device_node *pnv_pci_get_phb_node(struct pci_dev 
> *dev)
>  }
>  EXPORT_SYMBOL(pnv_pci_get_phb_node);
>
> -int pnv_pci_enable_tunnel(struct pci_dev *dev, u64 *asnind)
> -{
> -   struct device_node *np;
> -   const __be32 *prop;
> -   struct pnv_ioda_pe *pe;
> -   uint16_t window_id;
> -   int rc;
> -
> -   if (!radix_enabled())
> -   return -ENXIO;
> -
> -   if (!(np = pnv_pci_get_phb_node(dev)))
> -   return -ENXIO;
> -
> -   prop = of_get_property(np, "ibm,phb-indications", NULL);
> -   of_node_put(np);
> -
> -   if (!prop || !prop[1])
> -   return -ENXIO;
> -
> -   *asnind = (u64)be32_to_cpu(prop[1]);
> -   pe = pnv_ioda_get_pe(dev);
> -   if (!pe)
> -   return -ENODEV;
> -
> -   /* Increase real window size to accept as_notify messages. */
> -   window_id = (pe->pe_number << 1 ) + 1;
> -   rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, pe->pe_number,
> -window_id, pe->tce_bypass_base,
> -(uint64_t)1 << 48);
> -   return opal_error_code(rc);
> -}
> -EXPORT_SYMBOL_GPL(pnv_pci_enable_tunnel);
> -
> -int pnv_pci_disable_tunnel(struct pci_dev *dev)
> -{
> -   struct pnv_ioda_pe *pe;
> -
> -   pe = pnv_ioda_get_pe(dev);
> -   if (!pe)
> -   return -ENODEV;
> -
> -   /* Restore default real window size. */
> -   pnv_pci_ioda2_set_bypass(pe, true);
> -   return 0;
> -}
> -EXPORT_SYMBOL_GPL(pnv_pci_disable_tunnel);
> -
>  int pnv_pci_set_tunnel_bar(struct pci_dev *dev, u64 addr, int enable)
>  {
> __be64 val;
> @@ -970,29 +922,6 @@ int pnv_pci_set_tunnel_bar(struct pci_dev *dev, u64 
> addr, int enable)
>  }
>  EXPORT_SYMBOL_GPL(pnv_pci_set_tunnel_bar);
>
> -#ifdef CONFIG_PPC64/* for thread.tidr */
> -int pnv_pci_get_as_notify_info(struct task_struct *task, u32 *lpid, u32 *pid,
> -  u32 *tid)
> -{
> -   struct mm_struct *mm = NULL;
> -
> -   if (task == NULL)
> -   return -EINVAL;
> -
> -   

Re: [PATCH 4/4] powerpc/powernv: remove the unused vas_win_paste_addr and vas_win_id functions

2019-06-20 Thread Oliver O'Halloran
On Thu, May 23, 2019 at 5:56 PM Christoph Hellwig  wrote:
>
> These two function have never been used since they were added to the
> kernel.
>
> Signed-off-by: Christoph Hellwig 
> ---
>  arch/powerpc/include/asm/vas.h  | 10 --
>  arch/powerpc/platforms/powernv/vas-window.c | 19 ---
>  arch/powerpc/platforms/powernv/vas.h| 20 
>  3 files changed, 49 deletions(-)

Sukadev (+cc), what's the reason this is not being used?

IIRC the VAS hardware on P9 had some issues, but I don't know any of
the details.

> diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
> index 771456227496..9b5b7261df7b 100644
> --- a/arch/powerpc/include/asm/vas.h
> +++ b/arch/powerpc/include/asm/vas.h
> @@ -167,14 +167,4 @@ int vas_copy_crb(void *crb, int offset);
>   */
>  int vas_paste_crb(struct vas_window *win, int offset, bool re);
>
> -/*
> - * Return a system-wide unique id for the VAS window @win.
> - */
> -extern u32 vas_win_id(struct vas_window *win);
> -
> -/*
> - * Return the power bus paste address associated with @win so the caller
> - * can map that address into their address space.
> - */
> -extern u64 vas_win_paste_addr(struct vas_window *win);
>  #endif /* __ASM_POWERPC_VAS_H */
> diff --git a/arch/powerpc/platforms/powernv/vas-window.c 
> b/arch/powerpc/platforms/powernv/vas-window.c
> index e59e0e60e5b5..e48c44cb3a16 100644
> --- a/arch/powerpc/platforms/powernv/vas-window.c
> +++ b/arch/powerpc/platforms/powernv/vas-window.c
> @@ -44,16 +44,6 @@ static void compute_paste_address(struct vas_window 
> *window, u64 *addr, int *len
> pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr);
>  }
>
> -u64 vas_win_paste_addr(struct vas_window *win)
> -{
> -   u64 addr;
> -
> -   compute_paste_address(win, , NULL);
> -
> -   return addr;
> -}
> -EXPORT_SYMBOL(vas_win_paste_addr);
> -
>  static inline void get_hvwc_mmio_bar(struct vas_window *window,
> u64 *start, int *len)
>  {
> @@ -1268,12 +1258,3 @@ int vas_win_close(struct vas_window *window)
> return 0;
>  }
>  EXPORT_SYMBOL_GPL(vas_win_close);
> -
> -/*
> - * Return a system-wide unique window id for the window @win.
> - */
> -u32 vas_win_id(struct vas_window *win)
> -{
> -   return encode_pswid(win->vinst->vas_id, win->winid);
> -}
> -EXPORT_SYMBOL_GPL(vas_win_id);
> diff --git a/arch/powerpc/platforms/powernv/vas.h 
> b/arch/powerpc/platforms/powernv/vas.h
> index f5493dbdd7ff..551affaddd59 100644
> --- a/arch/powerpc/platforms/powernv/vas.h
> +++ b/arch/powerpc/platforms/powernv/vas.h
> @@ -448,26 +448,6 @@ static inline u64 read_hvwc_reg(struct vas_window *win,
> return in_be64(win->hvwc_map+reg);
>  }
>
> -/*
> - * Encode/decode the Partition Send Window ID (PSWID) for a window in
> - * a way that we can uniquely identify any window in the system. i.e.
> - * we should be able to locate the 'struct vas_window' given the PSWID.
> - *
> - * BitsUsage
> - * 0:7 VAS id (8 bits)
> - * 8:15Unused, 0 (3 bits)
> - * 16:31   Window id (16 bits)
> - */
> -static inline u32 encode_pswid(int vasid, int winid)
> -{
> -   u32 pswid = 0;
> -
> -   pswid |= vasid << (31 - 7);
> -   pswid |= winid;
> -
> -   return pswid;
> -}
> -
>  static inline void decode_pswid(u32 pswid, int *vasid, int *winid)
>  {
> if (vasid)
> --
> 2.20.1
>


Re: [PATCH v2] ocxl: Allow contexts to be attached with a NULL mm

2019-06-20 Thread Andrew Donnellan

On 20/6/19 2:12 pm, Alastair D'Silva wrote:

From: Alastair D'Silva 

If an OpenCAPI context is to be used directly by a kernel driver, there
may not be a suitable mm to use.

The patch makes the mm parameter to ocxl_context_attach optional.

Signed-off-by: Alastair D'Silva 


Acked-by: Andrew Donnellan 


---
  arch/powerpc/mm/book3s64/radix_tlb.c |  5 +
  drivers/misc/ocxl/context.c  |  9 ++---
  drivers/misc/ocxl/link.c | 28 
  3 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c 
b/arch/powerpc/mm/book3s64/radix_tlb.c
index bb9835681315..ce8a77fae6a7 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -666,6 +666,11 @@ EXPORT_SYMBOL(radix__flush_tlb_page);
  #define radix__flush_all_mm radix__local_flush_all_mm
  #endif /* CONFIG_SMP */
  
+/*

+ * If kernel TLBIs ever become local rather than global, then
+ * drivers/misc/ocxl/link.c:ocxl_link_add_pe will need some work, as it
+ * assumes kernel TLBIs are global.
+ */
  void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
  {
_tlbie_pid(0, RIC_FLUSH_ALL);
diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c
index bab9c9364184..994563a078eb 100644
--- a/drivers/misc/ocxl/context.c
+++ b/drivers/misc/ocxl/context.c
@@ -69,6 +69,7 @@ static void xsl_fault_error(void *data, u64 addr, u64 dsisr)
  int ocxl_context_attach(struct ocxl_context *ctx, u64 amr, struct mm_struct 
*mm)
  {
int rc;
+   unsigned long pidr = 0;
  
  	// Locks both status & tidr

mutex_lock(>status_mutex);
@@ -77,9 +78,11 @@ int ocxl_context_attach(struct ocxl_context *ctx, u64 amr, 
struct mm_struct *mm)
goto out;
}
  
-	rc = ocxl_link_add_pe(ctx->afu->fn->link, ctx->pasid,

-   mm->context.id, ctx->tidr, amr, mm,
-   xsl_fault_error, ctx);
+   if (mm)
+   pidr = mm->context.id;
+
+   rc = ocxl_link_add_pe(ctx->afu->fn->link, ctx->pasid, pidr, ctx->tidr,
+ amr, mm, xsl_fault_error, ctx);
if (rc)
goto out;
  
diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c

index cce5b0d64505..58d111afd9f6 100644
--- a/drivers/misc/ocxl/link.c
+++ b/drivers/misc/ocxl/link.c
@@ -224,6 +224,17 @@ static irqreturn_t xsl_fault_handler(int irq, void *data)
ack_irq(spa, ADDRESS_ERROR);
return IRQ_HANDLED;
}
+
+   if (!pe_data->mm) {
+   /*
+* translation fault from a kernel context - an OpenCAPI
+* device tried to access a bad kernel address
+*/
+   rcu_read_unlock();
+   pr_warn("Unresolved OpenCAPI xsl fault in kernel context\n");
+   ack_irq(spa, ADDRESS_ERROR);
+   return IRQ_HANDLED;
+   }
WARN_ON(pe_data->mm->context.id != pid);
  
  	if (mmget_not_zero(pe_data->mm)) {

@@ -523,7 +534,13 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 
pidr, u32 tidr,
pe->amr = cpu_to_be64(amr);
pe->software_state = cpu_to_be32(SPA_PE_VALID);
  
-	mm_context_add_copro(mm);

+   /*
+* For user contexts, register a copro so that TLBIs are seen
+* by the nest MMU. If we have a kernel context, TLBIs are
+* already global.
+*/
+   if (mm)
+   mm_context_add_copro(mm);
/*
 * Barrier is to make sure PE is visible in the SPA before it
 * is used by the device. It also helps with the global TLBI
@@ -546,7 +563,8 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 
pidr, u32 tidr,
 * have a reference on mm_users. Incrementing mm_count solves
 * the problem.
 */
-   mmgrab(mm);
+   if (mm)
+   mmgrab(mm);
trace_ocxl_context_add(current->pid, spa->spa_mem, pasid, pidr, tidr);
  unlock:
mutex_unlock(>spa_lock);
@@ -652,8 +670,10 @@ int ocxl_link_remove_pe(void *link_handle, int pasid)
if (!pe_data) {
WARN(1, "Couldn't find pe data when removing PE\n");
} else {
-   mm_context_remove_copro(pe_data->mm);
-   mmdrop(pe_data->mm);
+   if (pe_data->mm) {
+   mm_context_remove_copro(pe_data->mm);
+   mmdrop(pe_data->mm);
+   }
kfree_rcu(pe_data, rcu);
}
  unlock:



--
Andrew Donnellan  OzLabs, ADL Canberra
a...@linux.ibm.com IBM Australia Limited



[PATCH v3 6/6] drivers/base/memory.c: Get rid of find_memory_block_hinted()

2019-06-20 Thread David Hildenbrand
No longer needed, let's remove it. Also, drop the "hint" parameter
completely from "find_memory_block_by_id", as nobody needs it anymore.

Cc: Greg Kroah-Hartman 
Cc: "Rafael J. Wysocki" 
Cc: Andrew Morton 
Cc: Stephen Rothwell 
Cc: Pavel Tatashin 
Cc: "mike.tra...@hpe.com" 
Signed-off-by: David Hildenbrand 
---
 drivers/base/memory.c  | 37 +++--
 include/linux/memory.h |  2 --
 2 files changed, 11 insertions(+), 28 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 0204384b4d1d..195dbcb8e8a8 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -588,30 +588,13 @@ int __weak arch_get_memory_phys_device(unsigned long 
start_pfn)
return 0;
 }
 
-/*
- * A reference for the returned object is held and the reference for the
- * hinted object is released.
- */
-static struct memory_block *find_memory_block_by_id(unsigned long block_id,
-   struct memory_block *hint)
+/* A reference for the returned memory block device is acquired. */
+static struct memory_block *find_memory_block_by_id(unsigned long block_id)
 {
-   struct device *hintdev = hint ? >dev : NULL;
struct device *dev;
 
-   dev = subsys_find_device_by_id(_subsys, block_id, hintdev);
-   if (hint)
-   put_device(>dev);
-   if (!dev)
-   return NULL;
-   return to_memory_block(dev);
-}
-
-struct memory_block *find_memory_block_hinted(struct mem_section *section,
- struct memory_block *hint)
-{
-   unsigned long block_id = base_memory_block_id(__section_nr(section));
-
-   return find_memory_block_by_id(block_id, hint);
+   dev = subsys_find_device_by_id(_subsys, block_id, NULL);
+   return dev ? to_memory_block(dev) : NULL;
 }
 
 /*
@@ -624,7 +607,9 @@ struct memory_block *find_memory_block_hinted(struct 
mem_section *section,
  */
 struct memory_block *find_memory_block(struct mem_section *section)
 {
-   return find_memory_block_hinted(section, NULL);
+   unsigned long block_id = base_memory_block_id(__section_nr(section));
+
+   return find_memory_block_by_id(block_id);
 }
 
 static struct attribute *memory_memblk_attrs[] = {
@@ -675,7 +660,7 @@ static int init_memory_block(struct memory_block **memory,
unsigned long start_pfn;
int ret = 0;
 
-   mem = find_memory_block_by_id(block_id, NULL);
+   mem = find_memory_block_by_id(block_id);
if (mem) {
put_device(>dev);
return -EEXIST;
@@ -755,7 +740,7 @@ int create_memory_block_devices(unsigned long start, 
unsigned long size)
end_block_id = block_id;
for (block_id = start_block_id; block_id != end_block_id;
 block_id++) {
-   mem = find_memory_block_by_id(block_id, NULL);
+   mem = find_memory_block_by_id(block_id);
mem->section_count = 0;
unregister_memory(mem);
}
@@ -782,7 +767,7 @@ void remove_memory_block_devices(unsigned long start, 
unsigned long size)
 
mutex_lock(_sysfs_mutex);
for (block_id = start_block_id; block_id != end_block_id; block_id++) {
-   mem = find_memory_block_by_id(block_id, NULL);
+   mem = find_memory_block_by_id(block_id);
if (WARN_ON_ONCE(!mem))
continue;
mem->section_count = 0;
@@ -882,7 +867,7 @@ int walk_memory_blocks(unsigned long start, unsigned long 
size,
int ret = 0;
 
for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
-   mem = find_memory_block_by_id(block_id, NULL);
+   mem = find_memory_block_by_id(block_id);
if (!mem)
continue;
 
diff --git a/include/linux/memory.h b/include/linux/memory.h
index b3b388775a30..02e633f3ede0 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -116,8 +116,6 @@ void remove_memory_block_devices(unsigned long start, 
unsigned long size);
 extern int memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
 extern int memory_isolate_notify(unsigned long val, void *v);
-extern struct memory_block *find_memory_block_hinted(struct mem_section *,
-   struct memory_block *);
 extern struct memory_block *find_memory_block(struct mem_section *);
 typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
 extern int walk_memory_blocks(unsigned long start, unsigned long size,
-- 
2.21.0



[PATCH v3 5/6] mm/memory_hotplug: Move and simplify walk_memory_blocks()

2019-06-20 Thread David Hildenbrand
Let's move walk_memory_blocks() to the place where memory block logic
resides and simplify it. While at it, add a type for the callback function.

Cc: Greg Kroah-Hartman 
Cc: "Rafael J. Wysocki" 
Cc: David Hildenbrand 
Cc: Andrew Morton 
Cc: Stephen Rothwell 
Cc: Pavel Tatashin 
Cc: Andrew Banman 
Cc: "mike.tra...@hpe.com" 
Cc: Oscar Salvador 
Cc: Michal Hocko 
Cc: Wei Yang 
Cc: Arun KS 
Cc: Qian Cai 
Signed-off-by: David Hildenbrand 
---
 drivers/base/memory.c  | 42 ++
 include/linux/memory.h |  3 ++
 include/linux/memory_hotplug.h |  2 --
 mm/memory_hotplug.c| 55 --
 4 files changed, 45 insertions(+), 57 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index c54e80fd25a8..0204384b4d1d 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -44,6 +44,11 @@ static inline unsigned long pfn_to_block_id(unsigned long 
pfn)
return base_memory_block_id(pfn_to_section_nr(pfn));
 }
 
+static inline unsigned long phys_to_block_id(unsigned long phys)
+{
+   return pfn_to_block_id(PFN_DOWN(phys));
+}
+
 static int memory_subsys_online(struct device *dev);
 static int memory_subsys_offline(struct device *dev);
 
@@ -851,3 +856,40 @@ int __init memory_dev_init(void)
printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
return ret;
 }
+
+/**
+ * walk_memory_blocks - walk through all present memory blocks overlapped
+ * by the range [start, start + size)
+ *
+ * @start: start address of the memory range
+ * @size: size of the memory range
+ * @arg: argument passed to func
+ * @func: callback for each memory section walked
+ *
+ * This function walks through all present memory blocks overlapped by the
+ * range [start, start + size), calling func on each memory block.
+ *
+ * In case func() returns an error, walking is aborted and the error is
+ * returned.
+ */
+int walk_memory_blocks(unsigned long start, unsigned long size,
+  void *arg, walk_memory_blocks_func_t func)
+{
+   const unsigned long start_block_id = phys_to_block_id(start);
+   const unsigned long end_block_id = phys_to_block_id(start + size - 1);
+   struct memory_block *mem;
+   unsigned long block_id;
+   int ret = 0;
+
+   for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
+   mem = find_memory_block_by_id(block_id, NULL);
+   if (!mem)
+   continue;
+
+   ret = func(mem, arg);
+   put_device(>dev);
+   if (ret)
+   break;
+   }
+   return ret;
+}
diff --git a/include/linux/memory.h b/include/linux/memory.h
index f26a5417ec5d..b3b388775a30 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -119,6 +119,9 @@ extern int memory_isolate_notify(unsigned long val, void 
*v);
 extern struct memory_block *find_memory_block_hinted(struct mem_section *,
struct memory_block *);
 extern struct memory_block *find_memory_block(struct mem_section *);
+typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
+extern int walk_memory_blocks(unsigned long start, unsigned long size,
+ void *arg, walk_memory_blocks_func_t func);
 #define CONFIG_MEM_BLOCK_SIZE  (PAGES_PER_SECTION<= mem->start_section_nr) &&
-   (section_nr <= mem->end_section_nr))
-   continue;
-
-   mem = find_memory_block_hinted(section, mem);
-   if (!mem)
-   continue;
-
-   ret = func(mem, arg);
-   if (ret) {
-   kobject_put(>dev.kobj);
-   return ret;
-   }
-   }
-
-   if (mem)
-   kobject_put(>dev.kobj);
-
-   return 0;
-}
-
-#ifdef CONFIG_MEMORY_HOTREMOVE
 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
 {
int ret = !is_memblock_offlined(mem);
-- 
2.21.0



[PATCH v3 4/6] mm/memory_hotplug: Rename walk_memory_range() and pass start+size instead of pfns

2019-06-20 Thread David Hildenbrand
walk_memory_range() was once used to iterate over sections. Now, it
iterates over memory blocks. Rename the function, fixup the
documentation. Also, pass start+size instead of PFNs, which is what most
callers already have at hand. (we'll rework link_mem_sections() most
probably soon)

Follow-up patches wil rework, simplify, and move walk_memory_blocks() to
drivers/base/memory.c.

Note: walk_memory_blocks() only works correctly right now if the
start_pfn is aligned to a section start. This is the case right now,
but we'll generalize the function in a follow up patch so the semantics
match the documentation.

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: "Rafael J. Wysocki" 
Cc: Len Brown 
Cc: Greg Kroah-Hartman 
Cc: David Hildenbrand 
Cc: Rashmica Gupta 
Cc: Andrew Morton 
Cc: Pavel Tatashin 
Cc: Anshuman Khandual 
Cc: Michael Neuling 
Cc: Thomas Gleixner 
Cc: Oscar Salvador 
Cc: Michal Hocko 
Cc: Wei Yang 
Cc: Juergen Gross 
Cc: Qian Cai 
Cc: Arun KS 
Signed-off-by: David Hildenbrand 
---
 arch/powerpc/platforms/powernv/memtrace.c | 23 +++---
 drivers/acpi/acpi_memhotplug.c| 19 --
 drivers/base/node.c   |  5 +++--
 include/linux/memory_hotplug.h|  2 +-
 mm/memory_hotplug.c   | 24 ---
 5 files changed, 32 insertions(+), 41 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/memtrace.c 
b/arch/powerpc/platforms/powernv/memtrace.c
index 5e53c1392d3b..eb2e75dac369 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -70,23 +70,23 @@ static int change_memblock_state(struct memory_block *mem, 
void *arg)
 /* called with device_hotplug_lock held */
 static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
 {
-   u64 end_pfn = start_pfn + nr_pages - 1;
+   const unsigned long start = PFN_PHYS(start_pfn);
+   const unsigned long size = PFN_PHYS(nr_pages);
 
-   if (walk_memory_range(start_pfn, end_pfn, NULL,
-   check_memblock_online))
+   if (walk_memory_blocks(start, size, NULL, check_memblock_online))
return false;
 
-   walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE,
- change_memblock_state);
+   walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE,
+  change_memblock_state);
 
if (offline_pages(start_pfn, nr_pages)) {
-   walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE,
- change_memblock_state);
+   walk_memory_blocks(start, size, (void *)MEM_ONLINE,
+  change_memblock_state);
return false;
}
 
-   walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
- change_memblock_state);
+   walk_memory_blocks(start, size, (void *)MEM_OFFLINE,
+  change_memblock_state);
 
 
return true;
@@ -242,9 +242,8 @@ static int memtrace_online(void)
 */
if (!memhp_auto_online) {
lock_device_hotplug();
-   walk_memory_range(PFN_DOWN(ent->start),
- PFN_UP(ent->start + ent->size - 1),
- NULL, online_mem_block);
+   walk_memory_blocks(ent->start, ent->size, NULL,
+  online_mem_block);
unlock_device_hotplug();
}
 
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index db013dc21c02..e294f44a7850 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -155,16 +155,6 @@ static int acpi_memory_check_device(struct 
acpi_memory_device *mem_device)
return 0;
 }
 
-static unsigned long acpi_meminfo_start_pfn(struct acpi_memory_info *info)
-{
-   return PFN_DOWN(info->start_addr);
-}
-
-static unsigned long acpi_meminfo_end_pfn(struct acpi_memory_info *info)
-{
-   return PFN_UP(info->start_addr + info->length-1);
-}
-
 static int acpi_bind_memblk(struct memory_block *mem, void *arg)
 {
return acpi_bind_one(>dev, arg);
@@ -173,9 +163,8 @@ static int acpi_bind_memblk(struct memory_block *mem, void 
*arg)
 static int acpi_bind_memory_blocks(struct acpi_memory_info *info,
   struct acpi_device *adev)
 {
-   return walk_memory_range(acpi_meminfo_start_pfn(info),
-acpi_meminfo_end_pfn(info), adev,
-acpi_bind_memblk);
+   return walk_memory_blocks(info->start_addr, info->length, adev,
+ acpi_bind_memblk);
 }
 
 static int acpi_unbind_memblk(struct memory_block *mem, void *arg)
@@ -186,8 +175,8 @@ static int acpi_unbind_memblk(struct memory_block *mem, 
void 

[PATCH v3 0/6] mm: Further memory block device cleanups

2019-06-20 Thread David Hildenbrand
@Andrew: Only patch 1, 4 and 6 changed compared to v1.

Some further cleanups around memory block devices. Especially, clean up
and simplify walk_memory_range(). Including some other minor cleanups.

Compiled + tested on x86 with DIMMs under QEMU. Compile-tested on ppc64.

v2 -> v3:
- "mm/memory_hotplug: Rename walk_memory_range() and pass start+size .."
-- Avoid warning on ppc.
- "drivers/base/memory.c: Get rid of find_memory_block_hinted()"
-- Fixup a comment regarding hinted devices.

v1 -> v2:
- "mm: Section numbers use the type "unsigned long""
-- "unsigned long i" -> "unsigned long nr", in one case -> "int i"
- "drivers/base/memory.c: Get rid of find_memory_block_hinted("
-- Fix compilation error
-- Get rid of the "hint" parameter completely

David Hildenbrand (6):
  mm: Section numbers use the type "unsigned long"
  drivers/base/memory: Use "unsigned long" for block ids
  mm: Make register_mem_sect_under_node() static
  mm/memory_hotplug: Rename walk_memory_range() and pass start+size
instead of pfns
  mm/memory_hotplug: Move and simplify walk_memory_blocks()
  drivers/base/memory.c: Get rid of find_memory_block_hinted()

 arch/powerpc/platforms/powernv/memtrace.c |  23 ++---
 drivers/acpi/acpi_memhotplug.c|  19 +---
 drivers/base/memory.c | 120 +-
 drivers/base/node.c   |   8 +-
 include/linux/memory.h|   5 +-
 include/linux/memory_hotplug.h|   2 -
 include/linux/mmzone.h|   4 +-
 include/linux/node.h  |   7 --
 mm/memory_hotplug.c   |  57 +-
 mm/sparse.c   |  12 +--
 10 files changed, 106 insertions(+), 151 deletions(-)

-- 
2.21.0



[PATCH v3 3/6] mm: Make register_mem_sect_under_node() static

2019-06-20 Thread David Hildenbrand
It is only used internally.

Cc: Greg Kroah-Hartman 
Cc: "Rafael J. Wysocki" 
Cc: Andrew Morton 
Cc: Keith Busch 
Cc: Oscar Salvador 
Signed-off-by: David Hildenbrand 
---
 drivers/base/node.c  | 3 ++-
 include/linux/node.h | 7 ---
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 9be88fd05147..e6364e3e3e31 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -752,7 +752,8 @@ static int __ref get_nid_for_pfn(unsigned long pfn)
 }
 
 /* register memory section under specified node if it spans that node */
-int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg)
+static int register_mem_sect_under_node(struct memory_block *mem_blk,
+void *arg)
 {
int ret, nid = *(int *)arg;
unsigned long pfn, sect_start_pfn, sect_end_pfn;
diff --git a/include/linux/node.h b/include/linux/node.h
index 548c226966a2..4866f32a02d8 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -137,8 +137,6 @@ static inline int register_one_node(int nid)
 extern void unregister_one_node(int nid);
 extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
-extern int register_mem_sect_under_node(struct memory_block *mem_blk,
-   void *arg);
 extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk);
 
 extern int register_memory_node_under_compute_node(unsigned int mem_nid,
@@ -170,11 +168,6 @@ static inline int unregister_cpu_under_node(unsigned int 
cpu, unsigned int nid)
 {
return 0;
 }
-static inline int register_mem_sect_under_node(struct memory_block *mem_blk,
-   void *arg)
-{
-   return 0;
-}
 static inline void unregister_memory_block_under_nodes(struct memory_block 
*mem_blk)
 {
 }
-- 
2.21.0



[PATCH v3 2/6] drivers/base/memory: Use "unsigned long" for block ids

2019-06-20 Thread David Hildenbrand
Block ids are just shifted section numbers, so let's also use
"unsigned long" for them, too.

Cc: Greg Kroah-Hartman 
Cc: "Rafael J. Wysocki" 
Signed-off-by: David Hildenbrand 
---
 drivers/base/memory.c | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 5947b5a5686d..c54e80fd25a8 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -34,12 +34,12 @@ static DEFINE_MUTEX(mem_sysfs_mutex);
 
 static int sections_per_block;
 
-static inline int base_memory_block_id(unsigned long section_nr)
+static inline unsigned long base_memory_block_id(unsigned long section_nr)
 {
return section_nr / sections_per_block;
 }
 
-static inline int pfn_to_block_id(unsigned long pfn)
+static inline unsigned long pfn_to_block_id(unsigned long pfn)
 {
return base_memory_block_id(pfn_to_section_nr(pfn));
 }
@@ -587,7 +587,7 @@ int __weak arch_get_memory_phys_device(unsigned long 
start_pfn)
  * A reference for the returned object is held and the reference for the
  * hinted object is released.
  */
-static struct memory_block *find_memory_block_by_id(int block_id,
+static struct memory_block *find_memory_block_by_id(unsigned long block_id,
struct memory_block *hint)
 {
struct device *hintdev = hint ? >dev : NULL;
@@ -604,7 +604,7 @@ static struct memory_block *find_memory_block_by_id(int 
block_id,
 struct memory_block *find_memory_block_hinted(struct mem_section *section,
  struct memory_block *hint)
 {
-   int block_id = base_memory_block_id(__section_nr(section));
+   unsigned long block_id = base_memory_block_id(__section_nr(section));
 
return find_memory_block_by_id(block_id, hint);
 }
@@ -663,8 +663,8 @@ int register_memory(struct memory_block *memory)
return ret;
 }
 
-static int init_memory_block(struct memory_block **memory, int block_id,
-unsigned long state)
+static int init_memory_block(struct memory_block **memory,
+unsigned long block_id, unsigned long state)
 {
struct memory_block *mem;
unsigned long start_pfn;
@@ -729,8 +729,8 @@ static void unregister_memory(struct memory_block *memory)
  */
 int create_memory_block_devices(unsigned long start, unsigned long size)
 {
-   const int start_block_id = pfn_to_block_id(PFN_DOWN(start));
-   int end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
+   const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
+   unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
struct memory_block *mem;
unsigned long block_id;
int ret = 0;
@@ -766,10 +766,10 @@ int create_memory_block_devices(unsigned long start, 
unsigned long size)
  */
 void remove_memory_block_devices(unsigned long start, unsigned long size)
 {
-   const int start_block_id = pfn_to_block_id(PFN_DOWN(start));
-   const int end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
+   const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
+   const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + 
size));
struct memory_block *mem;
-   int block_id;
+   unsigned long block_id;
 
if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
 !IS_ALIGNED(size, memory_block_size_bytes(
-- 
2.21.0



[PATCH v3 1/6] mm: Section numbers use the type "unsigned long"

2019-06-20 Thread David Hildenbrand
We are using a mixture of "int" and "unsigned long". Let's make this
consistent by using "unsigned long" everywhere. We'll do the same with
memory block ids next.

While at it, turn the "unsigned long i" in removable_show() into an
int - sections_per_block is an int.

Cc: Greg Kroah-Hartman 
Cc: "Rafael J. Wysocki" 
Cc: Andrew Morton 
Cc: Vlastimil Babka 
Cc: Michal Hocko 
Cc: Dan Williams 
Cc: Mel Gorman 
Cc: Wei Yang 
Cc: Johannes Weiner 
Cc: Arun KS 
Cc: Pavel Tatashin 
Cc: Oscar Salvador 
Cc: Stephen Rothwell 
Cc: Mike Rapoport 
Cc: Baoquan He 
Signed-off-by: David Hildenbrand 
---
 drivers/base/memory.c  | 27 +--
 include/linux/mmzone.h |  4 ++--
 mm/sparse.c| 12 ++--
 3 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 826dd76f662e..5947b5a5686d 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -34,7 +34,7 @@ static DEFINE_MUTEX(mem_sysfs_mutex);
 
 static int sections_per_block;
 
-static inline int base_memory_block_id(int section_nr)
+static inline int base_memory_block_id(unsigned long section_nr)
 {
return section_nr / sections_per_block;
 }
@@ -131,9 +131,9 @@ static ssize_t phys_index_show(struct device *dev,
 static ssize_t removable_show(struct device *dev, struct device_attribute 
*attr,
  char *buf)
 {
-   unsigned long i, pfn;
-   int ret = 1;
struct memory_block *mem = to_memory_block(dev);
+   unsigned long pfn;
+   int ret = 1, i;
 
if (mem->state != MEM_ONLINE)
goto out;
@@ -691,15 +691,15 @@ static int init_memory_block(struct memory_block 
**memory, int block_id,
return ret;
 }
 
-static int add_memory_block(int base_section_nr)
+static int add_memory_block(unsigned long base_section_nr)
 {
+   int ret, section_count = 0;
struct memory_block *mem;
-   int i, ret, section_count = 0;
+   unsigned long nr;
 
-   for (i = base_section_nr;
-i < base_section_nr + sections_per_block;
-i++)
-   if (present_section_nr(i))
+   for (nr = base_section_nr; nr < base_section_nr + sections_per_block;
+nr++)
+   if (present_section_nr(nr))
section_count++;
 
if (section_count == 0)
@@ -822,10 +822,9 @@ static const struct attribute_group 
*memory_root_attr_groups[] = {
  */
 int __init memory_dev_init(void)
 {
-   unsigned int i;
int ret;
int err;
-   unsigned long block_sz;
+   unsigned long block_sz, nr;
 
ret = subsys_system_register(_subsys, memory_root_attr_groups);
if (ret)
@@ -839,9 +838,9 @@ int __init memory_dev_init(void)
 * during boot and have been initialized
 */
mutex_lock(_sysfs_mutex);
-   for (i = 0; i <= __highest_present_section_nr;
-   i += sections_per_block) {
-   err = add_memory_block(i);
+   for (nr = 0; nr <= __highest_present_section_nr;
+nr += sections_per_block) {
+   err = add_memory_block(nr);
if (!ret)
ret = err;
}
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 427b79c39b3c..83b6aae16f13 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1220,7 +1220,7 @@ static inline struct mem_section 
*__nr_to_section(unsigned long nr)
return NULL;
return _section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
 }
-extern int __section_nr(struct mem_section* ms);
+extern unsigned long __section_nr(struct mem_section *ms);
 extern unsigned long usemap_size(void);
 
 /*
@@ -1292,7 +1292,7 @@ static inline struct mem_section 
*__pfn_to_section(unsigned long pfn)
return __nr_to_section(pfn_to_section_nr(pfn));
 }
 
-extern int __highest_present_section_nr;
+extern unsigned long __highest_present_section_nr;
 
 #ifndef CONFIG_HAVE_ARCH_PFN_VALID
 static inline int pfn_valid(unsigned long pfn)
diff --git a/mm/sparse.c b/mm/sparse.c
index 1552c855d62a..e8c57e039be8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -102,7 +102,7 @@ static inline int sparse_index_init(unsigned long 
section_nr, int nid)
 #endif
 
 #ifdef CONFIG_SPARSEMEM_EXTREME
-int __section_nr(struct mem_section* ms)
+unsigned long __section_nr(struct mem_section *ms)
 {
unsigned long root_nr;
struct mem_section *root = NULL;
@@ -121,9 +121,9 @@ int __section_nr(struct mem_section* ms)
return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
 }
 #else
-int __section_nr(struct mem_section* ms)
+unsigned long __section_nr(struct mem_section *ms)
 {
-   return (int)(ms - mem_section[0]);
+   return (unsigned long)(ms - mem_section[0]);
 }
 #endif
 
@@ -178,10 +178,10 @@ void __meminit mminit_validate_memmodel_limits(unsigned 
long *start_pfn,
  * Keeping track of this gives us an easy way to break out of
  * those loops early.
  

Re: [PATCH v2 6/6] drivers/base/memory.c: Get rid of find_memory_block_hinted()

2019-06-20 Thread David Hildenbrand
On 20.06.19 12:35, David Hildenbrand wrote:
> No longer needed, let's remove it. Also, drop the "hint" parameter
> completely from "find_memory_block_by_id", as nobody needs it anymore.
> 
> Cc: Greg Kroah-Hartman 
> Cc: "Rafael J. Wysocki" 
> Cc: Andrew Morton 
> Cc: Stephen Rothwell 
> Cc: Pavel Tatashin 
> Cc: "mike.tra...@hpe.com" 
> Signed-off-by: David Hildenbrand 
> ---
>  drivers/base/memory.c  | 32 ++--
>  include/linux/memory.h |  2 --
>  2 files changed, 10 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index 0204384b4d1d..fefb64d3588e 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -592,26 +592,12 @@ int __weak arch_get_memory_phys_device(unsigned long 
> start_pfn)
>   * A reference for the returned object is held and the reference for the
>   * hinted object is released.
>   */

I'll fixup this comment as well (yes, I desperately need vacation :) )


-- 

Thanks,

David / dhildenb


Re: [PATCH 16/16] mm: pass get_user_pages_fast iterator arguments in a structure

2019-06-20 Thread Linus Torvalds
On Thu, Jun 20, 2019 at 5:19 AM Nicholas Piggin  wrote:
>
> The processor aliasing problem happens because the struct will
> be initialised with stores using one base register (e.g., stack
> register), and then same memory is loaded using a different
> register (e.g., parameter register).

Hmm. Honestly, I've never seen anything like that in any kernel profiles.

Compared to the problems I _do_ see (which is usually the obvious
cache misses, and locking), it must either be in the noise or it's
some problem specific to whatever CPU you are doing performance work
on?

I've occasionally seen pipeline hiccups in profiles, but it's usually
been either some serious glass jaw of the core, or it's been something
really stupid we did (or occasionally that the compiler did: one in
particular I remember was how there was a time when gcc would narrow
stores when it could, so if you set a bit in a word, it would do it
with a byte store, and then when you read the whole word afterwards
you'd get a major pipeline stall and it happened to show up in some
really hot paths).

Linus


Re: [PATCH v2 4/6] mm/memory_hotplug: Rename walk_memory_range() and pass start+size instead of pfns

2019-06-20 Thread David Hildenbrand
On 20.06.19 18:05, Nathan Chancellor wrote:
> On Thu, Jun 20, 2019 at 12:35:18PM +0200, David Hildenbrand wrote:
>> walk_memory_range() was once used to iterate over sections. Now, it
>> iterates over memory blocks. Rename the function, fixup the
>> documentation. Also, pass start+size instead of PFNs, which is what most
>> callers already have at hand. (we'll rework link_mem_sections() most
>> probably soon)
>>
>> Follow-up patches wil rework, simplify, and move walk_memory_blocks() to
>> drivers/base/memory.c.
>>
>> Note: walk_memory_blocks() only works correctly right now if the
>> start_pfn is aligned to a section start. This is the case right now,
>> but we'll generalize the function in a follow up patch so the semantics
>> match the documentation.
>>
>> Cc: Benjamin Herrenschmidt 
>> Cc: Paul Mackerras 
>> Cc: Michael Ellerman 
>> Cc: "Rafael J. Wysocki" 
>> Cc: Len Brown 
>> Cc: Greg Kroah-Hartman 
>> Cc: David Hildenbrand 
>> Cc: Rashmica Gupta 
>> Cc: Andrew Morton 
>> Cc: Pavel Tatashin 
>> Cc: Anshuman Khandual 
>> Cc: Michael Neuling 
>> Cc: Thomas Gleixner 
>> Cc: Oscar Salvador 
>> Cc: Michal Hocko 
>> Cc: Wei Yang 
>> Cc: Juergen Gross 
>> Cc: Qian Cai 
>> Cc: Arun KS 
>> Signed-off-by: David Hildenbrand 
>> ---
>>  arch/powerpc/platforms/powernv/memtrace.c | 22 ++---
>>  drivers/acpi/acpi_memhotplug.c| 19 --
>>  drivers/base/node.c   |  5 +++--
>>  include/linux/memory_hotplug.h|  2 +-
>>  mm/memory_hotplug.c   | 24 ---
>>  5 files changed, 32 insertions(+), 40 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/powernv/memtrace.c 
>> b/arch/powerpc/platforms/powernv/memtrace.c
>> index 5e53c1392d3b..8c82c041afe6 100644
>> --- a/arch/powerpc/platforms/powernv/memtrace.c
>> +++ b/arch/powerpc/platforms/powernv/memtrace.c
>> @@ -70,23 +70,24 @@ static int change_memblock_state(struct memory_block 
>> *mem, void *arg)
>>  /* called with device_hotplug_lock held */
>>  static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
>>  {
>> +const unsigned long start = PFN_PHYS(start_pfn);
>> +const unsigned long size = PFN_PHYS(nr_pages);
>>  u64 end_pfn = start_pfn + nr_pages - 1;
> 
> This variable should be removed:
> 
> arch/powerpc/platforms/powernv/memtrace.c:75:6: warning: unused variable 
> 'end_pfn' [-Wunused-variable]
> u64 end_pfn = start_pfn + nr_pages - 1;
> ^
> 1 warning generated.
> 
> https://travis-ci.com/ClangBuiltLinux/continuous-integration/jobs/209576737
> 
> Cheers,
> Nathan
> 

Indeed, thanks!


-- 

Thanks,

David / dhildenb


Re: [PATCH v2 4/6] mm/memory_hotplug: Rename walk_memory_range() and pass start+size instead of pfns

2019-06-20 Thread Nathan Chancellor
On Thu, Jun 20, 2019 at 12:35:18PM +0200, David Hildenbrand wrote:
> walk_memory_range() was once used to iterate over sections. Now, it
> iterates over memory blocks. Rename the function, fixup the
> documentation. Also, pass start+size instead of PFNs, which is what most
> callers already have at hand. (we'll rework link_mem_sections() most
> probably soon)
> 
> Follow-up patches wil rework, simplify, and move walk_memory_blocks() to
> drivers/base/memory.c.
> 
> Note: walk_memory_blocks() only works correctly right now if the
> start_pfn is aligned to a section start. This is the case right now,
> but we'll generalize the function in a follow up patch so the semantics
> match the documentation.
> 
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Michael Ellerman 
> Cc: "Rafael J. Wysocki" 
> Cc: Len Brown 
> Cc: Greg Kroah-Hartman 
> Cc: David Hildenbrand 
> Cc: Rashmica Gupta 
> Cc: Andrew Morton 
> Cc: Pavel Tatashin 
> Cc: Anshuman Khandual 
> Cc: Michael Neuling 
> Cc: Thomas Gleixner 
> Cc: Oscar Salvador 
> Cc: Michal Hocko 
> Cc: Wei Yang 
> Cc: Juergen Gross 
> Cc: Qian Cai 
> Cc: Arun KS 
> Signed-off-by: David Hildenbrand 
> ---
>  arch/powerpc/platforms/powernv/memtrace.c | 22 ++---
>  drivers/acpi/acpi_memhotplug.c| 19 --
>  drivers/base/node.c   |  5 +++--
>  include/linux/memory_hotplug.h|  2 +-
>  mm/memory_hotplug.c   | 24 ---
>  5 files changed, 32 insertions(+), 40 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/memtrace.c 
> b/arch/powerpc/platforms/powernv/memtrace.c
> index 5e53c1392d3b..8c82c041afe6 100644
> --- a/arch/powerpc/platforms/powernv/memtrace.c
> +++ b/arch/powerpc/platforms/powernv/memtrace.c
> @@ -70,23 +70,24 @@ static int change_memblock_state(struct memory_block 
> *mem, void *arg)
>  /* called with device_hotplug_lock held */
>  static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
>  {
> + const unsigned long start = PFN_PHYS(start_pfn);
> + const unsigned long size = PFN_PHYS(nr_pages);
>   u64 end_pfn = start_pfn + nr_pages - 1;

This variable should be removed:

arch/powerpc/platforms/powernv/memtrace.c:75:6: warning: unused variable 
'end_pfn' [-Wunused-variable]
u64 end_pfn = start_pfn + nr_pages - 1;
^
1 warning generated.

https://travis-ci.com/ClangBuiltLinux/continuous-integration/jobs/209576737

Cheers,
Nathan

>  
> - if (walk_memory_range(start_pfn, end_pfn, NULL,
> - check_memblock_online))
> + if (walk_memory_blocks(start, size, NULL, check_memblock_online))
>   return false;
>  
> - walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE,
> -   change_memblock_state);
> + walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE,
> +change_memblock_state);
>  
>   if (offline_pages(start_pfn, nr_pages)) {
> - walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE,
> -   change_memblock_state);
> + walk_memory_blocks(start, size, (void *)MEM_ONLINE,
> +change_memblock_state);
>   return false;
>   }
>  
> - walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
> -   change_memblock_state);
> + walk_memory_blocks(start, size, (void *)MEM_OFFLINE,
> +change_memblock_state);
>  
>  
>   return true;
> @@ -242,9 +243,8 @@ static int memtrace_online(void)
>*/
>   if (!memhp_auto_online) {
>   lock_device_hotplug();
> - walk_memory_range(PFN_DOWN(ent->start),
> -   PFN_UP(ent->start + ent->size - 1),
> -   NULL, online_mem_block);
> + walk_memory_blocks(ent->start, ent->size, NULL,
> +online_mem_block);
>   unlock_device_hotplug();
>   }
>  
> diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
> index db013dc21c02..e294f44a7850 100644
> --- a/drivers/acpi/acpi_memhotplug.c
> +++ b/drivers/acpi/acpi_memhotplug.c
> @@ -155,16 +155,6 @@ static int acpi_memory_check_device(struct 
> acpi_memory_device *mem_device)
>   return 0;
>  }
>  
> -static unsigned long acpi_meminfo_start_pfn(struct acpi_memory_info *info)
> -{
> - return PFN_DOWN(info->start_addr);
> -}
> -
> -static unsigned long acpi_meminfo_end_pfn(struct acpi_memory_info *info)
> -{
> - return PFN_UP(info->start_addr + info->length-1);
> -}
> -
>  static int acpi_bind_memblk(struct memory_block *mem, void *arg)
>  {
>   return acpi_bind_one(>dev, arg);
> @@ -173,9 +163,8 @@ static int acpi_bind_memblk(struct memory_block *mem, 
> void *arg)
>  static int acpi_bind_memory_blocks(struct 

Re: [PATCH] powerpc: enable a 30-bit ZONE_DMA for 32-bit pmac

2019-06-20 Thread Michael Ellerman
Benjamin Herrenschmidt  writes:
> On Wed, 2019-06-19 at 22:32 +1000, Michael Ellerman wrote:
>> Christoph Hellwig  writes:
>> > Any chance this could get picked up to fix the regression?
>> 
>> Was hoping Ben would Ack it. He's still powermac maintainer :)
>> 
>> I guess he OK'ed it in the other thread, will add it to my queue.
>
> Yeah ack. If I had written it myself, I would have made the DMA bits a
> variable and only set it down to 30 if I see that device in the DT
> early on, but I can't be bothered now, if it works, ship it :-)

OK, we can do that next release if someone's motivated.

> Note: The patch affects all ppc32, though I don't think it will cause
> any significant issue on those who don't need it.

Yeah. We could always hide it behind CONFIG_PPC_PMAC if it becomes a problem.

cheers


Re: [PATCH 16/16] mm: pass get_user_pages_fast iterator arguments in a structure

2019-06-20 Thread Nicholas Piggin
Linus Torvalds's on June 12, 2019 11:09 am:
> On Tue, Jun 11, 2019 at 2:55 PM Nicholas Piggin  wrote:
>>
>> What does this do for performance? I've found this pattern can be
>> bad for store aliasing detection.
> 
> I wouldn't expect it to be noticeable, and the lack of argument
> reloading etc should make up for it. Plus inlining makes it a
> non-issue when that happens.

Maybe in isolation. Just seems like a strange pattern to sprinkle
around randomly, I wouldn't like it to proliferate.

I understand in some cases where a big set of parameters or
basically state gets sent around through a lot of interfaces.
Within one file to make lines a bit shorter or save a few bytes
isn't such a strong case.

> 
> But I guess we could also at least look at using "restrict", if that
> ends up helping. Unlike the completely bogus type-based aliasing rules
> (that we disable because I think the C people were on some bad bad
> drugs when they came up with them), restricted pointers are a real
> thing that makes sense.
> 
> That said, we haven't traditionally used it, and I don't know how much
> it helps gcc. Maybe gcc ignores it entirely? S

Ahh, it's not compiler store alias analysis I'm talking about, but
processor (but you raise an interesting point about compiler too,
would be nice if we could improve that in general).

The processor aliasing problem happens because the struct will
be initialised with stores using one base register (e.g., stack
register), and then same memory is loaded using a different
register (e.g., parameter register). Processor's static heuristics
for determining a load doesn't alias with an earlier store doesn't
do so well in that case.

Just about everywhere I've seen those kind of misspeculation and
flushes in the kernel has been this pattern, so I'm wary of it in
performance critical code.

Thanks,
Nick


Re: switch the remaining architectures to use generic GUP v3

2019-06-20 Thread Christoph Hellwig
I just noticed I didn't have Andrew explicitly on the receipents
list, so adding him.

Is everyone happy enough to give this a spin in -mm and linux-next?


Re: [PATCH v2 43/52] powerpc/64s/exception: machine check early only runs in HV mode

2019-06-20 Thread Mahesh Jagannath Salgaonkar
On 6/20/19 3:46 PM, Nicholas Piggin wrote:
> Mahesh J Salgaonkar's on June 20, 2019 7:53 pm:
>> On 2019-06-20 15:14:50 Thu, Nicholas Piggin wrote:
>>> machine_check_common_early and machine_check_handle_early only run in
>>> HVMODE. Remove dead code.
>>
>> That's not true. For pseries guest with FWNMI enabled hypervisor,
>> machine_check_common_early gets called in non-HV mode as well.
>>
>>machine_check_fwnmi
>>  machine_check_common_early
>>machine_check_handle_early
>>  machine_check_early
>>pseries_machine_check_realmode
> 
> Yep, yep I was confused by the earlier patch. So we're only doing the
> early machine check path for the FWNMI case?

yes.

> 
> Thanks,
> Nick
> 



[PATCH v2 6/6] drivers/base/memory.c: Get rid of find_memory_block_hinted()

2019-06-20 Thread David Hildenbrand
No longer needed, let's remove it. Also, drop the "hint" parameter
completely from "find_memory_block_by_id", as nobody needs it anymore.

Cc: Greg Kroah-Hartman 
Cc: "Rafael J. Wysocki" 
Cc: Andrew Morton 
Cc: Stephen Rothwell 
Cc: Pavel Tatashin 
Cc: "mike.tra...@hpe.com" 
Signed-off-by: David Hildenbrand 
---
 drivers/base/memory.c  | 32 ++--
 include/linux/memory.h |  2 --
 2 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 0204384b4d1d..fefb64d3588e 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -592,26 +592,12 @@ int __weak arch_get_memory_phys_device(unsigned long 
start_pfn)
  * A reference for the returned object is held and the reference for the
  * hinted object is released.
  */
-static struct memory_block *find_memory_block_by_id(unsigned long block_id,
-   struct memory_block *hint)
+static struct memory_block *find_memory_block_by_id(unsigned long block_id)
 {
-   struct device *hintdev = hint ? >dev : NULL;
struct device *dev;
 
-   dev = subsys_find_device_by_id(_subsys, block_id, hintdev);
-   if (hint)
-   put_device(>dev);
-   if (!dev)
-   return NULL;
-   return to_memory_block(dev);
-}
-
-struct memory_block *find_memory_block_hinted(struct mem_section *section,
- struct memory_block *hint)
-{
-   unsigned long block_id = base_memory_block_id(__section_nr(section));
-
-   return find_memory_block_by_id(block_id, hint);
+   dev = subsys_find_device_by_id(_subsys, block_id, NULL);
+   return dev ? to_memory_block(dev) : NULL;
 }
 
 /*
@@ -624,7 +610,9 @@ struct memory_block *find_memory_block_hinted(struct 
mem_section *section,
  */
 struct memory_block *find_memory_block(struct mem_section *section)
 {
-   return find_memory_block_hinted(section, NULL);
+   unsigned long block_id = base_memory_block_id(__section_nr(section));
+
+   return find_memory_block_by_id(block_id);
 }
 
 static struct attribute *memory_memblk_attrs[] = {
@@ -675,7 +663,7 @@ static int init_memory_block(struct memory_block **memory,
unsigned long start_pfn;
int ret = 0;
 
-   mem = find_memory_block_by_id(block_id, NULL);
+   mem = find_memory_block_by_id(block_id);
if (mem) {
put_device(>dev);
return -EEXIST;
@@ -755,7 +743,7 @@ int create_memory_block_devices(unsigned long start, 
unsigned long size)
end_block_id = block_id;
for (block_id = start_block_id; block_id != end_block_id;
 block_id++) {
-   mem = find_memory_block_by_id(block_id, NULL);
+   mem = find_memory_block_by_id(block_id);
mem->section_count = 0;
unregister_memory(mem);
}
@@ -782,7 +770,7 @@ void remove_memory_block_devices(unsigned long start, 
unsigned long size)
 
mutex_lock(_sysfs_mutex);
for (block_id = start_block_id; block_id != end_block_id; block_id++) {
-   mem = find_memory_block_by_id(block_id, NULL);
+   mem = find_memory_block_by_id(block_id);
if (WARN_ON_ONCE(!mem))
continue;
mem->section_count = 0;
@@ -882,7 +870,7 @@ int walk_memory_blocks(unsigned long start, unsigned long 
size,
int ret = 0;
 
for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
-   mem = find_memory_block_by_id(block_id, NULL);
+   mem = find_memory_block_by_id(block_id);
if (!mem)
continue;
 
diff --git a/include/linux/memory.h b/include/linux/memory.h
index b3b388775a30..02e633f3ede0 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -116,8 +116,6 @@ void remove_memory_block_devices(unsigned long start, 
unsigned long size);
 extern int memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
 extern int memory_isolate_notify(unsigned long val, void *v);
-extern struct memory_block *find_memory_block_hinted(struct mem_section *,
-   struct memory_block *);
 extern struct memory_block *find_memory_block(struct mem_section *);
 typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
 extern int walk_memory_blocks(unsigned long start, unsigned long size,
-- 
2.21.0



[PATCH v2 4/6] mm/memory_hotplug: Rename walk_memory_range() and pass start+size instead of pfns

2019-06-20 Thread David Hildenbrand
walk_memory_range() was once used to iterate over sections. Now, it
iterates over memory blocks. Rename the function, fixup the
documentation. Also, pass start+size instead of PFNs, which is what most
callers already have at hand. (we'll rework link_mem_sections() most
probably soon)

Follow-up patches wil rework, simplify, and move walk_memory_blocks() to
drivers/base/memory.c.

Note: walk_memory_blocks() only works correctly right now if the
start_pfn is aligned to a section start. This is the case right now,
but we'll generalize the function in a follow up patch so the semantics
match the documentation.

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: "Rafael J. Wysocki" 
Cc: Len Brown 
Cc: Greg Kroah-Hartman 
Cc: David Hildenbrand 
Cc: Rashmica Gupta 
Cc: Andrew Morton 
Cc: Pavel Tatashin 
Cc: Anshuman Khandual 
Cc: Michael Neuling 
Cc: Thomas Gleixner 
Cc: Oscar Salvador 
Cc: Michal Hocko 
Cc: Wei Yang 
Cc: Juergen Gross 
Cc: Qian Cai 
Cc: Arun KS 
Signed-off-by: David Hildenbrand 
---
 arch/powerpc/platforms/powernv/memtrace.c | 22 ++---
 drivers/acpi/acpi_memhotplug.c| 19 --
 drivers/base/node.c   |  5 +++--
 include/linux/memory_hotplug.h|  2 +-
 mm/memory_hotplug.c   | 24 ---
 5 files changed, 32 insertions(+), 40 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/memtrace.c 
b/arch/powerpc/platforms/powernv/memtrace.c
index 5e53c1392d3b..8c82c041afe6 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -70,23 +70,24 @@ static int change_memblock_state(struct memory_block *mem, 
void *arg)
 /* called with device_hotplug_lock held */
 static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
 {
+   const unsigned long start = PFN_PHYS(start_pfn);
+   const unsigned long size = PFN_PHYS(nr_pages);
u64 end_pfn = start_pfn + nr_pages - 1;
 
-   if (walk_memory_range(start_pfn, end_pfn, NULL,
-   check_memblock_online))
+   if (walk_memory_blocks(start, size, NULL, check_memblock_online))
return false;
 
-   walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE,
- change_memblock_state);
+   walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE,
+  change_memblock_state);
 
if (offline_pages(start_pfn, nr_pages)) {
-   walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE,
- change_memblock_state);
+   walk_memory_blocks(start, size, (void *)MEM_ONLINE,
+  change_memblock_state);
return false;
}
 
-   walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
- change_memblock_state);
+   walk_memory_blocks(start, size, (void *)MEM_OFFLINE,
+  change_memblock_state);
 
 
return true;
@@ -242,9 +243,8 @@ static int memtrace_online(void)
 */
if (!memhp_auto_online) {
lock_device_hotplug();
-   walk_memory_range(PFN_DOWN(ent->start),
- PFN_UP(ent->start + ent->size - 1),
- NULL, online_mem_block);
+   walk_memory_blocks(ent->start, ent->size, NULL,
+  online_mem_block);
unlock_device_hotplug();
}
 
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index db013dc21c02..e294f44a7850 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -155,16 +155,6 @@ static int acpi_memory_check_device(struct 
acpi_memory_device *mem_device)
return 0;
 }
 
-static unsigned long acpi_meminfo_start_pfn(struct acpi_memory_info *info)
-{
-   return PFN_DOWN(info->start_addr);
-}
-
-static unsigned long acpi_meminfo_end_pfn(struct acpi_memory_info *info)
-{
-   return PFN_UP(info->start_addr + info->length-1);
-}
-
 static int acpi_bind_memblk(struct memory_block *mem, void *arg)
 {
return acpi_bind_one(>dev, arg);
@@ -173,9 +163,8 @@ static int acpi_bind_memblk(struct memory_block *mem, void 
*arg)
 static int acpi_bind_memory_blocks(struct acpi_memory_info *info,
   struct acpi_device *adev)
 {
-   return walk_memory_range(acpi_meminfo_start_pfn(info),
-acpi_meminfo_end_pfn(info), adev,
-acpi_bind_memblk);
+   return walk_memory_blocks(info->start_addr, info->length, adev,
+ acpi_bind_memblk);
 }
 
 static int acpi_unbind_memblk(struct memory_block *mem, void *arg)
@@ -186,8 +175,8 @@ static int acpi_unbind_memblk(struct memory_block *mem, 
void 

[PATCH v2 5/6] mm/memory_hotplug: Move and simplify walk_memory_blocks()

2019-06-20 Thread David Hildenbrand
Let's move walk_memory_blocks() to the place where memory block logic
resides and simplify it. While at it, add a type for the callback function.

Cc: Greg Kroah-Hartman 
Cc: "Rafael J. Wysocki" 
Cc: David Hildenbrand 
Cc: Andrew Morton 
Cc: Stephen Rothwell 
Cc: Pavel Tatashin 
Cc: Andrew Banman 
Cc: "mike.tra...@hpe.com" 
Cc: Oscar Salvador 
Cc: Michal Hocko 
Cc: Wei Yang 
Cc: Arun KS 
Cc: Qian Cai 
Signed-off-by: David Hildenbrand 
---
 drivers/base/memory.c  | 42 ++
 include/linux/memory.h |  3 ++
 include/linux/memory_hotplug.h |  2 --
 mm/memory_hotplug.c| 55 --
 4 files changed, 45 insertions(+), 57 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index c54e80fd25a8..0204384b4d1d 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -44,6 +44,11 @@ static inline unsigned long pfn_to_block_id(unsigned long 
pfn)
return base_memory_block_id(pfn_to_section_nr(pfn));
 }
 
+static inline unsigned long phys_to_block_id(unsigned long phys)
+{
+   return pfn_to_block_id(PFN_DOWN(phys));
+}
+
 static int memory_subsys_online(struct device *dev);
 static int memory_subsys_offline(struct device *dev);
 
@@ -851,3 +856,40 @@ int __init memory_dev_init(void)
printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
return ret;
 }
+
+/**
+ * walk_memory_blocks - walk through all present memory blocks overlapped
+ * by the range [start, start + size)
+ *
+ * @start: start address of the memory range
+ * @size: size of the memory range
+ * @arg: argument passed to func
+ * @func: callback for each memory section walked
+ *
+ * This function walks through all present memory blocks overlapped by the
+ * range [start, start + size), calling func on each memory block.
+ *
+ * In case func() returns an error, walking is aborted and the error is
+ * returned.
+ */
+int walk_memory_blocks(unsigned long start, unsigned long size,
+  void *arg, walk_memory_blocks_func_t func)
+{
+   const unsigned long start_block_id = phys_to_block_id(start);
+   const unsigned long end_block_id = phys_to_block_id(start + size - 1);
+   struct memory_block *mem;
+   unsigned long block_id;
+   int ret = 0;
+
+   for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
+   mem = find_memory_block_by_id(block_id, NULL);
+   if (!mem)
+   continue;
+
+   ret = func(mem, arg);
+   put_device(>dev);
+   if (ret)
+   break;
+   }
+   return ret;
+}
diff --git a/include/linux/memory.h b/include/linux/memory.h
index f26a5417ec5d..b3b388775a30 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -119,6 +119,9 @@ extern int memory_isolate_notify(unsigned long val, void 
*v);
 extern struct memory_block *find_memory_block_hinted(struct mem_section *,
struct memory_block *);
 extern struct memory_block *find_memory_block(struct mem_section *);
+typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
+extern int walk_memory_blocks(unsigned long start, unsigned long size,
+ void *arg, walk_memory_blocks_func_t func);
 #define CONFIG_MEM_BLOCK_SIZE  (PAGES_PER_SECTION<= mem->start_section_nr) &&
-   (section_nr <= mem->end_section_nr))
-   continue;
-
-   mem = find_memory_block_hinted(section, mem);
-   if (!mem)
-   continue;
-
-   ret = func(mem, arg);
-   if (ret) {
-   kobject_put(>dev.kobj);
-   return ret;
-   }
-   }
-
-   if (mem)
-   kobject_put(>dev.kobj);
-
-   return 0;
-}
-
-#ifdef CONFIG_MEMORY_HOTREMOVE
 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
 {
int ret = !is_memblock_offlined(mem);
-- 
2.21.0



[PATCH v2 3/6] mm: Make register_mem_sect_under_node() static

2019-06-20 Thread David Hildenbrand
It is only used internally.

Cc: Greg Kroah-Hartman 
Cc: "Rafael J. Wysocki" 
Cc: Andrew Morton 
Cc: Keith Busch 
Cc: Oscar Salvador 
Signed-off-by: David Hildenbrand 
---
 drivers/base/node.c  | 3 ++-
 include/linux/node.h | 7 ---
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 9be88fd05147..e6364e3e3e31 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -752,7 +752,8 @@ static int __ref get_nid_for_pfn(unsigned long pfn)
 }
 
 /* register memory section under specified node if it spans that node */
-int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg)
+static int register_mem_sect_under_node(struct memory_block *mem_blk,
+void *arg)
 {
int ret, nid = *(int *)arg;
unsigned long pfn, sect_start_pfn, sect_end_pfn;
diff --git a/include/linux/node.h b/include/linux/node.h
index 548c226966a2..4866f32a02d8 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -137,8 +137,6 @@ static inline int register_one_node(int nid)
 extern void unregister_one_node(int nid);
 extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
-extern int register_mem_sect_under_node(struct memory_block *mem_blk,
-   void *arg);
 extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk);
 
 extern int register_memory_node_under_compute_node(unsigned int mem_nid,
@@ -170,11 +168,6 @@ static inline int unregister_cpu_under_node(unsigned int 
cpu, unsigned int nid)
 {
return 0;
 }
-static inline int register_mem_sect_under_node(struct memory_block *mem_blk,
-   void *arg)
-{
-   return 0;
-}
 static inline void unregister_memory_block_under_nodes(struct memory_block 
*mem_blk)
 {
 }
-- 
2.21.0



[PATCH v2 1/6] mm: Section numbers use the type "unsigned long"

2019-06-20 Thread David Hildenbrand
We are using a mixture of "int" and "unsigned long". Let's make this
consistent by using "unsigned long" everywhere. We'll do the same with
memory block ids next.

While at it, turn the "unsigned long i" in removable_show() into an
int - sections_per_block is an int.

Cc: Greg Kroah-Hartman 
Cc: "Rafael J. Wysocki" 
Cc: Andrew Morton 
Cc: Vlastimil Babka 
Cc: Michal Hocko 
Cc: Dan Williams 
Cc: Mel Gorman 
Cc: Wei Yang 
Cc: Johannes Weiner 
Cc: Arun KS 
Cc: Pavel Tatashin 
Cc: Oscar Salvador 
Cc: Stephen Rothwell 
Cc: Mike Rapoport 
Cc: Baoquan He 
Signed-off-by: David Hildenbrand 
---
 drivers/base/memory.c  | 27 +--
 include/linux/mmzone.h |  4 ++--
 mm/sparse.c| 12 ++--
 3 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 826dd76f662e..5947b5a5686d 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -34,7 +34,7 @@ static DEFINE_MUTEX(mem_sysfs_mutex);
 
 static int sections_per_block;
 
-static inline int base_memory_block_id(int section_nr)
+static inline int base_memory_block_id(unsigned long section_nr)
 {
return section_nr / sections_per_block;
 }
@@ -131,9 +131,9 @@ static ssize_t phys_index_show(struct device *dev,
 static ssize_t removable_show(struct device *dev, struct device_attribute 
*attr,
  char *buf)
 {
-   unsigned long i, pfn;
-   int ret = 1;
struct memory_block *mem = to_memory_block(dev);
+   unsigned long pfn;
+   int ret = 1, i;
 
if (mem->state != MEM_ONLINE)
goto out;
@@ -691,15 +691,15 @@ static int init_memory_block(struct memory_block 
**memory, int block_id,
return ret;
 }
 
-static int add_memory_block(int base_section_nr)
+static int add_memory_block(unsigned long base_section_nr)
 {
+   int ret, section_count = 0;
struct memory_block *mem;
-   int i, ret, section_count = 0;
+   unsigned long nr;
 
-   for (i = base_section_nr;
-i < base_section_nr + sections_per_block;
-i++)
-   if (present_section_nr(i))
+   for (nr = base_section_nr; nr < base_section_nr + sections_per_block;
+nr++)
+   if (present_section_nr(nr))
section_count++;
 
if (section_count == 0)
@@ -822,10 +822,9 @@ static const struct attribute_group 
*memory_root_attr_groups[] = {
  */
 int __init memory_dev_init(void)
 {
-   unsigned int i;
int ret;
int err;
-   unsigned long block_sz;
+   unsigned long block_sz, nr;
 
ret = subsys_system_register(_subsys, memory_root_attr_groups);
if (ret)
@@ -839,9 +838,9 @@ int __init memory_dev_init(void)
 * during boot and have been initialized
 */
mutex_lock(_sysfs_mutex);
-   for (i = 0; i <= __highest_present_section_nr;
-   i += sections_per_block) {
-   err = add_memory_block(i);
+   for (nr = 0; nr <= __highest_present_section_nr;
+nr += sections_per_block) {
+   err = add_memory_block(nr);
if (!ret)
ret = err;
}
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 427b79c39b3c..83b6aae16f13 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1220,7 +1220,7 @@ static inline struct mem_section 
*__nr_to_section(unsigned long nr)
return NULL;
return _section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
 }
-extern int __section_nr(struct mem_section* ms);
+extern unsigned long __section_nr(struct mem_section *ms);
 extern unsigned long usemap_size(void);
 
 /*
@@ -1292,7 +1292,7 @@ static inline struct mem_section 
*__pfn_to_section(unsigned long pfn)
return __nr_to_section(pfn_to_section_nr(pfn));
 }
 
-extern int __highest_present_section_nr;
+extern unsigned long __highest_present_section_nr;
 
 #ifndef CONFIG_HAVE_ARCH_PFN_VALID
 static inline int pfn_valid(unsigned long pfn)
diff --git a/mm/sparse.c b/mm/sparse.c
index 1552c855d62a..e8c57e039be8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -102,7 +102,7 @@ static inline int sparse_index_init(unsigned long 
section_nr, int nid)
 #endif
 
 #ifdef CONFIG_SPARSEMEM_EXTREME
-int __section_nr(struct mem_section* ms)
+unsigned long __section_nr(struct mem_section *ms)
 {
unsigned long root_nr;
struct mem_section *root = NULL;
@@ -121,9 +121,9 @@ int __section_nr(struct mem_section* ms)
return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
 }
 #else
-int __section_nr(struct mem_section* ms)
+unsigned long __section_nr(struct mem_section *ms)
 {
-   return (int)(ms - mem_section[0]);
+   return (unsigned long)(ms - mem_section[0]);
 }
 #endif
 
@@ -178,10 +178,10 @@ void __meminit mminit_validate_memmodel_limits(unsigned 
long *start_pfn,
  * Keeping track of this gives us an easy way to break out of
  * those loops early.
  

[PATCH v2 0/6] mm: Further memory block device cleanups

2019-06-20 Thread David Hildenbrand
@Andrew: Only patch 1 and 6 changed. The patches are based on the
same state as the previous patches (replace the old ones if possible).

Some further cleanups around memory block devices. Especially, clean up
and simplify walk_memory_range(). Including some other minor cleanups.

Compiled + tested on x86 with DIMMs under QEMU.

v1 -> v2:
- "mm: Section numbers use the type "unsigned long""
-- "unsigned long i" -> "unsigned long nr", in one case -> "int i"
- "drivers/base/memory.c: Get rid of find_memory_block_hinted("
-- Fix compilation error
-- Get rid of the "hint" parameter completely

David Hildenbrand (6):
  mm: Section numbers use the type "unsigned long"
  drivers/base/memory: Use "unsigned long" for block ids
  mm: Make register_mem_sect_under_node() static
  mm/memory_hotplug: Rename walk_memory_range() and pass start+size
instead of pfns
  mm/memory_hotplug: Move and simplify walk_memory_blocks()
  drivers/base/memory.c: Get rid of find_memory_block_hinted()

 arch/powerpc/platforms/powernv/memtrace.c |  22 ++---
 drivers/acpi/acpi_memhotplug.c|  19 +---
 drivers/base/memory.c | 115 ++
 drivers/base/node.c   |   8 +-
 include/linux/memory.h|   5 +-
 include/linux/memory_hotplug.h|   2 -
 include/linux/mmzone.h|   4 +-
 include/linux/node.h  |   7 --
 mm/memory_hotplug.c   |  57 +--
 mm/sparse.c   |  12 +--
 10 files changed, 105 insertions(+), 146 deletions(-)

-- 
2.21.0



[PATCH v2 2/6] drivers/base/memory: Use "unsigned long" for block ids

2019-06-20 Thread David Hildenbrand
Block ids are just shifted section numbers, so let's also use
"unsigned long" for them, too.

Cc: Greg Kroah-Hartman 
Cc: "Rafael J. Wysocki" 
Signed-off-by: David Hildenbrand 
---
 drivers/base/memory.c | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 5947b5a5686d..c54e80fd25a8 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -34,12 +34,12 @@ static DEFINE_MUTEX(mem_sysfs_mutex);
 
 static int sections_per_block;
 
-static inline int base_memory_block_id(unsigned long section_nr)
+static inline unsigned long base_memory_block_id(unsigned long section_nr)
 {
return section_nr / sections_per_block;
 }
 
-static inline int pfn_to_block_id(unsigned long pfn)
+static inline unsigned long pfn_to_block_id(unsigned long pfn)
 {
return base_memory_block_id(pfn_to_section_nr(pfn));
 }
@@ -587,7 +587,7 @@ int __weak arch_get_memory_phys_device(unsigned long 
start_pfn)
  * A reference for the returned object is held and the reference for the
  * hinted object is released.
  */
-static struct memory_block *find_memory_block_by_id(int block_id,
+static struct memory_block *find_memory_block_by_id(unsigned long block_id,
struct memory_block *hint)
 {
struct device *hintdev = hint ? >dev : NULL;
@@ -604,7 +604,7 @@ static struct memory_block *find_memory_block_by_id(int 
block_id,
 struct memory_block *find_memory_block_hinted(struct mem_section *section,
  struct memory_block *hint)
 {
-   int block_id = base_memory_block_id(__section_nr(section));
+   unsigned long block_id = base_memory_block_id(__section_nr(section));
 
return find_memory_block_by_id(block_id, hint);
 }
@@ -663,8 +663,8 @@ int register_memory(struct memory_block *memory)
return ret;
 }
 
-static int init_memory_block(struct memory_block **memory, int block_id,
-unsigned long state)
+static int init_memory_block(struct memory_block **memory,
+unsigned long block_id, unsigned long state)
 {
struct memory_block *mem;
unsigned long start_pfn;
@@ -729,8 +729,8 @@ static void unregister_memory(struct memory_block *memory)
  */
 int create_memory_block_devices(unsigned long start, unsigned long size)
 {
-   const int start_block_id = pfn_to_block_id(PFN_DOWN(start));
-   int end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
+   const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
+   unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
struct memory_block *mem;
unsigned long block_id;
int ret = 0;
@@ -766,10 +766,10 @@ int create_memory_block_devices(unsigned long start, 
unsigned long size)
  */
 void remove_memory_block_devices(unsigned long start, unsigned long size)
 {
-   const int start_block_id = pfn_to_block_id(PFN_DOWN(start));
-   const int end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
+   const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
+   const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + 
size));
struct memory_block *mem;
-   int block_id;
+   unsigned long block_id;
 
if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
 !IS_ALIGNED(size, memory_block_size_bytes(
-- 
2.21.0



Re: [PATCH v2 43/52] powerpc/64s/exception: machine check early only runs in HV mode

2019-06-20 Thread Nicholas Piggin
Mahesh J Salgaonkar's on June 20, 2019 7:53 pm:
> On 2019-06-20 15:14:50 Thu, Nicholas Piggin wrote:
>> machine_check_common_early and machine_check_handle_early only run in
>> HVMODE. Remove dead code.
> 
> That's not true. For pseries guest with FWNMI enabled hypervisor,
> machine_check_common_early gets called in non-HV mode as well.
> 
>machine_check_fwnmi
>  machine_check_common_early
>machine_check_handle_early
>  machine_check_early
>pseries_machine_check_realmode

Yep, yep I was confused by the earlier patch. So we're only doing the
early machine check path for the FWNMI case?

Thanks,
Nick


Re: [PATCH v2 43/52] powerpc/64s/exception: machine check early only runs in HV mode

2019-06-20 Thread Mahesh J Salgaonkar
On 2019-06-20 15:14:50 Thu, Nicholas Piggin wrote:
> machine_check_common_early and machine_check_handle_early only run in
> HVMODE. Remove dead code.

That's not true. For pseries guest with FWNMI enabled hypervisor,
machine_check_common_early gets called in non-HV mode as well.

   machine_check_fwnmi
 machine_check_common_early
   machine_check_handle_early
 machine_check_early
   pseries_machine_check_realmode

Thanks,
-Mahesh.

> 
> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/kernel/exceptions-64s.S | 38 +---
>  1 file changed, 6 insertions(+), 32 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/exceptions-64s.S 
> b/arch/powerpc/kernel/exceptions-64s.S
> index b12755a4f884..f3362adc99e6 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -1014,10 +1014,7 @@ TRAMP_REAL_BEGIN(machine_check_common_early)
>   /* Save r9 through r13 from EXMC save area to stack frame. */
>   EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
>   mfmsr   r11 /* get MSR value */
> -BEGIN_FTR_SECTION
> - ori r11,r11,MSR_ME  /* turn on ME bit */
> -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
> - ori r11,r11,MSR_RI  /* turn on RI bit */
> + ori r11,r11,MSR_ME|MSR_RI   /* turn on ME, RI */
>   LOAD_HANDLER(r12, machine_check_handle_early)
>  1:   mtspr   SPRN_SRR0,r12
>   mtspr   SPRN_SRR1,r11
> @@ -1124,11 +1121,8 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
>   bl  machine_check_early
>   std r3,RESULT(r1)   /* Save result */
>   ld  r12,_MSR(r1)
> -BEGIN_FTR_SECTION
> - b   4f
> -END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
> 
> -#ifdef   CONFIG_PPC_P7_NAP
> +#ifdef CONFIG_PPC_P7_NAP
>   /*
>* Check if thread was in power saving mode. We come here when any
>* of the following is true:
> @@ -1141,7 +1135,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
>  BEGIN_FTR_SECTION
>   rlwinm. r11,r12,47-31,30,31
>   bne machine_check_idle_common
> -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
> +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
>  #endif
> 
>   /*
> @@ -1150,12 +1144,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | 
> CPU_FTR_ARCH_206)
>*/
>   rldicl. r11,r12,4,63/* See if MC hit while in HV mode. */
>   beq 5f
> -4:   andi.   r11,r12,MSR_PR  /* See if coming from user. */
> + andi.   r11,r12,MSR_PR  /* See if coming from user. */
>   bne 9f  /* continue in V mode if we are. */
> 
>  5:
>  #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
> -BEGIN_FTR_SECTION
>   /*
>* We are coming from kernel context. Check if we are coming from
>* guest. if yes, then we can continue. We will fall through
> @@ -1164,7 +1157,6 @@ BEGIN_FTR_SECTION
>   lbz r11,HSTATE_IN_GUEST(r13)
>   cmpwi   r11,0   /* Check if coming from guest */
>   bne 9f  /* continue if we are. */
> -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
>  #endif
>   /*
>* At this point we are not sure about what context we come from.
> @@ -1199,7 +1191,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
>   cmpdi   r3,0/* see if we handled MCE successfully */
> 
>   beq 1b  /* if !handled then panic */
> -BEGIN_FTR_SECTION
> +
>   /*
>* Return from MC interrupt.
>* Queue up the MCE event so that we can log it later, while
> @@ -1208,18 +1200,7 @@ BEGIN_FTR_SECTION
>   bl  machine_check_queue_event
>   MACHINE_CHECK_HANDLER_WINDUP
>   RFI_TO_USER_OR_KERNEL
> -FTR_SECTION_ELSE
> - /*
> -  * pSeries: Return from MC interrupt. Before that stay on emergency
> -  * stack and call machine_check_exception to log the MCE event.
> -  */
> - LOAD_HANDLER(r10,mce_return)
> - mtspr   SPRN_SRR0,r10
> - ld  r10,PACAKMSR(r13)
> - mtspr   SPRN_SRR1,r10
> - RFI_TO_KERNEL
> - b   .
> -ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
> +
>  9:
>   /* Deliver the machine check to host kernel in V mode. */
>   MACHINE_CHECK_HANDLER_WINDUP
> @@ -1238,13 +1219,6 @@ EXC_COMMON_BEGIN(unrecover_mce)
>   bl  unrecoverable_exception
>   b   1b
> 
> -EXC_COMMON_BEGIN(mce_return)
> - /* Invoke machine_check_exception to print MCE event and return. */
> - addir3,r1,STACK_FRAME_OVERHEAD
> - bl  machine_check_exception
> - MACHINE_CHECK_HANDLER_WINDUP
> - RFI_TO_KERNEL
> - b   .
> 
>  EXC_REAL_BEGIN(data_access, 0x300, 0x80)
>   EXCEPTION_PROLOG_0 PACA_EXGEN
> -- 
> 2.20.1
> 

-- 
Mahesh J Salgaonkar



Re: [PATCH v2 42/52] powerpc/64s/exception: machine check fwnmi does not trigger when in HV mode

2019-06-20 Thread Nicholas Piggin
Mahesh Jagannath Salgaonkar's on June 20, 2019 7:26 pm:
> On 6/20/19 10:44 AM, Nicholas Piggin wrote:
>> Remove dead code.
>> 
>> Signed-off-by: Nicholas Piggin 
>> ---
>>  arch/powerpc/kernel/exceptions-64s.S | 3 ---
>>  1 file changed, 3 deletions(-)
>> 
>> diff --git a/arch/powerpc/kernel/exceptions-64s.S 
>> b/arch/powerpc/kernel/exceptions-64s.S
>> index 286bd5670d60..b12755a4f884 100644
>> --- a/arch/powerpc/kernel/exceptions-64s.S
>> +++ b/arch/powerpc/kernel/exceptions-64s.S
>> @@ -1040,9 +1040,6 @@ TRAMP_REAL_BEGIN(machine_check_pSeries)
>>  .globl machine_check_fwnmi
>>  machine_check_fwnmi:
>>  EXCEPTION_PROLOG_0 PACA_EXMC
>> -BEGIN_FTR_SECTION
>> -b   machine_check_common_early
>> -END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
> 
> Didn't We add that to handle SLB/ERAT errors in real mode for pseries ?
> Are we taking that off ?

Oh that's a stupid mistake, as patch title says I mistook it for
IFSET. Good catch, thanks, that probably messes up a later patch
too, I'll fix.

Thanks,
Nick


Re: [PATCH v2 42/52] powerpc/64s/exception: machine check fwnmi does not trigger when in HV mode

2019-06-20 Thread Mahesh Jagannath Salgaonkar
On 6/20/19 10:44 AM, Nicholas Piggin wrote:
> Remove dead code.
> 
> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/kernel/exceptions-64s.S | 3 ---
>  1 file changed, 3 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/exceptions-64s.S 
> b/arch/powerpc/kernel/exceptions-64s.S
> index 286bd5670d60..b12755a4f884 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -1040,9 +1040,6 @@ TRAMP_REAL_BEGIN(machine_check_pSeries)
>   .globl machine_check_fwnmi
>  machine_check_fwnmi:
>   EXCEPTION_PROLOG_0 PACA_EXMC
> -BEGIN_FTR_SECTION
> - b   machine_check_common_early
> -END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)

Didn't We add that to handle SLB/ERAT errors in real mode for pseries ?
Are we taking that off ?

>  machine_check_pSeries_0:
>   EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0
>   /*
> 




Re: [PATCH v2] ocxl: Allow contexts to be attached with a NULL mm

2019-06-20 Thread Frederic Barrat




Le 20/06/2019 à 06:12, Alastair D'Silva a écrit :

From: Alastair D'Silva 

If an OpenCAPI context is to be used directly by a kernel driver, there
may not be a suitable mm to use.

The patch makes the mm parameter to ocxl_context_attach optional.

Signed-off-by: Alastair D'Silva 
---


Thanks for the update.
Acked-by: Frederic Barrat 




  arch/powerpc/mm/book3s64/radix_tlb.c |  5 +
  drivers/misc/ocxl/context.c  |  9 ++---
  drivers/misc/ocxl/link.c | 28 
  3 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c 
b/arch/powerpc/mm/book3s64/radix_tlb.c
index bb9835681315..ce8a77fae6a7 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -666,6 +666,11 @@ EXPORT_SYMBOL(radix__flush_tlb_page);
  #define radix__flush_all_mm radix__local_flush_all_mm
  #endif /* CONFIG_SMP */
  
+/*

+ * If kernel TLBIs ever become local rather than global, then
+ * drivers/misc/ocxl/link.c:ocxl_link_add_pe will need some work, as it
+ * assumes kernel TLBIs are global.
+ */
  void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
  {
_tlbie_pid(0, RIC_FLUSH_ALL);
diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c
index bab9c9364184..994563a078eb 100644
--- a/drivers/misc/ocxl/context.c
+++ b/drivers/misc/ocxl/context.c
@@ -69,6 +69,7 @@ static void xsl_fault_error(void *data, u64 addr, u64 dsisr)
  int ocxl_context_attach(struct ocxl_context *ctx, u64 amr, struct mm_struct 
*mm)
  {
int rc;
+   unsigned long pidr = 0;
  
  	// Locks both status & tidr

mutex_lock(>status_mutex);
@@ -77,9 +78,11 @@ int ocxl_context_attach(struct ocxl_context *ctx, u64 amr, 
struct mm_struct *mm)
goto out;
}
  
-	rc = ocxl_link_add_pe(ctx->afu->fn->link, ctx->pasid,

-   mm->context.id, ctx->tidr, amr, mm,
-   xsl_fault_error, ctx);
+   if (mm)
+   pidr = mm->context.id;
+
+   rc = ocxl_link_add_pe(ctx->afu->fn->link, ctx->pasid, pidr, ctx->tidr,
+ amr, mm, xsl_fault_error, ctx);
if (rc)
goto out;
  
diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c

index cce5b0d64505..58d111afd9f6 100644
--- a/drivers/misc/ocxl/link.c
+++ b/drivers/misc/ocxl/link.c
@@ -224,6 +224,17 @@ static irqreturn_t xsl_fault_handler(int irq, void *data)
ack_irq(spa, ADDRESS_ERROR);
return IRQ_HANDLED;
}
+
+   if (!pe_data->mm) {
+   /*
+* translation fault from a kernel context - an OpenCAPI
+* device tried to access a bad kernel address
+*/
+   rcu_read_unlock();
+   pr_warn("Unresolved OpenCAPI xsl fault in kernel context\n");
+   ack_irq(spa, ADDRESS_ERROR);
+   return IRQ_HANDLED;
+   }
WARN_ON(pe_data->mm->context.id != pid);
  
  	if (mmget_not_zero(pe_data->mm)) {

@@ -523,7 +534,13 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 
pidr, u32 tidr,
pe->amr = cpu_to_be64(amr);
pe->software_state = cpu_to_be32(SPA_PE_VALID);
  
-	mm_context_add_copro(mm);

+   /*
+* For user contexts, register a copro so that TLBIs are seen
+* by the nest MMU. If we have a kernel context, TLBIs are
+* already global.
+*/
+   if (mm)
+   mm_context_add_copro(mm);
/*
 * Barrier is to make sure PE is visible in the SPA before it
 * is used by the device. It also helps with the global TLBI
@@ -546,7 +563,8 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 
pidr, u32 tidr,
 * have a reference on mm_users. Incrementing mm_count solves
 * the problem.
 */
-   mmgrab(mm);
+   if (mm)
+   mmgrab(mm);
trace_ocxl_context_add(current->pid, spa->spa_mem, pasid, pidr, tidr);
  unlock:
mutex_unlock(>spa_lock);
@@ -652,8 +670,10 @@ int ocxl_link_remove_pe(void *link_handle, int pasid)
if (!pe_data) {
WARN(1, "Couldn't find pe data when removing PE\n");
} else {
-   mm_context_remove_copro(pe_data->mm);
-   mmdrop(pe_data->mm);
+   if (pe_data->mm) {
+   mm_context_remove_copro(pe_data->mm);
+   mmdrop(pe_data->mm);
+   }
kfree_rcu(pe_data, rcu);
}
  unlock:





[PATCH v4 6/6] mm/nvdimm: Fix endian conversion issues 

2019-06-20 Thread Aneesh Kumar K.V
nd_label->dpa issue was observed when trying to enable the namespace created
with little-endian kernel on a big-endian kernel. That made me run
`sparse` on the rest of the code and other changes are the result of that.

Fixes: d9b83c756953 ("libnvdimm, btt: rework error clearing")
Fixes: 9dedc73a4658 ("libnvdimm/btt: Fix LBA masking during 'free list' 
population")

Reviewed-by: Vishal Verma 
Signed-off-by: Aneesh Kumar K.V 
---
 drivers/nvdimm/btt.c| 8 
 drivers/nvdimm/namespace_devs.c | 7 ---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index a8d56887ec88..3e9f45aec8d1 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -392,9 +392,9 @@ static int btt_flog_write(struct arena_info *arena, u32 
lane, u32 sub,
arena->freelist[lane].sub = 1 - arena->freelist[lane].sub;
if (++(arena->freelist[lane].seq) == 4)
arena->freelist[lane].seq = 1;
-   if (ent_e_flag(ent->old_map))
+   if (ent_e_flag(le32_to_cpu(ent->old_map)))
arena->freelist[lane].has_err = 1;
-   arena->freelist[lane].block = le32_to_cpu(ent_lba(ent->old_map));
+   arena->freelist[lane].block = ent_lba(le32_to_cpu(ent->old_map));
 
return ret;
 }
@@ -560,8 +560,8 @@ static int btt_freelist_init(struct arena_info *arena)
 * FIXME: if error clearing fails during init, we want to make
 * the BTT read-only
 */
-   if (ent_e_flag(log_new.old_map) &&
-   !ent_normal(log_new.old_map)) {
+   if (ent_e_flag(le32_to_cpu(log_new.old_map)) &&
+   !ent_normal(le32_to_cpu(log_new.old_map))) {
arena->freelist[i].has_err = 1;
ret = arena_clear_freelist_error(arena, i);
if (ret)
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 007027202542..839da9e43572 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1987,7 +1987,7 @@ static struct device *create_namespace_pmem(struct 
nd_region *nd_region,
nd_mapping = _region->mapping[i];
label_ent = list_first_entry_or_null(_mapping->labels,
typeof(*label_ent), list);
-   label0 = label_ent ? label_ent->label : 0;
+   label0 = label_ent ? label_ent->label : NULL;
 
if (!label0) {
WARN_ON(1);
@@ -2322,8 +2322,9 @@ static struct device **scan_labels(struct nd_region 
*nd_region)
continue;
 
/* skip labels that describe extents outside of the region */
-   if (nd_label->dpa < nd_mapping->start || nd_label->dpa > 
map_end)
-   continue;
+   if (__le64_to_cpu(nd_label->dpa) < nd_mapping->start ||
+   __le64_to_cpu(nd_label->dpa) > map_end)
+   continue;
 
i = add_namespace_resource(nd_region, nd_label, devs, count);
if (i < 0)
-- 
2.21.0



[PATCH v4 5/6] mm/nvdimm: Use correct alignment when looking at first pfn from a region

2019-06-20 Thread Aneesh Kumar K.V
vmem_altmap_offset() adjust the section aligned base_pfn offset.
So we need to make sure we account for the same when computing base_pfn.

ie, for altmap_valid case, our pfn_first should be:

pfn_first = altmap->base_pfn + vmem_altmap_offset(altmap);

Signed-off-by: Aneesh Kumar K.V 
---
 kernel/memremap.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/kernel/memremap.c b/kernel/memremap.c
index a0e5f6b91b04..63800128844b 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -58,9 +58,11 @@ static unsigned long pfn_first(struct dev_pagemap *pgmap)
struct vmem_altmap *altmap = >altmap;
unsigned long pfn;
 
-   pfn = PHYS_PFN(res->start);
-   if (pgmap->altmap_valid)
-   pfn += vmem_altmap_offset(altmap);
+   if (pgmap->altmap_valid) {
+   pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+   } else
+   pfn = PHYS_PFN(res->start);
+
return pfn;
 }
 
-- 
2.21.0



[PATCH v4 2/6] mm/nvdimm: Add page size and struct page size to pfn superblock

2019-06-20 Thread Aneesh Kumar K.V
This is needed so that we don't wrongly initialize a namespace
which doesn't have enough space reserved for holding struct pages
with the current kernel.

Signed-off-by: Aneesh Kumar K.V 
---
 drivers/nvdimm/pfn.h  |  5 -
 drivers/nvdimm/pfn_devs.c | 27 ++-
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h
index 7381673b7b70..acb19517f678 100644
--- a/drivers/nvdimm/pfn.h
+++ b/drivers/nvdimm/pfn.h
@@ -29,7 +29,10 @@ struct nd_pfn_sb {
/* minor-version-2 record the base alignment of the mapping */
__le32 align;
/* minor-version-3 guarantee the padding and flags are zero */
-   u8 padding[4000];
+   /* minor-version-4 record the page size and struct page size */
+   __le32 page_size;
+   __le16 page_struct_size;
+   u8 padding[3994];
__le64 checksum;
 };
 
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 2537aa338bd0..cd722de0ae03 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -460,6 +460,15 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
if (__le16_to_cpu(pfn_sb->version_minor) < 2)
pfn_sb->align = 0;
 
+   if (__le16_to_cpu(pfn_sb->version_minor) < 4) {
+   /*
+* For a large part we use PAGE_SIZE. But we
+* do have some accounting code using SZ_4K.
+*/
+   pfn_sb->page_struct_size = cpu_to_le16(64);
+   pfn_sb->page_size = cpu_to_le32(PAGE_SIZE);
+   }
+
switch (le32_to_cpu(pfn_sb->mode)) {
case PFN_MODE_RAM:
case PFN_MODE_PMEM:
@@ -475,6 +484,20 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
align = 1UL << ilog2(offset);
mode = le32_to_cpu(pfn_sb->mode);
 
+   if (le32_to_cpu(pfn_sb->page_size) != PAGE_SIZE) {
+   dev_err(_pfn->dev,
+   "init failed, page size mismatch %d\n",
+   le32_to_cpu(pfn_sb->page_size));
+   return -EOPNOTSUPP;
+   }
+
+   if (le16_to_cpu(pfn_sb->page_struct_size) != sizeof(struct page)) {
+   dev_err(_pfn->dev,
+   "init failed, struct page size mismatch %d\n",
+   le16_to_cpu(pfn_sb->page_struct_size));
+   return -EOPNOTSUPP;
+   }
+
if (!nd_pfn->uuid) {
/*
 * When probing a namepace via nd_pfn_probe() the uuid
@@ -723,8 +746,10 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(>dev), 16);
pfn_sb->version_major = cpu_to_le16(1);
-   pfn_sb->version_minor = cpu_to_le16(3);
+   pfn_sb->version_minor = cpu_to_le16(4);
pfn_sb->align = cpu_to_le32(nd_pfn->align);
+   pfn_sb->page_struct_size = cpu_to_le16(sizeof(struct page));
+   pfn_sb->page_size = cpu_to_le32(PAGE_SIZE);
checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
pfn_sb->checksum = cpu_to_le64(checksum);
 
-- 
2.21.0



[PATCH v4 3/6] mm/nvdimm: Use correct #defines instead of open coding

2019-06-20 Thread Aneesh Kumar K.V
Use PAGE_SIZE instead of SZ_4K and sizeof(struct page) instead of 64.
If we have a kernel built with different struct page size the previous
patch should handle marking the namespace disabled.

Signed-off-by: Aneesh Kumar K.V 
---
 drivers/nvdimm/label.c  | 2 +-
 drivers/nvdimm/namespace_devs.c | 6 +++---
 drivers/nvdimm/pfn_devs.c   | 3 ++-
 drivers/nvdimm/region_devs.c| 8 
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
index 73e197babc2f..7ee037063be7 100644
--- a/drivers/nvdimm/label.c
+++ b/drivers/nvdimm/label.c
@@ -355,7 +355,7 @@ static bool slot_valid(struct nvdimm_drvdata *ndd,
 
/* check that DPA allocations are page aligned */
if ((__le64_to_cpu(nd_label->dpa)
-   | __le64_to_cpu(nd_label->rawsize)) % SZ_4K)
+   | __le64_to_cpu(nd_label->rawsize)) % PAGE_SIZE)
return false;
 
/* check checksum */
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index a434a5964cb9..007027202542 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1006,10 +1006,10 @@ static ssize_t __size_store(struct device *dev, 
unsigned long long val)
return -ENXIO;
}
 
-   div_u64_rem(val, SZ_4K * nd_region->ndr_mappings, );
+   div_u64_rem(val, PAGE_SIZE * nd_region->ndr_mappings, );
if (remainder) {
-   dev_dbg(dev, "%llu is not %dK aligned\n", val,
-   (SZ_4K * nd_region->ndr_mappings) / SZ_1K);
+   dev_dbg(dev, "%llu is not %ldK aligned\n", val,
+   (PAGE_SIZE * nd_region->ndr_mappings) / SZ_1K);
return -EINVAL;
}
 
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index cd722de0ae03..9410d2692913 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -726,7 +726,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 * when populating the vmemmap. This *should* be equal to
 * PMD_SIZE for most architectures.
 */
-   offset = ALIGN(start + SZ_8K + 64 * npfns, align) - start;
+   offset = ALIGN(start + SZ_8K + sizeof(struct page) * npfns,
+  align) - start;
} else if (nd_pfn->mode == PFN_MODE_RAM)
offset = ALIGN(start + SZ_8K, align) - start;
else
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 1e74a1c9fdac..b9992499a035 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -997,10 +997,10 @@ static struct nd_region *nd_region_create(struct 
nvdimm_bus *nvdimm_bus,
struct nd_mapping_desc *mapping = _desc->mapping[i];
struct nvdimm *nvdimm = mapping->nvdimm;
 
-   if ((mapping->start | mapping->size) % SZ_4K) {
-   dev_err(_bus->dev, "%s: %s mapping%d is not 4K 
aligned\n",
-   caller, dev_name(>dev), i);
-
+   if ((mapping->start | mapping->size) % PAGE_SIZE) {
+   dev_err(_bus->dev,
+   "%s: %s mapping%d is not %ld aligned\n",
+   caller, dev_name(>dev), i, PAGE_SIZE);
return NULL;
}
 
-- 
2.21.0



[PATCH v4 4/6] mm/nvdimm: Pick the right alignment default when creating dax devices

2019-06-20 Thread Aneesh Kumar K.V
Allow arch to provide the supported alignments and use hugepage alignment only
if we support hugepage. Right now we depend on compile time configs whereas this
patch switch this to runtime discovery.

Architectures like ppc64 can have THP enabled in code, but then can have
hugepage size disabled by the hypervisor. This allows us to create dax devices
with PAGE_SIZE alignment in this case.

Existing dax namespace with alignment larger than PAGE_SIZE will fail to
initialize in this specific case. We still allow fsdax namespace initialization.

With respect to identifying whether to enable hugepage fault for a dax device,
if THP is enabled during compile, we default to taking hugepage fault and in dax
fault handler if we find the fault size > alignment we retry with PAGE_SIZE
fault size.

This also addresses the below failure scenario on ppc64

ndctl create-namespace --mode=devdax  | grep align
 "align":16777216,
 "align":16777216

cat /sys/devices/ndbus0/region0/dax0.0/supported_alignments
 65536 16777216

daxio.static-debug  -z -o /dev/dax0.0
  Bus error (core dumped)

  $ dmesg | tail
   lpar: Failed hash pte insert with error -4
   hash-mmu: mm: Hashing failure ! EA=0x7fff1700 access=0x8006 
current=daxio
   hash-mmu: trap=0x300 vsid=0x22cb7a3 ssize=1 base psize=2 psize 10 
pte=0xc00501002b86
   daxio[3860]: bus error (7) at 7fff1700 nip 7fff973c007c lr 7fff973bff34 
code 2 in libpmem.so.1.0.0[7fff973b+2]
   daxio[3860]: code: 792945e4 7d494b78 e95f0098 7d494b78 f93f00a0 4800012c 
e93f0088 f93f0120
   daxio[3860]: code: e93f00a0 f93f0128 e93f0120 e95f0128  e93f0088 
39290008 f93f0110

The failure was due to guest kernel using wrong page size.

The namespaces created with 16M alignment will appear as below on a config with
16M page size disabled.

$ ndctl list -Ni
[
  {
"dev":"namespace0.1",
"mode":"fsdax",
"map":"dev",
"size":5351931904,
"uuid":"fc6e9667-461a-4718-82b4-69b24570bddb",
"align":16777216,
"blockdev":"pmem0.1",
"supported_alignments":[
  65536
]
  },
  {
"dev":"namespace0.0",
"mode":"fsdax",< devdax 16M alignment marked disabled.
"map":"mem",
"size":5368709120,
"uuid":"a4bdf81a-f2ee-4bc6-91db-7b87eddd0484",
"state":"disabled"
  }
]

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/libnvdimm.h |  9 
 arch/powerpc/mm/Makefile |  1 +
 arch/powerpc/mm/nvdimm.c | 34 
 arch/x86/include/asm/libnvdimm.h | 19 
 drivers/nvdimm/nd.h  |  6 -
 drivers/nvdimm/pfn_devs.c| 32 +-
 include/linux/huge_mm.h  |  7 +-
 7 files changed, 100 insertions(+), 8 deletions(-)
 create mode 100644 arch/powerpc/include/asm/libnvdimm.h
 create mode 100644 arch/powerpc/mm/nvdimm.c
 create mode 100644 arch/x86/include/asm/libnvdimm.h

diff --git a/arch/powerpc/include/asm/libnvdimm.h 
b/arch/powerpc/include/asm/libnvdimm.h
new file mode 100644
index ..d35fd7f48603
--- /dev/null
+++ b/arch/powerpc/include/asm/libnvdimm.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_LIBNVDIMM_H
+#define _ASM_POWERPC_LIBNVDIMM_H
+
+#define nd_pfn_supported_alignments nd_pfn_supported_alignments
+extern unsigned long *nd_pfn_supported_alignments(void);
+extern unsigned long nd_pfn_default_alignment(void);
+
+#endif
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 0f499db315d6..42e4a399ba5d 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -20,3 +20,4 @@ obj-$(CONFIG_HIGHMEM) += highmem.o
 obj-$(CONFIG_PPC_COPRO_BASE)   += copro_fault.o
 obj-$(CONFIG_PPC_PTDUMP)   += ptdump/
 obj-$(CONFIG_KASAN)+= kasan/
+obj-$(CONFIG_NVDIMM_PFN)   += nvdimm.o
diff --git a/arch/powerpc/mm/nvdimm.c b/arch/powerpc/mm/nvdimm.c
new file mode 100644
index ..a29a4510715e
--- /dev/null
+++ b/arch/powerpc/mm/nvdimm.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+#include 
+
+#include 
+/*
+ * We support only pte and pmd mappings for now.
+ */
+const unsigned long *nd_pfn_supported_alignments(void)
+{
+   static unsigned long supported_alignments[3];
+
+   supported_alignments[0] = PAGE_SIZE;
+
+   if (has_transparent_hugepage())
+   supported_alignments[1] = HPAGE_PMD_SIZE;
+   else
+   supported_alignments[1] = 0;
+
+   supported_alignments[2] = 0;
+   return supported_alignments;
+}
+
+/*
+ * Use pmd mapping if supported as default alignment
+ */
+unsigned long nd_pfn_default_alignment(void)
+{
+
+   if (has_transparent_hugepage())
+   return HPAGE_PMD_SIZE;
+   return PAGE_SIZE;
+}
diff --git a/arch/x86/include/asm/libnvdimm.h b/arch/x86/include/asm/libnvdimm.h
new file mode 100644
index ..3d5361db9164
--- /dev/null
+++ b/arch/x86/include/asm/libnvdimm.h
@@ -0,0 

[PATCH v4 1/6] nvdimm: Consider probe return -EOPNOTSUPP as success

2019-06-20 Thread Aneesh Kumar K.V
This patch add -EOPNOTSUPP as return from probe callback to
indicate we were not able to initialize a namespace due to pfn superblock
feature/version mismatch. We want to consider this a probe success so that
we can create new namesapce seed and there by avoid marking the failed
namespace as the seed namespace.

Signed-off-by: Aneesh Kumar K.V 
---
 drivers/nvdimm/bus.c |  4 ++--
 drivers/nvdimm/nd-core.h |  3 ++-
 drivers/nvdimm/pmem.c| 26 ++
 drivers/nvdimm/region_devs.c | 19 +++
 4 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 2dca3034fee0..3b8ffb3966ab 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -92,8 +92,8 @@ static int nvdimm_bus_probe(struct device *dev)
 
nvdimm_bus_probe_start(nvdimm_bus);
rc = nd_drv->probe(dev);
-   if (rc == 0)
-   nd_region_probe_success(nvdimm_bus, dev);
+   if (rc == 0 || rc == -EOPNOTSUPP)
+   nd_region_probe_success(nvdimm_bus, dev, rc);
else
nd_region_disable(nvdimm_bus, dev);
nvdimm_bus_probe_end(nvdimm_bus);
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 391e88de3a29..4e6ffa0d89bb 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -126,7 +126,8 @@ int __init nvdimm_bus_init(void);
 void nvdimm_bus_exit(void);
 void nvdimm_devs_exit(void);
 void nd_region_devs_exit(void);
-void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device 
*dev);
+void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus,
+struct device *dev, int ret);
 struct nd_region;
 void nd_region_create_ns_seed(struct nd_region *nd_region);
 void nd_region_create_btt_seed(struct nd_region *nd_region);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 24d7fe7c74ed..422b11c01301 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -497,6 +497,7 @@ static int pmem_attach_disk(struct device *dev,
 
 static int nd_pmem_probe(struct device *dev)
 {
+   int ret;
struct nd_namespace_common *ndns;
 
ndns = nvdimm_namespace_common_probe(dev);
@@ -512,12 +513,29 @@ static int nd_pmem_probe(struct device *dev)
if (is_nd_pfn(dev))
return pmem_attach_disk(dev, ndns);
 
-   /* if we find a valid info-block we'll come back as that personality */
-   if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0
-   || nd_dax_probe(dev, ndns) == 0)
+   ret = nd_btt_probe(dev, ndns);
+   if (ret == 0)
return -ENXIO;
+   else if (ret == -EOPNOTSUPP)
+   return ret;
 
-   /* ...otherwise we're just a raw pmem device */
+   ret = nd_pfn_probe(dev, ndns);
+   if (ret == 0)
+   return -ENXIO;
+   else if (ret == -EOPNOTSUPP)
+   return ret;
+
+   ret = nd_dax_probe(dev, ndns);
+   if (ret == 0)
+   return -ENXIO;
+   else if (ret == -EOPNOTSUPP)
+   return ret;
+   /*
+* We have two failure conditions here, there is no
+* info reserver block or we found a valid info reserve block
+* but failed to initialize the pfn superblock.
+* Don't create a raw pmem disk for the second case.
+*/
return pmem_attach_disk(dev, ndns);
 }
 
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 4fed9ce9c2fe..1e74a1c9fdac 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -715,7 +715,7 @@ void nd_mapping_free_labels(struct nd_mapping *nd_mapping)
  * disable the region.
  */
 static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
-   struct device *dev, bool probe)
+  struct device *dev, bool probe, int 
ret)
 {
struct nd_region *nd_region;
 
@@ -745,6 +745,16 @@ static void nd_region_notify_driver_action(struct 
nvdimm_bus *nvdimm_bus,
nd_region_create_ns_seed(nd_region);
nvdimm_bus_unlock(dev);
}
+
+   if (dev->parent && is_nd_region(dev->parent) &&
+   !probe && (ret == -EOPNOTSUPP)) {
+   nd_region = to_nd_region(dev->parent);
+   nvdimm_bus_lock(dev);
+   if (nd_region->ns_seed == dev)
+   nd_region_create_ns_seed(nd_region);
+   nvdimm_bus_unlock(dev);
+   }
+
if (is_nd_btt(dev) && probe) {
struct nd_btt *nd_btt = to_nd_btt(dev);
 
@@ -780,14 +790,15 @@ static void nd_region_notify_driver_action(struct 
nvdimm_bus *nvdimm_bus,
}
 }
 
-void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev)
+void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus,
+struct device *dev, int ret)
 {
-   

[PATCH v4 0/6] Fixes related namespace alignment/page size/big endian

2019-06-20 Thread Aneesh Kumar K.V
This series handle configs where hugepage support is not enabled by default.
Also, we update some of the information messages to make sure we use PAGE_SIZE 
instead
of SZ_4K. We now store page size and struct page size in pfn_sb and do extra 
check
before enabling namespace. There also an endianness fix.

The patch series is on top of subsection v10 patchset

http://lore.kernel.org/linux-mm/156092349300.979959.17603710711957735135.st...@dwillia2-desk3.amr.corp.intel.com

Changes from V3:
* Dropped the change related PFN_MIN_VERSION
* for pfn_sb minor version < 4, we default page_size to PAGE_SIZE instead of 
SZ_4k.

Aneesh Kumar K.V (6):
  nvdimm: Consider probe return -EOPNOTSUPP as success
  mm/nvdimm: Add page size and struct page size to pfn superblock
  mm/nvdimm: Use correct #defines instead of open coding
  mm/nvdimm: Pick the right alignment default when creating dax devices
  mm/nvdimm: Use correct alignment when looking at first pfn from a
region
  mm/nvdimm: Fix endian conversion issues 

 arch/powerpc/include/asm/libnvdimm.h |  9 
 arch/powerpc/mm/Makefile |  1 +
 arch/powerpc/mm/nvdimm.c | 34 +++
 arch/x86/include/asm/libnvdimm.h | 19 +
 drivers/nvdimm/btt.c |  8 ++--
 drivers/nvdimm/bus.c |  4 +-
 drivers/nvdimm/label.c   |  2 +-
 drivers/nvdimm/namespace_devs.c  | 13 +++---
 drivers/nvdimm/nd-core.h |  3 +-
 drivers/nvdimm/nd.h  |  6 ---
 drivers/nvdimm/pfn.h |  5 ++-
 drivers/nvdimm/pfn_devs.c| 62 ++--
 drivers/nvdimm/pmem.c| 26 ++--
 drivers/nvdimm/region_devs.c | 27 
 include/linux/huge_mm.h  |  7 +++-
 kernel/memremap.c|  8 ++--
 16 files changed, 194 insertions(+), 40 deletions(-)
 create mode 100644 arch/powerpc/include/asm/libnvdimm.h
 create mode 100644 arch/powerpc/mm/nvdimm.c
 create mode 100644 arch/x86/include/asm/libnvdimm.h

-- 
2.21.0



Re: [PATCH 3/3] KVM: PPC: Book3S HV: Clear pending decr exceptions on nested guest entry

2019-06-20 Thread Cédric Le Goater
On 20/06/2019 09:57, Laurent Vivier wrote:
> On 20/06/2019 03:46, Suraj Jitindar Singh wrote:
>> If we enter an L1 guest with a pending decrementer exception then this
>> is cleared on guest exit if the guest has writtien a positive value into
>> the decrementer (indicating that it handled the decrementer exception)
>> since there is no other way to detect that the guest has handled the
>> pending exception and that it should be dequeued. In the event that the
>> L1 guest tries to run a nested (L2) guest immediately after this and the
>> L2 guest decrementer is negative (which is loaded by L1 before making
>> the H_ENTER_NESTED hcall), then the pending decrementer exception
>> isn't cleared and the L2 entry is blocked since L1 has a pending
>> exception, even though L1 may have already handled the exception and
>> written a positive value for it's decrementer. This results in a loop of
>> L1 trying to enter the L2 guest and L0 blocking the entry since L1 has
>> an interrupt pending with the outcome being that L2 never gets to run
>> and hangs.
>>
>> Fix this by clearing any pending decrementer exceptions when L1 makes
>> the H_ENTER_NESTED hcall since it won't do this if it's decrementer has
>> gone negative, and anyway it's decrementer has been communicated to L0
>> in the hdec_expires field and L0 will return control to L1 when this
>> goes negative by delivering an H_DECREMENTER exception.
>>
>> Fixes: 95a6432ce903 "KVM: PPC: Book3S HV: Streamlined guest entry/exit path 
>> on P9 for radix guests"
>>
>> Signed-off-by: Suraj Jitindar Singh 
>> ---
>>  arch/powerpc/kvm/book3s_hv.c | 11 +--
>>  1 file changed, 9 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
>> index 719fd2529eec..4a5eb29b952f 100644
>> --- a/arch/powerpc/kvm/book3s_hv.c
>> +++ b/arch/powerpc/kvm/book3s_hv.c
>> @@ -4128,8 +4128,15 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
>>  
>>  preempt_enable();
>>  
>> -/* cancel pending decrementer exception if DEC is now positive */
>> -if (get_tb() < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
>> +/*
>> + * cancel pending decrementer exception if DEC is now positive, or if
>> + * entering a nested guest in which case the decrementer is now owned
>> + * by L2 and the L1 decrementer is provided in hdec_expires
>> + */
>> +if (kvmppc_core_pending_dec(vcpu) &&
>> +((get_tb() < vcpu->arch.dec_expires) ||
>> + (trap == BOOK3S_INTERRUPT_SYSCALL &&
>> +  kvmppc_get_gpr(vcpu, 3) == H_ENTER_NESTED)))
>>  kvmppc_core_dequeue_dec(vcpu);
>>  
>>  trace_kvm_guest_exit(vcpu);
>>
> 
> Patches 2 and 3: tested I can boot and run an L2 nested guest with qemu
> v4.0.0 and caps-large-decr=on in the case we have had a hang previously.
> 
> Tested-by: Laurent Vivier 

You beat me to it. All works fine on L0, L1, L2.

  Tested-by: Cédric Le Goater 

With a QEMU-4.1. In this configuration, L2 runs with the XIVE (emulated) 
interrupt mode by default now (kernel_irqchip=allowed, ic-mode=dual).

Thanks,

C.




Re: [PATCH v12 00/31] Speculative page faults

2019-06-20 Thread Haiyan Song
Hi Laurent,

I downloaded your script and run it on Intel 2s skylake platform with spf-v12 
patch
serials.

Here attached the output results of this script.

The following comparison result is statistics from the script outputs.

a). Enable THP
SPF_0  change   SPF_1
will-it-scale.page_fault2.per_thread_ops2664190.8  -11.7%   
2353637.6  
will-it-scale.page_fault3.per_thread_ops4480027.2  -14.7%   
3819331.9 


b). Disable THP
SPF_0   change  SPF_1
will-it-scale.page_fault2.per_thread_ops2653260.7   -10%
2385165.8
will-it-scale.page_fault3.per_thread_ops4436330.1   -12.4%  
3886734.2 


Thanks,
Haiyan Song


On Fri, Jun 14, 2019 at 10:44:47AM +0200, Laurent Dufour wrote:
> Le 14/06/2019 à 10:37, Laurent Dufour a écrit :
> > Please find attached the script I run to get these numbers.
> > This would be nice if you could give it a try on your victim node and share 
> > the result.
> 
> Sounds that the Intel mail fitering system doesn't like the attached shell 
> script.
> Please find it there: 
> https://gist.github.com/ldu4/a5cc1a93f293108ea387d43d5d5e7f44
> 
> Thanks,
> Laurent.
> 
 THP always
 SPF 0
average:2628818
average:2732209
average:2728392
average:2550695
average:2689873
average:2691963
average:2627612
average:2558295
average:2707877
average:2726174
 SPF 1
average:2426260
average:2145674
average:2117769
average:2292502
average:2350403
average:2483327
average:2467324
average:2335393
average:2437859
average:2479865
 THP never
 SPF 0
average:2712575
average:2711447
average:2672362
average:2701981
average:2668073
average:2579296
average:2662048
average:2637422
average:2579143
average:2608260
 SPF 1
average:2348782
average:2203349
average:2312960
average:2402995
average:2318914
average:2543129
average:2390337
average:2490178
average:2416798
average:2424216
 THP always
 SPF 0
average:4370143
average:4245754
average:4678884
average:4665759
average:4665809
average:4639132
average:4210755
average:4330552
average:4290469
average:4703015
 SPF 1
average:3810608
average:3918890
average:3758003
average:3965024
average:3578151
average:3822748
average:3687293
average:3998701
average:3915771
average:3738130
 THP never
 SPF 0
average:4505598
average:4672023
average:4701787
average:4355885
average:4338397
average:4446350
average:4360811
average:4653767
average:4016352
average:4312331
 SPF 1
average:3685383
average:4029413
average:4051615
average:3747588
average:4058557
average:4042340
average:3971295
average:3752943
average:3750626
average:3777582


Re: [PATCH v2] crypto: nx: no need to check return value of debugfs_create functions

2019-06-20 Thread Herbert Xu
Greg Kroah-Hartman  wrote:
> When calling debugfs functions, there is no need to ever check the
> return value.  The function can work or not, but the code logic should
> never do something different based on this.
> 
> Also, there is no need to store the individual debugfs file names,
> especially as the whole directiry is deleted at once, so remove the
> unneeded structure entirely.
> 
> Cc: "Breno Leitão" 
> Cc: Nayna Jain 
> Cc: Paulo Flabiano Smorigo 
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Michael Ellerman 
> Cc: Herbert Xu 
> Cc: "David S. Miller" 
> Cc: linux-cry...@vger.kernel.org
> Cc: linuxppc-dev@lists.ozlabs.org
> Signed-off-by: Greg Kroah-Hartman 
> ---
> v2: fixed build error found by kbuild
> 
> drivers/crypto/nx/nx.c |  4 +-
> drivers/crypto/nx/nx.h | 12 +-
> drivers/crypto/nx/nx_debugfs.c | 71 +++---
> 3 files changed, 26 insertions(+), 61 deletions(-)

Patch applied.  Thanks.
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH] crypto: talitos - fix max key size for sha384 and sha512

2019-06-20 Thread Herbert Xu
On Wed, Jun 12, 2019 at 05:49:50AM +, Christophe Leroy wrote:
> Below commit came with a typo in the CONFIG_ symbol, leading
> to a permanently reduced max key size regarless of the driver
> capabilities.
> 
> Reported-by: Horia Geantă 
> Fixes: b8fbdc2bc4e7 ("crypto: talitos - reduce max key size for SEC1")
> Signed-off-by: Christophe Leroy 
> ---
>  drivers/crypto/talitos.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Patch applied.  Thanks.
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH] crypto: vmx - Document CTR mode counter width quirks

2019-06-20 Thread Herbert Xu
On Tue, Jun 11, 2019 at 11:54:31AM +1000, Daniel Axtens wrote:
> The CTR code comes from OpenSSL, where it does a 32-bit counter.
> The kernel has a 128-bit counter. This difference has lead to
> issues.
> 
> Document it.
> 
> Signed-off-by: Daniel Axtens 
> ---
>  drivers/crypto/vmx/aesp8-ppc.pl | 22 --
>  1 file changed, 20 insertions(+), 2 deletions(-)

Patch applied.  Thanks.
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH 3/3] KVM: PPC: Book3S HV: Clear pending decr exceptions on nested guest entry

2019-06-20 Thread Laurent Vivier
On 20/06/2019 03:46, Suraj Jitindar Singh wrote:
> If we enter an L1 guest with a pending decrementer exception then this
> is cleared on guest exit if the guest has writtien a positive value into
> the decrementer (indicating that it handled the decrementer exception)
> since there is no other way to detect that the guest has handled the
> pending exception and that it should be dequeued. In the event that the
> L1 guest tries to run a nested (L2) guest immediately after this and the
> L2 guest decrementer is negative (which is loaded by L1 before making
> the H_ENTER_NESTED hcall), then the pending decrementer exception
> isn't cleared and the L2 entry is blocked since L1 has a pending
> exception, even though L1 may have already handled the exception and
> written a positive value for it's decrementer. This results in a loop of
> L1 trying to enter the L2 guest and L0 blocking the entry since L1 has
> an interrupt pending with the outcome being that L2 never gets to run
> and hangs.
> 
> Fix this by clearing any pending decrementer exceptions when L1 makes
> the H_ENTER_NESTED hcall since it won't do this if it's decrementer has
> gone negative, and anyway it's decrementer has been communicated to L0
> in the hdec_expires field and L0 will return control to L1 when this
> goes negative by delivering an H_DECREMENTER exception.
> 
> Fixes: 95a6432ce903 "KVM: PPC: Book3S HV: Streamlined guest entry/exit path 
> on P9 for radix guests"
> 
> Signed-off-by: Suraj Jitindar Singh 
> ---
>  arch/powerpc/kvm/book3s_hv.c | 11 +--
>  1 file changed, 9 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 719fd2529eec..4a5eb29b952f 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -4128,8 +4128,15 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
>  
>   preempt_enable();
>  
> - /* cancel pending decrementer exception if DEC is now positive */
> - if (get_tb() < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
> + /*
> +  * cancel pending decrementer exception if DEC is now positive, or if
> +  * entering a nested guest in which case the decrementer is now owned
> +  * by L2 and the L1 decrementer is provided in hdec_expires
> +  */
> + if (kvmppc_core_pending_dec(vcpu) &&
> + ((get_tb() < vcpu->arch.dec_expires) ||
> +  (trap == BOOK3S_INTERRUPT_SYSCALL &&
> +   kvmppc_get_gpr(vcpu, 3) == H_ENTER_NESTED)))
>   kvmppc_core_dequeue_dec(vcpu);
>  
>   trace_kvm_guest_exit(vcpu);
> 

Patches 2 and 3: tested I can boot and run an L2 nested guest with qemu
v4.0.0 and caps-large-decr=on in the case we have had a hang previously.

Tested-by: Laurent Vivier 



Re: [PATCH 2/3] KVM: PPC: Book3S HV: Signed extend decrementer value if not using large decr

2019-06-20 Thread Laurent Vivier
On 20/06/2019 03:46, Suraj Jitindar Singh wrote:
> On POWER9 the decrementer can operate in large decrementer mode where
> the decrementer is 56 bits and signed extended to 64 bits. When not
> operating in this mode the decrementer behaves as a 32 bit decrementer
> which is NOT signed extended (as on POWER8).
> 
> Currently when reading a guest decrementer value we don't take into
> account whether the large decrementer is enabled or not, and this means
> the value will be incorrect when the guest is not using the large
> decrementer. Fix this by sign extending the value read when the guest
> isn't using the large decrementer.
> 
> Fixes: 95a6432ce903 "KVM: PPC: Book3S HV: Streamlined guest entry/exit path 
> on P9 for radix guests"
> 
> Signed-off-by: Suraj Jitindar Singh 
> ---
>  arch/powerpc/kvm/book3s_hv.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index d3684509da35..719fd2529eec 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -3607,6 +3607,8 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 
> time_limit,
>  
>   vcpu->arch.slb_max = 0;
>   dec = mfspr(SPRN_DEC);
> + if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
> + dec = (s32) dec;
>   tb = mftb();
>   vcpu->arch.dec_expires = dec + tb;
>   vcpu->cpu = -1;
> 

Patches 2 and 3: tested I can boot and run an L2 nested guest with qemu
v4.0.0 and caps-large-decr=on in the case we have had a hang previously.

Tested-by: Laurent Vivier 


Re: [PATCH 3/4] powerpc/powernv: remove dead NPU DMA code

2019-06-20 Thread Alexey Kardashevskiy



On 20/06/2019 16:21, Christoph Hellwig wrote:
> On Thu, Jun 20, 2019 at 04:20:08PM +1000, Alexey Kardashevskiy wrote:
>>
>>
>> On 20/06/2019 16:03, Christoph Hellwig wrote:
>>> Hi Linus,
>>>
>>> this goes back to the discussion at last years kernel summit, where
>>> we had the discussion on removing code never used by any in-kernel
>>> user an no prospects of one.  The IBM folks are unfortunately still
>>> dragging their feet on the powerpc side.  Can we revise this discussion?
>>>
>>> The use case here is a IBM specific bus for which they only have an
>>> out of tree driver that their partner doesn't want to submit for mainline,
>>> but keep insisting on keeping the code around (which is also built
>>> uncondÑ–tionally for the platform).
>>
>>
>> I personally keep insisting on correct commit logs, i.e. not calling
>> working code dead and providing actual reasons for the change. Thanks,
> 
> If that is the only thing you are complaining about I can clarify it
> a little of course.

Please do so. For me the problem is that if a maintainer decides to pull
that then so be it but I want that person to know exactly what is
happening. As it is now - the code may seem dead as nobody complains so
- I complained.


> But it didn't sound like that was the actual
> problem.

I'd like to see some formal statement in a written form about where we
stand in regard to the out-of-tree drivers support^wacceptability
(cannot pick the right word). Thanks,


-- 
Alexey


Re: [PATCH 3/4] powerpc/powernv: remove dead NPU DMA code

2019-06-20 Thread Christoph Hellwig
On Thu, Jun 20, 2019 at 04:20:08PM +1000, Alexey Kardashevskiy wrote:
> 
> 
> On 20/06/2019 16:03, Christoph Hellwig wrote:
> > Hi Linus,
> > 
> > this goes back to the discussion at last years kernel summit, where
> > we had the discussion on removing code never used by any in-kernel
> > user an no prospects of one.  The IBM folks are unfortunately still
> > dragging their feet on the powerpc side.  Can we revise this discussion?
> > 
> > The use case here is a IBM specific bus for which they only have an
> > out of tree driver that their partner doesn't want to submit for mainline,
> > but keep insisting on keeping the code around (which is also built
> > uncondÑ–tionally for the platform).
> 
> 
> I personally keep insisting on correct commit logs, i.e. not calling
> working code dead and providing actual reasons for the change. Thanks,

If that is the only thing you are complaining about I can clarify it
a little of course.  But it didn't sound like that was the actual
problem.


Re: [PATCH 3/4] powerpc/powernv: remove dead NPU DMA code

2019-06-20 Thread Alexey Kardashevskiy



On 20/06/2019 16:03, Christoph Hellwig wrote:
> Hi Linus,
> 
> this goes back to the discussion at last years kernel summit, where
> we had the discussion on removing code never used by any in-kernel
> user an no prospects of one.  The IBM folks are unfortunately still
> dragging their feet on the powerpc side.  Can we revise this discussion?
> 
> The use case here is a IBM specific bus for which they only have an
> out of tree driver that their partner doesn't want to submit for mainline,
> but keep insisting on keeping the code around (which is also built
> uncondÑ–tionally for the platform).


I personally keep insisting on correct commit logs, i.e. not calling
working code dead and providing actual reasons for the change. Thanks,


> 
> I hope we had settled that argument back then, but it seems like Big
> Blue insists they are special.
> 
> On Thu, Jun 20, 2019 at 11:45:42AM +1000, Alexey Kardashevskiy wrote:
>>
>>
>> On 19/06/2019 17:28, Christoph Hellwig wrote:
>>> On Wed, Jun 19, 2019 at 10:34:54AM +1000, Alexey Kardashevskiy wrote:


 On 23/05/2019 17:49, Christoph Hellwig wrote:
> None of these routines were ever used since they were added to the
> kernel.


 It is still being used exactly in the way as it was explained before in
 previous respins. Thanks.
>>>
>>> Please point to the in-kernel user, because that is the only relevant
>>> one.  This is not just my opinion but we had a clear discussion on that
>>> at least years kernel summit.
>>
>>
>> There is no in-kernel user which still does not mean that the code is
>> dead. If it is irrelevant - put this to the commit log instead of saying
>> it is dead; also if there was a clear outcome from that discussion, then
>> please point me to that, I do not get to attend these discussions. Thanks,


-- 
Alexey


Re: [PATCH 3/4] powerpc/powernv: remove dead NPU DMA code

2019-06-20 Thread Christoph Hellwig
Hi Linus,

this goes back to the discussion at last years kernel summit, where
we had the discussion on removing code never used by any in-kernel
user an no prospects of one.  The IBM folks are unfortunately still
dragging their feet on the powerpc side.  Can we revise this discussion?

The use case here is a IBM specific bus for which they only have an
out of tree driver that their partner doesn't want to submit for mainline,
but keep insisting on keeping the code around (which is also built
uncondÑ–tionally for the platform).

I hope we had settled that argument back then, but it seems like Big
Blue insists they are special.

On Thu, Jun 20, 2019 at 11:45:42AM +1000, Alexey Kardashevskiy wrote:
> 
> 
> On 19/06/2019 17:28, Christoph Hellwig wrote:
> > On Wed, Jun 19, 2019 at 10:34:54AM +1000, Alexey Kardashevskiy wrote:
> >>
> >>
> >> On 23/05/2019 17:49, Christoph Hellwig wrote:
> >>> None of these routines were ever used since they were added to the
> >>> kernel.
> >>
> >>
> >> It is still being used exactly in the way as it was explained before in
> >> previous respins. Thanks.
> > 
> > Please point to the in-kernel user, because that is the only relevant
> > one.  This is not just my opinion but we had a clear discussion on that
> > at least years kernel summit.
> 
> 
> There is no in-kernel user which still does not mean that the code is
> dead. If it is irrelevant - put this to the commit log instead of saying
> it is dead; also if there was a clear outcome from that discussion, then
> please point me to that, I do not get to attend these discussions. Thanks,
> 
> 
> -- 
> Alexey
---end quoted text---


Re: [PATCH v4 1/4] lib/scatterlist: Fix mapping iterator when sg->offset is greater than PAGE_SIZE

2019-06-20 Thread Herbert Xu
On Mon, Jun 17, 2019 at 09:15:02PM +, Christophe Leroy wrote:
> All mapping iterator logic is based on the assumption that sg->offset
> is always lower than PAGE_SIZE.
> 
> But there are situations where sg->offset is such that the SG item
> is on the second page. In that case sg_copy_to_buffer() fails
> properly copying the data into the buffer. One of the reason is
> that the data will be outside the kmapped area used to access that
> data.
> 
> This patch fixes the issue by adjusting the mapping iterator
> offset and pgoffset fields such that offset is always lower than
> PAGE_SIZE.
> 
> Signed-off-by: Christophe Leroy 
> Fixes: 4225fc8555a9 ("lib/scatterlist: use page iterator in the mapping 
> iterator")
> Cc: sta...@vger.kernel.org
> ---
>  lib/scatterlist.c | 9 +++--
>  1 file changed, 7 insertions(+), 2 deletions(-)

Good catch.

> @@ -686,7 +686,12 @@ static bool sg_miter_get_next_page(struct 
> sg_mapping_iter *miter)
>   sg = miter->piter.sg;
>   pgoffset = miter->piter.sg_pgoffset;
>  
> - miter->__offset = pgoffset ? 0 : sg->offset;
> + offset = pgoffset ? 0 : sg->offset;
> + while (offset >= PAGE_SIZE) {
> + miter->piter.sg_pgoffset = ++pgoffset;
> + offset -= PAGE_SIZE;
> + }

How about

miter->piter.sg_pgoffset += offset >> PAGE_SHIFT;
offset &= PAGE_SIZE - 1;

Thanks,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


[PATCH] KVM: PPC: Book3S HV: Fix CR0 setting in TM emulation

2019-06-20 Thread Michael Neuling
When emulating tsr, treclaim and trechkpt, we incorrectly set CR0. The
code currently sets:
CR0 <- 00 || MSR[TS]
but according to the ISA it should be:
CR0 <-  0 || MSR[TS] || 0

This fixes the bit shift to put the bits in the correct location.

Tested-by: Suraj Jitindar Singh 
Signed-off-by: Michael Neuling 
---
 arch/powerpc/kvm/book3s_hv_tm.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_tm.c b/arch/powerpc/kvm/book3s_hv_tm.c
index 888e2609e3..31cd0f327c 100644
--- a/arch/powerpc/kvm/book3s_hv_tm.c
+++ b/arch/powerpc/kvm/book3s_hv_tm.c
@@ -131,7 +131,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
}
/* Set CR0 to indicate previous transactional state */
vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fff) |
-   (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
+   (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29);
/* L=1 => tresume, L=0 => tsuspend */
if (instr & (1 << 21)) {
if (MSR_TM_SUSPENDED(msr))
@@ -175,7 +175,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
 
/* Set CR0 to indicate previous transactional state */
vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fff) |
-   (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
+   (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29);
vcpu->arch.shregs.msr &= ~MSR_TS_MASK;
return RESUME_GUEST;
 
@@ -205,7 +205,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
 
/* Set CR0 to indicate previous transactional state */
vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fff) |
-   (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
+   (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29);
vcpu->arch.shregs.msr = msr | MSR_TS_S;
return RESUME_GUEST;
}
-- 
2.21.0



Re: [PATCH v2 30/52] powerpc/64s/exception: optimise system_reset for idle, clean up non-idle case

2019-06-20 Thread Nicholas Piggin
Nicholas Piggin's on June 20, 2019 3:14 pm:
> The idle wake up code in the system reset interrupt is not very
> optimal. There are two requirements: perform idle wake up quickly;
> and save everything including CFAR for non-idle interrupts, with
> no performance requirement.
> 
> The problem with placing the idle test in the middle of the handler
> and using the normal handler code to save CFAR, is that it's quite
> costly (e.g., mfcfar is serialising, speculative workarounds get
> applied, SRR1 has to be reloaded, etc). It also prevents the standard
> interrupt handler boilerplate being used.
> 
> This pain can be avoided by using a dedicated idle interrupt handler
> at the start of the interrupt handler, which restores all registers
> back to the way they were in case it was not an idle wake up. CFAR
> is preserved without saving it before the non-idle case by making that
> the fall-through, and idle is a taken branch.
> 
> Performance seems to be in the noise, but possibly around 0.5% faster,
> the executed instructions certainly look better. The bigger benefit is
> being able to drop in standard interrupt handlers after the idle code,
> which helps with subsequent cleanup and consolidation.
> 
> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/kernel/exceptions-64s.S | 89 ++--
>  1 file changed, 44 insertions(+), 45 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/exceptions-64s.S 
> b/arch/powerpc/kernel/exceptions-64s.S
> index e0492912ea79..f582ae30f3f7 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -241,7 +241,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
>   * load KBASE for a slight optimisation.
>   */
>  #define BRANCH_TO_C000(reg, label)   \
> - __LOAD_HANDLER(reg, label); \
> + __LOAD_FAR_HANDLER(reg, label); \
>   mtctr   reg;\
>   bctr
>  
> @@ -784,16 +784,6 @@ EXC_VIRT_NONE(0x4000, 0x100)
>  
>  
>  EXC_REAL_BEGIN(system_reset, 0x100, 0x100)
> - SET_SCRATCH0(r13)
> - EXCEPTION_PROLOG_0 PACA_EXNMI
> -
> - /* This is EXCEPTION_PROLOG_1 with the idle feature section added */
> - OPT_SAVE_REG_TO_PACA(PACA_EXNMI+EX_PPR, r9, CPU_FTR_HAS_PPR)
> - OPT_SAVE_REG_TO_PACA(PACA_EXNMI+EX_CFAR, r10, CPU_FTR_CFAR)
> - INTERRUPT_TO_KERNEL
> - SAVE_CTR(r10, PACA_EXNMI)
> - mfcrr9
> -
>  #ifdef CONFIG_PPC_P7_NAP
>   /*
>* If running native on arch 2.06 or later, check if we are waking up
> @@ -801,45 +791,67 @@ EXC_REAL_BEGIN(system_reset, 0x100, 0x100)
>* bits 46:47. A non-0 value indicates that we are coming from a power
>* saving state. The idle wakeup handler initially runs in real mode,
>* but we branch to the 0xc000... address so we can turn on relocation
> -  * with mtmsr.
> +  * with mtmsrd later, after SPRs are restored.
> +  *
> +  * Careful to minimise cost for the fast path (idle wakeup) while
> +  * also avoiding clobbering CFAR for the non-idle case. Once we know
> +  * it is an idle wake, volatiles don't matter, which is why we use
> +  * those here, and then re-do the entry in case of non-idle (without
> +  * branching for the non-idle case, to keep CFAR).
>*/
>  BEGIN_FTR_SECTION
> - mfspr   r10,SPRN_SRR1
> - rlwinm. r10,r10,47-31,30,31
> - beq-1f
> - cmpwi   cr1,r10,2
> + SET_SCRATCH0(r13)
> + GET_PACA(r13)
> + std r3,PACA_EXNMI+0*8(r13)
> + std r4,PACA_EXNMI+1*8(r13)
> + std r5,PACA_EXNMI+2*8(r13)
>   mfspr   r3,SPRN_SRR1
> - bltlr   cr1 /* no state loss, return to idle caller */
> - BRANCH_TO_C000(r10, system_reset_idle_common)
> -1:
> + mfocrf  r4,0x80
> + rlwinm. r5,r3,47-31,30,31
> + bne+system_reset_idle_wake
> + /* Not powersave wakeup. Restore regs for regular interrupt handler. */
> + mtocrf  0x80,r4
> + ld  r12,PACA_EXNMI+0*8(r13)
> + ld  r4,PACA_EXNMI+1*8(r13)
> + ld  r5,PACA_EXNMI+2*8(r13)
> + GET_SCRATCH0(r13)

For the love of... that should be 'ld r3', not 'ld r12', sorry.

Thanks,
Nick


Re: [PATCH v2] ocxl: Allow contexts to be attached with a NULL mm

2019-06-20 Thread Nicholas Piggin
Alastair D'Silva's on June 20, 2019 2:12 pm:
> From: Alastair D'Silva 
> 
> If an OpenCAPI context is to be used directly by a kernel driver, there
> may not be a suitable mm to use.
> 
> The patch makes the mm parameter to ocxl_context_attach optional.
> 
> Signed-off-by: Alastair D'Silva 

Yeah I don't think you need to manage a kernel context explicitly
because it will always be flushed with tlbie, comment helps. For
the powerpc/mm bit,

Acked-by: Nicholas Piggin 



[PATCH v2 52/52] powerpc/64s/exception: add missing branch to self after RFI

2019-06-20 Thread Nicholas Piggin
For consistency. These may not be required on modern processors,
and they don't quite fit with the RFI_TO macros, but they should
be all removed in that case.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index e21bf047156d..cdf7d7ef0c0e 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -944,6 +944,7 @@ EXC_COMMON_BEGIN(system_reset_common)
 
EXCEPTION_RESTORE_REGS EXC_STD
RFI_TO_USER_OR_KERNEL
+   b   .
 
 
 EXC_REAL_BEGIN(machine_check, 0x200, 0x100)
@@ -,6 +1112,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
bl  machine_check_queue_event
MACHINE_CHECK_HANDLER_WINDUP
RFI_TO_USER_OR_KERNEL
+   b   .
 
 deliver_mce:
/* Deliver the machine check to host kernel in V mode. */
@@ -1686,6 +1688,7 @@ TRAMP_REAL_BEGIN(hmi_exception_early)
 
EXCEPTION_RESTORE_REGS EXC_HV
HRFI_TO_USER_OR_KERNEL
+   b   .
 
 1:
/*
-- 
2.20.1



[PATCH v2 51/52] powerpc/64s/exception: machine check improve branch labels

2019-06-20 Thread Nicholas Piggin
Short forward and backward branches can be given number labels,
but larger significant divergences in code path a more readable
if they're given descriptive names.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 793d611fa937..e21bf047156d 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1071,11 +1071,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 * continue in host kernel in V mode to deliver the MC event.
 */
rldicl. r11,r12,4,63/* See if MC hit while in HV mode. */
-   beq 5f
+   beq 1f
andi.   r11,r12,MSR_PR  /* See if coming from user. */
-   bne 9f  /* continue in V mode if we are. */
+   bne deliver_mce /* continue in V mode if we are. */
+1:
 
-5:
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
/*
 * We are coming from kernel context. Check if we are coming from
@@ -1084,7 +1084,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 */
lbz r11,HSTATE_IN_GUEST(r13)
cmpwi   r11,0   /* Check if coming from guest */
-   bne 9f  /* continue if we are. */
+   bne deliver_mce /* continue if we are. */
 #endif
/*
 * At this point we are not sure about what context we come from.
@@ -1112,7 +1112,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
MACHINE_CHECK_HANDLER_WINDUP
RFI_TO_USER_OR_KERNEL
 
-9:
+deliver_mce:
/* Deliver the machine check to host kernel in V mode. */
 BEGIN_FTR_SECTION
ld  r10,ORIG_GPR3(r1)
-- 
2.20.1



[PATCH v2 50/52] powerpc/64s/exception: untangle early machine check handler

2019-06-20 Thread Nicholas Piggin
machine_check_early_common now branches to machine_check_handle_early
which is its only caller, and they're separated by a bunch of other
code which makes no sense.

This patch moves that other code out of the way, and removes the
branch instruction.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 129 +--
 1 file changed, 62 insertions(+), 67 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index e8f644d6f310..793d611fa937 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -980,6 +980,16 @@ machine_check_pseries:
 
 TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
 
+#define MACHINE_CHECK_HANDLER_WINDUP   \
+   /* Clear MSR_RI before setting SRR0 and SRR1. */\
+   li  r9,0;   \
+   mtmsrd  r9,1;   /* Clear MSR_RI */  \
+   /* Decrement paca->in_mce now RI is clear. */   \
+   lhz r12,PACA_IN_MCE(r13);   \
+   subir12,r12,1;  \
+   sth r12,PACA_IN_MCE(r13);   \
+   EXCEPTION_RESTORE_REGS EXC_STD
+
 EXC_COMMON_BEGIN(machine_check_early_common)
mtctr   r10 /* Restore ctr */
mfspr   r11,SPRN_SRR0
@@ -1033,74 +1043,7 @@ EXC_COMMON_BEGIN(machine_check_early_common)
li  r10,MSR_RI
mtmsrd  r10,1
bl  enable_machine_check
-   b   machine_check_handle_early
 
-EXC_COMMON_BEGIN(machine_check_common)
-   /*
-* Machine check is different because we use a different
-* save area: PACA_EXMC instead of PACA_EXGEN.
-*/
-   EXCEPTION_COMMON(PACA_EXMC, 0x200)
-   FINISH_NAP
-   RECONCILE_IRQ_STATE(r10, r11)
-   ld  r3,PACA_EXMC+EX_DAR(r13)
-   lwz r4,PACA_EXMC+EX_DSISR(r13)
-   /* Enable MSR_RI when finished with PACA_EXMC */
-   li  r10,MSR_RI
-   mtmsrd  r10,1
-   std r3,_DAR(r1)
-   std r4,_DSISR(r1)
-   bl  save_nvgprs
-   addir3,r1,STACK_FRAME_OVERHEAD
-   bl  machine_check_exception
-   b   ret_from_except
-
-#define MACHINE_CHECK_HANDLER_WINDUP   \
-   /* Clear MSR_RI before setting SRR0 and SRR1. */\
-   li  r9,0;   \
-   mtmsrd  r9,1;   /* Clear MSR_RI */  \
-   /* Decrement paca->in_mce now RI is clear. */   \
-   lhz r12,PACA_IN_MCE(r13);   \
-   subir12,r12,1;  \
-   sth r12,PACA_IN_MCE(r13);   \
-   EXCEPTION_RESTORE_REGS EXC_STD
-
-#ifdef CONFIG_PPC_P7_NAP
-/*
- * This is an idle wakeup. Low level machine check has already been
- * done. Queue the event then call the idle code to do the wake up.
- */
-EXC_COMMON_BEGIN(machine_check_idle_common)
-   bl  machine_check_queue_event
-
-   /*
-* We have not used any non-volatile GPRs here, and as a rule
-* most exception code including machine check does not.
-* Therefore PACA_NAPSTATELOST does not need to be set. Idle
-* wakeup will restore volatile registers.
-*
-* Load the original SRR1 into r3 for pnv_powersave_wakeup_mce.
-*
-* Then decrement MCE nesting after finishing with the stack.
-*/
-   ld  r3,_MSR(r1)
-   ld  r4,_LINK(r1)
-
-   lhz r11,PACA_IN_MCE(r13)
-   subir11,r11,1
-   sth r11,PACA_IN_MCE(r13)
-
-   mtlrr4
-   rlwinm  r10,r3,47-31,30,31
-   cmpwi   cr1,r10,2
-   bltlr   cr1 /* no state loss, return to idle caller */
-   b   idle_return_gpr_loss
-#endif
-   /*
-* Handle machine check early in real mode. We come here with
-* ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack.
-*/
-EXC_COMMON_BEGIN(machine_check_handle_early)
bl  save_nvgprs
addir3,r1,STACK_FRAME_OVERHEAD
bl  machine_check_early
@@ -1180,6 +1123,58 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0
EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0
 
+EXC_COMMON_BEGIN(machine_check_common)
+   /*
+* Machine check is different because we use a different
+* save area: PACA_EXMC instead of PACA_EXGEN.
+*/
+   EXCEPTION_COMMON(PACA_EXMC, 0x200)
+   FINISH_NAP
+   RECONCILE_IRQ_STATE(r10, r11)
+   ld  r3,PACA_EXMC+EX_DAR(r13)
+   lwz r4,PACA_EXMC+EX_DSISR(r13)
+   /* Enable MSR_RI when finished with PACA_EXMC */
+   li  r10,MSR_RI
+   mtmsrd  r10,1
+   std r3,_DAR(r1)
+   std r4,_DSISR(r1)
+   bl  save_nvgprs
+   addir3,r1,STACK_FRAME_OVERHEAD
+   bl  machine_check_exception
+   b   ret_from_except
+
+#ifdef CONFIG_PPC_P7_NAP

[PATCH v2 49/52] powerpc/64s/exceptions: machine check move unrecoverable handling out of line

2019-06-20 Thread Nicholas Piggin
Similarly to the previous patch, move unrecoverable handling out of
line, which makes the regular path less cluttered and easier to
follow.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 83 +---
 1 file changed, 39 insertions(+), 44 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index be83a4e71814..e8f644d6f310 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1016,9 +1016,9 @@ EXC_COMMON_BEGIN(machine_check_early_common)
bne 1f
/* First machine check entry */
ld  r1,PACAMCEMERGSP(r13)   /* Use MC emergency stack */
-1: subir1,r1,INT_FRAME_SIZE/* alloc stack frame */
-   /* Limit nested MCE to level 4 to avoid stack overflow */
-   bge cr1,2f  /* Check if we hit limit of 4 */
+1: /* Limit nested MCE to level 4 to avoid stack overflow */
+   bgt cr1,unrecoverable_mce   /* Check if we hit limit of 4 */
+   subir1,r1,INT_FRAME_SIZE/* alloc stack frame */
 
EXCEPTION_PROLOG_COMMON_1()
/* We don't touch AMR here, we never go to virtual mode */
@@ -1032,25 +1032,9 @@ EXC_COMMON_BEGIN(machine_check_early_common)
 
li  r10,MSR_RI
mtmsrd  r10,1
-
bl  enable_machine_check
b   machine_check_handle_early
 
-2:
-   /* Stack overflow. Stay on emergency stack and panic.
-* Keep the ME bit off while panic-ing, so that if we hit
-* another machine check we checkstop.
-*/
-   addir1,r1,INT_FRAME_SIZE/* go back to previous stack frame */
-   ld  r11,PACAKMSR(r13)
-   LOAD_HANDLER(r12, unrecover_mce)
-   li  r10,MSR_ME
-   andcr11,r11,r10 /* Turn off MSR_ME */
-   mtspr   SPRN_SRR0,r12
-   mtspr   SPRN_SRR1,r11
-   RFI_TO_KERNEL
-   b   .   /* prevent speculative execution */
-
 EXC_COMMON_BEGIN(machine_check_common)
/*
 * Machine check is different because we use a different
@@ -1166,32 +1150,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 * If yes, then stay on emergency stack and panic.
 */
andi.   r11,r12,MSR_RI
-   bne 2f
-1: mfspr   r11,SPRN_SRR0
-   LOAD_HANDLER(r10,unrecover_mce)
-   mtspr   SPRN_SRR0,r10
-   ld  r10,PACAKMSR(r13)
-   /*
-* We are going down. But there are chances that we might get hit by
-* another MCE during panic path and we may run into unstable state
-* with no way out. Hence, turn ME bit off while going down, so that
-* when another MCE is hit during panic path, system will checkstop
-* and hypervisor will get restarted cleanly by SP.
-*/
-   li  r3,MSR_ME
-   andcr10,r10,r3  /* Turn off MSR_ME */
-   mtspr   SPRN_SRR1,r10
-   RFI_TO_KERNEL
-   b   .
-2:
+   beq unrecoverable_mce
+
/*
 * Check if we have successfully handled/recovered from error, if not
 * then stay on emergency stack and panic.
 */
ld  r3,RESULT(r1)   /* Load result */
cmpdi   r3,0/* see if we handled MCE successfully */
-
-   beq 1b  /* if !handled then panic */
+   beq unrecoverable_mce /* if !handled then panic */
 
/*
 * Return from MC interrupt.
@@ -1213,17 +1180,31 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0
EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0
 
-EXC_COMMON_BEGIN(unrecover_mce)
+EXC_COMMON_BEGIN(unrecoverable_mce)
+   /*
+* We are going down. But there are chances that we might get hit by
+* another MCE during panic path and we may run into unstable state
+* with no way out. Hence, turn ME bit off while going down, so that
+* when another MCE is hit during panic path, system will checkstop
+* and hypervisor will get restarted cleanly by SP.
+*/
+   bl  disable_machine_check
+   ld  r10,PACAKMSR(r13)
+   li  r3,MSR_ME
+   andcr10,r10,r3
+   mtmsrd  r10
+
/* Invoke machine_check_exception to print MCE event and panic. */
addir3,r1,STACK_FRAME_OVERHEAD
bl  machine_check_exception
+
/*
-* We will not reach here. Even if we did, there is no way out. Call
-* unrecoverable_exception and die.
+* We will not reach here. Even if we did, there is no way out.
+* Call unrecoverable_exception and die.
 */
-1: addir3,r1,STACK_FRAME_OVERHEAD
+   addir3,r1,STACK_FRAME_OVERHEAD
bl  unrecoverable_exception
-   b   1b
+   b   .
 
 
 EXC_REAL_BEGIN(data_access, 0x300, 0x80)
@@ -2297,6 +2278,20 @@ enable_machine_check:
 1: mtlrr0
blr
 

[PATCH v2 48/52] powerpc/64s/exception: simplify machine check early path

2019-06-20 Thread Nicholas Piggin
machine_check_handle_early_common can reach machine_check_handle_early
directly now that it runs at the relocated address. The only reason to
do the rfi sequence is to enable MSR[ME]. Move that into a helper
function to make the normal code path a bit easier to read.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 30 
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 384f591ef078..be83a4e71814 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1030,13 +1030,12 @@ EXC_COMMON_BEGIN(machine_check_early_common)
std r3,_DAR(r1)
std r4,_DSISR(r1)
 
-   mfmsr   r11 /* get MSR value */
-   ori r11,r11,MSR_ME|MSR_RI   /* turn on ME, RI */
-   LOAD_HANDLER(r12, machine_check_handle_early)
-1: mtspr   SPRN_SRR0,r12
-   mtspr   SPRN_SRR1,r11
-   RFI_TO_KERNEL
-   b   .   /* prevent speculative execution */
+   li  r10,MSR_RI
+   mtmsrd  r10,1
+
+   bl  enable_machine_check
+   b   machine_check_handle_early
+
 2:
/* Stack overflow. Stay on emergency stack and panic.
 * Keep the ME bit off while panic-ing, so that if we hit
@@ -1047,7 +1046,9 @@ EXC_COMMON_BEGIN(machine_check_early_common)
LOAD_HANDLER(r12, unrecover_mce)
li  r10,MSR_ME
andcr11,r11,r10 /* Turn off MSR_ME */
-   b   1b
+   mtspr   SPRN_SRR0,r12
+   mtspr   SPRN_SRR1,r11
+   RFI_TO_KERNEL
b   .   /* prevent speculative execution */
 
 EXC_COMMON_BEGIN(machine_check_common)
@@ -2283,6 +2284,19 @@ CLOSE_FIXED_SECTION(virt_trampolines);
 
 USE_TEXT_SECTION()
 
+enable_machine_check:
+   mflrr0
+   bcl 20,31,$+4
+0: mflrr3
+   addir3,r3,(1f - 0b)
+   mtspr   SPRN_SRR0,r3
+   mfmsr   r3
+   ori r3,r3,MSR_ME
+   mtspr   SPRN_SRR1,r3
+   RFI_TO_KERNEL
+1: mtlrr0
+   blr
+
 /*
  * Hash table stuff
  */
-- 
2.20.1



[PATCH v2 47/52] powerpc/64s/exception: machine check restructure handler to be more regular

2019-06-20 Thread Nicholas Piggin
Follow the pattern of sreset and HMI handlers more closely, in using
EXCEPTION_PROLOG_COMMON_1 rather than open-coding it. Run the handler
at the relocated location.

This will help with simplification and code sharing.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 86 ++--
 1 file changed, 42 insertions(+), 44 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 8ed787dc579c..384f591ef078 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -958,17 +958,34 @@ BEGIN_FTR_SECTION
b   machine_check_pseries
 END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
 #endif
-   b   machine_check_common_early
+   EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 0, 0x200, 1, 1, 0
+   mfctr   r10 /* save ctr */
+   BRANCH_TO_C000(r11, machine_check_early_common)
+   /*
+* MSR_RI is not enabled, because PACA_EXMC is being used, so a
+* nested machine check corrupts it. machine_check_common enables
+* MSR_RI.
+*/
 EXC_REAL_END(machine_check, 0x200, 0x100)
 EXC_VIRT_NONE(0x4200, 0x100)
-TRAMP_REAL_BEGIN(machine_check_common_early)
-   EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 0, 0x200, 0, 0, 0
+
+#ifdef CONFIG_PPC_PSERIES
+TRAMP_REAL_BEGIN(machine_check_fwnmi)
+   /* See comment at machine_check exception, don't turn on RI */
+   EXCEPTION_PROLOG_0 PACA_EXMC
+machine_check_pseries:
+   EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0
+   EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0
+#endif
+
+TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
+
+EXC_COMMON_BEGIN(machine_check_early_common)
+   mtctr   r10 /* Restore ctr */
+   mfspr   r11,SPRN_SRR0
+   mfspr   r12,SPRN_SRR1
+
/*
-* Register contents:
-* R13  = PACA
-* R9   = CR
-* Original R9 to R13 is saved on PACA_EXMC
-*
 * Switch to mc_emergency stack and handle re-entrancy (we limit
 * the nested MCE upto level 4 to avoid stack overflow).
 * Save MCE registers srr1, srr0, dar and dsisr and then set ME=1
@@ -989,32 +1006,30 @@ TRAMP_REAL_BEGIN(machine_check_common_early)
 * the machine check is handled then the idle wakeup code is called
 * to restore state.
 */
-   mr  r11,r1  /* Save r1 */
lhz r10,PACA_IN_MCE(r13)
cmpwi   r10,0   /* Are we in nested machine check */
-   bne 0f  /* Yes, we are. */
-   /* First machine check entry */
-   ld  r1,PACAMCEMERGSP(r13)   /* Use MC emergency stack */
-0: subir1,r1,INT_FRAME_SIZE/* alloc stack frame */
+   cmpwi   cr1,r10,MAX_MCE_DEPTH   /* Are we at maximum nesting */
addir10,r10,1   /* increment paca->in_mce */
sth r10,PACA_IN_MCE(r13)
+
+   mr  r10,r1  /* Save r1 */
+   bne 1f
+   /* First machine check entry */
+   ld  r1,PACAMCEMERGSP(r13)   /* Use MC emergency stack */
+1: subir1,r1,INT_FRAME_SIZE/* alloc stack frame */
/* Limit nested MCE to level 4 to avoid stack overflow */
-   cmpwi   r10,MAX_MCE_DEPTH
-   bgt 2f  /* Check if we hit limit of 4 */
-   std r11,GPR1(r1)/* Save r1 on the stack. */
-   std r11,0(r1)   /* make stack chain pointer */
-   mfspr   r11,SPRN_SRR0   /* Save SRR0 */
-   std r11,_NIP(r1)
-   mfspr   r11,SPRN_SRR1   /* Save SRR1 */
-   std r11,_MSR(r1)
-   mfspr   r11,SPRN_DAR/* Save DAR */
-   std r11,_DAR(r1)
-   mfspr   r11,SPRN_DSISR  /* Save DSISR */
-   std r11,_DSISR(r1)
-   std r9,_CCR(r1) /* Save CR in stackframe */
+   bge cr1,2f  /* Check if we hit limit of 4 */
+
+   EXCEPTION_PROLOG_COMMON_1()
/* We don't touch AMR here, we never go to virtual mode */
-   /* Save r9 through r13 from EXMC save area to stack frame. */
EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
+   EXCEPTION_PROLOG_COMMON_3(0x200)
+
+   ld  r3,PACA_EXMC+EX_DAR(r13)
+   lwz r4,PACA_EXMC+EX_DSISR(r13)
+   std r3,_DAR(r1)
+   std r4,_DSISR(r1)
+
mfmsr   r11 /* get MSR value */
ori r11,r11,MSR_ME|MSR_RI   /* turn on ME, RI */
LOAD_HANDLER(r12, machine_check_handle_early)
@@ -1035,21 +1050,6 @@ TRAMP_REAL_BEGIN(machine_check_common_early)
b   1b
b   .   /* prevent speculative execution */
 
-#ifdef CONFIG_PPC_PSERIES
-TRAMP_REAL_BEGIN(machine_check_fwnmi)
-   EXCEPTION_PROLOG_0 PACA_EXMC
-machine_check_pseries:
-   EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0
-   EXCEPTION_PROLOG_2_REAL 

[PATCH v2 46/52] powerpc/64s/exception: fix machine check early should not set AMR

2019-06-20 Thread Nicholas Piggin
The early machine check runs in real mode, so locking is unnecessary.
Worse, the windup does not restore AMR, so this can result in a false
KUAP fault after a recoverable machine check hits inside a user copy
operation.

Fix this similarly to HMI by just avoiding the kuap lock in the
early machine check handler (it will be set by the late handler that
runs in virtual mode if that runs).

Fixes: 890274c2dc4c0 ("powerpc/64s: Implement KUAP for Radix MMU")
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index ab22af2509d8..8ed787dc579c 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1012,7 +1012,7 @@ TRAMP_REAL_BEGIN(machine_check_common_early)
mfspr   r11,SPRN_DSISR  /* Save DSISR */
std r11,_DSISR(r1)
std r9,_CCR(r1) /* Save CR in stackframe */
-   kuap_save_amr_and_lock r9, r10, cr1
+   /* We don't touch AMR here, we never go to virtual mode */
/* Save r9 through r13 from EXMC save area to stack frame. */
EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
mfmsr   r11 /* get MSR value */
-- 
2.20.1



[PATCH v2 45/52] powerpc/64s/exception: machine check windup restore cfar for host delivery

2019-06-20 Thread Nicholas Piggin
Bare metal machine checks run an "early" handler in real mode which
potentially flushes faulting translation structures, among other
things, before running the main handler which reports the event.

The main handler runs as a normal interrupt handler, after a "windup"
that sets registers back as they were at interrupt entry. CFAR does
not get restored by the windup code, so add that. The current handler
does not appear to use CFAR anywhere, because the main handler is not
run if the MCE happens in kernel-mode and the user-mode message is not
a register trace. However it may be useful in some cases or future
changes (xmon, panic on mce, etc).

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 61c96502d2a8..ab22af2509d8 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1205,6 +1205,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 9:
/* Deliver the machine check to host kernel in V mode. */
+BEGIN_FTR_SECTION
+   ld  r10,ORIG_GPR3(r1)
+   mtspr   SPRN_CFAR,r10
+END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
MACHINE_CHECK_HANDLER_WINDUP
EXCEPTION_PROLOG_0 PACA_EXMC
EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0
-- 
2.20.1



[PATCH v2 44/52] powerpc/64s/exception: separate pseries and powernv mce delivery paths

2019-06-20 Thread Nicholas Piggin
This will allow standardised interrupt entry macros to be used in
future. These paths may be de-duplicated again after that if code
allows.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 21 -
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index f3362adc99e6..61c96502d2a8 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -952,11 +952,13 @@ EXC_REAL_BEGIN(machine_check, 0x200, 0x100)
 * vector
 */
EXCEPTION_PROLOG_0 PACA_EXMC
+#ifdef CONFIG_PPC_PSERIES
 BEGIN_FTR_SECTION
+   /* Some hypervisors inject directly to 0x200 if FWNMI is not enabled */
+   b   machine_check_pseries
+END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
+#endif
b   machine_check_common_early
-FTR_SECTION_ELSE
-   b   machine_check_pSeries_0
-ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
 EXC_REAL_END(machine_check, 0x200, 0x100)
 EXC_VIRT_NONE(0x4200, 0x100)
 TRAMP_REAL_BEGIN(machine_check_common_early)
@@ -1033,18 +1035,18 @@ TRAMP_REAL_BEGIN(machine_check_common_early)
b   1b
b   .   /* prevent speculative execution */
 
-TRAMP_REAL_BEGIN(machine_check_pSeries)
-   .globl machine_check_fwnmi
-machine_check_fwnmi:
+#ifdef CONFIG_PPC_PSERIES
+TRAMP_REAL_BEGIN(machine_check_fwnmi)
EXCEPTION_PROLOG_0 PACA_EXMC
-machine_check_pSeries_0:
+machine_check_pseries:
EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0
+   EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0
/*
 * MSR_RI is not enabled, because PACA_EXMC is being used, so a
 * nested machine check corrupts it. machine_check_common enables
 * MSR_RI.
 */
-   EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0
+#endif
 
 TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
 
@@ -1205,7 +1207,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
/* Deliver the machine check to host kernel in V mode. */
MACHINE_CHECK_HANDLER_WINDUP
EXCEPTION_PROLOG_0 PACA_EXMC
-   b   machine_check_pSeries_0
+   EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0
+   EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0
 
 EXC_COMMON_BEGIN(unrecover_mce)
/* Invoke machine_check_exception to print MCE event and panic. */
-- 
2.20.1



[PATCH v2 43/52] powerpc/64s/exception: machine check early only runs in HV mode

2019-06-20 Thread Nicholas Piggin
machine_check_common_early and machine_check_handle_early only run in
HVMODE. Remove dead code.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 38 +---
 1 file changed, 6 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index b12755a4f884..f3362adc99e6 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1014,10 +1014,7 @@ TRAMP_REAL_BEGIN(machine_check_common_early)
/* Save r9 through r13 from EXMC save area to stack frame. */
EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
mfmsr   r11 /* get MSR value */
-BEGIN_FTR_SECTION
-   ori r11,r11,MSR_ME  /* turn on ME bit */
-END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
-   ori r11,r11,MSR_RI  /* turn on RI bit */
+   ori r11,r11,MSR_ME|MSR_RI   /* turn on ME, RI */
LOAD_HANDLER(r12, machine_check_handle_early)
 1: mtspr   SPRN_SRR0,r12
mtspr   SPRN_SRR1,r11
@@ -1124,11 +1121,8 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
bl  machine_check_early
std r3,RESULT(r1)   /* Save result */
ld  r12,_MSR(r1)
-BEGIN_FTR_SECTION
-   b   4f
-END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
 
-#ifdef CONFIG_PPC_P7_NAP
+#ifdef CONFIG_PPC_P7_NAP
/*
 * Check if thread was in power saving mode. We come here when any
 * of the following is true:
@@ -1141,7 +1135,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
 BEGIN_FTR_SECTION
rlwinm. r11,r12,47-31,30,31
bne machine_check_idle_common
-END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 #endif
 
/*
@@ -1150,12 +1144,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 */
rldicl. r11,r12,4,63/* See if MC hit while in HV mode. */
beq 5f
-4: andi.   r11,r12,MSR_PR  /* See if coming from user. */
+   andi.   r11,r12,MSR_PR  /* See if coming from user. */
bne 9f  /* continue in V mode if we are. */
 
 5:
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-BEGIN_FTR_SECTION
/*
 * We are coming from kernel context. Check if we are coming from
 * guest. if yes, then we can continue. We will fall through
@@ -1164,7 +1157,6 @@ BEGIN_FTR_SECTION
lbz r11,HSTATE_IN_GUEST(r13)
cmpwi   r11,0   /* Check if coming from guest */
bne 9f  /* continue if we are. */
-END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 #endif
/*
 * At this point we are not sure about what context we come from.
@@ -1199,7 +1191,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
cmpdi   r3,0/* see if we handled MCE successfully */
 
beq 1b  /* if !handled then panic */
-BEGIN_FTR_SECTION
+
/*
 * Return from MC interrupt.
 * Queue up the MCE event so that we can log it later, while
@@ -1208,18 +1200,7 @@ BEGIN_FTR_SECTION
bl  machine_check_queue_event
MACHINE_CHECK_HANDLER_WINDUP
RFI_TO_USER_OR_KERNEL
-FTR_SECTION_ELSE
-   /*
-* pSeries: Return from MC interrupt. Before that stay on emergency
-* stack and call machine_check_exception to log the MCE event.
-*/
-   LOAD_HANDLER(r10,mce_return)
-   mtspr   SPRN_SRR0,r10
-   ld  r10,PACAKMSR(r13)
-   mtspr   SPRN_SRR1,r10
-   RFI_TO_KERNEL
-   b   .
-ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
+
 9:
/* Deliver the machine check to host kernel in V mode. */
MACHINE_CHECK_HANDLER_WINDUP
@@ -1238,13 +1219,6 @@ EXC_COMMON_BEGIN(unrecover_mce)
bl  unrecoverable_exception
b   1b
 
-EXC_COMMON_BEGIN(mce_return)
-   /* Invoke machine_check_exception to print MCE event and return. */
-   addir3,r1,STACK_FRAME_OVERHEAD
-   bl  machine_check_exception
-   MACHINE_CHECK_HANDLER_WINDUP
-   RFI_TO_KERNEL
-   b   .
 
 EXC_REAL_BEGIN(data_access, 0x300, 0x80)
EXCEPTION_PROLOG_0 PACA_EXGEN
-- 
2.20.1



[PATCH v2 42/52] powerpc/64s/exception: machine check fwnmi does not trigger when in HV mode

2019-06-20 Thread Nicholas Piggin
Remove dead code.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 286bd5670d60..b12755a4f884 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1040,9 +1040,6 @@ TRAMP_REAL_BEGIN(machine_check_pSeries)
.globl machine_check_fwnmi
 machine_check_fwnmi:
EXCEPTION_PROLOG_0 PACA_EXMC
-BEGIN_FTR_SECTION
-   b   machine_check_common_early
-END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
 machine_check_pSeries_0:
EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0
/*
-- 
2.20.1



[PATCH v2 41/52] powerpc/tm: update comment about interrupt re-entrancy

2019-06-20 Thread Nicholas Piggin
Since the system reset interrupt began to use its own stack, and
machine check interrupts have done so for some time, r1 can be
changed without clearing MSR[RI], provided no other interrupts
(including SLB misses) are taken.

MSR[RI] does have to be cleared when using SCRATCH0, however.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/tm.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index 9fabdce255cd..6ba0fdd1e7f8 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -148,7 +148,7 @@ _GLOBAL(tm_reclaim)
/* Stash the stack pointer away for use after reclaim */
std r1, PACAR1(r13)
 
-   /* Clear MSR RI since we are about to change r1, EE is already off. */
+   /* Clear MSR RI since we are about to use SCRATCH0, EE is already off */
li  r5, 0
mtmsrd  r5, 1
 
@@ -474,7 +474,7 @@ restore_gprs:
 
REST_GPR(7, r7)
 
-   /* Clear MSR RI since we are about to change r1. EE is already off */
+   /* Clear MSR RI since we are about to use SCRATCH0. EE is already off */
li  r5, 0
mtmsrd  r5, 1
 
-- 
2.20.1



[PATCH v2 40/52] powerpc/64s/exception: move SET_SCRATCH0 into EXCEPTION_PROLOG_0

2019-06-20 Thread Nicholas Piggin
No generated code change. File is change is in bug table line numbers.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 25 +
 1 file changed, 1 insertion(+), 24 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 99de397a1cd9..286bd5670d60 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -128,6 +128,7 @@ BEGIN_FTR_SECTION_NESTED(943)   
\
 END_FTR_SECTION_NESTED(ftr,ftr,943)
 
 .macro EXCEPTION_PROLOG_0 area
+   SET_SCRATCH0(r13)   /* save r13 */
GET_PACA(r13)
std r9,\area\()+EX_R9(r13)  /* save r9 */
OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR)
@@ -540,7 +541,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 
 #define __EXC_REAL(name, start, size, area)\
EXC_REAL_BEGIN(name, start, size);  \
-   SET_SCRATCH0(r13);  /* save r13 */  \
EXCEPTION_PROLOG_0 area ;   \
EXCEPTION_PROLOG_1 EXC_STD, area, 1, start, 0, 0, 0 ;   \
EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 ; \
@@ -551,7 +551,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 
 #define __EXC_VIRT(name, start, size, realvec, area)   \
EXC_VIRT_BEGIN(name, start, size);  \
-   SET_SCRATCH0(r13);/* save r13 */\
EXCEPTION_PROLOG_0 area ;   \
EXCEPTION_PROLOG_1 EXC_STD, area, 0, realvec, 0, 0, 0;  \
EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD ;\
@@ -562,7 +561,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 
 #define EXC_REAL_MASKABLE(name, start, size, bitmask)  \
EXC_REAL_BEGIN(name, start, size);  \
-   SET_SCRATCH0(r13);/* save r13 */\
EXCEPTION_PROLOG_0 PACA_EXGEN ; \
EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, start, 0, 0, bitmask ; \
EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 ; \
@@ -570,7 +568,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 
 #define EXC_VIRT_MASKABLE(name, start, size, realvec, bitmask) \
EXC_VIRT_BEGIN(name, start, size);  \
-   SET_SCRATCH0(r13);/* save r13 */\
EXCEPTION_PROLOG_0 PACA_EXGEN ; \
EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, realvec, 0, 0, bitmask ; \
EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD ;\
@@ -578,7 +575,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 
 #define EXC_REAL_HV(name, start, size) \
EXC_REAL_BEGIN(name, start, size);  \
-   SET_SCRATCH0(r13);  /* save r13 */  \
EXCEPTION_PROLOG_0 PACA_EXGEN;  \
EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, start, 0, 0, 0 ;  \
EXCEPTION_PROLOG_2_REAL name##_common, EXC_HV, 1 ;  \
@@ -586,7 +582,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 
 #define EXC_VIRT_HV(name, start, size, realvec)
\
EXC_VIRT_BEGIN(name, start, size);  \
-   SET_SCRATCH0(r13);  /* save r13 */  \
EXCEPTION_PROLOG_0 PACA_EXGEN;  \
EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, realvec, 0, 0, 0 ;\
EXCEPTION_PROLOG_2_VIRT name##_common, EXC_HV ; \
@@ -594,7 +589,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 
 #define __EXC_REAL_OOL(name, start, size)  \
EXC_REAL_BEGIN(name, start, size);  \
-   SET_SCRATCH0(r13);  \
EXCEPTION_PROLOG_0 PACA_EXGEN ; \
b   tramp_real_##name ; \
EXC_REAL_END(name, start, size)
@@ -622,7 +616,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 
 #define __EXC_REAL_OOL_HV_DIRECT(name, start, size, handler)   \
EXC_REAL_BEGIN(name, start, size);  \
-   SET_SCRATCH0(r13);  \
EXCEPTION_PROLOG_0 PACA_EXGEN ; \
b   handler;\
EXC_REAL_END(name, start, size)
@@ -653,7 +646,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
 
 #define __EXC_VIRT_OOL(name, start, size)  \
EXC_VIRT_BEGIN(name, start, size);

[PATCH v2 39/52] powerpc/64s/exception: denorm handler use standard scratch save macro

2019-06-20 Thread Nicholas Piggin
Although the 0x1500 interrupt only applies to bare metal, it is better
to just use the standard macro for scratch save.

Runtime code path remains unchanged (due to instruction patching).

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 437f91179537..99de397a1cd9 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1920,7 +1920,7 @@ EXC_REAL_NONE(0x1400, 0x100)
 EXC_VIRT_NONE(0x5400, 0x100)
 
 EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x100)
-   mtspr   SPRN_SPRG_HSCRATCH0,r13
+   SET_SCRATCH0(r13)
EXCEPTION_PROLOG_0 PACA_EXGEN
EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 0, 0x1500, 0, 0, 0
 
-- 
2.20.1



[PATCH v2 38/52] powerpc/64s/exception: machine check use standard macros to save dar/dsisr

2019-06-20 Thread Nicholas Piggin
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 16d5ea1c86bb..437f91179537 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1056,7 +1056,7 @@ BEGIN_FTR_SECTION
b   machine_check_common_early
 END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
 machine_check_pSeries_0:
-   EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 0, 0, 0
+   EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0
/*
 * MSR_RI is not enabled, because PACA_EXMC is being used, so a
 * nested machine check corrupts it. machine_check_common enables
@@ -1071,10 +1071,6 @@ EXC_COMMON_BEGIN(machine_check_common)
 * Machine check is different because we use a different
 * save area: PACA_EXMC instead of PACA_EXGEN.
 */
-   mfspr   r10,SPRN_DAR
-   std r10,PACA_EXMC+EX_DAR(r13)
-   mfspr   r10,SPRN_DSISR
-   stw r10,PACA_EXMC+EX_DSISR(r13)
EXCEPTION_COMMON(PACA_EXMC, 0x200)
FINISH_NAP
RECONCILE_IRQ_STATE(r10, r11)
-- 
2.20.1



[PATCH v2 37/52] powerpc/64s/exception: add dar and dsisr options to exception macro

2019-06-20 Thread Nicholas Piggin
Some exception entry requires DAR and/or DSISR to be saved into the
paca exception save area. Add options to the standard exception
macros for these.

Generated code changes slightly due to code structure.

- 554:  a6 02 72 7d mfdsisr r11
- 558:  a8 00 4d f9 std r10,168(r13)
- 55c:  b0 00 6d 91 stw r11,176(r13)
+ 554:  a8 00 4d f9 std r10,168(r13)
+ 558:  a6 02 52 7d mfdsisr r10
+ 55c:  b0 00 4d 91 stw r10,176(r13)

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 103 ---
 1 file changed, 46 insertions(+), 57 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 2412b5269e25..16d5ea1c86bb 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -136,7 +136,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR)
 .endm
 
-.macro EXCEPTION_PROLOG_1 hsrr, area, kvm, vec, bitmask
+.macro EXCEPTION_PROLOG_1 hsrr, area, kvm, vec, dar, dsisr, bitmask
OPT_SAVE_REG_TO_PACA(\area\()+EX_PPR, r9, CPU_FTR_HAS_PPR)
OPT_SAVE_REG_TO_PACA(\area\()+EX_CFAR, r10, CPU_FTR_CFAR)
INTERRUPT_TO_KERNEL
@@ -172,8 +172,22 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 
std r11,\area\()+EX_R11(r13)
std r12,\area\()+EX_R12(r13)
+
+   /*
+* DAR/DSISR, SCRATCH0 must be read before setting MSR[RI],
+* because a d-side MCE will clobber those registers so is
+* not recoverable if they are live.
+*/
GET_SCRATCH0(r10)
std r10,\area\()+EX_R13(r13)
+   .if \dar
+   mfspr   r10,SPRN_DAR
+   std r10,\area\()+EX_DAR(r13)
+   .endif
+   .if \dsisr
+   mfspr   r10,SPRN_DSISR
+   stw r10,\area\()+EX_DSISR(r13)
+   .endif
 .endm
 
 .macro EXCEPTION_PROLOG_2_REAL label, hsrr, set_ri
@@ -528,7 +542,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
EXC_REAL_BEGIN(name, start, size);  \
SET_SCRATCH0(r13);  /* save r13 */  \
EXCEPTION_PROLOG_0 area ;   \
-   EXCEPTION_PROLOG_1 EXC_STD, area, 1, start, 0 ; \
+   EXCEPTION_PROLOG_1 EXC_STD, area, 1, start, 0, 0, 0 ;   \
EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 ; \
EXC_REAL_END(name, start, size)
 
@@ -539,7 +553,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
EXC_VIRT_BEGIN(name, start, size);  \
SET_SCRATCH0(r13);/* save r13 */\
EXCEPTION_PROLOG_0 area ;   \
-   EXCEPTION_PROLOG_1 EXC_STD, area, 0, realvec, 0;\
+   EXCEPTION_PROLOG_1 EXC_STD, area, 0, realvec, 0, 0, 0;  \
EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD ;\
EXC_VIRT_END(name, start, size)
 
@@ -550,7 +564,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
EXC_REAL_BEGIN(name, start, size);  \
SET_SCRATCH0(r13);/* save r13 */\
EXCEPTION_PROLOG_0 PACA_EXGEN ; \
-   EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, start, bitmask ; \
+   EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, start, 0, 0, bitmask ; \
EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 ; \
EXC_REAL_END(name, start, size)
 
@@ -558,7 +572,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
EXC_VIRT_BEGIN(name, start, size);  \
SET_SCRATCH0(r13);/* save r13 */\
EXCEPTION_PROLOG_0 PACA_EXGEN ; \
-   EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, realvec, bitmask ;   \
+   EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, realvec, 0, 0, bitmask ; \
EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD ;\
EXC_VIRT_END(name, start, size)
 
@@ -566,7 +580,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
EXC_REAL_BEGIN(name, start, size);  \
SET_SCRATCH0(r13);  /* save r13 */  \
EXCEPTION_PROLOG_0 PACA_EXGEN;  \
-   EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, start, 0 ;\
+   EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, start, 0, 0, 0 ;  \
EXCEPTION_PROLOG_2_REAL name##_common, EXC_HV, 1 ;  \
EXC_REAL_END(name, start, size)
 
@@ -574,7 +588,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
EXC_VIRT_BEGIN(name, start, size);  \
SET_SCRATCH0(r13);  /* save r13 */  \
EXCEPTION_PROLOG_0 PACA_EXGEN;  \
-   

[PATCH v2 36/52] powerpc/64s/exception: use common macro for windup

2019-06-20 Thread Nicholas Piggin
No generated code change. File is change is in bug table line numbers.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 112 +--
 1 file changed, 36 insertions(+), 76 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index cce75adf2095..2412b5269e25 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -417,6 +417,38 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);
   \
EXCEPTION_PROLOG_COMMON_2(area);\
EXCEPTION_PROLOG_COMMON_3(trap)
 
+/*
+ * Restore all registers including H/SRR0/1 saved in a stack frame of a
+ * standard exception.
+ */
+.macro EXCEPTION_RESTORE_REGS hsrr
+   /* Move original SRR0 and SRR1 into the respective regs */
+   ld  r9,_MSR(r1)
+   .if \hsrr
+   mtspr   SPRN_HSRR1,r9
+   .else
+   mtspr   SPRN_SRR1,r9
+   .endif
+   ld  r9,_NIP(r1)
+   .if \hsrr
+   mtspr   SPRN_HSRR0,r9
+   .else
+   mtspr   SPRN_SRR0,r9
+   .endif
+   ld  r9,_CTR(r1)
+   mtctr   r9
+   ld  r9,_XER(r1)
+   mtxer   r9
+   ld  r9,_LINK(r1)
+   mtlrr9
+   ld  r9,_CCR(r1)
+   mtcrr9
+   REST_8GPRS(2, r1)
+   REST_4GPRS(10, r1)
+   REST_GPR(0, r1)
+   /* restore original r1. */
+   ld  r1,GPR1(r1)
+.endm
 
 #define RUNLATCH_ON\
 BEGIN_FTR_SECTION  \
@@ -906,29 +938,7 @@ EXC_COMMON_BEGIN(system_reset_common)
ld  r10,SOFTE(r1)
stb r10,PACAIRQSOFTMASK(r13)
 
-   /*
-* Keep below code in synch with MACHINE_CHECK_HANDLER_WINDUP.
-* Should share common bits...
-*/
-
-   /* Move original SRR0 and SRR1 into the respective regs */
-   ld  r9,_MSR(r1)
-   mtspr   SPRN_SRR1,r9
-   ld  r9,_NIP(r1)
-   mtspr   SPRN_SRR0,r9
-   ld  r9,_CTR(r1)
-   mtctr   r9
-   ld  r9,_XER(r1)
-   mtxer   r9
-   ld  r9,_LINK(r1)
-   mtlrr9
-   ld  r9,_CCR(r1)
-   mtcrr9
-   REST_8GPRS(2, r1)
-   REST_4GPRS(10, r1)
-   REST_GPR(0, r1)
-   /* restore original r1. */
-   ld  r1,GPR1(r1)
+   EXCEPTION_RESTORE_REGS EXC_STD
RFI_TO_USER_OR_KERNEL
 
 
@@ -1074,24 +1084,7 @@ EXC_COMMON_BEGIN(machine_check_common)
lhz r12,PACA_IN_MCE(r13);   \
subir12,r12,1;  \
sth r12,PACA_IN_MCE(r13);   \
-   /* Move original SRR0 and SRR1 into the respective regs */  \
-   ld  r9,_MSR(r1);\
-   mtspr   SPRN_SRR1,r9;   \
-   ld  r9,_NIP(r1);\
-   mtspr   SPRN_SRR0,r9;   \
-   ld  r9,_CTR(r1);\
-   mtctr   r9; \
-   ld  r9,_XER(r1);\
-   mtxer   r9; \
-   ld  r9,_LINK(r1);   \
-   mtlrr9; \
-   ld  r9,_CCR(r1);\
-   mtcrr9; \
-   REST_8GPRS(2, r1);  \
-   REST_4GPRS(10, r1); \
-   REST_GPR(0, r1);\
-   /* restore original r1. */  \
-   ld  r1,GPR1(r1)
+   EXCEPTION_RESTORE_REGS EXC_STD
 
 #ifdef CONFIG_PPC_P7_NAP
 /*
@@ -1774,48 +1767,15 @@ TRAMP_REAL_BEGIN(hmi_exception_early)
cmpdi   cr0,r3,0
bne 1f
 
-   /* Windup the stack. */
-   /* Move original HSRR0 and HSRR1 into the respective regs */
-   ld  r9,_MSR(r1)
-   mtspr   SPRN_HSRR1,r9
-   ld  r9,_NIP(r1)
-   mtspr   SPRN_HSRR0,r9
-   ld  r9,_CTR(r1)
-   mtctr   r9
-   ld  r9,_XER(r1)
-   mtxer   r9
-   ld  r9,_LINK(r1)
-   mtlrr9
-   ld  r9,_CCR(r1)
-   mtcrr9
-   REST_8GPRS(2, r1)
-   REST_4GPRS(10, r1)
-   REST_GPR(0, r1)
-   ld  r1,GPR1(r1)
+   EXCEPTION_RESTORE_REGS EXC_HV
HRFI_TO_USER_OR_KERNEL
 
 1:
-   ld  r9,_MSR(r1)
-   mtspr   SPRN_HSRR1,r9
-   ld  r9,_NIP(r1)
-   mtspr   SPRN_HSRR0,r9
-   ld  r9,_CTR(r1)
-   mtctr   r9
-   ld  r9,_XER(r1)
-   mtxer   r9
-   ld  r9,_LINK(r1)
-   mtlrr9
-   ld  r9,_CCR(r1)
-   mtcrr9
-   REST_8GPRS(2, r1)
-   REST_4GPRS(10, r1)
-   REST_GPR(0, r1)
-   ld  r1,GPR1(r1)
-
/*
 * Go to virtual mode and pull the HMI event information from
 * firmware.
 */
+   

[PATCH v2 35/52] powerpc/64s/exception: shuffle windup code around

2019-06-20 Thread Nicholas Piggin
Restore all SPRs and CR up-front, these are longer latency
instructions. Move register restore around to maximise pairs of
adjacent loads (e.g., restore r0 next to r1).

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 40 +++-
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 3476cffa21b8..cce75adf2095 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -922,13 +922,11 @@ EXC_COMMON_BEGIN(system_reset_common)
mtxer   r9
ld  r9,_LINK(r1)
mtlrr9
-   REST_GPR(0, r1)
+   ld  r9,_CCR(r1)
+   mtcrr9
REST_8GPRS(2, r1)
-   REST_GPR(10, r1)
-   ld  r11,_CCR(r1)
-   mtcrr11
-   REST_GPR(11, r1)
-   REST_2GPRS(12, r1)
+   REST_4GPRS(10, r1)
+   REST_GPR(0, r1)
/* restore original r1. */
ld  r1,GPR1(r1)
RFI_TO_USER_OR_KERNEL
@@ -1087,13 +1085,11 @@ EXC_COMMON_BEGIN(machine_check_common)
mtxer   r9; \
ld  r9,_LINK(r1);   \
mtlrr9; \
-   REST_GPR(0, r1);\
+   ld  r9,_CCR(r1);\
+   mtcrr9; \
REST_8GPRS(2, r1);  \
-   REST_GPR(10, r1);   \
-   ld  r11,_CCR(r1);   \
-   mtcrr11;\
-   REST_GPR(11, r1);   \
-   REST_2GPRS(12, r1); \
+   REST_4GPRS(10, r1); \
+   REST_GPR(0, r1);\
/* restore original r1. */  \
ld  r1,GPR1(r1)
 
@@ -1790,13 +1786,11 @@ TRAMP_REAL_BEGIN(hmi_exception_early)
mtxer   r9
ld  r9,_LINK(r1)
mtlrr9
-   REST_GPR(0, r1)
+   ld  r9,_CCR(r1)
+   mtcrr9
REST_8GPRS(2, r1)
-   REST_GPR(10, r1)
-   ld  r11,_CCR(r1)
-   REST_2GPRS(12, r1)
-   mtcrr11
-   REST_GPR(11, r1)
+   REST_4GPRS(10, r1)
+   REST_GPR(0, r1)
ld  r1,GPR1(r1)
HRFI_TO_USER_OR_KERNEL
 
@@ -1811,13 +1805,11 @@ TRAMP_REAL_BEGIN(hmi_exception_early)
mtxer   r9
ld  r9,_LINK(r1)
mtlrr9
-   REST_GPR(0, r1)
+   ld  r9,_CCR(r1)
+   mtcrr9
REST_8GPRS(2, r1)
-   REST_GPR(10, r1)
-   ld  r11,_CCR(r1)
-   REST_2GPRS(12, r1)
-   mtcrr11
-   REST_GPR(11, r1)
+   REST_4GPRS(10, r1)
+   REST_GPR(0, r1)
ld  r1,GPR1(r1)
 
/*
-- 
2.20.1



[PATCH v2 34/52] powerpc/64s/exception: simplify hmi windup code

2019-06-20 Thread Nicholas Piggin
Duplicate the hmi windup code for both cases, rather than to put a
special case branch in the middle of it. Remove unused label. This
helps with later code consolidation.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 22 ++
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index cf89d728720a..3476cffa21b8 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1776,6 +1776,7 @@ TRAMP_REAL_BEGIN(hmi_exception_early)
addir3,r1,STACK_FRAME_OVERHEAD
BRANCH_LINK_TO_FAR(DOTSYM(hmi_exception_realmode)) /* Function call ABI 
*/
cmpdi   cr0,r3,0
+   bne 1f
 
/* Windup the stack. */
/* Move original HSRR0 and HSRR1 into the respective regs */
@@ -1794,13 +1795,28 @@ TRAMP_REAL_BEGIN(hmi_exception_early)
REST_GPR(10, r1)
ld  r11,_CCR(r1)
REST_2GPRS(12, r1)
-   bne 1f
mtcrr11
REST_GPR(11, r1)
ld  r1,GPR1(r1)
HRFI_TO_USER_OR_KERNEL
 
-1: mtcrr11
+1:
+   ld  r9,_MSR(r1)
+   mtspr   SPRN_HSRR1,r9
+   ld  r9,_NIP(r1)
+   mtspr   SPRN_HSRR0,r9
+   ld  r9,_CTR(r1)
+   mtctr   r9
+   ld  r9,_XER(r1)
+   mtxer   r9
+   ld  r9,_LINK(r1)
+   mtlrr9
+   REST_GPR(0, r1)
+   REST_8GPRS(2, r1)
+   REST_GPR(10, r1)
+   ld  r11,_CCR(r1)
+   REST_2GPRS(12, r1)
+   mtcrr11
REST_GPR(11, r1)
ld  r1,GPR1(r1)
 
@@ -1808,8 +1824,6 @@ TRAMP_REAL_BEGIN(hmi_exception_early)
 * Go to virtual mode and pull the HMI event information from
 * firmware.
 */
-   .globl hmi_exception_after_realmode
-hmi_exception_after_realmode:
SET_SCRATCH0(r13)
EXCEPTION_PROLOG_0 PACA_EXGEN
b   tramp_real_hmi_exception
-- 
2.20.1



[PATCH v2 33/52] powerpc/64s/exception: move machine check windup in_mce handling

2019-06-20 Thread Nicholas Piggin
Move in_mce decrement earlier before registers are restored (but
still after RI=0). This helps with later consolidation.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 804438669454..cf89d728720a 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1072,6 +1072,10 @@ EXC_COMMON_BEGIN(machine_check_common)
/* Clear MSR_RI before setting SRR0 and SRR1. */\
li  r9,0;   \
mtmsrd  r9,1;   /* Clear MSR_RI */  \
+   /* Decrement paca->in_mce now RI is clear. */   \
+   lhz r12,PACA_IN_MCE(r13);   \
+   subir12,r12,1;  \
+   sth r12,PACA_IN_MCE(r13);   \
/* Move original SRR0 and SRR1 into the respective regs */  \
ld  r9,_MSR(r1);\
mtspr   SPRN_SRR1,r9;   \
@@ -1088,10 +1092,6 @@ EXC_COMMON_BEGIN(machine_check_common)
REST_GPR(10, r1);   \
ld  r11,_CCR(r1);   \
mtcrr11;\
-   /* Decrement paca->in_mce. */   \
-   lhz r12,PACA_IN_MCE(r13);   \
-   subir12,r12,1;  \
-   sth r12,PACA_IN_MCE(r13);   \
REST_GPR(11, r1);   \
REST_2GPRS(12, r1); \
/* restore original r1. */  \
-- 
2.20.1



[PATCH v2 32/52] powerpc/64s/exception: windup use r9 consistently to restore SPRs

2019-06-20 Thread Nicholas Piggin
Trivial code change, r3->r9.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 539bb1b83d90..804438669454 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -914,8 +914,8 @@ EXC_COMMON_BEGIN(system_reset_common)
/* Move original SRR0 and SRR1 into the respective regs */
ld  r9,_MSR(r1)
mtspr   SPRN_SRR1,r9
-   ld  r3,_NIP(r1)
-   mtspr   SPRN_SRR0,r3
+   ld  r9,_NIP(r1)
+   mtspr   SPRN_SRR0,r9
ld  r9,_CTR(r1)
mtctr   r9
ld  r9,_XER(r1)
@@ -1075,8 +1075,8 @@ EXC_COMMON_BEGIN(machine_check_common)
/* Move original SRR0 and SRR1 into the respective regs */  \
ld  r9,_MSR(r1);\
mtspr   SPRN_SRR1,r9;   \
-   ld  r3,_NIP(r1);\
-   mtspr   SPRN_SRR0,r3;   \
+   ld  r9,_NIP(r1);\
+   mtspr   SPRN_SRR0,r9;   \
ld  r9,_CTR(r1);\
mtctr   r9; \
ld  r9,_XER(r1);\
@@ -1781,8 +1781,8 @@ TRAMP_REAL_BEGIN(hmi_exception_early)
/* Move original HSRR0 and HSRR1 into the respective regs */
ld  r9,_MSR(r1)
mtspr   SPRN_HSRR1,r9
-   ld  r3,_NIP(r1)
-   mtspr   SPRN_HSRR0,r3
+   ld  r9,_NIP(r1)
+   mtspr   SPRN_HSRR0,r9
ld  r9,_CTR(r1)
mtctr   r9
ld  r9,_XER(r1)
-- 
2.20.1



[PATCH v2 31/52] powerpc/64s/exception: mtmsrd L=1 cleanup

2019-06-20 Thread Nicholas Piggin
All supported 64s CPUs support mtmsrd L=1 instruction, so a cleanup
can be made in sreset and mce handlers.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index f582ae30f3f7..539bb1b83d90 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -887,11 +887,8 @@ EXC_COMMON_BEGIN(system_reset_common)
addir3,r1,STACK_FRAME_OVERHEAD
bl  system_reset_exception
 
-   /* This (and MCE) can be simplified with mtmsrd L=1 */
/* Clear MSR_RI before setting SRR0 and SRR1. */
-   li  r0,MSR_RI
-   mfmsr   r9
-   andcr9,r9,r0
+   li  r9,0
mtmsrd  r9,1
 
/*
@@ -1073,9 +1070,7 @@ EXC_COMMON_BEGIN(machine_check_common)
 
 #define MACHINE_CHECK_HANDLER_WINDUP   \
/* Clear MSR_RI before setting SRR0 and SRR1. */\
-   li  r0,MSR_RI;  \
-   mfmsr   r9; /* get MSR value */ \
-   andcr9,r9,r0;   \
+   li  r9,0;   \
mtmsrd  r9,1;   /* Clear MSR_RI */  \
/* Move original SRR0 and SRR1 into the respective regs */  \
ld  r9,_MSR(r1);\
-- 
2.20.1



[PATCH v2 30/52] powerpc/64s/exception: optimise system_reset for idle, clean up non-idle case

2019-06-20 Thread Nicholas Piggin
The idle wake up code in the system reset interrupt is not very
optimal. There are two requirements: perform idle wake up quickly;
and save everything including CFAR for non-idle interrupts, with
no performance requirement.

The problem with placing the idle test in the middle of the handler
and using the normal handler code to save CFAR, is that it's quite
costly (e.g., mfcfar is serialising, speculative workarounds get
applied, SRR1 has to be reloaded, etc). It also prevents the standard
interrupt handler boilerplate being used.

This pain can be avoided by using a dedicated idle interrupt handler
at the start of the interrupt handler, which restores all registers
back to the way they were in case it was not an idle wake up. CFAR
is preserved without saving it before the non-idle case by making that
the fall-through, and idle is a taken branch.

Performance seems to be in the noise, but possibly around 0.5% faster,
the executed instructions certainly look better. The bigger benefit is
being able to drop in standard interrupt handlers after the idle code,
which helps with subsequent cleanup and consolidation.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 89 ++--
 1 file changed, 44 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index e0492912ea79..f582ae30f3f7 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -241,7 +241,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
  * load KBASE for a slight optimisation.
  */
 #define BRANCH_TO_C000(reg, label) \
-   __LOAD_HANDLER(reg, label); \
+   __LOAD_FAR_HANDLER(reg, label); \
mtctr   reg;\
bctr
 
@@ -784,16 +784,6 @@ EXC_VIRT_NONE(0x4000, 0x100)
 
 
 EXC_REAL_BEGIN(system_reset, 0x100, 0x100)
-   SET_SCRATCH0(r13)
-   EXCEPTION_PROLOG_0 PACA_EXNMI
-
-   /* This is EXCEPTION_PROLOG_1 with the idle feature section added */
-   OPT_SAVE_REG_TO_PACA(PACA_EXNMI+EX_PPR, r9, CPU_FTR_HAS_PPR)
-   OPT_SAVE_REG_TO_PACA(PACA_EXNMI+EX_CFAR, r10, CPU_FTR_CFAR)
-   INTERRUPT_TO_KERNEL
-   SAVE_CTR(r10, PACA_EXNMI)
-   mfcrr9
-
 #ifdef CONFIG_PPC_P7_NAP
/*
 * If running native on arch 2.06 or later, check if we are waking up
@@ -801,45 +791,67 @@ EXC_REAL_BEGIN(system_reset, 0x100, 0x100)
 * bits 46:47. A non-0 value indicates that we are coming from a power
 * saving state. The idle wakeup handler initially runs in real mode,
 * but we branch to the 0xc000... address so we can turn on relocation
-* with mtmsr.
+* with mtmsrd later, after SPRs are restored.
+*
+* Careful to minimise cost for the fast path (idle wakeup) while
+* also avoiding clobbering CFAR for the non-idle case. Once we know
+* it is an idle wake, volatiles don't matter, which is why we use
+* those here, and then re-do the entry in case of non-idle (without
+* branching for the non-idle case, to keep CFAR).
 */
 BEGIN_FTR_SECTION
-   mfspr   r10,SPRN_SRR1
-   rlwinm. r10,r10,47-31,30,31
-   beq-1f
-   cmpwi   cr1,r10,2
+   SET_SCRATCH0(r13)
+   GET_PACA(r13)
+   std r3,PACA_EXNMI+0*8(r13)
+   std r4,PACA_EXNMI+1*8(r13)
+   std r5,PACA_EXNMI+2*8(r13)
mfspr   r3,SPRN_SRR1
-   bltlr   cr1 /* no state loss, return to idle caller */
-   BRANCH_TO_C000(r10, system_reset_idle_common)
-1:
+   mfocrf  r4,0x80
+   rlwinm. r5,r3,47-31,30,31
+   bne+system_reset_idle_wake
+   /* Not powersave wakeup. Restore regs for regular interrupt handler. */
+   mtocrf  0x80,r4
+   ld  r12,PACA_EXNMI+0*8(r13)
+   ld  r4,PACA_EXNMI+1*8(r13)
+   ld  r5,PACA_EXNMI+2*8(r13)
+   GET_SCRATCH0(r13)
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 #endif
 
-   KVMTEST EXC_STD 0x100
-   std r11,PACA_EXNMI+EX_R11(r13)
-   std r12,PACA_EXNMI+EX_R12(r13)
-   GET_SCRATCH0(r10)
-   std r10,PACA_EXNMI+EX_R13(r13)
-
+   SET_SCRATCH0(r13)   /* save r13 */
+   EXCEPTION_PROLOG_0 PACA_EXNMI
+   EXCEPTION_PROLOG_1 EXC_STD, PACA_EXNMI, 1, 0x100, 0
EXCEPTION_PROLOG_2_REAL system_reset_common, EXC_STD, 0
/*
 * MSR_RI is not enabled, because PACA_EXNMI and nmi stack is
 * being used, so a nested NMI exception would corrupt it.
 */
-
 EXC_REAL_END(system_reset, 0x100, 0x100)
+
 EXC_VIRT_NONE(0x4100, 0x100)
 TRAMP_KVM(PACA_EXNMI, 0x100)
 
 #ifdef CONFIG_PPC_P7_NAP
-EXC_COMMON_BEGIN(system_reset_idle_common)
-   /*
-* This must be a direct branch (without linker branch stub) because
-* we can not use TOC at this point as 

[PATCH v2 29/52] powerpc/64s/exception: avoid SPR RAW scoreboard stall in real mode entry

2019-06-20 Thread Nicholas Piggin
Move SPR reads ahead of writes. Real mode entry that is not a KVM
guest is rare these days, but bad practice propagates.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 62f7e9ad23c6..e0492912ea79 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -183,19 +183,19 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
.endif
.if \hsrr
mfspr   r11,SPRN_HSRR0  /* save HSRR0 */
+   mfspr   r12,SPRN_HSRR1  /* and HSRR1 */
+   mtspr   SPRN_HSRR1,r10
.else
mfspr   r11,SPRN_SRR0   /* save SRR0 */
+   mfspr   r12,SPRN_SRR1   /* and SRR1 */
+   mtspr   SPRN_SRR1,r10
.endif
-   LOAD_HANDLER(r12, \label\())
+   LOAD_HANDLER(r10, \label\())
.if \hsrr
-   mtspr   SPRN_HSRR0,r12
-   mfspr   r12,SPRN_HSRR1  /* and HSRR1 */
-   mtspr   SPRN_HSRR1,r10
+   mtspr   SPRN_HSRR0,r10
HRFI_TO_KERNEL
.else
-   mtspr   SPRN_SRR0,r12
-   mfspr   r12,SPRN_SRR1   /* and SRR1 */
-   mtspr   SPRN_SRR1,r10
+   mtspr   SPRN_SRR0,r10
RFI_TO_KERNEL
.endif
b   .   /* prevent speculative execution */
-- 
2.20.1



[PATCH v2 28/52] powerpc/64s/exception: clean up system call entry

2019-06-20 Thread Nicholas Piggin
syscall / hcall entry unnecessarily differs between KVM and non-KVM
builds. Move the SMT priority instruction to the same location
(after INTERRUPT_TO_KERNEL).

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 25 +++--
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index d6de0ce1f0f2..62f7e9ad23c6 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1635,10 +1635,8 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
std r10,PACA_EXGEN+EX_R10(r13)
INTERRUPT_TO_KERNEL
KVMTEST EXC_STD 0xc00 /* uses r10, branch to do_kvm_0xc00_system_call */
-   HMT_MEDIUM
mfctr   r9
 #else
-   HMT_MEDIUM
mr  r9,r13
GET_PACA(r13)
INTERRUPT_TO_KERNEL
@@ -1650,11 +1648,13 @@ BEGIN_FTR_SECTION
beq-1f
 END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
 #endif
-   /* We reach here with PACA in r13, r13 in r9, and HMT_MEDIUM. */
-
-   .if \real
+   /* We reach here with PACA in r13, r13 in r9. */
mfspr   r11,SPRN_SRR0
mfspr   r12,SPRN_SRR1
+
+   HMT_MEDIUM
+
+   .if \real
__LOAD_HANDLER(r10, system_call_common)
mtspr   SPRN_SRR0,r10
ld  r10,PACAKMSR(r13)
@@ -1662,24 +1662,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
RFI_TO_KERNEL
b   .   /* prevent speculative execution */
.else
+   li  r10,MSR_RI
+   mtmsrd  r10,1   /* Set RI (EE=0) */
 #ifdef CONFIG_RELOCATABLE
-   /*
-* We can't branch directly so we do it via the CTR which
-* is volatile across system calls.
-*/
__LOAD_HANDLER(r10, system_call_common)
mtctr   r10
-   mfspr   r11,SPRN_SRR0
-   mfspr   r12,SPRN_SRR1
-   li  r10,MSR_RI
-   mtmsrd  r10,1
bctr
 #else
-   /* We can branch directly */
-   mfspr   r11,SPRN_SRR0
-   mfspr   r12,SPRN_SRR1
-   li  r10,MSR_RI
-   mtmsrd  r10,1   /* Set RI (EE=0) */
b   system_call_common
 #endif
.endif
-- 
2.20.1



[PATCH v2 27/52] powerpc/64s/exception: move paca save area offsets into exception-64s.S

2019-06-20 Thread Nicholas Piggin
No generated code change. File is change is in bug table line numbers.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/exception-64s.h | 17 +++--
 arch/powerpc/kernel/exceptions-64s.S | 22 ++
 2 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h 
b/arch/powerpc/include/asm/exception-64s.h
index 79e5ac87c029..33f4f72eb035 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -32,22 +32,11 @@
  */
 #include 
 
-/* PACA save area offsets (exgen, exmc, etc) */
-#define EX_R9  0
-#define EX_R10 8
-#define EX_R11 16
-#define EX_R12 24
-#define EX_R13 32
-#define EX_DAR 40
-#define EX_DSISR   48
-#define EX_CCR 52
-#define EX_CFAR56
-#define EX_PPR 64
+/* PACA save area size in u64 units (exgen, exmc, etc) */
 #if defined(CONFIG_RELOCATABLE)
-#define EX_CTR 72
-#define EX_SIZE10  /* size in u64 units */
+#define EX_SIZE10
 #else
-#define EX_SIZE9   /* size in u64 units */
+#define EX_SIZE9
 #endif
 
 /*
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 4b4bb8f43f55..d6de0ce1f0f2 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -21,6 +21,28 @@
 #include 
 #include 
 
+/* PACA save area offsets (exgen, exmc, etc) */
+#define EX_R9  0
+#define EX_R10 8
+#define EX_R11 16
+#define EX_R12 24
+#define EX_R13 32
+#define EX_DAR 40
+#define EX_DSISR   48
+#define EX_CCR 52
+#define EX_CFAR56
+#define EX_PPR 64
+#if defined(CONFIG_RELOCATABLE)
+#define EX_CTR 72
+.if EX_SIZE != 10
+   .error "EX_SIZE is wrong"
+.endif
+#else
+.if EX_SIZE != 9
+   .error "EX_SIZE is wrong"
+.endif
+#endif
+
 /*
  * We're short on space and time in the exception prolog, so we can't
  * use the normal LOAD_REG_IMMEDIATE macro to load the address of label.
-- 
2.20.1



[PATCH v2 26/52] powerpc/64s/exception: remove pointless EXCEPTION_PROLOG macro indirection

2019-06-20 Thread Nicholas Piggin
No generated code change. File is change is in bug table line numbers.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 97 +---
 1 file changed, 45 insertions(+), 52 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 6c0321e128da..4b4bb8f43f55 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -326,34 +326,6 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948)
std r0,GPR0(r1);/* save r0 in stackframe*/ \
std r10,GPR1(r1);   /* save r1 in stackframe*/ \
 
-
-/*
- * The common exception prolog is used for all except a few exceptions
- * such as a segment miss on a kernel address.  We have to be prepared
- * to take another exception from the point where we first touch the
- * kernel stack onwards.
- *
- * On entry r13 points to the paca, r9-r13 are saved in the paca,
- * r9 contains the saved CR, r11 and r12 contain the saved SRR0 and
- * SRR1, and relocation is on.
- */
-#define EXCEPTION_PROLOG_COMMON(n, area)  \
-   andi.   r10,r12,MSR_PR; /* See if coming from user  */ \
-   mr  r10,r1; /* Save r1  */ \
-   subir1,r1,INT_FRAME_SIZE;   /* alloc frame on kernel stack  */ \
-   beq-1f;\
-   ld  r1,PACAKSAVE(r13);  /* kernel stack to use  */ \
-1: tdgei   r1,-INT_FRAME_SIZE; /* trap if r1 is in userspace   */ \
-   EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0; \
-3: EXCEPTION_PROLOG_COMMON_1();   \
-   kuap_save_amr_and_lock r9, r10, cr1, cr0;  \
-   beq 4f; /* if from kernel mode  */ \
-   ACCOUNT_CPU_USER_ENTRY(r13, r9, r10);  \
-   SAVE_PPR(area, r9);\
-4: EXCEPTION_PROLOG_COMMON_2(area)\
-   EXCEPTION_PROLOG_COMMON_3(n)   \
-   ACCOUNT_STOLEN_TIME
-
 /* Save original regs values from save area to stack frame. */
 #define EXCEPTION_PROLOG_COMMON_2(area)
   \
ld  r9,area+EX_R9(r13); /* move r9, r10 to stackframe   */ \
@@ -373,7 +345,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); 
   \
GET_CTR(r10, area);\
std r10,_CTR(r1);
 
-#define EXCEPTION_PROLOG_COMMON_3(n)  \
+#define EXCEPTION_PROLOG_COMMON_3(trap)
   \
std r2,GPR2(r1);/* save r2 in stackframe*/ \
SAVE_4GPRS(3, r1);  /* save r3 - r6 in stackframe   */ \
SAVE_2GPRS(7, r1);  /* save r7, r8 in stackframe*/ \
@@ -384,26 +356,38 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);   
   \
mfspr   r11,SPRN_XER;   /* save XER in stackframe   */ \
std r10,SOFTE(r1); \
std r11,_XER(r1);  \
-   li  r9,(n)+1;  \
+   li  r9,(trap)+1;   \
std r9,_TRAP(r1);   /* set trap number  */ \
li  r10,0; \
ld  r11,exception_marker@toc(r2);  \
std r10,RESULT(r1); /* clear regs->result   */ \
std r11,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame  */
 
-#define RUNLATCH_ON\
-BEGIN_FTR_SECTION  \
-   ld  r3, PACA_THREAD_INFO(r13);  \
-   ld  r4,TI_LOCAL_FLAGS(r3);  \
-   andi.   r0,r4,_TLF_RUNLATCH;\
-   beqlppc64_runlatch_on_trampoline;   \
-END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
-
-#define EXCEPTION_COMMON(area, trap)   \
-   EXCEPTION_PROLOG_COMMON(trap, area);\
+/*
+ * On entry r13 points to the paca, r9-r13 are saved in the paca,
+ * r9 contains the saved CR, r11 and r12 contain the saved SRR0 and
+ * SRR1, and relocation is on.
+ */
+#define EXCEPTION_COMMON(area, trap)  \
+   andi.   r10,r12,MSR_PR; /* See if coming from user  */ \
+   mr  r10,r1; /* Save r1  */ \
+   subir1,r1,INT_FRAME_SIZE;   /* alloc frame on kernel stack  */ \
+   beq-1f;   

[PATCH v2 25/52] powerpc/64s/exception: remove bad stack branch

2019-06-20 Thread Nicholas Piggin
The bad stack test in interrupt handlers has a few problems. For
performance it is taken in the common case, which is a fetch bubble
and a waste of i-cache.

For code development and maintainence, it requires yet another stack
frame setup routine, and that constrains all exception handlers to
follow the same register save pattern which inhibits future
optimisation.

Remove the test/branch and replace it with a trap. Teach the program
check handler to use the emergency stack for this case.

This does not result in quite so nice a message, however the SRR0 and
SRR1 of the crashed interrupt can be seen in r11 and r12, as is the
original r1 (adjusted by INT_FRAME_SIZE). These are the most important
parts to debugging the issue.

The original r9-12 and cr0 is lost, which is the main downside.

  kernel BUG at linux/arch/powerpc/kernel/exceptions-64s.S:847!
  Oops: Exception in kernel mode, sig: 5 [#1]
  BE SMP NR_CPUS=2048 NUMA PowerNV
  Modules linked in:
  CPU: 0 PID: 1 Comm: swapper/0 Not tainted
  NIP:  c0009108 LR: c0cadbcc CTR: c00090f0
  REGS: c000fffcbd70 TRAP: 0700   Not tainted
  MSR:  90021032   CR: 28222448  XER: 2004
  CFAR: c0009100 IRQMASK: 0
  GPR00: 003d fd00 c18cfb00 c000f02b3166
  GPR04: fffd 0007 fffb 0030
  GPR08: 0037 28222448  c0ca8de0
  GPR12: 92009032 c1ae c0010a00 
  GPR16:    
  GPR20: c000f00322c0 c0f85200 0004 
  GPR24: fffe   000a
  GPR28:   c000f02b391c c000f02b3167
  NIP [c0009108] decrementer_common+0x18/0x160
  LR [c0cadbcc] .vsnprintf+0x3ec/0x4f0
  Call Trace:
  Instruction dump:
  996d098a 994d098b 38610070 480246ed 48005518 6000 3820 718a4000
  7c2a0b78 3821fd00 41c20008 e82d0970 <0981fd00> f92101a0 f9610170 f9810178

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/exception-64s.h |  7 --
 arch/powerpc/include/asm/paca.h  |  2 +
 arch/powerpc/kernel/asm-offsets.c|  2 +
 arch/powerpc/kernel/exceptions-64s.S | 95 
 arch/powerpc/xmon/xmon.c |  2 +
 5 files changed, 22 insertions(+), 86 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h 
b/arch/powerpc/include/asm/exception-64s.h
index dc6a5ccac965..79e5ac87c029 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -55,13 +55,6 @@
  */
 #define MAX_MCE_DEPTH  4
 
-/*
- * EX_R3 is only used by the bad_stack handler. bad_stack reloads and
- * saves DAR from SPRN_DAR, and EX_DAR is not used. So EX_R3 can overlap
- * with EX_DAR.
- */
-#define EX_R3  EX_DAR
-
 #ifdef __ASSEMBLY__
 
 #define STF_ENTRY_BARRIER_SLOT \
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 9bd2326bef6f..e3cc9eb9204d 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -166,7 +166,9 @@ struct paca_struct {
u64 kstack; /* Saved Kernel stack addr */
u64 saved_r1;   /* r1 save for RTAS calls or PM or EE=0 
*/
u64 saved_msr;  /* MSR saved here by enter_rtas */
+#ifdef CONFIG_PPC_BOOK3E
u16 trap_save;  /* Used when bad stack is encountered */
+#endif
u8 irq_soft_mask;   /* mask for irq soft masking */
u8 irq_happened;/* irq happened while soft-disabled */
u8 irq_work_pending;/* IRQ_WORK interrupt while 
soft-disable */
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 31dc7e64cbfc..4ccb6b3a7fbd 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -266,7 +266,9 @@ int main(void)
OFFSET(ACCOUNT_STARTTIME_USER, paca_struct, accounting.starttime_user);
OFFSET(ACCOUNT_USER_TIME, paca_struct, accounting.utime);
OFFSET(ACCOUNT_SYSTEM_TIME, paca_struct, accounting.stime);
+#ifdef CONFIG_PPC_BOOK3E
OFFSET(PACA_TRAP_SAVE, paca_struct, trap_save);
+#endif
OFFSET(PACA_SPRG_VDSO, paca_struct, sprg_vdso);
 #else /* CONFIG_PPC64 */
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 0a2b4e8b02b0..6c0321e128da 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -343,14 +343,8 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948)
subir1,r1,INT_FRAME_SIZE;   /* alloc frame on kernel stack  */ \
beq-1f;\

[PATCH v2 24/52] powerpc/64s/exception: generate regs clear instructions using .rept

2019-06-20 Thread Nicholas Piggin
No generated code change.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 29 +++-
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 02b4722b7c64..0a2b4e8b02b0 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -2010,12 +2010,11 @@ BEGIN_FTR_SECTION
mtmsrd  r10
sync
 
-#define FMR2(n)  fmr (n), (n) ; fmr n+1, n+1
-#define FMR4(n)  FMR2(n) ; FMR2(n+2)
-#define FMR8(n)  FMR4(n) ; FMR4(n+4)
-#define FMR16(n) FMR8(n) ; FMR8(n+8)
-#define FMR32(n) FMR16(n) ; FMR16(n+16)
-   FMR32(0)
+   .Lreg=0
+   .rept 32
+   fmr .Lreg,.Lreg
+   .Lreg=.Lreg+1
+   .endr
 
 FTR_SECTION_ELSE
 /*
@@ -2027,12 +2026,11 @@ FTR_SECTION_ELSE
mtmsrd  r10
sync
 
-#define XVCPSGNDP2(n) XVCPSGNDP(n,n,n) ; XVCPSGNDP(n+1,n+1,n+1)
-#define XVCPSGNDP4(n) XVCPSGNDP2(n) ; XVCPSGNDP2(n+2)
-#define XVCPSGNDP8(n) XVCPSGNDP4(n) ; XVCPSGNDP4(n+4)
-#define XVCPSGNDP16(n) XVCPSGNDP8(n) ; XVCPSGNDP8(n+8)
-#define XVCPSGNDP32(n) XVCPSGNDP16(n) ; XVCPSGNDP16(n+16)
-   XVCPSGNDP32(0)
+   .Lreg=0
+   .rept 32
+   XVCPSGNDP(.Lreg,.Lreg,.Lreg)
+   .Lreg=.Lreg+1
+   .endr
 
 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_206)
 
@@ -2043,7 +2041,12 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
  * To denormalise we need to move a copy of the register to itself.
  * For POWER8 we need to do that for all 64 VSX registers
  */
-   XVCPSGNDP32(32)
+   .Lreg=32
+   .rept 32
+   XVCPSGNDP(.Lreg,.Lreg,.Lreg)
+   .Lreg=.Lreg+1
+   .endr
+
 denorm_done:
mfspr   r11,SPRN_HSRR0
subir11,r11,4
-- 
2.20.1



[PATCH v2 23/52] powerpc/64s/exception: fix indenting irregularities

2019-06-20 Thread Nicholas Piggin
Generally, macros that result in instructions being expanded are
indented by a tab, and those that don't have no indent. Fix the
obvious cases that go contrary to style.

No generated code change.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 92 ++--
 1 file changed, 46 insertions(+), 46 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index f7b6634bcc75..02b4722b7c64 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -261,16 +261,16 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
cmpwi   r10,KVM_GUEST_MODE_SKIP
beq 89f
.else
-   BEGIN_FTR_SECTION_NESTED(947)
+BEGIN_FTR_SECTION_NESTED(947)
ld  r10,\area+EX_CFAR(r13)
std r10,HSTATE_CFAR(r13)
-   END_FTR_SECTION_NESTED(CPU_FTR_CFAR,CPU_FTR_CFAR,947)
+END_FTR_SECTION_NESTED(CPU_FTR_CFAR,CPU_FTR_CFAR,947)
.endif
 
-   BEGIN_FTR_SECTION_NESTED(948)
+BEGIN_FTR_SECTION_NESTED(948)
ld  r10,\area+EX_PPR(r13)
std r10,HSTATE_PPR(r13)
-   END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948)
+END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948)
ld  r10,\area+EX_R10(r13)
std r12,HSTATE_SCRATCH0(r13)
sldir12,r9,32
@@ -372,10 +372,10 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
std r9,GPR11(r1);  \
std r10,GPR12(r1); \
std r11,GPR13(r1); \
-   BEGIN_FTR_SECTION_NESTED(66);  \
+BEGIN_FTR_SECTION_NESTED(66); \
ld  r10,area+EX_CFAR(r13); \
std r10,ORIG_GPR3(r1); \
-   END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);\
+END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);
   \
GET_CTR(r10, area);\
std r10,_CTR(r1);
 
@@ -794,7 +794,7 @@ EXC_REAL_BEGIN(system_reset, 0x100, 0x100)
 * but we branch to the 0xc000... address so we can turn on relocation
 * with mtmsr.
 */
-   BEGIN_FTR_SECTION
+BEGIN_FTR_SECTION
mfspr   r10,SPRN_SRR1
rlwinm. r10,r10,47-31,30,31
beq-1f
@@ -803,7 +803,7 @@ EXC_REAL_BEGIN(system_reset, 0x100, 0x100)
bltlr   cr1 /* no state loss, return to idle caller */
BRANCH_TO_C000(r10, system_reset_idle_common)
 1:
-   END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 #endif
 
KVMTEST EXC_STD 0x100
@@ -1151,10 +1151,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
 *
 * Go back to nap/sleep/winkle mode again if (b) is true.
 */
-   BEGIN_FTR_SECTION
+BEGIN_FTR_SECTION
rlwinm. r11,r12,47-31,30,31
bne machine_check_idle_common
-   END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 #endif
 
/*
@@ -1261,13 +1261,13 @@ EXC_COMMON_BEGIN(mce_return)
b   .
 
 EXC_REAL_BEGIN(data_access, 0x300, 0x80)
-SET_SCRATCH0(r13)  /* save r13 */
-EXCEPTION_PROLOG_0 PACA_EXGEN
+   SET_SCRATCH0(r13)   /* save r13 */
+   EXCEPTION_PROLOG_0 PACA_EXGEN
b   tramp_real_data_access
 EXC_REAL_END(data_access, 0x300, 0x80)
 
 TRAMP_REAL_BEGIN(tramp_real_data_access)
-EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x300, 0
+   EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x300, 0
/*
 * DAR/DSISR must be read before setting MSR[RI], because
 * a d-side MCE will clobber those registers so is not
@@ -1280,9 +1280,9 @@ EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x300, 0
 EXCEPTION_PROLOG_2_REAL data_access_common, EXC_STD, 1
 
 EXC_VIRT_BEGIN(data_access, 0x4300, 0x80)
-SET_SCRATCH0(r13)  /* save r13 */
-EXCEPTION_PROLOG_0 PACA_EXGEN
-EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, 0x300, 0
+   SET_SCRATCH0(r13)   /* save r13 */
+   EXCEPTION_PROLOG_0 PACA_EXGEN
+   EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, 0x300, 0
mfspr   r10,SPRN_DAR
mfspr   r11,SPRN_DSISR
std r10,PACA_EXGEN+EX_DAR(r13)
@@ -1315,24 +1315,24 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
 
 
 EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80)
-SET_SCRATCH0(r13)  /* save r13 */
-EXCEPTION_PROLOG_0 PACA_EXSLB
+   SET_SCRATCH0(r13)   /* save r13 */
+   EXCEPTION_PROLOG_0 PACA_EXSLB
b   tramp_real_data_access_slb
 EXC_REAL_END(data_access_slb, 0x380, 0x80)
 
 TRAMP_REAL_BEGIN(tramp_real_data_access_slb)
-EXCEPTION_PROLOG_1 EXC_STD,