date:20190701

[PATCH v2] powerpc/64s: Make boot look nicer

2019-07-01 Thread Nicholas Piggin

Radix boot looks like this:

 -
 phys_mem_size = 0x2
 dcache_bsize  = 0x80
 icache_bsize  = 0x80
 cpu_features  = 0xc06f8f5fb1a7
   possible= 0xfbffcf5fb1a7
   always  = 0x0003800081a1
 cpu_user_features = 0xdc0065c2 0xaee0
 mmu_features  = 0xbc006041
 firmware_features = 0x1000
 hash-mmu: ppc64_pft_size= 0x0
 hash-mmu: kernel vmalloc start   = 0xc008
 hash-mmu: kernel IO start= 0xc00a
 hash-mmu: kernel vmemmap start   = 0xc00c
 -

It's misleading to see hash-mmu there. After fix:

 -
 phys_mem_size = 0x2
 dcache_bsize  = 0x80
 icache_bsize  = 0x80
 cpu_features  = 0xc06f8f5fb1a7
   possible= 0xfbffcf5fb1a7
   always  = 0x0003800081a1
 cpu_user_features = 0xdc0065c2 0xaee0
 mmu_features  = 0xbc006041
 firmware_features = 0x1000
 vmalloc start = 0xc008
 IO start  = 0xc00a
 vmemmap start = 0xc00c
 -

Hash has the same text misalignment problem which is also corrected.

Signed-off-by: Nicholas Piggin 
---
v2: avoid adding an ifdef to generic code.

 arch/powerpc/mm/book3s64/hash_utils.c | 21 +++--
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 28ced26f2a00..54f2d6acf6da 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1946,11 +1946,20 @@ machine_device_initcall(pseries, hash64_debugfs);
 
 void __init print_system_hash_info(void)
 {
-   pr_info("ppc64_pft_size= 0x%llx\n", ppc64_pft_size);
+#undef pr_fmt
+#define pr_fmt(fmt) fmt
 
-   if (htab_hash_mask)
-   pr_info("htab_hash_mask= 0x%lx\n", htab_hash_mask);
-   pr_info("kernel vmalloc start   = 0x%lx\n", KERN_VIRT_START);
-   pr_info("kernel IO start= 0x%lx\n", KERN_IO_START);
-   pr_info("kernel vmemmap start   = 0x%lx\n", (unsigned long)vmemmap);
+   if (!early_radix_enabled()) {
+   pr_info("ppc64_pft_size= 0x%llx\n", ppc64_pft_size);
+
+   if (htab_hash_mask)
+   pr_info("htab_hash_mask= 0x%lx\n", htab_hash_mask);
+   }
+
+   pr_info("vmalloc start = 0x%lx\n", KERN_VIRT_START);
+   pr_info("IO start  = 0x%lx\n", KERN_IO_START);
+   pr_info("vmemmap start = 0x%lx\n", (unsigned long)vmemmap);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "hash-mmu: " fmt
 }
-- 
2.20.1

[PATCH v2 2/2] Powerpc/Watchpoint: Rewrite ptrace-hwbreak.c selftest

2019-07-01 Thread Ravi Bangoria

ptrace-hwbreak.c selftest is logically broken. On powerpc, when
watchpoint is created with ptrace, signals are generated before
executing the instruction and user has to manually singlestep
the instruction with watchpoint disabled, which selftest never
does and thus it keeps on getting the signal at the same
instruction. If we fix it, selftest fails because the logical
connection between tracer(parent) and tracee(child) is also
broken. Rewrite the selftest and add new tests for unaligned
access.

With patch:
  $ ./tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak
  test: ptrace-hwbreak
  tags: git_version:v5.2-rc2-33-ga247a75f90a9-dirty
  PTRACE_SET_DEBUGREG, WO, len: 1: Ok
  PTRACE_SET_DEBUGREG, WO, len: 2: Ok
  PTRACE_SET_DEBUGREG, WO, len: 4: Ok
  PTRACE_SET_DEBUGREG, WO, len: 8: Ok
  PTRACE_SET_DEBUGREG, RO, len: 1: Ok
  PTRACE_SET_DEBUGREG, RO, len: 2: Ok
  PTRACE_SET_DEBUGREG, RO, len: 4: Ok
  PTRACE_SET_DEBUGREG, RO, len: 8: Ok
  PTRACE_SET_DEBUGREG, RW, len: 1: Ok
  PTRACE_SET_DEBUGREG, RW, len: 2: Ok
  PTRACE_SET_DEBUGREG, RW, len: 4: Ok
  PTRACE_SET_DEBUGREG, RW, len: 8: Ok
  PPC_PTRACE_SETHWDEBUG, MODE_EXACT, WO, len: 1: Ok
  PPC_PTRACE_SETHWDEBUG, MODE_EXACT, RO, len: 1: Ok
  PPC_PTRACE_SETHWDEBUG, MODE_EXACT, RW, len: 1: Ok
  PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW ALIGNED, WO, len: 6: Ok
  PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW ALIGNED, RO, len: 6: Ok
  PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW ALIGNED, RW, len: 6: Ok
  PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW UNALIGNED, WO, len: 6: Ok
  PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW UNALIGNED, RO, len: 6: Ok
  PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW UNALIGNED, RW, len: 6: Ok
  PPC_PTRACE_SETHWDEBUG, DAWR_MAX_LEN, RW, len: 512: Ok
  success: ptrace-hwbreak

Signed-off-by: Ravi Bangoria 
---
 .../selftests/powerpc/ptrace/ptrace-hwbreak.c | 535 +++---
 1 file changed, 325 insertions(+), 210 deletions(-)

diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c 
b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
index 3066d310f32b..fb1e05d7f77c 100644
--- a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
@@ -22,318 +22,433 @@
 #include 
 #include "ptrace.h"
 
-/* Breakpoint access modes */
-enum {
-   BP_X = 1,
-   BP_RW = 2,
-   BP_W = 4,
-};
-
-static pid_t child_pid;
-static struct ppc_debug_info dbginfo;
-
-static void get_dbginfo(void)
-{
-   int ret;
-
-   ret = ptrace(PPC_PTRACE_GETHWDBGINFO, child_pid, NULL, );
-   if (ret) {
-   perror("Can't get breakpoint info\n");
-   exit(-1);
-   }
-}
-
-static bool hwbreak_present(void)
-{
-   return (dbginfo.num_data_bps != 0);
-}
+/*
+ * Use volatile on all global var so that compiler doesn't
+ * optimise their load/stores. Otherwise selftest can fail.
+ */
+static volatile __u64 glvar;
 
-static bool dawr_present(void)
-{
-   return !!(dbginfo.features & PPC_DEBUG_FEATURE_DATA_BP_DAWR);
-}
+#define DAWR_MAX_LEN 512
+static volatile __u8 big_var[DAWR_MAX_LEN] __attribute__((aligned(512)));
 
-static void set_breakpoint_addr(void *addr)
-{
-   int ret;
+#define A_LEN 6
+#define B_LEN 6
+struct gstruct {
+   __u8 a[A_LEN]; /* double word aligned */
+   __u8 b[B_LEN]; /* double word unaligned */
+};
+static volatile struct gstruct gstruct __attribute__((aligned(512)));
 
-   ret = ptrace(PTRACE_SET_DEBUGREG, child_pid, 0, addr);
-   if (ret) {
-   perror("Can't set breakpoint addr\n");
-   exit(-1);
-   }
-}
 
-static int set_hwbreakpoint_addr(void *addr, int range)
+static void get_dbginfo(pid_t child_pid, struct ppc_debug_info *dbginfo)
 {
-   int ret;
-
-   struct ppc_hw_breakpoint info;
-
-   info.version = 1;
-   info.trigger_type = PPC_BREAKPOINT_TRIGGER_RW;
-   info.addr_mode = PPC_BREAKPOINT_MODE_EXACT;
-   if (range > 0)
-   info.addr_mode = PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE;
-   info.condition_mode = PPC_BREAKPOINT_CONDITION_NONE;
-   info.addr = (__u64)addr;
-   info.addr2 = (__u64)addr + range;
-   info.condition_value = 0;
-
-   ret = ptrace(PPC_PTRACE_SETHWDEBUG, child_pid, 0, );
-   if (ret < 0) {
-   perror("Can't set breakpoint\n");
+   if (ptrace(PPC_PTRACE_GETHWDBGINFO, child_pid, NULL, dbginfo)) {
+   perror("Can't get breakpoint info");
exit(-1);
}
-   return ret;
 }
 
-static int del_hwbreakpoint_addr(int watchpoint_handle)
+static bool dawr_present(struct ppc_debug_info *dbginfo)
 {
-   int ret;
-
-   ret = ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, watchpoint_handle);
-   if (ret < 0) {
-   perror("Can't delete hw breakpoint\n");
-   exit(-1);
-   }
-   return ret;
+   return !!(dbginfo->features & PPC_DEBUG_FEATURE_DATA_BP_DAWR);
 }
 
-#define DAWR_LENGTH_MAX 512
-
-/* Dummy variables to test read/write accesses */

[PATCH v2 1/2] Powerpc/Watchpoint: Fix length calculation for unaligned target

2019-07-01 Thread Ravi Bangoria

Watchpoint match range is always doubleword(8 bytes) aligned on
powerpc. If the given range is crossing doubleword boundary, we
need to increase the length such that next doubleword also get
covered. Ex,

  address   len = 6 bytes
|=.
   |v--|--v|
   | | | | | | | | | | | | | | | | |
   |---|---|
<---8 bytes--->

In such case, current code configures hw as:
  start_addr = address & ~HW_BREAKPOINT_ALIGN
  len = 8 bytes

And thus read/write in last 4 bytes of the given range is ignored.
Fix this by including next doubleword in the length. Watchpoint
exception handler already ignores extraneous exceptions, so no
changes required for that.

Signed-off-by: Ravi Bangoria 
---
v1->v2:
 - v1: https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-June/192162.html 
 - No cosmetic patches.
 - v1 fixed unaligned issue in core hw-breakpoint code only. But 
   ptrace was still messing up with address/len. Fix that in v2. 

 arch/powerpc/include/asm/debug.h |  1 +
 arch/powerpc/include/asm/hw_breakpoint.h |  9 -
 arch/powerpc/kernel/hw_breakpoint.c  | 26 +++-
 arch/powerpc/kernel/process.c| 50 +++-
 arch/powerpc/kernel/ptrace.c | 37 ++
 arch/powerpc/xmon/xmon.c |  3 +-
 6 files changed, 83 insertions(+), 43 deletions(-)

diff --git a/arch/powerpc/include/asm/debug.h b/arch/powerpc/include/asm/debug.h
index 7756026b95ca..9c1b4aaa374b 100644
--- a/arch/powerpc/include/asm/debug.h
+++ b/arch/powerpc/include/asm/debug.h
@@ -45,6 +45,7 @@ static inline int debugger_break_match(struct pt_regs *regs) 
{ return 0; }
 static inline int debugger_fault_handler(struct pt_regs *regs) { return 0; }
 #endif
 
+int hw_breakpoint_validate_len(struct arch_hw_breakpoint *hw);
 void __set_breakpoint(struct arch_hw_breakpoint *brk);
 bool ppc_breakpoint_available(void);
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
diff --git a/arch/powerpc/include/asm/hw_breakpoint.h 
b/arch/powerpc/include/asm/hw_breakpoint.h
index 0fe8c1e46bbc..ca2877be7f79 100644
--- a/arch/powerpc/include/asm/hw_breakpoint.h
+++ b/arch/powerpc/include/asm/hw_breakpoint.h
@@ -28,6 +28,7 @@ struct arch_hw_breakpoint {
unsigned long   address;
u16 type;
u16 len; /* length of the target data symbol */
+   u16 hw_len; /* length programmed in hw */
 };
 
 /* Note: Don't change the the first 6 bits below as they are in the same order
@@ -47,6 +48,11 @@ struct arch_hw_breakpoint {
 #define HW_BRK_TYPE_PRIV_ALL   (HW_BRK_TYPE_USER | HW_BRK_TYPE_KERNEL | \
 HW_BRK_TYPE_HYP)
 
+#define HW_BREAKPOINT_ALIGN 0x7
+
+#define DABR_MAX_LEN   8
+#define DAWR_MAX_LEN   512
+
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 #include 
 #include 
@@ -58,8 +64,6 @@ struct pmu;
 struct perf_sample_data;
 struct task_struct;
 
-#define HW_BREAKPOINT_ALIGN 0x7
-
 extern int hw_breakpoint_slots(int type);
 extern int arch_bp_generic_fields(int type, int *gen_bp_type);
 extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw);
@@ -84,6 +88,7 @@ static inline void hw_breakpoint_disable(void)
brk.address = 0;
brk.type = 0;
brk.len = 0;
+   brk.hw_len = 0;
if (ppc_breakpoint_available())
__set_breakpoint();
 }
diff --git a/arch/powerpc/kernel/hw_breakpoint.c 
b/arch/powerpc/kernel/hw_breakpoint.c
index da307dd93ee3..a61a1266089a 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -147,9 +147,9 @@ int hw_breakpoint_arch_parse(struct perf_event *bp,
 const struct perf_event_attr *attr,
 struct arch_hw_breakpoint *hw)
 {
-   int ret = -EINVAL, length_max;
+   int ret = -EINVAL;
 
-   if (!bp)
+   if (!bp || !attr->bp_len)
return ret;
 
hw->type = HW_BRK_TYPE_TRANSLATE;
@@ -169,26 +169,10 @@ int hw_breakpoint_arch_parse(struct perf_event *bp,
hw->address = attr->bp_addr;
hw->len = attr->bp_len;
 
-   /*
-* Since breakpoint length can be a maximum of HW_BREAKPOINT_LEN(8)
-* and breakpoint addresses are aligned to nearest double-word
-* HW_BREAKPOINT_ALIGN by rounding off to the lower address, the
-* 'symbolsize' should satisfy the check below.
-*/
if (!ppc_breakpoint_available())
return -ENODEV;
-   length_max = 8; /* DABR */
-   if (dawr_enabled()) {
-   length_max = 512 ; /* 64 doublewords */
-   /* DAWR region can't cross 512 boundary */
-   if ((attr->bp_addr >> 9) !=
-   ((attr->bp_addr + attr->bp_len - 1) >> 9))
-   return -EINVAL;
-   }
-   if (hw->len >
-   (length_max - (hw->address & HW_BREAKPOINT_ALIGN)))
-   return -EINVAL;
-   return 0;
+
+

[v2 12/12] powerpc/64s: Save r13 in machine_check_common_early

2019-07-01 Thread Santosh Sivaraj

From: Reza Arbab 

Testing my memcpy_mcsafe() work in progress with an injected UE, I get
an error like this immediately after the function returns:

BUG: Unable to handle kernel data access at 0x7fff84dec8f8
Faulting instruction address: 0xc008009c00b0
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Radix MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV
Modules linked in: mce(O+) vmx_crypto crc32c_vpmsum
CPU: 0 PID: 1375 Comm: modprobe Tainted: G   O  5.1.0-rc6 #267
NIP:  c008009c00b0 LR: c008009c00a8 CTR: c0095f90
REGS: c000ee197790 TRAP: 0300   Tainted: G   O   (5.1.0-rc6)
MSR:  9280b033   CR: 88002826  XER: 
0004
CFAR: c0095f8c DAR: 7fff84dec8f8 DSISR: 4000 IRQMASK: 0
GPR00: 6c6c6568 c000ee197a20 c008009c8400 fff2
GPR04: c008009c02e0 0006  c3c834c8
GPR08: 0080 776a6681b7fb5100  c008009c01c8
GPR12: c0095f90 7fff84debc00 4d071440 
GPR16: 00010601 c008009e c0c98dd8 c0c98d98
GPR20: c3bba970 c008009c04d0 c008009c0618 c01e5820
GPR24:  0100 0001 c3bba958
GPR28: c008009c02e8 c008009c0318 c008009c02e0 
NIP [c008009c00b0] cause_ue+0xa8/0xe8 [mce]
LR [c008009c00a8] cause_ue+0xa0/0xe8 [mce]

To fix, ensure that r13 is properly restored after an MCE.

This commit is needed for testing this series, this is a possible simulator
bug.
---
 arch/powerpc/kernel/exceptions-64s.S | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 311f1392a2ec..932d8d05892c 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -265,6 +265,7 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
 EXC_REAL_END(machine_check, 0x200, 0x100)
 EXC_VIRT_NONE(0x4200, 0x100)
 TRAMP_REAL_BEGIN(machine_check_common_early)
+   SET_SCRATCH0(r13)   /* save r13 */
EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
/*
 * Register contents:
-- 
2.20.1

[v2 11/12] powerpc: add machine check safe copy_to_user

2019-07-01 Thread Santosh Sivaraj

Use  memcpy_mcsafe() implementation to define copy_to_user_mcsafe()

Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/include/asm/uaccess.h | 12 
 2 files changed, 13 insertions(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8c1c636308c8..a173b392c272 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -134,6 +134,7 @@ config PPC
select ARCH_HAS_STRICT_KERNEL_RWX   if ((PPC_BOOK3S_64 || PPC32) && 
!RELOCATABLE && !HIBERNATION)
select ARCH_HAS_TICK_BROADCAST  if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UACCESS_FLUSHCACHE  if PPC64
+   select ARCH_HAS_UACCESS_MCSAFE  if PPC64
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64
select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 76f34346b642..f8fcaab4c5bc 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -386,6 +386,18 @@ static inline unsigned long raw_copy_to_user(void __user 
*to,
return ret;
 }
 
+static __always_inline unsigned long __must_check
+copy_to_user_mcsafe(void __user *to, const void *from, unsigned long n)
+{
+   if (likely(check_copy_size(from, n, true))) {
+   allow_write_to_user(to, n);
+   n = memcpy_mcsafe(to, from, n);
+   prevent_write_to_user(to, n);
+   }
+
+   return n;
+}
+
 extern unsigned long __clear_user(void __user *addr, unsigned long size);
 
 static inline unsigned long clear_user(void __user *addr, unsigned long size)
-- 
2.20.1

[v2 10/12] powerpc/memcpy_mcsafe: return remaining bytes

2019-07-01 Thread Santosh Sivaraj

memcpy_mcsafe currently return -EFAULT on a machine check exception, change
it to return the remaining bytes that needs to be copied, so that machine
check safe copy_to_user can maintain the same behavior as copy_to_user.

Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/lib/memcpy_mcsafe_64.S | 129 +++-
 1 file changed, 70 insertions(+), 59 deletions(-)

diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S 
b/arch/powerpc/lib/memcpy_mcsafe_64.S
index 50f865db0338..566c664aa640 100644
--- a/arch/powerpc/lib/memcpy_mcsafe_64.S
+++ b/arch/powerpc/lib/memcpy_mcsafe_64.S
@@ -30,11 +30,12 @@
ld  r14,STK_REG(R14)(r1)
addir1,r1,STACKFRAMESIZE
 .Ldo_err1:
-   li  r3,-EFAULT
+   mr  r3,r7
blr
 
 
 _GLOBAL(memcpy_mcsafe)
+   mr  r7,r5
cmpldi  r5,16
blt .Lshort_copy
 
@@ -49,18 +50,21 @@ err1;   lbz r0,0(r4)
addir4,r4,1
 err1;  stb r0,0(r3)
addir3,r3,1
+   subir7,r7,1
 
 1: bf  cr7*4+2,2f
 err1;  lhz r0,0(r4)
addir4,r4,2
 err1;  sth r0,0(r3)
addir3,r3,2
+   subir7,r7,2
 
 2: bf  cr7*4+1,3f
 err1;  lwz r0,0(r4)
addir4,r4,4
 err1;  stw r0,0(r3)
addir3,r3,4
+   subir7,r7,4
 
 3: sub r5,r5,r6
cmpldi  r5,128
@@ -87,43 +91,69 @@ err1;   stw r0,0(r3)
 4:
 err2;  ld  r0,0(r4)
 err2;  ld  r6,8(r4)
-err2;  ld  r7,16(r4)
-err2;  ld  r8,24(r4)
-err2;  ld  r9,32(r4)
-err2;  ld  r10,40(r4)
-err2;  ld  r11,48(r4)
-err2;  ld  r12,56(r4)
-err2;  ld  r14,64(r4)
-err2;  ld  r15,72(r4)
-err2;  ld  r16,80(r4)
-err2;  ld  r17,88(r4)
-err2;  ld  r18,96(r4)
-err2;  ld  r19,104(r4)
-err2;  ld  r20,112(r4)
-err2;  ld  r21,120(r4)
+err2;  ld  r8,16(r4)
+err2;  ld  r9,24(r4)
+err2;  ld  r10,32(r4)
+err2;  ld  r11,40(r4)
+err2;  ld  r12,48(r4)
+err2;  ld  r14,56(r4)
+err2;  ld  r15,64(r4)
+err2;  ld  r16,72(r4)
+err2;  ld  r17,80(r4)
+err2;  ld  r18,88(r4)
+err2;  ld  r19,96(r4)
+err2;  ld  r20,104(r4)
+err2;  ld  r21,112(r4)
+err2;  ld  r22,120(r4)
addir4,r4,128
 err2;  std r0,0(r3)
 err2;  std r6,8(r3)
-err2;  std r7,16(r3)
-err2;  std r8,24(r3)
-err2;  std r9,32(r3)
-err2;  std r10,40(r3)
-err2;  std r11,48(r3)
-err2;  std r12,56(r3)
-err2;  std r14,64(r3)
-err2;  std r15,72(r3)
-err2;  std r16,80(r3)
-err2;  std r17,88(r3)
-err2;  std r18,96(r3)
-err2;  std r19,104(r3)
-err2;  std r20,112(r3)
-err2;  std r21,120(r3)
+err2;  std r8,16(r3)
+err2;  std r9,24(r3)
+err2;  std r10,32(r3)
+err2;  std r11,40(r3)
+err2;  std r12,48(r3)
+err2;  std r14,56(r3)
+err2;  std r15,64(r3)
+err2;  std r16,72(r3)
+err2;  std r17,80(r3)
+err2;  std r18,88(r3)
+err2;  std r19,96(r3)
+err2;  std r20,104(r3)
+err2;  std r21,112(r3)
+err2;  std r22,120(r3)
addir3,r3,128
+   subir7,r7,128
bdnz4b
 
clrldi  r5,r5,(64-7)
 
-   ld  r14,STK_REG(R14)(r1)
+   /* Up to 127B to go */
+5: srdir6,r5,4
+   mtocrf  0x01,r6
+
+6: bf  cr7*4+1,7f
+err2;  ld  r0,0(r4)
+err2;  ld  r6,8(r4)
+err2;  ld  r8,16(r4)
+err2;  ld  r9,24(r4)
+err2;  ld  r10,32(r4)
+err2;  ld  r11,40(r4)
+err2;  ld  r12,48(r4)
+err2;  ld  r14,56(r4)
+   addir4,r4,64
+err2;  std r0,0(r3)
+err2;  std r6,8(r3)
+err2;  std r8,16(r3)
+err2;  std r9,24(r3)
+err2;  std r10,32(r3)
+err2;  std r11,40(r3)
+err2;  std r12,48(r3)
+err2;  std r14,56(r3)
+   addir3,r3,64
+   subir7,r7,64
+
+7: ld  r14,STK_REG(R14)(r1)
ld  r15,STK_REG(R15)(r1)
ld  r16,STK_REG(R16)(r1)
ld  r17,STK_REG(R17)(r1)
@@ -134,42 +164,19 @@ err2; std r21,120(r3)
ld  r22,STK_REG(R22)(r1)
addir1,r1,STACKFRAMESIZE
 
-   /* Up to 127B to go */
-5: srdir6,r5,4
-   mtocrf  0x01,r6
-
-6: bf  cr7*4+1,7f
-err1;  ld  r0,0(r4)
-err1;  ld  r6,8(r4)
-err1;  ld  r7,16(r4)
-err1;  ld  r8,24(r4)
-err1;  ld  r9,32(r4)
-err1;  ld  r10,40(r4)
-err1;  ld  r11,48(r4)
-err1;  ld  r12,56(r4)
-   addir4,r4,64
-err1;  std r0,0(r3)
-err1;  std r6,8(r3)
-err1;  std r7,16(r3)
-err1;  std r8,24(r3)
-err1;  std r9,32(r3)
-err1;  std r10,40(r3)
-err1;  std r11,48(r3)
-err1;  std r12,56(r3)
-   addir3,r3,64
-
/* Up to 63B to go */
-7: bf  cr7*4+2,8f
+   bf  cr7*4+2,8f
 err1;  ld  r0,0(r4)
 err1;  ld  r6,8(r4)
-err1;  ld  r7,16(r4)
-err1;  ld  r8,24(r4)
+err1;  ld  r8,16(r4)
+err1;  ld  r9,24(r4)
addir4,r4,32
 err1;  std r0,0(r3)
 err1;  std r6,8(r3)
-err1;  std r7,16(r3)

[v2 09/12] powerpc/mce: Enable MCE notifiers in external modules

2019-07-01 Thread Santosh Sivaraj

From: Reza Arbab 

Signed-off-by: Reza Arbab 
---
 arch/powerpc/kernel/exceptions-64s.S | 6 ++
 arch/powerpc/kernel/mce.c| 2 ++
 2 files changed, 8 insertions(+)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index c83e38a403fd..311f1392a2ec 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -458,6 +458,12 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
bl  machine_check_early
std r3,RESULT(r1)   /* Save result */
 
+   /* Notifiers may be in a module, so enable virtual addressing. */
+   mfmsr   r11
+   ori r11,r11,MSR_IR
+   ori r11,r11,MSR_DR
+   mtmsr   r11
+
addir3,r1,STACK_FRAME_OVERHEAD
bl  machine_check_notify
ld  r11,RESULT(r1)
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index a8348a9bea5b..9e4d497837d8 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -50,11 +50,13 @@ int mce_register_notifier(struct notifier_block *nb)
 {
return blocking_notifier_chain_register(_notifier_list, nb);
 }
+EXPORT_SYMBOL_GPL(mce_register_notifier);
 
 int mce_unregister_notifier(struct notifier_block *nb)
 {
return blocking_notifier_chain_unregister(_notifier_list, nb);
 }
+EXPORT_SYMBOL_GPL(mce_unregister_notifier);
 
 static int check_memcpy_mcsafe(struct notifier_block *nb, unsigned long val,
   void *data)
-- 
2.20.1

[v2 08/12] powerpc/mce: Handle memcpy_mcsafe()

2019-07-01 Thread Santosh Sivaraj

From: Reza Arbab 

Add an mce notifier intended to service memcpy_mcsafe().

The notifier uses this heuristic; if a UE occurs when accessing device
memory, and the faulting instruction had a fixup entry, the callback
will return NOTIFY_STOP.

This causes the notification mechanism to consider the MCE handled and
continue execution at the fixup address, which returns -EFAULT from the
memcpy_mcsafe() call.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/kernel/mce.c | 34 ++
 1 file changed, 34 insertions(+)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 0233c0ee45ab..a8348a9bea5b 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -56,6 +56,40 @@ int mce_unregister_notifier(struct notifier_block *nb)
return blocking_notifier_chain_unregister(_notifier_list, nb);
 }
 
+static int check_memcpy_mcsafe(struct notifier_block *nb, unsigned long val,
+  void *data)
+{
+   struct machine_check_event *evt = data;
+   unsigned long pfn;
+   struct page *page;
+
+   if (evt->error_type != MCE_ERROR_TYPE_UE ||
+   !evt->u.ue_error.physical_address_provided)
+   return NOTIFY_DONE;
+
+   pfn = evt->u.ue_error.physical_address >> PAGE_SHIFT;
+   page = pfn_to_page(pfn);
+   if (!page)
+   return NOTIFY_DONE;
+
+   /* HMM and PMEM */
+   if (is_zone_device_page(page) && evt->u.ue_error.fixup_address_provided)
+   return NOTIFY_STOP;
+
+   return NOTIFY_DONE;
+}
+
+static struct notifier_block memcpy_mcsafe_nb = {
+   .notifier_call = check_memcpy_mcsafe
+};
+
+static int __init mce_mcsafe_register(void)
+{
+   mce_register_notifier(_mcsafe_nb);
+   return 0;
+}
+arch_initcall(mce_mcsafe_register);
+
 static void mce_set_error_info(struct machine_check_event *mce,
   struct mce_error_info *mce_err)
 {
-- 
2.20.1

[v2 07/12] powerpc/memcpy: Add memcpy_mcsafe for pmem

2019-07-01 Thread Santosh Sivaraj

From: Balbir Singh 

The pmem infrastructure uses memcpy_mcsafe in the pmem
layer so as to convert machine check exceptions into
a return value on failure in case a machine check
exception is encountered during the memcpy.

This patch largely borrows from the copyuser_power7
logic and does not add the VMX optimizations, largely
to keep the patch simple. If needed those optimizations
can be folded in.

Signed-off-by: Balbir Singh 
Acked-by: Nicholas Piggin 
[ar...@linux.ibm.com: Added symbol export]
---
 arch/powerpc/include/asm/string.h   |   2 +
 arch/powerpc/lib/Makefile   |   2 +-
 arch/powerpc/lib/memcpy_mcsafe_64.S | 215 
 3 files changed, 218 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/lib/memcpy_mcsafe_64.S

diff --git a/arch/powerpc/include/asm/string.h 
b/arch/powerpc/include/asm/string.h
index 9bf6dffb4090..b72692702f35 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -53,7 +53,9 @@ void *__memmove(void *to, const void *from, __kernel_size_t 
n);
 #ifndef CONFIG_KASAN
 #define __HAVE_ARCH_MEMSET32
 #define __HAVE_ARCH_MEMSET64
+#define __HAVE_ARCH_MEMCPY_MCSAFE
 
+extern int memcpy_mcsafe(void *dst, const void *src, __kernel_size_t sz);
 extern void *__memset16(uint16_t *, uint16_t v, __kernel_size_t);
 extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
 extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t);
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index c55f9c27bf79..529d6536eb4a 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o 
copypage_power7.o \
   memcpy_power7.o
 
 obj64-y+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
-  memcpy_64.o pmem.o
+  memcpy_64.o pmem.o memcpy_mcsafe_64.o
 
 obj64-$(CONFIG_SMP)+= locks.o
 obj64-$(CONFIG_ALTIVEC)+= vmx-helper.o
diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S 
b/arch/powerpc/lib/memcpy_mcsafe_64.S
new file mode 100644
index ..50f865db0338
--- /dev/null
+++ b/arch/powerpc/lib/memcpy_mcsafe_64.S
@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) IBM Corporation, 2011
+ * Derived from copyuser_power7.s by Anton Blanchard 
+ * Author - Balbir Singh 
+ */
+#include 
+#include 
+#include 
+
+   .macro err1
+100:
+   EX_TABLE(100b,.Ldo_err1)
+   .endm
+
+   .macro err2
+200:
+   EX_TABLE(200b,.Ldo_err2)
+   .endm
+
+.Ldo_err2:
+   ld  r22,STK_REG(R22)(r1)
+   ld  r21,STK_REG(R21)(r1)
+   ld  r20,STK_REG(R20)(r1)
+   ld  r19,STK_REG(R19)(r1)
+   ld  r18,STK_REG(R18)(r1)
+   ld  r17,STK_REG(R17)(r1)
+   ld  r16,STK_REG(R16)(r1)
+   ld  r15,STK_REG(R15)(r1)
+   ld  r14,STK_REG(R14)(r1)
+   addir1,r1,STACKFRAMESIZE
+.Ldo_err1:
+   li  r3,-EFAULT
+   blr
+
+
+_GLOBAL(memcpy_mcsafe)
+   cmpldi  r5,16
+   blt .Lshort_copy
+
+.Lcopy:
+   /* Get the source 8B aligned */
+   neg r6,r4
+   mtocrf  0x01,r6
+   clrldi  r6,r6,(64-3)
+
+   bf  cr7*4+3,1f
+err1;  lbz r0,0(r4)
+   addir4,r4,1
+err1;  stb r0,0(r3)
+   addir3,r3,1
+
+1: bf  cr7*4+2,2f
+err1;  lhz r0,0(r4)
+   addir4,r4,2
+err1;  sth r0,0(r3)
+   addir3,r3,2
+
+2: bf  cr7*4+1,3f
+err1;  lwz r0,0(r4)
+   addir4,r4,4
+err1;  stw r0,0(r3)
+   addir3,r3,4
+
+3: sub r5,r5,r6
+   cmpldi  r5,128
+   blt 5f
+
+   mflrr0
+   stdur1,-STACKFRAMESIZE(r1)
+   std r14,STK_REG(R14)(r1)
+   std r15,STK_REG(R15)(r1)
+   std r16,STK_REG(R16)(r1)
+   std r17,STK_REG(R17)(r1)
+   std r18,STK_REG(R18)(r1)
+   std r19,STK_REG(R19)(r1)
+   std r20,STK_REG(R20)(r1)
+   std r21,STK_REG(R21)(r1)
+   std r22,STK_REG(R22)(r1)
+   std r0,STACKFRAMESIZE+16(r1)
+
+   srdir6,r5,7
+   mtctr   r6
+
+   /* Now do cacheline (128B) sized loads and stores. */
+   .align  5
+4:
+err2;  ld  r0,0(r4)
+err2;  ld  r6,8(r4)
+err2;  ld  r7,16(r4)
+err2;  ld  r8,24(r4)
+err2;  ld  r9,32(r4)
+err2;  ld  r10,40(r4)
+err2;  ld  r11,48(r4)
+err2;  ld  r12,56(r4)
+err2;  ld  r14,64(r4)
+err2;  ld  r15,72(r4)
+err2;  ld  r16,80(r4)
+err2;  ld  r17,88(r4)
+err2;  ld  r18,96(r4)
+err2;  ld  r19,104(r4)
+err2;  ld  r20,112(r4)
+err2;  ld  r21,120(r4)
+   addir4,r4,128
+err2;  std r0,0(r3)
+err2;  std r6,8(r3)
+err2;  std r7,16(r3)
+err2;  std r8,24(r3)
+err2;  std r9,32(r3)
+err2;  std r10,40(r3)
+err2;  std r11,48(r3)
+err2;  std r12,56(r3)
+err2;  std r14,64(r3)
+err2;  std r15,72(r3)
+err2;  std r16,80(r3)
+err2;  std

[v2 06/12] powerpc/mce: Add fixup address to UE events

2019-07-01 Thread Santosh Sivaraj

From: Reza Arbab 

If the instruction causing a UE has an exception table entry with fixup
address, save it in the machine_check_event struct.

If a machine check notifier callback returns NOTIFY_STOP to indicate it
has handled the error, set nip to continue execution from the fixup
address.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/include/asm/mce.h |  5 +++--
 arch/powerpc/kernel/mce.c  | 16 +++-
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 240dd1fdfe35..9d9661747adf 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -122,11 +122,12 @@ struct machine_check_event {
enum MCE_UeErrorType ue_error_type:8;
u8  effective_address_provided;
u8  physical_address_provided;
+   u8  fixup_address_provided;
u8  process_event;
-   u8  reserved_1[4];
+   u8  reserved_1[3];
u64 effective_address;
u64 physical_address;
-   u8  reserved_2[8];
+   u64 fixup_address;
} ue_error;
 
struct {
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 4a37928ab30e..0233c0ee45ab 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -15,10 +15,12 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
+#include 
 
 static DEFINE_PER_CPU(int, mce_nest_count);
 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
@@ -151,6 +153,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
mce->u.link_error.effective_address_provided = true;
mce->u.link_error.effective_address = addr;
} else if (mce->error_type == MCE_ERROR_TYPE_UE) {
+   const struct exception_table_entry *entry;
+
mce->u.ue_error.effective_address_provided = true;
mce->u.ue_error.effective_address = addr;
if (phys_addr != ULONG_MAX) {
@@ -158,6 +162,12 @@ void save_mce_event(struct pt_regs *regs, long handled,
mce->u.ue_error.physical_address = phys_addr;
}
 
+   entry = search_exception_tables(regs->nip);
+   if (entry) {
+   mce->u.ue_error.fixup_address_provided = true;
+   mce->u.ue_error.fixup_address = extable_fixup(entry);
+   }
+
mce->u.ue_error.process_event = true;
}
return;
@@ -666,8 +676,12 @@ long machine_check_notify(struct pt_regs *regs)
 
rc = blocking_notifier_call_chain(_notifier_list, 0, evt);
if (rc & NOTIFY_STOP_MASK) {
-   if (evt->error_type == MCE_ERROR_TYPE_UE)
+   if (evt->error_type == MCE_ERROR_TYPE_UE) {
+   if (evt->u.ue_error.fixup_address_provided)
+   regs->nip = evt->u.ue_error.fixup_address;
+
evt->u.ue_error.process_event = false;
+   }
 
if ((rc & NOTIFY_STOP_MASK) && (regs->msr & MSR_RI))
evt->disposition = MCE_DISPOSITION_RECOVERED;
-- 
2.20.1

[v2 05/12] powerpc/mce: Allow notifier callback to handle MCE

2019-07-01 Thread Santosh Sivaraj

From: Reza Arbab 

If a notifier returns NOTIFY_STOP, consider the MCE handled, just as we
do when machine_check_early() returns 1.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/include/asm/asm-prototypes.h |  2 +-
 arch/powerpc/include/asm/mce.h|  3 +-
 arch/powerpc/kernel/exceptions-64s.S  |  3 ++
 arch/powerpc/kernel/mce.c | 37 ++-
 4 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index f66f26ef3ce0..49ee8f08de2a 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -72,7 +72,7 @@ void machine_check_exception(struct pt_regs *regs);
 void emulation_assist_interrupt(struct pt_regs *regs);
 long do_slb_fault(struct pt_regs *regs, unsigned long ea);
 void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err);
-void machine_check_notify(struct pt_regs *regs);
+long machine_check_notify(struct pt_regs *regs);
 
 /* signals, syscalls and interrupts */
 long sys_swapcontext(struct ucontext __user *old_ctx,
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 948bef579086..240dd1fdfe35 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -122,7 +122,8 @@ struct machine_check_event {
enum MCE_UeErrorType ue_error_type:8;
u8  effective_address_provided;
u8  physical_address_provided;
-   u8  reserved_1[5];
+   u8  process_event;
+   u8  reserved_1[4];
u64 effective_address;
u64 physical_address;
u8  reserved_2[8];
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 2e56014fca21..c83e38a403fd 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -460,6 +460,9 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
 
addir3,r1,STACK_FRAME_OVERHEAD
bl  machine_check_notify
+   ld  r11,RESULT(r1)
+   or  r3,r3,r11
+   std r3,RESULT(r1)
 
ld  r12,_MSR(r1)
 BEGIN_FTR_SECTION
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 0ab171b41ede..4a37928ab30e 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -157,6 +157,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
mce->u.ue_error.physical_address_provided = true;
mce->u.ue_error.physical_address = phys_addr;
}
+
+   mce->u.ue_error.process_event = true;
}
return;
 }
@@ -241,6 +243,10 @@ void machine_check_queue_event(void)
if (!get_mce_event(, MCE_EVENT_RELEASE))
return;
 
+   if (evt.error_type == MCE_ERROR_TYPE_UE &&
+   !evt.u.ue_error.process_event)
+   return;
+
index = __this_cpu_inc_return(mce_queue_count) - 1;
/* If queue is full, just return for now. */
if (index >= MAX_MC_EVT) {
@@ -647,16 +653,31 @@ long hmi_exception_realmode(struct pt_regs *regs)
return 1;
 }
 
-void machine_check_notify(struct pt_regs *regs)
+long machine_check_notify(struct pt_regs *regs)
 {
-   struct machine_check_event evt;
+   int index = __this_cpu_read(mce_nest_count) - 1;
+   struct machine_check_event *evt;
+   int rc;
 
-   if (!get_mce_event(, MCE_EVENT_DONTRELEASE))
-   return;
+   if (index < 0 || index >= MAX_MC_EVT)
+   return 0;
 
-   blocking_notifier_call_chain(_notifier_list, 0, );
+   evt = this_cpu_ptr(_event[index]);
 
-   if (evt.error_type == MCE_ERROR_TYPE_UE &&
-   evt.u.ue_error.physical_address_provided)
-   machine_check_ue_event();
+   rc = blocking_notifier_call_chain(_notifier_list, 0, evt);
+   if (rc & NOTIFY_STOP_MASK) {
+   if (evt->error_type == MCE_ERROR_TYPE_UE)
+   evt->u.ue_error.process_event = false;
+
+   if ((rc & NOTIFY_STOP_MASK) && (regs->msr & MSR_RI))
+   evt->disposition = MCE_DISPOSITION_RECOVERED;
+
+   return 1;
+   }
+
+   if (evt->error_type == MCE_ERROR_TYPE_UE &&
+   evt->u.ue_error.physical_address_provided)
+   machine_check_ue_event(evt);
+
+   return 0;
 }
-- 
2.20.1

[v2 01/12] powerpc/mce: Make machine_check_ue_event() static

2019-07-01 Thread Santosh Sivaraj

From: Reza Arbab 

The function doesn't get used outside this file, so make it static.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/kernel/mce.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index b18df633eae9..e78c4f18ea0a 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -33,7 +33,7 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
mce_ue_event_queue);
 
 static void machine_check_process_queued_event(struct irq_work *work);
-void machine_check_ue_event(struct machine_check_event *evt);
+static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
 static struct irq_work mce_event_process_work = {
@@ -203,7 +203,7 @@ void release_mce_event(void)
 /*
  * Queue up the MCE event which then can be handled later.
  */
-void machine_check_ue_event(struct machine_check_event *evt)
+static void machine_check_ue_event(struct machine_check_event *evt)
 {
int index;
 
-- 
2.20.1

Re: [PATCH] mm/nvdimm: Add is_ioremap_addr and use that to check ioremap address

2019-07-01 Thread Aneesh Kumar K.V

Andrew Morton  writes:

> On Mon,  1 Jul 2019 19:10:38 +0530 "Aneesh Kumar K.V" 
>  wrote:
>
>> Architectures like powerpc use different address range to map ioremap
>> and vmalloc range. The memunmap() check used by the nvdimm layer was
>> wrongly using is_vmalloc_addr() to check for ioremap range which fails for
>> ppc64. This result in ppc64 not freeing the ioremap mapping. The side effect
>> of this is an unbind failure during module unload with papr_scm nvdimm driver
>
> The patch applies to 5.1.  Does it need a Fixes: and a Cc:stable?

Actually, we want it to be backported to an older kernel possibly one
that added papr-scm driver, b5beae5e224f ("powerpc/pseries: Add driver
for PAPR SCM regions"). But that doesn't apply easily. It does apply
without conflicts to 5.0

-aneesh

Re: Re: [PATCH 1/3] arm64: mm: Add p?d_large() definitions

2019-07-01 Thread Nicholas Piggin

Will Deacon's on July 1, 2019 8:15 pm:
> On Mon, Jul 01, 2019 at 11:03:51AM +0100, Steven Price wrote:
>> On 01/07/2019 10:27, Will Deacon wrote:
>> > On Sun, Jun 23, 2019 at 07:44:44PM +1000, Nicholas Piggin wrote:
>> >> walk_page_range() is going to be allowed to walk page tables other than
>> >> those of user space. For this it needs to know when it has reached a
>> >> 'leaf' entry in the page tables. This information will be provided by the
>> >> p?d_large() functions/macros.
>> > 
>> > I can't remember whether or not I asked this before, but why not call
>> > this macro p?d_leaf() if that's what it's identifying? "Large" and "huge"
>> > are usually synonymous, so I find this naming needlessly confusing based
>> > on this patch in isolation.

Those page table macro names are horrible. Large, huge, leaf, wtf?
They could do with a sensible renaming. But this series just follows
naming that's alreay there on x86.

Thanks,
Nick

Re: [PATCH v2 1/3] arm64: mm: Add p?d_large() definitions

2019-07-01 Thread Nicholas Piggin

Steven Price's on July 1, 2019 7:57 pm:
> On 01/07/2019 07:40, Nicholas Piggin wrote:
>> walk_page_range() is going to be allowed to walk page tables other than
>> those of user space. For this it needs to know when it has reached a
>> 'leaf' entry in the page tables. This information will be provided by the
>> p?d_large() functions/macros.
>> 
>> For arm64, we already have p?d_sect() macros which we can reuse for
>> p?d_large().
>> 
>> pud_sect() is defined as a dummy function when CONFIG_PGTABLE_LEVELS < 3
>> or CONFIG_ARM64_64K_PAGES is defined. However when the kernel is
>> configured this way then architecturally it isn't allowed to have a
>> large page that this level, and any code using these page walking macros
>> is implicitly relying on the page size/number of levels being the same as
>> the kernel. So it is safe to reuse this for p?d_large() as it is an
>> architectural restriction.
>> 
>> Cc: Catalin Marinas 
>> Cc: Will Deacon 
>> Signed-off-by: Steven Price 
> 
> Hi Nicolas,
> 
> This appears to my patch which I originally posted as part of converting
> x86/arm64 to use a generic page walk code[1].

Hey, yeah it is, I'd intended to mark you as the author but must have
forgot to change it in git.

> I'm not sure that this
> patch makes much sense on its own, in particular it was working up to
> having a generic macro[2] which means the _large() macros could be used
> across all architectures.

It goes with this series which makes _large macros usable for archs
that define HUGE_VMAP. I posted the same thing earlier and Anshuman
noted you'd done it too so I deferred to yours (I thought it would
go via arm64 tree and that this would just allow Andrew to easily
reconcile the merge).

If your series is not going upstream this time then the changelog
probably doesn't make so much sense, so I could just send my version
to the arm64 tree.

Thanks,
Nick

Re: ["RFC PATCH" 1/2] powerpc/mm: Fix node look up with numa=off boot

2019-07-01 Thread Aneesh Kumar K.V


On 7/1/19 10:12 PM, Nathan Lynch wrote:

"Aneesh Kumar K.V"  writes:

I guess we should have here.

modified   arch/powerpc/mm/numa.c
@@ -416,12 +416,11 @@ static int of_get_assoc_arrays(struct assoc_arrays
*aa)
   static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
   {
struct assoc_arrays aa = { .arrays = NULL };
-   /* is that correct? */
int default_nid = 0;
int nid = default_nid;
int rc, index;

-   if (!numa_enabled)
+   if ((min_common_depth < 0) || !numa_enabled)
return NUMA_NO_NODE;

rc = of_get_assoc_arrays();


Nathan,

Can you check this?


Looks like it would do the right thing.

Just checking: do people still need numa=off? Seems like it's a
maintenance burden :-)



That is used in kdump kernel.

-aneesh

Re: Can I compile Linux Kernel 5.2-rc7 for PowerPC on Intel/AMD x86 hardware?

2019-07-01 Thread Michael Ellerman

Christophe Leroy  writes:
> Le 01/07/2019 à 15:39, Turritopsis Dohrnii Teo En Ming a écrit :
>> Good evening from Singapore,
>
> Good evening afternoon from Paris,
>
>> 
>> Can I compile Linux Kernel 5.2-rc7 for PowerPC on Intel/AMD x86 hardware, 
>> for example, AMD Ryzen 9 3950X, with 16 CPU cores and 32 threads?
>
> Yes you can
>
>> 
>> Is it called cross-compiling?
>
> Yes it is, you can get cross compilers at 
> https://mirrors.edge.kernel.org/pub/tools/crosstool/

There's also some info here:

  https://github.com/linuxppc/wiki/wiki/Building-powerpc-kernels


cheers

Re: [PATCH] powerpc/rtas: retry when cpu offline races with suspend/migration

2019-07-01 Thread Michael Ellerman

Nathan Lynch  writes:
> Hi Michael,
>
> Nathan Lynch  writes:
>> The protocol for suspending or migrating an LPAR requires all present
>> processor threads to enter H_JOIN. So if we have threads offline, we
>> have to temporarily bring them up. This can race with administrator
>> actions such as SMT state changes. As of dfd718a2ed1f ("powerpc/rtas:
>> Fix a potential race between CPU-Offline & Migration"),
>
> snowpatch/checkpatch flagged an error in my commit message here:
>
>   ERROR:GIT_COMMIT_ID: Please use git commit description style 'commit
>   <12+ chars of sha1> ("")' - ie: 'commit dfd718a2ed1f
>   ("powerpc/rtas: Fix a potential race between CPU-Offline &
>   Migration")'
>
> I see this is in your next-test branch though. Should I fix the commit
> message and resend?

No that's fine.

You have a Fixes tag which is formatted correctly and that's what
matters IMO.

Ideally we could control that check to not complain about it being split
across lines when there's a fixes tag as well.

cheers

Re: [PATCH] mm/nvdimm: Add is_ioremap_addr and use that to check ioremap address

2019-07-01 Thread Andrew Morton

On Mon,  1 Jul 2019 19:10:38 +0530 "Aneesh Kumar K.V" 
 wrote:

> Architectures like powerpc use different address range to map ioremap
> and vmalloc range. The memunmap() check used by the nvdimm layer was
> wrongly using is_vmalloc_addr() to check for ioremap range which fails for
> ppc64. This result in ppc64 not freeing the ioremap mapping. The side effect
> of this is an unbind failure during module unload with papr_scm nvdimm driver

The patch applies to 5.1.  Does it need a Fixes: and a Cc:stable?

Re: [PATCH] powerpc/rtas: retry when cpu offline races with suspend/migration

2019-07-01 Thread Nathan Lynch

Hi Michael,

Nathan Lynch  writes:
> The protocol for suspending or migrating an LPAR requires all present
> processor threads to enter H_JOIN. So if we have threads offline, we
> have to temporarily bring them up. This can race with administrator
> actions such as SMT state changes. As of dfd718a2ed1f ("powerpc/rtas:
> Fix a potential race between CPU-Offline & Migration"),

snowpatch/checkpatch flagged an error in my commit message here:

  ERROR:GIT_COMMIT_ID: Please use git commit description style 'commit
  <12+ chars of sha1> ("")' - ie: 'commit dfd718a2ed1f
  ("powerpc/rtas: Fix a potential race between CPU-Offline &
  Migration")'

I see this is in your next-test branch though. Should I fix the commit
message and resend?

Re: [PATCH RFC] generic ELF support for kexec

2019-07-01 Thread Sven Schnelle

Hi Philipp,

On Mon, Jul 01, 2019 at 02:31:20PM +0200, Philipp Rudo wrote:
> Sven Schnelle  wrote:
> 
> > I'm attaching the patch to this Mail. What do you think about that change?
> > s390 also uses ELF files, and (maybe?) could also switch to this 
> > implementation.
> > But i don't know anything about S/390 and don't have one in my basement. So
> > i'll leave s390 to the IBM folks.
> 
> I'm afraid there isn't much code here s390 can reuse. I see multiple problems
> in kexec_elf_load:
> 
> * while loading the phdrs we also need to setup some data structures to pass
>   to the next kernel
> * the s390 kernel needs to be loaded to a fixed address
> * there is no support to load a crash kernel
> 
> Of course that could all be fixed/worked around by introducing some arch 
> hooks.
> But when you take into account that the whole elf loader on s390 is ~100 lines
> of code, I don't think it is worth it.

That's fine. I didn't really look into the S/390 Loader, and just wanted to let
the IBM people know.

> More comments below.
>  
> [...]
> 
> > diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> > index b9b1bc5f9669..49b23b425f84 100644
> > --- a/include/linux/kexec.h
> > +++ b/include/linux/kexec.h
> > @@ -216,6 +216,41 @@ extern int crash_prepare_elf64_headers(struct 
> > crash_mem *mem, int kernel_map,
> >void **addr, unsigned long *sz);
> >  #endif /* CONFIG_KEXEC_FILE */
> >  
> > +#ifdef CONFIG_KEXEC_FILE_ELF
> > +
> > +struct kexec_elf_info {
> > +   /*
> > +* Where the ELF binary contents are kept.
> > +* Memory managed by the user of the struct.
> > +*/
> > +   const char *buffer;
> > +
> > +   const struct elfhdr *ehdr;
> > +   const struct elf_phdr *proghdrs;
> > +   struct elf_shdr *sechdrs;
> > +};
> 
> Do i understand this right? elf_info->buffer contains the full elf file and
> elf_info->ehdr is a (endianness translated) copy of the files ehdr?
> 
> If so ...
> 
> > +void kexec_free_elf_info(struct kexec_elf_info *elf_info);
> > +
> > +int kexec_build_elf_info(const char *buf, size_t len, struct elfhdr *ehdr,
> > + struct kexec_elf_info *elf_info);
> > +
> > +int kexec_elf_kernel_load(struct kimage *image, struct kexec_buf *kbuf,
> > + char *kernel_buf, unsigned long kernel_len,
> > + unsigned long *kernel_load_addr);
> > +
> > +int kexec_elf_probe(const char *buf, unsigned long len);
> > +
> > +int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr,
> > +struct kexec_elf_info *elf_info,
> > +struct kexec_buf *kbuf,
> > +unsigned long *lowest_load_addr);
> > +
> > +int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr,
> > +struct kexec_elf_info *elf_info,
> > +struct kexec_buf *kbuf,
> > +unsigned long *lowest_load_addr);
> 
> ... you could simplify the arguments by dropping the ehdr argument. The
> elf_info should contain all the information needed. Furthermore the kexec_buf
> also contains a pointer to its kimage. So you can drop that argument as well.
> 
> An other thing is that you kzalloc the memory needed for proghdrs and sechdrs
> but expect the user of those functions to provide the memory needed for ehdr.
> Wouldn't it be more consistent to also kzalloc the ehdr?
> 

Good point. I'll think about it. I would like to do that in an extra patch,
as it is not a small style change.

> 
> > diff --git a/kernel/kexec_file_elf.c b/kernel/kexec_file_elf.c
> > new file mode 100644
> > index ..bb966c93492c
> > --- /dev/null
> > +++ b/kernel/kexec_file_elf.c
> > @@ -0,0 +1,574 @@
> 
> [...]
> 
> > +static uint64_t elf64_to_cpu(const struct elfhdr *ehdr, uint64_t value)
> > +{
> > +   if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
> > +   value = le64_to_cpu(value);
> > +   else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
> > +   value = be64_to_cpu(value);
> > +
> > +   return value;
> > +}
> > +
> > +static uint16_t elf16_to_cpu(const struct elfhdr *ehdr, uint16_t value)
> > +{
> > +   if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
> > +   value = le16_to_cpu(value);
> > +   else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
> > +   value = be16_to_cpu(value);
> > +
> > +   return value;
> > +}
> > +
> > +static uint32_t elf32_to_cpu(const struct elfhdr *ehdr, uint32_t value)
> > +{
> > +   if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
> > +   value = le32_to_cpu(value);
> > +   else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
> > +   value = be32_to_cpu(value);
> > +
> > +   return value;
> > +}
> 
> What are the elf*_to_cpu good for? In general I'd assume that kexec loads a
> kernel for the same architecture it is running on. So the new kernel should
> also have the same endianness like the one which loads it. Is this a
> ppcle/ppcbe issue?

Don't know. I would agree, but i'm not an powerpc expert.

>

Re: [PATCH RFC] generic ELF support for kexec

2019-07-01 Thread Sven Schnelle

Hi Michael,

On Fri, Jun 28, 2019 at 12:04:11PM +1000, Michael Ellerman wrote:
> Sven Schnelle  writes:
>   https://github.com/linuxppc/wiki/wiki/Booting-with-Qemu
> 
> But I'm not sure where you get a version of kexec that uses kexec_file().

kexec-tools HEAD supports it, so that's not a problem.

> > If that change is acceptable i would finish the patch and submit it. I think
> > best would be to push this change through Helge's parisc tree, so we don't
> > have any dependencies to sort out.
> 
> That will work for you but could cause us problems if we have any
> changes that touch that code.
> 
> It's easy enough to create a topic branch with just that patch that both
> of us merge.

What should be the base branch for that patch? Christophe suggested the
powerpc/merge branch?

> >  #include 
> >  #include 
> >  #include 
> > @@ -31,540 +29,6 @@
> >  #include 
> >  #include 
> >  
> > -#define PURGATORY_STACK_SIZE   (16 * 1024)
> 
> This is unused AFAICS. We should probably remove it explicitly rather
> than as part of this patch.

I have one patch right now. If wanted i can split up all the changes
suggested during the review into smaller pieces, whatever you prefer.

> Or that.
> 
> > +#include 
> > +#include 
> > +
> > +#define elf_addr_to_cpuelf64_to_cpu
> 
> Why are we doing that rather than just using elf64_to_cpu directly?
> 
> > +#ifndef Elf_Rel
> > +#define Elf_RelElf64_Rel
> > +#endif /* Elf_Rel */
> 
> And that?

Don't know - ask the PPC people :-)

Regards
Sven

Re: ["RFC PATCH" 1/2] powerpc/mm: Fix node look up with numa=off boot

2019-07-01 Thread Nathan Lynch

"Aneesh Kumar K.V"  writes:
> I guess we should have here.
>
> modified   arch/powerpc/mm/numa.c
> @@ -416,12 +416,11 @@ static int of_get_assoc_arrays(struct assoc_arrays 
> *aa)
>   static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
>   {
>   struct assoc_arrays aa = { .arrays = NULL };
> - /* is that correct? */
>   int default_nid = 0;
>   int nid = default_nid;
>   int rc, index;
>
> - if (!numa_enabled)
> + if ((min_common_depth < 0) || !numa_enabled)
>   return NUMA_NO_NODE;
>
>   rc = of_get_assoc_arrays();
>
>
> Nathan,
>
> Can you check this?

Looks like it would do the right thing.

Just checking: do people still need numa=off? Seems like it's a
maintenance burden :-)

Re: Can I compile Linux Kernel 5.2-rc7 for PowerPC on Intel/AMD x86 hardware?

2019-07-01 Thread Christophe Leroy





Le 01/07/2019 à 15:39, Turritopsis Dohrnii Teo En Ming a écrit :

Good evening from Singapore,


Good evening afternoon from Paris,



Can I compile Linux Kernel 5.2-rc7 for PowerPC on Intel/AMD x86 hardware, for 
example, AMD Ryzen 9 3950X, with 16 CPU cores and 32 threads?


Yes you can



Is it called cross-compiling?



Yes it is, you can get cross compilers at 
https://mirrors.edge.kernel.org/pub/tools/crosstool/


Thanks
Christophe



Thank you.

-BEGIN EMAIL SIGNATURE-

The Gospel for all Targeted Individuals (TIs):

[The New York Times] Microwave Weapons Are Prime Suspect in Ills of
U.S. Embassy Workers

Link: 
https://www.nytimes.com/2018/09/01/science/sonic-attack-cuba-microwave.html



Singaporean Mr. Turritopsis Dohrnii Teo En Ming's Academic
Qualifications as at 14 Feb 2019

[1] https://tdtemcerts.wordpress.com/

[2] https://tdtemcerts.blogspot.sg/

[3] https://www.scribd.com/user/270125049/Teo-En-Ming

-END EMAIL SIGNATURE-

Re: remove asm-generic/ptrace.h v3

2019-07-01 Thread Arnd Bergmann

On Mon, Jun 24, 2019 at 9:23 AM Thomas Gleixner  wrote:
>
> On Mon, 24 Jun 2019, Christoph Hellwig wrote:
> >
> > asm-generic/ptrace.h is a little weird in that it doesn't actually
> > implement any functionality, but it provided multiple layers of macros
> > that just implement trivial inline functions.  We implement those
> > directly in the few architectures and be off with a much simpler
> > design.
> >
> > I'm not sure which tree is the right place, but may this can go through
> > the asm-generic tree since it removes an asm-generic header?
>
> Makes sense.

Applied and pushed to asm-generic.git/master now, sorry for the delay.

 Arnd

Re: [PATCH v12 01/11] MODSIGN: Export module signature definitions

2019-07-01 Thread Jessica Yu


+++ Thiago Jung Bauermann [27/06/19 23:19 -0300]:

IMA will use the module_signature format for append signatures, so export
the relevant definitions and factor out the code which verifies that the
appended signature trailer is valid.

Also, create a CONFIG_MODULE_SIG_FORMAT option so that IMA can select it
and be able to use mod_check_sig() without having to depend on either
CONFIG_MODULE_SIG or CONFIG_MODULES.

Signed-off-by: Thiago Jung Bauermann 
Reviewed-by: Mimi Zohar 
Cc: Jessica Yu 
---
include/linux/module.h   |  3 --
include/linux/module_signature.h | 44 +
init/Kconfig |  6 +++-
kernel/Makefile  |  1 +
kernel/module.c  |  1 +
kernel/module_signature.c| 46 ++
kernel/module_signing.c  | 56 +---
scripts/Makefile |  2 +-
8 files changed, 106 insertions(+), 53 deletions(-)

diff --git a/include/linux/module.h b/include/linux/module.h
index 188998d3dca9..aa56f531cf1e 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -25,9 +25,6 @@
#include 
#include 

-/* In stripped ARM and x86-64 modules, ~ is surprisingly rare. */
-#define MODULE_SIG_STRING "~Module signature appended~\n"
-


Hi Thiago, apologies for the delay.

It looks like arch/s390/kernel/machine_kexec_file.c also relies on
MODULE_SIG_STRING being defined, so module_signature.h will need to be
included there too, otherwise we'll run into a compilation error.

Other than that, the module-related changes look good to me:

Acked-by: Jessica Yu 

Thanks!

Jessica


/* Not Yet Implemented */
#define MODULE_SUPPORTED_DEVICE(name)

diff --git a/include/linux/module_signature.h b/include/linux/module_signature.h
new file mode 100644
index ..523617fc5b6a
--- /dev/null
+++ b/include/linux/module_signature.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Module signature handling.
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowe...@redhat.com)
+ */
+
+#ifndef _LINUX_MODULE_SIGNATURE_H
+#define _LINUX_MODULE_SIGNATURE_H
+
+/* In stripped ARM and x86-64 modules, ~ is surprisingly rare. */
+#define MODULE_SIG_STRING "~Module signature appended~\n"
+
+enum pkey_id_type {
+   PKEY_ID_PGP,/* OpenPGP generated key ID */
+   PKEY_ID_X509,   /* X.509 arbitrary subjectKeyIdentifier */
+   PKEY_ID_PKCS7,  /* Signature in PKCS#7 message */
+};
+
+/*
+ * Module signature information block.
+ *
+ * The constituents of the signature section are, in order:
+ *
+ * - Signer's name
+ * - Key identifier
+ * - Signature data
+ * - Information block
+ */
+struct module_signature {
+   u8  algo;   /* Public-key crypto algorithm [0] */
+   u8  hash;   /* Digest algorithm [0] */
+   u8  id_type;/* Key identifier type [PKEY_ID_PKCS7] */
+   u8  signer_len; /* Length of signer's name [0] */
+   u8  key_id_len; /* Length of key identifier [0] */
+   u8  __pad[3];
+   __be32  sig_len;/* Length of signature data */
+};
+
+int mod_check_sig(const struct module_signature *ms, size_t file_len,
+ const char *name);
+
+#endif /* _LINUX_MODULE_SIGNATURE_H */
diff --git a/init/Kconfig b/init/Kconfig
index 8b9ffe236e4f..c2286a3c74c5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1852,6 +1852,10 @@ config BASE_SMALL
default 0 if BASE_FULL
default 1 if !BASE_FULL

+config MODULE_SIG_FORMAT
+   def_bool n
+   select SYSTEM_DATA_VERIFICATION
+
menuconfig MODULES
bool "Enable loadable module support"
option modules
@@ -1929,7 +1933,7 @@ config MODULE_SRCVERSION_ALL
config MODULE_SIG
bool "Module signature verification"
depends on MODULES
-   select SYSTEM_DATA_VERIFICATION
+   select MODULE_SIG_FORMAT
help
  Check modules for valid signatures upon load: the signature
  is simply appended to the module. For more information see
diff --git a/kernel/Makefile b/kernel/Makefile
index 33824f0385b3..f29ae2997a43 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -58,6 +58,7 @@ endif
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_SIG) += module_signing.o
+obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_CRASH_CORE) += crash_core.o
diff --git a/kernel/module.c b/kernel/module.c
index 6e6712b3aaf5..2712f4d217f5 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -19,6 +19,7 @@
#include 
#include 
#include 
+#include 
#include 
#include 
#include 
diff --git a/kernel/module_signature.c b/kernel/module_signature.c
new file mode 100644
index ..4224a1086b7d
--- /dev/null
+++ b/kernel/module_signature.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier:

Re: [PATCH v12 00/11] Appended signatures support for IMA appraisal

2019-07-01 Thread Mimi Zohar

On Thu, 2019-06-27 at 23:19 -0300, Thiago Jung Bauermann wrote:
> Hello,
> 
> This version is essentially identical to the last one.
> 
> It is only a rebase on top of today's linux-integrity/next-queued-testing,
> prompted by conflicts with Prakhar Srivastava's patches to measure the
> kernel command line. It also drops two patches that are already present in
> that branch.

Thanks, Thiago.  These patches are now in next-queued-testing waiting
for some additional reviews/acks.

Mimi

[PATCH] powerpc: Remove unused variable declaration

2019-07-01 Thread Aneesh Kumar K.V

__kernel_virt_size is not used anymore.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 7dede2e34b70..a3eab10f5a27 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -275,7 +275,6 @@ extern unsigned long __vmalloc_end;
 #define VMALLOC_END__vmalloc_end
 
 extern unsigned long __kernel_virt_start;
-extern unsigned long __kernel_virt_size;
 extern unsigned long __kernel_io_start;
 extern unsigned long __kernel_io_end;
 #define KERN_VIRT_START __kernel_virt_start
-- 
2.21.0

[PATCH v2 3/3] powerpc/mm: Consolidate numa_enable check and min_common_depth check

2019-07-01 Thread Aneesh Kumar K.V

If we fail to parse min_common_depth from device tree we boot with
numa disabled. Reflect the same by updating numa_enabled variable
to false. Also, switch all min_common_depth failure check to
if (!numa_enabled) check.

This helps us to avoid checking for both in different code paths.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c | 16 +++-
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 27f792c0df68..848b4663c7ad 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -212,7 +212,7 @@ static int associativity_to_nid(const __be32 *associativity)
 {
int nid = NUMA_NO_NODE;
 
-   if (min_common_depth == -1 || !numa_enabled)
+   if (!numa_enabled)
goto out;
 
if (of_read_number(associativity, 1) >= min_common_depth)
@@ -628,8 +628,14 @@ static int __init parse_numa_properties(void)
 
min_common_depth = find_min_common_depth();
 
-   if (min_common_depth < 0)
+   if (min_common_depth < 0) {
+   /*
+* if we fail to parse min_common_depth from device tree
+* mark the numa disabled, boot with numa disabled.
+*/
+   numa_enabled = false;
return min_common_depth;
+   }
 
dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
 
@@ -745,7 +751,7 @@ void __init dump_numa_cpu_topology(void)
unsigned int node;
unsigned int cpu, count;
 
-   if (min_common_depth == -1 || !numa_enabled)
+   if (!numa_enabled)
return;
 
for_each_online_node(node) {
@@ -810,7 +816,7 @@ static void __init find_possible_nodes(void)
struct device_node *rtas;
u32 numnodes, i;
 
-   if (min_common_depth <= 0 || !numa_enabled)
+   if (!numa_enabled)
return;
 
rtas = of_find_node_by_path("/rtas");
@@ -1012,7 +1018,7 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
struct device_node *memory = NULL;
int nid;
 
-   if (!numa_enabled || (min_common_depth < 0))
+   if (!numa_enabled)
return first_online_node;
 
memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
-- 
2.21.0

[PATCH v2 2/3] powerpc/mm: Fix node look up with numa=off boot

2019-07-01 Thread Aneesh Kumar K.V

If we boot with numa=off, we need to make sure we return NUMA_NO_NODE when
looking up associativity details of resources. Without this, we hit crash
like below

BUG: Unable to handle kernel data access at 0x408
Faulting instruction address: 0xc8f31704
cpu 0x1b: Vector: 380 (Data SLB Access) at [cb9bb320]
pc: c8f31704: _raw_spin_lock+0x14/0x100
lr: c83f41fc: cache_alloc_node+0x5c/0x290
sp: cb9bb5b0
   msr: 80010280b033
   dar: 408
  current = 0xcb9a2700
  paca= 0xca740c00   irqmask: 0x03   irq_happened: 0x01
pid   = 1, comm = swapper/27
Linux version 5.2.0-rc4-00925-g74e188c620b1 (root@linux-d8ip) (gcc version 
7.4.1 20190424 [gcc-7-branch revision 270538] (SUSE Linux)) #34 SMP Sat Jun 29 
00:41:02 EDT 2019
enter ? for help
[link register   ] c83f41fc cache_alloc_node+0x5c/0x290
[cb9bb5b0] 0dc0 (unreliable)
[cb9bb5f0] c83f48c8 kmem_cache_alloc_node_trace+0x138/0x360
[cb9bb670] c8aa789c devres_alloc_node+0x4c/0xa0
[cb9bb6a0] c8337218 devm_memremap+0x58/0x130
[cb9bb6f0] c8aed00c devm_nsio_enable+0xdc/0x170
[cb9bb780] c8af3b6c nd_pmem_probe+0x4c/0x180
[cb9bb7b0] c8ad84cc nvdimm_bus_probe+0xac/0x260
[cb9bb840] c8aa0628 really_probe+0x148/0x500
[cb9bb8d0] c8aa0d7c driver_probe_device+0x19c/0x1d0
[cb9bb950] c8aa11bc device_driver_attach+0xcc/0x100
[cb9bb990] c8aa12ec __driver_attach+0xfc/0x1e0
[cb9bba10] c8a9d0a4 bus_for_each_dev+0xb4/0x130
[cb9bba70] c8a9fc04 driver_attach+0x34/0x50
[cb9bba90] c8a9f118 bus_add_driver+0x1d8/0x300
[cb9bbb20] c8aa2358 driver_register+0x98/0x1a0
[cb9bbb90] c8ad7e6c __nd_driver_register+0x5c/0x100
[cb9bbbf0] c93efbac nd_pmem_driver_init+0x34/0x48
[cb9bbc10] c80106c0 do_one_initcall+0x60/0x2d0
[cb9bbce0] c938463c kernel_init_freeable+0x384/0x48c
[cb9bbdb0] c8010a5c kernel_init+0x2c/0x160
[cb9bbe20] c800ba54 ret_from_kernel_thread+0x5c/0x68

Reported-and-debugged-by: Vaibhav Jain 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index aee718509085..27f792c0df68 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -212,7 +212,7 @@ static int associativity_to_nid(const __be32 *associativity)
 {
int nid = NUMA_NO_NODE;
 
-   if (min_common_depth == -1)
+   if (min_common_depth == -1 || !numa_enabled)
goto out;
 
if (of_read_number(associativity, 1) >= min_common_depth)
@@ -420,7 +420,7 @@ static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
int nid = default_nid;
int rc, index;
 
-   if (min_common_depth < 0)
+   if ((min_common_depth < 0) || !numa_enabled)
return default_nid;
 
rc = of_get_assoc_arrays();
@@ -810,7 +810,7 @@ static void __init find_possible_nodes(void)
struct device_node *rtas;
u32 numnodes, i;
 
-   if (min_common_depth <= 0)
+   if (min_common_depth <= 0 || !numa_enabled)
return;
 
rtas = of_find_node_by_path("/rtas");
-- 
2.21.0

[PATCH v2 1/3] powerpc/mm/drconf: Use NUMA_NO_NODE on failures instead of node 0

2019-07-01 Thread Aneesh Kumar K.V

If we fail to parse the associativity array we should default to
NUMA_NO_NODE instead of NODE 0. Rest of the code fallback to the
right default if we find the numa node value NUMA_NO_NODE.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/numa.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 917904d2fe97..aee718509085 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -416,17 +416,19 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa)
 static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
 {
struct assoc_arrays aa = { .arrays = NULL };
-   int default_nid = 0;
+   int default_nid = NUMA_NO_NODE;
int nid = default_nid;
int rc, index;
 
+   if (min_common_depth < 0)
+   return default_nid;
+
rc = of_get_assoc_arrays();
if (rc)
return default_nid;
 
-   if (min_common_depth > 0 && min_common_depth <= aa.array_sz &&
-   !(lmb->flags & DRCONF_MEM_AI_INVALID) &&
-   lmb->aa_index < aa.n_arrays) {
+   if (min_common_depth <= aa.array_sz &&
+   !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
aa.n_arrays) {
index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
nid = of_read_number([index], 1);
 
-- 
2.21.0

[PATCH v2 2/2] powerpc/mm/radix: Use the right page size for vmemmap mapping

2019-07-01 Thread Aneesh Kumar K.V

We use mmu_vmemmap_psize to find the page size for mapping the vmmemap area.
With radix translation, we are suboptimally setting this value to PAGE_SIZE.

We do check for 2M page size support and update mmu_vmemap_psize to use
hugepage size but we suboptimally reset the value to PAGE_SIZE in
radix__early_init_mmu(). This resulted in always mapping vmemmap area with
64K page size.

Fixes: 2bfd65e45e87 ("powerpc/mm/radix: Add radix callbacks for early init 
routines")
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/book3s64/radix_pgtable.c | 16 +++-
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 273ae66a9a45..8deb432c2975 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -515,14 +515,6 @@ void __init radix__early_init_devtree(void)
mmu_psize_defs[MMU_PAGE_64K].shift = 16;
mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
 found:
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-   if (mmu_psize_defs[MMU_PAGE_2M].shift) {
-   /*
-* map vmemmap using 2M if available
-*/
-   mmu_vmemmap_psize = MMU_PAGE_2M;
-   }
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
return;
 }
 
@@ -587,7 +579,13 @@ void __init radix__early_init_mmu(void)
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
/* vmemmap mapping */
-   mmu_vmemmap_psize = mmu_virtual_psize;
+   if (mmu_psize_defs[MMU_PAGE_2M].shift) {
+   /*
+* map vmemmap using 2M if available
+*/
+   mmu_vmemmap_psize = MMU_PAGE_2M;
+   } else
+   mmu_vmemmap_psize = mmu_virtual_psize;
 #endif
/*
 * initialize page table size
-- 
2.21.0

[PATCH v2 1/2] powerpc/mm/hash/4k: Don't use 64K page size for vmemmap with 4K pagesize

2019-07-01 Thread Aneesh Kumar K.V

With hash translation and 4K PAGE_SIZE config, we need to make sure we don't
use 64K page size for vmemmap.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/book3s64/hash_utils.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 28ced26f2a00..a8cc40d2a9ed 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -684,10 +684,8 @@ static void __init htab_init_page_sizes(void)
if (mmu_psize_defs[MMU_PAGE_16M].shift &&
memblock_phys_mem_size() >= 0x4000)
mmu_vmemmap_psize = MMU_PAGE_16M;
-   else if (mmu_psize_defs[MMU_PAGE_64K].shift)
-   mmu_vmemmap_psize = MMU_PAGE_64K;
else
-   mmu_vmemmap_psize = MMU_PAGE_4K;
+   mmu_vmemmap_psize = mmu_virtual_psize;
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
printk(KERN_DEBUG "Page orders: linear mapping = %d, "
-- 
2.21.0

[PATCH v2] powerpc/mm/nvdimm: Add an informative message if we fail to allocate altmap block

2019-07-01 Thread Aneesh Kumar K.V

Allocation from altmap area can fail based on vmemmap page size used. Add kernel
info message to indicate the failure. That allows the user to identify whether 
they
are really using persistent memory reserved space for per-page metadata.

The message looks like:
[  136.587212] altmap block allocation failed, falling back to system memory

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/init_64.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index a4e17a979e45..f3b64f49082b 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -194,8 +194,12 @@ int __meminit vmemmap_populate(unsigned long start, 
unsigned long end, int node,
 * fail due to alignment issues when using 16MB hugepages, so
 * fall back to system memory if the altmap allocation fail.
 */
-   if (altmap)
+   if (altmap) {
p = altmap_alloc_block_buf(page_size, altmap);
+   if (!p)
+   pr_debug("altmap block allocation failed, " \
+   "falling back to system memory");
+   }
if (!p)
p = vmemmap_alloc_block_buf(page_size, node);
if (!p)
-- 
2.21.0

Re: [RFC PATCH] virtio_ring: Use DMA API if guest memory is encrypted

2019-07-01 Thread Michael S. Tsirkin

On Thu, Jun 27, 2019 at 10:58:40PM -0300, Thiago Jung Bauermann wrote:
> 
> Michael S. Tsirkin  writes:
> 
> > On Mon, Jun 03, 2019 at 10:13:59PM -0300, Thiago Jung Bauermann wrote:
> >>
> >>
> >> Michael S. Tsirkin  writes:
> >>
> >> > On Wed, Apr 17, 2019 at 06:42:00PM -0300, Thiago Jung Bauermann wrote:
> >> >> I rephrased it in terms of address translation. What do you think of
> >> >> this version? The flag name is slightly different too:
> >> >>
> >> >>
> >> >> VIRTIO_F_ACCESS_PLATFORM_NO_TRANSLATION This feature has the same
> >> >> meaning as VIRTIO_F_ACCESS_PLATFORM both when set and when not set,
> >> >> with the exception that address translation is guaranteed to be
> >> >> unnecessary when accessing memory addresses supplied to the device
> >> >> by the driver. Which is to say, the device will always use physical
> >> >> addresses matching addresses used by the driver (typically meaning
> >> >> physical addresses used by the CPU) and not translated further. This
> >> >> flag should be set by the guest if offered, but to allow for
> >> >> backward-compatibility device implementations allow for it to be
> >> >> left unset by the guest. It is an error to set both this flag and
> >> >> VIRTIO_F_ACCESS_PLATFORM.
> >> >
> >> >
> >> >
> >> >
> >> > OK so VIRTIO_F_ACCESS_PLATFORM is designed to allow unpriveledged
> >> > drivers. This is why devices fail when it's not negotiated.
> >>
> >> Just to clarify, what do you mean by unprivileged drivers? Is it drivers
> >> implemented in guest userspace such as with VFIO? Or unprivileged in
> >> some other sense such as needing to use bounce buffers for some reason?
> >
> > I had drivers in guest userspace in mind.
> 
> Great. Thanks for clarifying.
> 
> I don't think this flag would work for guest userspace drivers. Should I
> add a note about that in the flag definition?

I think you need to clarify access protection rules. Is it only
translation that is bypassed or is any platform-specific
protection mechanism bypassed too?

> >> > This confuses me.
> >> > If driver is unpriveledged then what happens with this flag?
> >> > It can supply any address it wants. Will that corrupt kernel
> >> > memory?
> >>
> >> Not needing address translation doesn't necessarily mean that there's no
> >> IOMMU. On powerpc we don't use VIRTIO_F_ACCESS_PLATFORM but there's
> >> always an IOMMU present. And we also support VFIO drivers. The VFIO API
> >> for pseries (sPAPR section in Documentation/vfio.txt) has extra ioctls
> >> to program the IOMMU.
> >>
> >> For our use case, we don't need address translation because we set up an
> >> identity mapping in the IOMMU so that the device can use guest physical
> >> addresses.

OK so I think I am beginning to see it in a different light.  Right now the 
specific
platform creates an identity mapping. That in turn means DMA API can be
fast - it does not need to do anything.  What you are looking for is a
way to tell host it's an identity mapping - just as an optimization.

Is that right?  So this is what I would call this option:

VIRTIO_F_ACCESS_PLATFORM_IDENTITY_ADDRESS

and the explanation should state that all device
addresses are translated by the platform to identical
addresses.

In fact this option then becomes more, not less restrictive
than VIRTIO_F_ACCESS_PLATFORM - it's a promise
by guest to only create identity mappings,
and only before driver_ok is set.
This option then would always be negotiated together with
VIRTIO_F_ACCESS_PLATFORM.

Host then must verify that
1. full 1:1 mappings are created before driver_ok
or can we make sure this happens before features_ok?
that would be ideal as we could require that features_ok fails
2. mappings are not modified between driver_ok and reset
i guess attempts to change them will fail -
possibly by causing a guest crash
or some other kind of platform-specific error

So far so good, but now a question:

how are we handling guest address width limitations?
Is VIRTIO_F_ACCESS_PLATFORM_IDENTITY_ADDRESS subject to
guest address width limitations?
I am guessing we can make them so ...
This needs to be documented.

> >
> > And can it access any guest physical address?
> 
> Sorry, I was mistaken. We do support VFIO in guests but not for virtio
> devices, only for regular PCI devices. In which case they will use
> address translation.

Not sure how this answers the question.

> >> If the guest kernel is concerned that an unprivileged driver could
> >> jeopardize its integrity it should not negotiate this feature flag.
> >
> > Unfortunately flag negotiation is done through config space
> > and so can be overwritten by the driver.
> 
> Ok, so the guest kernel has to forbid VFIO access on devices where this
> flag is advertised.

That's possible in theory but in practice we did not yet teach VFIO not
to attach to legacy devices without VIRTIO_F_ACCESS_PLATFORM.  So all
security relies on host denying driver_ok without

Re: [PATCH v3 3/9] powerpc: Introduce FW_FEATURE_ULTRAVISOR

2019-07-01 Thread Claudio Carvalho



On 6/15/19 4:36 AM, Paul Mackerras wrote:
> On Thu, Jun 06, 2019 at 02:36:08PM -0300, Claudio Carvalho wrote:
>> This feature tells if the ultravisor firmware is available to handle
>> ucalls.
> Everything in this patch that depends on CONFIG_PPC_UV should just
> depend on CONFIG_PPC_POWERNV instead.  The reason is that every host
> kernel needs to be able to do the ultracall to set partition table
> entry 0, in case it ends up being run on a machine with an ultravisor.
> Otherwise we will have the situation where a host kernel may crash
> early in boot just because the machine it's booted on happens to have
> an ultravisor running.  The crash will be a particularly nasty one
> because it will happen before we have probed the machine type and
> initialized the console; therefore it will just look like the machine
> hangs for no discernable reason.

> We also need to think about how to provide a way for petitboot to know
> whether the kernel it is booting knows how to do a ucall to set its
> partition table entry.  One suggestion would be to modify
> vmlinux.lds.S to add a new PT_NOTE entry in the program header of the
> binary with (say) a 64-bit doubleword which is a bitmap indicating
> capabilities of the binary.  We would define the first bit as
> indicating that the kernel knows how to run under an ultravisor.
> When running under an ultravisor, petitboot could then look for the
> PT_NOTE and the ultravisor-capable bit in it, and if the PT_NOTE is
> not there or the bit is zero, put up a dialog warning the user that
> the kernel will probably crash early in boot, and asking for explicit
> confirmation that the user wants to proceed.


I just posted a separated RFC patch for the ELF note.

Thanks, Claudio.


>
> Paul.
>

[RFC PATCH] powerpc: Add the ppc_capabilities ELF note

2019-07-01 Thread Claudio Carvalho

Add the ppc_capabilities ELF note to the powerpc kernel binary. It is a
bitmap that can be used to advertise kernel capabilities to userland.

This patch also defines PPCCAP_ULTRAVISOR_BIT as being the bit zero.

Suggested-by: Paul Mackerras 
Signed-off-by: Claudio Carvalho 
---
 arch/powerpc/kernel/Makefile |  2 +-
 arch/powerpc/kernel/note.S   | 36 
 2 files changed, 37 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/kernel/note.S

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 0ea6c4aa3a20..4ec36fe4325b 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -49,7 +49,7 @@ obj-y := cputable.o ptrace.o 
syscalls.o \
   signal.o sysfs.o cacheinfo.o time.o \
   prom.o traps.o setup-common.o \
   udbg.o misc.o io.o misc_$(BITS).o \
-  of_platform.o prom_parse.o
+  of_platform.o prom_parse.o note.o
 obj-$(CONFIG_PPC64)+= setup_64.o sys_ppc32.o \
   signal_64.o ptrace32.o \
   paca.o nvram_64.o firmware.o
diff --git a/arch/powerpc/kernel/note.S b/arch/powerpc/kernel/note.S
new file mode 100644
index ..721bf8ce9eb7
--- /dev/null
+++ b/arch/powerpc/kernel/note.S
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * PowerPC ELF notes.
+ *
+ * Copyright 2019, IBM Corporation
+ */
+#include 
+
+/*
+ * Ultravisor-capable bit (PowerNV only).
+ *
+ * Indicate that the powerpc kernel binary knows how to run in an
+ * ultravisor-enabled system.
+ *
+ * In an ultravisor-enabled system, some machine resources are now controlled
+ * by the ultravisor. If the kernel is not ultravisor-capable, but it ends up
+ * being run on a machine with ultravisor, the kernel will probably crash
+ * trying to access ultravisor resources. For instance, it may crash in early
+ * boot trying to set the partition table entry 0.
+ *
+ * In an ultravisor-enabled system, petitboot can warn the user or prevent the
+ * kernel from being run if the ppc_capabilities doesn't exist or the
+ * Ultravisor-capable bit is not set.
+ */
+#if defined(CONFIG_PPC_POWERNV)
+#define PPCCAP_ULTRAVISOR_BIT  (1 << 0)
+#else
+#define PPCCAP_ULTRAVISOR_BIT  0
+#endif
+
+/*
+ * Add the ppc_capabilities ELF note to the powerpc kernel binary. It is a
+ * bitmap that can be used to advertise kernel capabilities to userland.
+ */
+ELFNOTE(ppc_capabilities, 3,
+   .long PPCCAP_ULTRAVISOR_BIT)
-- 
2.20.1

[PATCH] mm/nvdimm: Add is_ioremap_addr and use that to check ioremap address

2019-07-01 Thread Aneesh Kumar K.V

Architectures like powerpc use different address range to map ioremap
and vmalloc range. The memunmap() check used by the nvdimm layer was
wrongly using is_vmalloc_addr() to check for ioremap range which fails for
ppc64. This result in ppc64 not freeing the ioremap mapping. The side effect
of this is an unbind failure during module unload with papr_scm nvdimm driver

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/pgtable.h | 14 ++
 include/linux/mm.h |  5 +
 kernel/iomem.c |  2 +-
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index 3f53be60fb01..64145751b2fd 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -140,6 +140,20 @@ static inline void pte_frag_set(mm_context_t *ctx, void *p)
 }
 #endif
 
+#ifdef CONFIG_PPC64
+#define is_ioremap_addr is_ioremap_addr
+static inline bool is_ioremap_addr(const void *x)
+{
+#ifdef CONFIG_MMU
+   unsigned long addr = (unsigned long)x;
+
+   return addr >= IOREMAP_BASE && addr < IOREMAP_END;
+#else
+   return false;
+#endif
+}
+#endif /* CONFIG_PPC64 */
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_PGTABLE_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 973ebf71f7b6..65b2eb6c9f0a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -633,6 +633,11 @@ static inline bool is_vmalloc_addr(const void *x)
return false;
 #endif
 }
+
+#ifndef is_ioremap_addr
+#define is_ioremap_addr(x) is_vmalloc_addr(x)
+#endif
+
 #ifdef CONFIG_MMU
 extern int is_vmalloc_or_module_addr(const void *x);
 #else
diff --git a/kernel/iomem.c b/kernel/iomem.c
index 93c26510..62c92e43aa0d 100644
--- a/kernel/iomem.c
+++ b/kernel/iomem.c
@@ -121,7 +121,7 @@ EXPORT_SYMBOL(memremap);
 
 void memunmap(void *addr)
 {
-   if (is_vmalloc_addr(addr))
+   if (is_ioremap_addr(addr))
iounmap((void __iomem *) addr);
 }
 EXPORT_SYMBOL(memunmap);
-- 
2.21.0

Can I compile Linux Kernel 5.2-rc7 for PowerPC on Intel/AMD x86 hardware?

2019-07-01 Thread Turritopsis Dohrnii Teo En Ming

Good evening from Singapore,

Can I compile Linux Kernel 5.2-rc7 for PowerPC on Intel/AMD x86 hardware, for 
example, AMD Ryzen 9 3950X, with 16 CPU cores and 32 threads?

Is it called cross-compiling?

Thank you.

-BEGIN EMAIL SIGNATURE-

The Gospel for all Targeted Individuals (TIs):

[The New York Times] Microwave Weapons Are Prime Suspect in Ills of
U.S. Embassy Workers

Link: 
https://www.nytimes.com/2018/09/01/science/sonic-attack-cuba-microwave.html



Singaporean Mr. Turritopsis Dohrnii Teo En Ming's Academic
Qualifications as at 14 Feb 2019

[1] https://tdtemcerts.wordpress.com/

[2] https://tdtemcerts.blogspot.sg/

[3] https://www.scribd.com/user/270125049/Teo-En-Ming

-END EMAIL SIGNATURE-

Re: [PATCH RFC] generic ELF support for kexec

2019-07-01 Thread Philipp Rudo

Hi Sven,

On Tue, 25 Jun 2019 20:54:34 +0200
Sven Schnelle  wrote:

> Hi List,
> 
> i recently started working on kexec for PA-RISC. While doing so, i figured
> that powerpc already has support for reading ELF images inside of the Kernel.
> My first attempt was to steal the source code and modify it for PA-RISC, but
> it turned out that i didn't had to change much. Only ARM specific stuff like
> fdt blob fetching had to be removed.
> 
> So instead of duplicating the code, i thought about moving the ELF stuff to
> the core kexec code, and exposing several function to use that code from the
> arch specific code.

That's always the right approach. Well done.

> I'm attaching the patch to this Mail. What do you think about that change?
> s390 also uses ELF files, and (maybe?) could also switch to this 
> implementation.
> But i don't know anything about S/390 and don't have one in my basement. So
> i'll leave s390 to the IBM folks.

I'm afraid there isn't much code here s390 can reuse. I see multiple problems
in kexec_elf_load:

* while loading the phdrs we also need to setup some data structures to pass
  to the next kernel
* the s390 kernel needs to be loaded to a fixed address
* there is no support to load a crash kernel

Of course that could all be fixed/worked around by introducing some arch hooks.
But when you take into account that the whole elf loader on s390 is ~100 lines
of code, I don't think it is worth it.

More comments below.
 
[...]

> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> index b9b1bc5f9669..49b23b425f84 100644
> --- a/include/linux/kexec.h
> +++ b/include/linux/kexec.h
> @@ -216,6 +216,41 @@ extern int crash_prepare_elf64_headers(struct crash_mem 
> *mem, int kernel_map,
>  void **addr, unsigned long *sz);
>  #endif /* CONFIG_KEXEC_FILE */
>  
> +#ifdef CONFIG_KEXEC_FILE_ELF
> +
> +struct kexec_elf_info {
> + /*
> +  * Where the ELF binary contents are kept.
> +  * Memory managed by the user of the struct.
> +  */
> + const char *buffer;
> +
> + const struct elfhdr *ehdr;
> + const struct elf_phdr *proghdrs;
> + struct elf_shdr *sechdrs;
> +};

Do i understand this right? elf_info->buffer contains the full elf file and
elf_info->ehdr is a (endianness translated) copy of the files ehdr?

If so ...

> +void kexec_free_elf_info(struct kexec_elf_info *elf_info);
> +
> +int kexec_build_elf_info(const char *buf, size_t len, struct elfhdr *ehdr,
> +   struct kexec_elf_info *elf_info);
> +
> +int kexec_elf_kernel_load(struct kimage *image, struct kexec_buf *kbuf,
> +   char *kernel_buf, unsigned long kernel_len,
> +   unsigned long *kernel_load_addr);
> +
> +int kexec_elf_probe(const char *buf, unsigned long len);
> +
> +int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr,
> +  struct kexec_elf_info *elf_info,
> +  struct kexec_buf *kbuf,
> +  unsigned long *lowest_load_addr);
> +
> +int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr,
> +  struct kexec_elf_info *elf_info,
> +  struct kexec_buf *kbuf,
> +  unsigned long *lowest_load_addr);

... you could simplify the arguments by dropping the ehdr argument. The
elf_info should contain all the information needed. Furthermore the kexec_buf
also contains a pointer to its kimage. So you can drop that argument as well.

An other thing is that you kzalloc the memory needed for proghdrs and sechdrs
but expect the user of those functions to provide the memory needed for ehdr.
Wouldn't it be more consistent to also kzalloc the ehdr?

[...]

> diff --git a/kernel/kexec_file_elf.c b/kernel/kexec_file_elf.c
> new file mode 100644
> index ..bb966c93492c
> --- /dev/null
> +++ b/kernel/kexec_file_elf.c
> @@ -0,0 +1,574 @@

[...]

> +static uint64_t elf64_to_cpu(const struct elfhdr *ehdr, uint64_t value)
> +{
> + if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
> + value = le64_to_cpu(value);
> + else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
> + value = be64_to_cpu(value);
> +
> + return value;
> +}
> +
> +static uint16_t elf16_to_cpu(const struct elfhdr *ehdr, uint16_t value)
> +{
> + if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
> + value = le16_to_cpu(value);
> + else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
> + value = be16_to_cpu(value);
> +
> + return value;
> +}
> +
> +static uint32_t elf32_to_cpu(const struct elfhdr *ehdr, uint32_t value)
> +{
> + if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
> + value = le32_to_cpu(value);
> + else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
> + value = be32_to_cpu(value);
> +
> + return value;
> +}

What are the elf*_to_cpu good for? In general I'd assume that kexec loads a
kernel for the same architecture it is running on. So

Re: [PATCH v3 06/11] mm/memory_hotplug: Allow arch_remove_pages() without CONFIG_MEMORY_HOTREMOVE

2019-07-01 Thread Michal Hocko

On Mon 01-07-19 10:01:41, Michal Hocko wrote:
> On Mon 27-05-19 13:11:47, David Hildenbrand wrote:
> > We want to improve error handling while adding memory by allowing
> > to use arch_remove_memory() and __remove_pages() even if
> > CONFIG_MEMORY_HOTREMOVE is not set to e.g., implement something like:
> > 
> > arch_add_memory()
> > rc = do_something();
> > if (rc) {
> > arch_remove_memory();
> > }
> > 
> > We won't get rid of CONFIG_MEMORY_HOTREMOVE for now, as it will require
> > quite some dependencies for memory offlining.
> 
> If we cannot really remove CONFIG_MEMORY_HOTREMOVE altogether then why
> not simply add an empty placeholder for arch_remove_memory when the
> config is disabled?

In other words, can we replace this by something as simple as:

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index ae892eef8b82..0329027fe740 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -128,6 +128,20 @@ extern void arch_remove_memory(int nid, u64 start, u64 
size,
   struct vmem_altmap *altmap);
 extern void __remove_pages(struct zone *zone, unsigned long start_pfn,
   unsigned long nr_pages, struct vmem_altmap *altmap);
+#else
+/*
+ * Allow code using
+ * arch_add_memory();
+ * rc = do_something();
+ * if (rc)
+ * arch_remove_memory();
+ *
+ * without ifdefery.
+ */
+static inline void arch_remove_memory(int nid, u64 start, u64 size,
+  struct vmem_altmap *altmap)
+{
+}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 /*
-- 
Michal Hocko
SUSE Labs

Re: [PATCH v3 04/11] arm64/mm: Add temporary arch_remove_memory() implementation

2019-07-01 Thread Michal Hocko

On Mon 27-05-19 13:11:45, David Hildenbrand wrote:
> A proper arch_remove_memory() implementation is on its way, which also
> cleanly removes page tables in arch_add_memory() in case something goes
> wrong.
> 
> As we want to use arch_remove_memory() in case something goes wrong
> during memory hotplug after arch_add_memory() finished, let's add
> a temporary hack that is sufficient enough until we get a proper
> implementation that cleans up page table entries.
> 
> We will remove CONFIG_MEMORY_HOTREMOVE around this code in follow up
> patches.

I would drop this one as well (like s390 counterpart).
 
> Cc: Catalin Marinas 
> Cc: Will Deacon 
> Cc: Mark Rutland 
> Cc: Andrew Morton 
> Cc: Ard Biesheuvel 
> Cc: Chintan Pandya 
> Cc: Mike Rapoport 
> Cc: Jun Yao 
> Cc: Yu Zhao 
> Cc: Robin Murphy 
> Cc: Anshuman Khandual 
> Signed-off-by: David Hildenbrand 
> ---
>  arch/arm64/mm/mmu.c | 19 +++
>  1 file changed, 19 insertions(+)
> 
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index a1bfc4413982..e569a543c384 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -1084,4 +1084,23 @@ int arch_add_memory(int nid, u64 start, u64 size,
>   return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
>  restrictions);
>  }
> +#ifdef CONFIG_MEMORY_HOTREMOVE
> +void arch_remove_memory(int nid, u64 start, u64 size,
> + struct vmem_altmap *altmap)
> +{
> + unsigned long start_pfn = start >> PAGE_SHIFT;
> + unsigned long nr_pages = size >> PAGE_SHIFT;
> + struct zone *zone;
> +
> + /*
> +  * FIXME: Cleanup page tables (also in arch_add_memory() in case
> +  * adding fails). Until then, this function should only be used
> +  * during memory hotplug (adding memory), not for memory
> +  * unplug. ARCH_ENABLE_MEMORY_HOTREMOVE must not be
> +  * unlocked yet.
> +  */
> + zone = page_zone(pfn_to_page(start_pfn));
> + __remove_pages(zone, start_pfn, nr_pages, altmap);
> +}
> +#endif
>  #endif
> -- 
> 2.20.1

-- 
Michal Hocko
SUSE Labs

Re: [PATCH v3 03/11] s390x/mm: Implement arch_remove_memory()

2019-07-01 Thread Michal Hocko

On Mon 01-07-19 09:45:03, Michal Hocko wrote:
> On Mon 27-05-19 13:11:44, David Hildenbrand wrote:
> > Will come in handy when wanting to handle errors after
> > arch_add_memory().
> 
> I do not understand this. Why do you add a code for something that is
> not possible on this HW (based on the comment - is it still valid btw?)

Same as the previous patch (drop it).

> > Cc: Martin Schwidefsky 
> > Cc: Heiko Carstens 
> > Cc: Andrew Morton 
> > Cc: Michal Hocko 
> > Cc: Mike Rapoport 
> > Cc: David Hildenbrand 
> > Cc: Vasily Gorbik 
> > Cc: Oscar Salvador 
> > Signed-off-by: David Hildenbrand 
> > ---
> >  arch/s390/mm/init.c | 13 +++--
> >  1 file changed, 7 insertions(+), 6 deletions(-)
> > 
> > diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
> > index d552e330fbcc..14955e0a9fcf 100644
> > --- a/arch/s390/mm/init.c
> > +++ b/arch/s390/mm/init.c
> > @@ -243,12 +243,13 @@ int arch_add_memory(int nid, u64 start, u64 size,
> >  void arch_remove_memory(int nid, u64 start, u64 size,
> > struct vmem_altmap *altmap)
> >  {
> > -   /*
> > -* There is no hardware or firmware interface which could trigger a
> > -* hot memory remove on s390. So there is nothing that needs to be
> > -* implemented.
> > -*/
> > -   BUG();
> > +   unsigned long start_pfn = start >> PAGE_SHIFT;
> > +   unsigned long nr_pages = size >> PAGE_SHIFT;
> > +   struct zone *zone;
> > +
> > +   zone = page_zone(pfn_to_page(start_pfn));
> > +   __remove_pages(zone, start_pfn, nr_pages, altmap);
> > +   vmem_remove_mapping(start, size);
> >  }
> >  #endif
> >  #endif /* CONFIG_MEMORY_HOTPLUG */
> > -- 
> > 2.20.1
> > 
> 
> -- 
> Michal Hocko
> SUSE Labs

-- 
Michal Hocko
SUSE Labs

Re: [PATCH v3 02/11] s390x/mm: Fail when an altmap is used for arch_add_memory()

2019-07-01 Thread Michal Hocko

On Mon 01-07-19 09:43:06, Michal Hocko wrote:
> On Mon 27-05-19 13:11:43, David Hildenbrand wrote:
> > ZONE_DEVICE is not yet supported, fail if an altmap is passed, so we
> > don't forget arch_add_memory()/arch_remove_memory() when unlocking
> > support.
> 
> Why do we need this? Sure ZONE_DEVICE is not supported for s390 and so
> might be the case for other arches which support hotplug. I do not see
> much point in adding warning to each of them.

I would drop this one. If there is a strong reason to have something
like that it should come with a better explanation and it can be done on
top.
-- 
Michal Hocko
SUSE Labs

Re: [PATCH] powerpc/mm/nvdimm: Add an informative message if we fail to allocate altmap block

2019-07-01 Thread Aneesh Kumar K.V

"Oliver O'Halloran"  writes:

> On Sat, Jun 29, 2019 at 5:39 PM Aneesh Kumar K.V
>  wrote:
>>
>> Allocation from altmap area can fail based on vmemmap page size used. Add 
>> kernel
>> info message to indicate the failure. That allows the user to identify 
>> whether they
>> are really using persistent memory reserved space for per-page metadata.
>>
>> The message looks like:
>> [  136.587212] altmap block allocation failed, falling back to system memory
>>
>> Signed-off-by: Aneesh Kumar K.V 
>> ---
>>  arch/powerpc/mm/init_64.c | 6 +-
>>  1 file changed, 5 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
>> index a4e17a979e45..57c0573650dc 100644
>> --- a/arch/powerpc/mm/init_64.c
>> +++ b/arch/powerpc/mm/init_64.c
>> @@ -194,8 +194,12 @@ int __meminit vmemmap_populate(unsigned long start, 
>> unsigned long end, int node,
>>  * fail due to alignment issues when using 16MB hugepages, so
>>  * fall back to system memory if the altmap allocation fail.
>>  */
>> -   if (altmap)
>> +   if (altmap) {
>> p = altmap_alloc_block_buf(page_size, altmap);
>> +   if (!p)
>
>> +   pr_info("altmap block allocation failed, " \
>> +   "falling back to system memory");
>
> I think this is kind of misleading. If you're mapping a large amount
> of memory you can have most of the vmemmap backing allocated from the
> altmap and one extra block allocated from normal memory. E.g. If you
> have 32MB of altmap space, one 16MB block will be allocated from the
> altmap, but the 2nd 16MB block is probably unusable due to the
> reserved pages at the start of the altmap. Maybe this should be a
> pr_debug() so it's only printed along with the "vmemmap_populate ..."
> message above?

Will switch to pr_debug. What I really wanted was an indication of which
pfn device failed to allocate  per page meata data in the device. But
we really don't have device details here and we don't end up calling
this function if there is already a 16MB mapping in DRAM for this area.

>
> Also, isn't kernel style to keep printf()s, even long ones, on one line?

I was not sure. It do print to kernel log in one line.

-aneesh

Re: Re: [PATCH 1/3] arm64: mm: Add p?d_large() definitions

2019-07-01 Thread Will Deacon

On Mon, Jul 01, 2019 at 11:03:51AM +0100, Steven Price wrote:
> On 01/07/2019 10:27, Will Deacon wrote:
> > On Sun, Jun 23, 2019 at 07:44:44PM +1000, Nicholas Piggin wrote:
> >> walk_page_range() is going to be allowed to walk page tables other than
> >> those of user space. For this it needs to know when it has reached a
> >> 'leaf' entry in the page tables. This information will be provided by the
> >> p?d_large() functions/macros.
> > 
> > I can't remember whether or not I asked this before, but why not call
> > this macro p?d_leaf() if that's what it's identifying? "Large" and "huge"
> > are usually synonymous, so I find this naming needlessly confusing based
> > on this patch in isolation.
> 
> You replied to my posting of this patch before[1], to which you said:
> 
> > I've have thought p?d_leaf() might match better with your description
> > above, but I'm not going to quibble on naming.

That explains the sense of deja vu.

> Have you changed your mind about quibbling? ;)

Ha, I suppose I have! If it's not loads of effort to use p?d_leaf() instead
of p?d_large, then I'd certainly prefer that.

Will

Re: [PATCH 1/3] arm64: mm: Add p?d_large() definitions

2019-07-01 Thread Will Deacon

Hi Nick,

On Sun, Jun 23, 2019 at 07:44:44PM +1000, Nicholas Piggin wrote:
> walk_page_range() is going to be allowed to walk page tables other than
> those of user space. For this it needs to know when it has reached a
> 'leaf' entry in the page tables. This information will be provided by the
> p?d_large() functions/macros.

I can't remember whether or not I asked this before, but why not call
this macro p?d_leaf() if that's what it's identifying? "Large" and "huge"
are usually synonymous, so I find this naming needlessly confusing based
on this patch in isolation.

Will

Re: [PATCH v3 10/11] mm/memory_hotplug: Make unregister_memory_block_under_nodes() never fail

2019-07-01 Thread Michal Hocko

On Mon 01-07-19 11:36:44, Oscar Salvador wrote:
> On Mon, Jul 01, 2019 at 10:51:44AM +0200, Michal Hocko wrote:
> > Yeah, we do not allow to offline multi zone (node) ranges so the current
> > code seems to be over engineered.
> > 
> > Anyway, I am wondering why do we have to strictly check for already
> > removed nodes links. Is the sysfs code going to complain we we try to
> > remove again?
> 
> No, sysfs will silently "fail" if the symlink has already been removed.
> At least that is what I saw last time I played with it.
> 
> I guess the question is what if sysfs handling changes in the future
> and starts dropping warnings when trying to remove a symlink is not there.
> Maybe that is unlikely to happen?

And maybe we handle it then rather than have a static allocation that
everybody with hotremove configured has to pay for.
-- 
Michal Hocko
SUSE Labs

Re: Re: [PATCH 1/3] arm64: mm: Add p?d_large() definitions

2019-07-01 Thread Steven Price

On 01/07/2019 10:27, Will Deacon wrote:
> Hi Nick,
> 
> On Sun, Jun 23, 2019 at 07:44:44PM +1000, Nicholas Piggin wrote:
>> walk_page_range() is going to be allowed to walk page tables other than
>> those of user space. For this it needs to know when it has reached a
>> 'leaf' entry in the page tables. This information will be provided by the
>> p?d_large() functions/macros.
> 
> I can't remember whether or not I asked this before, but why not call
> this macro p?d_leaf() if that's what it's identifying? "Large" and "huge"
> are usually synonymous, so I find this naming needlessly confusing based
> on this patch in isolation.

Hi Will,

You replied to my posting of this patch before[1], to which you said:

> I've have thought p?d_leaf() might match better with your description
> above, but I'm not going to quibble on naming.

Have you changed your mind about quibbling? ;)

Steve

[1]
https://lore.kernel.org/lkml/20190611153650.gb4...@fuggles.cambridge.arm.com/

Re: [PATCH v2 3/3] mm/vmalloc: fix vmalloc_to_page for huge vmap mappings

2019-07-01 Thread Anshuman Khandual



On 07/01/2019 12:10 PM, Nicholas Piggin wrote:
> vmalloc_to_page returns NULL for addresses mapped by larger pages[*].
> Whether or not a vmap is huge depends on the architecture details,
> alignments, boot options, etc., which the caller can not be expected
> to know. Therefore HUGE_VMAP is a regression for vmalloc_to_page.
> 
> This change teaches vmalloc_to_page about larger pages, and returns
> the struct page that corresponds to the offset within the large page.
> This makes the API agnostic to mapping implementation details.
> 
> [*] As explained by commit 029c54b095995 ("mm/vmalloc.c: huge-vmap:
> fail gracefully on unexpected huge vmap mappings")
> 
> Signed-off-by: Nicholas Piggin 

Reviewed-by: Anshuman Khandual

Re: [PATCH v2 1/3] arm64: mm: Add p?d_large() definitions

2019-07-01 Thread Steven Price

On 01/07/2019 07:40, Nicholas Piggin wrote:
> walk_page_range() is going to be allowed to walk page tables other than
> those of user space. For this it needs to know when it has reached a
> 'leaf' entry in the page tables. This information will be provided by the
> p?d_large() functions/macros.
> 
> For arm64, we already have p?d_sect() macros which we can reuse for
> p?d_large().
> 
> pud_sect() is defined as a dummy function when CONFIG_PGTABLE_LEVELS < 3
> or CONFIG_ARM64_64K_PAGES is defined. However when the kernel is
> configured this way then architecturally it isn't allowed to have a
> large page that this level, and any code using these page walking macros
> is implicitly relying on the page size/number of levels being the same as
> the kernel. So it is safe to reuse this for p?d_large() as it is an
> architectural restriction.
> 
> Cc: Catalin Marinas 
> Cc: Will Deacon 
> Signed-off-by: Steven Price 

Hi Nicolas,

This appears to my patch which I originally posted as part of converting
x86/arm64 to use a generic page walk code[1]. I'm not sure that this
patch makes much sense on its own, in particular it was working up to
having a generic macro[2] which means the _large() macros could be used
across all architectures.

Also as a matter of courtesy please can you ensure the authorship
information is preserved when posting other people's patches (there
should be a From: line with my name on). You should also include your
own Signed-off-by: line (see submitting-patches[3]) after mine.

Apologies to anyone that has been following my patch series, I've been
on holiday so not actively working on it. My aim is to combine the
series with a generic ptdump implementation[4] which should improve the
diff state. I should be able to post that in the next few weeks.

Thanks,

Steve

[1] Series: https://patchwork.kernel.org/cover/10883885/
Last posting of this patch:
https://patchwork.kernel.org/patch/10883899/

[2] https://patchwork.kernel.org/patch/10883965/

[3]
https://www.kernel.org/doc/html/latest/process/submitting-patches.html#developer-s-certificate-of-origin-1-1

[4] RFC version here:
https://lore.kernel.org/lkml/20190417143423.26665-1-steven.pr...@arm.com/

> ---
>  arch/arm64/include/asm/pgtable.h | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/pgtable.h 
> b/arch/arm64/include/asm/pgtable.h
> index fca26759081a..0e973201bc16 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -417,6 +417,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, 
> unsigned long pfn,
>PMD_TYPE_TABLE)
>  #define pmd_sect(pmd)((pmd_val(pmd) & PMD_TYPE_MASK) == \
>PMD_TYPE_SECT)
> +#define pmd_large(pmd)   pmd_sect(pmd)
>  
>  #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
>  #define pud_sect(pud)(0)
> @@ -499,6 +500,7 @@ static inline void pte_unmap(pte_t *pte) { }
>  #define pud_none(pud)(!pud_val(pud))
>  #define pud_bad(pud) (!(pud_val(pud) & PUD_TABLE_BIT))
>  #define pud_present(pud) pte_present(pud_pte(pud))
> +#define pud_large(pud)   pud_sect(pud)
>  #define pud_valid(pud)   pte_valid(pud_pte(pud))
>  
>  static inline void set_pud(pud_t *pudp, pud_t pud)
>

Re: [PATCH v2 1/3] arm64: mm: Add p?d_large() definitions

2019-07-01 Thread Catalin Marinas

On Mon, Jul 01, 2019 at 04:40:24PM +1000, Nicholas Piggin wrote:
> walk_page_range() is going to be allowed to walk page tables other than
> those of user space. For this it needs to know when it has reached a
> 'leaf' entry in the page tables. This information will be provided by the
> p?d_large() functions/macros.
> 
> For arm64, we already have p?d_sect() macros which we can reuse for
> p?d_large().
> 
> pud_sect() is defined as a dummy function when CONFIG_PGTABLE_LEVELS < 3
> or CONFIG_ARM64_64K_PAGES is defined. However when the kernel is
> configured this way then architecturally it isn't allowed to have a
> large page that this level, and any code using these page walking macros
> is implicitly relying on the page size/number of levels being the same as
> the kernel. So it is safe to reuse this for p?d_large() as it is an
> architectural restriction.
> 
> Cc: Catalin Marinas 
> Cc: Will Deacon 
> Signed-off-by: Steven Price 

Acked-by: Catalin Marinas

Re: [PATCH v3 10/11] mm/memory_hotplug: Make unregister_memory_block_under_nodes() never fail

2019-07-01 Thread Oscar Salvador

On Mon, Jul 01, 2019 at 10:51:44AM +0200, Michal Hocko wrote:
> Yeah, we do not allow to offline multi zone (node) ranges so the current
> code seems to be over engineered.
> 
> Anyway, I am wondering why do we have to strictly check for already
> removed nodes links. Is the sysfs code going to complain we we try to
> remove again?

No, sysfs will silently "fail" if the symlink has already been removed.
At least that is what I saw last time I played with it.

I guess the question is what if sysfs handling changes in the future
and starts dropping warnings when trying to remove a symlink is not there.
Maybe that is unlikely to happen?

-- 
Oscar Salvador
SUSE L3

Re: [PATCH v3 11/11] mm/memory_hotplug: Remove "zone" parameter from sparse_remove_one_section

2019-07-01 Thread Michal Hocko

On Mon 27-05-19 13:11:52, David Hildenbrand wrote:
> The parameter is unused, so let's drop it. Memory removal paths should
> never care about zones. This is the job of memory offlining and will
> require more refactorings.
> 
> Reviewed-by: Dan Williams 
> Signed-off-by: David Hildenbrand 

Acked-by: Michal Hocko 

> ---
>  include/linux/memory_hotplug.h | 2 +-
>  mm/memory_hotplug.c| 2 +-
>  mm/sparse.c| 4 ++--
>  3 files changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> index 2f1f87e13baa..1a4257c5f74c 100644
> --- a/include/linux/memory_hotplug.h
> +++ b/include/linux/memory_hotplug.h
> @@ -346,7 +346,7 @@ extern void move_pfn_range_to_zone(struct zone *zone, 
> unsigned long start_pfn,
>  extern bool is_memblock_offlined(struct memory_block *mem);
>  extern int sparse_add_one_section(int nid, unsigned long start_pfn,
> struct vmem_altmap *altmap);
> -extern void sparse_remove_one_section(struct zone *zone, struct mem_section 
> *ms,
> +extern void sparse_remove_one_section(struct mem_section *ms,
>   unsigned long map_offset, struct vmem_altmap *altmap);
>  extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
> unsigned long pnum);
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 82136c5b4c5f..e48ec7b9dee2 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -524,7 +524,7 @@ static void __remove_section(struct zone *zone, struct 
> mem_section *ms,
>   start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
>   __remove_zone(zone, start_pfn);
>  
> - sparse_remove_one_section(zone, ms, map_offset, altmap);
> + sparse_remove_one_section(ms, map_offset, altmap);
>  }
>  
>  /**
> diff --git a/mm/sparse.c b/mm/sparse.c
> index d1d5e05f5b8d..1552c855d62a 100644
> --- a/mm/sparse.c
> +++ b/mm/sparse.c
> @@ -800,8 +800,8 @@ static void free_section_usemap(struct page *memmap, 
> unsigned long *usemap,
>   free_map_bootmem(memmap);
>  }
>  
> -void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
> - unsigned long map_offset, struct vmem_altmap *altmap)
> +void sparse_remove_one_section(struct mem_section *ms, unsigned long 
> map_offset,
> +struct vmem_altmap *altmap)
>  {
>   struct page *memmap = NULL;
>   unsigned long *usemap = NULL;
> -- 
> 2.20.1

-- 
Michal Hocko
SUSE Labs

Re: [PATCH] powerpc: remove device_to_mask

2019-07-01 Thread Michael Ellerman

Alexey Kardashevskiy  writes:
> On 29/06/2019 18:03, Christoph Hellwig wrote:
>> Use the dma_get_mask helper from dma-mapping.h instead.
>> 
>> Signed-off-by: Christoph Hellwig 
>
> Reviewed-by: Alexey Kardashevskiy 

I'll add to the change log "because they are functionally identical."

cheers

Re: [PATCH v3 10/11] mm/memory_hotplug: Make unregister_memory_block_under_nodes() never fail

2019-07-01 Thread Michal Hocko

On Mon 27-05-19 13:11:51, David Hildenbrand wrote:
> We really don't want anything during memory hotunplug to fail.
> We always pass a valid memory block device, that check can go. Avoid
> allocating memory and eventually failing. As we are always called under
> lock, we can use a static piece of memory. This avoids having to put
> the structure onto the stack, having to guess about the stack size
> of callers.
> 
> Patch inspired by a patch from Oscar Salvador.
> 
> In the future, there might be no need to iterate over nodes at all.
> mem->nid should tell us exactly what to remove. Memory block devices
> with mixed nodes (added during boot) should properly fenced off and never
> removed.

Yeah, we do not allow to offline multi zone (node) ranges so the current
code seems to be over engineered.

Anyway, I am wondering why do we have to strictly check for already
removed nodes links. Is the sysfs code going to complain we we try to
remove again?
 
> Cc: Greg Kroah-Hartman 
> Cc: "Rafael J. Wysocki" 
> Cc: Alex Deucher 
> Cc: "David S. Miller" 
> Cc: Mark Brown 
> Cc: Chris Wilson 
> Cc: David Hildenbrand 
> Cc: Oscar Salvador 
> Cc: Andrew Morton 
> Cc: Jonathan Cameron 
> Signed-off-by: David Hildenbrand 

Anyway
Acked-by: Michal Hocko 

> ---
>  drivers/base/node.c  | 18 +-
>  include/linux/node.h |  5 ++---
>  2 files changed, 7 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/base/node.c b/drivers/base/node.c
> index 04fdfa99b8bc..9be88fd05147 100644
> --- a/drivers/base/node.c
> +++ b/drivers/base/node.c
> @@ -803,20 +803,14 @@ int register_mem_sect_under_node(struct memory_block 
> *mem_blk, void *arg)
>  
>  /*
>   * Unregister memory block device under all nodes that it spans.
> + * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes).
>   */
> -int unregister_memory_block_under_nodes(struct memory_block *mem_blk)
> +void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
>  {
> - NODEMASK_ALLOC(nodemask_t, unlinked_nodes, GFP_KERNEL);
>   unsigned long pfn, sect_start_pfn, sect_end_pfn;
> + static nodemask_t unlinked_nodes;
>  
> - if (!mem_blk) {
> - NODEMASK_FREE(unlinked_nodes);
> - return -EFAULT;
> - }
> - if (!unlinked_nodes)
> - return -ENOMEM;
> - nodes_clear(*unlinked_nodes);
> -
> + nodes_clear(unlinked_nodes);
>   sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
>   sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
>   for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
> @@ -827,15 +821,13 @@ int unregister_memory_block_under_nodes(struct 
> memory_block *mem_blk)
>   continue;
>   if (!node_online(nid))
>   continue;
> - if (node_test_and_set(nid, *unlinked_nodes))
> + if (node_test_and_set(nid, unlinked_nodes))
>   continue;
>   sysfs_remove_link(_devices[nid]->dev.kobj,
>kobject_name(_blk->dev.kobj));
>   sysfs_remove_link(_blk->dev.kobj,
>kobject_name(_devices[nid]->dev.kobj));
>   }
> - NODEMASK_FREE(unlinked_nodes);
> - return 0;
>  }
>  
>  int link_mem_sections(int nid, unsigned long start_pfn, unsigned long 
> end_pfn)
> diff --git a/include/linux/node.h b/include/linux/node.h
> index 02a29e71b175..548c226966a2 100644
> --- a/include/linux/node.h
> +++ b/include/linux/node.h
> @@ -139,7 +139,7 @@ extern int register_cpu_under_node(unsigned int cpu, 
> unsigned int nid);
>  extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
>  extern int register_mem_sect_under_node(struct memory_block *mem_blk,
>   void *arg);
> -extern int unregister_memory_block_under_nodes(struct memory_block *mem_blk);
> +extern void unregister_memory_block_under_nodes(struct memory_block 
> *mem_blk);
>  
>  extern int register_memory_node_under_compute_node(unsigned int mem_nid,
>  unsigned int cpu_nid,
> @@ -175,9 +175,8 @@ static inline int register_mem_sect_under_node(struct 
> memory_block *mem_blk,
>  {
>   return 0;
>  }
> -static inline int unregister_memory_block_under_nodes(struct memory_block 
> *mem_blk)
> +static inline void unregister_memory_block_under_nodes(struct memory_block 
> *mem_blk)
>  {
> - return 0;
>  }
>  
>  static inline void register_hugetlbfs_with_node(node_registration_func_t reg,
> -- 
> 2.20.1

-- 
Michal Hocko
SUSE Labs

Re: [PATCH v2 4/7] powerpc/ftrace: Additionally nop out the preceding mflr with -mprofile-kernel

2019-07-01 Thread Naveen N. Rao

Steven Rostedt wrote:

On Thu, 27 Jun 2019 20:58:20 +0530
"Naveen N. Rao"  wrote:

> But interesting, I don't see a synchronize_rcu_tasks() call
> there.  

We felt we don't need it in this case. We patch the branch to ftrace 
with a nop first. Other cpus should see that first. But, now that I 
think about it, should we add a memory barrier to ensure the writes get 
ordered properly?

Do you send an ipi to the other CPUs. I would just to be safe.

We are handling this through ftrace_replace_code() and 
__ftrace_make_call_prep() below. For FTRACE_UPDATE_MAKE_CALL, we patch 
in the mflr, followed by smp_call_function(isync) and 
synchronize_rcu_tasks() before we proceed to patch the branch to ftrace.

I don't see any other scenario where we end up in 
__ftrace_make_nop_kernel() without going through ftrace_replace_code().  
For kernel modules, this can happen during module load/init and so, I 
patch out both instructions in __ftrace_make_call() above without any 
synchronization.

Am I missing anything?

No, I think I got confused ;-), it's the patch out that I was worried
about, but when I was going through the scenario, I somehow turned it
into the patching in (which I already audited :-p). I was going to
reply with just the top part of this email, but then the confusion
started :-/

OK, yes, patching out should be fine, and you already covered the
patching in. Sorry for the noise.

Just to confirm and totally remove the confusion, the patch does:

:
mflrr0 <-- preempt here
bl  _mcount

:
mflrr0
nop

And this is fine regardless.

OK, Reviewed-by: Steven Rostedt (VMware) 

Thanks for confirming! We do need an IPI to be sure, as you pointed out 
above. I will have the patching out take the same path to simplify 
things.

- Naveen

Re: [PATCH v3 09/11] mm/memory_hotplug: Remove memory block devices before arch_remove_memory()

2019-07-01 Thread Michal Hocko

On Mon 27-05-19 13:11:50, David Hildenbrand wrote:
> Let's factor out removing of memory block devices, which is only
> necessary for memory added via add_memory() and friends that created
> memory block devices. Remove the devices before calling
> arch_remove_memory().
> 
> This finishes factoring out memory block device handling from
> arch_add_memory() and arch_remove_memory().

OK, this makes sense again. Just a nit. Calling find_memory_block_by_id
for each memory block looks a bit suboptimal, especially when we are
removing consequent physical memblocks. I have to confess that I do not
know how expensive is the search and I also expect that there won't be
that many memblocks in the removed range anyway as large setups have
large memblocks.

> Cc: Greg Kroah-Hartman 
> Cc: "Rafael J. Wysocki" 
> Cc: David Hildenbrand 
> Cc: "mike.tra...@hpe.com" 
> Cc: Andrew Morton 
> Cc: Andrew Banman 
> Cc: Ingo Molnar 
> Cc: Alex Deucher 
> Cc: "David S. Miller" 
> Cc: Mark Brown 
> Cc: Chris Wilson 
> Cc: Oscar Salvador 
> Cc: Jonathan Cameron 
> Cc: Michal Hocko 
> Cc: Pavel Tatashin 
> Cc: Arun KS 
> Cc: Mathieu Malaterre 
> Reviewed-by: Dan Williams 
> Signed-off-by: David Hildenbrand 

Other than that looks good to me.
Acked-by: Michal Hocko 

> ---
>  drivers/base/memory.c  | 37 ++---
>  drivers/base/node.c| 11 ++-
>  include/linux/memory.h |  2 +-
>  include/linux/node.h   |  6 ++
>  mm/memory_hotplug.c|  5 +++--
>  5 files changed, 30 insertions(+), 31 deletions(-)
> 
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index 5a0370f0c506..f28efb0bf5c7 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -763,32 +763,31 @@ int create_memory_block_devices(unsigned long start, 
> unsigned long size)
>   return ret;
>  }
>  
> -void unregister_memory_section(struct mem_section *section)
> +/*
> + * Remove memory block devices for the given memory area. Start and size
> + * have to be aligned to memory block granularity. Memory block devices
> + * have to be offline.
> + */
> +void remove_memory_block_devices(unsigned long start, unsigned long size)
>  {
> + const int start_block_id = pfn_to_block_id(PFN_DOWN(start));
> + const int end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
>   struct memory_block *mem;
> + int block_id;
>  
> - if (WARN_ON_ONCE(!present_section(section)))
> + if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
> +  !IS_ALIGNED(size, memory_block_size_bytes(
>   return;
>  
>   mutex_lock(_sysfs_mutex);
> -
> - /*
> -  * Some users of the memory hotplug do not want/need memblock to
> -  * track all sections. Skip over those.
> -  */
> - mem = find_memory_block(section);
> - if (!mem)
> - goto out_unlock;
> -
> - unregister_mem_sect_under_nodes(mem, __section_nr(section));
> -
> - mem->section_count--;
> - if (mem->section_count == 0)
> + for (block_id = start_block_id; block_id != end_block_id; block_id++) {
> + mem = find_memory_block_by_id(block_id, NULL);
> + if (WARN_ON_ONCE(!mem))
> + continue;
> + mem->section_count = 0;
> + unregister_memory_block_under_nodes(mem);
>   unregister_memory(mem);
> - else
> - put_device(>dev);
> -
> -out_unlock:
> + }
>   mutex_unlock(_sysfs_mutex);
>  }
>  
> diff --git a/drivers/base/node.c b/drivers/base/node.c
> index 8598fcbd2a17..04fdfa99b8bc 100644
> --- a/drivers/base/node.c
> +++ b/drivers/base/node.c
> @@ -801,9 +801,10 @@ int register_mem_sect_under_node(struct memory_block 
> *mem_blk, void *arg)
>   return 0;
>  }
>  
> -/* unregister memory section under all nodes that it spans */
> -int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
> - unsigned long phys_index)
> +/*
> + * Unregister memory block device under all nodes that it spans.
> + */
> +int unregister_memory_block_under_nodes(struct memory_block *mem_blk)
>  {
>   NODEMASK_ALLOC(nodemask_t, unlinked_nodes, GFP_KERNEL);
>   unsigned long pfn, sect_start_pfn, sect_end_pfn;
> @@ -816,8 +817,8 @@ int unregister_mem_sect_under_nodes(struct memory_block 
> *mem_blk,
>   return -ENOMEM;
>   nodes_clear(*unlinked_nodes);
>  
> - sect_start_pfn = section_nr_to_pfn(phys_index);
> - sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1;
> + sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
> + sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
>   for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
>   int nid;
>  
> diff --git a/include/linux/memory.h b/include/linux/memory.h
> index db3e8567f900..f26a5417ec5d 100644
> --- a/include/linux/memory.h
> +++ b/include/linux/memory.h
> @@ -112,7 +112,7 @@ extern void

Re: [PATCH v3 08/11] mm/memory_hotplug: Drop MHP_MEMBLOCK_API

2019-07-01 Thread Michal Hocko

On Mon 27-05-19 13:11:49, David Hildenbrand wrote:
> No longer needed, the callers of arch_add_memory() can handle this
> manually.
> 
> Cc: Andrew Morton 
> Cc: David Hildenbrand 
> Cc: Michal Hocko 
> Cc: Oscar Salvador 
> Cc: Pavel Tatashin 
> Cc: Wei Yang 
> Cc: Joonsoo Kim 
> Cc: Qian Cai 
> Cc: Arun KS 
> Cc: Mathieu Malaterre 
> Signed-off-by: David Hildenbrand 

Acked-by: Michal Hocko 

> ---
>  include/linux/memory_hotplug.h | 8 
>  mm/memory_hotplug.c| 9 +++--
>  2 files changed, 3 insertions(+), 14 deletions(-)
> 
> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> index 2d4de313926d..2f1f87e13baa 100644
> --- a/include/linux/memory_hotplug.h
> +++ b/include/linux/memory_hotplug.h
> @@ -128,14 +128,6 @@ extern void arch_remove_memory(int nid, u64 start, u64 
> size,
>  extern void __remove_pages(struct zone *zone, unsigned long start_pfn,
>  unsigned long nr_pages, struct vmem_altmap *altmap);
>  
> -/*
> - * Do we want sysfs memblock files created. This will allow userspace to 
> online
> - * and offline memory explicitly. Lack of this bit means that the caller has 
> to
> - * call move_pfn_range_to_zone to finish the initialization.
> - */
> -
> -#define MHP_MEMBLOCK_API   (1<<0)
> -
>  /* reasonably generic interface to expand the physical pages */
>  extern int __add_pages(int nid, unsigned long start_pfn, unsigned long 
> nr_pages,
>  struct mhp_restrictions *restrictions);
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index b1fde90bbf19..9a92549ef23b 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -251,7 +251,7 @@ void __init register_page_bootmem_info_node(struct 
> pglist_data *pgdat)
>  #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
>  
>  static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
> - struct vmem_altmap *altmap, bool want_memblock)
> +struct vmem_altmap *altmap)
>  {
>   int ret;
>  
> @@ -294,8 +294,7 @@ int __ref __add_pages(int nid, unsigned long 
> phys_start_pfn,
>   }
>  
>   for (i = start_sec; i <= end_sec; i++) {
> - err = __add_section(nid, section_nr_to_pfn(i), altmap,
> - restrictions->flags & MHP_MEMBLOCK_API);
> + err = __add_section(nid, section_nr_to_pfn(i), altmap);
>  
>   /*
>* EEXIST is finally dealt with by ioresource collision
> @@ -1067,9 +1066,7 @@ static int online_memory_block(struct memory_block 
> *mem, void *arg)
>   */
>  int __ref add_memory_resource(int nid, struct resource *res)
>  {
> - struct mhp_restrictions restrictions = {
> - .flags = MHP_MEMBLOCK_API,
> - };
> + struct mhp_restrictions restrictions = {};
>   u64 start, size;
>   bool new_node = false;
>   int ret;
> -- 
> 2.20.1
> 

-- 
Michal Hocko
SUSE Labs

Re: [PATCH v3 07/11] mm/memory_hotplug: Create memory block devices after arch_add_memory()

2019-07-01 Thread Michal Hocko

On Mon 27-05-19 13:11:48, David Hildenbrand wrote:
> Only memory to be added to the buddy and to be onlined/offlined by
> user space using /sys/devices/system/memory/... needs (and should have!)
> memory block devices.
> 
> Factor out creation of memory block devices. Create all devices after
> arch_add_memory() succeeded. We can later drop the want_memblock parameter,
> because it is now effectively stale.
> 
> Only after memory block devices have been added, memory can be onlined
> by user space. This implies, that memory is not visible to user space at
> all before arch_add_memory() succeeded.

I like the memblock API to go away from the low level hotplug handling.
The current implementation is just too convoluted and I remember I was
fighting with subtle expectations wired deep in call chains when
touching that code in the past (some memblocks didn't get created etc.).
Maybe those have been addressed in the meantime.

> While at it
> - use WARN_ON_ONCE instead of BUG_ON in moved unregister_memory()

This would better be a separate patch with an explanation

> - introduce find_memory_block_by_id() to search via block id
> - Use find_memory_block_by_id() in init_memory_block() to catch
>   duplicates
> 
> Cc: Greg Kroah-Hartman 
> Cc: "Rafael J. Wysocki" 
> Cc: David Hildenbrand 
> Cc: "mike.tra...@hpe.com" 
> Cc: Andrew Morton 
> Cc: Ingo Molnar 
> Cc: Andrew Banman 
> Cc: Oscar Salvador 
> Cc: Michal Hocko 
> Cc: Pavel Tatashin 
> Cc: Qian Cai 
> Cc: Wei Yang 
> Cc: Arun KS 
> Cc: Mathieu Malaterre 
> Signed-off-by: David Hildenbrand 

Other than that looks good to me.
Acked-by: Michal Hocko 

> ---
>  drivers/base/memory.c  | 82 +++---
>  include/linux/memory.h |  2 +-
>  mm/memory_hotplug.c| 15 
>  3 files changed, 63 insertions(+), 36 deletions(-)
> 
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index ac17c95a5f28..5a0370f0c506 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -39,6 +39,11 @@ static inline int base_memory_block_id(int section_nr)
>   return section_nr / sections_per_block;
>  }
>  
> +static inline int pfn_to_block_id(unsigned long pfn)
> +{
> + return base_memory_block_id(pfn_to_section_nr(pfn));
> +}
> +
>  static int memory_subsys_online(struct device *dev);
>  static int memory_subsys_offline(struct device *dev);
>  
> @@ -582,10 +587,9 @@ int __weak arch_get_memory_phys_device(unsigned long 
> start_pfn)
>   * A reference for the returned object is held and the reference for the
>   * hinted object is released.
>   */
> -struct memory_block *find_memory_block_hinted(struct mem_section *section,
> -   struct memory_block *hint)
> +static struct memory_block *find_memory_block_by_id(int block_id,
> + struct memory_block *hint)
>  {
> - int block_id = base_memory_block_id(__section_nr(section));
>   struct device *hintdev = hint ? >dev : NULL;
>   struct device *dev;
>  
> @@ -597,6 +601,14 @@ struct memory_block *find_memory_block_hinted(struct 
> mem_section *section,
>   return to_memory_block(dev);
>  }
>  
> +struct memory_block *find_memory_block_hinted(struct mem_section *section,
> +   struct memory_block *hint)
> +{
> + int block_id = base_memory_block_id(__section_nr(section));
> +
> + return find_memory_block_by_id(block_id, hint);
> +}
> +
>  /*
>   * For now, we have a linear search to go find the appropriate
>   * memory_block corresponding to a particular phys_index. If
> @@ -658,6 +670,11 @@ static int init_memory_block(struct memory_block 
> **memory, int block_id,
>   unsigned long start_pfn;
>   int ret = 0;
>  
> + mem = find_memory_block_by_id(block_id, NULL);
> + if (mem) {
> + put_device(>dev);
> + return -EEXIST;
> + }
>   mem = kzalloc(sizeof(*mem), GFP_KERNEL);
>   if (!mem)
>   return -ENOMEM;
> @@ -699,44 +716,53 @@ static int add_memory_block(int base_section_nr)
>   return 0;
>  }
>  
> +static void unregister_memory(struct memory_block *memory)
> +{
> + if (WARN_ON_ONCE(memory->dev.bus != _subsys))
> + return;
> +
> + /* drop the ref. we got via find_memory_block() */
> + put_device(>dev);
> + device_unregister(>dev);
> +}
> +
>  /*
> - * need an interface for the VM to add new memory regions,
> - * but without onlining it.
> + * Create memory block devices for the given memory area. Start and size
> + * have to be aligned to memory block granularity. Memory block devices
> + * will be initialized as offline.
>   */
> -int hotplug_memory_register(int nid, struct mem_section *section)
> +int create_memory_block_devices(unsigned long start, unsigned long size)
>  {
> - int block_id = base_memory_block_id(__section_nr(section));
> - int ret = 0;
> + const int start_block_id =

Re: [PATCH v3 06/11] mm/memory_hotplug: Allow arch_remove_pages() without CONFIG_MEMORY_HOTREMOVE

2019-07-01 Thread Michal Hocko

On Mon 27-05-19 13:11:47, David Hildenbrand wrote:
> We want to improve error handling while adding memory by allowing
> to use arch_remove_memory() and __remove_pages() even if
> CONFIG_MEMORY_HOTREMOVE is not set to e.g., implement something like:
> 
>   arch_add_memory()
>   rc = do_something();
>   if (rc) {
>   arch_remove_memory();
>   }
> 
> We won't get rid of CONFIG_MEMORY_HOTREMOVE for now, as it will require
> quite some dependencies for memory offlining.

If we cannot really remove CONFIG_MEMORY_HOTREMOVE altogether then why
not simply add an empty placeholder for arch_remove_memory when the
config is disabled?
 
> Cc: Tony Luck 
> Cc: Fenghua Yu 
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Michael Ellerman 
> Cc: Martin Schwidefsky 
> Cc: Heiko Carstens 
> Cc: Yoshinori Sato 
> Cc: Rich Felker 
> Cc: Dave Hansen 
> Cc: Andy Lutomirski 
> Cc: Peter Zijlstra 
> Cc: Thomas Gleixner 
> Cc: Ingo Molnar 
> Cc: Borislav Petkov 
> Cc: "H. Peter Anvin" 
> Cc: Greg Kroah-Hartman 
> Cc: "Rafael J. Wysocki" 
> Cc: Andrew Morton 
> Cc: Michal Hocko 
> Cc: Mike Rapoport 
> Cc: David Hildenbrand 
> Cc: Oscar Salvador 
> Cc: "Kirill A. Shutemov" 
> Cc: Alex Deucher 
> Cc: "David S. Miller" 
> Cc: Mark Brown 
> Cc: Chris Wilson 
> Cc: Christophe Leroy 
> Cc: Nicholas Piggin 
> Cc: Vasily Gorbik 
> Cc: Rob Herring 
> Cc: Masahiro Yamada 
> Cc: "mike.tra...@hpe.com" 
> Cc: Andrew Banman 
> Cc: Pavel Tatashin 
> Cc: Wei Yang 
> Cc: Arun KS 
> Cc: Qian Cai 
> Cc: Mathieu Malaterre 
> Cc: Baoquan He 
> Cc: Logan Gunthorpe 
> Cc: Anshuman Khandual 
> Signed-off-by: David Hildenbrand 
> ---
>  arch/arm64/mm/mmu.c| 2 --
>  arch/ia64/mm/init.c| 2 --
>  arch/powerpc/mm/mem.c  | 2 --
>  arch/s390/mm/init.c| 2 --
>  arch/sh/mm/init.c  | 2 --
>  arch/x86/mm/init_32.c  | 2 --
>  arch/x86/mm/init_64.c  | 2 --
>  drivers/base/memory.c  | 2 --
>  include/linux/memory.h | 2 --
>  include/linux/memory_hotplug.h | 2 --
>  mm/memory_hotplug.c| 2 --
>  mm/sparse.c| 6 --
>  12 files changed, 28 deletions(-)
> 
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index e569a543c384..9ccd7539f2d4 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -1084,7 +1084,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
>   return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
>  restrictions);
>  }
> -#ifdef CONFIG_MEMORY_HOTREMOVE
>  void arch_remove_memory(int nid, u64 start, u64 size,
>   struct vmem_altmap *altmap)
>  {
> @@ -1103,4 +1102,3 @@ void arch_remove_memory(int nid, u64 start, u64 size,
>   __remove_pages(zone, start_pfn, nr_pages, altmap);
>  }
>  #endif
> -#endif
> diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
> index d28e29103bdb..aae75fd7b810 100644
> --- a/arch/ia64/mm/init.c
> +++ b/arch/ia64/mm/init.c
> @@ -681,7 +681,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
>   return ret;
>  }
>  
> -#ifdef CONFIG_MEMORY_HOTREMOVE
>  void arch_remove_memory(int nid, u64 start, u64 size,
>   struct vmem_altmap *altmap)
>  {
> @@ -693,4 +692,3 @@ void arch_remove_memory(int nid, u64 start, u64 size,
>   __remove_pages(zone, start_pfn, nr_pages, altmap);
>  }
>  #endif
> -#endif
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index e885fe2aafcc..e4bc2dc3f593 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -130,7 +130,6 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
>   return __add_pages(nid, start_pfn, nr_pages, restrictions);
>  }
>  
> -#ifdef CONFIG_MEMORY_HOTREMOVE
>  void __ref arch_remove_memory(int nid, u64 start, u64 size,
>struct vmem_altmap *altmap)
>  {
> @@ -164,7 +163,6 @@ void __ref arch_remove_memory(int nid, u64 start, u64 
> size,
>   pr_warn("Hash collision while resizing HPT\n");
>  }
>  #endif
> -#endif /* CONFIG_MEMORY_HOTPLUG */
>  
>  #ifndef CONFIG_NEED_MULTIPLE_NODES
>  void __init mem_topology_setup(void)
> diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
> index 14955e0a9fcf..ffb81fe95c77 100644
> --- a/arch/s390/mm/init.c
> +++ b/arch/s390/mm/init.c
> @@ -239,7 +239,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
>   return rc;
>  }
>  
> -#ifdef CONFIG_MEMORY_HOTREMOVE
>  void arch_remove_memory(int nid, u64 start, u64 size,
>   struct vmem_altmap *altmap)
>  {
> @@ -251,5 +250,4 @@ void arch_remove_memory(int nid, u64 start, u64 size,
>   __remove_pages(zone, start_pfn, nr_pages, altmap);
>   vmem_remove_mapping(start, size);
>  }
> -#endif
>  #endif /* CONFIG_MEMORY_HOTPLUG */
> diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
> index 13c6a6bb5fd9..dfdbaa50946e 100644
> --- a/arch/sh/mm/init.c
> +++ b/arch/sh/mm/init.c
> @@ -429,7 +429,6 @@ int

Re: [PATCH v3 05/11] drivers/base/memory: Pass a block_id to init_memory_block()

2019-07-01 Thread Michal Hocko

On Mon 27-05-19 13:11:46, David Hildenbrand wrote:
> We'll rework hotplug_memory_register() shortly, so it no longer consumes
> pass a section.
> 
> Cc: Greg Kroah-Hartman 
> Cc: "Rafael J. Wysocki" 
> Signed-off-by: David Hildenbrand 

Acked-by: Michal Hocko 

> ---
>  drivers/base/memory.c | 15 +++
>  1 file changed, 7 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index f180427e48f4..f914fa6fe350 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -651,21 +651,18 @@ int register_memory(struct memory_block *memory)
>   return ret;
>  }
>  
> -static int init_memory_block(struct memory_block **memory,
> -  struct mem_section *section, unsigned long state)
> +static int init_memory_block(struct memory_block **memory, int block_id,
> +  unsigned long state)
>  {
>   struct memory_block *mem;
>   unsigned long start_pfn;
> - int scn_nr;
>   int ret = 0;
>  
>   mem = kzalloc(sizeof(*mem), GFP_KERNEL);
>   if (!mem)
>   return -ENOMEM;
>  
> - scn_nr = __section_nr(section);
> - mem->start_section_nr =
> - base_memory_block_id(scn_nr) * sections_per_block;
> + mem->start_section_nr = block_id * sections_per_block;
>   mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
>   mem->state = state;
>   start_pfn = section_nr_to_pfn(mem->start_section_nr);
> @@ -694,7 +691,8 @@ static int add_memory_block(int base_section_nr)
>  
>   if (section_count == 0)
>   return 0;
> - ret = init_memory_block(, __nr_to_section(section_nr), MEM_ONLINE);
> + ret = init_memory_block(, base_memory_block_id(base_section_nr),
> + MEM_ONLINE);
>   if (ret)
>   return ret;
>   mem->section_count = section_count;
> @@ -707,6 +705,7 @@ static int add_memory_block(int base_section_nr)
>   */
>  int hotplug_memory_register(int nid, struct mem_section *section)
>  {
> + int block_id = base_memory_block_id(__section_nr(section));
>   int ret = 0;
>   struct memory_block *mem;
>  
> @@ -717,7 +716,7 @@ int hotplug_memory_register(int nid, struct mem_section 
> *section)
>   mem->section_count++;
>   put_device(>dev);
>   } else {
> - ret = init_memory_block(, section, MEM_OFFLINE);
> + ret = init_memory_block(, block_id, MEM_OFFLINE);
>   if (ret)
>   goto out;
>   mem->section_count++;
> -- 
> 2.20.1

-- 
Michal Hocko
SUSE Labs

Re: [PATCH v3 03/11] s390x/mm: Implement arch_remove_memory()

2019-07-01 Thread Michal Hocko

On Mon 27-05-19 13:11:44, David Hildenbrand wrote:
> Will come in handy when wanting to handle errors after
> arch_add_memory().

I do not understand this. Why do you add a code for something that is
not possible on this HW (based on the comment - is it still valid btw?)

> Cc: Martin Schwidefsky 
> Cc: Heiko Carstens 
> Cc: Andrew Morton 
> Cc: Michal Hocko 
> Cc: Mike Rapoport 
> Cc: David Hildenbrand 
> Cc: Vasily Gorbik 
> Cc: Oscar Salvador 
> Signed-off-by: David Hildenbrand 
> ---
>  arch/s390/mm/init.c | 13 +++--
>  1 file changed, 7 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
> index d552e330fbcc..14955e0a9fcf 100644
> --- a/arch/s390/mm/init.c
> +++ b/arch/s390/mm/init.c
> @@ -243,12 +243,13 @@ int arch_add_memory(int nid, u64 start, u64 size,
>  void arch_remove_memory(int nid, u64 start, u64 size,
>   struct vmem_altmap *altmap)
>  {
> - /*
> -  * There is no hardware or firmware interface which could trigger a
> -  * hot memory remove on s390. So there is nothing that needs to be
> -  * implemented.
> -  */
> - BUG();
> + unsigned long start_pfn = start >> PAGE_SHIFT;
> + unsigned long nr_pages = size >> PAGE_SHIFT;
> + struct zone *zone;
> +
> + zone = page_zone(pfn_to_page(start_pfn));
> + __remove_pages(zone, start_pfn, nr_pages, altmap);
> + vmem_remove_mapping(start, size);
>  }
>  #endif
>  #endif /* CONFIG_MEMORY_HOTPLUG */
> -- 
> 2.20.1
> 

-- 
Michal Hocko
SUSE Labs

Re: [PATCH v3 02/11] s390x/mm: Fail when an altmap is used for arch_add_memory()

2019-07-01 Thread Michal Hocko

On Mon 27-05-19 13:11:43, David Hildenbrand wrote:
> ZONE_DEVICE is not yet supported, fail if an altmap is passed, so we
> don't forget arch_add_memory()/arch_remove_memory() when unlocking
> support.

Why do we need this? Sure ZONE_DEVICE is not supported for s390 and so
might be the case for other arches which support hotplug. I do not see
much point in adding warning to each of them.

> Cc: Martin Schwidefsky 
> Cc: Heiko Carstens 
> Cc: Andrew Morton 
> Cc: Michal Hocko 
> Cc: Mike Rapoport 
> Cc: David Hildenbrand 
> Cc: Vasily Gorbik 
> Cc: Oscar Salvador 
> Suggested-by: Dan Williams 
> Signed-off-by: David Hildenbrand 
> ---
>  arch/s390/mm/init.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
> index 14d1eae9fe43..d552e330fbcc 100644
> --- a/arch/s390/mm/init.c
> +++ b/arch/s390/mm/init.c
> @@ -226,6 +226,9 @@ int arch_add_memory(int nid, u64 start, u64 size,
>   unsigned long size_pages = PFN_DOWN(size);
>   int rc;
>  
> + if (WARN_ON_ONCE(restrictions->altmap))
> + return -EINVAL;
> +
>   rc = vmem_add_mapping(start, size);
>   if (rc)
>   return rc;
> -- 
> 2.20.1
> 

-- 
Michal Hocko
SUSE Labs

Re: [PATCH v3 01/11] mm/memory_hotplug: Simplify and fix check_hotplug_memory_range()

2019-07-01 Thread Michal Hocko

[Sorry for a really late response]

On Mon 27-05-19 13:11:42, David Hildenbrand wrote:
> By converting start and size to page granularity, we actually ignore
> unaligned parts within a page instead of properly bailing out with an
> error.

I do not expect any code path would ever provide an unaligned address
and even if it did then rounding that to a pfn doesn't sound like a
terrible thing to do. Anyway this removes few lines so why not.
> 
> Cc: Andrew Morton 
> Cc: Oscar Salvador 
> Cc: Michal Hocko 
> Cc: David Hildenbrand 
> Cc: Pavel Tatashin 
> Cc: Qian Cai 
> Cc: Wei Yang 
> Cc: Arun KS 
> Cc: Mathieu Malaterre 
> Reviewed-by: Dan Williams 
> Reviewed-by: Wei Yang 
> Signed-off-by: David Hildenbrand 

Acked-by: Michal Hocko 

> ---
>  mm/memory_hotplug.c | 11 +++
>  1 file changed, 3 insertions(+), 8 deletions(-)
> 
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index e096c987d261..762887b2358b 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1051,16 +1051,11 @@ int try_online_node(int nid)
>  
>  static int check_hotplug_memory_range(u64 start, u64 size)
>  {
> - unsigned long block_sz = memory_block_size_bytes();
> - u64 block_nr_pages = block_sz >> PAGE_SHIFT;
> - u64 nr_pages = size >> PAGE_SHIFT;
> - u64 start_pfn = PFN_DOWN(start);
> -
>   /* memory range must be block size aligned */
> - if (!nr_pages || !IS_ALIGNED(start_pfn, block_nr_pages) ||
> - !IS_ALIGNED(nr_pages, block_nr_pages)) {
> + if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) ||
> + !IS_ALIGNED(size, memory_block_size_bytes())) {
>   pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, 
> size %#llx",
> -block_sz, start, size);
> +memory_block_size_bytes(), start, size);
>   return -EINVAL;
>   }
>  
> -- 
> 2.20.1
> 

-- 
Michal Hocko
SUSE Labs

Re: Re: [PATCH v4 6/8] KVM: PPC: Ultravisor: Restrict LDBAR access

2019-07-01 Thread Ram Pai

On Mon, Jul 01, 2019 at 04:30:55PM +1000, Alexey Kardashevskiy wrote:
> 
> 
> On 01/07/2019 16:17, maddy wrote:
> > 
> > On 01/07/19 11:24 AM, Alexey Kardashevskiy wrote:
> >>
> >> On 29/06/2019 06:08, Claudio Carvalho wrote:
> >>> When the ultravisor firmware is available, it takes control over the
> >>> LDBAR register. In this case, thread-imc updates and save/restore
> >>> operations on the LDBAR register are handled by ultravisor.
> >> What does LDBAR do? "Power ISA™ Version 3.0 B" or "User’s Manual POWER9
> >> Processor" do not tell.
> > LDBAR is a per-thread SPR used by thread-imc pmu to dump the counter
> > data into memory.
> > LDBAR contains memory address along with few other configuration bits
> > (it is populated
> > by the thread-imc pmu driver). It is populated and enabled only when any
> > of the thread
> > imc pmu events are monitored.
> 
> 
> I was actually looking for a spec for this register, what is the
> document name?

  Its not a architected register. Its documented in the Power9
  workbook.

RP

[PATCH v2 3/3] mm/vmalloc: fix vmalloc_to_page for huge vmap mappings

2019-07-01 Thread Nicholas Piggin

vmalloc_to_page returns NULL for addresses mapped by larger pages[*].
Whether or not a vmap is huge depends on the architecture details,
alignments, boot options, etc., which the caller can not be expected
to know. Therefore HUGE_VMAP is a regression for vmalloc_to_page.

This change teaches vmalloc_to_page about larger pages, and returns
the struct page that corresponds to the offset within the large page.
This makes the API agnostic to mapping implementation details.

[*] As explained by commit 029c54b095995 ("mm/vmalloc.c: huge-vmap:
fail gracefully on unexpected huge vmap mappings")

Signed-off-by: Nicholas Piggin 
---
 include/asm-generic/4level-fixup.h |  1 +
 include/asm-generic/5level-fixup.h |  1 +
 mm/vmalloc.c   | 37 +++---
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/include/asm-generic/4level-fixup.h 
b/include/asm-generic/4level-fixup.h
index e3667c9a33a5..3cc65a4dd093 100644
--- a/include/asm-generic/4level-fixup.h
+++ b/include/asm-generic/4level-fixup.h
@@ -20,6 +20,7 @@
 #define pud_none(pud)  0
 #define pud_bad(pud)   0
 #define pud_present(pud)   1
+#define pud_large(pud) 0
 #define pud_ERROR(pud) do { } while (0)
 #define pud_clear(pud) pgd_clear(pud)
 #define pud_val(pud)   pgd_val(pud)
diff --git a/include/asm-generic/5level-fixup.h 
b/include/asm-generic/5level-fixup.h
index bb6cb347018c..c4377db09a4f 100644
--- a/include/asm-generic/5level-fixup.h
+++ b/include/asm-generic/5level-fixup.h
@@ -22,6 +22,7 @@
 #define p4d_none(p4d)  0
 #define p4d_bad(p4d)   0
 #define p4d_present(p4d)   1
+#define p4d_large(p4d) 0
 #define p4d_ERROR(p4d) do { } while (0)
 #define p4d_clear(p4d) pgd_clear(p4d)
 #define p4d_val(p4d)   pgd_val(p4d)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0f76cca32a1c..09a283866368 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -36,6 +36,7 @@
 #include 
 
 #include 
+#include 
 #include 
 #include 
 
@@ -284,25 +285,35 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
 
if (pgd_none(*pgd))
return NULL;
+
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d))
return NULL;
-   pud = pud_offset(p4d, addr);
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+   if (p4d_large(*p4d))
+   return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
+#endif
+   if (WARN_ON_ONCE(p4d_bad(*p4d)))
+   return NULL;
 
-   /*
-* Don't dereference bad PUD or PMD (below) entries. This will also
-* identify huge mappings, which we may encounter on architectures
-* that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
-* identified as vmalloc addresses by is_vmalloc_addr(), but are
-* not [unambiguously] associated with a struct page, so there is
-* no correct value to return for them.
-*/
-   WARN_ON_ONCE(pud_bad(*pud));
-   if (pud_none(*pud) || pud_bad(*pud))
+   pud = pud_offset(p4d, addr);
+   if (pud_none(*pud))
+   return NULL;
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+   if (pud_large(*pud))
+   return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+#endif
+   if (WARN_ON_ONCE(pud_bad(*pud)))
return NULL;
+
pmd = pmd_offset(pud, addr);
-   WARN_ON_ONCE(pmd_bad(*pmd));
-   if (pmd_none(*pmd) || pmd_bad(*pmd))
+   if (pmd_none(*pmd))
+   return NULL;
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+   if (pmd_large(*pmd))
+   return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+#endif
+   if (WARN_ON_ONCE(pmd_bad(*pmd)))
return NULL;
 
ptep = pte_offset_map(pmd, addr);
-- 
2.20.1

[PATCH v2 2/3] powerpc/64s: Add p?d_large definitions

2019-07-01 Thread Nicholas Piggin

The subsequent patch to fix vmalloc_to_page with huge vmap requires
HUGE_VMAP archs to provide p?d_large definitions for the non-pgd page
table levels they support.

Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 24 
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index ccf00a8b98c6..c19c8396a1bd 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -915,6 +915,11 @@ static inline int pud_present(pud_t pud)
return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PRESENT));
 }
 
+static inline int pud_large(pud_t pud)
+{
+   return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
+}
+
 extern struct page *pud_page(pud_t pud);
 extern struct page *pmd_page(pmd_t pmd);
 static inline pte_t pud_pte(pud_t pud)
@@ -958,6 +963,11 @@ static inline int pgd_present(pgd_t pgd)
return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PRESENT));
 }
 
+static inline int pgd_large(pgd_t pgd)
+{
+   return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PTE));
+}
+
 static inline pte_t pgd_pte(pgd_t pgd)
 {
return __pte_raw(pgd_raw(pgd));
@@ -1083,6 +1093,11 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
 #define pmd_mk_savedwrite(pmd) pte_pmd(pte_mk_savedwrite(pmd_pte(pmd)))
 #define pmd_clear_savedwrite(pmd)  
pte_pmd(pte_clear_savedwrite(pmd_pte(pmd)))
 
+static inline int pmd_large(pmd_t pmd)
+{
+   return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
+}
+
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
 #define pmd_soft_dirty(pmd)pte_soft_dirty(pmd_pte(pmd))
 #define pmd_mksoft_dirty(pmd)  pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)))
@@ -1151,15 +1166,6 @@ pmd_hugepage_update(struct mm_struct *mm, unsigned long 
addr, pmd_t *pmdp,
return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set);
 }
 
-/*
- * returns true for pmd migration entries, THP, devmap, hugetlb
- * But compile time dependent on THP config
- */
-static inline int pmd_large(pmd_t pmd)
-{
-   return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
-}
-
 static inline pmd_t pmd_mknotpresent(pmd_t pmd)
 {
return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT);
-- 
2.20.1

[PATCH v2 1/3] arm64: mm: Add p?d_large() definitions

2019-07-01 Thread Nicholas Piggin

walk_page_range() is going to be allowed to walk page tables other than
those of user space. For this it needs to know when it has reached a
'leaf' entry in the page tables. This information will be provided by the
p?d_large() functions/macros.

For arm64, we already have p?d_sect() macros which we can reuse for
p?d_large().

pud_sect() is defined as a dummy function when CONFIG_PGTABLE_LEVELS < 3
or CONFIG_ARM64_64K_PAGES is defined. However when the kernel is
configured this way then architecturally it isn't allowed to have a
large page that this level, and any code using these page walking macros
is implicitly relying on the page size/number of levels being the same as
the kernel. So it is safe to reuse this for p?d_large() as it is an
architectural restriction.

Cc: Catalin Marinas 
Cc: Will Deacon 
Signed-off-by: Steven Price 
---
 arch/arm64/include/asm/pgtable.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index fca26759081a..0e973201bc16 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -417,6 +417,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, 
unsigned long pfn,
 PMD_TYPE_TABLE)
 #define pmd_sect(pmd)  ((pmd_val(pmd) & PMD_TYPE_MASK) == \
 PMD_TYPE_SECT)
+#define pmd_large(pmd) pmd_sect(pmd)
 
 #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
 #define pud_sect(pud)  (0)
@@ -499,6 +500,7 @@ static inline void pte_unmap(pte_t *pte) { }
 #define pud_none(pud)  (!pud_val(pud))
 #define pud_bad(pud)   (!(pud_val(pud) & PUD_TABLE_BIT))
 #define pud_present(pud)   pte_present(pud_pte(pud))
+#define pud_large(pud) pud_sect(pud)
 #define pud_valid(pud) pte_valid(pud_pte(pud))
 
 static inline void set_pud(pud_t *pudp, pud_t pud)
-- 
2.20.1

[PATCH v2 0/3] fix vmalloc_to_page for huge vmap mappings

2019-07-01 Thread Nicholas Piggin

This is a change broken out from the huge vmap vmalloc series as
requested. There is a little bit of dependency juggling across
trees, but patches are pretty trivial. Ideally if Andrew accepts
this patch and queues it up for next, then the arch patches would
be merged through those trees then patch 3 gets sent by Andrew.

I've tested this with other powerpc and vmalloc patches, with code
that explicitly tests vmalloc_to_page on vmalloced memory and
results look fine.

v2: change the order of testing pxx_large and pxx_bad, to avoid issues
with arm64

Thanks,
Nick

Nicholas Piggin (3):
  arm64: mm: Add p?d_large() definitions
  powerpc/64s: Add p?d_large definitions
  mm/vmalloc: fix vmalloc_to_page for huge vmap mappings

 arch/arm64/include/asm/pgtable.h |  2 ++
 arch/powerpc/include/asm/book3s/64/pgtable.h | 24 -
 include/asm-generic/4level-fixup.h   |  1 +
 include/asm-generic/5level-fixup.h   |  1 +
 mm/vmalloc.c | 37 +---
 5 files changed, 43 insertions(+), 22 deletions(-)

-- 
2.20.1

Re: [PATCH v4 6/8] KVM: PPC: Ultravisor: Restrict LDBAR access

2019-07-01 Thread Alexey Kardashevskiy




On 01/07/2019 16:17, maddy wrote:
> 
> On 01/07/19 11:24 AM, Alexey Kardashevskiy wrote:
>>
>> On 29/06/2019 06:08, Claudio Carvalho wrote:
>>> When the ultravisor firmware is available, it takes control over the
>>> LDBAR register. In this case, thread-imc updates and save/restore
>>> operations on the LDBAR register are handled by ultravisor.
>> What does LDBAR do? "Power ISA™ Version 3.0 B" or "User’s Manual POWER9
>> Processor" do not tell.
> LDBAR is a per-thread SPR used by thread-imc pmu to dump the counter
> data into memory.
> LDBAR contains memory address along with few other configuration bits
> (it is populated
> by the thread-imc pmu driver). It is populated and enabled only when any
> of the thread
> imc pmu events are monitored.


I was actually looking for a spec for this register, what is the
document name?


> 
> Maddy
>>
>>
>>> Signed-off-by: Claudio Carvalho 
>>> Reviewed-by: Ram Pai 
>>> Reviewed-by: Ryan Grimm 
>>> Acked-by: Madhavan Srinivasan 
>>> Acked-by: Paul Mackerras 
>>> ---
>>>   arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 2 ++
>>>   arch/powerpc/platforms/powernv/idle.c | 6 --
>>>   arch/powerpc/platforms/powernv/opal-imc.c | 4 
>>>   3 files changed, 10 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
>>> b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
>>> index f9b2620fbecd..cffb365d9d02 100644
>>> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
>>> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
>>> @@ -375,8 +375,10 @@ BEGIN_FTR_SECTION
>>>   mtspr    SPRN_RPR, r0
>>>   ld    r0, KVM_SPLIT_PMMAR(r6)
>>>   mtspr    SPRN_PMMAR, r0
>>> +BEGIN_FW_FTR_SECTION_NESTED(70)
>>>   ld    r0, KVM_SPLIT_LDBAR(r6)
>>>   mtspr    SPRN_LDBAR, r0
>>> +END_FW_FTR_SECTION_NESTED(FW_FEATURE_ULTRAVISOR, 0, 70)
>>>   isync
>>>   FTR_SECTION_ELSE
>>>   /* On P9 we use the split_info for coordinating LPCR changes */
>>> diff --git a/arch/powerpc/platforms/powernv/idle.c
>>> b/arch/powerpc/platforms/powernv/idle.c
>>> index 77f2e0a4ee37..5593a2d55959 100644
>>> --- a/arch/powerpc/platforms/powernv/idle.c
>>> +++ b/arch/powerpc/platforms/powernv/idle.c
>>> @@ -679,7 +679,8 @@ static unsigned long power9_idle_stop(unsigned
>>> long psscr, bool mmu_on)
>>>   sprs.ptcr    = mfspr(SPRN_PTCR);
>>>   sprs.rpr    = mfspr(SPRN_RPR);
>>>   sprs.tscr    = mfspr(SPRN_TSCR);
>>> -    sprs.ldbar    = mfspr(SPRN_LDBAR);
>>> +    if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
>>> +    sprs.ldbar    = mfspr(SPRN_LDBAR);
>>>     sprs_saved = true;
>>>   @@ -762,7 +763,8 @@ static unsigned long power9_idle_stop(unsigned
>>> long psscr, bool mmu_on)
>>>   mtspr(SPRN_PTCR,    sprs.ptcr);
>>>   mtspr(SPRN_RPR,    sprs.rpr);
>>>   mtspr(SPRN_TSCR,    sprs.tscr);
>>> -    mtspr(SPRN_LDBAR,    sprs.ldbar);
>>> +    if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
>>> +    mtspr(SPRN_LDBAR,    sprs.ldbar);
>>>     if (pls >= pnv_first_tb_loss_level) {
>>>   /* TB loss */
>>> diff --git a/arch/powerpc/platforms/powernv/opal-imc.c
>>> b/arch/powerpc/platforms/powernv/opal-imc.c
>>> index 1b6932890a73..5fe2d4526cbc 100644
>>> --- a/arch/powerpc/platforms/powernv/opal-imc.c
>>> +++ b/arch/powerpc/platforms/powernv/opal-imc.c
>>> @@ -254,6 +254,10 @@ static int opal_imc_counters_probe(struct
>>> platform_device *pdev)
>>>   bool core_imc_reg = false, thread_imc_reg = false;
>>>   u32 type;
>>>   +    /* Disable IMC devices, when Ultravisor is enabled. */
>>> +    if (firmware_has_feature(FW_FEATURE_ULTRAVISOR))
>>> +    return -EACCES;
>>> +
>>>   /*
>>>    * Check whether this is kdump kernel. If yes, force the
>>> engines to
>>>    * stop and return.
>>>
> 

-- 
Alexey

Re: [PATCH v4 6/8] KVM: PPC: Ultravisor: Restrict LDBAR access

2019-07-01 Thread maddy




On 01/07/19 11:24 AM, Alexey Kardashevskiy wrote:


On 29/06/2019 06:08, Claudio Carvalho wrote:

When the ultravisor firmware is available, it takes control over the
LDBAR register. In this case, thread-imc updates and save/restore
operations on the LDBAR register are handled by ultravisor.

What does LDBAR do? "Power ISA™ Version 3.0 B" or "User’s Manual POWER9
Processor" do not tell.
LDBAR is a per-thread SPR used by thread-imc pmu to dump the counter 
data into memory.
LDBAR contains memory address along with few other configuration bits 
(it is populated
by the thread-imc pmu driver). It is populated and enabled only when any 
of the thread

imc pmu events are monitored.

Maddy




Signed-off-by: Claudio Carvalho 
Reviewed-by: Ram Pai 
Reviewed-by: Ryan Grimm 
Acked-by: Madhavan Srinivasan 
Acked-by: Paul Mackerras 
---
  arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 2 ++
  arch/powerpc/platforms/powernv/idle.c | 6 --
  arch/powerpc/platforms/powernv/opal-imc.c | 4 
  3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index f9b2620fbecd..cffb365d9d02 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -375,8 +375,10 @@ BEGIN_FTR_SECTION
mtspr   SPRN_RPR, r0
ld  r0, KVM_SPLIT_PMMAR(r6)
mtspr   SPRN_PMMAR, r0
+BEGIN_FW_FTR_SECTION_NESTED(70)
ld  r0, KVM_SPLIT_LDBAR(r6)
mtspr   SPRN_LDBAR, r0
+END_FW_FTR_SECTION_NESTED(FW_FEATURE_ULTRAVISOR, 0, 70)
isync
  FTR_SECTION_ELSE
/* On P9 we use the split_info for coordinating LPCR changes */
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 77f2e0a4ee37..5593a2d55959 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -679,7 +679,8 @@ static unsigned long power9_idle_stop(unsigned long psscr, 
bool mmu_on)
sprs.ptcr   = mfspr(SPRN_PTCR);
sprs.rpr= mfspr(SPRN_RPR);
sprs.tscr   = mfspr(SPRN_TSCR);
-   sprs.ldbar  = mfspr(SPRN_LDBAR);
+   if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+   sprs.ldbar  = mfspr(SPRN_LDBAR);
  
  		sprs_saved = true;
  
@@ -762,7 +763,8 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)

mtspr(SPRN_PTCR,sprs.ptcr);
mtspr(SPRN_RPR, sprs.rpr);
mtspr(SPRN_TSCR,sprs.tscr);
-   mtspr(SPRN_LDBAR,   sprs.ldbar);
+   if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+   mtspr(SPRN_LDBAR,   sprs.ldbar);
  
  	if (pls >= pnv_first_tb_loss_level) {

/* TB loss */
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index 1b6932890a73..5fe2d4526cbc 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -254,6 +254,10 @@ static int opal_imc_counters_probe(struct platform_device 
*pdev)
bool core_imc_reg = false, thread_imc_reg = false;
u32 type;
  
+	/* Disable IMC devices, when Ultravisor is enabled. */

+   if (firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+   return -EACCES;
+
/*
 * Check whether this is kdump kernel. If yes, force the engines to
 * stop and return.

72 matches

Mail list logo