[PATCH v2] powerpc/64s: Make boot look nicer
Radix boot looks like this: - phys_mem_size = 0x2 dcache_bsize = 0x80 icache_bsize = 0x80 cpu_features = 0xc06f8f5fb1a7 possible= 0xfbffcf5fb1a7 always = 0x0003800081a1 cpu_user_features = 0xdc0065c2 0xaee0 mmu_features = 0xbc006041 firmware_features = 0x1000 hash-mmu: ppc64_pft_size= 0x0 hash-mmu: kernel vmalloc start = 0xc008 hash-mmu: kernel IO start= 0xc00a hash-mmu: kernel vmemmap start = 0xc00c - It's misleading to see hash-mmu there. After fix: - phys_mem_size = 0x2 dcache_bsize = 0x80 icache_bsize = 0x80 cpu_features = 0xc06f8f5fb1a7 possible= 0xfbffcf5fb1a7 always = 0x0003800081a1 cpu_user_features = 0xdc0065c2 0xaee0 mmu_features = 0xbc006041 firmware_features = 0x1000 vmalloc start = 0xc008 IO start = 0xc00a vmemmap start = 0xc00c - Hash has the same text misalignment problem which is also corrected. Signed-off-by: Nicholas Piggin --- v2: avoid adding an ifdef to generic code. arch/powerpc/mm/book3s64/hash_utils.c | 21 +++-- 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 28ced26f2a00..54f2d6acf6da 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1946,11 +1946,20 @@ machine_device_initcall(pseries, hash64_debugfs); void __init print_system_hash_info(void) { - pr_info("ppc64_pft_size= 0x%llx\n", ppc64_pft_size); +#undef pr_fmt +#define pr_fmt(fmt) fmt - if (htab_hash_mask) - pr_info("htab_hash_mask= 0x%lx\n", htab_hash_mask); - pr_info("kernel vmalloc start = 0x%lx\n", KERN_VIRT_START); - pr_info("kernel IO start= 0x%lx\n", KERN_IO_START); - pr_info("kernel vmemmap start = 0x%lx\n", (unsigned long)vmemmap); + if (!early_radix_enabled()) { + pr_info("ppc64_pft_size= 0x%llx\n", ppc64_pft_size); + + if (htab_hash_mask) + pr_info("htab_hash_mask= 0x%lx\n", htab_hash_mask); + } + + pr_info("vmalloc start = 0x%lx\n", KERN_VIRT_START); + pr_info("IO start = 0x%lx\n", KERN_IO_START); + pr_info("vmemmap start = 0x%lx\n", (unsigned long)vmemmap); + +#undef pr_fmt +#define pr_fmt(fmt) "hash-mmu: " fmt } -- 2.20.1
[PATCH v2 2/2] Powerpc/Watchpoint: Rewrite ptrace-hwbreak.c selftest
ptrace-hwbreak.c selftest is logically broken. On powerpc, when watchpoint is created with ptrace, signals are generated before executing the instruction and user has to manually singlestep the instruction with watchpoint disabled, which selftest never does and thus it keeps on getting the signal at the same instruction. If we fix it, selftest fails because the logical connection between tracer(parent) and tracee(child) is also broken. Rewrite the selftest and add new tests for unaligned access. With patch: $ ./tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak test: ptrace-hwbreak tags: git_version:v5.2-rc2-33-ga247a75f90a9-dirty PTRACE_SET_DEBUGREG, WO, len: 1: Ok PTRACE_SET_DEBUGREG, WO, len: 2: Ok PTRACE_SET_DEBUGREG, WO, len: 4: Ok PTRACE_SET_DEBUGREG, WO, len: 8: Ok PTRACE_SET_DEBUGREG, RO, len: 1: Ok PTRACE_SET_DEBUGREG, RO, len: 2: Ok PTRACE_SET_DEBUGREG, RO, len: 4: Ok PTRACE_SET_DEBUGREG, RO, len: 8: Ok PTRACE_SET_DEBUGREG, RW, len: 1: Ok PTRACE_SET_DEBUGREG, RW, len: 2: Ok PTRACE_SET_DEBUGREG, RW, len: 4: Ok PTRACE_SET_DEBUGREG, RW, len: 8: Ok PPC_PTRACE_SETHWDEBUG, MODE_EXACT, WO, len: 1: Ok PPC_PTRACE_SETHWDEBUG, MODE_EXACT, RO, len: 1: Ok PPC_PTRACE_SETHWDEBUG, MODE_EXACT, RW, len: 1: Ok PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW ALIGNED, WO, len: 6: Ok PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW ALIGNED, RO, len: 6: Ok PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW ALIGNED, RW, len: 6: Ok PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW UNALIGNED, WO, len: 6: Ok PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW UNALIGNED, RO, len: 6: Ok PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW UNALIGNED, RW, len: 6: Ok PPC_PTRACE_SETHWDEBUG, DAWR_MAX_LEN, RW, len: 512: Ok success: ptrace-hwbreak Signed-off-by: Ravi Bangoria --- .../selftests/powerpc/ptrace/ptrace-hwbreak.c | 535 +++--- 1 file changed, 325 insertions(+), 210 deletions(-) diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c index 3066d310f32b..fb1e05d7f77c 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c @@ -22,318 +22,433 @@ #include #include "ptrace.h" -/* Breakpoint access modes */ -enum { - BP_X = 1, - BP_RW = 2, - BP_W = 4, -}; - -static pid_t child_pid; -static struct ppc_debug_info dbginfo; - -static void get_dbginfo(void) -{ - int ret; - - ret = ptrace(PPC_PTRACE_GETHWDBGINFO, child_pid, NULL, ); - if (ret) { - perror("Can't get breakpoint info\n"); - exit(-1); - } -} - -static bool hwbreak_present(void) -{ - return (dbginfo.num_data_bps != 0); -} +/* + * Use volatile on all global var so that compiler doesn't + * optimise their load/stores. Otherwise selftest can fail. + */ +static volatile __u64 glvar; -static bool dawr_present(void) -{ - return !!(dbginfo.features & PPC_DEBUG_FEATURE_DATA_BP_DAWR); -} +#define DAWR_MAX_LEN 512 +static volatile __u8 big_var[DAWR_MAX_LEN] __attribute__((aligned(512))); -static void set_breakpoint_addr(void *addr) -{ - int ret; +#define A_LEN 6 +#define B_LEN 6 +struct gstruct { + __u8 a[A_LEN]; /* double word aligned */ + __u8 b[B_LEN]; /* double word unaligned */ +}; +static volatile struct gstruct gstruct __attribute__((aligned(512))); - ret = ptrace(PTRACE_SET_DEBUGREG, child_pid, 0, addr); - if (ret) { - perror("Can't set breakpoint addr\n"); - exit(-1); - } -} -static int set_hwbreakpoint_addr(void *addr, int range) +static void get_dbginfo(pid_t child_pid, struct ppc_debug_info *dbginfo) { - int ret; - - struct ppc_hw_breakpoint info; - - info.version = 1; - info.trigger_type = PPC_BREAKPOINT_TRIGGER_RW; - info.addr_mode = PPC_BREAKPOINT_MODE_EXACT; - if (range > 0) - info.addr_mode = PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE; - info.condition_mode = PPC_BREAKPOINT_CONDITION_NONE; - info.addr = (__u64)addr; - info.addr2 = (__u64)addr + range; - info.condition_value = 0; - - ret = ptrace(PPC_PTRACE_SETHWDEBUG, child_pid, 0, ); - if (ret < 0) { - perror("Can't set breakpoint\n"); + if (ptrace(PPC_PTRACE_GETHWDBGINFO, child_pid, NULL, dbginfo)) { + perror("Can't get breakpoint info"); exit(-1); } - return ret; } -static int del_hwbreakpoint_addr(int watchpoint_handle) +static bool dawr_present(struct ppc_debug_info *dbginfo) { - int ret; - - ret = ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, watchpoint_handle); - if (ret < 0) { - perror("Can't delete hw breakpoint\n"); - exit(-1); - } - return ret; + return !!(dbginfo->features & PPC_DEBUG_FEATURE_DATA_BP_DAWR); } -#define DAWR_LENGTH_MAX 512 - -/* Dummy variables to test read/write accesses */
[PATCH v2 1/2] Powerpc/Watchpoint: Fix length calculation for unaligned target
Watchpoint match range is always doubleword(8 bytes) aligned on powerpc. If the given range is crossing doubleword boundary, we need to increase the length such that next doubleword also get covered. Ex, address len = 6 bytes |=. |v--|--v| | | | | | | | | | | | | | | | | | |---|---| <---8 bytes---> In such case, current code configures hw as: start_addr = address & ~HW_BREAKPOINT_ALIGN len = 8 bytes And thus read/write in last 4 bytes of the given range is ignored. Fix this by including next doubleword in the length. Watchpoint exception handler already ignores extraneous exceptions, so no changes required for that. Signed-off-by: Ravi Bangoria --- v1->v2: - v1: https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-June/192162.html - No cosmetic patches. - v1 fixed unaligned issue in core hw-breakpoint code only. But ptrace was still messing up with address/len. Fix that in v2. arch/powerpc/include/asm/debug.h | 1 + arch/powerpc/include/asm/hw_breakpoint.h | 9 - arch/powerpc/kernel/hw_breakpoint.c | 26 +++- arch/powerpc/kernel/process.c| 50 +++- arch/powerpc/kernel/ptrace.c | 37 ++ arch/powerpc/xmon/xmon.c | 3 +- 6 files changed, 83 insertions(+), 43 deletions(-) diff --git a/arch/powerpc/include/asm/debug.h b/arch/powerpc/include/asm/debug.h index 7756026b95ca..9c1b4aaa374b 100644 --- a/arch/powerpc/include/asm/debug.h +++ b/arch/powerpc/include/asm/debug.h @@ -45,6 +45,7 @@ static inline int debugger_break_match(struct pt_regs *regs) { return 0; } static inline int debugger_fault_handler(struct pt_regs *regs) { return 0; } #endif +int hw_breakpoint_validate_len(struct arch_hw_breakpoint *hw); void __set_breakpoint(struct arch_hw_breakpoint *brk); bool ppc_breakpoint_available(void); #ifdef CONFIG_PPC_ADV_DEBUG_REGS diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h index 0fe8c1e46bbc..ca2877be7f79 100644 --- a/arch/powerpc/include/asm/hw_breakpoint.h +++ b/arch/powerpc/include/asm/hw_breakpoint.h @@ -28,6 +28,7 @@ struct arch_hw_breakpoint { unsigned long address; u16 type; u16 len; /* length of the target data symbol */ + u16 hw_len; /* length programmed in hw */ }; /* Note: Don't change the the first 6 bits below as they are in the same order @@ -47,6 +48,11 @@ struct arch_hw_breakpoint { #define HW_BRK_TYPE_PRIV_ALL (HW_BRK_TYPE_USER | HW_BRK_TYPE_KERNEL | \ HW_BRK_TYPE_HYP) +#define HW_BREAKPOINT_ALIGN 0x7 + +#define DABR_MAX_LEN 8 +#define DAWR_MAX_LEN 512 + #ifdef CONFIG_HAVE_HW_BREAKPOINT #include #include @@ -58,8 +64,6 @@ struct pmu; struct perf_sample_data; struct task_struct; -#define HW_BREAKPOINT_ALIGN 0x7 - extern int hw_breakpoint_slots(int type); extern int arch_bp_generic_fields(int type, int *gen_bp_type); extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); @@ -84,6 +88,7 @@ static inline void hw_breakpoint_disable(void) brk.address = 0; brk.type = 0; brk.len = 0; + brk.hw_len = 0; if (ppc_breakpoint_available()) __set_breakpoint(); } diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index da307dd93ee3..a61a1266089a 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -147,9 +147,9 @@ int hw_breakpoint_arch_parse(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw) { - int ret = -EINVAL, length_max; + int ret = -EINVAL; - if (!bp) + if (!bp || !attr->bp_len) return ret; hw->type = HW_BRK_TYPE_TRANSLATE; @@ -169,26 +169,10 @@ int hw_breakpoint_arch_parse(struct perf_event *bp, hw->address = attr->bp_addr; hw->len = attr->bp_len; - /* -* Since breakpoint length can be a maximum of HW_BREAKPOINT_LEN(8) -* and breakpoint addresses are aligned to nearest double-word -* HW_BREAKPOINT_ALIGN by rounding off to the lower address, the -* 'symbolsize' should satisfy the check below. -*/ if (!ppc_breakpoint_available()) return -ENODEV; - length_max = 8; /* DABR */ - if (dawr_enabled()) { - length_max = 512 ; /* 64 doublewords */ - /* DAWR region can't cross 512 boundary */ - if ((attr->bp_addr >> 9) != - ((attr->bp_addr + attr->bp_len - 1) >> 9)) - return -EINVAL; - } - if (hw->len > - (length_max - (hw->address & HW_BREAKPOINT_ALIGN))) - return -EINVAL; - return 0; + +
[v2 12/12] powerpc/64s: Save r13 in machine_check_common_early
From: Reza Arbab Testing my memcpy_mcsafe() work in progress with an injected UE, I get an error like this immediately after the function returns: BUG: Unable to handle kernel data access at 0x7fff84dec8f8 Faulting instruction address: 0xc008009c00b0 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Radix MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV Modules linked in: mce(O+) vmx_crypto crc32c_vpmsum CPU: 0 PID: 1375 Comm: modprobe Tainted: G O 5.1.0-rc6 #267 NIP: c008009c00b0 LR: c008009c00a8 CTR: c0095f90 REGS: c000ee197790 TRAP: 0300 Tainted: G O (5.1.0-rc6) MSR: 9280b033 CR: 88002826 XER: 0004 CFAR: c0095f8c DAR: 7fff84dec8f8 DSISR: 4000 IRQMASK: 0 GPR00: 6c6c6568 c000ee197a20 c008009c8400 fff2 GPR04: c008009c02e0 0006 c3c834c8 GPR08: 0080 776a6681b7fb5100 c008009c01c8 GPR12: c0095f90 7fff84debc00 4d071440 GPR16: 00010601 c008009e c0c98dd8 c0c98d98 GPR20: c3bba970 c008009c04d0 c008009c0618 c01e5820 GPR24: 0100 0001 c3bba958 GPR28: c008009c02e8 c008009c0318 c008009c02e0 NIP [c008009c00b0] cause_ue+0xa8/0xe8 [mce] LR [c008009c00a8] cause_ue+0xa0/0xe8 [mce] To fix, ensure that r13 is properly restored after an MCE. This commit is needed for testing this series, this is a possible simulator bug. --- arch/powerpc/kernel/exceptions-64s.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 311f1392a2ec..932d8d05892c 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -265,6 +265,7 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) EXC_REAL_END(machine_check, 0x200, 0x100) EXC_VIRT_NONE(0x4200, 0x100) TRAMP_REAL_BEGIN(machine_check_common_early) + SET_SCRATCH0(r13) /* save r13 */ EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200) /* * Register contents: -- 2.20.1
[v2 11/12] powerpc: add machine check safe copy_to_user
Use memcpy_mcsafe() implementation to define copy_to_user_mcsafe() Signed-off-by: Santosh Sivaraj --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/uaccess.h | 12 2 files changed, 13 insertions(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 8c1c636308c8..a173b392c272 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -134,6 +134,7 @@ config PPC select ARCH_HAS_STRICT_KERNEL_RWX if ((PPC_BOOK3S_64 || PPC32) && !RELOCATABLE && !HIBERNATION) select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UACCESS_FLUSHCACHE if PPC64 + select ARCH_HAS_UACCESS_MCSAFE if PPC64 select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64 select ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 76f34346b642..f8fcaab4c5bc 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -386,6 +386,18 @@ static inline unsigned long raw_copy_to_user(void __user *to, return ret; } +static __always_inline unsigned long __must_check +copy_to_user_mcsafe(void __user *to, const void *from, unsigned long n) +{ + if (likely(check_copy_size(from, n, true))) { + allow_write_to_user(to, n); + n = memcpy_mcsafe(to, from, n); + prevent_write_to_user(to, n); + } + + return n; +} + extern unsigned long __clear_user(void __user *addr, unsigned long size); static inline unsigned long clear_user(void __user *addr, unsigned long size) -- 2.20.1
[v2 10/12] powerpc/memcpy_mcsafe: return remaining bytes
memcpy_mcsafe currently return -EFAULT on a machine check exception, change it to return the remaining bytes that needs to be copied, so that machine check safe copy_to_user can maintain the same behavior as copy_to_user. Signed-off-by: Santosh Sivaraj --- arch/powerpc/lib/memcpy_mcsafe_64.S | 129 +++- 1 file changed, 70 insertions(+), 59 deletions(-) diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S b/arch/powerpc/lib/memcpy_mcsafe_64.S index 50f865db0338..566c664aa640 100644 --- a/arch/powerpc/lib/memcpy_mcsafe_64.S +++ b/arch/powerpc/lib/memcpy_mcsafe_64.S @@ -30,11 +30,12 @@ ld r14,STK_REG(R14)(r1) addir1,r1,STACKFRAMESIZE .Ldo_err1: - li r3,-EFAULT + mr r3,r7 blr _GLOBAL(memcpy_mcsafe) + mr r7,r5 cmpldi r5,16 blt .Lshort_copy @@ -49,18 +50,21 @@ err1; lbz r0,0(r4) addir4,r4,1 err1; stb r0,0(r3) addir3,r3,1 + subir7,r7,1 1: bf cr7*4+2,2f err1; lhz r0,0(r4) addir4,r4,2 err1; sth r0,0(r3) addir3,r3,2 + subir7,r7,2 2: bf cr7*4+1,3f err1; lwz r0,0(r4) addir4,r4,4 err1; stw r0,0(r3) addir3,r3,4 + subir7,r7,4 3: sub r5,r5,r6 cmpldi r5,128 @@ -87,43 +91,69 @@ err1; stw r0,0(r3) 4: err2; ld r0,0(r4) err2; ld r6,8(r4) -err2; ld r7,16(r4) -err2; ld r8,24(r4) -err2; ld r9,32(r4) -err2; ld r10,40(r4) -err2; ld r11,48(r4) -err2; ld r12,56(r4) -err2; ld r14,64(r4) -err2; ld r15,72(r4) -err2; ld r16,80(r4) -err2; ld r17,88(r4) -err2; ld r18,96(r4) -err2; ld r19,104(r4) -err2; ld r20,112(r4) -err2; ld r21,120(r4) +err2; ld r8,16(r4) +err2; ld r9,24(r4) +err2; ld r10,32(r4) +err2; ld r11,40(r4) +err2; ld r12,48(r4) +err2; ld r14,56(r4) +err2; ld r15,64(r4) +err2; ld r16,72(r4) +err2; ld r17,80(r4) +err2; ld r18,88(r4) +err2; ld r19,96(r4) +err2; ld r20,104(r4) +err2; ld r21,112(r4) +err2; ld r22,120(r4) addir4,r4,128 err2; std r0,0(r3) err2; std r6,8(r3) -err2; std r7,16(r3) -err2; std r8,24(r3) -err2; std r9,32(r3) -err2; std r10,40(r3) -err2; std r11,48(r3) -err2; std r12,56(r3) -err2; std r14,64(r3) -err2; std r15,72(r3) -err2; std r16,80(r3) -err2; std r17,88(r3) -err2; std r18,96(r3) -err2; std r19,104(r3) -err2; std r20,112(r3) -err2; std r21,120(r3) +err2; std r8,16(r3) +err2; std r9,24(r3) +err2; std r10,32(r3) +err2; std r11,40(r3) +err2; std r12,48(r3) +err2; std r14,56(r3) +err2; std r15,64(r3) +err2; std r16,72(r3) +err2; std r17,80(r3) +err2; std r18,88(r3) +err2; std r19,96(r3) +err2; std r20,104(r3) +err2; std r21,112(r3) +err2; std r22,120(r3) addir3,r3,128 + subir7,r7,128 bdnz4b clrldi r5,r5,(64-7) - ld r14,STK_REG(R14)(r1) + /* Up to 127B to go */ +5: srdir6,r5,4 + mtocrf 0x01,r6 + +6: bf cr7*4+1,7f +err2; ld r0,0(r4) +err2; ld r6,8(r4) +err2; ld r8,16(r4) +err2; ld r9,24(r4) +err2; ld r10,32(r4) +err2; ld r11,40(r4) +err2; ld r12,48(r4) +err2; ld r14,56(r4) + addir4,r4,64 +err2; std r0,0(r3) +err2; std r6,8(r3) +err2; std r8,16(r3) +err2; std r9,24(r3) +err2; std r10,32(r3) +err2; std r11,40(r3) +err2; std r12,48(r3) +err2; std r14,56(r3) + addir3,r3,64 + subir7,r7,64 + +7: ld r14,STK_REG(R14)(r1) ld r15,STK_REG(R15)(r1) ld r16,STK_REG(R16)(r1) ld r17,STK_REG(R17)(r1) @@ -134,42 +164,19 @@ err2; std r21,120(r3) ld r22,STK_REG(R22)(r1) addir1,r1,STACKFRAMESIZE - /* Up to 127B to go */ -5: srdir6,r5,4 - mtocrf 0x01,r6 - -6: bf cr7*4+1,7f -err1; ld r0,0(r4) -err1; ld r6,8(r4) -err1; ld r7,16(r4) -err1; ld r8,24(r4) -err1; ld r9,32(r4) -err1; ld r10,40(r4) -err1; ld r11,48(r4) -err1; ld r12,56(r4) - addir4,r4,64 -err1; std r0,0(r3) -err1; std r6,8(r3) -err1; std r7,16(r3) -err1; std r8,24(r3) -err1; std r9,32(r3) -err1; std r10,40(r3) -err1; std r11,48(r3) -err1; std r12,56(r3) - addir3,r3,64 - /* Up to 63B to go */ -7: bf cr7*4+2,8f + bf cr7*4+2,8f err1; ld r0,0(r4) err1; ld r6,8(r4) -err1; ld r7,16(r4) -err1; ld r8,24(r4) +err1; ld r8,16(r4) +err1; ld r9,24(r4) addir4,r4,32 err1; std r0,0(r3) err1; std r6,8(r3) -err1; std r7,16(r3)
[v2 09/12] powerpc/mce: Enable MCE notifiers in external modules
From: Reza Arbab Signed-off-by: Reza Arbab --- arch/powerpc/kernel/exceptions-64s.S | 6 ++ arch/powerpc/kernel/mce.c| 2 ++ 2 files changed, 8 insertions(+) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index c83e38a403fd..311f1392a2ec 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -458,6 +458,12 @@ EXC_COMMON_BEGIN(machine_check_handle_early) bl machine_check_early std r3,RESULT(r1) /* Save result */ + /* Notifiers may be in a module, so enable virtual addressing. */ + mfmsr r11 + ori r11,r11,MSR_IR + ori r11,r11,MSR_DR + mtmsr r11 + addir3,r1,STACK_FRAME_OVERHEAD bl machine_check_notify ld r11,RESULT(r1) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index a8348a9bea5b..9e4d497837d8 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -50,11 +50,13 @@ int mce_register_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(_notifier_list, nb); } +EXPORT_SYMBOL_GPL(mce_register_notifier); int mce_unregister_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(_notifier_list, nb); } +EXPORT_SYMBOL_GPL(mce_unregister_notifier); static int check_memcpy_mcsafe(struct notifier_block *nb, unsigned long val, void *data) -- 2.20.1
[v2 08/12] powerpc/mce: Handle memcpy_mcsafe()
From: Reza Arbab Add an mce notifier intended to service memcpy_mcsafe(). The notifier uses this heuristic; if a UE occurs when accessing device memory, and the faulting instruction had a fixup entry, the callback will return NOTIFY_STOP. This causes the notification mechanism to consider the MCE handled and continue execution at the fixup address, which returns -EFAULT from the memcpy_mcsafe() call. Signed-off-by: Reza Arbab --- arch/powerpc/kernel/mce.c | 34 ++ 1 file changed, 34 insertions(+) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 0233c0ee45ab..a8348a9bea5b 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -56,6 +56,40 @@ int mce_unregister_notifier(struct notifier_block *nb) return blocking_notifier_chain_unregister(_notifier_list, nb); } +static int check_memcpy_mcsafe(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct machine_check_event *evt = data; + unsigned long pfn; + struct page *page; + + if (evt->error_type != MCE_ERROR_TYPE_UE || + !evt->u.ue_error.physical_address_provided) + return NOTIFY_DONE; + + pfn = evt->u.ue_error.physical_address >> PAGE_SHIFT; + page = pfn_to_page(pfn); + if (!page) + return NOTIFY_DONE; + + /* HMM and PMEM */ + if (is_zone_device_page(page) && evt->u.ue_error.fixup_address_provided) + return NOTIFY_STOP; + + return NOTIFY_DONE; +} + +static struct notifier_block memcpy_mcsafe_nb = { + .notifier_call = check_memcpy_mcsafe +}; + +static int __init mce_mcsafe_register(void) +{ + mce_register_notifier(_mcsafe_nb); + return 0; +} +arch_initcall(mce_mcsafe_register); + static void mce_set_error_info(struct machine_check_event *mce, struct mce_error_info *mce_err) { -- 2.20.1
[v2 07/12] powerpc/memcpy: Add memcpy_mcsafe for pmem
From: Balbir Singh The pmem infrastructure uses memcpy_mcsafe in the pmem layer so as to convert machine check exceptions into a return value on failure in case a machine check exception is encountered during the memcpy. This patch largely borrows from the copyuser_power7 logic and does not add the VMX optimizations, largely to keep the patch simple. If needed those optimizations can be folded in. Signed-off-by: Balbir Singh Acked-by: Nicholas Piggin [ar...@linux.ibm.com: Added symbol export] --- arch/powerpc/include/asm/string.h | 2 + arch/powerpc/lib/Makefile | 2 +- arch/powerpc/lib/memcpy_mcsafe_64.S | 215 3 files changed, 218 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/lib/memcpy_mcsafe_64.S diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h index 9bf6dffb4090..b72692702f35 100644 --- a/arch/powerpc/include/asm/string.h +++ b/arch/powerpc/include/asm/string.h @@ -53,7 +53,9 @@ void *__memmove(void *to, const void *from, __kernel_size_t n); #ifndef CONFIG_KASAN #define __HAVE_ARCH_MEMSET32 #define __HAVE_ARCH_MEMSET64 +#define __HAVE_ARCH_MEMCPY_MCSAFE +extern int memcpy_mcsafe(void *dst, const void *src, __kernel_size_t sz); extern void *__memset16(uint16_t *, uint16_t v, __kernel_size_t); extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t); extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t); diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index c55f9c27bf79..529d6536eb4a 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -39,7 +39,7 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \ memcpy_power7.o obj64-y+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \ - memcpy_64.o pmem.o + memcpy_64.o pmem.o memcpy_mcsafe_64.o obj64-$(CONFIG_SMP)+= locks.o obj64-$(CONFIG_ALTIVEC)+= vmx-helper.o diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S b/arch/powerpc/lib/memcpy_mcsafe_64.S new file mode 100644 index ..50f865db0338 --- /dev/null +++ b/arch/powerpc/lib/memcpy_mcsafe_64.S @@ -0,0 +1,215 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) IBM Corporation, 2011 + * Derived from copyuser_power7.s by Anton Blanchard + * Author - Balbir Singh + */ +#include +#include +#include + + .macro err1 +100: + EX_TABLE(100b,.Ldo_err1) + .endm + + .macro err2 +200: + EX_TABLE(200b,.Ldo_err2) + .endm + +.Ldo_err2: + ld r22,STK_REG(R22)(r1) + ld r21,STK_REG(R21)(r1) + ld r20,STK_REG(R20)(r1) + ld r19,STK_REG(R19)(r1) + ld r18,STK_REG(R18)(r1) + ld r17,STK_REG(R17)(r1) + ld r16,STK_REG(R16)(r1) + ld r15,STK_REG(R15)(r1) + ld r14,STK_REG(R14)(r1) + addir1,r1,STACKFRAMESIZE +.Ldo_err1: + li r3,-EFAULT + blr + + +_GLOBAL(memcpy_mcsafe) + cmpldi r5,16 + blt .Lshort_copy + +.Lcopy: + /* Get the source 8B aligned */ + neg r6,r4 + mtocrf 0x01,r6 + clrldi r6,r6,(64-3) + + bf cr7*4+3,1f +err1; lbz r0,0(r4) + addir4,r4,1 +err1; stb r0,0(r3) + addir3,r3,1 + +1: bf cr7*4+2,2f +err1; lhz r0,0(r4) + addir4,r4,2 +err1; sth r0,0(r3) + addir3,r3,2 + +2: bf cr7*4+1,3f +err1; lwz r0,0(r4) + addir4,r4,4 +err1; stw r0,0(r3) + addir3,r3,4 + +3: sub r5,r5,r6 + cmpldi r5,128 + blt 5f + + mflrr0 + stdur1,-STACKFRAMESIZE(r1) + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + std r17,STK_REG(R17)(r1) + std r18,STK_REG(R18)(r1) + std r19,STK_REG(R19)(r1) + std r20,STK_REG(R20)(r1) + std r21,STK_REG(R21)(r1) + std r22,STK_REG(R22)(r1) + std r0,STACKFRAMESIZE+16(r1) + + srdir6,r5,7 + mtctr r6 + + /* Now do cacheline (128B) sized loads and stores. */ + .align 5 +4: +err2; ld r0,0(r4) +err2; ld r6,8(r4) +err2; ld r7,16(r4) +err2; ld r8,24(r4) +err2; ld r9,32(r4) +err2; ld r10,40(r4) +err2; ld r11,48(r4) +err2; ld r12,56(r4) +err2; ld r14,64(r4) +err2; ld r15,72(r4) +err2; ld r16,80(r4) +err2; ld r17,88(r4) +err2; ld r18,96(r4) +err2; ld r19,104(r4) +err2; ld r20,112(r4) +err2; ld r21,120(r4) + addir4,r4,128 +err2; std r0,0(r3) +err2; std r6,8(r3) +err2; std r7,16(r3) +err2; std r8,24(r3) +err2; std r9,32(r3) +err2; std r10,40(r3) +err2; std r11,48(r3) +err2; std r12,56(r3) +err2; std r14,64(r3) +err2; std r15,72(r3) +err2; std r16,80(r3) +err2; std
[v2 06/12] powerpc/mce: Add fixup address to UE events
From: Reza Arbab If the instruction causing a UE has an exception table entry with fixup address, save it in the machine_check_event struct. If a machine check notifier callback returns NOTIFY_STOP to indicate it has handled the error, set nip to continue execution from the fixup address. Signed-off-by: Reza Arbab --- arch/powerpc/include/asm/mce.h | 5 +++-- arch/powerpc/kernel/mce.c | 16 +++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 240dd1fdfe35..9d9661747adf 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -122,11 +122,12 @@ struct machine_check_event { enum MCE_UeErrorType ue_error_type:8; u8 effective_address_provided; u8 physical_address_provided; + u8 fixup_address_provided; u8 process_event; - u8 reserved_1[4]; + u8 reserved_1[3]; u64 effective_address; u64 physical_address; - u8 reserved_2[8]; + u64 fixup_address; } ue_error; struct { diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 4a37928ab30e..0233c0ee45ab 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -15,10 +15,12 @@ #include #include #include +#include #include #include #include +#include static DEFINE_PER_CPU(int, mce_nest_count); static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); @@ -151,6 +153,8 @@ void save_mce_event(struct pt_regs *regs, long handled, mce->u.link_error.effective_address_provided = true; mce->u.link_error.effective_address = addr; } else if (mce->error_type == MCE_ERROR_TYPE_UE) { + const struct exception_table_entry *entry; + mce->u.ue_error.effective_address_provided = true; mce->u.ue_error.effective_address = addr; if (phys_addr != ULONG_MAX) { @@ -158,6 +162,12 @@ void save_mce_event(struct pt_regs *regs, long handled, mce->u.ue_error.physical_address = phys_addr; } + entry = search_exception_tables(regs->nip); + if (entry) { + mce->u.ue_error.fixup_address_provided = true; + mce->u.ue_error.fixup_address = extable_fixup(entry); + } + mce->u.ue_error.process_event = true; } return; @@ -666,8 +676,12 @@ long machine_check_notify(struct pt_regs *regs) rc = blocking_notifier_call_chain(_notifier_list, 0, evt); if (rc & NOTIFY_STOP_MASK) { - if (evt->error_type == MCE_ERROR_TYPE_UE) + if (evt->error_type == MCE_ERROR_TYPE_UE) { + if (evt->u.ue_error.fixup_address_provided) + regs->nip = evt->u.ue_error.fixup_address; + evt->u.ue_error.process_event = false; + } if ((rc & NOTIFY_STOP_MASK) && (regs->msr & MSR_RI)) evt->disposition = MCE_DISPOSITION_RECOVERED; -- 2.20.1
[v2 05/12] powerpc/mce: Allow notifier callback to handle MCE
From: Reza Arbab If a notifier returns NOTIFY_STOP, consider the MCE handled, just as we do when machine_check_early() returns 1. Signed-off-by: Reza Arbab --- arch/powerpc/include/asm/asm-prototypes.h | 2 +- arch/powerpc/include/asm/mce.h| 3 +- arch/powerpc/kernel/exceptions-64s.S | 3 ++ arch/powerpc/kernel/mce.c | 37 ++- 4 files changed, 35 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index f66f26ef3ce0..49ee8f08de2a 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -72,7 +72,7 @@ void machine_check_exception(struct pt_regs *regs); void emulation_assist_interrupt(struct pt_regs *regs); long do_slb_fault(struct pt_regs *regs, unsigned long ea); void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err); -void machine_check_notify(struct pt_regs *regs); +long machine_check_notify(struct pt_regs *regs); /* signals, syscalls and interrupts */ long sys_swapcontext(struct ucontext __user *old_ctx, diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 948bef579086..240dd1fdfe35 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -122,7 +122,8 @@ struct machine_check_event { enum MCE_UeErrorType ue_error_type:8; u8 effective_address_provided; u8 physical_address_provided; - u8 reserved_1[5]; + u8 process_event; + u8 reserved_1[4]; u64 effective_address; u64 physical_address; u8 reserved_2[8]; diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 2e56014fca21..c83e38a403fd 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -460,6 +460,9 @@ EXC_COMMON_BEGIN(machine_check_handle_early) addir3,r1,STACK_FRAME_OVERHEAD bl machine_check_notify + ld r11,RESULT(r1) + or r3,r3,r11 + std r3,RESULT(r1) ld r12,_MSR(r1) BEGIN_FTR_SECTION diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 0ab171b41ede..4a37928ab30e 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -157,6 +157,8 @@ void save_mce_event(struct pt_regs *regs, long handled, mce->u.ue_error.physical_address_provided = true; mce->u.ue_error.physical_address = phys_addr; } + + mce->u.ue_error.process_event = true; } return; } @@ -241,6 +243,10 @@ void machine_check_queue_event(void) if (!get_mce_event(, MCE_EVENT_RELEASE)) return; + if (evt.error_type == MCE_ERROR_TYPE_UE && + !evt.u.ue_error.process_event) + return; + index = __this_cpu_inc_return(mce_queue_count) - 1; /* If queue is full, just return for now. */ if (index >= MAX_MC_EVT) { @@ -647,16 +653,31 @@ long hmi_exception_realmode(struct pt_regs *regs) return 1; } -void machine_check_notify(struct pt_regs *regs) +long machine_check_notify(struct pt_regs *regs) { - struct machine_check_event evt; + int index = __this_cpu_read(mce_nest_count) - 1; + struct machine_check_event *evt; + int rc; - if (!get_mce_event(, MCE_EVENT_DONTRELEASE)) - return; + if (index < 0 || index >= MAX_MC_EVT) + return 0; - blocking_notifier_call_chain(_notifier_list, 0, ); + evt = this_cpu_ptr(_event[index]); - if (evt.error_type == MCE_ERROR_TYPE_UE && - evt.u.ue_error.physical_address_provided) - machine_check_ue_event(); + rc = blocking_notifier_call_chain(_notifier_list, 0, evt); + if (rc & NOTIFY_STOP_MASK) { + if (evt->error_type == MCE_ERROR_TYPE_UE) + evt->u.ue_error.process_event = false; + + if ((rc & NOTIFY_STOP_MASK) && (regs->msr & MSR_RI)) + evt->disposition = MCE_DISPOSITION_RECOVERED; + + return 1; + } + + if (evt->error_type == MCE_ERROR_TYPE_UE && + evt->u.ue_error.physical_address_provided) + machine_check_ue_event(evt); + + return 0; } -- 2.20.1
[v2 01/12] powerpc/mce: Make machine_check_ue_event() static
From: Reza Arbab The function doesn't get used outside this file, so make it static. Signed-off-by: Reza Arbab --- arch/powerpc/kernel/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index b18df633eae9..e78c4f18ea0a 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -33,7 +33,7 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_ue_event_queue); static void machine_check_process_queued_event(struct irq_work *work); -void machine_check_ue_event(struct machine_check_event *evt); +static void machine_check_ue_event(struct machine_check_event *evt); static void machine_process_ue_event(struct work_struct *work); static struct irq_work mce_event_process_work = { @@ -203,7 +203,7 @@ void release_mce_event(void) /* * Queue up the MCE event which then can be handled later. */ -void machine_check_ue_event(struct machine_check_event *evt) +static void machine_check_ue_event(struct machine_check_event *evt) { int index; -- 2.20.1
Re: [PATCH] mm/nvdimm: Add is_ioremap_addr and use that to check ioremap address
Andrew Morton writes: > On Mon, 1 Jul 2019 19:10:38 +0530 "Aneesh Kumar K.V" > wrote: > >> Architectures like powerpc use different address range to map ioremap >> and vmalloc range. The memunmap() check used by the nvdimm layer was >> wrongly using is_vmalloc_addr() to check for ioremap range which fails for >> ppc64. This result in ppc64 not freeing the ioremap mapping. The side effect >> of this is an unbind failure during module unload with papr_scm nvdimm driver > > The patch applies to 5.1. Does it need a Fixes: and a Cc:stable? Actually, we want it to be backported to an older kernel possibly one that added papr-scm driver, b5beae5e224f ("powerpc/pseries: Add driver for PAPR SCM regions"). But that doesn't apply easily. It does apply without conflicts to 5.0 -aneesh
Re: Re: [PATCH 1/3] arm64: mm: Add p?d_large() definitions
Will Deacon's on July 1, 2019 8:15 pm: > On Mon, Jul 01, 2019 at 11:03:51AM +0100, Steven Price wrote: >> On 01/07/2019 10:27, Will Deacon wrote: >> > On Sun, Jun 23, 2019 at 07:44:44PM +1000, Nicholas Piggin wrote: >> >> walk_page_range() is going to be allowed to walk page tables other than >> >> those of user space. For this it needs to know when it has reached a >> >> 'leaf' entry in the page tables. This information will be provided by the >> >> p?d_large() functions/macros. >> > >> > I can't remember whether or not I asked this before, but why not call >> > this macro p?d_leaf() if that's what it's identifying? "Large" and "huge" >> > are usually synonymous, so I find this naming needlessly confusing based >> > on this patch in isolation. Those page table macro names are horrible. Large, huge, leaf, wtf? They could do with a sensible renaming. But this series just follows naming that's alreay there on x86. Thanks, Nick
Re: [PATCH v2 1/3] arm64: mm: Add p?d_large() definitions
Steven Price's on July 1, 2019 7:57 pm: > On 01/07/2019 07:40, Nicholas Piggin wrote: >> walk_page_range() is going to be allowed to walk page tables other than >> those of user space. For this it needs to know when it has reached a >> 'leaf' entry in the page tables. This information will be provided by the >> p?d_large() functions/macros. >> >> For arm64, we already have p?d_sect() macros which we can reuse for >> p?d_large(). >> >> pud_sect() is defined as a dummy function when CONFIG_PGTABLE_LEVELS < 3 >> or CONFIG_ARM64_64K_PAGES is defined. However when the kernel is >> configured this way then architecturally it isn't allowed to have a >> large page that this level, and any code using these page walking macros >> is implicitly relying on the page size/number of levels being the same as >> the kernel. So it is safe to reuse this for p?d_large() as it is an >> architectural restriction. >> >> Cc: Catalin Marinas >> Cc: Will Deacon >> Signed-off-by: Steven Price > > Hi Nicolas, > > This appears to my patch which I originally posted as part of converting > x86/arm64 to use a generic page walk code[1]. Hey, yeah it is, I'd intended to mark you as the author but must have forgot to change it in git. > I'm not sure that this > patch makes much sense on its own, in particular it was working up to > having a generic macro[2] which means the _large() macros could be used > across all architectures. It goes with this series which makes _large macros usable for archs that define HUGE_VMAP. I posted the same thing earlier and Anshuman noted you'd done it too so I deferred to yours (I thought it would go via arm64 tree and that this would just allow Andrew to easily reconcile the merge). If your series is not going upstream this time then the changelog probably doesn't make so much sense, so I could just send my version to the arm64 tree. Thanks, Nick
Re: ["RFC PATCH" 1/2] powerpc/mm: Fix node look up with numa=off boot
On 7/1/19 10:12 PM, Nathan Lynch wrote: "Aneesh Kumar K.V" writes: I guess we should have here. modified arch/powerpc/mm/numa.c @@ -416,12 +416,11 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa) static int of_drconf_to_nid_single(struct drmem_lmb *lmb) { struct assoc_arrays aa = { .arrays = NULL }; - /* is that correct? */ int default_nid = 0; int nid = default_nid; int rc, index; - if (!numa_enabled) + if ((min_common_depth < 0) || !numa_enabled) return NUMA_NO_NODE; rc = of_get_assoc_arrays(); Nathan, Can you check this? Looks like it would do the right thing. Just checking: do people still need numa=off? Seems like it's a maintenance burden :-) That is used in kdump kernel. -aneesh
Re: Can I compile Linux Kernel 5.2-rc7 for PowerPC on Intel/AMD x86 hardware?
Christophe Leroy writes: > Le 01/07/2019 à 15:39, Turritopsis Dohrnii Teo En Ming a écrit : >> Good evening from Singapore, > > Good evening afternoon from Paris, > >> >> Can I compile Linux Kernel 5.2-rc7 for PowerPC on Intel/AMD x86 hardware, >> for example, AMD Ryzen 9 3950X, with 16 CPU cores and 32 threads? > > Yes you can > >> >> Is it called cross-compiling? > > Yes it is, you can get cross compilers at > https://mirrors.edge.kernel.org/pub/tools/crosstool/ There's also some info here: https://github.com/linuxppc/wiki/wiki/Building-powerpc-kernels cheers
Re: [PATCH] powerpc/rtas: retry when cpu offline races with suspend/migration
Nathan Lynch writes: > Hi Michael, > > Nathan Lynch writes: >> The protocol for suspending or migrating an LPAR requires all present >> processor threads to enter H_JOIN. So if we have threads offline, we >> have to temporarily bring them up. This can race with administrator >> actions such as SMT state changes. As of dfd718a2ed1f ("powerpc/rtas: >> Fix a potential race between CPU-Offline & Migration"), > > snowpatch/checkpatch flagged an error in my commit message here: > > ERROR:GIT_COMMIT_ID: Please use git commit description style 'commit > <12+ chars of sha1> ("")' - ie: 'commit dfd718a2ed1f > ("powerpc/rtas: Fix a potential race between CPU-Offline & > Migration")' > > I see this is in your next-test branch though. Should I fix the commit > message and resend? No that's fine. You have a Fixes tag which is formatted correctly and that's what matters IMO. Ideally we could control that check to not complain about it being split across lines when there's a fixes tag as well. cheers
Re: [PATCH] mm/nvdimm: Add is_ioremap_addr and use that to check ioremap address
On Mon, 1 Jul 2019 19:10:38 +0530 "Aneesh Kumar K.V" wrote: > Architectures like powerpc use different address range to map ioremap > and vmalloc range. The memunmap() check used by the nvdimm layer was > wrongly using is_vmalloc_addr() to check for ioremap range which fails for > ppc64. This result in ppc64 not freeing the ioremap mapping. The side effect > of this is an unbind failure during module unload with papr_scm nvdimm driver The patch applies to 5.1. Does it need a Fixes: and a Cc:stable?
Re: [PATCH] powerpc/rtas: retry when cpu offline races with suspend/migration
Hi Michael, Nathan Lynch writes: > The protocol for suspending or migrating an LPAR requires all present > processor threads to enter H_JOIN. So if we have threads offline, we > have to temporarily bring them up. This can race with administrator > actions such as SMT state changes. As of dfd718a2ed1f ("powerpc/rtas: > Fix a potential race between CPU-Offline & Migration"), snowpatch/checkpatch flagged an error in my commit message here: ERROR:GIT_COMMIT_ID: Please use git commit description style 'commit <12+ chars of sha1> ("")' - ie: 'commit dfd718a2ed1f ("powerpc/rtas: Fix a potential race between CPU-Offline & Migration")' I see this is in your next-test branch though. Should I fix the commit message and resend?
Re: [PATCH RFC] generic ELF support for kexec
Hi Philipp, On Mon, Jul 01, 2019 at 02:31:20PM +0200, Philipp Rudo wrote: > Sven Schnelle wrote: > > > I'm attaching the patch to this Mail. What do you think about that change? > > s390 also uses ELF files, and (maybe?) could also switch to this > > implementation. > > But i don't know anything about S/390 and don't have one in my basement. So > > i'll leave s390 to the IBM folks. > > I'm afraid there isn't much code here s390 can reuse. I see multiple problems > in kexec_elf_load: > > * while loading the phdrs we also need to setup some data structures to pass > to the next kernel > * the s390 kernel needs to be loaded to a fixed address > * there is no support to load a crash kernel > > Of course that could all be fixed/worked around by introducing some arch > hooks. > But when you take into account that the whole elf loader on s390 is ~100 lines > of code, I don't think it is worth it. That's fine. I didn't really look into the S/390 Loader, and just wanted to let the IBM people know. > More comments below. > > [...] > > > diff --git a/include/linux/kexec.h b/include/linux/kexec.h > > index b9b1bc5f9669..49b23b425f84 100644 > > --- a/include/linux/kexec.h > > +++ b/include/linux/kexec.h > > @@ -216,6 +216,41 @@ extern int crash_prepare_elf64_headers(struct > > crash_mem *mem, int kernel_map, > >void **addr, unsigned long *sz); > > #endif /* CONFIG_KEXEC_FILE */ > > > > +#ifdef CONFIG_KEXEC_FILE_ELF > > + > > +struct kexec_elf_info { > > + /* > > +* Where the ELF binary contents are kept. > > +* Memory managed by the user of the struct. > > +*/ > > + const char *buffer; > > + > > + const struct elfhdr *ehdr; > > + const struct elf_phdr *proghdrs; > > + struct elf_shdr *sechdrs; > > +}; > > Do i understand this right? elf_info->buffer contains the full elf file and > elf_info->ehdr is a (endianness translated) copy of the files ehdr? > > If so ... > > > +void kexec_free_elf_info(struct kexec_elf_info *elf_info); > > + > > +int kexec_build_elf_info(const char *buf, size_t len, struct elfhdr *ehdr, > > + struct kexec_elf_info *elf_info); > > + > > +int kexec_elf_kernel_load(struct kimage *image, struct kexec_buf *kbuf, > > + char *kernel_buf, unsigned long kernel_len, > > + unsigned long *kernel_load_addr); > > + > > +int kexec_elf_probe(const char *buf, unsigned long len); > > + > > +int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, > > +struct kexec_elf_info *elf_info, > > +struct kexec_buf *kbuf, > > +unsigned long *lowest_load_addr); > > + > > +int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, > > +struct kexec_elf_info *elf_info, > > +struct kexec_buf *kbuf, > > +unsigned long *lowest_load_addr); > > ... you could simplify the arguments by dropping the ehdr argument. The > elf_info should contain all the information needed. Furthermore the kexec_buf > also contains a pointer to its kimage. So you can drop that argument as well. > > An other thing is that you kzalloc the memory needed for proghdrs and sechdrs > but expect the user of those functions to provide the memory needed for ehdr. > Wouldn't it be more consistent to also kzalloc the ehdr? > Good point. I'll think about it. I would like to do that in an extra patch, as it is not a small style change. > > > diff --git a/kernel/kexec_file_elf.c b/kernel/kexec_file_elf.c > > new file mode 100644 > > index ..bb966c93492c > > --- /dev/null > > +++ b/kernel/kexec_file_elf.c > > @@ -0,0 +1,574 @@ > > [...] > > > +static uint64_t elf64_to_cpu(const struct elfhdr *ehdr, uint64_t value) > > +{ > > + if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) > > + value = le64_to_cpu(value); > > + else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) > > + value = be64_to_cpu(value); > > + > > + return value; > > +} > > + > > +static uint16_t elf16_to_cpu(const struct elfhdr *ehdr, uint16_t value) > > +{ > > + if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) > > + value = le16_to_cpu(value); > > + else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) > > + value = be16_to_cpu(value); > > + > > + return value; > > +} > > + > > +static uint32_t elf32_to_cpu(const struct elfhdr *ehdr, uint32_t value) > > +{ > > + if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) > > + value = le32_to_cpu(value); > > + else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) > > + value = be32_to_cpu(value); > > + > > + return value; > > +} > > What are the elf*_to_cpu good for? In general I'd assume that kexec loads a > kernel for the same architecture it is running on. So the new kernel should > also have the same endianness like the one which loads it. Is this a > ppcle/ppcbe issue? Don't know. I would agree, but i'm not an powerpc expert. >
Re: [PATCH RFC] generic ELF support for kexec
Hi Michael, On Fri, Jun 28, 2019 at 12:04:11PM +1000, Michael Ellerman wrote: > Sven Schnelle writes: > https://github.com/linuxppc/wiki/wiki/Booting-with-Qemu > > But I'm not sure where you get a version of kexec that uses kexec_file(). kexec-tools HEAD supports it, so that's not a problem. > > If that change is acceptable i would finish the patch and submit it. I think > > best would be to push this change through Helge's parisc tree, so we don't > > have any dependencies to sort out. > > That will work for you but could cause us problems if we have any > changes that touch that code. > > It's easy enough to create a topic branch with just that patch that both > of us merge. What should be the base branch for that patch? Christophe suggested the powerpc/merge branch? > > #include > > #include > > #include > > @@ -31,540 +29,6 @@ > > #include > > #include > > > > -#define PURGATORY_STACK_SIZE (16 * 1024) > > This is unused AFAICS. We should probably remove it explicitly rather > than as part of this patch. I have one patch right now. If wanted i can split up all the changes suggested during the review into smaller pieces, whatever you prefer. > Or that. > > > +#include > > +#include > > + > > +#define elf_addr_to_cpuelf64_to_cpu > > Why are we doing that rather than just using elf64_to_cpu directly? > > > +#ifndef Elf_Rel > > +#define Elf_RelElf64_Rel > > +#endif /* Elf_Rel */ > > And that? Don't know - ask the PPC people :-) Regards Sven
Re: ["RFC PATCH" 1/2] powerpc/mm: Fix node look up with numa=off boot
"Aneesh Kumar K.V" writes: > I guess we should have here. > > modified arch/powerpc/mm/numa.c > @@ -416,12 +416,11 @@ static int of_get_assoc_arrays(struct assoc_arrays > *aa) > static int of_drconf_to_nid_single(struct drmem_lmb *lmb) > { > struct assoc_arrays aa = { .arrays = NULL }; > - /* is that correct? */ > int default_nid = 0; > int nid = default_nid; > int rc, index; > > - if (!numa_enabled) > + if ((min_common_depth < 0) || !numa_enabled) > return NUMA_NO_NODE; > > rc = of_get_assoc_arrays(); > > > Nathan, > > Can you check this? Looks like it would do the right thing. Just checking: do people still need numa=off? Seems like it's a maintenance burden :-)
Re: Can I compile Linux Kernel 5.2-rc7 for PowerPC on Intel/AMD x86 hardware?
Le 01/07/2019 à 15:39, Turritopsis Dohrnii Teo En Ming a écrit : Good evening from Singapore, Good evening afternoon from Paris, Can I compile Linux Kernel 5.2-rc7 for PowerPC on Intel/AMD x86 hardware, for example, AMD Ryzen 9 3950X, with 16 CPU cores and 32 threads? Yes you can Is it called cross-compiling? Yes it is, you can get cross compilers at https://mirrors.edge.kernel.org/pub/tools/crosstool/ Thanks Christophe Thank you. -BEGIN EMAIL SIGNATURE- The Gospel for all Targeted Individuals (TIs): [The New York Times] Microwave Weapons Are Prime Suspect in Ills of U.S. Embassy Workers Link: https://www.nytimes.com/2018/09/01/science/sonic-attack-cuba-microwave.html Singaporean Mr. Turritopsis Dohrnii Teo En Ming's Academic Qualifications as at 14 Feb 2019 [1] https://tdtemcerts.wordpress.com/ [2] https://tdtemcerts.blogspot.sg/ [3] https://www.scribd.com/user/270125049/Teo-En-Ming -END EMAIL SIGNATURE-
Re: remove asm-generic/ptrace.h v3
On Mon, Jun 24, 2019 at 9:23 AM Thomas Gleixner wrote: > > On Mon, 24 Jun 2019, Christoph Hellwig wrote: > > > > asm-generic/ptrace.h is a little weird in that it doesn't actually > > implement any functionality, but it provided multiple layers of macros > > that just implement trivial inline functions. We implement those > > directly in the few architectures and be off with a much simpler > > design. > > > > I'm not sure which tree is the right place, but may this can go through > > the asm-generic tree since it removes an asm-generic header? > > Makes sense. Applied and pushed to asm-generic.git/master now, sorry for the delay. Arnd
Re: [PATCH v12 01/11] MODSIGN: Export module signature definitions
+++ Thiago Jung Bauermann [27/06/19 23:19 -0300]: IMA will use the module_signature format for append signatures, so export the relevant definitions and factor out the code which verifies that the appended signature trailer is valid. Also, create a CONFIG_MODULE_SIG_FORMAT option so that IMA can select it and be able to use mod_check_sig() without having to depend on either CONFIG_MODULE_SIG or CONFIG_MODULES. Signed-off-by: Thiago Jung Bauermann Reviewed-by: Mimi Zohar Cc: Jessica Yu --- include/linux/module.h | 3 -- include/linux/module_signature.h | 44 + init/Kconfig | 6 +++- kernel/Makefile | 1 + kernel/module.c | 1 + kernel/module_signature.c| 46 ++ kernel/module_signing.c | 56 +--- scripts/Makefile | 2 +- 8 files changed, 106 insertions(+), 53 deletions(-) diff --git a/include/linux/module.h b/include/linux/module.h index 188998d3dca9..aa56f531cf1e 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -25,9 +25,6 @@ #include #include -/* In stripped ARM and x86-64 modules, ~ is surprisingly rare. */ -#define MODULE_SIG_STRING "~Module signature appended~\n" - Hi Thiago, apologies for the delay. It looks like arch/s390/kernel/machine_kexec_file.c also relies on MODULE_SIG_STRING being defined, so module_signature.h will need to be included there too, otherwise we'll run into a compilation error. Other than that, the module-related changes look good to me: Acked-by: Jessica Yu Thanks! Jessica /* Not Yet Implemented */ #define MODULE_SUPPORTED_DEVICE(name) diff --git a/include/linux/module_signature.h b/include/linux/module_signature.h new file mode 100644 index ..523617fc5b6a --- /dev/null +++ b/include/linux/module_signature.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Module signature handling. + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowe...@redhat.com) + */ + +#ifndef _LINUX_MODULE_SIGNATURE_H +#define _LINUX_MODULE_SIGNATURE_H + +/* In stripped ARM and x86-64 modules, ~ is surprisingly rare. */ +#define MODULE_SIG_STRING "~Module signature appended~\n" + +enum pkey_id_type { + PKEY_ID_PGP,/* OpenPGP generated key ID */ + PKEY_ID_X509, /* X.509 arbitrary subjectKeyIdentifier */ + PKEY_ID_PKCS7, /* Signature in PKCS#7 message */ +}; + +/* + * Module signature information block. + * + * The constituents of the signature section are, in order: + * + * - Signer's name + * - Key identifier + * - Signature data + * - Information block + */ +struct module_signature { + u8 algo; /* Public-key crypto algorithm [0] */ + u8 hash; /* Digest algorithm [0] */ + u8 id_type;/* Key identifier type [PKEY_ID_PKCS7] */ + u8 signer_len; /* Length of signer's name [0] */ + u8 key_id_len; /* Length of key identifier [0] */ + u8 __pad[3]; + __be32 sig_len;/* Length of signature data */ +}; + +int mod_check_sig(const struct module_signature *ms, size_t file_len, + const char *name); + +#endif /* _LINUX_MODULE_SIGNATURE_H */ diff --git a/init/Kconfig b/init/Kconfig index 8b9ffe236e4f..c2286a3c74c5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1852,6 +1852,10 @@ config BASE_SMALL default 0 if BASE_FULL default 1 if !BASE_FULL +config MODULE_SIG_FORMAT + def_bool n + select SYSTEM_DATA_VERIFICATION + menuconfig MODULES bool "Enable loadable module support" option modules @@ -1929,7 +1933,7 @@ config MODULE_SRCVERSION_ALL config MODULE_SIG bool "Module signature verification" depends on MODULES - select SYSTEM_DATA_VERIFICATION + select MODULE_SIG_FORMAT help Check modules for valid signatures upon load: the signature is simply appended to the module. For more information see diff --git a/kernel/Makefile b/kernel/Makefile index 33824f0385b3..f29ae2997a43 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -58,6 +58,7 @@ endif obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SIG) += module_signing.o +obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_CRASH_CORE) += crash_core.o diff --git a/kernel/module.c b/kernel/module.c index 6e6712b3aaf5..2712f4d217f5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/module_signature.c b/kernel/module_signature.c new file mode 100644 index ..4224a1086b7d --- /dev/null +++ b/kernel/module_signature.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier:
Re: [PATCH v12 00/11] Appended signatures support for IMA appraisal
On Thu, 2019-06-27 at 23:19 -0300, Thiago Jung Bauermann wrote: > Hello, > > This version is essentially identical to the last one. > > It is only a rebase on top of today's linux-integrity/next-queued-testing, > prompted by conflicts with Prakhar Srivastava's patches to measure the > kernel command line. It also drops two patches that are already present in > that branch. Thanks, Thiago. These patches are now in next-queued-testing waiting for some additional reviews/acks. Mimi
[PATCH] powerpc: Remove unused variable declaration
__kernel_virt_size is not used anymore. Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/include/asm/book3s/64/pgtable.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 7dede2e34b70..a3eab10f5a27 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -275,7 +275,6 @@ extern unsigned long __vmalloc_end; #define VMALLOC_END__vmalloc_end extern unsigned long __kernel_virt_start; -extern unsigned long __kernel_virt_size; extern unsigned long __kernel_io_start; extern unsigned long __kernel_io_end; #define KERN_VIRT_START __kernel_virt_start -- 2.21.0
[PATCH v2 3/3] powerpc/mm: Consolidate numa_enable check and min_common_depth check
If we fail to parse min_common_depth from device tree we boot with numa disabled. Reflect the same by updating numa_enabled variable to false. Also, switch all min_common_depth failure check to if (!numa_enabled) check. This helps us to avoid checking for both in different code paths. Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/mm/numa.c | 16 +++- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 27f792c0df68..848b4663c7ad 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -212,7 +212,7 @@ static int associativity_to_nid(const __be32 *associativity) { int nid = NUMA_NO_NODE; - if (min_common_depth == -1 || !numa_enabled) + if (!numa_enabled) goto out; if (of_read_number(associativity, 1) >= min_common_depth) @@ -628,8 +628,14 @@ static int __init parse_numa_properties(void) min_common_depth = find_min_common_depth(); - if (min_common_depth < 0) + if (min_common_depth < 0) { + /* +* if we fail to parse min_common_depth from device tree +* mark the numa disabled, boot with numa disabled. +*/ + numa_enabled = false; return min_common_depth; + } dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); @@ -745,7 +751,7 @@ void __init dump_numa_cpu_topology(void) unsigned int node; unsigned int cpu, count; - if (min_common_depth == -1 || !numa_enabled) + if (!numa_enabled) return; for_each_online_node(node) { @@ -810,7 +816,7 @@ static void __init find_possible_nodes(void) struct device_node *rtas; u32 numnodes, i; - if (min_common_depth <= 0 || !numa_enabled) + if (!numa_enabled) return; rtas = of_find_node_by_path("/rtas"); @@ -1012,7 +1018,7 @@ int hot_add_scn_to_nid(unsigned long scn_addr) struct device_node *memory = NULL; int nid; - if (!numa_enabled || (min_common_depth < 0)) + if (!numa_enabled) return first_online_node; memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); -- 2.21.0
[PATCH v2 2/3] powerpc/mm: Fix node look up with numa=off boot
If we boot with numa=off, we need to make sure we return NUMA_NO_NODE when looking up associativity details of resources. Without this, we hit crash like below BUG: Unable to handle kernel data access at 0x408 Faulting instruction address: 0xc8f31704 cpu 0x1b: Vector: 380 (Data SLB Access) at [cb9bb320] pc: c8f31704: _raw_spin_lock+0x14/0x100 lr: c83f41fc: cache_alloc_node+0x5c/0x290 sp: cb9bb5b0 msr: 80010280b033 dar: 408 current = 0xcb9a2700 paca= 0xca740c00 irqmask: 0x03 irq_happened: 0x01 pid = 1, comm = swapper/27 Linux version 5.2.0-rc4-00925-g74e188c620b1 (root@linux-d8ip) (gcc version 7.4.1 20190424 [gcc-7-branch revision 270538] (SUSE Linux)) #34 SMP Sat Jun 29 00:41:02 EDT 2019 enter ? for help [link register ] c83f41fc cache_alloc_node+0x5c/0x290 [cb9bb5b0] 0dc0 (unreliable) [cb9bb5f0] c83f48c8 kmem_cache_alloc_node_trace+0x138/0x360 [cb9bb670] c8aa789c devres_alloc_node+0x4c/0xa0 [cb9bb6a0] c8337218 devm_memremap+0x58/0x130 [cb9bb6f0] c8aed00c devm_nsio_enable+0xdc/0x170 [cb9bb780] c8af3b6c nd_pmem_probe+0x4c/0x180 [cb9bb7b0] c8ad84cc nvdimm_bus_probe+0xac/0x260 [cb9bb840] c8aa0628 really_probe+0x148/0x500 [cb9bb8d0] c8aa0d7c driver_probe_device+0x19c/0x1d0 [cb9bb950] c8aa11bc device_driver_attach+0xcc/0x100 [cb9bb990] c8aa12ec __driver_attach+0xfc/0x1e0 [cb9bba10] c8a9d0a4 bus_for_each_dev+0xb4/0x130 [cb9bba70] c8a9fc04 driver_attach+0x34/0x50 [cb9bba90] c8a9f118 bus_add_driver+0x1d8/0x300 [cb9bbb20] c8aa2358 driver_register+0x98/0x1a0 [cb9bbb90] c8ad7e6c __nd_driver_register+0x5c/0x100 [cb9bbbf0] c93efbac nd_pmem_driver_init+0x34/0x48 [cb9bbc10] c80106c0 do_one_initcall+0x60/0x2d0 [cb9bbce0] c938463c kernel_init_freeable+0x384/0x48c [cb9bbdb0] c8010a5c kernel_init+0x2c/0x160 [cb9bbe20] c800ba54 ret_from_kernel_thread+0x5c/0x68 Reported-and-debugged-by: Vaibhav Jain Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/mm/numa.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index aee718509085..27f792c0df68 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -212,7 +212,7 @@ static int associativity_to_nid(const __be32 *associativity) { int nid = NUMA_NO_NODE; - if (min_common_depth == -1) + if (min_common_depth == -1 || !numa_enabled) goto out; if (of_read_number(associativity, 1) >= min_common_depth) @@ -420,7 +420,7 @@ static int of_drconf_to_nid_single(struct drmem_lmb *lmb) int nid = default_nid; int rc, index; - if (min_common_depth < 0) + if ((min_common_depth < 0) || !numa_enabled) return default_nid; rc = of_get_assoc_arrays(); @@ -810,7 +810,7 @@ static void __init find_possible_nodes(void) struct device_node *rtas; u32 numnodes, i; - if (min_common_depth <= 0) + if (min_common_depth <= 0 || !numa_enabled) return; rtas = of_find_node_by_path("/rtas"); -- 2.21.0
[PATCH v2 1/3] powerpc/mm/drconf: Use NUMA_NO_NODE on failures instead of node 0
If we fail to parse the associativity array we should default to NUMA_NO_NODE instead of NODE 0. Rest of the code fallback to the right default if we find the numa node value NUMA_NO_NODE. Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/mm/numa.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 917904d2fe97..aee718509085 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -416,17 +416,19 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa) static int of_drconf_to_nid_single(struct drmem_lmb *lmb) { struct assoc_arrays aa = { .arrays = NULL }; - int default_nid = 0; + int default_nid = NUMA_NO_NODE; int nid = default_nid; int rc, index; + if (min_common_depth < 0) + return default_nid; + rc = of_get_assoc_arrays(); if (rc) return default_nid; - if (min_common_depth > 0 && min_common_depth <= aa.array_sz && - !(lmb->flags & DRCONF_MEM_AI_INVALID) && - lmb->aa_index < aa.n_arrays) { + if (min_common_depth <= aa.array_sz && + !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { index = lmb->aa_index * aa.array_sz + min_common_depth - 1; nid = of_read_number([index], 1); -- 2.21.0
[PATCH v2 2/2] powerpc/mm/radix: Use the right page size for vmemmap mapping
We use mmu_vmemmap_psize to find the page size for mapping the vmmemap area. With radix translation, we are suboptimally setting this value to PAGE_SIZE. We do check for 2M page size support and update mmu_vmemap_psize to use hugepage size but we suboptimally reset the value to PAGE_SIZE in radix__early_init_mmu(). This resulted in always mapping vmemmap area with 64K page size. Fixes: 2bfd65e45e87 ("powerpc/mm/radix: Add radix callbacks for early init routines") Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/mm/book3s64/radix_pgtable.c | 16 +++- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 273ae66a9a45..8deb432c2975 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -515,14 +515,6 @@ void __init radix__early_init_devtree(void) mmu_psize_defs[MMU_PAGE_64K].shift = 16; mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; found: -#ifdef CONFIG_SPARSEMEM_VMEMMAP - if (mmu_psize_defs[MMU_PAGE_2M].shift) { - /* -* map vmemmap using 2M if available -*/ - mmu_vmemmap_psize = MMU_PAGE_2M; - } -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ return; } @@ -587,7 +579,13 @@ void __init radix__early_init_mmu(void) #ifdef CONFIG_SPARSEMEM_VMEMMAP /* vmemmap mapping */ - mmu_vmemmap_psize = mmu_virtual_psize; + if (mmu_psize_defs[MMU_PAGE_2M].shift) { + /* +* map vmemmap using 2M if available +*/ + mmu_vmemmap_psize = MMU_PAGE_2M; + } else + mmu_vmemmap_psize = mmu_virtual_psize; #endif /* * initialize page table size -- 2.21.0
[PATCH v2 1/2] powerpc/mm/hash/4k: Don't use 64K page size for vmemmap with 4K pagesize
With hash translation and 4K PAGE_SIZE config, we need to make sure we don't use 64K page size for vmemmap. Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/mm/book3s64/hash_utils.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 28ced26f2a00..a8cc40d2a9ed 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -684,10 +684,8 @@ static void __init htab_init_page_sizes(void) if (mmu_psize_defs[MMU_PAGE_16M].shift && memblock_phys_mem_size() >= 0x4000) mmu_vmemmap_psize = MMU_PAGE_16M; - else if (mmu_psize_defs[MMU_PAGE_64K].shift) - mmu_vmemmap_psize = MMU_PAGE_64K; else - mmu_vmemmap_psize = MMU_PAGE_4K; + mmu_vmemmap_psize = mmu_virtual_psize; #endif /* CONFIG_SPARSEMEM_VMEMMAP */ printk(KERN_DEBUG "Page orders: linear mapping = %d, " -- 2.21.0
[PATCH v2] powerpc/mm/nvdimm: Add an informative message if we fail to allocate altmap block
Allocation from altmap area can fail based on vmemmap page size used. Add kernel info message to indicate the failure. That allows the user to identify whether they are really using persistent memory reserved space for per-page metadata. The message looks like: [ 136.587212] altmap block allocation failed, falling back to system memory Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/mm/init_64.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index a4e17a979e45..f3b64f49082b 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -194,8 +194,12 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, * fail due to alignment issues when using 16MB hugepages, so * fall back to system memory if the altmap allocation fail. */ - if (altmap) + if (altmap) { p = altmap_alloc_block_buf(page_size, altmap); + if (!p) + pr_debug("altmap block allocation failed, " \ + "falling back to system memory"); + } if (!p) p = vmemmap_alloc_block_buf(page_size, node); if (!p) -- 2.21.0
Re: [RFC PATCH] virtio_ring: Use DMA API if guest memory is encrypted
On Thu, Jun 27, 2019 at 10:58:40PM -0300, Thiago Jung Bauermann wrote: > > Michael S. Tsirkin writes: > > > On Mon, Jun 03, 2019 at 10:13:59PM -0300, Thiago Jung Bauermann wrote: > >> > >> > >> Michael S. Tsirkin writes: > >> > >> > On Wed, Apr 17, 2019 at 06:42:00PM -0300, Thiago Jung Bauermann wrote: > >> >> I rephrased it in terms of address translation. What do you think of > >> >> this version? The flag name is slightly different too: > >> >> > >> >> > >> >> VIRTIO_F_ACCESS_PLATFORM_NO_TRANSLATION This feature has the same > >> >> meaning as VIRTIO_F_ACCESS_PLATFORM both when set and when not set, > >> >> with the exception that address translation is guaranteed to be > >> >> unnecessary when accessing memory addresses supplied to the device > >> >> by the driver. Which is to say, the device will always use physical > >> >> addresses matching addresses used by the driver (typically meaning > >> >> physical addresses used by the CPU) and not translated further. This > >> >> flag should be set by the guest if offered, but to allow for > >> >> backward-compatibility device implementations allow for it to be > >> >> left unset by the guest. It is an error to set both this flag and > >> >> VIRTIO_F_ACCESS_PLATFORM. > >> > > >> > > >> > > >> > > >> > OK so VIRTIO_F_ACCESS_PLATFORM is designed to allow unpriveledged > >> > drivers. This is why devices fail when it's not negotiated. > >> > >> Just to clarify, what do you mean by unprivileged drivers? Is it drivers > >> implemented in guest userspace such as with VFIO? Or unprivileged in > >> some other sense such as needing to use bounce buffers for some reason? > > > > I had drivers in guest userspace in mind. > > Great. Thanks for clarifying. > > I don't think this flag would work for guest userspace drivers. Should I > add a note about that in the flag definition? I think you need to clarify access protection rules. Is it only translation that is bypassed or is any platform-specific protection mechanism bypassed too? > >> > This confuses me. > >> > If driver is unpriveledged then what happens with this flag? > >> > It can supply any address it wants. Will that corrupt kernel > >> > memory? > >> > >> Not needing address translation doesn't necessarily mean that there's no > >> IOMMU. On powerpc we don't use VIRTIO_F_ACCESS_PLATFORM but there's > >> always an IOMMU present. And we also support VFIO drivers. The VFIO API > >> for pseries (sPAPR section in Documentation/vfio.txt) has extra ioctls > >> to program the IOMMU. > >> > >> For our use case, we don't need address translation because we set up an > >> identity mapping in the IOMMU so that the device can use guest physical > >> addresses. OK so I think I am beginning to see it in a different light. Right now the specific platform creates an identity mapping. That in turn means DMA API can be fast - it does not need to do anything. What you are looking for is a way to tell host it's an identity mapping - just as an optimization. Is that right? So this is what I would call this option: VIRTIO_F_ACCESS_PLATFORM_IDENTITY_ADDRESS and the explanation should state that all device addresses are translated by the platform to identical addresses. In fact this option then becomes more, not less restrictive than VIRTIO_F_ACCESS_PLATFORM - it's a promise by guest to only create identity mappings, and only before driver_ok is set. This option then would always be negotiated together with VIRTIO_F_ACCESS_PLATFORM. Host then must verify that 1. full 1:1 mappings are created before driver_ok or can we make sure this happens before features_ok? that would be ideal as we could require that features_ok fails 2. mappings are not modified between driver_ok and reset i guess attempts to change them will fail - possibly by causing a guest crash or some other kind of platform-specific error So far so good, but now a question: how are we handling guest address width limitations? Is VIRTIO_F_ACCESS_PLATFORM_IDENTITY_ADDRESS subject to guest address width limitations? I am guessing we can make them so ... This needs to be documented. > > > > And can it access any guest physical address? > > Sorry, I was mistaken. We do support VFIO in guests but not for virtio > devices, only for regular PCI devices. In which case they will use > address translation. Not sure how this answers the question. > >> If the guest kernel is concerned that an unprivileged driver could > >> jeopardize its integrity it should not negotiate this feature flag. > > > > Unfortunately flag negotiation is done through config space > > and so can be overwritten by the driver. > > Ok, so the guest kernel has to forbid VFIO access on devices where this > flag is advertised. That's possible in theory but in practice we did not yet teach VFIO not to attach to legacy devices without VIRTIO_F_ACCESS_PLATFORM. So all security relies on host denying driver_ok without
Re: [PATCH v3 3/9] powerpc: Introduce FW_FEATURE_ULTRAVISOR
On 6/15/19 4:36 AM, Paul Mackerras wrote: > On Thu, Jun 06, 2019 at 02:36:08PM -0300, Claudio Carvalho wrote: >> This feature tells if the ultravisor firmware is available to handle >> ucalls. > Everything in this patch that depends on CONFIG_PPC_UV should just > depend on CONFIG_PPC_POWERNV instead. The reason is that every host > kernel needs to be able to do the ultracall to set partition table > entry 0, in case it ends up being run on a machine with an ultravisor. > Otherwise we will have the situation where a host kernel may crash > early in boot just because the machine it's booted on happens to have > an ultravisor running. The crash will be a particularly nasty one > because it will happen before we have probed the machine type and > initialized the console; therefore it will just look like the machine > hangs for no discernable reason. > We also need to think about how to provide a way for petitboot to know > whether the kernel it is booting knows how to do a ucall to set its > partition table entry. One suggestion would be to modify > vmlinux.lds.S to add a new PT_NOTE entry in the program header of the > binary with (say) a 64-bit doubleword which is a bitmap indicating > capabilities of the binary. We would define the first bit as > indicating that the kernel knows how to run under an ultravisor. > When running under an ultravisor, petitboot could then look for the > PT_NOTE and the ultravisor-capable bit in it, and if the PT_NOTE is > not there or the bit is zero, put up a dialog warning the user that > the kernel will probably crash early in boot, and asking for explicit > confirmation that the user wants to proceed. I just posted a separated RFC patch for the ELF note. Thanks, Claudio. > > Paul. >
[RFC PATCH] powerpc: Add the ppc_capabilities ELF note
Add the ppc_capabilities ELF note to the powerpc kernel binary. It is a bitmap that can be used to advertise kernel capabilities to userland. This patch also defines PPCCAP_ULTRAVISOR_BIT as being the bit zero. Suggested-by: Paul Mackerras Signed-off-by: Claudio Carvalho --- arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/note.S | 36 2 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/kernel/note.S diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 0ea6c4aa3a20..4ec36fe4325b 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -49,7 +49,7 @@ obj-y := cputable.o ptrace.o syscalls.o \ signal.o sysfs.o cacheinfo.o time.o \ prom.o traps.o setup-common.o \ udbg.o misc.o io.o misc_$(BITS).o \ - of_platform.o prom_parse.o + of_platform.o prom_parse.o note.o obj-$(CONFIG_PPC64)+= setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ paca.o nvram_64.o firmware.o diff --git a/arch/powerpc/kernel/note.S b/arch/powerpc/kernel/note.S new file mode 100644 index ..721bf8ce9eb7 --- /dev/null +++ b/arch/powerpc/kernel/note.S @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * PowerPC ELF notes. + * + * Copyright 2019, IBM Corporation + */ +#include + +/* + * Ultravisor-capable bit (PowerNV only). + * + * Indicate that the powerpc kernel binary knows how to run in an + * ultravisor-enabled system. + * + * In an ultravisor-enabled system, some machine resources are now controlled + * by the ultravisor. If the kernel is not ultravisor-capable, but it ends up + * being run on a machine with ultravisor, the kernel will probably crash + * trying to access ultravisor resources. For instance, it may crash in early + * boot trying to set the partition table entry 0. + * + * In an ultravisor-enabled system, petitboot can warn the user or prevent the + * kernel from being run if the ppc_capabilities doesn't exist or the + * Ultravisor-capable bit is not set. + */ +#if defined(CONFIG_PPC_POWERNV) +#define PPCCAP_ULTRAVISOR_BIT (1 << 0) +#else +#define PPCCAP_ULTRAVISOR_BIT 0 +#endif + +/* + * Add the ppc_capabilities ELF note to the powerpc kernel binary. It is a + * bitmap that can be used to advertise kernel capabilities to userland. + */ +ELFNOTE(ppc_capabilities, 3, + .long PPCCAP_ULTRAVISOR_BIT) -- 2.20.1
[PATCH] mm/nvdimm: Add is_ioremap_addr and use that to check ioremap address
Architectures like powerpc use different address range to map ioremap and vmalloc range. The memunmap() check used by the nvdimm layer was wrongly using is_vmalloc_addr() to check for ioremap range which fails for ppc64. This result in ppc64 not freeing the ioremap mapping. The side effect of this is an unbind failure during module unload with papr_scm nvdimm driver Signed-off-by: Aneesh Kumar K.V --- arch/powerpc/include/asm/pgtable.h | 14 ++ include/linux/mm.h | 5 + kernel/iomem.c | 2 +- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 3f53be60fb01..64145751b2fd 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -140,6 +140,20 @@ static inline void pte_frag_set(mm_context_t *ctx, void *p) } #endif +#ifdef CONFIG_PPC64 +#define is_ioremap_addr is_ioremap_addr +static inline bool is_ioremap_addr(const void *x) +{ +#ifdef CONFIG_MMU + unsigned long addr = (unsigned long)x; + + return addr >= IOREMAP_BASE && addr < IOREMAP_END; +#else + return false; +#endif +} +#endif /* CONFIG_PPC64 */ + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 973ebf71f7b6..65b2eb6c9f0a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -633,6 +633,11 @@ static inline bool is_vmalloc_addr(const void *x) return false; #endif } + +#ifndef is_ioremap_addr +#define is_ioremap_addr(x) is_vmalloc_addr(x) +#endif + #ifdef CONFIG_MMU extern int is_vmalloc_or_module_addr(const void *x); #else diff --git a/kernel/iomem.c b/kernel/iomem.c index 93c26510..62c92e43aa0d 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c @@ -121,7 +121,7 @@ EXPORT_SYMBOL(memremap); void memunmap(void *addr) { - if (is_vmalloc_addr(addr)) + if (is_ioremap_addr(addr)) iounmap((void __iomem *) addr); } EXPORT_SYMBOL(memunmap); -- 2.21.0
Can I compile Linux Kernel 5.2-rc7 for PowerPC on Intel/AMD x86 hardware?
Good evening from Singapore, Can I compile Linux Kernel 5.2-rc7 for PowerPC on Intel/AMD x86 hardware, for example, AMD Ryzen 9 3950X, with 16 CPU cores and 32 threads? Is it called cross-compiling? Thank you. -BEGIN EMAIL SIGNATURE- The Gospel for all Targeted Individuals (TIs): [The New York Times] Microwave Weapons Are Prime Suspect in Ills of U.S. Embassy Workers Link: https://www.nytimes.com/2018/09/01/science/sonic-attack-cuba-microwave.html Singaporean Mr. Turritopsis Dohrnii Teo En Ming's Academic Qualifications as at 14 Feb 2019 [1] https://tdtemcerts.wordpress.com/ [2] https://tdtemcerts.blogspot.sg/ [3] https://www.scribd.com/user/270125049/Teo-En-Ming -END EMAIL SIGNATURE-
Re: [PATCH RFC] generic ELF support for kexec
Hi Sven, On Tue, 25 Jun 2019 20:54:34 +0200 Sven Schnelle wrote: > Hi List, > > i recently started working on kexec for PA-RISC. While doing so, i figured > that powerpc already has support for reading ELF images inside of the Kernel. > My first attempt was to steal the source code and modify it for PA-RISC, but > it turned out that i didn't had to change much. Only ARM specific stuff like > fdt blob fetching had to be removed. > > So instead of duplicating the code, i thought about moving the ELF stuff to > the core kexec code, and exposing several function to use that code from the > arch specific code. That's always the right approach. Well done. > I'm attaching the patch to this Mail. What do you think about that change? > s390 also uses ELF files, and (maybe?) could also switch to this > implementation. > But i don't know anything about S/390 and don't have one in my basement. So > i'll leave s390 to the IBM folks. I'm afraid there isn't much code here s390 can reuse. I see multiple problems in kexec_elf_load: * while loading the phdrs we also need to setup some data structures to pass to the next kernel * the s390 kernel needs to be loaded to a fixed address * there is no support to load a crash kernel Of course that could all be fixed/worked around by introducing some arch hooks. But when you take into account that the whole elf loader on s390 is ~100 lines of code, I don't think it is worth it. More comments below. [...] > diff --git a/include/linux/kexec.h b/include/linux/kexec.h > index b9b1bc5f9669..49b23b425f84 100644 > --- a/include/linux/kexec.h > +++ b/include/linux/kexec.h > @@ -216,6 +216,41 @@ extern int crash_prepare_elf64_headers(struct crash_mem > *mem, int kernel_map, > void **addr, unsigned long *sz); > #endif /* CONFIG_KEXEC_FILE */ > > +#ifdef CONFIG_KEXEC_FILE_ELF > + > +struct kexec_elf_info { > + /* > + * Where the ELF binary contents are kept. > + * Memory managed by the user of the struct. > + */ > + const char *buffer; > + > + const struct elfhdr *ehdr; > + const struct elf_phdr *proghdrs; > + struct elf_shdr *sechdrs; > +}; Do i understand this right? elf_info->buffer contains the full elf file and elf_info->ehdr is a (endianness translated) copy of the files ehdr? If so ... > +void kexec_free_elf_info(struct kexec_elf_info *elf_info); > + > +int kexec_build_elf_info(const char *buf, size_t len, struct elfhdr *ehdr, > + struct kexec_elf_info *elf_info); > + > +int kexec_elf_kernel_load(struct kimage *image, struct kexec_buf *kbuf, > + char *kernel_buf, unsigned long kernel_len, > + unsigned long *kernel_load_addr); > + > +int kexec_elf_probe(const char *buf, unsigned long len); > + > +int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, > + struct kexec_elf_info *elf_info, > + struct kexec_buf *kbuf, > + unsigned long *lowest_load_addr); > + > +int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, > + struct kexec_elf_info *elf_info, > + struct kexec_buf *kbuf, > + unsigned long *lowest_load_addr); ... you could simplify the arguments by dropping the ehdr argument. The elf_info should contain all the information needed. Furthermore the kexec_buf also contains a pointer to its kimage. So you can drop that argument as well. An other thing is that you kzalloc the memory needed for proghdrs and sechdrs but expect the user of those functions to provide the memory needed for ehdr. Wouldn't it be more consistent to also kzalloc the ehdr? [...] > diff --git a/kernel/kexec_file_elf.c b/kernel/kexec_file_elf.c > new file mode 100644 > index ..bb966c93492c > --- /dev/null > +++ b/kernel/kexec_file_elf.c > @@ -0,0 +1,574 @@ [...] > +static uint64_t elf64_to_cpu(const struct elfhdr *ehdr, uint64_t value) > +{ > + if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) > + value = le64_to_cpu(value); > + else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) > + value = be64_to_cpu(value); > + > + return value; > +} > + > +static uint16_t elf16_to_cpu(const struct elfhdr *ehdr, uint16_t value) > +{ > + if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) > + value = le16_to_cpu(value); > + else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) > + value = be16_to_cpu(value); > + > + return value; > +} > + > +static uint32_t elf32_to_cpu(const struct elfhdr *ehdr, uint32_t value) > +{ > + if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) > + value = le32_to_cpu(value); > + else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) > + value = be32_to_cpu(value); > + > + return value; > +} What are the elf*_to_cpu good for? In general I'd assume that kexec loads a kernel for the same architecture it is running on. So
Re: [PATCH v3 06/11] mm/memory_hotplug: Allow arch_remove_pages() without CONFIG_MEMORY_HOTREMOVE
On Mon 01-07-19 10:01:41, Michal Hocko wrote: > On Mon 27-05-19 13:11:47, David Hildenbrand wrote: > > We want to improve error handling while adding memory by allowing > > to use arch_remove_memory() and __remove_pages() even if > > CONFIG_MEMORY_HOTREMOVE is not set to e.g., implement something like: > > > > arch_add_memory() > > rc = do_something(); > > if (rc) { > > arch_remove_memory(); > > } > > > > We won't get rid of CONFIG_MEMORY_HOTREMOVE for now, as it will require > > quite some dependencies for memory offlining. > > If we cannot really remove CONFIG_MEMORY_HOTREMOVE altogether then why > not simply add an empty placeholder for arch_remove_memory when the > config is disabled? In other words, can we replace this by something as simple as: diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ae892eef8b82..0329027fe740 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -128,6 +128,20 @@ extern void arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap); extern void __remove_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); +#else +/* + * Allow code using + * arch_add_memory(); + * rc = do_something(); + * if (rc) + * arch_remove_memory(); + * + * without ifdefery. + */ +static inline void arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) +{ +} #endif /* CONFIG_MEMORY_HOTREMOVE */ /* -- Michal Hocko SUSE Labs
Re: [PATCH v3 04/11] arm64/mm: Add temporary arch_remove_memory() implementation
On Mon 27-05-19 13:11:45, David Hildenbrand wrote: > A proper arch_remove_memory() implementation is on its way, which also > cleanly removes page tables in arch_add_memory() in case something goes > wrong. > > As we want to use arch_remove_memory() in case something goes wrong > during memory hotplug after arch_add_memory() finished, let's add > a temporary hack that is sufficient enough until we get a proper > implementation that cleans up page table entries. > > We will remove CONFIG_MEMORY_HOTREMOVE around this code in follow up > patches. I would drop this one as well (like s390 counterpart). > Cc: Catalin Marinas > Cc: Will Deacon > Cc: Mark Rutland > Cc: Andrew Morton > Cc: Ard Biesheuvel > Cc: Chintan Pandya > Cc: Mike Rapoport > Cc: Jun Yao > Cc: Yu Zhao > Cc: Robin Murphy > Cc: Anshuman Khandual > Signed-off-by: David Hildenbrand > --- > arch/arm64/mm/mmu.c | 19 +++ > 1 file changed, 19 insertions(+) > > diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c > index a1bfc4413982..e569a543c384 100644 > --- a/arch/arm64/mm/mmu.c > +++ b/arch/arm64/mm/mmu.c > @@ -1084,4 +1084,23 @@ int arch_add_memory(int nid, u64 start, u64 size, > return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, > restrictions); > } > +#ifdef CONFIG_MEMORY_HOTREMOVE > +void arch_remove_memory(int nid, u64 start, u64 size, > + struct vmem_altmap *altmap) > +{ > + unsigned long start_pfn = start >> PAGE_SHIFT; > + unsigned long nr_pages = size >> PAGE_SHIFT; > + struct zone *zone; > + > + /* > + * FIXME: Cleanup page tables (also in arch_add_memory() in case > + * adding fails). Until then, this function should only be used > + * during memory hotplug (adding memory), not for memory > + * unplug. ARCH_ENABLE_MEMORY_HOTREMOVE must not be > + * unlocked yet. > + */ > + zone = page_zone(pfn_to_page(start_pfn)); > + __remove_pages(zone, start_pfn, nr_pages, altmap); > +} > +#endif > #endif > -- > 2.20.1 -- Michal Hocko SUSE Labs
Re: [PATCH v3 03/11] s390x/mm: Implement arch_remove_memory()
On Mon 01-07-19 09:45:03, Michal Hocko wrote: > On Mon 27-05-19 13:11:44, David Hildenbrand wrote: > > Will come in handy when wanting to handle errors after > > arch_add_memory(). > > I do not understand this. Why do you add a code for something that is > not possible on this HW (based on the comment - is it still valid btw?) Same as the previous patch (drop it). > > Cc: Martin Schwidefsky > > Cc: Heiko Carstens > > Cc: Andrew Morton > > Cc: Michal Hocko > > Cc: Mike Rapoport > > Cc: David Hildenbrand > > Cc: Vasily Gorbik > > Cc: Oscar Salvador > > Signed-off-by: David Hildenbrand > > --- > > arch/s390/mm/init.c | 13 +++-- > > 1 file changed, 7 insertions(+), 6 deletions(-) > > > > diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c > > index d552e330fbcc..14955e0a9fcf 100644 > > --- a/arch/s390/mm/init.c > > +++ b/arch/s390/mm/init.c > > @@ -243,12 +243,13 @@ int arch_add_memory(int nid, u64 start, u64 size, > > void arch_remove_memory(int nid, u64 start, u64 size, > > struct vmem_altmap *altmap) > > { > > - /* > > -* There is no hardware or firmware interface which could trigger a > > -* hot memory remove on s390. So there is nothing that needs to be > > -* implemented. > > -*/ > > - BUG(); > > + unsigned long start_pfn = start >> PAGE_SHIFT; > > + unsigned long nr_pages = size >> PAGE_SHIFT; > > + struct zone *zone; > > + > > + zone = page_zone(pfn_to_page(start_pfn)); > > + __remove_pages(zone, start_pfn, nr_pages, altmap); > > + vmem_remove_mapping(start, size); > > } > > #endif > > #endif /* CONFIG_MEMORY_HOTPLUG */ > > -- > > 2.20.1 > > > > -- > Michal Hocko > SUSE Labs -- Michal Hocko SUSE Labs
Re: [PATCH v3 02/11] s390x/mm: Fail when an altmap is used for arch_add_memory()
On Mon 01-07-19 09:43:06, Michal Hocko wrote: > On Mon 27-05-19 13:11:43, David Hildenbrand wrote: > > ZONE_DEVICE is not yet supported, fail if an altmap is passed, so we > > don't forget arch_add_memory()/arch_remove_memory() when unlocking > > support. > > Why do we need this? Sure ZONE_DEVICE is not supported for s390 and so > might be the case for other arches which support hotplug. I do not see > much point in adding warning to each of them. I would drop this one. If there is a strong reason to have something like that it should come with a better explanation and it can be done on top. -- Michal Hocko SUSE Labs
Re: [PATCH] powerpc/mm/nvdimm: Add an informative message if we fail to allocate altmap block
"Oliver O'Halloran" writes: > On Sat, Jun 29, 2019 at 5:39 PM Aneesh Kumar K.V > wrote: >> >> Allocation from altmap area can fail based on vmemmap page size used. Add >> kernel >> info message to indicate the failure. That allows the user to identify >> whether they >> are really using persistent memory reserved space for per-page metadata. >> >> The message looks like: >> [ 136.587212] altmap block allocation failed, falling back to system memory >> >> Signed-off-by: Aneesh Kumar K.V >> --- >> arch/powerpc/mm/init_64.c | 6 +- >> 1 file changed, 5 insertions(+), 1 deletion(-) >> >> diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c >> index a4e17a979e45..57c0573650dc 100644 >> --- a/arch/powerpc/mm/init_64.c >> +++ b/arch/powerpc/mm/init_64.c >> @@ -194,8 +194,12 @@ int __meminit vmemmap_populate(unsigned long start, >> unsigned long end, int node, >> * fail due to alignment issues when using 16MB hugepages, so >> * fall back to system memory if the altmap allocation fail. >> */ >> - if (altmap) >> + if (altmap) { >> p = altmap_alloc_block_buf(page_size, altmap); >> + if (!p) > >> + pr_info("altmap block allocation failed, " \ >> + "falling back to system memory"); > > I think this is kind of misleading. If you're mapping a large amount > of memory you can have most of the vmemmap backing allocated from the > altmap and one extra block allocated from normal memory. E.g. If you > have 32MB of altmap space, one 16MB block will be allocated from the > altmap, but the 2nd 16MB block is probably unusable due to the > reserved pages at the start of the altmap. Maybe this should be a > pr_debug() so it's only printed along with the "vmemmap_populate ..." > message above? Will switch to pr_debug. What I really wanted was an indication of which pfn device failed to allocate per page meata data in the device. But we really don't have device details here and we don't end up calling this function if there is already a 16MB mapping in DRAM for this area. > > Also, isn't kernel style to keep printf()s, even long ones, on one line? I was not sure. It do print to kernel log in one line. -aneesh
Re: Re: [PATCH 1/3] arm64: mm: Add p?d_large() definitions
On Mon, Jul 01, 2019 at 11:03:51AM +0100, Steven Price wrote: > On 01/07/2019 10:27, Will Deacon wrote: > > On Sun, Jun 23, 2019 at 07:44:44PM +1000, Nicholas Piggin wrote: > >> walk_page_range() is going to be allowed to walk page tables other than > >> those of user space. For this it needs to know when it has reached a > >> 'leaf' entry in the page tables. This information will be provided by the > >> p?d_large() functions/macros. > > > > I can't remember whether or not I asked this before, but why not call > > this macro p?d_leaf() if that's what it's identifying? "Large" and "huge" > > are usually synonymous, so I find this naming needlessly confusing based > > on this patch in isolation. > > You replied to my posting of this patch before[1], to which you said: > > > I've have thought p?d_leaf() might match better with your description > > above, but I'm not going to quibble on naming. That explains the sense of deja vu. > Have you changed your mind about quibbling? ;) Ha, I suppose I have! If it's not loads of effort to use p?d_leaf() instead of p?d_large, then I'd certainly prefer that. Will
Re: [PATCH 1/3] arm64: mm: Add p?d_large() definitions
Hi Nick, On Sun, Jun 23, 2019 at 07:44:44PM +1000, Nicholas Piggin wrote: > walk_page_range() is going to be allowed to walk page tables other than > those of user space. For this it needs to know when it has reached a > 'leaf' entry in the page tables. This information will be provided by the > p?d_large() functions/macros. I can't remember whether or not I asked this before, but why not call this macro p?d_leaf() if that's what it's identifying? "Large" and "huge" are usually synonymous, so I find this naming needlessly confusing based on this patch in isolation. Will
Re: [PATCH v3 10/11] mm/memory_hotplug: Make unregister_memory_block_under_nodes() never fail
On Mon 01-07-19 11:36:44, Oscar Salvador wrote: > On Mon, Jul 01, 2019 at 10:51:44AM +0200, Michal Hocko wrote: > > Yeah, we do not allow to offline multi zone (node) ranges so the current > > code seems to be over engineered. > > > > Anyway, I am wondering why do we have to strictly check for already > > removed nodes links. Is the sysfs code going to complain we we try to > > remove again? > > No, sysfs will silently "fail" if the symlink has already been removed. > At least that is what I saw last time I played with it. > > I guess the question is what if sysfs handling changes in the future > and starts dropping warnings when trying to remove a symlink is not there. > Maybe that is unlikely to happen? And maybe we handle it then rather than have a static allocation that everybody with hotremove configured has to pay for. -- Michal Hocko SUSE Labs
Re: Re: [PATCH 1/3] arm64: mm: Add p?d_large() definitions
On 01/07/2019 10:27, Will Deacon wrote: > Hi Nick, > > On Sun, Jun 23, 2019 at 07:44:44PM +1000, Nicholas Piggin wrote: >> walk_page_range() is going to be allowed to walk page tables other than >> those of user space. For this it needs to know when it has reached a >> 'leaf' entry in the page tables. This information will be provided by the >> p?d_large() functions/macros. > > I can't remember whether or not I asked this before, but why not call > this macro p?d_leaf() if that's what it's identifying? "Large" and "huge" > are usually synonymous, so I find this naming needlessly confusing based > on this patch in isolation. Hi Will, You replied to my posting of this patch before[1], to which you said: > I've have thought p?d_leaf() might match better with your description > above, but I'm not going to quibble on naming. Have you changed your mind about quibbling? ;) Steve [1] https://lore.kernel.org/lkml/20190611153650.gb4...@fuggles.cambridge.arm.com/
Re: [PATCH v2 3/3] mm/vmalloc: fix vmalloc_to_page for huge vmap mappings
On 07/01/2019 12:10 PM, Nicholas Piggin wrote: > vmalloc_to_page returns NULL for addresses mapped by larger pages[*]. > Whether or not a vmap is huge depends on the architecture details, > alignments, boot options, etc., which the caller can not be expected > to know. Therefore HUGE_VMAP is a regression for vmalloc_to_page. > > This change teaches vmalloc_to_page about larger pages, and returns > the struct page that corresponds to the offset within the large page. > This makes the API agnostic to mapping implementation details. > > [*] As explained by commit 029c54b095995 ("mm/vmalloc.c: huge-vmap: > fail gracefully on unexpected huge vmap mappings") > > Signed-off-by: Nicholas Piggin Reviewed-by: Anshuman Khandual
Re: [PATCH v2 1/3] arm64: mm: Add p?d_large() definitions
On 01/07/2019 07:40, Nicholas Piggin wrote: > walk_page_range() is going to be allowed to walk page tables other than > those of user space. For this it needs to know when it has reached a > 'leaf' entry in the page tables. This information will be provided by the > p?d_large() functions/macros. > > For arm64, we already have p?d_sect() macros which we can reuse for > p?d_large(). > > pud_sect() is defined as a dummy function when CONFIG_PGTABLE_LEVELS < 3 > or CONFIG_ARM64_64K_PAGES is defined. However when the kernel is > configured this way then architecturally it isn't allowed to have a > large page that this level, and any code using these page walking macros > is implicitly relying on the page size/number of levels being the same as > the kernel. So it is safe to reuse this for p?d_large() as it is an > architectural restriction. > > Cc: Catalin Marinas > Cc: Will Deacon > Signed-off-by: Steven Price Hi Nicolas, This appears to my patch which I originally posted as part of converting x86/arm64 to use a generic page walk code[1]. I'm not sure that this patch makes much sense on its own, in particular it was working up to having a generic macro[2] which means the _large() macros could be used across all architectures. Also as a matter of courtesy please can you ensure the authorship information is preserved when posting other people's patches (there should be a From: line with my name on). You should also include your own Signed-off-by: line (see submitting-patches[3]) after mine. Apologies to anyone that has been following my patch series, I've been on holiday so not actively working on it. My aim is to combine the series with a generic ptdump implementation[4] which should improve the diff state. I should be able to post that in the next few weeks. Thanks, Steve [1] Series: https://patchwork.kernel.org/cover/10883885/ Last posting of this patch: https://patchwork.kernel.org/patch/10883899/ [2] https://patchwork.kernel.org/patch/10883965/ [3] https://www.kernel.org/doc/html/latest/process/submitting-patches.html#developer-s-certificate-of-origin-1-1 [4] RFC version here: https://lore.kernel.org/lkml/20190417143423.26665-1-steven.pr...@arm.com/ > --- > arch/arm64/include/asm/pgtable.h | 2 ++ > 1 file changed, 2 insertions(+) > > diff --git a/arch/arm64/include/asm/pgtable.h > b/arch/arm64/include/asm/pgtable.h > index fca26759081a..0e973201bc16 100644 > --- a/arch/arm64/include/asm/pgtable.h > +++ b/arch/arm64/include/asm/pgtable.h > @@ -417,6 +417,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, > unsigned long pfn, >PMD_TYPE_TABLE) > #define pmd_sect(pmd)((pmd_val(pmd) & PMD_TYPE_MASK) == \ >PMD_TYPE_SECT) > +#define pmd_large(pmd) pmd_sect(pmd) > > #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3 > #define pud_sect(pud)(0) > @@ -499,6 +500,7 @@ static inline void pte_unmap(pte_t *pte) { } > #define pud_none(pud)(!pud_val(pud)) > #define pud_bad(pud) (!(pud_val(pud) & PUD_TABLE_BIT)) > #define pud_present(pud) pte_present(pud_pte(pud)) > +#define pud_large(pud) pud_sect(pud) > #define pud_valid(pud) pte_valid(pud_pte(pud)) > > static inline void set_pud(pud_t *pudp, pud_t pud) >
Re: [PATCH v2 1/3] arm64: mm: Add p?d_large() definitions
On Mon, Jul 01, 2019 at 04:40:24PM +1000, Nicholas Piggin wrote: > walk_page_range() is going to be allowed to walk page tables other than > those of user space. For this it needs to know when it has reached a > 'leaf' entry in the page tables. This information will be provided by the > p?d_large() functions/macros. > > For arm64, we already have p?d_sect() macros which we can reuse for > p?d_large(). > > pud_sect() is defined as a dummy function when CONFIG_PGTABLE_LEVELS < 3 > or CONFIG_ARM64_64K_PAGES is defined. However when the kernel is > configured this way then architecturally it isn't allowed to have a > large page that this level, and any code using these page walking macros > is implicitly relying on the page size/number of levels being the same as > the kernel. So it is safe to reuse this for p?d_large() as it is an > architectural restriction. > > Cc: Catalin Marinas > Cc: Will Deacon > Signed-off-by: Steven Price Acked-by: Catalin Marinas
Re: [PATCH v3 10/11] mm/memory_hotplug: Make unregister_memory_block_under_nodes() never fail
On Mon, Jul 01, 2019 at 10:51:44AM +0200, Michal Hocko wrote: > Yeah, we do not allow to offline multi zone (node) ranges so the current > code seems to be over engineered. > > Anyway, I am wondering why do we have to strictly check for already > removed nodes links. Is the sysfs code going to complain we we try to > remove again? No, sysfs will silently "fail" if the symlink has already been removed. At least that is what I saw last time I played with it. I guess the question is what if sysfs handling changes in the future and starts dropping warnings when trying to remove a symlink is not there. Maybe that is unlikely to happen? -- Oscar Salvador SUSE L3
Re: [PATCH v3 11/11] mm/memory_hotplug: Remove "zone" parameter from sparse_remove_one_section
On Mon 27-05-19 13:11:52, David Hildenbrand wrote: > The parameter is unused, so let's drop it. Memory removal paths should > never care about zones. This is the job of memory offlining and will > require more refactorings. > > Reviewed-by: Dan Williams > Signed-off-by: David Hildenbrand Acked-by: Michal Hocko > --- > include/linux/memory_hotplug.h | 2 +- > mm/memory_hotplug.c| 2 +- > mm/sparse.c| 4 ++-- > 3 files changed, 4 insertions(+), 4 deletions(-) > > diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h > index 2f1f87e13baa..1a4257c5f74c 100644 > --- a/include/linux/memory_hotplug.h > +++ b/include/linux/memory_hotplug.h > @@ -346,7 +346,7 @@ extern void move_pfn_range_to_zone(struct zone *zone, > unsigned long start_pfn, > extern bool is_memblock_offlined(struct memory_block *mem); > extern int sparse_add_one_section(int nid, unsigned long start_pfn, > struct vmem_altmap *altmap); > -extern void sparse_remove_one_section(struct zone *zone, struct mem_section > *ms, > +extern void sparse_remove_one_section(struct mem_section *ms, > unsigned long map_offset, struct vmem_altmap *altmap); > extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, > unsigned long pnum); > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c > index 82136c5b4c5f..e48ec7b9dee2 100644 > --- a/mm/memory_hotplug.c > +++ b/mm/memory_hotplug.c > @@ -524,7 +524,7 @@ static void __remove_section(struct zone *zone, struct > mem_section *ms, > start_pfn = section_nr_to_pfn((unsigned long)scn_nr); > __remove_zone(zone, start_pfn); > > - sparse_remove_one_section(zone, ms, map_offset, altmap); > + sparse_remove_one_section(ms, map_offset, altmap); > } > > /** > diff --git a/mm/sparse.c b/mm/sparse.c > index d1d5e05f5b8d..1552c855d62a 100644 > --- a/mm/sparse.c > +++ b/mm/sparse.c > @@ -800,8 +800,8 @@ static void free_section_usemap(struct page *memmap, > unsigned long *usemap, > free_map_bootmem(memmap); > } > > -void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, > - unsigned long map_offset, struct vmem_altmap *altmap) > +void sparse_remove_one_section(struct mem_section *ms, unsigned long > map_offset, > +struct vmem_altmap *altmap) > { > struct page *memmap = NULL; > unsigned long *usemap = NULL; > -- > 2.20.1 -- Michal Hocko SUSE Labs
Re: [PATCH] powerpc: remove device_to_mask
Alexey Kardashevskiy writes: > On 29/06/2019 18:03, Christoph Hellwig wrote: >> Use the dma_get_mask helper from dma-mapping.h instead. >> >> Signed-off-by: Christoph Hellwig > > Reviewed-by: Alexey Kardashevskiy I'll add to the change log "because they are functionally identical." cheers
Re: [PATCH v3 10/11] mm/memory_hotplug: Make unregister_memory_block_under_nodes() never fail
On Mon 27-05-19 13:11:51, David Hildenbrand wrote: > We really don't want anything during memory hotunplug to fail. > We always pass a valid memory block device, that check can go. Avoid > allocating memory and eventually failing. As we are always called under > lock, we can use a static piece of memory. This avoids having to put > the structure onto the stack, having to guess about the stack size > of callers. > > Patch inspired by a patch from Oscar Salvador. > > In the future, there might be no need to iterate over nodes at all. > mem->nid should tell us exactly what to remove. Memory block devices > with mixed nodes (added during boot) should properly fenced off and never > removed. Yeah, we do not allow to offline multi zone (node) ranges so the current code seems to be over engineered. Anyway, I am wondering why do we have to strictly check for already removed nodes links. Is the sysfs code going to complain we we try to remove again? > Cc: Greg Kroah-Hartman > Cc: "Rafael J. Wysocki" > Cc: Alex Deucher > Cc: "David S. Miller" > Cc: Mark Brown > Cc: Chris Wilson > Cc: David Hildenbrand > Cc: Oscar Salvador > Cc: Andrew Morton > Cc: Jonathan Cameron > Signed-off-by: David Hildenbrand Anyway Acked-by: Michal Hocko > --- > drivers/base/node.c | 18 +- > include/linux/node.h | 5 ++--- > 2 files changed, 7 insertions(+), 16 deletions(-) > > diff --git a/drivers/base/node.c b/drivers/base/node.c > index 04fdfa99b8bc..9be88fd05147 100644 > --- a/drivers/base/node.c > +++ b/drivers/base/node.c > @@ -803,20 +803,14 @@ int register_mem_sect_under_node(struct memory_block > *mem_blk, void *arg) > > /* > * Unregister memory block device under all nodes that it spans. > + * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes). > */ > -int unregister_memory_block_under_nodes(struct memory_block *mem_blk) > +void unregister_memory_block_under_nodes(struct memory_block *mem_blk) > { > - NODEMASK_ALLOC(nodemask_t, unlinked_nodes, GFP_KERNEL); > unsigned long pfn, sect_start_pfn, sect_end_pfn; > + static nodemask_t unlinked_nodes; > > - if (!mem_blk) { > - NODEMASK_FREE(unlinked_nodes); > - return -EFAULT; > - } > - if (!unlinked_nodes) > - return -ENOMEM; > - nodes_clear(*unlinked_nodes); > - > + nodes_clear(unlinked_nodes); > sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); > sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr); > for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { > @@ -827,15 +821,13 @@ int unregister_memory_block_under_nodes(struct > memory_block *mem_blk) > continue; > if (!node_online(nid)) > continue; > - if (node_test_and_set(nid, *unlinked_nodes)) > + if (node_test_and_set(nid, unlinked_nodes)) > continue; > sysfs_remove_link(_devices[nid]->dev.kobj, >kobject_name(_blk->dev.kobj)); > sysfs_remove_link(_blk->dev.kobj, >kobject_name(_devices[nid]->dev.kobj)); > } > - NODEMASK_FREE(unlinked_nodes); > - return 0; > } > > int link_mem_sections(int nid, unsigned long start_pfn, unsigned long > end_pfn) > diff --git a/include/linux/node.h b/include/linux/node.h > index 02a29e71b175..548c226966a2 100644 > --- a/include/linux/node.h > +++ b/include/linux/node.h > @@ -139,7 +139,7 @@ extern int register_cpu_under_node(unsigned int cpu, > unsigned int nid); > extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); > extern int register_mem_sect_under_node(struct memory_block *mem_blk, > void *arg); > -extern int unregister_memory_block_under_nodes(struct memory_block *mem_blk); > +extern void unregister_memory_block_under_nodes(struct memory_block > *mem_blk); > > extern int register_memory_node_under_compute_node(unsigned int mem_nid, > unsigned int cpu_nid, > @@ -175,9 +175,8 @@ static inline int register_mem_sect_under_node(struct > memory_block *mem_blk, > { > return 0; > } > -static inline int unregister_memory_block_under_nodes(struct memory_block > *mem_blk) > +static inline void unregister_memory_block_under_nodes(struct memory_block > *mem_blk) > { > - return 0; > } > > static inline void register_hugetlbfs_with_node(node_registration_func_t reg, > -- > 2.20.1 -- Michal Hocko SUSE Labs
Re: [PATCH v2 4/7] powerpc/ftrace: Additionally nop out the preceding mflr with -mprofile-kernel
Steven Rostedt wrote: On Thu, 27 Jun 2019 20:58:20 +0530 "Naveen N. Rao" wrote: > But interesting, I don't see a synchronize_rcu_tasks() call > there. We felt we don't need it in this case. We patch the branch to ftrace with a nop first. Other cpus should see that first. But, now that I think about it, should we add a memory barrier to ensure the writes get ordered properly? Do you send an ipi to the other CPUs. I would just to be safe. We are handling this through ftrace_replace_code() and __ftrace_make_call_prep() below. For FTRACE_UPDATE_MAKE_CALL, we patch in the mflr, followed by smp_call_function(isync) and synchronize_rcu_tasks() before we proceed to patch the branch to ftrace. I don't see any other scenario where we end up in __ftrace_make_nop_kernel() without going through ftrace_replace_code(). For kernel modules, this can happen during module load/init and so, I patch out both instructions in __ftrace_make_call() above without any synchronization. Am I missing anything? No, I think I got confused ;-), it's the patch out that I was worried about, but when I was going through the scenario, I somehow turned it into the patching in (which I already audited :-p). I was going to reply with just the top part of this email, but then the confusion started :-/ OK, yes, patching out should be fine, and you already covered the patching in. Sorry for the noise. Just to confirm and totally remove the confusion, the patch does: : mflrr0 <-- preempt here bl _mcount : mflrr0 nop And this is fine regardless. OK, Reviewed-by: Steven Rostedt (VMware) Thanks for confirming! We do need an IPI to be sure, as you pointed out above. I will have the patching out take the same path to simplify things. - Naveen
Re: [PATCH v3 09/11] mm/memory_hotplug: Remove memory block devices before arch_remove_memory()
On Mon 27-05-19 13:11:50, David Hildenbrand wrote: > Let's factor out removing of memory block devices, which is only > necessary for memory added via add_memory() and friends that created > memory block devices. Remove the devices before calling > arch_remove_memory(). > > This finishes factoring out memory block device handling from > arch_add_memory() and arch_remove_memory(). OK, this makes sense again. Just a nit. Calling find_memory_block_by_id for each memory block looks a bit suboptimal, especially when we are removing consequent physical memblocks. I have to confess that I do not know how expensive is the search and I also expect that there won't be that many memblocks in the removed range anyway as large setups have large memblocks. > Cc: Greg Kroah-Hartman > Cc: "Rafael J. Wysocki" > Cc: David Hildenbrand > Cc: "mike.tra...@hpe.com" > Cc: Andrew Morton > Cc: Andrew Banman > Cc: Ingo Molnar > Cc: Alex Deucher > Cc: "David S. Miller" > Cc: Mark Brown > Cc: Chris Wilson > Cc: Oscar Salvador > Cc: Jonathan Cameron > Cc: Michal Hocko > Cc: Pavel Tatashin > Cc: Arun KS > Cc: Mathieu Malaterre > Reviewed-by: Dan Williams > Signed-off-by: David Hildenbrand Other than that looks good to me. Acked-by: Michal Hocko > --- > drivers/base/memory.c | 37 ++--- > drivers/base/node.c| 11 ++- > include/linux/memory.h | 2 +- > include/linux/node.h | 6 ++ > mm/memory_hotplug.c| 5 +++-- > 5 files changed, 30 insertions(+), 31 deletions(-) > > diff --git a/drivers/base/memory.c b/drivers/base/memory.c > index 5a0370f0c506..f28efb0bf5c7 100644 > --- a/drivers/base/memory.c > +++ b/drivers/base/memory.c > @@ -763,32 +763,31 @@ int create_memory_block_devices(unsigned long start, > unsigned long size) > return ret; > } > > -void unregister_memory_section(struct mem_section *section) > +/* > + * Remove memory block devices for the given memory area. Start and size > + * have to be aligned to memory block granularity. Memory block devices > + * have to be offline. > + */ > +void remove_memory_block_devices(unsigned long start, unsigned long size) > { > + const int start_block_id = pfn_to_block_id(PFN_DOWN(start)); > + const int end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); > struct memory_block *mem; > + int block_id; > > - if (WARN_ON_ONCE(!present_section(section))) > + if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || > + !IS_ALIGNED(size, memory_block_size_bytes( > return; > > mutex_lock(_sysfs_mutex); > - > - /* > - * Some users of the memory hotplug do not want/need memblock to > - * track all sections. Skip over those. > - */ > - mem = find_memory_block(section); > - if (!mem) > - goto out_unlock; > - > - unregister_mem_sect_under_nodes(mem, __section_nr(section)); > - > - mem->section_count--; > - if (mem->section_count == 0) > + for (block_id = start_block_id; block_id != end_block_id; block_id++) { > + mem = find_memory_block_by_id(block_id, NULL); > + if (WARN_ON_ONCE(!mem)) > + continue; > + mem->section_count = 0; > + unregister_memory_block_under_nodes(mem); > unregister_memory(mem); > - else > - put_device(>dev); > - > -out_unlock: > + } > mutex_unlock(_sysfs_mutex); > } > > diff --git a/drivers/base/node.c b/drivers/base/node.c > index 8598fcbd2a17..04fdfa99b8bc 100644 > --- a/drivers/base/node.c > +++ b/drivers/base/node.c > @@ -801,9 +801,10 @@ int register_mem_sect_under_node(struct memory_block > *mem_blk, void *arg) > return 0; > } > > -/* unregister memory section under all nodes that it spans */ > -int unregister_mem_sect_under_nodes(struct memory_block *mem_blk, > - unsigned long phys_index) > +/* > + * Unregister memory block device under all nodes that it spans. > + */ > +int unregister_memory_block_under_nodes(struct memory_block *mem_blk) > { > NODEMASK_ALLOC(nodemask_t, unlinked_nodes, GFP_KERNEL); > unsigned long pfn, sect_start_pfn, sect_end_pfn; > @@ -816,8 +817,8 @@ int unregister_mem_sect_under_nodes(struct memory_block > *mem_blk, > return -ENOMEM; > nodes_clear(*unlinked_nodes); > > - sect_start_pfn = section_nr_to_pfn(phys_index); > - sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1; > + sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); > + sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr); > for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { > int nid; > > diff --git a/include/linux/memory.h b/include/linux/memory.h > index db3e8567f900..f26a5417ec5d 100644 > --- a/include/linux/memory.h > +++ b/include/linux/memory.h > @@ -112,7 +112,7 @@ extern void
Re: [PATCH v3 08/11] mm/memory_hotplug: Drop MHP_MEMBLOCK_API
On Mon 27-05-19 13:11:49, David Hildenbrand wrote: > No longer needed, the callers of arch_add_memory() can handle this > manually. > > Cc: Andrew Morton > Cc: David Hildenbrand > Cc: Michal Hocko > Cc: Oscar Salvador > Cc: Pavel Tatashin > Cc: Wei Yang > Cc: Joonsoo Kim > Cc: Qian Cai > Cc: Arun KS > Cc: Mathieu Malaterre > Signed-off-by: David Hildenbrand Acked-by: Michal Hocko > --- > include/linux/memory_hotplug.h | 8 > mm/memory_hotplug.c| 9 +++-- > 2 files changed, 3 insertions(+), 14 deletions(-) > > diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h > index 2d4de313926d..2f1f87e13baa 100644 > --- a/include/linux/memory_hotplug.h > +++ b/include/linux/memory_hotplug.h > @@ -128,14 +128,6 @@ extern void arch_remove_memory(int nid, u64 start, u64 > size, > extern void __remove_pages(struct zone *zone, unsigned long start_pfn, > unsigned long nr_pages, struct vmem_altmap *altmap); > > -/* > - * Do we want sysfs memblock files created. This will allow userspace to > online > - * and offline memory explicitly. Lack of this bit means that the caller has > to > - * call move_pfn_range_to_zone to finish the initialization. > - */ > - > -#define MHP_MEMBLOCK_API (1<<0) > - > /* reasonably generic interface to expand the physical pages */ > extern int __add_pages(int nid, unsigned long start_pfn, unsigned long > nr_pages, > struct mhp_restrictions *restrictions); > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c > index b1fde90bbf19..9a92549ef23b 100644 > --- a/mm/memory_hotplug.c > +++ b/mm/memory_hotplug.c > @@ -251,7 +251,7 @@ void __init register_page_bootmem_info_node(struct > pglist_data *pgdat) > #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ > > static int __meminit __add_section(int nid, unsigned long phys_start_pfn, > - struct vmem_altmap *altmap, bool want_memblock) > +struct vmem_altmap *altmap) > { > int ret; > > @@ -294,8 +294,7 @@ int __ref __add_pages(int nid, unsigned long > phys_start_pfn, > } > > for (i = start_sec; i <= end_sec; i++) { > - err = __add_section(nid, section_nr_to_pfn(i), altmap, > - restrictions->flags & MHP_MEMBLOCK_API); > + err = __add_section(nid, section_nr_to_pfn(i), altmap); > > /* >* EEXIST is finally dealt with by ioresource collision > @@ -1067,9 +1066,7 @@ static int online_memory_block(struct memory_block > *mem, void *arg) > */ > int __ref add_memory_resource(int nid, struct resource *res) > { > - struct mhp_restrictions restrictions = { > - .flags = MHP_MEMBLOCK_API, > - }; > + struct mhp_restrictions restrictions = {}; > u64 start, size; > bool new_node = false; > int ret; > -- > 2.20.1 > -- Michal Hocko SUSE Labs
Re: [PATCH v3 07/11] mm/memory_hotplug: Create memory block devices after arch_add_memory()
On Mon 27-05-19 13:11:48, David Hildenbrand wrote: > Only memory to be added to the buddy and to be onlined/offlined by > user space using /sys/devices/system/memory/... needs (and should have!) > memory block devices. > > Factor out creation of memory block devices. Create all devices after > arch_add_memory() succeeded. We can later drop the want_memblock parameter, > because it is now effectively stale. > > Only after memory block devices have been added, memory can be onlined > by user space. This implies, that memory is not visible to user space at > all before arch_add_memory() succeeded. I like the memblock API to go away from the low level hotplug handling. The current implementation is just too convoluted and I remember I was fighting with subtle expectations wired deep in call chains when touching that code in the past (some memblocks didn't get created etc.). Maybe those have been addressed in the meantime. > While at it > - use WARN_ON_ONCE instead of BUG_ON in moved unregister_memory() This would better be a separate patch with an explanation > - introduce find_memory_block_by_id() to search via block id > - Use find_memory_block_by_id() in init_memory_block() to catch > duplicates > > Cc: Greg Kroah-Hartman > Cc: "Rafael J. Wysocki" > Cc: David Hildenbrand > Cc: "mike.tra...@hpe.com" > Cc: Andrew Morton > Cc: Ingo Molnar > Cc: Andrew Banman > Cc: Oscar Salvador > Cc: Michal Hocko > Cc: Pavel Tatashin > Cc: Qian Cai > Cc: Wei Yang > Cc: Arun KS > Cc: Mathieu Malaterre > Signed-off-by: David Hildenbrand Other than that looks good to me. Acked-by: Michal Hocko > --- > drivers/base/memory.c | 82 +++--- > include/linux/memory.h | 2 +- > mm/memory_hotplug.c| 15 > 3 files changed, 63 insertions(+), 36 deletions(-) > > diff --git a/drivers/base/memory.c b/drivers/base/memory.c > index ac17c95a5f28..5a0370f0c506 100644 > --- a/drivers/base/memory.c > +++ b/drivers/base/memory.c > @@ -39,6 +39,11 @@ static inline int base_memory_block_id(int section_nr) > return section_nr / sections_per_block; > } > > +static inline int pfn_to_block_id(unsigned long pfn) > +{ > + return base_memory_block_id(pfn_to_section_nr(pfn)); > +} > + > static int memory_subsys_online(struct device *dev); > static int memory_subsys_offline(struct device *dev); > > @@ -582,10 +587,9 @@ int __weak arch_get_memory_phys_device(unsigned long > start_pfn) > * A reference for the returned object is held and the reference for the > * hinted object is released. > */ > -struct memory_block *find_memory_block_hinted(struct mem_section *section, > - struct memory_block *hint) > +static struct memory_block *find_memory_block_by_id(int block_id, > + struct memory_block *hint) > { > - int block_id = base_memory_block_id(__section_nr(section)); > struct device *hintdev = hint ? >dev : NULL; > struct device *dev; > > @@ -597,6 +601,14 @@ struct memory_block *find_memory_block_hinted(struct > mem_section *section, > return to_memory_block(dev); > } > > +struct memory_block *find_memory_block_hinted(struct mem_section *section, > + struct memory_block *hint) > +{ > + int block_id = base_memory_block_id(__section_nr(section)); > + > + return find_memory_block_by_id(block_id, hint); > +} > + > /* > * For now, we have a linear search to go find the appropriate > * memory_block corresponding to a particular phys_index. If > @@ -658,6 +670,11 @@ static int init_memory_block(struct memory_block > **memory, int block_id, > unsigned long start_pfn; > int ret = 0; > > + mem = find_memory_block_by_id(block_id, NULL); > + if (mem) { > + put_device(>dev); > + return -EEXIST; > + } > mem = kzalloc(sizeof(*mem), GFP_KERNEL); > if (!mem) > return -ENOMEM; > @@ -699,44 +716,53 @@ static int add_memory_block(int base_section_nr) > return 0; > } > > +static void unregister_memory(struct memory_block *memory) > +{ > + if (WARN_ON_ONCE(memory->dev.bus != _subsys)) > + return; > + > + /* drop the ref. we got via find_memory_block() */ > + put_device(>dev); > + device_unregister(>dev); > +} > + > /* > - * need an interface for the VM to add new memory regions, > - * but without onlining it. > + * Create memory block devices for the given memory area. Start and size > + * have to be aligned to memory block granularity. Memory block devices > + * will be initialized as offline. > */ > -int hotplug_memory_register(int nid, struct mem_section *section) > +int create_memory_block_devices(unsigned long start, unsigned long size) > { > - int block_id = base_memory_block_id(__section_nr(section)); > - int ret = 0; > + const int start_block_id =
Re: [PATCH v3 06/11] mm/memory_hotplug: Allow arch_remove_pages() without CONFIG_MEMORY_HOTREMOVE
On Mon 27-05-19 13:11:47, David Hildenbrand wrote: > We want to improve error handling while adding memory by allowing > to use arch_remove_memory() and __remove_pages() even if > CONFIG_MEMORY_HOTREMOVE is not set to e.g., implement something like: > > arch_add_memory() > rc = do_something(); > if (rc) { > arch_remove_memory(); > } > > We won't get rid of CONFIG_MEMORY_HOTREMOVE for now, as it will require > quite some dependencies for memory offlining. If we cannot really remove CONFIG_MEMORY_HOTREMOVE altogether then why not simply add an empty placeholder for arch_remove_memory when the config is disabled? > Cc: Tony Luck > Cc: Fenghua Yu > Cc: Benjamin Herrenschmidt > Cc: Paul Mackerras > Cc: Michael Ellerman > Cc: Martin Schwidefsky > Cc: Heiko Carstens > Cc: Yoshinori Sato > Cc: Rich Felker > Cc: Dave Hansen > Cc: Andy Lutomirski > Cc: Peter Zijlstra > Cc: Thomas Gleixner > Cc: Ingo Molnar > Cc: Borislav Petkov > Cc: "H. Peter Anvin" > Cc: Greg Kroah-Hartman > Cc: "Rafael J. Wysocki" > Cc: Andrew Morton > Cc: Michal Hocko > Cc: Mike Rapoport > Cc: David Hildenbrand > Cc: Oscar Salvador > Cc: "Kirill A. Shutemov" > Cc: Alex Deucher > Cc: "David S. Miller" > Cc: Mark Brown > Cc: Chris Wilson > Cc: Christophe Leroy > Cc: Nicholas Piggin > Cc: Vasily Gorbik > Cc: Rob Herring > Cc: Masahiro Yamada > Cc: "mike.tra...@hpe.com" > Cc: Andrew Banman > Cc: Pavel Tatashin > Cc: Wei Yang > Cc: Arun KS > Cc: Qian Cai > Cc: Mathieu Malaterre > Cc: Baoquan He > Cc: Logan Gunthorpe > Cc: Anshuman Khandual > Signed-off-by: David Hildenbrand > --- > arch/arm64/mm/mmu.c| 2 -- > arch/ia64/mm/init.c| 2 -- > arch/powerpc/mm/mem.c | 2 -- > arch/s390/mm/init.c| 2 -- > arch/sh/mm/init.c | 2 -- > arch/x86/mm/init_32.c | 2 -- > arch/x86/mm/init_64.c | 2 -- > drivers/base/memory.c | 2 -- > include/linux/memory.h | 2 -- > include/linux/memory_hotplug.h | 2 -- > mm/memory_hotplug.c| 2 -- > mm/sparse.c| 6 -- > 12 files changed, 28 deletions(-) > > diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c > index e569a543c384..9ccd7539f2d4 100644 > --- a/arch/arm64/mm/mmu.c > +++ b/arch/arm64/mm/mmu.c > @@ -1084,7 +1084,6 @@ int arch_add_memory(int nid, u64 start, u64 size, > return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, > restrictions); > } > -#ifdef CONFIG_MEMORY_HOTREMOVE > void arch_remove_memory(int nid, u64 start, u64 size, > struct vmem_altmap *altmap) > { > @@ -1103,4 +1102,3 @@ void arch_remove_memory(int nid, u64 start, u64 size, > __remove_pages(zone, start_pfn, nr_pages, altmap); > } > #endif > -#endif > diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c > index d28e29103bdb..aae75fd7b810 100644 > --- a/arch/ia64/mm/init.c > +++ b/arch/ia64/mm/init.c > @@ -681,7 +681,6 @@ int arch_add_memory(int nid, u64 start, u64 size, > return ret; > } > > -#ifdef CONFIG_MEMORY_HOTREMOVE > void arch_remove_memory(int nid, u64 start, u64 size, > struct vmem_altmap *altmap) > { > @@ -693,4 +692,3 @@ void arch_remove_memory(int nid, u64 start, u64 size, > __remove_pages(zone, start_pfn, nr_pages, altmap); > } > #endif > -#endif > diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c > index e885fe2aafcc..e4bc2dc3f593 100644 > --- a/arch/powerpc/mm/mem.c > +++ b/arch/powerpc/mm/mem.c > @@ -130,7 +130,6 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, > return __add_pages(nid, start_pfn, nr_pages, restrictions); > } > > -#ifdef CONFIG_MEMORY_HOTREMOVE > void __ref arch_remove_memory(int nid, u64 start, u64 size, >struct vmem_altmap *altmap) > { > @@ -164,7 +163,6 @@ void __ref arch_remove_memory(int nid, u64 start, u64 > size, > pr_warn("Hash collision while resizing HPT\n"); > } > #endif > -#endif /* CONFIG_MEMORY_HOTPLUG */ > > #ifndef CONFIG_NEED_MULTIPLE_NODES > void __init mem_topology_setup(void) > diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c > index 14955e0a9fcf..ffb81fe95c77 100644 > --- a/arch/s390/mm/init.c > +++ b/arch/s390/mm/init.c > @@ -239,7 +239,6 @@ int arch_add_memory(int nid, u64 start, u64 size, > return rc; > } > > -#ifdef CONFIG_MEMORY_HOTREMOVE > void arch_remove_memory(int nid, u64 start, u64 size, > struct vmem_altmap *altmap) > { > @@ -251,5 +250,4 @@ void arch_remove_memory(int nid, u64 start, u64 size, > __remove_pages(zone, start_pfn, nr_pages, altmap); > vmem_remove_mapping(start, size); > } > -#endif > #endif /* CONFIG_MEMORY_HOTPLUG */ > diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c > index 13c6a6bb5fd9..dfdbaa50946e 100644 > --- a/arch/sh/mm/init.c > +++ b/arch/sh/mm/init.c > @@ -429,7 +429,6 @@ int
Re: [PATCH v3 05/11] drivers/base/memory: Pass a block_id to init_memory_block()
On Mon 27-05-19 13:11:46, David Hildenbrand wrote: > We'll rework hotplug_memory_register() shortly, so it no longer consumes > pass a section. > > Cc: Greg Kroah-Hartman > Cc: "Rafael J. Wysocki" > Signed-off-by: David Hildenbrand Acked-by: Michal Hocko > --- > drivers/base/memory.c | 15 +++ > 1 file changed, 7 insertions(+), 8 deletions(-) > > diff --git a/drivers/base/memory.c b/drivers/base/memory.c > index f180427e48f4..f914fa6fe350 100644 > --- a/drivers/base/memory.c > +++ b/drivers/base/memory.c > @@ -651,21 +651,18 @@ int register_memory(struct memory_block *memory) > return ret; > } > > -static int init_memory_block(struct memory_block **memory, > - struct mem_section *section, unsigned long state) > +static int init_memory_block(struct memory_block **memory, int block_id, > + unsigned long state) > { > struct memory_block *mem; > unsigned long start_pfn; > - int scn_nr; > int ret = 0; > > mem = kzalloc(sizeof(*mem), GFP_KERNEL); > if (!mem) > return -ENOMEM; > > - scn_nr = __section_nr(section); > - mem->start_section_nr = > - base_memory_block_id(scn_nr) * sections_per_block; > + mem->start_section_nr = block_id * sections_per_block; > mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; > mem->state = state; > start_pfn = section_nr_to_pfn(mem->start_section_nr); > @@ -694,7 +691,8 @@ static int add_memory_block(int base_section_nr) > > if (section_count == 0) > return 0; > - ret = init_memory_block(, __nr_to_section(section_nr), MEM_ONLINE); > + ret = init_memory_block(, base_memory_block_id(base_section_nr), > + MEM_ONLINE); > if (ret) > return ret; > mem->section_count = section_count; > @@ -707,6 +705,7 @@ static int add_memory_block(int base_section_nr) > */ > int hotplug_memory_register(int nid, struct mem_section *section) > { > + int block_id = base_memory_block_id(__section_nr(section)); > int ret = 0; > struct memory_block *mem; > > @@ -717,7 +716,7 @@ int hotplug_memory_register(int nid, struct mem_section > *section) > mem->section_count++; > put_device(>dev); > } else { > - ret = init_memory_block(, section, MEM_OFFLINE); > + ret = init_memory_block(, block_id, MEM_OFFLINE); > if (ret) > goto out; > mem->section_count++; > -- > 2.20.1 -- Michal Hocko SUSE Labs
Re: [PATCH v3 03/11] s390x/mm: Implement arch_remove_memory()
On Mon 27-05-19 13:11:44, David Hildenbrand wrote: > Will come in handy when wanting to handle errors after > arch_add_memory(). I do not understand this. Why do you add a code for something that is not possible on this HW (based on the comment - is it still valid btw?) > Cc: Martin Schwidefsky > Cc: Heiko Carstens > Cc: Andrew Morton > Cc: Michal Hocko > Cc: Mike Rapoport > Cc: David Hildenbrand > Cc: Vasily Gorbik > Cc: Oscar Salvador > Signed-off-by: David Hildenbrand > --- > arch/s390/mm/init.c | 13 +++-- > 1 file changed, 7 insertions(+), 6 deletions(-) > > diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c > index d552e330fbcc..14955e0a9fcf 100644 > --- a/arch/s390/mm/init.c > +++ b/arch/s390/mm/init.c > @@ -243,12 +243,13 @@ int arch_add_memory(int nid, u64 start, u64 size, > void arch_remove_memory(int nid, u64 start, u64 size, > struct vmem_altmap *altmap) > { > - /* > - * There is no hardware or firmware interface which could trigger a > - * hot memory remove on s390. So there is nothing that needs to be > - * implemented. > - */ > - BUG(); > + unsigned long start_pfn = start >> PAGE_SHIFT; > + unsigned long nr_pages = size >> PAGE_SHIFT; > + struct zone *zone; > + > + zone = page_zone(pfn_to_page(start_pfn)); > + __remove_pages(zone, start_pfn, nr_pages, altmap); > + vmem_remove_mapping(start, size); > } > #endif > #endif /* CONFIG_MEMORY_HOTPLUG */ > -- > 2.20.1 > -- Michal Hocko SUSE Labs
Re: [PATCH v3 02/11] s390x/mm: Fail when an altmap is used for arch_add_memory()
On Mon 27-05-19 13:11:43, David Hildenbrand wrote: > ZONE_DEVICE is not yet supported, fail if an altmap is passed, so we > don't forget arch_add_memory()/arch_remove_memory() when unlocking > support. Why do we need this? Sure ZONE_DEVICE is not supported for s390 and so might be the case for other arches which support hotplug. I do not see much point in adding warning to each of them. > Cc: Martin Schwidefsky > Cc: Heiko Carstens > Cc: Andrew Morton > Cc: Michal Hocko > Cc: Mike Rapoport > Cc: David Hildenbrand > Cc: Vasily Gorbik > Cc: Oscar Salvador > Suggested-by: Dan Williams > Signed-off-by: David Hildenbrand > --- > arch/s390/mm/init.c | 3 +++ > 1 file changed, 3 insertions(+) > > diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c > index 14d1eae9fe43..d552e330fbcc 100644 > --- a/arch/s390/mm/init.c > +++ b/arch/s390/mm/init.c > @@ -226,6 +226,9 @@ int arch_add_memory(int nid, u64 start, u64 size, > unsigned long size_pages = PFN_DOWN(size); > int rc; > > + if (WARN_ON_ONCE(restrictions->altmap)) > + return -EINVAL; > + > rc = vmem_add_mapping(start, size); > if (rc) > return rc; > -- > 2.20.1 > -- Michal Hocko SUSE Labs
Re: [PATCH v3 01/11] mm/memory_hotplug: Simplify and fix check_hotplug_memory_range()
[Sorry for a really late response] On Mon 27-05-19 13:11:42, David Hildenbrand wrote: > By converting start and size to page granularity, we actually ignore > unaligned parts within a page instead of properly bailing out with an > error. I do not expect any code path would ever provide an unaligned address and even if it did then rounding that to a pfn doesn't sound like a terrible thing to do. Anyway this removes few lines so why not. > > Cc: Andrew Morton > Cc: Oscar Salvador > Cc: Michal Hocko > Cc: David Hildenbrand > Cc: Pavel Tatashin > Cc: Qian Cai > Cc: Wei Yang > Cc: Arun KS > Cc: Mathieu Malaterre > Reviewed-by: Dan Williams > Reviewed-by: Wei Yang > Signed-off-by: David Hildenbrand Acked-by: Michal Hocko > --- > mm/memory_hotplug.c | 11 +++ > 1 file changed, 3 insertions(+), 8 deletions(-) > > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c > index e096c987d261..762887b2358b 100644 > --- a/mm/memory_hotplug.c > +++ b/mm/memory_hotplug.c > @@ -1051,16 +1051,11 @@ int try_online_node(int nid) > > static int check_hotplug_memory_range(u64 start, u64 size) > { > - unsigned long block_sz = memory_block_size_bytes(); > - u64 block_nr_pages = block_sz >> PAGE_SHIFT; > - u64 nr_pages = size >> PAGE_SHIFT; > - u64 start_pfn = PFN_DOWN(start); > - > /* memory range must be block size aligned */ > - if (!nr_pages || !IS_ALIGNED(start_pfn, block_nr_pages) || > - !IS_ALIGNED(nr_pages, block_nr_pages)) { > + if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) || > + !IS_ALIGNED(size, memory_block_size_bytes())) { > pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, > size %#llx", > -block_sz, start, size); > +memory_block_size_bytes(), start, size); > return -EINVAL; > } > > -- > 2.20.1 > -- Michal Hocko SUSE Labs
Re: Re: [PATCH v4 6/8] KVM: PPC: Ultravisor: Restrict LDBAR access
On Mon, Jul 01, 2019 at 04:30:55PM +1000, Alexey Kardashevskiy wrote: > > > On 01/07/2019 16:17, maddy wrote: > > > > On 01/07/19 11:24 AM, Alexey Kardashevskiy wrote: > >> > >> On 29/06/2019 06:08, Claudio Carvalho wrote: > >>> When the ultravisor firmware is available, it takes control over the > >>> LDBAR register. In this case, thread-imc updates and save/restore > >>> operations on the LDBAR register are handled by ultravisor. > >> What does LDBAR do? "Power ISA™ Version 3.0 B" or "User’s Manual POWER9 > >> Processor" do not tell. > > LDBAR is a per-thread SPR used by thread-imc pmu to dump the counter > > data into memory. > > LDBAR contains memory address along with few other configuration bits > > (it is populated > > by the thread-imc pmu driver). It is populated and enabled only when any > > of the thread > > imc pmu events are monitored. > > > I was actually looking for a spec for this register, what is the > document name? Its not a architected register. Its documented in the Power9 workbook. RP
[PATCH v2 3/3] mm/vmalloc: fix vmalloc_to_page for huge vmap mappings
vmalloc_to_page returns NULL for addresses mapped by larger pages[*]. Whether or not a vmap is huge depends on the architecture details, alignments, boot options, etc., which the caller can not be expected to know. Therefore HUGE_VMAP is a regression for vmalloc_to_page. This change teaches vmalloc_to_page about larger pages, and returns the struct page that corresponds to the offset within the large page. This makes the API agnostic to mapping implementation details. [*] As explained by commit 029c54b095995 ("mm/vmalloc.c: huge-vmap: fail gracefully on unexpected huge vmap mappings") Signed-off-by: Nicholas Piggin --- include/asm-generic/4level-fixup.h | 1 + include/asm-generic/5level-fixup.h | 1 + mm/vmalloc.c | 37 +++--- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h index e3667c9a33a5..3cc65a4dd093 100644 --- a/include/asm-generic/4level-fixup.h +++ b/include/asm-generic/4level-fixup.h @@ -20,6 +20,7 @@ #define pud_none(pud) 0 #define pud_bad(pud) 0 #define pud_present(pud) 1 +#define pud_large(pud) 0 #define pud_ERROR(pud) do { } while (0) #define pud_clear(pud) pgd_clear(pud) #define pud_val(pud) pgd_val(pud) diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h index bb6cb347018c..c4377db09a4f 100644 --- a/include/asm-generic/5level-fixup.h +++ b/include/asm-generic/5level-fixup.h @@ -22,6 +22,7 @@ #define p4d_none(p4d) 0 #define p4d_bad(p4d) 0 #define p4d_present(p4d) 1 +#define p4d_large(p4d) 0 #define p4d_ERROR(p4d) do { } while (0) #define p4d_clear(p4d) pgd_clear(p4d) #define p4d_val(p4d) pgd_val(p4d) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0f76cca32a1c..09a283866368 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -36,6 +36,7 @@ #include #include +#include #include #include @@ -284,25 +285,35 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) if (pgd_none(*pgd)) return NULL; + p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) return NULL; - pud = pud_offset(p4d, addr); +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP + if (p4d_large(*p4d)) + return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); +#endif + if (WARN_ON_ONCE(p4d_bad(*p4d))) + return NULL; - /* -* Don't dereference bad PUD or PMD (below) entries. This will also -* identify huge mappings, which we may encounter on architectures -* that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be -* identified as vmalloc addresses by is_vmalloc_addr(), but are -* not [unambiguously] associated with a struct page, so there is -* no correct value to return for them. -*/ - WARN_ON_ONCE(pud_bad(*pud)); - if (pud_none(*pud) || pud_bad(*pud)) + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + return NULL; +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP + if (pud_large(*pud)) + return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); +#endif + if (WARN_ON_ONCE(pud_bad(*pud))) return NULL; + pmd = pmd_offset(pud, addr); - WARN_ON_ONCE(pmd_bad(*pmd)); - if (pmd_none(*pmd) || pmd_bad(*pmd)) + if (pmd_none(*pmd)) + return NULL; +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP + if (pmd_large(*pmd)) + return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); +#endif + if (WARN_ON_ONCE(pmd_bad(*pmd))) return NULL; ptep = pte_offset_map(pmd, addr); -- 2.20.1
[PATCH v2 2/3] powerpc/64s: Add p?d_large definitions
The subsequent patch to fix vmalloc_to_page with huge vmap requires HUGE_VMAP archs to provide p?d_large definitions for the non-pgd page table levels they support. Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Nicholas Piggin --- arch/powerpc/include/asm/book3s/64/pgtable.h | 24 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index ccf00a8b98c6..c19c8396a1bd 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -915,6 +915,11 @@ static inline int pud_present(pud_t pud) return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PRESENT)); } +static inline int pud_large(pud_t pud) +{ + return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE)); +} + extern struct page *pud_page(pud_t pud); extern struct page *pmd_page(pmd_t pmd); static inline pte_t pud_pte(pud_t pud) @@ -958,6 +963,11 @@ static inline int pgd_present(pgd_t pgd) return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PRESENT)); } +static inline int pgd_large(pgd_t pgd) +{ + return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PTE)); +} + static inline pte_t pgd_pte(pgd_t pgd) { return __pte_raw(pgd_raw(pgd)); @@ -1083,6 +1093,11 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd) #define pmd_mk_savedwrite(pmd) pte_pmd(pte_mk_savedwrite(pmd_pte(pmd))) #define pmd_clear_savedwrite(pmd) pte_pmd(pte_clear_savedwrite(pmd_pte(pmd))) +static inline int pmd_large(pmd_t pmd) +{ + return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE)); +} + #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #define pmd_soft_dirty(pmd)pte_soft_dirty(pmd_pte(pmd)) #define pmd_mksoft_dirty(pmd) pte_pmd(pte_mksoft_dirty(pmd_pte(pmd))) @@ -1151,15 +1166,6 @@ pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set); } -/* - * returns true for pmd migration entries, THP, devmap, hugetlb - * But compile time dependent on THP config - */ -static inline int pmd_large(pmd_t pmd) -{ - return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE)); -} - static inline pmd_t pmd_mknotpresent(pmd_t pmd) { return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT); -- 2.20.1
[PATCH v2 1/3] arm64: mm: Add p?d_large() definitions
walk_page_range() is going to be allowed to walk page tables other than those of user space. For this it needs to know when it has reached a 'leaf' entry in the page tables. This information will be provided by the p?d_large() functions/macros. For arm64, we already have p?d_sect() macros which we can reuse for p?d_large(). pud_sect() is defined as a dummy function when CONFIG_PGTABLE_LEVELS < 3 or CONFIG_ARM64_64K_PAGES is defined. However when the kernel is configured this way then architecturally it isn't allowed to have a large page that this level, and any code using these page walking macros is implicitly relying on the page size/number of levels being the same as the kernel. So it is safe to reuse this for p?d_large() as it is an architectural restriction. Cc: Catalin Marinas Cc: Will Deacon Signed-off-by: Steven Price --- arch/arm64/include/asm/pgtable.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index fca26759081a..0e973201bc16 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -417,6 +417,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, PMD_TYPE_TABLE) #define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ PMD_TYPE_SECT) +#define pmd_large(pmd) pmd_sect(pmd) #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3 #define pud_sect(pud) (0) @@ -499,6 +500,7 @@ static inline void pte_unmap(pte_t *pte) { } #define pud_none(pud) (!pud_val(pud)) #define pud_bad(pud) (!(pud_val(pud) & PUD_TABLE_BIT)) #define pud_present(pud) pte_present(pud_pte(pud)) +#define pud_large(pud) pud_sect(pud) #define pud_valid(pud) pte_valid(pud_pte(pud)) static inline void set_pud(pud_t *pudp, pud_t pud) -- 2.20.1
[PATCH v2 0/3] fix vmalloc_to_page for huge vmap mappings
This is a change broken out from the huge vmap vmalloc series as requested. There is a little bit of dependency juggling across trees, but patches are pretty trivial. Ideally if Andrew accepts this patch and queues it up for next, then the arch patches would be merged through those trees then patch 3 gets sent by Andrew. I've tested this with other powerpc and vmalloc patches, with code that explicitly tests vmalloc_to_page on vmalloced memory and results look fine. v2: change the order of testing pxx_large and pxx_bad, to avoid issues with arm64 Thanks, Nick Nicholas Piggin (3): arm64: mm: Add p?d_large() definitions powerpc/64s: Add p?d_large definitions mm/vmalloc: fix vmalloc_to_page for huge vmap mappings arch/arm64/include/asm/pgtable.h | 2 ++ arch/powerpc/include/asm/book3s/64/pgtable.h | 24 - include/asm-generic/4level-fixup.h | 1 + include/asm-generic/5level-fixup.h | 1 + mm/vmalloc.c | 37 +--- 5 files changed, 43 insertions(+), 22 deletions(-) -- 2.20.1
Re: [PATCH v4 6/8] KVM: PPC: Ultravisor: Restrict LDBAR access
On 01/07/2019 16:17, maddy wrote: > > On 01/07/19 11:24 AM, Alexey Kardashevskiy wrote: >> >> On 29/06/2019 06:08, Claudio Carvalho wrote: >>> When the ultravisor firmware is available, it takes control over the >>> LDBAR register. In this case, thread-imc updates and save/restore >>> operations on the LDBAR register are handled by ultravisor. >> What does LDBAR do? "Power ISA™ Version 3.0 B" or "User’s Manual POWER9 >> Processor" do not tell. > LDBAR is a per-thread SPR used by thread-imc pmu to dump the counter > data into memory. > LDBAR contains memory address along with few other configuration bits > (it is populated > by the thread-imc pmu driver). It is populated and enabled only when any > of the thread > imc pmu events are monitored. I was actually looking for a spec for this register, what is the document name? > > Maddy >> >> >>> Signed-off-by: Claudio Carvalho >>> Reviewed-by: Ram Pai >>> Reviewed-by: Ryan Grimm >>> Acked-by: Madhavan Srinivasan >>> Acked-by: Paul Mackerras >>> --- >>> arch/powerpc/kvm/book3s_hv_rmhandlers.S | 2 ++ >>> arch/powerpc/platforms/powernv/idle.c | 6 -- >>> arch/powerpc/platforms/powernv/opal-imc.c | 4 >>> 3 files changed, 10 insertions(+), 2 deletions(-) >>> >>> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S >>> b/arch/powerpc/kvm/book3s_hv_rmhandlers.S >>> index f9b2620fbecd..cffb365d9d02 100644 >>> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S >>> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S >>> @@ -375,8 +375,10 @@ BEGIN_FTR_SECTION >>> mtspr SPRN_RPR, r0 >>> ld r0, KVM_SPLIT_PMMAR(r6) >>> mtspr SPRN_PMMAR, r0 >>> +BEGIN_FW_FTR_SECTION_NESTED(70) >>> ld r0, KVM_SPLIT_LDBAR(r6) >>> mtspr SPRN_LDBAR, r0 >>> +END_FW_FTR_SECTION_NESTED(FW_FEATURE_ULTRAVISOR, 0, 70) >>> isync >>> FTR_SECTION_ELSE >>> /* On P9 we use the split_info for coordinating LPCR changes */ >>> diff --git a/arch/powerpc/platforms/powernv/idle.c >>> b/arch/powerpc/platforms/powernv/idle.c >>> index 77f2e0a4ee37..5593a2d55959 100644 >>> --- a/arch/powerpc/platforms/powernv/idle.c >>> +++ b/arch/powerpc/platforms/powernv/idle.c >>> @@ -679,7 +679,8 @@ static unsigned long power9_idle_stop(unsigned >>> long psscr, bool mmu_on) >>> sprs.ptcr = mfspr(SPRN_PTCR); >>> sprs.rpr = mfspr(SPRN_RPR); >>> sprs.tscr = mfspr(SPRN_TSCR); >>> - sprs.ldbar = mfspr(SPRN_LDBAR); >>> + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) >>> + sprs.ldbar = mfspr(SPRN_LDBAR); >>> sprs_saved = true; >>> @@ -762,7 +763,8 @@ static unsigned long power9_idle_stop(unsigned >>> long psscr, bool mmu_on) >>> mtspr(SPRN_PTCR, sprs.ptcr); >>> mtspr(SPRN_RPR, sprs.rpr); >>> mtspr(SPRN_TSCR, sprs.tscr); >>> - mtspr(SPRN_LDBAR, sprs.ldbar); >>> + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) >>> + mtspr(SPRN_LDBAR, sprs.ldbar); >>> if (pls >= pnv_first_tb_loss_level) { >>> /* TB loss */ >>> diff --git a/arch/powerpc/platforms/powernv/opal-imc.c >>> b/arch/powerpc/platforms/powernv/opal-imc.c >>> index 1b6932890a73..5fe2d4526cbc 100644 >>> --- a/arch/powerpc/platforms/powernv/opal-imc.c >>> +++ b/arch/powerpc/platforms/powernv/opal-imc.c >>> @@ -254,6 +254,10 @@ static int opal_imc_counters_probe(struct >>> platform_device *pdev) >>> bool core_imc_reg = false, thread_imc_reg = false; >>> u32 type; >>> + /* Disable IMC devices, when Ultravisor is enabled. */ >>> + if (firmware_has_feature(FW_FEATURE_ULTRAVISOR)) >>> + return -EACCES; >>> + >>> /* >>> * Check whether this is kdump kernel. If yes, force the >>> engines to >>> * stop and return. >>> > -- Alexey
Re: [PATCH v4 6/8] KVM: PPC: Ultravisor: Restrict LDBAR access
On 01/07/19 11:24 AM, Alexey Kardashevskiy wrote: On 29/06/2019 06:08, Claudio Carvalho wrote: When the ultravisor firmware is available, it takes control over the LDBAR register. In this case, thread-imc updates and save/restore operations on the LDBAR register are handled by ultravisor. What does LDBAR do? "Power ISA™ Version 3.0 B" or "User’s Manual POWER9 Processor" do not tell. LDBAR is a per-thread SPR used by thread-imc pmu to dump the counter data into memory. LDBAR contains memory address along with few other configuration bits (it is populated by the thread-imc pmu driver). It is populated and enabled only when any of the thread imc pmu events are monitored. Maddy Signed-off-by: Claudio Carvalho Reviewed-by: Ram Pai Reviewed-by: Ryan Grimm Acked-by: Madhavan Srinivasan Acked-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 2 ++ arch/powerpc/platforms/powernv/idle.c | 6 -- arch/powerpc/platforms/powernv/opal-imc.c | 4 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index f9b2620fbecd..cffb365d9d02 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -375,8 +375,10 @@ BEGIN_FTR_SECTION mtspr SPRN_RPR, r0 ld r0, KVM_SPLIT_PMMAR(r6) mtspr SPRN_PMMAR, r0 +BEGIN_FW_FTR_SECTION_NESTED(70) ld r0, KVM_SPLIT_LDBAR(r6) mtspr SPRN_LDBAR, r0 +END_FW_FTR_SECTION_NESTED(FW_FEATURE_ULTRAVISOR, 0, 70) isync FTR_SECTION_ELSE /* On P9 we use the split_info for coordinating LPCR changes */ diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 77f2e0a4ee37..5593a2d55959 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -679,7 +679,8 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) sprs.ptcr = mfspr(SPRN_PTCR); sprs.rpr= mfspr(SPRN_RPR); sprs.tscr = mfspr(SPRN_TSCR); - sprs.ldbar = mfspr(SPRN_LDBAR); + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) + sprs.ldbar = mfspr(SPRN_LDBAR); sprs_saved = true; @@ -762,7 +763,8 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) mtspr(SPRN_PTCR,sprs.ptcr); mtspr(SPRN_RPR, sprs.rpr); mtspr(SPRN_TSCR,sprs.tscr); - mtspr(SPRN_LDBAR, sprs.ldbar); + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) + mtspr(SPRN_LDBAR, sprs.ldbar); if (pls >= pnv_first_tb_loss_level) { /* TB loss */ diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 1b6932890a73..5fe2d4526cbc 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -254,6 +254,10 @@ static int opal_imc_counters_probe(struct platform_device *pdev) bool core_imc_reg = false, thread_imc_reg = false; u32 type; + /* Disable IMC devices, when Ultravisor is enabled. */ + if (firmware_has_feature(FW_FEATURE_ULTRAVISOR)) + return -EACCES; + /* * Check whether this is kdump kernel. If yes, force the engines to * stop and return.