[PATCH v5 6/8] powerpc: Rework and improve STRICT_KERNEL_RWX patching
Rework code-patching with STRICT_KERNEL_RWX to prepare for the next patch which uses a temporary mm for patching under the Book3s64 Radix MMU. Make improvements by adding a WARN_ON when the patchsite doesn't match after patching and return the error from __patch_instruction() properly. Signed-off-by: Christopher M. Riedl --- v5: * New to series. --- arch/powerpc/lib/code-patching.c | 51 +--- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 3122d8e4cc013..9f2eba9b70ee4 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -102,11 +102,12 @@ static inline void unuse_temporary_mm(struct temp_mm *temp_mm) } static DEFINE_PER_CPU(struct vm_struct *, text_poke_area); +static DEFINE_PER_CPU(unsigned long, cpu_patching_addr); #if IS_BUILTIN(CONFIG_LKDTM) unsigned long read_cpu_patching_addr(unsigned int cpu) { - return (unsigned long)(per_cpu(text_poke_area, cpu))->addr; + return per_cpu(cpu_patching_addr, cpu); } #endif @@ -121,6 +122,7 @@ static int text_area_cpu_up(unsigned int cpu) return -1; } this_cpu_write(text_poke_area, area); + this_cpu_write(cpu_patching_addr, (unsigned long)area->addr); return 0; } @@ -146,7 +148,7 @@ void __init poking_init(void) /* * This can be called for kernel text or a module. */ -static int map_patch_area(void *addr, unsigned long text_poke_addr) +static int map_patch_area(void *addr) { unsigned long pfn; int err; @@ -156,17 +158,20 @@ static int map_patch_area(void *addr, unsigned long text_poke_addr) else pfn = __pa_symbol(addr) >> PAGE_SHIFT; - err = map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), PAGE_KERNEL); + err = map_kernel_page(__this_cpu_read(cpu_patching_addr), + (pfn << PAGE_SHIFT), PAGE_KERNEL); - pr_devel("Mapped addr %lx with pfn %lx:%d\n", text_poke_addr, pfn, err); + pr_devel("Mapped addr %lx with pfn %lx:%d\n", +__this_cpu_read(cpu_patching_addr), pfn, err); if (err) return -1; return 0; } -static inline int unmap_patch_area(unsigned long addr) +static inline int unmap_patch_area(void) { + unsigned long addr = __this_cpu_read(cpu_patching_addr); pte_t *ptep; pmd_t *pmdp; pud_t *pudp; @@ -175,23 +180,23 @@ static inline int unmap_patch_area(unsigned long addr) pgdp = pgd_offset_k(addr); if (unlikely(!pgdp)) - return -EINVAL; + goto out_err; p4dp = p4d_offset(pgdp, addr); if (unlikely(!p4dp)) - return -EINVAL; + goto out_err; pudp = pud_offset(p4dp, addr); if (unlikely(!pudp)) - return -EINVAL; + goto out_err; pmdp = pmd_offset(pudp, addr); if (unlikely(!pmdp)) - return -EINVAL; + goto out_err; ptep = pte_offset_kernel(pmdp, addr); if (unlikely(!ptep)) - return -EINVAL; + goto out_err; pr_devel("clearing mm %p, pte %p, addr %lx\n", _mm, ptep, addr); @@ -202,15 +207,17 @@ static inline int unmap_patch_area(unsigned long addr) flush_tlb_kernel_range(addr, addr + PAGE_SIZE); return 0; + +out_err: + pr_warn("failed to unmap %lx\n", addr); + return -EINVAL; } static int do_patch_instruction(u32 *addr, struct ppc_inst instr) { - int err; + int err, rc = 0; u32 *patch_addr = NULL; unsigned long flags; - unsigned long text_poke_addr; - unsigned long kaddr = (unsigned long)addr; /* * During early early boot patch_instruction is called @@ -222,24 +229,20 @@ static int do_patch_instruction(u32 *addr, struct ppc_inst instr) local_irq_save(flags); - text_poke_addr = (unsigned long)__this_cpu_read(text_poke_area)->addr; - if (map_patch_area(addr, text_poke_addr)) { - err = -1; + err = map_patch_area(addr); + if (err) goto out; - } - - patch_addr = (u32 *)(text_poke_addr + (kaddr & ~PAGE_MASK)); - __patch_instruction(addr, instr, patch_addr); + patch_addr = (u32 *)(__this_cpu_read(cpu_patching_addr) | offset_in_page(addr)); + rc = __patch_instruction(addr, instr, patch_addr); - err = unmap_patch_area(text_poke_addr); - if (err) - pr_warn("failed to unmap %lx\n", text_poke_addr); + err = unmap_patch_area(); out: local_irq_restore(flags); + WARN_ON(!ppc_inst_equal(ppc_inst_read(addr), instr)); - return err; + return rc ? rc : err; } #else /* !CONFIG_STRICT_KERNEL_RWX */ -- 2.26.1
[PATCH v5 5/8] powerpc/64s: Introduce temporary mm for Radix MMU
x86 supports the notion of a temporary mm which restricts access to temporary PTEs to a single CPU. A temporary mm is useful for situations where a CPU needs to perform sensitive operations (such as patching a STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing said mappings to other CPUs. Another benefit is that other CPU TLBs do not need to be flushed when the temporary mm is torn down. Mappings in the temporary mm can be set in the userspace portion of the address-space. Interrupts must be disabled while the temporary mm is in use. HW breakpoints, which may have been set by userspace as watchpoints on addresses now within the temporary mm, are saved and disabled when loading the temporary mm. The HW breakpoints are restored when unloading the temporary mm. All HW breakpoints are indiscriminately disabled while the temporary mm is in use. Based on x86 implementation: commit cefa929c034e ("x86/mm: Introduce temporary mm structs") Signed-off-by: Christopher M. Riedl --- v5: * Drop support for using a temporary mm on Book3s64 Hash MMU. v4: * Pass the prev mm instead of NULL to switch_mm_irqs_off() when using/unusing the temp mm as suggested by Jann Horn to keep the context.active counter in-sync on mm/nohash. * Disable SLB preload in the temporary mm when initializing the temp_mm struct. * Include asm/debug.h header to fix build issue with ppc44x_defconfig. --- arch/powerpc/include/asm/debug.h | 1 + arch/powerpc/kernel/process.c| 5 +++ arch/powerpc/lib/code-patching.c | 56 3 files changed, 62 insertions(+) diff --git a/arch/powerpc/include/asm/debug.h b/arch/powerpc/include/asm/debug.h index 86a14736c76c3..dfd82635ea8b3 100644 --- a/arch/powerpc/include/asm/debug.h +++ b/arch/powerpc/include/asm/debug.h @@ -46,6 +46,7 @@ static inline int debugger_fault_handler(struct pt_regs *regs) { return 0; } #endif void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk); +void __get_breakpoint(int nr, struct arch_hw_breakpoint *brk); bool ppc_breakpoint_available(void); #ifdef CONFIG_PPC_ADV_DEBUG_REGS extern void do_send_trap(struct pt_regs *regs, unsigned long address, diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 185beb2905801..a0776200772e8 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -865,6 +865,11 @@ static inline int set_breakpoint_8xx(struct arch_hw_breakpoint *brk) return 0; } +void __get_breakpoint(int nr, struct arch_hw_breakpoint *brk) +{ + memcpy(brk, this_cpu_ptr(_brk[nr]), sizeof(*brk)); +} + void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk) { memcpy(this_cpu_ptr(_brk[nr]), brk, sizeof(*brk)); diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 54b6157d44e95..3122d8e4cc013 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -17,6 +17,9 @@ #include #include #include +#include +#include +#include static int __patch_instruction(u32 *exec_addr, struct ppc_inst instr, u32 *patch_addr) { @@ -45,6 +48,59 @@ int raw_patch_instruction(u32 *addr, struct ppc_inst instr) } #ifdef CONFIG_STRICT_KERNEL_RWX + +struct temp_mm { + struct mm_struct *temp; + struct mm_struct *prev; + struct arch_hw_breakpoint brk[HBP_NUM_MAX]; +}; + +static inline void init_temp_mm(struct temp_mm *temp_mm, struct mm_struct *mm) +{ + /* We currently only support temporary mm on the Book3s64 Radix MMU */ + WARN_ON(!radix_enabled()); + + temp_mm->temp = mm; + temp_mm->prev = NULL; + memset(_mm->brk, 0, sizeof(temp_mm->brk)); +} + +static inline void use_temporary_mm(struct temp_mm *temp_mm) +{ + lockdep_assert_irqs_disabled(); + + temp_mm->prev = current->active_mm; + switch_mm_irqs_off(temp_mm->prev, temp_mm->temp, current); + + WARN_ON(!mm_is_thread_local(temp_mm->temp)); + + if (ppc_breakpoint_available()) { + struct arch_hw_breakpoint null_brk = {0}; + int i = 0; + + for (; i < nr_wp_slots(); ++i) { + __get_breakpoint(i, _mm->brk[i]); + if (temp_mm->brk[i].type != 0) + __set_breakpoint(i, _brk); + } + } +} + +static inline void unuse_temporary_mm(struct temp_mm *temp_mm) +{ + lockdep_assert_irqs_disabled(); + + switch_mm_irqs_off(temp_mm->temp, temp_mm->prev, current); + + if (ppc_breakpoint_available()) { + int i = 0; + + for (; i < nr_wp_slots(); ++i) + if (temp_mm->brk[i].type != 0) + __set_breakpoint(i, _mm->brk[i]); + } +} + static DEFINE_PER_CPU(struct vm_struct *, text_poke_area); #if IS_BUILTIN(CONFIG_LKDTM) -- 2.26.1
[PATCH v5 8/8] lkdtm/powerpc: Fix code patching hijack test
Code patching on powerpc with a STRICT_KERNEL_RWX uses a userspace address in a temporary mm on Radix now. Use __put_user() to avoid write failures due to KUAP when attempting a "hijack" on the patching address. __put_user() also works with the non-userspace, vmalloc-based patching address on non-Radix MMUs. Signed-off-by: Christopher M. Riedl --- drivers/misc/lkdtm/perms.c | 9 - 1 file changed, 9 deletions(-) diff --git a/drivers/misc/lkdtm/perms.c b/drivers/misc/lkdtm/perms.c index 41e87e5f9cc86..da6a34a0a49fb 100644 --- a/drivers/misc/lkdtm/perms.c +++ b/drivers/misc/lkdtm/perms.c @@ -262,16 +262,7 @@ static inline u32 lkdtm_read_patch_site(void) /* Returns True if the write succeeds */ static inline bool lkdtm_try_write(u32 data, u32 *addr) { -#ifdef CONFIG_PPC - __put_kernel_nofault(addr, , u32, err); - return true; - -err: - return false; -#endif -#ifdef CONFIG_X86_64 return !__put_user(data, addr); -#endif } static int lkdtm_patching_cpu(void *data) -- 2.26.1
[PATCH v5 1/8] powerpc: Add LKDTM accessor for patching addr
When live patching with STRICT_KERNEL_RWX a mapping is installed at a "patching address" with temporary write permissions. Provide a LKDTM-only accessor function for this address in preparation for a LKDTM test which attempts to "hijack" this mapping by writing to it from another CPU. Signed-off-by: Christopher M. Riedl --- arch/powerpc/include/asm/code-patching.h | 4 arch/powerpc/lib/code-patching.c | 7 +++ 2 files changed, 11 insertions(+) diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index a95f63788c6b1..16fbc58a4932f 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -184,4 +184,8 @@ static inline unsigned long ppc_kallsyms_lookup_name(const char *name) #define PPC_INST_STD_LRPPC_RAW_STD(_R0, _R1, PPC_LR_STKOFF) #endif /* CONFIG_PPC64 */ +#if IS_BUILTIN(CONFIG_LKDTM) && IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) +unsigned long read_cpu_patching_addr(unsigned int cpu); +#endif + #endif /* _ASM_POWERPC_CODE_PATCHING_H */ diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index f9a3019e37b43..54b6157d44e95 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -47,6 +47,13 @@ int raw_patch_instruction(u32 *addr, struct ppc_inst instr) #ifdef CONFIG_STRICT_KERNEL_RWX static DEFINE_PER_CPU(struct vm_struct *, text_poke_area); +#if IS_BUILTIN(CONFIG_LKDTM) +unsigned long read_cpu_patching_addr(unsigned int cpu) +{ + return (unsigned long)(per_cpu(text_poke_area, cpu))->addr; +} +#endif + static int text_area_cpu_up(unsigned int cpu) { struct vm_struct *area; -- 2.26.1
[PATCH v5 7/8] powerpc/64s: Initialize and use a temporary mm for patching on Radix
When code patching a STRICT_KERNEL_RWX kernel the page containing the address to be patched is temporarily mapped as writeable. Currently, a per-cpu vmalloc patch area is used for this purpose. While the patch area is per-cpu, the temporary page mapping is inserted into the kernel page tables for the duration of patching. The mapping is exposed to CPUs other than the patching CPU - this is undesirable from a hardening perspective. Use a temporary mm instead which keeps the mapping local to the CPU doing the patching. Use the `poking_init` init hook to prepare a temporary mm and patching address. Initialize the temporary mm by copying the init mm. Choose a randomized patching address inside the temporary mm userspace address space. The patching address is randomized between PAGE_SIZE and DEFAULT_MAP_WINDOW-PAGE_SIZE. Bits of entropy with 64K page size on BOOK3S_64: bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE) PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB bits of entropy = log2(128TB / 64K) bits of entropy = 31 The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU operates - by default the space above DEFAULT_MAP_WINDOW is not available. Currently the Hash MMU does not use a temporary mm so technically this upper limit isn't necessary; however, a larger randomization range does not further "harden" this overall approach and future work may introduce patching with a temporary mm on Hash as well. Randomization occurs only once during initialization at boot for each possible CPU in the system. Introduce two new functions, map_patch() and unmap_patch(), to respectively create and remove the temporary mapping with write permissions at patching_addr. Map the page with PAGE_KERNEL to set EAA[0] for the PTE which ignores the AMR (so no need to unlock/lock KUAP) according to PowerISA v3.0b Figure 35 on Radix. Based on x86 implementation: commit 4fc19708b165 ("x86/alternatives: Initialize temporary mm for patching") and: commit b3fd8e83ada0 ("x86/alternatives: Use temporary mm for text poking") Signed-off-by: Christopher M. Riedl --- v5: * Only support Book3s64 Radix MMU for now. * Use a per-cpu datastructure to hold the patching_addr and patching_mm to avoid the need for a synchronization lock/mutex. v4: * In the previous series this was two separate patches: one to init the temporary mm in poking_init() (unused in powerpc at the time) and the other to use it for patching (which removed all the per-cpu vmalloc code). Now that we use poking_init() in the existing per-cpu vmalloc approach, that separation doesn't work as nicely anymore so I just merged the two patches into one. * Preload the SLB entry and hash the page for the patching_addr when using Hash on book3s64 to avoid taking an SLB and Hash fault during patching. The previous implementation was a hack which changed current->mm to allow the SLB and Hash fault handlers to work with the temporary mm since both of those code-paths always assume mm == current->mm. * Also (hmm - seeing a trend here) with the book3s64 Hash MMU we have to manage the mm->context.active_cpus counter and mm cpumask since they determine (via mm_is_thread_local()) if the TLB flush in pte_clear() is local or not - it should always be local when we're using the temporary mm. On book3s64's Radix MMU we can just call local_flush_tlb_mm(). * Use HPTE_USE_KERNEL_KEY on Hash to avoid costly lock/unlock of KUAP. --- arch/powerpc/lib/code-patching.c | 132 +-- 1 file changed, 125 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 9f2eba9b70ee4..027dabd42b8dd 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -103,6 +104,7 @@ static inline void unuse_temporary_mm(struct temp_mm *temp_mm) static DEFINE_PER_CPU(struct vm_struct *, text_poke_area); static DEFINE_PER_CPU(unsigned long, cpu_patching_addr); +static DEFINE_PER_CPU(struct mm_struct *, cpu_patching_mm); #if IS_BUILTIN(CONFIG_LKDTM) unsigned long read_cpu_patching_addr(unsigned int cpu) @@ -133,6 +135,51 @@ static int text_area_cpu_down(unsigned int cpu) return 0; } +static __always_inline void __poking_init_temp_mm(void) +{ + int cpu; + spinlock_t *ptl; /* for protecting pte table */ + pte_t *ptep; + struct mm_struct *patching_mm; + unsigned long patching_addr; + + for_each_possible_cpu(cpu) { + /* +* Some parts of the kernel (static keys for example) depend on +* successful code patching. Code patching under +* STRICT_KERNEL_RWX requires this setup - otherwise we cannot +*
[PATCH v5 4/8] lkdtm/x86_64: Add test to hijack a patch mapping
A previous commit implemented an LKDTM test on powerpc to exploit the temporary mapping established when patching code with STRICT_KERNEL_RWX enabled. Extend the test to work on x86_64 as well. Signed-off-by: Christopher M. Riedl --- drivers/misc/lkdtm/perms.c | 26 ++ 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/misc/lkdtm/perms.c b/drivers/misc/lkdtm/perms.c index 39e7456852229..41e87e5f9cc86 100644 --- a/drivers/misc/lkdtm/perms.c +++ b/drivers/misc/lkdtm/perms.c @@ -224,7 +224,7 @@ void lkdtm_ACCESS_NULL(void) } #if (IS_BUILTIN(CONFIG_LKDTM) && defined(CONFIG_STRICT_KERNEL_RWX) && \ - defined(CONFIG_PPC)) + (defined(CONFIG_PPC) || defined(CONFIG_X86_64))) /* * This is just a dummy location to patch-over. */ @@ -233,12 +233,25 @@ static void patching_target(void) return; } -#include const u32 *patch_site = (const u32 *)_target; +#ifdef CONFIG_PPC +#include +#endif + +#ifdef CONFIG_X86_64 +#include +#endif + static inline int lkdtm_do_patch(u32 data) { +#ifdef CONFIG_PPC return patch_instruction((u32 *)patch_site, ppc_inst(data)); +#endif +#ifdef CONFIG_X86_64 + text_poke((void *)patch_site, , sizeof(u32)); + return 0; +#endif } static inline u32 lkdtm_read_patch_site(void) @@ -249,11 +262,16 @@ static inline u32 lkdtm_read_patch_site(void) /* Returns True if the write succeeds */ static inline bool lkdtm_try_write(u32 data, u32 *addr) { +#ifdef CONFIG_PPC __put_kernel_nofault(addr, , u32, err); return true; err: return false; +#endif +#ifdef CONFIG_X86_64 + return !__put_user(data, addr); +#endif } static int lkdtm_patching_cpu(void *data) @@ -346,8 +364,8 @@ void lkdtm_HIJACK_PATCH(void) void lkdtm_HIJACK_PATCH(void) { - if (!IS_ENABLED(CONFIG_PPC)) - pr_err("XFAIL: this test only runs on powerpc\n"); + if (!IS_ENABLED(CONFIG_PPC) && !IS_ENABLED(CONFIG_X86_64)) + pr_err("XFAIL: this test only runs on powerpc and x86_64\n"); if (!IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) pr_err("XFAIL: this test requires CONFIG_STRICT_KERNEL_RWX\n"); if (!IS_BUILTIN(CONFIG_LKDTM)) -- 2.26.1
[PATCH v5 0/8] Use per-CPU temporary mappings for patching on Radix MMU
When compiled with CONFIG_STRICT_KERNEL_RWX, the kernel must create temporary mappings when patching itself. These mappings temporarily override the strict RWX text protections to permit a write. Currently, powerpc allocates a per-CPU VM area for patching. Patching occurs as follows: 1. Map page in per-CPU VM area w/ PAGE_KERNEL protection 2. Patch text 3. Remove the temporary mapping While the VM area is per-CPU, the mapping is actually inserted into the kernel page tables. Presumably, this could allow another CPU to access the normally write-protected text - either malicously or accidentally - via this same mapping if the address of the VM area is known. Ideally, the mapping should be kept local to the CPU doing the patching [0]. x86 introduced "temporary mm" structs which allow the creation of mappings local to a particular CPU [1]. This series intends to bring the notion of a temporary mm to powerpc's Book3s64 Radix MMU and harden it by using such a mapping for patching a kernel with strict RWX permissions. The first four patches implement an LKDTM test "proof-of-concept" which exploits the potential vulnerability (ie. the temporary mapping during patching is exposed in the kernel page tables and accessible by other CPUs) using a simple brute-force approach. This test is implemented for both powerpc and x86_64. The test passes on powerpc Radix with this new series, fails on upstream powerpc, passes on upstream x86_64, and fails on an older (ancient) x86_64 tree without the x86_64 temporary mm patches. The remaining patches add support for and use a temporary mm for code patching on powerpc with the Radix MMU. Tested boot, ftrace, and repeated LKDTM "hijack": - QEMU+KVM (host: POWER9 Blackbird): Radix MMU w/ KUAP - QEMU+KVM (host: POWER9 Blackbird): Hash MMU Tested repeated LKDTM "hijack": - QEMU+KVM (host: AMD desktop): x86_64 upstream - QEMU+KVM (host: AMD desktop): x86_64 w/o percpu temp mm to verify the LKDTM "hijack" test fails Tested boot and ftrace: - QEMU+TCG: ppc44x (bamboo) - QEMU+TCG: g5 (mac99) I also tested with various extra config options enabled as suggested in section 12) in Documentation/process/submit-checklist.rst. v5: * Only support Book3s64 Radix MMU for now. There are some issues with the previous implementation on the Hash MMU as pointed out by Nick Piggin. Fixing these is not trivial so we only support the Radix MMU for now. I tried using realmode (no data translation) to patch with Hash to at least avoid exposing the patch mapping to other CPUs but this doesn't reliably work either since we cannot access vmalloc'd space in realmode. * Use percpu variables for the patching_mm and patching_addr. This avoids the need for synchronization mechanisms entirely. Thanks to Peter Zijlstra for pointing out text_mutex which unfortunately didn't work out without larger code changes in powerpc. Also thanks to Daniel Axtens for comments about using percpu variables for the *percpu* temp mm things off list. v4: * It's time to revisit this series again since @jpn and @mpe fixed our known STRICT_*_RWX bugs on powerpc/64s. * Rebase on linuxppc/next: commit ee1bc694fbaec ("powerpc/kvm: Fix build error when PPC_MEM_KEYS/PPC_PSERIES=n") * Completely rework how map_patch() works on book3s64 Hash MMU * Split the LKDTM x86_64 and powerpc bits into separate patches * Annotate commit messages with changes from v3 instead of listing them here completely out-of context... v3: * Rebase on linuxppc/next: commit 9123e3a74ec7 ("Linux 5.9-rc1") * Move temporary mm implementation into code-patching.c where it belongs * Implement LKDTM hijacker test on x86_64 (on IBM time oof) Do * not use address zero for the patching address in the temporary mm (thanks @dja for pointing this out!) * Wrap the LKDTM test w/ CONFIG_SMP as suggested by Christophe Leroy * Comments to clarify PTE pre-allocation and patching addr selection v2: * Rebase on linuxppc/next: commit 105fb38124a4 ("powerpc/8xx: Modify ptep_get()") * Always dirty pte when mapping patch * Use `ppc_inst_len` instead of `sizeof` on instructions * Declare LKDTM patching addr accessor in header where it belongs v1: * Rebase on linuxppc/next (4336b9337824) * Save and restore second hw watchpoint * Use new ppc_inst_* functions for patching check and in LKDTM test rfc-v2: * Many fixes and improvements mostly based on extensive feedback and testing by Christophe Leroy (thanks!). * Make patching_mm and patching_addr static and move '__ro_after_init' to after the variable name (more common in other parts
[PATCH v5 2/8] lkdtm/powerpc: Add test to hijack a patch mapping
When live patching with STRICT_KERNEL_RWX the CPU doing the patching must temporarily remap the page(s) containing the patch site with +W permissions. While this temporary mapping is in use, another CPU could write to the same mapping and maliciously alter kernel text. Implement a LKDTM test to attempt to exploit such an opening during code patching. The test is implemented on powerpc and requires LKDTM built into the kernel (building LKDTM as a module is insufficient). The LKDTM "hijack" test works as follows: 1. A CPU executes an infinite loop to patch an instruction. This is the "patching" CPU. 2. Another CPU attempts to write to the address of the temporary mapping used by the "patching" CPU. This other CPU is the "hijacker" CPU. The hijack either fails with a fault/error or succeeds, in which case some kernel text is now overwritten. The virtual address of the temporary patch mapping is provided via an LKDTM-specific accessor to the hijacker CPU. This test assumes a hypothetical situation where this address was leaked previously. How to run the test: mount -t debugfs none /sys/kernel/debug (echo HIJACK_PATCH > /sys/kernel/debug/provoke-crash/DIRECT) A passing test indicates that it is not possible to overwrite kernel text from another CPU by using the temporary mapping established by a CPU for patching. Signed-off-by: Christopher M. Riedl --- v5: * Use `u32*` instead of `struct ppc_inst*` based on new series in upstream. v4: * Separate the powerpc and x86_64 bits into individual patches. * Use __put_kernel_nofault() when attempting to hijack the mapping * Use raw_smp_processor_id() to avoid triggering the BUG() when calling smp_processor_id() in preemptible code - the only thing that matters is that one of the threads is bound to a different CPU - we are not using smp_processor_id() to access any per-cpu data or similar where preemption should be disabled. * Rework the patching_cpu() kthread stop condition to avoid: https://lwn.net/Articles/628628/ --- drivers/misc/lkdtm/core.c | 1 + drivers/misc/lkdtm/lkdtm.h | 1 + drivers/misc/lkdtm/perms.c | 134 + 3 files changed, 136 insertions(+) diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c index 8024b6a5cc7fc..fbcb95eda337b 100644 --- a/drivers/misc/lkdtm/core.c +++ b/drivers/misc/lkdtm/core.c @@ -147,6 +147,7 @@ static const struct crashtype crashtypes[] = { CRASHTYPE(WRITE_RO), CRASHTYPE(WRITE_RO_AFTER_INIT), CRASHTYPE(WRITE_KERN), + CRASHTYPE(HIJACK_PATCH), CRASHTYPE(REFCOUNT_INC_OVERFLOW), CRASHTYPE(REFCOUNT_ADD_OVERFLOW), CRASHTYPE(REFCOUNT_INC_NOT_ZERO_OVERFLOW), diff --git a/drivers/misc/lkdtm/lkdtm.h b/drivers/misc/lkdtm/lkdtm.h index 99f90d3e5e9cb..87e7e6136d962 100644 --- a/drivers/misc/lkdtm/lkdtm.h +++ b/drivers/misc/lkdtm/lkdtm.h @@ -62,6 +62,7 @@ void lkdtm_EXEC_USERSPACE(void); void lkdtm_EXEC_NULL(void); void lkdtm_ACCESS_USERSPACE(void); void lkdtm_ACCESS_NULL(void); +void lkdtm_HIJACK_PATCH(void); /* refcount.c */ void lkdtm_REFCOUNT_INC_OVERFLOW(void); diff --git a/drivers/misc/lkdtm/perms.c b/drivers/misc/lkdtm/perms.c index 2dede2ef658f3..39e7456852229 100644 --- a/drivers/misc/lkdtm/perms.c +++ b/drivers/misc/lkdtm/perms.c @@ -9,6 +9,7 @@ #include #include #include +#include #include /* Whether or not to fill the target memory area with do_nothing(). */ @@ -222,6 +223,139 @@ void lkdtm_ACCESS_NULL(void) pr_err("FAIL: survived bad write\n"); } +#if (IS_BUILTIN(CONFIG_LKDTM) && defined(CONFIG_STRICT_KERNEL_RWX) && \ + defined(CONFIG_PPC)) +/* + * This is just a dummy location to patch-over. + */ +static void patching_target(void) +{ + return; +} + +#include +const u32 *patch_site = (const u32 *)_target; + +static inline int lkdtm_do_patch(u32 data) +{ + return patch_instruction((u32 *)patch_site, ppc_inst(data)); +} + +static inline u32 lkdtm_read_patch_site(void) +{ + return READ_ONCE(*patch_site); +} + +/* Returns True if the write succeeds */ +static inline bool lkdtm_try_write(u32 data, u32 *addr) +{ + __put_kernel_nofault(addr, , u32, err); + return true; + +err: + return false; +} + +static int lkdtm_patching_cpu(void *data) +{ + int err = 0; + u32 val = 0xdeadbeef; + + pr_info("starting patching_cpu=%d\n", raw_smp_processor_id()); + + do { + err = lkdtm_do_patch(val); + } while (lkdtm_read_patch_site() == val && !err && !kthread_should_stop()); + + if (err) + pr_warn("XFAIL: patch_instruction returned error: %d\n", err); + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } + + return err; +} + +void lkdtm_HIJACK_PATCH(void) +{ + struct task_struct *patching_kthrd; + int
[PATCH v5 3/8] x86_64: Add LKDTM accessor for patching addr
When live patching with STRICT_KERNEL_RWX a mapping is installed at a "patching address" with temporary write permissions. Provide a LKDTM-only accessor function for this address in preparation for a LKDTM test which attempts to "hijack" this mapping by writing to it from another CPU. Signed-off-by: Christopher M. Riedl --- arch/x86/include/asm/text-patching.h | 4 arch/x86/kernel/alternative.c| 7 +++ 2 files changed, 11 insertions(+) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index b7421780e4e92..f0caf9ee13bd8 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -167,4 +167,8 @@ void int3_emulate_ret(struct pt_regs *regs) } #endif /* !CONFIG_UML_X86 */ +#if IS_BUILTIN(CONFIG_LKDTM) +unsigned long read_cpu_patching_addr(unsigned int cpu); +#endif + #endif /* _ASM_X86_TEXT_PATCHING_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index e9da3dc712541..28bb92b695639 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -773,6 +773,13 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state) __ro_after_init struct mm_struct *poking_mm; __ro_after_init unsigned long poking_addr; +#if IS_BUILTIN(CONFIG_LKDTM) +unsigned long read_cpu_patching_addr(unsigned int cpu) +{ + return poking_addr; +} +#endif + static void *__text_poke(void *addr, const void *opcode, size_t len) { bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; -- 2.26.1
[RFC 2/2] kernel/idle: Update and use idle-hint in VPA region
In guest, Set idle_hint to 1 when the prev_cpu of a vCPU goes into idle state, similarly set idle_hint to 0 when exiting an idle state. Since the idle_hint is in VPA region, the available_idle_cpu() in guest can read this region every time to find if a vCPU can be scheduled instantly by the hypervsior or not. Signed-off-by: Parth Shah --- arch/powerpc/include/asm/paravirt.h | 12 ++-- kernel/sched/idle.c | 3 +++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h index bcb7b5f917be..b20e4d25bd00 100644 --- a/arch/powerpc/include/asm/paravirt.h +++ b/arch/powerpc/include/asm/paravirt.h @@ -21,6 +21,12 @@ static inline bool is_shared_processor(void) return static_branch_unlikely(_processor); } +static inline int idle_hint_of(int cpu) +{ + __be32 idle_hint = READ_ONCE(lppaca_of(cpu).idle_hint); + return be32_to_cpu(idle_hint); +} + /* If bit 0 is set, the cpu has been preempted */ static inline u32 yield_count_of(int cpu) { @@ -109,8 +115,10 @@ static inline bool vcpu_is_preempted(int cpu) } #endif - if (yield_count_of(cpu) & 1) - return true; + if (yield_count_of(cpu) & 1) { + if (idle_hint_of(cpu) == 0) + return true; + } return false; } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 7ca3d3d86c2a..fdc8d1d474f0 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -7,6 +7,7 @@ *tasks which are handled in sched/fair.c ) */ #include "sched.h" +#include #include @@ -290,6 +291,7 @@ static void do_idle(void) arch_cpu_idle_dead(); } + set_idle_hint(cpu, 1); arch_cpu_idle_enter(); rcu_nocb_flush_deferred_wakeup(); @@ -306,6 +308,7 @@ static void do_idle(void) cpuidle_idle_call(); } arch_cpu_idle_exit(); + set_idle_hint(cpu, 0); } /* -- 2.26.3
[RFC 1/2] powerpc/book3s_hv: Add new idle-hint attribute in VPA region
In lppaca region, add a new attribute idle_hint which can allow guest scheduler for better cpu selection. Hypervisor can update idle_hint attribute based on the prediction that if vCPU needs to be scheduled then can it be scheduled instantly or not. Signed-off-by: Parth Shah --- arch/powerpc/include/asm/idle_hint.h | 28 +++ arch/powerpc/include/asm/lppaca.h| 3 ++- arch/powerpc/kvm/book3s.h| 2 ++ arch/powerpc/kvm/book3s_hv.c | 34 4 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/include/asm/idle_hint.h diff --git a/arch/powerpc/include/asm/idle_hint.h b/arch/powerpc/include/asm/idle_hint.h new file mode 100644 index ..165d65c0275b --- /dev/null +++ b/arch/powerpc/include/asm/idle_hint.h @@ -0,0 +1,28 @@ +#ifndef _ASM_POWERPC_IDLEHINT_H +#define _ASM_POWERPC_IDLEHINT_H + +#include + +extern void kvmppc_idle_hint_set(struct kvm_vcpu *vcpu, int idle_hint); + +extern int idle_hint_is_active; + +extern void set_idle_hint(int cpu, int value); + +static inline int prev_cpu_of_kvm(struct kvm_vcpu *vcpu) +{ + struct pid *pid; + struct task_struct *task = NULL; + + rcu_read_lock(); + pid = rcu_dereference(vcpu->pid); + if (pid) + task = get_pid_task(pid, PIDTYPE_PID); + rcu_read_unlock(); + + if (!task) + return -1; + + return task_cpu(task); +} +#endif diff --git a/arch/powerpc/include/asm/lppaca.h b/arch/powerpc/include/asm/lppaca.h index c390ec377bae..ee790a566036 100644 --- a/arch/powerpc/include/asm/lppaca.h +++ b/arch/powerpc/include/asm/lppaca.h @@ -111,7 +111,8 @@ struct lppaca { __be32 page_ins; /* CMO Hint - # page ins by OS */ u8 reserved11[148]; volatile __be64 dtl_idx;/* Dispatch Trace Log head index */ - u8 reserved12[96]; + volatile __be32 idle_hint; /* Can vCPU be scheduled instantly? */ + u8 reserved12[92]; } cacheline_aligned; #define lppaca_of(cpu) (*paca_ptrs[cpu]->lppaca_ptr) diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h index 740e51def5a5..61b0741c139a 100644 --- a/arch/powerpc/kvm/book3s.h +++ b/arch/powerpc/kvm/book3s.h @@ -7,6 +7,8 @@ #ifndef __POWERPC_KVM_BOOK3S_H__ #define __POWERPC_KVM_BOOK3S_H__ +#include + extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm, struct kvm_memory_slot *memslot); extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index bc0813644666..c008be20294d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -447,6 +447,7 @@ static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa) { vpa->__old_status |= LPPACA_OLD_SHARED_PROC; vpa->yield_count = cpu_to_be32(1); + vpa->idle_hint = cpu_to_be32(0); } static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v, @@ -911,6 +912,17 @@ static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target) return kvm_vcpu_yield_to(target); } +void kvmppc_idle_hint_set(struct kvm_vcpu *vcpu, int idle_hint) +{ + struct lppaca *lppaca; + + if (!vcpu) return; + + lppaca = (struct lppaca *)vcpu->arch.vpa.pinned_addr; + if (lppaca) + lppaca->idle_hint = cpu_to_be32(idle_hint); +} + static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu) { int yield_count = 0; @@ -2803,6 +2815,28 @@ static int on_primary_thread(void) return 1; } +void set_idle_hint_for_kvm(struct kvm *kvm, int cpu, int value) +{ + int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (cpu == prev_cpu_of_kvm(vcpu)) { + kvmppc_idle_hint_set(vcpu, value); + } + } +} + +void set_idle_hint(int cpu, int value) +{ + struct kvm *kvm; + struct kvm *tmp; + + list_for_each_entry_safe(kvm, tmp, _list, vm_list) { + set_idle_hint_for_kvm(kvm, cpu, value); + } +} + /* * A list of virtual cores for each physical CPU. * These are vcores that could run but their runner VCPU tasks are -- 2.26.3
[RFC 0/2] Paravirtualize idle CPU wakeup optimization
This patch-set is a revision over HCALL based implementation which can be found at: https://lore.kernel.org/linuxppc-dev/20210401115922.1524705-1-pa...@linux.ibm.com/ But since the overhead of HCALL is huge, this patch-set uses lppaca region to update idle-hint, where hypervisor keeps changing the newly added idle_hint attribute in the VPA region of each vCPUs of all KVMs, and guest have to just look at this attribute. This implementation is not aimed for full fledged solution, but is rather a demonstration of para-virtualizing task scheduler. Hypervisor can provided better idle-hints about vCPU scheduling and guest can use it for better scheduling decisions. Abstract: = The Linux task scheduler searches for an idle cpu for task wakeup in-order to lower the wakeup latency as much as possible. The process of determining if a cpu is idle or not has evolved over time. Currently, in available_idle_cpu(), a cpu is considered idle if - there are no task running or enqueued to the runqueue of the cpu and - the cpu is not-preempted, i.e. while running inside a guest, a cpu is not yielded (determined via vcpu_is_preempted()) While inside the guest, there is no way to deterministically predict if a vCPU that has been yielded/ceded to the hypervisor can be gotten back. Hence currently the scheduler considers such CEDEd vCPU as not "available" idle and would instead pick other busy CPUs for waking up the wakee task. In this patch-set we try to further classify idle cpus as instantly available or not. This is achieved by taking hint from the hypervisor by quering if the vCPU will be scheduled instantly or not. In most cases, scheduler prefers prev_cpu of a waking task unless it is busy. In this patchset, the hypervisor uses this information to figure out if the prev_cpu used by the task (of the corresponding vCPU) is idle or not, and passes this information to the guest. Interface: === This patchset introduces a new attribute in lppaca structure which is shared by both the hypervisor and the guest. The new attribute, i.e. idle_hint is updated regularly by the hypervisor. When a particular cpu goes into idle-state, it updates the idle_hint of all the vCPUs of all existing KVMs whose prev_cpu == smp_processor_id(). It similarly revert backs the update when coming out of the idle-state. Internal working: The code-flow of the current implementation is as follow: - In do_idle(), when entering an idle-state, walk through all vCPUs of all KVM guests and find whose prev_cpu of vCPU task was same as the caller's cpu, and mark the idle_hint=1 in the lppaca region of such vCPUs. - Similarly, mark idle_hint=0 for such vCPUs when exiting idle state. - Guest OS scheduler searches for idle cpu using `avaialable_idle_cpu()` which also looks if a vcpu_is_preempted() to see if vCPU is yielded or not. - If vCPU is yielded, then the GuestOS will additionally see if idle_hint is marked as 1 or not. If idle_hint==1 then consider the vCPU as non-preempted and use it for scheduling a task. The patch-set is based on v5.13 kernel. Results: - Baseline kernel = v5.13 - Patched kernel = v5.13 + this patch-set Setup: All the results are taken on IBM POWER9 baremetal system running patched kernel. This system consists of 2 NUMA nodes, 22 cores per socket with SMT-4 mode. Each KVM guest have identical cpu topology with total 40 CPUs, which are 10 cores with SMT-4 support. Scenarios: -- 1. Under-commit case: Only one KVM is active at a time. - Baseline kernel: $> schbench -m 20 -t 2 -r 30 Latency percentiles (usec) 50.th: 86 75.th: 406 90.th: 497 95.th: 541 *99.th: 2572 <- 99.5000th: 3724 99.9000th: 6904 min=0, max=10007 - With Patched kernel: $> schbench -m 20 -t 2 -r 30 Latency percentiles (usec) 50.th: 386 75.th: 470 90.th: 529 95.th: 589 *99.th: 741 (-71%) <- 99.5000th: 841 99.9000th: 1522 min=0, max=6488 We see a significant reduction in the tail latencies due to being able to schedule on an yielded/ceded idle CPU with the patchset instead of waking up the task on a busy CPU. 2. Over-commit case: Multiple KVM guests sharing same set of CPUs. Two KVMs running baseline kernel is used for creating noise using `schbench -m 10 -t 2 -r 3000` while only the other KVM is benchmarked. - Baseline kernel: $> schbench -m 20 -t 2 -r 30 Latency percentiles (usec) 50.th: 289 75.th: 1074 90.th: 7288 95.th: 11248 *99.th: 17760 99.5000th: 21088 99.9000th: 28896 min=0, max=36640 - With Patched kernel: $> schbench -m 20 -t 2 -r 30 Latency percentiles (usec) 50.th: 281 75.th: 445 90.th: 4344 95.th: 9168 *99.th: 15824 99.5000th: 19296 99.9000th: 26656
Re: [PATCH v4 07/11] powerpc/pseries/iommu: Reorganize iommu_table_setparms*() with new helper
Hello Alexey, On Fri, 2021-06-18 at 19:26 -0300, Leonardo Brás wrote: > > > > > + unsigned long liobn, > > > unsigned long win_addr, > > > + unsigned long > > > window_size, > > > unsigned long page_shift, > > > + unsigned long base, > > > struct > > > iommu_table_ops *table_ops) > > > > > > iommu_table_setparms() rather than passing 0 around. > > > > The same comment about "liobn" - set it in > > iommu_table_setparms_lpar(). > > The reviewer will see what field atters in what situation imho. > > > > The idea here was to keep all tbl parameters setting in > _iommu_table_setparms (or iommu_table_setparms_common). > > I understand the idea that each one of those is optional in the other > case, but should we keep whatever value is present in the other > variable (not zeroing the other variable), or do someting like: > > tbl->it_index = 0; > tbl->it_base = basep; > (in iommu_table_setparms) > > tbl->it_index = liobn; > tbl->it_base = 0; > (in iommu_table_setparms_lpar) > This one is supposed to be a question, but I missed the question mark. Sorry about that. I would like to get your opinion in this :)
Re: [PATCH v4 10/11] powerpc/pseries/iommu: Make use of DDW for indirect mapping
On Tue, 2021-05-11 at 17:57 +1000, Alexey Kardashevskiy wrote: > > > On 01/05/2021 02:31, Leonardo Bras wrote: > > [...] > > pmem_present = dn != NULL; > > @@ -1218,8 +1224,12 @@ static bool enable_ddw(struct pci_dev *dev, > > struct device_node *pdn) > > > > mutex_lock(_window_init_mutex); > > > > - if (find_existing_ddw(pdn, >dev.archdata.dma_offset, > > )) > > - goto out_unlock; > > + if (find_existing_ddw(pdn, >dev.archdata.dma_offset, > > )) { > > + direct_mapping = (len >= max_ram_len); > > + > > + mutex_unlock(_window_init_mutex); > > + return direct_mapping; > > Does not this break the existing case when direct_mapping==true by > skipping setting dev->dev.bus_dma_limit before returning? > Yes, it does. Good catch! I changed it to use a flag instead of win64 for return, and now I can use the same success exit path for both the new config and the config found in list. (out_unlock) > > > > + } > > > > /* > > * If we already went through this for a previous function of > > @@ -1298,7 +1308,6 @@ static bool enable_ddw(struct pci_dev *dev, > > struct device_node *pdn) > > goto out_failed; > > } > > /* verify the window * number of ptes will map the partition > > */ > > - /* check largest block * page size > max memory hotplug addr > > */ > > /* > > * The "ibm,pmemory" can appear anywhere in the address > > space. > > * Assuming it is still backed by page structs, try > > MAX_PHYSMEM_BITS > > @@ -1320,6 +1329,17 @@ static bool enable_ddw(struct pci_dev *dev, > > struct device_node *pdn) > > 1ULL << len, > > query.largest_available_block, > > 1ULL << page_shift); > > + > > + len = order_base_2(query.largest_available_block << > > page_shift); > > + win_name = DMA64_PROPNAME; > > [1] > > > > + } else { > > + direct_mapping = true; > > + win_name = DIRECT64_PROPNAME; > > + } > > + > > + /* DDW + IOMMU on single window may fail if there is any > > allocation */ > > + if (default_win_removed && !direct_mapping && > > iommu_table_in_use(tbl)) { > > + dev_dbg(>dev, "current IOMMU table in use, can't > > be replaced.\n"); > > > ... remove !direct_mapping and move to [1]? sure, done! > > > > goto out_failed; > > } > > > > @@ -1331,8 +1351,7 @@ static bool enable_ddw(struct pci_dev *dev, > > struct device_node *pdn) > > create.liobn, dn); > > > > win_addr = ((u64)create.addr_hi << 32) | create.addr_lo; > > - win64 = ddw_property_create(DIRECT64_PROPNAME, create.liobn, > > win_addr, > > - page_shift, len); > > + win64 = ddw_property_create(win_name, create.liobn, win_addr, > > page_shift, len); > > if (!win64) { > > dev_info(>dev, > > "couldn't allocate property, property name, > > or value\n"); > > @@ -1350,12 +1369,47 @@ static bool enable_ddw(struct pci_dev *dev, > > struct device_node *pdn) > > if (!window) > > goto out_del_prop; > > > > - ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> > > PAGE_SHIFT, > > - win64->value, > > tce_setrange_multi_pSeriesLP_walk); > > - if (ret) { > > - dev_info(>dev, "failed to map direct window for > > %pOF: %d\n", > > - dn, ret); > > - goto out_del_list; > > + if (direct_mapping) { > > + /* DDW maps the whole partition, so enable direct DMA > > mapping */ > > + ret = walk_system_ram_range(0, memblock_end_of_DRAM() > > >> PAGE_SHIFT, > > + win64->value, > > tce_setrange_multi_pSeriesLP_walk); > > + if (ret) { > > + dev_info(>dev, "failed to map direct > > window for %pOF: %d\n", > > + dn, ret); > > + goto out_del_list; > > + } > > + } else { > > + struct iommu_table *newtbl; > > + int i; > > + > > + /* New table for using DDW instead of the default DMA > > window */ > > + newtbl = iommu_pseries_alloc_table(pci->phb->node); > > + if (!newtbl) { > > + dev_dbg(>dev, "couldn't create new IOMMU > > table\n"); > > + goto out_del_list; > > + } > > + > > + for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); > > i++) { > > + const unsigned long mask = IORESOURCE_MEM_64 > > | IORESOURCE_MEM; > > + > > + /* Look for MMIO32 */ > > + if ((pci->phb->mem_resources[i].flags &
Re: [PATCH v3] lockdown,selinux: fix wrong subject in some SELinux lockdown checks
On Sat, Jun 19, 2021 at 1:00 PM Thomas Gleixner wrote: > On Wed, Jun 16 2021 at 10:51, Ondrej Mosnacek wrote: > > diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c > > index bda73cb7a044..c43a13241ae8 100644 > > --- a/arch/x86/mm/testmmiotrace.c > > +++ b/arch/x86/mm/testmmiotrace.c > > @@ -116,7 +116,7 @@ static void do_test_bulk_ioremapping(void) > > static int __init init(void) > > { > > unsigned long size = (read_far) ? (8 << 20) : (16 << 10); > > - int ret = security_locked_down(LOCKDOWN_MMIOTRACE); > > + int ret = security_locked_down(current_cred(), LOCKDOWN_MMIOTRACE); > > I have no real objection to those patches, but it strikes me odd that > out of the 62 changed places 58 have 'current_cred()' and 4 have NULL as > argument. > > I can't see why this would ever end up with anything else than > current_cred() or NULL and NULL being the 'special' case. So why not > having security_locked_down_no_cred() and make current_cred() implicit > for security_locked_down() which avoids most of the churn and just makes > the special cases special. I might be missing something though. Unfortunately it is not uncommon for kernel subsystems to add, move, or otherwise play around with LSM hooks without checking with the LSM folks; generally this is okay, but there have been a few problems in the past and I try to keep that in mind when we are introducing new hooks or modifying existing ones. If we have two LSM hooks for roughly the same control point it has the potential to cause confusion, e.g. do I use the "normal" or the "no_cred" version? What if I don't want to pass a credential, can I just use "no_cred"? My thinking with the single, always-pass-a-cred function is that callers don't have to worry about choosing from multiple, similar hooks and they know they need to pass a cred which hopefully gets them thinking about what cred is appropriate. It's not foolproof, but I believe the single hook approach will be less prone to accidents ... or so I hope :) -- paul moore www.paul-moore.com
[PATCH 1/2] PCI: Use pcie_reset_state_t type in function arguments
The pcie_reset_state_t type has been introduced in the commit f7bdd12d234d ("pci: New PCI-E reset API") along with the enum pcie_reset_state, but it has never been used for anything else other than to define the members of the enumeration set in the enum pcie_reset_state. Thus, replace the direct use of enum pcie_reset_state in function arguments and replace it with pcie_reset_state_t type so that the argument type matches the type used in enum pcie_reset_state. Signed-off-by: Krzysztof Wilczyński --- drivers/pci/pci.c | 4 ++-- include/linux/pci.h | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index aacf575c15cf..5c3386a73eb1 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2194,7 +2194,7 @@ EXPORT_SYMBOL(pci_disable_device); * implementation. Architecture implementations can override this. */ int __weak pcibios_set_pcie_reset_state(struct pci_dev *dev, - enum pcie_reset_state state) + pcie_reset_state_t state) { return -EINVAL; } @@ -2206,7 +2206,7 @@ int __weak pcibios_set_pcie_reset_state(struct pci_dev *dev, * * Sets the PCI reset state for the device. */ -int pci_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state) +int pci_set_pcie_reset_state(struct pci_dev *dev, pcie_reset_state_t state) { return pcibios_set_pcie_reset_state(dev, state); } diff --git a/include/linux/pci.h b/include/linux/pci.h index 540b377ca8f6..15f93de69e6a 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -191,7 +191,6 @@ enum { }; typedef unsigned int __bitwise pcie_reset_state_t; - enum pcie_reset_state { /* Reset is NOT asserted (Use to deassert reset) */ pcie_deassert_reset = (__force pcie_reset_state_t) 1, @@ -1205,7 +1204,7 @@ extern unsigned int pcibios_max_latency; void pci_set_master(struct pci_dev *dev); void pci_clear_master(struct pci_dev *dev); -int pci_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state); +int pci_set_pcie_reset_state(struct pci_dev *dev, pcie_reset_state_t state); int pci_set_cacheline_size(struct pci_dev *dev); int __must_check pci_set_mwi(struct pci_dev *dev); int __must_check pcim_set_mwi(struct pci_dev *dev); @@ -2079,7 +2078,7 @@ extern u8 pci_cache_line_size; void pcibios_disable_device(struct pci_dev *dev); void pcibios_set_master(struct pci_dev *dev); int pcibios_set_pcie_reset_state(struct pci_dev *dev, -enum pcie_reset_state state); +pcie_reset_state_t state); int pcibios_add_device(struct pci_dev *dev); void pcibios_release_device(struct pci_dev *dev); #ifdef CONFIG_PCI -- 2.32.0
[PATCH 2/2] powerpc/eeh: Use pcie_reset_state_t type in function arguments
The pcie_reset_state_t type has been introduced in the commit f7bdd12d234d ("pci: New PCI-E reset API") along with the enum pcie_reset_state, but it has never been used for anything else other than to define the members of the enumeration set in the enum pcie_reset_state. Thus, replace the direct use of enum pcie_reset_state in function arguments and replace it with pcie_reset_state_t type so that the argument type matches the type used in enum pcie_reset_state. Signed-off-by: Krzysztof Wilczyński --- arch/powerpc/kernel/eeh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 3bbdcc86d01b..15485abb89ff 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -714,7 +714,7 @@ static void eeh_restore_dev_state(struct eeh_dev *edev, void *userdata) * Return value: * 0 if success */ -int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state) +int pcibios_set_pcie_reset_state(struct pci_dev *dev, pcie_reset_state_t state) { struct eeh_dev *edev = pci_dev_to_eeh_dev(dev); struct eeh_pe *pe = eeh_dev_to_pe(edev); -- 2.32.0
[PATCH] replace if with min
Replace if with min in order to make code more clean. Signed-off-by: Salah Triki --- drivers/crypto/nx/nx-842.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/crypto/nx/nx-842.c b/drivers/crypto/nx/nx-842.c index 2ab90ec10e61..0d1d5a463899 100644 --- a/drivers/crypto/nx/nx-842.c +++ b/drivers/crypto/nx/nx-842.c @@ -134,8 +134,7 @@ EXPORT_SYMBOL_GPL(nx842_crypto_exit); static void check_constraints(struct nx842_constraints *c) { /* limit maximum, to always have enough bounce buffer to decompress */ - if (c->maximum > BOUNCE_BUFFER_SIZE) - c->maximum = BOUNCE_BUFFER_SIZE; + c->maximum = min(c->maximum, BOUNCE_BUFFER_SIZE); } static int nx842_crypto_add_header(struct nx842_crypto_header *hdr, u8 *buf) -- 2.25.1
[RFC PATCH] KVM: PPC: BookE: Load FP and Altivec state before soft enabling IRQs
The kvmppc_fix_ee_before_entry function sets the IRQ soft mask to IRQS_ENABLED. This function is called right before loading the guest FP and Altivec states at kvmppc_handle_exit. This triggers a WARN_ON(preemptible()) at enable_kernel_fp/altivec when running with CONFIG_PREEMPT_COUNT=y: WARNING: CPU: 1 PID: 6585 at .enable_kernel_fp+0x30/0x78 Modules linked in: r8153_ecm cdc_ether usbnet r8152 uio_pdrv_genirq uio CPU: 1 PID: 6585 Comm: qemu-system-ppc Tainted: GW 5.12.10_e6500 #1 NIP: c0003ec0 LR: c004fb00 CTR: 0004 REGS: c000b38ab440 TRAP: 0700 Tainted: GW (5.12.10_e6500) MSR: 82023002 CR: 24000208 XER: IRQMASK: 0 GPR00: c004fb00 c000b38ab6e0 c1a4e300 c000b3878000 GPR04: 0010 8003 GPR08: fe662000 0001 0001 GPR12: 24000208 c00038c0 c000b3878000 c183eb60 GPR16: c20a8d80 0001 GPR20: c1ab1a70 c20a8d80 c20a8d80 GPR24: c183ed48 c17c8ec0 c183eec0 c00b80e0 GPR28: 000b80e0 c000b3878000 NIP [c0003ec0] .enable_kernel_fp+0x30/0x78 LR [c004fb00] .kvmppc_load_guest_fp+0x2c/0x80 Call Trace: [c000b38ab6e0] [c183ed48] rcu_state+0x248/0x400 (unreliable) [c000b38ab750] [c004fb00] .kvmppc_load_guest_fp+0x2c/0x80 [c000b38ab7d0] [c0050f48] .kvmppc_handle_exit+0x5cc/0x5d8 [c000b38ab870] [c0053e64] .kvmppc_resume_host+0xcc/0x120 Instruction dump: 7c0802a6 f8010010 f821ff91 e92d0658 8149 3920 2c0a 40c20014 892d067a 552907fe 7d290034 5529d97e <0b09> 38602000 4bfffe79 e86d0658 I'm assuming this was an oversight while introducing the call to kvmppc_load_guest_fp and kvmppc_load_guest_altivec functions from kvmppc_handle_exit. So this patch moves kvmppc_fix_ee_before_entry to be again the last thing before exiting kvmppc_handle_exit. Compile tested only since I don't have a BookE machine. Fixes: 3efc7da61f6c ("KVM: PPC: Book3E: Increase FPU laziness") Fixes: 95d80a294b1e ("KVM: PPC: Book3e: Add AltiVec support") Reported-by: Signed-off-by: Fabiano Rosas --- arch/powerpc/kvm/booke.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 551b30d84aee..38179c45eb90 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1387,9 +1387,9 @@ int kvmppc_handle_exit(struct kvm_vcpu *vcpu, unsigned int exit_nr) r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV); else { /* interrupts now hard-disabled */ - kvmppc_fix_ee_before_entry(); kvmppc_load_guest_fp(vcpu); kvmppc_load_guest_altivec(vcpu); + kvmppc_fix_ee_before_entry(); } } -- 2.29.2
Re: [PATCH v3 1/1] powerpc/pseries: Interface to represent PAPR firmware attributes
"Pratik R. Sampat" writes: Hi, have you seen Documentation/core-api/kobject.rst, particularly the part that says: "When you see a sysfs directory full of other directories, generally each of those directories corresponds to a kobject in the same kset." Taking a look at samples/kobject/kset-example.c, it seems to provide an overall structure that is closer to what other modules do when creating sysfs entries. It uses less dynamic allocations and deals a bit better with cleaning up the state afterwards. > Adds a generic interface to represent the energy and frequency related > PAPR attributes on the system using the new H_CALL > "H_GET_ENERGY_SCALE_INFO". > > H_GET_EM_PARMS H_CALL was previously responsible for exporting this > information in the lparcfg, however the H_GET_EM_PARMS H_CALL > will be deprecated P10 onwards. > > The H_GET_ENERGY_SCALE_INFO H_CALL is of the following call format: > hcall( > uint64 H_GET_ENERGY_SCALE_INFO, // Get energy scale info > uint64 flags, // Per the flag request > uint64 firstAttributeId,// The attribute id > uint64 bufferAddress, // Guest physical address of the output buffer > uint64 bufferSize // The size in bytes of the output buffer > ); > > This H_CALL can query either all the attributes at once with > firstAttributeId = 0, flags = 0 as well as query only one attribute > at a time with firstAttributeId = id, flags = 1. > > The output buffer consists of the following > 1. number of attributes - 8 bytes > 2. array offset to the data location - 8 bytes > 3. version info - 1 byte > 4. A data array of size num attributes, which contains the following: > a. attribute ID - 8 bytes > b. attribute value in number - 8 bytes > c. attribute name in string - 64 bytes > d. attribute value in string - 64 bytes > > The new H_CALL exports information in direct string value format, hence > a new interface has been introduced in > /sys/firmware/papr/energy_scale_info to export this information to > userspace in an extensible pass-through format. > > The H_CALL returns the name, numeric value and string value (if exists) > > The format of exposing the sysfs information is as follows: > /sys/firmware/papr/energy_scale_info/ >|-- / > |-- desc > |-- value > |-- value_desc (if exists) >|-- / > |-- desc > |-- value > |-- value_desc (if exists) > ... > > The energy information that is exported is useful for userspace tools > such as powerpc-utils. Currently these tools infer the > "power_mode_data" value in the lparcfg, which in turn is obtained from > the to be deprecated H_GET_EM_PARMS H_CALL. > On future platforms, such userspace utilities will have to look at the > data returned from the new H_CALL being populated in this new sysfs > interface and report this information directly without the need of > interpretation. > > Signed-off-by: Pratik R. Sampat > Reviewed-by: Gautham R. Shenoy > --- > .../sysfs-firmware-papr-energy-scale-info | 26 ++ > arch/powerpc/include/asm/hvcall.h | 24 +- > arch/powerpc/kvm/trace_hv.h | 1 + > arch/powerpc/platforms/pseries/Makefile | 3 +- > .../pseries/papr_platform_attributes.c| 320 ++ > 5 files changed, 372 insertions(+), 2 deletions(-) > create mode 100644 > Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info > create mode 100644 arch/powerpc/platforms/pseries/papr_platform_attributes.c > > diff --git a/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info > b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info > new file mode 100644 > index ..fd82f2bfafe5 > --- /dev/null > +++ b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info > @@ -0,0 +1,26 @@ > +What:/sys/firmware/papr/energy_scale_info > +Date:June 2021 > +Contact: Linux for PowerPC mailing list > +Description: Directory hosting a set of platform attributes like > + energy/frequency on Linux running as a PAPR guest. > + > + Each file in a directory contains a platform > + attribute hierarchy pertaining to performance/ > + energy-savings mode and processor frequency. > + > +What:/sys/firmware/papr/energy_scale_info/ > + /sys/firmware/papr/energy_scale_info//desc > + /sys/firmware/papr/energy_scale_info//value > + /sys/firmware/papr/energy_scale_info//value_desc > +Date:June 2021 > +Contact: Linux for PowerPC mailing list > +Description: Energy, frequency attributes directory for POWERVM servers > + > + This directory provides energy, erequency, folding information. > It s/erequency/frequency/ > + contains below sysfs attributes: > + > + - desc: String description of the attribute > + > + - value: Numeric value of attribute >
Re: [PATCH] powerpc/64s/hash: Fix SLB preload cache vs kthread_use_mm
Excerpts from Nicholas Piggin's message of July 8, 2021 7:05 pm: > It's possible for kernel threads to pick up SLB preload entries if > they are accessing userspace with kthread_use_mm. If the kthread > later is context switched while using a different mm, when it is > switched back it could preload SLBs belonging to the previous mm. > > This could lead to data corruption, leaks, SLB multi hits, etc. > > In the absence of a usable hook to clear preloads when unusing an > mm, fix it by keeping track of the mm that the preloads belong to. > > Adjust the isync() comment to be clear it can't be skipped if we > had no preloads. I should note that this patch is wrong, and so I withdraw it. The supposed bug is not actually a bug, because the SLB preload only records the ESID/EA to preload, not the VA. So this cross-mm "leak" can happen, but the worst it will do is preload some addresses used in the previous mm that are not likely to be accessed in the new mm. There is an idea that this is the "correct" thing to be doing as performance goes, but on the other hand it should be pretty rare to happen so it may not be worth the extra logic. At least it should be submitted as a performance thing not bugfix if we did do it. Thanks, Nick > > Fixes: 5434ae74629a ("powerpc/64s/hash: Add a SLB preload cache") > Signed-off-by: Nicholas Piggin > --- > arch/powerpc/include/asm/thread_info.h | 1 + > arch/powerpc/mm/book3s64/slb.c | 36 ++ > 2 files changed, 26 insertions(+), 11 deletions(-) > > diff --git a/arch/powerpc/include/asm/thread_info.h > b/arch/powerpc/include/asm/thread_info.h > index b4ec6c7dd72e..c3de13dde2af 100644 > --- a/arch/powerpc/include/asm/thread_info.h > +++ b/arch/powerpc/include/asm/thread_info.h > @@ -54,6 +54,7 @@ struct thread_info { > #if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC32) > struct cpu_accounting_data accounting; > #endif > + struct mm_struct *slb_preload_mm; > unsigned char slb_preload_nr; > unsigned char slb_preload_tail; > u32 slb_preload_esid[SLB_PRELOAD_NR]; > diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c > index c91bd85eb90e..4f9dbce0dd84 100644 > --- a/arch/powerpc/mm/book3s64/slb.c > +++ b/arch/powerpc/mm/book3s64/slb.c > @@ -294,11 +294,20 @@ static bool preload_hit(struct thread_info *ti, > unsigned long esid) > return false; > } > > -static bool preload_add(struct thread_info *ti, unsigned long ea) > +static bool preload_add(struct thread_info *ti, struct mm_struct *mm, > unsigned long ea) > { > unsigned char idx; > unsigned long esid; > > + if (unlikely(ti->slb_preload_mm != mm)) { > + /* > + * kthread_use_mm or other temporary mm switching can > + * change the mm being used by a particular thread. > + */ > + ti->slb_preload_nr = 0; > + ti->slb_preload_mm = mm; > + } > + > if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { > /* EAs are stored >> 28 so 256MB segments don't need clearing */ > if (ea & ESID_MASK_1T) > @@ -362,13 +371,13 @@ void slb_setup_new_exec(void) >* 0x1000 so it makes sense to preload this segment. >*/ > if (!is_kernel_addr(exec)) { > - if (preload_add(ti, exec)) > + if (preload_add(ti, mm, exec)) > slb_allocate_user(mm, exec); > } > > /* Libraries and mmaps. */ > if (!is_kernel_addr(mm->mmap_base)) { > - if (preload_add(ti, mm->mmap_base)) > + if (preload_add(ti, mm, mm->mmap_base)) > slb_allocate_user(mm, mm->mmap_base); > } > > @@ -394,19 +403,19 @@ void preload_new_slb_context(unsigned long start, > unsigned long sp) > > /* Userspace entry address. */ > if (!is_kernel_addr(start)) { > - if (preload_add(ti, start)) > + if (preload_add(ti, mm, start)) > slb_allocate_user(mm, start); > } > > /* Top of stack, grows down. */ > if (!is_kernel_addr(sp)) { > - if (preload_add(ti, sp)) > + if (preload_add(ti, mm, sp)) > slb_allocate_user(mm, sp); > } > > /* Bottom of heap, grows up. */ > if (heap && !is_kernel_addr(heap)) { > - if (preload_add(ti, heap)) > + if (preload_add(ti, mm, heap)) > slb_allocate_user(mm, heap); > } > > @@ -502,6 +511,11 @@ void switch_slb(struct task_struct *tsk, struct > mm_struct *mm) > > copy_mm_to_paca(mm); > > + if (unlikely(ti->slb_preload_mm != mm)) { > + ti->slb_preload_nr = 0; > + ti->slb_preload_mm = mm; > + } > + > /* >* We gradually age out SLBs after a number of context switches to >* reduce reload overhead of unused entries (like we do with FP/VEC > @@ -513,7
Re: [RFC PATCH 11/43] KVM: PPC: Book3S HV P9: Implement PMU save/restore in C
> On 12-Jul-2021, at 8:19 AM, Nicholas Piggin wrote: > > Excerpts from Athira Rajeev's message of July 10, 2021 12:47 pm: >> >> >>> On 22-Jun-2021, at 4:27 PM, Nicholas Piggin wrote: >>> >>> Implement the P9 path PMU save/restore code in C, and remove the >>> POWER9/10 code from the P7/8 path assembly. >>> >>> -449 cycles (8533) POWER9 virt-mode NULL hcall >>> >>> Signed-off-by: Nicholas Piggin >>> --- >>> arch/powerpc/include/asm/asm-prototypes.h | 5 - >>> arch/powerpc/kvm/book3s_hv.c | 205 -- >>> arch/powerpc/kvm/book3s_hv_interrupts.S | 13 +- >>> arch/powerpc/kvm/book3s_hv_rmhandlers.S | 43 + >>> 4 files changed, 200 insertions(+), 66 deletions(-) >>> >>> diff --git a/arch/powerpc/include/asm/asm-prototypes.h >>> b/arch/powerpc/include/asm/asm-prototypes.h >>> index 02ee6f5ac9fe..928db8ef9a5a 100644 >>> --- a/arch/powerpc/include/asm/asm-prototypes.h >>> +++ b/arch/powerpc/include/asm/asm-prototypes.h >>> @@ -136,11 +136,6 @@ static inline void kvmppc_restore_tm_hv(struct >>> kvm_vcpu *vcpu, u64 msr, >>> bool preserve_nv) { } >>> #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ >>> >>> -void kvmhv_save_host_pmu(void); >>> -void kvmhv_load_host_pmu(void); >>> -void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use); >>> -void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu); >>> - >>> void kvmppc_p9_enter_guest(struct kvm_vcpu *vcpu); >>> >>> long kvmppc_h_set_dabr(struct kvm_vcpu *vcpu, unsigned long dabr); >>> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c >>> index f7349d150828..b1b94b3563b7 100644 >>> --- a/arch/powerpc/kvm/book3s_hv.c >>> +++ b/arch/powerpc/kvm/book3s_hv.c >>> @@ -3635,6 +3635,188 @@ static noinline void kvmppc_run_core(struct >>> kvmppc_vcore *vc) >>> trace_kvmppc_run_core(vc, 1); >>> } >>> >>> +/* >>> + * Privileged (non-hypervisor) host registers to save. >>> + */ >>> +struct p9_host_os_sprs { >>> + unsigned long dscr; >>> + unsigned long tidr; >>> + unsigned long iamr; >>> + unsigned long amr; >>> + unsigned long fscr; >>> + >>> + unsigned int pmc1; >>> + unsigned int pmc2; >>> + unsigned int pmc3; >>> + unsigned int pmc4; >>> + unsigned int pmc5; >>> + unsigned int pmc6; >>> + unsigned long mmcr0; >>> + unsigned long mmcr1; >>> + unsigned long mmcr2; >>> + unsigned long mmcr3; >>> + unsigned long mmcra; >>> + unsigned long siar; >>> + unsigned long sier1; >>> + unsigned long sier2; >>> + unsigned long sier3; >>> + unsigned long sdar; >>> +}; >>> + >>> +static void freeze_pmu(unsigned long mmcr0, unsigned long mmcra) >>> +{ >>> + if (!(mmcr0 & MMCR0_FC)) >>> + goto do_freeze; >>> + if (mmcra & MMCRA_SAMPLE_ENABLE) >>> + goto do_freeze; >>> + if (cpu_has_feature(CPU_FTR_ARCH_31)) { >>> + if (!(mmcr0 & MMCR0_PMCCEXT)) >>> + goto do_freeze; >>> + if (!(mmcra & MMCRA_BHRB_DISABLE)) >>> + goto do_freeze; >>> + } >>> + return; >> >> >> Hi Nick >> >> When freezing the PMU, do we need to also set pmcregs_in_use to zero ? > > Not immediately, we still need to save out the values of the PMU > registers. If we clear pmcregs_in_use, then our hypervisor can discard > the contents of those registers at any time. > >> Also, why we need these above conditions like MMCRA_SAMPLE_ENABLE, >> MMCR0_PMCCEXT checks also before freezing ? > > Basically just because that's the condition we wnat to set the PMU to > before entering the guest if the guest does not have its own PMU > registers in use. > > I'm not entirely sure this is correct / optimal for perf though, so we > should double check that. I think some of this stuff should be wrapped > up and put into perf/ subdirectory as I've said before. KVM shouldn't > need to know about exactly how PMU is to be set up and managed by > perf, we should just be able to ask perf to save/restore/switch state. Hi Nick, Agree to your point. It makes sense that some of these perf code should be moved under perf/ directory. Athira > > Thanks, > Nick
Re: [PATCH v15 06/12] swiotlb: Use is_swiotlb_force_bounce for swiotlb data bouncing
On Tue, Jul 06, 2021 at 12:59:57PM -0400, Konrad Rzeszutek Wilk wrote: > On Tue, Jul 06, 2021 at 05:57:21PM +0100, Will Deacon wrote: > > On Tue, Jul 06, 2021 at 10:46:07AM -0400, Konrad Rzeszutek Wilk wrote: > > > On Tue, Jul 06, 2021 at 04:05:13PM +0200, Christoph Hellwig wrote: > > > > On Tue, Jul 06, 2021 at 03:01:04PM +0100, Robin Murphy wrote: > > > > > FWIW I was pondering the question of whether to do something along > > > > > those > > > > > lines or just scrap the default assignment entirely, so since I > > > > > hadn't got > > > > > round to saying that I've gone ahead and hacked up the alternative > > > > > (similarly untested) for comparison :) > > > > > > > > > > TBH I'm still not sure which one I prefer... > > > > > > > > Claire did implement something like your suggestion originally, but > > > > I don't really like it as it doesn't scale for adding multiple global > > > > pools, e.g. for the 64-bit addressable one for the various encrypted > > > > secure guest schemes. > > > > > > Couple of things: > > > - I am not pushing to Linus the Claire's patchset until we have a > > >resolution on this. I hope you all agree that is a sensible way > > >forward as much as I hate doing that. > > > > Sure, it's a pity but we could clearly use a bit more time to get these > > just right and we've run out of time for 5.14. > > > > I think the main question I have is how would you like to see patches for > > 5.15? i.e. as patches on top of devel/for-linus-5.14 or something else? > > Yes that would be perfect. If there are any dependencies on the rc1, I > can rebase it on top of that. Yes, please, rebasing would be very helpful. The broader rework of 'io_tlb_default_mem' is going to conflict quite badly otherwise. Cheers, Will
[PATCH v1 2/4] mm/memory_hotplug: remove nid parameter from arch_remove_memory()
The parameter is unused, let's remove it. Acked-by: Catalin Marinas Acked-by: Michael Ellerman (powerpc) Acked-by: Heiko Carstens (s390) Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Yoshinori Sato Cc: Rich Felker Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: x...@kernel.org Cc: "H. Peter Anvin" Cc: Andrew Morton Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Pavel Tatashin Cc: Baoquan He Cc: Laurent Dufour Cc: Sergei Trofimovich Cc: Kefeng Wang Cc: Michel Lespinasse Cc: Christophe Leroy Cc: "Aneesh Kumar K.V" Cc: Thiago Jung Bauermann Cc: Joe Perches Cc: Pierre Morel Cc: Jia He Cc: linux-arm-ker...@lists.infradead.org Cc: linux-i...@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-s...@vger.kernel.org Cc: linux...@vger.kernel.org Signed-off-by: David Hildenbrand --- arch/arm64/mm/mmu.c| 3 +-- arch/ia64/mm/init.c| 3 +-- arch/powerpc/mm/mem.c | 3 +-- arch/s390/mm/init.c| 3 +-- arch/sh/mm/init.c | 3 +-- arch/x86/mm/init_32.c | 3 +-- arch/x86/mm/init_64.c | 3 +-- include/linux/memory_hotplug.h | 3 +-- mm/memory_hotplug.c| 4 ++-- mm/memremap.c | 5 + 10 files changed, 11 insertions(+), 22 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index d74586508448..af8ab553a268 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1506,8 +1506,7 @@ int arch_add_memory(int nid, u64 start, u64 size, return ret; } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 064a967a7b6e..5c6da8d83c1a 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -484,8 +484,7 @@ int arch_add_memory(int nid, u64 start, u64 size, return ret; } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index ad198b439222..c3c4e31462ec 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -119,8 +119,7 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, return rc; } -void __ref arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 8ac710de1ab1..d85bd7f5d8dc 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -306,8 +306,7 @@ int arch_add_memory(int nid, u64 start, u64 size, return rc; } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index ce26c7f8950a..506784702430 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -414,8 +414,7 @@ int arch_add_memory(int nid, u64 start, u64 size, return ret; } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 74b78840182d..bd90b8fe81e4 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -801,8 +801,7 @@ int arch_add_memory(int nid, u64 start, u64 size, return __add_pages(nid, start_pfn, nr_pages, params); } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ddeaba947eb3..a6e11763763f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1255,8 +1255,7 @@
Re: [RFC PATCH 10/43] powerpc/64s: Always set PMU control registers to frozen/disabled when not in use
On 12-Jul-2021, at 8:11 AM, Nicholas Piggin wrote:Excerpts from Athira Rajeev's message of July 10, 2021 12:50 pm:On 22-Jun-2021, at 4:27 PM, Nicholas Piggin wrote:KVM PMU management code looks for particular frozen/disabled bits inthe PMU registers so it knows whether it must clear them when comingout of a guest or not. Setting this up helps KVM make these optimisationswithout getting confused. Longer term the better approach might be tomove guest/host PMU switching to the perf subsystem.Signed-off-by: Nicholas Piggin ---arch/powerpc/kernel/cpu_setup_power.c | 4 ++--arch/powerpc/kernel/dt_cpu_ftrs.c | 6 +++---arch/powerpc/kvm/book3s_hv.c | 5 +arch/powerpc/perf/core-book3s.c | 7 +++4 files changed, 17 insertions(+), 5 deletions(-)diff --git a/arch/powerpc/kernel/cpu_setup_power.c b/arch/powerpc/kernel/cpu_setup_power.cindex a29dc8326622..3dc61e203f37 100644--- a/arch/powerpc/kernel/cpu_setup_power.c+++ b/arch/powerpc/kernel/cpu_setup_power.c@@ -109,7 +109,7 @@ static void init_PMU_HV_ISA207(void)static void init_PMU(void){ mtspr(SPRN_MMCRA, 0);- mtspr(SPRN_MMCR0, 0);+ mtspr(SPRN_MMCR0, MMCR0_FC); mtspr(SPRN_MMCR1, 0); mtspr(SPRN_MMCR2, 0);}@@ -123,7 +123,7 @@ static void init_PMU_ISA31(void){ mtspr(SPRN_MMCR3, 0); mtspr(SPRN_MMCRA, MMCRA_BHRB_DISABLE);- mtspr(SPRN_MMCR0, MMCR0_PMCCEXT);+ mtspr(SPRN_MMCR0, MMCR0_FC | MMCR0_PMCCEXT);}/*diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.cindex 0a6b36b4bda8..06a089fbeaa7 100644--- a/arch/powerpc/kernel/dt_cpu_ftrs.c+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c@@ -353,7 +353,7 @@ static void init_pmu_power8(void) } mtspr(SPRN_MMCRA, 0);- mtspr(SPRN_MMCR0, 0);+ mtspr(SPRN_MMCR0, MMCR0_FC); mtspr(SPRN_MMCR1, 0); mtspr(SPRN_MMCR2, 0); mtspr(SPRN_MMCRS, 0);@@ -392,7 +392,7 @@ static void init_pmu_power9(void) mtspr(SPRN_MMCRC, 0); mtspr(SPRN_MMCRA, 0);- mtspr(SPRN_MMCR0, 0);+ mtspr(SPRN_MMCR0, MMCR0_FC); mtspr(SPRN_MMCR1, 0); mtspr(SPRN_MMCR2, 0);}@@ -428,7 +428,7 @@ static void init_pmu_power10(void) mtspr(SPRN_MMCR3, 0); mtspr(SPRN_MMCRA, MMCRA_BHRB_DISABLE);- mtspr(SPRN_MMCR0, MMCR0_PMCCEXT);+ mtspr(SPRN_MMCR0, MMCR0_FC | MMCR0_PMCCEXT);}static int __init feat_enable_pmu_power10(struct dt_cpu_feature *f)diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.cindex 1f30f98b09d1..f7349d150828 100644--- a/arch/powerpc/kvm/book3s_hv.c+++ b/arch/powerpc/kvm/book3s_hv.c@@ -2593,6 +2593,11 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu)#endif#endif vcpu->arch.mmcr[0] = MMCR0_FC;+ if (cpu_has_feature(CPU_FTR_ARCH_31)) {+ vcpu->arch.mmcr[0] |= MMCR0_PMCCEXT;+ vcpu->arch.mmcra = MMCRA_BHRB_DISABLE;+ }+ vcpu->arch.ctrl = CTRL_RUNLATCH; /* default to host PVR, since we can't spoof it */ kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR));diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.cindex 51622411a7cc..e33b29ec1a65 100644--- a/arch/powerpc/perf/core-book3s.c+++ b/arch/powerpc/perf/core-book3s.c@@ -1361,6 +1361,13 @@ static void power_pmu_enable(struct pmu *pmu) goto out; if (cpuhw->n_events == 0) {+ if (cpu_has_feature(CPU_FTR_ARCH_31)) {+ mtspr(SPRN_MMCRA, MMCRA_BHRB_DISABLE);+ mtspr(SPRN_MMCR0, MMCR0_FC | MMCR0_PMCCEXT);+ } else {+ mtspr(SPRN_MMCRA, 0);+ mtspr(SPRN_MMCR0, MMCR0_FC);+ }Hi Nick,We are setting these bits in “power_pmu_disable” function. And disable will be called before any event gets deleted/stopped. Can you please help to understand why this is needed in power_pmu_enable path also ?I'll have to go back and check what I needed it for.Basically what I'm trying to achieve is that when the guest and host are not running perf, then the registers should match. This way the host can test them with mfspr but does not need to fix them with mtspr.It's not too important for performance after TM facility demand faultingand the nested guest PMU fix means you can usually just skip that entirely, but I think it's cleaner. I'll double check my perf/ changesit's possible that part should be split out or is unnecessary.Sure Nickpower_pmu_disable already sets MMCRA_BHRB_DISABLE/MMCR0_FC/MMCR0_PMCCEXT bits.Hence trying to understand in which scenario we found this was needed for power_pmu_enable.ThanksAthiraThanks,Nick
Re: [PATCH v2 1/2] sched/topology: Skip updating masks for non-online nodes
Hi Valentin, > On 01/07/21 09:45, Srikar Dronamraju wrote: > > @@ -1891,12 +1894,30 @@ void sched_init_numa(void) > > void sched_domains_numa_masks_set(unsigned int cpu) > > { > > Hmph, so we're playing games with masks of offline nodes - is that really > necessary? Your modification of sched_init_numa() still scans all of the > nodes (regardless of their online status) to build the distance map, and > that is never updated (sched_init_numa() is pretty much an __init > function). > > So AFAICT this is all to cope with topology_span_sane() not applying > 'cpu_map' to its masks. That seemed fine to me back when I wrote it, but in > light of having bogus distance values for offline nodes, not so much... > > What about the below instead? > > --- > diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c > index b77ad49dc14f..c2d9caad4aa6 100644 > --- a/kernel/sched/topology.c > +++ b/kernel/sched/topology.c > @@ -2075,6 +2075,7 @@ static struct sched_domain *build_sched_domain(struct > sched_domain_topology_leve > static bool topology_span_sane(struct sched_domain_topology_level *tl, > const struct cpumask *cpu_map, int cpu) > { > + struct cpumask *intersect = sched_domains_tmpmask; > int i; > > /* NUMA levels are allowed to overlap */ > @@ -2090,14 +2091,17 @@ static bool topology_span_sane(struct > sched_domain_topology_level *tl, > for_each_cpu(i, cpu_map) { > if (i == cpu) > continue; > + > /* > - * We should 'and' all those masks with 'cpu_map' to exactly > - * match the topology we're about to build, but that can only > - * remove CPUs, which only lessens our ability to detect > - * overlaps > + * We shouldn't have to bother with cpu_map here, unfortunately > + * some architectures (powerpc says hello) have to deal with > + * offline NUMA nodes reporting bogus distance values. This can > + * lead to funky NODE domain spans, but since those are offline > + * we can mask them out. >*/ > + cpumask_and(intersect, tl->mask(cpu), tl->mask(i)); > if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) && > - cpumask_intersects(tl->mask(cpu), tl->mask(i))) > + cpumask_intersects(intersect, cpu_map)) > return false; > } > Unfortunately this is not helping. I tried this patch alone and also with 2/2 patch of this series where we update/fill fake topology numbers. However both cases are still failing. -- Thanks and Regards Srikar Dronamraju
[PATCH v1 3/4] mm/memory_hotplug: remove nid parameter from remove_memory() and friends
There is only a single user remaining. We can simply lookup the nid only used for node offlining purposes when walking our memory blocks. We don't expect to remove multi-nid ranges; and if we'd ever do, we most probably don't care about removing multi-nid ranges that actually result in empty nodes. If ever required, we can detect the "multi-nid" scenario and simply try offlining all online nodes. Acked-by: Michael Ellerman (powerpc) Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Dan Williams Cc: Vishal Verma Cc: Dave Jiang Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Andrew Morton Cc: Nathan Lynch Cc: Laurent Dufour Cc: "Aneesh Kumar K.V" Cc: Scott Cheloha Cc: Anton Blanchard Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-a...@vger.kernel.org Cc: nvd...@lists.linux.dev Signed-off-by: David Hildenbrand --- .../platforms/pseries/hotplug-memory.c| 9 +++--- drivers/acpi/acpi_memhotplug.c| 7 + drivers/dax/kmem.c| 3 +- drivers/virtio/virtio_mem.c | 4 +-- include/linux/memory_hotplug.h| 10 +++ mm/memory_hotplug.c | 28 +++ 6 files changed, 30 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 377d852f5a9a..ef5c24b42cf1 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -286,7 +286,7 @@ static int pseries_remove_memblock(unsigned long base, unsigned long memblock_si { unsigned long block_sz, start_pfn; int sections_per_block; - int i, nid; + int i; start_pfn = base >> PAGE_SHIFT; @@ -297,10 +297,9 @@ static int pseries_remove_memblock(unsigned long base, unsigned long memblock_si block_sz = pseries_memory_block_size(); sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; - nid = memory_add_physaddr_to_nid(base); for (i = 0; i < sections_per_block; i++) { - __remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE); + __remove_memory(base, MIN_MEMORY_BLOCK_SIZE); base += MIN_MEMORY_BLOCK_SIZE; } @@ -387,7 +386,7 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb) block_sz = pseries_memory_block_size(); - __remove_memory(mem_block->nid, lmb->base_addr, block_sz); + __remove_memory(lmb->base_addr, block_sz); put_device(_block->dev); /* Update memory regions for memory remove */ @@ -660,7 +659,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) rc = dlpar_online_lmb(lmb); if (rc) { - __remove_memory(nid, lmb->base_addr, block_sz); + __remove_memory(lmb->base_addr, block_sz); invalidate_lmb_associativity_index(lmb); } else { lmb->flags |= DRCONF_MEM_ASSIGNED; diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index 8cc195c4c861..1d01d9414c40 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -239,19 +239,14 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) static void acpi_memory_remove_memory(struct acpi_memory_device *mem_device) { - acpi_handle handle = mem_device->device->handle; struct acpi_memory_info *info, *n; - int nid = acpi_get_node(handle); list_for_each_entry_safe(info, n, _device->res_list, list) { if (!info->enabled) continue; - if (nid == NUMA_NO_NODE) - nid = memory_add_physaddr_to_nid(info->start_addr); - acpi_unbind_memory_blocks(info); - __remove_memory(nid, info->start_addr, info->length); + __remove_memory(info->start_addr, info->length); list_del(>list); kfree(info); } diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c index ac231cc36359..99e0f60c4c26 100644 --- a/drivers/dax/kmem.c +++ b/drivers/dax/kmem.c @@ -156,8 +156,7 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax) if (rc) continue; - rc = remove_memory(dev_dax->target_node, range.start, - range_len()); + rc = remove_memory(range.start, range_len()); if (rc == 0) { release_resource(data->res[i]); kfree(data->res[i]); diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 09ed55de07d7..774986695dc4 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -677,7 +677,7 @@ static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr, dev_dbg(>vdev->dev, "removing memory: 0x%llx - 0x%llx\n",
[PATCH v3 1/1] powerpc/pseries: Interface to represent PAPR firmware attributes
Adds a generic interface to represent the energy and frequency related PAPR attributes on the system using the new H_CALL "H_GET_ENERGY_SCALE_INFO". H_GET_EM_PARMS H_CALL was previously responsible for exporting this information in the lparcfg, however the H_GET_EM_PARMS H_CALL will be deprecated P10 onwards. The H_GET_ENERGY_SCALE_INFO H_CALL is of the following call format: hcall( uint64 H_GET_ENERGY_SCALE_INFO, // Get energy scale info uint64 flags, // Per the flag request uint64 firstAttributeId,// The attribute id uint64 bufferAddress, // Guest physical address of the output buffer uint64 bufferSize // The size in bytes of the output buffer ); This H_CALL can query either all the attributes at once with firstAttributeId = 0, flags = 0 as well as query only one attribute at a time with firstAttributeId = id, flags = 1. The output buffer consists of the following 1. number of attributes - 8 bytes 2. array offset to the data location - 8 bytes 3. version info - 1 byte 4. A data array of size num attributes, which contains the following: a. attribute ID - 8 bytes b. attribute value in number - 8 bytes c. attribute name in string - 64 bytes d. attribute value in string - 64 bytes The new H_CALL exports information in direct string value format, hence a new interface has been introduced in /sys/firmware/papr/energy_scale_info to export this information to userspace in an extensible pass-through format. The H_CALL returns the name, numeric value and string value (if exists) The format of exposing the sysfs information is as follows: /sys/firmware/papr/energy_scale_info/ |-- / |-- desc |-- value |-- value_desc (if exists) |-- / |-- desc |-- value |-- value_desc (if exists) ... The energy information that is exported is useful for userspace tools such as powerpc-utils. Currently these tools infer the "power_mode_data" value in the lparcfg, which in turn is obtained from the to be deprecated H_GET_EM_PARMS H_CALL. On future platforms, such userspace utilities will have to look at the data returned from the new H_CALL being populated in this new sysfs interface and report this information directly without the need of interpretation. Signed-off-by: Pratik R. Sampat Reviewed-by: Gautham R. Shenoy --- .../sysfs-firmware-papr-energy-scale-info | 26 ++ arch/powerpc/include/asm/hvcall.h | 24 +- arch/powerpc/kvm/trace_hv.h | 1 + arch/powerpc/platforms/pseries/Makefile | 3 +- .../pseries/papr_platform_attributes.c| 320 ++ 5 files changed, 372 insertions(+), 2 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info create mode 100644 arch/powerpc/platforms/pseries/papr_platform_attributes.c diff --git a/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info new file mode 100644 index ..fd82f2bfafe5 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info @@ -0,0 +1,26 @@ +What: /sys/firmware/papr/energy_scale_info +Date: June 2021 +Contact: Linux for PowerPC mailing list +Description: Directory hosting a set of platform attributes like + energy/frequency on Linux running as a PAPR guest. + + Each file in a directory contains a platform + attribute hierarchy pertaining to performance/ + energy-savings mode and processor frequency. + +What: /sys/firmware/papr/energy_scale_info/ + /sys/firmware/papr/energy_scale_info//desc + /sys/firmware/papr/energy_scale_info//value + /sys/firmware/papr/energy_scale_info//value_desc +Date: June 2021 +Contact: Linux for PowerPC mailing list +Description: Energy, frequency attributes directory for POWERVM servers + + This directory provides energy, erequency, folding information. It + contains below sysfs attributes: + + - desc: String description of the attribute + + - value: Numeric value of attribute + + - value_desc: String value of attribute diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index e3b29eda8074..c91714ea6719 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -316,7 +316,8 @@ #define H_SCM_PERFORMANCE_STATS 0x418 #define H_RPT_INVALIDATE 0x448 #define H_SCM_FLUSH0x44C -#define MAX_HCALL_OPCODE H_SCM_FLUSH +#define H_GET_ENERGY_SCALE_INFO0x450 +#define MAX_HCALL_OPCODE H_GET_ENERGY_SCALE_INFO /* Scope args for H_SCM_UNBIND_ALL */ #define H_UNBIND_SCOPE_ALL (0x1) @@ -631,6 +632,27 @@ struct hv_gpci_request_buffer { uint8_t bytes[HGPCI_MAX_DATA_BYTES]; } __packed;
[PATCH v3 0/1] Interface to represent PAPR firmware attributes
RFC: https://lkml.org/lkml/2021/6/4/791 PATCH v1: https://lkml.org/lkml/2021/6/16/805 PATCH v2: https://lkml.org/lkml/2021/7/6/138 Changelog v2 --> v3 Based on a comment from Guatham: 1. Added a versioning check after the H_CALL is made to bail out when the version from the firmware is inconsistent with that in the kernel Also, have implemented a POC using this interface for the powerpc-utils' ppc64_cpu --frequency command-line tool to utilize this information in userspace. The POC for the new interface has been hosted here: https://github.com/pratiksampat/powerpc-utils/tree/H_GET_ENERGY_SCALE_INFO_v2 Sample output from the powerpc-utils tool is as follows: # ppc64_cpu --frequency Power and Performance Mode: Idle Power Saver Status : Processor Folding Status : --> Printed if Idle power save status is supported Platform reported frequencies --> Frequencies reported from the platform's H_CALL i.e PAPR interface min: GHz max: GHz static : GHz Tool Computed frequencies min: GHz (cpu XX) max: GHz (cpu XX) avg: GHz Pratik R. Sampat (1): powerpc/pseries: Interface to represent PAPR firmware attributes .../sysfs-firmware-papr-energy-scale-info | 26 ++ arch/powerpc/include/asm/hvcall.h | 24 +- arch/powerpc/kvm/trace_hv.h | 1 + arch/powerpc/platforms/pseries/Makefile | 3 +- .../pseries/papr_platform_attributes.c| 320 ++ 5 files changed, 372 insertions(+), 2 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info create mode 100644 arch/powerpc/platforms/pseries/papr_platform_attributes.c -- 2.31.1
Re: [PATCH] ASoC: fsl_xcvr: Omit superfluous error message in fsl_xcvr_probe()
On Thu, 24 Jun 2021 18:45:05 +0800, Tang Bin wrote: > In the function fsl_xcvr__probe(), when get irq failed, > the function platform_get_irq() logs an error message, so remove > redundant message here. Applied to https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git for-next Thanks! [1/1] ASoC: fsl_xcvr: Omit superfluous error message in fsl_xcvr_probe() commit: 8620c40002db9679279546cc3be2aceb8c5e5e76 All being well this means that it will be integrated into the linux-next tree (usually sometime in the next 24 hours) and sent to Linus during the next merge window (or sooner if it is a bug fix), however if problems are discovered then the patch may be dropped or reverted. You may get further e-mails resulting from automated or manual testing and review of the tree, please engage with people reporting problems and send followup patches addressing any issues that are reported if needed. If any updates are required or you are submitting further changes they should be sent as incremental updates against current git, existing patches will not be replaced. Please add any relevant lists and maintainers to the CCs when replying to this mail. Thanks, Mark
Re: [PATCH] powerpc/perf: Fix cycles/instructions as PM_CYC/PM_INST_CMPL in power10
> On 08-Jul-2021, at 9:13 PM, Paul A. Clarke wrote: > > On Thu, Jul 08, 2021 at 10:56:57PM +1000, Nicholas Piggin wrote: >> Excerpts from Athira Rajeev's message of July 7, 2021 4:39 pm: >>> From: Athira Rajeev >>> >>> Power10 performance monitoring unit (PMU) driver uses performance >>> monitor counter 5 (PMC5) and performance monitor counter 6 (PMC6) >>> for counting instructions and cycles. Event used for cycles is >>> PM_RUN_CYC and instructions is PM_RUN_INST_CMPL. But counting of these >>> events in wait state is controlled by the CC56RUN bit setting in >>> Monitor Mode Control Register0 (MMCR0). If the CC56RUN bit is not >>> set, PMC5/6 will not count when CTRL[RUN] is zero. >> >> What's the acutal bug here, can you explain a bit more? I thought >> PM_RUN_CYC is supposed to be gated by the runlatch. > > Would this renaming break compatibility with existing tools that > presume PM_RUN_CYC and PM_RUN_INST_CMPL exist generically? Hi Paul, Thanks for checking the patch. No, this does not break compatibility with existing tools. Since the change is only for PMC5 and PMC6. Events PM_RUN_CYC and PM_RUN_INST_CMPL still behaves the same way since they are programmed in different PMC’s. Thanks Athira > > PC
Re: [PATCH] powerpc/perf: Fix cycles/instructions as PM_CYC/PM_INST_CMPL in power10
> On 08-Jul-2021, at 6:26 PM, Nicholas Piggin wrote: > > Excerpts from Athira Rajeev's message of July 7, 2021 4:39 pm: >> From: Athira Rajeev >> >> Power10 performance monitoring unit (PMU) driver uses performance >> monitor counter 5 (PMC5) and performance monitor counter 6 (PMC6) >> for counting instructions and cycles. Event used for cycles is >> PM_RUN_CYC and instructions is PM_RUN_INST_CMPL. But counting of these >> events in wait state is controlled by the CC56RUN bit setting in >> Monitor Mode Control Register0 (MMCR0). If the CC56RUN bit is not >> set, PMC5/6 will not count when CTRL[RUN] is zero. > > What's the acutal bug here, can you explain a bit more? I thought > PM_RUN_CYC is supposed to be gated by the runlatch. > > POWER9 code looks similar, it doesn't have the same problem? > > Thanks, > Nick Hi Nick, Thanks for the review. In power9 and before, the default event used for counting "cycles" - PM_CYC (0x0001e) and for counting instruction - PM_INST_CMPL (0x2) . These events, uses two programmable PMC to count cycles and instructions (this causes multiplexing for basic set of events supported by perf stat). And PM_CYC/PM_INST_CMPL both by default count irrespective of the run latch state. So in power10, we decided to use PMC5 and PMC6 for counting instructions and cycles respectively. But PMC5 and PMC6 by default counts only when runlatch is enabled, this can cause issues in case of system wide profiling. So in order to make PMC5/6 behave as PM_CYC and PM_INST_CMPL, we are enabling MMCR0[CC56RUN]] bit. Thanks Athira > >> >> Patch sets the CC56RUN bit in MMCR0 for power10 which makes PMC5 and >> PMC6 count instructions and cycles regardless of the run bit. With this >> change, these events are also now renamed to PM_CYC and PM_INST_CMPL >> rather than PM_RUN_CYC and PM_RUN_INST_CMPL. >> >> Fixes: a64e697cef23 ("powerpc/perf: power10 Performance Monitoring support") >> Signed-off-by: Athira Rajeev >> Reviewed-by: Madhavan Srinivasan >> --- >> Notes on testing done for this change: >> Tested this patch change with a kernel module that >> turns off and turns on the runlatch. kernel module also >> reads the counter values for PMC5 and PMC6 during the >> period when runlatch is off. >> - Started PMU counters via "perf stat" and loaded the >> test module. >> - Checked the counter values captured from module during >> the runlatch off period. >> - Verified that counters were frozen without the patch and >> with the patch, observed counters were incrementing. >> >> arch/powerpc/perf/power10-events-list.h | 8 +++--- >> arch/powerpc/perf/power10-pmu.c | 44 >> +++-- >> 2 files changed, 35 insertions(+), 17 deletions(-) >> >> diff --git a/arch/powerpc/perf/power10-events-list.h >> b/arch/powerpc/perf/power10-events-list.h >> index 93be719..564f1409 100644 >> --- a/arch/powerpc/perf/power10-events-list.h >> +++ b/arch/powerpc/perf/power10-events-list.h >> @@ -9,10 +9,10 @@ >> /* >> * Power10 event codes. >> */ >> -EVENT(PM_RUN_CYC, 0x600f4); >> +EVENT(PM_CYC, 0x600f4); >> EVENT(PM_DISP_STALL_CYC, 0x100f8); >> EVENT(PM_EXEC_STALL, 0x30008); >> -EVENT(PM_RUN_INST_CMPL, 0x500fa); >> +EVENT(PM_INST_CMPL, 0x500fa); >> EVENT(PM_BR_CMPL, 0x4d05e); >> EVENT(PM_BR_MPRED_CMPL, 0x400f6); >> EVENT(PM_BR_FIN, 0x2f04a); >> @@ -50,8 +50,8 @@ >> /* ITLB Reloaded */ >> EVENT(PM_ITLB_MISS, 0x400fc); >> >> -EVENT(PM_RUN_CYC_ALT, 0x0001e); >> -EVENT(PM_RUN_INST_CMPL_ALT, 0x2); >> +EVENT(PM_CYC_ALT, 0x0001e); >> +EVENT(PM_INST_CMPL_ALT, 0x2); >> >> /* >> * Memory Access Events >> diff --git a/arch/powerpc/perf/power10-pmu.c >> b/arch/powerpc/perf/power10-pmu.c >> index f9d64c6..9dd75f3 100644 >> --- a/arch/powerpc/perf/power10-pmu.c >> +++ b/arch/powerpc/perf/power10-pmu.c >> @@ -91,8 +91,8 @@ >> >> /* Table of alternatives, sorted by column 0 */ >> static const unsigned int power10_event_alternatives[][MAX_ALT] = { >> -{ PM_RUN_CYC_ALT, PM_RUN_CYC }, >> -{ PM_RUN_INST_CMPL_ALT, PM_RUN_INST_CMPL }, >> +{ PM_CYC_ALT, PM_CYC }, >> +{ PM_INST_CMPL_ALT, PM_INST_CMPL }, >> }; >> >> static int power10_get_alternatives(u64 event, unsigned int flags, u64 alt[]) >> @@ -118,8 +118,8 @@ static int power10_check_attr_config(struct perf_event >> *ev) >> return 0; >> } >> >> -GENERIC_EVENT_ATTR(cpu-cycles, PM_RUN_CYC); >> -GENERIC_EVENT_ATTR(instructions,PM_RUN_INST_CMPL); >> +GENERIC_EVENT_ATTR(cpu-cycles, PM_CYC); >> +GENERIC_EVENT_ATTR(instructions,
Re: [PATCH V3 1/1] powerpc/perf: Fix PMU callbacks to clear pending PMI before resetting an overflown PMC
> On 12-Jul-2021, at 8:07 AM, Nicholas Piggin wrote: > > Excerpts from Athira Rajeev's message of July 11, 2021 1:58 am: >> Running perf fuzzer showed below in dmesg logs: >> "Can't find PMC that caused IRQ" >> >> This means a PMU exception happened, but none of the PMC's (Performance >> Monitor Counter) were found to be overflown. There are some corner cases >> that clears the PMCs after PMI gets masked. In such cases, the perf >> interrupt handler will not find the active PMC values that had caused >> the overflow and thus leads to this message while replaying. >> >> Case 1: PMU Interrupt happens during replay of other interrupts and >> counter values gets cleared by PMU callbacks before replay: >> >> During replay of interrupts like timer, __do_irq and doorbell exception, we >> conditionally enable interrupts via may_hard_irq_enable(). This could >> potentially create a window to generate a PMI. Since irq soft mask is set >> to ALL_DISABLED, the PMI will get masked here. We could get IPIs run before >> perf interrupt is replayed and the PMU events could deleted or stopped. >> This will change the PMU SPR values and resets the counters. Snippet of >> ftrace log showing PMU callbacks invoked in "__do_irq": >> >> -0 [051] dns. 132025441306354: __do_irq <-call_do_irq >> -0 [051] dns. 132025441306430: irq_enter <-__do_irq >> -0 [051] dns. 132025441306503: irq_enter_rcu <-__do_irq >> -0 [051] dnH. 132025441306599: xive_get_irq <-__do_irq >> <<>> >> -0 [051] dnH. 132025441307770: >> generic_smp_call_function_single_interrupt <-smp_ipi_demux_relaxed >> -0 [051] dnH. 132025441307839: flush_smp_call_function_queue >> <-smp_ipi_demux_relaxed >> -0 [051] dnH. 132025441308057: _raw_spin_lock <-event_function >> -0 [051] dnH. 132025441308206: power_pmu_disable <-perf_pmu_disable >> -0 [051] dnH. 132025441308337: power_pmu_del <-event_sched_out >> -0 [051] dnH. 132025441308407: power_pmu_read <-power_pmu_del >> -0 [051] dnH. 132025441308477: read_pmc <-power_pmu_read >> -0 [051] dnH. 132025441308590: isa207_disable_pmc <-power_pmu_del >> -0 [051] dnH. 132025441308663: write_pmc <-power_pmu_del >> -0 [051] dnH. 132025441308787: power_pmu_event_idx >> <-perf_event_update_userpage >> -0 [051] dnH. 132025441308859: rcu_read_unlock_strict >> <-perf_event_update_userpage >> -0 [051] dnH. 132025441308975: power_pmu_enable <-perf_pmu_enable >> <<>> >> -0 [051] dnH. 132025441311108: irq_exit <-__do_irq >> -0 [051] dns. 132025441311319: performance_monitor_exception >> <-replay_soft_interrupts >> >> Case 2: PMI's masked during local_* operations, example local_add. >> If the local_add operation happens within a local_irq_save, replay of >> PMI will be during local_irq_restore. Similar to case 1, this could >> also create a window before replay where PMU events gets deleted or >> stopped. >> >> Patch adds a fix to update the PMU callback function 'power_pmu_disable' to >> check for pending perf interrupt. If there is an overflown PMC and pending >> perf interrupt indicated in Paca, clear the PMI bit in paca to drop that >> sample. Clearing of PMI bit is done in 'power_pmu_disable' since disable is >> invoked before any event gets deleted/stopped. With this fix, if there are >> more than one event running in the PMU, there is a chance that we clear the >> PMI bit for the event which is not getting deleted/stopped. The other >> events may still remain active. Hence to make sure we don't drop valid >> sample in such cases, another check is added in power_pmu_enable. This >> checks if there is an overflown PMC found among the active events and if >> so enable back the PMI bit. Two new helper functions are introduced to >> clear/set the PMI, ie 'clear_pmi_irq_pending' and 'set_pmi_irq_pending'. >> >> Also there are corner cases which results in performance monitor interrupts >> getting triggered during power_pmu_disable. This happens since PMXE bit is >> not cleared along with disabling of other MMCR0 bits in the pmu_disable. >> Such PMI's could leave the PMU running and could trigger PMI again which >> will set MMCR0 PMAO bit. This could lead to spurious interrupts in some >> corner cases. Example, a timer after power_pmu_del which will re-enable >> interrupts and triggers a PMI again since PMAO bit is still set. But fails >> to find valid overflow since PMC get cleared in power_pmu_del. Patch >> fixes this by disabling PMXE along with disabling of other MMCR0 bits >> in power_pmu_disable. >> >> We can't just replay PMI any time. Hence this approach is preferred rather >> than replaying PMI before resetting overflown PMC. Patch also documents >> core-book3s on a race condition which can trigger these PMC messages during >> idle path in PowerNV. >> >> Fixes: f442d004806e ("powerpc/64s: Add support to mask perf interrupts and >> replay them") >> Reported-by: Nageswara R Sastry >> Suggested-by: Nicholas Piggin >> Suggested-by: Madhavan Srinivasan >> Signed-off-by: Athira Rajeev >> --- >>
[PATCH] powerpc/papr_scm: Implement initial support for injecting smart errors
Presently PAPR doesn't support injecting smart errors on an NVDIMM. This makes testing the NVDIMM health reporting functionality difficult as simulating NVDIMM health related events need a hacked up qemu version. To solve this problem this patch proposes simulating certain set of NVDIMM health related events in papr_scm. Specifically 'fatal' health state and 'dirty' shutdown state. These error can be injected via the user-space 'ndctl-inject-smart(1)' command. With the proposed patch and corresponding ndctl patches following command flow is expected: $ sudo ndctl list -DH -d nmem0 ... "health_state":"ok", "shutdown_state":"clean", ... # inject unsafe shutdown and fatal health error $ sudo ndctl inject-smart nmem0 -Uf ... "health_state":"fatal", "shutdown_state":"dirty", ... # uninject all errors $ sudo ndctl inject-smart nmem0 -N ... "health_state":"ok", "shutdown_state":"clean", ... The patch adds two members 'health_bitmap_mask' and 'health_bitmap_override' inside struct papr_scm_priv which are then bit blt'ed[1] to the health bitmaps fetched from the hypervisor. In case we are not able to fetch health information from the hypervisor we service the health bitmap from these two members. These members are accessible from sysfs at nmemX/papr/health_bitmap_override A new PDSM named 'SMART_INJECT' is proposed that accepts newly introduced 'struct nd_papr_pdsm_smart_inject' as payload thats exchanged between libndctl and papr_scm to indicate the requested smart-error states. When the processing the PDSM 'SMART_INJECT', papr_pdsm_smart_inject() constructs a pair or 'mask' and 'override' bitmaps from the payload and bit-blt it to the 'health_bitmap_{mask, override}' members. This ensures the after being fetched from the hypervisor, the health_bitmap reflects requested smart-error states. The patch is based on [2] "powerpc/papr_scm: Move duplicate definitions to common header files". [1] : https://en.wikipedia.org/wiki/Bit_blit [2] : https://lore.kernel.org/nvdimm/162505488483.72147.12741153746322191381.stgit@56e104a48989 Signed-off-by: Vaibhav Jain Signed-off-by: Shivaprasad G Bhat --- arch/powerpc/platforms/pseries/papr_scm.c | 94 ++- include/uapi/linux/papr_pdsm.h| 18 + 2 files changed, 109 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index 0c56db5a1427..b7437c61a270 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -29,6 +29,10 @@ (1ul << ND_CMD_SET_CONFIG_DATA) | \ (1ul << ND_CMD_CALL)) +/* Use bitblt method to override specific bits in the '_bitmap_' */ +#define BITBLT_BITMAP(_bitmap_, _mask_, _override_)\ + (((_bitmap_) & ~(_mask_)) | ((_mask_) & (_override_))) + /* Struct holding a single performance metric */ struct papr_scm_perf_stat { u8 stat_id[8]; @@ -81,6 +85,12 @@ struct papr_scm_priv { /* length of the stat buffer as expected by phyp */ size_t stat_buffer_len; + + /* The bits which needs to be overridden */ + u64 health_bitmap_mask; + + /* The overridden values for the bits having the masks set */ + u64 health_bitmap_override; }; static int papr_scm_pmem_flush(struct nd_region *nd_region, @@ -308,19 +318,28 @@ static ssize_t drc_pmem_query_stats(struct papr_scm_priv *p, static int __drc_pmem_query_health(struct papr_scm_priv *p) { unsigned long ret[PLPAR_HCALL_BUFSIZE]; + u64 bitmap = 0; long rc; /* issue the hcall */ rc = plpar_hcall(H_SCM_HEALTH, ret, p->drc_index); - if (rc != H_SUCCESS) { + if (rc == H_SUCCESS) + bitmap = ret[0] & ret[1]; + else if (rc == H_FUNCTION) + dev_info_once(>pdev->dev, + "Hcall H_SCM_HEALTH not implemented, assuming empty health bitmap"); + else { + dev_err(>pdev->dev, "Failed to query health information, Err:%ld\n", rc); return -ENXIO; } p->lasthealth_jiffies = jiffies; - p->health_bitmap = ret[0] & ret[1]; - + /* Allow overriding specific health bits via bit blt. */ + bitmap = BITBLT_BITMAP(bitmap, p->health_bitmap_mask, + p->health_bitmap_override); + WRITE_ONCE(p->health_bitmap, bitmap); dev_dbg(>pdev->dev, "Queried dimm health info. Bitmap:0x%016lx Mask:0x%016lx\n", ret[0], ret[1]); @@ -630,6 +649,54 @@ static int papr_pdsm_health(struct papr_scm_priv *p, return rc; } +/* Inject a smart error Add the dirty-shutdown-counter value to the pdsm */ +static int papr_pdsm_smart_inject(struct papr_scm_priv *p, + union nd_pdsm_payload *payload) +{ + int rc; + u32 supported_flags = 0; + u64 mask = 0, override
Re: [PATCH] memblock: make for_each_mem_range() traverse MEMBLOCK_HOTPLUG regions
On Mon, 12 Jul 2021 10:11:32 +0300 Mike Rapoport wrote: > From: Mike Rapoport > > Commit b10d6bca8720 ("arch, drivers: replace for_each_membock() with > for_each_mem_range()") didn't take into account that when there is > movable_node parameter in the kernel command line, for_each_mem_range() > would skip ranges marked with MEMBLOCK_HOTPLUG. > > The page table setup code in POWER uses for_each_mem_range() to create the > linear mapping of the physical memory and since the regions marked as > MEMORY_HOTPLUG are skipped, they never make it to the linear map. > > A later access to the memory in those ranges will fail: > > [2.271743] BUG: Unable to handle kernel data access on write at > 0xc004 > [2.271984] Faulting instruction address: 0xc008a3c0 > [2.272568] Oops: Kernel access of bad area, sig: 11 [#1] > [2.272683] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries > [2.273063] Modules linked in: > [2.273435] CPU: 0 PID: 53 Comm: kworker/u2:0 Not tainted 5.13.0 #7 > [2.273832] NIP: c008a3c0 LR: c03c1ed8 CTR: > 0040 > [2.273918] REGS: c8a57770 TRAP: 0300 Not tainted (5.13.0) > [2.274036] MSR: 82009033 CR: > 8402 XER: 2004 > [2.274454] CFAR: c03c1ed4 DAR: c004 DSISR: 4200 > IRQMASK: 0 > [2.274454] GPR00: c03c1ed8 c8a57a10 c19da700 > c004 > [2.274454] GPR04: 0280 0180 0400 > 0200 > [2.274454] GPR08: 0100 0080 0040 > 0300 > [2.274454] GPR12: 0380 c1bc c01660c8 > c6337e00 > [2.274454] GPR16: > > [2.274454] GPR20: 4000 2000 c1a81990 > c8c3 > [2.274454] GPR24: c8c2 c1a81998 000f > c1a819a0 > [2.274454] GPR28: c1a81908 c00c0100 c8c4 > c8a64680 > [2.275520] NIP [c008a3c0] clear_user_page+0x50/0x80 > [2.276333] LR [c03c1ed8] __handle_mm_fault+0xc88/0x1910 > [2.276688] Call Trace: > [2.276839] [c8a57a10] [c03c1e94] > __handle_mm_fault+0xc44/0x1910 (unreliable) > [2.277142] [c8a57af0] [c03c2c90] > handle_mm_fault+0x130/0x2a0 > [2.277331] [c8a57b40] [c03b5f08] > __get_user_pages+0x248/0x610 > [2.277541] [c8a57c40] [c03b848c] > __get_user_pages_remote+0x12c/0x3e0 > [2.277768] [c8a57cd0] [c0473f24] get_arg_page+0x54/0xf0 > [2.277959] [c8a57d10] [c0474a7c] > copy_string_kernel+0x11c/0x210 > [2.278159] [c8a57d80] [c047663c] kernel_execve+0x16c/0x220 > [2.278361] [c8a57dd0] [c0166270] > call_usermodehelper_exec_async+0x1b0/0x2f0 > [2.278543] [c8a57e10] [c000d5ec] > ret_from_kernel_thread+0x5c/0x70 > [2.278870] Instruction dump: > [2.279214] 79280fa4 79271764 79261f24 794ae8e2 7ca94214 7d683a14 7c893a14 > 7d893050 > [2.279416] 7d4903a6 6000 6000 6000 <7c001fec> 7c091fec > 7c081fec 7c051fec > [2.280193] ---[ end trace 490b8c67e6075e09 ]--- > > Making for_each_mem_range() include MEMBLOCK_HOTPLUG regions in the > traversal fixes this issue. > > Link: https://bugzilla.redhat.com/show_bug.cgi?id=1976100 > Fixes: b10d6bca8720 ("arch, drivers: replace for_each_membock() with > for_each_mem_range()") > Signed-off-by: Mike Rapoport > --- This fixes the issue I was observing with both radix and hash. Thanks ! Tested-by: Greg Kurz Cc'ing linuxppc-dev so that POWER folks know about the fix and stable. Cc: sta...@vger.kernel.org # v5.10 > include/linux/memblock.h | 4 ++-- > mm/memblock.c| 3 ++- > 2 files changed, 4 insertions(+), 3 deletions(-) > > diff --git a/include/linux/memblock.h b/include/linux/memblock.h > index cbf46f56d105..4a53c3ca86bd 100644 > --- a/include/linux/memblock.h > +++ b/include/linux/memblock.h > @@ -209,7 +209,7 @@ static inline void __next_physmem_range(u64 *idx, struct > memblock_type *type, > */ > #define for_each_mem_range(i, p_start, p_end) \ > __for_each_mem_range(i, , NULL, NUMA_NO_NODE, \ > - MEMBLOCK_NONE, p_start, p_end, NULL) > + MEMBLOCK_HOTPLUG, p_start, p_end, NULL) > > /** > * for_each_mem_range_rev - reverse iterate through memblock areas from > @@ -220,7 +220,7 @@ static inline void __next_physmem_range(u64 *idx, struct > memblock_type *type, > */ > #define for_each_mem_range_rev(i, p_start, p_end)\ > __for_each_mem_range_rev(i, , NULL, NUMA_NO_NODE, \ > - MEMBLOCK_NONE, p_start, p_end, NULL) > + MEMBLOCK_HOTPLUG, p_start,
Re: [PATCH v2 1/1] powerpc/pseries: Interface to represent PAPR firmware attributes
On 08/07/21 4:05 pm, Gautham R Shenoy wrote: Hello Pratik, On Tue, Jul 06, 2021 at 01:54:00PM +0530, Pratik R. Sampat wrote: Adds a generic interface to represent the energy and frequency related PAPR attributes on the system using the new H_CALL "H_GET_ENERGY_SCALE_INFO". H_GET_EM_PARMS H_CALL was previously responsible for exporting this information in the lparcfg, however the H_GET_EM_PARMS H_CALL will be deprecated P10 onwards. The H_GET_ENERGY_SCALE_INFO H_CALL is of the following call format: hcall( uint64 H_GET_ENERGY_SCALE_INFO, // Get energy scale info uint64 flags, // Per the flag request uint64 firstAttributeId,// The attribute id uint64 bufferAddress, // Guest physical address of the output buffer uint64 bufferSize // The size in bytes of the output buffer ); This H_CALL can query either all the attributes at once with firstAttributeId = 0, flags = 0 as well as query only one attribute at a time with firstAttributeId = id, flags = 1. The output buffer consists of the following 1. number of attributes - 8 bytes 2. array offset to the data location - 8 bytes 3. version info - 1 byte 4. A data array of size num attributes, which contains the following: a. attribute ID - 8 bytes b. attribute value in number - 8 bytes c. attribute name in string - 64 bytes d. attribute value in string - 64 bytes The new H_CALL exports information in direct string value format, hence a new interface has been introduced in /sys/firmware/papr/energy_scale_info to export this information to userspace in an extensible pass-through format. The H_CALL returns the name, numeric value and string value (if exists) The format of exposing the sysfs information is as follows: /sys/firmware/papr/energy_scale_info/ |-- / |-- desc |-- value |-- value_desc (if exists) |-- / |-- desc |-- value |-- value_desc (if exists) ... I like this implementation. Only one comment w.r.t versioning below: [..snip..] @@ -631,6 +632,26 @@ struct hv_gpci_request_buffer { uint8_t bytes[HGPCI_MAX_DATA_BYTES]; } __packed; +#define MAX_ESI_ATTRS 10 +#define MAX_BUF_SZ (sizeof(struct h_energy_scale_info_hdr) + \ + (sizeof(struct energy_scale_attribute) * MAX_ESI_ATTRS)) + +struct energy_scale_attribute { + __be64 id; + __be64 value; + unsigned char desc[64]; + unsigned char value_desc[64]; +} __packed; + +struct h_energy_scale_info_hdr { + __be64 num_attrs; + __be64 array_offset; + __u8 data_header_version; +} __packed; + [..snip..] +static int __init papr_init(void) +{ + uint64_t num_attrs; + int ret, idx, i; + char *esi_buf; + + if (!firmware_has_feature(FW_FEATURE_LPAR)) + return -ENXIO; + + esi_buf = kmalloc(MAX_BUF_SZ, GFP_KERNEL); + if (esi_buf == NULL) + return -ENOMEM; + /* +* hcall( +* uint64 H_GET_ENERGY_SCALE_INFO, // Get energy scale info +* uint64 flags,// Per the flag request +* uint64 firstAttributeId, // The attribute id +* uint64 bufferAddress,// Guest physical address of the output buffer +* uint64 bufferSize); // The size in bytes of the output buffer +*/ + ret = plpar_hcall_norets(H_GET_ENERGY_SCALE_INFO, ESI_FLAGS_ALL, 0, +virt_to_phys(esi_buf), MAX_BUF_SZ); It is good that you are passing a character buffer here and interpreting it later. This helps in the cases where the header has more fields than what we are aware of changed. I think even a future platform adds newer fields to the header, those fields will be appended after the existing fields, so we should still be able to interpret the first three fields of the header that we are currently aware of. + if (ret != H_SUCCESS) { + pr_warn("hcall failed: H_GET_ENERGY_SCALE_INFO"); + goto out; + } + + esi_hdr = (struct h_energy_scale_info_hdr *) esi_buf; + num_attrs = be64_to_cpu(esi_hdr->num_attrs); Shouldn't we check for the esi_hdr->data_header_version here? Currently we are only aware of the version 1. If we happen to run this kernel code on a future platform which supports a different version, wouldn't it be safer to bail out here ? Otherwise this patch looks good to me. Reviewed-by: Gautham R. Shenoy Thanks for the review Gautham. Sure I'll make use of the header version to bail out. That just seems like the safest approach. I'll add that and spin a new version here Thanks Pratik -- Thanks and Regards gautham.