[PATCH v5 6/8] powerpc: Rework and improve STRICT_KERNEL_RWX patching

2021-07-12 Thread Christopher M. Riedl
Rework code-patching with STRICT_KERNEL_RWX to prepare for the next
patch which uses a temporary mm for patching under the Book3s64 Radix
MMU. Make improvements by adding a WARN_ON when the patchsite doesn't
match after patching and return the error from __patch_instruction()
properly.

Signed-off-by: Christopher M. Riedl 

---

v5:  * New to series.
---
 arch/powerpc/lib/code-patching.c | 51 +---
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 3122d8e4cc013..9f2eba9b70ee4 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -102,11 +102,12 @@ static inline void unuse_temporary_mm(struct temp_mm 
*temp_mm)
 }
 
 static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
+static DEFINE_PER_CPU(unsigned long, cpu_patching_addr);
 
 #if IS_BUILTIN(CONFIG_LKDTM)
 unsigned long read_cpu_patching_addr(unsigned int cpu)
 {
-   return (unsigned long)(per_cpu(text_poke_area, cpu))->addr;
+   return per_cpu(cpu_patching_addr, cpu);
 }
 #endif
 
@@ -121,6 +122,7 @@ static int text_area_cpu_up(unsigned int cpu)
return -1;
}
this_cpu_write(text_poke_area, area);
+   this_cpu_write(cpu_patching_addr, (unsigned long)area->addr);
 
return 0;
 }
@@ -146,7 +148,7 @@ void __init poking_init(void)
 /*
  * This can be called for kernel text or a module.
  */
-static int map_patch_area(void *addr, unsigned long text_poke_addr)
+static int map_patch_area(void *addr)
 {
unsigned long pfn;
int err;
@@ -156,17 +158,20 @@ static int map_patch_area(void *addr, unsigned long 
text_poke_addr)
else
pfn = __pa_symbol(addr) >> PAGE_SHIFT;
 
-   err = map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), PAGE_KERNEL);
+   err = map_kernel_page(__this_cpu_read(cpu_patching_addr),
+ (pfn << PAGE_SHIFT), PAGE_KERNEL);
 
-   pr_devel("Mapped addr %lx with pfn %lx:%d\n", text_poke_addr, pfn, err);
+   pr_devel("Mapped addr %lx with pfn %lx:%d\n",
+__this_cpu_read(cpu_patching_addr), pfn, err);
if (err)
return -1;
 
return 0;
 }
 
-static inline int unmap_patch_area(unsigned long addr)
+static inline int unmap_patch_area(void)
 {
+   unsigned long addr = __this_cpu_read(cpu_patching_addr);
pte_t *ptep;
pmd_t *pmdp;
pud_t *pudp;
@@ -175,23 +180,23 @@ static inline int unmap_patch_area(unsigned long addr)
 
pgdp = pgd_offset_k(addr);
if (unlikely(!pgdp))
-   return -EINVAL;
+   goto out_err;
 
p4dp = p4d_offset(pgdp, addr);
if (unlikely(!p4dp))
-   return -EINVAL;
+   goto out_err;
 
pudp = pud_offset(p4dp, addr);
if (unlikely(!pudp))
-   return -EINVAL;
+   goto out_err;
 
pmdp = pmd_offset(pudp, addr);
if (unlikely(!pmdp))
-   return -EINVAL;
+   goto out_err;
 
ptep = pte_offset_kernel(pmdp, addr);
if (unlikely(!ptep))
-   return -EINVAL;
+   goto out_err;
 
pr_devel("clearing mm %p, pte %p, addr %lx\n", _mm, ptep, addr);
 
@@ -202,15 +207,17 @@ static inline int unmap_patch_area(unsigned long addr)
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
 
return 0;
+
+out_err:
+   pr_warn("failed to unmap %lx\n", addr);
+   return -EINVAL;
 }
 
 static int do_patch_instruction(u32 *addr, struct ppc_inst instr)
 {
-   int err;
+   int err, rc = 0;
u32 *patch_addr = NULL;
unsigned long flags;
-   unsigned long text_poke_addr;
-   unsigned long kaddr = (unsigned long)addr;
 
/*
 * During early early boot patch_instruction is called
@@ -222,24 +229,20 @@ static int do_patch_instruction(u32 *addr, struct 
ppc_inst instr)
 
local_irq_save(flags);
 
-   text_poke_addr = (unsigned long)__this_cpu_read(text_poke_area)->addr;
-   if (map_patch_area(addr, text_poke_addr)) {
-   err = -1;
+   err = map_patch_area(addr);
+   if (err)
goto out;
-   }
-
-   patch_addr = (u32 *)(text_poke_addr + (kaddr & ~PAGE_MASK));
 
-   __patch_instruction(addr, instr, patch_addr);
+   patch_addr = (u32 *)(__this_cpu_read(cpu_patching_addr) | 
offset_in_page(addr));
+   rc = __patch_instruction(addr, instr, patch_addr);
 
-   err = unmap_patch_area(text_poke_addr);
-   if (err)
-   pr_warn("failed to unmap %lx\n", text_poke_addr);
+   err = unmap_patch_area();
 
 out:
local_irq_restore(flags);
+   WARN_ON(!ppc_inst_equal(ppc_inst_read(addr), instr));
 
-   return err;
+   return rc ? rc : err;
 }
 #else /* !CONFIG_STRICT_KERNEL_RWX */
 
-- 
2.26.1



[PATCH v5 5/8] powerpc/64s: Introduce temporary mm for Radix MMU

2021-07-12 Thread Christopher M. Riedl
x86 supports the notion of a temporary mm which restricts access to
temporary PTEs to a single CPU. A temporary mm is useful for situations
where a CPU needs to perform sensitive operations (such as patching a
STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
said mappings to other CPUs. Another benefit is that other CPU TLBs do
not need to be flushed when the temporary mm is torn down.

Mappings in the temporary mm can be set in the userspace portion of the
address-space.

Interrupts must be disabled while the temporary mm is in use. HW
breakpoints, which may have been set by userspace as watchpoints on
addresses now within the temporary mm, are saved and disabled when
loading the temporary mm. The HW breakpoints are restored when unloading
the temporary mm. All HW breakpoints are indiscriminately disabled while
the temporary mm is in use.

Based on x86 implementation:

commit cefa929c034e
("x86/mm: Introduce temporary mm structs")

Signed-off-by: Christopher M. Riedl 

---

v5:  * Drop support for using a temporary mm on Book3s64 Hash MMU.

v4:  * Pass the prev mm instead of NULL to switch_mm_irqs_off() when
   using/unusing the temp mm as suggested by Jann Horn to keep
   the context.active counter in-sync on mm/nohash.
 * Disable SLB preload in the temporary mm when initializing the
   temp_mm struct.
 * Include asm/debug.h header to fix build issue with
   ppc44x_defconfig.
---
 arch/powerpc/include/asm/debug.h |  1 +
 arch/powerpc/kernel/process.c|  5 +++
 arch/powerpc/lib/code-patching.c | 56 
 3 files changed, 62 insertions(+)

diff --git a/arch/powerpc/include/asm/debug.h b/arch/powerpc/include/asm/debug.h
index 86a14736c76c3..dfd82635ea8b3 100644
--- a/arch/powerpc/include/asm/debug.h
+++ b/arch/powerpc/include/asm/debug.h
@@ -46,6 +46,7 @@ static inline int debugger_fault_handler(struct pt_regs 
*regs) { return 0; }
 #endif
 
 void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk);
+void __get_breakpoint(int nr, struct arch_hw_breakpoint *brk);
 bool ppc_breakpoint_available(void);
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
 extern void do_send_trap(struct pt_regs *regs, unsigned long address,
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 185beb2905801..a0776200772e8 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -865,6 +865,11 @@ static inline int set_breakpoint_8xx(struct 
arch_hw_breakpoint *brk)
return 0;
 }
 
+void __get_breakpoint(int nr, struct arch_hw_breakpoint *brk)
+{
+   memcpy(brk, this_cpu_ptr(_brk[nr]), sizeof(*brk));
+}
+
 void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk)
 {
memcpy(this_cpu_ptr(_brk[nr]), brk, sizeof(*brk));
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 54b6157d44e95..3122d8e4cc013 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -17,6 +17,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 
 static int __patch_instruction(u32 *exec_addr, struct ppc_inst instr, u32 
*patch_addr)
 {
@@ -45,6 +48,59 @@ int raw_patch_instruction(u32 *addr, struct ppc_inst instr)
 }
 
 #ifdef CONFIG_STRICT_KERNEL_RWX
+
+struct temp_mm {
+   struct mm_struct *temp;
+   struct mm_struct *prev;
+   struct arch_hw_breakpoint brk[HBP_NUM_MAX];
+};
+
+static inline void init_temp_mm(struct temp_mm *temp_mm, struct mm_struct *mm)
+{
+   /* We currently only support temporary mm on the Book3s64 Radix MMU */
+   WARN_ON(!radix_enabled());
+
+   temp_mm->temp = mm;
+   temp_mm->prev = NULL;
+   memset(_mm->brk, 0, sizeof(temp_mm->brk));
+}
+
+static inline void use_temporary_mm(struct temp_mm *temp_mm)
+{
+   lockdep_assert_irqs_disabled();
+
+   temp_mm->prev = current->active_mm;
+   switch_mm_irqs_off(temp_mm->prev, temp_mm->temp, current);
+
+   WARN_ON(!mm_is_thread_local(temp_mm->temp));
+
+   if (ppc_breakpoint_available()) {
+   struct arch_hw_breakpoint null_brk = {0};
+   int i = 0;
+
+   for (; i < nr_wp_slots(); ++i) {
+   __get_breakpoint(i, _mm->brk[i]);
+   if (temp_mm->brk[i].type != 0)
+   __set_breakpoint(i, _brk);
+   }
+   }
+}
+
+static inline void unuse_temporary_mm(struct temp_mm *temp_mm)
+{
+   lockdep_assert_irqs_disabled();
+
+   switch_mm_irqs_off(temp_mm->temp, temp_mm->prev, current);
+
+   if (ppc_breakpoint_available()) {
+   int i = 0;
+
+   for (; i < nr_wp_slots(); ++i)
+   if (temp_mm->brk[i].type != 0)
+   __set_breakpoint(i, _mm->brk[i]);
+   }
+}
+
 static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
 
 #if IS_BUILTIN(CONFIG_LKDTM)
-- 
2.26.1



[PATCH v5 8/8] lkdtm/powerpc: Fix code patching hijack test

2021-07-12 Thread Christopher M. Riedl
Code patching on powerpc with a STRICT_KERNEL_RWX uses a userspace
address in a temporary mm on Radix now. Use __put_user() to avoid write
failures due to KUAP when attempting a "hijack" on the patching address.
__put_user() also works with the non-userspace, vmalloc-based patching
address on non-Radix MMUs.

Signed-off-by: Christopher M. Riedl 
---
 drivers/misc/lkdtm/perms.c | 9 -
 1 file changed, 9 deletions(-)

diff --git a/drivers/misc/lkdtm/perms.c b/drivers/misc/lkdtm/perms.c
index 41e87e5f9cc86..da6a34a0a49fb 100644
--- a/drivers/misc/lkdtm/perms.c
+++ b/drivers/misc/lkdtm/perms.c
@@ -262,16 +262,7 @@ static inline u32 lkdtm_read_patch_site(void)
 /* Returns True if the write succeeds */
 static inline bool lkdtm_try_write(u32 data, u32 *addr)
 {
-#ifdef CONFIG_PPC
-   __put_kernel_nofault(addr, , u32, err);
-   return true;
-
-err:
-   return false;
-#endif
-#ifdef CONFIG_X86_64
return !__put_user(data, addr);
-#endif
 }
 
 static int lkdtm_patching_cpu(void *data)
-- 
2.26.1



[PATCH v5 1/8] powerpc: Add LKDTM accessor for patching addr

2021-07-12 Thread Christopher M. Riedl
When live patching with STRICT_KERNEL_RWX a mapping is installed at a
"patching address" with temporary write permissions. Provide a
LKDTM-only accessor function for this address in preparation for a LKDTM
test which attempts to "hijack" this mapping by writing to it from
another CPU.

Signed-off-by: Christopher M. Riedl 
---
 arch/powerpc/include/asm/code-patching.h | 4 
 arch/powerpc/lib/code-patching.c | 7 +++
 2 files changed, 11 insertions(+)

diff --git a/arch/powerpc/include/asm/code-patching.h 
b/arch/powerpc/include/asm/code-patching.h
index a95f63788c6b1..16fbc58a4932f 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -184,4 +184,8 @@ static inline unsigned long ppc_kallsyms_lookup_name(const 
char *name)
 #define PPC_INST_STD_LRPPC_RAW_STD(_R0, _R1, PPC_LR_STKOFF)
 #endif /* CONFIG_PPC64 */
 
+#if IS_BUILTIN(CONFIG_LKDTM) && IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)
+unsigned long read_cpu_patching_addr(unsigned int cpu);
+#endif
+
 #endif /* _ASM_POWERPC_CODE_PATCHING_H */
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index f9a3019e37b43..54b6157d44e95 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -47,6 +47,13 @@ int raw_patch_instruction(u32 *addr, struct ppc_inst instr)
 #ifdef CONFIG_STRICT_KERNEL_RWX
 static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
 
+#if IS_BUILTIN(CONFIG_LKDTM)
+unsigned long read_cpu_patching_addr(unsigned int cpu)
+{
+   return (unsigned long)(per_cpu(text_poke_area, cpu))->addr;
+}
+#endif
+
 static int text_area_cpu_up(unsigned int cpu)
 {
struct vm_struct *area;
-- 
2.26.1



[PATCH v5 7/8] powerpc/64s: Initialize and use a temporary mm for patching on Radix

2021-07-12 Thread Christopher M. Riedl
When code patching a STRICT_KERNEL_RWX kernel the page containing the
address to be patched is temporarily mapped as writeable. Currently, a
per-cpu vmalloc patch area is used for this purpose. While the patch
area is per-cpu, the temporary page mapping is inserted into the kernel
page tables for the duration of patching. The mapping is exposed to CPUs
other than the patching CPU - this is undesirable from a hardening
perspective. Use a temporary mm instead which keeps the mapping local to
the CPU doing the patching.

Use the `poking_init` init hook to prepare a temporary mm and patching
address. Initialize the temporary mm by copying the init mm. Choose a
randomized patching address inside the temporary mm userspace address
space. The patching address is randomized between PAGE_SIZE and
DEFAULT_MAP_WINDOW-PAGE_SIZE.

Bits of entropy with 64K page size on BOOK3S_64:

bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE)

PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB
bits of entropy = log2(128TB / 64K)
bits of entropy = 31

The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU
operates - by default the space above DEFAULT_MAP_WINDOW is not
available. Currently the Hash MMU does not use a temporary mm so
technically this upper limit isn't necessary; however, a larger
randomization range does not further "harden" this overall approach and
future work may introduce patching with a temporary mm on Hash as well.

Randomization occurs only once during initialization at boot for each
possible CPU in the system.

Introduce two new functions, map_patch() and unmap_patch(), to
respectively create and remove the temporary mapping with write
permissions at patching_addr. Map the page with PAGE_KERNEL to set
EAA[0] for the PTE which ignores the AMR (so no need to unlock/lock
KUAP) according to PowerISA v3.0b Figure 35 on Radix.

Based on x86 implementation:

commit 4fc19708b165
("x86/alternatives: Initialize temporary mm for patching")

and:

commit b3fd8e83ada0
("x86/alternatives: Use temporary mm for text poking")

Signed-off-by: Christopher M. Riedl 

---

v5:  * Only support Book3s64 Radix MMU for now.
 * Use a per-cpu datastructure to hold the patching_addr and
   patching_mm to avoid the need for a synchronization lock/mutex.

v4:  * In the previous series this was two separate patches: one to init
   the temporary mm in poking_init() (unused in powerpc at the time)
   and the other to use it for patching (which removed all the
   per-cpu vmalloc code). Now that we use poking_init() in the
   existing per-cpu vmalloc approach, that separation doesn't work
   as nicely anymore so I just merged the two patches into one.
 * Preload the SLB entry and hash the page for the patching_addr
   when using Hash on book3s64 to avoid taking an SLB and Hash fault
   during patching. The previous implementation was a hack which
   changed current->mm to allow the SLB and Hash fault handlers to
   work with the temporary mm since both of those code-paths always
   assume mm == current->mm.
 * Also (hmm - seeing a trend here) with the book3s64 Hash MMU we
   have to manage the mm->context.active_cpus counter and mm cpumask
   since they determine (via mm_is_thread_local()) if the TLB flush
   in pte_clear() is local or not - it should always be local when
   we're using the temporary mm. On book3s64's Radix MMU we can
   just call local_flush_tlb_mm().
 * Use HPTE_USE_KERNEL_KEY on Hash to avoid costly lock/unlock of
   KUAP.
---
 arch/powerpc/lib/code-patching.c | 132 +--
 1 file changed, 125 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 9f2eba9b70ee4..027dabd42b8dd 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -103,6 +104,7 @@ static inline void unuse_temporary_mm(struct temp_mm 
*temp_mm)
 
 static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
 static DEFINE_PER_CPU(unsigned long, cpu_patching_addr);
+static DEFINE_PER_CPU(struct mm_struct *, cpu_patching_mm);
 
 #if IS_BUILTIN(CONFIG_LKDTM)
 unsigned long read_cpu_patching_addr(unsigned int cpu)
@@ -133,6 +135,51 @@ static int text_area_cpu_down(unsigned int cpu)
return 0;
 }
 
+static __always_inline void __poking_init_temp_mm(void)
+{
+   int cpu;
+   spinlock_t *ptl; /* for protecting pte table */
+   pte_t *ptep;
+   struct mm_struct *patching_mm;
+   unsigned long patching_addr;
+
+   for_each_possible_cpu(cpu) {
+   /*
+* Some parts of the kernel (static keys for example) depend on
+* successful code patching. Code patching under
+* STRICT_KERNEL_RWX requires this setup - otherwise we cannot
+* 

[PATCH v5 4/8] lkdtm/x86_64: Add test to hijack a patch mapping

2021-07-12 Thread Christopher M. Riedl
A previous commit implemented an LKDTM test on powerpc to exploit the
temporary mapping established when patching code with STRICT_KERNEL_RWX
enabled. Extend the test to work on x86_64 as well.

Signed-off-by: Christopher M. Riedl 
---
 drivers/misc/lkdtm/perms.c | 26 ++
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/drivers/misc/lkdtm/perms.c b/drivers/misc/lkdtm/perms.c
index 39e7456852229..41e87e5f9cc86 100644
--- a/drivers/misc/lkdtm/perms.c
+++ b/drivers/misc/lkdtm/perms.c
@@ -224,7 +224,7 @@ void lkdtm_ACCESS_NULL(void)
 }
 
 #if (IS_BUILTIN(CONFIG_LKDTM) && defined(CONFIG_STRICT_KERNEL_RWX) && \
-   defined(CONFIG_PPC))
+   (defined(CONFIG_PPC) || defined(CONFIG_X86_64)))
 /*
  * This is just a dummy location to patch-over.
  */
@@ -233,12 +233,25 @@ static void patching_target(void)
return;
 }
 
-#include 
 const u32 *patch_site = (const u32 *)_target;
 
+#ifdef CONFIG_PPC
+#include 
+#endif
+
+#ifdef CONFIG_X86_64
+#include 
+#endif
+
 static inline int lkdtm_do_patch(u32 data)
 {
+#ifdef CONFIG_PPC
return patch_instruction((u32 *)patch_site, ppc_inst(data));
+#endif
+#ifdef CONFIG_X86_64
+   text_poke((void *)patch_site, , sizeof(u32));
+   return 0;
+#endif
 }
 
 static inline u32 lkdtm_read_patch_site(void)
@@ -249,11 +262,16 @@ static inline u32 lkdtm_read_patch_site(void)
 /* Returns True if the write succeeds */
 static inline bool lkdtm_try_write(u32 data, u32 *addr)
 {
+#ifdef CONFIG_PPC
__put_kernel_nofault(addr, , u32, err);
return true;
 
 err:
return false;
+#endif
+#ifdef CONFIG_X86_64
+   return !__put_user(data, addr);
+#endif
 }
 
 static int lkdtm_patching_cpu(void *data)
@@ -346,8 +364,8 @@ void lkdtm_HIJACK_PATCH(void)
 
 void lkdtm_HIJACK_PATCH(void)
 {
-   if (!IS_ENABLED(CONFIG_PPC))
-   pr_err("XFAIL: this test only runs on powerpc\n");
+   if (!IS_ENABLED(CONFIG_PPC) && !IS_ENABLED(CONFIG_X86_64))
+   pr_err("XFAIL: this test only runs on powerpc and x86_64\n");
if (!IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
pr_err("XFAIL: this test requires CONFIG_STRICT_KERNEL_RWX\n");
if (!IS_BUILTIN(CONFIG_LKDTM))
-- 
2.26.1



[PATCH v5 0/8] Use per-CPU temporary mappings for patching on Radix MMU

2021-07-12 Thread Christopher M. Riedl
When compiled with CONFIG_STRICT_KERNEL_RWX, the kernel must create
temporary mappings when patching itself. These mappings temporarily
override the strict RWX text protections to permit a write. Currently,
powerpc allocates a per-CPU VM area for patching. Patching occurs as
follows:

1. Map page in per-CPU VM area w/ PAGE_KERNEL protection
2. Patch text
3. Remove the temporary mapping

While the VM area is per-CPU, the mapping is actually inserted into the
kernel page tables. Presumably, this could allow another CPU to access
the normally write-protected text - either malicously or accidentally -
via this same mapping if the address of the VM area is known. Ideally,
the mapping should be kept local to the CPU doing the patching [0].

x86 introduced "temporary mm" structs which allow the creation of mappings
local to a particular CPU [1]. This series intends to bring the notion of a
temporary mm to powerpc's Book3s64 Radix MMU and harden it by using such a
mapping for patching a kernel with strict RWX permissions.

The first four patches implement an LKDTM test "proof-of-concept" which
exploits the potential vulnerability (ie. the temporary mapping during patching
is exposed in the kernel page tables and accessible by other CPUs) using a
simple brute-force approach. This test is implemented for both powerpc and
x86_64. The test passes on powerpc Radix with this new series, fails on
upstream powerpc, passes on upstream x86_64, and fails on an older (ancient)
x86_64 tree without the x86_64 temporary mm patches. The remaining patches add
support for and use a temporary mm for code patching on powerpc with the Radix
MMU.

Tested boot, ftrace, and repeated LKDTM "hijack":
- QEMU+KVM (host: POWER9 Blackbird): Radix MMU w/ KUAP
- QEMU+KVM (host: POWER9 Blackbird): Hash MMU

Tested repeated LKDTM "hijack":
- QEMU+KVM (host: AMD desktop): x86_64 upstream
- QEMU+KVM (host: AMD desktop): x86_64 w/o percpu temp mm to
  verify the LKDTM "hijack" test fails

Tested boot and ftrace:
- QEMU+TCG: ppc44x (bamboo)
- QEMU+TCG: g5 (mac99)

I also tested with various extra config options enabled as suggested in
section 12) in Documentation/process/submit-checklist.rst.

v5: * Only support Book3s64 Radix MMU for now. There are some issues with
  the previous implementation on the Hash MMU as pointed out by Nick
  Piggin. Fixing these is not trivial so we only support the Radix MMU
  for now. I tried using realmode (no data translation) to patch with
  Hash to at least avoid exposing the patch mapping to other CPUs but
  this doesn't reliably work either since we cannot access vmalloc'd
  space in realmode.
* Use percpu variables for the patching_mm and patching_addr. This
  avoids the need for synchronization mechanisms entirely. Thanks to
  Peter Zijlstra for pointing out text_mutex which unfortunately didn't
  work out without larger code changes in powerpc. Also thanks to Daniel
  Axtens for comments about using percpu variables for the *percpu* temp
  mm things off list.

v4: * It's time to revisit this series again since @jpn and @mpe fixed
  our known STRICT_*_RWX bugs on powerpc/64s.
* Rebase on linuxppc/next:
  commit ee1bc694fbaec ("powerpc/kvm: Fix build error when 
PPC_MEM_KEYS/PPC_PSERIES=n")
* Completely rework how map_patch() works on book3s64 Hash MMU
* Split the LKDTM x86_64 and powerpc bits into separate patches
* Annotate commit messages with changes from v3 instead of
  listing them here completely out-of context...

v3: * Rebase on linuxppc/next: commit 9123e3a74ec7 ("Linux 5.9-rc1")
* Move temporary mm implementation into code-patching.c where it
  belongs
* Implement LKDTM hijacker test on x86_64 (on IBM time oof) Do
* not use address zero for the patching address in the
  temporary mm (thanks @dja for pointing this out!)
* Wrap the LKDTM test w/ CONFIG_SMP as suggested by Christophe
  Leroy
* Comments to clarify PTE pre-allocation and patching addr
  selection

v2: * Rebase on linuxppc/next:
  commit 105fb38124a4 ("powerpc/8xx: Modify ptep_get()")
* Always dirty pte when mapping patch
* Use `ppc_inst_len` instead of `sizeof` on instructions
* Declare LKDTM patching addr accessor in header where it belongs   

v1: * Rebase on linuxppc/next (4336b9337824)
* Save and restore second hw watchpoint
* Use new ppc_inst_* functions for patching check and in LKDTM test

rfc-v2: * Many fixes and improvements mostly based on extensive feedback
  and testing by Christophe Leroy (thanks!).
* Make patching_mm and patching_addr static and move
  '__ro_after_init' to after the variable name (more common in
  other parts 

[PATCH v5 2/8] lkdtm/powerpc: Add test to hijack a patch mapping

2021-07-12 Thread Christopher M. Riedl
When live patching with STRICT_KERNEL_RWX the CPU doing the patching
must temporarily remap the page(s) containing the patch site with +W
permissions. While this temporary mapping is in use, another CPU could
write to the same mapping and maliciously alter kernel text. Implement a
LKDTM test to attempt to exploit such an opening during code patching.
The test is implemented on powerpc and requires LKDTM built into the
kernel (building LKDTM as a module is insufficient).

The LKDTM "hijack" test works as follows:

  1. A CPU executes an infinite loop to patch an instruction. This is
 the "patching" CPU.
  2. Another CPU attempts to write to the address of the temporary
 mapping used by the "patching" CPU. This other CPU is the
 "hijacker" CPU. The hijack either fails with a fault/error or
 succeeds, in which case some kernel text is now overwritten.

The virtual address of the temporary patch mapping is provided via an
LKDTM-specific accessor to the hijacker CPU. This test assumes a
hypothetical situation where this address was leaked previously.

How to run the test:

mount -t debugfs none /sys/kernel/debug
(echo HIJACK_PATCH > /sys/kernel/debug/provoke-crash/DIRECT)

A passing test indicates that it is not possible to overwrite kernel
text from another CPU by using the temporary mapping established by
a CPU for patching.

Signed-off-by: Christopher M. Riedl 

---

v5:  * Use `u32*` instead of `struct ppc_inst*` based on new series in
   upstream.

v4:  * Separate the powerpc and x86_64 bits into individual patches.
 * Use __put_kernel_nofault() when attempting to hijack the mapping
 * Use raw_smp_processor_id() to avoid triggering the BUG() when
   calling smp_processor_id() in preemptible code - the only thing
   that matters is that one of the threads is bound to a different
   CPU - we are not using smp_processor_id() to access any per-cpu
   data or similar where preemption should be disabled.
 * Rework the patching_cpu() kthread stop condition to avoid:
   https://lwn.net/Articles/628628/
---
 drivers/misc/lkdtm/core.c  |   1 +
 drivers/misc/lkdtm/lkdtm.h |   1 +
 drivers/misc/lkdtm/perms.c | 134 +
 3 files changed, 136 insertions(+)

diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c
index 8024b6a5cc7fc..fbcb95eda337b 100644
--- a/drivers/misc/lkdtm/core.c
+++ b/drivers/misc/lkdtm/core.c
@@ -147,6 +147,7 @@ static const struct crashtype crashtypes[] = {
CRASHTYPE(WRITE_RO),
CRASHTYPE(WRITE_RO_AFTER_INIT),
CRASHTYPE(WRITE_KERN),
+   CRASHTYPE(HIJACK_PATCH),
CRASHTYPE(REFCOUNT_INC_OVERFLOW),
CRASHTYPE(REFCOUNT_ADD_OVERFLOW),
CRASHTYPE(REFCOUNT_INC_NOT_ZERO_OVERFLOW),
diff --git a/drivers/misc/lkdtm/lkdtm.h b/drivers/misc/lkdtm/lkdtm.h
index 99f90d3e5e9cb..87e7e6136d962 100644
--- a/drivers/misc/lkdtm/lkdtm.h
+++ b/drivers/misc/lkdtm/lkdtm.h
@@ -62,6 +62,7 @@ void lkdtm_EXEC_USERSPACE(void);
 void lkdtm_EXEC_NULL(void);
 void lkdtm_ACCESS_USERSPACE(void);
 void lkdtm_ACCESS_NULL(void);
+void lkdtm_HIJACK_PATCH(void);
 
 /* refcount.c */
 void lkdtm_REFCOUNT_INC_OVERFLOW(void);
diff --git a/drivers/misc/lkdtm/perms.c b/drivers/misc/lkdtm/perms.c
index 2dede2ef658f3..39e7456852229 100644
--- a/drivers/misc/lkdtm/perms.c
+++ b/drivers/misc/lkdtm/perms.c
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 /* Whether or not to fill the target memory area with do_nothing(). */
@@ -222,6 +223,139 @@ void lkdtm_ACCESS_NULL(void)
pr_err("FAIL: survived bad write\n");
 }
 
+#if (IS_BUILTIN(CONFIG_LKDTM) && defined(CONFIG_STRICT_KERNEL_RWX) && \
+   defined(CONFIG_PPC))
+/*
+ * This is just a dummy location to patch-over.
+ */
+static void patching_target(void)
+{
+   return;
+}
+
+#include 
+const u32 *patch_site = (const u32 *)_target;
+
+static inline int lkdtm_do_patch(u32 data)
+{
+   return patch_instruction((u32 *)patch_site, ppc_inst(data));
+}
+
+static inline u32 lkdtm_read_patch_site(void)
+{
+   return READ_ONCE(*patch_site);
+}
+
+/* Returns True if the write succeeds */
+static inline bool lkdtm_try_write(u32 data, u32 *addr)
+{
+   __put_kernel_nofault(addr, , u32, err);
+   return true;
+
+err:
+   return false;
+}
+
+static int lkdtm_patching_cpu(void *data)
+{
+   int err = 0;
+   u32 val = 0xdeadbeef;
+
+   pr_info("starting patching_cpu=%d\n", raw_smp_processor_id());
+
+   do {
+   err = lkdtm_do_patch(val);
+   } while (lkdtm_read_patch_site() == val && !err && 
!kthread_should_stop());
+
+   if (err)
+   pr_warn("XFAIL: patch_instruction returned error: %d\n", err);
+
+   while (!kthread_should_stop()) {
+   set_current_state(TASK_INTERRUPTIBLE);
+   schedule();
+   }
+
+   return err;
+}
+
+void lkdtm_HIJACK_PATCH(void)
+{
+   struct task_struct *patching_kthrd;
+   int 

[PATCH v5 3/8] x86_64: Add LKDTM accessor for patching addr

2021-07-12 Thread Christopher M. Riedl
When live patching with STRICT_KERNEL_RWX a mapping is installed at a
"patching address" with temporary write permissions. Provide a
LKDTM-only accessor function for this address in preparation for a LKDTM
test which attempts to "hijack" this mapping by writing to it from
another CPU.

Signed-off-by: Christopher M. Riedl 
---
 arch/x86/include/asm/text-patching.h | 4 
 arch/x86/kernel/alternative.c| 7 +++
 2 files changed, 11 insertions(+)

diff --git a/arch/x86/include/asm/text-patching.h 
b/arch/x86/include/asm/text-patching.h
index b7421780e4e92..f0caf9ee13bd8 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -167,4 +167,8 @@ void int3_emulate_ret(struct pt_regs *regs)
 }
 #endif /* !CONFIG_UML_X86 */
 
+#if IS_BUILTIN(CONFIG_LKDTM)
+unsigned long read_cpu_patching_addr(unsigned int cpu);
+#endif
+
 #endif /* _ASM_X86_TEXT_PATCHING_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index e9da3dc712541..28bb92b695639 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -773,6 +773,13 @@ static inline void unuse_temporary_mm(temp_mm_state_t 
prev_state)
 __ro_after_init struct mm_struct *poking_mm;
 __ro_after_init unsigned long poking_addr;
 
+#if IS_BUILTIN(CONFIG_LKDTM)
+unsigned long read_cpu_patching_addr(unsigned int cpu)
+{
+   return poking_addr;
+}
+#endif
+
 static void *__text_poke(void *addr, const void *opcode, size_t len)
 {
bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
-- 
2.26.1



[RFC 2/2] kernel/idle: Update and use idle-hint in VPA region

2021-07-12 Thread Parth Shah
In guest, Set idle_hint to 1 when the prev_cpu of a vCPU goes into idle state,
similarly set idle_hint to 0 when exiting an idle state.

Since the idle_hint is in VPA region, the available_idle_cpu() in guest can
read this region every time to find if a vCPU can be scheduled instantly by
the hypervsior or not.

Signed-off-by: Parth Shah 
---
 arch/powerpc/include/asm/paravirt.h | 12 ++--
 kernel/sched/idle.c |  3 +++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/paravirt.h 
b/arch/powerpc/include/asm/paravirt.h
index bcb7b5f917be..b20e4d25bd00 100644
--- a/arch/powerpc/include/asm/paravirt.h
+++ b/arch/powerpc/include/asm/paravirt.h
@@ -21,6 +21,12 @@ static inline bool is_shared_processor(void)
return static_branch_unlikely(_processor);
 }
 
+static inline int idle_hint_of(int cpu)
+{
+   __be32 idle_hint = READ_ONCE(lppaca_of(cpu).idle_hint);
+   return be32_to_cpu(idle_hint);
+}
+
 /* If bit 0 is set, the cpu has been preempted */
 static inline u32 yield_count_of(int cpu)
 {
@@ -109,8 +115,10 @@ static inline bool vcpu_is_preempted(int cpu)
}
 #endif
 
-   if (yield_count_of(cpu) & 1)
-   return true;
+   if (yield_count_of(cpu) & 1) {
+   if (idle_hint_of(cpu) == 0)
+   return true;
+   }
return false;
 }
 
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 7ca3d3d86c2a..fdc8d1d474f0 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -7,6 +7,7 @@
  *tasks which are handled in sched/fair.c )
  */
 #include "sched.h"
+#include 
 
 #include 
 
@@ -290,6 +291,7 @@ static void do_idle(void)
arch_cpu_idle_dead();
}
 
+   set_idle_hint(cpu, 1);
arch_cpu_idle_enter();
rcu_nocb_flush_deferred_wakeup();
 
@@ -306,6 +308,7 @@ static void do_idle(void)
cpuidle_idle_call();
}
arch_cpu_idle_exit();
+   set_idle_hint(cpu, 0);
}
 
/*
-- 
2.26.3



[RFC 1/2] powerpc/book3s_hv: Add new idle-hint attribute in VPA region

2021-07-12 Thread Parth Shah
In lppaca region, add a new attribute idle_hint which can allow guest scheduler 
for
better cpu selection. Hypervisor can update idle_hint attribute based on
the prediction that if vCPU needs to be scheduled then can it be scheduled
instantly or not.

Signed-off-by: Parth Shah 
---
 arch/powerpc/include/asm/idle_hint.h | 28 +++
 arch/powerpc/include/asm/lppaca.h|  3 ++-
 arch/powerpc/kvm/book3s.h|  2 ++
 arch/powerpc/kvm/book3s_hv.c | 34 
 4 files changed, 66 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/include/asm/idle_hint.h

diff --git a/arch/powerpc/include/asm/idle_hint.h 
b/arch/powerpc/include/asm/idle_hint.h
new file mode 100644
index ..165d65c0275b
--- /dev/null
+++ b/arch/powerpc/include/asm/idle_hint.h
@@ -0,0 +1,28 @@
+#ifndef _ASM_POWERPC_IDLEHINT_H
+#define _ASM_POWERPC_IDLEHINT_H
+
+#include 
+
+extern void kvmppc_idle_hint_set(struct kvm_vcpu *vcpu, int idle_hint);
+
+extern int idle_hint_is_active;
+
+extern void set_idle_hint(int cpu, int value);
+
+static inline int prev_cpu_of_kvm(struct kvm_vcpu *vcpu)
+{
+   struct pid *pid;
+   struct task_struct *task = NULL;
+
+   rcu_read_lock();
+   pid = rcu_dereference(vcpu->pid);
+   if (pid)
+   task = get_pid_task(pid, PIDTYPE_PID);
+   rcu_read_unlock();
+
+   if (!task)
+   return -1;
+
+   return task_cpu(task);
+}
+#endif
diff --git a/arch/powerpc/include/asm/lppaca.h 
b/arch/powerpc/include/asm/lppaca.h
index c390ec377bae..ee790a566036 100644
--- a/arch/powerpc/include/asm/lppaca.h
+++ b/arch/powerpc/include/asm/lppaca.h
@@ -111,7 +111,8 @@ struct lppaca {
__be32  page_ins;   /* CMO Hint - # page ins by OS */
u8  reserved11[148];
volatile __be64 dtl_idx;/* Dispatch Trace Log head index */
-   u8  reserved12[96];
+   volatile __be32 idle_hint;  /* Can vCPU be scheduled instantly? */
+   u8  reserved12[92];
 } cacheline_aligned;
 
 #define lppaca_of(cpu) (*paca_ptrs[cpu]->lppaca_ptr)
diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
index 740e51def5a5..61b0741c139a 100644
--- a/arch/powerpc/kvm/book3s.h
+++ b/arch/powerpc/kvm/book3s.h
@@ -7,6 +7,8 @@
 #ifndef __POWERPC_KVM_BOOK3S_H__
 #define __POWERPC_KVM_BOOK3S_H__
 
+#include 
+
 extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
 struct kvm_memory_slot *memslot);
 extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range 
*range);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index bc0813644666..c008be20294d 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -447,6 +447,7 @@ static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca 
*vpa)
 {
vpa->__old_status |= LPPACA_OLD_SHARED_PROC;
vpa->yield_count = cpu_to_be32(1);
+   vpa->idle_hint = cpu_to_be32(0);
 }
 
 static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v,
@@ -911,6 +912,17 @@ static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
return kvm_vcpu_yield_to(target);
 }
 
+void kvmppc_idle_hint_set(struct kvm_vcpu *vcpu, int idle_hint)
+{
+   struct lppaca *lppaca;
+
+   if (!vcpu) return;
+
+   lppaca = (struct lppaca *)vcpu->arch.vpa.pinned_addr;
+   if (lppaca)
+   lppaca->idle_hint = cpu_to_be32(idle_hint);
+}
+
 static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu)
 {
int yield_count = 0;
@@ -2803,6 +2815,28 @@ static int on_primary_thread(void)
return 1;
 }
 
+void set_idle_hint_for_kvm(struct kvm *kvm, int cpu, int value)
+{
+   int i;
+   struct kvm_vcpu *vcpu;
+
+   kvm_for_each_vcpu(i, vcpu, kvm) {
+   if (cpu == prev_cpu_of_kvm(vcpu)) {
+   kvmppc_idle_hint_set(vcpu, value);
+   }
+   }
+}
+
+void set_idle_hint(int cpu, int value)
+{
+   struct kvm *kvm;
+   struct kvm *tmp;
+
+   list_for_each_entry_safe(kvm, tmp, _list, vm_list) {
+   set_idle_hint_for_kvm(kvm, cpu, value);
+   }
+}
+
 /*
  * A list of virtual cores for each physical CPU.
  * These are vcores that could run but their runner VCPU tasks are
-- 
2.26.3



[RFC 0/2] Paravirtualize idle CPU wakeup optimization

2021-07-12 Thread Parth Shah
This patch-set is a revision over HCALL based implementation which can
be found at:
https://lore.kernel.org/linuxppc-dev/20210401115922.1524705-1-pa...@linux.ibm.com/
But since the overhead of HCALL is huge, this patch-set uses lppaca
region to update idle-hint, where hypervisor keeps changing the newly
added idle_hint attribute in the VPA region of each vCPUs of all KVMs,
and guest have to just look at this attribute.

This implementation is not aimed for full fledged solution, but is
rather a demonstration of para-virtualizing task scheduler. Hypervisor
can provided better idle-hints about vCPU scheduling and guest can use
it for better scheduling decisions.

Abstract:
=
The Linux task scheduler searches for an idle cpu for task wakeup
in-order to lower the wakeup latency as much as possible. The process of
determining if a cpu is idle or not has evolved over time.
Currently, in available_idle_cpu(), a cpu is considered idle if
- there are no task running or enqueued to the runqueue of the cpu and
- the cpu is not-preempted, i.e. while running inside a guest, a cpu is
  not yielded (determined via vcpu_is_preempted())

While inside the guest, there is no way to deterministically predict
if a vCPU that has been yielded/ceded to the hypervisor can be gotten
back. Hence currently the scheduler considers such CEDEd vCPU as not
"available" idle and would instead pick other busy CPUs for waking up
the wakee task.

In this patch-set we try to further classify idle cpus as instantly
available or not. This is achieved by taking hint from the hypervisor
by quering if the vCPU will be scheduled instantly or not.  In most
cases, scheduler prefers prev_cpu of a waking task unless it is busy.
In this patchset, the hypervisor uses this information to figure out
if the prev_cpu used by the task (of the corresponding vCPU) is idle
or not, and passes this information to the guest.

Interface:
===
This patchset introduces a new attribute in lppaca structure which is
shared by both the hypervisor and the guest. The new attribute, i.e.
idle_hint is updated regularly by the hypervisor. When a particular cpu
goes into idle-state, it updates the idle_hint of all the vCPUs of all
existing KVMs whose prev_cpu == smp_processor_id(). It similarly revert
backs the update when coming out of the idle-state.


Internal working:

The code-flow of the current implementation is as follow:
- In do_idle(), when entering an idle-state, walk through all vCPUs of
  all KVM guests and find whose prev_cpu of vCPU task was same as the
  caller's cpu, and mark the idle_hint=1 in the lppaca region of such
  vCPUs.
- Similarly, mark idle_hint=0 for such vCPUs when exiting idle state.
- Guest OS scheduler searches for idle cpu using `avaialable_idle_cpu()`
  which also looks if a vcpu_is_preempted() to see if vCPU is yielded or
  not.
- If vCPU is yielded, then the GuestOS will additionally see if
  idle_hint is marked as 1 or not. If idle_hint==1 then consider the
  vCPU as non-preempted and use it for scheduling a task.


The patch-set is based on v5.13 kernel.

Results:

- Baseline kernel = v5.13
- Patched kernel = v5.13 + this patch-set

Setup:
All the results are taken on IBM POWER9 baremetal system running patched
kernel. This system consists of 2 NUMA nodes, 22 cores per socket with
SMT-4 mode.

Each KVM guest have identical cpu topology with total 40 CPUs, which are
10 cores with SMT-4 support.

Scenarios:
--
1. Under-commit case: Only one KVM is active at a time.

- Baseline kernel:
$> schbench -m 20 -t 2 -r 30
Latency percentiles (usec)
50.th: 86
75.th: 406
90.th: 497
95.th: 541
*99.th: 2572 <-
99.5000th: 3724
99.9000th: 6904
min=0, max=10007

- With Patched kernel:
$> schbench -m 20 -t 2 -r 30
Latency percentiles (usec)
50.th: 386
75.th: 470
90.th: 529
95.th: 589
*99.th: 741 (-71%) <-
99.5000th: 841
99.9000th: 1522
min=0, max=6488

We see a significant reduction in the tail latencies due to being able
to schedule on an yielded/ceded idle CPU with the patchset instead of
waking up the task on a busy CPU.


2. Over-commit case: Multiple KVM guests sharing same set of CPUs.

Two KVMs running baseline kernel is used for creating noise using `schbench
-m 10 -t 2 -r 3000` while only the other KVM is benchmarked.

- Baseline kernel:
$> schbench -m 20 -t 2 -r 30
Latency percentiles (usec)
50.th: 289
75.th: 1074
90.th: 7288
95.th: 11248
*99.th: 17760
99.5000th: 21088
99.9000th: 28896
min=0, max=36640

- With Patched kernel:
$> schbench -m 20 -t 2 -r 30
Latency percentiles (usec)
50.th: 281
75.th: 445
90.th: 4344
95.th: 9168
*99.th: 15824
99.5000th: 19296
99.9000th: 26656

Re: [PATCH v4 07/11] powerpc/pseries/iommu: Reorganize iommu_table_setparms*() with new helper

2021-07-12 Thread Leonardo Brás
Hello Alexey, 

On Fri, 2021-06-18 at 19:26 -0300, Leonardo Brás wrote:
> > 
> > > +    unsigned long liobn,
> > > unsigned long win_addr,
> > > +    unsigned long
> > > window_size,
> > > unsigned long page_shift,
> > > +    unsigned long base,
> > > struct
> > > iommu_table_ops *table_ops)
> > 
> > 
> > iommu_table_setparms() rather than passing 0 around.
> > 
> > The same comment about "liobn" - set it in
> > iommu_table_setparms_lpar().
> > The reviewer will see what field atters in what situation imho.
> > 
> 
> The idea here was to keep all tbl parameters setting in
> _iommu_table_setparms (or iommu_table_setparms_common).
> 
> I understand the idea that each one of those is optional in the other
> case, but should we keep whatever value is present in the other
> variable (not zeroing the other variable), or do someting like:
> 
> tbl->it_index = 0;
> tbl->it_base = basep;
> (in iommu_table_setparms)
> 
> tbl->it_index = liobn;
> tbl->it_base = 0;
> (in iommu_table_setparms_lpar)
> 

This one is supposed to be a question, but I missed the question mark.
Sorry about that.

I would like to get your opinion in this :)





Re: [PATCH v4 10/11] powerpc/pseries/iommu: Make use of DDW for indirect mapping

2021-07-12 Thread Leonardo Brás
On Tue, 2021-05-11 at 17:57 +1000, Alexey Kardashevskiy wrote:
> 
> 
> On 01/05/2021 02:31, Leonardo Bras wrote:
> > [...]
> >   pmem_present = dn != NULL;
> > @@ -1218,8 +1224,12 @@ static bool enable_ddw(struct pci_dev *dev,
> > struct device_node *pdn)
> >   
> > mutex_lock(_window_init_mutex);
> >   
> > -   if (find_existing_ddw(pdn, >dev.archdata.dma_offset,
> > ))
> > -   goto out_unlock;
> > +   if (find_existing_ddw(pdn, >dev.archdata.dma_offset,
> > )) {
> > +   direct_mapping = (len >= max_ram_len);
> > +
> > +   mutex_unlock(_window_init_mutex);
> > +   return direct_mapping;
> 
> Does not this break the existing case when direct_mapping==true by 
> skipping setting dev->dev.bus_dma_limit before returning?
> 

Yes, it does. Good catch!
I changed it to use a flag instead of win64 for return, and now I can
use the same success exit path for both the new config and the config
found in list. (out_unlock)

> 
> 
> > +   }
> >   
> > /*
> >  * If we already went through this for a previous function of
> > @@ -1298,7 +1308,6 @@ static bool enable_ddw(struct pci_dev *dev,
> > struct device_node *pdn)
> > goto out_failed;
> > }
> > /* verify the window * number of ptes will map the partition
> > */
> > -   /* check largest block * page size > max memory hotplug addr
> > */
> > /*
> >  * The "ibm,pmemory" can appear anywhere in the address
> > space.
> >  * Assuming it is still backed by page structs, try
> > MAX_PHYSMEM_BITS
> > @@ -1320,6 +1329,17 @@ static bool enable_ddw(struct pci_dev *dev,
> > struct device_node *pdn)
> > 1ULL << len,
> > query.largest_available_block,
> > 1ULL << page_shift);
> > +
> > +   len = order_base_2(query.largest_available_block <<
> > page_shift);
> > +   win_name = DMA64_PROPNAME;
> 
> [1] 
> 
> 
> > +   } else {
> > +   direct_mapping = true;
> > +   win_name = DIRECT64_PROPNAME;
> > +   }
> > +
> > +   /* DDW + IOMMU on single window may fail if there is any
> > allocation */
> > +   if (default_win_removed && !direct_mapping &&
> > iommu_table_in_use(tbl)) {
> > +   dev_dbg(>dev, "current IOMMU table in use, can't
> > be replaced.\n");
> 
> 
> ... remove !direct_mapping and move to [1]?


sure, done!

> 
> 
> > goto out_failed;
> > }
> >   
> > @@ -1331,8 +1351,7 @@ static bool enable_ddw(struct pci_dev *dev,
> > struct device_node *pdn)
> >   create.liobn, dn);
> >   
> > win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;
> > -   win64 = ddw_property_create(DIRECT64_PROPNAME, create.liobn,
> > win_addr,
> > -   page_shift, len);
> > +   win64 = ddw_property_create(win_name, create.liobn, win_addr,
> > page_shift, len);
> > if (!win64) {
> > dev_info(>dev,
> >  "couldn't allocate property, property name,
> > or value\n");
> > @@ -1350,12 +1369,47 @@ static bool enable_ddw(struct pci_dev *dev,
> > struct device_node *pdn)
> > if (!window)
> > goto out_del_prop;
> >   
> > -   ret = walk_system_ram_range(0, memblock_end_of_DRAM() >>
> > PAGE_SHIFT,
> > -   win64->value,
> > tce_setrange_multi_pSeriesLP_walk);
> > -   if (ret) {
> > -   dev_info(>dev, "failed to map direct window for
> > %pOF: %d\n",
> > -    dn, ret);
> > -   goto out_del_list;
> > +   if (direct_mapping) {
> > +   /* DDW maps the whole partition, so enable direct DMA
> > mapping */
> > +   ret = walk_system_ram_range(0, memblock_end_of_DRAM()
> > >> PAGE_SHIFT,
> > +   win64->value,
> > tce_setrange_multi_pSeriesLP_walk);
> > +   if (ret) {
> > +   dev_info(>dev, "failed to map direct
> > window for %pOF: %d\n",
> > +    dn, ret);
> > +   goto out_del_list;
> > +   }
> > +   } else {
> > +   struct iommu_table *newtbl;
> > +   int i;
> > +
> > +   /* New table for using DDW instead of the default DMA
> > window */
> > +   newtbl = iommu_pseries_alloc_table(pci->phb->node);
> > +   if (!newtbl) {
> > +   dev_dbg(>dev, "couldn't create new IOMMU
> > table\n");
> > +   goto out_del_list;
> > +   }
> > +
> > +   for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources);
> > i++) {
> > +   const unsigned long mask = IORESOURCE_MEM_64
> > | IORESOURCE_MEM;
> > +
> > +   /* Look for MMIO32 */
> > +   if ((pci->phb->mem_resources[i].flags & 

Re: [PATCH v3] lockdown,selinux: fix wrong subject in some SELinux lockdown checks

2021-07-12 Thread Paul Moore
On Sat, Jun 19, 2021 at 1:00 PM Thomas Gleixner  wrote:
> On Wed, Jun 16 2021 at 10:51, Ondrej Mosnacek wrote:
> > diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
> > index bda73cb7a044..c43a13241ae8 100644
> > --- a/arch/x86/mm/testmmiotrace.c
> > +++ b/arch/x86/mm/testmmiotrace.c
> > @@ -116,7 +116,7 @@ static void do_test_bulk_ioremapping(void)
> >  static int __init init(void)
> >  {
> >   unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
> > - int ret = security_locked_down(LOCKDOWN_MMIOTRACE);
> > + int ret = security_locked_down(current_cred(), LOCKDOWN_MMIOTRACE);
>
> I have no real objection to those patches, but it strikes me odd that
> out of the 62 changed places 58 have 'current_cred()' and 4 have NULL as
> argument.
>
> I can't see why this would ever end up with anything else than
> current_cred() or NULL and NULL being the 'special' case. So why not
> having security_locked_down_no_cred() and make current_cred() implicit
> for security_locked_down() which avoids most of the churn and just makes
> the special cases special. I might be missing something though.

Unfortunately it is not uncommon for kernel subsystems to add, move,
or otherwise play around with LSM hooks without checking with the LSM
folks; generally this is okay, but there have been a few problems in
the past and I try to keep that in mind when we are introducing new
hooks or modifying existing ones.  If we have two LSM hooks for
roughly the same control point it has the potential to cause
confusion, e.g. do I use the "normal" or the "no_cred" version?  What
if I don't want to pass a credential, can I just use "no_cred"?  My
thinking with the single, always-pass-a-cred function is that callers
don't have to worry about choosing from multiple, similar hooks and
they know they need to pass a cred which hopefully gets them thinking
about what cred is appropriate.  It's not foolproof, but I believe the
single hook approach will be less prone to accidents ... or so I hope
:)

-- 
paul moore
www.paul-moore.com


[PATCH 1/2] PCI: Use pcie_reset_state_t type in function arguments

2021-07-12 Thread Krzysztof Wilczyński
The pcie_reset_state_t type has been introduced in the commit
f7bdd12d234d ("pci: New PCI-E reset API") along with the enum
pcie_reset_state, but it has never been used for anything else
other than to define the members of the enumeration set in the
enum pcie_reset_state.

Thus, replace the direct use of enum pcie_reset_state in function
arguments and replace it with pcie_reset_state_t type so that the
argument type matches the type used in enum pcie_reset_state.

Signed-off-by: Krzysztof Wilczyński 
---
 drivers/pci/pci.c   | 4 ++--
 include/linux/pci.h | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index aacf575c15cf..5c3386a73eb1 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2194,7 +2194,7 @@ EXPORT_SYMBOL(pci_disable_device);
  * implementation. Architecture implementations can override this.
  */
 int __weak pcibios_set_pcie_reset_state(struct pci_dev *dev,
-   enum pcie_reset_state state)
+   pcie_reset_state_t state)
 {
return -EINVAL;
 }
@@ -2206,7 +2206,7 @@ int __weak pcibios_set_pcie_reset_state(struct pci_dev 
*dev,
  *
  * Sets the PCI reset state for the device.
  */
-int pci_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state)
+int pci_set_pcie_reset_state(struct pci_dev *dev, pcie_reset_state_t state)
 {
return pcibios_set_pcie_reset_state(dev, state);
 }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 540b377ca8f6..15f93de69e6a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -191,7 +191,6 @@ enum {
 };
 
 typedef unsigned int __bitwise pcie_reset_state_t;
-
 enum pcie_reset_state {
/* Reset is NOT asserted (Use to deassert reset) */
pcie_deassert_reset = (__force pcie_reset_state_t) 1,
@@ -1205,7 +1204,7 @@ extern unsigned int pcibios_max_latency;
 void pci_set_master(struct pci_dev *dev);
 void pci_clear_master(struct pci_dev *dev);
 
-int pci_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state);
+int pci_set_pcie_reset_state(struct pci_dev *dev, pcie_reset_state_t state);
 int pci_set_cacheline_size(struct pci_dev *dev);
 int __must_check pci_set_mwi(struct pci_dev *dev);
 int __must_check pcim_set_mwi(struct pci_dev *dev);
@@ -2079,7 +2078,7 @@ extern u8 pci_cache_line_size;
 void pcibios_disable_device(struct pci_dev *dev);
 void pcibios_set_master(struct pci_dev *dev);
 int pcibios_set_pcie_reset_state(struct pci_dev *dev,
-enum pcie_reset_state state);
+pcie_reset_state_t state);
 int pcibios_add_device(struct pci_dev *dev);
 void pcibios_release_device(struct pci_dev *dev);
 #ifdef CONFIG_PCI
-- 
2.32.0



[PATCH 2/2] powerpc/eeh: Use pcie_reset_state_t type in function arguments

2021-07-12 Thread Krzysztof Wilczyński
The pcie_reset_state_t type has been introduced in the commit
f7bdd12d234d ("pci: New PCI-E reset API") along with the enum
pcie_reset_state, but it has never been used for anything else
other than to define the members of the enumeration set in the
enum pcie_reset_state.

Thus, replace the direct use of enum pcie_reset_state in function
arguments and replace it with pcie_reset_state_t type so that the
argument type matches the type used in enum pcie_reset_state.

Signed-off-by: Krzysztof Wilczyński 
---
 arch/powerpc/kernel/eeh.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 3bbdcc86d01b..15485abb89ff 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -714,7 +714,7 @@ static void eeh_restore_dev_state(struct eeh_dev *edev, 
void *userdata)
  * Return value:
  * 0 if success
  */
-int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state 
state)
+int pcibios_set_pcie_reset_state(struct pci_dev *dev, pcie_reset_state_t state)
 {
struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
struct eeh_pe *pe = eeh_dev_to_pe(edev);
-- 
2.32.0



[PATCH] replace if with min

2021-07-12 Thread Salah Triki
Replace if with min in order to make code more clean.

Signed-off-by: Salah Triki 
---
 drivers/crypto/nx/nx-842.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/crypto/nx/nx-842.c b/drivers/crypto/nx/nx-842.c
index 2ab90ec10e61..0d1d5a463899 100644
--- a/drivers/crypto/nx/nx-842.c
+++ b/drivers/crypto/nx/nx-842.c
@@ -134,8 +134,7 @@ EXPORT_SYMBOL_GPL(nx842_crypto_exit);
 static void check_constraints(struct nx842_constraints *c)
 {
/* limit maximum, to always have enough bounce buffer to decompress */
-   if (c->maximum > BOUNCE_BUFFER_SIZE)
-   c->maximum = BOUNCE_BUFFER_SIZE;
+   c->maximum = min(c->maximum, BOUNCE_BUFFER_SIZE);
 }
 
 static int nx842_crypto_add_header(struct nx842_crypto_header *hdr, u8 *buf)
-- 
2.25.1



[RFC PATCH] KVM: PPC: BookE: Load FP and Altivec state before soft enabling IRQs

2021-07-12 Thread Fabiano Rosas
The kvmppc_fix_ee_before_entry function sets the IRQ soft mask to
IRQS_ENABLED. This function is called right before loading the guest
FP and Altivec states at kvmppc_handle_exit. This triggers a
WARN_ON(preemptible()) at enable_kernel_fp/altivec when running with
CONFIG_PREEMPT_COUNT=y:

WARNING: CPU: 1 PID: 6585 at .enable_kernel_fp+0x30/0x78
Modules linked in: r8153_ecm cdc_ether usbnet r8152 uio_pdrv_genirq uio
CPU: 1 PID: 6585 Comm: qemu-system-ppc Tainted: GW 
5.12.10_e6500 #1
NIP:  c0003ec0 LR: c004fb00 CTR: 0004
REGS: c000b38ab440 TRAP: 0700   Tainted: GW  (5.12.10_e6500)
MSR:  82023002   CR: 24000208  XER: 
IRQMASK: 0
GPR00: c004fb00 c000b38ab6e0 c1a4e300 c000b3878000
GPR04: 0010 8003  
GPR08: fe662000 0001  0001
GPR12: 24000208 c00038c0 c000b3878000 c183eb60
GPR16:   c20a8d80 0001
GPR20:  c1ab1a70 c20a8d80 c20a8d80
GPR24: c183ed48 c17c8ec0 c183eec0 c00b80e0
GPR28:  000b80e0  c000b3878000
NIP [c0003ec0] .enable_kernel_fp+0x30/0x78
LR [c004fb00] .kvmppc_load_guest_fp+0x2c/0x80
Call Trace:
[c000b38ab6e0] [c183ed48] rcu_state+0x248/0x400 (unreliable)
[c000b38ab750] [c004fb00] .kvmppc_load_guest_fp+0x2c/0x80
[c000b38ab7d0] [c0050f48] .kvmppc_handle_exit+0x5cc/0x5d8
[c000b38ab870] [c0053e64] .kvmppc_resume_host+0xcc/0x120
Instruction dump:
7c0802a6 f8010010 f821ff91 e92d0658 8149 3920 2c0a 40c20014
892d067a 552907fe 7d290034 5529d97e <0b09> 38602000 4bfffe79 e86d0658

I'm assuming this was an oversight while introducing the call to
kvmppc_load_guest_fp and kvmppc_load_guest_altivec functions from
kvmppc_handle_exit. So this patch moves kvmppc_fix_ee_before_entry to
be again the last thing before exiting kvmppc_handle_exit.

Compile tested only since I don't have a BookE machine.

Fixes: 3efc7da61f6c ("KVM: PPC: Book3E: Increase FPU laziness")
Fixes: 95d80a294b1e ("KVM: PPC: Book3e: Add AltiVec support")
Reported-by: 
Signed-off-by: Fabiano Rosas 
---
 arch/powerpc/kvm/booke.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 551b30d84aee..38179c45eb90 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -1387,9 +1387,9 @@ int kvmppc_handle_exit(struct kvm_vcpu *vcpu, unsigned 
int exit_nr)
r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
else {
/* interrupts now hard-disabled */
-   kvmppc_fix_ee_before_entry();
kvmppc_load_guest_fp(vcpu);
kvmppc_load_guest_altivec(vcpu);
+   kvmppc_fix_ee_before_entry();
}
}
 
-- 
2.29.2



Re: [PATCH v3 1/1] powerpc/pseries: Interface to represent PAPR firmware attributes

2021-07-12 Thread Fabiano Rosas
"Pratik R. Sampat"  writes:

Hi, have you seen Documentation/core-api/kobject.rst, particularly the
part that says:

"When you see a sysfs directory full of other directories, generally each
   of those directories corresponds to a kobject in the same kset."

Taking a look at samples/kobject/kset-example.c, it seems to provide an
overall structure that is closer to what other modules do when creating
sysfs entries. It uses less dynamic allocations and deals a bit better
with cleaning up the state afterwards.

> Adds a generic interface to represent the energy and frequency related
> PAPR attributes on the system using the new H_CALL
> "H_GET_ENERGY_SCALE_INFO".
>
> H_GET_EM_PARMS H_CALL was previously responsible for exporting this
> information in the lparcfg, however the H_GET_EM_PARMS H_CALL
> will be deprecated P10 onwards.
>
> The H_GET_ENERGY_SCALE_INFO H_CALL is of the following call format:
> hcall(
>   uint64 H_GET_ENERGY_SCALE_INFO,  // Get energy scale info
>   uint64 flags,   // Per the flag request
>   uint64 firstAttributeId,// The attribute id
>   uint64 bufferAddress,   // Guest physical address of the output buffer
>   uint64 bufferSize   // The size in bytes of the output buffer
> );
>
> This H_CALL can query either all the attributes at once with
> firstAttributeId = 0, flags = 0 as well as query only one attribute
> at a time with firstAttributeId = id, flags = 1.
>
> The output buffer consists of the following
> 1. number of attributes  - 8 bytes
> 2. array offset to the data location - 8 bytes
> 3. version info  - 1 byte
> 4. A data array of size num attributes, which contains the following:
>   a. attribute ID  - 8 bytes
>   b. attribute value in number - 8 bytes
>   c. attribute name in string  - 64 bytes
>   d. attribute value in string - 64 bytes
>
> The new H_CALL exports information in direct string value format, hence
> a new interface has been introduced in
> /sys/firmware/papr/energy_scale_info to export this information to
> userspace in an extensible pass-through format.
>
> The H_CALL returns the name, numeric value and string value (if exists)
>
> The format of exposing the sysfs information is as follows:
> /sys/firmware/papr/energy_scale_info/
>|-- /
>  |-- desc
>  |-- value
>  |-- value_desc (if exists)
>|-- /
>  |-- desc
>  |-- value
>  |-- value_desc (if exists)
> ...
>
> The energy information that is exported is useful for userspace tools
> such as powerpc-utils. Currently these tools infer the
> "power_mode_data" value in the lparcfg, which in turn is obtained from
> the to be deprecated H_GET_EM_PARMS H_CALL.
> On future platforms, such userspace utilities will have to look at the
> data returned from the new H_CALL being populated in this new sysfs
> interface and report this information directly without the need of
> interpretation.
>
> Signed-off-by: Pratik R. Sampat 
> Reviewed-by: Gautham R. Shenoy 
> ---
>  .../sysfs-firmware-papr-energy-scale-info |  26 ++
>  arch/powerpc/include/asm/hvcall.h |  24 +-
>  arch/powerpc/kvm/trace_hv.h   |   1 +
>  arch/powerpc/platforms/pseries/Makefile   |   3 +-
>  .../pseries/papr_platform_attributes.c| 320 ++
>  5 files changed, 372 insertions(+), 2 deletions(-)
>  create mode 100644 
> Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
>  create mode 100644 arch/powerpc/platforms/pseries/papr_platform_attributes.c
>
> diff --git a/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info 
> b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
> new file mode 100644
> index ..fd82f2bfafe5
> --- /dev/null
> +++ b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
> @@ -0,0 +1,26 @@
> +What:/sys/firmware/papr/energy_scale_info
> +Date:June 2021
> +Contact: Linux for PowerPC mailing list 
> +Description: Directory hosting a set of platform attributes like
> + energy/frequency on Linux running as a PAPR guest.
> +
> + Each file in a directory contains a platform
> + attribute hierarchy pertaining to performance/
> + energy-savings mode and processor frequency.
> +
> +What:/sys/firmware/papr/energy_scale_info/
> + /sys/firmware/papr/energy_scale_info//desc
> + /sys/firmware/papr/energy_scale_info//value
> + /sys/firmware/papr/energy_scale_info//value_desc
> +Date:June 2021
> +Contact: Linux for PowerPC mailing list 
> +Description: Energy, frequency attributes directory for POWERVM servers
> +
> + This directory provides energy, erequency, folding information. 
> It

s/erequency/frequency/

> + contains below sysfs attributes:
> +
> + - desc: String description of the attribute 
> +
> + - value: Numeric value of attribute 
> 

Re: [PATCH] powerpc/64s/hash: Fix SLB preload cache vs kthread_use_mm

2021-07-12 Thread Nicholas Piggin
Excerpts from Nicholas Piggin's message of July 8, 2021 7:05 pm:
> It's possible for kernel threads to pick up SLB preload entries if
> they are accessing userspace with kthread_use_mm. If the kthread
> later is context switched while using a different mm, when it is
> switched back it could preload SLBs belonging to the previous mm.
> 
> This could lead to data corruption, leaks, SLB multi hits, etc.
> 
> In the absence of a usable hook to clear preloads when unusing an
> mm, fix it by keeping track of the mm that the preloads belong to.
> 
> Adjust the isync() comment to be clear it can't be skipped if we
> had no preloads.

I should note that this patch is wrong, and so I withdraw it. The
supposed bug is not actually a bug, because the SLB preload only
records the ESID/EA to preload, not the VA.

So this cross-mm "leak" can happen, but the worst it will do is
preload some addresses used in the previous mm that are not likely to
be accessed in the new mm.

There is an idea that this is the "correct" thing to be doing as
performance goes, but on the other hand it should be pretty rare to
happen so it may not be worth the extra logic. At least it should be
submitted as a performance thing not bugfix if we did do it.

Thanks,
Nick

> 
> Fixes: 5434ae74629a ("powerpc/64s/hash: Add a SLB preload cache")
> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/include/asm/thread_info.h |  1 +
>  arch/powerpc/mm/book3s64/slb.c | 36 ++
>  2 files changed, 26 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/thread_info.h 
> b/arch/powerpc/include/asm/thread_info.h
> index b4ec6c7dd72e..c3de13dde2af 100644
> --- a/arch/powerpc/include/asm/thread_info.h
> +++ b/arch/powerpc/include/asm/thread_info.h
> @@ -54,6 +54,7 @@ struct thread_info {
>  #if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC32)
>   struct cpu_accounting_data accounting;
>  #endif
> + struct mm_struct *slb_preload_mm;
>   unsigned char slb_preload_nr;
>   unsigned char slb_preload_tail;
>   u32 slb_preload_esid[SLB_PRELOAD_NR];
> diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
> index c91bd85eb90e..4f9dbce0dd84 100644
> --- a/arch/powerpc/mm/book3s64/slb.c
> +++ b/arch/powerpc/mm/book3s64/slb.c
> @@ -294,11 +294,20 @@ static bool preload_hit(struct thread_info *ti, 
> unsigned long esid)
>   return false;
>  }
>  
> -static bool preload_add(struct thread_info *ti, unsigned long ea)
> +static bool preload_add(struct thread_info *ti, struct mm_struct *mm, 
> unsigned long ea)
>  {
>   unsigned char idx;
>   unsigned long esid;
>  
> + if (unlikely(ti->slb_preload_mm != mm)) {
> + /*
> +  * kthread_use_mm or other temporary mm switching can
> +  * change the mm being used by a particular thread.
> +  */
> + ti->slb_preload_nr = 0;
> + ti->slb_preload_mm = mm;
> + }
> +
>   if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
>   /* EAs are stored >> 28 so 256MB segments don't need clearing */
>   if (ea & ESID_MASK_1T)
> @@ -362,13 +371,13 @@ void slb_setup_new_exec(void)
>* 0x1000 so it makes sense to preload this segment.
>*/
>   if (!is_kernel_addr(exec)) {
> - if (preload_add(ti, exec))
> + if (preload_add(ti, mm, exec))
>   slb_allocate_user(mm, exec);
>   }
>  
>   /* Libraries and mmaps. */
>   if (!is_kernel_addr(mm->mmap_base)) {
> - if (preload_add(ti, mm->mmap_base))
> + if (preload_add(ti, mm, mm->mmap_base))
>   slb_allocate_user(mm, mm->mmap_base);
>   }
>  
> @@ -394,19 +403,19 @@ void preload_new_slb_context(unsigned long start, 
> unsigned long sp)
>  
>   /* Userspace entry address. */
>   if (!is_kernel_addr(start)) {
> - if (preload_add(ti, start))
> + if (preload_add(ti, mm, start))
>   slb_allocate_user(mm, start);
>   }
>  
>   /* Top of stack, grows down. */
>   if (!is_kernel_addr(sp)) {
> - if (preload_add(ti, sp))
> + if (preload_add(ti, mm, sp))
>   slb_allocate_user(mm, sp);
>   }
>  
>   /* Bottom of heap, grows up. */
>   if (heap && !is_kernel_addr(heap)) {
> - if (preload_add(ti, heap))
> + if (preload_add(ti, mm, heap))
>   slb_allocate_user(mm, heap);
>   }
>  
> @@ -502,6 +511,11 @@ void switch_slb(struct task_struct *tsk, struct 
> mm_struct *mm)
>  
>   copy_mm_to_paca(mm);
>  
> + if (unlikely(ti->slb_preload_mm != mm)) {
> + ti->slb_preload_nr = 0;
> + ti->slb_preload_mm = mm;
> + }
> +
>   /*
>* We gradually age out SLBs after a number of context switches to
>* reduce reload overhead of unused entries (like we do with FP/VEC
> @@ -513,7 

Re: [RFC PATCH 11/43] KVM: PPC: Book3S HV P9: Implement PMU save/restore in C

2021-07-12 Thread Athira Rajeev



> On 12-Jul-2021, at 8:19 AM, Nicholas Piggin  wrote:
> 
> Excerpts from Athira Rajeev's message of July 10, 2021 12:47 pm:
>> 
>> 
>>> On 22-Jun-2021, at 4:27 PM, Nicholas Piggin  wrote:
>>> 
>>> Implement the P9 path PMU save/restore code in C, and remove the
>>> POWER9/10 code from the P7/8 path assembly.
>>> 
>>> -449 cycles (8533) POWER9 virt-mode NULL hcall
>>> 
>>> Signed-off-by: Nicholas Piggin 
>>> ---
>>> arch/powerpc/include/asm/asm-prototypes.h |   5 -
>>> arch/powerpc/kvm/book3s_hv.c  | 205 --
>>> arch/powerpc/kvm/book3s_hv_interrupts.S   |  13 +-
>>> arch/powerpc/kvm/book3s_hv_rmhandlers.S   |  43 +
>>> 4 files changed, 200 insertions(+), 66 deletions(-)
>>> 
>>> diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
>>> b/arch/powerpc/include/asm/asm-prototypes.h
>>> index 02ee6f5ac9fe..928db8ef9a5a 100644
>>> --- a/arch/powerpc/include/asm/asm-prototypes.h
>>> +++ b/arch/powerpc/include/asm/asm-prototypes.h
>>> @@ -136,11 +136,6 @@ static inline void kvmppc_restore_tm_hv(struct 
>>> kvm_vcpu *vcpu, u64 msr,
>>> bool preserve_nv) { }
>>> #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
>>> 
>>> -void kvmhv_save_host_pmu(void);
>>> -void kvmhv_load_host_pmu(void);
>>> -void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use);
>>> -void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu);
>>> -
>>> void kvmppc_p9_enter_guest(struct kvm_vcpu *vcpu);
>>> 
>>> long kvmppc_h_set_dabr(struct kvm_vcpu *vcpu, unsigned long dabr);
>>> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
>>> index f7349d150828..b1b94b3563b7 100644
>>> --- a/arch/powerpc/kvm/book3s_hv.c
>>> +++ b/arch/powerpc/kvm/book3s_hv.c
>>> @@ -3635,6 +3635,188 @@ static noinline void kvmppc_run_core(struct 
>>> kvmppc_vcore *vc)
>>> trace_kvmppc_run_core(vc, 1);
>>> }
>>> 
>>> +/*
>>> + * Privileged (non-hypervisor) host registers to save.
>>> + */
>>> +struct p9_host_os_sprs {
>>> +   unsigned long dscr;
>>> +   unsigned long tidr;
>>> +   unsigned long iamr;
>>> +   unsigned long amr;
>>> +   unsigned long fscr;
>>> +
>>> +   unsigned int pmc1;
>>> +   unsigned int pmc2;
>>> +   unsigned int pmc3;
>>> +   unsigned int pmc4;
>>> +   unsigned int pmc5;
>>> +   unsigned int pmc6;
>>> +   unsigned long mmcr0;
>>> +   unsigned long mmcr1;
>>> +   unsigned long mmcr2;
>>> +   unsigned long mmcr3;
>>> +   unsigned long mmcra;
>>> +   unsigned long siar;
>>> +   unsigned long sier1;
>>> +   unsigned long sier2;
>>> +   unsigned long sier3;
>>> +   unsigned long sdar;
>>> +};
>>> +
>>> +static void freeze_pmu(unsigned long mmcr0, unsigned long mmcra)
>>> +{
>>> +   if (!(mmcr0 & MMCR0_FC))
>>> +   goto do_freeze;
>>> +   if (mmcra & MMCRA_SAMPLE_ENABLE)
>>> +   goto do_freeze;
>>> +   if (cpu_has_feature(CPU_FTR_ARCH_31)) {
>>> +   if (!(mmcr0 & MMCR0_PMCCEXT))
>>> +   goto do_freeze;
>>> +   if (!(mmcra & MMCRA_BHRB_DISABLE))
>>> +   goto do_freeze;
>>> +   }
>>> +   return;
>> 
>> 
>> Hi Nick
>> 
>> When freezing the PMU, do we need to also set pmcregs_in_use to zero ?
> 
> Not immediately, we still need to save out the values of the PMU 
> registers. If we clear pmcregs_in_use, then our hypervisor can discard 
> the contents of those registers at any time.
> 
>> Also, why we need these above conditions like MMCRA_SAMPLE_ENABLE,  
>> MMCR0_PMCCEXT checks also before freezing ?
> 
> Basically just because that's the condition we wnat to set the PMU to 
> before entering the guest if the guest does not have its own PMU 
> registers in use.
> 
> I'm not entirely sure this is correct / optimal for perf though, so we
> should double check that. I think some of this stuff should be wrapped 
> up and put into perf/ subdirectory as I've said before. KVM shouldn't 
> need to know about exactly how PMU is to be set up and managed by
> perf, we should just be able to ask perf to save/restore/switch state.

Hi Nick,

Agree to your point. It makes sense that some of these perf code should be 
moved under perf/ directory.

Athira
> 
> Thanks,
> Nick



Re: [PATCH v15 06/12] swiotlb: Use is_swiotlb_force_bounce for swiotlb data bouncing

2021-07-12 Thread Will Deacon
On Tue, Jul 06, 2021 at 12:59:57PM -0400, Konrad Rzeszutek Wilk wrote:
> On Tue, Jul 06, 2021 at 05:57:21PM +0100, Will Deacon wrote:
> > On Tue, Jul 06, 2021 at 10:46:07AM -0400, Konrad Rzeszutek Wilk wrote:
> > > On Tue, Jul 06, 2021 at 04:05:13PM +0200, Christoph Hellwig wrote:
> > > > On Tue, Jul 06, 2021 at 03:01:04PM +0100, Robin Murphy wrote:
> > > > > FWIW I was pondering the question of whether to do something along 
> > > > > those 
> > > > > lines or just scrap the default assignment entirely, so since I 
> > > > > hadn't got 
> > > > > round to saying that I've gone ahead and hacked up the alternative 
> > > > > (similarly untested) for comparison :)
> > > > >
> > > > > TBH I'm still not sure which one I prefer...
> > > > 
> > > > Claire did implement something like your suggestion originally, but
> > > > I don't really like it as it doesn't scale for adding multiple global
> > > > pools, e.g. for the 64-bit addressable one for the various encrypted
> > > > secure guest schemes.
> > > 
> > > Couple of things:
> > >  - I am not pushing to Linus the Claire's patchset until we have a
> > >resolution on this. I hope you all agree that is a sensible way
> > >forward as much as I hate doing that.
> > 
> > Sure, it's a pity but we could clearly use a bit more time to get these
> > just right and we've run out of time for 5.14.
> > 
> > I think the main question I have is how would you like to see patches for
> > 5.15? i.e. as patches on top of devel/for-linus-5.14 or something else?
> 
> Yes that would be perfect. If there are any dependencies on the rc1, I
> can rebase it on top of that.

Yes, please, rebasing would be very helpful. The broader rework of
'io_tlb_default_mem' is going to conflict quite badly otherwise.

Cheers,

Will


[PATCH v1 2/4] mm/memory_hotplug: remove nid parameter from arch_remove_memory()

2021-07-12 Thread David Hildenbrand
The parameter is unused, let's remove it.

Acked-by: Catalin Marinas 
Acked-by: Michael Ellerman  (powerpc)
Acked-by: Heiko Carstens  (s390)
Cc: Catalin Marinas 
Cc: Will Deacon 
Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Heiko Carstens 
Cc: Vasily Gorbik 
Cc: Christian Borntraeger 
Cc: Yoshinori Sato 
Cc: Rich Felker 
Cc: Dave Hansen 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: x...@kernel.org
Cc: "H. Peter Anvin" 
Cc: Andrew Morton 
Cc: Anshuman Khandual 
Cc: Ard Biesheuvel 
Cc: Mike Rapoport 
Cc: Nicholas Piggin 
Cc: Pavel Tatashin 
Cc: Baoquan He 
Cc: Laurent Dufour 
Cc: Sergei Trofimovich 
Cc: Kefeng Wang 
Cc: Michel Lespinasse 
Cc: Christophe Leroy 
Cc: "Aneesh Kumar K.V" 
Cc: Thiago Jung Bauermann 
Cc: Joe Perches 
Cc: Pierre Morel 
Cc: Jia He 
Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-i...@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-s...@vger.kernel.org
Cc: linux...@vger.kernel.org
Signed-off-by: David Hildenbrand 
---
 arch/arm64/mm/mmu.c| 3 +--
 arch/ia64/mm/init.c| 3 +--
 arch/powerpc/mm/mem.c  | 3 +--
 arch/s390/mm/init.c| 3 +--
 arch/sh/mm/init.c  | 3 +--
 arch/x86/mm/init_32.c  | 3 +--
 arch/x86/mm/init_64.c  | 3 +--
 include/linux/memory_hotplug.h | 3 +--
 mm/memory_hotplug.c| 4 ++--
 mm/memremap.c  | 5 +
 10 files changed, 11 insertions(+), 22 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index d74586508448..af8ab553a268 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1506,8 +1506,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
return ret;
 }
 
-void arch_remove_memory(int nid, u64 start, u64 size,
-   struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 {
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 064a967a7b6e..5c6da8d83c1a 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -484,8 +484,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
return ret;
 }
 
-void arch_remove_memory(int nid, u64 start, u64 size,
-   struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 {
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index ad198b439222..c3c4e31462ec 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -119,8 +119,7 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
return rc;
 }
 
-void __ref arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 {
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 8ac710de1ab1..d85bd7f5d8dc 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -306,8 +306,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
return rc;
 }
 
-void arch_remove_memory(int nid, u64 start, u64 size,
-   struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 {
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index ce26c7f8950a..506784702430 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -414,8 +414,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
return ret;
 }
 
-void arch_remove_memory(int nid, u64 start, u64 size,
-   struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 {
unsigned long start_pfn = PFN_DOWN(start);
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 74b78840182d..bd90b8fe81e4 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -801,8 +801,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
return __add_pages(nid, start_pfn, nr_pages, params);
 }
 
-void arch_remove_memory(int nid, u64 start, u64 size,
-   struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 {
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ddeaba947eb3..a6e11763763f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1255,8 +1255,7 @@ 

Re: [RFC PATCH 10/43] powerpc/64s: Always set PMU control registers to frozen/disabled when not in use

2021-07-12 Thread Athira Rajeev
On 12-Jul-2021, at 8:11 AM, Nicholas Piggin  wrote:Excerpts from Athira Rajeev's message of July 10, 2021 12:50 pm:On 22-Jun-2021, at 4:27 PM, Nicholas Piggin  wrote:KVM PMU management code looks for particular frozen/disabled bits inthe PMU registers so it knows whether it must clear them when comingout of a guest or not. Setting this up helps KVM make these optimisationswithout getting confused. Longer term the better approach might be tomove guest/host PMU switching to the perf subsystem.Signed-off-by: Nicholas Piggin ---arch/powerpc/kernel/cpu_setup_power.c | 4 ++--arch/powerpc/kernel/dt_cpu_ftrs.c | 6 +++---arch/powerpc/kvm/book3s_hv.c  | 5 +arch/powerpc/perf/core-book3s.c   | 7 +++4 files changed, 17 insertions(+), 5 deletions(-)diff --git a/arch/powerpc/kernel/cpu_setup_power.c b/arch/powerpc/kernel/cpu_setup_power.cindex a29dc8326622..3dc61e203f37 100644--- a/arch/powerpc/kernel/cpu_setup_power.c+++ b/arch/powerpc/kernel/cpu_setup_power.c@@ -109,7 +109,7 @@ static void init_PMU_HV_ISA207(void)static void init_PMU(void){	mtspr(SPRN_MMCRA, 0);-	mtspr(SPRN_MMCR0, 0);+	mtspr(SPRN_MMCR0, MMCR0_FC);	mtspr(SPRN_MMCR1, 0);	mtspr(SPRN_MMCR2, 0);}@@ -123,7 +123,7 @@ static void init_PMU_ISA31(void){	mtspr(SPRN_MMCR3, 0);	mtspr(SPRN_MMCRA, MMCRA_BHRB_DISABLE);-	mtspr(SPRN_MMCR0, MMCR0_PMCCEXT);+	mtspr(SPRN_MMCR0, MMCR0_FC | MMCR0_PMCCEXT);}/*diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.cindex 0a6b36b4bda8..06a089fbeaa7 100644--- a/arch/powerpc/kernel/dt_cpu_ftrs.c+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c@@ -353,7 +353,7 @@ static void init_pmu_power8(void)	}	mtspr(SPRN_MMCRA, 0);-	mtspr(SPRN_MMCR0, 0);+	mtspr(SPRN_MMCR0, MMCR0_FC);	mtspr(SPRN_MMCR1, 0);	mtspr(SPRN_MMCR2, 0);	mtspr(SPRN_MMCRS, 0);@@ -392,7 +392,7 @@ static void init_pmu_power9(void)		mtspr(SPRN_MMCRC, 0);	mtspr(SPRN_MMCRA, 0);-	mtspr(SPRN_MMCR0, 0);+	mtspr(SPRN_MMCR0, MMCR0_FC);	mtspr(SPRN_MMCR1, 0);	mtspr(SPRN_MMCR2, 0);}@@ -428,7 +428,7 @@ static void init_pmu_power10(void)	mtspr(SPRN_MMCR3, 0);	mtspr(SPRN_MMCRA, MMCRA_BHRB_DISABLE);-	mtspr(SPRN_MMCR0, MMCR0_PMCCEXT);+	mtspr(SPRN_MMCR0, MMCR0_FC | MMCR0_PMCCEXT);}static int __init feat_enable_pmu_power10(struct dt_cpu_feature *f)diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.cindex 1f30f98b09d1..f7349d150828 100644--- a/arch/powerpc/kvm/book3s_hv.c+++ b/arch/powerpc/kvm/book3s_hv.c@@ -2593,6 +2593,11 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu)#endif#endif	vcpu->arch.mmcr[0] = MMCR0_FC;+	if (cpu_has_feature(CPU_FTR_ARCH_31)) {+		vcpu->arch.mmcr[0] |= MMCR0_PMCCEXT;+		vcpu->arch.mmcra = MMCRA_BHRB_DISABLE;+	}+	vcpu->arch.ctrl = CTRL_RUNLATCH;	/* default to host PVR, since we can't spoof it */	kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR));diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.cindex 51622411a7cc..e33b29ec1a65 100644--- a/arch/powerpc/perf/core-book3s.c+++ b/arch/powerpc/perf/core-book3s.c@@ -1361,6 +1361,13 @@ static void power_pmu_enable(struct pmu *pmu)		goto out;	if (cpuhw->n_events == 0) {+		if (cpu_has_feature(CPU_FTR_ARCH_31)) {+			mtspr(SPRN_MMCRA, MMCRA_BHRB_DISABLE);+			mtspr(SPRN_MMCR0, MMCR0_FC | MMCR0_PMCCEXT);+		} else {+			mtspr(SPRN_MMCRA, 0);+			mtspr(SPRN_MMCR0, MMCR0_FC);+		}Hi Nick,We are setting these bits in “power_pmu_disable” function. And disable will be called before any event gets deleted/stopped. Can you please help to understand why this is needed in power_pmu_enable path also ?I'll have to go back and check what I needed it for.Basically what I'm trying to achieve is that when the guest and host are not running perf, then the registers should match. This way the host can test them with mfspr but does not need to fix them with mtspr.It's not too important for performance after TM facility demand faultingand the nested guest PMU fix means you can usually just skip that entirely, but I think it's cleaner. I'll double check my perf/ changesit's possible that part should be split out or is unnecessary.Sure Nickpower_pmu_disable already sets MMCRA_BHRB_DISABLE/MMCR0_FC/MMCR0_PMCCEXT bits.Hence trying to understand in which scenario we found this was needed for power_pmu_enable.ThanksAthiraThanks,Nick

Re: [PATCH v2 1/2] sched/topology: Skip updating masks for non-online nodes

2021-07-12 Thread Srikar Dronamraju
Hi Valentin,

> On 01/07/21 09:45, Srikar Dronamraju wrote:
> > @@ -1891,12 +1894,30 @@ void sched_init_numa(void)
> >  void sched_domains_numa_masks_set(unsigned int cpu)
> >  {
> 
> Hmph, so we're playing games with masks of offline nodes - is that really
> necessary? Your modification of sched_init_numa() still scans all of the
> nodes (regardless of their online status) to build the distance map, and
> that is never updated (sched_init_numa() is pretty much an __init
> function).
> 
> So AFAICT this is all to cope with topology_span_sane() not applying
> 'cpu_map' to its masks. That seemed fine to me back when I wrote it, but in
> light of having bogus distance values for offline nodes, not so much...
> 
> What about the below instead?
> 
> ---
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index b77ad49dc14f..c2d9caad4aa6 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -2075,6 +2075,7 @@ static struct sched_domain *build_sched_domain(struct 
> sched_domain_topology_leve
>  static bool topology_span_sane(struct sched_domain_topology_level *tl,
> const struct cpumask *cpu_map, int cpu)
>  {
> + struct cpumask *intersect = sched_domains_tmpmask;
>   int i;
> 
>   /* NUMA levels are allowed to overlap */
> @@ -2090,14 +2091,17 @@ static bool topology_span_sane(struct 
> sched_domain_topology_level *tl,
>   for_each_cpu(i, cpu_map) {
>   if (i == cpu)
>   continue;
> +
>   /*
> -  * We should 'and' all those masks with 'cpu_map' to exactly
> -  * match the topology we're about to build, but that can only
> -  * remove CPUs, which only lessens our ability to detect
> -  * overlaps
> +  * We shouldn't have to bother with cpu_map here, unfortunately
> +  * some architectures (powerpc says hello) have to deal with
> +  * offline NUMA nodes reporting bogus distance values. This can
> +  * lead to funky NODE domain spans, but since those are offline
> +  * we can mask them out.
>*/
> + cpumask_and(intersect, tl->mask(cpu), tl->mask(i));
>   if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
> - cpumask_intersects(tl->mask(cpu), tl->mask(i)))
> + cpumask_intersects(intersect, cpu_map))
>   return false;
>   }
> 

Unfortunately this is not helping.
I tried this patch alone and also with 2/2 patch of this series where
we update/fill fake topology numbers. However both cases are still failing.

-- 
Thanks and Regards
Srikar Dronamraju


[PATCH v1 3/4] mm/memory_hotplug: remove nid parameter from remove_memory() and friends

2021-07-12 Thread David Hildenbrand
There is only a single user remaining. We can simply lookup the nid only
used for node offlining purposes when walking our memory blocks. We
don't expect to remove multi-nid ranges; and if we'd ever do, we most
probably don't care about removing multi-nid ranges that actually result
in empty nodes.

If ever required, we can detect the "multi-nid" scenario and simply try
offlining all online nodes.

Acked-by: Michael Ellerman  (powerpc)
Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: "Rafael J. Wysocki" 
Cc: Len Brown 
Cc: Dan Williams 
Cc: Vishal Verma 
Cc: Dave Jiang 
Cc: "Michael S. Tsirkin" 
Cc: Jason Wang 
Cc: Andrew Morton 
Cc: Nathan Lynch 
Cc: Laurent Dufour 
Cc: "Aneesh Kumar K.V" 
Cc: Scott Cheloha 
Cc: Anton Blanchard 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-a...@vger.kernel.org
Cc: nvd...@lists.linux.dev
Signed-off-by: David Hildenbrand 
---
 .../platforms/pseries/hotplug-memory.c|  9 +++---
 drivers/acpi/acpi_memhotplug.c|  7 +
 drivers/dax/kmem.c|  3 +-
 drivers/virtio/virtio_mem.c   |  4 +--
 include/linux/memory_hotplug.h| 10 +++
 mm/memory_hotplug.c   | 28 +++
 6 files changed, 30 insertions(+), 31 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 377d852f5a9a..ef5c24b42cf1 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -286,7 +286,7 @@ static int pseries_remove_memblock(unsigned long base, 
unsigned long memblock_si
 {
unsigned long block_sz, start_pfn;
int sections_per_block;
-   int i, nid;
+   int i;
 
start_pfn = base >> PAGE_SHIFT;
 
@@ -297,10 +297,9 @@ static int pseries_remove_memblock(unsigned long base, 
unsigned long memblock_si
 
block_sz = pseries_memory_block_size();
sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
-   nid = memory_add_physaddr_to_nid(base);
 
for (i = 0; i < sections_per_block; i++) {
-   __remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE);
+   __remove_memory(base, MIN_MEMORY_BLOCK_SIZE);
base += MIN_MEMORY_BLOCK_SIZE;
}
 
@@ -387,7 +386,7 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb)
 
block_sz = pseries_memory_block_size();
 
-   __remove_memory(mem_block->nid, lmb->base_addr, block_sz);
+   __remove_memory(lmb->base_addr, block_sz);
put_device(_block->dev);
 
/* Update memory regions for memory remove */
@@ -660,7 +659,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
 
rc = dlpar_online_lmb(lmb);
if (rc) {
-   __remove_memory(nid, lmb->base_addr, block_sz);
+   __remove_memory(lmb->base_addr, block_sz);
invalidate_lmb_associativity_index(lmb);
} else {
lmb->flags |= DRCONF_MEM_ASSIGNED;
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 8cc195c4c861..1d01d9414c40 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -239,19 +239,14 @@ static int acpi_memory_enable_device(struct 
acpi_memory_device *mem_device)
 
 static void acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
 {
-   acpi_handle handle = mem_device->device->handle;
struct acpi_memory_info *info, *n;
-   int nid = acpi_get_node(handle);
 
list_for_each_entry_safe(info, n, _device->res_list, list) {
if (!info->enabled)
continue;
 
-   if (nid == NUMA_NO_NODE)
-   nid = memory_add_physaddr_to_nid(info->start_addr);
-
acpi_unbind_memory_blocks(info);
-   __remove_memory(nid, info->start_addr, info->length);
+   __remove_memory(info->start_addr, info->length);
list_del(>list);
kfree(info);
}
diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index ac231cc36359..99e0f60c4c26 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -156,8 +156,7 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
if (rc)
continue;
 
-   rc = remove_memory(dev_dax->target_node, range.start,
-   range_len());
+   rc = remove_memory(range.start, range_len());
if (rc == 0) {
release_resource(data->res[i]);
kfree(data->res[i]);
diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 09ed55de07d7..774986695dc4 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -677,7 +677,7 @@ static int virtio_mem_remove_memory(struct virtio_mem *vm, 
uint64_t addr,
 
dev_dbg(>vdev->dev, "removing memory: 0x%llx - 0x%llx\n", 

[PATCH v3 1/1] powerpc/pseries: Interface to represent PAPR firmware attributes

2021-07-12 Thread Pratik R. Sampat
Adds a generic interface to represent the energy and frequency related
PAPR attributes on the system using the new H_CALL
"H_GET_ENERGY_SCALE_INFO".

H_GET_EM_PARMS H_CALL was previously responsible for exporting this
information in the lparcfg, however the H_GET_EM_PARMS H_CALL
will be deprecated P10 onwards.

The H_GET_ENERGY_SCALE_INFO H_CALL is of the following call format:
hcall(
  uint64 H_GET_ENERGY_SCALE_INFO,  // Get energy scale info
  uint64 flags,   // Per the flag request
  uint64 firstAttributeId,// The attribute id
  uint64 bufferAddress,   // Guest physical address of the output buffer
  uint64 bufferSize   // The size in bytes of the output buffer
);

This H_CALL can query either all the attributes at once with
firstAttributeId = 0, flags = 0 as well as query only one attribute
at a time with firstAttributeId = id, flags = 1.

The output buffer consists of the following
1. number of attributes  - 8 bytes
2. array offset to the data location - 8 bytes
3. version info  - 1 byte
4. A data array of size num attributes, which contains the following:
  a. attribute ID  - 8 bytes
  b. attribute value in number - 8 bytes
  c. attribute name in string  - 64 bytes
  d. attribute value in string - 64 bytes

The new H_CALL exports information in direct string value format, hence
a new interface has been introduced in
/sys/firmware/papr/energy_scale_info to export this information to
userspace in an extensible pass-through format.

The H_CALL returns the name, numeric value and string value (if exists)

The format of exposing the sysfs information is as follows:
/sys/firmware/papr/energy_scale_info/
   |-- /
 |-- desc
 |-- value
 |-- value_desc (if exists)
   |-- /
 |-- desc
 |-- value
 |-- value_desc (if exists)
...

The energy information that is exported is useful for userspace tools
such as powerpc-utils. Currently these tools infer the
"power_mode_data" value in the lparcfg, which in turn is obtained from
the to be deprecated H_GET_EM_PARMS H_CALL.
On future platforms, such userspace utilities will have to look at the
data returned from the new H_CALL being populated in this new sysfs
interface and report this information directly without the need of
interpretation.

Signed-off-by: Pratik R. Sampat 
Reviewed-by: Gautham R. Shenoy 
---
 .../sysfs-firmware-papr-energy-scale-info |  26 ++
 arch/powerpc/include/asm/hvcall.h |  24 +-
 arch/powerpc/kvm/trace_hv.h   |   1 +
 arch/powerpc/platforms/pseries/Makefile   |   3 +-
 .../pseries/papr_platform_attributes.c| 320 ++
 5 files changed, 372 insertions(+), 2 deletions(-)
 create mode 100644 
Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
 create mode 100644 arch/powerpc/platforms/pseries/papr_platform_attributes.c

diff --git a/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info 
b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
new file mode 100644
index ..fd82f2bfafe5
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
@@ -0,0 +1,26 @@
+What:  /sys/firmware/papr/energy_scale_info
+Date:  June 2021
+Contact:   Linux for PowerPC mailing list 
+Description:   Directory hosting a set of platform attributes like
+   energy/frequency on Linux running as a PAPR guest.
+
+   Each file in a directory contains a platform
+   attribute hierarchy pertaining to performance/
+   energy-savings mode and processor frequency.
+
+What:  /sys/firmware/papr/energy_scale_info/
+   /sys/firmware/papr/energy_scale_info//desc
+   /sys/firmware/papr/energy_scale_info//value
+   /sys/firmware/papr/energy_scale_info//value_desc
+Date:  June 2021
+Contact:   Linux for PowerPC mailing list 
+Description:   Energy, frequency attributes directory for POWERVM servers
+
+   This directory provides energy, erequency, folding information. 
It
+   contains below sysfs attributes:
+
+   - desc: String description of the attribute 
+
+   - value: Numeric value of attribute 
+
+   - value_desc: String value of attribute 
diff --git a/arch/powerpc/include/asm/hvcall.h 
b/arch/powerpc/include/asm/hvcall.h
index e3b29eda8074..c91714ea6719 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -316,7 +316,8 @@
 #define H_SCM_PERFORMANCE_STATS 0x418
 #define H_RPT_INVALIDATE   0x448
 #define H_SCM_FLUSH0x44C
-#define MAX_HCALL_OPCODE   H_SCM_FLUSH
+#define H_GET_ENERGY_SCALE_INFO0x450
+#define MAX_HCALL_OPCODE   H_GET_ENERGY_SCALE_INFO
 
 /* Scope args for H_SCM_UNBIND_ALL */
 #define H_UNBIND_SCOPE_ALL (0x1)
@@ -631,6 +632,27 @@ struct hv_gpci_request_buffer {
uint8_t bytes[HGPCI_MAX_DATA_BYTES];
 } __packed;
 

[PATCH v3 0/1] Interface to represent PAPR firmware attributes

2021-07-12 Thread Pratik R. Sampat
RFC: https://lkml.org/lkml/2021/6/4/791
PATCH v1: https://lkml.org/lkml/2021/6/16/805
PATCH v2: https://lkml.org/lkml/2021/7/6/138

Changelog v2 --> v3
Based on a comment from Guatham:
1. Added a versioning check after the H_CALL is made to bail out when
   the version from the firmware is inconsistent with that in the kernel

Also, have implemented a POC using this interface for the powerpc-utils'
ppc64_cpu --frequency command-line tool to utilize this information
in userspace.
The POC for the new interface has been hosted here:
https://github.com/pratiksampat/powerpc-utils/tree/H_GET_ENERGY_SCALE_INFO_v2

Sample output from the powerpc-utils tool is as follows:

# ppc64_cpu --frequency
Power and Performance Mode: 
Idle Power Saver Status   : 
Processor Folding Status  :  --> Printed if Idle power save status is 
supported

Platform reported frequencies --> Frequencies reported from the platform's 
H_CALL i.e PAPR interface
min: GHz
max: GHz
static : GHz

Tool Computed frequencies
min: GHz (cpu XX)
max: GHz (cpu XX)
avg: GHz

Pratik R. Sampat (1):
  powerpc/pseries: Interface to represent PAPR firmware attributes

 .../sysfs-firmware-papr-energy-scale-info |  26 ++
 arch/powerpc/include/asm/hvcall.h |  24 +-
 arch/powerpc/kvm/trace_hv.h   |   1 +
 arch/powerpc/platforms/pseries/Makefile   |   3 +-
 .../pseries/papr_platform_attributes.c| 320 ++
 5 files changed, 372 insertions(+), 2 deletions(-)
 create mode 100644 
Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
 create mode 100644 arch/powerpc/platforms/pseries/papr_platform_attributes.c

-- 
2.31.1



Re: [PATCH] ASoC: fsl_xcvr: Omit superfluous error message in fsl_xcvr_probe()

2021-07-12 Thread Mark Brown
On Thu, 24 Jun 2021 18:45:05 +0800, Tang Bin wrote:
> In the function fsl_xcvr__probe(), when get irq failed,
> the function platform_get_irq() logs an error message, so remove
> redundant message here.

Applied to

   https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git for-next

Thanks!

[1/1] ASoC: fsl_xcvr: Omit superfluous error message in fsl_xcvr_probe()
  commit: 8620c40002db9679279546cc3be2aceb8c5e5e76

All being well this means that it will be integrated into the linux-next
tree (usually sometime in the next 24 hours) and sent to Linus during
the next merge window (or sooner if it is a bug fix), however if
problems are discovered then the patch may be dropped or reverted.

You may get further e-mails resulting from automated or manual testing
and review of the tree, please engage with people reporting problems and
send followup patches addressing any issues that are reported if needed.

If any updates are required or you are submitting further changes they
should be sent as incremental updates against current git, existing
patches will not be replaced.

Please add any relevant lists and maintainers to the CCs when replying
to this mail.

Thanks,
Mark


Re: [PATCH] powerpc/perf: Fix cycles/instructions as PM_CYC/PM_INST_CMPL in power10

2021-07-12 Thread Athira Rajeev



> On 08-Jul-2021, at 9:13 PM, Paul A. Clarke  wrote:
> 
> On Thu, Jul 08, 2021 at 10:56:57PM +1000, Nicholas Piggin wrote:
>> Excerpts from Athira Rajeev's message of July 7, 2021 4:39 pm:
>>> From: Athira Rajeev 
>>> 
>>> Power10 performance monitoring unit (PMU) driver uses performance
>>> monitor counter 5 (PMC5) and performance monitor counter 6 (PMC6)
>>> for counting instructions and cycles. Event used for cycles is
>>> PM_RUN_CYC and instructions is PM_RUN_INST_CMPL. But counting of these
>>> events in wait state is controlled by the CC56RUN bit setting in
>>> Monitor Mode Control Register0 (MMCR0). If the CC56RUN bit is not
>>> set, PMC5/6 will not count when CTRL[RUN] is zero.
>> 
>> What's the acutal bug here, can you explain a bit more? I thought
>> PM_RUN_CYC is supposed to be gated by the runlatch.
> 
> Would this renaming break compatibility with existing tools that
> presume PM_RUN_CYC and PM_RUN_INST_CMPL exist generically?


Hi Paul,

Thanks for checking the patch.

No, this does not break compatibility with existing tools. Since the change is 
only for PMC5 and PMC6. Events PM_RUN_CYC and PM_RUN_INST_CMPL still behaves 
the same way since they are programmed in different PMC’s.

Thanks
Athira
> 
> PC



Re: [PATCH] powerpc/perf: Fix cycles/instructions as PM_CYC/PM_INST_CMPL in power10

2021-07-12 Thread Athira Rajeev



> On 08-Jul-2021, at 6:26 PM, Nicholas Piggin  wrote:
> 
> Excerpts from Athira Rajeev's message of July 7, 2021 4:39 pm:
>> From: Athira Rajeev 
>> 
>> Power10 performance monitoring unit (PMU) driver uses performance
>> monitor counter 5 (PMC5) and performance monitor counter 6 (PMC6)
>> for counting instructions and cycles. Event used for cycles is
>> PM_RUN_CYC and instructions is PM_RUN_INST_CMPL. But counting of these
>> events in wait state is controlled by the CC56RUN bit setting in
>> Monitor Mode Control Register0 (MMCR0). If the CC56RUN bit is not
>> set, PMC5/6 will not count when CTRL[RUN] is zero.
> 
> What's the acutal bug here, can you explain a bit more? I thought
> PM_RUN_CYC is supposed to be gated by the runlatch.
> 
> POWER9 code looks similar, it doesn't have the same problem?
> 
> Thanks,
> Nick

Hi Nick,

Thanks for the review.

In power9 and before, the default event used for counting "cycles" - PM_CYC 
(0x0001e) and for counting instruction - PM_INST_CMPL (0x2) . 
These events, uses two programmable PMC to count cycles and instructions (this 
causes multiplexing for basic set of events supported by perf stat). 
And PM_CYC/PM_INST_CMPL both by default count irrespective of the run latch 
state.

So in power10, we decided to use PMC5 and PMC6 for counting instructions and 
cycles respectively. But PMC5 and PMC6 by default counts only when runlatch is 
enabled, this can cause issues in case of system wide profiling. So in order to 
make PMC5/6 behave as PM_CYC and PM_INST_CMPL, we are enabling MMCR0[CC56RUN]] 
bit.

Thanks
Athira
> 
>> 
>> Patch sets the CC56RUN bit in MMCR0 for power10 which makes PMC5 and
>> PMC6 count instructions and cycles regardless of the run bit. With this
>> change, these events are also now renamed to PM_CYC and PM_INST_CMPL
>> rather than PM_RUN_CYC and PM_RUN_INST_CMPL.
>> 
>> Fixes: a64e697cef23 ("powerpc/perf: power10 Performance Monitoring support")
>> Signed-off-by: Athira Rajeev 
>> Reviewed-by: Madhavan Srinivasan 
>> ---
>> Notes on testing done for this change:
>> Tested this patch change with a kernel module that
>> turns off and turns on the runlatch. kernel module also
>> reads the counter values for PMC5 and PMC6 during the
>> period when runlatch is off.
>> - Started PMU counters via "perf stat" and loaded the
>>   test module.
>> - Checked the counter values captured from module during
>>   the runlatch off period.
>> - Verified that counters were frozen without the patch and
>>   with the patch, observed counters were incrementing.
>> 
>> arch/powerpc/perf/power10-events-list.h |  8 +++---
>> arch/powerpc/perf/power10-pmu.c | 44 
>> +++--
>> 2 files changed, 35 insertions(+), 17 deletions(-)
>> 
>> diff --git a/arch/powerpc/perf/power10-events-list.h 
>> b/arch/powerpc/perf/power10-events-list.h
>> index 93be719..564f1409 100644
>> --- a/arch/powerpc/perf/power10-events-list.h
>> +++ b/arch/powerpc/perf/power10-events-list.h
>> @@ -9,10 +9,10 @@
>> /*
>>  * Power10 event codes.
>>  */
>> -EVENT(PM_RUN_CYC,   0x600f4);
>> +EVENT(PM_CYC,   0x600f4);
>> EVENT(PM_DISP_STALL_CYC, 0x100f8);
>> EVENT(PM_EXEC_STALL, 0x30008);
>> -EVENT(PM_RUN_INST_CMPL, 0x500fa);
>> +EVENT(PM_INST_CMPL, 0x500fa);
>> EVENT(PM_BR_CMPL,   0x4d05e);
>> EVENT(PM_BR_MPRED_CMPL, 0x400f6);
>> EVENT(PM_BR_FIN, 0x2f04a);
>> @@ -50,8 +50,8 @@
>> /* ITLB Reloaded */
>> EVENT(PM_ITLB_MISS,  0x400fc);
>> 
>> -EVENT(PM_RUN_CYC_ALT,   0x0001e);
>> -EVENT(PM_RUN_INST_CMPL_ALT, 0x2);
>> +EVENT(PM_CYC_ALT,   0x0001e);
>> +EVENT(PM_INST_CMPL_ALT, 0x2);
>> 
>> /*
>>  * Memory Access Events
>> diff --git a/arch/powerpc/perf/power10-pmu.c 
>> b/arch/powerpc/perf/power10-pmu.c
>> index f9d64c6..9dd75f3 100644
>> --- a/arch/powerpc/perf/power10-pmu.c
>> +++ b/arch/powerpc/perf/power10-pmu.c
>> @@ -91,8 +91,8 @@
>> 
>> /* Table of alternatives, sorted by column 0 */
>> static const unsigned int power10_event_alternatives[][MAX_ALT] = {
>> -{ PM_RUN_CYC_ALT,   PM_RUN_CYC },
>> -{ PM_RUN_INST_CMPL_ALT, PM_RUN_INST_CMPL },
>> +{ PM_CYC_ALT,   PM_CYC },
>> +{ PM_INST_CMPL_ALT, PM_INST_CMPL },
>> };
>> 
>> static int power10_get_alternatives(u64 event, unsigned int flags, u64 alt[])
>> @@ -118,8 +118,8 @@ static int power10_check_attr_config(struct perf_event 
>> *ev)
>>  return 0;
>> }
>> 
>> -GENERIC_EVENT_ATTR(cpu-cycles,  PM_RUN_CYC);
>> -GENERIC_EVENT_ATTR(instructions,PM_RUN_INST_CMPL);
>> +GENERIC_EVENT_ATTR(cpu-cycles,  PM_CYC);
>> +GENERIC_EVENT_ATTR(instructions,

Re: [PATCH V3 1/1] powerpc/perf: Fix PMU callbacks to clear pending PMI before resetting an overflown PMC

2021-07-12 Thread Athira Rajeev



> On 12-Jul-2021, at 8:07 AM, Nicholas Piggin  wrote:
> 
> Excerpts from Athira Rajeev's message of July 11, 2021 1:58 am:
>> Running perf fuzzer showed below in dmesg logs:
>> "Can't find PMC that caused IRQ"
>> 
>> This means a PMU exception happened, but none of the PMC's (Performance
>> Monitor Counter) were found to be overflown. There are some corner cases
>> that clears the PMCs after PMI gets masked. In such cases, the perf
>> interrupt handler will not find the active PMC values that had caused
>> the overflow and thus leads to this message while replaying.
>> 
>> Case 1: PMU Interrupt happens during replay of other interrupts and
>> counter values gets cleared by PMU callbacks before replay:
>> 
>> During replay of interrupts like timer, __do_irq and doorbell exception, we
>> conditionally enable interrupts via may_hard_irq_enable(). This could
>> potentially create a window to generate a PMI. Since irq soft mask is set
>> to ALL_DISABLED, the PMI will get masked here. We could get IPIs run before
>> perf interrupt is replayed and the PMU events could deleted or stopped.
>> This will change the PMU SPR values and resets the counters. Snippet of
>> ftrace log showing PMU callbacks invoked in "__do_irq":
>> 
>> -0 [051] dns. 132025441306354: __do_irq <-call_do_irq
>> -0 [051] dns. 132025441306430: irq_enter <-__do_irq
>> -0 [051] dns. 132025441306503: irq_enter_rcu <-__do_irq
>> -0 [051] dnH. 132025441306599: xive_get_irq <-__do_irq
>> <<>>
>> -0 [051] dnH. 132025441307770: 
>> generic_smp_call_function_single_interrupt <-smp_ipi_demux_relaxed
>> -0 [051] dnH. 132025441307839: flush_smp_call_function_queue 
>> <-smp_ipi_demux_relaxed
>> -0 [051] dnH. 132025441308057: _raw_spin_lock <-event_function
>> -0 [051] dnH. 132025441308206: power_pmu_disable <-perf_pmu_disable
>> -0 [051] dnH. 132025441308337: power_pmu_del <-event_sched_out
>> -0 [051] dnH. 132025441308407: power_pmu_read <-power_pmu_del
>> -0 [051] dnH. 132025441308477: read_pmc <-power_pmu_read
>> -0 [051] dnH. 132025441308590: isa207_disable_pmc <-power_pmu_del
>> -0 [051] dnH. 132025441308663: write_pmc <-power_pmu_del
>> -0 [051] dnH. 132025441308787: power_pmu_event_idx 
>> <-perf_event_update_userpage
>> -0 [051] dnH. 132025441308859: rcu_read_unlock_strict 
>> <-perf_event_update_userpage
>> -0 [051] dnH. 132025441308975: power_pmu_enable <-perf_pmu_enable
>> <<>>
>> -0 [051] dnH. 132025441311108: irq_exit <-__do_irq
>> -0 [051] dns. 132025441311319: performance_monitor_exception 
>> <-replay_soft_interrupts
>> 
>> Case 2: PMI's masked during local_* operations, example local_add.
>> If the local_add operation happens within a local_irq_save, replay of
>> PMI will be during local_irq_restore. Similar to case 1, this could
>> also create a window before replay where PMU events gets deleted or
>> stopped.
>> 
>> Patch adds a fix to update the PMU callback function 'power_pmu_disable' to
>> check for pending perf interrupt. If there is an overflown PMC and pending
>> perf interrupt indicated in Paca, clear the PMI bit in paca to drop that
>> sample. Clearing of PMI bit is done in 'power_pmu_disable' since disable is
>> invoked before any event gets deleted/stopped. With this fix, if there are
>> more than one event running in the PMU, there is a chance that we clear the
>> PMI bit for the event which is not getting deleted/stopped. The other
>> events may still remain active. Hence to make sure we don't drop valid
>> sample in such cases, another check is added in power_pmu_enable. This
>> checks if there is an overflown PMC found among the active events and if
>> so enable back the PMI bit. Two new helper functions are introduced to
>> clear/set the PMI, ie 'clear_pmi_irq_pending' and 'set_pmi_irq_pending'.
>> 
>> Also there are corner cases which results in performance monitor interrupts
>> getting triggered during power_pmu_disable. This happens since PMXE bit is
>> not cleared along with disabling of other MMCR0 bits in the pmu_disable.
>> Such PMI's could leave the PMU running and could trigger PMI again which
>> will set MMCR0 PMAO bit. This could lead to spurious interrupts in some
>> corner cases. Example, a timer after power_pmu_del which will re-enable
>> interrupts and triggers a PMI again since PMAO bit is still set. But fails
>> to find valid overflow since PMC get cleared in power_pmu_del. Patch
>> fixes this by disabling PMXE along with disabling of other MMCR0 bits
>> in power_pmu_disable.
>> 
>> We can't just replay PMI any time. Hence this approach is preferred rather
>> than replaying PMI before resetting overflown PMC. Patch also documents
>> core-book3s on a race condition which can trigger these PMC messages during
>> idle path in PowerNV.
>> 
>> Fixes: f442d004806e ("powerpc/64s: Add support to mask perf interrupts and 
>> replay them")
>> Reported-by: Nageswara R Sastry 
>> Suggested-by: Nicholas Piggin 
>> Suggested-by: Madhavan Srinivasan 
>> Signed-off-by: Athira Rajeev 
>> ---
>> 

[PATCH] powerpc/papr_scm: Implement initial support for injecting smart errors

2021-07-12 Thread Vaibhav Jain
Presently PAPR doesn't support injecting smart errors on an
NVDIMM. This makes testing the NVDIMM health reporting functionality
difficult as simulating NVDIMM health related events need a hacked up
qemu version.

To solve this problem this patch proposes simulating certain set of
NVDIMM health related events in papr_scm. Specifically 'fatal' health
state and 'dirty' shutdown state. These error can be injected via the
user-space 'ndctl-inject-smart(1)' command. With the proposed patch and
corresponding ndctl patches following command flow is expected:

$ sudo ndctl list -DH -d nmem0
...
  "health_state":"ok",
  "shutdown_state":"clean",
...
 # inject unsafe shutdown and fatal health error
$ sudo ndctl inject-smart nmem0 -Uf
...
  "health_state":"fatal",
  "shutdown_state":"dirty",
...
 # uninject all errors
$ sudo ndctl inject-smart nmem0 -N
...
  "health_state":"ok",
  "shutdown_state":"clean",
...

The patch adds two members 'health_bitmap_mask' and
'health_bitmap_override' inside struct papr_scm_priv which are then
bit blt'ed[1] to the health bitmaps fetched from the hypervisor. In case
we are not able to fetch health information from the hypervisor we
service the health bitmap from these two members. These members are
accessible from sysfs at nmemX/papr/health_bitmap_override

A new PDSM named 'SMART_INJECT' is proposed that accepts newly
introduced 'struct nd_papr_pdsm_smart_inject' as payload thats
exchanged between libndctl and papr_scm to indicate the requested
smart-error states.

When the processing the PDSM 'SMART_INJECT', papr_pdsm_smart_inject()
constructs a pair or 'mask' and 'override' bitmaps from the payload
and bit-blt it to the 'health_bitmap_{mask, override}' members. This
ensures the after being fetched from the hypervisor, the health_bitmap
reflects requested smart-error states.

The patch is based on [2] "powerpc/papr_scm: Move duplicate
definitions to common header files".

[1] : https://en.wikipedia.org/wiki/Bit_blit
[2] :
https://lore.kernel.org/nvdimm/162505488483.72147.12741153746322191381.stgit@56e104a48989

Signed-off-by: Vaibhav Jain 
Signed-off-by: Shivaprasad G Bhat 
---
 arch/powerpc/platforms/pseries/papr_scm.c | 94 ++-
 include/uapi/linux/papr_pdsm.h| 18 +
 2 files changed, 109 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
b/arch/powerpc/platforms/pseries/papr_scm.c
index 0c56db5a1427..b7437c61a270 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -29,6 +29,10 @@
 (1ul << ND_CMD_SET_CONFIG_DATA) | \
 (1ul << ND_CMD_CALL))
 
+/* Use bitblt method to override specific bits in the '_bitmap_' */
+#define BITBLT_BITMAP(_bitmap_, _mask_, _override_)\
+   (((_bitmap_) & ~(_mask_)) | ((_mask_) & (_override_)))
+
 /* Struct holding a single performance metric */
 struct papr_scm_perf_stat {
u8 stat_id[8];
@@ -81,6 +85,12 @@ struct papr_scm_priv {
 
/* length of the stat buffer as expected by phyp */
size_t stat_buffer_len;
+
+   /* The bits which needs to be overridden */
+   u64 health_bitmap_mask;
+
+   /* The overridden values for the bits having the masks set */
+   u64 health_bitmap_override;
 };
 
 static int papr_scm_pmem_flush(struct nd_region *nd_region,
@@ -308,19 +318,28 @@ static ssize_t drc_pmem_query_stats(struct papr_scm_priv 
*p,
 static int __drc_pmem_query_health(struct papr_scm_priv *p)
 {
unsigned long ret[PLPAR_HCALL_BUFSIZE];
+   u64 bitmap = 0;
long rc;
 
/* issue the hcall */
rc = plpar_hcall(H_SCM_HEALTH, ret, p->drc_index);
-   if (rc != H_SUCCESS) {
+   if (rc == H_SUCCESS)
+   bitmap = ret[0] & ret[1];
+   else if (rc == H_FUNCTION)
+   dev_info_once(>pdev->dev,
+ "Hcall H_SCM_HEALTH not implemented, assuming 
empty health bitmap");
+   else {
+
dev_err(>pdev->dev,
"Failed to query health information, Err:%ld\n", rc);
return -ENXIO;
}
 
p->lasthealth_jiffies = jiffies;
-   p->health_bitmap = ret[0] & ret[1];
-
+   /* Allow overriding specific health bits via bit blt. */
+   bitmap = BITBLT_BITMAP(bitmap, p->health_bitmap_mask,
+  p->health_bitmap_override);
+   WRITE_ONCE(p->health_bitmap, bitmap);
dev_dbg(>pdev->dev,
"Queried dimm health info. Bitmap:0x%016lx Mask:0x%016lx\n",
ret[0], ret[1]);
@@ -630,6 +649,54 @@ static int papr_pdsm_health(struct papr_scm_priv *p,
return rc;
 }
 
+/* Inject a smart error Add the dirty-shutdown-counter value to the pdsm */
+static int papr_pdsm_smart_inject(struct papr_scm_priv *p,
+ union nd_pdsm_payload *payload)
+{
+   int rc;
+   u32 supported_flags = 0;
+   u64 mask = 0, override 

Re: [PATCH] memblock: make for_each_mem_range() traverse MEMBLOCK_HOTPLUG regions

2021-07-12 Thread Greg Kurz
On Mon, 12 Jul 2021 10:11:32 +0300
Mike Rapoport  wrote:

> From: Mike Rapoport 
> 
> Commit b10d6bca8720 ("arch, drivers: replace for_each_membock() with
> for_each_mem_range()") didn't take into account that when there is
> movable_node parameter in the kernel command line, for_each_mem_range()
> would skip ranges marked with MEMBLOCK_HOTPLUG.
> 
> The page table setup code in POWER uses for_each_mem_range() to create the
> linear mapping of the physical memory and since the regions marked as
> MEMORY_HOTPLUG are skipped, they never make it to the linear map.
> 
> A later access to the memory in those ranges will fail:
> 
> [2.271743] BUG: Unable to handle kernel data access on write at 
> 0xc004
> [2.271984] Faulting instruction address: 0xc008a3c0
> [2.272568] Oops: Kernel access of bad area, sig: 11 [#1]
> [2.272683] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
> [2.273063] Modules linked in:
> [2.273435] CPU: 0 PID: 53 Comm: kworker/u2:0 Not tainted 5.13.0 #7
> [2.273832] NIP:  c008a3c0 LR: c03c1ed8 CTR: 
> 0040
> [2.273918] REGS: c8a57770 TRAP: 0300   Not tainted  (5.13.0)
> [2.274036] MSR:  82009033   CR: 
> 8402  XER: 2004
> [2.274454] CFAR: c03c1ed4 DAR: c004 DSISR: 4200 
> IRQMASK: 0
> [2.274454] GPR00: c03c1ed8 c8a57a10 c19da700 
> c004
> [2.274454] GPR04: 0280 0180 0400 
> 0200
> [2.274454] GPR08: 0100 0080 0040 
> 0300
> [2.274454] GPR12: 0380 c1bc c01660c8 
> c6337e00
> [2.274454] GPR16:    
> 
> [2.274454] GPR20: 4000 2000 c1a81990 
> c8c3
> [2.274454] GPR24: c8c2 c1a81998 000f 
> c1a819a0
> [2.274454] GPR28: c1a81908 c00c0100 c8c4 
> c8a64680
> [2.275520] NIP [c008a3c0] clear_user_page+0x50/0x80
> [2.276333] LR [c03c1ed8] __handle_mm_fault+0xc88/0x1910
> [2.276688] Call Trace:
> [2.276839] [c8a57a10] [c03c1e94] 
> __handle_mm_fault+0xc44/0x1910 (unreliable)
> [2.277142] [c8a57af0] [c03c2c90] 
> handle_mm_fault+0x130/0x2a0
> [2.277331] [c8a57b40] [c03b5f08] 
> __get_user_pages+0x248/0x610
> [2.277541] [c8a57c40] [c03b848c] 
> __get_user_pages_remote+0x12c/0x3e0
> [2.277768] [c8a57cd0] [c0473f24] get_arg_page+0x54/0xf0
> [2.277959] [c8a57d10] [c0474a7c] 
> copy_string_kernel+0x11c/0x210
> [2.278159] [c8a57d80] [c047663c] kernel_execve+0x16c/0x220
> [2.278361] [c8a57dd0] [c0166270] 
> call_usermodehelper_exec_async+0x1b0/0x2f0
> [2.278543] [c8a57e10] [c000d5ec] 
> ret_from_kernel_thread+0x5c/0x70
> [2.278870] Instruction dump:
> [2.279214] 79280fa4 79271764 79261f24 794ae8e2 7ca94214 7d683a14 7c893a14 
> 7d893050
> [2.279416] 7d4903a6 6000 6000 6000 <7c001fec> 7c091fec 
> 7c081fec 7c051fec
> [2.280193] ---[ end trace 490b8c67e6075e09 ]---
> 
> Making for_each_mem_range() include MEMBLOCK_HOTPLUG regions in the
> traversal fixes this issue.
> 
> Link: https://bugzilla.redhat.com/show_bug.cgi?id=1976100
> Fixes: b10d6bca8720 ("arch, drivers: replace for_each_membock() with 
> for_each_mem_range()")
> Signed-off-by: Mike Rapoport 
> ---

This fixes the issue I was observing with both radix and hash.

Thanks !

Tested-by: Greg Kurz 

Cc'ing linuxppc-dev so that POWER folks know about the fix
and stable.

Cc: sta...@vger.kernel.org # v5.10

>  include/linux/memblock.h | 4 ++--
>  mm/memblock.c| 3 ++-
>  2 files changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index cbf46f56d105..4a53c3ca86bd 100644
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -209,7 +209,7 @@ static inline void __next_physmem_range(u64 *idx, struct 
> memblock_type *type,
>   */
>  #define for_each_mem_range(i, p_start, p_end) \
>   __for_each_mem_range(i, , NULL, NUMA_NO_NODE,   \
> -  MEMBLOCK_NONE, p_start, p_end, NULL)
> +  MEMBLOCK_HOTPLUG, p_start, p_end, NULL)
>  
>  /**
>   * for_each_mem_range_rev - reverse iterate through memblock areas from
> @@ -220,7 +220,7 @@ static inline void __next_physmem_range(u64 *idx, struct 
> memblock_type *type,
>   */
>  #define for_each_mem_range_rev(i, p_start, p_end)\
>   __for_each_mem_range_rev(i, , NULL, NUMA_NO_NODE, \
> -  MEMBLOCK_NONE, p_start, p_end, NULL)
> +  MEMBLOCK_HOTPLUG, p_start, 

Re: [PATCH v2 1/1] powerpc/pseries: Interface to represent PAPR firmware attributes

2021-07-12 Thread Pratik Sampat




On 08/07/21 4:05 pm, Gautham R Shenoy wrote:

Hello Pratik,

On Tue, Jul 06, 2021 at 01:54:00PM +0530, Pratik R. Sampat wrote:

Adds a generic interface to represent the energy and frequency related
PAPR attributes on the system using the new H_CALL
"H_GET_ENERGY_SCALE_INFO".

H_GET_EM_PARMS H_CALL was previously responsible for exporting this
information in the lparcfg, however the H_GET_EM_PARMS H_CALL
will be deprecated P10 onwards.

The H_GET_ENERGY_SCALE_INFO H_CALL is of the following call format:
hcall(
   uint64 H_GET_ENERGY_SCALE_INFO,  // Get energy scale info
   uint64 flags,   // Per the flag request
   uint64 firstAttributeId,// The attribute id
   uint64 bufferAddress,   // Guest physical address of the output buffer
   uint64 bufferSize   // The size in bytes of the output buffer
);

This H_CALL can query either all the attributes at once with
firstAttributeId = 0, flags = 0 as well as query only one attribute
at a time with firstAttributeId = id, flags = 1.

The output buffer consists of the following
1. number of attributes  - 8 bytes
2. array offset to the data location - 8 bytes
3. version info  - 1 byte
4. A data array of size num attributes, which contains the following:
   a. attribute ID  - 8 bytes
   b. attribute value in number - 8 bytes
   c. attribute name in string  - 64 bytes
   d. attribute value in string - 64 bytes

The new H_CALL exports information in direct string value format, hence
a new interface has been introduced in
/sys/firmware/papr/energy_scale_info to export this information to
userspace in an extensible pass-through format.

The H_CALL returns the name, numeric value and string value (if exists)

The format of exposing the sysfs information is as follows:
/sys/firmware/papr/energy_scale_info/
|-- /
  |-- desc
  |-- value
  |-- value_desc (if exists)
|-- /
  |-- desc
  |-- value
  |-- value_desc (if exists)
...


I like this implementation. Only one comment w.r.t versioning below:

[..snip..]


@@ -631,6 +632,26 @@ struct hv_gpci_request_buffer {
uint8_t bytes[HGPCI_MAX_DATA_BYTES];
  } __packed;

+#define MAX_ESI_ATTRS  10
+#define MAX_BUF_SZ (sizeof(struct h_energy_scale_info_hdr) + \
+   (sizeof(struct energy_scale_attribute) * MAX_ESI_ATTRS))
+
+struct energy_scale_attribute {
+   __be64 id;
+   __be64 value;
+   unsigned char desc[64];
+   unsigned char value_desc[64];
+} __packed;
+
+struct h_energy_scale_info_hdr {
+   __be64 num_attrs;
+   __be64 array_offset;
+   __u8 data_header_version;
+} __packed;
+


[..snip..]


+static int __init papr_init(void)
+{
+   uint64_t num_attrs;
+   int ret, idx, i;
+   char *esi_buf;
+
+   if (!firmware_has_feature(FW_FEATURE_LPAR))
+   return -ENXIO;
+
+   esi_buf = kmalloc(MAX_BUF_SZ, GFP_KERNEL);
+   if (esi_buf == NULL)
+   return -ENOMEM;
+   /*
+* hcall(
+* uint64 H_GET_ENERGY_SCALE_INFO,  // Get energy scale info
+* uint64 flags,// Per the flag request
+* uint64 firstAttributeId, // The attribute id
+* uint64 bufferAddress,// Guest physical address of the output 
buffer
+* uint64 bufferSize);  // The size in bytes of the output buffer
+*/
+   ret = plpar_hcall_norets(H_GET_ENERGY_SCALE_INFO, ESI_FLAGS_ALL, 0,
+virt_to_phys(esi_buf), MAX_BUF_SZ);


It is good that you are passing a character buffer here and
interpreting it later. This helps in the cases where the header has
more fields than what we are aware of changed. I think even a future
platform adds newer fields to the header, those fields will be
appended after the existing fields, so we should still be able to
interpret the first three fields of the header that we are currently
aware of.



+   if (ret != H_SUCCESS) {
+   pr_warn("hcall failed: H_GET_ENERGY_SCALE_INFO");
+   goto out;
+   }
+
+   esi_hdr = (struct h_energy_scale_info_hdr *) esi_buf;
+   num_attrs = be64_to_cpu(esi_hdr->num_attrs);


Shouldn't we check for the esi_hdr->data_header_version here?
Currently we are only aware of the version 1. If we happen to run this
kernel code on a future platform which supports a different version,
wouldn't it be safer to bail out here ?

Otherwise this patch looks good to me.

Reviewed-by: Gautham R. Shenoy 


Thanks for the review Gautham.
Sure I'll make use of the header version to bail out.
That just seems like the safest approach.

I'll add that and spin a new version here

Thanks
Pratik


--
Thanks and Regards
gautham.