Re: [PATCH v7 24/28] powerpc: Test prefixed code patching

2020-05-05 Thread Alistair Popple
Reviewed-by: Alistair Popple 

On Friday, 1 May 2020 1:42:16 PM AEST Jordan Niethe wrote:
> Expand the code-patching self-tests to includes tests for patching
> prefixed instructions.
> 
> Signed-off-by: Jordan Niethe 
> ---
> v6: New to series
> ---
>  arch/powerpc/lib/Makefile |  2 +-
>  arch/powerpc/lib/code-patching.c  | 21 +
>  arch/powerpc/lib/test_code-patching.S | 19 +++
>  3 files changed, 41 insertions(+), 1 deletion(-)
>  create mode 100644 arch/powerpc/lib/test_code-patching.S
> 
> diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
> index 546591848219..5e994cda8e40 100644
> --- a/arch/powerpc/lib/Makefile
> +++ b/arch/powerpc/lib/Makefile
> @@ -16,7 +16,7 @@ CFLAGS_code-patching.o += -DDISABLE_BRANCH_PROFILING
>  CFLAGS_feature-fixups.o += -DDISABLE_BRANCH_PROFILING
>  endif
> 
> -obj-y += alloc.o code-patching.o feature-fixups.o pmem.o inst.o
> +obj-y += alloc.o code-patching.o feature-fixups.o pmem.o inst.o
> test_code-patching.o
> 
>  ifndef CONFIG_KASAN
>  obj-y+=  string.o memcmp_$(BITS).o
> diff --git a/arch/powerpc/lib/code-patching.c
> b/arch/powerpc/lib/code-patching.c index b32fa707725e..7107c6d01261 100644
> --- a/arch/powerpc/lib/code-patching.c
> +++ b/arch/powerpc/lib/code-patching.c
> @@ -699,6 +699,24 @@ static void __init test_translate_branch(void)
>   vfree(buf);
>  }
> 
> +#ifdef __powerpc64__
> +static void __init test_prefixed_patching(void)
> +{
> + extern unsigned int code_patching_test1[];
> + extern unsigned int code_patching_test1_expected[];
> + extern unsigned int end_code_patching_test1[];
> +
> + __patch_instruction((struct ppc_inst *)code_patching_test1,
> + ppc_inst_prefix(1 << 26, 0x),
> + (struct ppc_inst *)code_patching_test1);
> +
> + check(!memcmp(code_patching_test1,
> +   code_patching_test1_expected,
> +   sizeof(unsigned int) *
> +   (end_code_patching_test1 - code_patching_test1)));
> +}
> +#endif
> +
>  static int __init test_code_patching(void)
>  {
>   printk(KERN_DEBUG "Running code patching self-tests ...\n");
> @@ -707,6 +725,9 @@ static int __init test_code_patching(void)
>   test_branch_bform();
>   test_create_function_call();
>   test_translate_branch();
> +#ifdef __powerpc64__
> + test_prefixed_patching();
> +#endif
> 
>   return 0;
>  }
> diff --git a/arch/powerpc/lib/test_code-patching.S
> b/arch/powerpc/lib/test_code-patching.S new file mode 100644
> index ..91aab208a804
> --- /dev/null
> +++ b/arch/powerpc/lib/test_code-patching.S
> @@ -0,0 +1,19 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2020 IBM Corporation
> + */
> +
> + .text
> +
> +#define globl(x) \
> + .globl x;   \
> +x:
> +
> +globl(code_patching_test1)
> + nop
> + nop
> +globl(end_code_patching_test1)
> +
> +globl(code_patching_test1_expected)
> + .long 1 << 26
> + .long 0x000






Re: [PATCH v2 2/2] clk: qoriq: add cpufreq platform device

2020-05-05 Thread Stephen Boyd
Quoting Mian Yousaf Kaukab (2020-04-21 01:30:00)
> Add a platform device for qoirq-cpufreq driver for the compatible
> clockgen blocks.
> 
> Reviewed-by: Yuantian Tang 
> Acked-by: Viresh Kumar 
> Signed-off-by: Mian Yousaf Kaukab 
> ---

Acked-by: Stephen Boyd 


Re: [PATCH v7 23/28] powerpc: Add prefixed instructions to instruction data type

2020-05-05 Thread Alistair Popple
When reviewing earlier patches in this series I assumed the data type would 
eventually change size (on PPC64 at least) so I was looking for any possible 
side effects this may cause, but I didn't notice any so I think this should be 
ok:

Reviewed-by: Alistair Popple 

However I haven't dug deeply enough into the optprobes code to fully 
understand/comment on the changes there (although they look correct afaict).

On Friday, 1 May 2020 1:42:15 PM AEST Jordan Niethe wrote:
> For powerpc64, redefine the ppc_inst type so both word and prefixed
> instructions can be represented. On powerpc32 the type will remain the
> same.  Update places which had assumed instructions to be 4 bytes long.
> 
> Signed-off-by: Jordan Niethe 
> ---
> v4: New to series
> v5:  - Distinguish normal instructions from prefixed instructions with a
>0xff marker for the suffix.
>  - __patch_instruction() using std for prefixed instructions
> v6:  - Return false instead of 0 in ppc_inst_prefixed()
>  - Fix up types for ppc32 so it compiles
>  - remove ppc_inst_write()
>  - __patching_instruction(): move flush out of condition
> ---
>  arch/powerpc/include/asm/inst.h  | 68 +---
>  arch/powerpc/include/asm/kprobes.h   |  2 +-
>  arch/powerpc/include/asm/uaccess.h   | 32 -
>  arch/powerpc/include/asm/uprobes.h   |  2 +-
>  arch/powerpc/kernel/optprobes.c  | 42 +
>  arch/powerpc/kernel/optprobes_head.S |  3 ++
>  arch/powerpc/lib/code-patching.c | 13 --
>  arch/powerpc/lib/feature-fixups.c|  5 +-
>  arch/powerpc/lib/inst.c  | 40 
>  arch/powerpc/lib/sstep.c |  4 +-
>  arch/powerpc/xmon/xmon.c |  4 +-
>  arch/powerpc/xmon/xmon_bpts.S|  2 +
>  12 files changed, 180 insertions(+), 37 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/inst.h
> b/arch/powerpc/include/asm/inst.h index 2f3c9d5bcf7c..1e743635c214 100644
> --- a/arch/powerpc/include/asm/inst.h
> +++ b/arch/powerpc/include/asm/inst.h
> @@ -8,23 +8,72 @@
> 
>  struct ppc_inst {
>   u32 val;
> +#ifdef __powerpc64__
> + u32 suffix;
> +#endif /* __powerpc64__ */
>  } __packed;
> 
> -#define ppc_inst(x) ((struct ppc_inst){ .val = x })
> -
>  static inline u32 ppc_inst_val(struct ppc_inst x)
>  {
>   return x.val;
>  }
> 
> -static inline int ppc_inst_len(struct ppc_inst x)
> +static inline int ppc_inst_primary_opcode(struct ppc_inst x)
>  {
> - return sizeof(struct ppc_inst);
> + return ppc_inst_val(x) >> 26;
>  }
> 
> -static inline int ppc_inst_primary_opcode(struct ppc_inst x)
> +#ifdef __powerpc64__
> +#define ppc_inst(x) ((struct ppc_inst){ .val = (x), .suffix = 0xff })
> +
> +#define ppc_inst_prefix(x, y) ((struct ppc_inst){ .val = (x), .suffix = (y)
> }) +
> +static inline u32 ppc_inst_suffix(struct ppc_inst x)
>  {
> - return ppc_inst_val(x) >> 26;
> + return x.suffix;
> +}
> +
> +static inline bool ppc_inst_prefixed(struct ppc_inst x)
> +{
> + return (ppc_inst_primary_opcode(x) == 1) && ppc_inst_suffix(x) != 0xff;
> +}
> +
> +static inline struct ppc_inst ppc_inst_swab(struct ppc_inst x)
> +{
> + return ppc_inst_prefix(swab32(ppc_inst_val(x)),
> +swab32(ppc_inst_suffix(x)));
> +}
> +
> +static inline struct ppc_inst ppc_inst_read(const struct ppc_inst *ptr)
> +{
> + u32 val, suffix;
> +
> + val = *(u32 *)ptr;
> + if ((val >> 26) == 1) {
> + suffix = *((u32 *)ptr + 1);
> + return ppc_inst_prefix(val, suffix);
> + } else {
> + return ppc_inst(val);
> + }
> +}
> +
> +static inline bool ppc_inst_equal(struct ppc_inst x, struct ppc_inst y)
> +{
> + return *(u64 *) == *(u64 *)
> +}
> +
> +#else
> +
> +#define ppc_inst(x) ((struct ppc_inst){ .val = x })
> +
> +static inline bool ppc_inst_prefixed(struct ppc_inst x)
> +{
> + return false;
> +}
> +
> +static inline u32 ppc_inst_suffix(struct ppc_inst x)
> +{
> + return 0;
>  }
> 
>  static inline struct ppc_inst ppc_inst_swab(struct ppc_inst x)
> @@ -42,6 +91,13 @@ static inline bool ppc_inst_equal(struct ppc_inst x,
> struct ppc_inst y) return ppc_inst_val(x) == ppc_inst_val(y);
>  }
> 
> +#endif /* __powerpc64__ */
> +
> +static inline int ppc_inst_len(struct ppc_inst x)
> +{
> + return (ppc_inst_prefixed(x)) ? 8  : 4;
> +}
> +
>  int probe_user_read_inst(struct ppc_inst *inst,
>struct ppc_inst *nip);
>  int probe_kernel_read_inst(struct ppc_inst *inst,
> diff --git a/arch/powerpc/include/asm/kprobes.h
> b/arch/powerpc/include/asm/kprobes.h index 66b3f2983b22..4fc0e15e23a5
> 100644
> --- a/arch/powerpc/include/asm/kprobes.h
> +++ b/arch/powerpc/include/asm/kprobes.h
> @@ -43,7 +43,7 @@ extern kprobe_opcode_t optprobe_template_ret[];
>  extern kprobe_opcode_t optprobe_template_end[];
> 
>  /* Fixed instruction size for powerpc */
> -#define MAX_INSN_SIZE1
> +#define MAX_INSN_SIZE2
>  #define 

Re: [PATCH v7 25/28] powerpc: Test prefixed instructions in feature fixups

2020-05-05 Thread Alistair Popple
Hmm, I was hoping to add a tested by but I'm seeing the following failure in 
Mambo:

[1.475459] feature-fixups: test failed at line 730

Based on the name of the test it looks like you probably made a copy/paste 
error in ftr_fixup_prefix2_expected. I suspect you probably meant to use the 
alt 
fixup:

globl(ftr_fixup_prefix2_expected)
or  1,1,1
.long 0x700
.long 0x001
or  2,2,2

Also for some reason these tests (and one of the code-patching tests) aren't 
passing on big endian.

- Alistair

On Friday, 1 May 2020 1:42:17 PM AEST Jordan Niethe wrote:
> Expand the feature-fixups self-tests to includes tests for prefixed
> instructions.
> 
> Signed-off-by: Jordan Niethe 
> ---
> v6: New to series
> ---
>  arch/powerpc/lib/feature-fixups-test.S | 68 +++
>  arch/powerpc/lib/feature-fixups.c  | 74 ++
>  2 files changed, 142 insertions(+)
> 
> diff --git a/arch/powerpc/lib/feature-fixups-test.S
> b/arch/powerpc/lib/feature-fixups-test.S index b12168c2447a..6e2da9123a9b
> 100644
> --- a/arch/powerpc/lib/feature-fixups-test.S
> +++ b/arch/powerpc/lib/feature-fixups-test.S
> @@ -791,3 +791,71 @@ globl(lwsync_fixup_test_expected_SYNC)
>  1:   or  1,1,1
>   sync
> 
> +globl(ftr_fixup_prefix1)
> + or  1,1,1
> + .long 1 << 26
> + .long 0x000
> + or  2,2,2
> +globl(end_ftr_fixup_prefix1)
> +
> +globl(ftr_fixup_prefix1_orig)
> + or  1,1,1
> + .long 1 << 26
> + .long 0x000
> + or  2,2,2
> +
> +globl(ftr_fixup_prefix1_expected)
> + or  1,1,1
> + nop
> + nop
> + or  2,2,2
> +
> +globl(ftr_fixup_prefix2)
> + or  1,1,1
> + .long 1 << 26
> + .long 0x000
> + or  2,2,2
> +globl(end_ftr_fixup_prefix2)
> +
> +globl(ftr_fixup_prefix2_orig)
> + or  1,1,1
> + .long 1 << 26
> + .long 0x000
> + or  2,2,2
> +
> +globl(ftr_fixup_prefix2_alt)
> + .long 0x700
> + .long 0x001
> +
> +globl(ftr_fixup_prefix2_expected)
> + or  1,1,1
> + .long 1 << 26
> + .long 0x001
> + or  2,2,2
> +
> +globl(ftr_fixup_prefix3)
> + or  1,1,1
> + .long 1 << 26
> + .long 0x000
> + or  2,2,2
> + or  3,3,3
> +globl(end_ftr_fixup_prefix3)
> +
> +globl(ftr_fixup_prefix3_orig)
> + or  1,1,1
> + .long 1 << 26
> + .long 0x000
> + or  2,2,2
> + or  3,3,3
> +
> +globl(ftr_fixup_prefix3_alt)
> + .long 1 << 26
> + .long 0x001
> + nop
> +
> +globl(ftr_fixup_prefix3_expected)
> + or  1,1,1
> + .long 1 << 26
> + .long 0x001
> + nop
> + or  3,3,3
> diff --git a/arch/powerpc/lib/feature-fixups.c
> b/arch/powerpc/lib/feature-fixups.c index 243011f85287..6fc499b1d63e 100644
> --- a/arch/powerpc/lib/feature-fixups.c
> +++ b/arch/powerpc/lib/feature-fixups.c
> @@ -687,6 +687,75 @@ static void test_lwsync_macros(void)
>   }
>  }
> 
> +#ifdef __powerpc64__
> +static void __init test_prefix_patching(void)
> +{
> + extern unsigned int ftr_fixup_prefix1[];
> + extern unsigned int end_ftr_fixup_prefix1[];
> + extern unsigned int ftr_fixup_prefix1_orig[];
> + extern unsigned int ftr_fixup_prefix1_expected[];
> + int size = sizeof(unsigned int) * (end_ftr_fixup_prefix1 -
> ftr_fixup_prefix1); +
> + fixup.value = fixup.mask = 8;
> + fixup.start_off = calc_offset(, ftr_fixup_prefix1 + 1);
> + fixup.end_off = calc_offset(, ftr_fixup_prefix1 + 3);
> + fixup.alt_start_off = fixup.alt_end_off = 0;
> +
> + /* Sanity check */
> + check(memcmp(ftr_fixup_prefix1, ftr_fixup_prefix1_orig, size) == 0);
> +
> + patch_feature_section(0, );
> + check(memcmp(ftr_fixup_prefix1, ftr_fixup_prefix1_expected, size) == 0);
> + check(memcmp(ftr_fixup_prefix1, ftr_fixup_prefix1_orig, size) != 0);
> +}
> +
> +static void __init test_prefix_alt_patching(void)
> +{
> + extern unsigned int ftr_fixup_prefix2[];
> + extern unsigned int end_ftr_fixup_prefix2[];
> + extern unsigned int ftr_fixup_prefix2_orig[];
> + extern unsigned int ftr_fixup_prefix2_expected[];
> + extern unsigned int ftr_fixup_prefix2_alt[];
> + int size = sizeof(unsigned int) * (end_ftr_fixup_prefix2 -
> ftr_fixup_prefix2); +
> + fixup.value = fixup.mask = 8;
> + fixup.start_off = calc_offset(, ftr_fixup_prefix2 + 1);
> + fixup.end_off = calc_offset(, ftr_fixup_prefix2 + 3);
> + fixup.alt_start_off = calc_offset(, ftr_fixup_prefix2_alt);
> + fixup.alt_end_off = calc_offset(, ftr_fixup_prefix2_alt + 2);
> + /* Sanity check */
> + check(memcmp(ftr_fixup_prefix2, ftr_fixup_prefix2_orig, size) == 0);
> +
> + patch_feature_section(0, );
> + check(memcmp(ftr_fixup_prefix2, ftr_fixup_prefix2_expected, size) == 0);
> + patch_feature_section(0, );
> + check(memcmp(ftr_fixup_prefix2, ftr_fixup_prefix2_orig, size) != 0);
> +}
> +
> 

[PATCH v4 01/22] powerpc/pkeys: Avoid using lockless page table walk

2020-05-05 Thread Aneesh Kumar K.V
Fetch pkey from vma instead of linux page table. Also document the fact that in
some cases the pkey returned in siginfo won't be the same as the one we took
keyfault on. Even with linux page table walk, we can end up in a similar 
scenario.

Cc: Ram Pai 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/mmu.h|  9 ---
 arch/powerpc/mm/book3s64/hash_utils.c | 24 
 arch/powerpc/mm/fault.c   | 83 +++
 3 files changed, 60 insertions(+), 56 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 0699cfeeb8c9..cf2a08bfd5cd 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -291,15 +291,6 @@ static inline bool early_radix_enabled(void)
 }
 #endif
 
-#ifdef CONFIG_PPC_MEM_KEYS
-extern u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address);
-#else
-static inline u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
-{
-   return 0;
-}
-#endif /* CONFIG_PPC_MEM_KEYS */
-
 #ifdef CONFIG_STRICT_KERNEL_RWX
 static inline bool strict_kernel_rwx_enabled(void)
 {
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 8ed2411c3f39..e951e87a974d 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1671,30 +1671,6 @@ void update_mmu_cache(struct vm_area_struct *vma, 
unsigned long address,
hash_preload(vma->vm_mm, address, is_exec, trap);
 }
 
-#ifdef CONFIG_PPC_MEM_KEYS
-/*
- * Return the protection key associated with the given address and the
- * mm_struct.
- */
-u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
-{
-   pte_t *ptep;
-   u16 pkey = 0;
-   unsigned long flags;
-
-   if (!mm || !mm->pgd)
-   return 0;
-
-   local_irq_save(flags);
-   ptep = find_linux_pte(mm->pgd, address, NULL, NULL);
-   if (ptep)
-   pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep)));
-   local_irq_restore(flags);
-
-   return pkey;
-}
-#endif /* CONFIG_PPC_MEM_KEYS */
-
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 static inline void tm_flush_hash_page(int local)
 {
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 84af6c8eecf7..8e529e4708e1 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -118,9 +118,34 @@ static noinline int bad_area(struct pt_regs *regs, 
unsigned long address)
return __bad_area(regs, address, SEGV_MAPERR);
 }
 
-static int bad_key_fault_exception(struct pt_regs *regs, unsigned long address,
-   int pkey)
+#ifdef CONFIG_PPC_MEM_KEYS
+static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long 
address,
+   struct vm_area_struct *vma)
 {
+   struct mm_struct *mm = current->mm;
+   int pkey;
+
+   /*
+* We don't try to fetch the pkey from page table because reading
+* page table without locking doesn't guarantee stable pte value.
+* Hence the pkey value that we return to userspace can be different
+* from the pkey that actually caused access error.
+*
+* It does *not* guarantee that the VMA we find here
+* was the one that we faulted on.
+*
+* 1. T1   : mprotect_key(foo, PAGE_SIZE, pkey=4);
+* 2. T1   : set AMR to deny access to pkey=4, touches, page
+* 3. T1   : faults...
+* 4.T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
+* 5. T1   : enters fault handler, takes mmap_sem, etc...
+* 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
+*   faulted on a pte with its pkey=4.
+*/
+   pkey = vma_pkey(vma);
+
+   up_read(>mmap_sem);
+
/*
 * If we are in kernel mode, bail out with a SEGV, this will
 * be caught by the assembly which will restore the non-volatile
@@ -133,6 +158,7 @@ static int bad_key_fault_exception(struct pt_regs *regs, 
unsigned long address,
 
return 0;
 }
+#endif
 
 static noinline int bad_access(struct pt_regs *regs, unsigned long address)
 {
@@ -289,8 +315,31 @@ static bool bad_stack_expansion(struct pt_regs *regs, 
unsigned long address,
return false;
 }
 
-static bool access_error(bool is_write, bool is_exec,
-struct vm_area_struct *vma)
+#ifdef CONFIG_PPC_MEM_KEYS
+static bool access_pkey_error(bool is_write, bool is_exec, bool is_pkey,
+ struct vm_area_struct *vma)
+{
+   /*
+* Read or write was blocked by protection keys.  This is
+* always an unconditional error and can never result in
+* a follow-up action to resolve the fault, like a COW.
+*/
+   if (is_pkey)
+   return true;
+
+   /*
+* Make sure to check the VMA so that we do not perform
+* faults just to hit a pkey fault as soon as we fill in a
+* page. Only called for 

[PATCH v4 10/22] powerpc/kvm/nested: Add helper to walk nested shadow linux page table.

2020-05-05 Thread Aneesh Kumar K.V
The locking rules for walking nested shadow linux page table is different from 
process
scoped table. Hence add a helper for nested page table walk and also
add check whether we are holding the right locks.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/kvm/book3s_hv_nested.c | 28 +---
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
b/arch/powerpc/kvm/book3s_hv_nested.c
index 7f1fc5db13ea..b2cc3eaec618 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -750,6 +750,24 @@ static struct kvm_nested_guest *kvmhv_find_nested(struct 
kvm *kvm, int lpid)
return kvm->arch.nested_guests[lpid];
 }
 
+static pte_t *find_kvm_nested_guest_pte(struct kvm *kvm, unsigned long lpid,
+   unsigned long ea, unsigned *hshift)
+{
+   struct kvm_nested_guest *gp;
+   pte_t *pte;
+
+   gp = kvmhv_find_nested(kvm, lpid);
+   if (!gp)
+   return NULL;
+
+   VM_WARN(!spin_is_locked(>mmu_lock),
+   "%s called with kvm mmu_lock not held \n", __func__);
+   pte = __find_linux_pte(gp->shadow_pgtable, ea, NULL, hshift);
+
+   return pte;
+}
+
+
 static inline bool kvmhv_n_rmap_is_equal(u64 rmap_1, u64 rmap_2)
 {
return !((rmap_1 ^ rmap_2) & (RMAP_NESTED_LPID_MASK |
@@ -792,19 +810,15 @@ static void kvmhv_update_nest_rmap_rc(struct kvm *kvm, 
u64 n_rmap,
  unsigned long clr, unsigned long set,
  unsigned long hpa, unsigned long mask)
 {
-   struct kvm_nested_guest *gp;
unsigned long gpa;
unsigned int shift, lpid;
pte_t *ptep;
 
gpa = n_rmap & RMAP_NESTED_GPA_MASK;
lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT;
-   gp = kvmhv_find_nested(kvm, lpid);
-   if (!gp)
-   return;
 
/* Find the pte */
-   ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, );
+   ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, );
/*
 * If the pte is present and the pfn is still the same, update the pte.
 * If the pfn has changed then this is a stale rmap entry, the nested
@@ -854,7 +868,7 @@ static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 
n_rmap,
return;
 
/* Find and invalidate the pte */
-   ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, );
+   ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, );
/* Don't spuriously invalidate ptes if the pfn has changed */
if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa))
kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
@@ -921,7 +935,7 @@ static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu 
*vcpu,
int shift;
 
spin_lock(>mmu_lock);
-   ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, );
+   ptep = find_kvm_nested_guest_pte(kvm, gp->l1_lpid, gpa, );
if (!shift)
shift = PAGE_SHIFT;
if (ptep && pte_present(*ptep)) {
-- 
2.26.2



Re: [PATCH v7 5/5] powerpc/hv-24x7: Update post_mobility_fixup() to handle migration

2020-05-05 Thread Michael Ellerman
kajoljain  writes:
> On 4/29/20 5:07 PM, Michael Ellerman wrote:
>> Kajol Jain  writes:
>>> Function 'read_sys_info_pseries()' is added to get system parameter
>>> values like number of sockets and chips per socket.
>>> and it gets these details via rtas_call with token
>>> "PROCESSOR_MODULE_INFO".
>>>
>>> Incase lpar migrate from one system to another, system
>>> parameter details like chips per sockets or number of sockets might
>>> change. So, it needs to be re-initialized otherwise, these values
>>> corresponds to previous system values.
>>> This patch adds a call to 'read_sys_info_pseries()' from
>>> 'post-mobility_fixup()' to re-init the physsockets and physchips values.
>>>
>>> Signed-off-by: Kajol Jain 
>>> ---
>>>  arch/powerpc/platforms/pseries/mobility.c | 12 
>>>  1 file changed, 12 insertions(+)
>>>
>>> diff --git a/arch/powerpc/platforms/pseries/mobility.c 
>>> b/arch/powerpc/platforms/pseries/mobility.c
>>> index b571285f6c14..226accd6218b 100644
>>> --- a/arch/powerpc/platforms/pseries/mobility.c
>>> +++ b/arch/powerpc/platforms/pseries/mobility.c
>>> @@ -371,6 +371,18 @@ void post_mobility_fixup(void)
>>> /* Possibly switch to a new RFI flush type */
>>> pseries_setup_rfi_flush();
>>>  
>>> +   /*
>>> +* Incase lpar migrate from one system to another, system
>> 
>> In case an LPAR migrates
>> 
>>> +* parameter details like chips per sockets and number of sockets
>>> +* might change. So, it needs to be re-initialized otherwise these
>>  ^   ^
>>  they need   the
>>> +* values corresponds to previous system.
>>   ^
>>   will correspond to the
>> 
>>> +* Here, adding a call to read_sys_info_pseries() declared in
>> 
>> Adding is the wrong tense in a comment. When someone reads the comment
>> the code has already been added. Past tense would be right, but really
>> the comment shouldn't say what you did, it should say why.
>> 
>>> +* platforms/pseries/pseries.h to re-init the physsockets and
>>> +* physchips value.
>> 
>> Call read_sys_info_pseries() to reinitialise the values.
>> 
>>> +*/
>>> +   if (IS_ENABLED(CONFIG_HV_PERF_CTRS) && IS_ENABLED(CONFIG_PPC_RTAS))
>>> +   read_sys_info_pseries();
>> 
>> The RTAS check is not needed. pseries always selects RTAS.
>> 
>> You shouldn't need the IS_ENABLED() check here though, do it with an
>> empty version in the header when CONFIG_HV_PERF_CTRS is not enabled.
>> 
>
> Hi Michael,
>   Thanks for reviewing the patch. Is something like this you are 
> suggesting. Please
> let me know if my understanding is fine.
>
> +#ifndef CONFIG_HV_PERF_CTRS
> +#define read_sys_info_pseries() 
> +#endif

It should be an empty static inline. So more like:

#ifdef CONFIG_HV_PERF_CTRS
void read_sys_info_pseries(void);
#else
static inline void read_sys_info_pseries(void) { }
#endif

cheers


[PATCH v4 03/22] powerpc/mm/hash64: use _PAGE_PTE when checking for pte_present

2020-05-05 Thread Aneesh Kumar K.V
This makes the pte_present check stricter by checking for additional _PAGE_PTE
bit. A level 1 pte pointer (THP pte) can be switched to a pointer to level 0 pte
page table page by following two operations.

1) THP split.
2) madvise(MADV_DONTNEED) in parallel to page fault.

A lockless page table walk need to make sure we can handle such changes
gracefully.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 15 ++-
 arch/powerpc/mm/book3s64/hash_utils.c| 11 +--
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 368b136517e0..03521a8b0292 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -553,6 +553,12 @@ static inline pte_t pte_clear_savedwrite(pte_t pte)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
+static inline bool pte_hw_valid(pte_t pte)
+{
+   return (pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_PTE)) ==
+   cpu_to_be64(_PAGE_PRESENT | _PAGE_PTE);
+}
+
 static inline int pte_present(pte_t pte)
 {
/*
@@ -561,12 +567,11 @@ static inline int pte_present(pte_t pte)
 * invalid during ptep_set_access_flags. Hence we look for _PAGE_INVALID
 * if we find _PAGE_PRESENT cleared.
 */
-   return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID));
-}
 
-static inline bool pte_hw_valid(pte_t pte)
-{
-   return !!(pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT));
+   if (pte_hw_valid(pte))
+   return true;
+   return (pte_raw(pte) & cpu_to_be64(_PAGE_INVALID | _PAGE_PTE)) ==
+   cpu_to_be64(_PAGE_INVALID | _PAGE_PTE);
 }
 
 #ifdef CONFIG_PPC_MEM_KEYS
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index e951e87a974d..525eac4ee2c2 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1350,8 +1350,15 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
goto bail;
}
 
-   /* Add _PAGE_PRESENT to the required access perm */
-   access |= _PAGE_PRESENT;
+   /*
+* Add _PAGE_PRESENT to the required access perm. If there are parallel
+* updates to the pte that can possibly clear _PAGE_PTE, catch that too.
+*
+* We can safely use the return pte address in rest of the function
+* because we do set H_PAGE_BUSY which prevents further updates to pte
+* from generic code.
+*/
+   access |= _PAGE_PRESENT | _PAGE_PTE;
 
/*
 * Pre-check access permissions (will be re-checked atomically
-- 
2.26.2



[PATCH v4 04/22] powerpc/hash64: Restrict page table lookup using init_mm with __flush_hash_table_range

2020-05-05 Thread Aneesh Kumar K.V
This is only used with init_mm currently. Walking init_mm is much simpler
because we don't need to handle concurrent page table like other mm_context

Signed-off-by: Aneesh Kumar K.V 
---
 .../include/asm/book3s/64/tlbflush-hash.h|  3 +--
 arch/powerpc/kernel/pci_64.c |  2 +-
 arch/powerpc/mm/book3s64/hash_tlb.c  | 16 +++-
 3 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h 
b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
index 64d02a704bcb..3b95769739c7 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
@@ -113,8 +113,7 @@ static inline void hash__flush_tlb_kernel_range(unsigned 
long start,
 struct mmu_gather;
 extern void hash__tlb_flush(struct mmu_gather *tlb);
 /* Private function for use by PCI IO mapping code */
-extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
-unsigned long end);
+extern void __flush_hash_table_range(unsigned long start, unsigned long end);
 extern void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr);
 #endif /*  _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H */
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index f83d1f69b1dd..30d07fc79dd1 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -100,7 +100,7 @@ int pcibios_unmap_io_space(struct pci_bus *bus)
 pci_name(bus->self));
 
 #ifdef CONFIG_PPC_BOOK3S_64
-   __flush_hash_table_range(_mm, res->start + _IO_BASE,
+   __flush_hash_table_range(res->start + _IO_BASE,
 res->end + _IO_BASE + 1);
 #endif
return 0;
diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c 
b/arch/powerpc/mm/book3s64/hash_tlb.c
index 4a70d8dd39cd..1fa2173413b5 100644
--- a/arch/powerpc/mm/book3s64/hash_tlb.c
+++ b/arch/powerpc/mm/book3s64/hash_tlb.c
@@ -176,7 +176,6 @@ void hash__tlb_flush(struct mmu_gather *tlb)
  *from the hash table (and the TLB). But keeps
  *the linux PTEs intact.
  *
- * @mm : mm_struct of the target address space (generally init_mm)
  * @start  : starting address
  * @end : ending address (not included in the flush)
  *
@@ -189,17 +188,14 @@ void hash__tlb_flush(struct mmu_gather *tlb)
  * Because of that usage pattern, it is implemented for small size rather
  * than speed.
  */
-void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
- unsigned long end)
+void __flush_hash_table_range(unsigned long start, unsigned long end)
 {
-   bool is_thp;
int hugepage_shift;
unsigned long flags;
 
start = _ALIGN_DOWN(start, PAGE_SIZE);
end = _ALIGN_UP(end, PAGE_SIZE);
 
-   BUG_ON(!mm->pgd);
 
/*
 * Note: Normally, we should only ever use a batch within a
@@ -212,21 +208,15 @@ void __flush_hash_table_range(struct mm_struct *mm, 
unsigned long start,
local_irq_save(flags);
arch_enter_lazy_mmu_mode();
for (; start < end; start += PAGE_SIZE) {
-   pte_t *ptep = find_current_mm_pte(mm->pgd, start, _thp,
- _shift);
+   pte_t *ptep = find_init_mm_pte(start, _shift);
unsigned long pte;
 
if (ptep == NULL)
continue;
pte = pte_val(*ptep);
-   if (is_thp)
-   trace_hugepage_invalidate(start, pte);
if (!(pte & H_PAGE_HASHPTE))
continue;
-   if (unlikely(is_thp))
-   hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte);
-   else
-   hpte_need_flush(mm, start, ptep, pte, hugepage_shift);
+   hpte_need_flush(_mm, start, ptep, pte, hugepage_shift);
}
arch_leave_lazy_mmu_mode();
local_irq_restore(flags);
-- 
2.26.2



[PATCH v4 09/22] powerpc/kvm/book3s: Add helper to walk partition scoped linux page table.

2020-05-05 Thread Aneesh Kumar K.V
The locking rules for walking partition scoped table is different from process
scoped table. Hence add a helper for secondary linux page table walk and also
add check whether we are holding the right locks.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/kvm_book3s_64.h | 13 +
 arch/powerpc/kvm/book3s_64_mmu_radix.c   | 12 ++--
 arch/powerpc/kvm/book3s_hv_nested.c  |  2 +-
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index 04b2b927bb5a..2c2635967d6e 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_PPC_PSERIES
 static inline bool kvmhv_on_pseries(void)
@@ -634,6 +635,18 @@ extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
unsigned long gpa, unsigned long hpa,
unsigned long nbytes);
 
+static inline pte_t *find_kvm_secondary_pte(struct kvm *kvm, unsigned long ea,
+   unsigned *hshift)
+{
+   pte_t *pte;
+
+   VM_WARN(!spin_is_locked(>mmu_lock),
+   "%s called with kvm mmu_lock not held \n", __func__);
+   pte = __find_linux_pte(kvm->arch.pgtable, ea, NULL, hshift);
+
+   return pte;
+}
+
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index aa12cd4078b3..c92d413eeaaf 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -981,11 +981,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct 
kvm_memory_slot *memslot,
return 0;
}
 
-   ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, );
+   ptep = find_kvm_secondary_pte(kvm, gpa, );
if (ptep && pte_present(*ptep))
kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
 kvm->arch.lpid);
-   return 0;   
+   return 0;
 }
 
 /* Called with kvm->mmu_lock held */
@@ -1001,7 +1001,7 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot 
*memslot,
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
return ref;
 
-   ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, );
+   ptep = find_kvm_secondary_pte(kvm, gpa, );
if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
  gpa, shift);
@@ -1028,7 +1028,7 @@ int kvm_test_age_radix(struct kvm *kvm, struct 
kvm_memory_slot *memslot,
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
return ref;
 
-   ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, );
+   ptep = find_kvm_secondary_pte(kvm, gpa, );
if (ptep && pte_present(*ptep) && pte_young(*ptep))
ref = 1;
return ref;
@@ -1048,7 +1048,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm,
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
return ret;
 
-   ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, );
+   ptep = find_kvm_secondary_pte(kvm, gpa, );
if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) {
ret = 1;
if (shift)
@@ -1109,7 +1109,7 @@ void kvmppc_radix_flush_memslot(struct kvm *kvm,
gpa = memslot->base_gfn << PAGE_SHIFT;
spin_lock(>mmu_lock);
for (n = memslot->npages; n; --n) {
-   ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, );
+   ptep = find_kvm_secondary_pte(kvm, gpa, );
if (ptep && pte_present(*ptep))
kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
 kvm->arch.lpid);
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
b/arch/powerpc/kvm/book3s_hv_nested.c
index dc97e5be76f6..7f1fc5db13ea 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -1362,7 +1362,7 @@ static long int __kvmhv_nested_page_fault(struct kvm_run 
*run,
/* See if can find translation in our partition scoped tables for L1 */
pte = __pte(0);
spin_lock(>mmu_lock);
-   pte_p = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, );
+   pte_p = find_kvm_secondary_pte(kvm, gpa, );
if (!shift)
shift = PAGE_SHIFT;
if (pte_p)
-- 
2.26.2



[PATCH v4 11/22] powerpc/kvm/book3s: Use kvm helpers to walk shadow or secondary table

2020-05-05 Thread Aneesh Kumar K.V
update kvmppc_hv_handle_set_rc to use find_kvm_nested_guest_pte and
find_kvm_secondary_pte

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/kvm_book3s.h|  2 +-
 arch/powerpc/include/asm/kvm_book3s_64.h |  3 +++
 arch/powerpc/kvm/book3s_64_mmu_radix.c   | 18 +-
 arch/powerpc/kvm/book3s_hv_nested.c  | 13 ++---
 4 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 506e4df2d730..37c8b50cb505 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -198,7 +198,7 @@ extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, 
unsigned long gpa,
unsigned int shift,
const struct kvm_memory_slot *memslot,
unsigned int lpid);
-extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
+extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested,
bool writing, unsigned long gpa,
unsigned int lpid);
 extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index 2c2635967d6e..2860521992b6 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -647,6 +647,9 @@ static inline pte_t *find_kvm_secondary_pte(struct kvm 
*kvm, unsigned long ea,
return pte;
 }
 
+extern pte_t *find_kvm_nested_guest_pte(struct kvm *kvm, unsigned long lpid,
+   unsigned long ea, unsigned *hshift);
+
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index c92d413eeaaf..70c4025406d8 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -735,7 +735,7 @@ int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, 
pte_t pte,
return ret;
 }
 
-bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing,
+bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing,
 unsigned long gpa, unsigned int lpid)
 {
unsigned long pgflags;
@@ -750,12 +750,12 @@ bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t 
*pgtable, bool writing,
pgflags = _PAGE_ACCESSED;
if (writing)
pgflags |= _PAGE_DIRTY;
-   /*
-* We are walking the secondary (partition-scoped) page table here.
-* We can do this without disabling irq because the Linux MM
-* subsystem doesn't do THP splits and collapses on this tree.
-*/
-   ptep = __find_linux_pte(pgtable, gpa, NULL, );
+
+   if (nested)
+   ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, );
+   else
+   ptep = find_kvm_secondary_pte(kvm, gpa, );
+
if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
return true;
@@ -949,8 +949,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, 
struct kvm_vcpu *vcpu,
/* Failed to set the reference/change bits */
if (dsisr & DSISR_SET_RC) {
spin_lock(>mmu_lock);
-   if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable,
-   writing, gpa, kvm->arch.lpid))
+   if (kvmppc_hv_handle_set_rc(kvm, false, writing,
+   gpa, kvm->arch.lpid))
dsisr &= ~DSISR_SET_RC;
spin_unlock(>mmu_lock);
 
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
b/arch/powerpc/kvm/book3s_hv_nested.c
index b2cc3eaec618..99011f1b772a 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -750,8 +750,8 @@ static struct kvm_nested_guest *kvmhv_find_nested(struct 
kvm *kvm, int lpid)
return kvm->arch.nested_guests[lpid];
 }
 
-static pte_t *find_kvm_nested_guest_pte(struct kvm *kvm, unsigned long lpid,
-   unsigned long ea, unsigned *hshift)
+pte_t *find_kvm_nested_guest_pte(struct kvm *kvm, unsigned long lpid,
+unsigned long ea, unsigned *hshift)
 {
struct kvm_nested_guest *gp;
pte_t *pte;
@@ -767,7 +767,6 @@ static pte_t *find_kvm_nested_guest_pte(struct kvm *kvm, 
unsigned long lpid,
return pte;
 }
 
-
 static inline bool kvmhv_n_rmap_is_equal(u64 rmap_1, u64 rmap_2)
 {
return !((rmap_1 ^ rmap_2) & (RMAP_NESTED_LPID_MASK |
@@ -1226,16 +1225,16 @@ static long kvmhv_handle_nested_set_rc(struct kvm_vcpu 
*vcpu,
 
spin_lock(>mmu_lock);
/* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */
-   ret = 

[PATCH v4 13/22] powerpc/kvm/book3s: Use find_kvm_host_pte in page fault handler

2020-05-05 Thread Aneesh Kumar K.V
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 2b35f9bcf892..38e934dc1714 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -602,12 +602,12 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, 
struct kvm_vcpu *vcpu,
 * Read the PTE from the process' radix tree and use that
 * so we get the shift and attribute bits.
 */
-   local_irq_disable();
-   ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, );
+   spin_lock(>mmu_lock);
+   ptep = find_kvm_host_pte(kvm, mmu_seq, hva, )
pte = __pte(0);
if (ptep)
-   pte = *ptep;
-   local_irq_enable();
+   pte = READ_ONCE(*ptep);
+   spin_unlock(>mmu_lock);
/*
 * If the PTE disappeared temporarily due to a THP
 * collapse, just return and let the guest try again.
-- 
2.26.2



Re: [PATCH v7 5/5] powerpc/hv-24x7: Update post_mobility_fixup() to handle migration

2020-05-05 Thread kajoljain



On 4/29/20 5:07 PM, Michael Ellerman wrote:
> Kajol Jain  writes:
>> Function 'read_sys_info_pseries()' is added to get system parameter
>> values like number of sockets and chips per socket.
>> and it gets these details via rtas_call with token
>> "PROCESSOR_MODULE_INFO".
>>
>> Incase lpar migrate from one system to another, system
>> parameter details like chips per sockets or number of sockets might
>> change. So, it needs to be re-initialized otherwise, these values
>> corresponds to previous system values.
>> This patch adds a call to 'read_sys_info_pseries()' from
>> 'post-mobility_fixup()' to re-init the physsockets and physchips values.
>>
>> Signed-off-by: Kajol Jain 
>> ---
>>  arch/powerpc/platforms/pseries/mobility.c | 12 
>>  1 file changed, 12 insertions(+)
>>
>> diff --git a/arch/powerpc/platforms/pseries/mobility.c 
>> b/arch/powerpc/platforms/pseries/mobility.c
>> index b571285f6c14..226accd6218b 100644
>> --- a/arch/powerpc/platforms/pseries/mobility.c
>> +++ b/arch/powerpc/platforms/pseries/mobility.c
>> @@ -371,6 +371,18 @@ void post_mobility_fixup(void)
>>  /* Possibly switch to a new RFI flush type */
>>  pseries_setup_rfi_flush();
>>  
>> +/*
>> + * Incase lpar migrate from one system to another, system
> 
> In case an LPAR migrates
> 
>> + * parameter details like chips per sockets and number of sockets
>> + * might change. So, it needs to be re-initialized otherwise these
>  ^   ^
>  they need   the
>> + * values corresponds to previous system.
>   ^
>   will correspond to the
> 
>> + * Here, adding a call to read_sys_info_pseries() declared in
> 
> Adding is the wrong tense in a comment. When someone reads the comment
> the code has already been added. Past tense would be right, but really
> the comment shouldn't say what you did, it should say why.
> 
>> + * platforms/pseries/pseries.h to re-init the physsockets and
>> + * physchips value.
> 
> Call read_sys_info_pseries() to reinitialise the values.
> 
>> + */
>> +if (IS_ENABLED(CONFIG_HV_PERF_CTRS) && IS_ENABLED(CONFIG_PPC_RTAS))
>> +read_sys_info_pseries();
> 
> The RTAS check is not needed. pseries always selects RTAS.
> 
> You shouldn't need the IS_ENABLED() check here though, do it with an
> empty version in the header when CONFIG_HV_PERF_CTRS is not enabled.
> 

Hi Michael,
Thanks for reviewing the patch. Is something like this you are 
suggesting. Please
let me know if my understanding is fine.

+#ifndef CONFIG_HV_PERF_CTRS
+#define read_sys_info_pseries() 
+#endif

Thanks,
Kajol Jain
> cheers
> 


Re: [PATCH v7 04/28] powerpc/xmon: Use bitwise calculations in_breakpoint_table()

2020-05-05 Thread Michael Ellerman
Jordan Niethe  writes:
> A modulo operation is used for calculating the current offset from a
> breakpoint within the breakpoint table. As instruction lengths are
> always a power of 2, this can be replaced with a bitwise 'and'. The
> current check for word alignment can be replaced with checking that the
> lower 2 bits are not set.
>
> Suggested-by: Christophe Leroy 
> Signed-off-by: Jordan Niethe 
> ---
> v6: New to series
> ---
>  arch/powerpc/xmon/xmon.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
> index bbfea22f4a96..e122f0c8a044 100644
> --- a/arch/powerpc/xmon/xmon.c
> +++ b/arch/powerpc/xmon/xmon.c
> @@ -857,8 +857,8 @@ static struct bpt *in_breakpoint_table(unsigned long nip, 
> unsigned long *offp)
>   off = nip - (unsigned long) bpt_table;
>   if (off >= sizeof(bpt_table))
>   return NULL;
> - *offp = off % BPT_SIZE;
> - if (*offp != 0 && *offp != 4)
> + *offp = off & (BPT_SIZE - 1);
> + if (off & 3)
>   return NULL;

It would be even better if you didn't hard code the 3 wouldn't it?

eg:

+   *offp = off & (BPT_SIZE - 1);
+   if (off & (BPT_SIZE - 1))
return NULL;

cheers


Re: [PATCH v7 26/28] powerpc: Support prefixed instructions in alignment handler

2020-05-05 Thread Alistair Popple
Reviewed-by: Alistair Popple 

On Friday, 1 May 2020 1:42:18 PM AEST Jordan Niethe wrote:
> If a prefixed instruction results in an alignment exception, the
> SRR1_PREFIXED bit is set. The handler attempts to emulate the
> responsible instruction and then increment the NIP past it. Use
> SRR1_PREFIXED to determine by how much the NIP should be incremented.
> 
> Prefixed instructions are not permitted to cross 64-byte boundaries. If
> they do the alignment interrupt is invoked with SRR1 BOUNDARY bit set.
> If this occurs send a SIGBUS to the offending process if in user mode.
> If in kernel mode call bad_page_fault().
> 
> Signed-off-by: Jordan Niethe 
> ---
> v2: - Move __get_user_instr() and __get_user_instr_inatomic() to this
> commit (previously in "powerpc sstep: Prepare to support prefixed
> instructions").
> - Rename sufx to suffix
> - Use a macro for calculating instruction length
> v3: Move __get_user_{instr(), instr_inatomic()} up with the other
> get_user definitions and remove nested if.
> v4: Rolled into "Add prefixed instructions to instruction data type"
> v5: Only one definition of inst_length()
> ---
>  arch/powerpc/kernel/traps.c | 19 ++-
>  1 file changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> index 493a3fa0ac1a..105242cc2f28 100644
> --- a/arch/powerpc/kernel/traps.c
> +++ b/arch/powerpc/kernel/traps.c
> @@ -583,6 +583,8 @@ static inline int check_io_access(struct pt_regs *regs)
>  #define REASON_ILLEGAL   (ESR_PIL | ESR_PUO)
>  #define REASON_PRIVILEGEDESR_PPR
>  #define REASON_TRAP  ESR_PTR
> +#define REASON_PREFIXED  0
> +#define REASON_BOUNDARY  0
> 
>  /* single-step stuff */
>  #define single_stepping(regs)(current->thread.debug.dbcr0 & DBCR0_IC)
> @@ -597,12 +599,16 @@ static inline int check_io_access(struct pt_regs
> *regs) #define REASON_ILLEGAL SRR1_PROGILL
>  #define REASON_PRIVILEGEDSRR1_PROGPRIV
>  #define REASON_TRAP  SRR1_PROGTRAP
> +#define REASON_PREFIXED  SRR1_PREFIXED
> +#define REASON_BOUNDARY  SRR1_BOUNDARY
> 
>  #define single_stepping(regs)((regs)->msr & MSR_SE)
>  #define clear_single_step(regs)  ((regs)->msr &= ~MSR_SE)
>  #define clear_br_trace(regs) ((regs)->msr &= ~MSR_BE)
>  #endif
> 
> +#define inst_length(reason)  (((reason) & REASON_PREFIXED) ? 8 : 4)
> +
>  #if defined(CONFIG_E500)
>  int machine_check_e500mc(struct pt_regs *regs)
>  {
> @@ -1593,11 +1599,20 @@ void alignment_exception(struct pt_regs *regs)
>  {
>   enum ctx_state prev_state = exception_enter();
>   int sig, code, fixed = 0;
> + unsigned long  reason;
> 
>   /* We restore the interrupt state now */
>   if (!arch_irq_disabled_regs(regs))
>   local_irq_enable();
> 
> + reason = get_reason(regs);
> +
> + if (reason & REASON_BOUNDARY) {
> + sig = SIGBUS;
> + code = BUS_ADRALN;
> + goto bad;
> + }
> +
>   if (tm_abort_check(regs, TM_CAUSE_ALIGNMENT | TM_CAUSE_PERSISTENT))
>   goto bail;
> 
> @@ -1606,7 +1621,8 @@ void alignment_exception(struct pt_regs *regs)
>   fixed = fix_alignment(regs);
> 
>   if (fixed == 1) {
> - regs->nip += 4; /* skip over emulated instruction */
> + /* skip over emulated instruction */
> + regs->nip += inst_length(reason);
>   emulate_single_step(regs);
>   goto bail;
>   }
> @@ -1619,6 +1635,7 @@ void alignment_exception(struct pt_regs *regs)
>   sig = SIGBUS;
>   code = BUS_ADRALN;
>   }
> +bad:
>   if (user_mode(regs))
>   _exception(sig, regs, code, regs->dar);
>   else






[PATCH v4 00/22] Avoid IPI while updating page table entries.

2020-05-05 Thread Aneesh Kumar K.V
Problem Summary:
Slow termination of KVM guest with large guest RAM config due to a large number
of IPIs that were caused by clearing level 1 PTE entries (THP) entries.
This is shown in the stack trace below.


- qemu-system-ppc  [kernel.vmlinux][k] smp_call_function_many
   - smp_call_function_many
  - 36.09% smp_call_function_many
   serialize_against_pte_lookup
   radix__pmdp_huge_get_and_clear
   zap_huge_pmd
   unmap_page_range
   unmap_vmas
   unmap_region
   __do_munmap
   __vm_munmap
   sys_munmap
  system_call
   __munmap
   qemu_ram_munmap
   qemu_anon_ram_free
   reclaim_ramblock
   call_rcu_thread
   qemu_thread_start
   start_thread
   __clone

Why we need to do IPI when clearing PMD entries:
This was added as part of commit: 13bd817bb884 ("powerpc/thp: Serialize pmd 
clear against a linux page table walk")

serialize_against_pte_lookup makes sure that all parallel lockless page table
walk completes before we convert a PMD pte entry to regular pmd entry.
We end up doing that conversion in the below scenarios

1) __split_huge_zero_page_pmd
2) do_huge_pmd_wp_page_fallback
3) MADV_DONTNEED running parallel to page faults.

local_irq_disable and lockless page table walk:

The lockless page table walk work with the assumption that we can dereference
the page table contents without holding a lock. For this to work, we need to
make sure we read the page table contents atomically and page table pages are
not going to be freed/released while we are walking the
table pages. We can achieve by using a rcu based freeing for page table pages or
if the architecture implements broadcast tlbie, we can block the IPI as we walk 
the
page table pages.

To support both the above framework, lockless page table walk is done with
irq disabled instead of rcu_read_lock()

We do have two interface for lockless page table walk, gup fast and 
__find_linux_pte.
This patch series makes __find_linux_pte table walk safe against the conversion 
of PMD PTE
to regular PMD.

gup fast:

gup fast is already safe against THP split because kernel now differentiate 
between a pmd
split and a compound page split. gup fast can run parallel to a pmd split and 
we prevent
a parallel gup fast to a hugepage split, by freezing the page refcount and 
failing the
speculative page ref increment.


Similar to how gup is safe against parallel pmd split, this patch series 
updates the
__find_linux_pte callers to be safe against a parallel pmd split. We do that by 
enforcing
the following rules.

1) Don't reload the pte value, because that can be updated in parallel.
2) Code should be able to work with a stale PTE value and not the recent one. 
ie,
the pte value that we are looking at may not be the latest value in the page 
table.
3) Before looking at pte value check for _PAGE_PTE bit. We now do this as part 
of pte_present()
check.

Performance:

This speeds up Qemu guest RAM del/unplug time as below
128 core, 496GB guest:

Without patch:
munmap start: timer = 13162 ms, PID=7684
munmap finish: timer = 95312 ms, PID=7684 - delta = 82150 ms

With patch (upto removing IPI)
munmap start: timer = 196449 ms, PID=6681
munmap finish: timer = 196488 ms, PID=6681 - delta = 39ms

With patch (with adding the tlb invalidate in pmdp_huge_get_and_clear_full)
munmap start: timer = 196345 ms, PID=6879
munmap finish: timer = 196714 ms, PID=6879 - delta = 369ms

Changes from V3:
* Rebase to latest kernel

Changes from V2:
* Rebase to lastest kernel

Changes from V1:
* Update commit messages
* Qemu Performance numbers


Aneesh Kumar K.V (22):
  powerpc/pkeys: Avoid using lockless page table walk
  powerpc/pkeys: Check vma before returning key fault error to the user
  powerpc/mm/hash64: use _PAGE_PTE when checking for pte_present
  powerpc/hash64: Restrict page table lookup using init_mm with
__flush_hash_table_range
  powerpc/book3s64/hash: Use the pte_t address from the caller
  powerpc/mce: Don't reload pte val in addr_to_pfn
  powerpc/perf/callchain: Use __get_user_pages_fast in
read_user_stack_slow
  powerpc/kvm/book3s: switch from raw_spin_*lock to arch_spin_lock.
  powerpc/kvm/book3s: Add helper to walk partition scoped linux page
table.
  powerpc/kvm/nested: Add helper to walk nested shadow linux page table.
  powerpc/kvm/book3s: Use kvm helpers to walk shadow or secondary table
  powerpc/kvm/book3s: Add helper for host page table walk
  powerpc/kvm/book3s: Use find_kvm_host_pte in page fault handler
  powerpc/kvm/book3s: Use find_kvm_host_pte in h_enter
  powerpc/kvm/book3s: use find_kvm_host_pte in pute_tce functions
  powerpc/kvm/book3s: Avoid using rmap to protect parallel page table
update.
  powerpc/kvm/book3s: use find_kvm_host_pte in
kvmppc_book3s_instantiate_page
  powerpc/kvm/book3s: Use find_kvm_host_pte in kvmppc_get_hpa
  powerpc/kvm/book3s: Use pte_present instead of 

[PATCH v4 02/22] powerpc/pkeys: Check vma before returning key fault error to the user

2020-05-05 Thread Aneesh Kumar K.V
If multiple threads in userspace keep changing the protection keys
mapping a range, there can be a scenario where kernel takes a key fault
but the pkey value found in the siginfo struct is a permissive one.

This can confuse the userspace as shown in the below test case.

/* use this to control the number of test iterations */

static void pkeyreg_set(int pkey, unsigned long rights)
{
unsigned long reg, shift;

shift = (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY;
asm volatile("mfspr %0, 0xd" : "=r"(reg));
reg &= ~(((unsigned long) PKEY_BITS_MASK) << shift);
reg |= (rights & PKEY_BITS_MASK) << shift;
asm volatile("mtspr 0xd, %0" : : "r"(reg));
}

static unsigned long pkeyreg_get(void)
{
unsigned long reg;

asm volatile("mfspr %0, 0xd" : "=r"(reg));
return reg;
}

static int sys_pkey_mprotect(void *addr, size_t len, int prot, int pkey)
{
return syscall(SYS_pkey_mprotect, addr, len, prot, pkey);
}

static int sys_pkey_alloc(unsigned long flags, unsigned long access_rights)
{
return syscall(SYS_pkey_alloc, flags, access_rights);
}

static int sys_pkey_free(int pkey)
{
return syscall(SYS_pkey_free, pkey);
}

static int faulting_pkey;
static int permissive_pkey;
static pthread_barrier_t pkey_set_barrier;
static pthread_barrier_t mprotect_barrier;

static void pkey_handle_fault(int signum, siginfo_t *sinfo, void *ctx)
{
unsigned long pkeyreg;

/* FIXME: printf is not signal-safe but for the current purpose,
  it gets the job done. */
printf("pkey: exp = %d, got = %d\n", faulting_pkey, sinfo->si_pkey);
fflush(stdout);

assert(sinfo->si_code == SEGV_PKUERR);
assert(sinfo->si_pkey == faulting_pkey);

/* clear pkey permissions to let the faulting instruction continue */
pkeyreg_set(faulting_pkey, 0x0);
}

static void *do_mprotect_fault(void *p)
{
unsigned long rights, pkeyreg, pgsize;
unsigned int i;
void *region;
int pkey;

srand(time(NULL));
pgsize = sysconf(_SC_PAGESIZE);
rights = PKEY_DISABLE_WRITE;
region = p;

/* allocate key, no permissions */
assert((pkey = sys_pkey_alloc(0, PKEY_DISABLE_ACCESS)) > 0);
pkeyreg_set(4, 0x0);

/* cache the pkey here as the faulting pkey for future reference
   in the signal handler */
faulting_pkey = pkey;
printf("%s: faulting pkey = %d\n", __func__, faulting_pkey);

/* try to allocate, mprotect and free pkeys repeatedly */
for (i = 0; i < NUM_ITERATIONS; i++) {
/* sync up with the other thread here */
pthread_barrier_wait(_set_barrier);

/* make sure that the pkey used by the non-faulting thread
   is made permissive for this thread's context too so that
   no faults are triggered because it still might have been
   set to a restrictive value */
//  pkeyreg_set(permissive_pkey, 0x0);

/* sync up with the other thread here */
pthread_barrier_wait(_barrier);

/* perform mprotect */
assert(!sys_pkey_mprotect(region, pgsize, PROT_READ | 
PROT_WRITE, pkey));

/* choose a random byte from the protected region and
   attempt to write to it, this will generate a fault */
*((char *) region + (rand() % pgsize)) = rand();

/* restore pkey permissions as the signal handler may have
   cleared the bit out for the sake of continuing */
pkeyreg_set(pkey, PKEY_DISABLE_WRITE);
}

/* free pkey */
sys_pkey_free(pkey);

return NULL;
}

static void *do_mprotect_nofault(void *p)
{
unsigned long pgsize;
unsigned int i, j;
void *region;
int pkey;

pgsize = sysconf(_SC_PAGESIZE);
region = p;

/* try to allocate, mprotect and free pkeys repeatedly */
for (i = 0; i < NUM_ITERATIONS; i++) {
/* allocate pkey, all permissions */
assert((pkey = sys_pkey_alloc(0, 0)) > 0);
permissive_pkey = pkey;

/* sync up with the other thread here */
pthread_barrier_wait(_set_barrier);
pthread_barrier_wait(_barrier);

/* perform mprotect on the common page, no faults will
   be triggered as this is most permissive */
assert(!sys_pkey_mprotect(region, pgsize, PROT_READ | 
PROT_WRITE, pkey));

/* free pkey */
assert(!sys_pkey_free(pkey));
}

return NULL;
}

int main(int argc, char **argv)
{
pthread_t fault_thread, nofault_thread;
unsigned long pgsize;
struct sigaction act;
pthread_attr_t attr;
cpu_set_t fault_cpuset, 

[PATCH v4 05/22] powerpc/book3s64/hash: Use the pte_t address from the caller

2020-05-05 Thread Aneesh Kumar K.V
Don't fetch the pte value using lockless page table walk. Instead use the value 
from the
caller. hash_preload is called with ptl lock held. So it is safe to use the
pte_t address directly.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/book3s64/hash_utils.c | 27 +--
 1 file changed, 5 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 525eac4ee2c2..3d727f73a8db 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1546,14 +1546,11 @@ static bool should_hash_preload(struct mm_struct *mm, 
unsigned long ea)
 }
 #endif
 
-static void hash_preload(struct mm_struct *mm, unsigned long ea,
+static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea,
 bool is_exec, unsigned long trap)
 {
-   int hugepage_shift;
unsigned long vsid;
pgd_t *pgdir;
-   pte_t *ptep;
-   unsigned long flags;
int rc, ssize, update_flags = 0;
unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? 
_PAGE_EXEC : 0);
 
@@ -1575,30 +1572,18 @@ static void hash_preload(struct mm_struct *mm, unsigned 
long ea,
vsid = get_user_vsid(>context, ea, ssize);
if (!vsid)
return;
-   /*
-* Hash doesn't like irqs. Walking linux page table with irq disabled
-* saves us from holding multiple locks.
-*/
-   local_irq_save(flags);
 
-   /*
-* THP pages use update_mmu_cache_pmd. We don't do
-* hash preload there. Hence can ignore THP here
-*/
-   ptep = find_current_mm_pte(pgdir, ea, NULL, _shift);
-   if (!ptep)
-   goto out_exit;
-
-   WARN_ON(hugepage_shift);
 #ifdef CONFIG_PPC_64K_PAGES
/* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on
 * a 64K kernel), then we don't preload, hash_page() will take
 * care of it once we actually try to access the page.
 * That way we don't have to duplicate all of the logic for segment
 * page size demotion here
+* Called with  PTL held, hence can be sure the value won't change in
+* between.
 */
if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep))
-   goto out_exit;
+   return;
 #endif /* CONFIG_PPC_64K_PAGES */
 
/* Is that local to this CPU ? */
@@ -1623,8 +1608,6 @@ static void hash_preload(struct mm_struct *mm, unsigned 
long ea,
   mm_ctx_user_psize(>context),
   mm_ctx_user_psize(>context),
   pte_val(*ptep));
-out_exit:
-   local_irq_restore(flags);
 }
 
 /*
@@ -1675,7 +1658,7 @@ void update_mmu_cache(struct vm_area_struct *vma, 
unsigned long address,
return;
}
 
-   hash_preload(vma->vm_mm, address, is_exec, trap);
+   hash_preload(vma->vm_mm, ptep, address, is_exec, trap);
 }
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-- 
2.26.2



[PATCH v4 06/22] powerpc/mce: Don't reload pte val in addr_to_pfn

2020-05-05 Thread Aneesh Kumar K.V
A lockless page table walk should be safe against parallel THP collapse, THP
split and madvise(MADV_DONTNEED)/parallel fault. This patch makes sure kernel
won't reload the pteval when checking for different conditions. The patch also 
added
a check for pte_present to make sure the kernel is indeed operating
on a PTE and not a pointer to level 0 table page.

The pfn value we find here can be different from the actual pfn on which
machine check happened. This can happen if we raced with a parallel update
of the page table. In such a scenario we end up isolating a wrong pfn. But that
doesn't have any other side effect.

Cc: Mahesh Salgaonkar 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/kernel/mce_power.c | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 067b094bfeff..1d18991f3854 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -27,7 +27,7 @@
  */
 unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
 {
-   pte_t *ptep;
+   pte_t *ptep, pte;
unsigned int shift;
unsigned long pfn, flags;
struct mm_struct *mm;
@@ -39,19 +39,23 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned 
long addr)
 
local_irq_save(flags);
ptep = __find_linux_pte(mm->pgd, addr, NULL, );
+   if (!ptep) {
+   pfn = ULONG_MAX;
+   goto out;
+   }
+   pte = READ_ONCE(*ptep);
 
-   if (!ptep || pte_special(*ptep)) {
+   if (!pte_present(pte) || pte_special(pte)) {
pfn = ULONG_MAX;
goto out;
}
 
if (shift <= PAGE_SHIFT)
-   pfn = pte_pfn(*ptep);
+   pfn = pte_pfn(pte);
else {
unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
-   pfn = pte_pfn(__pte(pte_val(*ptep) | (addr & rpnmask)));
+   pfn = pte_pfn(__pte(pte_val(pte) | (addr & rpnmask)));
}
-
 out:
local_irq_restore(flags);
return pfn;
-- 
2.26.2



[PATCH v4 07/22] powerpc/perf/callchain: Use __get_user_pages_fast in read_user_stack_slow

2020-05-05 Thread Aneesh Kumar K.V
read_user_stack_slow is called with interrupts soft disabled and it copies 
contents
from the page which we find mapped to a specific address. To convert
userspace address to pfn, the kernel now uses lockless page table walk.

The kernel needs to make sure the pfn value read remains stable and is not 
released
and reused for another process while the contents are read from the page. This
can only be achieved by holding a page reference.

One of the first approaches I tried was to check the pte value after the kernel
copies the contents from the page. But as shown below we can still get it wrong

CPU0   CPU1
pte = READ_ONCE(*ptep);
   pte_clear(pte);
   put_page(page);
   page = alloc_page();
   memcpy(page_address(page), "secret password", 
nr);
memcpy(buf, kaddr + offset, nb);
   put_page(page);
   handle_mm_fault()
   page = alloc_page();
   set_pte(pte, page);
if (pte_val(pte) != pte_val(*ptep))

Hence switch to __get_user_pages_fast.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/perf/callchain_64.c | 46 ++--
 1 file changed, 14 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/perf/callchain_64.c b/arch/powerpc/perf/callchain_64.c
index df1ffd8b20f2..b63086b663ef 100644
--- a/arch/powerpc/perf/callchain_64.c
+++ b/arch/powerpc/perf/callchain_64.c
@@ -26,43 +26,25 @@
  */
 int read_user_stack_slow(void __user *ptr, void *buf, int nb)
 {
-   int ret = -EFAULT;
-   pgd_t *pgdir;
-   pte_t *ptep, pte;
-   unsigned int shift;
+
unsigned long addr = (unsigned long) ptr;
unsigned long offset;
-   unsigned long pfn, flags;
+   struct page *page;
+   int nrpages;
void *kaddr;
 
-   pgdir = current->mm->pgd;
-   if (!pgdir)
-   return -EFAULT;
+   nrpages = __get_user_pages_fast(addr, 1, 1, );
+   if (nrpages == 1) {
+   kaddr = page_address(page);
+
+   /* align address to page boundary */
+   offset = addr & ~PAGE_MASK;
 
-   local_irq_save(flags);
-   ptep = find_current_mm_pte(pgdir, addr, NULL, );
-   if (!ptep)
-   goto err_out;
-   if (!shift)
-   shift = PAGE_SHIFT;
-
-   /* align address to page boundary */
-   offset = addr & ((1UL << shift) - 1);
-
-   pte = READ_ONCE(*ptep);
-   if (!pte_present(pte) || !pte_user(pte))
-   goto err_out;
-   pfn = pte_pfn(pte);
-   if (!page_is_ram(pfn))
-   goto err_out;
-
-   /* no highmem to worry about here */
-   kaddr = pfn_to_kaddr(pfn);
-   memcpy(buf, kaddr + offset, nb);
-   ret = 0;
-err_out:
-   local_irq_restore(flags);
-   return ret;
+   memcpy(buf, kaddr + offset, nb);
+   put_page(page);
+   return 0;
+   }
+   return -EFAULT;
 }
 
 static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret)
-- 
2.26.2



[PATCH v4 08/22] powerpc/kvm/book3s: switch from raw_spin_*lock to arch_spin_lock.

2020-05-05 Thread Aneesh Kumar K.V
These functions can get called in realmode. Hence use low level
arch_spin_lock which is safe to be called in realmode.

Cc: Suraj Jitindar Singh 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 220305454c23..03f8347de48b 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -948,7 +948,7 @@ static long kvmppc_do_h_page_init_zero(struct kvm_vcpu 
*vcpu,
return ret;
 
/* Check if we've been invalidated */
-   raw_spin_lock(>mmu_lock.rlock);
+   arch_spin_lock(>mmu_lock.rlock.raw_lock);
if (mmu_notifier_retry(kvm, mmu_seq)) {
ret = H_TOO_HARD;
goto out_unlock;
@@ -960,7 +960,7 @@ static long kvmppc_do_h_page_init_zero(struct kvm_vcpu 
*vcpu,
kvmppc_update_dirty_map(memslot, dest >> PAGE_SHIFT, PAGE_SIZE);
 
 out_unlock:
-   raw_spin_unlock(>mmu_lock.rlock);
+   arch_spin_unlock(>mmu_lock.rlock.raw_lock);
return ret;
 }
 
@@ -984,7 +984,7 @@ static long kvmppc_do_h_page_init_copy(struct kvm_vcpu 
*vcpu,
return ret;
 
/* Check if we've been invalidated */
-   raw_spin_lock(>mmu_lock.rlock);
+   arch_spin_lock(>mmu_lock.rlock.raw_lock);
if (mmu_notifier_retry(kvm, mmu_seq)) {
ret = H_TOO_HARD;
goto out_unlock;
@@ -996,7 +996,7 @@ static long kvmppc_do_h_page_init_copy(struct kvm_vcpu 
*vcpu,
kvmppc_update_dirty_map(dest_memslot, dest >> PAGE_SHIFT, PAGE_SIZE);
 
 out_unlock:
-   raw_spin_unlock(>mmu_lock.rlock);
+   arch_spin_unlock(>mmu_lock.rlock.raw_lock);
return ret;
 }
 
-- 
2.26.2



[PATCH v4 12/22] powerpc/kvm/book3s: Add helper for host page table walk

2020-05-05 Thread Aneesh Kumar K.V
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/kvm_book3s_64.h | 16 
 1 file changed, 16 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index 2860521992b6..1ca1f6495012 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -647,6 +647,22 @@ static inline pte_t *find_kvm_secondary_pte(struct kvm 
*kvm, unsigned long ea,
return pte;
 }
 
+static inline pte_t *find_kvm_host_pte(struct kvm *kvm, unsigned long mmu_seq,
+  unsigned long ea, unsigned *hshift)
+{
+   pte_t *pte;
+
+   VM_WARN(!spin_is_locked(>mmu_lock),
+   "%s called with kvm mmu_lock not held \n", __func__);
+
+   if (mmu_notifier_retry(kvm, mmu_seq))
+   return NULL;
+
+   pte = __find_linux_pte(kvm->mm->pgd, ea, NULL, hshift);
+
+   return pte;
+}
+
 extern pte_t *find_kvm_nested_guest_pte(struct kvm *kvm, unsigned long lpid,
unsigned long ea, unsigned *hshift);
 
-- 
2.26.2



[PATCH v4 14/22] powerpc/kvm/book3s: Use find_kvm_host_pte in h_enter

2020-05-05 Thread Aneesh Kumar K.V
Since kvmppc_do_h_enter can get called in realmode use low level
arch_spin_lock which is safe to be called in realmode.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c |  5 ++---
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 22 ++
 2 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 38e934dc1714..8d9725445e4f 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -281,11 +281,10 @@ static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, 
unsigned long flags,
 {
long ret;
 
-   /* Protect linux PTE lookup from page table destruction */
-   rcu_read_lock_sched();  /* this disables preemption too */
+   preempt_disable();
ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel,
kvm->mm->pgd, false, pte_idx_ret);
-   rcu_read_unlock_sched();
+   preempt_enable();
if (ret == H_TOO_HARD) {
/* this can't happen */
pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n");
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 03f8347de48b..83e987fecf97 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -210,7 +210,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
pte_t *ptep;
unsigned int writing;
unsigned long mmu_seq;
-   unsigned long rcbits, irq_flags = 0;
+   unsigned long rcbits;
 
if (kvm_is_radix(kvm))
return H_FUNCTION;
@@ -248,17 +248,9 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long 
flags,
 
/* Translate to host virtual address */
hva = __gfn_to_hva_memslot(memslot, gfn);
-   /*
-* If we had a page table table change after lookup, we would
-* retry via mmu_notifier_retry.
-*/
-   if (!realmode)
-   local_irq_save(irq_flags);
-   /*
-* If called in real mode we have MSR_EE = 0. Otherwise
-* we disable irq above.
-*/
-   ptep = __find_linux_pte(pgdir, hva, NULL, _shift);
+
+   arch_spin_lock(>mmu_lock.rlock.raw_lock);
+   ptep = find_kvm_host_pte(kvm, mmu_seq, hva, _shift);
if (ptep) {
pte_t pte;
unsigned int host_pte_size;
@@ -272,8 +264,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 * to <= host page size, if host is using hugepage
 */
if (host_pte_size < psize) {
-   if (!realmode)
-   local_irq_restore(flags);
+   arch_spin_unlock(>mmu_lock.rlock.raw_lock);
return H_PARAMETER;
}
pte = kvmppc_read_update_linux_pte(ptep, writing);
@@ -287,8 +278,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
pa |= gpa & ~PAGE_MASK;
}
}
-   if (!realmode)
-   local_irq_restore(irq_flags);
+   arch_spin_unlock(>mmu_lock.rlock.raw_lock);
 
ptel &= HPTE_R_KEY | HPTE_R_PP0 | (psize-1);
ptel |= pa;
-- 
2.26.2



[PATCH v4 15/22] powerpc/kvm/book3s: use find_kvm_host_pte in pute_tce functions

2020-05-05 Thread Aneesh Kumar K.V
Current code just hold rmap lock to ensure parallel page table update is
prevented. That is not sufficient. The kernel should also check whether
a mmu_notifer callback was running in parallel.

Cc: Alexey Kardashevskiy 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/kvm/book3s_64_vio_hv.c | 30 +++--
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 6fcaf1fa8e02..acc3ce570be7 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -437,8 +437,8 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned 
long liobn,
return H_SUCCESS;
 }
 
-static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu,
-   unsigned long ua, unsigned long *phpa)
+static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu, unsigned long mmu_seq,
+   unsigned long ua, unsigned long *phpa)
 {
pte_t *ptep, pte;
unsigned shift = 0;
@@ -452,10 +452,17 @@ static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu,
 * to exit which will agains result in the below page table walk
 * to finish.
 */
-   ptep = __find_linux_pte(vcpu->arch.pgdir, ua, NULL, );
-   if (!ptep || !pte_present(*ptep))
+   /* an rmap lock won't make it safe. because that just ensure hash
+* page table entries are removed with rmap lock held. After that
+* mmu notifier returns and we go ahead and removing ptes from Qemu 
page table.
+*/
+   ptep = find_kvm_host_pte(vcpu->kvm, mmu_seq, ua, );
+   if (!ptep)
+   return -ENXIO;
+
+   pte = READ_ONCE(*ptep);
+   if (!pte_present(pte))
return -ENXIO;
-   pte = *ptep;
 
if (!shift)
shift = PAGE_SHIFT;
@@ -477,10 +484,12 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
unsigned long liobn, unsigned long ioba,
unsigned long tce_list, unsigned long npages)
 {
+   struct kvm *kvm = vcpu->kvm;
struct kvmppc_spapr_tce_table *stt;
long i, ret = H_SUCCESS;
unsigned long tces, entry, ua = 0;
unsigned long *rmap = NULL;
+   unsigned long mmu_seq;
bool prereg = false;
struct kvmppc_spapr_tce_iommu_table *stit;
 
@@ -488,6 +497,12 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
if (kvm_is_radix(vcpu->kvm))
return H_TOO_HARD;
 
+   /*
+* used to check for invalidations in progress
+*/
+   mmu_seq = kvm->mmu_notifier_seq;
+   smp_rmb();
+
stt = kvmppc_find_table(vcpu->kvm, liobn);
if (!stt)
return H_TOO_HARD;
@@ -547,7 +562,9 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 * real page.
 */
lock_rmap(rmap);
-   if (kvmppc_rm_ua_to_hpa(vcpu, ua, )) {
+
+   arch_spin_lock(>mmu_lock.rlock.raw_lock);
+   if (kvmppc_rm_ua_to_hpa(vcpu, mmu_seq, ua, )) {
ret = H_TOO_HARD;
goto unlock_exit;
}
@@ -593,6 +610,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
if (rmap)
unlock_rmap(rmap);
 
+   arch_spin_unlock(>mmu_lock.rlock.raw_lock);
return ret;
 }
 
-- 
2.26.2



Re: [PATCH v2 0/5] Statsfs: a new ram-based file sytem for Linux kernel statistics

2020-05-05 Thread Emanuele Giuseppe Esposito




On 5/4/20 11:37 PM, David Rientjes wrote:

On Mon, 4 May 2020, Emanuele Giuseppe Esposito wrote:



In this patch series I introduce statsfs, a synthetic ram-based virtual
filesystem that takes care of gathering and displaying statistics for the
Linux kernel subsystems.



This is exciting, we have been looking in the same area recently.  Adding
Jonathan Adams .

In your diffstat, one thing I notice that is omitted: an update to
Documentation/* :)  Any chance of getting some proposed Documentation/
updates with structure of the fs, the per subsystem breakdown, and best
practices for managing the stats from the kernel level?


Yes, I will write some documentation. Thank you for the suggestion.



Values represent quantites that are gathered by the statsfs user. Examples
of values include the number of vm exits of a given kind, the amount of
memory used by some data structure, the length of the longest hash table
chain, or anything like that. Values are defined with the
statsfs_source_add_values function. Each value is defined by a struct
statsfs_value; the same statsfs_value can be added to many different
sources. A value can be considered "simple" if it fetches data from a
user-provided location, or "aggregate" if it groups all values in the
subordinates sources that include the same statsfs_value.



This seems like it could have a lot of overhead if we wanted to
periodically track the totality of subsystem stats as a form of telemetry
gathering from userspace.  To collect telemetry for 1,000 different stats,
do we need to issue lseek()+read() syscalls for each of them individually
(or, worse, open()+read()+close())?

Any thoughts on how that can be optimized?  A couple of ideas:

  - an interface that allows gathering of all stats for a particular
interface through a single file that would likely be encoded in binary
and the responsibility of userspace to disseminate, or

  - an interface that extends beyond this proposal and allows the reader to
specify which stats they are interested in collecting and then the
kernel will only provide these stats in a well formed structure and
also be binary encoded.


Are you thinking of another file, containing all the stats for the 
directory in binary format?



We've found that the one-file-per-stat method is pretty much a show
stopper from the performance view and we always must execute at least two
syscalls to obtain a single stat.

Since this is becoming a generic API (good!!), maybe we can discuss
possible ways to optimize gathering of stats in mass?


Sure, the idea of a binary format was considered from the beginning in 
[1], and it can be done either together with the current filesystem, or 
as a replacement via different mount options.


Thank you,
Emanuele


[1] 
https://lore.kernel.org/kvm/5d6cdcb1-d8ad-7ae6-7351-3544e2fa3...@redhat.com/?fbclid=IwAR18LHJ0PBcXcDaLzILFhHsl3qpT3z2vlG60RnqgbpGYhDv7L43n0ZXJY8M





Signed-off-by: Emanuele Giuseppe Esposito 

v1->v2 remove unnecessary list_foreach_safe loops, fix wrong indentation,
change statsfs in stats_fs

Emanuele Giuseppe Esposito (5):
   refcount, kref: add dec-and-test wrappers for rw_semaphores
   stats_fs API: create, add and remove stats_fs sources and values
   kunit: tests for stats_fs API
   stats_fs fs: virtual fs to show stats to the end-user
   kvm_main: replace debugfs with stats_fs

  MAINTAINERS |7 +
  arch/arm64/kvm/Kconfig  |1 +
  arch/arm64/kvm/guest.c  |2 +-
  arch/mips/kvm/Kconfig   |1 +
  arch/mips/kvm/mips.c|2 +-
  arch/powerpc/kvm/Kconfig|1 +
  arch/powerpc/kvm/book3s.c   |6 +-
  arch/powerpc/kvm/booke.c|8 +-
  arch/s390/kvm/Kconfig   |1 +
  arch/s390/kvm/kvm-s390.c|   16 +-
  arch/x86/include/asm/kvm_host.h |2 +-
  arch/x86/kvm/Kconfig|1 +
  arch/x86/kvm/Makefile   |2 +-
  arch/x86/kvm/debugfs.c  |   64 --
  arch/x86/kvm/stats_fs.c |   56 ++
  arch/x86/kvm/x86.c  |6 +-
  fs/Kconfig  |   12 +
  fs/Makefile |1 +
  fs/stats_fs/Makefile|6 +
  fs/stats_fs/inode.c |  337 ++
  fs/stats_fs/internal.h  |   35 +
  fs/stats_fs/stats_fs-tests.c| 1088 +++
  fs/stats_fs/stats_fs.c  |  773 ++
  include/linux/kref.h|   11 +
  include/linux/kvm_host.h|   39 +-
  include/linux/refcount.h|2 +
  include/linux/stats_fs.h|  304 +
  include/uapi/linux/magic.h  |1 +
  lib/refcount.c  |   32 +
  tools/lib/api/fs/fs.c   |   21 +
  virt/kvm/arm/arm.c  |2 +-
  virt/kvm/kvm_main.c |  314 ++---
  32 files changed, 2772 insertions(+), 382 deletions(-)
  delete mode 100644 arch/x86/kvm/debugfs.c
  create mode 100644 arch/x86/kvm/stats_fs.c
  

[PATCH v4 16/22] powerpc/kvm/book3s: Avoid using rmap to protect parallel page table update.

2020-05-05 Thread Aneesh Kumar K.V
We now depend on kvm->mmu_lock

Cc: Alexey Kardashevskiy 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/kvm/book3s_64_vio_hv.c | 38 +++--
 1 file changed, 9 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index acc3ce570be7..167029e57c8f 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -74,8 +74,8 @@ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm 
*kvm,
 EXPORT_SYMBOL_GPL(kvmppc_find_table);
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-static long kvmppc_rm_tce_to_ua(struct kvm *kvm, unsigned long tce,
-   unsigned long *ua, unsigned long **prmap)
+static long kvmppc_rm_tce_to_ua(struct kvm *kvm,
+   unsigned long tce, unsigned long *ua)
 {
unsigned long gfn = tce >> PAGE_SHIFT;
struct kvm_memory_slot *memslot;
@@ -87,9 +87,6 @@ static long kvmppc_rm_tce_to_ua(struct kvm *kvm, unsigned 
long tce,
*ua = __gfn_to_hva_memslot(memslot, gfn) |
(tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
 
-   if (prmap)
-   *prmap = >arch.rmap[gfn - memslot->base_gfn];
-
return 0;
 }
 
@@ -116,7 +113,7 @@ static long kvmppc_rm_tce_validate(struct 
kvmppc_spapr_tce_table *stt,
if (iommu_tce_check_gpa(stt->page_shift, gpa))
return H_PARAMETER;
 
-   if (kvmppc_rm_tce_to_ua(stt->kvm, tce, , NULL))
+   if (kvmppc_rm_tce_to_ua(stt->kvm, tce, ))
return H_TOO_HARD;
 
list_for_each_entry_lockless(stit, >iommu_tables, next) {
@@ -411,7 +408,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned 
long liobn,
return ret;
 
dir = iommu_tce_direction(tce);
-   if ((dir != DMA_NONE) && kvmppc_rm_tce_to_ua(vcpu->kvm, tce, , NULL))
+   if ((dir != DMA_NONE) && kvmppc_rm_tce_to_ua(vcpu->kvm, tce, ))
return H_PARAMETER;
 
entry = ioba >> stt->page_shift;
@@ -488,7 +485,6 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
struct kvmppc_spapr_tce_table *stt;
long i, ret = H_SUCCESS;
unsigned long tces, entry, ua = 0;
-   unsigned long *rmap = NULL;
unsigned long mmu_seq;
bool prereg = false;
struct kvmppc_spapr_tce_iommu_table *stit;
@@ -530,7 +526,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 */
struct mm_iommu_table_group_mem_t *mem;
 
-   if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, , NULL))
+   if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, ))
return H_TOO_HARD;
 
mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K);
@@ -546,23 +542,9 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 * We do not require memory to be preregistered in this case
 * so lock rmap and do __find_linux_pte_or_hugepte().
 */
-   if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, , ))
-   return H_TOO_HARD;
-
-   rmap = (void *) vmalloc_to_phys(rmap);
-   if (WARN_ON_ONCE_RM(!rmap))
+   if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, ))
return H_TOO_HARD;
 
-   /*
-* Synchronize with the MMU notifier callbacks in
-* book3s_64_mmu_hv.c (kvm_unmap_hva_range_hv etc.).
-* While we have the rmap lock, code running on other CPUs
-* cannot finish unmapping the host real page that backs
-* this guest real page, so we are OK to access the host
-* real page.
-*/
-   lock_rmap(rmap);
-
arch_spin_lock(>mmu_lock.rlock.raw_lock);
if (kvmppc_rm_ua_to_hpa(vcpu, mmu_seq, ua, )) {
ret = H_TOO_HARD;
@@ -582,7 +564,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
 
ua = 0;
-   if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, , NULL)) {
+   if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, )) {
ret = H_PARAMETER;
goto invalidate_exit;
}
@@ -607,10 +589,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
iommu_tce_kill_rm(stit->tbl, entry, npages);
 
 unlock_exit:
-   if (rmap)
-   unlock_rmap(rmap);
-
-   arch_spin_unlock(>mmu_lock.rlock.raw_lock);
+   if (!prereg)
+   arch_spin_unlock(>mmu_lock.rlock.raw_lock);
return ret;
 }
 
-- 
2.26.2



[PATCH v4 17/22] powerpc/kvm/book3s: use find_kvm_host_pte in kvmppc_book3s_instantiate_page

2020-05-05 Thread Aneesh Kumar K.V
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 70c4025406d8..271f1c3d8443 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -813,12 +813,12 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
 * Read the PTE from the process' radix tree and use that
 * so we get the shift and attribute bits.
 */
-   local_irq_disable();
-   ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, );
+   spin_lock(>mmu_lock);
+   ptep = find_kvm_host_pte(kvm, mmu_seq, hva, );
pte = __pte(0);
if (ptep)
-   pte = *ptep;
-   local_irq_enable();
+   pte = READ_ONCE(*ptep);
+   spin_unlock(>mmu_lock);
/*
 * If the PTE disappeared temporarily due to a THP
 * collapse, just return and let the guest try again.
-- 
2.26.2



[PATCH v4 18/22] powerpc/kvm/book3s: Use find_kvm_host_pte in kvmppc_get_hpa

2020-05-05 Thread Aneesh Kumar K.V
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 32 ++---
 1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 83e987fecf97..3b168c69d503 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -878,8 +878,8 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned 
long flags,
return ret;
 }
 
-static int kvmppc_get_hpa(struct kvm_vcpu *vcpu, unsigned long gpa,
- int writing, unsigned long *hpa,
+static int kvmppc_get_hpa(struct kvm_vcpu *vcpu, unsigned long mmu_seq,
+ unsigned long gpa, int writing, unsigned long *hpa,
  struct kvm_memory_slot **memslot_p)
 {
struct kvm *kvm = vcpu->kvm;
@@ -898,7 +898,7 @@ static int kvmppc_get_hpa(struct kvm_vcpu *vcpu, unsigned 
long gpa,
hva = __gfn_to_hva_memslot(memslot, gfn);
 
/* Try to find the host pte for that virtual address */
-   ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, );
+   ptep = find_kvm_host_pte(kvm, mmu_seq, hva, );
if (!ptep)
return H_TOO_HARD;
pte = kvmppc_read_update_linux_pte(ptep, writing);
@@ -933,16 +933,11 @@ static long kvmppc_do_h_page_init_zero(struct kvm_vcpu 
*vcpu,
mmu_seq = kvm->mmu_notifier_seq;
smp_rmb();
 
-   ret = kvmppc_get_hpa(vcpu, dest, 1, , );
-   if (ret != H_SUCCESS)
-   return ret;
-
-   /* Check if we've been invalidated */
arch_spin_lock(>mmu_lock.rlock.raw_lock);
-   if (mmu_notifier_retry(kvm, mmu_seq)) {
-   ret = H_TOO_HARD;
+
+   ret = kvmppc_get_hpa(vcpu, mmu_seq, dest, 1, , );
+   if (ret != H_SUCCESS)
goto out_unlock;
-   }
 
/* Zero the page */
for (i = 0; i < SZ_4K; i += L1_CACHE_BYTES, pa += L1_CACHE_BYTES)
@@ -966,19 +961,14 @@ static long kvmppc_do_h_page_init_copy(struct kvm_vcpu 
*vcpu,
mmu_seq = kvm->mmu_notifier_seq;
smp_rmb();
 
-   ret = kvmppc_get_hpa(vcpu, dest, 1, _pa, _memslot);
-   if (ret != H_SUCCESS)
-   return ret;
-   ret = kvmppc_get_hpa(vcpu, src, 0, _pa, NULL);
+   arch_spin_lock(>mmu_lock.rlock.raw_lock);
+   ret = kvmppc_get_hpa(vcpu, mmu_seq, dest, 1, _pa, _memslot);
if (ret != H_SUCCESS)
-   return ret;
+   goto out_unlock;
 
-   /* Check if we've been invalidated */
-   arch_spin_lock(>mmu_lock.rlock.raw_lock);
-   if (mmu_notifier_retry(kvm, mmu_seq)) {
-   ret = H_TOO_HARD;
+   ret = kvmppc_get_hpa(vcpu, mmu_seq, src, 0, _pa, NULL);
+   if (ret != H_SUCCESS)
goto out_unlock;
-   }
 
/* Copy the page */
memcpy((void *)dest_pa, (void *)src_pa, SZ_4K);
-- 
2.26.2



[PATCH v4 19/22] powerpc/kvm/book3s: Use pte_present instead of opencoding _PAGE_PRESENT check

2020-05-05 Thread Aneesh Kumar K.V
This adds _PAGE_PTE check and makes sure we validate the pte value returned via
find_kvm_host_pte.

NOTE: this also considers _PAGE_INVALID to the software valid bit.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/kvm_book3s_64.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index 1ca1f6495012..c58e64a0a74f 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -435,7 +435,7 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t 
*ptep, int writing)
continue;
}
/* If pte is not present return None */
-   if (unlikely(!(pte_val(old_pte) & _PAGE_PRESENT)))
+   if (unlikely(!pte_present(old_pte)))
return __pte(0);
 
new_pte = pte_mkyoung(old_pte);
-- 
2.26.2



[PATCH v4 21/22] mm: change pmdp_huge_get_and_clear_full take vm_area_struct as arg

2020-05-05 Thread Aneesh Kumar K.V
We will use this in later patch to do tlb flush when clearing pmd entries.

Cc: kir...@shutemov.name
Cc: a...@linux-foundation.org
Signed-off-by: Aneesh Kumar K.V 
---
 arch/s390/include/asm/pgtable.h | 4 ++--
 include/asm-generic/pgtable.h   | 4 ++--
 mm/huge_memory.c| 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 6076c8c912d2..e2528e057980 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1560,7 +1560,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct 
mm_struct *mm,
 }
 
 #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
-static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
+static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
 unsigned long addr,
 pmd_t *pmdp, int full)
 {
@@ -1569,7 +1569,7 @@ static inline pmd_t pmdp_huge_get_and_clear_full(struct 
mm_struct *mm,
*pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
return pmd;
}
-   return pmdp_xchg_lazy(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
+   return pmdp_xchg_lazy(vma->vm_mm, addr, pmdp, 
__pmd(_SEGMENT_ENTRY_EMPTY));
 }
 
 #define __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 329b8c8ca703..d10be362eafa 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -159,11 +159,11 @@ static inline pud_t pudp_huge_get_and_clear(struct 
mm_struct *mm,
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
-static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
+static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
int full)
 {
-   return pmdp_huge_get_and_clear(mm, address, pmdp);
+   return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
 }
 #endif
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6ecd1045113b..16f2bd6f1549 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1852,8 +1852,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
 * pgtable_trans_huge_withdraw after finishing pmdp related
 * operations.
 */
-   orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
-   tlb->fullmm);
+   orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
+   tlb->fullmm);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
if (vma_is_special_huge(vma)) {
if (arch_needs_pgtable_deposit())
-- 
2.26.2



[PATCH v4 22/22] powerpc/mm/book3s64: Fix MADV_DONTNEED and parallel page fault race

2020-05-05 Thread Aneesh Kumar K.V
MADV_DONTNEED holds mmap_sem in read mode and that implies a
parallel page fault is possible and the kernel can end up with a level 1 PTE
entry (THP entry) converted to a level 0 PTE entry without flushing
the THP TLB entry.

Most architectures including POWER have issues with kernel instantiating a level
0 PTE entry while holding level 1 TLB entries.

The code sequence I am looking at is

down_read(mmap_sem) down_read(mmap_sem)

zap_pmd_range()
 zap_huge_pmd()
  pmd lock held
  pmd_cleared
  table details added to mmu_gather
  pmd_unlock()
 insert a level 0 PTE entry()

tlb_finish_mmu().

Fix this by forcing a tlb flush before releasing pmd lock if this is
not a fullmm invalidate. We can safely skip this invalidate for
task exit case (fullmm invalidate) because in that case we are sure
there can be no parallel fault handlers.

This do change the Qemu guest RAM del/unplug time as below

128 core, 496GB guest:

Without patch:
munmap start: timer = 196449 ms, PID=6681
munmap finish: timer = 196488 ms, PID=6681 - delta = 39ms

With patch:
munmap start: timer = 196345 ms, PID=6879
munmap finish: timer = 196714 ms, PID=6879 - delta = 369ms

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h |  5 +
 arch/powerpc/mm/book3s64/pgtable.c   | 18 ++
 2 files changed, 23 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 03521a8b0292..e1f551159f7d 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1265,6 +1265,11 @@ static inline pmd_t pmdp_collapse_flush(struct 
vm_area_struct *vma,
 }
 #define pmdp_collapse_flush pmdp_collapse_flush
 
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
+pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
+  unsigned long addr,
+  pmd_t *pmdp, int full);
+
 #define __HAVE_ARCH_PGTABLE_DEPOSIT
 static inline void pgtable_trans_huge_deposit(struct mm_struct *mm,
  pmd_t *pmdp, pgtable_t pgtable)
diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
b/arch/powerpc/mm/book3s64/pgtable.c
index 127325ead505..54b6d6d103ea 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -112,6 +112,24 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned 
long address,
return __pmd(old_pmd);
 }
 
+pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
+  unsigned long addr, pmd_t *pmdp, int full)
+{
+   pmd_t pmd;
+   VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+   VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
+  !pmd_devmap(*pmdp)) || !pmd_present(*pmdp));
+   pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
+   /*
+* if it not a fullmm flush, then we can possibly end up converting
+* this PMD pte entry to a regular level 0 PTE by a parallel page fault.
+* Make sure we flush the tlb in this case.
+*/
+   if (!full)
+   flush_pmd_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
+   return pmd;
+}
+
 static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
 {
return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
-- 
2.26.2



Re: [PATCH v7 04/28] powerpc/xmon: Use bitwise calculations in_breakpoint_table()

2020-05-05 Thread Jordan Niethe
On Tue, May 5, 2020 at 5:08 PM Michael Ellerman  wrote:
>
> Jordan Niethe  writes:
> > A modulo operation is used for calculating the current offset from a
> > breakpoint within the breakpoint table. As instruction lengths are
> > always a power of 2, this can be replaced with a bitwise 'and'. The
> > current check for word alignment can be replaced with checking that the
> > lower 2 bits are not set.
> >
> > Suggested-by: Christophe Leroy 
> > Signed-off-by: Jordan Niethe 
> > ---
> > v6: New to series
> > ---
> >  arch/powerpc/xmon/xmon.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
> > index bbfea22f4a96..e122f0c8a044 100644
> > --- a/arch/powerpc/xmon/xmon.c
> > +++ b/arch/powerpc/xmon/xmon.c
> > @@ -857,8 +857,8 @@ static struct bpt *in_breakpoint_table(unsigned long 
> > nip, unsigned long *offp)
> >   off = nip - (unsigned long) bpt_table;
> >   if (off >= sizeof(bpt_table))
> >   return NULL;
> > - *offp = off % BPT_SIZE;
> > - if (*offp != 0 && *offp != 4)
> > + *offp = off & (BPT_SIZE - 1);
> > + if (off & 3)
> >   return NULL;
>
> It would be even better if you didn't hard code the 3 wouldn't it?
>
The three is just checking word alignment, which I think was the
intention of the previous
- if (*offp != 0 && *offp != 4)
But using BPT_SIZE is is a different calculation.
BPT_SIZE == 2 * sizeof(unsigned int) == 8
Which would mean the trap of the breakpoint pair of instructions would
return NULL.

> eg:
>
> +   *offp = off & (BPT_SIZE - 1);
> +   if (off & (BPT_SIZE - 1))
> return NULL;

>
> cheers


Re: [PATCH v7 25/28] powerpc: Test prefixed instructions in feature fixups

2020-05-05 Thread Jordan Niethe
On Tue, May 5, 2020 at 5:15 PM Alistair Popple  wrote:
>
> Hmm, I was hoping to add a tested by but I'm seeing the following failure in
> Mambo:
>
> [1.475459] feature-fixups: test failed at line 730
>
> Based on the name of the test it looks like you probably made a copy/paste
> error in ftr_fixup_prefix2_expected. I suspect you probably meant to use the 
> alt
> fixup:
>
> globl(ftr_fixup_prefix2_expected)
> or  1,1,1
> .long 0x700
> .long 0x001
> or  2,2,2
Thanks, I changed from using 0x700 to 1 << 26 but missed here.
Changing that fixes this.
>
> Also for some reason these tests (and one of the code-patching tests) aren't
> passing on big endian.
Okay, will fix that.
>
> - Alistair
>
> On Friday, 1 May 2020 1:42:17 PM AEST Jordan Niethe wrote:
> > Expand the feature-fixups self-tests to includes tests for prefixed
> > instructions.
> >
> > Signed-off-by: Jordan Niethe 
> > ---
> > v6: New to series
> > ---
> >  arch/powerpc/lib/feature-fixups-test.S | 68 +++
> >  arch/powerpc/lib/feature-fixups.c  | 74 ++
> >  2 files changed, 142 insertions(+)
> >
> > diff --git a/arch/powerpc/lib/feature-fixups-test.S
> > b/arch/powerpc/lib/feature-fixups-test.S index b12168c2447a..6e2da9123a9b
> > 100644
> > --- a/arch/powerpc/lib/feature-fixups-test.S
> > +++ b/arch/powerpc/lib/feature-fixups-test.S
> > @@ -791,3 +791,71 @@ globl(lwsync_fixup_test_expected_SYNC)
> >  1:   or  1,1,1
> >   sync
> >
> > +globl(ftr_fixup_prefix1)
> > + or  1,1,1
> > + .long 1 << 26
> > + .long 0x000
> > + or  2,2,2
> > +globl(end_ftr_fixup_prefix1)
> > +
> > +globl(ftr_fixup_prefix1_orig)
> > + or  1,1,1
> > + .long 1 << 26
> > + .long 0x000
> > + or  2,2,2
> > +
> > +globl(ftr_fixup_prefix1_expected)
> > + or  1,1,1
> > + nop
> > + nop
> > + or  2,2,2
> > +
> > +globl(ftr_fixup_prefix2)
> > + or  1,1,1
> > + .long 1 << 26
> > + .long 0x000
> > + or  2,2,2
> > +globl(end_ftr_fixup_prefix2)
> > +
> > +globl(ftr_fixup_prefix2_orig)
> > + or  1,1,1
> > + .long 1 << 26
> > + .long 0x000
> > + or  2,2,2
> > +
> > +globl(ftr_fixup_prefix2_alt)
> > + .long 0x700
> > + .long 0x001
> > +
> > +globl(ftr_fixup_prefix2_expected)
> > + or  1,1,1
> > + .long 1 << 26
> > + .long 0x001
> > + or  2,2,2
> > +
> > +globl(ftr_fixup_prefix3)
> > + or  1,1,1
> > + .long 1 << 26
> > + .long 0x000
> > + or  2,2,2
> > + or  3,3,3
> > +globl(end_ftr_fixup_prefix3)
> > +
> > +globl(ftr_fixup_prefix3_orig)
> > + or  1,1,1
> > + .long 1 << 26
> > + .long 0x000
> > + or  2,2,2
> > + or  3,3,3
> > +
> > +globl(ftr_fixup_prefix3_alt)
> > + .long 1 << 26
> > + .long 0x001
> > + nop
> > +
> > +globl(ftr_fixup_prefix3_expected)
> > + or  1,1,1
> > + .long 1 << 26
> > + .long 0x001
> > + nop
> > + or  3,3,3
> > diff --git a/arch/powerpc/lib/feature-fixups.c
> > b/arch/powerpc/lib/feature-fixups.c index 243011f85287..6fc499b1d63e 100644
> > --- a/arch/powerpc/lib/feature-fixups.c
> > +++ b/arch/powerpc/lib/feature-fixups.c
> > @@ -687,6 +687,75 @@ static void test_lwsync_macros(void)
> >   }
> >  }
> >
> > +#ifdef __powerpc64__
> > +static void __init test_prefix_patching(void)
> > +{
> > + extern unsigned int ftr_fixup_prefix1[];
> > + extern unsigned int end_ftr_fixup_prefix1[];
> > + extern unsigned int ftr_fixup_prefix1_orig[];
> > + extern unsigned int ftr_fixup_prefix1_expected[];
> > + int size = sizeof(unsigned int) * (end_ftr_fixup_prefix1 -
> > ftr_fixup_prefix1); +
> > + fixup.value = fixup.mask = 8;
> > + fixup.start_off = calc_offset(, ftr_fixup_prefix1 + 1);
> > + fixup.end_off = calc_offset(, ftr_fixup_prefix1 + 3);
> > + fixup.alt_start_off = fixup.alt_end_off = 0;
> > +
> > + /* Sanity check */
> > + check(memcmp(ftr_fixup_prefix1, ftr_fixup_prefix1_orig, size) == 0);
> > +
> > + patch_feature_section(0, );
> > + check(memcmp(ftr_fixup_prefix1, ftr_fixup_prefix1_expected, size) == 
> > 0);
> > + check(memcmp(ftr_fixup_prefix1, ftr_fixup_prefix1_orig, size) != 0);
> > +}
> > +
> > +static void __init test_prefix_alt_patching(void)
> > +{
> > + extern unsigned int ftr_fixup_prefix2[];
> > + extern unsigned int end_ftr_fixup_prefix2[];
> > + extern unsigned int ftr_fixup_prefix2_orig[];
> > + extern unsigned int ftr_fixup_prefix2_expected[];
> > + extern unsigned int ftr_fixup_prefix2_alt[];
> > + int size = sizeof(unsigned int) * (end_ftr_fixup_prefix2 -
> > ftr_fixup_prefix2); +
> > + fixup.value = fixup.mask = 8;
> > + fixup.start_off = calc_offset(, ftr_fixup_prefix2 + 1);
> > + fixup.end_off = calc_offset(, ftr_fixup_prefix2 + 3);
> > + 

[PATCH v4 20/22] powerpc/mm/book3s64: Avoid sending IPI on clearing PMD

2020-05-05 Thread Aneesh Kumar K.V
Now that all the lockless page table walk is careful w.r.t the PTE
address returned, we can now revert
commit: 13bd817bb884 ("powerpc/thp: Serialize pmd clear against a linux page 
table walk.")

We also drop the equivalent IPI from other pte updates routines. We still keep
IPI in hash pmdp collapse and that is to take care of parallel hash page table
insert. The radix pmdp collapse flush can possibly be removed once I am sure
generic code doesn't have the any expectations around parallel gup walk.

This speeds up Qemu guest RAM del/unplug time as below

128 core, 496GB guest:

Without patch:
munmap start: timer = 13162 ms, PID=7684
munmap finish: timer = 95312 ms, PID=7684 - delta = 82150 ms

With patch:
munmap start: timer = 196449 ms, PID=6681
munmap finish: timer = 196488 ms, PID=6681 - delta = 39ms

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/book3s64/hash_pgtable.c  | 11 ---
 arch/powerpc/mm/book3s64/pgtable.c   |  8 
 arch/powerpc/mm/book3s64/radix_pgtable.c | 19 +++
 3 files changed, 7 insertions(+), 31 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c 
b/arch/powerpc/mm/book3s64/hash_pgtable.c
index 64733b9cb20a..64ca375278dc 100644
--- a/arch/powerpc/mm/book3s64/hash_pgtable.c
+++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
@@ -363,17 +363,6 @@ pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
 * hash fault look at them.
 */
memset(pgtable, 0, PTE_FRAG_SIZE);
-   /*
-* Serialize against find_current_mm_pte variants which does lock-less
-* lookup in page tables with local interrupts disabled. For huge pages
-* it casts pmd_t to pte_t. Since format of pte_t is different from
-* pmd_t we want to prevent transit from pmd pointing to page table
-* to pmd pointing to huge page (and back) while interrupts are 
disabled.
-* We clear pmd to possibly replace it with page table pointer in
-* different code paths. So make sure we wait for the parallel
-* find_curren_mm_pte to finish.
-*/
-   serialize_against_pte_lookup(mm);
return old_pmd;
 }
 
diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
b/arch/powerpc/mm/book3s64/pgtable.c
index e0bb69c616e4..127325ead505 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -109,14 +109,6 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned 
long address,
 
old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 
_PAGE_INVALID);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
-   /*
-* This ensures that generic code that rely on IRQ disabling
-* to prevent a parallel THP split work as expected.
-*
-* Marking the entry with _PAGE_INVALID && ~_PAGE_PRESENT requires
-* a special case check in pmd_access_permitted.
-*/
-   serialize_against_pte_lookup(vma->vm_mm);
return __pmd(old_pmd);
 }
 
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 8f9edf07063a..dfb9fe92aea8 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -962,7 +962,13 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct 
*vma, unsigned long addre
pmd = *pmdp;
pmd_clear(pmdp);
 
-   /*FIXME!!  Verify whether we need this kick below */
+   /*
+* pmdp collapse_flush need to ensure that there are no parallel gup
+* walk after this call. This is needed so that we can have stable
+* page ref count when collapsing a page. We don't allow a collapse page
+* if we have gup taken on the page. We can ensure that by sending IPI
+* because gup walk happens with IRQ disabled.
+*/
serialize_against_pte_lookup(vma->vm_mm);
 
radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
@@ -1023,17 +1029,6 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct 
*mm,
 
old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
old_pmd = __pmd(old);
-   /*
-* Serialize against find_current_mm_pte which does lock-less
-* lookup in page tables with local interrupts disabled. For huge pages
-* it casts pmd_t to pte_t. Since format of pte_t is different from
-* pmd_t we want to prevent transit from pmd pointing to page table
-* to pmd pointing to huge page (and back) while interrupts are 
disabled.
-* We clear pmd to possibly replace it with page table pointer in
-* different code paths. So make sure we wait for the parallel
-* find_current_mm_pte to finish.
-*/
-   serialize_against_pte_lookup(mm);
return old_pmd;
 }
 
-- 
2.26.2



Re: [PATCH 1/2] powerpc/spufs: fix copy_to_user while atomic

2020-05-05 Thread Michael Ellerman
Christoph Hellwig  writes:
> powerpc mantainers,

There's only one of me.

> are you going to pick this up for the next -rc1?  I'm waiting for it to
> hit upstream before resending the coredump series.

I thought you were going to take it in your series. 

Otherwise you'll be waiting 4 or more weeks before this hits rc1.

I can put it in a topic branch if you're worried about merge conflicts.

There's also the fcheck() RCU fix I need to repost as a proper patch, it
seems to work.

cheers

> On Wed, Apr 29, 2020 at 03:03:02PM +0800, Jeremy Kerr wrote:
>> Currently, we may perform a copy_to_user (through
>> simple_read_from_buffer()) while holding a context's register_lock,
>> while accessing the context save area.
>> 
>> This change uses a temporary buffer for the context save area data,
>> which we then pass to simple_read_from_buffer.
>> 
>> Includes changes from Christoph Hellwig .
>> 
>> Fixes: bf1ab978be23 ("[POWERPC] coredump: Add SPU elf notes to coredump.")
>> Signed-off-by: Jeremy Kerr 
>> Reviewed-by: Arnd Bergmann 
>> Reviewed-by: Christoph Hellwig 
>> ---
>>  arch/powerpc/platforms/cell/spufs/file.c | 113 +++
>>  1 file changed, 75 insertions(+), 38 deletions(-)
>> 
>> diff --git a/arch/powerpc/platforms/cell/spufs/file.c 
>> b/arch/powerpc/platforms/cell/spufs/file.c
>> index c0f950a3f4e1..b4e1ef650b40 100644
>> --- a/arch/powerpc/platforms/cell/spufs/file.c
>> +++ b/arch/powerpc/platforms/cell/spufs/file.c
>> @@ -1978,8 +1978,9 @@ static ssize_t __spufs_mbox_info_read(struct 
>> spu_context *ctx,
>>  static ssize_t spufs_mbox_info_read(struct file *file, char __user *buf,
>> size_t len, loff_t *pos)
>>  {
>> -int ret;
>>  struct spu_context *ctx = file->private_data;
>> +u32 stat, data;
>> +int ret;
>>  
>>  if (!access_ok(buf, len))
>>  return -EFAULT;
>> @@ -1988,11 +1989,16 @@ static ssize_t spufs_mbox_info_read(struct file 
>> *file, char __user *buf,
>>  if (ret)
>>  return ret;
>>  spin_lock(>csa.register_lock);
>> -ret = __spufs_mbox_info_read(ctx, buf, len, pos);
>> +stat = ctx->csa.prob.mb_stat_R;
>> +data = ctx->csa.prob.pu_mb_R;
>>  spin_unlock(>csa.register_lock);
>>  spu_release_saved(ctx);
>>  
>> -return ret;
>> +/* EOF if there's no entry in the mbox */
>> +if (!(stat & 0xff))
>> +return 0;
>> +
>> +return simple_read_from_buffer(buf, len, pos, , sizeof(data));
>>  }
>>  
>>  static const struct file_operations spufs_mbox_info_fops = {
>> @@ -2019,6 +2025,7 @@ static ssize_t spufs_ibox_info_read(struct file *file, 
>> char __user *buf,
>> size_t len, loff_t *pos)
>>  {
>>  struct spu_context *ctx = file->private_data;
>> +u32 stat, data;
>>  int ret;
>>  
>>  if (!access_ok(buf, len))
>> @@ -2028,11 +2035,16 @@ static ssize_t spufs_ibox_info_read(struct file 
>> *file, char __user *buf,
>>  if (ret)
>>  return ret;
>>  spin_lock(>csa.register_lock);
>> -ret = __spufs_ibox_info_read(ctx, buf, len, pos);
>> +stat = ctx->csa.prob.mb_stat_R;
>> +data = ctx->csa.priv2.puint_mb_R;
>>  spin_unlock(>csa.register_lock);
>>  spu_release_saved(ctx);
>>  
>> -return ret;
>> +/* EOF if there's no entry in the ibox */
>> +if (!(stat & 0xff))
>> +return 0;
>> +
>> +return simple_read_from_buffer(buf, len, pos, , sizeof(data));
>>  }
>>  
>>  static const struct file_operations spufs_ibox_info_fops = {
>> @@ -2041,6 +2053,11 @@ static const struct file_operations 
>> spufs_ibox_info_fops = {
>>  .llseek  = generic_file_llseek,
>>  };
>>  
>> +static size_t spufs_wbox_info_cnt(struct spu_context *ctx)
>> +{
>> +return (4 - ((ctx->csa.prob.mb_stat_R & 0x00ff00) >> 8)) * sizeof(u32);
>> +}
>> +
>>  static ssize_t __spufs_wbox_info_read(struct spu_context *ctx,
>>  char __user *buf, size_t len, loff_t *pos)
>>  {
>> @@ -2049,7 +2066,7 @@ static ssize_t __spufs_wbox_info_read(struct 
>> spu_context *ctx,
>>  u32 wbox_stat;
>>  
>>  wbox_stat = ctx->csa.prob.mb_stat_R;
>> -cnt = 4 - ((wbox_stat & 0x00ff00) >> 8);
>> +cnt = spufs_wbox_info_cnt(ctx);
>>  for (i = 0; i < cnt; i++) {
>>  data[i] = ctx->csa.spu_mailbox_data[i];
>>  }
>> @@ -2062,7 +2079,8 @@ static ssize_t spufs_wbox_info_read(struct file *file, 
>> char __user *buf,
>> size_t len, loff_t *pos)
>>  {
>>  struct spu_context *ctx = file->private_data;
>> -int ret;
>> +u32 data[ARRAY_SIZE(ctx->csa.spu_mailbox_data)];
>> +int ret, count;
>>  
>>  if (!access_ok(buf, len))
>>  return -EFAULT;
>> @@ -2071,11 +2089,13 @@ static ssize_t spufs_wbox_info_read(struct file 
>> *file, char __user *buf,
>>  if (ret)
>>  return ret;
>>  spin_lock(>csa.register_lock);
>> -ret = __spufs_wbox_info_read(ctx, buf, len, pos);
>> +

Re: [PATCH 1/2] powerpc/spufs: fix copy_to_user while atomic

2020-05-05 Thread Christoph Hellwig
On Tue, May 05, 2020 at 05:20:54PM +1000, Michael Ellerman wrote:
> Christoph Hellwig  writes:
> > powerpc mantainers,
> 
> There's only one of me.
> 
> > are you going to pick this up for the next -rc1?  I'm waiting for it to
> > hit upstream before resending the coredump series.
> 
> I thought you were going to take it in your series. 

Ok, I'll pick it up.


Re: [PATCH v4 2/7] KVM: arm64: clean up redundant 'kvm_run' parameters

2020-05-05 Thread Marc Zyngier

Hi Tianjia,

On 2020-04-27 05:35, Tianjia Zhang wrote:
In the current kvm version, 'kvm_run' has been included in the 
'kvm_vcpu'

structure. For historical reasons, many kvm-related function parameters
retain the 'kvm_run' and 'kvm_vcpu' parameters at the same time. This
patch does a unified cleanup of these remaining redundant parameters.

Signed-off-by: Tianjia Zhang 


On the face of it, this looks OK, but I haven't tried to run the
resulting kernel. I'm not opposed to taking this patch *if* there
is an agreement across architectures to take the series (I value
consistency over the janitorial exercise).

Another thing is that this is going to conflict with the set of
patches that move the KVM/arm code back where it belongs 
(arch/arm64/kvm),

so I'd probably cherry-pick that one directly.

Thanks,

M.


--
Jazz is not dead. It just smells funny...


Re: [RFC][PATCH 0/2] Add support for using reserved memory for ima buffer pass

2020-05-05 Thread Mark Rutland
Hi Prakhar,

On Mon, May 04, 2020 at 01:38:27PM -0700, Prakhar Srivastava wrote:
> IMA during kexec(kexec file load) verifies the kernel signature and measures
> the signature of the kernel. The signature in the logs can be used to verfiy 
> the 
> authenticity of the kernel. The logs don not get carried over kexec and thus
> remote attesation cannot verify the signature of the running kernel.
> 
> Introduce an ABI to carry forward the ima logs over kexec.
> Memory reserved via device tree reservation can be used to store and read
> via the of_* functions.

This flow needs to work for:

1) Pure DT
2) DT + EFI memory map
3) ACPI + EFI memory map

... and if this is just for transiently passing the log, I don't think
that a reserved memory region is the right thing to use, since they're
supposed to be more permanent.

This sounds analogous to passing the initrd, and should probably use
properties under the chosen node (which can be used for all three boot
flows above).

For reference, how big is the IMA log likely to be? Does it need
physically contiguous space?

Thanks,
Mark.

> 
> Reserved memory stores the size(sizeof(size_t)) of the buffer in the starting
> address, followed by the IMA log contents.
> 
> Tested on:
>   arm64 with Uboot
> 
> Prakhar Srivastava (2):
>   Add a layer of abstraction to use the memory reserved by device tree
> for ima buffer pass.
>   Add support for ima buffer pass using reserved memory for arm64 kexec.
> Update the arch sepcific code path in kexec file load to store the
> ima buffer in the reserved memory. The same reserved memory is read
> on kexec or cold boot.
> 
>  arch/arm64/Kconfig |   1 +
>  arch/arm64/include/asm/ima.h   |  22 
>  arch/arm64/include/asm/kexec.h |   5 +
>  arch/arm64/kernel/Makefile |   1 +
>  arch/arm64/kernel/ima_kexec.c  |  64 ++
>  arch/arm64/kernel/machine_kexec_file.c |   1 +
>  arch/powerpc/include/asm/ima.h |   3 +-
>  arch/powerpc/kexec/ima.c   |  14 ++-
>  drivers/of/Kconfig |   6 +
>  drivers/of/Makefile|   1 +
>  drivers/of/of_ima.c| 165 +
>  include/linux/of.h |  34 +
>  security/integrity/ima/ima_kexec.c |  15 ++-
>  13 files changed, 325 insertions(+), 7 deletions(-)
>  create mode 100644 arch/arm64/include/asm/ima.h
>  create mode 100644 arch/arm64/kernel/ima_kexec.c
>  create mode 100644 drivers/of/of_ima.c
> 
> -- 
> 2.25.1
> 


remove set_fs calls from the coredump code v6

2020-05-05 Thread Christoph Hellwig
Hi all,

this series gets rid of playing with the address limit in the exec and
coredump code.  Most of this was fairly trivial, the biggest changes are
those to the spufs coredump code.

Changes since v5:
 - fix uaccess under spinlock in spufs (Jeremy)
 - remove use of access_ok in spufs

Changes since v4:
 - change some goto names as suggested by Linus

Changes since v3:
 - fix x86 compilation with x32 in the new version of the signal code
 - split the exec patches into a new series

Changes since v2:
 - don't cleanup the compat siginfo calling conventions, use the patch
   variant from Eric with slight coding style fixes instead.

Changes since v1:
 - properly spell NUL
 - properly handle the compat siginfo case in ELF coredumps



[PATCH 1/7] powerpc/spufs: fix copy_to_user while atomic

2020-05-05 Thread Christoph Hellwig
From: Jeremy Kerr 

Currently, we may perform a copy_to_user (through
simple_read_from_buffer()) while holding a context's register_lock,
while accessing the context save area.

This change uses a temporary buffer for the context save area data,
which we then pass to simple_read_from_buffer.

Includes changes from Christoph Hellwig .

Fixes: bf1ab978be23 ("[POWERPC] coredump: Add SPU elf notes to coredump.")
Signed-off-by: Jeremy Kerr 
Reviewed-by: Arnd Bergmann 
[hch: renamed to function to avoid ___-prefixes]
Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/platforms/cell/spufs/file.c | 113 +++
 1 file changed, 75 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/file.c 
b/arch/powerpc/platforms/cell/spufs/file.c
index c0f950a3f4e1f..f4a4dfb191e7d 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -1978,8 +1978,9 @@ static ssize_t __spufs_mbox_info_read(struct spu_context 
*ctx,
 static ssize_t spufs_mbox_info_read(struct file *file, char __user *buf,
   size_t len, loff_t *pos)
 {
-   int ret;
struct spu_context *ctx = file->private_data;
+   u32 stat, data;
+   int ret;
 
if (!access_ok(buf, len))
return -EFAULT;
@@ -1988,11 +1989,16 @@ static ssize_t spufs_mbox_info_read(struct file *file, 
char __user *buf,
if (ret)
return ret;
spin_lock(>csa.register_lock);
-   ret = __spufs_mbox_info_read(ctx, buf, len, pos);
+   stat = ctx->csa.prob.mb_stat_R;
+   data = ctx->csa.prob.pu_mb_R;
spin_unlock(>csa.register_lock);
spu_release_saved(ctx);
 
-   return ret;
+   /* EOF if there's no entry in the mbox */
+   if (!(stat & 0xff))
+   return 0;
+
+   return simple_read_from_buffer(buf, len, pos, , sizeof(data));
 }
 
 static const struct file_operations spufs_mbox_info_fops = {
@@ -2019,6 +2025,7 @@ static ssize_t spufs_ibox_info_read(struct file *file, 
char __user *buf,
   size_t len, loff_t *pos)
 {
struct spu_context *ctx = file->private_data;
+   u32 stat, data;
int ret;
 
if (!access_ok(buf, len))
@@ -2028,11 +2035,16 @@ static ssize_t spufs_ibox_info_read(struct file *file, 
char __user *buf,
if (ret)
return ret;
spin_lock(>csa.register_lock);
-   ret = __spufs_ibox_info_read(ctx, buf, len, pos);
+   stat = ctx->csa.prob.mb_stat_R;
+   data = ctx->csa.priv2.puint_mb_R;
spin_unlock(>csa.register_lock);
spu_release_saved(ctx);
 
-   return ret;
+   /* EOF if there's no entry in the ibox */
+   if (!(stat & 0xff))
+   return 0;
+
+   return simple_read_from_buffer(buf, len, pos, , sizeof(data));
 }
 
 static const struct file_operations spufs_ibox_info_fops = {
@@ -2041,6 +2053,11 @@ static const struct file_operations spufs_ibox_info_fops 
= {
.llseek  = generic_file_llseek,
 };
 
+static size_t spufs_wbox_info_cnt(struct spu_context *ctx)
+{
+   return (4 - ((ctx->csa.prob.mb_stat_R & 0x00ff00) >> 8)) * sizeof(u32);
+}
+
 static ssize_t __spufs_wbox_info_read(struct spu_context *ctx,
char __user *buf, size_t len, loff_t *pos)
 {
@@ -2049,7 +2066,7 @@ static ssize_t __spufs_wbox_info_read(struct spu_context 
*ctx,
u32 wbox_stat;
 
wbox_stat = ctx->csa.prob.mb_stat_R;
-   cnt = 4 - ((wbox_stat & 0x00ff00) >> 8);
+   cnt = spufs_wbox_info_cnt(ctx);
for (i = 0; i < cnt; i++) {
data[i] = ctx->csa.spu_mailbox_data[i];
}
@@ -2062,7 +2079,8 @@ static ssize_t spufs_wbox_info_read(struct file *file, 
char __user *buf,
   size_t len, loff_t *pos)
 {
struct spu_context *ctx = file->private_data;
-   int ret;
+   u32 data[ARRAY_SIZE(ctx->csa.spu_mailbox_data)];
+   int ret, count;
 
if (!access_ok(buf, len))
return -EFAULT;
@@ -2071,11 +2089,13 @@ static ssize_t spufs_wbox_info_read(struct file *file, 
char __user *buf,
if (ret)
return ret;
spin_lock(>csa.register_lock);
-   ret = __spufs_wbox_info_read(ctx, buf, len, pos);
+   count = spufs_wbox_info_cnt(ctx);
+   memcpy(, >csa.spu_mailbox_data, sizeof(data));
spin_unlock(>csa.register_lock);
spu_release_saved(ctx);
 
-   return ret;
+   return simple_read_from_buffer(buf, len, pos, ,
+   count * sizeof(u32));
 }
 
 static const struct file_operations spufs_wbox_info_fops = {
@@ -2084,27 +2104,33 @@ static const struct file_operations 
spufs_wbox_info_fops = {
.llseek  = generic_file_llseek,
 };
 
-static ssize_t __spufs_dma_info_read(struct spu_context *ctx,
-   char __user *buf, size_t len, loff_t *pos)
+static void spufs_get_dma_info(struct spu_context *ctx,

Re: [PATCH v2 17/20] mm: free_area_init: allow defining max_zone_pfn in descending order

2020-05-05 Thread Guenter Roeck
On 5/4/20 8:39 AM, Mike Rapoport wrote:
> On Sun, May 03, 2020 at 11:43:00AM -0700, Guenter Roeck wrote:
>> On Sun, May 03, 2020 at 10:41:38AM -0700, Guenter Roeck wrote:
>>> Hi,
>>>
>>> On Wed, Apr 29, 2020 at 03:11:23PM +0300, Mike Rapoport wrote:
 From: Mike Rapoport 

 Some architectures (e.g. ARC) have the ZONE_HIGHMEM zone below the
 ZONE_NORMAL. Allowing free_area_init() parse max_zone_pfn array even it is
 sorted in descending order allows using free_area_init() on such
 architectures.

 Add top -> down traversal of max_zone_pfn array in free_area_init() and use
 the latter in ARC node/zone initialization.

 Signed-off-by: Mike Rapoport 
>>>
>>> This patch causes my microblazeel qemu boot test in linux-next to fail.
>>> Reverting it fixes the problem.
>>>
>> The same problem is seen with s390 emulations.
> 
> Yeah, this patch breaks some others as well :(
> 
> My assumption that max_zone_pfn defines architectural limit for maximal
> PFN that can belong to a zone was over-optimistic. Several arches
> actually do that, but others do
> 
>   max_zone_pfn[ZONE_DMA] = MAX_DMA_PFN;
>   max_zone_pfn[ZONE_NORMAL] = max_pfn;
> 
> where MAX_DMA_PFN is build-time constrain and max_pfn is run time limit
> for the current system.
> 
> So, when max_pfn is lower than MAX_DMA_PFN, the free_init_area() will
> consider max_zone_pfn as descending and will wrongly calculate zone
> extents.
> 
> That said, instead of trying to create a generic way to special case
> ARC, I suggest to simply use the below patch instead.
> 

As a reminder, I reported the problem against s390 and microblazeel
(interestingly enough, microblaze (big endian) works), not against arc.

Guenter

> diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
> index 41eb9be1653c..386959bac3d2 100644
> --- a/arch/arc/mm/init.c
> +++ b/arch/arc/mm/init.c
> @@ -77,6 +77,11 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 
> size)
>   base, TO_MB(size), !in_use ? "Not used":"");
>  }
>  
> +bool arch_has_descending_max_zone_pfns(void)
> +{
> + return true;
> +}
> +
>  /*
>   * First memory setup routine called from setup_arch()
>   * 1. setup swapper's mm @init_mm
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index b990e9734474..114f0e027144 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -7307,6 +7307,15 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
>   }
>  }
>  
> +/*
> + * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
> + * such cases we allow max_zone_pfn sorted in the descending order
> + */
> +bool __weak arch_has_descending_max_zone_pfns(void)
> +{
> + return false;
> +}
> +
>  /**
>   * free_area_init - Initialise all pg_data_t and zone data
>   * @max_zone_pfn: an array of max PFNs for each zone
> @@ -7324,7 +7333,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
>  {
>   unsigned long start_pfn, end_pfn;
>   int i, nid, zone;
> - bool descending = false;
> + bool descending;
>  
>   /* Record where the zone boundaries are */
>   memset(arch_zone_lowest_possible_pfn, 0,
> @@ -7333,14 +7342,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
>   sizeof(arch_zone_highest_possible_pfn));
>  
>   start_pfn = find_min_pfn_with_active_regions();
> -
> - /*
> -  * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below
> -  * ZONE_NORMAL. For such cases we allow max_zone_pfn sorted in the
> -  * descending order
> -  */
> - if (MAX_NR_ZONES > 1 && max_zone_pfn[0] > max_zone_pfn[1])
> - descending = true;
> + descending = arch_has_descending_max_zone_pfns();
>  
>   for (i = 0; i < MAX_NR_ZONES; i++) {
>   if (descending)
> 
>> Guenter
>>
>>> qemu command line:
>>>
>>> qemu-system-microblazeel -M petalogix-ml605 -m 256 \
>>> -kernel arch/microblaze/boot/linux.bin -no-reboot \
>>> -initrd rootfs.cpio \
>>> -append 'panic=-1 slub_debug=FZPUA rdinit=/sbin/init 
>>> console=ttyS0,115200' \
>>> -monitor none -serial stdio -nographic
>>>
>>> initrd:
>>> 
>>> https://github.com/groeck/linux-build-test/blob/master/rootfs/microblazeel/rootfs.cpio.gz
>>> configuration:
>>> 
>>> https://github.com/groeck/linux-build-test/blob/master/rootfs/microblazeel/qemu_microblazeel_ml605_defconfig
>>>
>>> Bisect log is below.
>>>
>>> Guenter
>>>
>>> ---
>>> # bad: [fb9d670f57e3f6478602328bbbf71138be06ca4f] Add linux-next specific 
>>> files for 20200501
>>> # good: [6a8b55ed4056ea5559ebe4f6a4b247f627870d4c] Linux 5.7-rc3
>>> git bisect start 'HEAD' 'v5.7-rc3'
>>> # good: [068b80b68a670f0b17288c8a3d1ee751f35162ab] Merge remote-tracking 
>>> branch 'drm/drm-next'
>>> git bisect good 068b80b68a670f0b17288c8a3d1ee751f35162ab
>>> # good: [46c70fc6a3ac35cd72ddad248dcbe4eee716d2a5] Merge remote-tracking 
>>> branch 'drivers-x86/for-next'
>>> git bisect good 

Re: [PATCH v2 0/5] Statsfs: a new ram-based file sytem for Linux kernel statistics

2020-05-05 Thread Paolo Bonzini
On 05/05/20 18:53, Jim Mattson wrote:
>>> Since this is becoming a generic API (good!!), maybe we can discuss
>>> possible ways to optimize gathering of stats in mass?
>> Sure, the idea of a binary format was considered from the beginning in
>> [1], and it can be done either together with the current filesystem, or
>> as a replacement via different mount options.
> 
> ASCII stats are not scalable. A binary format is definitely the way to go.

I am totally in favor of having a binary format, but it should be
introduced as a separate series on top of this one---and preferably by
someone who has already put some thought into the problem (which
Emanuele and I have not, beyond ensuring that the statsfs concept and
API is flexible enough).

ASCII stats are necessary for quick userspace consumption and for
backwards compatibility with KVM debugfs (which is not an ABI, but it's
damn useful and should not be dropped without providing something as
handy), so this is what this series starts from.

Paolo



Re: [PATCH v2 17/20] mm: free_area_init: allow defining max_zone_pfn in descending order

2020-05-05 Thread Vineet Gupta
On 5/5/20 6:18 AM, Guenter Roeck wrote:
> On 5/4/20 8:39 AM, Mike Rapoport wrote:
>> On Sun, May 03, 2020 at 11:43:00AM -0700, Guenter Roeck wrote:
>>> On Sun, May 03, 2020 at 10:41:38AM -0700, Guenter Roeck wrote:
 Hi,

 On Wed, Apr 29, 2020 at 03:11:23PM +0300, Mike Rapoport wrote:
> From: Mike Rapoport 
>
> Some architectures (e.g. ARC) have the ZONE_HIGHMEM zone below the
> ZONE_NORMAL. Allowing free_area_init() parse max_zone_pfn array even it is
> sorted in descending order allows using free_area_init() on such
> architectures.
>
> Add top -> down traversal of max_zone_pfn array in free_area_init() and 
> use
> the latter in ARC node/zone initialization.
>
> Signed-off-by: Mike Rapoport 
 This patch causes my microblazeel qemu boot test in linux-next to fail.
 Reverting it fixes the problem.

>>> The same problem is seen with s390 emulations.
>> Yeah, this patch breaks some others as well :(
>>
>> My assumption that max_zone_pfn defines architectural limit for maximal
>> PFN that can belong to a zone was over-optimistic. Several arches
>> actually do that, but others do
>>
>>  max_zone_pfn[ZONE_DMA] = MAX_DMA_PFN;
>>  max_zone_pfn[ZONE_NORMAL] = max_pfn;
>>
>> where MAX_DMA_PFN is build-time constrain and max_pfn is run time limit
>> for the current system.
>>
>> So, when max_pfn is lower than MAX_DMA_PFN, the free_init_area() will
>> consider max_zone_pfn as descending and will wrongly calculate zone
>> extents.
>>
>> That said, instead of trying to create a generic way to special case
>> ARC, I suggest to simply use the below patch instead.
>>
> As a reminder, I reported the problem against s390 and microblazeel
> (interestingly enough, microblaze (big endian) works), not against arc.

Understood and my comment was to point to any other problems in future.

Thx,
-Vineet


Re: [PATCH v2 0/5] Statsfs: a new ram-based file sytem for Linux kernel statistics

2020-05-05 Thread Paolo Bonzini
On 05/05/20 19:07, David Rientjes wrote:
>> I am totally in favor of having a binary format, but it should be
>> introduced as a separate series on top of this one---and preferably by
>> someone who has already put some thought into the problem (which
>> Emanuele and I have not, beyond ensuring that the statsfs concept and
>> API is flexible enough).
>>
> The concern is that once this series is merged then /sys/kernel/stats 
> could be considered an ABI and there would be a reasonable expectation 
> that it will remain stable, in so far as the stats that userspace is 
> interested in are stable and not obsoleted.
> 
> So is this a suggestion that the binary format becomes complementary to 
> statsfs and provide a means for getting all stats from a single subsystem, 
> or that this series gets converted to such a format before it is merged?

The binary format should be complementary.  The ASCII format should
indeed be considered stable even though individual statistics would come
and go.  It may make sense to allow disabling ASCII files via mount
and/or Kconfig options; but either way, the binary format can and should
be added on top.

I have not put any thought into what the binary format would look like
and what its features would be.  For example these are but the first
questions that come to mind:

* would it be possible to read/clear an arbitrary statistic with
pread/pwrite, or do you have to read all of them?

* if userspace wants to read the schema just once and then read the
statistics many times, how is it informed of schema changes?

* and of course the details of how the schema (names of stat and
subsources) is encoded and what details it should include about the
values (e.g. type or just signedness).

Another possibility is to query stats via BPF.  This could be a third
way to access the stats, or it could be alternative to a binary format.

Paolo



Re: [PATCH v2 0/5] Statsfs: a new ram-based file sytem for Linux kernel statistics

2020-05-05 Thread Christian Borntraeger
Adding Stefan Raspl, who has done a lot of kvm_stat work in the past.

On 05.05.20 19:21, Paolo Bonzini wrote:
> On 05/05/20 19:07, David Rientjes wrote:
>>> I am totally in favor of having a binary format, but it should be
>>> introduced as a separate series on top of this one---and preferably by
>>> someone who has already put some thought into the problem (which
>>> Emanuele and I have not, beyond ensuring that the statsfs concept and
>>> API is flexible enough).
>>>
>> The concern is that once this series is merged then /sys/kernel/stats 
>> could be considered an ABI and there would be a reasonable expectation 
>> that it will remain stable, in so far as the stats that userspace is 
>> interested in are stable and not obsoleted.
>>
>> So is this a suggestion that the binary format becomes complementary to 
>> statsfs and provide a means for getting all stats from a single subsystem, 
>> or that this series gets converted to such a format before it is merged?
> 
> The binary format should be complementary.  The ASCII format should
> indeed be considered stable even though individual statistics would come
> and go.  It may make sense to allow disabling ASCII files via mount
> and/or Kconfig options; but either way, the binary format can and should
> be added on top.
> 
> I have not put any thought into what the binary format would look like
> and what its features would be.  For example these are but the first
> questions that come to mind:
> 
> * would it be possible to read/clear an arbitrary statistic with
> pread/pwrite, or do you have to read all of them?
> 
> * if userspace wants to read the schema just once and then read the
> statistics many times, how is it informed of schema changes?
> 
> * and of course the details of how the schema (names of stat and
> subsources) is encoded and what details it should include about the
> values (e.g. type or just signedness).
> 
> Another possibility is to query stats via BPF.  This could be a third
> way to access the stats, or it could be alternative to a binary format.
> 
> Paolo
> 


Re: remove set_fs calls from the coredump code v6

2020-05-05 Thread Linus Torvalds
On Tue, May 5, 2020 at 3:13 AM Christoph Hellwig  wrote:
>
> this series gets rid of playing with the address limit in the exec and
> coredump code.  Most of this was fairly trivial, the biggest changes are
> those to the spufs coredump code.

Ack, nice, and looks good.

The only part I dislike is how we have that 'struct compat_siginfo' on
the stack, which is a huge waste (most of it is the nasty padding to
128 bytes).

But that's not new, I only reacted to it because the code moved a bit.
We cleaned up the regular siginfo to not have the padding in the
kernel (and by "we" I mean "Eric Biederman did it after some prodding
as part of his siginfo cleanups" - see commit 4ce5f9c9e754 "signal:
Use a smaller struct siginfo in the kernel"),  and I wonder if we
could do something similar with that compat thing.

128 bytes of wasted kernel stack isn't the end of the world, but it's
sad when the *actual* data is only 32 bytes or so.

Linus


Re: [PATCH v3 00/29] Convert files to ReST - part 2

2020-05-05 Thread Jonathan Corbet
On Tue, 28 Apr 2020 13:01:28 -0600
Jonathan Corbet  wrote:

> So I'm happy to merge this set, but there is one thing that worries me a
> bit... 
> 
> >  fs/cachefiles/Kconfig |4 +-
> >  fs/coda/Kconfig   |2 +-
> >  fs/configfs/inode.c   |2 +-
> >  fs/configfs/item.c|2 +-
> >  fs/fscache/Kconfig|8 +-
> >  fs/fscache/cache.c|8 +-
> >  fs/fscache/cookie.c   |2 +-
> >  fs/fscache/object.c   |4 +-
> >  fs/fscache/operation.c|2 +-
> >  fs/locks.c|2 +-
> >  include/linux/configfs.h  |2 +-
> >  include/linux/fs_context.h|2 +-
> >  include/linux/fscache-cache.h |4 +-
> >  include/linux/fscache.h   |   42 +-
> >  include/linux/lsm_hooks.h |2 +-  
> 
> I'd feel a bit better if I could get an ack or two from filesystem folks
> before I venture that far out of my own yard...what say you all?

It's been another week and nobody has complained, so I'm taking that as
assent; the series has been applied.

Thanks,

jon


Re: [PATCH] powerpc/5200: update contact email

2020-05-05 Thread Wolfram Sang

> > My 'pengutronix' address is defunct for years. Merge the entries and use
> > the proper contact address.
> 
> Is there any point adding the new address? It's just likely to bit-rot
> one day too.

At least, this one is a group address, not an individual one, so less
likey.

> I figure the git history is a better source for more up-to-date emails.

But yes, can still be argued. I won't persist if you don't like it.

Thanks!



signature.asc
Description: PGP signature


Re: [PATCH v4 1/2] powerpc/uaccess: Implement unsafe_put_user() using 'asm goto'

2020-05-05 Thread Michael Ellerman
Christophe Leroy  writes:
> unsafe_put_user() is designed to take benefit of 'asm goto'.
>
> Instead of using the standard __put_user() approach and branch
> based on the returned error, use 'asm goto' and make the
> exception code branch directly to the error label. There is
> no code anymore in the fixup section.
>
> This change significantly simplifies functions using
> unsafe_put_user()
>
...
>
> Signed-off-by: Christophe Leroy 
> ---
>  arch/powerpc/include/asm/uaccess.h | 61 +-
>  1 file changed, 52 insertions(+), 9 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/uaccess.h 
> b/arch/powerpc/include/asm/uaccess.h
> index 9cc9c106ae2a..9365b59495a2 100644
> --- a/arch/powerpc/include/asm/uaccess.h
> +++ b/arch/powerpc/include/asm/uaccess.h
> @@ -196,6 +193,52 @@ do { 
> \
>  })
>  
>  
> +#define __put_user_asm_goto(x, addr, label, op)  \
> + asm volatile goto(  \
> + "1: " op "%U1%X1 %0,%1  # put_user\n"   \
> + EX_TABLE(1b, %l2)   \
> + :   \
> + : "r" (x), "m<>" (*addr)\

The "m<>" here is breaking GCC 4.6.3, which we allegedly still support.

Plain "m" works, how much does the "<>" affect code gen in practice?

A quick diff here shows no difference from removing "<>".

cheers


Re: [PATCH v4 1/2] powerpc/uaccess: Implement unsafe_put_user() using 'asm goto'

2020-05-05 Thread Segher Boessenkool
On Tue, May 05, 2020 at 05:40:21PM +0200, Christophe Leroy wrote:
> >>+#define __put_user_asm_goto(x, addr, label, op)\
> >>+   asm volatile goto(  \
> >>+   "1: " op "%U1%X1 %0,%1  # put_user\n"   \
> >>+   EX_TABLE(1b, %l2)   \
> >>+   :   \
> >>+   : "r" (x), "m<>" (*addr)\
> >
> >The "m<>" here is breaking GCC 4.6.3, which we allegedly still support.
> >
> >Plain "m" works, how much does the "<>" affect code gen in practice?
> >
> >A quick diff here shows no difference from removing "<>".
> 
> It was recommended by Segher, there has been some discussion about it on 
> v1 of this patch, see 
> https://patchwork.ozlabs.org/project/linuxppc-dev/patch/4fdc2aba6f5e51887d1cd0fee94be0989eada2cd.1586942312.git.christophe.le...@c-s.fr/
> 
> As far as I understood that's mandatory on recent gcc to get the 
> pre-update form of the instruction. With older versions "m" was doing 
> the same, but not anymore.

Yes.  How much that matters depends on the asm.  On older CPUs (6xx/7xx,
say) the update form was just as fast as the non-update form.  On newer
or bigger CPUs it is usually executed just the same as an add followed
by the memory access, so it just saves a bit of code size.

> Should we ifdef the "m<>" or "m" based on GCC 
> version ?

That will be a lot of churn.  Just make 4.8 minimum?


Segher


Re: [PATCH v2 17/20] mm: free_area_init: allow defining max_zone_pfn in descending order

2020-05-05 Thread Mike Rapoport
On Tue, May 05, 2020 at 06:18:11AM -0700, Guenter Roeck wrote:
> On 5/4/20 8:39 AM, Mike Rapoport wrote:
> > On Sun, May 03, 2020 at 11:43:00AM -0700, Guenter Roeck wrote:
> >> On Sun, May 03, 2020 at 10:41:38AM -0700, Guenter Roeck wrote:
> >>> Hi,
> >>>
> >>> On Wed, Apr 29, 2020 at 03:11:23PM +0300, Mike Rapoport wrote:
>  From: Mike Rapoport 
> 
>  Some architectures (e.g. ARC) have the ZONE_HIGHMEM zone below the
>  ZONE_NORMAL. Allowing free_area_init() parse max_zone_pfn array even it 
>  is
>  sorted in descending order allows using free_area_init() on such
>  architectures.
> 
>  Add top -> down traversal of max_zone_pfn array in free_area_init() and 
>  use
>  the latter in ARC node/zone initialization.
> 
>  Signed-off-by: Mike Rapoport 
> >>>
> >>> This patch causes my microblazeel qemu boot test in linux-next to fail.
> >>> Reverting it fixes the problem.
> >>>
> >> The same problem is seen with s390 emulations.
> > 
> > Yeah, this patch breaks some others as well :(
> > 
> > My assumption that max_zone_pfn defines architectural limit for maximal
> > PFN that can belong to a zone was over-optimistic. Several arches
> > actually do that, but others do
> > 
> > max_zone_pfn[ZONE_DMA] = MAX_DMA_PFN;
> > max_zone_pfn[ZONE_NORMAL] = max_pfn;
> > 
> > where MAX_DMA_PFN is build-time constrain and max_pfn is run time limit
> > for the current system.
> > 
> > So, when max_pfn is lower than MAX_DMA_PFN, the free_init_area() will
> > consider max_zone_pfn as descending and will wrongly calculate zone
> > extents.
> > 
> > That said, instead of trying to create a generic way to special case
> > ARC, I suggest to simply use the below patch instead.
> > 
> 
> As a reminder, I reported the problem against s390 and microblazeel
> (interestingly enough, microblaze (big endian) works), not against arc.

With this fix microblazeel and s390 worked for me and also Christian had
reported that s390 is fixed.

microblaze (big endian) works because its defconfig does not enable HIGHMEM
while little endian does.

ARC is mentioned because it is the only arch that may have ZONE_HIGHMEM
and ZONE_NORMAL and this patch was required to consolidate
free_area_init* variants.

> Guenter
> 
> > diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
> > index 41eb9be1653c..386959bac3d2 100644
> > --- a/arch/arc/mm/init.c
> > +++ b/arch/arc/mm/init.c
> > @@ -77,6 +77,11 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 
> > size)
> > base, TO_MB(size), !in_use ? "Not used":"");
> >  }
> >  
> > +bool arch_has_descending_max_zone_pfns(void)
> > +{
> > +   return true;
> > +}
> > +
> >  /*
> >   * First memory setup routine called from setup_arch()
> >   * 1. setup swapper's mm @init_mm
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index b990e9734474..114f0e027144 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -7307,6 +7307,15 @@ static void check_for_memory(pg_data_t *pgdat, int 
> > nid)
> > }
> >  }
> >  
> > +/*
> > + * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
> > + * such cases we allow max_zone_pfn sorted in the descending order
> > + */
> > +bool __weak arch_has_descending_max_zone_pfns(void)
> > +{
> > +   return false;
> > +}
> > +
> >  /**
> >   * free_area_init - Initialise all pg_data_t and zone data
> >   * @max_zone_pfn: an array of max PFNs for each zone
> > @@ -7324,7 +7333,7 @@ void __init free_area_init(unsigned long 
> > *max_zone_pfn)
> >  {
> > unsigned long start_pfn, end_pfn;
> > int i, nid, zone;
> > -   bool descending = false;
> > +   bool descending;
> >  
> > /* Record where the zone boundaries are */
> > memset(arch_zone_lowest_possible_pfn, 0,
> > @@ -7333,14 +7342,7 @@ void __init free_area_init(unsigned long 
> > *max_zone_pfn)
> > sizeof(arch_zone_highest_possible_pfn));
> >  
> > start_pfn = find_min_pfn_with_active_regions();
> > -
> > -   /*
> > -* Some architecturs, e.g. ARC may have ZONE_HIGHMEM below
> > -* ZONE_NORMAL. For such cases we allow max_zone_pfn sorted in the
> > -* descending order
> > -*/
> > -   if (MAX_NR_ZONES > 1 && max_zone_pfn[0] > max_zone_pfn[1])
> > -   descending = true;
> > +   descending = arch_has_descending_max_zone_pfns();
> >  
> > for (i = 0; i < MAX_NR_ZONES; i++) {
> > if (descending)
> > 
> >> Guenter
> >>
> >>> qemu command line:
> >>>
> >>> qemu-system-microblazeel -M petalogix-ml605 -m 256 \
> >>>   -kernel arch/microblaze/boot/linux.bin -no-reboot \
> >>>   -initrd rootfs.cpio \
> >>>   -append 'panic=-1 slub_debug=FZPUA rdinit=/sbin/init 
> >>> console=ttyS0,115200' \
> >>>   -monitor none -serial stdio -nographic
> >>>
> >>> initrd:
> >>>   
> >>> https://github.com/groeck/linux-build-test/blob/master/rootfs/microblazeel/rootfs.cpio.gz
> >>> configuration:
> >>>   
> >>> 

Re: [PATCH v4 1/2] powerpc/uaccess: Implement unsafe_put_user() using 'asm goto'

2020-05-05 Thread Segher Boessenkool
Hi!

On Wed, May 06, 2020 at 12:27:58AM +1000, Michael Ellerman wrote:
> Christophe Leroy  writes:
> > unsafe_put_user() is designed to take benefit of 'asm goto'.
> >
> > Instead of using the standard __put_user() approach and branch
> > based on the returned error, use 'asm goto' and make the
> > exception code branch directly to the error label. There is
> > no code anymore in the fixup section.
> >
> > This change significantly simplifies functions using
> > unsafe_put_user()
> >
> ...
> >
> > Signed-off-by: Christophe Leroy 
> > ---
> >  arch/powerpc/include/asm/uaccess.h | 61 +-
> >  1 file changed, 52 insertions(+), 9 deletions(-)
> >
> > diff --git a/arch/powerpc/include/asm/uaccess.h 
> > b/arch/powerpc/include/asm/uaccess.h
> > index 9cc9c106ae2a..9365b59495a2 100644
> > --- a/arch/powerpc/include/asm/uaccess.h
> > +++ b/arch/powerpc/include/asm/uaccess.h
> > @@ -196,6 +193,52 @@ do {   
> > \
> >  })
> >  
> >  
> > +#define __put_user_asm_goto(x, addr, label, op)\
> > +   asm volatile goto(  \
> > +   "1: " op "%U1%X1 %0,%1  # put_user\n"   \
> > +   EX_TABLE(1b, %l2)   \
> > +   :   \
> > +   : "r" (x), "m<>" (*addr)\
> 
> The "m<>" here is breaking GCC 4.6.3, which we allegedly still support.

[ You shouldn't use 4.6.3, there has been 4.6.4 since a while.  And 4.6
  is nine years old now.  Most projects do not support < 4.8 anymore, on
  any architecture.  ]

> Plain "m" works, how much does the "<>" affect code gen in practice?
> 
> A quick diff here shows no difference from removing "<>".

It will make it impossible to use update-form instructions here.  That
probably does not matter much at all, in this case.

If you remove the "<>" constraints, also remove the "%Un" output modifier?


Segher


Re: remove set_fs calls from the coredump code v6

2020-05-05 Thread Eric W. Biederman
Linus Torvalds  writes:

> On Tue, May 5, 2020 at 3:13 AM Christoph Hellwig  wrote:
>>
>> this series gets rid of playing with the address limit in the exec and
>> coredump code.  Most of this was fairly trivial, the biggest changes are
>> those to the spufs coredump code.
>
> Ack, nice, and looks good.
>
> The only part I dislike is how we have that 'struct compat_siginfo' on
> the stack, which is a huge waste (most of it is the nasty padding to
> 128 bytes).
>
> But that's not new, I only reacted to it because the code moved a bit.
> We cleaned up the regular siginfo to not have the padding in the
> kernel (and by "we" I mean "Eric Biederman did it after some prodding
> as part of his siginfo cleanups" - see commit 4ce5f9c9e754 "signal:
> Use a smaller struct siginfo in the kernel"),  and I wonder if we
> could do something similar with that compat thing.
>
> 128 bytes of wasted kernel stack isn't the end of the world, but it's
> sad when the *actual* data is only 32 bytes or so.

We probably can.   After introducing a kernel_compat_siginfo that is
the size that userspace actually would need.

It isn't something I want to mess with until this code gets merged, as I
think the set_fs cleanups are more important.


Christoph made some good points about how ugly the #ifdefs are in
the generic copy_siginfo_to_user32 implementation.

I am thinking the right fix is to introduce.
- TS_X32 as a companion to TS_COMPAT in the x86_64.
- Modify in_x32_syscall() to test TS_X32
- Implement x32_copy_siginfo_to_user32 that forces TS_X32 to be
  set. AKA:

x32_copy_siginfo_to_user32()
{
unsigned long state = current_thread_info()->state;
current_thread_info()->state |= TS_X32;
copy_siginfo_to_user32();
current_thread_info()->state = state;
}

That would make the #ifdefs go away, but I don't yet know what the x86
maintainers would say about that scheme.  I think it is a good path as
it would isolate the runtime cost of that weird SIGCHLD siginfo format
to just x32.  Then ia32 in compat mode would not need to pay.

Once I get that then it will be easier to introduce a yet another helper
of copy_siginfo_to_user32 that generates just the kernel_compat_siginfo
part, and the two visible derivatives can call memset and clear_user
to clear the unset parts.

I am assuming you don't don't mind having a full siginfo in
elf_note_info that ultimately gets copied into the core dump?

Eric


Re: remove set_fs calls from the coredump code v6

2020-05-05 Thread Al Viro
On Tue, May 05, 2020 at 10:42:58PM +0200, Christoph Hellwig wrote:
> On Tue, May 05, 2020 at 09:34:46PM +0100, Al Viro wrote:
> > Looks good.  Want me to put it into vfs.git?  #work.set_fs-exec, perhaps?
> 
> Sounds good.

Applied, pushed and added into #for-next


Re: [PATCH v4 7/8] powerpc/vdso32: implement clock_getres entirely

2020-05-05 Thread Aurelien Jarno
Hi,

On 2019-12-02 07:57, Christophe Leroy wrote:
> clock_getres returns hrtimer_res for all clocks but coarse ones
> for which it returns KTIME_LOW_RES.
> 
> return EINVAL for unknown clocks.
> 
> Signed-off-by: Christophe Leroy 
> ---
>  arch/powerpc/kernel/asm-offsets.c |  3 +++
>  arch/powerpc/kernel/vdso32/gettimeofday.S | 19 +++
>  2 files changed, 14 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/asm-offsets.c 
> b/arch/powerpc/kernel/asm-offsets.c
> index 0013197d89a6..90e53d432f2e 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -413,7 +413,10 @@ int main(void)
>   DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC);
>   DEFINE(CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE);
>   DEFINE(CLOCK_MONOTONIC_COARSE, CLOCK_MONOTONIC_COARSE);
> + DEFINE(CLOCK_MAX, CLOCK_TAI);
>   DEFINE(NSEC_PER_SEC, NSEC_PER_SEC);
> + DEFINE(EINVAL, EINVAL);
> + DEFINE(KTIME_LOW_RES, KTIME_LOW_RES);
>  
>  #ifdef CONFIG_BUG
>   DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry));
> diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S 
> b/arch/powerpc/kernel/vdso32/gettimeofday.S
> index 9aafacea9c4a..20ae38f3a5a3 100644
> --- a/arch/powerpc/kernel/vdso32/gettimeofday.S
> +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S
> @@ -196,17 +196,20 @@ V_FUNCTION_END(__kernel_clock_gettime)
>  V_FUNCTION_BEGIN(__kernel_clock_getres)
>.cfi_startproc
>   /* Check for supported clock IDs */
> - cmpwi   cr0,r3,CLOCK_REALTIME
> - cmpwi   cr1,r3,CLOCK_MONOTONIC
> - crorcr0*4+eq,cr0*4+eq,cr1*4+eq
> - bne cr0,99f
> + cmplwi  cr0, r3, CLOCK_MAX
> + cmpwi   cr1, r3, CLOCK_REALTIME_COARSE
> + cmpwi   cr7, r3, CLOCK_MONOTONIC_COARSE
> + bgt cr0, 99f
> + LOAD_REG_IMMEDIATE(r5, KTIME_LOW_RES)
> + beq cr1, 1f
> + beq cr7, 1f
>  
>   mflrr12
>.cfi_register lr,r12
>   get_datapager3, r0
>   lwz r5, CLOCK_HRTIMER_RES(r3)
>   mtlrr12
> - li  r3,0
> +1:   li  r3,0
>   cmpli   cr0,r4,0
>   crclr   cr0*4+so
>   beqlr
> @@ -215,11 +218,11 @@ V_FUNCTION_BEGIN(__kernel_clock_getres)
>   blr
>  
>   /*
> -  * syscall fallback
> +  * invalid clock
>*/
>  99:
> - li  r0,__NR_clock_getres
> - sc
> + li  r3, EINVAL
> + crset   so
>   blr
>.cfi_endproc
>  V_FUNCTION_END(__kernel_clock_getres)

Removing the syscall fallback looks wrong, and broke access to
per-processes clocks. With this change a few glibc tests now fail.

This can be reproduced by the simple code below:

| #include 
| #include 
| #include 
| #include 
| #include 
| #include 
| 
| int main()
| {
| struct timespec res;
| clockid_t ci;
| int e;
|
| e = clock_getcpuclockid(getpid(), );
| if (e) {
| printf("clock_getcpuclockid returned %d\n", e);
| return e;
| }
| e = clock_getres (ci, );
| printf("clock_getres returned %d\n", e);
| if (e) {
| printf("  errno: %d, %s\n", errno, strerror(errno));
| }
|
| return e;
| }

Without this patch or with -m64, it returns:

| clock_getres returned 0

With this patch with -m32 it returns:

| clock_getres returned -1
|   errno: 22, Invalid argument

Regards,
Aurelien

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net


Re: [PATCH v4 1/2] powerpc/uaccess: Implement unsafe_put_user() using 'asm goto'

2020-05-05 Thread Michael Ellerman
Segher Boessenkool  writes:
> On Tue, May 05, 2020 at 05:40:21PM +0200, Christophe Leroy wrote:
>> >>+#define __put_user_asm_goto(x, addr, label, op)   \
>> >>+  asm volatile goto(  \
>> >>+  "1: " op "%U1%X1 %0,%1  # put_user\n"   \
>> >>+  EX_TABLE(1b, %l2)   \
>> >>+  :   \
>> >>+  : "r" (x), "m<>" (*addr)\
>> >
>> >The "m<>" here is breaking GCC 4.6.3, which we allegedly still support.
>> >
>> >Plain "m" works, how much does the "<>" affect code gen in practice?
>> >
>> >A quick diff here shows no difference from removing "<>".
>> 
>> It was recommended by Segher, there has been some discussion about it on 
>> v1 of this patch, see 
>> https://patchwork.ozlabs.org/project/linuxppc-dev/patch/4fdc2aba6f5e51887d1cd0fee94be0989eada2cd.1586942312.git.christophe.le...@c-s.fr/
>> 
>> As far as I understood that's mandatory on recent gcc to get the 
>> pre-update form of the instruction. With older versions "m" was doing 
>> the same, but not anymore.
>
> Yes.  How much that matters depends on the asm.  On older CPUs (6xx/7xx,
> say) the update form was just as fast as the non-update form.  On newer
> or bigger CPUs it is usually executed just the same as an add followed
> by the memory access, so it just saves a bit of code size.

The update-forms are stdux, sthux etc. right?

I don't see any change in the number of those with or without the
constraint. That's using GCC 9.3.0.

>> Should we ifdef the "m<>" or "m" based on GCC 
>> version ?
>
> That will be a lot of churn.  Just make 4.8 minimum?

As I said in my other mail that's not really up to us. We could mandate
a higher minimum for powerpc, but I'd rather not.

I think for now I'm inclined to just drop the "<>", and we can revisit
in a release or two when hopefully GCC 4.8 has become the minimum.

cheers


Re: [PATCH v4 1/2] powerpc/uaccess: Implement unsafe_put_user() using 'asm goto'

2020-05-05 Thread Christophe Leroy

Hi,

Le 05/05/2020 à 16:27, Michael Ellerman a écrit :

Christophe Leroy  writes:

unsafe_put_user() is designed to take benefit of 'asm goto'.

Instead of using the standard __put_user() approach and branch
based on the returned error, use 'asm goto' and make the
exception code branch directly to the error label. There is
no code anymore in the fixup section.

This change significantly simplifies functions using
unsafe_put_user()


...


Signed-off-by: Christophe Leroy 
---
  arch/powerpc/include/asm/uaccess.h | 61 +-
  1 file changed, 52 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 9cc9c106ae2a..9365b59495a2 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -196,6 +193,52 @@ do {   
\
  })
  
  
+#define __put_user_asm_goto(x, addr, label, op)			\

+   asm volatile goto(  \
+   "1:" op "%U1%X1 %0,%1# put_user\n"  \
+   EX_TABLE(1b, %l2)   \
+   :   \
+   : "r" (x), "m<>" (*addr)  \


The "m<>" here is breaking GCC 4.6.3, which we allegedly still support.

Plain "m" works, how much does the "<>" affect code gen in practice?

A quick diff here shows no difference from removing "<>".


It was recommended by Segher, there has been some discussion about it on 
v1 of this patch, see 
https://patchwork.ozlabs.org/project/linuxppc-dev/patch/4fdc2aba6f5e51887d1cd0fee94be0989eada2cd.1586942312.git.christophe.le...@c-s.fr/


As far as I understood that's mandatory on recent gcc to get the 
pre-update form of the instruction. With older versions "m" was doing 
the same, but not anymore. Should we ifdef the "m<>" or "m" based on GCC 
version ?


Christophe


Re: [PATCH v4 1/2] powerpc/uaccess: Implement unsafe_put_user() using 'asm goto'

2020-05-05 Thread Michael Ellerman
Segher Boessenkool  writes:
> Hi!
>
> On Wed, May 06, 2020 at 12:27:58AM +1000, Michael Ellerman wrote:
>> Christophe Leroy  writes:
>> > unsafe_put_user() is designed to take benefit of 'asm goto'.
>> >
>> > Instead of using the standard __put_user() approach and branch
>> > based on the returned error, use 'asm goto' and make the
>> > exception code branch directly to the error label. There is
>> > no code anymore in the fixup section.
>> >
>> > This change significantly simplifies functions using
>> > unsafe_put_user()
>> >
>> ...
>> >
>> > Signed-off-by: Christophe Leroy 
>> > ---
>> >  arch/powerpc/include/asm/uaccess.h | 61 +-
>> >  1 file changed, 52 insertions(+), 9 deletions(-)
>> >
>> > diff --git a/arch/powerpc/include/asm/uaccess.h 
>> > b/arch/powerpc/include/asm/uaccess.h
>> > index 9cc9c106ae2a..9365b59495a2 100644
>> > --- a/arch/powerpc/include/asm/uaccess.h
>> > +++ b/arch/powerpc/include/asm/uaccess.h
>> > @@ -196,6 +193,52 @@ do {  
>> > \
>> >  })
>> >  
>> >  
>> > +#define __put_user_asm_goto(x, addr, label, op)   \
>> > +  asm volatile goto(  \
>> > +  "1: " op "%U1%X1 %0,%1  # put_user\n"   \
>> > +  EX_TABLE(1b, %l2)   \
>> > +  :   \
>> > +  : "r" (x), "m<>" (*addr)\
>> 
>> The "m<>" here is breaking GCC 4.6.3, which we allegedly still support.
>
> [ You shouldn't use 4.6.3, there has been 4.6.4 since a while.  And 4.6
>   is nine years old now.  Most projects do not support < 4.8 anymore, on
>   any architecture.  ]

Moving up to 4.6.4 wouldn't actually help with this though would it?

Also I have 4.6.3 compilers already built, I don't really have time to
rebuild them for 4.6.4.

The kernel has a top-level minimum version, which I'm not in charge of, see:

https://www.kernel.org/doc/html/latest/process/changes.html?highlight=gcc


There were discussions about making 4.8 the minimum, but I'm not sure
where they got to.

>> Plain "m" works, how much does the "<>" affect code gen in practice?
>> 
>> A quick diff here shows no difference from removing "<>".
>
> It will make it impossible to use update-form instructions here.  That
> probably does not matter much at all, in this case.
>
> If you remove the "<>" constraints, also remove the "%Un" output modifier?

So like this?

diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 62cc8d7640ec..ca847aed8e45 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -207,10 +207,10 @@ do {  
\
 
 #define __put_user_asm_goto(x, addr, label, op)\
asm volatile goto(  \
-   "1: " op "%U1%X1 %0,%1  # put_user\n"   \
+   "1: " op "%X1 %0,%1 # put_user\n"   \
EX_TABLE(1b, %l2)   \
:   \
-   : "r" (x), "m<>" (*addr)\
+   : "r" (x), "m" (*addr)  \
:   \
: label)
 


cheers


Re: [RFC PATCH 2/2] powerpc/64s: system call support for scv/rfscv instructions

2020-05-05 Thread Nicholas Piggin
Excerpts from Segher Boessenkool's message of May 6, 2020 8:11 am:
> Hi!
> 
> On Thu, Apr 30, 2020 at 02:02:02PM +1000, Nicholas Piggin wrote:
>> Add support for the scv instruction on POWER9 and later CPUs.
> 
> Looks good to me in general :-)

Thanks for taking a look.

>> For now this implements the zeroth scv vector 'scv 0', as identical
>> to 'sc' system calls, with the exception that lr is not preserved, and
>> it is 64-bit only. There may yet be changes made to this ABI, so it's
>> for testing only.
> 
> What does it do with SF=0?  I don't see how it is obviously not a
> security hole currently (but I didn't look too closely).

Oh that's an outdated comment, I since decided better to keep all the code 
common and handle 32-bit compat the same way as existing sc syscall.

Thanks,
Nick


Re: [PATCH v2 1/2] powerpc/64s/hash: add torture_slb kernel boot option to increase SLB faults

2020-05-05 Thread kbuild test robot
Hi Nicholas,

I love your patch! Yet something to improve:

[auto build test ERROR on powerpc/next]
[also build test ERROR on linus/master v5.7-rc4 next-20200505]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:
https://github.com/0day-ci/linux/commits/Nicholas-Piggin/powerpc-64s-hash-add-torture_slb-kernel-boot-option-to-increase-SLB-faults/20200505-053958
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-randconfig-a001-20200503 (attached as .config)
compiler: clang version 11.0.0 (https://github.com/llvm/llvm-project 
9e3549804672c79d64eececab39019f4dfd2b7ea)
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# install powerpc cross compiling tool for clang build
# apt-get install binutils-powerpc-linux-gnu
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=powerpc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kbuild test robot 

All error/warnings (new ones prefixed by >>):

   In file included from arch/powerpc/kernel/asm-offsets.c:14:
   In file included from include/linux/compat.h:10:
   In file included from include/linux/time.h:6:
   In file included from include/linux/seqlock.h:36:
   In file included from include/linux/spinlock.h:51:
   In file included from include/linux/preempt.h:78:
   In file included from ./arch/powerpc/include/generated/asm/preempt.h:1:
   In file included from include/asm-generic/preempt.h:5:
   In file included from include/linux/thread_info.h:21:
   In file included from arch/powerpc/include/asm/current.h:13:
   In file included from arch/powerpc/include/asm/paca.h:17:
   In file included from arch/powerpc/include/asm/lppaca.h:47:
   In file included from arch/powerpc/include/asm/mmu.h:356:
   In file included from arch/powerpc/include/asm/book3s/64/mmu.h:46:
>> arch/powerpc/include/asm/book3s/64/mmu-hash.h:321:1: warning: declaration 
>> specifier missing, defaulting to 'int'
   DECLARE_STATIC_KEY_FALSE(torture_slb_key);
   ^
   int
>> arch/powerpc/include/asm/book3s/64/mmu-hash.h:321:26: error: a parameter 
>> list without types is only allowed in a function definition
   DECLARE_STATIC_KEY_FALSE(torture_slb_key);
^
>> arch/powerpc/include/asm/book3s/64/mmu-hash.h:324:9: error: implicit 
>> declaration of function 'static_branch_unlikely' 
>> [-Werror,-Wimplicit-function-declaration]
   return static_branch_unlikely(_slb_key);
  ^
>> arch/powerpc/include/asm/book3s/64/mmu-hash.h:324:33: error: use of 
>> undeclared identifier 'torture_slb_key'; did you mean 'torture_slb'?
   return static_branch_unlikely(_slb_key);
  ^~~
  torture_slb
   arch/powerpc/include/asm/book3s/64/mmu-hash.h:322:20: note: 'torture_slb' 
declared here
   static inline bool torture_slb(void)
  ^
   In file included from arch/powerpc/kernel/asm-offsets.c:14:
   In file included from include/linux/compat.h:17:
   In file included from include/linux/fs.h:34:
   In file included from include/linux/percpu-rwsem.h:7:
   In file included from include/linux/rcuwait.h:6:
   In file included from include/linux/sched/signal.h:6:
   include/linux/signal.h:87:11: warning: array index 3 is past the end of the 
array (which contains 1 element) [-Warray-bounds]
   return (set->sig[3] | set->sig[2] |
   ^~
   arch/powerpc/include/uapi/asm/signal.h:18:2: note: array 'sig' declared here
   unsigned long sig[_NSIG_WORDS];
   ^
   In file included from arch/powerpc/kernel/asm-offsets.c:14:
   In file included from include/linux/compat.h:17:
   In file included from include/linux/fs.h:34:
   In file included from include/linux/percpu-rwsem.h:7:
   In file included from include/linux/rcuwait.h:6:
   In file included from include/linux/sched/signal.h:6:
   include/linux/signal.h:87:25: warning: array index 2 is past the end of the 
array (which contains 1 element) [-Warray-bounds]
   return (set->sig[3] | set->sig[2] |
 ^~
   arch/powerpc/include/uapi/asm/signal.h:18:2: note: array 'sig' declared here
   unsigned long sig[_NSIG_WORDS];
   ^
   In file included from arch/powerpc/kernel/asm-offsets.c:14:
   In file included from include/linux/compat.h:17:
   In file included from include/linux/fs.h:34:
   In file included from include/linux/percpu-rwsem.h:7:
   In file include

Re: [PATCH] powerpc/xive: Enforce load-after-store ordering when StoreEOI is active

2020-05-05 Thread Alistair Popple
I am still slowly wrapping my head around XIVE and it's interaction with KVM 
but from what I can see this looks good and is needed so we can enable 
StoreEOI support in future so:

Reviewed-by: Alistair Popple 

On Thursday, 20 February 2020 7:15:06 PM AEST Cédric Le Goater wrote:
> When an interrupt has been handled, the OS notifies the interrupt
> controller with a EOI sequence. On a POWER9 system using the XIVE
> interrupt controller, this can be done with a load or a store
> operation on the ESB interrupt management page of the interrupt. The
> StoreEOI operation has less latency and improves interrupt handling
> performance but it was deactivated during the POWER9 DD2.0 timeframe
> because of ordering issues. We use the LoadEOI today but we plan to
> reactivate StoreEOI in future architectures.
> 
> There is usually no need to enforce ordering between ESB load and
> store operations as they should lead to the same result. E.g. a store
> trigger and a load EOI can be executed in any order. Assuming the
> interrupt state is PQ=10, a store trigger followed by a load EOI will
> return a Q bit. In the reverse order, it will create a new interrupt
> trigger from HW. In both cases, the handler processing interrupts is
> notified.
> 
> In some cases, the XIVE_ESB_SET_PQ_10 load operation is used to
> disable temporarily the interrupt source (mask/unmask). When the
> source is reenabled, the OS can detect if interrupts were received
> while the source was disabled and reinject them. This process needs
> special care when StoreEOI is activated. The ESB load and store
> operations should be correctly ordered because a XIVE_ESB_STORE_EOI
> operation could leave the source enabled if it has not completed
> before the loads.
> 
> For those cases, we enforce Load-after-Store ordering with a special
> load operation offset. To avoid performance impact, this ordering is
> only enforced when really needed, that is when interrupt sources are
> temporarily disabled with the XIVE_ESB_SET_PQ_10 load. It should not
> be needed for other loads.
> 
> Signed-off-by: Cédric Le Goater 
> ---
>  arch/powerpc/include/asm/xive-regs.h| 8 
>  arch/powerpc/kvm/book3s_xive_native.c   | 6 ++
>  arch/powerpc/kvm/book3s_xive_template.c | 3 +++
>  arch/powerpc/sysdev/xive/common.c   | 3 +++
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 5 +
>  5 files changed, 25 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/xive-regs.h
> b/arch/powerpc/include/asm/xive-regs.h index f2dfcd50a2d3..b1996fbae59a
> 100644
> --- a/arch/powerpc/include/asm/xive-regs.h
> +++ b/arch/powerpc/include/asm/xive-regs.h
> @@ -37,6 +37,14 @@
>  #define XIVE_ESB_SET_PQ_10   0xe00 /* Load */
>  #define XIVE_ESB_SET_PQ_11   0xf00 /* Load */
> 
> +/*
> + * Load-after-store ordering
> + *
> + * Adding this offset to the load address will enforce
> + * load-after-store ordering. This is required to use StoreEOI.
> + */
> +#define XIVE_ESB_LD_ST_MO0x40 /* Load-after-store ordering */
> +
>  #define XIVE_ESB_VAL_P   0x2
>  #define XIVE_ESB_VAL_Q   0x1
> 
> diff --git a/arch/powerpc/kvm/book3s_xive_native.c
> b/arch/powerpc/kvm/book3s_xive_native.c index d83adb1e1490..c80b6a447efd
> 100644
> --- a/arch/powerpc/kvm/book3s_xive_native.c
> +++ b/arch/powerpc/kvm/book3s_xive_native.c
> @@ -31,6 +31,12 @@ static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32
> offset) {
>   u64 val;
> 
> + /*
> +  * The KVM XIVE native device does not use the XIVE_ESB_SET_PQ_10
> +  * load operation, so there is no need to enforce load-after-store
> +  * ordering.
> +  */
> +
>   if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
>   offset |= offset << 4;
> 
> diff --git a/arch/powerpc/kvm/book3s_xive_template.c
> b/arch/powerpc/kvm/book3s_xive_template.c index a8a900ace1e6..4ad3c0279458
> 100644
> --- a/arch/powerpc/kvm/book3s_xive_template.c
> +++ b/arch/powerpc/kvm/book3s_xive_template.c
> @@ -58,6 +58,9 @@ static u8 GLUE(X_PFX,esb_load)(struct xive_irq_data *xd,
> u32 offset) {
>   u64 val;
> 
> + if (offset == XIVE_ESB_SET_PQ_10 && xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
> + offset |= XIVE_ESB_LD_ST_MO;
> +
>   if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
>   offset |= offset << 4;
> 
> diff --git a/arch/powerpc/sysdev/xive/common.c
> b/arch/powerpc/sysdev/xive/common.c index f5fadbd2533a..0dc421bb494f 100644
> --- a/arch/powerpc/sysdev/xive/common.c
> +++ b/arch/powerpc/sysdev/xive/common.c
> @@ -202,6 +202,9 @@ static notrace u8 xive_esb_read(struct xive_irq_data
> *xd, u32 offset) {
>   u64 val;
> 
> + if (offset == XIVE_ESB_SET_PQ_10 && xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
> + offset |= XIVE_ESB_LD_ST_MO;
> +
>   /* Handle HW errata */
>   if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
>   offset |= offset << 4;
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 

Re: remove set_fs calls from the coredump code v6

2020-05-05 Thread Al Viro
On Tue, May 05, 2020 at 12:12:49PM +0200, Christoph Hellwig wrote:
> Hi all,
> 
> this series gets rid of playing with the address limit in the exec and
> coredump code.  Most of this was fairly trivial, the biggest changes are
> those to the spufs coredump code.
> 
> Changes since v5:
>  - fix uaccess under spinlock in spufs (Jeremy)
>  - remove use of access_ok in spufs
> 
> Changes since v4:
>  - change some goto names as suggested by Linus
> 
> Changes since v3:
>  - fix x86 compilation with x32 in the new version of the signal code
>  - split the exec patches into a new series
> 
> Changes since v2:
>  - don't cleanup the compat siginfo calling conventions, use the patch
>variant from Eric with slight coding style fixes instead.
> 
> Changes since v1:
>  - properly spell NUL
>  - properly handle the compat siginfo case in ELF coredumps

Looks good.  Want me to put it into vfs.git?  #work.set_fs-exec, perhaps?


Re: [RFC PATCH 2/2] powerpc/64s: system call support for scv/rfscv instructions

2020-05-05 Thread Segher Boessenkool
Hi!

On Thu, Apr 30, 2020 at 02:02:02PM +1000, Nicholas Piggin wrote:
> Add support for the scv instruction on POWER9 and later CPUs.

Looks good to me in general :-)

> For now this implements the zeroth scv vector 'scv 0', as identical
> to 'sc' system calls, with the exception that lr is not preserved, and
> it is 64-bit only. There may yet be changes made to this ABI, so it's
> for testing only.

What does it do with SF=0?  I don't see how it is obviously not a
security hole currently (but I didn't look too closely).


Segher


Re: [PATCH 3/3] mm/hugetlb: Introduce HAVE_ARCH_CLEAR_HUGEPAGE_FLAGS

2020-05-05 Thread Andrew Morton
On Tue, 5 May 2020 08:21:34 +0530 Anshuman Khandual  
wrote:

> >>> static inline void arch_clear_hugepage_flags(struct page *page)
> >>> {
> >>>   
> >>> }
> >>> #define arch_clear_hugepage_flags arch_clear_hugepage_flags
> >>>
> >>> It's a small difference - mainly to avoid adding two variables to the
> >>> overall namespace where one would do.
> >>
> >> Understood, will change and resend.
> > 
> > That's OK - I've queued up that fix.
> >
> 
> Hello Andrew,
> 
> I might not have searched all the relevant trees or might have just searched
> earlier than required. But I dont see these patches (or your proposed fixes)
> either in mmotm (2020-04-29-23-04) or in next-20200504. Wondering if you are
> waiting on a V2 for this series accommodating the changes you had proposed.

hm.  I think I must have got confused and thought you were referring to
a different patch.  Yes please, let's have v2.


Re: remove set_fs calls from the coredump code v6

2020-05-05 Thread Christoph Hellwig
On Tue, May 05, 2020 at 09:34:46PM +0100, Al Viro wrote:
> Looks good.  Want me to put it into vfs.git?  #work.set_fs-exec, perhaps?

Sounds good.


Re: New powerpc vdso calling convention

2020-05-05 Thread Segher Boessenkool
Hi!

On Wed, Apr 29, 2020 at 12:39:22PM +1000, Nicholas Piggin wrote:
> Excerpts from Adhemerval Zanella's message of April 27, 2020 11:09 pm:
> >> Right, I'm just talking about those comments -- it seems like the kernel 
> >> vdso should contain an .opd section with function descriptors in it for
> >> elfv1 calls, rather than the hack it has now of creating one in the 
> >> caller's .data section.
> >> 
> >> But all that function descriptor code is gated by
> >> 
> >> #if (defined(__PPC64__) || defined(__powerpc64__)) && _CALL_ELF != 2
> >> 
> >> So it seems PPC32 does not use function descriptors but a direct pointer 
> >> to the entry point like PPC64 with ELFv2.
> > 
> > Yes, this hack is only for ELFv1.  The missing ODP has not been an issue 
> > or glibc because it has been using the inline assembly to emulate the 
> > functions call since initial vDSO support (INTERNAL_VSYSCALL_CALL_TYPE).
> > It just has become an issue when I added a ifunc optimization to 
> > gettimeofday so it can bypass the libc.so and make plt branch to vDSO 
> > directly.
> 
> I can't understand if it's actually a problem for you or not.
> 
> Regardless if you can hack around it, it seems to me that if we're going 
> to add sane calling conventions to the vdso, then we should also just 
> have a .opd section for it as well, whether or not a particular libc 
> requires it.

An OPD ("official procedure descriptor") is required for every function,
to have proper C semantics, so that pointers to functions (which are
pointers to descriptors, in fact) are unique.  You can "manually" make
descriptors just fine, and use those to call functions -- but you cannot
(in general) use a pointer to such a "fake" descriptor as the "id" of
the function.

The way the ABIs define the OPDs makes them guaranteed unique.


Segher


Re: [PATCH 2/3] ASoC: fsl_esai: Add support for imx8qm

2020-05-05 Thread Shengjiu Wang
Hi

On Fri, May 1, 2020 at 6:23 PM Mark Brown  wrote:
>
> On Fri, May 01, 2020 at 04:12:05PM +0800, Shengjiu Wang wrote:
> > The difference for esai on imx8qm is that DMA device is EDMA.
> >
> > EDMA requires the period size to be multiple of maxburst. Otherwise
> > the remaining bytes are not transferred and thus noise is produced.
>
> If this constraint comes from the DMA controller then normally you'd
> expect the DMA controller integration to be enforcing this - is there no
> information in the DMA API that lets us know that this constraint is
> there?

No, I can't find one API for this.
Do you have a recommendation?

best regards
wang shengjiu


[PATCH v8 04/30] powerpc/xmon: Use bitwise calculations in_breakpoint_table()

2020-05-05 Thread Jordan Niethe
A modulo operation is used for calculating the current offset from a
breakpoint within the breakpoint table. As instruction lengths are
always a power of 2, this can be replaced with a bitwise 'and'. The
current check for word alignment can be replaced with checking that the
lower 2 bits are not set.

Suggested-by: Christophe Leroy 
Reviewed-by: Alistair Popple 
Signed-off-by: Jordan Niethe 
---
v6: New to series
---
 arch/powerpc/xmon/xmon.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 4ecb7e73b017..c52b117640f2 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -857,8 +857,8 @@ static struct bpt *in_breakpoint_table(unsigned long nip, 
unsigned long *offp)
off = nip - (unsigned long)bpt_table;
if (off >= sizeof(bpt_table))
return NULL;
-   *offp = off % BPT_SIZE;
-   if (*offp != 0 && *offp != 4)
+   *offp = off & (BPT_SIZE - 1);
+   if (off & 3)
return NULL;
return bpts + (off / BPT_SIZE);
 }
-- 
2.17.1



[PATCH v8 10/30] powerpc: Introduce functions for instruction equality

2020-05-05 Thread Jordan Niethe
In preparation for an instruction data type that can not be directly
used with the '==' operator use functions for checking equality.

Reviewed-by: Balamuruhan S 
Signed-off-by: Jordan Niethe 
---
v5: Remove ppc_inst_null()
v7: Fix compilation issue in expected_nop_sequence() when no
CONFIG_MPROFILE_KERNEL
---
 arch/powerpc/include/asm/inst.h|  5 +
 arch/powerpc/kernel/trace/ftrace.c | 15 ---
 arch/powerpc/lib/code-patching.c   | 12 ++--
 arch/powerpc/xmon/xmon.c   |  4 ++--
 4 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h
index 23fd57a86b03..0c5dc539160a 100644
--- a/arch/powerpc/include/asm/inst.h
+++ b/arch/powerpc/include/asm/inst.h
@@ -23,4 +23,9 @@ static inline u32 ppc_inst_swab(u32 x)
return ppc_inst(swab32(ppc_inst_val(x)));
 }
 
+static inline bool ppc_inst_equal(u32 x, u32 y)
+{
+   return x == y;
+}
+
 #endif /* _ASM_INST_H */
diff --git a/arch/powerpc/kernel/trace/ftrace.c 
b/arch/powerpc/kernel/trace/ftrace.c
index cc23c63f3769..cbb19af4a72a 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -72,7 +72,7 @@ ftrace_modify_code(unsigned long ip, unsigned int old, 
unsigned int new)
return -EFAULT;
 
/* Make sure it is what we expect it to be */
-   if (replaced != old) {
+   if (!ppc_inst_equal(replaced, old)) {
pr_err("%p: replaced (%#x) != old (%#x)",
(void *)ip, ppc_inst_val(replaced), ppc_inst_val(old));
return -EINVAL;
@@ -170,7 +170,8 @@ __ftrace_make_nop(struct module *mod,
}
 
/* We expect either a mflr r0, or a std r0, LRSAVE(r1) */
-   if (op != ppc_inst(PPC_INST_MFLR) && op != ppc_inst(PPC_INST_STD_LR)) {
+   if (!ppc_inst_equal(op, ppc_inst(PPC_INST_MFLR)) &&
+   !ppc_inst_equal(op, ppc_inst(PPC_INST_STD_LR))) {
pr_err("Unexpected instruction %08x around bl _mcount\n",
   ppc_inst_val(op));
return -EINVAL;
@@ -201,7 +202,7 @@ __ftrace_make_nop(struct module *mod,
return -EFAULT;
}
 
-   if (op != ppc_inst(PPC_INST_LD_TOC)) {
+   if (!ppc_inst_equal(op,  ppc_inst(PPC_INST_LD_TOC))) {
pr_err("Expected %08x found %08x\n", PPC_INST_LD_TOC, 
ppc_inst_val(op));
return -EINVAL;
}
@@ -498,7 +499,7 @@ expected_nop_sequence(void *ip, unsigned int op0, unsigned 
int op1)
 * The load offset is different depending on the ABI. For simplicity
 * just mask it out when doing the compare.
 */
-   if (op0 != ppc_inst(0x4808) ||
+   if (!ppc_inst_equal(op0, ppc_inst(0x4808)) ||
(ppc_inst_val(op1) & 0x) != 0xe841)
return 0;
return 1;
@@ -508,7 +509,7 @@ static int
 expected_nop_sequence(void *ip, unsigned int op0, unsigned int op1)
 {
/* look for patched "NOP" on ppc64 with -mprofile-kernel */
-   if (op0 != ppc_inst(PPC_INST_NOP))
+   if (!ppc_inst_equal(op0, ppc_inst(PPC_INST_NOP)))
return 0;
return 1;
 }
@@ -591,7 +592,7 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long 
addr)
return -EFAULT;
 
/* It should be pointing to a nop */
-   if (op != ppc_inst(PPC_INST_NOP)) {
+   if (!ppc_inst_equal(op,  ppc_inst(PPC_INST_NOP))) {
pr_err("Expected NOP but have %x\n", ppc_inst_val(op));
return -EINVAL;
}
@@ -648,7 +649,7 @@ static int __ftrace_make_call_kernel(struct dyn_ftrace 
*rec, unsigned long addr)
return -EFAULT;
}
 
-   if (op != ppc_inst(PPC_INST_NOP)) {
+   if (!ppc_inst_equal(op, ppc_inst(PPC_INST_NOP))) {
pr_err("Unexpected call sequence at %p: %x\n", ip, 
ppc_inst_val(op));
return -EINVAL;
}
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index f5c6dcbac44b..d298bb16936e 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -479,7 +479,7 @@ static void __init test_branch_iform(void)
/* Check flags are masked correctly */
err = create_branch(, , addr, 0xFFFC);
check(instr_is_branch_to_addr(, addr));
-   check(instr == ppc_inst(0x4800));
+   check(ppc_inst_equal(instr, ppc_inst(0x4800)));
 }
 
 static void __init test_create_function_call(void)
@@ -564,7 +564,7 @@ static void __init test_branch_bform(void)
/* Check flags are masked correctly */
err = create_cond_branch(, iptr, addr, 0xFFFC);
check(instr_is_branch_to_addr(, addr));
-   check(instr == ppc_inst(0x43FF));
+   check(ppc_inst_equal(instr, ppc_inst(0x43FF)));
 }
 
 static void __init test_translate_branch(void)
@@ -598,7 +598,7 @@ static void __init test_translate_branch(void)

[PATCH v8 12/30] powerpc: Use a function for reading instructions

2020-05-05 Thread Jordan Niethe
Prefixed instructions will mean there are instructions of different
length. As a result dereferencing a pointer to an instruction will not
necessarily give the desired result. Introduce a function for reading
instructions from memory into the instruction data type.

Reviewed-by: Alistair Popple 
Signed-off-by: Jordan Niethe 
---
v4: New to series
v5: - Rename read_inst() -> probe_kernel_read_inst()
- No longer modify uprobe probe type in this patch
v6: - feature-fixups.c: do_final_fixups(): Use here
- arch_prepare_kprobe(): patch_instruction(): no longer part of this
  patch
- Move probe_kernel_read_inst() out of this patch
- Use in uprobes
v8: style
---
 arch/powerpc/include/asm/inst.h|  5 +
 arch/powerpc/kernel/kprobes.c  |  6 +++---
 arch/powerpc/kernel/mce_power.c|  2 +-
 arch/powerpc/kernel/optprobes.c|  4 ++--
 arch/powerpc/kernel/trace/ftrace.c |  4 ++--
 arch/powerpc/kernel/uprobes.c  |  2 +-
 arch/powerpc/lib/code-patching.c   | 26 ++
 arch/powerpc/lib/feature-fixups.c  |  4 ++--
 arch/powerpc/xmon/xmon.c   |  6 +++---
 9 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h
index 19d8bb7a1c2b..552e953bf04f 100644
--- a/arch/powerpc/include/asm/inst.h
+++ b/arch/powerpc/include/asm/inst.h
@@ -27,6 +27,11 @@ static inline struct ppc_inst ppc_inst_swab(struct ppc_inst 
x)
return ppc_inst(swab32(ppc_inst_val(x)));
 }
 
+static inline struct ppc_inst ppc_inst_read(const struct ppc_inst *ptr)
+{
+   return *ptr;
+}
+
 static inline bool ppc_inst_equal(struct ppc_inst x, struct ppc_inst y)
 {
return ppc_inst_val(x) == ppc_inst_val(y);
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index a08ae5803622..f64312dca84f 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -106,7 +106,7 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, 
unsigned int offset)
 int arch_prepare_kprobe(struct kprobe *p)
 {
int ret = 0;
-   struct ppc_inst insn = *(struct ppc_inst *)p->addr;
+   struct ppc_inst insn = ppc_inst_read((struct ppc_inst *)p->addr);
 
if ((unsigned long)p->addr & 0x03) {
printk("Attempt to register kprobe at an unaligned address\n");
@@ -127,7 +127,7 @@ int arch_prepare_kprobe(struct kprobe *p)
if (!ret) {
memcpy(p->ainsn.insn, p->addr,
MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-   p->opcode = *p->addr;
+   p->opcode = ppc_inst_val(insn);
flush_icache_range((unsigned long)p->ainsn.insn,
(unsigned long)p->ainsn.insn + sizeof(kprobe_opcode_t));
}
@@ -217,7 +217,7 @@ NOKPROBE_SYMBOL(arch_prepare_kretprobe);
 static int try_to_emulate(struct kprobe *p, struct pt_regs *regs)
 {
int ret;
-   struct ppc_inst insn = *(struct ppc_inst *)p->ainsn.insn;
+   struct ppc_inst insn = ppc_inst_read((struct ppc_inst *)p->ainsn.insn);
 
/* regs->nip is also adjusted if emulate_step returns 1 */
ret = emulate_step(regs, insn);
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index cd23218c60bb..45c51ba0071b 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -374,7 +374,7 @@ static int mce_find_instr_ea_and_phys(struct pt_regs *regs, 
uint64_t *addr,
pfn = addr_to_pfn(regs, regs->nip);
if (pfn != ULONG_MAX) {
instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK);
-   instr = *(struct ppc_inst *)(instr_addr);
+   instr = ppc_inst_read((struct ppc_inst *)instr_addr);
if (!analyse_instr(, , instr)) {
pfn = addr_to_pfn(regs, op.ea);
*addr = op.ea;
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index 5a71fef71c22..52c1ab3f85aa 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -100,9 +100,9 @@ static unsigned long can_optimize(struct kprobe *p)
 * Ensure that the instruction is not a conditional branch,
 * and that can be emulated.
 */
-   if (!is_conditional_branch(*(struct ppc_inst *)p->ainsn.insn) &&
+   if (!is_conditional_branch(ppc_inst_read((struct ppc_inst 
*)p->ainsn.insn)) &&
analyse_instr(, ,
- *(struct ppc_inst *)p->ainsn.insn) == 1) {
+ ppc_inst_read((struct ppc_inst *)p->ainsn.insn)) == 
1) {
emulate_update_regs(, );
nip = regs.nip;
}
diff --git a/arch/powerpc/kernel/trace/ftrace.c 
b/arch/powerpc/kernel/trace/ftrace.c
index 3117ed675735..acd5b889815f 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -848,7 +848,7 @@ int ftrace_update_ftrace_func(ftrace_func_t 

[PATCH v8 13/30] powerpc: Add a probe_user_read_inst() function

2020-05-05 Thread Jordan Niethe
Introduce a probe_user_read_inst() function to use in cases where
probe_user_read() is used for getting an instruction. This will be more
useful for prefixed instructions.

Reviewed-by: Alistair Popple 
Signed-off-by: Jordan Niethe 
---
v6: - New to series
---
 arch/powerpc/include/asm/inst.h |  3 +++
 arch/powerpc/lib/Makefile   |  2 +-
 arch/powerpc/lib/inst.c | 18 ++
 arch/powerpc/mm/fault.c |  2 +-
 4 files changed, 23 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/lib/inst.c

diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h
index 552e953bf04f..3e9a58420151 100644
--- a/arch/powerpc/include/asm/inst.h
+++ b/arch/powerpc/include/asm/inst.h
@@ -37,4 +37,7 @@ static inline bool ppc_inst_equal(struct ppc_inst x, struct 
ppc_inst y)
return ppc_inst_val(x) == ppc_inst_val(y);
 }
 
+int probe_user_read_inst(struct ppc_inst *inst,
+struct ppc_inst *nip);
+
 #endif /* _ASM_INST_H */
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index b8de3be10eb4..546591848219 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -16,7 +16,7 @@ CFLAGS_code-patching.o += -DDISABLE_BRANCH_PROFILING
 CFLAGS_feature-fixups.o += -DDISABLE_BRANCH_PROFILING
 endif
 
-obj-y += alloc.o code-patching.o feature-fixups.o pmem.o
+obj-y += alloc.o code-patching.o feature-fixups.o pmem.o inst.o
 
 ifndef CONFIG_KASAN
 obj-y  +=  string.o memcmp_$(BITS).o
diff --git a/arch/powerpc/lib/inst.c b/arch/powerpc/lib/inst.c
new file mode 100644
index ..eaf786afad2b
--- /dev/null
+++ b/arch/powerpc/lib/inst.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *  Copyright 2020, IBM Corporation.
+ */
+
+#include 
+#include 
+
+int probe_user_read_inst(struct ppc_inst *inst,
+struct ppc_inst *nip)
+{
+   unsigned int val;
+   int err;
+
+   err = probe_user_read(, nip, sizeof(val));
+   *inst = ppc_inst(val);
+   return err;
+}
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 4a50f125ec18..f3a943eae305 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -281,7 +281,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, 
unsigned long address,
access_ok(nip, sizeof(*nip))) {
struct ppc_inst inst;
 
-   if (!probe_user_read(, nip, sizeof(inst)))
+   if (!probe_user_read_inst(, (struct ppc_inst 
__user *)nip))
return !store_updates_sp(inst);
*must_retry = true;
}
-- 
2.17.1



[PATCH v8 21/30] powerpc: Enable Prefixed Instructions

2020-05-05 Thread Jordan Niethe
From: Alistair Popple 

Prefix instructions have their own FSCR bit which needs to enabled via
a CPU feature. The kernel will save the FSCR for problem state but it
needs to be enabled initially.

If prefixed instructions are made unavailable by the [H]FSCR, attempting
to use them will cause a facility unavailable exception. Add "PREFIX" to
the facility_strings[].

Currently there are no prefixed instructions that are actually emulated
by emulate_instruction() within facility_unavailable_exception().
However, when caused by a prefixed instructions the SRR1 PREFIXED bit is
set. Prepare for dealing with emulated prefixed instructions by checking
for this bit.

Reviewed-by: Nicholas Piggin 
Signed-off-by: Alistair Popple 
Signed-off-by: Jordan Niethe 
---
v4:
- Squash "Check for prefixed instructions in
  facility_unavailable_exception()" here
- Remove dt parts for now
---
 arch/powerpc/include/asm/reg.h | 3 +++
 arch/powerpc/kernel/traps.c| 1 +
 2 files changed, 4 insertions(+)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index da5cab038e25..773f76402392 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -397,6 +397,7 @@
 #define SPRN_RWMR  0x375   /* Region-Weighting Mode Register */
 
 /* HFSCR and FSCR bit numbers are the same */
+#define FSCR_PREFIX_LG 13  /* Enable Prefix Instructions */
 #define FSCR_SCV_LG12  /* Enable System Call Vectored */
 #define FSCR_MSGP_LG   10  /* Enable MSGP */
 #define FSCR_TAR_LG8   /* Enable Target Address Register */
@@ -408,11 +409,13 @@
 #define FSCR_VECVSX_LG 1   /* Enable VMX/VSX  */
 #define FSCR_FP_LG 0   /* Enable Floating Point */
 #define SPRN_FSCR  0x099   /* Facility Status & Control Register */
+#define   FSCR_PREFIX  __MASK(FSCR_PREFIX_LG)
 #define   FSCR_SCV __MASK(FSCR_SCV_LG)
 #define   FSCR_TAR __MASK(FSCR_TAR_LG)
 #define   FSCR_EBB __MASK(FSCR_EBB_LG)
 #define   FSCR_DSCR__MASK(FSCR_DSCR_LG)
 #define SPRN_HFSCR 0xbe/* HV=1 Facility Status & Control Register */
+#define   HFSCR_PREFIX __MASK(FSCR_PREFIX_LG)
 #define   HFSCR_MSGP   __MASK(FSCR_MSGP_LG)
 #define   HFSCR_TAR__MASK(FSCR_TAR_LG)
 #define   HFSCR_EBB__MASK(FSCR_EBB_LG)
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 3fca22276bb1..493a3fa0ac1a 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1720,6 +1720,7 @@ void facility_unavailable_exception(struct pt_regs *regs)
[FSCR_TAR_LG] = "TAR",
[FSCR_MSGP_LG] = "MSGP",
[FSCR_SCV_LG] = "SCV",
+   [FSCR_PREFIX_LG] = "PREFIX",
};
char *facility = "unknown";
u64 value;
-- 
2.17.1



[PATCH v8 20/30] powerpc: Make test_translate_branch() independent of instruction length

2020-05-05 Thread Jordan Niethe
test_translate_branch() uses two pointers to instructions within a
buffer, p and q, to test patch_branch(). The pointer arithmetic done on
them assumes a size of 4. This will not work if the instruction length
changes. Instead do the arithmetic relative to the void * to the buffer.

Reviewed-by: Alistair Popple 
Signed-off-by: Jordan Niethe 
---
v4: New to series
---
 arch/powerpc/lib/code-patching.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 435fc8e9f45d..d946f7d6bb32 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -572,7 +572,7 @@ static void __init test_branch_bform(void)
 static void __init test_translate_branch(void)
 {
unsigned long addr;
-   struct ppc_inst *p, *q;
+   void *p, *q;
struct ppc_inst instr;
void *buf;
 
@@ -586,7 +586,7 @@ static void __init test_translate_branch(void)
addr = (unsigned long)p;
patch_branch(p, addr, 0);
check(instr_is_branch_to_addr(p, addr));
-   q = p + 1;
+   q = p + 4;
translate_branch(, q, p);
patch_instruction(q, instr);
check(instr_is_branch_to_addr(q, addr));
@@ -642,7 +642,7 @@ static void __init test_translate_branch(void)
create_cond_branch(, p, addr, 0);
patch_instruction(p, instr);
check(instr_is_branch_to_addr(p, addr));
-   q = p + 1;
+   q = buf + 4;
translate_branch(, q, p);
patch_instruction(q, instr);
check(instr_is_branch_to_addr(q, addr));
-- 
2.17.1



[PATCH v8 23/30] powerpc: Add prefixed instructions to instruction data type

2020-05-05 Thread Jordan Niethe
For powerpc64, redefine the ppc_inst type so both word and prefixed
instructions can be represented. On powerpc32 the type will remain the
same.  Update places which had assumed instructions to be 4 bytes long.

Reviewed-by: Alistair Popple 
Signed-off-by: Jordan Niethe 
---
v4: New to series
v5:  - Distinguish normal instructions from prefixed instructions with a
   0xff marker for the suffix.
 - __patch_instruction() using std for prefixed instructions
v6:  - Return false instead of 0 in ppc_inst_prefixed()
 - Fix up types for ppc32 so it compiles
 - remove ppc_inst_write()
 - __patching_instruction(): move flush out of condition
v8:  - style
 - Define and use OP_PREFIX instead of '1' (back from v3)
 - __patch_instruction() fix for big endian
---
 arch/powerpc/include/asm/inst.h   | 69 ---
 arch/powerpc/include/asm/kprobes.h|  2 +-
 arch/powerpc/include/asm/ppc-opcode.h |  3 ++
 arch/powerpc/include/asm/uaccess.h| 40 +++-
 arch/powerpc/include/asm/uprobes.h|  2 +-
 arch/powerpc/kernel/crash_dump.c  |  2 +-
 arch/powerpc/kernel/optprobes.c   | 42 
 arch/powerpc/kernel/optprobes_head.S  |  3 ++
 arch/powerpc/lib/code-patching.c  | 19 ++--
 arch/powerpc/lib/feature-fixups.c |  5 +-
 arch/powerpc/lib/inst.c   | 41 
 arch/powerpc/lib/sstep.c  |  4 +-
 arch/powerpc/xmon/xmon.c  |  4 +-
 arch/powerpc/xmon/xmon_bpts.S |  2 +
 14 files changed, 200 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h
index 2f3c9d5bcf7c..7868b80b610e 100644
--- a/arch/powerpc/include/asm/inst.h
+++ b/arch/powerpc/include/asm/inst.h
@@ -2,29 +2,79 @@
 #ifndef _ASM_INST_H
 #define _ASM_INST_H
 
+#include 
 /*
  * Instruction data type for POWER
  */
 
 struct ppc_inst {
u32 val;
+#ifdef __powerpc64__
+   u32 suffix;
+#endif /* __powerpc64__ */
 } __packed;
 
-#define ppc_inst(x) ((struct ppc_inst){ .val = x })
-
 static inline u32 ppc_inst_val(struct ppc_inst x)
 {
return x.val;
 }
 
-static inline int ppc_inst_len(struct ppc_inst x)
+static inline int ppc_inst_primary_opcode(struct ppc_inst x)
 {
-   return sizeof(struct ppc_inst);
+   return ppc_inst_val(x) >> 26;
 }
 
-static inline int ppc_inst_primary_opcode(struct ppc_inst x)
+#ifdef __powerpc64__
+#define ppc_inst(x) ((struct ppc_inst){ .val = (x), .suffix = 0xff })
+
+#define ppc_inst_prefix(x, y) ((struct ppc_inst){ .val = (x), .suffix = (y) })
+
+static inline u32 ppc_inst_suffix(struct ppc_inst x)
 {
-   return ppc_inst_val(x) >> 26;
+   return x.suffix;
+}
+
+static inline bool ppc_inst_prefixed(struct ppc_inst x)
+{
+   return (ppc_inst_primary_opcode(x) == 1) && ppc_inst_suffix(x) != 0xff;
+}
+
+static inline struct ppc_inst ppc_inst_swab(struct ppc_inst x)
+{
+   return ppc_inst_prefix(swab32(ppc_inst_val(x)),
+  swab32(ppc_inst_suffix(x)));
+}
+
+static inline struct ppc_inst ppc_inst_read(const struct ppc_inst *ptr)
+{
+   u32 val, suffix;
+
+   val = *(u32 *)ptr;
+   if ((val >> 26) == 1) {
+   suffix = *((u32 *)ptr + 1);
+   return ppc_inst_prefix(val, suffix);
+   } else {
+   return ppc_inst(val);
+   }
+}
+
+static inline bool ppc_inst_equal(struct ppc_inst x, struct ppc_inst y)
+{
+   return *(u64 *) == *(u64 *)
+}
+
+#else
+
+#define ppc_inst(x) ((struct ppc_inst){ .val = x })
+
+static inline bool ppc_inst_prefixed(struct ppc_inst x)
+{
+   return false;
+}
+
+static inline u32 ppc_inst_suffix(struct ppc_inst x)
+{
+   return 0;
 }
 
 static inline struct ppc_inst ppc_inst_swab(struct ppc_inst x)
@@ -42,6 +92,13 @@ static inline bool ppc_inst_equal(struct ppc_inst x, struct 
ppc_inst y)
return ppc_inst_val(x) == ppc_inst_val(y);
 }
 
+#endif /* __powerpc64__ */
+
+static inline int ppc_inst_len(struct ppc_inst x)
+{
+   return (ppc_inst_prefixed(x)) ? 8  : 4;
+}
+
 int probe_user_read_inst(struct ppc_inst *inst,
 struct ppc_inst *nip);
 int probe_kernel_read_inst(struct ppc_inst *inst,
diff --git a/arch/powerpc/include/asm/kprobes.h 
b/arch/powerpc/include/asm/kprobes.h
index 66b3f2983b22..4fc0e15e23a5 100644
--- a/arch/powerpc/include/asm/kprobes.h
+++ b/arch/powerpc/include/asm/kprobes.h
@@ -43,7 +43,7 @@ extern kprobe_opcode_t optprobe_template_ret[];
 extern kprobe_opcode_t optprobe_template_end[];
 
 /* Fixed instruction size for powerpc */
-#define MAX_INSN_SIZE  1
+#define MAX_INSN_SIZE  2
 #define MAX_OPTIMIZED_LENGTH   sizeof(kprobe_opcode_t) /* 4 bytes */
 #define MAX_OPTINSN_SIZE   (optprobe_template_end - 
optprobe_template_entry)
 #define RELATIVEJUMP_SIZE  sizeof(kprobe_opcode_t) /* 4 bytes */
diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index c1df75edde44..2a39c716c343 

[PATCH v8 30/30] powerpc sstep: Add support for prefixed fixed-point arithmetic

2020-05-05 Thread Jordan Niethe
This adds emulation support for the following prefixed Fixed-Point
Arithmetic instructions:
  * Prefixed Add Immediate (paddi)

Reviewed-by: Balamuruhan S 
Signed-off-by: Jordan Niethe 
---
v3: Since we moved the prefixed loads/stores into the load/store switch
statement it no longer makes sense to have paddi in there, so move it
out.
---
 arch/powerpc/lib/sstep.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 6794a7672ad5..964fe7bbfce3 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -1337,6 +1337,26 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
 
switch (opcode) {
 #ifdef __powerpc64__
+   case 1:
+   prefix_r = word & (1ul << 20);
+   ra = (suffix >> 16) & 0x1f;
+   rd = (suffix >> 21) & 0x1f;
+   op->reg = rd;
+   op->val = regs->gpr[rd];
+   suffixopcode = suffix >> 26;
+   prefixtype = (word >> 24) & 0x3;
+   switch (prefixtype) {
+   case 2:
+   if (prefix_r && ra)
+   return 0;
+   switch (suffixopcode) {
+   case 14:/* paddi */
+   op->type = COMPUTE | PREFIXED;
+   op->val = mlsd_8lsd_ea(word, suffix, regs);
+   goto compute_done;
+   }
+   }
+   break;
case 2: /* tdi */
if (rd & trap_compare(regs->gpr[ra], (short) word))
goto trap;
-- 
2.17.1



[PATCH v8 03/30] powerpc/xmon: Move breakpoints to text section

2020-05-05 Thread Jordan Niethe
The instructions for xmon's breakpoint are stored bpt_table[] which is in
the data section. This is problematic as the data section may be marked
as no execute. Move bpt_table[] to the text section.

Signed-off-by: Jordan Niethe 
---
v6: - New to series. Was part of the previous patch.
- Make BPT_SIZE available in assembly
---
 arch/powerpc/kernel/asm-offsets.c |  8 
 arch/powerpc/xmon/Makefile|  2 +-
 arch/powerpc/xmon/xmon.c  |  6 +-
 arch/powerpc/xmon/xmon_bpts.S |  9 +
 arch/powerpc/xmon/xmon_bpts.h | 14 ++
 5 files changed, 33 insertions(+), 6 deletions(-)
 create mode 100644 arch/powerpc/xmon/xmon_bpts.S
 create mode 100644 arch/powerpc/xmon/xmon_bpts.h

diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index fcf24a365fc0..9b9cde07e396 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -70,6 +70,10 @@
 #include 
 #endif
 
+#ifdef CONFIG_XMON
+#include "../xmon/xmon_bpts.h"
+#endif
+
 #define STACK_PT_REGS_OFFSET(sym, val) \
DEFINE(sym, STACK_FRAME_OVERHEAD + offsetof(struct pt_regs, val))
 
@@ -795,5 +799,9 @@ int main(void)
DEFINE(VIRT_IMMR_BASE, (u64)__fix_to_virt(FIX_IMMR_BASE));
 #endif
 
+#ifdef CONFIG_XMON
+   DEFINE(BPT_SIZE, BPT_SIZE);
+#endif
+
return 0;
 }
diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile
index 6f9cccea54f3..89c76ca35640 100644
--- a/arch/powerpc/xmon/Makefile
+++ b/arch/powerpc/xmon/Makefile
@@ -18,7 +18,7 @@ endif
 
 ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
 
-obj-y  += xmon.o nonstdio.o spr_access.o
+obj-y  += xmon.o nonstdio.o spr_access.o xmon_bpts.o
 
 ifdef CONFIG_XMON_DISASSEMBLY
 obj-y  += ppc-dis.o ppc-opc.o
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 14c578e0383a..4ecb7e73b017 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -62,6 +62,7 @@
 
 #include "nonstdio.h"
 #include "dis-asm.h"
+#include "xmon_bpts.h"
 
 #ifdef CONFIG_SMP
 static cpumask_t cpus_in_xmon = CPU_MASK_NONE;
@@ -109,7 +110,6 @@ struct bpt {
 #define BP_TRAP2
 #define BP_DABR4
 
-#define NBPTS  256
 static struct bpt bpts[NBPTS];
 static struct bpt dabr;
 static struct bpt *iabr;
@@ -117,10 +117,6 @@ static unsigned bpinstr = 0x7fe8;  /* trap */
 
 #define BP_NUM(bp) ((bp) - bpts + 1)
 
-#define BPT_SIZE   (sizeof(unsigned int) * 2)
-#define BPT_WORDS  (BPT_SIZE / sizeof(unsigned int))
-static unsigned int bpt_table[NBPTS * BPT_WORDS];
-
 /* Prototypes */
 static int cmds(struct pt_regs *);
 static int mread(unsigned long, void *, int);
diff --git a/arch/powerpc/xmon/xmon_bpts.S b/arch/powerpc/xmon/xmon_bpts.S
new file mode 100644
index ..f3ad0ab50854
--- /dev/null
+++ b/arch/powerpc/xmon/xmon_bpts.S
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include 
+#include 
+#include 
+#include "xmon_bpts.h"
+
+.global bpt_table
+bpt_table:
+   .space NBPTS * BPT_SIZE
diff --git a/arch/powerpc/xmon/xmon_bpts.h b/arch/powerpc/xmon/xmon_bpts.h
new file mode 100644
index ..b7e94375db86
--- /dev/null
+++ b/arch/powerpc/xmon/xmon_bpts.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef XMON_BPTS_H
+#define XMON_BPTS_H
+
+#define NBPTS  256
+#ifndef __ASSEMBLY__
+#define BPT_SIZE   (sizeof(unsigned int) * 2)
+#define BPT_WORDS  (BPT_SIZE / sizeof(unsigned int))
+
+extern unsigned int bpt_table[NBPTS * BPT_WORDS];
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* XMON_BPTS_H */
-- 
2.17.1



[PATCH v8 02/30] powerpc/xmon: Move breakpoint instructions to own array

2020-05-05 Thread Jordan Niethe
To execute an instruction out of line after a breakpoint, the NIP is set
to the address of struct bpt::instr. Here a copy of the instruction that
was replaced with a breakpoint is kept, along with a trap so normal flow
can be resumed after XOLing. The struct bpt's are located within the
data section. This is problematic as the data section may be marked as
no execute.

Instead of each struct bpt holding the instructions to be XOL'd, make a
new array, bpt_table[], with enough space to hold instructions for the
number of supported breakpoints. A later patch will move this to the
text section.
Make struct bpt::instr a pointer to the instructions in bpt_table[]
associated with that breakpoint. This association is a simple mapping:
bpts[n] -> bpt_table[n * words per breakpoint]. Currently we only need
the copied instruction followed by a trap, so 2 words per breakpoint.

Reviewed-by: Alistair Popple 
Signed-off-by: Jordan Niethe 
---
v4: New to series
v5: - Do not use __section(), use a .space directive in .S file
- Simplify in_breakpoint_table() calculation
- Define BPT_SIZE
v6: - Seperate moving to text section
---
 arch/powerpc/xmon/xmon.c | 21 -
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index f91ae2c9adbe..14c578e0383a 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -98,7 +98,7 @@ static long *xmon_fault_jmp[NR_CPUS];
 /* Breakpoint stuff */
 struct bpt {
unsigned long   address;
-   unsigned intinstr[2];
+   unsigned int*instr;
atomic_tref_count;
int enabled;
unsigned long   pad;
@@ -117,6 +117,10 @@ static unsigned bpinstr = 0x7fe8;  /* trap */
 
 #define BP_NUM(bp) ((bp) - bpts + 1)
 
+#define BPT_SIZE   (sizeof(unsigned int) * 2)
+#define BPT_WORDS  (BPT_SIZE / sizeof(unsigned int))
+static unsigned int bpt_table[NBPTS * BPT_WORDS];
+
 /* Prototypes */
 static int cmds(struct pt_regs *);
 static int mread(unsigned long, void *, int);
@@ -854,15 +858,13 @@ static struct bpt *in_breakpoint_table(unsigned long nip, 
unsigned long *offp)
 {
unsigned long off;
 
-   off = nip - (unsigned long) bpts;
-   if (off >= sizeof(bpts))
+   off = nip - (unsigned long)bpt_table;
+   if (off >= sizeof(bpt_table))
return NULL;
-   off %= sizeof(struct bpt);
-   if (off != offsetof(struct bpt, instr[0])
-   && off != offsetof(struct bpt, instr[1]))
+   *offp = off % BPT_SIZE;
+   if (*offp != 0 && *offp != 4)
return NULL;
-   *offp = off - offsetof(struct bpt, instr[0]);
-   return (struct bpt *) (nip - off);
+   return bpts + (off / BPT_SIZE);
 }
 
 static struct bpt *new_breakpoint(unsigned long a)
@@ -877,7 +879,8 @@ static struct bpt *new_breakpoint(unsigned long a)
for (bp = bpts; bp < [NBPTS]; ++bp) {
if (!bp->enabled && atomic_read(>ref_count) == 0) {
bp->address = a;
-   patch_instruction(>instr[1], bpinstr);
+   bp->instr = bpt_table + ((bp - bpts) * BPT_WORDS);
+   patch_instruction(bp->instr + 1, bpinstr);
return bp;
}
}
-- 
2.17.1



[PATCH v8 09/30] powerpc: Use a function for byte swapping instructions

2020-05-05 Thread Jordan Niethe
Use a function for byte swapping instructions in preparation of a more
complicated instruction type.

Reviewed-by: Balamuruhan S 
Signed-off-by: Jordan Niethe 
---
 arch/powerpc/include/asm/inst.h | 5 +
 arch/powerpc/kernel/align.c | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h
index 442a95f20de7..23fd57a86b03 100644
--- a/arch/powerpc/include/asm/inst.h
+++ b/arch/powerpc/include/asm/inst.h
@@ -18,4 +18,9 @@ static inline int ppc_inst_primary_opcode(u32 x)
return ppc_inst_val(x) >> 26;
 }
 
+static inline u32 ppc_inst_swab(u32 x)
+{
+   return ppc_inst(swab32(ppc_inst_val(x)));
+}
+
 #endif /* _ASM_INST_H */
diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
index 47dbba81a227..a63216da8cf1 100644
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -310,7 +310,7 @@ int fix_alignment(struct pt_regs *regs)
/* We don't handle PPC little-endian any more... */
if (cpu_has_feature(CPU_FTR_PPC_LE))
return -EIO;
-   instr = swab32(instr);
+   instr = ppc_inst_swab(instr);
}
 
 #ifdef CONFIG_SPE
-- 
2.17.1



[PATCH v8 08/30] powerpc: Use a function for getting the instruction op code

2020-05-05 Thread Jordan Niethe
In preparation for using a data type for instructions that can not be
directly used with the '>>' operator use a function for getting the op
code of an instruction.

Reviewed-by: Alistair Popple 
Signed-off-by: Jordan Niethe 
---
v4: New to series
v6: - Rename ppc_inst_primary() to ppc_inst_primary_opcode()
- Use in vecemu.c, fault.c, sstep.c
- Move this patch after the ppc_inst_val() patch
---
 arch/powerpc/include/asm/inst.h  | 5 +
 arch/powerpc/kernel/align.c  | 2 +-
 arch/powerpc/kernel/vecemu.c | 3 ++-
 arch/powerpc/lib/code-patching.c | 4 ++--
 arch/powerpc/lib/sstep.c | 2 +-
 arch/powerpc/mm/fault.c  | 3 ++-
 6 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h
index 8a9e73bfbd27..442a95f20de7 100644
--- a/arch/powerpc/include/asm/inst.h
+++ b/arch/powerpc/include/asm/inst.h
@@ -13,4 +13,9 @@ static inline u32 ppc_inst_val(u32 x)
return x;
 }
 
+static inline int ppc_inst_primary_opcode(u32 x)
+{
+   return ppc_inst_val(x) >> 26;
+}
+
 #endif /* _ASM_INST_H */
diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
index 44921001f84a..47dbba81a227 100644
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -314,7 +314,7 @@ int fix_alignment(struct pt_regs *regs)
}
 
 #ifdef CONFIG_SPE
-   if ((ppc_inst_val(instr) >> 26) == 0x4) {
+   if (ppc_inst_primary_opcode(instr) == 0x4) {
int reg = (ppc_inst_val(instr) >> 21) & 0x1f;
PPC_WARN_ALIGNMENT(spe, regs);
return emulate_spe(regs, reg, instr);
diff --git a/arch/powerpc/kernel/vecemu.c b/arch/powerpc/kernel/vecemu.c
index 1f5e3b4c8ae4..a544590b90e5 100644
--- a/arch/powerpc/kernel/vecemu.c
+++ b/arch/powerpc/kernel/vecemu.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* Functions in vector.S */
 extern void vaddfp(vector128 *dst, vector128 *a, vector128 *b);
@@ -268,7 +269,7 @@ int emulate_altivec(struct pt_regs *regs)
return -EFAULT;
 
word = ppc_inst_val(instr);
-   if ((word >> 26) != 4)
+   if (ppc_inst_primary_opcode(instr) != 4)
return -EINVAL; /* not an altivec instruction */
vd = (word >> 21) & 0x1f;
va = (word >> 16) & 0x1f;
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index baa849b1a1f9..f5c6dcbac44b 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -231,7 +231,7 @@ bool is_offset_in_branch_range(long offset)
  */
 bool is_conditional_branch(unsigned int instr)
 {
-   unsigned int opcode = instr >> 26;
+   unsigned int opcode = ppc_inst_primary_opcode(instr);
 
if (opcode == 16)   /* bc, bca, bcl, bcla */
return true;
@@ -289,7 +289,7 @@ int create_cond_branch(unsigned int *instr, const unsigned 
int *addr,
 
 static unsigned int branch_opcode(unsigned int instr)
 {
-   return (instr >> 26) & 0x3F;
+   return ppc_inst_primary_opcode(instr) & 0x3F;
 }
 
 static int instr_is_branch_iform(unsigned int instr)
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 14c93ee4ffc8..7f7be154da7e 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -1175,7 +1175,7 @@ int analyse_instr(struct instruction_op *op, const struct 
pt_regs *regs,
word = ppc_inst_val(instr);
op->type = COMPUTE;
 
-   opcode = instr >> 26;
+   opcode = ppc_inst_primary_opcode(instr);
switch (opcode) {
case 16:/* bc */
op->type = BRANCH;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 9364921870df..0e7e145d5cad 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -41,6 +41,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Check whether the instruction inst is a store using
@@ -52,7 +53,7 @@ static bool store_updates_sp(unsigned int inst)
if (((ppc_inst_val(inst) >> 16) & 0x1f) != 1)
return false;
/* check major opcode */
-   switch (inst >> 26) {
+   switch (ppc_inst_primary_opcode(inst)) {
case OP_STWU:
case OP_STBU:
case OP_STHU:
-- 
2.17.1



[PATCH v8 14/30] powerpc: Add a probe_kernel_read_inst() function

2020-05-05 Thread Jordan Niethe
Introduce a probe_kernel_read_inst() function to use in cases where
probe_kernel_read() is used for getting an instruction. This will be
more useful for prefixed instructions.

Reviewed-by: Alistair Popple 
Signed-off-by: Jordan Niethe 
---
v6: - This was previously just in ftrace.c
---
 arch/powerpc/include/asm/inst.h|  2 ++
 arch/powerpc/kernel/trace/ftrace.c | 23 +--
 arch/powerpc/lib/inst.c| 11 +++
 3 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h
index 3e9a58420151..0d581b332c20 100644
--- a/arch/powerpc/include/asm/inst.h
+++ b/arch/powerpc/include/asm/inst.h
@@ -39,5 +39,7 @@ static inline bool ppc_inst_equal(struct ppc_inst x, struct 
ppc_inst y)
 
 int probe_user_read_inst(struct ppc_inst *inst,
 struct ppc_inst *nip);
+int probe_kernel_read_inst(struct ppc_inst *inst,
+  struct ppc_inst *src);
 
 #endif /* _ASM_INST_H */
diff --git a/arch/powerpc/kernel/trace/ftrace.c 
b/arch/powerpc/kernel/trace/ftrace.c
index acd5b889815f..5e399628f51a 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -68,7 +68,7 @@ ftrace_modify_code(unsigned long ip, struct ppc_inst old, 
struct ppc_inst new)
 */
 
/* read the text we want to modify */
-   if (probe_kernel_read(, (void *)ip, MCOUNT_INSN_SIZE))
+   if (probe_kernel_read_inst(, (void *)ip))
return -EFAULT;
 
/* Make sure it is what we expect it to be */
@@ -130,7 +130,7 @@ __ftrace_make_nop(struct module *mod,
struct ppc_inst op, pop;
 
/* read where this goes */
-   if (probe_kernel_read(, (void *)ip, sizeof(int))) {
+   if (probe_kernel_read_inst(, (void *)ip)) {
pr_err("Fetching opcode failed.\n");
return -EFAULT;
}
@@ -164,7 +164,7 @@ __ftrace_make_nop(struct module *mod,
/* When using -mkernel_profile there is no load to jump over */
pop = ppc_inst(PPC_INST_NOP);
 
-   if (probe_kernel_read(, (void *)(ip - 4), 4)) {
+   if (probe_kernel_read_inst(, (void *)(ip - 4))) {
pr_err("Fetching instruction at %lx failed.\n", ip - 4);
return -EFAULT;
}
@@ -197,7 +197,7 @@ __ftrace_make_nop(struct module *mod,
 * Check what is in the next instruction. We can see ld r2,40(r1), but
 * on first pass after boot we will see mflr r0.
 */
-   if (probe_kernel_read(, (void *)(ip+4), MCOUNT_INSN_SIZE)) {
+   if (probe_kernel_read_inst(, (void *)(ip + 4))) {
pr_err("Fetching op failed.\n");
return -EFAULT;
}
@@ -349,7 +349,7 @@ static int setup_mcount_compiler_tramp(unsigned long tramp)
return -1;
 
/* New trampoline -- read where this goes */
-   if (probe_kernel_read(, (void *)tramp, sizeof(int))) {
+   if (probe_kernel_read_inst(, (void *)tramp)) {
pr_debug("Fetching opcode failed.\n");
return -1;
}
@@ -399,7 +399,7 @@ static int __ftrace_make_nop_kernel(struct dyn_ftrace *rec, 
unsigned long addr)
struct ppc_inst op;
 
/* Read where this goes */
-   if (probe_kernel_read(, (void *)ip, sizeof(int))) {
+   if (probe_kernel_read_inst(, (void *)ip)) {
pr_err("Fetching opcode failed.\n");
return -EFAULT;
}
@@ -526,7 +526,10 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long 
addr)
struct module *mod = rec->arch.mod;
 
/* read where this goes */
-   if (probe_kernel_read(op, ip, sizeof(op)))
+   if (probe_kernel_read_inst(op, ip))
+   return -EFAULT;
+
+   if (probe_kernel_read_inst(op + 1, ip + 4))
return -EFAULT;
 
if (!expected_nop_sequence(ip, op[0], op[1])) {
@@ -589,7 +592,7 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long 
addr)
unsigned long ip = rec->ip;
 
/* read where this goes */
-   if (probe_kernel_read(, (void *)ip, MCOUNT_INSN_SIZE))
+   if (probe_kernel_read_inst(, (void *)ip))
return -EFAULT;
 
/* It should be pointing to a nop */
@@ -645,7 +648,7 @@ static int __ftrace_make_call_kernel(struct dyn_ftrace 
*rec, unsigned long addr)
}
 
/* Make sure we have a nop */
-   if (probe_kernel_read(, ip, sizeof(op))) {
+   if (probe_kernel_read_inst(, ip)) {
pr_err("Unable to read ftrace location %p\n", ip);
return -EFAULT;
}
@@ -723,7 +726,7 @@ __ftrace_modify_call(struct dyn_ftrace *rec, unsigned long 
old_addr,
}
 
/* read where this goes */
-   if (probe_kernel_read(, (void *)ip, sizeof(int))) {
+   if (probe_kernel_read_inst(, (void *)ip)) {
pr_err("Fetching opcode failed.\n");
return -EFAULT;
}
diff --git 

[PATCH v8 19/30] powerpc/xmon: Move insertion of breakpoint for xol'ing

2020-05-05 Thread Jordan Niethe
When a new breakpoint is created, the second instruction of that
breakpoint is patched with a trap instruction. This assumes the length
of the instruction is always the same. In preparation for prefixed
instructions, remove this assumption. Insert the trap instruction at the
same time the first instruction is inserted.

Reviewed-by: Alistair Popple 
Signed-off-by: Jordan Niethe 
---
v8: style
---
 arch/powerpc/xmon/xmon.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 7a9cbc6d9b21..4d6980d51456 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -878,7 +878,6 @@ static struct bpt *new_breakpoint(unsigned long a)
if (!bp->enabled && atomic_read(>ref_count) == 0) {
bp->address = a;
bp->instr = (void *)(bpt_table + ((bp - bpts) * 
BPT_WORDS));
-   patch_instruction(bp->instr + 1, ppc_inst(bpinstr));
return bp;
}
}
@@ -910,6 +909,8 @@ static void insert_bpts(void)
continue;
}
patch_instruction(bp->instr, instr);
+   patch_instruction((void *)bp->instr + ppc_inst_len(instr),
+ ppc_inst(bpinstr));
if (bp->enabled & BP_CIABR)
continue;
if (patch_instruction((struct ppc_inst *)bp->address,
-- 
2.17.1



[PATCH v8 24/30] powerpc: Test prefixed code patching

2020-05-05 Thread Jordan Niethe
Expand the code-patching self-tests to includes tests for patching
prefixed instructions.

Signed-off-by: Jordan Niethe 
---
v6: New to series
v8: Use OP_PREFIX
---
 arch/powerpc/lib/Makefile |  2 +-
 arch/powerpc/lib/code-patching.c  | 21 +
 arch/powerpc/lib/test_code-patching.S | 20 
 3 files changed, 42 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/lib/test_code-patching.S

diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 546591848219..5e994cda8e40 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -16,7 +16,7 @@ CFLAGS_code-patching.o += -DDISABLE_BRANCH_PROFILING
 CFLAGS_feature-fixups.o += -DDISABLE_BRANCH_PROFILING
 endif
 
-obj-y += alloc.o code-patching.o feature-fixups.o pmem.o inst.o
+obj-y += alloc.o code-patching.o feature-fixups.o pmem.o inst.o 
test_code-patching.o
 
 ifndef CONFIG_KASAN
 obj-y  +=  string.o memcmp_$(BITS).o
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 58b67b62d5d3..aa51a44bc138 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -708,6 +708,24 @@ static void __init test_translate_branch(void)
vfree(buf);
 }
 
+#ifdef __powerpc64__
+static void __init test_prefixed_patching(void)
+{
+   extern unsigned int code_patching_test1[];
+   extern unsigned int code_patching_test1_expected[];
+   extern unsigned int end_code_patching_test1[];
+
+   __patch_instruction((struct ppc_inst *)code_patching_test1,
+   ppc_inst_prefix(OP_PREFIX << 26, 0x),
+   (struct ppc_inst *)code_patching_test1);
+
+   check(!memcmp(code_patching_test1,
+ code_patching_test1_expected,
+ sizeof(unsigned int) *
+ (end_code_patching_test1 - code_patching_test1)));
+}
+#endif
+
 static int __init test_code_patching(void)
 {
printk(KERN_DEBUG "Running code patching self-tests ...\n");
@@ -716,6 +734,9 @@ static int __init test_code_patching(void)
test_branch_bform();
test_create_function_call();
test_translate_branch();
+#ifdef __powerpc64__
+   test_prefixed_patching();
+#endif
 
return 0;
 }
diff --git a/arch/powerpc/lib/test_code-patching.S 
b/arch/powerpc/lib/test_code-patching.S
new file mode 100644
index ..a9be6107844e
--- /dev/null
+++ b/arch/powerpc/lib/test_code-patching.S
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020 IBM Corporation
+ */
+#include 
+
+   .text
+
+#define globl(x)   \
+   .globl x;   \
+x:
+
+globl(code_patching_test1)
+   nop
+   nop
+globl(end_code_patching_test1)
+
+globl(code_patching_test1_expected)
+   .long OP_PREFIX << 26
+   .long 0x000
-- 
2.17.1



[PATCH v8 25/30] powerpc: Test prefixed instructions in feature fixups

2020-05-05 Thread Jordan Niethe
Expand the feature-fixups self-tests to includes tests for prefixed
instructions.

Signed-off-by: Jordan Niethe 
---
v6: New to series
v8: Use OP_PREFIX
---
 arch/powerpc/lib/feature-fixups-test.S | 69 
 arch/powerpc/lib/feature-fixups.c  | 73 ++
 2 files changed, 142 insertions(+)

diff --git a/arch/powerpc/lib/feature-fixups-test.S 
b/arch/powerpc/lib/feature-fixups-test.S
index b12168c2447a..480172fbd024 100644
--- a/arch/powerpc/lib/feature-fixups-test.S
+++ b/arch/powerpc/lib/feature-fixups-test.S
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include 
 
.text
 
@@ -791,3 +792,71 @@ globl(lwsync_fixup_test_expected_SYNC)
 1: or  1,1,1
sync
 
+globl(ftr_fixup_prefix1)
+   or  1,1,1
+   .long OP_PREFIX << 26
+   .long 0x000
+   or  2,2,2
+globl(end_ftr_fixup_prefix1)
+
+globl(ftr_fixup_prefix1_orig)
+   or  1,1,1
+   .long OP_PREFIX << 26
+   .long 0x000
+   or  2,2,2
+
+globl(ftr_fixup_prefix1_expected)
+   or  1,1,1
+   nop
+   nop
+   or  2,2,2
+
+globl(ftr_fixup_prefix2)
+   or  1,1,1
+   .long OP_PREFIX << 26
+   .long 0x000
+   or  2,2,2
+globl(end_ftr_fixup_prefix2)
+
+globl(ftr_fixup_prefix2_orig)
+   or  1,1,1
+   .long OP_PREFIX << 26
+   .long 0x000
+   or  2,2,2
+
+globl(ftr_fixup_prefix2_alt)
+   .long OP_PREFIX << 26
+   .long 0x001
+
+globl(ftr_fixup_prefix2_expected)
+   or  1,1,1
+   .long OP_PREFIX << 26
+   .long 0x001
+   or  2,2,2
+
+globl(ftr_fixup_prefix3)
+   or  1,1,1
+   .long OP_PREFIX << 26
+   .long 0x000
+   or  2,2,2
+   or  3,3,3
+globl(end_ftr_fixup_prefix3)
+
+globl(ftr_fixup_prefix3_orig)
+   or  1,1,1
+   .long OP_PREFIX << 26
+   .long 0x000
+   or  2,2,2
+   or  3,3,3
+
+globl(ftr_fixup_prefix3_alt)
+   .long OP_PREFIX << 26
+   .long 0x001
+   nop
+
+globl(ftr_fixup_prefix3_expected)
+   or  1,1,1
+   .long OP_PREFIX << 26
+   .long 0x001
+   nop
+   or  3,3,3
diff --git a/arch/powerpc/lib/feature-fixups.c 
b/arch/powerpc/lib/feature-fixups.c
index a8238eff3a31..5144854713e6 100644
--- a/arch/powerpc/lib/feature-fixups.c
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -689,6 +689,74 @@ static void test_lwsync_macros(void)
}
 }
 
+#ifdef __powerpc64__
+static void __init test_prefix_patching(void)
+{
+   extern unsigned int ftr_fixup_prefix1[];
+   extern unsigned int end_ftr_fixup_prefix1[];
+   extern unsigned int ftr_fixup_prefix1_orig[];
+   extern unsigned int ftr_fixup_prefix1_expected[];
+   int size = sizeof(unsigned int) * (end_ftr_fixup_prefix1 - 
ftr_fixup_prefix1);
+
+   fixup.value = fixup.mask = 8;
+   fixup.start_off = calc_offset(, ftr_fixup_prefix1 + 1);
+   fixup.end_off = calc_offset(, ftr_fixup_prefix1 + 3);
+   fixup.alt_start_off = fixup.alt_end_off = 0;
+
+   /* Sanity check */
+   check(memcmp(ftr_fixup_prefix1, ftr_fixup_prefix1_orig, size) == 0);
+
+   patch_feature_section(0, );
+   check(memcmp(ftr_fixup_prefix1, ftr_fixup_prefix1_expected, size) == 0);
+   check(memcmp(ftr_fixup_prefix1, ftr_fixup_prefix1_orig, size) != 0);
+}
+
+static void __init test_prefix_alt_patching(void)
+{
+   extern unsigned int ftr_fixup_prefix2[];
+   extern unsigned int end_ftr_fixup_prefix2[];
+   extern unsigned int ftr_fixup_prefix2_orig[];
+   extern unsigned int ftr_fixup_prefix2_expected[];
+   extern unsigned int ftr_fixup_prefix2_alt[];
+   int size = sizeof(unsigned int) * (end_ftr_fixup_prefix2 - 
ftr_fixup_prefix2);
+
+   fixup.value = fixup.mask = 8;
+   fixup.start_off = calc_offset(, ftr_fixup_prefix2 + 1);
+   fixup.end_off = calc_offset(, ftr_fixup_prefix2 + 3);
+   fixup.alt_start_off = calc_offset(, ftr_fixup_prefix2_alt);
+   fixup.alt_end_off = calc_offset(, ftr_fixup_prefix2_alt + 2);
+   /* Sanity check */
+   check(memcmp(ftr_fixup_prefix2, ftr_fixup_prefix2_orig, size) == 0);
+
+   patch_feature_section(0, );
+   check(memcmp(ftr_fixup_prefix2, ftr_fixup_prefix2_expected, size) == 0);
+   check(memcmp(ftr_fixup_prefix2, ftr_fixup_prefix2_orig, size) != 0);
+}
+
+static void __init test_prefix_word_alt_patching(void)
+{
+   extern unsigned int ftr_fixup_prefix3[];
+   extern unsigned int end_ftr_fixup_prefix3[];
+   extern unsigned int ftr_fixup_prefix3_orig[];
+   extern unsigned int ftr_fixup_prefix3_expected[];
+   extern unsigned int ftr_fixup_prefix3_alt[];
+   int size = sizeof(unsigned int) * (end_ftr_fixup_prefix3 - 
ftr_fixup_prefix3);
+
+   fixup.value = fixup.mask = 8;
+   fixup.start_off = calc_offset(, ftr_fixup_prefix3 + 1);
+   fixup.end_off = calc_offset(, ftr_fixup_prefix3 + 4);
+   

[PATCH v8 29/30] powerpc sstep: Add support for prefixed load/stores

2020-05-05 Thread Jordan Niethe
This adds emulation support for the following prefixed integer
load/stores:
  * Prefixed Load Byte and Zero (plbz)
  * Prefixed Load Halfword and Zero (plhz)
  * Prefixed Load Halfword Algebraic (plha)
  * Prefixed Load Word and Zero (plwz)
  * Prefixed Load Word Algebraic (plwa)
  * Prefixed Load Doubleword (pld)
  * Prefixed Store Byte (pstb)
  * Prefixed Store Halfword (psth)
  * Prefixed Store Word (pstw)
  * Prefixed Store Doubleword (pstd)
  * Prefixed Load Quadword (plq)
  * Prefixed Store Quadword (pstq)

the follow prefixed floating-point load/stores:
  * Prefixed Load Floating-Point Single (plfs)
  * Prefixed Load Floating-Point Double (plfd)
  * Prefixed Store Floating-Point Single (pstfs)
  * Prefixed Store Floating-Point Double (pstfd)

and for the following prefixed VSX load/stores:
  * Prefixed Load VSX Scalar Doubleword (plxsd)
  * Prefixed Load VSX Scalar Single-Precision (plxssp)
  * Prefixed Load VSX Vector [0|1]  (plxv, plxv0, plxv1)
  * Prefixed Store VSX Scalar Doubleword (pstxsd)
  * Prefixed Store VSX Scalar Single-Precision (pstxssp)
  * Prefixed Store VSX Vector [0|1] (pstxv, pstxv0, pstxv1)

Reviewed-by: Balamuruhan S 
Signed-off-by: Jordan Niethe 
---
v2: - Combine all load/store patches
- Fix the name of Type 01 instructions
- Remove sign extension flag from pstd/pld
- Rename sufx -> suffix
v3: - Move prefixed loads and stores into the switch statement
v6: - Compile on ppc32
- Add back in + GETLENGTH(op->type)
v8: Use fallthrough; keyword
---
 arch/powerpc/include/asm/sstep.h |   4 +
 arch/powerpc/lib/sstep.c | 163 ++-
 2 files changed, 165 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h
index c3ce903ac488..9b200a5f8794 100644
--- a/arch/powerpc/include/asm/sstep.h
+++ b/arch/powerpc/include/asm/sstep.h
@@ -90,11 +90,15 @@ enum instruction_type {
 #define VSX_LDLEFT 4   /* load VSX register from left */
 #define VSX_CHECK_VEC  8   /* check MSR_VEC not MSR_VSX for reg >= 32 */
 
+/* Prefixed flag, ORed in with type */
+#define PREFIXED   0x800
+
 /* Size field in type word */
 #define SIZE(n)((n) << 12)
 #define GETSIZE(w) ((w) >> 12)
 
 #define GETTYPE(t) ((t) & INSTR_TYPE_MASK)
+#define GETLENGTH(t)   (((t) & PREFIXED) ? 8 : 4)
 
 #define MKOP(t, f, s)  ((t) | (f) | SIZE(s))
 
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index ecd756c346fd..6794a7672ad5 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -187,6 +187,44 @@ static nokprobe_inline unsigned long xform_ea(unsigned int 
instr,
return ea;
 }
 
+/*
+ * Calculate effective address for a MLS:D-form / 8LS:D-form
+ * prefixed instruction
+ */
+static nokprobe_inline unsigned long mlsd_8lsd_ea(unsigned int instr,
+ unsigned int suffix,
+ const struct pt_regs *regs)
+{
+   int ra, prefix_r;
+   unsigned int  dd;
+   unsigned long ea, d0, d1, d;
+
+   prefix_r = instr & (1ul << 20);
+   ra = (suffix >> 16) & 0x1f;
+
+   d0 = instr & 0x3;
+   d1 = suffix & 0x;
+   d = (d0 << 16) | d1;
+
+   /*
+* sign extend a 34 bit number
+*/
+   dd = (unsigned int)(d >> 2);
+   ea = (signed int)dd;
+   ea = (ea << 2) | (d & 0x3);
+
+   if (!prefix_r && ra)
+   ea += regs->gpr[ra];
+   else if (!prefix_r && !ra)
+   ; /* Leave ea as is */
+   else if (prefix_r && !ra)
+   ea += regs->nip;
+   else if (prefix_r && ra)
+   ; /* Invalid form. Should already be checked for by caller! */
+
+   return ea;
+}
+
 /*
  * Return the largest power of 2, not greater than sizeof(unsigned long),
  * such that x is a multiple of it.
@@ -1166,6 +1204,9 @@ int analyse_instr(struct instruction_op *op, const struct 
pt_regs *regs,
  struct ppc_inst instr)
 {
unsigned int opcode, ra, rb, rc, rd, spr, u;
+#ifdef __powerpc64__
+   unsigned int suffixopcode, prefixtype, prefix_r;
+#endif
unsigned long int imm;
unsigned long int val, val2;
unsigned int mb, me, sh;
@@ -2652,6 +2693,124 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
break;
}
break;
+   case 1: /* Prefixed instructions */
+   prefix_r = word & (1ul << 20);
+   ra = (suffix >> 16) & 0x1f;
+   op->update_reg = ra;
+   rd = (suffix >> 21) & 0x1f;
+   op->reg = rd;
+   op->val = regs->gpr[rd];
+
+   suffixopcode = suffix >> 26;
+   prefixtype = (word >> 24) & 0x3;
+   switch (prefixtype) {
+   case 0: /* Type 00  Eight-Byte Load/Store */
+   if (prefix_r && ra)
+   

[PATCH v8 28/30] powerpc: Support prefixed instructions in alignment handler

2020-05-05 Thread Jordan Niethe
If a prefixed instruction results in an alignment exception, the
SRR1_PREFIXED bit is set. The handler attempts to emulate the
responsible instruction and then increment the NIP past it. Use
SRR1_PREFIXED to determine by how much the NIP should be incremented.

Prefixed instructions are not permitted to cross 64-byte boundaries. If
they do the alignment interrupt is invoked with SRR1 BOUNDARY bit set.
If this occurs send a SIGBUS to the offending process if in user mode.
If in kernel mode call bad_page_fault().

Reviewed-by: Alistair Popple 
Signed-off-by: Jordan Niethe 
---
v2: - Move __get_user_instr() and __get_user_instr_inatomic() to this
commit (previously in "powerpc sstep: Prepare to support prefixed
instructions").
- Rename sufx to suffix
- Use a macro for calculating instruction length
v3: Move __get_user_{instr(), instr_inatomic()} up with the other
get_user definitions and remove nested if.
v4: Rolled into "Add prefixed instructions to instruction data type"
v5: Only one definition of inst_length()
---
 arch/powerpc/kernel/traps.c | 19 ++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 493a3fa0ac1a..105242cc2f28 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -583,6 +583,8 @@ static inline int check_io_access(struct pt_regs *regs)
 #define REASON_ILLEGAL (ESR_PIL | ESR_PUO)
 #define REASON_PRIVILEGED  ESR_PPR
 #define REASON_TRAPESR_PTR
+#define REASON_PREFIXED0
+#define REASON_BOUNDARY0
 
 /* single-step stuff */
 #define single_stepping(regs)  (current->thread.debug.dbcr0 & DBCR0_IC)
@@ -597,12 +599,16 @@ static inline int check_io_access(struct pt_regs *regs)
 #define REASON_ILLEGAL SRR1_PROGILL
 #define REASON_PRIVILEGED  SRR1_PROGPRIV
 #define REASON_TRAPSRR1_PROGTRAP
+#define REASON_PREFIXEDSRR1_PREFIXED
+#define REASON_BOUNDARYSRR1_BOUNDARY
 
 #define single_stepping(regs)  ((regs)->msr & MSR_SE)
 #define clear_single_step(regs)((regs)->msr &= ~MSR_SE)
 #define clear_br_trace(regs)   ((regs)->msr &= ~MSR_BE)
 #endif
 
+#define inst_length(reason)(((reason) & REASON_PREFIXED) ? 8 : 4)
+
 #if defined(CONFIG_E500)
 int machine_check_e500mc(struct pt_regs *regs)
 {
@@ -1593,11 +1599,20 @@ void alignment_exception(struct pt_regs *regs)
 {
enum ctx_state prev_state = exception_enter();
int sig, code, fixed = 0;
+   unsigned long  reason;
 
/* We restore the interrupt state now */
if (!arch_irq_disabled_regs(regs))
local_irq_enable();
 
+   reason = get_reason(regs);
+
+   if (reason & REASON_BOUNDARY) {
+   sig = SIGBUS;
+   code = BUS_ADRALN;
+   goto bad;
+   }
+
if (tm_abort_check(regs, TM_CAUSE_ALIGNMENT | TM_CAUSE_PERSISTENT))
goto bail;
 
@@ -1606,7 +1621,8 @@ void alignment_exception(struct pt_regs *regs)
fixed = fix_alignment(regs);
 
if (fixed == 1) {
-   regs->nip += 4; /* skip over emulated instruction */
+   /* skip over emulated instruction */
+   regs->nip += inst_length(reason);
emulate_single_step(regs);
goto bail;
}
@@ -1619,6 +1635,7 @@ void alignment_exception(struct pt_regs *regs)
sig = SIGBUS;
code = BUS_ADRALN;
}
+bad:
if (user_mode(regs))
_exception(sig, regs, code, regs->dar);
else
-- 
2.17.1



Re: [PATCH v3 1/2] powerpc/fadump: use static allocation for reserved memory ranges

2020-05-05 Thread Michael Ellerman
On Mon, 2020-04-20 at 08:56:09 UTC, Hari Bathini wrote:
> At times, memory ranges have to be looked up during early boot, when
> kernel couldn't be initialized for dynamic memory allocation. In fact,
> reserved-ranges look up is needed during FADump memory reservation.
> Without accounting for reserved-ranges in reserving memory for FADump,
> MPIPL boot fails with memory corruption issues. So, extend memory
> ranges handling to support static allocation and populate reserved
> memory ranges during early boot.
> 
> Fixes: dda9dbfeeb7a ("powerpc/fadump: consider reserved ranges while 
> releasing memory")
> Cc: sta...@vger.kernel.org
> Signed-off-by: Hari Bathini 
> Reviewed-by: Mahesh Salgaonkar 

Series applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/02c04e374e176ae3a3f64a682f80702f8d2fb65d

cheers


Re: [PATCH v5 1/5] powerpc: Move idle_loop_prolog()/epilog() functions to header file

2020-05-05 Thread Michael Ellerman
On Tue, 2020-04-07 at 08:47:39 UTC, "Gautham R. Shenoy" wrote:
> From: "Gautham R. Shenoy" 
> 
> Currently prior to entering an idle state on a Linux Guest, the
> pseries cpuidle driver implement an idle_loop_prolog() and
> idle_loop_epilog() functions which ensure that idle_purr is correctly
> computed, and the hypervisor is informed that the CPU cycles have been
> donated.
> 
> These prolog and epilog functions are also required in the default
> idle call, i.e pseries_lpar_idle(). Hence move these accessor
> functions to a common header file and call them from
> pseries_lpar_idle(). Since the existing header files such as
> asm/processor.h have enough clutter, create a new header file
> asm/idle.h. Finally rename idle_loop_prolog() and idle_loop_epilog()
> to pseries_idle_prolog() and pseries_idle_epilog() as they are only
> relavent for on pseries guests.
> 
> Signed-off-by: Gautham R. Shenoy 

Series applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/e4a884cc28fa3f5d8b81de46998ffe29b4ad169e

cheers


Re: [PATCH] powerpc/64: Have MPROFILE_KERNEL depend on FUNCTION_TRACER

2020-05-05 Thread Michael Ellerman
On Wed, 2020-04-22 at 09:26:12 UTC, "Naveen N. Rao" wrote:
> Currently, it is possible to have CONFIG_FUNCTION_TRACER disabled, but
> CONFIG_MPROFILE_KERNEL enabled. Though all existing users of
> MPROFILE_KERNEL are doing the right thing, it is weird to have
> MPROFILE_KERNEL enabled when the function tracer isn't. Fix this by
> making MPROFILE_KERNEL depend on FUNCTION_TRACER.
> 
> Signed-off-by: Naveen N. Rao 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/57b3ed941b5542aaebcd9f59369571bbce9d6dcc

cheers


Re: [PATCH] powerpc/ps3: Move static keyword to the front of declaration

2020-05-05 Thread Michael Ellerman
On Wed, 2020-04-29 at 10:00:48 UTC, Xiongfeng Wang wrote:
> Move the static keyword to the front of declaration of 'vuart_bus_priv',
> and resolve the following compiler warning that can be seen when
> building with warnings enabled (W=1):
> 
> drivers/ps3/ps3-vuart.c:867:1: warning: ‘static’ is not at beginning of 
> declaration [-Wold-style-declaration]
>  } static vuart_bus_priv;
>  ^
> 
> Reported-by: Hulk Robot 
> Signed-off-by: Xiongfeng Wang 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/43c8a496fa37187b54f7df71fb8262acc6bf6200

cheers


[PATCH v8 01/30] powerpc/xmon: Remove store_inst() for patch_instruction()

2020-05-05 Thread Jordan Niethe
For modifying instructions in xmon, patch_instruction() can serve the
same role that store_inst() is performing with the advantage of not
being specific to xmon. In some places patch_instruction() is already
being using followed by store_inst(). In these cases just remove the
store_inst(). Otherwise replace store_inst() with patch_instruction().

Reviewed-by: Nicholas Piggin 
Signed-off-by: Jordan Niethe 
---
v4: Read into a local variable
---
 arch/powerpc/xmon/xmon.c | 18 +-
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 7af840c0fc93..f91ae2c9adbe 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -326,11 +326,6 @@ static inline void sync(void)
asm volatile("sync; isync");
 }
 
-static inline void store_inst(void *p)
-{
-   asm volatile ("dcbst 0,%0; sync; icbi 0,%0; isync" : : "r" (p));
-}
-
 static inline void cflush(void *p)
 {
asm volatile ("dcbf 0,%0; icbi 0,%0" : : "r" (p));
@@ -882,8 +877,7 @@ static struct bpt *new_breakpoint(unsigned long a)
for (bp = bpts; bp < [NBPTS]; ++bp) {
if (!bp->enabled && atomic_read(>ref_count) == 0) {
bp->address = a;
-   bp->instr[1] = bpinstr;
-   store_inst(>instr[1]);
+   patch_instruction(>instr[1], bpinstr);
return bp;
}
}
@@ -895,25 +889,26 @@ static struct bpt *new_breakpoint(unsigned long a)
 static void insert_bpts(void)
 {
int i;
+   unsigned int instr;
struct bpt *bp;
 
bp = bpts;
for (i = 0; i < NBPTS; ++i, ++bp) {
if ((bp->enabled & (BP_TRAP|BP_CIABR)) == 0)
continue;
-   if (mread(bp->address, >instr[0], 4) != 4) {
+   if (mread(bp->address, , 4) != 4) {
printf("Couldn't read instruction at %lx, "
   "disabling breakpoint there\n", bp->address);
bp->enabled = 0;
continue;
}
-   if (IS_MTMSRD(bp->instr[0]) || IS_RFID(bp->instr[0])) {
+   if (IS_MTMSRD(instr) || IS_RFID(instr)) {
printf("Breakpoint at %lx is on an mtmsrd or rfid "
   "instruction, disabling it\n", bp->address);
bp->enabled = 0;
continue;
}
-   store_inst(>instr[0]);
+   patch_instruction(bp->instr, instr);
if (bp->enabled & BP_CIABR)
continue;
if (patch_instruction((unsigned int *)bp->address,
@@ -923,7 +918,6 @@ static void insert_bpts(void)
bp->enabled &= ~BP_TRAP;
continue;
}
-   store_inst((void *)bp->address);
}
 }
 
@@ -958,8 +952,6 @@ static void remove_bpts(void)
(unsigned int *)bp->address, bp->instr[0]) != 0)
printf("Couldn't remove breakpoint at %lx\n",
   bp->address);
-   else
-   store_inst((void *)bp->address);
}
 }
 
-- 
2.17.1



[PATCH v8 07/30] powerpc: Use an accessor for instructions

2020-05-05 Thread Jordan Niethe
In preparation for introducing a more complicated instruction type to
accommodate prefixed instructions use an accessor for getting an
instruction as a u32.

Signed-off-by: Jordan Niethe 
---
v4: New to series
v5: Remove references to 'word' instructions
v6: - test_emulate_step.c: execute_compute_instr(): Introduce
  ppc_inst_val() here instead of in a later patch
- ftrace.c: __ftrace_make_call(): Introduce adding ppc_inst_val() in
  this patch
- fault.c: store_updates_sp(): Start using ppc_inst_val()
- Move this patch before the ppc_inst_primary_opcode() patch
v8: - Style
- Missed some in ftrace.c: __ftrace_make_nop(), __ftrace_make_call()
---
 arch/powerpc/include/asm/inst.h  |   5 +
 arch/powerpc/include/asm/sstep.h |   6 +-
 arch/powerpc/kernel/align.c  |   6 +-
 arch/powerpc/kernel/kprobes.c|   2 +-
 arch/powerpc/kernel/trace/ftrace.c   |  30 +--
 arch/powerpc/kernel/vecemu.c |  16 +-
 arch/powerpc/lib/code-patching.c |  18 +-
 arch/powerpc/lib/sstep.c | 268 ++-
 arch/powerpc/lib/test_emulate_step.c |   8 +-
 arch/powerpc/mm/fault.c  |   6 +-
 arch/powerpc/xmon/xmon.c |   4 +-
 11 files changed, 190 insertions(+), 179 deletions(-)

diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h
index 5298ba33b6e5..8a9e73bfbd27 100644
--- a/arch/powerpc/include/asm/inst.h
+++ b/arch/powerpc/include/asm/inst.h
@@ -8,4 +8,9 @@
 
 #define ppc_inst(x) (x)
 
+static inline u32 ppc_inst_val(u32 x)
+{
+   return x;
+}
+
 #endif /* _ASM_INST_H */
diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h
index 769f055509c9..26d729562fe2 100644
--- a/arch/powerpc/include/asm/sstep.h
+++ b/arch/powerpc/include/asm/sstep.h
@@ -15,9 +15,9 @@ struct pt_regs;
  * Note that IS_MTMSRD returns true for both an mtmsr (32-bit)
  * and an mtmsrd (64-bit).
  */
-#define IS_MTMSRD(instr)   (((instr) & 0xfc0007be) == 0x7c000124)
-#define IS_RFID(instr) (((instr) & 0xfc0007fe) == 0x4c24)
-#define IS_RFI(instr)  (((instr) & 0xfc0007fe) == 0x4c64)
+#define IS_MTMSRD(instr)   ((ppc_inst_val(instr) & 0xfc0007be) == 
0x7c000124)
+#define IS_RFID(instr) ((ppc_inst_val(instr) & 0xfc0007fe) == 
0x4c24)
+#define IS_RFI(instr)  ((ppc_inst_val(instr) & 0xfc0007fe) == 
0x4c64)
 
 enum instruction_type {
COMPUTE,/* arith/logical/CR op, etc. */
diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
index 86e9bf62f18c..44921001f84a 100644
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -314,8 +314,8 @@ int fix_alignment(struct pt_regs *regs)
}
 
 #ifdef CONFIG_SPE
-   if ((instr >> 26) == 0x4) {
-   int reg = (instr >> 21) & 0x1f;
+   if ((ppc_inst_val(instr) >> 26) == 0x4) {
+   int reg = (ppc_inst_val(instr) >> 21) & 0x1f;
PPC_WARN_ALIGNMENT(spe, regs);
return emulate_spe(regs, reg, instr);
}
@@ -332,7 +332,7 @@ int fix_alignment(struct pt_regs *regs)
 * when pasting to a co-processor. Furthermore, paste_last is the
 * synchronisation point for preceding copy/paste sequences.
 */
-   if ((instr & 0xfc0006fe) == (PPC_INST_COPY & 0xfc0006fe))
+   if ((ppc_inst_val(instr) & 0xfc0006fe) == (PPC_INST_COPY & 0xfc0006fe))
return -EIO;
 
r = analyse_instr(, regs, instr);
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 2378a7ed4438..92fa3070d905 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -234,7 +234,7 @@ static int try_to_emulate(struct kprobe *p, struct pt_regs 
*regs)
 * So, we should never get here... but, its still
 * good to catch them, just in case...
 */
-   printk("Can't step on instruction %x\n", insn);
+   printk("Can't step on instruction %x\n", ppc_inst_val(insn));
BUG();
} else {
/*
diff --git a/arch/powerpc/kernel/trace/ftrace.c 
b/arch/powerpc/kernel/trace/ftrace.c
index 00f69b7baa8a..cc23c63f3769 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -74,7 +74,7 @@ ftrace_modify_code(unsigned long ip, unsigned int old, 
unsigned int new)
/* Make sure it is what we expect it to be */
if (replaced != old) {
pr_err("%p: replaced (%#x) != old (%#x)",
-   (void *)ip, replaced, old);
+   (void *)ip, ppc_inst_val(replaced), ppc_inst_val(old));
return -EINVAL;
}
 
@@ -99,19 +99,19 @@ static int test_24bit_addr(unsigned long ip, unsigned long 
addr)
 
 static int is_bl_op(unsigned int op)
 {
-   return (op & 0xfc03) == 0x4801;
+   return (ppc_inst_val(op) & 0xfc03) == 0x4801;
 }
 
 static int is_b_op(unsigned int 

[PATCH v8 11/30] powerpc: Use a datatype for instructions

2020-05-05 Thread Jordan Niethe
Currently unsigned ints are used to represent instructions on powerpc.
This has worked well as instructions have always been 4 byte words.
However, a future ISA version will introduce some changes to
instructions that mean this scheme will no longer work as well. This
change is Prefixed Instructions. A prefixed instruction is made up of a
word prefix followed by a word suffix to make an 8 byte double word
instruction. No matter the endianness of the system the prefix always
comes first. Prefixed instructions are only planned for powerpc64.

Introduce a ppc_inst type to represent both prefixed and word
instructions on powerpc64 while keeping it possible to exclusively have
word instructions on powerpc32.

Signed-off-by: Jordan Niethe 
---
v4: New to series
v5: Add to epapr_paravirt.c, kgdb.c
v6: - setup_32.c: machine_init(): Use type
- feature-fixups.c: do_final_fixups(): Use type
- optprobes.c: arch_prepare_optimized_kprobe(): change a void * to
  struct ppc_inst *
- fault.c: store_updates_sp(): Use type
- Change ppc_inst_equal() implementation from memcpy()
v7: - Fix compilation issue in early_init_dt_scan_epapr() and
  do_patch_instruction() with CONFIG_STRICT_KERNEL_RWX
v8: - style
- Use in crash_dump.c, mpc86xx_smp.c, smp.c
---
 arch/powerpc/include/asm/code-patching.h  | 32 -
 arch/powerpc/include/asm/inst.h   | 18 +++--
 arch/powerpc/include/asm/sstep.h  |  5 +-
 arch/powerpc/include/asm/uprobes.h|  5 +-
 arch/powerpc/kernel/align.c   |  4 +-
 arch/powerpc/kernel/crash_dump.c  |  2 +-
 arch/powerpc/kernel/epapr_paravirt.c  |  6 +-
 arch/powerpc/kernel/hw_breakpoint.c   |  4 +-
 arch/powerpc/kernel/jump_label.c  |  2 +-
 arch/powerpc/kernel/kgdb.c|  4 +-
 arch/powerpc/kernel/kprobes.c |  8 +--
 arch/powerpc/kernel/mce_power.c   |  5 +-
 arch/powerpc/kernel/optprobes.c   | 64 +
 arch/powerpc/kernel/setup_32.c|  4 +-
 arch/powerpc/kernel/trace/ftrace.c| 83 ---
 arch/powerpc/kernel/vecemu.c  |  5 +-
 arch/powerpc/lib/code-patching.c  | 76 ++---
 arch/powerpc/lib/feature-fixups.c | 60 
 arch/powerpc/lib/sstep.c  |  4 +-
 arch/powerpc/lib/test_emulate_step.c  |  9 +--
 arch/powerpc/mm/fault.c   |  4 +-
 arch/powerpc/perf/core-book3s.c   |  4 +-
 arch/powerpc/platforms/86xx/mpc86xx_smp.c |  4 +-
 arch/powerpc/platforms/powermac/smp.c |  4 +-
 arch/powerpc/xmon/xmon.c  | 22 +++---
 arch/powerpc/xmon/xmon_bpts.h |  6 +-
 26 files changed, 233 insertions(+), 211 deletions(-)

diff --git a/arch/powerpc/include/asm/code-patching.h 
b/arch/powerpc/include/asm/code-patching.h
index 48e021957ee5..eacc9102c251 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -23,33 +23,33 @@
 #define BRANCH_ABSOLUTE0x2
 
 bool is_offset_in_branch_range(long offset);
-int create_branch(unsigned int *instr, const unsigned int *addr,
+int create_branch(struct ppc_inst *instr, const struct ppc_inst *addr,
  unsigned long target, int flags);
-int create_cond_branch(unsigned int *instr, const unsigned int *addr,
+int create_cond_branch(struct ppc_inst *instr, const struct ppc_inst *addr,
   unsigned long target, int flags);
-int patch_branch(unsigned int *addr, unsigned long target, int flags);
-int patch_instruction(unsigned int *addr, unsigned int instr);
-int raw_patch_instruction(unsigned int *addr, unsigned int instr);
+int patch_branch(struct ppc_inst *addr, unsigned long target, int flags);
+int patch_instruction(struct ppc_inst *addr, struct ppc_inst instr);
+int raw_patch_instruction(struct ppc_inst *addr, struct ppc_inst instr);
 
 static inline unsigned long patch_site_addr(s32 *site)
 {
return (unsigned long)site + *site;
 }
 
-static inline int patch_instruction_site(s32 *site, unsigned int instr)
+static inline int patch_instruction_site(s32 *site, struct ppc_inst instr)
 {
-   return patch_instruction((unsigned int *)patch_site_addr(site), instr);
+   return patch_instruction((struct ppc_inst *)patch_site_addr(site), 
instr);
 }
 
 static inline int patch_branch_site(s32 *site, unsigned long target, int flags)
 {
-   return patch_branch((unsigned int *)patch_site_addr(site), target, 
flags);
+   return patch_branch((struct ppc_inst *)patch_site_addr(site), target, 
flags);
 }
 
 static inline int modify_instruction(unsigned int *addr, unsigned int clr,
 unsigned int set)
 {
-   return patch_instruction(addr, ppc_inst((*addr & ~clr) | set));
+   return patch_instruction((struct ppc_inst *)addr, ppc_inst((*addr & 
~clr) | set));
 }
 
 static inline int modify_instruction_site(s32 *site, unsigned int clr, 
unsigned int set)
@@ -57,13 

[PATCH v8 17/30] powerpc: Introduce a function for reporting instruction length

2020-05-05 Thread Jordan Niethe
Currently all instructions have the same length, but in preparation for
prefixed instructions introduce a function for returning instruction
length.

Reviewed-by: Alistair Popple 
Signed-off-by: Jordan Niethe 
---
v6: - feature-fixups.c: do_final_fixups(): use here
- ppc_inst_len(): change return type from bool to int
- uprobes: Use ppc_inst_read() before calling ppc_inst_len()
---
 arch/powerpc/include/asm/inst.h   |  5 +
 arch/powerpc/kernel/kprobes.c |  6 --
 arch/powerpc/kernel/uprobes.c |  2 +-
 arch/powerpc/lib/feature-fixups.c | 14 +++---
 4 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h
index 0d581b332c20..2f3c9d5bcf7c 100644
--- a/arch/powerpc/include/asm/inst.h
+++ b/arch/powerpc/include/asm/inst.h
@@ -17,6 +17,11 @@ static inline u32 ppc_inst_val(struct ppc_inst x)
return x.val;
 }
 
+static inline int ppc_inst_len(struct ppc_inst x)
+{
+   return sizeof(struct ppc_inst);
+}
+
 static inline int ppc_inst_primary_opcode(struct ppc_inst x)
 {
return ppc_inst_val(x) >> 26;
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index a72c8e1a42ad..33d54b091c70 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -462,14 +462,16 @@ NOKPROBE_SYMBOL(trampoline_probe_handler);
  */
 int kprobe_post_handler(struct pt_regs *regs)
 {
+   int len;
struct kprobe *cur = kprobe_running();
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
if (!cur || user_mode(regs))
return 0;
 
+   len = ppc_inst_len(ppc_inst_read((struct ppc_inst *)cur->ainsn.insn));
/* make sure we got here for instruction we have a kprobe on */
-   if (((unsigned long)cur->ainsn.insn + 4) != regs->nip)
+   if (((unsigned long)cur->ainsn.insn + len) != regs->nip)
return 0;
 
if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
@@ -478,7 +480,7 @@ int kprobe_post_handler(struct pt_regs *regs)
}
 
/* Adjust nip to after the single-stepped instruction */
-   regs->nip = (unsigned long)cur->addr + 4;
+   regs->nip = (unsigned long)cur->addr + len;
regs->msr |= kcb->kprobe_saved_msr;
 
/*Restore back the original saved kprobes variables and continue. */
diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c
index 6893d40a48c5..83e883e1a42d 100644
--- a/arch/powerpc/kernel/uprobes.c
+++ b/arch/powerpc/kernel/uprobes.c
@@ -112,7 +112,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, 
struct pt_regs *regs)
 * support doesn't exist and have to fix-up the next instruction
 * to be executed.
 */
-   regs->nip = utask->vaddr + MAX_UINSN_BYTES;
+   regs->nip = utask->vaddr + ppc_inst_len(ppc_inst_read(>insn));
 
user_disable_single_step(current);
return 0;
diff --git a/arch/powerpc/lib/feature-fixups.c 
b/arch/powerpc/lib/feature-fixups.c
index c0d3ed4efb7e..2bd2b752de4f 100644
--- a/arch/powerpc/lib/feature-fixups.c
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -392,20 +392,20 @@ void do_lwsync_fixups(unsigned long value, void 
*fixup_start, void *fixup_end)
 static void do_final_fixups(void)
 {
 #if defined(CONFIG_PPC64) && defined(CONFIG_RELOCATABLE)
-   struct ppc_inst *src, *dest;
-   unsigned long length;
+   struct ppc_inst inst, *src, *dest, *end;
 
if (PHYSICAL_START == 0)
return;
 
src = (struct ppc_inst *)(KERNELBASE + PHYSICAL_START);
dest = (struct ppc_inst *)KERNELBASE;
-   length = (__end_interrupts - _stext) / sizeof(struct ppc_inst);
+   end = (void *)src + (__end_interrupts - _stext);
 
-   while (length--) {
-   raw_patch_instruction(dest, ppc_inst_read(src));
-   src++;
-   dest++;
+   while (src < end) {
+   inst = ppc_inst_read(src);
+   raw_patch_instruction(dest, inst);
+   src = (void *)src + ppc_inst_len(inst);
+   dest = (void *)dest + ppc_inst_len(inst);
}
 #endif
 }
-- 
2.17.1



[PATCH v8 18/30] powerpc/xmon: Use a function for reading instructions

2020-05-05 Thread Jordan Niethe
Currently in xmon, mread() is used for reading instructions. In
preparation for prefixed instructions, create and use a new function,
mread_instr(), especially for reading instructions.

Reviewed-by: Alistair Popple 
Signed-off-by: Jordan Niethe 
---
v5: New to series, seperated from "Add prefixed instructions to
instruction data type"
v6: mread_instr(): correctly return error status
v8: style
---
 arch/powerpc/xmon/xmon.c | 28 
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 68e0b05d9226..7a9cbc6d9b21 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -122,6 +122,7 @@ static unsigned bpinstr = 0x7fe8;   /* trap */
 static int cmds(struct pt_regs *);
 static int mread(unsigned long, void *, int);
 static int mwrite(unsigned long, void *, int);
+static int mread_instr(unsigned long, struct ppc_inst *);
 static int handle_fault(struct pt_regs *);
 static void byterev(unsigned char *, int);
 static void memex(void);
@@ -896,7 +897,7 @@ static void insert_bpts(void)
for (i = 0; i < NBPTS; ++i, ++bp) {
if ((bp->enabled & (BP_TRAP|BP_CIABR)) == 0)
continue;
-   if (mread(bp->address, , 4) != 4) {
+   if (!mread_instr(bp->address, )) {
printf("Couldn't read instruction at %lx, "
   "disabling breakpoint there\n", bp->address);
bp->enabled = 0;
@@ -946,7 +947,7 @@ static void remove_bpts(void)
for (i = 0; i < NBPTS; ++i, ++bp) {
if ((bp->enabled & (BP_TRAP|BP_CIABR)) != BP_TRAP)
continue;
-   if (mread(bp->address, , 4) == 4
+   if (mread_instr(bp->address, )
&& ppc_inst_equal(instr, ppc_inst(bpinstr))
&& patch_instruction(
(struct ppc_inst *)bp->address, 
ppc_inst_read(bp->instr)) != 0)
@@ -1162,7 +1163,7 @@ static int do_step(struct pt_regs *regs)
force_enable_xmon();
/* check we are in 64-bit kernel mode, translation enabled */
if ((regs->msr & (MSR_64BIT|MSR_PR|MSR_IR)) == (MSR_64BIT|MSR_IR)) {
-   if (mread(regs->nip, , 4) == 4) {
+   if (mread_instr(regs->nip, )) {
stepped = emulate_step(regs, instr);
if (stepped < 0) {
printf("Couldn't single-step %s instruction\n",
@@ -1329,7 +1330,7 @@ static long check_bp_loc(unsigned long addr)
printf("Breakpoints may only be placed at kernel addresses\n");
return 0;
}
-   if (!mread(addr, , sizeof(instr))) {
+   if (!mread_instr(addr, )) {
printf("Can't read instruction at address %lx\n", addr);
return 0;
}
@@ -2122,6 +2123,25 @@ mwrite(unsigned long adrs, void *buf, int size)
return n;
 }
 
+static int
+mread_instr(unsigned long adrs, struct ppc_inst *instr)
+{
+   volatile int n;
+
+   n = 0;
+   if (setjmp(bus_error_jmp) == 0) {
+   catch_memory_errors = 1;
+   sync();
+   *instr = ppc_inst_read((struct ppc_inst *)adrs);
+   sync();
+   /* wait a little while to see if we get a machine check */
+   __delay(200);
+   n = ppc_inst_len(*instr);
+   }
+   catch_memory_errors = 0;
+   return n;
+}
+
 static int fault_type;
 static int fault_except;
 static char *fault_chars[] = { "--", "**", "##" };
-- 
2.17.1



[PATCH v8 26/30] powerpc/xmon: Don't allow breakpoints on suffixes

2020-05-05 Thread Jordan Niethe
Do not allow placing xmon breakpoints on the suffix of a prefix
instruction.

Signed-off-by: Jordan Niethe 
---
v8: Add this back from v3
---
 arch/powerpc/xmon/xmon.c | 29 +++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 647b3829c4eb..d082c35c6638 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -889,8 +889,8 @@ static struct bpt *new_breakpoint(unsigned long a)
 static void insert_bpts(void)
 {
int i;
-   struct ppc_inst instr;
-   struct bpt *bp;
+   struct ppc_inst instr, instr2;
+   struct bpt *bp, *bp2;
 
bp = bpts;
for (i = 0; i < NBPTS; ++i, ++bp) {
@@ -908,6 +908,31 @@ static void insert_bpts(void)
bp->enabled = 0;
continue;
}
+   /*
+* Check the address is not a suffix by looking for a prefix in
+* front of it.
+*/
+   if (mread_instr(bp->address - 4, ) == 8) {
+   printf("Breakpoint at %lx is on the second word of a "
+  "prefixed instruction, disabling it\n",
+  bp->address);
+   bp->enabled = 0;
+   continue;
+   }
+   /*
+* We might still be a suffix - if the prefix has already been
+* replaced by a breakpoint we won't catch it with the above
+* test.
+*/
+   bp2 = at_breakpoint(bp->address - 4);
+   if (bp2 && ppc_inst_prefixed(ppc_inst_read(bp2->instr))) {
+   printf("Breakpoint at %lx is on the second word of a "
+  "prefixed instruction, disabling it\n",
+  bp->address);
+   bp->enabled = 0;
+   continue;
+   }
+
patch_instruction(bp->instr, instr);
patch_instruction((void *)bp->instr + ppc_inst_len(instr),
  ppc_inst(bpinstr));
-- 
2.17.1



[PATCH v8 27/30] powerpc/kprobes: Don't allow breakpoints on suffixes

2020-05-05 Thread Jordan Niethe
Do not allow inserting breakpoints on the suffix of a prefix instruction
in kprobes.

Signed-off-by: Jordan Niethe 
---
v8: Add this back from v3
---
 arch/powerpc/kernel/kprobes.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 33d54b091c70..227510df8c55 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -106,7 +106,9 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, 
unsigned int offset)
 int arch_prepare_kprobe(struct kprobe *p)
 {
int ret = 0;
+   struct kprobe *prev;
struct ppc_inst insn = ppc_inst_read((struct ppc_inst *)p->addr);
+   struct ppc_inst prefix = ppc_inst_read((struct ppc_inst *)(p->addr - 
1));
 
if ((unsigned long)p->addr & 0x03) {
printk("Attempt to register kprobe at an unaligned address\n");
@@ -114,6 +116,17 @@ int arch_prepare_kprobe(struct kprobe *p)
} else if (IS_MTMSRD(insn) || IS_RFID(insn) || IS_RFI(insn)) {
printk("Cannot register a kprobe on rfi/rfid or mtmsr[d]\n");
ret = -EINVAL;
+   } else if (ppc_inst_prefixed(prefix)) {
+   printk("Cannot register a kprobe on the second word of prefixed 
instruction\n");
+   ret = -EINVAL;
+   }
+   preempt_disable();
+   prev = get_kprobe(p->addr - 1);
+   preempt_enable_no_resched();
+   if (prev &&
+   ppc_inst_prefixed(ppc_inst_read((struct ppc_inst 
*)prev->ainsn.insn))) {
+   printk("Cannot register a kprobe on the second word of prefixed 
instruction\n");
+   ret = -EINVAL;
}
 
/* insn must be on a special executable page on ppc64.  This is
-- 
2.17.1



Re: [PATCH v3 11/15] powerpc/64s: machine check interrupt update NMI accounting

2020-05-05 Thread Nicholas Piggin
Excerpts from Christophe Leroy's message of April 7, 2020 3:37 pm:
> 
> 
> Le 07/04/2020 à 07:16, Nicholas Piggin a écrit :
>> machine_check_early is taken as an NMI, so nmi_enter is used there.
>> machine_check_exception is no longer taken as an NMI (it's invoked
>> via irq_work in the case a machine check hits in kernel mode), so
>> remove the nmi_enter from that case.
> 
> Euh ... Is that also the case for PPC32 ?
> 
> AFAIK machine_check_exception() is called as an NMI on PPC32.

Sorry I missed your comment.  You're right, I'll make this change
depend on 64S. Thanks for reviewing them.

Thanks,
Nick


[PATCH v8 00/30] Initial Prefixed Instruction support

2020-05-05 Thread Jordan Niethe
A future revision of the ISA will introduce prefixed instructions. A
prefixed instruction is composed of a 4-byte prefix followed by a
4-byte suffix.

All prefixes have the major opcode 1. A prefix will never be a valid
word instruction. A suffix may be an existing word instruction or a
new instruction.

This series enables prefixed instructions and extends the instruction
emulation to support them. Then the places where prefixed instructions
might need to be emulated are updated.

v8 incorporates feedback from Alistair Popple and Balamuruhan Suriyakumar.
The major changes:
- Fix some style issues
- Fix __patch_instruction() on big endian
- Reintroduce v3's forbidding breakpoints on second word of prefix
  instructions for kprobes and xmon. Missed this when changing to
  using a data type.
- Use the data type in some places that were missed.

v7 fixes compilation issues for some configs reported by Alistair
Popple.

v6 is based on feedback from Balamuruhan Suriyakumar, Alistair Popple,
Christophe Leroy and Segher Boessenkool.
The major changes:
- Use the instruction type in more places that had been missed before
- Fix issues with ppc32
- Introduce new self tests for code patching and feature fixups

v5 is based on feedback from Nick Piggins, Michael Ellerman, Balamuruhan
Suriyakumar and Alistair Popple.
The major changes:
- The ppc instruction type is now a struct
- Series now just based on next
- ppc_inst_masked() dropped
- Space for xmon breakpoints allocated in an assembly file
- "Add prefixed instructions to instruction data type" patch seperated in
  to smaller patches
- Calling convention for create_branch() is changed
- Some places which had not been updated to use the data type are now 
updated

v4 is based on feedback from Nick Piggins, Christophe Leroy and Daniel Axtens.
The major changes:
- Move xmon breakpoints from data section to text section
- Introduce a data type for instructions on powerpc

v3 is based on feedback from Christophe Leroy. The major changes:
- Completely replacing store_inst() with patch_instruction() in
  xmon
- Improve implementation of mread_instr() to not use mread().
- Base the series on top of
  https://patchwork.ozlabs.org/patch/1232619/ as this will effect
  kprobes.
- Some renaming and simplification of conditionals.

v2 incorporates feedback from Daniel Axtens and and Balamuruhan
S. The major changes are:
- Squashing together all commits about SRR1 bits
- Squashing all commits for supporting prefixed load stores
- Changing abbreviated references to sufx/prfx -> suffix/prefix
- Introducing macros for returning the length of an instruction
- Removing sign extension flag from pstd/pld in sstep.c
- Dropping patch  "powerpc/fault: Use analyse_instr() to check for
  store with updates to sp" from the series, it did not really fit
  with prefixed enablement in the first place and as reported by Greg
  Kurz did not work correctly.


Alistair Popple (1):
  powerpc: Enable Prefixed Instructions

Jordan Niethe (29):
  powerpc/xmon: Remove store_inst() for patch_instruction()
  powerpc/xmon: Move breakpoint instructions to own array
  powerpc/xmon: Move breakpoints to text section
  powerpc/xmon: Use bitwise calculations in_breakpoint_table()
  powerpc: Change calling convention for create_branch() et. al.
  powerpc: Use a macro for creating instructions from u32s
  powerpc: Use an accessor for instructions
  powerpc: Use a function for getting the instruction op code
  powerpc: Use a function for byte swapping instructions
  powerpc: Introduce functions for instruction equality
  powerpc: Use a datatype for instructions
  powerpc: Use a function for reading instructions
  powerpc: Add a probe_user_read_inst() function
  powerpc: Add a probe_kernel_read_inst() function
  powerpc/kprobes: Use patch_instruction()
  powerpc: Define and use __get_user_instr{,inatomic}()
  powerpc: Introduce a function for reporting instruction length
  powerpc/xmon: Use a function for reading instructions
  powerpc/xmon: Move insertion of breakpoint for xol'ing
  powerpc: Make test_translate_branch() independent of instruction
length
  powerpc: Define new SRR1 bits for a future ISA version
  powerpc: Add prefixed instructions to instruction data type
  powerpc: Test prefixed code patching
  powerpc: Test prefixed instructions in feature fixups
  powerpc/xmon: Don't allow breakpoints on suffixes
  powerpc/kprobes: Don't allow breakpoints on suffixes
  powerpc: Support prefixed instructions in alignment handler
  powerpc sstep: Add support for prefixed load/stores
  powerpc sstep: Add support for prefixed fixed-point arithmetic

 arch/powerpc/include/asm/code-patching.h  |  37 +-
 arch/powerpc/include/asm/inst.h   | 107 +
 arch/powerpc/include/asm/kprobes.h|   2 +-
 arch/powerpc/include/asm/ppc-opcode.h |   3 +
 

[PATCH v8 06/30] powerpc: Use a macro for creating instructions from u32s

2020-05-05 Thread Jordan Niethe
In preparation for instructions having a more complex data type start
using a macro, ppc_inst(), for making an instruction out of a u32.  A
macro is used so that instructions can be used as initializer elements.
Currently this does nothing, but it will allow for creating a data type
that can represent prefixed instructions.

Reviewed-by: Alistair Popple 
Signed-off-by: Jordan Niethe 
---
v4: New to series
v5: - Rename PPC_INST() -> ppc_inst().
- Use on epapr_paravirt.c, kgdb.c
v6: - Use in setup_32.c
- epapr_paravirt.c: early_init_dt_scan_epapr(): move the use of
  ppc_inst() earlier.
v8: - style fixes
- Use in crash_dump.c, 8xx.c, 8xx-pmu.c, mpc86xx_smp.c, smp.c
---
 arch/powerpc/include/asm/code-patching.h  |  3 +-
 arch/powerpc/include/asm/inst.h   | 11 +
 arch/powerpc/kernel/align.c   |  1 +
 arch/powerpc/kernel/crash_dump.c  |  3 +-
 arch/powerpc/kernel/epapr_paravirt.c  |  3 +-
 arch/powerpc/kernel/hw_breakpoint.c   |  3 +-
 arch/powerpc/kernel/jump_label.c  |  3 +-
 arch/powerpc/kernel/kgdb.c|  5 +-
 arch/powerpc/kernel/kprobes.c |  5 +-
 arch/powerpc/kernel/module_64.c   |  3 +-
 arch/powerpc/kernel/optprobes.c   | 32 +++--
 arch/powerpc/kernel/security.c| 12 +++--
 arch/powerpc/kernel/setup_32.c|  2 +-
 arch/powerpc/kernel/trace/ftrace.c| 25 +-
 arch/powerpc/kernel/uprobes.c |  1 +
 arch/powerpc/kvm/emulate_loadstore.c  |  2 +-
 arch/powerpc/lib/code-patching.c  | 57 ---
 arch/powerpc/lib/feature-fixups.c | 39 
 arch/powerpc/lib/test_emulate_step.c  | 39 
 arch/powerpc/mm/nohash/8xx.c  |  5 +-
 arch/powerpc/perf/8xx-pmu.c   |  9 ++--
 arch/powerpc/platforms/86xx/mpc86xx_smp.c |  3 +-
 arch/powerpc/platforms/powermac/smp.c |  3 +-
 arch/powerpc/xmon/xmon.c  |  7 +--
 24 files changed, 156 insertions(+), 120 deletions(-)
 create mode 100644 arch/powerpc/include/asm/inst.h

diff --git a/arch/powerpc/include/asm/code-patching.h 
b/arch/powerpc/include/asm/code-patching.h
index 351dda7215b6..48e021957ee5 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* Flags for create_branch:
  * "b"   == create_branch(addr, target, 0);
@@ -48,7 +49,7 @@ static inline int patch_branch_site(s32 *site, unsigned long 
target, int flags)
 static inline int modify_instruction(unsigned int *addr, unsigned int clr,
 unsigned int set)
 {
-   return patch_instruction(addr, (*addr & ~clr) | set);
+   return patch_instruction(addr, ppc_inst((*addr & ~clr) | set));
 }
 
 static inline int modify_instruction_site(s32 *site, unsigned int clr, 
unsigned int set)
diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h
new file mode 100644
index ..5298ba33b6e5
--- /dev/null
+++ b/arch/powerpc/include/asm/inst.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _ASM_INST_H
+#define _ASM_INST_H
+
+/*
+ * Instruction data type for POWER
+ */
+
+#define ppc_inst(x) (x)
+
+#endif /* _ASM_INST_H */
diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
index 92045ed64976..86e9bf62f18c 100644
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct aligninfo {
unsigned char len;
diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c
index 05745ddbd229..78e556b131db 100644
--- a/arch/powerpc/kernel/crash_dump.c
+++ b/arch/powerpc/kernel/crash_dump.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef DEBUG
 #include 
@@ -44,7 +45,7 @@ static void __init create_trampoline(unsigned long addr)
 * branch to "addr" we jump to ("addr" + 32 MB). Although it requires
 * two instructions it doesn't require any registers.
 */
-   patch_instruction(p, PPC_INST_NOP);
+   patch_instruction(p, ppc_inst(PPC_INST_NOP));
patch_branch(++p, addr + PHYSICAL_START, 0);
 }
 
diff --git a/arch/powerpc/kernel/epapr_paravirt.c 
b/arch/powerpc/kernel/epapr_paravirt.c
index 9d32158ce36f..e8eb72a65572 100644
--- a/arch/powerpc/kernel/epapr_paravirt.c
+++ b/arch/powerpc/kernel/epapr_paravirt.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #if !defined(CONFIG_64BIT) || defined(CONFIG_PPC_BOOK3E_64)
 extern void epapr_ev_idle(void);
@@ -36,7 +37,7 @@ static int __init early_init_dt_scan_epapr(unsigned long node,
return -1;
 
for (i = 0; i < (len / 4); i++) {
-   u32 inst = be32_to_cpu(insts[i]);
+   u32 inst = ppc_inst(be32_to_cpu(insts[i]));
patch_instruction(epapr_hypercall_start + i, inst);
 #if 

[PATCH v8 05/30] powerpc: Change calling convention for create_branch() et. al.

2020-05-05 Thread Jordan Niethe
create_branch(), create_cond_branch() and translate_branch() return the
instruction that they create, or return 0 to signal an error. Separate
these concerns in preparation for an instruction type that is not just
an unsigned int.  Fill the created instruction to a pointer passed as
the first parameter to the function and use a non-zero return value to
signify an error.

Reviewed-by: Alistair Popple 
Signed-off-by: Jordan Niethe 
---
v5: New to series
v6: - setup_32.c: machine_init(): change insn to unsigned int
- Fix typo in commit message
- __ftrace_make_call(): test for err not !err
v8: Style fix
---
 arch/powerpc/include/asm/code-patching.h |  12 +-
 arch/powerpc/kernel/optprobes.c  |  24 ++--
 arch/powerpc/kernel/setup_32.c   |   4 +-
 arch/powerpc/kernel/trace/ftrace.c   |  24 ++--
 arch/powerpc/lib/code-patching.c | 134 +--
 arch/powerpc/lib/feature-fixups.c|   5 +-
 6 files changed, 119 insertions(+), 84 deletions(-)

diff --git a/arch/powerpc/include/asm/code-patching.h 
b/arch/powerpc/include/asm/code-patching.h
index 898b54262881..351dda7215b6 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -22,10 +22,10 @@
 #define BRANCH_ABSOLUTE0x2
 
 bool is_offset_in_branch_range(long offset);
-unsigned int create_branch(const unsigned int *addr,
-  unsigned long target, int flags);
-unsigned int create_cond_branch(const unsigned int *addr,
-   unsigned long target, int flags);
+int create_branch(unsigned int *instr, const unsigned int *addr,
+ unsigned long target, int flags);
+int create_cond_branch(unsigned int *instr, const unsigned int *addr,
+  unsigned long target, int flags);
 int patch_branch(unsigned int *addr, unsigned long target, int flags);
 int patch_instruction(unsigned int *addr, unsigned int instr);
 int raw_patch_instruction(unsigned int *addr, unsigned int instr);
@@ -60,8 +60,8 @@ int instr_is_relative_branch(unsigned int instr);
 int instr_is_relative_link_branch(unsigned int instr);
 int instr_is_branch_to_addr(const unsigned int *instr, unsigned long addr);
 unsigned long branch_target(const unsigned int *instr);
-unsigned int translate_branch(const unsigned int *dest,
- const unsigned int *src);
+int translate_branch(unsigned int *instr, const unsigned int *dest,
+const unsigned int *src);
 extern bool is_conditional_branch(unsigned int instr);
 #ifdef CONFIG_PPC_BOOK3E_64
 void __patch_exception(int exc, unsigned long addr);
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index 024f7aad1952..445b3dad82dc 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -251,15 +251,17 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe 
*op, struct kprobe *p)
goto error;
}
 
-   branch_op_callback = create_branch((unsigned int *)buff + 
TMPL_CALL_HDLR_IDX,
-   (unsigned long)op_callback_addr,
-   BRANCH_SET_LINK);
+   rc = create_branch(_op_callback,
+  (unsigned int *)buff + TMPL_CALL_HDLR_IDX,
+  (unsigned long)op_callback_addr,
+  BRANCH_SET_LINK);
 
-   branch_emulate_step = create_branch((unsigned int *)buff + 
TMPL_EMULATE_IDX,
-   (unsigned long)emulate_step_addr,
-   BRANCH_SET_LINK);
+   rc |= create_branch(_emulate_step,
+   (unsigned int *)buff + TMPL_EMULATE_IDX,
+   (unsigned long)emulate_step_addr,
+   BRANCH_SET_LINK);
 
-   if (!branch_op_callback || !branch_emulate_step)
+   if (rc)
goto error;
 
patch_instruction(buff + TMPL_CALL_HDLR_IDX, branch_op_callback);
@@ -305,6 +307,7 @@ int arch_check_optimized_kprobe(struct optimized_kprobe *op)
 
 void arch_optimize_kprobes(struct list_head *oplist)
 {
+   unsigned int instr;
struct optimized_kprobe *op;
struct optimized_kprobe *tmp;
 
@@ -315,9 +318,10 @@ void arch_optimize_kprobes(struct list_head *oplist)
 */
memcpy(op->optinsn.copied_insn, op->kp.addr,
   RELATIVEJUMP_SIZE);
-   patch_instruction(op->kp.addr,
-   create_branch((unsigned int *)op->kp.addr,
- (unsigned long)op->optinsn.insn, 0));
+   create_branch(,
+ (unsigned int *)op->kp.addr,
+ (unsigned long)op->optinsn.insn, 0);
+   patch_instruction(op->kp.addr, instr);
list_del_init(>list);
}
 }
diff --git a/arch/powerpc/kernel/setup_32.c 

  1   2   >