date:20200406

Re: [PATCH v8 1/7] perf expr: Add expr_ prefix for parse_ctx and parse_id

2020-04-06 Thread Arnaldo Carvalho de Melo

Em Thu, Apr 02, 2020 at 02:03:34AM +0530, Kajol Jain escreveu:
> From: Jiri Olsa 
> 
> Adding expr_ prefix for parse_ctx and parse_id,
> to straighten out the expr* namespace.
> 
> There's no functional change.

Next time please add your Signed-off-by: as well when pushing 3rd party
patches.

Applied.

- Arnaldo
 
> Signed-off-by: Jiri Olsa 
> ---
>  tools/perf/tests/expr.c   |  4 ++--
>  tools/perf/util/expr.c| 10 +-
>  tools/perf/util/expr.h| 12 ++--
>  tools/perf/util/expr.y|  6 +++---
>  tools/perf/util/stat-shadow.c |  2 +-
>  5 files changed, 17 insertions(+), 17 deletions(-)
> 
> diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c
> index 28313e59d6f6..ea10fc4412c4 100644
> --- a/tools/perf/tests/expr.c
> +++ b/tools/perf/tests/expr.c
> @@ -6,7 +6,7 @@
>  #include 
>  #include 
>  
> -static int test(struct parse_ctx *ctx, const char *e, double val2)
> +static int test(struct expr_parse_ctx *ctx, const char *e, double val2)
>  {
>   double val;
>  
> @@ -22,7 +22,7 @@ int test__expr(struct test *t __maybe_unused, int subtest 
> __maybe_unused)
>   const char **other;
>   double val;
>   int i, ret;
> - struct parse_ctx ctx;
> + struct expr_parse_ctx ctx;
>   int num_other;
>  
>   expr__ctx_init();
> diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c
> index fd192ddf93c1..c8ccc548a585 100644
> --- a/tools/perf/util/expr.c
> +++ b/tools/perf/util/expr.c
> @@ -11,7 +11,7 @@ extern int expr_debug;
>  #endif
>  
>  /* Caller must make sure id is allocated */
> -void expr__add_id(struct parse_ctx *ctx, const char *name, double val)
> +void expr__add_id(struct expr_parse_ctx *ctx, const char *name, double val)
>  {
>   int idx;
>  
> @@ -21,13 +21,13 @@ void expr__add_id(struct parse_ctx *ctx, const char 
> *name, double val)
>   ctx->ids[idx].val = val;
>  }
>  
> -void expr__ctx_init(struct parse_ctx *ctx)
> +void expr__ctx_init(struct expr_parse_ctx *ctx)
>  {
>   ctx->num_ids = 0;
>  }
>  
>  static int
> -__expr__parse(double *val, struct parse_ctx *ctx, const char *expr,
> +__expr__parse(double *val, struct expr_parse_ctx *ctx, const char *expr,
> int start)
>  {
>   YY_BUFFER_STATE buffer;
> @@ -52,7 +52,7 @@ __expr__parse(double *val, struct parse_ctx *ctx, const 
> char *expr,
>   return ret;
>  }
>  
> -int expr__parse(double *final_val, struct parse_ctx *ctx, const char *expr)
> +int expr__parse(double *final_val, struct expr_parse_ctx *ctx, const char 
> *expr)
>  {
>   return __expr__parse(final_val, ctx, expr, EXPR_PARSE) ? -1 : 0;
>  }
> @@ -75,7 +75,7 @@ int expr__find_other(const char *expr, const char *one, 
> const char ***other,
>int *num_other)
>  {
>   int err, i = 0, j = 0;
> - struct parse_ctx ctx;
> + struct expr_parse_ctx ctx;
>  
>   expr__ctx_init();
>   err = __expr__parse(NULL, , expr, EXPR_OTHER);
> diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h
> index 9377538f4097..b9e53f2b5844 100644
> --- a/tools/perf/util/expr.h
> +++ b/tools/perf/util/expr.h
> @@ -5,19 +5,19 @@
>  #define EXPR_MAX_OTHER 20
>  #define MAX_PARSE_ID EXPR_MAX_OTHER
>  
> -struct parse_id {
> +struct expr_parse_id {
>   const char *name;
>   double val;
>  };
>  
> -struct parse_ctx {
> +struct expr_parse_ctx {
>   int num_ids;
> - struct parse_id ids[MAX_PARSE_ID];
> + struct expr_parse_id ids[MAX_PARSE_ID];
>  };
>  
> -void expr__ctx_init(struct parse_ctx *ctx);
> -void expr__add_id(struct parse_ctx *ctx, const char *id, double val);
> -int expr__parse(double *final_val, struct parse_ctx *ctx, const char *expr);
> +void expr__ctx_init(struct expr_parse_ctx *ctx);
> +void expr__add_id(struct expr_parse_ctx *ctx, const char *id, double val);
> +int expr__parse(double *final_val, struct expr_parse_ctx *ctx, const char 
> *expr);
>  int expr__find_other(const char *expr, const char *one, const char ***other,
>   int *num_other);
>  
> diff --git a/tools/perf/util/expr.y b/tools/perf/util/expr.y
> index 4720cbe79357..cd17486c1c5d 100644
> --- a/tools/perf/util/expr.y
> +++ b/tools/perf/util/expr.y
> @@ -15,7 +15,7 @@
>  %define api.pure full
>  
>  %parse-param { double *final_val }
> -%parse-param { struct parse_ctx *ctx }
> +%parse-param { struct expr_parse_ctx *ctx }
>  %parse-param {void *scanner}
>  %lex-param {void* scanner}
>  
> @@ -39,14 +39,14 @@
>  
>  %{
>  static void expr_error(double *final_val __maybe_unused,
> -struct parse_ctx *ctx __maybe_unused,
> +struct expr_parse_ctx *ctx __maybe_unused,
>  void *scanner,
>  const char *s)
>  {
>   pr_debug("%s\n", s);
>  }
>  
> -static int lookup_id(struct parse_ctx *ctx, char *id, double *val)
> +static int lookup_id(struct expr_parse_ctx *ctx, char *id, double *val)
>  {
>   int i;
>  
> diff --git a/tools/perf/util/stat-shadow.c

Re: [PATCH v5 2/2] powerpc/powernv: Add NULL check after kzalloc in opal_add_one_export

2020-04-06 Thread Markus Elfring

> Here needs a NULL check, as kzalloc may fail returning NULL.
>
> Issue was found by coccinelle.

* Do you really try to ignore (my) specific patch review comments
  (for a moment)?
  
https://lore.kernel.org/linuxppc-dev/b7d64d4a-74dd-ee21-db7b-018070f12...@web.de/
  https://lore.kernel.org/patchwork/comment/1414845/
  https://lkml.org/lkml/2020/4/6/279

* Would you like to integrate further adjustments with a varying delay?
  (Are you waiting on nicer feedback by any software maintainers?)

Regards,
Markus

Re: [PATCH v3 1/1] powerpc/kernel: Enables memory hot-remove after reboot on pseries guests

2020-04-06 Thread Leonardo Bras

Hello Bharata,

On Fri, 2020-04-03 at 20:08 +0530, Bharata B Rao wrote:
> The patch would be more complete with the following change that ensures
> that DRCONF_MEM_HOTREMOVABLE flag is set for non-boot-time hotplugged
> memory too. This will ensure that ibm,dynamic-memory-vN property
> reflects the right flags value for memory that gets hotplugged
> post boot.
> 

You just sent that on a separated patchset, so I think it's dealt with.
Do you have any other comments on the present patch?

Best regards,


signature.asc
Description: This is a digitally signed message part

Re: [PATCH 1/2] dma-mapping: add a dma_ops_bypass flag to struct device

2020-04-06 Thread Alexey Kardashevskiy




On 06/04/2020 21:50, Christoph Hellwig wrote:
> On Fri, Apr 03, 2020 at 07:38:11PM +1100, Alexey Kardashevskiy wrote:
>>
>>
>> On 26/03/2020 12:26, Alexey Kardashevskiy wrote:
>>>
>>>
>>> On 25/03/2020 19:37, Christoph Hellwig wrote:
 On Wed, Mar 25, 2020 at 03:51:36PM +1100, Alexey Kardashevskiy wrote:
>>> This is for persistent memory which you can DMA to/from but yet it does
>>> not appear in the system as a normal memory and therefore requires
>>> special handling anyway (O_DIRECT or DAX, I do not know the exact
>>> mechanics). All other devices in the system should just run as usual,
>>> i.e. use 1:1 mapping if possible.
>>
>> On other systems (x86 and arm) pmem as long as it is page backed does
>> not require any special handling.  This must be some weird way powerpc
>> fucked up again, and I suspect you'll have to suffer from it.
>
>
> It does not matter if it is backed by pages or not, the problem may also
> appear if we wanted for example p2p PCI via IOMMU (between PHBs) and
> MMIO might be mapped way too high in the system address space and make
> 1:1 impossible.

 How can it be mapped too high for a direct mapping with a 64-bit DMA
 mask?
>>>
>>> The window size is limited and often it is not even sparse. It requires
>>> an 8 byte entry per an IOMMU page (which is most commonly is 64k max) so
>>> 1TB limit (a guest RAM size) is a quite real thing. MMIO is mapped to
>>> guest physical address space outside of this 1TB (on PPC).
>>>
>>>
>>
>> I am trying now this approach on top of yours "dma-bypass.3" (it is
>> "wip", needs an upper boundary check):
>>
>> https://github.com/aik/linux/commit/49d73c7771e3f6054804f6cfa80b4e320111662d
>>
>> Do you see any serious problem with this approach? Thanks!
> 
> Do you have a link to the whole branch?  The github UI is unfortunately
> unusable for that (or I'm missing something).

The UI shows the branch but since I rebased and forcepushed it, it does
not. Here is the current one with:

https://github.com/aik/linux/commits/dma-bypass.3


Thanks,


-- 
Alexey

Re: [PATCH 1/6] powerpc/spufs: simplify spufs core dumping

2020-04-06 Thread Arnd Bergmann

On Mon, Apr 6, 2020 at 2:03 PM Christoph Hellwig  wrote:
>
> Replace the coredump ->read method with a ->dump method that must call
> dump_emit itself.  That way we avoid a buffer allocation an messing with
> set_fs() to call into code that is intended to deal with user buffers.
> For the ->get case we can now use a small on-stack buffer and avoid
> memory allocations as well.

I had no memory of this code at all, but your change looks fine to me.
Amazingly you even managed to even make it smaller and more readable

Reviewed-by: Arnd Bergmann

Re: [PATCH v5 1/2] powerpc/powernv: Remove two unnecessary variable initialisations in opal_add_one_export()

2020-04-06 Thread Markus Elfring

> And we can remove the redundant assignments to attr and name.

How do you think about a wording like the following?

   Two local variables will eventually be set to appropriate pointers
   a bit later. Thus omit their explicit initialisation at the beginning.


Regards,
Markus

Re: [PATCH v5 1/2] powerpc/powernv: Return directly after a failed of_property_read_u64_array() in opal_add_one_export()

2020-04-06 Thread Markus Elfring

> We don't need to go to the labal of out …

Please avoid a typo for this change description.


> fails, as there is nothing to do. Just return.

I suggest to reconsider also this wording.

  Return directly after a call of the function “of_property_read_u64_array”
  failed at the beginning.


> And we can …

Do such words indicate that even this patch approach should be split
into further update steps?

Regards,
Markus

[PATCH v2] powerpc/vio: drop bus_type from parent device

2020-04-06 Thread Thadeu Lima de Souza Cascardo

Commit df44b479654f62b478c18ee4d8bc4e9f897a9844 ("kobject: return error
code if writing /sys/.../uevent fails") started returning failure when
writing to /sys/devices/vio/uevent.

This causes an early udevadm trigger to fail. On some installer versions of
Ubuntu, this will cause init to exit, thus panicing the system very early
during boot.

Removing the bus_type from the parent device will remove some of the extra
empty files from /sys/devices/vio/, but will keep the rest of the layout
for vio devices, keeping them under /sys/devices/vio/.

It has been tested that uevents for vio devices don't change after this
fix, they still contain MODALIAS.

Signed-off-by: Thadeu Lima de Souza Cascardo 
Fixes: df44b479654f ("kobject: return error code if writing /sys/.../uevent 
fails")
---
 arch/powerpc/platforms/pseries/vio.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/vio.c 
b/arch/powerpc/platforms/pseries/vio.c
index 37f1f25ba804..a94dab3972a0 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -36,7 +36,6 @@ static struct vio_dev vio_bus_device  = { /* fake "parent" 
device */
.name = "vio",
.type = "",
.dev.init_name = "vio",
-   .dev.bus = _bus_type,
 };
 
 #ifdef CONFIG_PPC_SMLPAR
-- 
2.20.1

Re: [PATCH] powerpc/mce: Add MCE notification chain

2020-04-06 Thread Mahesh J Salgaonkar

On 2020-04-06 12:17:22 Mon, Nicholas Piggin wrote:
> Ganesh's on April 4, 2020 11:05 pm:
> > On 4/3/20 7:38 AM, Nicholas Piggin wrote:
> > 
> >> Ganesh Goudar's on March 30, 2020 5:12 pm:
> >>> From: Santosh S 
> >>>
> >>> Introduce notification chain which lets know about uncorrected memory
> >>> errors(UE). This would help prospective users in pmem or nvdimm subsystem
> >>> to track bad blocks for better handling of persistent memory allocations.
> >>>
> >>> Signed-off-by: Santosh S 
> >>> Signed-off-by: Ganesh Goudar 
> >> Do you have any such users yet? It would be good to refer to an example
> >> user and give a brief description of what it does in its notifier.
> > 
> > Santosh has sent a patch which uses this notification.
> > https://patchwork.ozlabs.org/patch/1265062/
> 
> Okay. So these things are asynchronous after the machine check. I guess
> that's the design of it and memory offlining does something similar by
> the looks, but how do you prevent the memory being allocated for 
> something else before the notifiers run?

We can't. This race even exists today when we call memory_failure(). If
the same memory is allocated again then we may hit another mce on same
address when touched until the subsystem that has resistered for
notification has completed handling the notified address.

Thanks,
-Mahesh.

> 
> >>> @@ -263,6 +277,7 @@ static void machine_process_ue_event(struct 
> >>> work_struct *work)
> >>>   while (__this_cpu_read(mce_ue_count) > 0) {
> >>>   index = __this_cpu_read(mce_ue_count) - 1;
> >>>   evt = this_cpu_ptr(_ue_event_queue[index]);
> >>> + blocking_notifier_call_chain(_notifier_list, 0, evt);
> >> Can we really use a blocking notifier here? I'm not sure that we can.
> > 
> > I think we can, do you see any problem?
> 
> No it looks okay after better look, sorry for the noise.
> 
> Thanks,
> Nick

-- 
Mahesh J Salgaonkar

[RFC PATCH v3 02/15] powerpc/radix: Make kuap_check_amr() and kuap_restore_amr() generic

2020-04-06 Thread Christophe Leroy

In preparation of porting powerpc32 to C syscall entry/exit,
rename kuap_check_amr() and kuap_restore_amr() as kuap_check()
and kuap_restore(), and move the stub for when CONFIG_PPC_KUAP is
not selected in the generic asm/kup.h

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/book3s/64/kup-radix.h | 12 ++--
 arch/powerpc/include/asm/kup.h |  2 ++
 arch/powerpc/kernel/syscall_64.c   | 10 +-
 3 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h 
b/arch/powerpc/include/asm/book3s/64/kup-radix.h
index 3bcef989a35d..1f2716a0dcd8 100644
--- a/arch/powerpc/include/asm/book3s/64/kup-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h
@@ -60,13 +60,13 @@
 #include 
 #include 
 
-static inline void kuap_restore_amr(struct pt_regs *regs)
+static inline void kuap_restore(struct pt_regs *regs)
 {
if (mmu_has_feature(MMU_FTR_RADIX_KUAP))
mtspr(SPRN_AMR, regs->kuap);
 }
 
-static inline void kuap_check_amr(void)
+static inline void kuap_check(void)
 {
if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && 
mmu_has_feature(MMU_FTR_RADIX_KUAP))
WARN_ON_ONCE(mfspr(SPRN_AMR) != AMR_KUAP_BLOCKED);
@@ -141,14 +141,6 @@ bad_kuap_fault(struct pt_regs *regs, unsigned long 
address, bool is_write)
(regs->kuap & (is_write ? AMR_KUAP_BLOCK_WRITE : 
AMR_KUAP_BLOCK_READ)),
"Bug: %s fault blocked by AMR!", is_write ? "Write" : 
"Read");
 }
-#else /* CONFIG_PPC_KUAP */
-static inline void kuap_restore_amr(struct pt_regs *regs)
-{
-}
-
-static inline void kuap_check_amr(void)
-{
-}
 #endif /* CONFIG_PPC_KUAP */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h
index 92bcd1a26d73..1100c13b6d9e 100644
--- a/arch/powerpc/include/asm/kup.h
+++ b/arch/powerpc/include/asm/kup.h
@@ -62,6 +62,8 @@ bad_kuap_fault(struct pt_regs *regs, unsigned long address, 
bool is_write)
 {
return false;
 }
+static inline void kuap_restore(struct pt_regs *regs) { }
+static inline void kuap_check(void) { }
 #endif /* CONFIG_PPC_KUAP */
 
 static inline void allow_read_from_user(const void __user *from, unsigned long 
size)
diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c
index f021db893ec2..4c46f3aefaf8 100644
--- a/arch/powerpc/kernel/syscall_64.c
+++ b/arch/powerpc/kernel/syscall_64.c
@@ -2,7 +2,7 @@
 
 #include 
 #include 
-#include 
+#include 
 #include 
 #include 
 #include 
@@ -48,7 +48,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
}
 #endif
 
-   kuap_check_amr();
+   kuap_check();
 
/*
 * This is not required for the syscall exit path, but makes the
@@ -220,7 +220,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
local_paca->tm_scratch = regs->msr;
 #endif
 
-   kuap_check_amr();
+   kuap_check();
 
account_cpu_user_exit();
 
@@ -300,7 +300,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct 
pt_regs *regs, unsigned
local_paca->tm_scratch = regs->msr;
 #endif
 
-   kuap_check_amr();
+   kuap_check();
 
account_cpu_user_exit();
 
@@ -370,7 +370,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct 
pt_regs *regs, unsign
 * We don't need to restore AMR on the way back to userspace for KUAP.
 * The value of AMR only matters while we're in the kernel.
 */
-   kuap_restore_amr(regs);
+   kuap_restore(regs);
 
return ret;
 }
-- 
2.25.0

[RFC PATCH v3 08/15] powerpc/syscall: Rename syscall_64.c into syscall.c

2020-04-06 Thread Christophe Leroy

syscall_64.c will be reused almost as is for PPC32.

Rename it syscall.c

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/Makefile| 2 +-
 arch/powerpc/kernel/{syscall_64.c => syscall.c} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename arch/powerpc/kernel/{syscall_64.c => syscall.c} (100%)

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 570660efbb3d..8cc3c831dccd 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -49,7 +49,7 @@ obj-y := cputable.o syscalls.o \
 obj-y  += ptrace/
 obj-$(CONFIG_PPC64)+= setup_64.o sys_ppc32.o signal_64.o \
   paca.o nvram_64.o firmware.o note.o \
-  syscall_64.o
+  syscall.o
 obj-$(CONFIG_VDSO32)   += vdso32/
 obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)   += hw_breakpoint.o
diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall.c
similarity index 100%
rename from arch/powerpc/kernel/syscall_64.c
rename to arch/powerpc/kernel/syscall.c
-- 
2.25.0

[RFC PATCH v3 13/15] powerpc/syscall: system call implement entry/exit logic in C for PPC32

2020-04-06 Thread Christophe Leroy

That's port of PPC64 syscall entry/exit logic in C to PPC32.

Performancewise:
Before : 311 cycles on null_syscall
After  : 353 cycles on null_syscall

Note: before the patch, if calling NVGPRS all the time as well,
we have 335 cycles on null_syscall

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/entry_32.S | 259 -
 arch/powerpc/kernel/head_32.h  |  16 +-
 2 files changed, 29 insertions(+), 246 deletions(-)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index a6371fb8f761..103f5158bc44 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -315,162 +315,37 @@ stack_ovf:
RFI
 #endif
 
-#ifdef CONFIG_TRACE_IRQFLAGS
-trace_syscall_entry_irq_off:
-   /*
-* Syscall shouldn't happen while interrupts are disabled,
-* so let's do a warning here.
-*/
-0: trap
-   EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING
-   bl  trace_hardirqs_on
-
-   /* Now enable for real */
-   LOAD_REG_IMMEDIATE(r10, MSR_KERNEL | MSR_EE)
-   mtmsr   r10
-
-   REST_GPR(0, r1)
-   REST_4GPRS(3, r1)
-   REST_2GPRS(7, r1)
-   b   DoSyscall
-#endif /* CONFIG_TRACE_IRQFLAGS */
-
.globl  transfer_to_syscall
 transfer_to_syscall:
-#ifdef CONFIG_TRACE_IRQFLAGS
-   andi.   r12,r9,MSR_EE
-   beq-trace_syscall_entry_irq_off
-#endif /* CONFIG_TRACE_IRQFLAGS */
-
-/*
- * Handle a system call.
- */
-   .stabs  "arch/powerpc/kernel/",N_SO,0,0,0f
-   .stabs  "entry_32.S",N_SO,0,0,0f
-0:
-
-_GLOBAL(DoSyscall)
-   stw r3,ORIG_GPR3(r1)
-   li  r12,0
-   stw r12,RESULT(r1)
-#ifdef CONFIG_TRACE_IRQFLAGS
-   /* Make sure interrupts are enabled */
-   mfmsr   r11
-   andi.   r12,r11,MSR_EE
-   /* We came in with interrupts disabled, we WARN and mark them enabled
-* for lockdep now */
-0: tweqi   r12, 0
-   EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING
-#endif /* CONFIG_TRACE_IRQFLAGS */
-   lwz r11,TI_FLAGS(r2)
-   andi.   r11,r11,_TIF_SYSCALL_DOTRACE
-   bne-syscall_dotrace
-syscall_dotrace_cont:
-   cmplwi  0,r0,NR_syscalls
-   lis r10,sys_call_table@h
-   ori r10,r10,sys_call_table@l
-   slwir0,r0,2
-   bge-66f
-
-   barrier_nospec_asm
-   /*
-* Prevent the load of the handler below (based on the user-passed
-* system call number) being speculatively executed until the test
-* against NR_syscalls and branch to .66f above has
-* committed.
-*/
-
-   lwzxr10,r10,r0  /* Fetch system call handler [ptr] */
-   mtlrr10
-   addir9,r1,STACK_FRAME_OVERHEAD
-   PPC440EP_ERR42
-   blrl/* Call handler */
-   .globl  ret_from_syscall
+   mr  r9, r0
+   addir10, r1, STACK_FRAME_OVERHEAD
+   bl  system_call_exception
 ret_from_syscall:
-#ifdef CONFIG_DEBUG_RSEQ
-   /* Check whether the syscall is issued inside a restartable sequence */
-   stw r3,GPR3(r1)
-   addir3,r1,STACK_FRAME_OVERHEAD
-   bl  rseq_syscall
-   lwz r3,GPR3(r1)
-#endif
-   mr  r6,r3
-   /* disable interrupts so current_thread_info()->flags can't change */
-   LOAD_REG_IMMEDIATE(r10,MSR_KERNEL)  /* doesn't include MSR_EE */
-   /* Note: We don't bother telling lockdep about it */
-   SYNC
-   mtmsr   r10
-   lwz r9,TI_FLAGS(r2)
-   li  r8,-MAX_ERRNO
-   andi.   
r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)
-   bne-syscall_exit_work
-   cmplw   0,r3,r8
-   blt+syscall_exit_cont
-   lwz r11,_CCR(r1)/* Load CR */
-   neg r3,r3
-   orisr11,r11,0x1000  /* Set SO bit in CR */
-   stw r11,_CCR(r1)
-syscall_exit_cont:
-   lwz r8,_MSR(r1)
-#ifdef CONFIG_TRACE_IRQFLAGS
-   /* If we are going to return from the syscall with interrupts
-* off, we trace that here. It shouldn't normally happen.
-*/
-   andi.   r10,r8,MSR_EE
-   bne+1f
-   stw r3,GPR3(r1)
-   bl  trace_hardirqs_off
-   lwz r3,GPR3(r1)
-1:
-#endif /* CONFIG_TRACE_IRQFLAGS */
-#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
-   /* If the process has its own DBCR0 value, load it up.  The internal
-  debug mode bit tells us that dbcr0 should be loaded. */
-   lwz r0,THREAD+THREAD_DBCR0(r2)
-   andis.  r10,r0,DBCR0_IDM@h
-   bnel-   load_dbcr0
-#endif
-#ifdef CONFIG_44x
-BEGIN_MMU_FTR_SECTION
-   lis r4,icache_44x_need_flush@ha
-   lwz r5,icache_44x_need_flush@l(r4)
-   cmplwi  cr0,r5,0
-   bne-2f
-1:
-END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_47x)
-#endif /* CONFIG_44x */
-BEGIN_FTR_SECTION
-   lwarx   r7,0,r1
-END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
-   stwcx.

Re: [PATCH 1/2] dma-mapping: add a dma_ops_bypass flag to struct device

2020-04-06 Thread Christoph Hellwig

On Mon, Apr 06, 2020 at 11:25:09PM +1000, Alexey Kardashevskiy wrote:
> >> Do you see any serious problem with this approach? Thanks!
> > 
> > Do you have a link to the whole branch?  The github UI is unfortunately
> > unusable for that (or I'm missing something).
> 
> The UI shows the branch but since I rebased and forcepushed it, it does
> not. Here is the current one with:
> 
> https://github.com/aik/linux/commits/dma-bypass.3

Ok, so we use the core bypass without persistent memory, and then
have another bypass mode on top.  Not great, but I can't think
of anything better.  Note that your checks for the map_sg case
aren't very efficient - for one it would make sense to calculate
the limit only once, but also it would make sense to reuse the
calculted diecect mapping addresses instead of doing another pass
later on in the dma-direct code.

[RFC PATCH v3 15/15] powerpc/kernel: Do not inconditionally save non volatile registers on system call

2020-04-06 Thread Christophe Leroy

To allow that, syscall_exit_prepare() gets split in 3 parts.
On PPC32, the three parts are called from entry_32.S
On PPC64, we keep a syscall_exit_prepare() function which
concatenates the three parts.

One benefit is also that the likely part of
syscall_exit_prepare_begin() and the syscall_exit_prepare_end()
functions are frameless whereas there was no way to get the
likely part of syscall_exit_prepare() frameless.

Before : 347 cycles on null_syscall
After  : 307 cycles on null_syscall, ie better than before C porting.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/asm-prototypes.h | 11 +++
 arch/powerpc/kernel/entry_32.S| 25 ++-
 arch/powerpc/kernel/head_32.h |  3 +-
 arch/powerpc/kernel/syscall.c | 83 +++
 4 files changed, 92 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index 7d81e86a1e5d..eea5133733bb 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -98,6 +98,17 @@ unsigned long __init early_init(unsigned long dt_ptr);
 void __init machine_init(u64 dt_ptr);
 #endif
 long system_call_exception(long r3, long r4, long r5, long r6, long r7, long 
r8, unsigned long r0, struct pt_regs *regs);
+#ifdef CONFIG_PPC64
+#define static64 static
+#else
+#define static64
+#endif
+static64 notrace unsigned long
+syscall_exit_prepare_begin(unsigned long r3, struct pt_regs *regs, unsigned 
long ti_flags);
+static64 notrace unsigned long
+syscall_exit_prepare_loop(unsigned long ret, struct pt_regs *regs, unsigned 
long ti_flags);
+static64 notrace unsigned long
+syscall_exit_prepare_end(unsigned long ret, struct pt_regs *regs, unsigned 
long ti_flags);
 notrace unsigned long syscall_exit_prepare(unsigned long r3, struct pt_regs 
*regs);
 notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, 
unsigned long msr);
 notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, 
unsigned long msr);
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 103f5158bc44..b9287fd0fcc6 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -315,14 +315,37 @@ stack_ovf:
RFI
 #endif
 
+save_nvgprs:
+   lwz r11, _TRAP(r1)
+   andi.   r12, r11, 1
+   rlwinm  r11, r11, 0, ~1
+   beqlr
+   SAVE_NVGPRS(r1)
+   stw r11, _TRAP(r1)
+   blr
+
.globl  transfer_to_syscall
 transfer_to_syscall:
+   lwz r10, TI_FLAGS(r2)
mr  r9, r0
+   andi.   r10, r10, _TIF_SYSCALL_DOTRACE
addir10, r1, STACK_FRAME_OVERHEAD
+   bnel-   save_nvgprs
bl  system_call_exception
 ret_from_syscall:
+   lwz r5, TI_FLAGS(r2)
addir4, r1, STACK_FRAME_OVERHEAD
-   bl  syscall_exit_prepare
+   andi.   r0, r5, _TIF_SYSCALL_DOTRACE | _TIF_SINGLESTEP | 
_TIF_USER_WORK_MASK
+   bnel-   save_nvgprs
+   bl  syscall_exit_prepare_begin
+1: lwz r5, TI_FLAGS(r2)
+   addir4, r1, STACK_FRAME_OVERHEAD
+   andi.   r0, r5, _TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM
+   beq+1f
+   bl  save_nvgprs
+   bl  syscall_exit_prepare_loop
+   b   1b
+1: bl  syscall_exit_prepare_end
lwz r2, _CCR(r1)
lwz r4, _NIP(r1)
lwz r5, _MSR(r1)
diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h
index c301d666a3e5..1cc9a67cb42c 100644
--- a/arch/powerpc/kernel/head_32.h
+++ b/arch/powerpc/kernel/head_32.h
@@ -174,13 +174,12 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
stw r2,GPR2(r11)
addir10,r10,STACK_FRAME_REGS_MARKER@l
stw r9,_MSR(r11)
-   li  r2, \trapno
+   li  r2, \trapno + 1
stw r10,8(r11)
stw r2,_TRAP(r11)
SAVE_GPR(0, r11)
SAVE_4GPRS(3, r11)
SAVE_2GPRS(7, r11)
-   SAVE_NVGPRS(r11)
addir11,r1,STACK_FRAME_OVERHEAD
addir2,r12,-THREAD
stw r11,PT_REGS(r12)
diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
index af449a4a8e8f..b15f19c00ccb 100644
--- a/arch/powerpc/kernel/syscall.c
+++ b/arch/powerpc/kernel/syscall.c
@@ -37,7 +37,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
if (!IS_ENABLED(CONFIG_PPC_BOOK3E))
BUG_ON(!(regs->msr & MSR_RI));
BUG_ON(IS_ENABLED(CONFIG_PPC64) && !(regs->msr & MSR_PR));
-   BUG_ON(!FULL_REGS(regs));
+   BUG_ON(IS_ENABLED(CONFIG_PPC64) && !FULL_REGS(regs));
BUG_ON(IS_ENABLED(CONFIG_PPC64) && get_softe(regs) != IRQS_ENABLED);
 
account_cpu_user_entry();
@@ -145,11 +145,9 @@ static notrace inline bool prep_irq_for_enabled_exit(void)
  * The function graph tracer can not trace the return side of this function,
  * because RI=0 and soft mask state is "unreconciled", so it is marked

[RFC PATCH v3 11/15] powerpc/syscall: Save r3 in regs->orig_r3

2020-04-06 Thread Christophe Leroy

Save r3 in regs->orig_r3 in system_call_exception()

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/entry_64.S | 1 -
 arch/powerpc/kernel/syscall.c  | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 63f0a4414618..5ccb65f75712 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -114,7 +114,6 @@ END_BTB_FLUSH_SECTION
std r10,_LINK(r1)
std r11,_TRAP(r1)
std r12,_CCR(r1)
-   std r3,ORIG_GPR3(r1)
addir10,r1,STACK_FRAME_OVERHEAD
ld  r11,exception_marker@toc(r2)
std r11,-16(r10)/* "regshere" marker */
diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
index 0ad4250d2ce8..dfd7b28239b8 100644
--- a/arch/powerpc/kernel/syscall.c
+++ b/arch/powerpc/kernel/syscall.c
@@ -27,6 +27,8 @@ notrace long system_call_exception(long r3, long r4, long r5,
unsigned long ti_flags;
syscall_fn f;
 
+   regs->orig_gpr3 = r3;
+
if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
BUG_ON(irq_soft_mask_return() != IRQS_ALL_DISABLED);
 
-- 
2.25.0

Re: [PATCH v3 1/1] ppc/crash: Reset spinlocks during crash

2020-04-06 Thread Leonardo Bras

On Thu, 2020-04-02 at 22:28 +1100, Michael Ellerman wrote:
> Leonardo Bras  
> TBH I think we could just drop that printk() entirely.
> 
> Or we could tell printk() that we're in NMI context so that it uses the
> percpu buffers.
> 
> We should probably do the latter anyway, in case there's any other code
> we call that inadvertently calls printk().

Done:
http://patchwork.ozlabs.org/patch/1266956/

About the rtas-call, I think it will take more time, because I have to
study it properly.

Thank you,


signature.asc
Description: This is a digitally signed message part

[RFC PATCH v3 04/15] powerpc/8xx: Create C version of kuap_restore() and kuap_check()

2020-04-06 Thread Christophe Leroy

In preparation of porting PPC32 to C syscall entry/exit,
create C version of kuap_restore() and kuap_check() on 8xx

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/nohash/32/kup-8xx.h | 13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h 
b/arch/powerpc/include/asm/nohash/32/kup-8xx.h
index 85ed2390fb99..1918d2e55da3 100644
--- a/arch/powerpc/include/asm/nohash/32/kup-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h
@@ -34,6 +34,19 @@
 
 #include 
 
+static inline void kuap_restore(struct pt_regs *regs)
+{
+   mtspr(SPRN_MD_AP, regs->kuap);
+}
+
+static inline void kuap_check(void)
+{
+   if (!IS_ENABLED(CONFIG_PPC_KUAP_DEBUG))
+   return;
+
+   WARN_ON_ONCE((mfspr(SPRN_MD_AP) & 0x) != (MD_APG_KUAP & 
0x));
+}
+
 static inline void allow_user_access(void __user *to, const void __user *from,
 unsigned long size, unsigned long dir)
 {
-- 
2.25.0

[RFC PATCH v3 03/15] powerpc/32s: Create C version of kuap_restore() and kuap_check()

2020-04-06 Thread Christophe Leroy

In preparation of porting PPC32 to C syscall entry/exit,
create C version of kuap_restore() and kuap_check() on book3s/32.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/book3s/32/kup.h | 21 +
 1 file changed, 21 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/32/kup.h 
b/arch/powerpc/include/asm/book3s/32/kup.h
index 3c0ba22dc360..c85bc5b56366 100644
--- a/arch/powerpc/include/asm/book3s/32/kup.h
+++ b/arch/powerpc/include/asm/book3s/32/kup.h
@@ -102,6 +102,27 @@ static inline void kuap_update_sr(u32 sr, u32 addr, u32 
end)
isync();/* Context sync required after mtsrin() */
 }
 
+static inline void kuap_restore(struct pt_regs *regs)
+{
+   u32 kuap = current->thread.kuap;
+   u32 addr = kuap & 0xf000;
+   u32 end = kuap << 28;
+
+   if (unlikely(!kuap))
+   return;
+
+   current->thread.kuap = 0;
+   kuap_update_sr(mfsrin(addr) & ~SR_KS, addr, end);   /* Clear Ks */
+}
+
+static inline void kuap_check(void)
+{
+   if (!IS_ENABLED(CONFIG_PPC_KUAP_DEBUG))
+   return;
+
+   WARN_ON_ONCE(current->thread.kuap != 0);
+}
+
 static __always_inline void allow_user_access(void __user *to, const void 
__user *from,
  u32 size, unsigned long dir)
 {
-- 
2.25.0

[RFC PATCH v3 09/15] powerpc/syscall: Make syscall_64.c buildable on PPC32

2020-04-06 Thread Christophe Leroy

ifdef out specific PPC64 stuff to allow building
syscall_64.c on PPC32.

Modify Makefile to always build syscall.o

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/Makefile  | 5 ++---
 arch/powerpc/kernel/syscall.c | 9 +
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 8cc3c831dccd..e4be425b7718 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -45,11 +45,10 @@ obj-y   := cputable.o 
syscalls.o \
   signal.o sysfs.o cacheinfo.o time.o \
   prom.o traps.o setup-common.o \
   udbg.o misc.o io.o misc_$(BITS).o \
-  of_platform.o prom_parse.o
+  of_platform.o prom_parse.o syscall.o
 obj-y  += ptrace/
 obj-$(CONFIG_PPC64)+= setup_64.o sys_ppc32.o signal_64.o \
-  paca.o nvram_64.o firmware.o note.o \
-  syscall.o
+  paca.o nvram_64.o firmware.o note.o
 obj-$(CONFIG_VDSO32)   += vdso32/
 obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)   += hw_breakpoint.o
diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
index 4c46f3aefaf8..98c98ce12f7d 100644
--- a/arch/powerpc/kernel/syscall.c
+++ b/arch/powerpc/kernel/syscall.c
@@ -34,7 +34,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
BUG_ON(!(regs->msr & MSR_RI));
BUG_ON(!(regs->msr & MSR_PR));
BUG_ON(!FULL_REGS(regs));
-   BUG_ON(regs->softe != IRQS_ENABLED);
+   BUG_ON(IS_ENABLED(CONFIG_PPC64) && get_softe(regs) != IRQS_ENABLED);
 
account_cpu_user_entry();
 
@@ -56,7 +56,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
 * frame, or if the unwinder was taught the first stack frame always
 * returns to user with IRQS_ENABLED, this store could be avoided!
 */
-   regs->softe = IRQS_ENABLED;
+   set_softe(regs, IRQS_ENABLED);
 
local_irq_enable();
 
@@ -114,6 +114,7 @@ static notrace inline bool prep_irq_for_enabled_exit(void)
 
/* This pattern matches prep_irq_for_idle */
__hard_EE_RI_disable();
+#ifdef CONFIG_PPC64
if (unlikely(lazy_irq_pending())) {
/* Took an interrupt, may have more exit work to do. */
__hard_RI_enable();
@@ -124,7 +125,7 @@ static notrace inline bool prep_irq_for_enabled_exit(void)
}
local_paca->irq_happened = 0;
irq_soft_mask_set(IRQS_ENABLED);
-
+#endif
return true;
 }
 
@@ -227,7 +228,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
return ret;
 }
 
-#ifdef CONFIG_PPC_BOOK3S /* BOOK3E not yet using this */
+#ifdef CONFIG_PPC_BOOK3S_64 /* BOOK3E not yet using this */
 notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, 
unsigned long msr)
 {
 #ifdef CONFIG_PPC_BOOK3E
-- 
2.25.0

[RFC PATCH v3 12/15] powerpc/syscall: Selectively check MSR_RI and MSR_PR on syscall entry

2020-04-06 Thread Christophe Leroy

In system_call_exception(), MSR_RI needs to also be checked on 8xx.
Only book3e doesn't have MSR_RI.

On PPC32, MSR_PR is checked in real mode to avoid clobbering the
stack, so no need to check and panic in system_call_exception().

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/syscall.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
index dfd7b28239b8..f9fca9985b0f 100644
--- a/arch/powerpc/kernel/syscall.c
+++ b/arch/powerpc/kernel/syscall.c
@@ -34,9 +34,9 @@ notrace long system_call_exception(long r3, long r4, long r5,
 
trace_hardirqs_off(); /* finish reconciling */
 
-   if (IS_ENABLED(CONFIG_PPC_BOOK3S))
+   if (!IS_ENABLED(CONFIG_PPC_BOOK3E))
BUG_ON(!(regs->msr & MSR_RI));
-   BUG_ON(!(regs->msr & MSR_PR));
+   BUG_ON(IS_ENABLED(CONFIG_PPC64) && !(regs->msr & MSR_PR));
BUG_ON(!FULL_REGS(regs));
BUG_ON(IS_ENABLED(CONFIG_PPC64) && get_softe(regs) != IRQS_ENABLED);
 
-- 
2.25.0

Re: [RFC PATCH v2 12/13] powerpc/kernel: Do not inconditionally save non volatile registers on system call

2020-04-06 Thread Christophe Leroy





Le 06/04/2020 à 03:25, Nicholas Piggin a écrit :

Christophe Leroy's on April 6, 2020 3:44 am:

Before : 347 cycles on null_syscall
After  : 327 cycles on null_syscall


The problem I had doing this is that signal delivery wnats full regs,
and you don't know if you have a signal pending ahead of time if you
have interrupts enabled.

I began to try bailing out back to asm to save nvgprs and call again.
I think that can be made to work, but it is more complication in asm,
and I soon found that 64s CPUs don't care about NVGPRs too much so it's
nice to get rid of the !fullregs state.


I tried a new way in v3, please have a look. I split 
syscall_exit_prepare() in 3 parts and the result is unexpected: it is 
better than before the series (307 cycles now versus 311 cycles with 
full ASM syscall entry/exit).




Possibly another approach would be to leave interrupts disabled for the
case where you have no work to do. You could create a small
syscall_exit_prepare_nowork fastpath for that case for 32-bit, perhaps?

Thanks,
Nick



Christophe

Re: [RFC PATCH v2 11/13] powerpc/syscall: Avoid stack frame in likely part of syscall_call_exception()

2020-04-06 Thread Christophe Leroy





Le 06/04/2020 à 03:29, Nicholas Piggin a écrit :

Christophe Leroy's on April 6, 2020 3:44 am:

When r3 is not modified, reload it from regs->orig_r3 to free
volatile registers. This avoids a stack frame for the likely part
of syscall_call_exception()

Before : 353 cycles on null_syscall
After  : 347 cycles on null_syscall



[...]



Signed-off-by: Christophe Leroy 
---
  arch/powerpc/kernel/syscall.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
index 69d75fc4a5eb..630c423e089a 100644
--- a/arch/powerpc/kernel/syscall.c
+++ b/arch/powerpc/kernel/syscall.c
@@ -91,6 +91,8 @@ notrace long system_call_exception(long r3, long r4, long r5,
  
  	} else if (unlikely(r0 >= NR_syscalls)) {

return -ENOSYS;
+   } else {
+   r3 = regs->orig_gpr3;
}


So this just gives enough volatiles to avoid spilling to stack? I wonder
about other various options here if they would cause a spill anyway.

Interesting optimisation, it would definitely need a comment. Would be
nice if we had a way to tell the compiler that a local can be reloaded
from a particular address.


Ok, comment added.

Christophe

[RFC PATCH v3 01/15] powerpc/syscall: Refactorise from Nick

2020-04-06 Thread Christophe Leroy

From: Nicholas Piggin 

Christophe Leroy's on April 6, 2020 3:44 am:
> ifdef out specific PPC64 stuff to allow building
> syscall_64.c on PPC32.
>
> Modify Makefile to always build syscall.o
>
> Signed-off-by: Christophe Leroy 
> ---
>  arch/powerpc/kernel/Makefile  |  5 ++---
>  arch/powerpc/kernel/syscall.c | 10 ++
>  2 files changed, 12 insertions(+), 3 deletions(-)
>
> diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
> index 8cc3c831dccd..e4be425b7718 100644
> --- a/arch/powerpc/kernel/Makefile
> +++ b/arch/powerpc/kernel/Makefile
> @@ -45,11 +45,10 @@ obj-y := cputable.o 
> syscalls.o \
>  signal.o sysfs.o cacheinfo.o time.o \
>  prom.o traps.o setup-common.o \
>  udbg.o misc.o io.o misc_$(BITS).o \
> -of_platform.o prom_parse.o
> +of_platform.o prom_parse.o syscall.o
>  obj-y+= ptrace/
>  obj-$(CONFIG_PPC64)  += setup_64.o sys_ppc32.o signal_64.o \
> -paca.o nvram_64.o firmware.o note.o \
> -syscall.o
> +paca.o nvram_64.o firmware.o note.o
>  obj-$(CONFIG_VDSO32) += vdso32/
>  obj-$(CONFIG_PPC_WATCHDOG)   += watchdog.o
>  obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
> diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
> index 72f3d2f0a823..28bd43db8755 100644
> --- a/arch/powerpc/kernel/syscall.c
> +++ b/arch/powerpc/kernel/syscall.c
> @@ -25,8 +25,10 @@ notrace long system_call_exception(long r3, long r4, long 
> r5,
>   unsigned long ti_flags;
>   syscall_fn f;
>
> +#ifdef CONFIG_PPC64
>   if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
>   BUG_ON(irq_soft_mask_return() != IRQS_ALL_DISABLED);
> +#endif
>
>   trace_hardirqs_off(); /* finish reconciling */
>
> @@ -34,7 +36,9 @@ notrace long system_call_exception(long r3, long r4, long 
> r5,
>   BUG_ON(!(regs->msr & MSR_RI));
>   BUG_ON(!(regs->msr & MSR_PR));
>   BUG_ON(!FULL_REGS(regs));
> +#ifdef CONFIG_PPC64
>   BUG_ON(regs->softe != IRQS_ENABLED);
> +#endif
>
>   account_cpu_user_entry();
>
> @@ -56,7 +60,9 @@ notrace long system_call_exception(long r3, long r4, long 
> r5,
>* frame, or if the unwinder was taught the first stack frame always
>* returns to user with IRQS_ENABLED, this store could be avoided!
>*/
> +#ifdef CONFIG_PPC64
>   regs->softe = IRQS_ENABLED;
> +#endif
>
>   local_irq_enable();
>
> @@ -148,7 +154,9 @@ notrace unsigned long syscall_exit_prepare(unsigned long 
> r3,
>   ret |= _TIF_RESTOREALL;
>   }
>
> +#ifdef CONFIG_PPC64
>  again:
> +#endif
>   local_irq_disable();
>   ti_flags = READ_ONCE(*ti_flagsp);
>   while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
> @@ -191,6 +199,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long 
> r3,
>
>   /* This pattern matches prep_irq_for_idle */
>   __hard_EE_RI_disable();
> +#ifdef CONFIG_PPC64
>   if (unlikely(lazy_irq_pending())) {
>   __hard_RI_enable();
>   trace_hardirqs_off();
> @@ -201,6 +210,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long 
> r3,
>   }
>   local_paca->irq_happened = 0;
>   irq_soft_mask_set(IRQS_ENABLED);
> +#endif
>
>  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
>   local_paca->tm_scratch = regs->msr;
> --
> 2.25.0
>
>

The #ifdefs disappoint me!

Here is (unested) something that should help 32-bit avoid several ifdefs
in the main part of the function. I should have done this as part of the
merged series, but that's okay I'll submit as a cleanup.

The rest looks okay for now. Maybe we grow some helpers to manage the
soft-mask state, though I'm not really sure it would make sense for
32-bit code to ever call them. Maybe just confined to this file would be
okay but for now the ifdefs are okay.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/syscall_64.c | 58 +++-
 1 file changed, 28 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c
index cf06eb443a80..f021db893ec2 100644
--- a/arch/powerpc/kernel/syscall_64.c
+++ b/arch/powerpc/kernel/syscall_64.c
@@ -103,6 +103,31 @@ notrace long system_call_exception(long r3, long r4, long 
r5,
return f(r3, r4, r5, r6, r7, r8);
 }
 
+/*
+ * local irqs must be disabled. Returns false if the caller must re-enable
+ * them, check for new work, and try again.
+ */
+static notrace inline bool prep_irq_for_enabled_exit(void)
+{
+   /* This must be done with RI=1 because tracing may touch vmaps */
+   trace_hardirqs_on();
+
+   /* This pattern matches prep_irq_for_idle */
+   __hard_EE_RI_disable();
+   if

[RFC PATCH v3 06/15] powerpc/irq: Add new helpers to play with MSR_EE and MSR_RI on PPC32

2020-04-06 Thread Christophe Leroy

In preparation of porting PPC32 to C syscall entry/exit,
add PPC32 version of following helpers:
__hard_irq_enable()
__hard_irq_disable()
__hard_EE_RI_disable()
__hard_RI_enable()

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/hw_irq.h | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/arch/powerpc/include/asm/hw_irq.h 
b/arch/powerpc/include/asm/hw_irq.h
index e69466867d5f..8c30a72262fd 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -330,6 +330,16 @@ static inline void arch_local_irq_disable(void)
mtmsr(mfmsr() & ~MSR_EE);
 }
 
+static inline void arch_local_recovery_disable(void)
+{
+   if (IS_ENABLED(CONFIG_BOOKE))
+   wrtee(0);
+   else if (IS_ENABLED(CONFIG_PPC_8xx))
+   wrtspr(SPRN_NRI);
+   else
+   mtmsr(mfmsr() & ~(MSR_EE | MSR_RI));
+}
+
 static inline void arch_local_irq_enable(void)
 {
if (IS_ENABLED(CONFIG_BOOKE))
@@ -352,6 +362,11 @@ static inline bool arch_irqs_disabled(void)
 
 #define hard_irq_disable() arch_local_irq_disable()
 
+#define __hard_irq_enable()arch_local_irq_enable()
+#define __hard_irq_disable()   arch_local_irq_disable()
+#define __hard_EE_RI_disable() arch_local_recovery_disable()
+#define __hard_RI_enable() arch_local_irq_disable()
+
 static inline bool arch_irq_disabled_regs(struct pt_regs *regs)
 {
return !(regs->msr & MSR_EE);
-- 
2.25.0

[RFC PATCH v3 05/15] powerpc/irq: Add helpers to get and set regs->softe

2020-04-06 Thread Christophe Leroy

regs->softe doesn't exist on PPC32.

Add helpers to get and set regs->softe.
Those helpers will void on PPC32.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/hw_irq.h | 21 +++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/hw_irq.h 
b/arch/powerpc/include/asm/hw_irq.h
index e0e71777961f..e69466867d5f 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -39,6 +39,8 @@
 #define PACA_IRQ_MUST_HARD_MASK(PACA_IRQ_EE)
 #endif
 
+#endif /* CONFIG_PPC64 */
+
 /*
  * flags for paca->irq_soft_mask
  */
@@ -47,8 +49,6 @@
 #define IRQS_PMI_DISABLED  2
 #define IRQS_ALL_DISABLED  (IRQS_DISABLED | IRQS_PMI_DISABLED)
 
-#endif /* CONFIG_PPC64 */
-
 #ifndef __ASSEMBLY__
 
 extern void replay_system_reset(void);
@@ -282,6 +282,15 @@ extern void irq_set_pending_from_srr1(unsigned long srr1);
 
 extern void force_external_irq_replay(void);
 
+static inline unsigned long get_softe(struct pt_regs *regs)
+{
+   return regs->softe;
+}
+
+static inline void set_softe(struct pt_regs *regs, unsigned long val)
+{
+   regs->softe = val;
+}
 #else /* CONFIG_PPC64 */
 
 static inline unsigned long arch_local_save_flags(void)
@@ -350,6 +359,14 @@ static inline bool arch_irq_disabled_regs(struct pt_regs 
*regs)
 
 static inline void may_hard_irq_enable(void) { }
 
+static inline unsigned long get_softe(struct pt_regs *regs)
+{
+   return 0;
+}
+
+static inline void set_softe(struct pt_regs *regs, unsigned long val)
+{
+}
 #endif /* CONFIG_PPC64 */
 
 #define ARCH_IRQ_INIT_FLAGSIRQ_NOREQUEST
-- 
2.25.0

[RFC PATCH v3 10/15] powerpc/syscall: Use is_compat_task()

2020-04-06 Thread Christophe Leroy

Instead of hard comparing task flags with _TIF_32BIT, use
is_compat_task(). The advantage is that it returns 0 on PPC32
allthough _TIF_32BIT is always set.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/syscall.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
index 98c98ce12f7d..0ad4250d2ce8 100644
--- a/arch/powerpc/kernel/syscall.c
+++ b/arch/powerpc/kernel/syscall.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #include 
+#include 
+
 #include 
 #include 
 #include 
@@ -86,7 +88,7 @@ notrace long system_call_exception(long r3, long r4, long r5,
/* May be faster to do array_index_nospec? */
barrier_nospec();
 
-   if (unlikely(ti_flags & _TIF_32BIT)) {
+   if (is_compat_task()) {
f = (void *)compat_sys_call_table[r0];
 
r3 &= 0xULL;
-- 
2.25.0

[RFC PATCH v3 07/15] powerpc/irq: Add stub irq_soft_mask_return() for PPC32

2020-04-06 Thread Christophe Leroy

To allow building syscall_64.c smoothly on PPC32, add stub version
of irq_soft_mask_return().

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/hw_irq.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/include/asm/hw_irq.h 
b/arch/powerpc/include/asm/hw_irq.h
index 8c30a72262fd..1c25a84a3159 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -293,6 +293,11 @@ static inline void set_softe(struct pt_regs *regs, 
unsigned long val)
 }
 #else /* CONFIG_PPC64 */
 
+static inline notrace unsigned long irq_soft_mask_return(void)
+{
+   return 0;
+}
+
 static inline unsigned long arch_local_save_flags(void)
 {
return mfmsr();
-- 
2.25.0

[RFC PATCH v3 14/15] powerpc/syscall: Avoid stack frame in likely part of syscall_call_exception()

2020-04-06 Thread Christophe Leroy

When r3 is not modified, reload it from regs->orig_r3 to free
volatile registers. This avoids a stack frame for the likely part
of syscall_call_exception()

Before : 353 cycles on null_syscall
After  : 347 cycles on null_syscall

Before the patch:

c000b4d4 :
c000b4d4:   7c 08 02 a6 mflrr0
c000b4d8:   94 21 ff e0 stwur1,-32(r1)
c000b4dc:   93 e1 00 1c stw r31,28(r1)
c000b4e0:   90 01 00 24 stw r0,36(r1)
c000b4e4:   90 6a 00 88 stw r3,136(r10)
c000b4e8:   81 6a 00 84 lwz r11,132(r10)
c000b4ec:   69 6b 00 02 xorir11,r11,2
c000b4f0:   55 6b ff fe rlwinm  r11,r11,31,31,31
c000b4f4:   0f 0b 00 00 twnei   r11,0
c000b4f8:   81 6a 00 a0 lwz r11,160(r10)
c000b4fc:   55 6b 07 fe clrlwi  r11,r11,31
c000b500:   0f 0b 00 00 twnei   r11,0
c000b504:   7c 0c 42 e6 mftbr0
c000b508:   83 e2 00 08 lwz r31,8(r2)
c000b50c:   81 82 00 28 lwz r12,40(r2)
c000b510:   90 02 00 24 stw r0,36(r2)
c000b514:   7d 8c f8 50 subfr12,r12,r31
c000b518:   7c 0c 02 14 add r0,r12,r0
c000b51c:   90 02 00 08 stw r0,8(r2)
c000b520:   7c 10 13 a6 mtspr   80,r0
c000b524:   81 62 00 70 lwz r11,112(r2)
c000b528:   71 60 86 91 andi.   r0,r11,34449
c000b52c:   40 82 00 34 bne c000b560 
c000b530:   2b 89 01 b6 cmplwi  cr7,r9,438
c000b534:   41 9d 00 64 bgt cr7,c000b598 

c000b538:   3d 40 c0 5c lis r10,-16292
c000b53c:   55 29 10 3a rlwinm  r9,r9,2,0,29
c000b540:   39 4a 41 e8 addir10,r10,16872
c000b544:   80 01 00 24 lwz r0,36(r1)
c000b548:   7d 2a 48 2e lwzxr9,r10,r9
c000b54c:   7c 08 03 a6 mtlrr0
c000b550:   7d 29 03 a6 mtctr   r9
c000b554:   83 e1 00 1c lwz r31,28(r1)
c000b558:   38 21 00 20 addir1,r1,32
c000b55c:   4e 80 04 20 bctr

After the patch:

c000b4d4 :
c000b4d4:   81 6a 00 84 lwz r11,132(r10)
c000b4d8:   90 6a 00 88 stw r3,136(r10)
c000b4dc:   69 6b 00 02 xorir11,r11,2
c000b4e0:   55 6b ff fe rlwinm  r11,r11,31,31,31
c000b4e4:   0f 0b 00 00 twnei   r11,0
c000b4e8:   80 6a 00 a0 lwz r3,160(r10)
c000b4ec:   54 63 07 fe clrlwi  r3,r3,31
c000b4f0:   0f 03 00 00 twnei   r3,0
c000b4f4:   7d 6c 42 e6 mftbr11
c000b4f8:   81 82 00 08 lwz r12,8(r2)
c000b4fc:   80 02 00 28 lwz r0,40(r2)
c000b500:   91 62 00 24 stw r11,36(r2)
c000b504:   7c 00 60 50 subfr0,r0,r12
c000b508:   7d 60 5a 14 add r11,r0,r11
c000b50c:   91 62 00 08 stw r11,8(r2)
c000b510:   7c 10 13 a6 mtspr   80,r0
c000b514:   80 62 00 70 lwz r3,112(r2)
c000b518:   70 6b 86 91 andi.   r11,r3,34449
c000b51c:   40 82 00 28 bne c000b544 
c000b520:   2b 89 01 b6 cmplwi  cr7,r9,438
c000b524:   41 9d 00 84 bgt cr7,c000b5a8 

c000b528:   80 6a 00 88 lwz r3,136(r10)
c000b52c:   3d 40 c0 5c lis r10,-16292
c000b530:   55 29 10 3a rlwinm  r9,r9,2,0,29
c000b534:   39 4a 41 e4 addir10,r10,16868
c000b538:   7d 2a 48 2e lwzxr9,r10,r9
c000b53c:   7d 29 03 a6 mtctr   r9
c000b540:   4e 80 04 20 bctr

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/syscall.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
index f9fca9985b0f..af449a4a8e8f 100644
--- a/arch/powerpc/kernel/syscall.c
+++ b/arch/powerpc/kernel/syscall.c
@@ -85,6 +85,9 @@ notrace long system_call_exception(long r3, long r4, long r5,
 
} else if (unlikely(r0 >= NR_syscalls)) {
return -ENOSYS;
+   } else {
+   /* Restore r3 from orig_gpr3 to free up a volatile reg */
+   r3 = regs->orig_gpr3;
}
 
/* May be faster to do array_index_nospec? */
-- 
2.25.0

Re: [RFC PATCH v2 05/13] powerpc/syscall: Rename syscall_64.c into syscall.c

2020-04-06 Thread Christophe Leroy





Le 06/04/2020 à 03:42, Nicholas Piggin a écrit :

Christophe Leroy's on April 6, 2020 3:44 am:

syscall_64.c will be reused almost as is for PPC32.

Rename it syscall.c


Don't mind this, but I wonder if we can rename it to interrupt.c.


Interrupt for me is irq.

Maybe exception.c ?

Exceptions, that's what interrupts and system calls are.

Christophe

[PATCH 1/1] powerpc/crash: Use NMI context for printk after crashing other CPUs

2020-04-06 Thread Leonardo Bras

Currently, if printk lock (logbuf_lock) is held by other thread during
crash, there is a chance of deadlocking the crash on next printk, and
blocking a possibly desired kdump.

After sending IPI to all other CPUs, make printk enter in NMI context,
as it will use per-cpu buffers to store the message, and avoid locking
logbuf_lock.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/kexec/crash.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c
index d488311efab1..9b73e3991bf4 100644
--- a/arch/powerpc/kexec/crash.c
+++ b/arch/powerpc/kexec/crash.c
@@ -115,6 +115,7 @@ static void crash_kexec_prepare_cpus(int cpu)
 
crash_send_ipi(crash_ipi_callback);
smp_wmb();
+   printk_nmi_enter();
 
 again:
/*
-- 
2.25.1

Re: [RFC PATCH v2 06/13] powerpc/syscall: Make syscall_64.c buildable on PPC32

2020-04-06 Thread Christophe Leroy





Le 06/04/2020 à 03:52, Nicholas Piggin a écrit :

Christophe Leroy's on April 6, 2020 3:44 am:

ifdef out specific PPC64 stuff to allow building
syscall_64.c on PPC32.

Modify Makefile to always build syscall.o

Signed-off-by: Christophe Leroy 
---
  arch/powerpc/kernel/Makefile  |  5 ++---
  arch/powerpc/kernel/syscall.c | 10 ++
  2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 8cc3c831dccd..e4be425b7718 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -45,11 +45,10 @@ obj-y   := cputable.o 
syscalls.o \
   signal.o sysfs.o cacheinfo.o time.o \
   prom.o traps.o setup-common.o \
   udbg.o misc.o io.o misc_$(BITS).o \
-  of_platform.o prom_parse.o
+  of_platform.o prom_parse.o syscall.o
  obj-y += ptrace/
  obj-$(CONFIG_PPC64)   += setup_64.o sys_ppc32.o signal_64.o \
-  paca.o nvram_64.o firmware.o note.o \
-  syscall.o
+  paca.o nvram_64.o firmware.o note.o
  obj-$(CONFIG_VDSO32)  += vdso32/
  obj-$(CONFIG_PPC_WATCHDOG)+= watchdog.o
  obj-$(CONFIG_HAVE_HW_BREAKPOINT)  += hw_breakpoint.o
diff --git a/arch/powerpc/kernel/syscall.c b/arch/powerpc/kernel/syscall.c
index 72f3d2f0a823..28bd43db8755 100644
--- a/arch/powerpc/kernel/syscall.c
+++ b/arch/powerpc/kernel/syscall.c
@@ -25,8 +25,10 @@ notrace long system_call_exception(long r3, long r4, long r5,
unsigned long ti_flags;
syscall_fn f;
  
+#ifdef CONFIG_PPC64

if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
BUG_ON(irq_soft_mask_return() != IRQS_ALL_DISABLED);
+#endif
  
  	trace_hardirqs_off(); /* finish reconciling */
  
@@ -34,7 +36,9 @@ notrace long system_call_exception(long r3, long r4, long r5,

BUG_ON(!(regs->msr & MSR_RI));
BUG_ON(!(regs->msr & MSR_PR));
BUG_ON(!FULL_REGS(regs));
+#ifdef CONFIG_PPC64
BUG_ON(regs->softe != IRQS_ENABLED);
+#endif
  
  	account_cpu_user_entry();
  
@@ -56,7 +60,9 @@ notrace long system_call_exception(long r3, long r4, long r5,

 * frame, or if the unwinder was taught the first stack frame always
 * returns to user with IRQS_ENABLED, this store could be avoided!
 */
+#ifdef CONFIG_PPC64
regs->softe = IRQS_ENABLED;
+#endif
  
  	local_irq_enable();
  
@@ -148,7 +154,9 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,

ret |= _TIF_RESTOREALL;
}
  
+#ifdef CONFIG_PPC64

  again:
+#endif
local_irq_disable();
ti_flags = READ_ONCE(*ti_flagsp);
while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
@@ -191,6 +199,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
  
  	/* This pattern matches prep_irq_for_idle */

__hard_EE_RI_disable();
+#ifdef CONFIG_PPC64
if (unlikely(lazy_irq_pending())) {
__hard_RI_enable();
trace_hardirqs_off();
@@ -201,6 +210,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
}
local_paca->irq_happened = 0;
irq_soft_mask_set(IRQS_ENABLED);
+#endif
  
  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM

local_paca->tm_scratch = regs->msr;
--
2.25.0




The #ifdefs disappoint me!


They disappoint me as well. I tried to removed most of them in v3, I 
also took your patch up front the series.




Here is (unested) something that should help 32-bit avoid several ifdefs
in the main part of the function. I should have done this as part of the
merged series, but that's okay I'll submit as a cleanup.

The rest looks okay for now. Maybe we grow some helpers to manage the
soft-mask state, though I'm not really sure it would make sense for
32-bit code to ever call them. Maybe just confined to this file would be
okay but for now the ifdefs are okay.


Thanks
Christophe

Re: [PATCH v11 3/8] powerpc/perf: consolidate read_user_stack_32

2020-04-06 Thread Michal Suchánek

On Fri, Apr 03, 2020 at 05:13:25PM +1000, Nicholas Piggin wrote:
> Michal Suchánek's on March 25, 2020 5:38 am:
> > On Tue, Mar 24, 2020 at 06:48:20PM +1000, Nicholas Piggin wrote:
> >> Michal Suchanek's on March 19, 2020 10:19 pm:
> >> > There are two almost identical copies for 32bit and 64bit.
> >> > 
> >> > The function is used only in 32bit code which will be split out in next
> >> > patch so consolidate to one function.
> >> > 
> >> > Signed-off-by: Michal Suchanek 
> >> > Reviewed-by: Christophe Leroy 
> >> > ---
> >> > v6:  new patch
> >> > v8:  move the consolidated function out of the ifdef block.
> >> > v11: rebase on top of def0bfdbd603
> >> > ---
> >> >  arch/powerpc/perf/callchain.c | 48 +--
> >> >  1 file changed, 24 insertions(+), 24 deletions(-)
> >> > 
> >> > diff --git a/arch/powerpc/perf/callchain.c 
> >> > b/arch/powerpc/perf/callchain.c
> >> > index cbc251981209..c9a78c6e4361 100644
> >> > --- a/arch/powerpc/perf/callchain.c
> >> > +++ b/arch/powerpc/perf/callchain.c
> >> > @@ -161,18 +161,6 @@ static int read_user_stack_64(unsigned long __user 
> >> > *ptr, unsigned long *ret)
> >> >  return read_user_stack_slow(ptr, ret, 8);
> >> >  }
> >> >  
> >> > -static int read_user_stack_32(unsigned int __user *ptr, unsigned int 
> >> > *ret)
> >> > -{
> >> > -if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
> >> > -((unsigned long)ptr & 3))
> >> > -return -EFAULT;
> >> > -
> >> > -if (!probe_user_read(ret, ptr, sizeof(*ret)))
> >> > -return 0;
> >> > -
> >> > -return read_user_stack_slow(ptr, ret, 4);
> >> > -}
> >> > -
> >> >  static inline int valid_user_sp(unsigned long sp, int is_64)
> >> >  {
> >> >  if (!sp || (sp & 7) || sp > (is_64 ? TASK_SIZE : 0x1UL) 
> >> > - 32)
> >> > @@ -277,19 +265,9 @@ static void perf_callchain_user_64(struct 
> >> > perf_callchain_entry_ctx *entry,
> >> >  }
> >> >  
> >> >  #else  /* CONFIG_PPC64 */
> >> > -/*
> >> > - * On 32-bit we just access the address and let hash_page create a
> >> > - * HPTE if necessary, so there is no need to fall back to reading
> >> > - * the page tables.  Since this is called at interrupt level,
> >> > - * do_page_fault() won't treat a DSI as a page fault.
> >> > - */
> >> > -static int read_user_stack_32(unsigned int __user *ptr, unsigned int 
> >> > *ret)
> >> > +static int read_user_stack_slow(void __user *ptr, void *buf, int nb)
> >> >  {
> >> > -if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
> >> > -((unsigned long)ptr & 3))
> >> > -return -EFAULT;
> >> > -
> >> > -return probe_user_read(ret, ptr, sizeof(*ret));
> >> > +return 0;
> >> >  }
> >> >  
> >> >  static inline void perf_callchain_user_64(struct 
> >> > perf_callchain_entry_ctx *entry,
> >> > @@ -312,6 +290,28 @@ static inline int valid_user_sp(unsigned long sp, 
> >> > int is_64)
> >> >  
> >> >  #endif /* CONFIG_PPC64 */
> >> >  
> >> > +/*
> >> > + * On 32-bit we just access the address and let hash_page create a
> >> > + * HPTE if necessary, so there is no need to fall back to reading
> >> > + * the page tables.  Since this is called at interrupt level,
> >> > + * do_page_fault() won't treat a DSI as a page fault.
> >> > + */
> >> 
> >> The comment is actually probably better to stay in the 32-bit
> >> read_user_stack_slow implementation. Is that function defined
> >> on 32-bit purely so that you can use IS_ENABLED()? In that case
> > It documents the IS_ENABLED() and that's where it is. The 32bit
> > definition is only a technical detail.
> 
> Sorry for the late reply, busy trying to fix bugs in the C rewrite
> series. I don't think it is the right place, it should be in the
> ppc32 implementation detail.
Which does not exist anymore after the 32bit and 64bit part is split.
> ppc64 has an equivalent comment at the top of its read_user_stack functions.
> 
> >> I would prefer to put a BUG() there which makes it self documenting.
> > Which will cause checkpatch complaints about introducing new BUG() which
> > is frowned on.
> 
> It's fine in this case, that warning is about not introducing
> runtime bugs, but this wouldn't be.
> 
> But... I actually don't like adding read_user_stack_slow on 32-bit
> and especially not just to make IS_ENABLED work.
That's to not break build at this point. Later the function is removed.
> 
> IMO this would be better if you really want to consolidate it
> 
> ---
> 
> diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
> index cbc251981209..ca3a599b3f54 100644
> --- a/arch/powerpc/perf/callchain.c
> +++ b/arch/powerpc/perf/callchain.c
> @@ -108,7 +108,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx 
> *entry, struct pt_regs *re
>   * interrupt context, so if the access faults, we read the page tables
>   * to find which page (if any) is mapped and access it directly.
>   */
> -static int

[PATCH] powerpcs: perf: consolidate perf_callchain_user_64 and perf_callchain_user_32

2020-04-06 Thread Michal Suchanek

perf_callchain_user_64 and perf_callchain_user_32 are nearly identical.
Consolidate into one function with thin wrappers.

Suggested-by: Nicholas Piggin 
Signed-off-by: Michal Suchanek 
---
 arch/powerpc/perf/callchain.h| 24 +++-
 arch/powerpc/perf/callchain_32.c | 21 ++---
 arch/powerpc/perf/callchain_64.c | 14 --
 3 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/perf/callchain.h b/arch/powerpc/perf/callchain.h
index 7a2cb9e1181a..7540bb71cb60 100644
--- a/arch/powerpc/perf/callchain.h
+++ b/arch/powerpc/perf/callchain.h
@@ -2,7 +2,7 @@
 #ifndef _POWERPC_PERF_CALLCHAIN_H
 #define _POWERPC_PERF_CALLCHAIN_H
 
-int read_user_stack_slow(void __user *ptr, void *buf, int nb);
+int read_user_stack_slow(const void __user *ptr, void *buf, int nb);
 void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
struct pt_regs *regs);
 void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
@@ -16,4 +16,26 @@ static inline bool invalid_user_sp(unsigned long sp)
return (!sp || (sp & mask) || (sp > top));
 }
 
+/*
+ * On 32-bit we just access the address and let hash_page create a
+ * HPTE if necessary, so there is no need to fall back to reading
+ * the page tables.  Since this is called at interrupt level,
+ * do_page_fault() won't treat a DSI as a page fault.
+ */
+static inline int __read_user_stack(const void __user *ptr, void *ret,
+   size_t size)
+{
+   int rc;
+
+   if ((unsigned long)ptr > TASK_SIZE - size ||
+   ((unsigned long)ptr & (size - 1)))
+   return -EFAULT;
+   rc = probe_user_read(ret, ptr, size);
+
+   if (rc && IS_ENABLED(CONFIG_PPC64))
+   return read_user_stack_slow(ptr, ret, size);
+
+   return rc;
+}
+
 #endif /* _POWERPC_PERF_CALLCHAIN_H */
diff --git a/arch/powerpc/perf/callchain_32.c b/arch/powerpc/perf/callchain_32.c
index 8aa951003141..1b4621f177e8 100644
--- a/arch/powerpc/perf/callchain_32.c
+++ b/arch/powerpc/perf/callchain_32.c
@@ -31,26 +31,9 @@
 
 #endif /* CONFIG_PPC64 */
 
-/*
- * On 32-bit we just access the address and let hash_page create a
- * HPTE if necessary, so there is no need to fall back to reading
- * the page tables.  Since this is called at interrupt level,
- * do_page_fault() won't treat a DSI as a page fault.
- */
-static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
+static int read_user_stack_32(const unsigned int __user *ptr, unsigned int 
*ret)
 {
-   int rc;
-
-   if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
-   ((unsigned long)ptr & 3))
-   return -EFAULT;
-
-   rc = probe_user_read(ret, ptr, sizeof(*ret));
-
-   if (IS_ENABLED(CONFIG_PPC64) && rc)
-   return read_user_stack_slow(ptr, ret, 4);
-
-   return rc;
+   return __read_user_stack(ptr, ret, sizeof(*ret);
 }
 
 /*
diff --git a/arch/powerpc/perf/callchain_64.c b/arch/powerpc/perf/callchain_64.c
index df1ffd8b20f2..55bbc25a54ed 100644
--- a/arch/powerpc/perf/callchain_64.c
+++ b/arch/powerpc/perf/callchain_64.c
@@ -24,7 +24,7 @@
  * interrupt context, so if the access faults, we read the page tables
  * to find which page (if any) is mapped and access it directly.
  */
-int read_user_stack_slow(void __user *ptr, void *buf, int nb)
+int read_user_stack_slow(const void __user *ptr, void *buf, int nb)
 {
int ret = -EFAULT;
pgd_t *pgdir;
@@ -65,16 +65,10 @@ int read_user_stack_slow(void __user *ptr, void *buf, int 
nb)
return ret;
 }
 
-static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret)
+static int read_user_stack_64(const unsigned long __user *ptr,
+ unsigned long *ret)
 {
-   if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned long) ||
-   ((unsigned long)ptr & 7))
-   return -EFAULT;
-
-   if (!probe_user_read(ret, ptr, sizeof(*ret)))
-   return 0;
-
-   return read_user_stack_slow(ptr, ret, 8);
+   return __read_user_stack(ptr, ret, sizeof(*ret));
 }
 
 /*
-- 
2.23.0

[Bug 207129] PowerMac G4 DP (5.6.2 debug kernel + inline KASAN) freezes shortly after booting with "do_IRQ: stack overflow: 1760"

2020-04-06 Thread bugzilla-daemon

https://bugzilla.kernel.org/show_bug.cgi?id=207129

--- Comment #4 from Erhard F. (erhar...@mailbox.org) ---
Without CONFIG_VMAP_STACK I had one crash after 2-3 hours of building but the
panic timer kicked in and rebooted the machine. Now it has been building
packages for hours again without any anomalies.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

Re: [PATCH v6 4/7] ASoC: fsl_asrc: Support new property fsl,asrc-format

2020-04-06 Thread Nicolin Chen

Just some small comments.

On Wed, Apr 01, 2020 at 04:45:37PM +0800, Shengjiu Wang wrote:
> In order to align with new ESARC, we add new property fsl,asrc-format.
> The fsl,asrc-format can replace the fsl,asrc-width, driver
> can accept format from devicetree, don't need to convert it to
> format through width.
> 
> Signed-off-by: Shengjiu Wang 
> ---
>  sound/soc/fsl/fsl_asrc.c | 40 ++--
>  sound/soc/fsl/fsl_asrc.h |  4 ++--
>  sound/soc/fsl/fsl_asrc_dma.c | 15 +++---
>  3 files changed, 39 insertions(+), 20 deletions(-)
> 
> diff --git a/sound/soc/fsl/fsl_asrc.c b/sound/soc/fsl/fsl_asrc.c
> index 4d3e51bfa949..eea19e2b723b 100644
> --- a/sound/soc/fsl/fsl_asrc.c
> +++ b/sound/soc/fsl/fsl_asrc.c
> @@ -1052,16 +1047,31 @@ static int fsl_asrc_probe(struct platform_device 
> *pdev)
>   return ret;
>   }
>  
> - ret = of_property_read_u32(np, "fsl,asrc-width",
> ->asrc_width);
> + ret = of_property_read_u32(np, "fsl,asrc-format", >asrc_format);
>   if (ret) {
> - dev_err(>dev, "failed to get output width\n");
> - return ret;
> + ret = of_property_read_u32(np, "fsl,asrc-width", );
> + if (ret) {
> + dev_err(>dev, "failed to get output width\n");

Similar to the comments against sound card driver:
"failed to decide output format"

> + return ret;
> + }
> +
> + switch (width) {
> + case 16:
> + asrc->asrc_format = SNDRV_PCM_FORMAT_S16_LE;
> + break;
> + case 24:
> + asrc->asrc_format = SNDRV_PCM_FORMAT_S24_LE;
> + break;
> + default:
> + dev_warn(>dev, "unsupported width, switching to 
> 24bit\n");

Should match what the code does after the change:
+   dev_warn(>dev,
+"unsupported width, use default S24_LE\n");

> + asrc->asrc_format = SNDRV_PCM_FORMAT_S24_LE;
> + break;
> + }
>   }
>  
> - if (asrc->asrc_width != 16 && asrc->asrc_width != 24) {
> - dev_warn(>dev, "unsupported width, switching to 24bit\n");
> - asrc->asrc_width = 24;
> + if (!(FSL_ASRC_FORMATS & (1ULL << asrc->asrc_format))) {
> + dev_warn(>dev, "unsupported format, switching to 
> S24_LE\n");

Could fit 80 characters:
+   dev_warn(>dev, "unsupported width, use default S24_LE\n");

> diff --git a/sound/soc/fsl/fsl_asrc_dma.c b/sound/soc/fsl/fsl_asrc_dma.c
> index 5fe83aece25b..b15946e03380 100644
> --- a/sound/soc/fsl/fsl_asrc_dma.c
> +++ b/sound/soc/fsl/fsl_asrc_dma.c
> @@ -230,10 +230,19 @@ static int fsl_asrc_dma_hw_params(struct 
> snd_soc_component *component,
>   return -EINVAL;
>   }
>  
> - if (asrc->asrc_width == 16)
> + bits = snd_pcm_format_physical_width(asrc->asrc_format);

Can we just use 'width' to match the function name?

Re: [PATCH] selftests/powerpc: Always build the tm-poison test 64-bit

2020-04-06 Thread Gustavo Romero


On 04/06/2020 10:06 AM, Michael Ellerman wrote:

On Fri, 2020-04-03 at 09:56:56 UTC, Michael Ellerman wrote:

The tm-poison test includes inline asm which is 64-bit only, so the
test must be built 64-bit in order to work.

Otherwise it fails, eg:
   # file tm-poison
   tm-poison: ELF 32-bit MSB executable, PowerPC or cisco 4500, version 1 
(SYSV) ...
   # ./tm-poison
   test: tm_poison_test
   Unknown value 0x1fff71150 leaked into f31!
   Unknown value 0x1fff710c0 leaked into vr31!
   failure: tm_poison_test

Fixes: a003365cab64 ("powerpc/tm: Add tm-poison test")
Signed-off-by: Michael Ellerman 


Applied to powerpc next.

https://git.kernel.org/powerpc/c/6ba4a2d3591039aea1cb45c7c42262d26351a2fa

cheers


Ack.

Thank you, Michael.

Cheers,
Gustavo

[PATCH AUTOSEL 5.4 11/32] soc: fsl: dpio: register dpio irq handlers after dpio create

2020-04-06 Thread Sasha Levin

From: Grigore Popescu 

[ Upstream commit fe8fe7723a3a824790bda681b40efd767e2251a7 ]

The dpio irqs must be registered when you can actually
receive interrupts, ie when the dpios are created.
Kernel goes through NULL pointer dereference errors
followed by kernel panic [1] because the dpio irqs are
enabled before the dpio is created.

[1]
Unable to handle kernel NULL pointer dereference at virtual address 0040
fsl_mc_dpio dpio.14: probed
fsl_mc_dpio dpio.13: Adding to iommu group 11
  ISV = 0, ISS = 0x0004
Unable to handle kernel NULL pointer dereference at virtual address 0040
Mem abort info:
  ESR = 0x9604
  EC = 0x25: DABT (current EL), IL = 32 bits
  SET = 0, FnV = 0
  EA = 0, S1PTW = 0
Data abort info:
  ISV = 0, ISS = 0x0004
  CM = 0, WnR = 0
[0040] user address but active_mm is swapper
Internal error: Oops: 9604 [#1] PREEMPT SMP
Modules linked in:
CPU: 2 PID: 151 Comm: kworker/2:1 Not tainted 5.6.0-rc4-next-20200304 #1
Hardware name: NXP Layerscape LX2160ARDB (DT)
Workqueue: events deferred_probe_work_func
pstate: 0085 (nzcv daIf -PAN -UAO)
pc : dpaa2_io_irq+0x18/0xe0
lr : dpio_irq_handler+0x1c/0x28
sp : 800010013e20
x29: 800010013e20 x28: 0026d9b4c140
x27: a1d38a142018 x26: 0026d2953400
x25: a1d38a142018 x24: a1d38a7ba1d8
x23: 800010013f24 x22: 
x21: 0072 x20: 0026d2953400
x19: 0026d2a68b80 x18: 0001
x17: 2fb37f3d x16: 35eafadd
x15: 0026d9b4c5b8 x14: 
x13: ff00 x12: 0038
x11: 0101010101010101 x10: 0040
x9 : a1d388db11e4 x8 : a1d38a7e40f0
x7 : 0026da414f38 x6 : 
x5 : 0026da414d80 x4 : 5e5353d0c000
x3 : 800010013f60 x2 : a1d388db11c8
x1 : 0026d2a67c00 x0 : 
Call trace:
 dpaa2_io_irq+0x18/0xe0
 dpio_irq_handler+0x1c/0x28
 __handle_irq_event_percpu+0x78/0x2c0
 handle_irq_event_percpu+0x38/0x90
 handle_irq_event+0x4c/0xd0
 handle_fasteoi_irq+0xbc/0x168
 generic_handle_irq+0x2c/0x40
 __handle_domain_irq+0x68/0xc0
 gic_handle_irq+0x64/0x150
 el1_irq+0xb8/0x180
 _raw_spin_unlock_irqrestore+0x14/0x48
 irq_set_affinity_hint+0x6c/0xa0
 dpaa2_dpio_probe+0x2a4/0x518
 fsl_mc_driver_probe+0x28/0x70
 really_probe+0xdc/0x320
 driver_probe_device+0x5c/0xf0
 __device_attach_driver+0x88/0xc0
 bus_for_each_drv+0x7c/0xc8
 __device_attach+0xe4/0x140
 device_initial_probe+0x18/0x20
 bus_probe_device+0x98/0xa0
 device_add+0x41c/0x758
 fsl_mc_device_add+0x184/0x530
 dprc_scan_objects+0x280/0x370
 dprc_probe+0x124/0x3b0
 fsl_mc_driver_probe+0x28/0x70
 really_probe+0xdc/0x320
 driver_probe_device+0x5c/0xf0
 __device_attach_driver+0x88/0xc0
 bus_for_each_drv+0x7c/0xc8
 __device_attach+0xe4/0x140
 device_initial_probe+0x18/0x20
 bus_probe_device+0x98/0xa0
 deferred_probe_work_func+0x74/0xa8
 process_one_work+0x1c8/0x470
 worker_thread+0x1f8/0x428
 kthread+0x124/0x128
 ret_from_fork+0x10/0x18
Code: a9bc7bfd 910003fd a9025bf5 a90363f7 (f9402015)
---[ end trace 38298e1a29e7a570 ]---
Kernel panic - not syncing: Fatal exception in interrupt
SMP: stopping secondary CPUs
Mem abort info:
  ESR = 0x9604
  CM = 0, WnR = 0
  EC = 0x25: DABT (current EL), IL = 32 bits
[0040] user address but active_mm is swapper
  SET = 0, FnV = 0
  EA = 0, S1PTW = 0
Data abort info:
  ISV = 0, ISS = 0x0004
  CM = 0, WnR = 0
[0040] user address but active_mm is swapper
SMP: failed to stop secondary CPUs 0-2
Kernel Offset: 0x21d37860 from 0x80001000
PHYS_OFFSET: 0xe9218000
CPU features: 0x10002,21806008
Memory Limit: none
---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---

Signed-off-by: Laurentiu Tudor 
Signed-off-by: Grigore Popescu 
Signed-off-by: Li Yang 
Signed-off-by: Sasha Levin 
---
 drivers/soc/fsl/dpio/dpio-driver.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/soc/fsl/dpio/dpio-driver.c 
b/drivers/soc/fsl/dpio/dpio-driver.c
index 70014ecce2a7e..7b642c330977f 100644
--- a/drivers/soc/fsl/dpio/dpio-driver.c
+++ b/drivers/soc/fsl/dpio/dpio-driver.c
@@ -233,10 +233,6 @@ static int dpaa2_dpio_probe(struct fsl_mc_device *dpio_dev)
goto err_allocate_irqs;
}
 
-   err = register_dpio_irq_handlers(dpio_dev, desc.cpu);
-   if (err)
-   goto err_register_dpio_irq;
-
priv->io = dpaa2_io_create(, dev);
if (!priv->io) {
dev_err(dev, "dpaa2_io_create failed\n");
@@ -244,6 +240,10 @@ static int dpaa2_dpio_probe(struct fsl_mc_device *dpio_dev)
goto err_dpaa2_io_create;
}
 
+   err = register_dpio_irq_handlers(dpio_dev, desc.cpu);
+   if (err)
+   goto err_register_dpio_irq;
+
dev_info(dev, "probed\n");
dev_dbg(dev, "   receives_notifications = %d\n",
desc.receives_notifications);
-- 
2.20.1

Re: [RFC PATCH v2 05/13] powerpc/syscall: Rename syscall_64.c into syscall.c

2020-04-06 Thread Nicholas Piggin

Christophe Leroy's on April 7, 2020 4:20 am:
> 
> 
> Le 06/04/2020 à 03:42, Nicholas Piggin a écrit :
>> Christophe Leroy's on April 6, 2020 3:44 am:
>>> syscall_64.c will be reused almost as is for PPC32.
>>>
>>> Rename it syscall.c
>> 
>> Don't mind this, but I wonder if we can rename it to interrupt.c.
> 
> Interrupt for me is irq.
> 
> Maybe exception.c ?
> 
> Exceptions, that's what interrupts and system calls are.

It's not the same. An interrupt is caused by an exception, but an 
exception does not always cause an interrupt.

The code here is handling interrupts, it is not handling the exceptions
(those are handled by handlers that are called). We also handle
exceptions without taking an interrupt, for example if we set 
decrementer or msgclr.

And we often (especially in 64-bit with soft masking) take interrupts
without handling the exception (we set EE=0 and return, leaving the
exception existing).

So I'm trying to fix terminology slowly. We are not "returning" from
an exception for example, that doesn't make sense. An exception is a
condition of processor state which is either satisfied or it is not.
What the software returns from is an interrupt.

I'd like irq to be ~= Linux irq, which in general means an asynchronous 
interrupt that can be blocked by local_irq_disable or sometimes a device 
irq specifically, as distinct from "interrupt" which is powerpc 
architecture definition.

Thanks,
Nick

Re: [RFC PATCH v3 05/15] powerpc/irq: Add helpers to get and set regs->softe

2020-04-06 Thread Nicholas Piggin

Christophe Leroy's on April 7, 2020 4:16 am:
> regs->softe doesn't exist on PPC32.
> 
> Add helpers to get and set regs->softe.
> Those helpers will void on PPC32.
> 
> Signed-off-by: Christophe Leroy 
> ---
>  arch/powerpc/include/asm/hw_irq.h | 21 +++--
>  1 file changed, 19 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/hw_irq.h 
> b/arch/powerpc/include/asm/hw_irq.h
> index e0e71777961f..e69466867d5f 100644
> --- a/arch/powerpc/include/asm/hw_irq.h
> +++ b/arch/powerpc/include/asm/hw_irq.h
> @@ -39,6 +39,8 @@
>  #define PACA_IRQ_MUST_HARD_MASK  (PACA_IRQ_EE)
>  #endif
>  
> +#endif /* CONFIG_PPC64 */
> +
>  /*
>   * flags for paca->irq_soft_mask
>   */
> @@ -47,8 +49,6 @@
>  #define IRQS_PMI_DISABLED2
>  #define IRQS_ALL_DISABLED(IRQS_DISABLED | IRQS_PMI_DISABLED)
>  
> -#endif /* CONFIG_PPC64 */
> -
>  #ifndef __ASSEMBLY__
>  
>  extern void replay_system_reset(void);
> @@ -282,6 +282,15 @@ extern void irq_set_pending_from_srr1(unsigned long 
> srr1);
>  
>  extern void force_external_irq_replay(void);
>  
> +static inline unsigned long get_softe(struct pt_regs *regs)
> +{
> + return regs->softe;
> +}
> +
> +static inline void set_softe(struct pt_regs *regs, unsigned long val)
> +{
> + regs->softe = val;
> +}
>  #else /* CONFIG_PPC64 */
>  
>  static inline unsigned long arch_local_save_flags(void)
> @@ -350,6 +359,14 @@ static inline bool arch_irq_disabled_regs(struct pt_regs 
> *regs)
>  
>  static inline void may_hard_irq_enable(void) { }
>  
> +static inline unsigned long get_softe(struct pt_regs *regs)
> +{
> + return 0;
> +}
> +
> +static inline void set_softe(struct pt_regs *regs, unsigned long val)
> +{
> +}

If this goes into a general shared header, I would prefer if we could
do something a bit more general (at least with the name).

I think get_softe() could just be replaced with arch_irq_disabled_regs().

For set, could we call it irq_soft_mask_regs_set_state()? 32 has no soft
mask state in regs, so it's more obvious that it's a no-op. Or you could
make 32-bit version a BUG(), and then always guard it with IS_ENABLED().

Thanks,
Nick

Re: [RFC PATCH v3 15/15] powerpc/kernel: Do not inconditionally save non volatile registers on system call

2020-04-06 Thread Nicholas Piggin

Christophe Leroy's on April 7, 2020 4:16 am:
> + ret = syscall_exit_prepare_end(ret, regs, ti_flags);
> + if (unlikely(ret & 0x8000)) {
> + ret &= ~0x8000;

We could just add our own set of defines for these, there's no real
reason to use _TIF_RESTOREALL as I had.

Thanks,
Nick

Re: [PATCH v6 3/7] ASoC: fsl-asoc-card: Support new property fsl,asrc-format

2020-04-06 Thread Nicolin Chen

On Wed, Apr 01, 2020 at 04:45:36PM +0800, Shengjiu Wang wrote:
> In order to align with new ESARC, we add new property fsl,asrc-format.
> The fsl,asrc-format can replace the fsl,asrc-width, driver
> can accept format from devicetree, don't need to convert it to
> format through width.
> 
> Signed-off-by: Shengjiu Wang 

Acked-by: Nicolin Chen 

> ---
>  sound/soc/fsl/fsl-asoc-card.c | 21 -
>  1 file changed, 12 insertions(+), 9 deletions(-)
> 
> diff --git a/sound/soc/fsl/fsl-asoc-card.c b/sound/soc/fsl/fsl-asoc-card.c
> index bb33601fab84..a0f5eb27d61a 100644
> --- a/sound/soc/fsl/fsl-asoc-card.c
> +++ b/sound/soc/fsl/fsl-asoc-card.c
> @@ -680,17 +680,20 @@ static int fsl_asoc_card_probe(struct platform_device 
> *pdev)
>   goto asrc_fail;
>   }
>  
> - ret = of_property_read_u32(asrc_np, "fsl,asrc-width", );
> + ret = of_property_read_u32(asrc_np, "fsl,asrc-format", 
> >asrc_format);
>   if (ret) {
> - dev_err(>dev, "failed to get output rate\n");
> - ret = -EINVAL;
> - goto asrc_fail;
> - }
> + /* Fallback to old binding; translate to asrc_format */
> + ret = of_property_read_u32(asrc_np, "fsl,asrc-width", 
> );
> + if (ret) {
> + dev_err(>dev, "failed to get output 
> width\n");

Should warn 'format' over 'width', since it's preferable.

> + return ret;

Should goto asrc_fail as we did prior to the change.

And some of lines are over 80 characters.

Let's try this:
ret = of_property_read_u32(asrc_np, "fsl,asrc-format",
   >asrc_format);
if (ret) {
/* Fallback to old binding; translate to asrc_format */
ret = of_property_read_u32(asrc_np, "fsl,asrc-width",
   );
if (ret) {
dev_err(>dev,
"failed to decide output format\n");
goto asrc_fail;
}

if (width == 24)
priv->asrc_format = SNDRV_PCM_FORMAT_S24_LE;
else
priv->asrc_format = SNDRV_PCM_FORMAT_S16_LE;
}

Re: [PATCH v6 5/7] ASoC: fsl_asrc: Move common definition to fsl_asrc_common

2020-04-06 Thread Nicolin Chen

On Wed, Apr 01, 2020 at 04:45:38PM +0800, Shengjiu Wang wrote:

>  static int fsl_asrc_probe(struct platform_device *pdev)
>  {
>   struct device_node *np = pdev->dev.of_node;
>   struct fsl_asrc *asrc;
> + struct fsl_asrc_priv *asrc_priv;

Could we move it before "struct fsl_asrc *asrc;"?

> diff --git a/sound/soc/fsl/fsl_asrc_common.h b/sound/soc/fsl/fsl_asrc_common.h
> new file mode 100644
> index ..5c93ccdfc30a
> --- /dev/null
> +++ b/sound/soc/fsl/fsl_asrc_common.h

> +#define PAIR_CTX_NUM  0x4
> +#define PAIR_PRIVAT_SIZE 0x400

"PRIVAT_" => "PRIVATE_"

> +/**
> + * fsl_asrc_pair: ASRC common data

"fsl_asrc_pair" => "fsl_asrc"

> + */
> +struct fsl_asrc {

> diff --git a/sound/soc/fsl/fsl_asrc_dma.c b/sound/soc/fsl/fsl_asrc_dma.c
> index b15946e03380..5cf0468ce6e3 100644
> --- a/sound/soc/fsl/fsl_asrc_dma.c
> +++ b/sound/soc/fsl/fsl_asrc_dma.c

> @@ -311,11 +311,12 @@ static int fsl_asrc_dma_startup(struct 
> snd_soc_component *component,
>   return ret;
>   }
>  
> - pair = kzalloc(sizeof(struct fsl_asrc_pair), GFP_KERNEL);
> + pair = kzalloc(sizeof(struct fsl_asrc_pair) + PAIR_PRIVAT_SIZE, 
> GFP_KERNEL);

If we only use the PAIR_PRIVATE_SIZE here, maybe we can put the
define in this file too, rather than in the header file.

And could fit 80 characters:

+   pair = kzalloc(sizeof(*pair) + PAIR_PRIVAT_SIZE, GFP_KERNEL);

Re: [PATCH 1/6] powerpc/spufs: simplify spufs core dumping

2020-04-06 Thread Jeremy Kerr

Hi Christoph,

> Replace the coredump ->read method with a ->dump method that must call
> dump_emit itself.  That way we avoid a buffer allocation an messing with
> set_fs() to call into code that is intended to deal with user buffers.
> For the ->get case we can now use a small on-stack buffer and avoid
> memory allocations as well.

That looks much better, thanks!

Reviewed-by: Jeremy Kerr 

However, I no longer have access to hardware to test this on. Michael,
are the coredump tests in spufs-testsuite still alive?

Cheers,


Jeremy

[PATCH AUTOSEL 5.5 12/35] soc: fsl: dpio: register dpio irq handlers after dpio create

2020-04-06 Thread Sasha Levin

From: Grigore Popescu 

[ Upstream commit fe8fe7723a3a824790bda681b40efd767e2251a7 ]

The dpio irqs must be registered when you can actually
receive interrupts, ie when the dpios are created.
Kernel goes through NULL pointer dereference errors
followed by kernel panic [1] because the dpio irqs are
enabled before the dpio is created.

[1]
Unable to handle kernel NULL pointer dereference at virtual address 0040
fsl_mc_dpio dpio.14: probed
fsl_mc_dpio dpio.13: Adding to iommu group 11
  ISV = 0, ISS = 0x0004
Unable to handle kernel NULL pointer dereference at virtual address 0040
Mem abort info:
  ESR = 0x9604
  EC = 0x25: DABT (current EL), IL = 32 bits
  SET = 0, FnV = 0
  EA = 0, S1PTW = 0
Data abort info:
  ISV = 0, ISS = 0x0004
  CM = 0, WnR = 0
[0040] user address but active_mm is swapper
Internal error: Oops: 9604 [#1] PREEMPT SMP
Modules linked in:
CPU: 2 PID: 151 Comm: kworker/2:1 Not tainted 5.6.0-rc4-next-20200304 #1
Hardware name: NXP Layerscape LX2160ARDB (DT)
Workqueue: events deferred_probe_work_func
pstate: 0085 (nzcv daIf -PAN -UAO)
pc : dpaa2_io_irq+0x18/0xe0
lr : dpio_irq_handler+0x1c/0x28
sp : 800010013e20
x29: 800010013e20 x28: 0026d9b4c140
x27: a1d38a142018 x26: 0026d2953400
x25: a1d38a142018 x24: a1d38a7ba1d8
x23: 800010013f24 x22: 
x21: 0072 x20: 0026d2953400
x19: 0026d2a68b80 x18: 0001
x17: 2fb37f3d x16: 35eafadd
x15: 0026d9b4c5b8 x14: 
x13: ff00 x12: 0038
x11: 0101010101010101 x10: 0040
x9 : a1d388db11e4 x8 : a1d38a7e40f0
x7 : 0026da414f38 x6 : 
x5 : 0026da414d80 x4 : 5e5353d0c000
x3 : 800010013f60 x2 : a1d388db11c8
x1 : 0026d2a67c00 x0 : 
Call trace:
 dpaa2_io_irq+0x18/0xe0
 dpio_irq_handler+0x1c/0x28
 __handle_irq_event_percpu+0x78/0x2c0
 handle_irq_event_percpu+0x38/0x90
 handle_irq_event+0x4c/0xd0
 handle_fasteoi_irq+0xbc/0x168
 generic_handle_irq+0x2c/0x40
 __handle_domain_irq+0x68/0xc0
 gic_handle_irq+0x64/0x150
 el1_irq+0xb8/0x180
 _raw_spin_unlock_irqrestore+0x14/0x48
 irq_set_affinity_hint+0x6c/0xa0
 dpaa2_dpio_probe+0x2a4/0x518
 fsl_mc_driver_probe+0x28/0x70
 really_probe+0xdc/0x320
 driver_probe_device+0x5c/0xf0
 __device_attach_driver+0x88/0xc0
 bus_for_each_drv+0x7c/0xc8
 __device_attach+0xe4/0x140
 device_initial_probe+0x18/0x20
 bus_probe_device+0x98/0xa0
 device_add+0x41c/0x758
 fsl_mc_device_add+0x184/0x530
 dprc_scan_objects+0x280/0x370
 dprc_probe+0x124/0x3b0
 fsl_mc_driver_probe+0x28/0x70
 really_probe+0xdc/0x320
 driver_probe_device+0x5c/0xf0
 __device_attach_driver+0x88/0xc0
 bus_for_each_drv+0x7c/0xc8
 __device_attach+0xe4/0x140
 device_initial_probe+0x18/0x20
 bus_probe_device+0x98/0xa0
 deferred_probe_work_func+0x74/0xa8
 process_one_work+0x1c8/0x470
 worker_thread+0x1f8/0x428
 kthread+0x124/0x128
 ret_from_fork+0x10/0x18
Code: a9bc7bfd 910003fd a9025bf5 a90363f7 (f9402015)
---[ end trace 38298e1a29e7a570 ]---
Kernel panic - not syncing: Fatal exception in interrupt
SMP: stopping secondary CPUs
Mem abort info:
  ESR = 0x9604
  CM = 0, WnR = 0
  EC = 0x25: DABT (current EL), IL = 32 bits
[0040] user address but active_mm is swapper
  SET = 0, FnV = 0
  EA = 0, S1PTW = 0
Data abort info:
  ISV = 0, ISS = 0x0004
  CM = 0, WnR = 0
[0040] user address but active_mm is swapper
SMP: failed to stop secondary CPUs 0-2
Kernel Offset: 0x21d37860 from 0x80001000
PHYS_OFFSET: 0xe9218000
CPU features: 0x10002,21806008
Memory Limit: none
---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---

Signed-off-by: Laurentiu Tudor 
Signed-off-by: Grigore Popescu 
Signed-off-by: Li Yang 
Signed-off-by: Sasha Levin 
---
 drivers/soc/fsl/dpio/dpio-driver.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/soc/fsl/dpio/dpio-driver.c 
b/drivers/soc/fsl/dpio/dpio-driver.c
index 70014ecce2a7e..7b642c330977f 100644
--- a/drivers/soc/fsl/dpio/dpio-driver.c
+++ b/drivers/soc/fsl/dpio/dpio-driver.c
@@ -233,10 +233,6 @@ static int dpaa2_dpio_probe(struct fsl_mc_device *dpio_dev)
goto err_allocate_irqs;
}
 
-   err = register_dpio_irq_handlers(dpio_dev, desc.cpu);
-   if (err)
-   goto err_register_dpio_irq;
-
priv->io = dpaa2_io_create(, dev);
if (!priv->io) {
dev_err(dev, "dpaa2_io_create failed\n");
@@ -244,6 +240,10 @@ static int dpaa2_dpio_probe(struct fsl_mc_device *dpio_dev)
goto err_dpaa2_io_create;
}
 
+   err = register_dpio_irq_handlers(dpio_dev, desc.cpu);
+   if (err)
+   goto err_register_dpio_irq;
+
dev_info(dev, "probed\n");
dev_dbg(dev, "   receives_notifications = %d\n",
desc.receives_notifications);
-- 
2.20.1

Re: [RFC/PATCH 2/3] pseries/kvm: Clear PSSCR[ESL|EC] bits before guest entry

2020-04-06 Thread David Gibson

On Fri, Apr 03, 2020 at 03:01:03PM +0530, Gautham R Shenoy wrote:
> On Fri, Apr 03, 2020 at 12:20:26PM +1000, Nicholas Piggin wrote:
> > Gautham R. Shenoy's on March 31, 2020 10:10 pm:
> > > From: "Gautham R. Shenoy" 
> > > 
> > > ISA v3.0 allows the guest to execute a stop instruction. For this, the
> > > PSSCR[ESL|EC] bits need to be cleared by the hypervisor before
> > > scheduling in the guest vCPU.
> > > 
> > > Currently we always schedule in a vCPU with PSSCR[ESL|EC] bits
> > > set. This patch changes the behaviour to enter the guest with
> > > PSSCR[ESL|EC] bits cleared. This is a RFC patch where we
> > > unconditionally clear these bits. Ideally this should be done
> > > conditionally on platforms where the guest stop instruction has no
> > > Bugs (starting POWER9 DD2.3).
> > 
> > How will guests know that they can use this facility safely after your
> > series? You need both DD2.3 and a patched KVM.
> 
> 
> Yes, this is something that isn't addressed in this series (mentioned
> in the cover letter), which is a POC demonstrating that the stop0lite
> state in guest works.
> 
> However, to answer your question, this is the scheme that I had in
> mind :
> 
> OPAL:
>On Procs >= DD2.3 : we publish a dt-cpu-feature "idle-stop-guest"
> 
> Hypervisor Kernel:
> 1. If "idle-stop-guest" dt-cpu-feature is discovered, then
>we set bool enable_guest_stop = true;
> 
> 2. During KVM guest entry, clear PSSCR[ESL|EC] iff
>enable_guest_stop == true.
> 
> 3. In kvm_vm_ioctl_check_extension(), for a new capability
>KVM_CAP_STOP, return true iff enable_guest_top == true.
> 
> QEMU:
>Check with the hypervisor if KVM_CAP_STOP is present. If so,
>indicate the presence to the guest via device tree.

Nack.  Presenting different capabilities to the guest depending on
host capabilities (rather than explicit options) is never ok.  It
means that depending on the system you start on you may or may not be
able to migrate to other systems that you're supposed to be able to,

> Guest Kernel:
>Check for the presence of guest stop state support in
>device-tree. If available, enable the stop0lite in the cpuidle
>driver. 
>
> 
> We still have a challenge of migrating a guest which started on a
> hypervisor supporting guest stop state to a hypervisor without it.
> The target hypervisor should atleast have Patch 1 of this series, so
> that we don't crash the guest.
> 
> > 
> > > 
> > > Signed-off-by: Gautham R. Shenoy 
> > > ---
> > >  arch/powerpc/kvm/book3s_hv.c|  2 +-
> > >  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 25 +
> > >  2 files changed, 14 insertions(+), 13 deletions(-)
> > > 
> > > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> > > index cdb7224..36d059a 100644
> > > --- a/arch/powerpc/kvm/book3s_hv.c
> > > +++ b/arch/powerpc/kvm/book3s_hv.c
> > > @@ -3424,7 +3424,7 @@ static int kvmhv_load_hv_regs_and_go(struct 
> > > kvm_vcpu *vcpu, u64 time_limit,
> > >   mtspr(SPRN_IC, vcpu->arch.ic);
> > >   mtspr(SPRN_PID, vcpu->arch.pid);
> > >  
> > > - mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
> > > + mtspr(SPRN_PSSCR, (vcpu->arch.psscr  & ~(PSSCR_EC | PSSCR_ESL)) |
> > > (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
> > >  
> > >   mtspr(SPRN_HFSCR, vcpu->arch.hfscr);
> > > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> > > b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > > index dbc2fec..c2daec3 100644
> > > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > > @@ -823,6 +823,18 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
> > >   mtspr   SPRN_PID, r7
> > >   mtspr   SPRN_WORT, r8
> > >  BEGIN_FTR_SECTION
> > > + /* POWER9-only registers */
> > > + ld  r5, VCPU_TID(r4)
> > > + ld  r6, VCPU_PSSCR(r4)
> > > + lbz r8, HSTATE_FAKE_SUSPEND(r13)
> > > + lis r7, (PSSCR_EC | PSSCR_ESL)@h /* Allow guest to call stop */
> > > + andcr6, r6, r7
> > > + rldimi  r6, r8, PSSCR_FAKE_SUSPEND_LG, 63 - PSSCR_FAKE_SUSPEND_LG
> > > + ld  r7, VCPU_HFSCR(r4)
> > > + mtspr   SPRN_TIDR, r5
> > > + mtspr   SPRN_PSSCR, r6
> > > + mtspr   SPRN_HFSCR, r7
> > > +FTR_SECTION_ELSE
> > 
> > Why did you move these around? Just because the POWER9 section became
> > larger than the other?
> 
> Yes.
> 
> > 
> > That's a real wart in the instruction patching implementation, I think
> > we can fix it by padding with nops in the macros.
> > 
> > Can you just add the additional required nops to the top branch without
> > changing them around for this patch, so it's easier to see what's going
> > on? The end result will be the same after patching. Actually changing
> > these around can have a slight unintended consequence in that code that
> > runs before features were patched will execute the IF code. Not a
> > problem here, but another reason why the instruction patching 
> > restriction is annoying.
> 
> Sure, I will repost this patch

Re: [PATCH v6 2/7] ASoC: dt-bindings: fsl_asrc: Add new property fsl,asrc-format

2020-04-06 Thread Nicolin Chen

On Wed, Apr 01, 2020 at 04:45:35PM +0800, Shengjiu Wang wrote:
> In order to support new EASRC and simplify the code structure,
> We decide to share the common structure between them. This bring
> a problem that EASRC accept format directly from devicetree, but
> ASRC accept width from devicetree.
> 
> In order to align with new ESARC, we add new property fsl,asrc-format.
> The fsl,asrc-format can replace the fsl,asrc-width, then driver
> can accept format from devicetree, don't need to convert it to
> format through width.
> 
> Signed-off-by: Shengjiu Wang 

Once Rob has no objection:
Acked-by: Nicolin Chen 

> ---
>  Documentation/devicetree/bindings/sound/fsl,asrc.txt | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/Documentation/devicetree/bindings/sound/fsl,asrc.txt 
> b/Documentation/devicetree/bindings/sound/fsl,asrc.txt
> index cb9a25165503..998b4c8a7f78 100644
> --- a/Documentation/devicetree/bindings/sound/fsl,asrc.txt
> +++ b/Documentation/devicetree/bindings/sound/fsl,asrc.txt
> @@ -51,6 +51,10 @@ Optional properties:
> will be in use as default. Otherwise, the big endian
> mode will be in use for all the device registers.
>  
> +   - fsl,asrc-format : Defines a mutual sample format used by DPCM Back
> +   Ends, which can replace the fsl,asrc-width.
> +   The value is 2 (S16_LE), or 6 (S24_LE).
> +
>  Example:
>  
>  asrc: asrc@2034000 {
> -- 
> 2.21.0
>

Re: [PATCH v6 7/7] ASoC: fsl_easrc: Add EASRC ASoC CPU DAI drivers

2020-04-06 Thread Nicolin Chen

On Wed, Apr 01, 2020 at 04:45:40PM +0800, Shengjiu Wang wrote:
> EASRC (Enhanced Asynchronous Sample Rate Converter) is a new IP module
> found on i.MX8MN. It is different with old ASRC module.
> 
> The primary features for the EASRC are as follows:
> - 4 Contexts - groups of channels with an independent time base
> - Fully independent and concurrent context control
> - Simultaneous processing of up to 32 audio channels
> - Programmable filter charachteristics for each context
> - 32, 24, 20, and 16-bit fixed point audio sample support
> - 32-bit floating point audio sample support
> - 8kHz to 384kHz sample rate
> - 1/16 to 8x sample rate conversion ratio
> 
> Signed-off-by: Shengjiu Wang 
> Signed-off-by: Cosmin-Gabriel Samoila 

Overall, looks good to me.

Please add:
Acked-by: Nicolin Chen 

> diff --git a/sound/soc/fsl/fsl_easrc.c b/sound/soc/fsl/fsl_easrc.c
> +static int fsl_easrc_normalize_filter(struct fsl_asrc *easrc,

> +  * If exponent is zero (value == 0), or 7ff (value == NaNs)
[...]
> + if (exp == 0 || exp == 0x7ff) {
[...]
> + if ((shift > 0 && exp >= 2047) ||
> + (shift < 0 && exp <= 0)) {

Could fit into one line, and would be probably nicer to re-use
"0x7ff" matching previous places, instead of "2047".

Re: [RFC PATCH v2 12/13] powerpc/kernel: Do not inconditionally save non volatile registers on system call

2020-04-06 Thread Nicholas Piggin

Christophe Leroy's on April 7, 2020 4:18 am:
> 
> 
> Le 06/04/2020 à 03:25, Nicholas Piggin a écrit :
>> Christophe Leroy's on April 6, 2020 3:44 am:
>>> Before : 347 cycles on null_syscall
>>> After  : 327 cycles on null_syscall
>> 
>> The problem I had doing this is that signal delivery wnats full regs,
>> and you don't know if you have a signal pending ahead of time if you
>> have interrupts enabled.
>> 
>> I began to try bailing out back to asm to save nvgprs and call again.
>> I think that can be made to work, but it is more complication in asm,
>> and I soon found that 64s CPUs don't care about NVGPRs too much so it's
>> nice to get rid of the !fullregs state.
> 
> I tried a new way in v3, please have a look. I split 
> syscall_exit_prepare() in 3 parts and the result is unexpected: it is 
> better than before the series (307 cycles now versus 311 cycles with 
> full ASM syscall entry/exit).

Great! Well I don't really see a problem with how you changed the C code 
around. I'll have to look at the assembly but I don't think it would 
have caused a problem for 64s.

Thanks,
Nick

Re: [PATCH v5 18/21] powerpc64: Add prefixed instructions to instruction data type

2020-04-06 Thread Alistair Popple

On Monday, 6 April 2020 8:42:57 PM AEST Jordan Niethe wrote:
> On Mon, 6 Apr 2020, 7:52 pm Alistair Popple,  wrote:
> > > diff --git a/arch/powerpc/include/asm/inst.h
> > > b/arch/powerpc/include/asm/inst.h index 70b37a35a91a..7e23e7146c66
> > > 100644
> > > --- a/arch/powerpc/include/asm/inst.h
> > > +++ b/arch/powerpc/include/asm/inst.h
> > > @@ -8,23 +8,67 @@
> > > 
> > >  struct ppc_inst {
> > >  
> > >  u32 val;
> > > 
> > > +#ifdef __powerpc64__
> > > +u32 suffix;
> > > +#endif /* __powerpc64__ */
> > > 
> > >  } __packed;
> > > 
> > > -#define ppc_inst(x) ((struct ppc_inst){ .val = x })
> > > +static inline int ppc_inst_opcode(struct ppc_inst x)
> > > +{
> > > + return x.val >> 26;
> > > +}
> > > 
> > >  static inline u32 ppc_inst_val(struct ppc_inst x)
> > >  {
> > >  
> > >   return x.val;
> > >  
> > >  }
> > > 
> > > -static inline bool ppc_inst_len(struct ppc_inst x)
> > > +#ifdef __powerpc64__
> > > +#define ppc_inst(x) ((struct ppc_inst){ .val = (x), .suffix = 0xff })
> > > +
> > > +#define ppc_inst_prefix(x, y) ((struct ppc_inst){ .val = (x), .suffix =
> > 
> > (y)
> > 
> > > }) +
> > > +static inline u32 ppc_inst_suffix(struct ppc_inst x)
> > > 
> > >  {
> > > 
> > > - return sizeof(struct ppc_inst);
> > > + return x.suffix;
> > > 
> > >  }
> > > 
> > > -static inline int ppc_inst_opcode(struct ppc_inst x)
> > > +static inline bool ppc_inst_prefixed(struct ppc_inst x) {
> > > + return ((ppc_inst_val(x) >> 26) == 1) && ppc_inst_suffix(x) !=
> > 
> > 0xff;
> > 
> > > +}
> > > +
> > > +static inline struct ppc_inst ppc_inst_swab(struct ppc_inst x)
> > > 
> > >  {
> > > 
> > > - return x.val >> 26;
> > > + return ppc_inst_prefix(swab32(ppc_inst_val(x)),
> > > +swab32(ppc_inst_suffix(x)));
> > > +}
> > > +
> > > +static inline struct ppc_inst ppc_inst_read(const struct ppc_inst *ptr)
> > > +{
> > > + u32 val, suffix = 0xff;
> > > + val = *(u32 *)ptr;
> > > + if ((val >> 26) == 1)
> > > + suffix = *((u32 *)ptr + 1);
> > > + return ppc_inst_prefix(val, suffix);
> > > +}
> > > +
> > > +static inline void ppc_inst_write(struct ppc_inst *ptr, struct ppc_inst
> > 
> > x)
> > 
> > > +{
> > > + if (ppc_inst_prefixed(x)) {
> > > + *(u32 *)ptr = x.val;
> > > + *((u32 *)ptr + 1) = x.suffix;
> > > + } else {
> > > + *(u32 *)ptr = x.val;
> > > + }
> > > +}
> > > +
> > > +#else
> > > +
> > > +#define ppc_inst(x) ((struct ppc_inst){ .val = x })
> > > +
> > > +static inline bool ppc_inst_prefixed(ppc_inst x)
> > > +{
> > > + return 0;
> > > 
> > >  }
> > >  
> > >  static inline struct ppc_inst ppc_inst_swab(struct ppc_inst x)
> > > 
> > > @@ -32,14 +76,31 @@ static inline struct ppc_inst ppc_inst_swab(struct
> > > ppc_inst x) return ppc_inst(swab32(ppc_inst_val(x)));
> > > 
> > >  }
> > > 
> > > +static inline u32 ppc_inst_val(struct ppc_inst x)
> > > +{
> > > + return x.val;
> > > +}
> > > +
> > > 
> > >  static inline struct ppc_inst ppc_inst_read(const struct ppc_inst *ptr)
> > >  {
> > >  
> > >   return *ptr;
> > >  
> > >  }
> > > 
> > > +static inline void ppc_inst_write(struct ppc_inst *ptr, struct ppc_inst
> > 
> > x)
> > 
> > > +{
> > > + *ptr = x;
> > > +}
> > > +
> > > +#endif /* __powerpc64__ */
> > > +
> > > 
> > >  static inline bool ppc_inst_equal(struct ppc_inst x, struct ppc_inst y)
> > >  {
> > >  
> > >   return !memcmp(, , sizeof(struct ppc_inst));
> > >  
> > >  }
> > 
> > Apologies for not picking this up earlier, I was hoping to get to the
> > bottom
> > of the issue I was seeing before you sent out v5. However the above
> > definition
> > of instruction equality does not seem correct because it does not consider
> > the
> > case when an instruction is not prefixed - a non-prefixed instruction
> > should be
> > considered equal if the first 32-bit opcode/value is the same. Something
> > 
> > like:
> > if (ppc_inst_prefixed(x) != ppc_inst_prefixed(y))
> > 
> > return false;
> > 
> > else if (ppc_inst_prefixed(x))
> > 
> > return !memcmp(, , sizeof(struct ppc_inst));
> > 
> > else
> > 
> > return x.val == y.val;
> > 
> > This was causing failures in ftrace_modify_code() as it would falsely
> > detect
> > two non-prefixed instructions as being not equal due to differences in the
> > suffix.
> 
> Hm I was intending that non prefixed instructions would always have the
> suffix set to the same value. If that's not happening, something must be
> wrong with where the instructions are created.
> 

Ok, based on your comment I found the problem. Your last patch series defined 
read_inst() in  ftrace.c and ended that function with:

ppc_inst_write(p, inst);
return 0;

This is called from ftrace_modify_code(), where p is uninitialised. In the 
last series ppc_inst_write() function would only write/initialise

Re: [PATCH v4] powerpc/powernv: add NULL check after kzalloc in opal_add_one_export

2020-04-06 Thread Qiujun Huang

On Mon, Apr 6, 2020 at 6:30 PM Michael Ellerman  wrote:
>
> Qiujun Huang  writes:
> > Here needs a NULL check as kzalloc may fail returning NULL.
> >
> > Issue was found by coccinelle.
> > Generated by: scripts/coccinelle/null/kmerr.cocci
> >
> > Signed-off-by: Qiujun Huang 
> > Reviewed-by: Oliver O'Halloran 
> >
> > ---
>
> Thanks for putting up with all the review comments :)
>
> But I think this should actually be two patches now.
>
> The first patch should change the goto after
> of_property_read_u64_array() into a return and drop the redundant
> assignments.
>
> Then the second patch can add the NULL check for attr.

Get that, I'll separate them.

>
> cheers
>
> > v3->v4:
> >   Added the information about coccinelle script.
> >   Added change log.
> >   Added Oliver's Reviewed-by.
> > v2->v3:
> >   Removed redundant assignment to 'attr' and 'name'.
> > v1->v2:
> >   Just return -ENOMEM if attr is NULL.
> > ---
> >  arch/powerpc/platforms/powernv/opal.c | 9 ++---
> >  1 file changed, 6 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/powerpc/platforms/powernv/opal.c 
> > b/arch/powerpc/platforms/powernv/opal.c
> > index 2b3dfd0b6cdd..908d749bcef5 100644
> > --- a/arch/powerpc/platforms/powernv/opal.c
> > +++ b/arch/powerpc/platforms/powernv/opal.c
> > @@ -801,16 +801,19 @@ static ssize_t export_attr_read(struct file *fp, 
> > struct kobject *kobj,
> >  static int opal_add_one_export(struct kobject *parent, const char 
> > *export_name,
> >  struct device_node *np, const char *prop_name)
> >  {
> > - struct bin_attribute *attr = NULL;
> > - const char *name = NULL;
> > + struct bin_attribute *attr;
> > + const char *name;
> >   u64 vals[2];
> >   int rc;
> >
> >   rc = of_property_read_u64_array(np, prop_name, [0], 2);
> >   if (rc)
> > - goto out;
> > + return rc;
> >
> >   attr = kzalloc(sizeof(*attr), GFP_KERNEL);
> > + if (!attr)
> > + return -ENOMEM;
> > +
> >   name = kstrdup(export_name, GFP_KERNEL);
> >   if (!name) {
> >   rc = -ENOMEM;
> > --
> > 2.17.1

Re: [PATCH v4] powerpc/powernv: add NULL check after kzalloc in opal_add_one_export

2020-04-06 Thread Markus Elfring

>>> Here needs a NULL check as kzalloc may fail returning NULL.

I find this wording potentially confusing.

* Such function calls will usually succeed to return a pointer.

* The desired memory allocation can fail.

* Please choose an imperative wording for the change description.


>>> Issue was found by coccinelle.

Please omit this line after the addition for the reference to the SmPL script.


>>> Generated by: scripts/coccinelle/null/kmerr.cocci
…
>>> Reviewed-by: Oliver O'Halloran 

I wonder about this tag because of requested changes for the shown patch 
approach.

I recommend to add the tag “Fixes”.


>> Thanks for putting up with all the review comments :)

This seems to become challenging here.


>> But I think this should actually be two patches now.
…

> Get that, I'll separate them.

I wonder why it was not directly tried in this patch version.


>>> v3->v4:

I suggest to apply a shorter version numbering format (without an arrow).

Regards,
Markus

[PATCH 4/6] binfmt_elf_fdpic: remove the set_fs(KERNEL_DS) in elf_fdpic_core_dump

2020-04-06 Thread Christoph Hellwig

There is no logic in elf_fdpic_core_dump itself that uses uaccess
routines on kernel pointers, the file writes are nicely encapsulated in
dump_emit which does its own set_fs.

Signed-off-by: Christoph Hellwig 
---
 fs/binfmt_elf_fdpic.c | 31 ---
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 240f3543..c62c17a5c34a 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1549,7 +1549,6 @@ static int elf_fdpic_core_dump(struct coredump_params 
*cprm)
 {
 #defineNUM_NOTES   6
int has_dumped = 0;
-   mm_segment_t fs;
int segs;
int i;
struct vm_area_struct *vma;
@@ -1678,9 +1677,6 @@ static int elf_fdpic_core_dump(struct coredump_params 
*cprm)
  "LINUX", ELF_CORE_XFPREG_TYPE, sizeof(*xfpu), xfpu);
 #endif
 
-   fs = get_fs();
-   set_fs(KERNEL_DS);
-
offset += sizeof(*elf); /* Elf header */
offset += segs * sizeof(struct elf_phdr);   /* Program headers */
 
@@ -1695,7 +1691,7 @@ static int elf_fdpic_core_dump(struct coredump_params 
*cprm)
 
phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL);
if (!phdr4note)
-   goto end_coredump;
+   goto cleanup;
 
fill_elf_note_phdr(phdr4note, sz, offset);
offset += sz;
@@ -1711,17 +1707,17 @@ static int elf_fdpic_core_dump(struct coredump_params 
*cprm)
if (e_phnum == PN_XNUM) {
shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL);
if (!shdr4extnum)
-   goto end_coredump;
+   goto cleanup;
fill_extnum_info(elf, shdr4extnum, e_shoff, segs);
}
 
offset = dataoff;
 
if (!dump_emit(cprm, elf, sizeof(*elf)))
-   goto end_coredump;
+   goto cleanup;
 
if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note)))
-   goto end_coredump;
+   goto cleanup;
 
/* write program headers for segments dump */
for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
@@ -1745,16 +1741,16 @@ static int elf_fdpic_core_dump(struct coredump_params 
*cprm)
phdr.p_align = ELF_EXEC_PAGESIZE;
 
if (!dump_emit(cprm, , sizeof(phdr)))
-   goto end_coredump;
+   goto cleanup;
}
 
if (!elf_core_write_extra_phdrs(cprm, offset))
-   goto end_coredump;
+   goto cleanup;
 
/* write out the notes section */
for (i = 0; i < numnote; i++)
if (!writenote(notes + i, cprm))
-   goto end_coredump;
+   goto cleanup;
 
/* write out the thread status notes section */
list_for_each(t, _list) {
@@ -1763,21 +1759,21 @@ static int elf_fdpic_core_dump(struct coredump_params 
*cprm)
 
for (i = 0; i < tmp->num_notes; i++)
if (!writenote(>notes[i], cprm))
-   goto end_coredump;
+   goto cleanup;
}
 
if (!dump_skip(cprm, dataoff - cprm->pos))
-   goto end_coredump;
+   goto cleanup;
 
if (!elf_fdpic_dump_segments(cprm))
-   goto end_coredump;
+   goto cleanup;
 
if (!elf_core_write_extra_data(cprm))
-   goto end_coredump;
+   goto cleanup;
 
if (e_phnum == PN_XNUM) {
if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum)))
-   goto end_coredump;
+   goto cleanup;
}
 
if (cprm->file->f_pos != offset) {
@@ -1787,9 +1783,6 @@ static int elf_fdpic_core_dump(struct coredump_params 
*cprm)
   cprm->file->f_pos, offset);
}
 
-end_coredump:
-   set_fs(fs);
-
 cleanup:
while (!list_empty(_list)) {
struct list_head *tmp = thread_list.next;
-- 
2.25.1

[PATCH v3 00/15] powerpc/64: machine check and system reset fixes

2020-04-06 Thread Nicholas Piggin

There's a bunch of problems we hit bringing up fwnmi sreset and testing
with mce injection on QEMU. Mostly pseries issues.

This series of fixes applies on top of next-test, the machine
check reconcile patch won't apply cleanly to previous kernels but
it might want to be backported. We can do that after upstreaming.

This doesn't solve all known problems yet, but fwnmi machine check
and system reset injection in QEMU is significantly better. There
will be more to come but these should be ready for review now.

Thanks,
Nick

v3:
- Addressed comments and fixed compile bugs.
- Didn't convert to pr_err because it's an existing printk that was
  moved. A cleanup to convert printks could go afterward.

v2:
- Added a couple more fixes
- Review comments and tags
- Re-tested with some fixes to my qemu machine check injection patches


Nicholas Piggin (15):
  powerpc/64s/exception: Fix machine check no-loss idle wakeup
  powerpc/64s/exceptions: Fix in_mce accounting in unrecoverable path
  powerpc/64s/exceptions: Change irq reconcile for NMIs from reusing
_DAR to RESULT
  powerpc/64s/exceptions: machine check reconcile irq state
  powerpc/pseries/ras: avoid calling rtas_token in NMI paths
  powerpc/pseries/ras: FWNMI_VALID off by one
  powerpc/pseries/ras: fwnmi avoid modifying r3 in error case
  powerpc/pseries/ras: fwnmi sreset should not interlock
  powerpc/pseries: limit machine check stack to 4GB
  powerpc/pseries: machine check use rtas_call_unlocked with args on
stack
  powerpc/64s: machine check interrupt update NMI accounting
  powerpc: ftrace_enabled helper
  powerpc/64s: machine check do not trace real-mode handler
  powerpc/64s: system reset do not trace
  powerpc: make unrecoverable NMIs die instead of panic

 arch/powerpc/include/asm/firmware.h|  1 +
 arch/powerpc/include/asm/ftrace.h  | 14 ++
 arch/powerpc/kernel/exceptions-64s.S   | 47 +++-
 arch/powerpc/kernel/mce.c  | 16 ++-
 arch/powerpc/kernel/process.c  |  2 +-
 arch/powerpc/kernel/setup_64.c | 15 +--
 arch/powerpc/kernel/traps.c| 24 ---
 arch/powerpc/platforms/pseries/ras.c   | 60 +++---
 arch/powerpc/platforms/pseries/setup.c | 14 --
 9 files changed, 142 insertions(+), 51 deletions(-)

-- 
2.23.0

[PATCH v3 04/15] powerpc/64s/exceptions: machine check reconcile irq state

2020-04-06 Thread Nicholas Piggin

pseries fwnmi machine check code pops the soft-irq checks in rtas_call
(after the previous patch to remove rtas_token from this call path).
Rather than play whack a mole with these and forever having fragile
code, it seems better to have the early machine check handler perform
the same kind of reconcile as the other NMI interrupts.

  WARNING: CPU: 0 PID: 493 at arch/powerpc/kernel/irq.c:343
  CPU: 0 PID: 493 Comm: a Tainted: GW
  NIP:  c001ed2c LR: c0042c40 CTR: 
  REGS: c001fffd38b0 TRAP: 0700   Tainted: GW
  MSR:  80021003   CR: 28000488  XER: 
  CFAR: c001ec90 IRQMASK: 0
  GPR00: c0043820 c001fffd3b40 c12ba300 
  GPR04: 48000488   deadbeef
  GPR08: 0080   1001
  GPR12:  c14a  
  GPR16:    
  GPR20:    
  GPR24:    
  GPR28:  0001 c1360810 
  NIP [c001ed2c] arch_local_irq_restore.part.0+0xac/0x100
  LR [c0042c40] unlock_rtas+0x30/0x90
  Call Trace:
  [c001fffd3b40] [c1360810] 0xc1360810 (unreliable)
  [c001fffd3b60] [c0043820] rtas_call+0x1c0/0x280
  [c001fffd3bb0] [c00dc328] fwnmi_release_errinfo+0x38/0x70
  [c001fffd3c10] [c00dcd8c] 
pseries_machine_check_realmode+0x1dc/0x540
  [c001fffd3cd0] [c003fe04] machine_check_early+0x54/0x70
  [c001fffd3d00] [c0008384] machine_check_early_common+0x134/0x1f0
  --- interrupt: 200 at 0x13f1307c8
  LR = 0x7fff888b8528
  Instruction dump:
  6000 7d2000a6 71298000 41820068 3922 7d210164 4b9c 6000
  6000 7d2000a6 71298000 4c820020 <0fe0> 4e800020 6000 6000

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index a42b73efb1a9..072772803b7c 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1116,11 +1116,30 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
li  r10,MSR_RI
mtmsrd  r10,1
 
+   /*
+* Set IRQS_ALL_DISABLED and save PACAIRQHAPPENED (see
+* system_reset_common)
+*/
+   li  r10,IRQS_ALL_DISABLED
+   stb r10,PACAIRQSOFTMASK(r13)
+   lbz r10,PACAIRQHAPPENED(r13)
+   std r10,RESULT(r1)
+   ori r10,r10,PACA_IRQ_HARD_DIS
+   stb r10,PACAIRQHAPPENED(r13)
+
addir3,r1,STACK_FRAME_OVERHEAD
bl  machine_check_early
std r3,RESULT(r1)   /* Save result */
ld  r12,_MSR(r1)
 
+   /*
+* Restore soft mask settings.
+*/
+   ld  r10,RESULT(r1)
+   stb r10,PACAIRQHAPPENED(r13)
+   ld  r10,SOFTE(r1)
+   stb r10,PACAIRQSOFTMASK(r13)
+
 #ifdef CONFIG_PPC_P7_NAP
/*
 * Check if thread was in power saving mode. We come here when any
-- 
2.23.0

[PATCH v3 05/15] powerpc/pseries/ras: avoid calling rtas_token in NMI paths

2020-04-06 Thread Nicholas Piggin

In the interest of reducing code and possible failures in the
machine check and system reset paths, grab the "ibm,nmi-interlock"
token at init time.

Reviewed-by: Mahesh Salgaonkar 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/firmware.h|  1 +
 arch/powerpc/platforms/pseries/ras.c   |  2 +-
 arch/powerpc/platforms/pseries/setup.c | 14 ++
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/firmware.h 
b/arch/powerpc/include/asm/firmware.h
index ca33f4ef6cb4..6003c2e533a0 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -128,6 +128,7 @@ extern void machine_check_fwnmi(void);
 
 /* This is true if we are using the firmware NMI handler (typically LPAR) */
 extern int fwnmi_active;
+extern int ibm_nmi_interlock_token;
 
 extern unsigned int __start___fw_ftr_fixup, __stop___fw_ftr_fixup;
 
diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index aa6208c8d4f0..972b95ebc867 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -458,7 +458,7 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct 
pt_regs *regs)
  */
 static void fwnmi_release_errinfo(void)
 {
-   int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL);
+   int ret = rtas_call(ibm_nmi_interlock_token, 0, 1, NULL);
if (ret != 0)
printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret);
 }
diff --git a/arch/powerpc/platforms/pseries/setup.c 
b/arch/powerpc/platforms/pseries/setup.c
index 0c8421dd01ab..dd234095ae4f 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -83,6 +83,7 @@ unsigned long CMO_PageSize = (ASM_CONST(1) << 
IOMMU_PAGE_SHIFT_4K);
 EXPORT_SYMBOL(CMO_PageSize);
 
 int fwnmi_active;  /* TRUE if an FWNMI handler is present */
+int ibm_nmi_interlock_token;
 
 static void pSeries_show_cpuinfo(struct seq_file *m)
 {
@@ -113,9 +114,14 @@ static void __init fwnmi_init(void)
struct slb_entry *slb_ptr;
size_t size;
 #endif
+   int ibm_nmi_register_token;
 
-   int ibm_nmi_register = rtas_token("ibm,nmi-register");
-   if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE)
+   ibm_nmi_register_token = rtas_token("ibm,nmi-register");
+   if (ibm_nmi_register_token == RTAS_UNKNOWN_SERVICE)
+   return;
+
+   ibm_nmi_interlock_token = rtas_token("ibm,nmi-interlock");
+   if (WARN_ON(ibm_nmi_interlock_token == RTAS_UNKNOWN_SERVICE))
return;
 
/* If the kernel's not linked at zero we point the firmware at low
@@ -123,8 +129,8 @@ static void __init fwnmi_init(void)
system_reset_addr  = __pa(system_reset_fwnmi) - PHYSICAL_START;
machine_check_addr = __pa(machine_check_fwnmi) - PHYSICAL_START;
 
-   if (0 == rtas_call(ibm_nmi_register, 2, 1, NULL, system_reset_addr,
-   machine_check_addr))
+   if (0 == rtas_call(ibm_nmi_register_token, 2, 1, NULL,
+  system_reset_addr, machine_check_addr))
fwnmi_active = 1;
 
/*
-- 
2.23.0

[PATCH v3 03/15] powerpc/64s/exceptions: Change irq reconcile for NMIs from reusing _DAR to RESULT

2020-04-06 Thread Nicholas Piggin

A spare interrupt stack slot is needed to save irq state when
reconciling NMIs (sreset and decrementer soft-nmi). _DAR is used
for this, but we want to reconcile machine checks as well, which
do use _DAR. Switch to using RESULT instead, as it's used by
system calls.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 3322000316ab..a42b73efb1a9 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -939,13 +939,13 @@ EXC_COMMON_BEGIN(system_reset_common)
 * the right thing. We do not want to reconcile because that goes
 * through irq tracing which we don't want in NMI.
 *
-* Save PACAIRQHAPPENED to _DAR (otherwise unused), and set HARD_DIS
+* Save PACAIRQHAPPENED to RESULT (otherwise unused), and set HARD_DIS
 * as we are running with MSR[EE]=0.
 */
li  r10,IRQS_ALL_DISABLED
stb r10,PACAIRQSOFTMASK(r13)
lbz r10,PACAIRQHAPPENED(r13)
-   std r10,_DAR(r1)
+   std r10,RESULT(r1)
ori r10,r10,PACA_IRQ_HARD_DIS
stb r10,PACAIRQHAPPENED(r13)
 
@@ -966,7 +966,7 @@ EXC_COMMON_BEGIN(system_reset_common)
/*
 * Restore soft mask settings.
 */
-   ld  r10,_DAR(r1)
+   ld  r10,RESULT(r1)
stb r10,PACAIRQHAPPENED(r13)
ld  r10,SOFTE(r1)
stb r10,PACAIRQSOFTMASK(r13)
@@ -2743,7 +2743,7 @@ EXC_COMMON_BEGIN(soft_nmi_common)
li  r10,IRQS_ALL_DISABLED
stb r10,PACAIRQSOFTMASK(r13)
lbz r10,PACAIRQHAPPENED(r13)
-   std r10,_DAR(r1)
+   std r10,RESULT(r1)
ori r10,r10,PACA_IRQ_HARD_DIS
stb r10,PACAIRQHAPPENED(r13)
 
@@ -2757,7 +2757,7 @@ EXC_COMMON_BEGIN(soft_nmi_common)
/*
 * Restore soft mask settings.
 */
-   ld  r10,_DAR(r1)
+   ld  r10,RESULT(r1)
stb r10,PACAIRQHAPPENED(r13)
ld  r10,SOFTE(r1)
stb r10,PACAIRQSOFTMASK(r13)
-- 
2.23.0

[PATCH v3 06/15] powerpc/pseries/ras: FWNMI_VALID off by one

2020-04-06 Thread Nicholas Piggin

This was discovered developing qemu fwnmi sreset support. This
off-by-one bug means the last 16 bytes of the rtas area can not
be used for a 16 byte save area.

It's not a serious bug, and QEMU implementation has to retain a
workaround for old kernels, but it's good to tighten it.

Acked-by: Mahesh Salgaonkar 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/platforms/pseries/ras.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 972b95ebc867..ed43c2e4d4ee 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -395,10 +395,11 @@ static irqreturn_t ras_error_interrupt(int irq, void 
*dev_id)
 /*
  * Some versions of FWNMI place the buffer inside the 4kB page starting at
  * 0x7000. Other versions place it inside the rtas buffer. We check both.
+ * Minimum size of the buffer is 16 bytes.
  */
 #define VALID_FWNMI_BUFFER(A) \
-   A) >= 0x7000) && ((A) < 0x7ff0)) || \
-   (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16
+   A) >= 0x7000) && ((A) <= 0x8000 - 16)) || \
+   (((A) >= rtas.base) && ((A) <= (rtas.base + rtas.size - 16
 
 static inline struct rtas_error_log *fwnmi_get_errlog(void)
 {
-- 
2.23.0

[PATCH v3 07/15] powerpc/pseries/ras: fwnmi avoid modifying r3 in error case

2020-04-06 Thread Nicholas Piggin

If there is some error with the fwnmi save area, r3 has already been
modified which doesn't help with debugging.

Only update r3 when to restore the saved value.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/platforms/pseries/ras.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index ed43c2e4d4ee..2c60e2be1bc5 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -423,18 +423,19 @@ static inline struct rtas_error_log 
*fwnmi_get_errlog(void)
  */
 static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
 {
+   unsigned long savep_ra;
unsigned long *savep;
struct rtas_error_log *h;
 
/* Mask top two bits */
-   regs->gpr[3] &= ~(0x3UL << 62);
+   savep_ra = regs->gpr[3] & ~(0x3UL << 62);
 
-   if (!VALID_FWNMI_BUFFER(regs->gpr[3])) {
+   if (!VALID_FWNMI_BUFFER(savep_ra)) {
printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]);
return NULL;
}
 
-   savep = __va(regs->gpr[3]);
+   savep = __va(savep_ra);
regs->gpr[3] = be64_to_cpu(savep[0]);   /* restore original r3 */
 
h = (struct rtas_error_log *)[1];
-- 
2.23.0

[PATCH v3 08/15] powerpc/pseries/ras: fwnmi sreset should not interlock

2020-04-06 Thread Nicholas Piggin

PAPR does not specify that fwnmi sreset should be interlocked, and
PowerVM (and therefore now QEMU) do not require it.

These "ibm,nmi-interlock" calls are ignored by firmware, but there
is a possibility that the sreset could have interrupted a machine
check and release the machine check's interlock too early, corrupting
it if another machine check came in.

This is an extremely rare case, but it should be fixed for clarity
and reducing the code executed in the sreset path. Firmware also
does not provide error information for the sreset case to look at, so
remove that comment.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/platforms/pseries/ras.c | 46 +++-
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 2c60e2be1bc5..d65bc38bcb8f 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -406,6 +406,20 @@ static inline struct rtas_error_log *fwnmi_get_errlog(void)
return (struct rtas_error_log *)local_paca->mce_data_buf;
 }
 
+static unsigned long *fwnmi_get_savep(struct pt_regs *regs)
+{
+   unsigned long savep_ra;
+
+   /* Mask top two bits */
+   savep_ra = regs->gpr[3] & ~(0x3UL << 62);
+   if (!VALID_FWNMI_BUFFER(savep_ra)) {
+   printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]);
+   return NULL;
+   }
+
+   return __va(savep_ra);
+}
+
 /*
  * Get the error information for errors coming through the
  * FWNMI vectors.  The pt_regs' r3 will be updated to reflect
@@ -423,20 +437,14 @@ static inline struct rtas_error_log 
*fwnmi_get_errlog(void)
  */
 static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
 {
-   unsigned long savep_ra;
unsigned long *savep;
struct rtas_error_log *h;
 
-   /* Mask top two bits */
-   savep_ra = regs->gpr[3] & ~(0x3UL << 62);
-
-   if (!VALID_FWNMI_BUFFER(savep_ra)) {
-   printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]);
+   savep = fwnmi_get_savep(regs);
+   if (!savep)
return NULL;
-   }
 
-   savep = __va(savep_ra);
-   regs->gpr[3] = be64_to_cpu(savep[0]);   /* restore original r3 */
+   regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */
 
h = (struct rtas_error_log *)[1];
/* Use the per cpu buffer from paca to store rtas error log */
@@ -483,11 +491,21 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
 #endif
 
if (fwnmi_active) {
-   struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs);
-   if (errhdr) {
-   /* XXX Should look at FWNMI information */
-   }
-   fwnmi_release_errinfo();
+   unsigned long *savep;
+
+   /*
+* Firmware (PowerVM and KVM) saves r3 to a save area like
+* machine check, which is not exactly what PAPR (2.9)
+* suggests but there is no way to detect otherwise, so this
+* is the interface now.
+*
+* System resets do not save any error log or require an
+* "ibm,nmi-interlock" rtas call to release.
+*/
+
+   savep = fwnmi_get_savep(regs);
+   if (savep)
+   regs->gpr[3] = be64_to_cpu(savep[0]); /* restore 
original r3 */
}
 
if (smp_handle_nmi_ipi(regs))
-- 
2.23.0

[PATCH v3 09/15] powerpc/pseries: limit machine check stack to 4GB

2020-04-06 Thread Nicholas Piggin

This allows rtas_args to be put on the machine check stack, which
avoids a lot of complications with re-entrancy deadlocks.

Reviewed-by: Mahesh Salgaonkar 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/setup_64.c | 15 ---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 438a9befce41..defe05b6b7a9 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -709,7 +709,7 @@ void __init exc_lvl_early_init(void)
  */
 void __init emergency_stack_init(void)
 {
-   u64 limit;
+   u64 limit, mce_limit;
unsigned int i;
 
/*
@@ -726,7 +726,16 @@ void __init emergency_stack_init(void)
 * initialized in kernel/irq.c. These are initialized here in order
 * to have emergency stacks available as early as possible.
 */
-   limit = min(ppc64_bolted_size(), ppc64_rma_size);
+   limit = mce_limit = min(ppc64_bolted_size(), ppc64_rma_size);
+
+   /*
+* Machine check on pseries calls rtas, but can't use the static
+* rtas_args due to a machine check hitting while the lock is held.
+* rtas args have to be under 4GB, so the machine check stack is
+* limited to 4GB so args can be put on stack.
+*/
+   if (firmware_has_feature(FW_FEATURE_LPAR) && mce_limit > SZ_4G)
+   mce_limit = SZ_4G;
 
for_each_possible_cpu(i) {
paca_ptrs[i]->emergency_sp = alloc_stack(limit, i) + 
THREAD_SIZE;
@@ -736,7 +745,7 @@ void __init emergency_stack_init(void)
paca_ptrs[i]->nmi_emergency_sp = alloc_stack(limit, i) + 
THREAD_SIZE;
 
/* emergency stack for machine check exception handling. */
-   paca_ptrs[i]->mc_emergency_sp = alloc_stack(limit, i) + 
THREAD_SIZE;
+   paca_ptrs[i]->mc_emergency_sp = alloc_stack(mce_limit, i) + 
THREAD_SIZE;
 #endif
}
 }
-- 
2.23.0

[PATCH v3 15/15] powerpc: make unrecoverable NMIs die instead of panic

2020-04-06 Thread Nicholas Piggin

System Reset and Machine Check interrupts that are not recoverable due
to being nested or interrupting when RI=0 currently panic. This is
not necessary, and can often just kill the current context and recover.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/traps.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 1beae89bb871..afed3de33a9a 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -513,11 +513,11 @@ void system_reset_exception(struct pt_regs *regs)
 #ifdef CONFIG_PPC_BOOK3S_64
BUG_ON(get_paca()->in_nmi == 0);
if (get_paca()->in_nmi > 1)
-   nmi_panic(regs, "Unrecoverable nested System Reset");
+   die("Unrecoverable nested System Reset", regs, SIGABRT);
 #endif
/* Must die if the interrupt is not recoverable */
if (!(regs->msr & MSR_RI))
-   nmi_panic(regs, "Unrecoverable System Reset");
+   die("Unrecoverable System Reset", regs, SIGABRT);
 
if (saved_hsrrs) {
mtspr(SPRN_HSRR0, hsrr0);
@@ -858,7 +858,7 @@ void machine_check_exception(struct pt_regs *regs)
 bail:
/* Must die if the interrupt is not recoverable */
if (!(regs->msr & MSR_RI))
-   nmi_panic(regs, "Unrecoverable Machine check");
+   die("Unrecoverable Machine check", regs, SIGBUS);
 }
 
 void SMIException(struct pt_regs *regs)
-- 
2.23.0

Re: [PATCH v3 08/15] powerpc/pseries/ras: fwnmi sreset should not interlock

2020-04-06 Thread Christophe Leroy





Le 07/04/2020 à 07:16, Nicholas Piggin a écrit :

PAPR does not specify that fwnmi sreset should be interlocked, and
PowerVM (and therefore now QEMU) do not require it.

These "ibm,nmi-interlock" calls are ignored by firmware, but there
is a possibility that the sreset could have interrupted a machine
check and release the machine check's interlock too early, corrupting
it if another machine check came in.

This is an extremely rare case, but it should be fixed for clarity
and reducing the code executed in the sreset path. Firmware also
does not provide error information for the sreset case to look at, so
remove that comment.

Signed-off-by: Nicholas Piggin 
---
  arch/powerpc/platforms/pseries/ras.c | 46 +++-
  1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 2c60e2be1bc5..d65bc38bcb8f 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -406,6 +406,20 @@ static inline struct rtas_error_log *fwnmi_get_errlog(void)
return (struct rtas_error_log *)local_paca->mce_data_buf;
  }
  
+static unsigned long *fwnmi_get_savep(struct pt_regs *regs)

+{
+   unsigned long savep_ra;
+
+   /* Mask top two bits */
+   savep_ra = regs->gpr[3] & ~(0x3UL << 62);
+   if (!VALID_FWNMI_BUFFER(savep_ra)) {
+   printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]);


pr_err() ?


+   return NULL;
+   }
+
+   return __va(savep_ra);
+}
+
  /*
   * Get the error information for errors coming through the
   * FWNMI vectors.  The pt_regs' r3 will be updated to reflect
@@ -423,20 +437,14 @@ static inline struct rtas_error_log 
*fwnmi_get_errlog(void)
   */
  static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
  {
-   unsigned long savep_ra;
unsigned long *savep;
struct rtas_error_log *h;
  
-	/* Mask top two bits */

-   savep_ra = regs->gpr[3] & ~(0x3UL << 62);
-
-   if (!VALID_FWNMI_BUFFER(savep_ra)) {
-   printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]);
+   savep = fwnmi_get_savep(regs);
+   if (!savep)
return NULL;
-   }
  
-	savep = __va(savep_ra);

-   regs->gpr[3] = be64_to_cpu(savep[0]);/* restore original r3 */
+   regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */


Not sure the above line change is needed.

  
  	h = (struct rtas_error_log *)[1];

/* Use the per cpu buffer from paca to store rtas error log */
@@ -483,11 +491,21 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
  #endif
  
  	if (fwnmi_active) {

-   struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs);
-   if (errhdr) {
-   /* XXX Should look at FWNMI information */
-   }
-   fwnmi_release_errinfo();
+   unsigned long *savep;
+
+   /*
+* Firmware (PowerVM and KVM) saves r3 to a save area like
+* machine check, which is not exactly what PAPR (2.9)
+* suggests but there is no way to detect otherwise, so this
+* is the interface now.
+*
+* System resets do not save any error log or require an
+* "ibm,nmi-interlock" rtas call to release.
+*/
+
+   savep = fwnmi_get_savep(regs);
+   if (savep)
+   regs->gpr[3] = be64_to_cpu(savep[0]); /* restore 
original r3 */
}
  
  	if (smp_handle_nmi_ipi(regs))




Christophe

Re: [PATCH v3 09/15] powerpc/pseries: limit machine check stack to 4GB

2020-04-06 Thread Christophe Leroy





Le 07/04/2020 à 07:16, Nicholas Piggin a écrit :

This allows rtas_args to be put on the machine check stack, which
avoids a lot of complications with re-entrancy deadlocks.

Reviewed-by: Mahesh Salgaonkar 
Signed-off-by: Nicholas Piggin 


Reviewed-by: Christophe Leroy 


---
  arch/powerpc/kernel/setup_64.c | 15 ---
  1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 438a9befce41..defe05b6b7a9 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -709,7 +709,7 @@ void __init exc_lvl_early_init(void)
   */
  void __init emergency_stack_init(void)
  {
-   u64 limit;
+   u64 limit, mce_limit;
unsigned int i;
  
  	/*

@@ -726,7 +726,16 @@ void __init emergency_stack_init(void)
 * initialized in kernel/irq.c. These are initialized here in order
 * to have emergency stacks available as early as possible.
 */
-   limit = min(ppc64_bolted_size(), ppc64_rma_size);
+   limit = mce_limit = min(ppc64_bolted_size(), ppc64_rma_size);
+
+   /*
+* Machine check on pseries calls rtas, but can't use the static
+* rtas_args due to a machine check hitting while the lock is held.
+* rtas args have to be under 4GB, so the machine check stack is
+* limited to 4GB so args can be put on stack.
+*/
+   if (firmware_has_feature(FW_FEATURE_LPAR) && mce_limit > SZ_4G)
+   mce_limit = SZ_4G;
  
  	for_each_possible_cpu(i) {

paca_ptrs[i]->emergency_sp = alloc_stack(limit, i) + 
THREAD_SIZE;
@@ -736,7 +745,7 @@ void __init emergency_stack_init(void)
paca_ptrs[i]->nmi_emergency_sp = alloc_stack(limit, i) + 
THREAD_SIZE;
  
  		/* emergency stack for machine check exception handling. */

-   paca_ptrs[i]->mc_emergency_sp = alloc_stack(limit, i) + 
THREAD_SIZE;
+   paca_ptrs[i]->mc_emergency_sp = alloc_stack(mce_limit, i) + 
THREAD_SIZE;
  #endif
}
  }

Re: [PATCH v3 05/15] powerpc/pseries/ras: avoid calling rtas_token in NMI paths

2020-04-06 Thread Christophe Leroy





Le 07/04/2020 à 07:16, Nicholas Piggin a écrit :

In the interest of reducing code and possible failures in the
machine check and system reset paths, grab the "ibm,nmi-interlock"
token at init time.

Reviewed-by: Mahesh Salgaonkar 
Signed-off-by: Nicholas Piggin 


Reviewed-by: Christophe Leroy 


---
  arch/powerpc/include/asm/firmware.h|  1 +
  arch/powerpc/platforms/pseries/ras.c   |  2 +-
  arch/powerpc/platforms/pseries/setup.c | 14 ++
  3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/firmware.h 
b/arch/powerpc/include/asm/firmware.h
index ca33f4ef6cb4..6003c2e533a0 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -128,6 +128,7 @@ extern void machine_check_fwnmi(void);
  
  /* This is true if we are using the firmware NMI handler (typically LPAR) */

  extern int fwnmi_active;
+extern int ibm_nmi_interlock_token;
  
  extern unsigned int __start___fw_ftr_fixup, __stop___fw_ftr_fixup;
  
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c

index aa6208c8d4f0..972b95ebc867 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -458,7 +458,7 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct 
pt_regs *regs)
   */
  static void fwnmi_release_errinfo(void)
  {
-   int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL);
+   int ret = rtas_call(ibm_nmi_interlock_token, 0, 1, NULL);
if (ret != 0)
printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret);
  }
diff --git a/arch/powerpc/platforms/pseries/setup.c 
b/arch/powerpc/platforms/pseries/setup.c
index 0c8421dd01ab..dd234095ae4f 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -83,6 +83,7 @@ unsigned long CMO_PageSize = (ASM_CONST(1) << 
IOMMU_PAGE_SHIFT_4K);
  EXPORT_SYMBOL(CMO_PageSize);
  
  int fwnmi_active;  /* TRUE if an FWNMI handler is present */

+int ibm_nmi_interlock_token;
  
  static void pSeries_show_cpuinfo(struct seq_file *m)

  {
@@ -113,9 +114,14 @@ static void __init fwnmi_init(void)
struct slb_entry *slb_ptr;
size_t size;
  #endif
+   int ibm_nmi_register_token;
  
-	int ibm_nmi_register = rtas_token("ibm,nmi-register");

-   if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE)
+   ibm_nmi_register_token = rtas_token("ibm,nmi-register");
+   if (ibm_nmi_register_token == RTAS_UNKNOWN_SERVICE)
+   return;
+
+   ibm_nmi_interlock_token = rtas_token("ibm,nmi-interlock");
+   if (WARN_ON(ibm_nmi_interlock_token == RTAS_UNKNOWN_SERVICE))
return;
  
  	/* If the kernel's not linked at zero we point the firmware at low

@@ -123,8 +129,8 @@ static void __init fwnmi_init(void)
system_reset_addr  = __pa(system_reset_fwnmi) - PHYSICAL_START;
machine_check_addr = __pa(machine_check_fwnmi) - PHYSICAL_START;
  
-	if (0 == rtas_call(ibm_nmi_register, 2, 1, NULL, system_reset_addr,

-   machine_check_addr))
+   if (0 == rtas_call(ibm_nmi_register_token, 2, 1, NULL,
+  system_reset_addr, machine_check_addr))
fwnmi_active = 1;
  
  	/*

[PATCH v3 02/15] powerpc/64s/exceptions: Fix in_mce accounting in unrecoverable path

2020-04-06 Thread Nicholas Piggin

Acked-by: Mahesh Salgaonkar 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index bbf3109c5cba..3322000316ab 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1267,6 +1267,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
andcr10,r10,r3
mtmsrd  r10
 
+   lhz r12,PACA_IN_MCE(r13)
+   subir12,r12,1
+   sth r12,PACA_IN_MCE(r13)
+
/* Invoke machine_check_exception to print MCE event and panic. */
addir3,r1,STACK_FRAME_OVERHEAD
bl  machine_check_exception
-- 
2.23.0

[PATCH v3 01/15] powerpc/64s/exception: Fix machine check no-loss idle wakeup

2020-04-06 Thread Nicholas Piggin

The architecture allows for machine check exceptions to cause idle
wakeups which resume at the 0x200 address which has to return via
the idle wakeup code, but the early machine check handler is run
first.

The case of a no state-loss sleep is broken because the early
handler uses non-volatile register r1 , which is needed for the wakeup
protocol, but it is not restored.

Fix this by loading r1 from the MCE exception frame before returning
to the idle wakeup code. Also update the comment which has become
stale since the idle rewrite in C.

Fixes: 10d91611f426d ("powerpc/64s: Reimplement book3s idle code in C")
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 728ccb0f560c..bbf3109c5cba 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1224,17 +1224,19 @@ EXC_COMMON_BEGIN(machine_check_idle_common)
bl  machine_check_queue_event
 
/*
-* We have not used any non-volatile GPRs here, and as a rule
-* most exception code including machine check does not.
-* Therefore PACA_NAPSTATELOST does not need to be set. Idle
-* wakeup will restore volatile registers.
+* GPR-loss wakeups are relatively straightforward, because the
+* idle sleep code has saved all non-volatile registers on its
+* own stack, and r1 in PACAR1.
 *
-* Load the original SRR1 into r3 for pnv_powersave_wakeup_mce.
+* For no-loss wakeups the r1 and lr registers used by the
+* early machine check handler have to be restored first. r2 is
+* the kernel TOC, so no need to restore it.
 *
 * Then decrement MCE nesting after finishing with the stack.
 */
ld  r3,_MSR(r1)
ld  r4,_LINK(r1)
+   ld  r1,GPR1(r1)
 
lhz r11,PACA_IN_MCE(r13)
subir11,r11,1
@@ -1243,7 +1245,7 @@ EXC_COMMON_BEGIN(machine_check_idle_common)
mtlrr4
rlwinm  r10,r3,47-31,30,31
cmpwi   cr1,r10,2
-   bltlr   cr1 /* no state loss, return to idle caller */
+   bltlr   cr1 /* no state loss, return to idle caller with r3=SRR1 */
b   idle_return_gpr_loss
 #endif
 
-- 
2.23.0

[PATCH v3 10/15] powerpc/pseries: machine check use rtas_call_unlocked with args on stack

2020-04-06 Thread Nicholas Piggin

With the previous patch, machine checks can use rtas_call_unlocked
which avoids the rtas spinlock which would deadlock if a machine
check hits while making an rtas call.

This also avoids the complex rtas error logging which has more rtas calls
and includes kmalloc (which can return memory beyond RMA, which would
also crash).

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/platforms/pseries/ras.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index d65bc38bcb8f..4fd138ae26fd 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -468,7 +468,15 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct 
pt_regs *regs)
  */
 static void fwnmi_release_errinfo(void)
 {
-   int ret = rtas_call(ibm_nmi_interlock_token, 0, 1, NULL);
+   struct rtas_args rtas_args;
+   int ret;
+
+   /*
+* On pseries, the machine check stack is limited to under 4GB, so
+* args can be on-stack.
+*/
+   rtas_call_unlocked(_args, ibm_nmi_interlock_token, 0, 1, NULL);
+   ret = be32_to_cpu(rtas_args.rets[0]);
if (ret != 0)
printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret);
 }
-- 
2.23.0

[PATCH v3 13/15] powerpc/64s: machine check do not trace real-mode handler

2020-04-06 Thread Nicholas Piggin

Rather than notrace annotations throughout a significant part of the
machine check code across kernel/ pseries/ and powernv/ which can
easily be broken and is infrequently tested, use paca->ftrace_enabled
to blanket-disable tracing of the real-mode non-maskable handler.

Acked-by: Naveen N. Rao 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/mce.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index be7e3f92a7b5..fd90c0eda229 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -571,10 +572,14 @@ EXPORT_SYMBOL_GPL(machine_check_print_event_info);
  *
  * regs->nip and regs->msr contains srr0 and ssr1.
  */
-long machine_check_early(struct pt_regs *regs)
+long notrace machine_check_early(struct pt_regs *regs)
 {
long handled = 0;
bool nested = in_nmi();
+   u8 ftrace_enabled = this_cpu_get_ftrace_enabled();
+
+   this_cpu_set_ftrace_enabled(0);
+
if (!nested)
nmi_enter();
 
@@ -589,6 +594,8 @@ long machine_check_early(struct pt_regs *regs)
if (!nested)
nmi_exit();
 
+   this_cpu_set_ftrace_enabled(ftrace_enabled);
+
return handled;
 }
 
-- 
2.23.0

[PATCH v3 11/15] powerpc/64s: machine check interrupt update NMI accounting

2020-04-06 Thread Nicholas Piggin

machine_check_early is taken as an NMI, so nmi_enter is used there.
machine_check_exception is no longer taken as an NMI (it's invoked
via irq_work in the case a machine check hits in kernel mode), so
remove the nmi_enter from that case.

In NMI context, hash faults don't try to refill the hash table, which
can lead to crashes accessing non-pinned kernel pages. System reset
still has this potential problem.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/mce.c |  7 +++
 arch/powerpc/kernel/process.c |  2 +-
 arch/powerpc/kernel/traps.c   | 13 +
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 8077b5fb18a7..be7e3f92a7b5 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -574,6 +574,9 @@ EXPORT_SYMBOL_GPL(machine_check_print_event_info);
 long machine_check_early(struct pt_regs *regs)
 {
long handled = 0;
+   bool nested = in_nmi();
+   if (!nested)
+   nmi_enter();
 
hv_nmi_check_nonrecoverable(regs);
 
@@ -582,6 +585,10 @@ long machine_check_early(struct pt_regs *regs)
 */
if (ppc_md.machine_check_early)
handled = ppc_md.machine_check_early(regs);
+
+   if (!nested)
+   nmi_exit();
+
return handled;
 }
 
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 9c21288f8645..44410dd3029f 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1421,7 +1421,7 @@ void show_regs(struct pt_regs * regs)
pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr);
 #endif
 #ifdef CONFIG_PPC64
-   pr_cont("IRQMASK: %lx ", regs->softe);
+   pr_cont("IRQMASK: %lx IN_NMI:%d IN_MCE:%d", regs->softe, 
(int)get_paca()->in_nmi, (int)get_paca()->in_mce);
 #endif
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
if (MSR_TM_ACTIVE(regs->msr))
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 3fca22276bb1..9f221772eb73 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -823,9 +823,6 @@ int machine_check_generic(struct pt_regs *regs)
 void machine_check_exception(struct pt_regs *regs)
 {
int recover = 0;
-   bool nested = in_nmi();
-   if (!nested)
-   nmi_enter();
 
__this_cpu_inc(irq_stat.mce_exceptions);
 
@@ -851,20 +848,12 @@ void machine_check_exception(struct pt_regs *regs)
if (check_io_access(regs))
goto bail;
 
-   if (!nested)
-   nmi_exit();
-
die("Machine check", regs, SIGBUS);
 
+bail:
/* Must die if the interrupt is not recoverable */
if (!(regs->msr & MSR_RI))
nmi_panic(regs, "Unrecoverable Machine check");
-
-   return;
-
-bail:
-   if (!nested)
-   nmi_exit();
 }
 
 void SMIException(struct pt_regs *regs)
-- 
2.23.0

[PATCH v3 12/15] powerpc: ftrace_enabled helper

2020-04-06 Thread Nicholas Piggin

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/ftrace.h | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/arch/powerpc/include/asm/ftrace.h 
b/arch/powerpc/include/asm/ftrace.h
index f54a08a2cd70..bc76970b6ee5 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -108,9 +108,23 @@ static inline void this_cpu_enable_ftrace(void)
 {
get_paca()->ftrace_enabled = 1;
 }
+
+/* Disable ftrace on this CPU if possible (may not be implemented) */
+static inline void this_cpu_set_ftrace_enabled(u8 ftrace_enabled)
+{
+   get_paca()->ftrace_enabled = ftrace_enabled;
+}
+
+static inline u8 this_cpu_get_ftrace_enabled(void)
+{
+   return get_paca()->ftrace_enabled;
+}
+
 #else /* CONFIG_PPC64 */
 static inline void this_cpu_disable_ftrace(void) { }
 static inline void this_cpu_enable_ftrace(void) { }
+static inline void this_cpu_set_ftrace_enabled(u8 ftrace_enabled) { }
+static inline u8 this_cpu_get_ftrace_enabled(void) { return 1; }
 #endif /* CONFIG_PPC64 */
 #endif /* !__ASSEMBLY__ */
 
-- 
2.23.0

[PATCH v3 14/15] powerpc/64s: system reset do not trace

2020-04-06 Thread Nicholas Piggin

Similarly to the previous patch, do not trace system reset. This code
is used when there is a crash or hang, and tracing disturbs the system
more and has been known to crash in the crash handling path.

Acked-by: Naveen N. Rao 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/traps.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 9f221772eb73..1beae89bb871 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -443,6 +443,9 @@ void system_reset_exception(struct pt_regs *regs)
unsigned long hsrr0, hsrr1;
bool nested = in_nmi();
bool saved_hsrrs = false;
+   u8 ftrace_enabled = this_cpu_get_ftrace_enabled();
+
+   this_cpu_set_ftrace_enabled(0);
 
/*
 * Avoid crashes in case of nested NMI exceptions. Recoverability
@@ -524,6 +527,8 @@ void system_reset_exception(struct pt_regs *regs)
if (!nested)
nmi_exit();
 
+   this_cpu_set_ftrace_enabled(ftrace_enabled);
+
/* What should we do here? We could issue a shutdown or hard reset. */
 }
 
-- 
2.23.0

Re: [PATCH] powerpcs: perf: consolidate perf_callchain_user_64 and perf_callchain_user_32

2020-04-06 Thread Christophe Leroy





Le 06/04/2020 à 23:00, Michal Suchanek a écrit :

perf_callchain_user_64 and perf_callchain_user_32 are nearly identical.
Consolidate into one function with thin wrappers.

Suggested-by: Nicholas Piggin 
Signed-off-by: Michal Suchanek 
---
  arch/powerpc/perf/callchain.h| 24 +++-
  arch/powerpc/perf/callchain_32.c | 21 ++---
  arch/powerpc/perf/callchain_64.c | 14 --
  3 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/perf/callchain.h b/arch/powerpc/perf/callchain.h
index 7a2cb9e1181a..7540bb71cb60 100644
--- a/arch/powerpc/perf/callchain.h
+++ b/arch/powerpc/perf/callchain.h
@@ -2,7 +2,7 @@
  #ifndef _POWERPC_PERF_CALLCHAIN_H
  #define _POWERPC_PERF_CALLCHAIN_H
  
-int read_user_stack_slow(void __user *ptr, void *buf, int nb);

+int read_user_stack_slow(const void __user *ptr, void *buf, int nb);


Does the constification of ptr has to be in this patch ?
Wouldn't it be better to have it as a separate patch ?


  void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
struct pt_regs *regs);
  void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
@@ -16,4 +16,26 @@ static inline bool invalid_user_sp(unsigned long sp)
return (!sp || (sp & mask) || (sp > top));
  }
  
+/*

+ * On 32-bit we just access the address and let hash_page create a
+ * HPTE if necessary, so there is no need to fall back to reading
+ * the page tables.  Since this is called at interrupt level,
+ * do_page_fault() won't treat a DSI as a page fault.
+ */
+static inline int __read_user_stack(const void __user *ptr, void *ret,
+   size_t size)
+{
+   int rc;
+
+   if ((unsigned long)ptr > TASK_SIZE - size ||
+   ((unsigned long)ptr & (size - 1)))
+   return -EFAULT;
+   rc = probe_user_read(ret, ptr, size);
+
+   if (rc && IS_ENABLED(CONFIG_PPC64))


gcc is probably smart enough to deal with it efficiently, but it would
be more correct to test rc after checking CONFIG_PPC64.


+   return read_user_stack_slow(ptr, ret, size);
+
+   return rc;
+}
+
  #endif /* _POWERPC_PERF_CALLCHAIN_H */
diff --git a/arch/powerpc/perf/callchain_32.c b/arch/powerpc/perf/callchain_32.c
index 8aa951003141..1b4621f177e8 100644
--- a/arch/powerpc/perf/callchain_32.c
+++ b/arch/powerpc/perf/callchain_32.c
@@ -31,26 +31,9 @@
  
  #endif /* CONFIG_PPC64 */
  
-/*

- * On 32-bit we just access the address and let hash_page create a
- * HPTE if necessary, so there is no need to fall back to reading
- * the page tables.  Since this is called at interrupt level,
- * do_page_fault() won't treat a DSI as a page fault.
- */
-static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
+static int read_user_stack_32(const unsigned int __user *ptr, unsigned int 
*ret)
  {
-   int rc;
-
-   if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) ||
-   ((unsigned long)ptr & 3))
-   return -EFAULT;
-
-   rc = probe_user_read(ret, ptr, sizeof(*ret));
-
-   if (IS_ENABLED(CONFIG_PPC64) && rc)
-   return read_user_stack_slow(ptr, ret, 4);
-
-   return rc;
+   return __read_user_stack(ptr, ret, sizeof(*ret);
  }
  
  /*

diff --git a/arch/powerpc/perf/callchain_64.c b/arch/powerpc/perf/callchain_64.c
index df1ffd8b20f2..55bbc25a54ed 100644
--- a/arch/powerpc/perf/callchain_64.c
+++ b/arch/powerpc/perf/callchain_64.c
@@ -24,7 +24,7 @@
   * interrupt context, so if the access faults, we read the page tables
   * to find which page (if any) is mapped and access it directly.
   */
-int read_user_stack_slow(void __user *ptr, void *buf, int nb)
+int read_user_stack_slow(const void __user *ptr, void *buf, int nb)
  {
int ret = -EFAULT;
pgd_t *pgdir;
@@ -65,16 +65,10 @@ int read_user_stack_slow(void __user *ptr, void *buf, int 
nb)
return ret;
  }
  
-static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret)

+static int read_user_stack_64(const unsigned long __user *ptr,
+ unsigned long *ret)
  {
-   if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned long) ||
-   ((unsigned long)ptr & 7))
-   return -EFAULT;
-
-   if (!probe_user_read(ret, ptr, sizeof(*ret)))
-   return 0;
-
-   return read_user_stack_slow(ptr, ret, 8);
+   return __read_user_stack(ptr, ret, sizeof(*ret));
  }
  
  /*




Christophe

Re: [PATCH v3 11/15] powerpc/64s: machine check interrupt update NMI accounting

2020-04-06 Thread Christophe Leroy





Le 07/04/2020 à 07:16, Nicholas Piggin a écrit :

machine_check_early is taken as an NMI, so nmi_enter is used there.
machine_check_exception is no longer taken as an NMI (it's invoked
via irq_work in the case a machine check hits in kernel mode), so
remove the nmi_enter from that case.


Euh ... Is that also the case for PPC32 ?

AFAIK machine_check_exception() is called as an NMI on PPC32.

Christophe



In NMI context, hash faults don't try to refill the hash table, which
can lead to crashes accessing non-pinned kernel pages. System reset
still has this potential problem.

Signed-off-by: Nicholas Piggin 
---
  arch/powerpc/kernel/mce.c |  7 +++
  arch/powerpc/kernel/process.c |  2 +-
  arch/powerpc/kernel/traps.c   | 13 +
  3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 8077b5fb18a7..be7e3f92a7b5 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -574,6 +574,9 @@ EXPORT_SYMBOL_GPL(machine_check_print_event_info);
  long machine_check_early(struct pt_regs *regs)
  {
long handled = 0;
+   bool nested = in_nmi();
+   if (!nested)
+   nmi_enter();
  
  	hv_nmi_check_nonrecoverable(regs);
  
@@ -582,6 +585,10 @@ long machine_check_early(struct pt_regs *regs)

 */
if (ppc_md.machine_check_early)
handled = ppc_md.machine_check_early(regs);
+
+   if (!nested)
+   nmi_exit();
+
return handled;
  }
  
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c

index 9c21288f8645..44410dd3029f 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1421,7 +1421,7 @@ void show_regs(struct pt_regs * regs)
pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr);
  #endif
  #ifdef CONFIG_PPC64
-   pr_cont("IRQMASK: %lx ", regs->softe);
+   pr_cont("IRQMASK: %lx IN_NMI:%d IN_MCE:%d", regs->softe, 
(int)get_paca()->in_nmi, (int)get_paca()->in_mce);
  #endif
  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
if (MSR_TM_ACTIVE(regs->msr))
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 3fca22276bb1..9f221772eb73 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -823,9 +823,6 @@ int machine_check_generic(struct pt_regs *regs)
  void machine_check_exception(struct pt_regs *regs)
  {
int recover = 0;
-   bool nested = in_nmi();
-   if (!nested)
-   nmi_enter();
  
  	__this_cpu_inc(irq_stat.mce_exceptions);
  
@@ -851,20 +848,12 @@ void machine_check_exception(struct pt_regs *regs)

if (check_io_access(regs))
goto bail;
  
-	if (!nested)

-   nmi_exit();
-
die("Machine check", regs, SIGBUS);
  
+bail:

/* Must die if the interrupt is not recoverable */
if (!(regs->msr & MSR_RI))
nmi_panic(regs, "Unrecoverable Machine check");
-
-   return;
-
-bail:
-   if (!nested)
-   nmi_exit();
  }
  
  void SMIException(struct pt_regs *regs)

Re: [PATCH v3 12/15] powerpc: ftrace_enabled helper

2020-04-06 Thread Christophe Leroy





Le 07/04/2020 à 07:16, Nicholas Piggin a écrit :

Signed-off-by: Nicholas Piggin 


Reviewed-by: Christophe Leroy 



---
  arch/powerpc/include/asm/ftrace.h | 14 ++
  1 file changed, 14 insertions(+)

diff --git a/arch/powerpc/include/asm/ftrace.h 
b/arch/powerpc/include/asm/ftrace.h
index f54a08a2cd70..bc76970b6ee5 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -108,9 +108,23 @@ static inline void this_cpu_enable_ftrace(void)
  {
get_paca()->ftrace_enabled = 1;
  }
+
+/* Disable ftrace on this CPU if possible (may not be implemented) */
+static inline void this_cpu_set_ftrace_enabled(u8 ftrace_enabled)
+{
+   get_paca()->ftrace_enabled = ftrace_enabled;
+}
+
+static inline u8 this_cpu_get_ftrace_enabled(void)
+{
+   return get_paca()->ftrace_enabled;
+}
+
  #else /* CONFIG_PPC64 */
  static inline void this_cpu_disable_ftrace(void) { }
  static inline void this_cpu_enable_ftrace(void) { }
+static inline void this_cpu_set_ftrace_enabled(u8 ftrace_enabled) { }
+static inline u8 this_cpu_get_ftrace_enabled(void) { return 1; }
  #endif /* CONFIG_PPC64 */
  #endif /* !__ASSEMBLY__ */

Re: [PATCH v3 1/1] powerpc/kernel: Enables memory hot-remove after reboot on pseries guests

2020-04-06 Thread Bharata B Rao

On Mon, Apr 06, 2020 at 12:41:01PM -0300, Leonardo Bras wrote:
> Hello Bharata,
> 
> On Fri, 2020-04-03 at 20:08 +0530, Bharata B Rao wrote:
> > The patch would be more complete with the following change that ensures
> > that DRCONF_MEM_HOTREMOVABLE flag is set for non-boot-time hotplugged
> > memory too. This will ensure that ibm,dynamic-memory-vN property
> > reflects the right flags value for memory that gets hotplugged
> > post boot.
> > 
> 
> You just sent that on a separated patchset, so I think it's dealt with.
> Do you have any other comments on the present patch?

None, thanks.

Regards,
Bharata.

Re: [PATCH v2 08/14] powerpc/pseries/ras: fwnmi sreset should not interlock

2020-04-06 Thread Nicholas Piggin

Christophe Leroy's on April 4, 2020 12:35 am:
> 
> 
> Le 03/04/2020 à 15:26, Nicholas Piggin a écrit :
>> PAPR does not specify that fwnmi sreset should be interlocked, and
>> PowerVM (and therefore now QEMU) do not require it.
>> 
>> These "ibm,nmi-interlock" calls are ignored by firmware, but there
>> is a possibility that the sreset could have interrupted a machine
>> check and release the machine check's interlock too early, corrupting
>> it if another machine check came in.
>> 
>> This is an extremely rare case, but it should be fixed for clarity
>> and reducing the code executed in the sreset path. Firmware also
>> does not provide error information for the sreset case to look at, so
>> remove that comment.
>> 
>> Signed-off-by: Nicholas Piggin 
>> ---
>>   arch/powerpc/platforms/pseries/ras.c | 48 
>>   1 file changed, 34 insertions(+), 14 deletions(-)
>> 
>> diff --git a/arch/powerpc/platforms/pseries/ras.c 
>> b/arch/powerpc/platforms/pseries/ras.c
>> index a40598e6e525..833ae34b7fec 100644
>> --- a/arch/powerpc/platforms/pseries/ras.c
>> +++ b/arch/powerpc/platforms/pseries/ras.c
>> @@ -406,6 +406,20 @@ static inline struct rtas_error_log 
>> *fwnmi_get_errlog(void)
>>  return (struct rtas_error_log *)local_paca->mce_data_buf;
>>   }
>>   
>> +static unsigned long *fwnmi_get_savep(struct pt_regs *regs)
>> +{
>> +unsigned long savep_ra;
>> +
>> +/* Mask top two bits */
>> +savep_ra = regs->gpr[3] & ~(0x3UL << 62);
>> +if (!VALID_FWNMI_BUFFER(savep_ra)) {
>> +printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]);
> 
> Can't you use pr_err() instead ?

I think so.

>> +return NULL;
>> +}
>> +
>> +return __va(savep_ra);
>> +}
>> +
>>   /*
>>* Get the error information for errors coming through the
>>* FWNMI vectors.  The pt_regs' r3 will be updated to reflect
>> @@ -423,20 +437,15 @@ static inline struct rtas_error_log 
>> *fwnmi_get_errlog(void)
>>*/
>>   static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
>>   {
>> -unsigned long savep_ra;
>>  unsigned long *savep;
>>  struct rtas_error_log *h;
>>   
>> -/* Mask top two bits */
>> -savep_ra = regs->gpr[3] & ~(0x3UL << 62);
>> -
>> -if (!VALID_FWNMI_BUFFER(savep_ra)) {
>> -printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]);
>> +savep = fwnmi_get_savep(regs);
>> +if (!savep)
>>  return NULL;
>> -}
>>   
>> -savep = __va(savep_ra);
>> -regs->gpr[3] = be64_to_cpu(savep[0]);   /* restore original r3 */
>> +/* restore original r3 */
>> +regs->gpr[3] = be64_to_cpu(savep[0]);
> 
> Is it needed to change the location of the comment ?

No, I originally had other changes I think. Will fix.

Thanks,
Nick

Re: [PATCH v2 13/14] powerpc/64s: system reset do not trace

2020-04-06 Thread Nicholas Piggin

Christophe Leroy's on April 4, 2020 12:45 am:
> 
> 
> Le 03/04/2020 à 15:26, Nicholas Piggin a écrit :
>> Similarly to the previous patch, do not trace system reset. This code
>> is used when there is a crash or hang, and tracing disturbs the system
>> more and has been known to crash in the crash handling path.
>> 
>> Acked-by: Naveen N. Rao 
>> Signed-off-by: Nicholas Piggin 
>> ---
>>   arch/powerpc/kernel/traps.c | 5 +
>>   1 file changed, 5 insertions(+)
>> 
>> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
>> index 1845fd7e161a..ed7b7a6e2dc0 100644
>> --- a/arch/powerpc/kernel/traps.c
>> +++ b/arch/powerpc/kernel/traps.c
>> @@ -443,6 +443,9 @@ void system_reset_exception(struct pt_regs *regs)
>>  unsigned long hsrr0, hsrr1;
>>  bool nested = in_nmi();
>>  bool saved_hsrrs = false;
>> +u8 ftrace_enabled = local_paca->ftrace_enabled;
>> +
>> +local_paca->ftrace_enabled = 0;
> 
> I predict a build failure here in the near future ...

Will fix. Naveen suggested some helper functions for this too.

Thanks,
Nick

Re: [PATCH v2 09/14] powerpc/pseries: limit machine check stack to 4GB

2020-04-06 Thread Nicholas Piggin

Christophe Leroy's on April 4, 2020 12:19 am:
> 
> 
> Le 03/04/2020 à 15:26, Nicholas Piggin a écrit :
>> This allows rtas_args to be put on the machine check stack, which
>> avoids a lot of complications with re-entrancy deadlocks.
>> 
>> Reviewed-by: Mahesh Salgaonkar 
>> Signed-off-by: Nicholas Piggin 
>> ---
>>   arch/powerpc/kernel/setup_64.c | 17 -
>>   1 file changed, 16 insertions(+), 1 deletion(-)
>> 
>> diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
>> index e05e6dd67ae6..3a2428aa3d9a 100644
>> --- a/arch/powerpc/kernel/setup_64.c
>> +++ b/arch/powerpc/kernel/setup_64.c
>> @@ -692,6 +692,9 @@ void __init exc_lvl_early_init(void)
>>   void __init emergency_stack_init(void)
>>   {
>>  u64 limit;
>> +#ifdef CONFIG_PPC_BOOK3S_64
> 
> #ifdef not needed, see below
> 
>> +u64 mce_limit;
>> +#endif
>>  unsigned int i;
>>   
>>  /*
>> @@ -710,6 +713,18 @@ void __init emergency_stack_init(void)
>>   */
>>  limit = min(ppc64_bolted_size(), ppc64_rma_size);
>>   
>> +/*
>> + * Machine check on pseries calls rtas, but can't use the static
>> + * rtas_args due to a machine check hitting while the lock is held.
>> + * rtas args have to be under 4GB, so the machine check stack is
>> + * limited to 4GB so args can be put on stack.
>> + */
>> +#ifdef CONFIG_PPC_BOOK3S_64
> 
> This ifdef is not needed. FW_FEATURE_LPAR is only possible on 
> CONFIG_PPC_BOOK3S_64 (indeed only on PSERIES or PS3). On others 
> firmware_has_feature(FW_FEATURE_LPAR) should return 0 at compile time.

Sure I'll remove it.

>> +mce_limit = limit;
>> +if (firmware_has_feature(FW_FEATURE_LPAR) && mce_limit > 
>> 4UL*1024*1024*1024)
>> +mce_limit = 4UL*1024*1024*1024;
> 
> You should use SZ_4G instead of hardcoding.

Will do.

Thanks,
Nick

Linux-next POWER9 NULL pointer NIP since 1st Apr.

2020-04-06 Thread Qian Cai

Ever since 1st Apr, linux-next starts to trigger a NULL pointer NIP on POWER9 
below using
this config,

https://raw.githubusercontent.com/cailca/linux-mm/master/powerpc.config

It takes a while to reproduce, so before I bury myself into bisecting and just 
send a head-up
to see if anyone spots anything obvious.

[  206.744625][T13224] LTP: starting fallocate04
[  207.601583][T27684] /dev/zero: Can't open blockdev
[  208.674301][T27684] EXT4-fs (loop0): mounting ext3 file system using the 
ext4 subsystem
[  208.680347][T27684] BUG: Unable to handle kernel instruction fetch (NULL 
pointer?)
[  208.680383][T27684] Faulting instruction address: 0x
[  208.680406][T27684] Oops: Kernel access of bad area, sig: 11 [#1]
[  208.680439][T27684] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=256 
DEBUG_PAGEALLOC NUMA PowerNV
[  208.680474][T27684] Modules linked in: ext4 crc16 mbcache jbd2 loop kvm_hv 
kvm ip_tables x_tables xfs sd_mod bnx2x ahci libahci mdio tg3 libata libphy 
firmware_class dm_mirror dm_region_hash dm_log dm_mod
[  208.680576][T27684] CPU: 117 PID: 27684 Comm: fallocate04 Tainted: G
W 5.6.0-next-20200401+ #288
[  208.680614][T27684] NIP:   LR: c008102c0048 CTR: 

[  208.680657][T27684] REGS: c000200361def420 TRAP: 0400   Tainted: GW  
(5.6.0-next-20200401+)
[  208.680700][T27684] MSR:  90004280b033 
  CR: 4208  XER: 2004
[  208.680760][T27684] CFAR: c0081032c494 IRQMASK: 0 
[  208.680760][T27684] GPR00: c05ac3f8 c000200361def6b0 
c165c200 c00020107dae0bd0 
[  208.680760][T27684] GPR04:  0400 
  
[  208.680760][T27684] GPR08: c000200361def6e8 c008102c0040 
7fff c1614e80 
[  208.680760][T27684] GPR12:  c000201fff671280 
 0002 
[  208.680760][T27684] GPR16: 0002 00040001 
c00020030f5a1000 c00020030f5a1548 
[  208.680760][T27684] GPR20: c15fbad8 c168c654 
c000200361def818 c05b4c10 
[  208.680760][T27684] GPR24:  c008103365b8 
c00020107dae0bd0 0400 
[  208.680760][T27684] GPR28: c168c3a8  
  
[  208.681014][T27684] NIP [] 0x0
[  208.681065][T27684] LR [c008102c0048] ext4_iomap_end+0x8/0x30 [ext4]
[  208.681091][T27684] Call Trace:
[  208.681129][T27684] [c000200361def6b0] [c05ac3bc] 
iomap_apply+0x20c/0x920 (unreliable)
iomap_apply at fs/iomap/apply.c:80 (discriminator 4)
[  208.681173][T27684] [c000200361def7f0] [c05b4adc] 
iomap_bmap+0xfc/0x160
iomap_bmap at fs/iomap/fiemap.c:142
[  208.681228][T27684] [c000200361def850] [c008102c2c1c] 
ext4_bmap+0xa4/0x180 [ext4]
ext4_bmap at fs/ext4/inode.c:3213
[  208.681260][T27684] [c000200361def890] [c04f71fc] bmap+0x4c/0x80
[  208.681281][T27684] [c000200361def8c0] [c0080fdb0acc] 
jbd2_journal_init_inode+0x44/0x1a0 [jbd2]
jbd2_journal_init_inode at fs/jbd2/journal.c:1255
[  208.681326][T27684] [c000200361def960] [c0081031c808] 
ext4_load_journal+0x440/0x860 [ext4]
[  208.681371][T27684] [c000200361defa30] [c00810322a14] 
ext4_fill_super+0x342c/0x3ab0 [ext4]
[  208.681414][T27684] [c000200361defba0] [c04cb0bc] 
mount_bdev+0x25c/0x290
[  208.681478][T27684] [c000200361defc40] [c00810310250] 
ext4_mount+0x28/0x50 [ext4]
[  208.681520][T27684] [c000200361defc60] [c053242c] 
legacy_get_tree+0x4c/0xb0
[  208.681556][T27684] [c000200361defc90] [c04c864c] 
vfs_get_tree+0x4c/0x130
[  208.681593][T27684] [c000200361defd00] [c050a1c8] 
do_mount+0xa18/0xc50
[  208.681641][T27684] [c000200361defdd0] [c050a9a8] 
sys_mount+0x158/0x180
[  208.681679][T27684] [c000200361defe20] [c000b3f8] 
system_call+0x5c/0x68
[  208.681726][T27684] Instruction dump:
[  208.681747][T27684]       
  
[  208.681797][T27684]       
  
[  208.681839][T27684] ---[ end trace 4e9e2bab7f1d4048 ]---
[  208.802259][T27684] 
[  209.802373][T27684] Kernel panic - not syncing: Fatal exception

[  215.281666][T16896] LTP: starting chown04_16
[  215.424203][T18297] BUG: Unable to handle kernel instruction fetch (NULL 
pointer?)
[  215.424289][T18297] Faulting instruction address: 0x
[  215.424313][T18297] Oops: Kernel access of bad area, sig: 11 [#1]
[  215.424341][T18297] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=256 
DEBUG_PAGEALLOC NUMA PowerNV
[  215.424383][T18297] Modules linked in: loop kvm_hv kvm ip_tables x_tables 
xfs sd_mod bnx2x mdio tg3 ahci libahci libphy libata firmware_class dm_mirror 
dm_region_hash dm_log dm_mod
[  215.424459][T18297] CPU: 85 PID: 18297 Comm: chown04_16 Tainted: GW  
   5.6.0-next-20200405+ #3
[  215.424489][T18297] NIP:   LR: c0080fbc0408 CTR:

RE: [EXT] [PATCH 2/2] clk: qoriq: add cpufreq platform device

2020-04-06 Thread Andy Tang


> -Original Message-
> From: Mian Yousaf Kaukab 
> Sent: 2020年4月4日 5:21
> To: linux...@vger.kernel.org; Andy Tang ;
> shawn...@kernel.org; Leo Li 
> Cc: viresh.ku...@linaro.org; linux-ker...@vger.kernel.org;
> linux-arm-ker...@lists.infradead.org; linuxppc-dev@lists.ozlabs.org; Mian
> Yousaf Kaukab 
> Subject: [EXT] [PATCH 2/2] clk: qoriq: add cpufreq platform device
> 
> Caution: EXT Email
> 
> Add a platform device for qoirq-cpufreq driver for the compatible clockgen
> blocks.
> 
> Signed-off-by: Mian Yousaf Kaukab 
> ---
>  drivers/clk/clk-qoriq.c | 30 +++---
>  1 file changed, 27 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/clk/clk-qoriq.c b/drivers/clk/clk-qoriq.c index
> d5946f7486d6..374afcab89af 100644
> --- a/drivers/clk/clk-qoriq.c
> +++ b/drivers/clk/clk-qoriq.c
> @@ -95,6 +95,7 @@ struct clockgen {
>  };
> 

For both patches,
Reviewed-by: Yuantian Tang 

BR,
Andy

Re: [PATCH v2 0/2] Don't generate thousands of new warnings when building docs

2020-04-06 Thread Michael Ellerman

Mauro Carvalho Chehab  writes:
> This small series address a regression caused by a new patch at
> docs-next (and at linux-next).
>
> Before this patch, when a cross-reference to a chapter within the
> documentation is needed, we had to add a markup like:
>
>   .. _foo:
>
>   foo
>   ===
>
> This behavor is now different after this patch:
>
>   58ad30cf91f0 ("docs: fix reference to core-api/namespaces.rst")
>
> As a Sphinx extension now creates automatically a reference
> like the above, without requiring any extra markup.
>
> That, however, comes with a price: it is not possible anymore to have
> two sections with the same name within the entire Kernel docs!
>
> This causes thousands of warnings, as we have sections named
> "introduction" on lots of places.
>
> This series solve this regression by doing two changes:
>
> 1) The references are now prefixed by the document name. So,
>a file named "bar" would have the "foo" reference as "bar:foo".
>
> 2) It will only use the first two levels. The first one is (usually) the
>name of the document, and the second one the chapter name.
>
> This solves almost all problems we have. Still, there are a few places
> where we have two chapters at the same document with the
> same name. The first patch addresses this problem.

I'm still seeing a lot of warnings. Am I doing something wrong?

cheers

/linux/Documentation/powerpc/cxl.rst:406: WARNING: duplicate label 
powerpc/cxl:open, other instance in /linux/Documentation/powerpc/cxl.rst
/linux/Documentation/powerpc/cxl.rst:412: WARNING: duplicate label 
powerpc/cxl:ioctl, other instance in /linux/Documentation/powerpc/cxl.rst
/linux/Documentation/powerpc/syscall64-abi.rst:86: WARNING: duplicate label 
powerpc/syscall64-abi:parameters and return value, other instance in 
/linux/Documentation/powerpc/syscall64-abi.rst
/linux/Documentation/powerpc/syscall64-abi.rst:90: WARNING: duplicate label 
powerpc/syscall64-abi:stack, other instance in 
/linux/Documentation/powerpc/syscall64-abi.rst
/linux/Documentation/powerpc/syscall64-abi.rst:94: WARNING: duplicate label 
powerpc/syscall64-abi:register preservation rules, other instance in 
/linux/Documentation/powerpc/syscall64-abi.rst
/linux/Documentation/powerpc/syscall64-abi.rst:103: WARNING: duplicate label 
powerpc/syscall64-abi:invocation, other instance in 
/linux/Documentation/powerpc/syscall64-abi.rst
/linux/Documentation/powerpc/syscall64-abi.rst:108: WARNING: duplicate label 
powerpc/syscall64-abi:transactional memory, other instance in 
/linux/Documentation/powerpc/syscall64-abi.rst
/linux/Documentation/powerpc/ultravisor.rst:339: WARNING: duplicate label 
powerpc/ultravisor:syntax, other instance in 
/linux/Documentation/powerpc/ultravisor.rst
/linux/Documentation/powerpc/ultravisor.rst:351: WARNING: duplicate label 
powerpc/ultravisor:return values, other instance in 
/linux/Documentation/powerpc/ultravisor.rst
/linux/Documentation/powerpc/ultravisor.rst:365: WARNING: duplicate label 
powerpc/ultravisor:description, other instance in 
/linux/Documentation/powerpc/ultravisor.rst
/linux/Documentation/powerpc/ultravisor.rst:387: WARNING: duplicate label 
powerpc/ultravisor:use cases, other instance in 
/linux/Documentation/powerpc/ultravisor.rst
/linux/Documentation/powerpc/ultravisor.rst:406: WARNING: duplicate label 
powerpc/ultravisor:syntax, other instance in 
/linux/Documentation/powerpc/ultravisor.rst
/linux/Documentation/powerpc/ultravisor.rst:416: WARNING: duplicate label 
powerpc/ultravisor:return values, other instance in 
/linux/Documentation/powerpc/ultravisor.rst
/linux/Documentation/powerpc/ultravisor.rst:429: WARNING: duplicate label 
powerpc/ultravisor:description, other instance in 
/linux/Documentation/powerpc/ultravisor.rst
/linux/Documentation/powerpc/ultravisor.rst:438: WARNING: duplicate label 
powerpc/ultravisor:use cases, other instance in 
/linux/Documentation/powerpc/ultravisor.rst
/linux/Documentation/powerpc/ultravisor.rst:452: WARNING: duplicate label 
powerpc/ultravisor:syntax, other instance in 
/linux/Documentation/powerpc/ultravisor.rst
/linux/Documentation/powerpc/ultravisor.rst:462: WARNING: duplicate label 
powerpc/ultravisor:return values, other instance in 
/linux/Documentation/powerpc/ultravisor.rst
/linux/Documentation/powerpc/ultravisor.rst:477: WARNING: duplicate label 
powerpc/ultravisor:description, other instance in 
/linux/Documentation/powerpc/ultravisor.rst
/linux/Documentation/powerpc/ultravisor.rst:484: WARNING: duplicate label 
powerpc/ultravisor:use cases, other instance in 
/linux/Documentation/powerpc/ultravisor.rst
/linux/Documentation/powerpc/ultravisor.rst:514: WARNING: duplicate label 
powerpc/ultravisor:syntax, other instance in 
/linux/Documentation/powerpc/ultravisor.rst
/linux/Documentation/powerpc/ultravisor.rst:521: WARNING: duplicate label 
powerpc/ultravisor:return values, other instance in 
/linux/Documentation/powerpc/ultravisor.rst

[PATCH v3] powerpc/uaccess: evaluate macro arguments once, before user access is allowed

2020-04-06 Thread Nicholas Piggin

get/put_user can be called with nontrivial arguments. fs/proc/page.c
has a good example:

if (put_user(stable_page_flags(ppage), out)) {

stable_page_flags is quite a lot of code, including spin locks in the
page allocator.

Ensure these arguments are evaluated before user access is allowed.
This improves security by reducing code with access to userspace, but
it also fixes a PREEMPT bug with KUAP on powerpc/64s:
stable_page_flags is currently called with AMR set to allow writes,
it ends up calling spin_unlock(), which can call preempt_schedule. But
the task switch code can not be called with AMR set (it relies on
interrupts saving the register), so this blows up.

It's fine if the code inside allow_user_access is preemptible, because
a timer or IPI will save the AMR, but it's not okay to explicitly
cause a reschedule.

Signed-off-by: Nicholas Piggin 
---

I took this patch out of the series because there is some discussion
about the other patches, and if we enable KUAP over probe_kernel_*
functions then it's technically a change of behaviour so I'm happy to
wait until next merge window or this discussion resolves itself.

This patch should fix the bugs and cause for security concern of running
arbitrary argument evaluation with user access allowed.

 arch/powerpc/include/asm/uaccess.h | 49 +-
 1 file changed, 35 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 2f500debae21..0969285996cb 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -166,13 +166,17 @@ do {  
\
 ({ \
long __pu_err;  \
__typeof__(*(ptr)) __user *__pu_addr = (ptr);   \
+   __typeof__(*(ptr)) __pu_val = (x);  \
+   __typeof__(size) __pu_size = (size);\
+   \
if (!is_kernel_addr((unsigned long)__pu_addr))  \
might_fault();  \
-   __chk_user_ptr(ptr);\
+   __chk_user_ptr(__pu_addr);  \
if (do_allow)   
\
-   __put_user_size((x), __pu_addr, (size), __pu_err);  
\
+   __put_user_size(__pu_val, __pu_addr, __pu_size, __pu_err);  
\
else
\
-   __put_user_size_allowed((x), __pu_addr, (size), __pu_err);  
\
+   __put_user_size_allowed(__pu_val, __pu_addr, __pu_size, 
__pu_err); \
+   \
__pu_err;   \
 })
 
@@ -180,9 +184,13 @@ do {   
\
 ({ \
long __pu_err = -EFAULT;\
__typeof__(*(ptr)) __user *__pu_addr = (ptr);   \
+   __typeof__(*(ptr)) __pu_val = (x);  \
+   __typeof__(size) __pu_size = (size);\
+   \
might_fault();  \
-   if (access_ok(__pu_addr, size)) \
-   __put_user_size((x), __pu_addr, (size), __pu_err);  \
+   if (access_ok(__pu_addr, __pu_size))\
+   __put_user_size(__pu_val, __pu_addr, __pu_size, __pu_err); \
+   \
__pu_err;   \
 })
 
@@ -190,8 +198,12 @@ do {   
\
 ({ \
long __pu_err;  \
__typeof__(*(ptr)) __user *__pu_addr = (ptr);   \
-   __chk_user_ptr(ptr);\
-   __put_user_size((x), __pu_addr, (size), __pu_err);  \
+   __typeof__(*(ptr)) __pu_val = (x);  \
+   __typeof__(size) __pu_size = (size);\
+   \
+   __chk_user_ptr(__pu_addr);  \
+   __put_user_size(__pu_val, __pu_addr, __pu_size, __pu_err); \
+   \
__pu_err;   \
 })
 
@@ -283,15 +295,18 @@ do {

Re: [PATCH v2 05/14] powerpc/pseries/ras: avoid calling rtas_token in NMI paths

2020-04-06 Thread Nicholas Piggin

Christophe Leroy's on April 4, 2020 12:30 am:
> 
> 
> Le 03/04/2020 à 15:26, Nicholas Piggin a écrit :
>> In the interest of reducing code and possible failures in the
>> machine check and system reset paths, grab the "ibm,nmi-interlock"
>> token at init time.
>> 
>> Reviewed-by: Mahesh Salgaonkar 
>> Signed-off-by: Nicholas Piggin 
>> ---
>>   arch/powerpc/include/asm/firmware.h|  1 +
>>   arch/powerpc/platforms/pseries/ras.c   |  2 +-
>>   arch/powerpc/platforms/pseries/setup.c | 13 ++---
>>   3 files changed, 12 insertions(+), 4 deletions(-)
>> 
>> diff --git a/arch/powerpc/include/asm/firmware.h 
>> b/arch/powerpc/include/asm/firmware.h
>> index ca33f4ef6cb4..6003c2e533a0 100644
>> --- a/arch/powerpc/include/asm/firmware.h
>> +++ b/arch/powerpc/include/asm/firmware.h
>> @@ -128,6 +128,7 @@ extern void machine_check_fwnmi(void);
>>   
>>   /* This is true if we are using the firmware NMI handler (typically LPAR) 
>> */
>>   extern int fwnmi_active;
>> +extern int ibm_nmi_interlock_token;
>>   
>>   extern unsigned int __start___fw_ftr_fixup, __stop___fw_ftr_fixup;
>>   
>> diff --git a/arch/powerpc/platforms/pseries/ras.c 
>> b/arch/powerpc/platforms/pseries/ras.c
>> index 1d7f973c647b..c74d5e740922 100644
>> --- a/arch/powerpc/platforms/pseries/ras.c
>> +++ b/arch/powerpc/platforms/pseries/ras.c
>> @@ -458,7 +458,7 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct 
>> pt_regs *regs)
>>*/
>>   static void fwnmi_release_errinfo(void)
>>   {
>> -int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL);
>> +int ret = rtas_call(ibm_nmi_interlock_token, 0, 1, NULL);
>>  if (ret != 0)
>>  printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret);
>>   }
>> diff --git a/arch/powerpc/platforms/pseries/setup.c 
>> b/arch/powerpc/platforms/pseries/setup.c
>> index 0c8421dd01ab..b582198be284 100644
>> --- a/arch/powerpc/platforms/pseries/setup.c
>> +++ b/arch/powerpc/platforms/pseries/setup.c
>> @@ -83,6 +83,7 @@ unsigned long CMO_PageSize = (ASM_CONST(1) << 
>> IOMMU_PAGE_SHIFT_4K);
>>   EXPORT_SYMBOL(CMO_PageSize);
>>   
>>   int fwnmi_active;  /* TRUE if an FWNMI handler is present */
>> +int ibm_nmi_interlock_token;
>>   
>>   static void pSeries_show_cpuinfo(struct seq_file *m)
>>   {
>> @@ -113,9 +114,14 @@ static void __init fwnmi_init(void)
>>  struct slb_entry *slb_ptr;
>>  size_t size;
>>   #endif
>> +int ibm_nmi_register_token;
>>   
>> -int ibm_nmi_register = rtas_token("ibm,nmi-register");
>> -if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE)
>> +ibm_nmi_register_token = rtas_token("ibm,nmi-register");
>> +if (ibm_nmi_register_token == RTAS_UNKNOWN_SERVICE)
>> +return;
>> +
>> +ibm_nmi_interlock_token = rtas_token("ibm,nmi-interlock");
>> +if (WARN_ON(ibm_nmi_interlock_token == RTAS_UNKNOWN_SERVICE))
>>  return;
>>   
>>  /* If the kernel's not linked at zero we point the firmware at low
>> @@ -123,7 +129,8 @@ static void __init fwnmi_init(void)
>>  system_reset_addr  = __pa(system_reset_fwnmi) - PHYSICAL_START;
>>  machine_check_addr = __pa(machine_check_fwnmi) - PHYSICAL_START;
>>   
>> -if (0 == rtas_call(ibm_nmi_register, 2, 1, NULL, system_reset_addr,
>> +if (0 == rtas_call(ibm_nmi_register_token, 2, 1, NULL,
>> +system_reset_addr,
>>  machine_check_addr))
> 
> Alignment is wrong.
> And you could put system_reset_addr and 
> machine_check_addr on the same line to limit the number of lines of the if.

I don't really like using spaces to align but I'll put it on the same 
line.

Thanks,
Nick

Re: [PATCH v2 1/4] powerpc/64s: implement probe_kernel_read/write without touching AMR

2020-04-06 Thread Nicholas Piggin

Nicholas Piggin's on April 3, 2020 9:05 pm:
> Christophe Leroy's on April 3, 2020 8:31 pm:
>> 
>> 
>> Le 03/04/2020 à 11:35, Nicholas Piggin a écrit :
>>> There is no need to allow user accesses when probing kernel addresses.
>> 
>> I just discovered the following commit 
>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=75a1a607bb7e6d918be3aca11ec2214a275392f4
>> 
>> This commit adds probe_kernel_read_strict() and probe_kernel_write_strict().
>> 
>> When reading the commit log, I understand that probe_kernel_read() may 
>> be used to access some user memory. Which will not work anymore with 
>> your patch.
> 
> Hmm, I looked at _strict but obviously not hard enough. Good catch.
> 
> I don't think probe_kernel_read() should ever access user memory,
> the comment certainly says it doesn't, but that patch sort of implies
> that they do.
> 
> I think it's wrong. The non-_strict maybe could return userspace data to 
> you if you did pass a user address? I don't see why that shouldn't just 
> be disallowed always though.
> 
> And if the _strict version is required to be safe, then it seems like a
> bug or security issue to just allow everyone that doesn't explicitly
> override it to use the default implementation.
> 
> Also, the way the weak linkage is done in that patch, means parisc and
> um archs that were previously overriding probe_kernel_read() now get
> the default probe_kernel_read_strict(), which would be wrong for them.

The changelog in commit 75a1a607bb7 makes it a bit clearer. If the
non-_strict variant is used on non-kernel addresses, then it might not 
return -EFAULT or it might cause a kernel warning. The _strict variant 
is supposed to be usable with any address and it will return -EFAULT if 
it was not a valid and mapped kernel address.

The non-_strict variant can not portably access user memory because it
uses KERNEL_DS, and its documentation says its only for kernel pointers.
So powerpc should be fine to run that under KUAP AFAIKS.

I don't know why the _strict behaviour is not just made default, but
the implementation of it does seem to be broken on the archs that
override the non-_strict variant.

Thanks,
Nick

Re: [PATCH v5 05/21] powerpc: Use a function for getting the instruction op code

2020-04-06 Thread Jordan Niethe

On Mon, Apr 6, 2020 at 6:22 PM Christophe Leroy  wrote:
>
>
>
> Le 06/04/2020 à 10:09, Jordan Niethe a écrit :
> > In preparation for using a data type for instructions that can not be
> > directly used with the '>>' operator use a function for getting the op
> > code of an instruction.
> >
> > Signed-off-by: Jordan Niethe 
> > ---
> > v4: New to series
> > ---
> >   arch/powerpc/include/asm/inst.h  | 5 +
> >   arch/powerpc/kernel/align.c  | 2 +-
> >   arch/powerpc/lib/code-patching.c | 4 ++--
>
> What about store_updates_sp() in mm/fault.c ?
True. An early revision of this series used analyse_instr() there,
which ended up causing issues. But it still can use the instruction
data type. I will change that.
>
> Christophe
>
> >   3 files changed, 8 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/powerpc/include/asm/inst.h 
> > b/arch/powerpc/include/asm/inst.h
> > index 5298ba33b6e5..93959016fe4b 100644
> > --- a/arch/powerpc/include/asm/inst.h
> > +++ b/arch/powerpc/include/asm/inst.h
> > @@ -8,4 +8,9 @@
> >
> >   #define ppc_inst(x) (x)
> >
> > +static inline int ppc_inst_opcode(u32 x)
> > +{
> > + return x >> 26;
> > +}
> > +
> >   #endif /* _ASM_INST_H */
> > diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
> > index 86e9bf62f18c..691013aa9f3c 100644
> > --- a/arch/powerpc/kernel/align.c
> > +++ b/arch/powerpc/kernel/align.c
> > @@ -314,7 +314,7 @@ int fix_alignment(struct pt_regs *regs)
> >   }
> >
> >   #ifdef CONFIG_SPE
> > - if ((instr >> 26) == 0x4) {
> > + if (ppc_inst_opcode(instr) == 0x4) {
> >   int reg = (instr >> 21) & 0x1f;
> >   PPC_WARN_ALIGNMENT(spe, regs);
> >   return emulate_spe(regs, reg, instr);
> > diff --git a/arch/powerpc/lib/code-patching.c 
> > b/arch/powerpc/lib/code-patching.c
> > index fdf0d6ea3575..099a515202aa 100644
> > --- a/arch/powerpc/lib/code-patching.c
> > +++ b/arch/powerpc/lib/code-patching.c
> > @@ -231,7 +231,7 @@ bool is_offset_in_branch_range(long offset)
> >*/
> >   bool is_conditional_branch(unsigned int instr)
> >   {
> > - unsigned int opcode = instr >> 26;
> > + unsigned int opcode = ppc_inst_opcode(instr);
> >
> >   if (opcode == 16)   /* bc, bca, bcl, bcla */
> >   return true;
> > @@ -289,7 +289,7 @@ int create_cond_branch(unsigned int *instr, const 
> > unsigned int *addr,
> >
> >   static unsigned int branch_opcode(unsigned int instr)
> >   {
> > - return (instr >> 26) & 0x3F;
> > + return ppc_inst_opcode(instr) & 0x3F;
> >   }
> >
> >   static int instr_is_branch_iform(unsigned int instr)
> >

[PATCH v4] powerpc/powernv: add NULL check after kzalloc in opal_add_one_export

2020-04-06 Thread Qiujun Huang

Here needs a NULL check as kzalloc may fail returning NULL.

Issue was found by coccinelle.
Generated by: scripts/coccinelle/null/kmerr.cocci

Signed-off-by: Qiujun Huang 
Reviewed-by: Oliver O'Halloran 

---

v3->v4:
Added the information about coccinelle script.
Added change log.
Added Oliver's Reviewed-by.
v2->v3:
Removed redundant assignment to 'attr' and 'name'.
v1->v2:
Just return -ENOMEM if attr is NULL.
---
 arch/powerpc/platforms/powernv/opal.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal.c 
b/arch/powerpc/platforms/powernv/opal.c
index 2b3dfd0b6cdd..908d749bcef5 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -801,16 +801,19 @@ static ssize_t export_attr_read(struct file *fp, struct 
kobject *kobj,
 static int opal_add_one_export(struct kobject *parent, const char *export_name,
   struct device_node *np, const char *prop_name)
 {
-   struct bin_attribute *attr = NULL;
-   const char *name = NULL;
+   struct bin_attribute *attr;
+   const char *name;
u64 vals[2];
int rc;
 
rc = of_property_read_u64_array(np, prop_name, [0], 2);
if (rc)
-   goto out;
+   return rc;
 
attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+   if (!attr)
+   return -ENOMEM;
+
name = kstrdup(export_name, GFP_KERNEL);
if (!name) {
rc = -ENOMEM;
-- 
2.17.1

Re: [PATCH 1/7] powerpc/powernv/npu: Clean up compound table group initialisation

2020-04-06 Thread Alexey Kardashevskiy




On 06/04/2020 13:07, Oliver O'Halloran wrote:
> Re-work the control flow a bit so what's going on is a little clearer.
> This also ensures the table_group is only initialised once in the P9
> case. This shouldn't be a functional change since all the GPU PCI
> devices should have the same table_group configuration, but it does
> look strange.
> 
> Cc: Alexey Kardashevskiy 
> Signed-off-by: Oliver O'Halloran 


Reviewed-by: Alexey Kardashevskiy 



> ---
>  arch/powerpc/platforms/powernv/npu-dma.c | 46 +++-
>  1 file changed, 21 insertions(+), 25 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
> b/arch/powerpc/platforms/powernv/npu-dma.c
> index b95b9e3c4c98..de617549c9a3 100644
> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> @@ -427,7 +427,7 @@ static void pnv_comp_attach_table_group(struct npu_comp 
> *npucomp,
>  
>  struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe 
> *pe)
>  {
> - struct iommu_table_group *table_group;
> + struct iommu_table_group *compound_group;
>   struct npu_comp *npucomp;
>   struct pci_dev *gpdev = NULL;
>   struct pci_controller *hose;
> @@ -446,36 +446,32 @@ struct iommu_table_group 
> *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
>   hose = pci_bus_to_host(npdev->bus);
>  
>   if (hose->npu) {
> - table_group = >npu->npucomp.table_group;
> -
> - if (!table_group->group) {
> - table_group->ops = _npu_peers_ops;
> - iommu_register_group(table_group,
> - hose->global_number,
> - pe->pe_number);
> - }
> + /* P9 case: compound group is per-NPU (all gpus, all links) */
> + npucomp = >npu->npucomp;
>   } else {
> - /* Create a group for 1 GPU and attached NPUs for POWER8 */
> - pe->npucomp = kzalloc(sizeof(*pe->npucomp), GFP_KERNEL);
> - table_group = >npucomp->table_group;
> - table_group->ops = _npu_peers_ops;
> - iommu_register_group(table_group, hose->global_number,
> - pe->pe_number);
> + /* P8 case: Compound group is per-GPU (1 gpu, 2 links) */
> + npucomp = pe->npucomp = kzalloc(sizeof(*npucomp), GFP_KERNEL);
>   }
>  
> - /* Steal capabilities from a GPU PE */
> - table_group->max_dynamic_windows_supported =
> - pe->table_group.max_dynamic_windows_supported;
> - table_group->tce32_start = pe->table_group.tce32_start;
> - table_group->tce32_size = pe->table_group.tce32_size;
> - table_group->max_levels = pe->table_group.max_levels;
> - if (!table_group->pgsizes)
> - table_group->pgsizes = pe->table_group.pgsizes;
> + compound_group = >table_group;
> + if (!compound_group->group) {
> + compound_group->ops = _npu_peers_ops;
> + iommu_register_group(compound_group, hose->global_number,
> + pe->pe_number);
> +
> + /* Steal capabilities from a GPU PE */
> + compound_group->max_dynamic_windows_supported =
> + pe->table_group.max_dynamic_windows_supported;
> + compound_group->tce32_start = pe->table_group.tce32_start;
> + compound_group->tce32_size = pe->table_group.tce32_size;
> + compound_group->max_levels = pe->table_group.max_levels;
> + if (!compound_group->pgsizes)
> + compound_group->pgsizes = pe->table_group.pgsizes;
> + }
>  
> - npucomp = container_of(table_group, struct npu_comp, table_group);
>   pnv_comp_attach_table_group(npucomp, pe);
>  
> - return table_group;
> + return compound_group;
>  }
>  
>  struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe)
> 

-- 
Alexey

Re: [PATCH 2/7] powerpc/powernv/iov: Don't add VFs to iommu group during PE config

2020-04-06 Thread Alexey Kardashevskiy




On 06/04/2020 13:07, Oliver O'Halloran wrote:
> In pnv_ioda_setup_vf_PE() we register an iommu group for the VF PE
> then call pnv_ioda_setup_bus_iommu_group() to add devices to that group.
> However, this function is called before the VFs are scanned so there's
> no devices to add.
> 
> Signed-off-by: Oliver O'Halloran 



Reviewed-by: Alexey Kardashevskiy 


> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 57d3a6af1d52..2c340504fa77 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1622,7 +1622,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, 
> u16 num_vfs)
>  #ifdef CONFIG_IOMMU_API
>   iommu_register_group(>table_group,
>   pe->phb->hose->global_number, pe->pe_number);
> - pnv_ioda_setup_bus_iommu_group(pe, >table_group, NULL);
>  #endif
>   }
>  }
> 

-- 
Alexey

Re: [PATCH 3/7] powerpc/powernv/pci: Register iommu group at PE DMA setup

2020-04-06 Thread Alexey Kardashevskiy




On 06/04/2020 13:07, Oliver O'Halloran wrote:
> Move the registration of IOMMU groups out of the post-phb init fixup and
> into when we configure DMA for a PE. For most devices this doesn't
> result in any functional changes, but for NVLink attached GPUs it
> requires a bit of care. When the GPU is probed an IOMMU group would be
> created for the PE that contains it. We need to ensure that group is
> removed before we add the PE to the compound group that's used to keep
> the translations see by the PCIe and NVLink buses the same.
> 
> No functional changes. Probably.
> 
> Cc: Alexey Kardashevskiy 
> Cc: Reza Arbab 
> Cc: Alistair Popple 
> Signed-off-by: Oliver O'Halloran 
> ---
>  arch/powerpc/platforms/powernv/npu-dma.c  |  9 +
>  arch/powerpc/platforms/powernv/pci-ioda.c | 16 ++--
>  2 files changed, 15 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
> b/arch/powerpc/platforms/powernv/npu-dma.c
> index de617549c9a3..4fbbdfa8b327 100644
> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> @@ -469,6 +469,15 @@ struct iommu_table_group 
> *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
>   compound_group->pgsizes = pe->table_group.pgsizes;
>   }
>  
> +   /*
> + * I'm not sure this is strictly required, but it's probably a good idea
> + * since the table_group for the PE is going to be attached to the


This seems just a right thing to do so I suggest changing the comment
from "not sure" to "we are going to put GPU to a compound group and
won't need the individual group".

Reviewed-by: Alexey Kardashevskiy 



> + * compound table group. If we leave the PE's iommu group active then
> + * we might have the same table_group being modifiable via two sepeate
> + * iommu groups.
> + */
> + iommu_group_put(pe->table_group.group);
> +
>   pnv_comp_attach_table_group(npucomp, pe);
>  
>   return compound_group;
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 2c340504fa77..cf0aaef1b8fa 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1619,10 +1619,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, 
> u16 num_vfs)
>   }
>  
>   pnv_pci_ioda2_setup_dma_pe(phb, pe);
> -#ifdef CONFIG_IOMMU_API
> - iommu_register_group(>table_group,
> - pe->phb->hose->global_number, pe->pe_number);
> -#endif
>   }
>  }
>  
> @@ -2661,9 +2657,6 @@ static void pnv_pci_ioda_setup_iommu_api(void)
>   continue;
>  
>   table_group = >table_group;
> - iommu_register_group(>table_group,
> - pe->phb->hose->global_number,
> - pe->pe_number);
>   }
>   pnv_ioda_setup_bus_iommu_group(pe, table_group,
>   pe->pbus);
> @@ -2748,14 +2741,17 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
> *phb,
>   IOMMU_TABLE_GROUP_MAX_TABLES;
>   pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
>   pe->table_group.pgsizes = pnv_ioda_parse_tce_sizes(phb);
> -#ifdef CONFIG_IOMMU_API
> - pe->table_group.ops = _pci_ioda2_ops;
> -#endif
>  
>   rc = pnv_pci_ioda2_setup_default_config(pe);
>   if (rc)
>   return;
>  
> +#ifdef CONFIG_IOMMU_API
> + pe->table_group.ops = _pci_ioda2_ops;
> + iommu_register_group(>table_group, phb->hose->global_number,
> +  pe->pe_number);
> +#endif
> +
>   if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
>   pnv_ioda_setup_bus_dma(pe, pe->pbus);
>  }
> 

-- 
Alexey

Re: [PATCH v5 18/21] powerpc64: Add prefixed instructions to instruction data type

2020-04-06 Thread Alistair Popple

> diff --git a/arch/powerpc/include/asm/inst.h
> b/arch/powerpc/include/asm/inst.h index 70b37a35a91a..7e23e7146c66 100644
> --- a/arch/powerpc/include/asm/inst.h
> +++ b/arch/powerpc/include/asm/inst.h
> @@ -8,23 +8,67 @@
> 
>  struct ppc_inst {
>  u32 val;
> +#ifdef __powerpc64__
> +u32 suffix;
> +#endif /* __powerpc64__ */
>  } __packed;
> 
> -#define ppc_inst(x) ((struct ppc_inst){ .val = x })
> +static inline int ppc_inst_opcode(struct ppc_inst x)
> +{
> + return x.val >> 26;
> +}
> 
>  static inline u32 ppc_inst_val(struct ppc_inst x)
>  {
>   return x.val;
>  }
> 
> -static inline bool ppc_inst_len(struct ppc_inst x)
> +#ifdef __powerpc64__
> +#define ppc_inst(x) ((struct ppc_inst){ .val = (x), .suffix = 0xff })
> +
> +#define ppc_inst_prefix(x, y) ((struct ppc_inst){ .val = (x), .suffix = (y)
> }) +
> +static inline u32 ppc_inst_suffix(struct ppc_inst x)
>  {
> - return sizeof(struct ppc_inst);
> + return x.suffix;
>  }
> 
> -static inline int ppc_inst_opcode(struct ppc_inst x)
> +static inline bool ppc_inst_prefixed(struct ppc_inst x) {
> + return ((ppc_inst_val(x) >> 26) == 1) && ppc_inst_suffix(x) != 0xff;
> +}
> +
> +static inline struct ppc_inst ppc_inst_swab(struct ppc_inst x)
>  {
> - return x.val >> 26;
> + return ppc_inst_prefix(swab32(ppc_inst_val(x)),
> +swab32(ppc_inst_suffix(x)));
> +}
> +
> +static inline struct ppc_inst ppc_inst_read(const struct ppc_inst *ptr)
> +{
> + u32 val, suffix = 0xff;
> + val = *(u32 *)ptr;
> + if ((val >> 26) == 1)
> + suffix = *((u32 *)ptr + 1);
> + return ppc_inst_prefix(val, suffix);
> +}
> +
> +static inline void ppc_inst_write(struct ppc_inst *ptr, struct ppc_inst x)
> +{
> + if (ppc_inst_prefixed(x)) {
> + *(u32 *)ptr = x.val;
> + *((u32 *)ptr + 1) = x.suffix;
> + } else {
> + *(u32 *)ptr = x.val;
> + }
> +}
> +
> +#else
> +
> +#define ppc_inst(x) ((struct ppc_inst){ .val = x })
> +
> +static inline bool ppc_inst_prefixed(ppc_inst x)
> +{
> + return 0;
>  }
> 
>  static inline struct ppc_inst ppc_inst_swab(struct ppc_inst x)
> @@ -32,14 +76,31 @@ static inline struct ppc_inst ppc_inst_swab(struct
> ppc_inst x) return ppc_inst(swab32(ppc_inst_val(x)));
>  }
> 
> +static inline u32 ppc_inst_val(struct ppc_inst x)
> +{
> + return x.val;
> +}
> +
>  static inline struct ppc_inst ppc_inst_read(const struct ppc_inst *ptr)
>  {
>   return *ptr;
>  }
> 
> +static inline void ppc_inst_write(struct ppc_inst *ptr, struct ppc_inst x)
> +{
> + *ptr = x;
> +}
> +
> +#endif /* __powerpc64__ */
> +
>  static inline bool ppc_inst_equal(struct ppc_inst x, struct ppc_inst y)
>  {
>   return !memcmp(, , sizeof(struct ppc_inst));
>  }

Apologies for not picking this up earlier, I was hoping to get to the bottom 
of the issue I was seeing before you sent out v5. However the above definition 
of instruction equality does not seem correct because it does not consider the 
case when an instruction is not prefixed - a non-prefixed instruction should be 
considered equal if the first 32-bit opcode/value is the same. Something like:

if (ppc_inst_prefixed(x) != ppc_inst_prefixed(y))
return false;
else if (ppc_inst_prefixed(x))
return !memcmp(, , sizeof(struct ppc_inst));
else
return x.val == y.val;

This was causing failures in ftrace_modify_code() as it would falsely detect 
two non-prefixed instructions as being not equal due to differences in the 
suffix.
 
- Alistair

> +static inline int ppc_inst_len(struct ppc_inst x)
> +{
> + return (ppc_inst_prefixed(x)) ? 8  : 4;
> +}
> +
>  #endif /* _ASM_INST_H */
> diff --git a/arch/powerpc/include/asm/kprobes.h
> b/arch/powerpc/include/asm/kprobes.h index 66b3f2983b22..4fc0e15e23a5
> 100644
> --- a/arch/powerpc/include/asm/kprobes.h
> +++ b/arch/powerpc/include/asm/kprobes.h
> @@ -43,7 +43,7 @@ extern kprobe_opcode_t optprobe_template_ret[];
>  extern kprobe_opcode_t optprobe_template_end[];
> 
>  /* Fixed instruction size for powerpc */
> -#define MAX_INSN_SIZE1
> +#define MAX_INSN_SIZE2
>  #define MAX_OPTIMIZED_LENGTH sizeof(kprobe_opcode_t) /* 4 bytes */
>  #define MAX_OPTINSN_SIZE (optprobe_template_end - 
> optprobe_template_entry)
>  #define RELATIVEJUMP_SIZEsizeof(kprobe_opcode_t) /* 4 bytes */
> diff --git a/arch/powerpc/include/asm/uaccess.h
> b/arch/powerpc/include/asm/uaccess.h index c0a35e4586a5..5a3f486ddf02
> 100644
> --- a/arch/powerpc/include/asm/uaccess.h
> +++ b/arch/powerpc/include/asm/uaccess.h
> @@ -105,11 +105,34 @@ static inline int __access_ok(unsigned long addr,
> unsigned long size, #define __put_user_inatomic(x, ptr) \
>   __put_user_nosleep((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
> 
> -#define __get_user_instr(x, ptr) \
> - __get_user_nocheck((x).val, (u32 *)(ptr), sizeof(u32), true)
> +#define

Re: [PATCH 4/7] powerpc/powernv/pci: Add device to iommu group during dma_dev_setup()

2020-04-06 Thread Alexey Kardashevskiy




On 06/04/2020 13:07, Oliver O'Halloran wrote:
> Historically adding devices to their respective iommu group has been
> handled by the post-init phb fixup for most devices. This was done
> because:
> 
> 1) The IOMMU group is tied to the PE (usually) so we can only setup the
>iommu groups after we've done resource allocation since BAR location
>determines the device's PE, and:
> 2) The sysfs directory for the pci_dev needs to be available since
>iommu_add_device() wants to add an attribute for the iommu group.
> 
> However, since commit 30d87ef8b38d ("powerpc/pci: Fix
> pcibios_setup_device() ordering") both conditions are met when
> hose->ops->dma_dev_setup() is called so there's no real need to do
> this in the fixup.
> 
> Moving the call to iommu_add_device() into pnv_pci_ioda_dma_setup_dev()
> is a nice cleanup since it puts all the per-device IOMMU setup into one
> place. It also results in all (non-nvlink) devices getting their iommu
> group via a common path rather than relying on the bus notifier hack
> in pnv_tce_iommu_bus_notifier() to handle the adding VFs and
> hotplugged devices to their group.
> 
> Cc: Alexey Kardashevskiy 
> Signed-off-by: Oliver O'Halloran 


Reviewed-by: Alexey Kardashevskiy 


> ---
>  arch/powerpc/platforms/powernv/npu-dma.c  |  8 
>  arch/powerpc/platforms/powernv/pci-ioda.c | 47 +++
>  arch/powerpc/platforms/powernv/pci.c  | 20 --
>  3 files changed, 21 insertions(+), 54 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
> b/arch/powerpc/platforms/powernv/npu-dma.c
> index 4fbbdfa8b327..df27b8d7e78f 100644
> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> @@ -469,6 +469,12 @@ struct iommu_table_group 
> *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
>   compound_group->pgsizes = pe->table_group.pgsizes;
>   }
>  
> + /*
> +  * The gpu would have been added to the iommu group that's created
> +  * for the PE. Pull it out now.
> +  */
> + iommu_del_device(>dev);
> +
> /*
>   * I'm not sure this is strictly required, but it's probably a good idea
>   * since the table_group for the PE is going to be attached to the
> @@ -478,7 +484,9 @@ struct iommu_table_group 
> *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
>   */
>   iommu_group_put(pe->table_group.group);
>  
> + /* now put the GPU into the compound group */
>   pnv_comp_attach_table_group(npucomp, pe);
> + iommu_add_device(compound_group, >dev);
>  
>   return compound_group;
>  }
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index cf0aaef1b8fa..9198b7882b57 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1774,12 +1774,10 @@ static void pnv_pci_ioda_dma_dev_setup(struct pci_dev 
> *pdev)
>   WARN_ON(get_dma_ops(>dev) != _iommu_ops);
>   pdev->dev.archdata.dma_offset = pe->tce_bypass_base;
>   set_iommu_table_base(>dev, pe->table_group.tables[0]);
> - /*
> -  * Note: iommu_add_device() will fail here as
> -  * for physical PE: the device is already added by now;
> -  * for virtual PE: sysfs entries are not ready yet and
> -  * tce_iommu_bus_notifier will add the device to a group later.
> -  */
> +
> + /* PEs with a DMA weight of zero won't have a group */
> + if (pe->table_group.group)
> + iommu_add_device(>table_group, >dev);
>  }
>  
>  /*
> @@ -2628,39 +2626,20 @@ static void pnv_pci_ioda_setup_iommu_api(void)
>   struct pnv_ioda_pe *pe;
>  
>   /*
> -  * There are 4 types of PEs:
> -  * - PNV_IODA_PE_BUS: a downstream port with an adapter,
> -  *   created from pnv_pci_setup_bridge();
> -  * - PNV_IODA_PE_BUS_ALL: a PCI-PCIX bridge with devices behind it,
> -  *   created from pnv_pci_setup_bridge();
> -  * - PNV_IODA_PE_VF: a SRIOV virtual function,
> -  *   created from pnv_pcibios_sriov_enable();
> -  * - PNV_IODA_PE_DEV: an NPU or OCAPI device,
> -  *   created from pnv_pci_ioda_fixup().
> +  * For non-nvlink devices the IOMMU group is registered when the PE is
> +  * configured and devices are added to the group when the per-device
> +  * DMA setup is run. That's done in hose->ops.dma_dev_setup() which is
> +  * only initialise for "normal" IODA PHBs.
>*
> -  * Normally a PE is represented by an IOMMU group, however for
> -  * devices with side channels the groups need to be more strict.
> +  * For NVLink devices we need to ensure the NVLinks and the GPU end up
> +  * in the same IOMMU group, so that's handled here.
>*/
>   list_for_each_entry(hose, _list, list_node) {
>   phb = hose->private_data;
>  
> - if (phb->type == PNV_PHB_NPU_NVLINK ||
> - phb->type ==

Re: [PATCH 5/7] powerpc/powernv/pci: Delete old iommu recursive iommu setup

2020-04-06 Thread Alexey Kardashevskiy




On 06/04/2020 13:07, Oliver O'Halloran wrote:
> No longer used.
> 
> Signed-off-by: Oliver O'Halloran 


Nit: you could fold it into 4/7.


Reviewed-by: Alexey Kardashevskiy 



> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 32 ---
>  1 file changed, 32 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 9198b7882b57..8b45b8e561e9 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1550,11 +1550,6 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
>  
>  static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>  struct pnv_ioda_pe *pe);
> -#ifdef CONFIG_IOMMU_API
> -static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
> - struct iommu_table_group *table_group, struct pci_bus *bus);
> -
> -#endif
>  static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>  {
>   struct pci_bus*bus;
> @@ -2590,33 +2585,6 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops 
> = {
>   .release_ownership = pnv_ioda2_release_ownership,
>  };
>  
> -static void pnv_ioda_setup_bus_iommu_group_add_devices(struct pnv_ioda_pe 
> *pe,
> - struct iommu_table_group *table_group,
> - struct pci_bus *bus)
> -{
> - struct pci_dev *dev;
> -
> - list_for_each_entry(dev, >devices, bus_list) {
> - iommu_add_device(table_group, >dev);
> -
> - if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
> - pnv_ioda_setup_bus_iommu_group_add_devices(pe,
> - table_group, dev->subordinate);
> - }
> -}
> -
> -static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
> - struct iommu_table_group *table_group, struct pci_bus *bus)
> -{
> -
> - if (pe->flags & PNV_IODA_PE_DEV)
> - iommu_add_device(table_group, >pdev->dev);
> -
> - if ((pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) || bus)
> - pnv_ioda_setup_bus_iommu_group_add_devices(pe, table_group,
> - bus);
> -}
> -
>  static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
>  
>  static void pnv_pci_ioda_setup_iommu_api(void)
> 

-- 
Alexey

Re: [PATCH 6/7] powerpc/powernv/pci: Move tce size parsing to pci-ioda-tce.c

2020-04-06 Thread Alexey Kardashevskiy




On 06/04/2020 13:07, Oliver O'Halloran wrote:
> Move it in with the rest of the TCE wrangling rather than carting around
> a static prototype in pci-ioda.c
> 
> Signed-off-by: Oliver O'Halloran 



Reviewed-by: Alexey Kardashevskiy 


> ---
>  arch/powerpc/platforms/powernv/pci-ioda-tce.c | 28 +
>  arch/powerpc/platforms/powernv/pci-ioda.c | 30 ---
>  arch/powerpc/platforms/powernv/pci.h  |  2 ++
>  3 files changed, 30 insertions(+), 30 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c 
> b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> index 5dc6847d5f4c..f923359d8afc 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> @@ -17,6 +17,34 @@
>  #include 
>  #include "pci.h"
>  
> +unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb)
> +{
> + struct pci_controller *hose = phb->hose;
> + struct device_node *dn = hose->dn;
> + unsigned long mask = 0;
> + int i, rc, count;
> + u32 val;
> +
> + count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes");
> + if (count <= 0) {
> + mask = SZ_4K | SZ_64K;
> + /* Add 16M for POWER8 by default */
> + if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
> + !cpu_has_feature(CPU_FTR_ARCH_300))
> + mask |= SZ_16M | SZ_256M;
> + return mask;
> + }
> +
> + for (i = 0; i < count; i++) {
> + rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes",
> + i, );
> + if (rc == 0)
> + mask |= 1ULL << val;
> + }
> +
> + return mask;
> +}
> +
>  void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
>   void *tce_mem, u64 tce_size,
>   u64 dma_offset, unsigned int page_shift)
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 8b45b8e561e9..c020ade3a846 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -2585,8 +2585,6 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = 
> {
>   .release_ownership = pnv_ioda2_release_ownership,
>  };
>  
> -static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
> -
>  static void pnv_pci_ioda_setup_iommu_api(void)
>  {
>   struct pci_controller *hose;
> @@ -2638,34 +2636,6 @@ static void pnv_pci_ioda_setup_iommu_api(void)
>  static void pnv_pci_ioda_setup_iommu_api(void) { };
>  #endif
>  
> -static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb)
> -{
> - struct pci_controller *hose = phb->hose;
> - struct device_node *dn = hose->dn;
> - unsigned long mask = 0;
> - int i, rc, count;
> - u32 val;
> -
> - count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes");
> - if (count <= 0) {
> - mask = SZ_4K | SZ_64K;
> - /* Add 16M for POWER8 by default */
> - if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
> - !cpu_has_feature(CPU_FTR_ARCH_300))
> - mask |= SZ_16M | SZ_256M;
> - return mask;
> - }
> -
> - for (i = 0; i < count; i++) {
> - rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes",
> - i, );
> - if (rc == 0)
> - mask |= 1ULL << val;
> - }
> -
> - return mask;
> -}
> -
>  static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>  struct pnv_ioda_pe *pe)
>  {
> diff --git a/arch/powerpc/platforms/powernv/pci.h 
> b/arch/powerpc/platforms/powernv/pci.h
> index d3bbdeab3a32..0c5845a1f05d 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -244,4 +244,6 @@ extern void pnv_pci_setup_iommu_table(struct iommu_table 
> *tbl,
>   void *tce_mem, u64 tce_size,
>   u64 dma_offset, unsigned int page_shift);
>  
> +extern unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
> +
>  #endif /* __POWERNV_PCI_H */
> 

-- 
Alexey

Re: [PATCH 7/7] powerpc/powernv/npu: Move IOMMU group setup into npu-dma.c

2020-04-06 Thread Alexey Kardashevskiy




On 06/04/2020 13:07, Oliver O'Halloran wrote:
> The NVlink IOMMU group setup is only relevant to NVLink devices so move
> it into the NPU containment zone. This let us remove some prototypes in
> pci.h and staticfy some function definitions.
> 
> Signed-off-by: Oliver O'Halloran 



Reviewed-by: Alexey Kardashevskiy 


> ---
>  arch/powerpc/platforms/powernv/npu-dma.c  | 54 +++-
>  arch/powerpc/platforms/powernv/pci-ioda.c | 60 +++
>  arch/powerpc/platforms/powernv/pci.h  |  6 +--
>  3 files changed, 60 insertions(+), 60 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
> b/arch/powerpc/platforms/powernv/npu-dma.c
> index df27b8d7e78f..abeaa533b976 100644
> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> @@ -15,6 +15,7 @@
>  
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #include "pci.h"
> @@ -425,7 +426,8 @@ static void pnv_comp_attach_table_group(struct npu_comp 
> *npucomp,
>   ++npucomp->pe_num;
>  }
>  
> -struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe 
> *pe)
> +static struct iommu_table_group *
> + pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
>  {
>   struct iommu_table_group *compound_group;
>   struct npu_comp *npucomp;
> @@ -491,7 +493,7 @@ struct iommu_table_group 
> *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
>   return compound_group;
>  }
>  
> -struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe)
> +static struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe 
> *pe)
>  {
>   struct iommu_table_group *table_group;
>   struct npu_comp *npucomp;
> @@ -534,6 +536,54 @@ struct iommu_table_group *pnv_npu_compound_attach(struct 
> pnv_ioda_pe *pe)
>  
>   return table_group;
>  }
> +
> +void pnv_pci_npu_setup_iommu_groups(void)
> +{
> + struct pci_controller *hose;
> + struct pnv_phb *phb;
> + struct pnv_ioda_pe *pe;
> +
> + /*
> +  * For non-nvlink devices the IOMMU group is registered when the PE is
> +  * configured and devices are added to the group when the per-device
> +  * DMA setup is run. That's done in hose->ops.dma_dev_setup() which is
> +  * only initialise for "normal" IODA PHBs.
> +  *
> +  * For NVLink devices we need to ensure the NVLinks and the GPU end up
> +  * in the same IOMMU group, so that's handled here.
> +  */
> + list_for_each_entry(hose, _list, list_node) {
> + phb = hose->private_data;
> +
> + if (phb->type == PNV_PHB_IODA2)
> + list_for_each_entry(pe, >ioda.pe_list, list)
> + pnv_try_setup_npu_table_group(pe);
> + }
> +
> + /*
> +  * Now we have all PHBs discovered, time to add NPU devices to
> +  * the corresponding IOMMU groups.
> +  */
> + list_for_each_entry(hose, _list, list_node) {
> + unsigned long  pgsizes;
> +
> + phb = hose->private_data;
> +
> + if (phb->type != PNV_PHB_NPU_NVLINK)
> + continue;
> +
> + pgsizes = pnv_ioda_parse_tce_sizes(phb);
> + list_for_each_entry(pe, >ioda.pe_list, list) {
> + /*
> +  * IODA2 bridges get this set up from
> +  * pci_controller_ops::setup_bridge but NPU bridges
> +  * do not have this hook defined so we do it here.
> +  */
> + pe->table_group.pgsizes = pgsizes;
> + pnv_npu_compound_attach(pe);
> + }
> + }
> +}
>  #endif /* CONFIG_IOMMU_API */
>  
>  int pnv_npu2_init(struct pci_controller *hose)
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index c020ade3a846..dba0c2c09f61 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1288,7 +1288,7 @@ static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus)
>   pnv_ioda_setup_npu_PE(pdev);
>  }
>  
> -static void pnv_pci_ioda_setup_PEs(void)
> +static void pnv_pci_ioda_setup_nvlink(void)
>  {
>   struct pci_controller *hose;
>   struct pnv_phb *phb;
> @@ -1312,6 +1312,11 @@ static void pnv_pci_ioda_setup_PEs(void)
>   list_for_each_entry(pe, >ioda.pe_list, list)
>   pnv_npu2_map_lpar(pe, MSR_DR | MSR_PR | MSR_HV);
>   }
> +
> +#ifdef CONFIG_IOMMU_API
> + /* setup iommu groups so we can do nvlink pass-thru */
> + pnv_pci_npu_setup_iommu_groups();
> +#endif
>  }
>  
>  #ifdef CONFIG_PCI_IOV
> @@ -2584,56 +2589,6 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops 
> = {
>   .take_ownership = pnv_ioda2_take_ownership,
>   .release_ownership = pnv_ioda2_release_ownership,
>  };
> -
> -static void pnv_pci_ioda_setup_iommu_api(void)
> -{
> - struct pci_controller *hose;
>

Re: Make PowerNV IOMMU group setup saner (and fix it for hotpug)

2020-04-06 Thread Alexey Kardashevskiy




On 06/04/2020 13:07, Oliver O'Halloran wrote:
> Currently on PowerNV the IOMMU group of a device is initialised in
> boot-time fixup which runs after devices are probed. Because this is
> only run at boot time hotplugged devices do not recieve an iommu group
> assignment which prevents them from being passed through to a guest.
> 
> This series fixes that by moving the point where IOMMU groups are
> registered to when we configure DMA for a PE, and moves the point where
> we add a device to the PE's IOMMU group into the per-device DMA setup
> callback for IODA phbs (pnv_pci_ioda_dma_dev_setup()). This change means
> that we'll do group setup for hotplugged devices and that we can remove
> the hack we have for VFs which are currently added to their group
> via a bus notifier.
> 
> With this change there's no longer any per-device setup that needs to
> run in a fixup for ordinary PCI devices. The exception is, as per usual,
> NVLink devices. For those the GPU and any of it's NVLink devices need
> to be in a "compound" IOMMU group which keeps the DMA address spaces
> of each device in sync with it's attached devices. As a result that
> setup can only be done when both the NVLink devices and the GPU device
> has been probed, so that setup is still done in the fixup. Sucks, but
> it's still an improvement.
> 
> Boot tested on a witherspoon with 6xGPUs and it didn't crash so it must
> be good.

Thanks for cleaning this up!

I tried this with IOV on P8 (garrison2) and witherspoon+GPU+NPU
passthrough, works, as before, IOMMU group numbers change but we never
relied on those anyway.



-- 
Alexey

Re: [PATCH v3] powerpc/powernv: add NULL check after kzalloc in opal_add_one_export

2020-04-06 Thread Markus Elfring

 Here needs a NULL check.
>> quite obvious?

I suggest to consider another fine-tuning for the wording also around
such “obvious” programming items.


>>> I find this change description questionable
>>> (despite of a reasonable patch subject).

I got further development concerns.
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/submitting-patches.rst?id=a10c9c710f9ecea87b9f4bbb837467893b4bef01#n129

* Were changes mixed for different issues according to the diff code?

* I find it safer here to split specific changes into separate update steps
  for a small patch series.

* Will the addition of the desired null pointer check qualify for
  the specification of the tag “Fixes”?


>>> Will a patch change log be helpful here?
>> I realized I should write some change log, and the change log was 
>> meaningless.

Will any more adjustments happen for the discussed update suggestion
after the third patch version?


> The changelog is fine IMO. The point of a changelog is to tell a
> reader doing git archeology why a change happened and this is
> sufficent for that.

We might stumble on a different understanding for the affected “change logs”.
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/submitting-patches.rst?id=a10c9c710f9ecea87b9f4bbb837467893b4bef01#n751

Would you like to follow the patch evolution a bit easier?

Regards,
Markus

Re: [RFC/PATCH 2/3] pseries/kvm: Clear PSSCR[ESL|EC] bits before guest entry

2020-04-06 Thread Gautham R Shenoy

On Fri, Apr 03, 2020 at 12:20:26PM +1000, Nicholas Piggin wrote:
> Gautham R. Shenoy's on March 31, 2020 10:10 pm:
> > From: "Gautham R. Shenoy" 
> > 
> > ISA v3.0 allows the guest to execute a stop instruction. For this, the
> > PSSCR[ESL|EC] bits need to be cleared by the hypervisor before
> > scheduling in the guest vCPU.
> > 
> > Currently we always schedule in a vCPU with PSSCR[ESL|EC] bits
> > set. This patch changes the behaviour to enter the guest with
> > PSSCR[ESL|EC] bits cleared. This is a RFC patch where we
> > unconditionally clear these bits. Ideally this should be done
> > conditionally on platforms where the guest stop instruction has no
> > Bugs (starting POWER9 DD2.3).
> 
> How will guests know that they can use this facility safely after your
> series? You need both DD2.3 and a patched KVM.


Yes, this is something that isn't addressed in this series (mentioned
in the cover letter), which is a POC demonstrating that the stop0lite
state in guest works.

However, to answer your question, this is the scheme that I had in
mind :

OPAL:
   On Procs >= DD2.3 : we publish a dt-cpu-feature "idle-stop-guest"

Hypervisor Kernel:
1. If "idle-stop-guest" dt-cpu-feature is discovered, then
   we set bool enable_guest_stop = true;

2. During KVM guest entry, clear PSSCR[ESL|EC] iff
   enable_guest_stop == true.

3. In kvm_vm_ioctl_check_extension(), for a new capability
   KVM_CAP_STOP, return true iff enable_guest_top == true.

QEMU:
   Check with the hypervisor if KVM_CAP_STOP is present. If so,
   indicate the presence to the guest via device tree.

Guest Kernel:
   Check for the presence of guest stop state support in
   device-tree. If available, enable the stop0lite in the cpuidle
   driver. 
   

We still have a challenge of migrating a guest which started on a
hypervisor supporting guest stop state to a hypervisor without it.
The target hypervisor should atleast have Patch 1 of this series, so
that we don't crash the guest.

> 
> > 
> > Signed-off-by: Gautham R. Shenoy 
> > ---
> >  arch/powerpc/kvm/book3s_hv.c|  2 +-
> >  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 25 +
> >  2 files changed, 14 insertions(+), 13 deletions(-)
> > 
> > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> > index cdb7224..36d059a 100644
> > --- a/arch/powerpc/kvm/book3s_hv.c
> > +++ b/arch/powerpc/kvm/book3s_hv.c
> > @@ -3424,7 +3424,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
> > *vcpu, u64 time_limit,
> > mtspr(SPRN_IC, vcpu->arch.ic);
> > mtspr(SPRN_PID, vcpu->arch.pid);
> >  
> > -   mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
> > +   mtspr(SPRN_PSSCR, (vcpu->arch.psscr  & ~(PSSCR_EC | PSSCR_ESL)) |
> >   (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
> >  
> > mtspr(SPRN_HFSCR, vcpu->arch.hfscr);
> > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> > b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > index dbc2fec..c2daec3 100644
> > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > @@ -823,6 +823,18 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
> > mtspr   SPRN_PID, r7
> > mtspr   SPRN_WORT, r8
> >  BEGIN_FTR_SECTION
> > +   /* POWER9-only registers */
> > +   ld  r5, VCPU_TID(r4)
> > +   ld  r6, VCPU_PSSCR(r4)
> > +   lbz r8, HSTATE_FAKE_SUSPEND(r13)
> > +   lis r7, (PSSCR_EC | PSSCR_ESL)@h /* Allow guest to call stop */
> > +   andcr6, r6, r7
> > +   rldimi  r6, r8, PSSCR_FAKE_SUSPEND_LG, 63 - PSSCR_FAKE_SUSPEND_LG
> > +   ld  r7, VCPU_HFSCR(r4)
> > +   mtspr   SPRN_TIDR, r5
> > +   mtspr   SPRN_PSSCR, r6
> > +   mtspr   SPRN_HFSCR, r7
> > +FTR_SECTION_ELSE
> 
> Why did you move these around? Just because the POWER9 section became
> larger than the other?

Yes.

> 
> That's a real wart in the instruction patching implementation, I think
> we can fix it by padding with nops in the macros.
> 
> Can you just add the additional required nops to the top branch without
> changing them around for this patch, so it's easier to see what's going
> on? The end result will be the same after patching. Actually changing
> these around can have a slight unintended consequence in that code that
> runs before features were patched will execute the IF code. Not a
> problem here, but another reason why the instruction patching 
> restriction is annoying.

Sure, I will repost this patch with additional nops instead of
moving them around.

> 
> Thanks,
> Nick
> 
> > /* POWER8-only registers */
> > ld  r5, VCPU_TCSCR(r4)
> > ld  r6, VCPU_ACOP(r4)
> > @@ -833,18 +845,7 @@ BEGIN_FTR_SECTION
> > mtspr   SPRN_CSIGR, r7
> > mtspr   SPRN_TACR, r8
> > nop
> > -FTR_SECTION_ELSE
> > -   /* POWER9-only registers */
> > -   ld  r5, VCPU_TID(r4)
> > -   ld  r6, VCPU_PSSCR(r4)
> > -   lbz r8, HSTATE_FAKE_SUSPEND(r13)
> > -   orisr6, r6, PSSCR_EC@h  /* This makes stop

[PATCH v5 00/21] Initial Prefixed Instruction support

2020-04-06 Thread Jordan Niethe

A future revision of the ISA will introduce prefixed instructions. A
prefixed instruction is composed of a 4-byte prefix followed by a
4-byte suffix.

All prefixes have the major opcode 1. A prefix will never be a valid
word instruction. A suffix may be an existing word instruction or a
new instruction.

This series enables prefixed instructions and extends the instruction
emulation to support them. Then the places where prefixed instructions
might need to be emulated are updated.

v5 is based on feedback from Nick Piggins, Michael Ellerman, Balamuruhan
Suriyakumar and Alistair Popple.
The major changes:
- The ppc instruction type is now a struct
- Series now just based on next
- ppc_inst_masked() dropped
- Space for xmon breakpoints allocated in an assembly file
- "Add prefixed instructions to instruction data type" patch seperated in
  to smaller patches
- Calling convention for create_branch() is changed
- Some places which had not been updated to use the data type are now 
updated

v4 is based on feedback from Nick Piggins, Christophe Leroy and Daniel Axtens.
The major changes:
- Move xmon breakpoints from data section to text section
- Introduce a data type for instructions on powerpc

v3 is based on feedback from Christophe Leroy. The major changes:
- Completely replacing store_inst() with patch_instruction() in
  xmon
- Improve implementation of mread_instr() to not use mread().
- Base the series on top of
  https://patchwork.ozlabs.org/patch/1232619/ as this will effect
  kprobes.
- Some renaming and simplification of conditionals.

v2 incorporates feedback from Daniel Axtens and and Balamuruhan
S. The major changes are:
- Squashing together all commits about SRR1 bits
- Squashing all commits for supporting prefixed load stores
- Changing abbreviated references to sufx/prfx -> suffix/prefix
- Introducing macros for returning the length of an instruction
- Removing sign extension flag from pstd/pld in sstep.c
- Dropping patch  "powerpc/fault: Use analyse_instr() to check for
  store with updates to sp" from the series, it did not really fit
  with prefixed enablement in the first place and as reported by Greg
  Kurz did not work correctly.


Alistair Popple (1):
  powerpc: Enable Prefixed Instructions

Jordan Niethe (20):
  powerpc/xmon: Remove store_inst() for patch_instruction()
  powerpc/xmon: Move out-of-line instructions to text section
  powerpc: Change calling convention for create_branch() et. al.
  powerpc: Use a macro for creating instructions from u32s
  powerpc: Use a function for getting the instruction op code
  powerpc: Use an accessor for instructions
  powerpc: Use a function for byte swapping instructions
  powerpc: Introduce functions for instruction equality
  powerpc: Use a datatype for instructions
  powerpc: Use a function for reading instructions
  powerpc: Define and use __get_user_instr{,inatomic}()
  powerpc: Introduce a function for reporting instruction length
  powerpc/xmon: Use a function for reading instructions
  powerpc/xmon: Move insertion of breakpoint for xol'ing
  powerpc: Make test_translate_branch() independent of instruction
length
  powerpc: Define new SRR1 bits for a future ISA version
  powerpc64: Add prefixed instructions to instruction data type
  powerpc: Support prefixed instructions in alignment handler
  powerpc sstep: Add support for prefixed load/stores
  powerpc sstep: Add support for prefixed fixed-point arithmetic

 arch/powerpc/include/asm/code-patching.h |  37 +-
 arch/powerpc/include/asm/inst.h  | 106 ++
 arch/powerpc/include/asm/kprobes.h   |   2 +-
 arch/powerpc/include/asm/reg.h   |   7 +-
 arch/powerpc/include/asm/sstep.h |  15 +-
 arch/powerpc/include/asm/uaccess.h   |  28 ++
 arch/powerpc/include/asm/uprobes.h   |   7 +-
 arch/powerpc/kernel/align.c  |  13 +-
 arch/powerpc/kernel/epapr_paravirt.c |   5 +-
 arch/powerpc/kernel/hw_breakpoint.c  |   5 +-
 arch/powerpc/kernel/jump_label.c |   5 +-
 arch/powerpc/kernel/kgdb.c   |   9 +-
 arch/powerpc/kernel/kprobes.c|  24 +-
 arch/powerpc/kernel/mce_power.c  |   5 +-
 arch/powerpc/kernel/module_64.c  |   3 +-
 arch/powerpc/kernel/optprobes.c  |  91 +++--
 arch/powerpc/kernel/optprobes_head.S |   3 +
 arch/powerpc/kernel/security.c   |   9 +-
 arch/powerpc/kernel/setup_32.c   |   4 +-
 arch/powerpc/kernel/trace/ftrace.c   | 190 ++
 arch/powerpc/kernel/traps.c  |  20 +-
 arch/powerpc/kernel/uprobes.c|   3 +-
 arch/powerpc/kernel/vecemu.c |  20 +-
 arch/powerpc/kvm/book3s_hv_nested.c  |   2 +-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c  |   2 +-
 arch/powerpc/kvm/emulate_loadstore.c |   2 +-
 arch/powerpc/lib/code-patching.c | 289 +++---
 arch/powerpc/lib/feature-fixups.c|  69

[PATCH v5 01/21] powerpc/xmon: Remove store_inst() for patch_instruction()

2020-04-06 Thread Jordan Niethe

For modifying instructions in xmon, patch_instruction() can serve the
same role that store_inst() is performing with the advantage of not
being specific to xmon. In some places patch_instruction() is already
being using followed by store_inst(). In these cases just remove the
store_inst(). Otherwise replace store_inst() with patch_instruction().

Reviewed-by: Nicholas Piggin 
Signed-off-by: Jordan Niethe 
---
v4: Read into a local variable
---
 arch/powerpc/xmon/xmon.c | 18 +-
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index e8c84d265602..02e3bd62cab4 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -325,11 +325,6 @@ static inline void sync(void)
asm volatile("sync; isync");
 }
 
-static inline void store_inst(void *p)
-{
-   asm volatile ("dcbst 0,%0; sync; icbi 0,%0; isync" : : "r" (p));
-}
-
 static inline void cflush(void *p)
 {
asm volatile ("dcbf 0,%0; icbi 0,%0" : : "r" (p));
@@ -881,8 +876,7 @@ static struct bpt *new_breakpoint(unsigned long a)
for (bp = bpts; bp < [NBPTS]; ++bp) {
if (!bp->enabled && atomic_read(>ref_count) == 0) {
bp->address = a;
-   bp->instr[1] = bpinstr;
-   store_inst(>instr[1]);
+   patch_instruction(>instr[1], bpinstr);
return bp;
}
}
@@ -894,25 +888,26 @@ static struct bpt *new_breakpoint(unsigned long a)
 static void insert_bpts(void)
 {
int i;
+   unsigned int instr;
struct bpt *bp;
 
bp = bpts;
for (i = 0; i < NBPTS; ++i, ++bp) {
if ((bp->enabled & (BP_TRAP|BP_CIABR)) == 0)
continue;
-   if (mread(bp->address, >instr[0], 4) != 4) {
+   if (mread(bp->address, , 4) != 4) {
printf("Couldn't read instruction at %lx, "
   "disabling breakpoint there\n", bp->address);
bp->enabled = 0;
continue;
}
-   if (IS_MTMSRD(bp->instr[0]) || IS_RFID(bp->instr[0])) {
+   if (IS_MTMSRD(instr) || IS_RFID(instr)) {
printf("Breakpoint at %lx is on an mtmsrd or rfid "
   "instruction, disabling it\n", bp->address);
bp->enabled = 0;
continue;
}
-   store_inst(>instr[0]);
+   patch_instruction(bp->instr, instr);
if (bp->enabled & BP_CIABR)
continue;
if (patch_instruction((unsigned int *)bp->address,
@@ -922,7 +917,6 @@ static void insert_bpts(void)
bp->enabled &= ~BP_TRAP;
continue;
}
-   store_inst((void *)bp->address);
}
 }
 
@@ -957,8 +951,6 @@ static void remove_bpts(void)
(unsigned int *)bp->address, bp->instr[0]) != 0)
printf("Couldn't remove breakpoint at %lx\n",
   bp->address);
-   else
-   store_inst((void *)bp->address);
}
 }
 
-- 
2.17.1

[PATCH v5 03/21] powerpc: Change calling convention for create_branch() et. al.

2020-04-06 Thread Jordan Niethe

create_branch(), create_cond_branch() and translate_branch() return the
instruction that they create, or return 0 to signal an error. Seperate
these concerns in preparation for an instruction type that is not just
an unsigned int.  Fill the created instruction to a pointer passed as
the first parameter to the function and use a non-zero return value to
signify an error.

Signed-off-by: Jordan Niethe 
---
v5: New to series
---
 arch/powerpc/include/asm/code-patching.h |  12 +-
 arch/powerpc/kernel/optprobes.c  |  24 ++--
 arch/powerpc/kernel/setup_32.c   |   2 +-
 arch/powerpc/kernel/trace/ftrace.c   |  24 ++--
 arch/powerpc/lib/code-patching.c | 133 +--
 arch/powerpc/lib/feature-fixups.c|   5 +-
 6 files changed, 117 insertions(+), 83 deletions(-)

diff --git a/arch/powerpc/include/asm/code-patching.h 
b/arch/powerpc/include/asm/code-patching.h
index 898b54262881..351dda7215b6 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -22,10 +22,10 @@
 #define BRANCH_ABSOLUTE0x2
 
 bool is_offset_in_branch_range(long offset);
-unsigned int create_branch(const unsigned int *addr,
-  unsigned long target, int flags);
-unsigned int create_cond_branch(const unsigned int *addr,
-   unsigned long target, int flags);
+int create_branch(unsigned int *instr, const unsigned int *addr,
+ unsigned long target, int flags);
+int create_cond_branch(unsigned int *instr, const unsigned int *addr,
+  unsigned long target, int flags);
 int patch_branch(unsigned int *addr, unsigned long target, int flags);
 int patch_instruction(unsigned int *addr, unsigned int instr);
 int raw_patch_instruction(unsigned int *addr, unsigned int instr);
@@ -60,8 +60,8 @@ int instr_is_relative_branch(unsigned int instr);
 int instr_is_relative_link_branch(unsigned int instr);
 int instr_is_branch_to_addr(const unsigned int *instr, unsigned long addr);
 unsigned long branch_target(const unsigned int *instr);
-unsigned int translate_branch(const unsigned int *dest,
- const unsigned int *src);
+int translate_branch(unsigned int *instr, const unsigned int *dest,
+const unsigned int *src);
 extern bool is_conditional_branch(unsigned int instr);
 #ifdef CONFIG_PPC_BOOK3E_64
 void __patch_exception(int exc, unsigned long addr);
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index 024f7aad1952..445b3dad82dc 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -251,15 +251,17 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe 
*op, struct kprobe *p)
goto error;
}
 
-   branch_op_callback = create_branch((unsigned int *)buff + 
TMPL_CALL_HDLR_IDX,
-   (unsigned long)op_callback_addr,
-   BRANCH_SET_LINK);
+   rc = create_branch(_op_callback,
+  (unsigned int *)buff + TMPL_CALL_HDLR_IDX,
+  (unsigned long)op_callback_addr,
+  BRANCH_SET_LINK);
 
-   branch_emulate_step = create_branch((unsigned int *)buff + 
TMPL_EMULATE_IDX,
-   (unsigned long)emulate_step_addr,
-   BRANCH_SET_LINK);
+   rc |= create_branch(_emulate_step,
+   (unsigned int *)buff + TMPL_EMULATE_IDX,
+   (unsigned long)emulate_step_addr,
+   BRANCH_SET_LINK);
 
-   if (!branch_op_callback || !branch_emulate_step)
+   if (rc)
goto error;
 
patch_instruction(buff + TMPL_CALL_HDLR_IDX, branch_op_callback);
@@ -305,6 +307,7 @@ int arch_check_optimized_kprobe(struct optimized_kprobe *op)
 
 void arch_optimize_kprobes(struct list_head *oplist)
 {
+   unsigned int instr;
struct optimized_kprobe *op;
struct optimized_kprobe *tmp;
 
@@ -315,9 +318,10 @@ void arch_optimize_kprobes(struct list_head *oplist)
 */
memcpy(op->optinsn.copied_insn, op->kp.addr,
   RELATIVEJUMP_SIZE);
-   patch_instruction(op->kp.addr,
-   create_branch((unsigned int *)op->kp.addr,
- (unsigned long)op->optinsn.insn, 0));
+   create_branch(,
+ (unsigned int *)op->kp.addr,
+ (unsigned long)op->optinsn.insn, 0);
+   patch_instruction(op->kp.addr, instr);
list_del_init(>list);
}
 }
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 5b49b26eb154..c1bdd462c5c0 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -88,7 +88,7 @@ notrace void __init machine_init(u64

1 2 >

1 - 100 of 163 matches

Mail list logo