[PATCH v5 12/21] powerpc/64: add context tracking to asynchronous interrupts

2021-01-12 Thread Nicholas Piggin
Previously context tracking was not done for asynchronous interrupts,
(those that run in interrupt context), and if those would cause a
reschedule when they exit, then scheduling functions (schedule_user,
preempt_schedule_irq) call exception_enter/exit to fix this up and
exit user context.

This is a hack we would like to get away from, so do context tracking
for asynchronous interrupts too.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/interrupt.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index 7fab54a14152..7c40ce78c4bb 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -42,10 +42,12 @@ static inline void interrupt_exit_prepare(struct pt_regs 
*regs, struct interrupt
 
 static inline void interrupt_async_enter_prepare(struct pt_regs *regs, struct 
interrupt_state *state)
 {
+   interrupt_enter_prepare(regs, state);
 }
 
 static inline void interrupt_async_exit_prepare(struct pt_regs *regs, struct 
interrupt_state *state)
 {
+   interrupt_exit_prepare(regs, state);
 }
 
 struct interrupt_nmi_state {
-- 
2.23.0



[PATCH v5 11/21] powerpc/64: context tracking move to interrupt wrappers

2021-01-12 Thread Nicholas Piggin
This moves exception_enter/exit calls to wrapper functions for
synchronous interrupts. More interrupt handlers are covered by
this than previously.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/interrupt.h  |  9 
 arch/powerpc/kernel/traps.c   | 74 ++-
 arch/powerpc/mm/book3s64/hash_utils.c |  3 --
 arch/powerpc/mm/fault.c   |  9 +---
 4 files changed, 27 insertions(+), 68 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index dfa846ebae43..7fab54a14152 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -7,10 +7,16 @@
 #include 
 
 struct interrupt_state {
+#ifdef CONFIG_PPC64
+   enum ctx_state ctx_state;
+#endif
 };
 
 static inline void interrupt_enter_prepare(struct pt_regs *regs, struct 
interrupt_state *state)
 {
+#ifdef CONFIG_PPC64
+   state->ctx_state = exception_enter();
+#endif
 }
 
 /*
@@ -29,6 +35,9 @@ static inline void interrupt_enter_prepare(struct pt_regs 
*regs, struct interrup
  */
 static inline void interrupt_exit_prepare(struct pt_regs *regs, struct 
interrupt_state *state)
 {
+#ifdef CONFIG_PPC64
+   exception_exit(state->ctx_state);
+#endif
 }
 
 static inline void interrupt_async_enter_prepare(struct pt_regs *regs, struct 
interrupt_state *state)
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 0b712c40272b..b2c53883580b 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1077,41 +1077,28 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(handle_hmi_exception)
 
 DEFINE_INTERRUPT_HANDLER(unknown_exception)
 {
-   enum ctx_state prev_state = exception_enter();
-
printk("Bad trap at PC: %lx, SR: %lx, vector=%lx\n",
   regs->nip, regs->msr, regs->trap);
 
_exception(SIGTRAP, regs, TRAP_UNK, 0);
-
-   exception_exit(prev_state);
 }
 
 DEFINE_INTERRUPT_HANDLER_ASYNC(unknown_async_exception)
 {
-   enum ctx_state prev_state = exception_enter();
-
printk("Bad trap at PC: %lx, SR: %lx, vector=%lx\n",
   regs->nip, regs->msr, regs->trap);
 
_exception(SIGTRAP, regs, TRAP_UNK, 0);
-
-   exception_exit(prev_state);
 }
 
 DEFINE_INTERRUPT_HANDLER(instruction_breakpoint_exception)
 {
-   enum ctx_state prev_state = exception_enter();
-
if (notify_die(DIE_IABR_MATCH, "iabr_match", regs, 5,
5, SIGTRAP) == NOTIFY_STOP)
-   goto bail;
+   return;
if (debugger_iabr_match(regs))
-   goto bail;
+   return;
_exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip);
-
-bail:
-   exception_exit(prev_state);
 }
 
 DEFINE_INTERRUPT_HANDLER(RunModeException)
@@ -1121,8 +1108,6 @@ DEFINE_INTERRUPT_HANDLER(RunModeException)
 
 DEFINE_INTERRUPT_HANDLER(single_step_exception)
 {
-   enum ctx_state prev_state = exception_enter();
-
clear_single_step(regs);
clear_br_trace(regs);
 
@@ -1131,14 +1116,11 @@ DEFINE_INTERRUPT_HANDLER(single_step_exception)
 
if (notify_die(DIE_SSTEP, "single_step", regs, 5,
5, SIGTRAP) == NOTIFY_STOP)
-   goto bail;
+   return;
if (debugger_sstep(regs))
-   goto bail;
+   return;
 
_exception(SIGTRAP, regs, TRAP_TRACE, regs->nip);
-
-bail:
-   exception_exit(prev_state);
 }
 NOKPROBE_SYMBOL(single_step_exception);
 
@@ -1466,7 +1448,6 @@ static inline int emulate_math(struct pt_regs *regs) { 
return -1; }
 
 DEFINE_INTERRUPT_HANDLER(program_check_exception)
 {
-   enum ctx_state prev_state = exception_enter();
unsigned int reason = get_reason(regs);
 
/* We can now get here via a FP Unavailable exception if the core
@@ -1475,22 +1456,22 @@ DEFINE_INTERRUPT_HANDLER(program_check_exception)
if (reason & REASON_FP) {
/* IEEE FP exception */
parse_fpe(regs);
-   goto bail;
+   return;
}
if (reason & REASON_TRAP) {
unsigned long bugaddr;
/* Debugger is first in line to stop recursive faults in
 * rcu_lock, notify_die, or atomic_notifier_call_chain */
if (debugger_bpt(regs))
-   goto bail;
+   return;
 
if (kprobe_handler(regs))
-   goto bail;
+   return;
 
/* trap exception */
if (notify_die(DIE_BPT, "breakpoint", regs, 5, 5, SIGTRAP)
== NOTIFY_STOP)
-   goto bail;
+   return;
 
bugaddr = regs->nip;
/*
@@ -1502,10 +1483,10 @@ DEFINE_INTERRUPT_HANDLER(program_check_exception)
if (!(regs->msr & MSR_PR) &&  /* not user-mode */

[PATCH v5 10/21] powerpc/64s/hash: improve context tracking of hash faults

2021-01-12 Thread Nicholas Piggin
This moves the 64s/hash context tracking from hash_page_mm() to
__do_hash_fault(), so it's no longer called by OCXL / SPU
accelerators, which was certainly the wrong thing to be doing,
because those callers are not low level interrupt handlers, so
should have entered a kernel context tracking already.

Then remain in kernel context for the duration of the fault,
rather than enter/exit for the hash fault then enter/exit for
the page fault, which is pointless.

Even still, calling exception_enter/exit in __do_hash_fault seems
questionable because that's touching per-cpu variables, tracing,
etc., which might have been interrupted by this hash fault or
themselves cause hash faults. But maybe I miss something because
hash_page_mm very deliberately calls trace_hash_fault too, for
example. So for now go with it, it's no worse than before, in this
regard.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/bug.h|  1 +
 arch/powerpc/mm/book3s64/hash_utils.c |  7 ---
 arch/powerpc/mm/fault.c   | 29 ++-
 3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
index 4220789b9a97..e048c820ca02 100644
--- a/arch/powerpc/include/asm/bug.h
+++ b/arch/powerpc/include/asm/bug.h
@@ -112,6 +112,7 @@
 
 struct pt_regs;
 long do_page_fault(struct pt_regs *);
+long hash__do_page_fault(struct pt_regs *);
 void bad_page_fault(struct pt_regs *, int);
 void __bad_page_fault(struct pt_regs *regs, int sig);
 extern void _exception(int, struct pt_regs *, int, unsigned long);
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 453afb9ae9b4..801d5e94cd2b 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1289,7 +1289,6 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
 unsigned long flags)
 {
bool is_thp;
-   enum ctx_state prev_state = exception_enter();
pgd_t *pgdir;
unsigned long vsid;
pte_t *ptep;
@@ -1491,7 +1490,6 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
DBG_LOW(" -> rc=%d\n", rc);
 
 bail:
-   exception_exit(prev_state);
return rc;
 }
 EXPORT_SYMBOL_GPL(hash_page_mm);
@@ -1515,6 +1513,7 @@ EXPORT_SYMBOL_GPL(hash_page);
 
 DEFINE_INTERRUPT_HANDLER_RET(__do_hash_fault)
 {
+   enum ctx_state prev_state = exception_enter();
unsigned long ea = regs->dar;
unsigned long dsisr = regs->dsisr;
unsigned long access = _PAGE_PRESENT | _PAGE_READ;
@@ -1563,9 +1562,11 @@ DEFINE_INTERRUPT_HANDLER_RET(__do_hash_fault)
err = 0;
 
} else if (err) {
-   err = do_page_fault(regs);
+   err = hash__do_page_fault(regs);
}
 
+   exception_exit(prev_state);
+
return err;
 }
 
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index e971712c95c6..495edce9dc51 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -391,7 +391,7 @@ static void sanity_check_fault(bool is_write, bool is_user,
  * The return value is 0 if the fault was handled, or the signal
  * number if this is a kernel fault that can't be handled here.
  */
-static int __do_page_fault(struct pt_regs *regs, unsigned long address,
+static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
   unsigned long error_code)
 {
struct vm_area_struct * vma;
@@ -544,16 +544,15 @@ static int __do_page_fault(struct pt_regs *regs, unsigned 
long address,
 
return 0;
 }
-NOKPROBE_SYMBOL(__do_page_fault);
+NOKPROBE_SYMBOL(___do_page_fault);
 
-DEFINE_INTERRUPT_HANDLER_RET(do_page_fault)
+static long __do_page_fault(struct pt_regs *regs)
 {
-   enum ctx_state prev_state = exception_enter();
unsigned long address = regs->dar;
unsigned long error_code = regs->dsisr;
long err;
 
-   err = __do_page_fault(regs, address, error_code);
+   err = ___do_page_fault(regs, address, error_code);
if (unlikely(err)) {
const struct exception_table_entry *entry;
 
@@ -580,12 +579,32 @@ DEFINE_INTERRUPT_HANDLER_RET(do_page_fault)
}
 #endif
 
+   return err;
+}
+NOKPROBE_SYMBOL(__do_page_fault);
+
+DEFINE_INTERRUPT_HANDLER_RET(do_page_fault)
+{
+   enum ctx_state prev_state = exception_enter();
+   long err;
+
+   err = __do_page_fault(regs);
+
exception_exit(prev_state);
 
return err;
 }
 NOKPROBE_SYMBOL(do_page_fault);
 
+#ifdef CONFIG_PPC_BOOK3S_64
+/* Same as do_page_fault but interrupt entry has already run in do_hash_fault 
*/
+long hash__do_page_fault(struct pt_regs *regs)
+{
+   return __do_page_fault(regs);
+}
+NOKPROBE_SYMBOL(hash__do_page_fault);
+#endif
+
 /*
  * bad_page_fault is called when we have a bad access from the kernel.
  * It is called from the DSI and ISI handlers in head.S and from some
-- 
2.23.0



[PATCH v5 09/21] powerpc/64: context tracking remove _TIF_NOHZ

2021-01-12 Thread Nicholas Piggin
Add context tracking to the system call handler explicitly, and remove
_TIF_NOHZ.

This saves 35 cycles on gettid system call cost on POWER9 with a
CONFIG_NOHZ_FULL kernel.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/Kconfig   |  1 -
 arch/powerpc/include/asm/thread_info.h |  4 +---
 arch/powerpc/kernel/ptrace/ptrace.c|  4 
 arch/powerpc/kernel/signal.c   |  4 
 arch/powerpc/kernel/syscall_64.c   | 10 ++
 5 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 107bb4319e0e..28d5a1b1510f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -196,7 +196,6 @@ config PPC
select HAVE_STACKPROTECTOR  if PPC64 && 
$(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13)
select HAVE_STACKPROTECTOR  if PPC32 && 
$(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2)
select HAVE_CONTEXT_TRACKINGif PPC64
-   select HAVE_TIF_NOHZif PPC64
select HAVE_DEBUG_KMEMLEAK
select HAVE_DEBUG_STACKOVERFLOW
select HAVE_DYNAMIC_FTRACE
diff --git a/arch/powerpc/include/asm/thread_info.h 
b/arch/powerpc/include/asm/thread_info.h
index 3d8a47af7a25..386d576673a1 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -94,7 +94,6 @@ void arch_setup_new_exec(void);
 #define TIF_PATCH_PENDING  6   /* pending live patching update */
 #define TIF_SYSCALL_AUDIT  7   /* syscall auditing active */
 #define TIF_SINGLESTEP 8   /* singlestepping active */
-#define TIF_NOHZ   9   /* in adaptive nohz mode */
 #define TIF_SECCOMP10  /* secure computing */
 #define TIF_RESTOREALL 11  /* Restore all regs (implies NOERROR) */
 #define TIF_NOERROR12  /* Force successful syscall return */
@@ -128,11 +127,10 @@ void arch_setup_new_exec(void);
 #define _TIF_UPROBE(1result = r3;
@@ -258,8 +265,11 @@ notrace unsigned long syscall_exit_prepare(unsigned long 
r3,
}
}
 
+   user_enter_irqoff();
+
/* scv need not set RI=0 because SRRs are not used */
if (unlikely(!prep_irq_for_enabled_exit(!scv))) {
+   user_exit_irqoff();
local_irq_enable();
goto again;
}
-- 
2.23.0



[PATCH v5 08/21] powerpc: add interrupt_cond_local_irq_enable helper

2021-01-12 Thread Nicholas Piggin
Simple helper for synchronous interrupt handlers (i.e., process-context)
to enable interrupts if it was taken in an interrupts-enabled context.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/interrupt.h |  7 +++
 arch/powerpc/kernel/traps.c  | 24 +++-
 arch/powerpc/mm/fault.c  |  4 +---
 3 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index 7c72c91c21ce..dfa846ebae43 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -3,6 +3,7 @@
 #define _ASM_POWERPC_INTERRUPT_H
 
 #include 
+#include 
 #include 
 
 struct interrupt_state {
@@ -281,4 +282,10 @@ DECLARE_INTERRUPT_HANDLER_ASYNC(unknown_async_exception);
 void replay_system_reset(void);
 void replay_soft_interrupts(void);
 
+static inline void interrupt_cond_local_irq_enable(struct pt_regs *regs)
+{
+   if (!arch_irq_disabled_regs(regs))
+   local_irq_enable();
+}
+
 #endif /* _ASM_POWERPC_INTERRUPT_H */
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index f4462b481248..0b712c40272b 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -343,8 +343,8 @@ static bool exception_common(int signr, struct pt_regs 
*regs, int code,
 
show_signal_msg(signr, regs, code, addr);
 
-   if (arch_irqs_disabled() && !arch_irq_disabled_regs(regs))
-   local_irq_enable();
+   if (arch_irqs_disabled())
+   interrupt_cond_local_irq_enable(regs);
 
current->thread.trap_nr = code;
 
@@ -1546,9 +1546,7 @@ DEFINE_INTERRUPT_HANDLER(program_check_exception)
if (!user_mode(regs))
goto sigill;
 
-   /* We restore the interrupt state now */
-   if (!arch_irq_disabled_regs(regs))
-   local_irq_enable();
+   interrupt_cond_local_irq_enable(regs);
 
/* (reason & REASON_ILLEGAL) would be the obvious thing here,
 * but there seems to be a hardware bug on the 405GP (RevD)
@@ -1602,9 +1600,7 @@ DEFINE_INTERRUPT_HANDLER(alignment_exception)
int sig, code, fixed = 0;
unsigned long  reason;
 
-   /* We restore the interrupt state now */
-   if (!arch_irq_disabled_regs(regs))
-   local_irq_enable();
+   interrupt_cond_local_irq_enable(regs);
 
reason = get_reason(regs);
 
@@ -1765,9 +1761,7 @@ DEFINE_INTERRUPT_HANDLER(facility_unavailable_exception)
die("Unexpected facility unavailable exception", regs, SIGABRT);
}
 
-   /* We restore the interrupt state now */
-   if (!arch_irq_disabled_regs(regs))
-   local_irq_enable();
+   interrupt_cond_local_irq_enable(regs);
 
if (status == FSCR_DSCR_LG) {
/*
@@ -2147,9 +2141,7 @@ void SPEFloatingPointException(struct pt_regs *regs)
int code = FPE_FLTUNK;
int err;
 
-   /* We restore the interrupt state now */
-   if (!arch_irq_disabled_regs(regs))
-   local_irq_enable();
+   interrupt_cond_local_irq_enable(regs);
 
flush_spe_to_thread(current);
 
@@ -2196,9 +2188,7 @@ void SPEFloatingPointRoundException(struct pt_regs *regs)
extern int speround_handler(struct pt_regs *regs);
int err;
 
-   /* We restore the interrupt state now */
-   if (!arch_irq_disabled_regs(regs))
-   local_irq_enable();
+   interrupt_cond_local_irq_enable(regs);
 
preempt_disable();
if (regs->msr & MSR_SPE)
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 9e1cd74ebb13..e971712c95c6 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -441,9 +441,7 @@ static int __do_page_fault(struct pt_regs *regs, unsigned 
long address,
return bad_area_nosemaphore(regs, address);
}
 
-   /* We restore the interrupt state now */
-   if (!arch_irq_disabled_regs(regs))
-   local_irq_enable();
+   interrupt_cond_local_irq_enable(regs);
 
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
-- 
2.23.0



[PATCH v5 07/21] powerpc: add interrupt wrapper entry / exit stub functions

2021-01-12 Thread Nicholas Piggin
These will be used by subsequent patches.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/interrupt.h | 66 
 1 file changed, 66 insertions(+)

diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index 60363e5eeffa..7c72c91c21ce 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -5,6 +5,50 @@
 #include 
 #include 
 
+struct interrupt_state {
+};
+
+static inline void interrupt_enter_prepare(struct pt_regs *regs, struct 
interrupt_state *state)
+{
+}
+
+/*
+ * Care should be taken to note that interrupt_exit_prepare and
+ * interrupt_async_exit_prepare do not necessarily return immediately to
+ * regs context (e.g., if regs is usermode, we don't necessarily return to
+ * user mode). Other interrupts might be taken between here and return,
+ * context switch / preemption may occur in the exit path after this, or a
+ * signal may be delivered, etc.
+ *
+ * The real interrupt exit code is platform specific, e.g.,
+ * interrupt_exit_user_prepare / interrupt_exit_kernel_prepare for 64s.
+ *
+ * However interrupt_nmi_exit_prepare does return directly to regs, because
+ * NMIs do not do "exit work" or replay soft-masked interrupts.
+ */
+static inline void interrupt_exit_prepare(struct pt_regs *regs, struct 
interrupt_state *state)
+{
+}
+
+static inline void interrupt_async_enter_prepare(struct pt_regs *regs, struct 
interrupt_state *state)
+{
+}
+
+static inline void interrupt_async_exit_prepare(struct pt_regs *regs, struct 
interrupt_state *state)
+{
+}
+
+struct interrupt_nmi_state {
+};
+
+static inline void interrupt_nmi_enter_prepare(struct pt_regs *regs, struct 
interrupt_nmi_state *state)
+{
+}
+
+static inline void interrupt_nmi_exit_prepare(struct pt_regs *regs, struct 
interrupt_nmi_state *state)
+{
+}
+
 /**
  * DECLARE_INTERRUPT_HANDLER_RAW - Declare raw interrupt handler function
  * @func:  Function name of the entry point
@@ -71,7 +115,13 @@ static __always_inline void ##func(struct pt_regs 
*regs);   \
\
 __visible noinstr void func(struct pt_regs *regs)  \
 {  \
+   struct interrupt_state state;   \
+   \
+   interrupt_enter_prepare(regs, );  \
+   \
##func (regs);  \
+   \
+   interrupt_exit_prepare(regs, );   \
 }  \
\
 static __always_inline void ##func(struct pt_regs *regs)
@@ -99,10 +149,15 @@ static __always_inline long ##func(struct pt_regs 
*regs);  \
\
 __visible noinstr long func(struct pt_regs *regs)  \
 {  \
+   struct interrupt_state state;   \
long ret;   \
\
+   interrupt_enter_prepare(regs, );  \
+   \
ret = ##func (regs);\
\
+   interrupt_exit_prepare(regs, );   \
+   \
return ret; \
 }  \
\
@@ -129,7 +184,13 @@ static __always_inline void ##func(struct pt_regs 
*regs);  \
\
 __visible noinstr void func(struct pt_regs *regs)  \
 {  \
+   struct interrupt_state state;   \
+   \
+   interrupt_async_enter_prepare(regs, );\
+   \
##func (regs);  \
+   \
+   

[PATCH v5 06/21] powerpc: interrupt handler wrapper functions

2021-01-12 Thread Nicholas Piggin
Add wrapper functions (derived from x86 macros) for interrupt handler
functions. This allows interrupt entry code to be written in C.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/asm-prototypes.h |  29 ---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |   1 -
 arch/powerpc/include/asm/hw_irq.h |   9 -
 arch/powerpc/include/asm/interrupt.h  | 218 ++
 arch/powerpc/include/asm/time.h   |   2 +
 arch/powerpc/kernel/dbell.c   |  12 +-
 arch/powerpc/kernel/exceptions-64s.S  |   7 +-
 arch/powerpc/kernel/head_book3s_32.S  |   6 +-
 arch/powerpc/kernel/irq.c |   3 +-
 arch/powerpc/kernel/mce.c |   5 +-
 arch/powerpc/kernel/syscall_64.c  |   1 +
 arch/powerpc/kernel/tau_6xx.c |   2 +-
 arch/powerpc/kernel/time.c|   3 +-
 arch/powerpc/kernel/traps.c   |  90 +---
 arch/powerpc/kernel/watchdog.c|   7 +-
 arch/powerpc/kvm/book3s_hv.c  |   1 +
 arch/powerpc/kvm/book3s_hv_builtin.c  |   1 +
 arch/powerpc/kvm/booke.c  |   1 +
 arch/powerpc/mm/book3s64/hash_utils.c |  57 +++--
 arch/powerpc/mm/book3s64/slb.c|  29 +--
 arch/powerpc/mm/fault.c   |  15 +-
 arch/powerpc/platforms/powernv/idle.c |   1 +
 22 files changed, 374 insertions(+), 126 deletions(-)
 create mode 100644 arch/powerpc/include/asm/interrupt.h

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index 22c9d08fa3a4..939f3c94c8f3 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -56,35 +56,6 @@ int exit_vmx_usercopy(void);
 int enter_vmx_ops(void);
 void *exit_vmx_ops(void *dest);
 
-/* Traps */
-long machine_check_early(struct pt_regs *regs);
-long hmi_exception_realmode(struct pt_regs *regs);
-void SMIException(struct pt_regs *regs);
-void handle_hmi_exception(struct pt_regs *regs);
-void instruction_breakpoint_exception(struct pt_regs *regs);
-void RunModeException(struct pt_regs *regs);
-void single_step_exception(struct pt_regs *regs);
-void program_check_exception(struct pt_regs *regs);
-void alignment_exception(struct pt_regs *regs);
-void StackOverflow(struct pt_regs *regs);
-void stack_overflow_exception(struct pt_regs *regs);
-void kernel_fp_unavailable_exception(struct pt_regs *regs);
-void altivec_unavailable_exception(struct pt_regs *regs);
-void vsx_unavailable_exception(struct pt_regs *regs);
-void fp_unavailable_tm(struct pt_regs *regs);
-void altivec_unavailable_tm(struct pt_regs *regs);
-void vsx_unavailable_tm(struct pt_regs *regs);
-void facility_unavailable_exception(struct pt_regs *regs);
-void TAUException(struct pt_regs *regs);
-void altivec_assist_exception(struct pt_regs *regs);
-void unrecoverable_exception(struct pt_regs *regs);
-void kernel_bad_stack(struct pt_regs *regs);
-void system_reset_exception(struct pt_regs *regs);
-void machine_check_exception(struct pt_regs *regs);
-void emulation_assist_interrupt(struct pt_regs *regs);
-long do_slb_fault(struct pt_regs *regs);
-void do_bad_slb_fault(struct pt_regs *regs);
-
 /* signals, syscalls and interrupts */
 long sys_swapcontext(struct ucontext __user *old_ctx,
struct ucontext __user *new_ctx,
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index b9968e297da2..066b1d34c7bc 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -454,7 +454,6 @@ static inline unsigned long hpt_hash(unsigned long vpn,
 #define HPTE_NOHPTE_UPDATE 0x2
 #define HPTE_USE_KERNEL_KEY0x4
 
-long do_hash_fault(struct pt_regs *regs);
 extern int __hash_page_4K(unsigned long ea, unsigned long access,
  unsigned long vsid, pte_t *ptep, unsigned long trap,
  unsigned long flags, int ssize, int subpage_prot);
diff --git a/arch/powerpc/include/asm/hw_irq.h 
b/arch/powerpc/include/asm/hw_irq.h
index 0363734ff56e..614957f74cee 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -50,15 +50,6 @@
 
 #ifndef __ASSEMBLY__
 
-extern void replay_system_reset(void);
-extern void replay_soft_interrupts(void);
-
-extern void timer_interrupt(struct pt_regs *);
-extern void timer_broadcast_interrupt(void);
-extern void performance_monitor_exception(struct pt_regs *regs);
-extern void WatchdogException(struct pt_regs *regs);
-extern void unknown_exception(struct pt_regs *regs);
-
 #ifdef CONFIG_PPC64
 #include 
 
diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
new file mode 100644
index ..60363e5eeffa
--- /dev/null
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -0,0 +1,218 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */

[PATCH v5 05/21] powerpc/perf: move perf irq/nmi handling details into traps.c

2021-01-12 Thread Nicholas Piggin
This is required in order to allow more significant differences between
NMI type interrupt handlers and regular asynchronous handlers.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/traps.c  | 31 +++-
 arch/powerpc/perf/core-book3s.c  | 35 ++--
 arch/powerpc/perf/core-fsl-emb.c | 25 ---
 3 files changed, 32 insertions(+), 59 deletions(-)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index f3f6af3141ee..9b5298c016c7 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1890,11 +1890,40 @@ void vsx_unavailable_tm(struct pt_regs *regs)
 }
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
-void performance_monitor_exception(struct pt_regs *regs)
+static void performance_monitor_exception_nmi(struct pt_regs *regs)
+{
+   nmi_enter();
+
+   __this_cpu_inc(irq_stat.pmu_irqs);
+
+   perf_irq(regs);
+
+   nmi_exit();
+}
+
+static void performance_monitor_exception_async(struct pt_regs *regs)
 {
+   irq_enter();
+
__this_cpu_inc(irq_stat.pmu_irqs);
 
perf_irq(regs);
+
+   irq_exit();
+}
+
+void performance_monitor_exception(struct pt_regs *regs)
+{
+   /*
+* On 64-bit, if perf interrupts hit in a local_irq_disable
+* (soft-masked) region, we consider them as NMIs. This is required to
+* prevent hash faults on user addresses when reading callchains (and
+* looks better from an irq tracing perspective).
+*/
+   if (IS_ENABLED(CONFIG_PPC64) && unlikely(arch_irq_disabled_regs(regs)))
+   performance_monitor_exception_nmi(regs);
+   else
+   performance_monitor_exception_async(regs);
 }
 
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 28206b1fe172..9fd06010e8b6 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -110,10 +110,6 @@ static inline void perf_read_regs(struct pt_regs *regs)
 {
regs->result = 0;
 }
-static inline int perf_intr_is_nmi(struct pt_regs *regs)
-{
-   return 0;
-}
 
 static inline int siar_valid(struct pt_regs *regs)
 {
@@ -353,15 +349,6 @@ static inline void perf_read_regs(struct pt_regs *regs)
regs->result = use_siar;
 }
 
-/*
- * If interrupts were soft-disabled when a PMU interrupt occurs, treat
- * it as an NMI.
- */
-static inline int perf_intr_is_nmi(struct pt_regs *regs)
-{
-   return (regs->softe & IRQS_DISABLED);
-}
-
 /*
  * On processors like P7+ that have the SIAR-Valid bit, marked instructions
  * must be sampled only if the SIAR-valid bit is set.
@@ -2279,7 +2266,6 @@ static void __perf_event_interrupt(struct pt_regs *regs)
struct perf_event *event;
unsigned long val[8];
int found, active;
-   int nmi;
 
if (cpuhw->n_limited)
freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
@@ -2287,18 +2273,6 @@ static void __perf_event_interrupt(struct pt_regs *regs)
 
perf_read_regs(regs);
 
-   /*
-* If perf interrupts hit in a local_irq_disable (soft-masked) region,
-* we consider them as NMIs. This is required to prevent hash faults on
-* user addresses when reading callchains. See the NMI test in
-* do_hash_page.
-*/
-   nmi = perf_intr_is_nmi(regs);
-   if (nmi)
-   nmi_enter();
-   else
-   irq_enter();
-
/* Read all the PMCs since we'll need them a bunch of times */
for (i = 0; i < ppmu->n_counter; ++i)
val[i] = read_pmc(i + 1);
@@ -2344,8 +2318,8 @@ static void __perf_event_interrupt(struct pt_regs *regs)
}
}
}
-   if (!found && !nmi && printk_ratelimit())
-   printk(KERN_WARNING "Can't find PMC that caused IRQ\n");
+   if (unlikely(!found) && !arch_irq_disabled_regs(regs))
+   printk_ratelimited(KERN_WARNING "Can't find PMC that caused 
IRQ\n");
 
/*
 * Reset MMCR0 to its normal value.  This will set PMXE and
@@ -2355,11 +2329,6 @@ static void __perf_event_interrupt(struct pt_regs *regs)
 * we get back out of this interrupt.
 */
write_mmcr0(cpuhw, cpuhw->mmcr.mmcr0);
-
-   if (nmi)
-   nmi_exit();
-   else
-   irq_exit();
 }
 
 static void perf_event_interrupt(struct pt_regs *regs)
diff --git a/arch/powerpc/perf/core-fsl-emb.c b/arch/powerpc/perf/core-fsl-emb.c
index e0e7e276bfd2..ee721f420a7b 100644
--- a/arch/powerpc/perf/core-fsl-emb.c
+++ b/arch/powerpc/perf/core-fsl-emb.c
@@ -31,19 +31,6 @@ static atomic_t num_events;
 /* Used to avoid races in calling reserve/release_pmc_hardware */
 static DEFINE_MUTEX(pmc_reserve_mutex);
 
-/*
- * If interrupts were soft-disabled when a PMU interrupt occurs, treat
- * it as an NMI.
- */
-static inline int perf_intr_is_nmi(struct pt_regs *regs)
-{
-#ifdef 

[PATCH v5 04/21] powerpc: bad_page_fault, do_break get registers from regs

2021-01-12 Thread Nicholas Piggin
Similar to the previous patch this makes interrupt handler function
types more regular so they can be wrapped with the next patch.

bad_page_fault and do_break are not performance critical.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/bug.h |  4 ++--
 arch/powerpc/include/asm/debug.h   |  3 +--
 arch/powerpc/kernel/entry_32.S |  3 +--
 arch/powerpc/kernel/exceptions-64e.S   |  3 +--
 arch/powerpc/kernel/exceptions-64s.S   |  3 +--
 arch/powerpc/kernel/head_8xx.S |  5 ++---
 arch/powerpc/kernel/process.c  |  7 +++
 arch/powerpc/kernel/traps.c|  2 +-
 arch/powerpc/mm/book3s64/hash_utils.c  |  4 ++--
 arch/powerpc/mm/book3s64/slb.c |  2 +-
 arch/powerpc/mm/fault.c| 10 +-
 arch/powerpc/platforms/8xx/machine_check.c |  2 +-
 12 files changed, 21 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
index f7827e993196..4220789b9a97 100644
--- a/arch/powerpc/include/asm/bug.h
+++ b/arch/powerpc/include/asm/bug.h
@@ -112,8 +112,8 @@
 
 struct pt_regs;
 long do_page_fault(struct pt_regs *);
-extern void bad_page_fault(struct pt_regs *, unsigned long, int);
-void __bad_page_fault(struct pt_regs *regs, unsigned long address, int sig);
+void bad_page_fault(struct pt_regs *, int);
+void __bad_page_fault(struct pt_regs *regs, int sig);
 extern void _exception(int, struct pt_regs *, int, unsigned long);
 extern void _exception_pkey(struct pt_regs *, unsigned long, int);
 extern void die(const char *, struct pt_regs *, long);
diff --git a/arch/powerpc/include/asm/debug.h b/arch/powerpc/include/asm/debug.h
index ec57daf87f40..0550eceab3ca 100644
--- a/arch/powerpc/include/asm/debug.h
+++ b/arch/powerpc/include/asm/debug.h
@@ -52,8 +52,7 @@ extern void do_send_trap(struct pt_regs *regs, unsigned long 
address,
 unsigned long error_code, int brkpt);
 #else
 
-extern void do_break(struct pt_regs *regs, unsigned long address,
-unsigned long error_code);
+void do_break(struct pt_regs *regs);
 #endif
 
 #endif /* _ASM_POWERPC_DEBUG_H */
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index a32157ce0551..a94127eed56b 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -673,9 +673,8 @@ handle_page_fault:
lwz r0,_TRAP(r1)
clrrwi  r0,r0,1
stw r0,_TRAP(r1)
-   mr  r5,r3
+   mr  r4,r3   /* err arg for bad_page_fault */
addir3,r1,STACK_FRAME_OVERHEAD
-   lwz r4,_DAR(r1)
bl  __bad_page_fault
b   ret_from_except_full
 
diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index 43e71d86dcbf..52421042a020 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -1018,9 +1018,8 @@ storage_fault_common:
bne-1f
b   ret_from_except_lite
 1: bl  save_nvgprs
-   mr  r5,r3
+   mr  r4,r3
addir3,r1,STACK_FRAME_OVERHEAD
-   ld  r4,_DAR(r1)
bl  __bad_page_fault
b   ret_from_except
 
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 814cff2c649e..36dea2020ec5 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -2136,8 +2136,7 @@ EXC_COMMON_BEGIN(h_data_storage_common)
GEN_COMMON h_data_storage
addir3,r1,STACK_FRAME_OVERHEAD
 BEGIN_MMU_FTR_SECTION
-   ld  r4,_DAR(r1)
-   li  r5,SIGSEGV
+   li  r4,SIGSEGV
bl  bad_page_fault
 MMU_FTR_SECTION_ELSE
bl  unknown_exception
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 0b2c247cfdff..7869db974185 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -364,10 +364,9 @@ do_databreakpoint:
addir3,r1,STACK_FRAME_OVERHEAD
mfspr   r4,SPRN_BAR
stw r4,_DAR(r11)
-#ifdef CONFIG_VMAP_STACK
-   lwz r5,_DSISR(r11)
-#else
+#ifndef CONFIG_VMAP_STACK
mfspr   r5,SPRN_DSISR
+   stw r5,_DSISR(r11)
 #endif
EXC_XFER_STD(0x1c00, do_break)
 
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index a66f435dabbf..4f0f81e9420b 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -659,11 +659,10 @@ static void do_break_handler(struct pt_regs *regs)
}
 }
 
-void do_break (struct pt_regs *regs, unsigned long address,
-   unsigned long error_code)
+void do_break(struct pt_regs *regs)
 {
current->thread.trap_nr = TRAP_HWBKPT;
-   if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
+   if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, regs->dsisr,
11, SIGSEGV) == 

[PATCH v5 03/21] powerpc: remove arguments from fault handler functions

2021-01-12 Thread Nicholas Piggin
Make mm fault handlers all just take the pt_regs * argument and load
DAR/DSISR from that. Make those that return a value return long.

This is done to make the function signatures match other handlers, which
will help with a future patch to add wrappers. Explicit arguments could
be added for performance but that would require more wrapper macro
variants.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/asm-prototypes.h |  4 ++--
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |  2 +-
 arch/powerpc/include/asm/bug.h|  2 +-
 arch/powerpc/kernel/entry_32.S|  6 +-
 arch/powerpc/kernel/exceptions-64e.S  |  2 --
 arch/powerpc/kernel/exceptions-64s.S  | 14 ++
 arch/powerpc/kernel/head_40x.S| 10 +-
 arch/powerpc/kernel/head_8xx.S|  6 +++---
 arch/powerpc/kernel/head_book3s_32.S  |  5 ++---
 arch/powerpc/kernel/head_booke.h  |  4 +---
 arch/powerpc/mm/book3s64/hash_utils.c |  8 +---
 arch/powerpc/mm/book3s64/slb.c| 11 +++
 arch/powerpc/mm/fault.c   |  7 ---
 13 files changed, 34 insertions(+), 47 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index d0b832cbbec8..22c9d08fa3a4 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -82,8 +82,8 @@ void kernel_bad_stack(struct pt_regs *regs);
 void system_reset_exception(struct pt_regs *regs);
 void machine_check_exception(struct pt_regs *regs);
 void emulation_assist_interrupt(struct pt_regs *regs);
-long do_slb_fault(struct pt_regs *regs, unsigned long ea);
-void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err);
+long do_slb_fault(struct pt_regs *regs);
+void do_bad_slb_fault(struct pt_regs *regs);
 
 /* signals, syscalls and interrupts */
 long sys_swapcontext(struct ucontext __user *old_ctx,
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 60a669379aa0..b9968e297da2 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -454,7 +454,7 @@ static inline unsigned long hpt_hash(unsigned long vpn,
 #define HPTE_NOHPTE_UPDATE 0x2
 #define HPTE_USE_KERNEL_KEY0x4
 
-int do_hash_fault(struct pt_regs *regs, unsigned long ea, unsigned long dsisr);
+long do_hash_fault(struct pt_regs *regs);
 extern int __hash_page_4K(unsigned long ea, unsigned long access,
  unsigned long vsid, pte_t *ptep, unsigned long trap,
  unsigned long flags, int ssize, int subpage_prot);
diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
index 464f8ca8a5c9..f7827e993196 100644
--- a/arch/powerpc/include/asm/bug.h
+++ b/arch/powerpc/include/asm/bug.h
@@ -111,7 +111,7 @@
 #ifndef __ASSEMBLY__
 
 struct pt_regs;
-extern int do_page_fault(struct pt_regs *, unsigned long, unsigned long);
+long do_page_fault(struct pt_regs *);
 extern void bad_page_fault(struct pt_regs *, unsigned long, int);
 void __bad_page_fault(struct pt_regs *regs, unsigned long address, int sig);
 extern void _exception(int, struct pt_regs *, int, unsigned long);
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 238eacfda7b0..a32157ce0551 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -277,7 +277,7 @@ reenable_mmu:
 * r3 can be different from GPR3(r1) at this point, r9 and r11
 * contains the old MSR and handler address respectively,
 * r4 & r5 can contain page fault arguments that need to be passed
-* along as well. r0, r6-r8, r12, CCR, CTR, XER etc... are left
+* r0, r4-r8, r12, CCR, CTR, XER etc... are left
 * clobbered as they aren't useful past this point.
 */
 
@@ -285,15 +285,11 @@ reenable_mmu:
stw r9,8(r1)
stw r11,12(r1)
stw r3,16(r1)
-   stw r4,20(r1)
-   stw r5,24(r1)
 
/* If we are disabling interrupts (normal case), simply log it with
 * lockdep
 */
 1: bl  trace_hardirqs_off
-   lwz r5,24(r1)
-   lwz r4,20(r1)
lwz r3,16(r1)
lwz r11,12(r1)
lwz r9,8(r1)
diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index 74d07dc0bb48..43e71d86dcbf 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -1011,8 +1011,6 @@ storage_fault_common:
std r14,_DAR(r1)
std r15,_DSISR(r1)
addir3,r1,STACK_FRAME_OVERHEAD
-   mr  r4,r14
-   mr  r5,r15
ld  r14,PACA_EXGEN+EX_R14(r13)
ld  r15,PACA_EXGEN+EX_R15(r13)
bl  do_page_fault
diff --git a/arch/powerpc/kernel/exceptions-64s.S 

[PATCH v5 02/21] powerpc/64s: move the last of the page fault handling logic to C

2021-01-12 Thread Nicholas Piggin
The page fault handling still has some complex logic particularly around
hash table handling, in asm. Implement this in C instead.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |   1 +
 arch/powerpc/kernel/exceptions-64s.S  | 131 +++---
 arch/powerpc/mm/book3s64/hash_utils.c |  77 ++
 arch/powerpc/mm/fault.c   |  46 --
 4 files changed, 107 insertions(+), 148 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 066b1d34c7bc..60a669379aa0 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -454,6 +454,7 @@ static inline unsigned long hpt_hash(unsigned long vpn,
 #define HPTE_NOHPTE_UPDATE 0x2
 #define HPTE_USE_KERNEL_KEY0x4
 
+int do_hash_fault(struct pt_regs *regs, unsigned long ea, unsigned long dsisr);
 extern int __hash_page_4K(unsigned long ea, unsigned long access,
  unsigned long vsid, pte_t *ptep, unsigned long trap,
  unsigned long flags, int ssize, int subpage_prot);
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 6e53f7638737..bcb5e81d2088 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1401,14 +1401,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
  *
  * Handling:
  * - Hash MMU
- *   Go to do_hash_page first to see if the HPT can be filled from an entry in
- *   the Linux page table. Hash faults can hit in kernel mode in a fairly
+ *   Go to do_hash_fault, which attempts to fill the HPT from an entry in the
+ *   Linux page table. Hash faults can hit in kernel mode in a fairly
  *   arbitrary state (e.g., interrupts disabled, locks held) when accessing
  *   "non-bolted" regions, e.g., vmalloc space. However these should always be
- *   backed by Linux page tables.
+ *   backed by Linux page table entries.
  *
- *   If none is found, do a Linux page fault. Linux page faults can happen in
- *   kernel mode due to user copy operations of course.
+ *   If no entry is found the Linux page fault handler is invoked (by
+ *   do_hash_fault). Linux page faults can happen in kernel mode due to user
+ *   copy operations of course.
  *
  *   KVM: The KVM HDSI handler may perform a load with MSR[DR]=1 in guest
  *   MMU context, which may cause a DSI in the host, which must go to the
@@ -1439,13 +1440,17 @@ EXC_COMMON_BEGIN(data_access_common)
GEN_COMMON data_access
ld  r4,_DAR(r1)
ld  r5,_DSISR(r1)
+   addir3,r1,STACK_FRAME_OVERHEAD
 BEGIN_MMU_FTR_SECTION
-   ld  r6,_MSR(r1)
-   li  r3,0x300
-   b   do_hash_page/* Try to handle as hpte fault */
+   bl  do_hash_fault
 MMU_FTR_SECTION_ELSE
-   b   handle_page_fault
+   bl  do_page_fault
 ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
+cmpdi  r3,0
+   beq+interrupt_return
+   /* We need to restore NVGPRS */
+   REST_NVGPRS(r1)
+   b   interrupt_return
 
GEN_KVM data_access
 
@@ -1540,13 +1545,17 @@ EXC_COMMON_BEGIN(instruction_access_common)
GEN_COMMON instruction_access
ld  r4,_DAR(r1)
ld  r5,_DSISR(r1)
+   addir3,r1,STACK_FRAME_OVERHEAD
 BEGIN_MMU_FTR_SECTION
-   ld  r6,_MSR(r1)
-   li  r3,0x400
-   b   do_hash_page/* Try to handle as hpte fault */
+   bl  do_hash_fault
 MMU_FTR_SECTION_ELSE
-   b   handle_page_fault
+   bl  do_page_fault
 ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
+cmpdi  r3,0
+   beq+interrupt_return
+   /* We need to restore NVGPRS */
+   REST_NVGPRS(r1)
+   b   interrupt_return
 
GEN_KVM instruction_access
 
@@ -3221,99 +3230,3 @@ disable_machine_check:
RFI_TO_KERNEL
 1: mtlrr0
blr
-
-/*
- * Hash table stuff
- */
-   .balign IFETCH_ALIGN_BYTES
-do_hash_page:
-#ifdef CONFIG_PPC_BOOK3S_64
-   lis r0,(DSISR_BAD_FAULT_64S | DSISR_DABRMATCH | DSISR_KEYFAULT)@h
-   ori r0,r0,DSISR_BAD_FAULT_64S@l
-   and.r0,r5,r0/* weird error? */
-   bne-handle_page_fault   /* if not, try to insert a HPTE */
-
-   /*
-* If we are in an "NMI" (e.g., an interrupt when soft-disabled), then
-* don't call hash_page, just fail the fault. This is required to
-* prevent re-entrancy problems in the hash code, namely perf
-* interrupts hitting while something holds H_PAGE_BUSY, and taking a
-* hash fault. See the comment in hash_preload().
-*/
-   ld  r11, PACA_THREAD_INFO(r13)
-   lwz r0,TI_PREEMPT(r11)
-   andis.  r0,r0,NMI_MASK@h
-   bne 77f
-
-   /*
-* r3 contains the trap number
-* r4 contains the faulting address
-

[PATCH v5 01/21] powerpc/32s: Do DABR match out of handle_page_fault()

2021-01-12 Thread Nicholas Piggin
From: Christophe Leroy 

handle_page_fault() has some code dedicated to book3s/32 to
call do_break() when the DSI is a DABR match.

On other platforms, do_break() is handled separately.

Do the same for book3s/32, do it earlier in the process of DSI.

This change also avoid doing the test on ISI.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/entry_32.S   | 15 ---
 arch/powerpc/kernel/head_book3s_32.S |  3 +++
 2 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 1c9b0ccc2172..238eacfda7b0 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -670,10 +670,6 @@ ppc_swapcontext:
.globl  handle_page_fault
 handle_page_fault:
addir3,r1,STACK_FRAME_OVERHEAD
-#ifdef CONFIG_PPC_BOOK3S_32
-   andis.  r0,r5,DSISR_DABRMATCH@h
-   bne-handle_dabr_fault
-#endif
bl  do_page_fault
cmpwi   r3,0
beq+ret_from_except
@@ -687,17 +683,6 @@ handle_page_fault:
bl  __bad_page_fault
b   ret_from_except_full
 
-#ifdef CONFIG_PPC_BOOK3S_32
-   /* We have a data breakpoint exception - handle it */
-handle_dabr_fault:
-   SAVE_NVGPRS(r1)
-   lwz r0,_TRAP(r1)
-   clrrwi  r0,r0,1
-   stw r0,_TRAP(r1)
-   bl  do_break
-   b   ret_from_except_full
-#endif
-
 /*
  * This routine switches between two different tasks.  The process
  * state of one is saved on its kernel stack.  Then the state
diff --git a/arch/powerpc/kernel/head_book3s_32.S 
b/arch/powerpc/kernel/head_book3s_32.S
index 349bf3f0c3af..fc9a12768a14 100644
--- a/arch/powerpc/kernel/head_book3s_32.S
+++ b/arch/powerpc/kernel/head_book3s_32.S
@@ -680,7 +680,10 @@ handle_page_fault_tramp_1:
lwz r5, _DSISR(r11)
/* fall through */
 handle_page_fault_tramp_2:
+   andis.  r0, r5, DSISR_DABRMATCH@h
+   bne-1f
EXC_XFER_LITE(0x300, handle_page_fault)
+1: EXC_XFER_STD(0x300, do_break)
 
 #ifdef CONFIG_VMAP_STACK
 .macro save_regs_threadthread
-- 
2.23.0



[PATCH v5 00/21] powerpc: interrupt wrappers

2021-01-12 Thread Nicholas Piggin
This adds interrupt handler wrapper functions, similar to the
generic / x86 code, and moves several common operations into them
from either asm or open coded in the individual handlers.

Since v1:
- Fixed a couple of compile issues
- Fixed perf weirdness (sometimes NMI, sometimes not)
- Also move irq_enter/exit into wrappers

Since v2:
- Rebased upstream
- Took code in patch 3 from Christophe
- Fixed some compile errors from 0day

Since v3:
- Rebased
- Split Christophe's 32s DABR patch into its own patch
- Fixed missing asm from 32s on patch 3 noticed by Christophe.
- Moved changes around, split out one more patch (patch 9) to make
  changes more logical and atomic.
- Add comments explaining _RAW handlers (SLB, HPTE) interrupts better

Since v4:
- Rebased (on top of scv fallback flush fix)
- Rearranged a few changes into different patches from Christophe,
  e.g., the ___do_page_fault change from patch 2 to 10. I didn't
  do everything (e.g., splitting to update __hash_page to drop the
  msr argument before the bulk of patch 2 seemed like churn without
  much improvement), and also other things like removing the new
  ___do_page_fault variant if we can change hash fault context tracking
  I didn't get time to completely investigate and implement. I think
  this shouldn't be a showstopper though we can make more improvements
  as we go.

Thanks,
Nick

Christophe Leroy (1):
  powerpc/32s: Do DABR match out of handle_page_fault()

Nicholas Piggin (20):
  powerpc/64s: move the last of the page fault handling logic to C
  powerpc: remove arguments from fault handler functions
  powerpc: bad_page_fault, do_break get registers from regs
  powerpc/perf: move perf irq/nmi handling details into traps.c
  powerpc: interrupt handler wrapper functions
  powerpc: add interrupt wrapper entry / exit stub functions
  powerpc: add interrupt_cond_local_irq_enable helper
  powerpc/64: context tracking remove _TIF_NOHZ
  powerpc/64s/hash: improve context tracking of hash faults
  powerpc/64: context tracking move to interrupt wrappers
  powerpc/64: add context tracking to asynchronous interrupts
  powerpc: handle irq_enter/irq_exit in interrupt handler wrappers
  powerpc/64s: move context tracking exit to interrupt exit path
  powerpc/64s: reconcile interrupts in C
  powerpc/64: move account_stolen_time into its own function
  powerpc/64: entry cpu time accounting in C
  powerpc: move NMI entry/exit code into wrapper
  powerpc/64s: move NMI soft-mask handling to C
  powerpc/64s: runlatch interrupt handling in C
  powerpc/64s: power4 nap fixup in C

 arch/powerpc/Kconfig   |   1 -
 arch/powerpc/include/asm/asm-prototypes.h  |  29 --
 arch/powerpc/include/asm/bug.h |   7 +-
 arch/powerpc/include/asm/cputime.h |  15 +
 arch/powerpc/include/asm/debug.h   |   3 +-
 arch/powerpc/include/asm/hw_irq.h  |   9 -
 arch/powerpc/include/asm/interrupt.h   | 418 +
 arch/powerpc/include/asm/ppc_asm.h |  24 --
 arch/powerpc/include/asm/processor.h   |   1 +
 arch/powerpc/include/asm/thread_info.h |  10 +-
 arch/powerpc/include/asm/time.h|   2 +
 arch/powerpc/kernel/dbell.c|  15 +-
 arch/powerpc/kernel/entry_32.S |  24 +-
 arch/powerpc/kernel/exceptions-64e.S   |   6 +-
 arch/powerpc/kernel/exceptions-64s.S   | 307 ++-
 arch/powerpc/kernel/head_40x.S |  10 +-
 arch/powerpc/kernel/head_8xx.S |  11 +-
 arch/powerpc/kernel/head_book3s_32.S   |  14 +-
 arch/powerpc/kernel/head_booke.h   |   4 +-
 arch/powerpc/kernel/idle_book3s.S  |   4 +
 arch/powerpc/kernel/irq.c  |   7 +-
 arch/powerpc/kernel/mce.c  |  16 +-
 arch/powerpc/kernel/process.c  |   7 +-
 arch/powerpc/kernel/ptrace/ptrace.c|   4 -
 arch/powerpc/kernel/signal.c   |   4 -
 arch/powerpc/kernel/syscall_64.c   |  30 +-
 arch/powerpc/kernel/tau_6xx.c  |   5 +-
 arch/powerpc/kernel/time.c |   7 +-
 arch/powerpc/kernel/traps.c| 231 ++--
 arch/powerpc/kernel/watchdog.c |  15 +-
 arch/powerpc/kvm/book3s_hv.c   |   1 +
 arch/powerpc/kvm/book3s_hv_builtin.c   |   1 +
 arch/powerpc/kvm/booke.c   |   1 +
 arch/powerpc/mm/book3s64/hash_utils.c  |  92 +++--
 arch/powerpc/mm/book3s64/slb.c |  36 +-
 arch/powerpc/mm/fault.c|  92 +++--
 arch/powerpc/perf/core-book3s.c|  35 +-
 arch/powerpc/perf/core-fsl-emb.c   |  25 --
 arch/powerpc/platforms/8xx/machine_check.c |   2 +-
 arch/powerpc/platforms/powernv/idle.c  |   1 +
 40 files changed, 813 insertions(+), 713 deletions(-)
 create mode 100644 arch/powerpc/include/asm/interrupt.h

-- 
2.23.0



[PATCH for 4.9/4.14] powerpc: Fix incorrect stw{, ux, u, x} instructions in __set_pte_at

2021-01-12 Thread Christophe Leroy
From: Mathieu Desnoyers 

Backport for 4.9 and 4.14

(cherry picked from commit d85be8a49e733dcd23674aa6202870d54bf5600d)

The placeholder for instruction selection should use the second
argument's operand, which is %1, not %0. This could generate incorrect
assembly code if the memory addressing of operand %0 is a different
form from that of operand %1.

Also remove the %Un placeholder because having %Un placeholders
for two operands which are based on the same local var (ptep) doesn't
make much sense. By the way, it doesn't change the current behaviour
because "<>" constraint is missing for the associated "=m".

[chleroy: revised commit log iaw segher's comments and removed %U0]

Fixes: 9bf2b5cdc5fe ("powerpc: Fixes for CONFIG_PTE_64BIT for SMP support")
Cc:  # v2.6.28+
Signed-off-by: Mathieu Desnoyers 
Signed-off-by: Christophe Leroy 
Acked-by: Segher Boessenkool 
Signed-off-by: Michael Ellerman 
Link: 
https://lore.kernel.org/r/96354bd77977a6a933fe9020da57629007fdb920.1603358942.git.christophe.le...@csgroup.eu
---
 arch/powerpc/include/asm/book3s/32/pgtable.h | 4 ++--
 arch/powerpc/include/asm/nohash/pgtable.h| 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h 
b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 016579ef16d3..ec98abca0df0 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -414,9 +414,9 @@ static inline void __set_pte_at(struct mm_struct *mm, 
unsigned long addr,
if (pte_val(*ptep) & _PAGE_HASHPTE)
flush_hash_entry(mm, ptep, addr);
__asm__ __volatile__("\
-   stw%U0%X0 %2,%0\n\
+   stw%X0 %2,%0\n\
eieio\n\
-   stw%U0%X0 %L2,%1"
+   stw%X1 %L2,%1"
: "=m" (*ptep), "=m" (*((unsigned char *)ptep+4))
: "r" (pte) : "memory");
 
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h 
b/arch/powerpc/include/asm/nohash/pgtable.h
index 5c68f4a59f75..e9171b8242e4 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -157,9 +157,9 @@ static inline void __set_pte_at(struct mm_struct *mm, 
unsigned long addr,
flush_hash_entry(mm, ptep, addr);
 #endif
__asm__ __volatile__("\
-   stw%U0%X0 %2,%0\n\
+   stw%X0 %2,%0\n\
eieio\n\
-   stw%U0%X0 %L2,%1"
+   stw%X1 %L2,%1"
: "=m" (*ptep), "=m" (*((unsigned char *)ptep+4))
: "r" (pte) : "memory");
 
-- 
2.25.0



[PATCH for 4.4] powerpc: Fix incorrect stw{, ux, u, x} instructions in __set_pte_at

2021-01-12 Thread Christophe Leroy
From: Mathieu Desnoyers 

Backport for 4.4

(cherry picked from commit d85be8a49e733dcd23674aa6202870d54bf5600d)

The placeholder for instruction selection should use the second
argument's operand, which is %1, not %0. This could generate incorrect
assembly code if the memory addressing of operand %0 is a different
form from that of operand %1.

Also remove the %Un placeholder because having %Un placeholders
for two operands which are based on the same local var (ptep) doesn't
make much sense. By the way, it doesn't change the current behaviour
because "<>" constraint is missing for the associated "=m".

[chleroy: revised commit log iaw segher's comments and removed %U0]

Fixes: 9bf2b5cdc5fe ("powerpc: Fixes for CONFIG_PTE_64BIT for SMP support")
Cc:  # v2.6.28+
Signed-off-by: Mathieu Desnoyers 
Signed-off-by: Christophe Leroy 
Acked-by: Segher Boessenkool 
Signed-off-by: Michael Ellerman 
Link: 
https://lore.kernel.org/r/96354bd77977a6a933fe9020da57629007fdb920.1603358942.git.christophe.le...@csgroup.eu
---
 arch/powerpc/include/asm/pgtable.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index b64b4212b71f..408f9e1fa24a 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -149,9 +149,9 @@ static inline void __set_pte_at(struct mm_struct *mm, 
unsigned long addr,
flush_hash_entry(mm, ptep, addr);
 #endif
__asm__ __volatile__("\
-   stw%U0%X0 %2,%0\n\
+   stw%X0 %2,%0\n\
eieio\n\
-   stw%U0%X0 %L2,%1"
+   stw%X1 %L2,%1"
: "=m" (*ptep), "=m" (*((unsigned char *)ptep+4))
: "r" (pte) : "memory");
 
-- 
2.25.0



Re: [PATCH] tools/perf: Fix powerpc gap between kernel end and module start

2021-01-12 Thread Athira Rajeev



> On 12-Jan-2021, at 3:08 PM, Jiri Olsa  wrote:
> 
> On Mon, Dec 28, 2020 at 09:14:14PM -0500, Athira Rajeev wrote:
> 
> SNIP
> 
>> c2799370 b backtrace_flag
>> c2799378 B radix_tree_node_cachep
>> c2799380 B __bss_stop
>> c27a B _end
>> c0080389 t icmp_checkentry  [ip_tables]
>> c00803890038 t ipt_alloc_initial_table  [ip_tables]
>> c00803890468 T ipt_do_table [ip_tables]
>> c00803890de8 T ipt_unregister_table_pre_exit[ip_tables]
>> ...
>> 
>> Perf calls function symbols__fixup_end() which sets the end of symbol
>> to 0xc0080389, which is the next address and this is the start
>> address of first module (icmp_checkentry in above) which will make the
>> huge symbol size of 0x8010f.
>> 
>> After symbols__fixup_end:
>> symbols__fixup_end: sym->name: _end, sym->start: 0xc27a,
>> sym->end: 0xc0080389
>> 
>> On powerpc, kernel text segment is located at 0xc000
>> whereas the modules are located at very high memory addresses,
>> 0xc0080xxx. Since the gap between end of kernel text segment
>> and beginning of first module's address is high, histogram allocation
>> using calloc fails.
>> 
>> Fix this by detecting the kernel's last symbol and limiting
>> the range of last kernel symbol to pagesize.
>> 
>> Signed-off-by: Athira Rajeev
> 
> I can't test, but since the same approach works for arm and s390,
> this also looks ok
> 
> Acked-by: Jiri Olsa 
> 
> thanks,
> jirka

Thanks Jiri for reviewing the patch,

Athira
> 
>> ---
>> tools/perf/arch/powerpc/util/Build |  1 +
>> tools/perf/arch/powerpc/util/machine.c | 24 
>> 2 files changed, 25 insertions(+)
>> create mode 100644 tools/perf/arch/powerpc/util/machine.c
>> 
>> diff --git a/tools/perf/arch/powerpc/util/Build 
>> b/tools/perf/arch/powerpc/util/Build
>> index e86e210bf514..b7945e5a543b 100644
>> --- a/tools/perf/arch/powerpc/util/Build
>> +++ b/tools/perf/arch/powerpc/util/Build
>> @@ -1,4 +1,5 @@
>> perf-y += header.o
>> +perf-y += machine.o
>> perf-y += kvm-stat.o
>> perf-y += perf_regs.o
>> perf-y += mem-events.o
>> diff --git a/tools/perf/arch/powerpc/util/machine.c 
>> b/tools/perf/arch/powerpc/util/machine.c
>> new file mode 100644
>> index ..c30e5cc88c16
>> --- /dev/null
>> +++ b/tools/perf/arch/powerpc/util/machine.c
>> @@ -0,0 +1,24 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +#include 
>> +#include 
>> +#include  // page_size
>> +#include "debug.h"
>> +#include "symbol.h"
>> +
>> +/* On powerpc kernel text segment start at memory addresses, 
>> 0xc000
>> + * whereas the modules are located at very high memory addresses,
>> + * for example 0xc0080xxx. The gap between end of kernel text 
>> segment
>> + * and beginning of first module's text segment is very high.
>> + * Therefore do not fill this gap and do not assign it to the kernel dso 
>> map.
>> + */
>> +
>> +void arch__symbols__fixup_end(struct symbol *p, struct symbol *c)
>> +{
>> +if (strchr(p->name, '[') == NULL && strchr(c->name, '['))
>> +/* Limit the range of last kernel symbol */
>> +p->end += page_size;
>> +else
>> +p->end = c->start;
>> +pr_debug4("%s sym:%s end:%#lx\n", __func__, p->name, p->end);
>> +}
>> -- 
>> 1.8.3.1



[PATCH for 5.10] powerpc/32s: Fix RTAS machine check with VMAP stack

2021-01-12 Thread Christophe Leroy
This is backport for 5.10

(cherry picked from commit 98bf2d3f4970179c702ef64db658e0553bc6ef3a)

When we have VMAP stack, exception prolog 1 sets r1, not r11.

When it is not an RTAS machine check, don't trash r1 because it is
needed by prolog 1.

Fixes: da7bb43ab9da ("powerpc/32: Fix vmap stack - Properly set r1 before 
activating MMU")
Cc: sta...@vger.kernel.org # v5.10+
Signed-off-by: Christophe Leroy 
[mpe: Squash in fixup for RTAS machine check from Christophe]
Signed-off-by: Michael Ellerman 
Link: 
https://lore.kernel.org/r/bc77d61d1c18940e456a2dee464f1e2eda65a3f0.1608621048.git.christophe.le...@csgroup.eu
---
 arch/powerpc/kernel/head_book3s_32.S | 9 +
 1 file changed, 9 insertions(+)

diff --git a/arch/powerpc/kernel/head_book3s_32.S 
b/arch/powerpc/kernel/head_book3s_32.S
index a0dda2a1f2df..d66da35f2e8d 100644
--- a/arch/powerpc/kernel/head_book3s_32.S
+++ b/arch/powerpc/kernel/head_book3s_32.S
@@ -262,10 +262,19 @@ __secondary_hold_acknowledge:
 MachineCheck:
EXCEPTION_PROLOG_0
 #ifdef CONFIG_PPC_CHRP
+#ifdef CONFIG_VMAP_STACK
+   mr  r11, r1
+   mfspr   r1, SPRN_SPRG_THREAD
+   lwz r1, RTAS_SP(r1)
+   cmpwi   cr1, r1, 0
+   bne cr1, 7f
+   mr  r1, r11
+#else
mfspr   r11, SPRN_SPRG_THREAD
lwz r11, RTAS_SP(r11)
cmpwi   cr1, r11, 0
bne cr1, 7f
+#endif
 #endif /* CONFIG_PPC_CHRP */
EXCEPTION_PROLOG_1 for_rtas=1
 7: EXCEPTION_PROLOG_2
-- 
2.25.0



Re: [PATCH v2 0/5] ibmvfc: MQ preparatory locking work

2021-01-12 Thread Martin K. Petersen
On Wed, 6 Jan 2021 14:18:30 -0600, Tyrel Datwyler wrote:

> The ibmvfc driver in its current form relies heavily on the host_lock. This
> patchset introduces a genric queue with its own queue lock and sent/free event
> list locks. This generic queue allows the driver to decouple the primary queue
> and future subordinate queues from the host lock reducing lock contention 
> while
> also relaxing locking for submissions and completions to simply the list lock 
> of
> the queue in question.
> 
> [...]

Applied to 5.12/scsi-queue, thanks!

[1/5] ibmvfc: define generic queue structure for CRQs
  https://git.kernel.org/mkp/scsi/c/f8968665af28
[2/5] ibmvfc: make command event pool queue specific
  https://git.kernel.org/mkp/scsi/c/e4b26f3db864
[3/5] ibmvfc: define per-queue state/list locks
  https://git.kernel.org/mkp/scsi/c/57e80e0bc108
[4/5] ibmvfc: complete commands outside the host/queue lock
  https://git.kernel.org/mkp/scsi/c/1f4a4a19508d
[5/5] ibmvfc: relax locking around ibmvfc_queuecommand
  https://git.kernel.org/mkp/scsi/c/654080d02edb

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [RFC PATCH v3 0/6] Restricted DMA

2021-01-12 Thread Florian Fainelli



On 1/12/2021 8:25 PM, Tomasz Figa wrote:
> On Wed, Jan 13, 2021 at 12:56 PM Florian Fainelli  
> wrote:
>>
>>
>>
>> On 1/12/2021 6:29 PM, Tomasz Figa wrote:
>>> Hi Florian,
>>>
>>> On Wed, Jan 13, 2021 at 3:01 AM Florian Fainelli  
>>> wrote:

 On 1/11/21 11:48 PM, Claire Chang wrote:
> On Fri, Jan 8, 2021 at 1:59 AM Florian Fainelli  
> wrote:
>>
>> On 1/7/21 9:42 AM, Claire Chang wrote:
>>
 Can you explain how ATF gets involved and to what extent it does help,
 besides enforcing a secure region from the ARM CPU's perpsective? Does
 the PCIe root complex not have an IOMMU but can somehow be denied 
 access
 to a region that is marked NS=0 in the ARM CPU's MMU? If so, that is
 still some sort of basic protection that the HW enforces, right?
>>>
>>> We need the ATF support for memory MPU (memory protection unit).
>>> Restricted DMA (with reserved-memory in dts) makes sure the predefined 
>>> memory
>>> region is for PCIe DMA only, but we still need MPU to locks down PCIe 
>>> access to
>>> that specific regions.
>>
>> OK so you do have a protection unit of some sort to enforce which region
>> in DRAM the PCIE bridge is allowed to access, that makes sense,
>> otherwise the restricted DMA region would only be a hint but nothing you
>> can really enforce. This is almost entirely analogous to our systems 
>> then.
>
> Here is the example of setting the MPU:
> https://github.com/ARM-software/arm-trusted-firmware/blob/master/plat/mediatek/mt8183/drivers/emi_mpu/emi_mpu.c#L132
>
>>
>> There may be some value in standardizing on an ARM SMCCC call then since
>> you already support two different SoC vendors.
>>
>>>

 On Broadcom STB SoCs we have had something similar for a while however
 and while we don't have an IOMMU for the PCIe bridge, we do have a a
 basic protection mechanism whereby we can configure a region in DRAM to
 be PCIe read/write and CPU read/write which then gets used as the PCIe
 inbound region for the PCIe EP. By default the PCIe bridge is not
 allowed access to DRAM so we must call into a security agent to allow
 the PCIe bridge to access the designated DRAM region.

 We have done this using a private CMA area region assigned via Device
 Tree, assigned with a and requiring the PCIe EP driver to use
 dma_alloc_from_contiguous() in order to allocate from this device
 private CMA area. The only drawback with that approach is that it
 requires knowing how much memory you need up front for buffers and DMA
 descriptors that the PCIe EP will need to process. The problem is that
 it requires driver modifications and that does not scale over the 
 number
 of PCIe EP drivers, some we absolutely do not control, but there is no
 need to bounce buffer. Your approach scales better across PCIe EP
 drivers however it does require bounce buffering which could be a
 performance hit.
>>>
>>> Only the streaming DMA (map/unmap) needs bounce buffering.
>>
>> True, and typically only on transmit since you don't really control
>> where the sk_buff are allocated from, right? On RX since you need to
>> hand buffer addresses to the WLAN chip prior to DMA, you can allocate
>> them from a pool that already falls within the restricted DMA region, 
>> right?
>>
>
> Right, but applying bounce buffering to RX will make it more secure.
> The device won't be able to modify the content after unmap. Just like what
> iommu_unmap does.

 Sure, however the goals of using bounce buffering equally applies to RX
 and TX in that this is the only layer sitting between a stack (block,
 networking, USB, etc.) and the underlying device driver that scales well
 in order to massage a dma_addr_t to be within a particular physical range.

 There is however room for improvement if the drivers are willing to
 change their buffer allocation strategy. When you receive Wi-Fi frames
 you need to allocate buffers for the Wi-Fi device to DMA into, and that
 happens ahead of the DMA transfers by the Wi-Fi device. At buffer
 allocation time you could very well allocate these frames from the
 restricted DMA region without having to bounce buffer them since the
 host CPU is in control over where and when to DMA into.

>>>
>>> That is, however, still a trade-off between saving that one copy and
>>> protection from the DMA tampering with the packet contents when the
>>> kernel is reading them. Notice how the copy effectively makes a
>>> snapshot of the contents, guaranteeing that the kernel has a
>>> consistent view of the packet, which is not true if the DMA could
>>> modify the buffer contents in the middle of CPU accesses.

Re: [RFC PATCH v3 0/6] Restricted DMA

2021-01-12 Thread Tomasz Figa
On Wed, Jan 13, 2021 at 12:56 PM Florian Fainelli  wrote:
>
>
>
> On 1/12/2021 6:29 PM, Tomasz Figa wrote:
> > Hi Florian,
> >
> > On Wed, Jan 13, 2021 at 3:01 AM Florian Fainelli  
> > wrote:
> >>
> >> On 1/11/21 11:48 PM, Claire Chang wrote:
> >>> On Fri, Jan 8, 2021 at 1:59 AM Florian Fainelli  
> >>> wrote:
> 
>  On 1/7/21 9:42 AM, Claire Chang wrote:
> 
> >> Can you explain how ATF gets involved and to what extent it does help,
> >> besides enforcing a secure region from the ARM CPU's perpsective? Does
> >> the PCIe root complex not have an IOMMU but can somehow be denied 
> >> access
> >> to a region that is marked NS=0 in the ARM CPU's MMU? If so, that is
> >> still some sort of basic protection that the HW enforces, right?
> >
> > We need the ATF support for memory MPU (memory protection unit).
> > Restricted DMA (with reserved-memory in dts) makes sure the predefined 
> > memory
> > region is for PCIe DMA only, but we still need MPU to locks down PCIe 
> > access to
> > that specific regions.
> 
>  OK so you do have a protection unit of some sort to enforce which region
>  in DRAM the PCIE bridge is allowed to access, that makes sense,
>  otherwise the restricted DMA region would only be a hint but nothing you
>  can really enforce. This is almost entirely analogous to our systems 
>  then.
> >>>
> >>> Here is the example of setting the MPU:
> >>> https://github.com/ARM-software/arm-trusted-firmware/blob/master/plat/mediatek/mt8183/drivers/emi_mpu/emi_mpu.c#L132
> >>>
> 
>  There may be some value in standardizing on an ARM SMCCC call then since
>  you already support two different SoC vendors.
> 
> >
> >>
> >> On Broadcom STB SoCs we have had something similar for a while however
> >> and while we don't have an IOMMU for the PCIe bridge, we do have a a
> >> basic protection mechanism whereby we can configure a region in DRAM to
> >> be PCIe read/write and CPU read/write which then gets used as the PCIe
> >> inbound region for the PCIe EP. By default the PCIe bridge is not
> >> allowed access to DRAM so we must call into a security agent to allow
> >> the PCIe bridge to access the designated DRAM region.
> >>
> >> We have done this using a private CMA area region assigned via Device
> >> Tree, assigned with a and requiring the PCIe EP driver to use
> >> dma_alloc_from_contiguous() in order to allocate from this device
> >> private CMA area. The only drawback with that approach is that it
> >> requires knowing how much memory you need up front for buffers and DMA
> >> descriptors that the PCIe EP will need to process. The problem is that
> >> it requires driver modifications and that does not scale over the 
> >> number
> >> of PCIe EP drivers, some we absolutely do not control, but there is no
> >> need to bounce buffer. Your approach scales better across PCIe EP
> >> drivers however it does require bounce buffering which could be a
> >> performance hit.
> >
> > Only the streaming DMA (map/unmap) needs bounce buffering.
> 
>  True, and typically only on transmit since you don't really control
>  where the sk_buff are allocated from, right? On RX since you need to
>  hand buffer addresses to the WLAN chip prior to DMA, you can allocate
>  them from a pool that already falls within the restricted DMA region, 
>  right?
> 
> >>>
> >>> Right, but applying bounce buffering to RX will make it more secure.
> >>> The device won't be able to modify the content after unmap. Just like what
> >>> iommu_unmap does.
> >>
> >> Sure, however the goals of using bounce buffering equally applies to RX
> >> and TX in that this is the only layer sitting between a stack (block,
> >> networking, USB, etc.) and the underlying device driver that scales well
> >> in order to massage a dma_addr_t to be within a particular physical range.
> >>
> >> There is however room for improvement if the drivers are willing to
> >> change their buffer allocation strategy. When you receive Wi-Fi frames
> >> you need to allocate buffers for the Wi-Fi device to DMA into, and that
> >> happens ahead of the DMA transfers by the Wi-Fi device. At buffer
> >> allocation time you could very well allocate these frames from the
> >> restricted DMA region without having to bounce buffer them since the
> >> host CPU is in control over where and when to DMA into.
> >>
> >
> > That is, however, still a trade-off between saving that one copy and
> > protection from the DMA tampering with the packet contents when the
> > kernel is reading them. Notice how the copy effectively makes a
> > snapshot of the contents, guaranteeing that the kernel has a
> > consistent view of the packet, which is not true if the DMA could
> > modify the buffer contents in the middle of CPU accesses.
>
> I would say that the window just became so much 

Re: [RFC PATCH v3 0/6] Restricted DMA

2021-01-12 Thread Florian Fainelli



On 1/12/2021 6:29 PM, Tomasz Figa wrote:
> Hi Florian,
> 
> On Wed, Jan 13, 2021 at 3:01 AM Florian Fainelli  wrote:
>>
>> On 1/11/21 11:48 PM, Claire Chang wrote:
>>> On Fri, Jan 8, 2021 at 1:59 AM Florian Fainelli  
>>> wrote:

 On 1/7/21 9:42 AM, Claire Chang wrote:

>> Can you explain how ATF gets involved and to what extent it does help,
>> besides enforcing a secure region from the ARM CPU's perpsective? Does
>> the PCIe root complex not have an IOMMU but can somehow be denied access
>> to a region that is marked NS=0 in the ARM CPU's MMU? If so, that is
>> still some sort of basic protection that the HW enforces, right?
>
> We need the ATF support for memory MPU (memory protection unit).
> Restricted DMA (with reserved-memory in dts) makes sure the predefined 
> memory
> region is for PCIe DMA only, but we still need MPU to locks down PCIe 
> access to
> that specific regions.

 OK so you do have a protection unit of some sort to enforce which region
 in DRAM the PCIE bridge is allowed to access, that makes sense,
 otherwise the restricted DMA region would only be a hint but nothing you
 can really enforce. This is almost entirely analogous to our systems then.
>>>
>>> Here is the example of setting the MPU:
>>> https://github.com/ARM-software/arm-trusted-firmware/blob/master/plat/mediatek/mt8183/drivers/emi_mpu/emi_mpu.c#L132
>>>

 There may be some value in standardizing on an ARM SMCCC call then since
 you already support two different SoC vendors.

>
>>
>> On Broadcom STB SoCs we have had something similar for a while however
>> and while we don't have an IOMMU for the PCIe bridge, we do have a a
>> basic protection mechanism whereby we can configure a region in DRAM to
>> be PCIe read/write and CPU read/write which then gets used as the PCIe
>> inbound region for the PCIe EP. By default the PCIe bridge is not
>> allowed access to DRAM so we must call into a security agent to allow
>> the PCIe bridge to access the designated DRAM region.
>>
>> We have done this using a private CMA area region assigned via Device
>> Tree, assigned with a and requiring the PCIe EP driver to use
>> dma_alloc_from_contiguous() in order to allocate from this device
>> private CMA area. The only drawback with that approach is that it
>> requires knowing how much memory you need up front for buffers and DMA
>> descriptors that the PCIe EP will need to process. The problem is that
>> it requires driver modifications and that does not scale over the number
>> of PCIe EP drivers, some we absolutely do not control, but there is no
>> need to bounce buffer. Your approach scales better across PCIe EP
>> drivers however it does require bounce buffering which could be a
>> performance hit.
>
> Only the streaming DMA (map/unmap) needs bounce buffering.

 True, and typically only on transmit since you don't really control
 where the sk_buff are allocated from, right? On RX since you need to
 hand buffer addresses to the WLAN chip prior to DMA, you can allocate
 them from a pool that already falls within the restricted DMA region, 
 right?

>>>
>>> Right, but applying bounce buffering to RX will make it more secure.
>>> The device won't be able to modify the content after unmap. Just like what
>>> iommu_unmap does.
>>
>> Sure, however the goals of using bounce buffering equally applies to RX
>> and TX in that this is the only layer sitting between a stack (block,
>> networking, USB, etc.) and the underlying device driver that scales well
>> in order to massage a dma_addr_t to be within a particular physical range.
>>
>> There is however room for improvement if the drivers are willing to
>> change their buffer allocation strategy. When you receive Wi-Fi frames
>> you need to allocate buffers for the Wi-Fi device to DMA into, and that
>> happens ahead of the DMA transfers by the Wi-Fi device. At buffer
>> allocation time you could very well allocate these frames from the
>> restricted DMA region without having to bounce buffer them since the
>> host CPU is in control over where and when to DMA into.
>>
> 
> That is, however, still a trade-off between saving that one copy and
> protection from the DMA tampering with the packet contents when the
> kernel is reading them. Notice how the copy effectively makes a
> snapshot of the contents, guaranteeing that the kernel has a
> consistent view of the packet, which is not true if the DMA could
> modify the buffer contents in the middle of CPU accesses.

I would say that the window just became so much narrower for the PCIe
end-point to overwrite contents with the copy because it would have to
happen within the dma_unmap_{page,single} time and before the copy is
finished to the bounce buffer.
-- 
Florian


Re: [RFC PATCH v3 0/6] Restricted DMA

2021-01-12 Thread Tomasz Figa
Hi Florian,

On Wed, Jan 13, 2021 at 3:01 AM Florian Fainelli  wrote:
>
> On 1/11/21 11:48 PM, Claire Chang wrote:
> > On Fri, Jan 8, 2021 at 1:59 AM Florian Fainelli  
> > wrote:
> >>
> >> On 1/7/21 9:42 AM, Claire Chang wrote:
> >>
>  Can you explain how ATF gets involved and to what extent it does help,
>  besides enforcing a secure region from the ARM CPU's perpsective? Does
>  the PCIe root complex not have an IOMMU but can somehow be denied access
>  to a region that is marked NS=0 in the ARM CPU's MMU? If so, that is
>  still some sort of basic protection that the HW enforces, right?
> >>>
> >>> We need the ATF support for memory MPU (memory protection unit).
> >>> Restricted DMA (with reserved-memory in dts) makes sure the predefined 
> >>> memory
> >>> region is for PCIe DMA only, but we still need MPU to locks down PCIe 
> >>> access to
> >>> that specific regions.
> >>
> >> OK so you do have a protection unit of some sort to enforce which region
> >> in DRAM the PCIE bridge is allowed to access, that makes sense,
> >> otherwise the restricted DMA region would only be a hint but nothing you
> >> can really enforce. This is almost entirely analogous to our systems then.
> >
> > Here is the example of setting the MPU:
> > https://github.com/ARM-software/arm-trusted-firmware/blob/master/plat/mediatek/mt8183/drivers/emi_mpu/emi_mpu.c#L132
> >
> >>
> >> There may be some value in standardizing on an ARM SMCCC call then since
> >> you already support two different SoC vendors.
> >>
> >>>
> 
>  On Broadcom STB SoCs we have had something similar for a while however
>  and while we don't have an IOMMU for the PCIe bridge, we do have a a
>  basic protection mechanism whereby we can configure a region in DRAM to
>  be PCIe read/write and CPU read/write which then gets used as the PCIe
>  inbound region for the PCIe EP. By default the PCIe bridge is not
>  allowed access to DRAM so we must call into a security agent to allow
>  the PCIe bridge to access the designated DRAM region.
> 
>  We have done this using a private CMA area region assigned via Device
>  Tree, assigned with a and requiring the PCIe EP driver to use
>  dma_alloc_from_contiguous() in order to allocate from this device
>  private CMA area. The only drawback with that approach is that it
>  requires knowing how much memory you need up front for buffers and DMA
>  descriptors that the PCIe EP will need to process. The problem is that
>  it requires driver modifications and that does not scale over the number
>  of PCIe EP drivers, some we absolutely do not control, but there is no
>  need to bounce buffer. Your approach scales better across PCIe EP
>  drivers however it does require bounce buffering which could be a
>  performance hit.
> >>>
> >>> Only the streaming DMA (map/unmap) needs bounce buffering.
> >>
> >> True, and typically only on transmit since you don't really control
> >> where the sk_buff are allocated from, right? On RX since you need to
> >> hand buffer addresses to the WLAN chip prior to DMA, you can allocate
> >> them from a pool that already falls within the restricted DMA region, 
> >> right?
> >>
> >
> > Right, but applying bounce buffering to RX will make it more secure.
> > The device won't be able to modify the content after unmap. Just like what
> > iommu_unmap does.
>
> Sure, however the goals of using bounce buffering equally applies to RX
> and TX in that this is the only layer sitting between a stack (block,
> networking, USB, etc.) and the underlying device driver that scales well
> in order to massage a dma_addr_t to be within a particular physical range.
>
> There is however room for improvement if the drivers are willing to
> change their buffer allocation strategy. When you receive Wi-Fi frames
> you need to allocate buffers for the Wi-Fi device to DMA into, and that
> happens ahead of the DMA transfers by the Wi-Fi device. At buffer
> allocation time you could very well allocate these frames from the
> restricted DMA region without having to bounce buffer them since the
> host CPU is in control over where and when to DMA into.
>

That is, however, still a trade-off between saving that one copy and
protection from the DMA tampering with the packet contents when the
kernel is reading them. Notice how the copy effectively makes a
snapshot of the contents, guaranteeing that the kernel has a
consistent view of the packet, which is not true if the DMA could
modify the buffer contents in the middle of CPU accesses.

Best regards,
Tomasz

> The issue is that each network driver may implement its own buffer
> allocation strategy, some may simply call netdev_alloc_skb() which gives
> zero control over where the buffer comes from unless you play tricks
> with NUMA node allocations and somehow declare that your restricted DMA
> region is a different NUMA node. If the driver allocates pages and then
> attaches a SKB to 

Re: [RFC PATCH v3 2/6] swiotlb: Add restricted DMA pool

2021-01-12 Thread Robin Murphy

On 2021-01-07 17:57, Konrad Rzeszutek Wilk wrote:

On Fri, Jan 08, 2021 at 01:39:18AM +0800, Claire Chang wrote:

Hi Greg and Konrad,

This change is intended to be non-arch specific. Any arch that lacks DMA access
control and has devices not behind an IOMMU can make use of it. Could you share
why you think this should be arch specific?


The idea behind non-arch specific code is it to be generic. The devicetree
is specific to PowerPC, Sparc, and ARM, and not to x86 - hence it should
be in arch specific code.


Sorry, but that's an absurd argument. By the same token you'd equally 
have to claim that bits of, say, the Broadcom WiFi driver (not to 
mention dozens of others) should be split out into arch code, since not 
all platforms use the devicetree parts, nor the ACPI parts, nor the PCI 
parts...


There is nothing architecture-specific about using devicetree as a 
system description - AFAIK there *are* a handful of x86 platforms that 
use it, besides even more architectures than you've listed above. It has 
long been the policy that devicetree-related code for a particular 
subsystem should just live within that subsystem. Sometimes if there's 
enough of it it gets collected together into its own file - e.g. 
drivers/pci/of.c - otherwise it tends to just get #ifdef'ed - e.g. 
of_spi_parse_dt(), or the other DMA reserved-memory consumers that 
already exist as Florian points out.


Besides, there are far more platforms that enable CONFIG_OF than enable 
CONFIG_SWIOTLB, so by that metric the whole of the SWIOTLB code itself 
is even less "generic" than any DT parsing :P


Robin.


Re: [PATCH v4 01/21] ibmvfc: add vhost fields and defaults for MQ enablement

2021-01-12 Thread Tyrel Datwyler
On 1/12/21 2:54 PM, Brian King wrote:
> On 1/11/21 5:12 PM, Tyrel Datwyler wrote:
>> Introduce several new vhost fields for managing MQ state of the adapter
>> as well as initial defaults for MQ enablement.
>>
>> Signed-off-by: Tyrel Datwyler 
>> ---
>>  drivers/scsi/ibmvscsi/ibmvfc.c | 8 
>>  drivers/scsi/ibmvscsi/ibmvfc.h | 9 +
>>  2 files changed, 17 insertions(+)
>>
>> diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
>> index ba95438a8912..9200fe49c57e 100644
>> --- a/drivers/scsi/ibmvscsi/ibmvfc.c
>> +++ b/drivers/scsi/ibmvscsi/ibmvfc.c
>> @@ -3302,6 +3302,7 @@ static struct scsi_host_template driver_template = {
>>  .max_sectors = IBMVFC_MAX_SECTORS,
>>  .shost_attrs = ibmvfc_attrs,
>>  .track_queue_depth = 1,
>> +.host_tagset = 1,
> 
> This doesn't seem right. You are setting host_tagset, which means you want a
> shared, host wide, tag set for commands. It also means that the total
> queue depth for the host is can_queue. However, it looks like you are 
> allocating
> max_requests events for each sub crq, which means you are over allocating 
> memory.

With the shared tagset yes the queue depth for the host is can_queue, but this
also implies that the max queue depth for each hw queue is also can_queue. So,
in the worst case that all commands are queued down the same hw queue we need an
event pool with can_queue commands.

> 
> Looking at this closer, we might have bigger problems. There is a host wide
> max number of commands that the VFC host supports, which gets returned on
> NPIV Login. This value can change across a live migration event.

>From what I understand the max commands can only become less.

> 
> The ibmvfc driver, which does the same thing the lpfc driver does, modifies
> can_queue on the scsi_host *after* the tag set has been allocated. This looks
> to be a concern with ibmvfc, not sure about lpfc, as it doesn't look like
> we look at can_queue once the tag set is setup, and I'm not seeing a good way
> to dynamically change the host queue depth once the tag set is setup. 
> 
> Unless I'm missing something, our best options appear to either be to 
> implement
> our own host wide busy reference counting, which doesn't sound very good, or
> we need to add some API to block / scsi that allows us to dynamically change
> can_queue.

Changing can_queue won't do use any good with the shared tagset becasue each
queue still needs to be able to queue can_queue number of commands in the worst
case.

The complaint before was not using shared tags increases the tag memory
allocation because they are statically allocated for each queue. The question is
what claims more memory our event pool allocation or the tagset per queue
allocation.

If we chose to not use the shared tagset then the queue depth for each hw queue
becomes (can_queue / nr_hw_queues).

-Tyrel

> 
> Thanks,
> 
> Brian
> 
> 



Re: [RFC PATCH v3 2/6] swiotlb: Add restricted DMA pool

2021-01-12 Thread Florian Fainelli
On 1/5/21 7:41 PM, Claire Chang wrote:
> Add the initialization function to create restricted DMA pools from
> matching reserved-memory nodes in the device tree.
> 
> Signed-off-by: Claire Chang 
> ---
>  include/linux/device.h  |   4 ++
>  include/linux/swiotlb.h |   7 +-
>  kernel/dma/Kconfig  |   1 +
>  kernel/dma/swiotlb.c| 144 ++--
>  4 files changed, 131 insertions(+), 25 deletions(-)
> 
> diff --git a/include/linux/device.h b/include/linux/device.h
> index 89bb8b84173e..ca6f71ec8871 100644
> --- a/include/linux/device.h
> +++ b/include/linux/device.h
> @@ -413,6 +413,7 @@ struct dev_links_info {
>   * @dma_pools:   Dma pools (if dma'ble device).
>   * @dma_mem: Internal for coherent mem override.
>   * @cma_area:Contiguous memory area for dma allocations
> + * @dma_io_tlb_mem: Internal for swiotlb io_tlb_mem override.
>   * @archdata:For arch-specific additions.
>   * @of_node: Associated device tree node.
>   * @fwnode:  Associated device node supplied by platform firmware.
> @@ -515,6 +516,9 @@ struct device {
>  #ifdef CONFIG_DMA_CMA
>   struct cma *cma_area;   /* contiguous memory area for dma
>  allocations */
> +#endif
> +#ifdef CONFIG_SWIOTLB
> + struct io_tlb_mem   *dma_io_tlb_mem;
>  #endif
>   /* arch specific additions */
>   struct dev_archdata archdata;
> diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
> index dd8eb57cbb8f..a1bbd775 100644
> --- a/include/linux/swiotlb.h
> +++ b/include/linux/swiotlb.h
> @@ -76,12 +76,13 @@ extern enum swiotlb_force swiotlb_force;
>   *
>   * @start:   The start address of the swiotlb memory pool. Used to do a quick
>   *   range check to see if the memory was in fact allocated by this
> - *   API.
> + *   API. For restricted DMA pool, this is device tree adjustable.

Maybe write it as this is "firmware adjustable" such that when/if ACPI
needs something like this, the description does not need updating.

[snip]

> +static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
> + struct device *dev)
> +{
> + struct io_tlb_mem *mem = rmem->priv;
> + int ret;
> +
> + if (dev->dma_io_tlb_mem)
> + return -EBUSY;
> +
> + if (!mem) {
> + mem = kzalloc(sizeof(*mem), GFP_KERNEL);
> + if (!mem)
> + return -ENOMEM;
> +
> + if (!memremap(rmem->base, rmem->size, MEMREMAP_WB)) {

MEMREMAP_WB sounds appropriate as a default.
Documentation/devicetree/bindings/reserved-memory/ramoops.txt does
define an "unbuffered" property which in premise could be applied to the
generic reserved memory binding as well and that we may have to be
honoring here, if we were to make it more generic. Oh well, this does
not need to be addressed right now I guess.
-- 
Florian


Re: [RFC PATCH v3 2/6] swiotlb: Add restricted DMA pool

2021-01-12 Thread Florian Fainelli
On 1/7/21 1:19 PM, Konrad Rzeszutek Wilk wrote:
> On Thu, Jan 07, 2021 at 10:09:14AM -0800, Florian Fainelli wrote:
>> On 1/7/21 9:57 AM, Konrad Rzeszutek Wilk wrote:
>>> On Fri, Jan 08, 2021 at 01:39:18AM +0800, Claire Chang wrote:
 Hi Greg and Konrad,

 This change is intended to be non-arch specific. Any arch that lacks DMA 
 access
 control and has devices not behind an IOMMU can make use of it. Could you 
 share
 why you think this should be arch specific?
>>>
>>> The idea behind non-arch specific code is it to be generic. The devicetree
>>> is specific to PowerPC, Sparc, and ARM, and not to x86 - hence it should
>>> be in arch specific code.
>>
>> In premise the same code could be used with an ACPI enabled system with
>> an appropriate service to identify the restricted DMA regions and unlock
>> them.
> 
> Which this patchset is not.

ACPI is not included, but the comment about Device Tree being specific
to PowerPC, SPARC and ARM is x86 is not quite correct. There is an
architecture specific part to obtaining where the Device Tree lives in
memory, but the implementation itself is architecture agnostic (with
some early SPARC/OpenFirmware shenanigans), and x86 does, or rather did
support Device Tree to a very small extent with the CE4100 platform.

Would you prefer that an swiotlb_of.c file be created instead or
something along those lines to better encapsulate where the OF specific
code lives?

> 
>>
>> More than 1 architecture requiring this function (ARM and ARM64 are the
>> two I can think of needing this immediately) sort of calls for making
>> the code architecture agnostic since past 2, you need something that scales.
> 
> I believe the use-case is for ARM64 at this moment.

For the platforms that Claire uses, certainly for the ones we use, ARM
and ARM64 are in scope.
-- 
Florian


Re: [RFC PATCH v3 6/6] of: Add plumbing for restricted DMA pool

2021-01-12 Thread Florian Fainelli
On 1/5/21 7:41 PM, Claire Chang wrote:
> If a device is not behind an IOMMU, we look up the device node and set
> up the restricted DMA when the restricted-dma-pool is presented.
> 
> Signed-off-by: Claire Chang 
> ---

[snip]

> +int of_dma_set_restricted_buffer(struct device *dev)
> +{
> + struct device_node *node;
> + int count, i;
> +
> + if (!dev->of_node)
> + return 0;
> +
> + count = of_property_count_elems_of_size(dev->of_node, "memory-region",
> + sizeof(phandle));

You could have an early check for count < 0, along with an error
message, if that is deemed useful.

> + for (i = 0; i < count; i++) {
> + node = of_parse_phandle(dev->of_node, "memory-region", i);
> + if (of_device_is_compatible(node, "restricted-dma-pool"))

And you may want to add here an of_device_is_available(node). A platform
that provides the Device Tree firmware and try to support multiple
different SoCs may try to determine if an IOMMU is present, and if it
is, it could be marking the restriced-dma-pool region with a 'status =
"disabled"' property, or any variant of that scheme.

> + return of_reserved_mem_device_init_by_idx(
> + dev, dev->of_node, i);

This does not seem to be supporting more than one memory region, did not
you want something like instead:

ret = of_reserved_mem_device_init_by_idx(...);
if (ret)
return ret;

> + }
> +
> + return 0;
> +}
> diff --git a/drivers/of/device.c b/drivers/of/device.c
> index aedfaaafd3e7..e2c7409956ab 100644
> --- a/drivers/of/device.c
> +++ b/drivers/of/device.c
> @@ -182,6 +182,10 @@ int of_dma_configure_id(struct device *dev, struct 
> device_node *np,
>   arch_setup_dma_ops(dev, dma_start, size, iommu, coherent);
>  
>   dev->dma_range_map = map;
> +
> + if (!iommu)
> + return of_dma_set_restricted_buffer(dev);
> +
>   return 0;
>  }
>  EXPORT_SYMBOL_GPL(of_dma_configure_id);
> diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h
> index d9e6a324de0a..28a2dfa197ba 100644
> --- a/drivers/of/of_private.h
> +++ b/drivers/of/of_private.h
> @@ -161,12 +161,17 @@ struct bus_dma_region;
>  #if defined(CONFIG_OF_ADDRESS) && defined(CONFIG_HAS_DMA)
>  int of_dma_get_range(struct device_node *np,
>   const struct bus_dma_region **map);
> +int of_dma_set_restricted_buffer(struct device *dev);
>  #else
>  static inline int of_dma_get_range(struct device_node *np,
>   const struct bus_dma_region **map)
>  {
>   return -ENODEV;
>  }
> +static inline int of_dma_get_restricted_buffer(struct device *dev)
> +{
> + return -ENODEV;
> +}
>  #endif
>  
>  #endif /* _LINUX_OF_PRIVATE_H */
> 


-- 
Florian


Re: [RFC PATCH v3 4/6] swiotlb: Add restricted DMA alloc/free support.

2021-01-12 Thread Florian Fainelli
On 1/5/21 7:41 PM, Claire Chang wrote:
> Add the functions, swiotlb_alloc and swiotlb_free to support the
> memory allocation from restricted DMA pool.
> 
> Signed-off-by: Claire Chang 
> ---

[snip]

> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index 30ccbc08e229..126e9b3354d6 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -137,6 +137,11 @@ void *dma_direct_alloc(struct device *dev, size_t size,
>   void *ret;
>   int err;
>  
> +#ifdef CONFIG_SWIOTLB
> + if (unlikely(dev->dma_io_tlb_mem))
> + return swiotlb_alloc(dev, size, dma_handle, attrs);
> +#endif

While this is potentially a hot path, I am not sure of the unkikely is
warranted, maybe best left as a plain conditional.
-- 
Florian


Re: [RFC PATCH v3 3/6] swiotlb: Use restricted DMA pool if available

2021-01-12 Thread Florian Fainelli
On 1/5/21 7:41 PM, Claire Chang wrote:
> Regardless of swiotlb setting, the restricted DMA pool is preferred if
> available.
> 
> The restricted DMA pools provide a basic level of protection against
> the DMA overwriting buffer contents at unexpected times. However, to
> protect against general data leakage and system memory corruption, the
> system needs to provide a way to restrict the DMA to a predefined memory
> region.
> 
> Signed-off-by: Claire Chang 

You could probably split this patch into two:

- one that introduces the get_io_tlb_mem() getter, updates all callers
of is_swiotlb_buffer() to gain a 'struct device' argument
- another one that does add support for a non-default swiotlb pool and
adds dev->dma_io_tlb_mem

Other than that, LGTM!
-- 
Florian


Re: [PATCH v4 18/21] ibmvfc: send Cancel MAD down each hw scsi channel

2021-01-12 Thread Brian King
On 1/11/21 5:12 PM, Tyrel Datwyler wrote:
> diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
> index b0b0212344f3..24e1278acfeb 100644
> --- a/drivers/scsi/ibmvscsi/ibmvfc.c
> +++ b/drivers/scsi/ibmvscsi/ibmvfc.c
> @@ -2418,18 +2418,79 @@ static struct ibmvfc_event *ibmvfc_init_tmf(struct 
> ibmvfc_queue *queue,
>   return evt;
>  }
>  
> -/**
> - * ibmvfc_cancel_all - Cancel all outstanding commands to the device
> - * @sdev:scsi device to cancel commands
> - * @type:type of error recovery being performed
> - *
> - * This sends a cancel to the VIOS for the specified device. This does
> - * NOT send any abort to the actual device. That must be done separately.
> - *
> - * Returns:
> - *   0 on success / other on failure
> - **/
> -static int ibmvfc_cancel_all(struct scsi_device *sdev, int type)
> +static int ibmvfc_cancel_all_mq(struct scsi_device *sdev, int type)
> +{
> + struct ibmvfc_host *vhost = shost_priv(sdev->host);
> + struct ibmvfc_event *evt, *found_evt, *temp;
> + struct ibmvfc_queue *queues = vhost->scsi_scrqs.scrqs;
> + unsigned long flags;
> + int num_hwq, i;
> + LIST_HEAD(cancelq);
> + u16 status;
> +
> + ENTER;
> + spin_lock_irqsave(vhost->host->host_lock, flags);
> + num_hwq = vhost->scsi_scrqs.active_queues;
> + for (i = 0; i < num_hwq; i++) {
> + spin_lock(queues[i].q_lock);
> + spin_lock([i].l_lock);
> + found_evt = NULL;
> + list_for_each_entry(evt, [i].sent, queue_list) {
> + if (evt->cmnd && evt->cmnd->device == sdev) {
> + found_evt = evt;
> + break;
> + }
> + }
> + spin_unlock([i].l_lock);
> +

I really don't like the way the first for loop grabs all the q_locks, and then
there is a second for loop that drops all these locks... I think this code would
be cleaner if it looked like:

if (found_evt && vhost->logged_in) {
evt = ibmvfc_init_tmf([i], sdev, type);
evt->sync_iu = [i].cancel_rsp;
ibmvfc_send_event(evt, vhost, default_timeout);
list_add_tail(>cancel, );
}

spin_unlock(queues[i].q_lock);

}

> + if (!found_evt)
> + continue;
> +
> + if (vhost->logged_in) {
> + evt = ibmvfc_init_tmf([i], sdev, type);
> + evt->sync_iu = [i].cancel_rsp;
> + ibmvfc_send_event(evt, vhost, default_timeout);
> + list_add_tail(>cancel, );
> + }
> + }
> +
> + for (i = 0; i < num_hwq; i++)
> + spin_unlock(queues[i].q_lock);
> + spin_unlock_irqrestore(vhost->host->host_lock, flags);
> +
> + if (list_empty()) {
> + if (vhost->log_level > IBMVFC_DEFAULT_LOG_LEVEL)
> + sdev_printk(KERN_INFO, sdev, "No events found to 
> cancel\n");
> + return 0;
> + }
> +
> + sdev_printk(KERN_INFO, sdev, "Cancelling outstanding commands.\n");
> +
> + list_for_each_entry_safe(evt, temp, , cancel) {
> + wait_for_completion(>comp);
> + status = be16_to_cpu(evt->queue->cancel_rsp.mad_common.status);

You probably want a list_del(>cancel) here.

> + ibmvfc_free_event(evt);
> +
> + if (status != IBMVFC_MAD_SUCCESS) {
> + sdev_printk(KERN_WARNING, sdev, "Cancel failed with 
> rc=%x\n", status);
> + switch (status) {
> + case IBMVFC_MAD_DRIVER_FAILED:
> + case IBMVFC_MAD_CRQ_ERROR:
> + /* Host adapter most likely going through reset, return 
> success to
> +  * the caller will wait for the command being cancelled 
> to get returned
> +  */
> + break;
> + default:
> + break;

I think this default break needs to be a return -EIO.

> + }
> + }
> + }
> +
> + sdev_printk(KERN_INFO, sdev, "Successfully cancelled outstanding 
> commands\n");
> + return 0;
> +}
> +
> +static int ibmvfc_cancel_all_sq(struct scsi_device *sdev, int type)
>  {
>   struct ibmvfc_host *vhost = shost_priv(sdev->host);
>   struct ibmvfc_event *evt, *found_evt;
> @@ -2498,6 +2559,27 @@ static int ibmvfc_cancel_all(struct scsi_device *sdev, 
> int type)
>   return 0;
>  }
>  



-- 
Brian King
Power Linux I/O
IBM Linux Technology Center



Re: [PATCH v4 01/21] ibmvfc: add vhost fields and defaults for MQ enablement

2021-01-12 Thread Brian King
On 1/11/21 5:12 PM, Tyrel Datwyler wrote:
> Introduce several new vhost fields for managing MQ state of the adapter
> as well as initial defaults for MQ enablement.
> 
> Signed-off-by: Tyrel Datwyler 
> ---
>  drivers/scsi/ibmvscsi/ibmvfc.c | 8 
>  drivers/scsi/ibmvscsi/ibmvfc.h | 9 +
>  2 files changed, 17 insertions(+)
> 
> diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
> index ba95438a8912..9200fe49c57e 100644
> --- a/drivers/scsi/ibmvscsi/ibmvfc.c
> +++ b/drivers/scsi/ibmvscsi/ibmvfc.c
> @@ -3302,6 +3302,7 @@ static struct scsi_host_template driver_template = {
>   .max_sectors = IBMVFC_MAX_SECTORS,
>   .shost_attrs = ibmvfc_attrs,
>   .track_queue_depth = 1,
> + .host_tagset = 1,

This doesn't seem right. You are setting host_tagset, which means you want a
shared, host wide, tag set for commands. It also means that the total
queue depth for the host is can_queue. However, it looks like you are allocating
max_requests events for each sub crq, which means you are over allocating 
memory.

Looking at this closer, we might have bigger problems. There is a host wide
max number of commands that the VFC host supports, which gets returned on
NPIV Login. This value can change across a live migration event. 

The ibmvfc driver, which does the same thing the lpfc driver does, modifies
can_queue on the scsi_host *after* the tag set has been allocated. This looks
to be a concern with ibmvfc, not sure about lpfc, as it doesn't look like
we look at can_queue once the tag set is setup, and I'm not seeing a good way
to dynamically change the host queue depth once the tag set is setup. 

Unless I'm missing something, our best options appear to either be to implement
our own host wide busy reference counting, which doesn't sound very good, or
we need to add some API to block / scsi that allows us to dynamically change
can_queue.

Thanks,

Brian


-- 
Brian King
Power Linux I/O
IBM Linux Technology Center



Re: [PATCH] tty: serial: cpm_uart: Add udbg support for enabling xmon

2021-01-12 Thread Christophe Leroy




Le 23/12/2020 à 10:38, Christophe Leroy a écrit :

In order to use xmon with powerpc 8xx, the serial driver
must provide udbg_putc() and udpb_getc().

Provide them via cpm_put_poll_char() and cpm_get_poll_char().

This requires CONFIG_CONSOLE_POLL.

Signed-off-by: Christophe Leroy 


This patch has been merged in tty-next, it is visible in linux-next

Christophe


---
  drivers/tty/serial/cpm_uart/cpm_uart_core.c | 40 -
  1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/drivers/tty/serial/cpm_uart/cpm_uart_core.c 
b/drivers/tty/serial/cpm_uart/cpm_uart_core.c
index ba14ec5b9bc4..2920b9b602b3 100644
--- a/drivers/tty/serial/cpm_uart/cpm_uart_core.c
+++ b/drivers/tty/serial/cpm_uart/cpm_uart_core.c
@@ -1145,6 +1145,32 @@ static void cpm_put_poll_char(struct uart_port *port,
ch[0] = (char)c;
cpm_uart_early_write(pinfo, ch, 1, false);
  }
+
+static struct uart_port *udbg_port;
+
+static void udbg_cpm_putc(char c)
+{
+   if (c == '\n')
+   cpm_put_poll_char(udbg_port, '\r');
+   cpm_put_poll_char(udbg_port, c);
+}
+
+static int udbg_cpm_getc_poll(void)
+{
+   int c = cpm_get_poll_char(udbg_port);
+
+   return c == NO_POLL_CHAR ? -1 : c;
+}
+
+static int udbg_cpm_getc(void)
+{
+   int c;
+
+   while ((c = udbg_cpm_getc_poll()) == -1)
+   cpu_relax();
+   return c;
+}
+
  #endif /* CONFIG_CONSOLE_POLL */
  
  static const struct uart_ops cpm_uart_pops = {

@@ -1251,7 +1277,10 @@ static int cpm_uart_init_port(struct device_node *np,
pinfo->gpios[i] = NULL;
  
  #ifdef CONFIG_PPC_EARLY_DEBUG_CPM

-   udbg_putc = NULL;
+#ifdef CONFIG_CONSOLE_POLL
+   if (!udbg_port)
+#endif
+   udbg_putc = NULL;
  #endif
  
  	return cpm_uart_request_port(>port);

@@ -1370,6 +1399,15 @@ static int __init cpm_uart_console_setup(struct console 
*co, char *options)
uart_set_options(port, co, baud, parity, bits, flow);
cpm_line_cr_cmd(pinfo, CPM_CR_RESTART_TX);
  
+#ifdef CONFIG_CONSOLE_POLL

+   if (!udbg_port) {
+   udbg_port = >port;
+   udbg_putc = udbg_cpm_putc;
+   udbg_getc = udbg_cpm_getc;
+   udbg_getc_poll = udbg_cpm_getc_poll;
+   }
+#endif
+
return 0;
  }
  



Re: [RFC PATCH v3 0/6] Restricted DMA

2021-01-12 Thread Florian Fainelli
On 1/11/21 11:48 PM, Claire Chang wrote:
> On Fri, Jan 8, 2021 at 1:59 AM Florian Fainelli  wrote:
>>
>> On 1/7/21 9:42 AM, Claire Chang wrote:
>>
 Can you explain how ATF gets involved and to what extent it does help,
 besides enforcing a secure region from the ARM CPU's perpsective? Does
 the PCIe root complex not have an IOMMU but can somehow be denied access
 to a region that is marked NS=0 in the ARM CPU's MMU? If so, that is
 still some sort of basic protection that the HW enforces, right?
>>>
>>> We need the ATF support for memory MPU (memory protection unit).
>>> Restricted DMA (with reserved-memory in dts) makes sure the predefined 
>>> memory
>>> region is for PCIe DMA only, but we still need MPU to locks down PCIe 
>>> access to
>>> that specific regions.
>>
>> OK so you do have a protection unit of some sort to enforce which region
>> in DRAM the PCIE bridge is allowed to access, that makes sense,
>> otherwise the restricted DMA region would only be a hint but nothing you
>> can really enforce. This is almost entirely analogous to our systems then.
> 
> Here is the example of setting the MPU:
> https://github.com/ARM-software/arm-trusted-firmware/blob/master/plat/mediatek/mt8183/drivers/emi_mpu/emi_mpu.c#L132
> 
>>
>> There may be some value in standardizing on an ARM SMCCC call then since
>> you already support two different SoC vendors.
>>
>>>

 On Broadcom STB SoCs we have had something similar for a while however
 and while we don't have an IOMMU for the PCIe bridge, we do have a a
 basic protection mechanism whereby we can configure a region in DRAM to
 be PCIe read/write and CPU read/write which then gets used as the PCIe
 inbound region for the PCIe EP. By default the PCIe bridge is not
 allowed access to DRAM so we must call into a security agent to allow
 the PCIe bridge to access the designated DRAM region.

 We have done this using a private CMA area region assigned via Device
 Tree, assigned with a and requiring the PCIe EP driver to use
 dma_alloc_from_contiguous() in order to allocate from this device
 private CMA area. The only drawback with that approach is that it
 requires knowing how much memory you need up front for buffers and DMA
 descriptors that the PCIe EP will need to process. The problem is that
 it requires driver modifications and that does not scale over the number
 of PCIe EP drivers, some we absolutely do not control, but there is no
 need to bounce buffer. Your approach scales better across PCIe EP
 drivers however it does require bounce buffering which could be a
 performance hit.
>>>
>>> Only the streaming DMA (map/unmap) needs bounce buffering.
>>
>> True, and typically only on transmit since you don't really control
>> where the sk_buff are allocated from, right? On RX since you need to
>> hand buffer addresses to the WLAN chip prior to DMA, you can allocate
>> them from a pool that already falls within the restricted DMA region, right?
>>
> 
> Right, but applying bounce buffering to RX will make it more secure.
> The device won't be able to modify the content after unmap. Just like what
> iommu_unmap does.

Sure, however the goals of using bounce buffering equally applies to RX
and TX in that this is the only layer sitting between a stack (block,
networking, USB, etc.) and the underlying device driver that scales well
in order to massage a dma_addr_t to be within a particular physical range.

There is however room for improvement if the drivers are willing to
change their buffer allocation strategy. When you receive Wi-Fi frames
you need to allocate buffers for the Wi-Fi device to DMA into, and that
happens ahead of the DMA transfers by the Wi-Fi device. At buffer
allocation time you could very well allocate these frames from the
restricted DMA region without having to bounce buffer them since the
host CPU is in control over where and when to DMA into.

The issue is that each network driver may implement its own buffer
allocation strategy, some may simply call netdev_alloc_skb() which gives
zero control over where the buffer comes from unless you play tricks
with NUMA node allocations and somehow declare that your restricted DMA
region is a different NUMA node. If the driver allocates pages and then
attaches a SKB to that page using build_skb(), then you have much more
control over where that page comes from, and this is where using a
device private CMA are helps, because you can just do
dma_alloc_from_contiguous() and that will ensure that the pages are
coming from your specific CMA area.

Few questions on the implementation:

- is there any warning or error being printed if the restricted DMA
region is outside of a device's DMA addressable range?

- are there are any helpful statistics that could be shown to indicate
that the restricted DMA region was sized too small, e.g.: that
allocation of a DMA buffer failed because we ran out of space in the
swiotlb 

[patch 4/4] powerpc/mm/highmem: Use __set_pte_at() for kmap_local()

2021-01-12 Thread Thomas Gleixner
The original PowerPC highmem mapping function used __set_pte_at() to denote
that the mapping is per CPU. This got lost with the conversion to the
generic implementation.

Override the default map function.

Fixes: 47da42b27a56 ("powerpc/mm/highmem: Switch to generic kmap atomic")
Signed-off-by: Thomas Gleixner 
Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
---
 arch/powerpc/include/asm/highmem.h |2 ++
 1 file changed, 2 insertions(+)

--- a/arch/powerpc/include/asm/highmem.h
+++ b/arch/powerpc/include/asm/highmem.h
@@ -58,6 +58,8 @@ extern pte_t *pkmap_page_table;
 
 #define flush_cache_kmaps()flush_cache_all()
 
+#define arch_kmap_local_set_pte(mm, vaddr, ptep, ptev) \
+   __set_pte_at(mm, vaddr, ptep, ptev, 1)
 #define arch_kmap_local_post_map(vaddr, pteval)\
local_flush_tlb_page(NULL, vaddr)
 #define arch_kmap_local_post_unmap(vaddr)  \



[patch 0/4] mm/highmem: Fix fallout from generic kmap_local conversions

2021-01-12 Thread Thomas Gleixner
The kmap_local conversion wreckaged sparc, mips and powerpc as it missed
some of the details in the original implementation.

The following series addresses that.

Thanks,

tglx
---
 arch/mips/include/asm/highmem.h  |1 +
 arch/sparc/include/asm/highmem.h |9 +
 b/arch/powerpc/include/asm/highmem.h |2 ++
 mm/highmem.c |7 ++-
 4 files changed, 14 insertions(+), 5 deletions(-)




[patch 3/4] mips/mm/highmem: Use set_pte() for kmap_local()

2021-01-12 Thread Thomas Gleixner
set_pte_at() on MIPS invokes update_cache() which might recurse into
kmap_local(). Use set_pte() like the original MIPS highmem implementation
did.

Fixes: a4c33e83bca1 ("mips/mm/highmem: Switch to generic kmap atomic")
Reported-by: Paul Cercueil 
Reported-by: Thomas Bogendoerfer 
Signed-off-by: Thomas Gleixner 
---
 arch/mips/include/asm/highmem.h |1 +
 1 file changed, 1 insertion(+)

--- a/arch/mips/include/asm/highmem.h
+++ b/arch/mips/include/asm/highmem.h
@@ -51,6 +51,7 @@ extern void kmap_flush_tlb(unsigned long
 
 #define flush_cache_kmaps()BUG_ON(cpu_has_dc_aliases)
 
+#define arch_kmap_local_set_pte(mm, vaddr, ptep, ptev) set_pte(ptep, ptev)
 #define arch_kmap_local_post_map(vaddr, pteval)
local_flush_tlb_one(vaddr)
 #define arch_kmap_local_post_unmap(vaddr)  local_flush_tlb_one(vaddr)
 



[patch 1/4] sparc/mm/highmem: Flush cache and TLB

2021-01-12 Thread Thomas Gleixner
The recent conversion to the generic kmap_local infrastructure failed to
assign the proper pre/post map/unmap flush operations for sparc.

Sparc requires cache flush before map/unmap and tlb flush afterwards.

Fixes: 3293efa97807 ("sparc/mm/highmem: Switch to generic kmap atomic")
Reported-by: Andreas Larsson 
Signed-off-by: Thomas Gleixner 
Cc: "David S. Miller" 
Cc: sparcli...@vger.kernel.org
---
 arch/sparc/include/asm/highmem.h |9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

--- a/arch/sparc/include/asm/highmem.h
+++ b/arch/sparc/include/asm/highmem.h
@@ -50,10 +50,11 @@ extern pte_t *pkmap_page_table;
 
 #define flush_cache_kmaps()flush_cache_all()
 
-/* FIXME: Use __flush_tlb_one(vaddr) instead of flush_cache_all() -- Anton */
-#define arch_kmap_local_post_map(vaddr, pteval)flush_cache_all()
-#define arch_kmap_local_post_unmap(vaddr)  flush_cache_all()
-
+/* FIXME: Use __flush_*_one(vaddr) instead of flush_*_all() -- Anton */
+#define arch_kmap_local_pre_map(vaddr, pteval) flush_cache_all()
+#define arch_kmap_local_pre_unmap(vaddr)   flush_cache_all()
+#define arch_kmap_local_post_map(vaddr, pteval)flush_tlb_all()
+#define arch_kmap_local_post_unmap(vaddr)  flush_tlb_all()
 
 #endif /* __KERNEL__ */
 



[patch 2/4] mm/highmem: Prepare for overriding set_pte_at()

2021-01-12 Thread Thomas Gleixner
The generic kmap_local() map function uses set_pte_at(), but MIPS requires
set_pte() and PowerPC wants __set_pte_at().

Provide arch_kmap_local_set_pte() and default it to set_pte_at().

Signed-off-by: Thomas Gleixner 
---
 mm/highmem.c |7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -473,6 +473,11 @@ static inline void *arch_kmap_local_high
 }
 #endif
 
+#ifndef arch_kmap_local_set_pte
+#define arch_kmap_local_set_pte(mm, vaddr, ptep, ptev) \
+   set_pte_at(mm, vaddr, ptep, ptev)
+#endif
+
 /* Unmap a local mapping which was obtained by kmap_high_get() */
 static inline bool kmap_high_unmap_local(unsigned long vaddr)
 {
@@ -515,7 +520,7 @@ void *__kmap_local_pfn_prot(unsigned lon
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
BUG_ON(!pte_none(*(kmap_pte - idx)));
pteval = pfn_pte(pfn, prot);
-   set_pte_at(_mm, vaddr, kmap_pte - idx, pteval);
+   arch_kmap_local_set_pte(_mm, vaddr, kmap_pte - idx, pteval);
arch_kmap_local_post_map(vaddr, pteval);
current->kmap_ctrl.pteval[kmap_local_idx()] = pteval;
preempt_enable();



Re: [PATCH v2] arch: consolidate pm_power_off callback

2021-01-12 Thread Will Deacon
On Sun, Dec 27, 2020 at 03:01:28PM +0100, Enrico Weigelt, metux IT consult 
wrote:
> Move the pm_power_off callback into one global place and also add an
> function for conditionally calling it (when not NULL), in order to remove
> code duplication in all individual archs.
> 
> Reported-by: kernel test robot 
> Signed-off-by: Enrico Weigelt, metux IT consult 

[...]

> diff --git a/kernel/reboot.c b/kernel/reboot.c
> index eb1b15850761..ec4cd66dd1ae 100644
> --- a/kernel/reboot.c
> +++ b/kernel/reboot.c
> @@ -53,6 +53,16 @@ int reboot_force;
>  void (*pm_power_off_prepare)(void);
>  EXPORT_SYMBOL_GPL(pm_power_off_prepare);
>  
> +void (*pm_power_off)(void);
> +EXPORT_SYMBOL_GPL(pm_power_off);
> +
> +void do_power_off(void)
> +{
> + if (pm_power_off)
> + pm_power_off();
> +}
> +EXPORT_SYMBOL_GPL(do_power_off);

Could this just live as a static inline in pm.h to avoid having to export
the extra symbol?

Will


[PATCH AUTOSEL 4.4 6/8] net: ethernet: fs_enet: Add missing MODULE_LICENSE

2021-01-12 Thread Sasha Levin
From: Michael Ellerman 

[ Upstream commit 445c6198fe7be03b7d38e66fe8d4b3187bc251d4 ]

Since commit 1d6cd3929360 ("modpost: turn missing MODULE_LICENSE()
into error") the ppc32_allmodconfig build fails with:

  ERROR: modpost: missing MODULE_LICENSE() in 
drivers/net/ethernet/freescale/fs_enet/mii-fec.o
  ERROR: modpost: missing MODULE_LICENSE() in 
drivers/net/ethernet/freescale/fs_enet/mii-bitbang.o

Add the missing MODULE_LICENSEs to fix the build. Both files include a
copyright header indicating they are GPL v2.

Signed-off-by: Michael Ellerman 
Reviewed-by: Andrew Lunn 
Signed-off-by: David S. Miller 
Signed-off-by: Sasha Levin 
---
 drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c | 1 +
 drivers/net/ethernet/freescale/fs_enet/mii-fec.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c 
b/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c
index 68a428de0bc0e..cfae74d8e6590 100644
--- a/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c
+++ b/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c
@@ -231,3 +231,4 @@ static struct platform_driver fs_enet_bb_mdio_driver = {
 };
 
 module_platform_driver(fs_enet_bb_mdio_driver);
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c 
b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c
index 2be383e6d2585..3b6232a6a56d6 100644
--- a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c
+++ b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c
@@ -232,3 +232,4 @@ static struct platform_driver fs_enet_fec_mdio_driver = {
 };
 
 module_platform_driver(fs_enet_fec_mdio_driver);
+MODULE_LICENSE("GPL");
-- 
2.27.0



[PATCH AUTOSEL 4.4 2/8] ethernet: ucc_geth: fix definition and size of ucc_geth_tx_global_pram

2021-01-12 Thread Sasha Levin
From: Rasmus Villemoes 

[ Upstream commit 887078de2a23689e29d6fa1b75d7cbc544c280be ]

Table 8-53 in the QUICC Engine Reference manual shows definitions of
fields up to a size of 192 bytes, not just 128. But in table 8-111,
one does find the text

  Base Address of the Global Transmitter Parameter RAM Page. [...]
  The user needs to allocate 128 bytes for this page. The address must
  be aligned to the page size.

I've checked both rev. 7 (11/2015) and rev. 9 (05/2018) of the manual;
they both have this inconsistency (and the table numbers are the
same).

Adding a bit of debug printing, on my board the struct
ucc_geth_tx_global_pram is allocated at offset 0x880, while
the (opaque) ucc_geth_thread_data_tx gets allocated immediately
afterwards, at 0x900. So whatever the engine writes into the thread
data overlaps with the tail of the global tx pram (and devmem says
that something does get written during a simple ping).

I haven't observed any failure that could be attributed to this, but
it seems to be the kind of thing that would be extremely hard to
debug. So extend the struct definition so that we do allocate 192
bytes.

Signed-off-by: Rasmus Villemoes 
Signed-off-by: Jakub Kicinski 
Signed-off-by: Sasha Levin 
---
 drivers/net/ethernet/freescale/ucc_geth.h | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/ucc_geth.h 
b/drivers/net/ethernet/freescale/ucc_geth.h
index 75f337163ce3c..1a40a5f11081b 100644
--- a/drivers/net/ethernet/freescale/ucc_geth.h
+++ b/drivers/net/ethernet/freescale/ucc_geth.h
@@ -580,7 +580,14 @@ struct ucc_geth_tx_global_pram {
u32 vtagtable[0x8]; /* 8 4-byte VLAN tags */
u32 tqptr;  /* a base pointer to the Tx Queues Memory
   Region */
-   u8 res2[0x80 - 0x74];
+   u8 res2[0x78 - 0x74];
+   u64 snums_en;
+   u32 l2l3baseptr;/* top byte consists of a few other bit fields 
*/
+
+   u16 mtu[8];
+   u8 res3[0xa8 - 0x94];
+   u32 wrrtablebase;   /* top byte is reserved */
+   u8 res4[0xc0 - 0xac];
 } __packed;
 
 /* structure representing Extended Filtering Global Parameters in PRAM */
-- 
2.27.0



[PATCH AUTOSEL 4.9 6/8] net: ethernet: fs_enet: Add missing MODULE_LICENSE

2021-01-12 Thread Sasha Levin
From: Michael Ellerman 

[ Upstream commit 445c6198fe7be03b7d38e66fe8d4b3187bc251d4 ]

Since commit 1d6cd3929360 ("modpost: turn missing MODULE_LICENSE()
into error") the ppc32_allmodconfig build fails with:

  ERROR: modpost: missing MODULE_LICENSE() in 
drivers/net/ethernet/freescale/fs_enet/mii-fec.o
  ERROR: modpost: missing MODULE_LICENSE() in 
drivers/net/ethernet/freescale/fs_enet/mii-bitbang.o

Add the missing MODULE_LICENSEs to fix the build. Both files include a
copyright header indicating they are GPL v2.

Signed-off-by: Michael Ellerman 
Reviewed-by: Andrew Lunn 
Signed-off-by: David S. Miller 
Signed-off-by: Sasha Levin 
---
 drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c | 1 +
 drivers/net/ethernet/freescale/fs_enet/mii-fec.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c 
b/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c
index 1f015edcca227..3c6fc61597f7e 100644
--- a/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c
+++ b/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c
@@ -223,3 +223,4 @@ static struct platform_driver fs_enet_bb_mdio_driver = {
 };
 
 module_platform_driver(fs_enet_bb_mdio_driver);
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c 
b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c
index a89267b94352a..fb2b0586469b2 100644
--- a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c
+++ b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c
@@ -224,3 +224,4 @@ static struct platform_driver fs_enet_fec_mdio_driver = {
 };
 
 module_platform_driver(fs_enet_fec_mdio_driver);
+MODULE_LICENSE("GPL");
-- 
2.27.0



[PATCH AUTOSEL 4.9 2/8] ethernet: ucc_geth: fix definition and size of ucc_geth_tx_global_pram

2021-01-12 Thread Sasha Levin
From: Rasmus Villemoes 

[ Upstream commit 887078de2a23689e29d6fa1b75d7cbc544c280be ]

Table 8-53 in the QUICC Engine Reference manual shows definitions of
fields up to a size of 192 bytes, not just 128. But in table 8-111,
one does find the text

  Base Address of the Global Transmitter Parameter RAM Page. [...]
  The user needs to allocate 128 bytes for this page. The address must
  be aligned to the page size.

I've checked both rev. 7 (11/2015) and rev. 9 (05/2018) of the manual;
they both have this inconsistency (and the table numbers are the
same).

Adding a bit of debug printing, on my board the struct
ucc_geth_tx_global_pram is allocated at offset 0x880, while
the (opaque) ucc_geth_thread_data_tx gets allocated immediately
afterwards, at 0x900. So whatever the engine writes into the thread
data overlaps with the tail of the global tx pram (and devmem says
that something does get written during a simple ping).

I haven't observed any failure that could be attributed to this, but
it seems to be the kind of thing that would be extremely hard to
debug. So extend the struct definition so that we do allocate 192
bytes.

Signed-off-by: Rasmus Villemoes 
Signed-off-by: Jakub Kicinski 
Signed-off-by: Sasha Levin 
---
 drivers/net/ethernet/freescale/ucc_geth.h | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/ucc_geth.h 
b/drivers/net/ethernet/freescale/ucc_geth.h
index 5da19b440a6a8..bf25e49d4fe34 100644
--- a/drivers/net/ethernet/freescale/ucc_geth.h
+++ b/drivers/net/ethernet/freescale/ucc_geth.h
@@ -580,7 +580,14 @@ struct ucc_geth_tx_global_pram {
u32 vtagtable[0x8]; /* 8 4-byte VLAN tags */
u32 tqptr;  /* a base pointer to the Tx Queues Memory
   Region */
-   u8 res2[0x80 - 0x74];
+   u8 res2[0x78 - 0x74];
+   u64 snums_en;
+   u32 l2l3baseptr;/* top byte consists of a few other bit fields 
*/
+
+   u16 mtu[8];
+   u8 res3[0xa8 - 0x94];
+   u32 wrrtablebase;   /* top byte is reserved */
+   u8 res4[0xc0 - 0xac];
 } __packed;
 
 /* structure representing Extended Filtering Global Parameters in PRAM */
-- 
2.27.0



[PATCH AUTOSEL 4.14 11/13] net: ethernet: fs_enet: Add missing MODULE_LICENSE

2021-01-12 Thread Sasha Levin
From: Michael Ellerman 

[ Upstream commit 445c6198fe7be03b7d38e66fe8d4b3187bc251d4 ]

Since commit 1d6cd3929360 ("modpost: turn missing MODULE_LICENSE()
into error") the ppc32_allmodconfig build fails with:

  ERROR: modpost: missing MODULE_LICENSE() in 
drivers/net/ethernet/freescale/fs_enet/mii-fec.o
  ERROR: modpost: missing MODULE_LICENSE() in 
drivers/net/ethernet/freescale/fs_enet/mii-bitbang.o

Add the missing MODULE_LICENSEs to fix the build. Both files include a
copyright header indicating they are GPL v2.

Signed-off-by: Michael Ellerman 
Reviewed-by: Andrew Lunn 
Signed-off-by: David S. Miller 
Signed-off-by: Sasha Levin 
---
 drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c | 1 +
 drivers/net/ethernet/freescale/fs_enet/mii-fec.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c 
b/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c
index c8e5d889bd81f..21de56345503f 100644
--- a/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c
+++ b/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c
@@ -223,3 +223,4 @@ static struct platform_driver fs_enet_bb_mdio_driver = {
 };
 
 module_platform_driver(fs_enet_bb_mdio_driver);
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c 
b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c
index 1582d82483eca..4e6a9c5d8af55 100644
--- a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c
+++ b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c
@@ -224,3 +224,4 @@ static struct platform_driver fs_enet_fec_mdio_driver = {
 };
 
 module_platform_driver(fs_enet_fec_mdio_driver);
+MODULE_LICENSE("GPL");
-- 
2.27.0



[PATCH AUTOSEL 4.14 05/13] ethernet: ucc_geth: fix definition and size of ucc_geth_tx_global_pram

2021-01-12 Thread Sasha Levin
From: Rasmus Villemoes 

[ Upstream commit 887078de2a23689e29d6fa1b75d7cbc544c280be ]

Table 8-53 in the QUICC Engine Reference manual shows definitions of
fields up to a size of 192 bytes, not just 128. But in table 8-111,
one does find the text

  Base Address of the Global Transmitter Parameter RAM Page. [...]
  The user needs to allocate 128 bytes for this page. The address must
  be aligned to the page size.

I've checked both rev. 7 (11/2015) and rev. 9 (05/2018) of the manual;
they both have this inconsistency (and the table numbers are the
same).

Adding a bit of debug printing, on my board the struct
ucc_geth_tx_global_pram is allocated at offset 0x880, while
the (opaque) ucc_geth_thread_data_tx gets allocated immediately
afterwards, at 0x900. So whatever the engine writes into the thread
data overlaps with the tail of the global tx pram (and devmem says
that something does get written during a simple ping).

I haven't observed any failure that could be attributed to this, but
it seems to be the kind of thing that would be extremely hard to
debug. So extend the struct definition so that we do allocate 192
bytes.

Signed-off-by: Rasmus Villemoes 
Signed-off-by: Jakub Kicinski 
Signed-off-by: Sasha Levin 
---
 drivers/net/ethernet/freescale/ucc_geth.h | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/ucc_geth.h 
b/drivers/net/ethernet/freescale/ucc_geth.h
index 5da19b440a6a8..bf25e49d4fe34 100644
--- a/drivers/net/ethernet/freescale/ucc_geth.h
+++ b/drivers/net/ethernet/freescale/ucc_geth.h
@@ -580,7 +580,14 @@ struct ucc_geth_tx_global_pram {
u32 vtagtable[0x8]; /* 8 4-byte VLAN tags */
u32 tqptr;  /* a base pointer to the Tx Queues Memory
   Region */
-   u8 res2[0x80 - 0x74];
+   u8 res2[0x78 - 0x74];
+   u64 snums_en;
+   u32 l2l3baseptr;/* top byte consists of a few other bit fields 
*/
+
+   u16 mtu[8];
+   u8 res3[0xa8 - 0x94];
+   u32 wrrtablebase;   /* top byte is reserved */
+   u8 res4[0xc0 - 0xac];
 } __packed;
 
 /* structure representing Extended Filtering Global Parameters in PRAM */
-- 
2.27.0



[PATCH AUTOSEL 4.19 13/16] net: ethernet: fs_enet: Add missing MODULE_LICENSE

2021-01-12 Thread Sasha Levin
From: Michael Ellerman 

[ Upstream commit 445c6198fe7be03b7d38e66fe8d4b3187bc251d4 ]

Since commit 1d6cd3929360 ("modpost: turn missing MODULE_LICENSE()
into error") the ppc32_allmodconfig build fails with:

  ERROR: modpost: missing MODULE_LICENSE() in 
drivers/net/ethernet/freescale/fs_enet/mii-fec.o
  ERROR: modpost: missing MODULE_LICENSE() in 
drivers/net/ethernet/freescale/fs_enet/mii-bitbang.o

Add the missing MODULE_LICENSEs to fix the build. Both files include a
copyright header indicating they are GPL v2.

Signed-off-by: Michael Ellerman 
Reviewed-by: Andrew Lunn 
Signed-off-by: David S. Miller 
Signed-off-by: Sasha Levin 
---
 drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c | 1 +
 drivers/net/ethernet/freescale/fs_enet/mii-fec.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c 
b/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c
index c8e5d889bd81f..21de56345503f 100644
--- a/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c
+++ b/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c
@@ -223,3 +223,4 @@ static struct platform_driver fs_enet_bb_mdio_driver = {
 };
 
 module_platform_driver(fs_enet_bb_mdio_driver);
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c 
b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c
index 1582d82483eca..4e6a9c5d8af55 100644
--- a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c
+++ b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c
@@ -224,3 +224,4 @@ static struct platform_driver fs_enet_fec_mdio_driver = {
 };
 
 module_platform_driver(fs_enet_fec_mdio_driver);
+MODULE_LICENSE("GPL");
-- 
2.27.0



[PATCH AUTOSEL 4.19 05/16] ethernet: ucc_geth: fix definition and size of ucc_geth_tx_global_pram

2021-01-12 Thread Sasha Levin
From: Rasmus Villemoes 

[ Upstream commit 887078de2a23689e29d6fa1b75d7cbc544c280be ]

Table 8-53 in the QUICC Engine Reference manual shows definitions of
fields up to a size of 192 bytes, not just 128. But in table 8-111,
one does find the text

  Base Address of the Global Transmitter Parameter RAM Page. [...]
  The user needs to allocate 128 bytes for this page. The address must
  be aligned to the page size.

I've checked both rev. 7 (11/2015) and rev. 9 (05/2018) of the manual;
they both have this inconsistency (and the table numbers are the
same).

Adding a bit of debug printing, on my board the struct
ucc_geth_tx_global_pram is allocated at offset 0x880, while
the (opaque) ucc_geth_thread_data_tx gets allocated immediately
afterwards, at 0x900. So whatever the engine writes into the thread
data overlaps with the tail of the global tx pram (and devmem says
that something does get written during a simple ping).

I haven't observed any failure that could be attributed to this, but
it seems to be the kind of thing that would be extremely hard to
debug. So extend the struct definition so that we do allocate 192
bytes.

Signed-off-by: Rasmus Villemoes 
Signed-off-by: Jakub Kicinski 
Signed-off-by: Sasha Levin 
---
 drivers/net/ethernet/freescale/ucc_geth.h | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/ucc_geth.h 
b/drivers/net/ethernet/freescale/ucc_geth.h
index 5da19b440a6a8..bf25e49d4fe34 100644
--- a/drivers/net/ethernet/freescale/ucc_geth.h
+++ b/drivers/net/ethernet/freescale/ucc_geth.h
@@ -580,7 +580,14 @@ struct ucc_geth_tx_global_pram {
u32 vtagtable[0x8]; /* 8 4-byte VLAN tags */
u32 tqptr;  /* a base pointer to the Tx Queues Memory
   Region */
-   u8 res2[0x80 - 0x74];
+   u8 res2[0x78 - 0x74];
+   u64 snums_en;
+   u32 l2l3baseptr;/* top byte consists of a few other bit fields 
*/
+
+   u16 mtu[8];
+   u8 res3[0xa8 - 0x94];
+   u32 wrrtablebase;   /* top byte is reserved */
+   u8 res4[0xc0 - 0xac];
 } __packed;
 
 /* structure representing Extended Filtering Global Parameters in PRAM */
-- 
2.27.0



[PATCH AUTOSEL 5.4 20/28] net: ethernet: fs_enet: Add missing MODULE_LICENSE

2021-01-12 Thread Sasha Levin
From: Michael Ellerman 

[ Upstream commit 445c6198fe7be03b7d38e66fe8d4b3187bc251d4 ]

Since commit 1d6cd3929360 ("modpost: turn missing MODULE_LICENSE()
into error") the ppc32_allmodconfig build fails with:

  ERROR: modpost: missing MODULE_LICENSE() in 
drivers/net/ethernet/freescale/fs_enet/mii-fec.o
  ERROR: modpost: missing MODULE_LICENSE() in 
drivers/net/ethernet/freescale/fs_enet/mii-bitbang.o

Add the missing MODULE_LICENSEs to fix the build. Both files include a
copyright header indicating they are GPL v2.

Signed-off-by: Michael Ellerman 
Reviewed-by: Andrew Lunn 
Signed-off-by: David S. Miller 
Signed-off-by: Sasha Levin 
---
 drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c | 1 +
 drivers/net/ethernet/freescale/fs_enet/mii-fec.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c 
b/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c
index c8e5d889bd81f..21de56345503f 100644
--- a/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c
+++ b/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c
@@ -223,3 +223,4 @@ static struct platform_driver fs_enet_bb_mdio_driver = {
 };
 
 module_platform_driver(fs_enet_bb_mdio_driver);
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c 
b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c
index 1582d82483eca..4e6a9c5d8af55 100644
--- a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c
+++ b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c
@@ -224,3 +224,4 @@ static struct platform_driver fs_enet_fec_mdio_driver = {
 };
 
 module_platform_driver(fs_enet_fec_mdio_driver);
+MODULE_LICENSE("GPL");
-- 
2.27.0



[PATCH AUTOSEL 5.4 08/28] ethernet: ucc_geth: fix definition and size of ucc_geth_tx_global_pram

2021-01-12 Thread Sasha Levin
From: Rasmus Villemoes 

[ Upstream commit 887078de2a23689e29d6fa1b75d7cbc544c280be ]

Table 8-53 in the QUICC Engine Reference manual shows definitions of
fields up to a size of 192 bytes, not just 128. But in table 8-111,
one does find the text

  Base Address of the Global Transmitter Parameter RAM Page. [...]
  The user needs to allocate 128 bytes for this page. The address must
  be aligned to the page size.

I've checked both rev. 7 (11/2015) and rev. 9 (05/2018) of the manual;
they both have this inconsistency (and the table numbers are the
same).

Adding a bit of debug printing, on my board the struct
ucc_geth_tx_global_pram is allocated at offset 0x880, while
the (opaque) ucc_geth_thread_data_tx gets allocated immediately
afterwards, at 0x900. So whatever the engine writes into the thread
data overlaps with the tail of the global tx pram (and devmem says
that something does get written during a simple ping).

I haven't observed any failure that could be attributed to this, but
it seems to be the kind of thing that would be extremely hard to
debug. So extend the struct definition so that we do allocate 192
bytes.

Signed-off-by: Rasmus Villemoes 
Signed-off-by: Jakub Kicinski 
Signed-off-by: Sasha Levin 
---
 drivers/net/ethernet/freescale/ucc_geth.h | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/ucc_geth.h 
b/drivers/net/ethernet/freescale/ucc_geth.h
index a86a42131fc71..b00fbef612cfe 100644
--- a/drivers/net/ethernet/freescale/ucc_geth.h
+++ b/drivers/net/ethernet/freescale/ucc_geth.h
@@ -576,7 +576,14 @@ struct ucc_geth_tx_global_pram {
u32 vtagtable[0x8]; /* 8 4-byte VLAN tags */
u32 tqptr;  /* a base pointer to the Tx Queues Memory
   Region */
-   u8 res2[0x80 - 0x74];
+   u8 res2[0x78 - 0x74];
+   u64 snums_en;
+   u32 l2l3baseptr;/* top byte consists of a few other bit fields 
*/
+
+   u16 mtu[8];
+   u8 res3[0xa8 - 0x94];
+   u32 wrrtablebase;   /* top byte is reserved */
+   u8 res4[0xc0 - 0xac];
 } __packed;
 
 /* structure representing Extended Filtering Global Parameters in PRAM */
-- 
2.27.0



[PATCH AUTOSEL 5.10 36/51] net: ethernet: fs_enet: Add missing MODULE_LICENSE

2021-01-12 Thread Sasha Levin
From: Michael Ellerman 

[ Upstream commit 445c6198fe7be03b7d38e66fe8d4b3187bc251d4 ]

Since commit 1d6cd3929360 ("modpost: turn missing MODULE_LICENSE()
into error") the ppc32_allmodconfig build fails with:

  ERROR: modpost: missing MODULE_LICENSE() in 
drivers/net/ethernet/freescale/fs_enet/mii-fec.o
  ERROR: modpost: missing MODULE_LICENSE() in 
drivers/net/ethernet/freescale/fs_enet/mii-bitbang.o

Add the missing MODULE_LICENSEs to fix the build. Both files include a
copyright header indicating they are GPL v2.

Signed-off-by: Michael Ellerman 
Reviewed-by: Andrew Lunn 
Signed-off-by: David S. Miller 
Signed-off-by: Sasha Levin 
---
 drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c | 1 +
 drivers/net/ethernet/freescale/fs_enet/mii-fec.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c 
b/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c
index c8e5d889bd81f..21de56345503f 100644
--- a/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c
+++ b/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c
@@ -223,3 +223,4 @@ static struct platform_driver fs_enet_bb_mdio_driver = {
 };
 
 module_platform_driver(fs_enet_bb_mdio_driver);
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c 
b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c
index 8b51ee142fa3c..152f4d83765aa 100644
--- a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c
+++ b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c
@@ -224,3 +224,4 @@ static struct platform_driver fs_enet_fec_mdio_driver = {
 };
 
 module_platform_driver(fs_enet_fec_mdio_driver);
+MODULE_LICENSE("GPL");
-- 
2.27.0



[PATCH AUTOSEL 5.10 14/51] ethernet: ucc_geth: fix definition and size of ucc_geth_tx_global_pram

2021-01-12 Thread Sasha Levin
From: Rasmus Villemoes 

[ Upstream commit 887078de2a23689e29d6fa1b75d7cbc544c280be ]

Table 8-53 in the QUICC Engine Reference manual shows definitions of
fields up to a size of 192 bytes, not just 128. But in table 8-111,
one does find the text

  Base Address of the Global Transmitter Parameter RAM Page. [...]
  The user needs to allocate 128 bytes for this page. The address must
  be aligned to the page size.

I've checked both rev. 7 (11/2015) and rev. 9 (05/2018) of the manual;
they both have this inconsistency (and the table numbers are the
same).

Adding a bit of debug printing, on my board the struct
ucc_geth_tx_global_pram is allocated at offset 0x880, while
the (opaque) ucc_geth_thread_data_tx gets allocated immediately
afterwards, at 0x900. So whatever the engine writes into the thread
data overlaps with the tail of the global tx pram (and devmem says
that something does get written during a simple ping).

I haven't observed any failure that could be attributed to this, but
it seems to be the kind of thing that would be extremely hard to
debug. So extend the struct definition so that we do allocate 192
bytes.

Signed-off-by: Rasmus Villemoes 
Signed-off-by: Jakub Kicinski 
Signed-off-by: Sasha Levin 
---
 drivers/net/ethernet/freescale/ucc_geth.h | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/ucc_geth.h 
b/drivers/net/ethernet/freescale/ucc_geth.h
index 3fe9039721952..c80bed2c995c1 100644
--- a/drivers/net/ethernet/freescale/ucc_geth.h
+++ b/drivers/net/ethernet/freescale/ucc_geth.h
@@ -575,7 +575,14 @@ struct ucc_geth_tx_global_pram {
u32 vtagtable[0x8]; /* 8 4-byte VLAN tags */
u32 tqptr;  /* a base pointer to the Tx Queues Memory
   Region */
-   u8 res2[0x80 - 0x74];
+   u8 res2[0x78 - 0x74];
+   u64 snums_en;
+   u32 l2l3baseptr;/* top byte consists of a few other bit fields 
*/
+
+   u16 mtu[8];
+   u8 res3[0xa8 - 0x94];
+   u32 wrrtablebase;   /* top byte is reserved */
+   u8 res4[0xc0 - 0xac];
 } __packed;
 
 /* structure representing Extended Filtering Global Parameters in PRAM */
-- 
2.27.0



Re: [PATCH v2] powerpc/vdso: fix clock_gettime_fallback for vdso32

2021-01-12 Thread Christophe Leroy




Le 12/01/2021 à 02:13, Michael Ellerman a écrit :

Christophe Leroy  writes:

From: Andreas Schwab 

The second argument of __kernel_clock_gettime64 points to a struct
__kernel_timespec, with 64-bit time_t, so use the clock_gettime64 syscall
in the fallback function for the 32-bit vdso.  Similarily,
clock_getres_fallback should use the clock_getres_time64 syscall, though
it isn't yet called from the 32-bit vdso.

Signed-off-by: Andreas Schwab 
[chleroy: Moved into the #ifdef CONFIG_VDSO32 block]


That doesn't build for 64-bit with compat VDSO. Should I just take
Andreas' version, or do you want to send a v3?


Yes, that #ifdef CONFIG_VDSO32 is crazy, should use __powerpc64__ instead, or __VDSO32__ but that 
one is defined in asflags-y so not sure we have it in C.


I sent v3 just before lunch.

Christophe



cheers


Fixes: d0e3fc69d00d ("powerpc/vdso: Provide __kernel_clock_gettime64() on 
vdso32")
Signed-off-by: Christophe Leroy 
---
  arch/powerpc/include/asm/vdso/gettimeofday.h | 27 +++-
  1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h 
b/arch/powerpc/include/asm/vdso/gettimeofday.h
index 7a215cc5da77..3ecddd9c6302 100644
--- a/arch/powerpc/include/asm/vdso/gettimeofday.h
+++ b/arch/powerpc/include/asm/vdso/gettimeofday.h
@@ -102,22 +102,22 @@ int gettimeofday_fallback(struct __kernel_old_timeval 
*_tv, struct timezone *_tz
return do_syscall_2(__NR_gettimeofday, (unsigned long)_tv, (unsigned 
long)_tz);
  }
  
+#ifdef CONFIG_VDSO32

+
+#define BUILD_VDSO32   1
+
  static __always_inline
  int clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
  {
-   return do_syscall_2(__NR_clock_gettime, _clkid, (unsigned long)_ts);
+   return do_syscall_2(__NR_clock_gettime64, _clkid, (unsigned long)_ts);
  }
  
  static __always_inline

  int clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
  {
-   return do_syscall_2(__NR_clock_getres, _clkid, (unsigned long)_ts);
+   return do_syscall_2(__NR_clock_getres_time64, _clkid, (unsigned 
long)_ts);
  }
  
-#ifdef CONFIG_VDSO32

-
-#define BUILD_VDSO32   1
-
  static __always_inline
  int clock_gettime32_fallback(clockid_t _clkid, struct old_timespec32 *_ts)
  {
@@ -129,6 +129,21 @@ int clock_getres32_fallback(clockid_t _clkid, struct 
old_timespec32 *_ts)
  {
return do_syscall_2(__NR_clock_getres, _clkid, (unsigned long)_ts);
  }
+
+#else
+
+static __always_inline
+int clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
+{
+   return do_syscall_2(__NR_clock_gettime, _clkid, (unsigned long)_ts);
+}
+
+static __always_inline
+int clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
+{
+   return do_syscall_2(__NR_clock_getres, _clkid, (unsigned long)_ts);
+}
+
  #endif
  
  static __always_inline u64 __arch_get_hw_counter(s32 clock_mode,

--
2.25.0


Re: [PATCH 1/2] dmaengine: fsldma: Fix a resource leak in the remove function

2021-01-12 Thread Vinod Koul
On 12-12-20, 17:05, Christophe JAILLET wrote:
> A 'irq_dispose_mapping()' call is missing in the remove function.
> Add it.
> 
> This is needed to undo the 'irq_of_parse_and_map() call from the probe
> function and already part of the error handling path of the probe function.
> 
> It was added in the probe function only in commit d3f620b2c4fe ("fsldma:
> simplify IRQ probing and handling")

Applied both, thanks

-- 
~Vinod


[PATCH v3] powerpc/vdso: fix clock_gettime_fallback for vdso32

2021-01-12 Thread Christophe Leroy
From: Andreas Schwab 

The second argument of __kernel_clock_gettime64 points to a struct
__kernel_timespec, with 64-bit time_t, so use the clock_gettime64 syscall
in the fallback function for the 32-bit vdso.  Similarily,
clock_getres_fallback should use the clock_getres_time64 syscall, though
it isn't yet called from the 32-bit vdso.

Signed-off-by: Andreas Schwab 
[chleroy: Moved into a single #ifdef __powerpc64__ block]
Fixes: d0e3fc69d00d ("powerpc/vdso: Provide __kernel_clock_gettime64() on 
vdso32")
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/vdso/gettimeofday.h | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h 
b/arch/powerpc/include/asm/vdso/gettimeofday.h
index 7a215cc5da77..d453e725c79f 100644
--- a/arch/powerpc/include/asm/vdso/gettimeofday.h
+++ b/arch/powerpc/include/asm/vdso/gettimeofday.h
@@ -102,6 +102,8 @@ int gettimeofday_fallback(struct __kernel_old_timeval *_tv, 
struct timezone *_tz
return do_syscall_2(__NR_gettimeofday, (unsigned long)_tv, (unsigned 
long)_tz);
 }
 
+#ifdef __powerpc64__
+
 static __always_inline
 int clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
 {
@@ -114,10 +116,22 @@ int clock_getres_fallback(clockid_t _clkid, struct 
__kernel_timespec *_ts)
return do_syscall_2(__NR_clock_getres, _clkid, (unsigned long)_ts);
 }
 
-#ifdef CONFIG_VDSO32
+#else
 
 #define BUILD_VDSO32   1
 
+static __always_inline
+int clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
+{
+   return do_syscall_2(__NR_clock_gettime64, _clkid, (unsigned long)_ts);
+}
+
+static __always_inline
+int clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
+{
+   return do_syscall_2(__NR_clock_getres_time64, _clkid, (unsigned 
long)_ts);
+}
+
 static __always_inline
 int clock_gettime32_fallback(clockid_t _clkid, struct old_timespec32 *_ts)
 {
-- 
2.25.0



Re: [PATCH v4 01/21] ibmvfc: add vhost fields and defaults for MQ enablement

2021-01-12 Thread John Garry

On 11/01/2021 23:12, Tyrel Datwyler wrote:

Introduce several new vhost fields for managing MQ state of the adapter
as well as initial defaults for MQ enablement.

Signed-off-by: Tyrel Datwyler 
---
  drivers/scsi/ibmvscsi/ibmvfc.c | 8 
  drivers/scsi/ibmvscsi/ibmvfc.h | 9 +
  2 files changed, 17 insertions(+)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index ba95438a8912..9200fe49c57e 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -3302,6 +3302,7 @@ static struct scsi_host_template driver_template = {
.max_sectors = IBMVFC_MAX_SECTORS,
.shost_attrs = ibmvfc_attrs,
.track_queue_depth = 1,
+   .host_tagset = 1,


Good to see another user :)

I didn't check the whole series very thoroughly, but I guess that you 
only need to set this when shost->nr_hw_queues > 1. Having said that, it 
should be fine when shost->nr_hw_queues = 0 or 1.


Thanks,
John


  };
  
  /**

@@ -5290,6 +5291,7 @@ static int ibmvfc_probe(struct vio_dev *vdev, const 
struct vio_device_id *id)
shost->max_sectors = IBMVFC_MAX_SECTORS;
shost->max_cmd_len = IBMVFC_MAX_CDB_LEN;
shost->unique_id = shost->host_no;
+   shost->nr_hw_queues = IBMVFC_MQ ? IBMVFC_SCSI_HW_QUEUES : 1;
  
  	vhost = shost_priv(shost);

INIT_LIST_HEAD(>targets);
@@ -5300,6 +5302,12 @@ static int ibmvfc_probe(struct vio_dev *vdev, const 
struct vio_device_id *id)
vhost->partition_number = -1;
vhost->log_level = log_level;
vhost->task_set = 1;
+
+   vhost->mq_enabled = IBMVFC_MQ;
+   vhost->client_scsi_channels = IBMVFC_SCSI_CHANNELS;
+   vhost->using_channels = 0;
+   vhost->do_enquiry = 1;
+
strcpy(vhost->partition_name, "UNKNOWN");
init_waitqueue_head(>work_wait_q);
init_waitqueue_head(>init_wait_q);
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.h b/drivers/scsi/ibmvscsi/ibmvfc.h
index 632e977449c5..dd6d89292867 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.h
+++ b/drivers/scsi/ibmvscsi/ibmvfc.h
@@ -41,6 +41,11 @@
  #define IBMVFC_DEFAULT_LOG_LEVEL  2
  #define IBMVFC_MAX_CDB_LEN16
  #define IBMVFC_CLS3_ERROR 0
+#define IBMVFC_MQ  0
+#define IBMVFC_SCSI_CHANNELS   0
+#define IBMVFC_SCSI_HW_QUEUES  1
+#define IBMVFC_MIG_NO_SUB_TO_CRQ   0
+#define IBMVFC_MIG_NO_N_TO_M   0
  
  /*

   * Ensure we have resources for ERP and initialization:
@@ -840,6 +845,10 @@ struct ibmvfc_host {
int delay_init;
int scan_complete;
int logged_in;
+   int mq_enabled;
+   int using_channels;
+   int do_enquiry;
+   int client_scsi_channels;
int aborting_passthru;
int events_to_log;
  #define IBMVFC_AE_LINKUP  0x0001





Re: [PATCH] tools/perf: Fix powerpc gap between kernel end and module start

2021-01-12 Thread Jiri Olsa
On Mon, Dec 28, 2020 at 09:14:14PM -0500, Athira Rajeev wrote:

SNIP

> c2799370 b backtrace_flag
> c2799378 B radix_tree_node_cachep
> c2799380 B __bss_stop
> c27a B _end
> c0080389 t icmp_checkentry  [ip_tables]
> c00803890038 t ipt_alloc_initial_table  [ip_tables]
> c00803890468 T ipt_do_table [ip_tables]
> c00803890de8 T ipt_unregister_table_pre_exit[ip_tables]
> ...
> 
> Perf calls function symbols__fixup_end() which sets the end of symbol
> to 0xc0080389, which is the next address and this is the start
> address of first module (icmp_checkentry in above) which will make the
> huge symbol size of 0x8010f.
> 
> After symbols__fixup_end:
> symbols__fixup_end: sym->name: _end, sym->start: 0xc27a,
> sym->end: 0xc0080389
> 
> On powerpc, kernel text segment is located at 0xc000
> whereas the modules are located at very high memory addresses,
> 0xc0080xxx. Since the gap between end of kernel text segment
> and beginning of first module's address is high, histogram allocation
> using calloc fails.
> 
> Fix this by detecting the kernel's last symbol and limiting
> the range of last kernel symbol to pagesize.
> 
> Signed-off-by: Athira Rajeev

I can't test, but since the same approach works for arm and s390,
this also looks ok

Acked-by: Jiri Olsa 

thanks,
jirka

> ---
>  tools/perf/arch/powerpc/util/Build |  1 +
>  tools/perf/arch/powerpc/util/machine.c | 24 
>  2 files changed, 25 insertions(+)
>  create mode 100644 tools/perf/arch/powerpc/util/machine.c
> 
> diff --git a/tools/perf/arch/powerpc/util/Build 
> b/tools/perf/arch/powerpc/util/Build
> index e86e210bf514..b7945e5a543b 100644
> --- a/tools/perf/arch/powerpc/util/Build
> +++ b/tools/perf/arch/powerpc/util/Build
> @@ -1,4 +1,5 @@
>  perf-y += header.o
> +perf-y += machine.o
>  perf-y += kvm-stat.o
>  perf-y += perf_regs.o
>  perf-y += mem-events.o
> diff --git a/tools/perf/arch/powerpc/util/machine.c 
> b/tools/perf/arch/powerpc/util/machine.c
> new file mode 100644
> index ..c30e5cc88c16
> --- /dev/null
> +++ b/tools/perf/arch/powerpc/util/machine.c
> @@ -0,0 +1,24 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include 
> +#include 
> +#include  // page_size
> +#include "debug.h"
> +#include "symbol.h"
> +
> +/* On powerpc kernel text segment start at memory addresses, 
> 0xc000
> + * whereas the modules are located at very high memory addresses,
> + * for example 0xc0080xxx. The gap between end of kernel text segment
> + * and beginning of first module's text segment is very high.
> + * Therefore do not fill this gap and do not assign it to the kernel dso map.
> + */
> +
> +void arch__symbols__fixup_end(struct symbol *p, struct symbol *c)
> +{
> + if (strchr(p->name, '[') == NULL && strchr(c->name, '['))
> + /* Limit the range of last kernel symbol */
> + p->end += page_size;
> + else
> + p->end = c->start;
> + pr_debug4("%s sym:%s end:%#lx\n", __func__, p->name, p->end);
> +}
> -- 
> 1.8.3.1
>