[PATCH v5 17/17] powerpc64/bpf: Add support for bpf trampolines

2024-09-15 Thread Hari Bathini
From: Naveen N Rao 

Add support for bpf_arch_text_poke() and arch_prepare_bpf_trampoline()
for 64-bit powerpc. While the code is generic, BPF trampolines are only
enabled on 64-bit powerpc. 32-bit powerpc will need testing and some
updates.

BPF Trampolines adhere to the existing ftrace ABI utilizing a
two-instruction profiling sequence, as well as the newer ABI utilizing a
three-instruction profiling sequence enabling return with a 'blr'. The
trampoline code itself closely follows x86 implementation.

BPF prog JIT is extended to mimic 64-bit powerpc approach for ftrace
having a single nop at function entry, followed by the function
profiling sequence out-of-line and a separate long branch stub for calls
to trampolines that are out of range. A dummy_tramp is provided to
simplify synchronization similar to arm64.

When attaching a bpf trampoline to a bpf prog, we can patch up to three
things:
- the nop at bpf prog entry to go to the out-of-line stub
- the instruction in the out-of-line stub to either call the bpf trampoline
directly, or to branch to the long_branch stub.
- the trampoline address before the long_branch stub.

We do not need any synchronization here since we always have a valid
branch target regardless of the order in which the above stores are
seen. dummy_tramp ensures that the long_branch stub goes to a valid
destination on other cpus, even when the branch to the long_branch stub
is seen before the updated trampoline address.

However, when detaching a bpf trampoline from a bpf prog, or if changing
the bpf trampoline address, we need synchronization to ensure that other
cpus can no longer branch into the older trampoline so that it can be
safely freed. bpf_tramp_image_put() uses rcu_tasks to ensure all cpus
make forward progress, but we still need to ensure that other cpus
execute isync (or some CSI) so that they don't go back into the
trampoline again.

Signed-off-by: Naveen N Rao 
---
 arch/powerpc/include/asm/ppc-opcode.h |  14 +
 arch/powerpc/net/bpf_jit.h|  12 +
 arch/powerpc/net/bpf_jit_comp.c   | 847 +-
 arch/powerpc/net/bpf_jit_comp32.c |   7 +-
 arch/powerpc/net/bpf_jit_comp64.c |   7 +-
 5 files changed, 884 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index b98a9e982c03..4312bcb913a4 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -587,12 +587,26 @@
 #define PPC_RAW_MTSPR(spr, d)  (0x7c0003a6 | ___PPC_RS(d) | 
__PPC_SPR(spr))
 #define PPC_RAW_EIEIO()(0x7c0006ac)
 
+/* bcl 20,31,$+4 */
+#define PPC_RAW_BCL4() (0x429f0005)
 #define PPC_RAW_BRANCH(offset) (0x4800 | PPC_LI(offset))
 #define PPC_RAW_BL(offset) (0x4801 | PPC_LI(offset))
 #define PPC_RAW_TW(t0, a, b)   (0x7c08 | ___PPC_RS(t0) | 
___PPC_RA(a) | ___PPC_RB(b))
 #define PPC_RAW_TRAP() PPC_RAW_TW(31, 0, 0)
 #define PPC_RAW_SETB(t, bfa)   (0x7c000100 | ___PPC_RT(t) | 
___PPC_RA((bfa) << 2))
 
+#ifdef CONFIG_PPC32
+#define PPC_RAW_STLPPC_RAW_STW
+#define PPC_RAW_STLU   PPC_RAW_STWU
+#define PPC_RAW_LL PPC_RAW_LWZ
+#define PPC_RAW_CMPLI  PPC_RAW_CMPWI
+#else
+#define PPC_RAW_STLPPC_RAW_STD
+#define PPC_RAW_STLU   PPC_RAW_STDU
+#define PPC_RAW_LL PPC_RAW_LD
+#define PPC_RAW_CMPLI  PPC_RAW_CMPDI
+#endif
+
 /* Deal with instructions that older assemblers aren't aware of */
 #definePPC_BCCTR_FLUSH stringify_in_c(.long 
PPC_INST_BCCTR_FLUSH)
 #definePPC_CP_ABORTstringify_in_c(.long PPC_RAW_CP_ABORT)
diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index cdea5dccaefe..2d04ce5a23da 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -12,6 +12,7 @@
 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_PPC64_ELF_ABI_V1
 #define FUNCTION_DESCR_SIZE24
@@ -21,6 +22,9 @@
 
 #define CTX_NIA(ctx) ((unsigned long)ctx->idx * 4)
 
+#define SZLsizeof(unsigned long)
+#define BPF_INSN_SAFETY64
+
 #define PLANT_INSTR(d, idx, instr)   \
do { if (d) { (d)[idx] = instr; } idx++; } while (0)
 #define EMIT(instr)PLANT_INSTR(image, ctx->idx, instr)
@@ -81,6 +85,13 @@
EMIT(PPC_RAW_ORI(d, d, (uintptr_t)(i) &   \
0x)); \
} } while (0)
+#define PPC_LI_ADDRPPC_LI64
+#define PPC64_LOAD_PACA()\
+   EMIT(PPC_RAW_LD(_R2, _R13, offsetof(struct paca_struct, kernel_toc)))
+#else
+#define PPC_LI64(d, i) BUILD_BUG()
+#define PPC_LI_ADDRPPC_LI32
+#define PPC64_LOAD_PACA() BUILD_BUG()
 #endif
 
 /*
@@ -165,6 +176,7 @@ int bpf_jit_build_body(struct bp

[PATCH v5 16/17] samples/ftrace: Add support for ftrace direct samples on powerpc

2024-09-15 Thread Hari Bathini
From: Naveen N Rao 

Add powerpc 32-bit and 64-bit samples for ftrace direct. This serves to
show the sample instruction sequence to be used by ftrace direct calls
to adhere to the ftrace ABI.

On 64-bit powerpc, TOC setup requires some additional work.

Signed-off-by: Naveen N Rao 
---
 arch/powerpc/Kconfig|   2 +
 samples/ftrace/ftrace-direct-modify.c   |  85 +++-
 samples/ftrace/ftrace-direct-multi-modify.c | 101 +++-
 samples/ftrace/ftrace-direct-multi.c|  79 ++-
 samples/ftrace/ftrace-direct-too.c  |  83 +++-
 samples/ftrace/ftrace-direct.c  |  69 -
 6 files changed, 414 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index ef845ea4dd27..1e093ed287fe 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -275,6 +275,8 @@ config PPC
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE
select HAVE_RSEQ
+   select HAVE_SAMPLE_FTRACE_DIRECTif 
HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+   select HAVE_SAMPLE_FTRACE_DIRECT_MULTI  if 
HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
select HAVE_SETUP_PER_CPU_AREA  if PPC64
select HAVE_SOFTIRQ_ON_OWN_STACK
select HAVE_STACKPROTECTOR  if PPC32 && 
$(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2)
diff --git a/samples/ftrace/ftrace-direct-modify.c 
b/samples/ftrace/ftrace-direct-modify.c
index 81220390851a..cfea7a38befb 100644
--- a/samples/ftrace/ftrace-direct-modify.c
+++ b/samples/ftrace/ftrace-direct-modify.c
@@ -2,7 +2,7 @@
 #include 
 #include 
 #include 
-#ifndef CONFIG_ARM64
+#if !defined(CONFIG_ARM64) && !defined(CONFIG_PPC32)
 #include 
 #endif
 
@@ -199,6 +199,89 @@ asm (
 
 #endif /* CONFIG_LOONGARCH */
 
+#ifdef CONFIG_PPC
+#include 
+
+#ifdef CONFIG_PPC64
+#define STACK_FRAME_SIZE 48
+#else
+#define STACK_FRAME_SIZE 24
+#endif
+
+#if defined(CONFIG_PPC64_ELF_ABI_V2) && !defined(CONFIG_PPC_KERNEL_PCREL)
+#define PPC64_TOC_SAVE_AND_UPDATE  \
+"  std 2, 24(1)\n" \
+"  bcl 20, 31, 1f\n"   \
+"   1: mflr12\n"   \
+"  ld  2, (99f - 1b)(12)\n"
+#define PPC64_TOC_RESTORE  \
+"  ld  2, 24(1)\n"
+#define PPC64_TOC  \
+"   99:.quad   .TOC.@tocbase\n"
+#else
+#define PPC64_TOC_SAVE_AND_UPDATE ""
+#define PPC64_TOC_RESTORE ""
+#define PPC64_TOC ""
+#endif
+
+#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE
+#define PPC_FTRACE_RESTORE_LR  \
+   PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"   \
+"  mtlr0\n"
+#define PPC_FTRACE_RET \
+"  blr\n"
+#else
+#define PPC_FTRACE_RESTORE_LR  \
+   PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"   \
+"  mtctr   0\n"
+#define PPC_FTRACE_RET \
+"  mtlr0\n"\
+"  bctr\n"
+#endif
+
+asm (
+"  .pushsection.text, \"ax\", @progbits\n"
+"  .type   my_tramp1, @function\n"
+"  .globl  my_tramp1\n"
+"   my_tramp1:\n"
+   PPC_STL"0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+   PPC_STLU"   1, -"__stringify(STACK_FRAME_MIN_SIZE)"(1)\n"
+"  mflr0\n"
+   PPC_STL"0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+   PPC_STLU"   1, -"__stringify(STACK_FRAME_SIZE)"(1)\n"
+   PPC64_TOC_SAVE_AND_UPDATE
+"  bl  my_direct_func1\n"
+   PPC64_TOC_RESTORE
+"  addi1, 1, "__stringify(STACK_FRAME_SIZE)"\n"
+   PPC_FTRACE_RESTORE_LR
+"  addi1, 1, "__stringify(STACK_FRAME_MIN_SIZE)"\n"
+   PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+   PPC_FTRACE_RET
+"  .size   my_tramp1, .-my_tramp1\n"
+
+"  .type   my_tramp2, @function\n"
+"  .globl  my_tramp2\n"
+"   my_tramp2:\n"
+   PPC_STL"0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+   PPC_STLU"   1, -"__stringify(STACK_FRAME_MIN_SIZE)"(1)\n"
+"  mflr0\n"
+   PPC_STL"0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+   PPC_STLU"   1, -"__stringify(STACK_FRAME_SIZE)"(1)\n"
+   PPC64_TOC_SAVE_AND_UPDATE
+"  bl  my_direct_func2\n"
+   PPC64_TOC_RESTORE
+"  addi1, 1, "__stringify(STACK_FRAME_SIZE)"\n"
+   PPC_FTRACE_RESTORE_LR
+"  addi1, 1, "__stringify(STACK_FRAME_MIN_SIZE)"\n"
+   PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n"
+   PPC_FTRACE_RET
+   PPC64_TOC
+"  .size   my_tramp2, .-my_tramp2\n"
+"  .popsection\n"
+);
+
+#endif /* CONFIG_PPC */
+
 static struct ftrace_ops direct;
 
 static unsigned l

[PATCH v5 15/17] powerpc/ftrace: Add support for DYNAMIC_FTRACE_WITH_DIRECT_CALLS

2024-09-15 Thread Hari Bathini
From: Naveen N Rao 

Add support for DYNAMIC_FTRACE_WITH_DIRECT_CALLS similar to the arm64
implementation.

ftrace direct calls allow custom trampolines to be called into directly
from function ftrace call sites, bypassing the ftrace trampoline
completely. This functionality is currently utilized by BPF trampolines
to hook into kernel function entries.

Since we have limited relative branch range, we support ftrace direct
calls through support for DYNAMIC_FTRACE_WITH_CALL_OPS. In this
approach, ftrace trampoline is not entirely bypassed. Rather, it is
re-purposed into a stub that reads direct_call field from the associated
ftrace_ops structure and branches into that, if it is not NULL. For
this, it is sufficient if we can ensure that the ftrace trampoline is
reachable from all traceable functions.

When multiple ftrace_ops are associated with a call site, we utilize a
call back to set pt_regs->orig_gpr3 that can then be tested on the
return path from the ftrace trampoline to branch into the direct caller.

Signed-off-by: Naveen N Rao 
---
 arch/powerpc/Kconfig |   1 +
 arch/powerpc/include/asm/ftrace.h|  16 
 arch/powerpc/kernel/asm-offsets.c|   3 +
 arch/powerpc/kernel/trace/ftrace.c   |  11 +++
 arch/powerpc/kernel/trace/ftrace_entry.S | 114 +--
 5 files changed, 116 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f1a0adedeb8e..ef845ea4dd27 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -235,6 +235,7 @@ config PPC
select HAVE_DYNAMIC_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_ARGSif 
ARCH_USING_PATCHABLE_FUNCTION_ENTRY || MPROFILE_KERNEL || PPC32
select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if PPC_FTRACE_OUT_OF_LINE || 
(PPC32 && ARCH_USING_PATCHABLE_FUNCTION_ENTRY)
+   select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if 
HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
select HAVE_DYNAMIC_FTRACE_WITH_REGSif 
ARCH_USING_PATCHABLE_FUNCTION_ENTRY || MPROFILE_KERNEL || PPC32
select HAVE_EBPF_JIT
select HAVE_EFFICIENT_UNALIGNED_ACCESS
diff --git a/arch/powerpc/include/asm/ftrace.h 
b/arch/powerpc/include/asm/ftrace.h
index 1ad1328cf4e3..5eb7631355a1 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -148,6 +148,22 @@ extern unsigned int ftrace_ool_stub_text_end_count, 
ftrace_ool_stub_text_count,
 #endif
 void ftrace_free_init_tramp(void);
 unsigned long ftrace_call_adjust(unsigned long addr);
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+/*
+ * When an ftrace registered caller is tracing a function that is also set by a
+ * register_ftrace_direct() call, it needs to be differentiated in the
+ * ftrace_caller trampoline so that the direct call can be invoked after the
+ * other ftrace ops. To do this, place the direct caller in the orig_gpr3 field
+ * of pt_regs. This tells ftrace_caller that there's a direct caller.
+ */
+static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, 
unsigned long addr)
+{
+   struct pt_regs *regs = &fregs->regs;
+
+   regs->orig_gpr3 = addr;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
 #else
 static inline void ftrace_free_init_tramp(void) { }
 static inline unsigned long ftrace_call_adjust(unsigned long addr) { return 
addr; }
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 60d1e388c2ba..dbd56264a8bc 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -680,6 +680,9 @@ int main(void)
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
OFFSET(FTRACE_OPS_FUNC, ftrace_ops, func);
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+   OFFSET(FTRACE_OPS_DIRECT_CALL, ftrace_ops, direct_call);
+#endif
 #endif
 
return 0;
diff --git a/arch/powerpc/kernel/trace/ftrace.c 
b/arch/powerpc/kernel/trace/ftrace.c
index 9090d1a21600..051f3db14606 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -150,6 +150,17 @@ static int ftrace_get_call_inst(struct dyn_ftrace *rec, 
unsigned long addr, ppc_
else
ip = rec->ip;
 
+   if (!is_offset_in_branch_range(addr - ip) && addr != FTRACE_ADDR &&
+   addr != FTRACE_REGS_ADDR) {
+   /* This can only happen with ftrace direct */
+   if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS)) {
+   pr_err("0x%lx (0x%lx): Unexpected target address 
0x%lx\n",
+  ip, rec->ip, addr);
+   return -EINVAL;
+   }
+   addr = FTRACE_ADDR;
+   }
+
if (is_offset_in_branch_range(addr - ip))
/* Within range */
stub = addr;
diff --git a/arch/powerpc/kernel/trace/ftrace_entry.S 
b/arch/powerpc/kernel/trace/ftrace_entry.S
index ff376c990308..2c1b24100eca 100644
--- a/arch/powerpc/kernel/trace/ftrace_entry.S
+++ b/

[PATCH v5 14/17] powerpc/ftrace: Add support for DYNAMIC_FTRACE_WITH_CALL_OPS

2024-09-15 Thread Hari Bathini
From: Naveen N Rao 

Implement support for DYNAMIC_FTRACE_WITH_CALL_OPS similar to the
arm64 implementation.

This works by patching-in a pointer to an associated ftrace_ops
structure before each traceable function. If multiple ftrace_ops are
associated with a call site, then a special ftrace_list_ops is used to
enable iterating over all the registered ftrace_ops. If no ftrace_ops
are associated with a call site, then a special ftrace_nop_ops structure
is used to render the ftrace call as a no-op. ftrace trampoline can then
read the associated ftrace_ops for a call site by loading from an offset
from the LR, and branch directly to the associated function.

The primary advantage with this approach is that we don't have to
iterate over all the registered ftrace_ops for call sites that have a
single ftrace_ops registered. This is the equivalent of implementing
support for dynamic ftrace trampolines, which set up a special ftrace
trampoline for each registered ftrace_ops and have individual call sites
branch into those directly.

A secondary advantage is that this gives us a way to add support for
direct ftrace callers without having to resort to using stubs. The
address of the direct call trampoline can be loaded from the ftrace_ops
structure.

To support this, we reserve a nop before each function on 32-bit
powerpc. For 64-bit powerpc, two nops are reserved before each
out-of-line stub. During ftrace activation, we update this location with
the associated ftrace_ops pointer. Then, on ftrace entry, we load from
this location and call into ftrace_ops->func().

For 64-bit powerpc, we ensure that the out-of-line stub area is
doubleword aligned so that ftrace_ops address can be updated atomically.

Signed-off-by: Naveen N Rao 
---
 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/Makefile  |  4 ++
 arch/powerpc/include/asm/ftrace.h  |  5 +-
 arch/powerpc/kernel/asm-offsets.c  |  4 ++
 arch/powerpc/kernel/trace/ftrace.c | 59 +-
 arch/powerpc/kernel/trace/ftrace_entry.S   | 36 ++---
 arch/powerpc/tools/ftrace-gen-ool-stubs.sh |  5 +-
 7 files changed, 102 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index a0ce00368bab..f1a0adedeb8e 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -234,6 +234,7 @@ config PPC
select HAVE_DEBUG_STACKOVERFLOW
select HAVE_DYNAMIC_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_ARGSif 
ARCH_USING_PATCHABLE_FUNCTION_ENTRY || MPROFILE_KERNEL || PPC32
+   select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if PPC_FTRACE_OUT_OF_LINE || 
(PPC32 && ARCH_USING_PATCHABLE_FUNCTION_ENTRY)
select HAVE_DYNAMIC_FTRACE_WITH_REGSif 
ARCH_USING_PATCHABLE_FUNCTION_ENTRY || MPROFILE_KERNEL || PPC32
select HAVE_EBPF_JIT
select HAVE_EFFICIENT_UNALIGNED_ACCESS
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index c973e6cd1ae8..7dede0ec0163 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -158,8 +158,12 @@ KBUILD_CPPFLAGS+= -DCC_USING_PATCHABLE_FUNCTION_ENTRY
 ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE
 CC_FLAGS_FTRACE := -fpatchable-function-entry=1
 else
+ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS # PPC32 only
+CC_FLAGS_FTRACE := -fpatchable-function-entry=3,1
+else
 CC_FLAGS_FTRACE := -fpatchable-function-entry=2
 endif
+endif
 else
 CC_FLAGS_FTRACE := -pg
 ifdef CONFIG_MPROFILE_KERNEL
diff --git a/arch/powerpc/include/asm/ftrace.h 
b/arch/powerpc/include/asm/ftrace.h
index 28f3590ca780..1ad1328cf4e3 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -136,8 +136,11 @@ static inline u8 this_cpu_get_ftrace_enabled(void) { 
return 1; }
 extern unsigned int ftrace_tramp_text[], ftrace_tramp_init[];
 #ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE
 struct ftrace_ool_stub {
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
+   struct ftrace_ops *ftrace_op;
+#endif
u32 insn[4];
-};
+} __aligned(sizeof(unsigned long));
 extern struct ftrace_ool_stub ftrace_ool_stub_text_end[], 
ftrace_ool_stub_text[],
  ftrace_ool_stub_inittext[];
 extern unsigned int ftrace_ool_stub_text_end_count, ftrace_ool_stub_text_count,
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 6854547d3164..60d1e388c2ba 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -678,5 +678,9 @@ int main(void)
DEFINE(FTRACE_OOL_STUB_SIZE, sizeof(struct ftrace_ool_stub));
 #endif
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
+   OFFSET(FTRACE_OPS_FUNC, ftrace_ops, func);
+#endif
+
return 0;
 }
diff --git a/arch/powerpc/kernel/trace/ftrace.c 
b/arch/powerpc/kernel/trace/ftrace.c
index bee2c54a8c04..9090d1a21600 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -38,8 +38,11 @@ unsigned long ftrace_call_adjust(unsigned long addr)
return

[PATCH v5 12/17] powerpc64/ftrace: Move ftrace sequence out of line

2024-09-15 Thread Hari Bathini
From: Naveen N Rao 

Function profile sequence on powerpc includes two instructions at the
beginning of each function:
mflrr0
bl  ftrace_caller

The call to ftrace_caller() gets nop'ed out during kernel boot and is
patched in when ftrace is enabled.

Given the sequence, we cannot return from ftrace_caller with 'blr' as we
need to keep LR and r0 intact. This results in link stack (return
address predictor) imbalance when ftrace is enabled. To address that, we
would like to use a three instruction sequence:
mflrr0
bl  ftrace_caller
mtlrr0

Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to
reserve two instruction slots before the function. This results in a
total of five instruction slots to be reserved for ftrace use on each
function that is traced.

Move the function profile sequence out-of-line to minimize its impact.
To do this, we reserve a single nop at function entry using
-fpatchable-function-entry=1 and add a pass on vmlinux.o to determine
the total number of functions that can be traced. This is then used to
generate a .S file reserving the appropriate amount of space for use as
ftrace stubs, which is built and linked into vmlinux.

On bootup, the stub space is split into separate stubs per function and
populated with the proper instruction sequence. A pointer to the
associated stub is maintained in dyn_arch_ftrace.

For modules, space for ftrace stubs is reserved from the generic module
stub space.

This is restricted to and enabled by default only on 64-bit powerpc,
though there are some changes to accommodate 32-bit powerpc. This is
done so that 32-bit powerpc could choose to opt into this based on
further tests and benchmarks.

As an example, after this patch, kernel functions will have a single nop
at function entry:
:
addis   r2,r12,467
addir2,r2,-16028
nop
mfocrf  r11,8
...

When ftrace is enabled, the nop is converted to an unconditional branch
to the stub associated with that function:
:
addis   r2,r12,467
addir2,r2,-16028
b   ftrace_ool_stub_text_end+0x11b28
mfocrf  r11,8
...

The associated stub:
:
mflrr0
bl  ftrace_caller
mtlrr0
b   kernel_clone+0xc
...

This change showed an improvement of ~10% in null_syscall benchmark on a
Power 10 system with ftrace enabled.

Signed-off-by: Naveen N Rao 
Signed-off-by: Hari Bathini 
---

Changes in v5:
* Fixed ftrace stack tracer failure due to inadvertent use of
  'add r7, r3, MCOUNT_INSN_SIZE' instruction instead of
  'addi r7, r3, MCOUNT_INSN_SIZE'
* Fixed build error for !CONFIG_MODULES case.
* .vmlinux.arch.* files compiled under arch/powerpc/tools
* Made sure .vmlinux.arch.* files are cleaned with `make clean`


 arch/powerpc/Kbuild|   2 +-
 arch/powerpc/Kconfig   |   5 +
 arch/powerpc/Makefile  |   4 +
 arch/powerpc/include/asm/ftrace.h  |  11 ++
 arch/powerpc/include/asm/module.h  |   5 +
 arch/powerpc/kernel/asm-offsets.c  |   4 +
 arch/powerpc/kernel/module_64.c|  58 +++-
 arch/powerpc/kernel/trace/ftrace.c | 162 +++--
 arch/powerpc/kernel/trace/ftrace_entry.S   | 116 +++
 arch/powerpc/tools/Makefile|  12 ++
 arch/powerpc/tools/ftrace-gen-ool-stubs.sh |  43 ++
 11 files changed, 384 insertions(+), 38 deletions(-)
 create mode 100644 arch/powerpc/tools/Makefile
 create mode 100755 arch/powerpc/tools/ftrace-gen-ool-stubs.sh

diff --git a/arch/powerpc/Kbuild b/arch/powerpc/Kbuild
index 571f260b0842..b010ccb071b6 100644
--- a/arch/powerpc/Kbuild
+++ b/arch/powerpc/Kbuild
@@ -19,4 +19,4 @@ obj-$(CONFIG_KEXEC_CORE)  += kexec/
 obj-$(CONFIG_KEXEC_FILE)  += purgatory/
 
 # for cleaning
-subdir- += boot
+subdir- += boot tools
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index de18f3baff66..bae96b65f295 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -568,6 +568,11 @@ config ARCH_USING_PATCHABLE_FUNCTION_ENTRY
def_bool 
$(success,$(srctree)/arch/powerpc/tools/gcc-check-fpatchable-function-entry.sh 
$(CC) -mlittle-endian) if PPC64 && CPU_LITTLE_ENDIAN
def_bool 
$(success,$(srctree)/arch/powerpc/tools/gcc-check-fpatchable-function-entry.sh 
$(CC) -mbig-endian) if PPC64 && CPU_BIG_ENDIAN
 
+config PPC_FTRACE_OUT_OF_LINE
+   def_bool PPC64 && ARCH_USING_PATCHABLE_FUNCTION_ENTRY
+   depends on PPC64
+   select ARCH_WANTS_PRE_LINK_VMLINUX
+
 config HOTPLUG_CPU
bool "Support for enabling/disabling CPUs"
depends on SMP && (PPC_PSERIES || \
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index bbfe4a1f06ef..c973e6cd1ae8 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -155,7 +155,11 @@ CC_FLAGS_NO_FPU  

[PATCH v5 13/17] powerpc64/ftrace: Support .text larger than 32MB with out-of-line stubs

2024-09-15 Thread Hari Bathini
From: Naveen N Rao 

We are restricted to a .text size of ~32MB when using out-of-line
function profile sequence. Allow this to be extended up to the previous
limit of ~64MB by reserving space in the middle of .text.

A new config option CONFIG_PPC_FTRACE_OUT_OF_LINE_NUM_RESERVE is
introduced to specify the number of function stubs that are reserved in
.text. On boot, ftrace utilizes stubs from this area first before using
the stub area at the end of .text.

A ppc64le defconfig has ~44k functions that can be traced. A more
conservative value of 32k functions is chosen as the default value of
PPC_FTRACE_OUT_OF_LINE_NUM_RESERVE so that we do not allot more space
than necessary by default. If building a kernel that only has 32k
trace-able functions, we won't allot any more space at the end of .text
during the pass on vmlinux.o. Otherwise, only the remaining functions
get space for stubs at the end of .text. This default value should help
cover a .text size of ~48MB in total (including space reserved at the
end of .text which can cover up to 32MB), which should be sufficient for
most common builds. For a very small kernel build, this can be set to 0.
Or, this can be bumped up to a larger value to support vmlinux .text
size up to ~64MB.

Signed-off-by: Naveen N Rao 
Signed-off-by: Hari Bathini 
---

Changes in v5:
* num_ool_stubs_text_end used for setting up ftrace_ool_stub_text_end
  set to zero instead of computing to some random negative value when
  not required.

 arch/powerpc/Kconfig   | 12 
 arch/powerpc/include/asm/ftrace.h  |  6 --
 arch/powerpc/kernel/trace/ftrace.c | 21 +
 arch/powerpc/kernel/trace/ftrace_entry.S   |  8 
 arch/powerpc/tools/Makefile|  2 +-
 arch/powerpc/tools/ftrace-gen-ool-stubs.sh | 16 
 6 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index bae96b65f295..a0ce00368bab 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -573,6 +573,18 @@ config PPC_FTRACE_OUT_OF_LINE
depends on PPC64
select ARCH_WANTS_PRE_LINK_VMLINUX
 
+config PPC_FTRACE_OUT_OF_LINE_NUM_RESERVE
+   int "Number of ftrace out-of-line stubs to reserve within .text"
+   default 32768 if PPC_FTRACE_OUT_OF_LINE
+   default 0
+   help
+ Number of stubs to reserve for use by ftrace. This space is
+ reserved within .text, and is distinct from any additional space
+ added at the end of .text before the final vmlinux link. Set to
+ zero to have stubs only be generated at the end of vmlinux (only
+ if the size of vmlinux is less than 32MB). Set to a higher value
+ if building vmlinux larger than 48MB.
+
 config HOTPLUG_CPU
bool "Support for enabling/disabling CPUs"
depends on SMP && (PPC_PSERIES || \
diff --git a/arch/powerpc/include/asm/ftrace.h 
b/arch/powerpc/include/asm/ftrace.h
index bdbafc668b20..28f3590ca780 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -138,8 +138,10 @@ extern unsigned int ftrace_tramp_text[], 
ftrace_tramp_init[];
 struct ftrace_ool_stub {
u32 insn[4];
 };
-extern struct ftrace_ool_stub ftrace_ool_stub_text_end[], 
ftrace_ool_stub_inittext[];
-extern unsigned int ftrace_ool_stub_text_end_count, 
ftrace_ool_stub_inittext_count;
+extern struct ftrace_ool_stub ftrace_ool_stub_text_end[], 
ftrace_ool_stub_text[],
+ ftrace_ool_stub_inittext[];
+extern unsigned int ftrace_ool_stub_text_end_count, ftrace_ool_stub_text_count,
+   ftrace_ool_stub_inittext_count;
 #endif
 void ftrace_free_init_tramp(void);
 unsigned long ftrace_call_adjust(unsigned long addr);
diff --git a/arch/powerpc/kernel/trace/ftrace.c 
b/arch/powerpc/kernel/trace/ftrace.c
index 1fee074388cc..bee2c54a8c04 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -168,7 +168,7 @@ static int ftrace_get_call_inst(struct dyn_ftrace *rec, 
unsigned long addr, ppc_
 static int ftrace_init_ool_stub(struct module *mod, struct dyn_ftrace *rec)
 {
 #ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE
-   static int ool_stub_text_end_index, ool_stub_inittext_index;
+   static int ool_stub_text_index, ool_stub_text_end_index, 
ool_stub_inittext_index;
int ret = 0, ool_stub_count, *ool_stub_index;
ppc_inst_t inst;
/*
@@ -191,9 +191,22 @@ static int ftrace_init_ool_stub(struct module *mod, struct 
dyn_ftrace *rec)
ool_stub_index = &ool_stub_inittext_index;
ool_stub_count = ftrace_ool_stub_inittext_count;
} else if (is_kernel_text(rec->ip)) {
-   ool_stub = ftrace_ool_stub_text_end;
-   ool_stub_index = &ool_stub_text_end_index;
-   ool_stub_count = ftrace_ool_stub_text_end_count;
+   /*
+

[PATCH v5 11/17] kbuild: Add generic hook for architectures to use before the final vmlinux link

2024-09-15 Thread Hari Bathini
From: Naveen N Rao 

On powerpc, we would like to be able to make a pass on vmlinux.o and
generate a new object file to be linked into vmlinux. Add a generic pass
in Makefile.vmlinux that architectures can use for this purpose.

Architectures need to select CONFIG_ARCH_WANTS_PRE_LINK_VMLINUX and must
provide arch//tools/Makefile with .arch.vmlinux.o target, which
will be invoked prior to the final vmlinux link step.

Signed-off-by: Naveen N Rao 
Signed-off-by: Hari Bathini 
---

Changes in v5:
* Intermediate files named .vmlinux.arch.* instead of .arch.vmlinux.*


 arch/Kconfig | 6 ++
 scripts/Makefile.vmlinux | 7 +++
 scripts/link-vmlinux.sh  | 7 ++-
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 975dd22a2dbd..ef868ff8156a 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1643,4 +1643,10 @@ config CC_HAS_SANE_FUNCTION_ALIGNMENT
 config ARCH_NEED_CMPXCHG_1_EMU
bool
 
+config ARCH_WANTS_PRE_LINK_VMLINUX
+   def_bool n
+   help
+ An architecture can select this if it provides 
arch//tools/Makefile
+ with .arch.vmlinux.o target to be linked into vmlinux.
+
 endmenu
diff --git a/scripts/Makefile.vmlinux b/scripts/Makefile.vmlinux
index 49946cb96844..edf6fae8d960 100644
--- a/scripts/Makefile.vmlinux
+++ b/scripts/Makefile.vmlinux
@@ -22,6 +22,13 @@ targets += .vmlinux.export.o
 vmlinux: .vmlinux.export.o
 endif
 
+ifdef CONFIG_ARCH_WANTS_PRE_LINK_VMLINUX
+vmlinux: arch/$(SRCARCH)/tools/.vmlinux.arch.o
+
+arch/$(SRCARCH)/tools/.vmlinux.arch.o: vmlinux.o
+   $(Q)$(MAKE) $(build)=arch/$(SRCARCH)/tools $@
+endif
+
 ARCH_POSTLINK := $(wildcard $(srctree)/arch/$(SRCARCH)/Makefile.postlink)
 
 # Final link of vmlinux with optional arch pass after final link
diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index f7b2503cdba9..b3a940c0e6c2 100755
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -100,7 +100,7 @@ vmlinux_link()
${ld} ${ldflags} -o ${output}   \
${wl}--whole-archive ${objs} ${wl}--no-whole-archive\
${wl}--start-group ${libs} ${wl}--end-group \
-   ${kallsymso} ${btf_vmlinux_bin_o} ${ldlibs}
+   ${kallsymso} ${btf_vmlinux_bin_o} ${arch_vmlinux_o} ${ldlibs}
 }
 
 # generate .BTF typeinfo from DWARF debuginfo
@@ -214,6 +214,11 @@ fi
 
 ${MAKE} -f "${srctree}/scripts/Makefile.build" obj=init 
init/version-timestamp.o
 
+arch_vmlinux_o=""
+if is_enabled CONFIG_ARCH_WANTS_PRE_LINK_VMLINUX; then
+   arch_vmlinux_o=arch/${SRCARCH}/tools/.vmlinux.arch.o
+fi
+
 btf_vmlinux_bin_o=
 kallsymso=
 strip_debug=
-- 
2.46.0




[PATCH v5 10/17] powerpc/ftrace: Add a postlink script to validate function tracer

2024-09-15 Thread Hari Bathini
From: Naveen N Rao 

Function tracer on powerpc can only work with vmlinux having a .text
size of up to ~64MB due to powerpc branch instruction having a limited
relative branch range of 32MB. Today, this is only detected on kernel
boot when ftrace is init'ed. Add a post-link script to check the size of
.text so that we can detect this at build time, and break the build if
necessary.

We add a dependency on !COMPILE_TEST for CONFIG_HAVE_FUNCTION_TRACER so
that allyesconfig and other test builds can continue to work without
enabling ftrace.

Signed-off-by: Naveen N Rao 
---
 arch/powerpc/Kconfig   |  2 +-
 arch/powerpc/Makefile.postlink |  8 +
 arch/powerpc/tools/ftrace_check.sh | 50 ++
 3 files changed, 59 insertions(+), 1 deletion(-)
 create mode 100755 arch/powerpc/tools/ftrace_check.sh

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1f9d23b276b5..de18f3baff66 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -243,7 +243,7 @@ config PPC
select HAVE_FUNCTION_DESCRIPTORSif PPC64_ELF_ABI_V1
select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_FUNCTION_GRAPH_TRACER
-   select HAVE_FUNCTION_TRACER if PPC64 || (PPC32 && CC_IS_GCC)
+   select HAVE_FUNCTION_TRACER if !COMPILE_TEST && (PPC64 || 
(PPC32 && CC_IS_GCC))
select HAVE_GCC_PLUGINS if GCC_VERSION >= 50200   # 
plugin support on gcc <= 5.1 is buggy on PPC
select HAVE_GENERIC_VDSO
select HAVE_HARDLOCKUP_DETECTOR_ARCHif PPC_BOOK3S_64 && SMP
diff --git a/arch/powerpc/Makefile.postlink b/arch/powerpc/Makefile.postlink
index ae5a4256b03d..bb601be36173 100644
--- a/arch/powerpc/Makefile.postlink
+++ b/arch/powerpc/Makefile.postlink
@@ -24,6 +24,9 @@ else
$(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh 
"$(OBJDUMP)" "$(NM)" "$@"
 endif
 
+quiet_cmd_ftrace_check = CHKFTRC $@
+  cmd_ftrace_check = $(CONFIG_SHELL) 
$(srctree)/arch/powerpc/tools/ftrace_check.sh "$(NM)" "$@"
+
 # `@true` prevents complaint when there is nothing to be done
 
 vmlinux: FORCE
@@ -34,6 +37,11 @@ endif
 ifdef CONFIG_RELOCATABLE
$(call if_changed,relocs_check)
 endif
+ifdef CONFIG_FUNCTION_TRACER
+ifndef CONFIG_PPC64_ELF_ABI_V1
+   $(call cmd,ftrace_check)
+endif
+endif
 
 clean:
rm -f .tmp_symbols.txt
diff --git a/arch/powerpc/tools/ftrace_check.sh 
b/arch/powerpc/tools/ftrace_check.sh
new file mode 100755
index ..f4310e736f1b
--- /dev/null
+++ b/arch/powerpc/tools/ftrace_check.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# This script checks vmlinux to ensure that all functions can call 
ftrace_caller() either directly,
+# or through the stub, ftrace_tramp_text, at the end of kernel text.
+
+# Error out if any command fails
+set -e
+
+# Allow for verbose output
+if [ "$V" = "1" ]; then
+   set -x
+fi
+
+if [ $# -lt 2 ]; then
+   echo "$0 [path to nm] [path to vmlinux]" 1>&2
+   exit 1
+fi
+
+# Have Kbuild supply the path to nm so we handle cross compilation.
+nm="$1"
+vmlinux="$2"
+
+stext_addr=$($nm "$vmlinux" | grep -e " [TA] _stext$" | \
+   cut -d' ' -f1 | tr '[[:lower:]]' '[[:upper:]]')
+ftrace_caller_addr=$($nm "$vmlinux" | grep -e " T ftrace_caller$" | \
+   cut -d' ' -f1 | tr '[[:lower:]]' '[[:upper:]]')
+ftrace_tramp_addr=$($nm "$vmlinux" | grep -e " T ftrace_tramp_text$" | \
+   cut -d' ' -f1 | tr '[[:lower:]]' '[[:upper:]]')
+
+ftrace_caller_offset=$(echo "ibase=16;$ftrace_caller_addr - $stext_addr" | bc)
+ftrace_tramp_offset=$(echo "ibase=16;$ftrace_tramp_addr - $ftrace_caller_addr" 
| bc)
+sz_32m=$(printf "%d" 0x200)
+sz_64m=$(printf "%d" 0x400)
+
+# ftrace_caller - _stext < 32M
+if [ $ftrace_caller_offset -ge $sz_32m ]; then
+   echo "ERROR: ftrace_caller (0x$ftrace_caller_addr) is beyond 32MiB of 
_stext" 1>&2
+   echo "ERROR: consider disabling CONFIG_FUNCTION_TRACER, or reducing the 
size \
+   of kernel text" 1>&2
+   exit 1
+fi
+
+# ftrace_tramp_text - ftrace_caller < 64M
+if [ $ftrace_tramp_offset -ge $sz_64m ]; then
+   echo "ERROR: kernel text extends beyond 64MiB from ftrace_caller" 1>&2
+   echo "ERROR: consider disabling CONFIG_FUNCTION_TRACER, or reducing the 
size \
+   of kernel text" 1>&2
+   exit 1
+fi
-- 
2.46.0




[PATCH v5 08/17] powerpc/ftrace: Move ftrace stub used for init text before _einittext

2024-09-15 Thread Hari Bathini
From: Naveen N Rao 

Move the ftrace stub used to cover inittext before _einittext so that it
is within kernel text, as seen through core_kernel_text(). This is
required for a subsequent change to ftrace.

Signed-off-by: Naveen N Rao 
---
 arch/powerpc/kernel/vmlinux.lds.S | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/vmlinux.lds.S 
b/arch/powerpc/kernel/vmlinux.lds.S
index 7ab4e2fb28b1..b4c9decc7a75 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -265,14 +265,13 @@ SECTIONS
.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
_sinittext = .;
INIT_TEXT
-
+   *(.tramp.ftrace.init);
/*
 *.init.text might be RO so we must ensure this section ends on
 * a page boundary.
 */
. = ALIGN(PAGE_SIZE);
_einittext = .;
-   *(.tramp.ftrace.init);
} :text
 
/* .exit.text is discarded at runtime, not link time,
-- 
2.46.0




[PATCH v5 09/17] powerpc64/bpf: Fold bpf_jit_emit_func_call_hlp() into bpf_jit_emit_func_call_rel()

2024-09-15 Thread Hari Bathini
From: Naveen N Rao 

Commit 61688a82e047 ("powerpc/bpf: enable kfunc call") enhanced
bpf_jit_emit_func_call_hlp() to handle calls out to module region, where
bpf progs are generated. The only difference now between
bpf_jit_emit_func_call_hlp() and bpf_jit_emit_func_call_rel() is in
handling of the initial pass where target function address is not known.
Fold that logic into bpf_jit_emit_func_call_hlp() and rename it to
bpf_jit_emit_func_call_rel() to simplify bpf function call JIT code.

We don't actually need to load/restore TOC across a call out to a
different kernel helper or to a different bpf program since they all
work with the kernel TOC. We only need to do it if we have to call out
to a module function. So, guard TOC load/restore with appropriate
conditions.

Signed-off-by: Naveen N Rao 
---
 arch/powerpc/net/bpf_jit_comp64.c | 61 +--
 1 file changed, 17 insertions(+), 44 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index 2cbcdf93cc19..f3be024fc685 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -202,14 +202,22 @@ void bpf_jit_build_epilogue(u32 *image, struct 
codegen_context *ctx)
EMIT(PPC_RAW_BLR());
 }
 
-static int
-bpf_jit_emit_func_call_hlp(u32 *image, u32 *fimage, struct codegen_context 
*ctx, u64 func)
+int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context 
*ctx, u64 func)
 {
unsigned long func_addr = func ? ppc_function_entry((void *)func) : 0;
long reladdr;
 
-   if (WARN_ON_ONCE(!kernel_text_address(func_addr)))
-   return -EINVAL;
+   /* bpf to bpf call, func is not known in the initial pass. Emit 5 nops 
as a placeholder */
+   if (!func) {
+   for (int i = 0; i < 5; i++)
+   EMIT(PPC_RAW_NOP());
+   /* elfv1 needs an additional instruction to load addr from 
descriptor */
+   if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V1))
+   EMIT(PPC_RAW_NOP());
+   EMIT(PPC_RAW_MTCTR(_R12));
+   EMIT(PPC_RAW_BCTRL());
+   return 0;
+   }
 
 #ifdef CONFIG_PPC_KERNEL_PCREL
reladdr = func_addr - local_paca->kernelbase;
@@ -266,7 +274,8 @@ bpf_jit_emit_func_call_hlp(u32 *image, u32 *fimage, struct 
codegen_context *ctx,
 * We can clobber r2 since we get called through a
 * function pointer (so caller will save/restore r2).
 */
-   EMIT(PPC_RAW_LD(_R2, bpf_to_ppc(TMP_REG_2), 8));
+   if (is_module_text_address(func_addr))
+   EMIT(PPC_RAW_LD(_R2, bpf_to_ppc(TMP_REG_2), 8));
} else {
PPC_LI64(_R12, func);
EMIT(PPC_RAW_MTCTR(_R12));
@@ -276,46 +285,14 @@ bpf_jit_emit_func_call_hlp(u32 *image, u32 *fimage, 
struct codegen_context *ctx,
 * Load r2 with kernel TOC as kernel TOC is used if function 
address falls
 * within core kernel text.
 */
-   EMIT(PPC_RAW_LD(_R2, _R13, offsetof(struct paca_struct, 
kernel_toc)));
+   if (is_module_text_address(func_addr))
+   EMIT(PPC_RAW_LD(_R2, _R13, offsetof(struct paca_struct, 
kernel_toc)));
}
 #endif
 
return 0;
 }
 
-int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context 
*ctx, u64 func)
-{
-   unsigned int i, ctx_idx = ctx->idx;
-
-   if (WARN_ON_ONCE(func && is_module_text_address(func)))
-   return -EINVAL;
-
-   /* skip past descriptor if elf v1 */
-   func += FUNCTION_DESCR_SIZE;
-
-   /* Load function address into r12 */
-   PPC_LI64(_R12, func);
-
-   /* For bpf-to-bpf function calls, the callee's address is unknown
-* until the last extra pass. As seen above, we use PPC_LI64() to
-* load the callee's address, but this may optimize the number of
-* instructions required based on the nature of the address.
-*
-* Since we don't want the number of instructions emitted to increase,
-* we pad the optimized PPC_LI64() call with NOPs to guarantee that
-* we always have a five-instruction sequence, which is the maximum
-* that PPC_LI64() can emit.
-*/
-   if (!image)
-   for (i = ctx->idx - ctx_idx; i < 5; i++)
-   EMIT(PPC_RAW_NOP());
-
-   EMIT(PPC_RAW_MTCTR(_R12));
-   EMIT(PPC_RAW_BCTRL());
-
-   return 0;
-}
-
 static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 
out)
 {
/*
@@ -1102,11 +1079,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, 
u32 *fimage, struct code
if (ret < 0)
return ret;
 
-   if (func_addr_fixed)
-   

[PATCH v5 07/17] powerpc/ftrace: Skip instruction patching if the instructions are the same

2024-09-15 Thread Hari Bathini
From: Naveen N Rao 

To simplify upcoming changes to ftrace, add a check to skip actual
instruction patching if the old and new instructions are the same. We
still validate that the instruction is what we expect, but don't
actually patch the same instruction again.

Signed-off-by: Naveen N Rao 
---
 arch/powerpc/kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/trace/ftrace.c 
b/arch/powerpc/kernel/trace/ftrace.c
index fe0546fbac8e..719517265d39 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -82,7 +82,7 @@ static inline int ftrace_modify_code(unsigned long ip, 
ppc_inst_t old, ppc_inst_
 {
int ret = ftrace_validate_inst(ip, old);
 
-   if (!ret)
+   if (!ret && !ppc_inst_equal(old, new))
ret = patch_instruction((u32 *)ip, new);
 
return ret;
-- 
2.46.0




[PATCH v5 06/17] powerpc/ftrace: Remove pointer to struct module from dyn_arch_ftrace

2024-09-15 Thread Hari Bathini
From: Naveen N Rao 

Pointer to struct module is only relevant for ftrace records belonging
to kernel modules. Having this field in dyn_arch_ftrace wastes memory
for all ftrace records belonging to the kernel. Remove the same in
favour of looking up the module from the ftrace record address, similar
to other architectures.

Reviewed-by: Nicholas Piggin 
Signed-off-by: Naveen N Rao 
---
 arch/powerpc/include/asm/ftrace.h|  1 -
 arch/powerpc/kernel/trace/ftrace.c   | 49 +
 arch/powerpc/kernel/trace/ftrace_64_pg.c | 69 ++--
 3 files changed, 56 insertions(+), 63 deletions(-)

diff --git a/arch/powerpc/include/asm/ftrace.h 
b/arch/powerpc/include/asm/ftrace.h
index 559560286e6d..278d4548e8f1 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -24,7 +24,6 @@ unsigned long prepare_ftrace_return(unsigned long parent, 
unsigned long ip,
 struct module;
 struct dyn_ftrace;
 struct dyn_arch_ftrace {
-   struct module *mod;
 };
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
diff --git a/arch/powerpc/kernel/trace/ftrace.c 
b/arch/powerpc/kernel/trace/ftrace.c
index 8c3e523e4f96..fe0546fbac8e 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -106,28 +106,43 @@ static unsigned long find_ftrace_tramp(unsigned long ip)
return 0;
 }
 
+#ifdef CONFIG_MODULES
+static unsigned long ftrace_lookup_module_stub(unsigned long ip, unsigned long 
addr)
+{
+   struct module *mod = NULL;
+
+   preempt_disable();
+   mod = __module_text_address(ip);
+   preempt_enable();
+
+   if (!mod)
+   pr_err("No module loaded at addr=%lx\n", ip);
+
+   return (addr == (unsigned long)ftrace_caller ? mod->arch.tramp : 
mod->arch.tramp_regs);
+}
+#else
+static unsigned long ftrace_lookup_module_stub(unsigned long ip, unsigned long 
addr)
+{
+   return 0;
+}
+#endif
+
 static int ftrace_get_call_inst(struct dyn_ftrace *rec, unsigned long addr, 
ppc_inst_t *call_inst)
 {
unsigned long ip = rec->ip;
unsigned long stub;
 
-   if (is_offset_in_branch_range(addr - ip)) {
+   if (is_offset_in_branch_range(addr - ip))
/* Within range */
stub = addr;
-#ifdef CONFIG_MODULES
-   } else if (rec->arch.mod) {
-   /* Module code would be going to one of the module stubs */
-   stub = (addr == (unsigned long)ftrace_caller ? 
rec->arch.mod->arch.tramp :
-  
rec->arch.mod->arch.tramp_regs);
-#endif
-   } else if (core_kernel_text(ip)) {
+   else if (core_kernel_text(ip))
/* We would be branching to one of our ftrace stubs */
stub = find_ftrace_tramp(ip);
-   if (!stub) {
-   pr_err("0x%lx: No ftrace stubs reachable\n", ip);
-   return -EINVAL;
-   }
-   } else {
+   else
+   stub = ftrace_lookup_module_stub(ip, addr);
+
+   if (!stub) {
+   pr_err("0x%lx: No ftrace stubs reachable\n", ip);
return -EINVAL;
}
 
@@ -262,14 +277,6 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace 
*rec)
if (ret)
return ret;
 
-   if (!core_kernel_text(ip)) {
-   if (!mod) {
-   pr_err("0x%lx: No module provided for non-kernel 
address\n", ip);
-   return -EFAULT;
-   }
-   rec->arch.mod = mod;
-   }
-
/* Nop-out the ftrace location */
new = ppc_inst(PPC_RAW_NOP());
addr = MCOUNT_ADDR;
diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.c 
b/arch/powerpc/kernel/trace/ftrace_64_pg.c
index 12fab1803bcf..8a551dfca3d0 100644
--- a/arch/powerpc/kernel/trace/ftrace_64_pg.c
+++ b/arch/powerpc/kernel/trace/ftrace_64_pg.c
@@ -116,6 +116,20 @@ static unsigned long find_bl_target(unsigned long ip, 
ppc_inst_t op)
 }
 
 #ifdef CONFIG_MODULES
+static struct module *ftrace_lookup_module(struct dyn_ftrace *rec)
+{
+   struct module *mod;
+
+   preempt_disable();
+   mod = __module_text_address(rec->ip);
+   preempt_enable();
+
+   if (!mod)
+   pr_err("No module loaded at addr=%lx\n", rec->ip);
+
+   return mod;
+}
+
 static int
 __ftrace_make_nop(struct module *mod,
  struct dyn_ftrace *rec, unsigned long addr)
@@ -124,6 +138,12 @@ __ftrace_make_nop(struct module *mod,
unsigned long ip = rec->ip;
ppc_inst_t op, pop;
 
+   if (!mod) {
+   mod = ftrace_lookup_module(rec);
+   if (!mod)
+   return -EINVAL;
+   }
+
/* read where this goes */
if (copy_inst_from_kernel_nofault(&op, (void *)ip)) {
pr_err("Fetching opcode failed.\n");
@@ -366,27 +386,6 @@ int ftrace_make_nop(struct module *mod,
return -EINVAL;
}
 
-   /

[PATCH v5 05/17] powerpc/module_64: Convert #ifdef to IS_ENABLED()

2024-09-15 Thread Hari Bathini
From: Naveen N Rao 

Minor refactor for converting #ifdef to IS_ENABLED().

Reviewed-by: Nicholas Piggin 
Signed-off-by: Naveen N Rao 
---
 arch/powerpc/kernel/module_64.c | 10 ++
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index e9bab599d0c2..1db88409bd95 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -241,14 +241,8 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,
}
}
 
-#ifdef CONFIG_DYNAMIC_FTRACE
-   /* make the trampoline to the ftrace_caller */
-   relocs++;
-#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
-   /* an additional one for ftrace_regs_caller */
-   relocs++;
-#endif
-#endif
+   /* stubs for ftrace_caller and ftrace_regs_caller */
+   relocs += IS_ENABLED(CONFIG_DYNAMIC_FTRACE) + 
IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS);
 
pr_debug("Looks like a total of %lu stubs, max\n", relocs);
return relocs * sizeof(struct ppc64_stub_entry);
-- 
2.46.0




[PATCH v5 04/17] powerpc32/ftrace: Unify 32-bit and 64-bit ftrace entry code

2024-09-15 Thread Hari Bathini
From: Naveen N Rao 

On 32-bit powerpc, gcc generates a three instruction sequence for
function profiling:
mflrr0
stw r0, 4(r1)
bl  _mcount

On kernel boot, the call to _mcount() is nop-ed out, to be patched back
in when ftrace is actually enabled. The 'stw' instruction therefore is
not necessary unless ftrace is enabled. Nop it out during ftrace init.

When ftrace is enabled, we want the 'stw' so that stack unwinding works
properly. Perform the same within the ftrace handler, similar to 64-bit
powerpc.

Reviewed-by: Nicholas Piggin 
Signed-off-by: Naveen N Rao 
---
 arch/powerpc/kernel/trace/ftrace.c   | 6 --
 arch/powerpc/kernel/trace/ftrace_entry.S | 4 ++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/trace/ftrace.c 
b/arch/powerpc/kernel/trace/ftrace.c
index 2ef504700e8d..8c3e523e4f96 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -240,8 +240,10 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace 
*rec)
} else if (IS_ENABLED(CONFIG_PPC32)) {
/* Expected sequence: 'mflr r0', 'stw r0,4(r1)', 'bl _mcount' */
ret = ftrace_validate_inst(ip - 8, ppc_inst(PPC_RAW_MFLR(_R0)));
-   if (!ret)
-   ret = ftrace_validate_inst(ip - 4, 
ppc_inst(PPC_RAW_STW(_R0, _R1, 4)));
+   if (ret)
+   return ret;
+   ret = ftrace_modify_code(ip - 4, ppc_inst(PPC_RAW_STW(_R0, _R1, 
4)),
+ppc_inst(PPC_RAW_NOP()));
} else if (IS_ENABLED(CONFIG_MPROFILE_KERNEL)) {
/* Expected sequence: 'mflr r0', ['std r0,16(r1)'], 'bl 
_mcount' */
ret = ftrace_read_inst(ip - 4, &old);
diff --git a/arch/powerpc/kernel/trace/ftrace_entry.S 
b/arch/powerpc/kernel/trace/ftrace_entry.S
index 76dbe9fd2c0f..244a1c7bb1e8 100644
--- a/arch/powerpc/kernel/trace/ftrace_entry.S
+++ b/arch/powerpc/kernel/trace/ftrace_entry.S
@@ -33,6 +33,8 @@
  * and then arrange for the ftrace function to be called.
  */
 .macro ftrace_regs_entry allregs
+   /* Save the original return address in A's stack frame */
+   PPC_STL r0, LRSAVE(r1)
/* Create a minimal stack frame for representing B */
PPC_STLUr1, -STACK_FRAME_MIN_SIZE(r1)
 
@@ -44,8 +46,6 @@
SAVE_GPRS(3, 10, r1)
 
 #ifdef CONFIG_PPC64
-   /* Save the original return address in A's stack frame */
-   std r0, LRSAVE+SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE(r1)
/* Ok to continue? */
lbz r3, PACA_FTRACE_ENABLED(r13)
cmpdi   r3, 0
-- 
2.46.0




[PATCH v5 02/17] powerpc/kprobes: Use ftrace to determine if a probe is at function entry

2024-09-15 Thread Hari Bathini
From: Naveen N Rao 

Rather than hard-coding the offset into a function to be used to
determine if a kprobe is at function entry, use ftrace_location() to
determine the ftrace location within the function and categorize all
instructions till that offset to be function entry.

For functions that cannot be traced, we fall back to using a fixed
offset of 8 (two instructions) to categorize a probe as being at
function entry for 64-bit elfv2, unless we are using pcrel.

Acked-by: Masami Hiramatsu (Google) 
Signed-off-by: Naveen N Rao 
---
 arch/powerpc/kernel/kprobes.c | 18 --
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index f8aa91bc3b17..bf382c459e1f 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -105,24 +105,22 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, 
unsigned int offset)
return addr;
 }
 
-static bool arch_kprobe_on_func_entry(unsigned long offset)
+static bool arch_kprobe_on_func_entry(unsigned long addr, unsigned long offset)
 {
-#ifdef CONFIG_PPC64_ELF_ABI_V2
-#ifdef CONFIG_KPROBES_ON_FTRACE
-   return offset <= 16;
-#else
-   return offset <= 8;
-#endif
-#else
+   unsigned long ip = ftrace_location(addr);
+
+   if (ip)
+   return offset <= (ip - addr);
+   if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2) && 
!IS_ENABLED(CONFIG_PPC_KERNEL_PCREL))
+   return offset <= 8;
return !offset;
-#endif
 }
 
 /* XXX try and fold the magic of kprobe_lookup_name() in this */
 kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long 
offset,
 bool *on_func_entry)
 {
-   *on_func_entry = arch_kprobe_on_func_entry(offset);
+   *on_func_entry = arch_kprobe_on_func_entry(addr, offset);
return (kprobe_opcode_t *)(addr + offset);
 }
 
-- 
2.46.0




[PATCH v5 03/17] powerpc64/ftrace: Nop out additional 'std' instruction emitted by gcc v5.x

2024-09-15 Thread Hari Bathini
From: Naveen N Rao 

Gcc v5.x emits a 3-instruction sequence for -mprofile-kernel:
mflrr0
std r0, 16(r1)
bl  _mcount

Gcc v6.x moved to a simpler 2-instruction sequence by removing the 'std'
instruction. The store saved the return address in the LR save area in
the caller stack frame for stack unwinding. However, with dynamic
ftrace, we no longer have a call to _mcount on kernel boot when ftrace
is not enabled. When ftrace is enabled, that store is performed within
ftrace_caller(). As such, the additional 'std' instruction is redundant.
Nop it out on kernel boot.

With this change, we now use the same 2-instruction profiling sequence
with both -mprofile-kernel, as well as -fpatchable-function-entry on
64-bit powerpc.

Signed-off-by: Naveen N Rao 
---
 arch/powerpc/kernel/trace/ftrace.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/trace/ftrace.c 
b/arch/powerpc/kernel/trace/ftrace.c
index d8d6b4fd9a14..2ef504700e8d 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -246,8 +246,12 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace 
*rec)
/* Expected sequence: 'mflr r0', ['std r0,16(r1)'], 'bl 
_mcount' */
ret = ftrace_read_inst(ip - 4, &old);
if (!ret && !ppc_inst_equal(old, ppc_inst(PPC_RAW_MFLR(_R0 {
+   /* Gcc v5.x emit the additional 'std' instruction, gcc 
v6.x don't */
ret = ftrace_validate_inst(ip - 8, 
ppc_inst(PPC_RAW_MFLR(_R0)));
-   ret |= ftrace_validate_inst(ip - 4, 
ppc_inst(PPC_RAW_STD(_R0, _R1, 16)));
+   if (ret)
+   return ret;
+   ret = ftrace_modify_code(ip - 4, 
ppc_inst(PPC_RAW_STD(_R0, _R1, 16)),
+ppc_inst(PPC_RAW_NOP()));
}
} else {
return -EINVAL;
-- 
2.46.0




[PATCH v5 00/17] powerpc: Core ftrace rework, support for ftrace direct and bpf trampolines

2024-09-15 Thread Hari Bathini
This is v5 of the series posted here:
https://lore.kernel.org/all/cover.1720942106.git.nav...@kernel.org/

This series reworks core ftrace support on powerpc to have the function
profiling sequence moved out of line. This enables us to have a single
nop at kernel function entry virtually eliminating effect of the
function tracer when it is not enabled. The function profile sequence is
moved out of line and is allocated at two separate places depending on a
new config option.

For 64-bit powerpc, the function profiling sequence is also updated to
include an additional instruction 'mtlr r0' after the usual
two-instruction sequence to fix link stack imbalance (return address
predictor) when ftrace is enabled. This showed an improvement of ~10%
in null_syscall benchmark (NR_LOOPS=1000) on a Power 10 system
with ftrace enabled.

Finally, support for ftrace direct calls is added based on support for
DYNAMIC_FTRACE_WITH_CALL_OPS. BPF Trampoline support is added atop this.

Support for ftrace direct calls is added for 32-bit powerpc. There is
some code to enable bpf trampolines for 32-bit powerpc, but it is not
complete and will need to be pursued separately.

Patches 1 to 10 are independent of this series and can go in separately
though. Rest of the patches depend on the series from Benjamin Gray
adding support for patch_uint() and patch_ulong():
https://lore.kernel.org/all/172474280311.31690.1489687786264785049.b4...@ellerman.id.au/

Changelog v5:
* Intermediate files named .vmlinux.arch.* instead of .arch.vmlinux.*
* Fixed ftrace stack tracer failure due to inadvertent use of
  'add r7, r3, MCOUNT_INSN_SIZE' instruction instead of
  'addi r7, r3, MCOUNT_INSN_SIZE'
* Fixed build error for !CONFIG_MODULES case.
* .vmlinux.arch.* files compiled under arch/powerpc/tools
* Made sure .vmlinux.arch.* files are cleaned with `make clean`
* num_ool_stubs_text_end used for setting up ftrace_ool_stub_text_end
  set to zero instead of computing to some random negative value when
  not required.
* Resolved checkpatch.pl warnings.
* Dropped RFC tag.

Changelog v4:
- Patches 1, 10 and 13 are new.
- Address review comments from Nick. Numerous changes throughout the
  patch series.
- Extend support for ftrace ool to vmlinux text up to 64MB (patch 13).
- Address remaining TODOs in support for BPF Trampolines.
- Update synchronization when patching instructions during trampoline
  attach/detach.


Naveen N Rao (17):
  powerpc/trace: Account for -fpatchable-function-entry support by
toolchain
  powerpc/kprobes: Use ftrace to determine if a probe is at function
entry
  powerpc64/ftrace: Nop out additional 'std' instruction emitted by gcc
v5.x
  powerpc32/ftrace: Unify 32-bit and 64-bit ftrace entry code
  powerpc/module_64: Convert #ifdef to IS_ENABLED()
  powerpc/ftrace: Remove pointer to struct module from dyn_arch_ftrace
  powerpc/ftrace: Skip instruction patching if the instructions are the
same
  powerpc/ftrace: Move ftrace stub used for init text before _einittext
  powerpc64/bpf: Fold bpf_jit_emit_func_call_hlp() into
bpf_jit_emit_func_call_rel()
  powerpc/ftrace: Add a postlink script to validate function tracer
  kbuild: Add generic hook for architectures to use before the final
vmlinux link
  powerpc64/ftrace: Move ftrace sequence out of line
  powerpc64/ftrace: Support .text larger than 32MB with out-of-line
stubs
  powerpc/ftrace: Add support for DYNAMIC_FTRACE_WITH_CALL_OPS
  powerpc/ftrace: Add support for DYNAMIC_FTRACE_WITH_DIRECT_CALLS
  samples/ftrace: Add support for ftrace direct samples on powerpc
  powerpc64/bpf: Add support for bpf trampolines

 arch/Kconfig|   6 +
 arch/powerpc/Kbuild |   2 +-
 arch/powerpc/Kconfig|  23 +-
 arch/powerpc/Makefile   |   8 +
 arch/powerpc/Makefile.postlink  |   8 +
 arch/powerpc/include/asm/ftrace.h   |  33 +-
 arch/powerpc/include/asm/module.h   |   5 +
 arch/powerpc/include/asm/ppc-opcode.h   |  14 +
 arch/powerpc/kernel/asm-offsets.c   |  11 +
 arch/powerpc/kernel/kprobes.c   |  18 +-
 arch/powerpc/kernel/module_64.c |  66 +-
 arch/powerpc/kernel/trace/Makefile  |  11 +-
 arch/powerpc/kernel/trace/ftrace.c  | 298 ++-
 arch/powerpc/kernel/trace/ftrace_64_pg.c|  69 +-
 arch/powerpc/kernel/trace/ftrace_entry.S| 244 --
 arch/powerpc/kernel/vmlinux.lds.S   |   3 +-
 arch/powerpc/net/bpf_jit.h  |  12 +
 arch/powerpc/net/bpf_jit_comp.c | 847 +++-
 arch/powerpc/net/bpf_jit_comp32.c   |   7 +-
 arch/powerpc/net/bpf_jit_comp64.c   |  68 +-
 arch/powerpc/tools/Makefile |  12 +
 arch/powerpc/tools/ftrace-gen-ool-stubs.sh  |  52 ++
 arch/powerpc/tools/ftrace_check.sh  |  50 ++
 samples/ftrace/ftrace-direct-modify.c   |  85 +-
 samples/ftrace/ftrace-direct-multi-modify.c 

[PATCH v5 01/17] powerpc/trace: Account for -fpatchable-function-entry support by toolchain

2024-09-15 Thread Hari Bathini
From: Naveen N Rao 

So far, we have relied on the fact that gcc supports both
-mprofile-kernel, as well as -fpatchable-function-entry, and clang
supports neither. Our Makefile only checks for CONFIG_MPROFILE_KERNEL to
decide which files to build. Clang has a feature request out [*] to
implement -fpatchable-function-entry, and is unlikely to support
-mprofile-kernel.

Update our Makefile checks so that we pick up the correct files to build
once clang picks up support for -fpatchable-function-entry.

[*] https://github.com/llvm/llvm-project/issues/57031

Signed-off-by: Naveen N Rao 
---
 arch/powerpc/kernel/trace/Makefile | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/trace/Makefile 
b/arch/powerpc/kernel/trace/Makefile
index 125f4ca588b9..d6c3885453bd 100644
--- a/arch/powerpc/kernel/trace/Makefile
+++ b/arch/powerpc/kernel/trace/Makefile
@@ -9,12 +9,15 @@ CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_ftrace_64_pg.o = $(CC_FLAGS_FTRACE)
 endif
 
-obj32-$(CONFIG_FUNCTION_TRACER)+= ftrace.o ftrace_entry.o
-ifdef CONFIG_MPROFILE_KERNEL
-obj64-$(CONFIG_FUNCTION_TRACER)+= ftrace.o ftrace_entry.o
+ifdef CONFIG_FUNCTION_TRACER
+obj32-y+= ftrace.o ftrace_entry.o
+ifeq ($(CONFIG_MPROFILE_KERNEL)$(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY),)
+obj64-y+= ftrace_64_pg.o 
ftrace_64_pg_entry.o
 else
-obj64-$(CONFIG_FUNCTION_TRACER)+= ftrace_64_pg.o 
ftrace_64_pg_entry.o
+obj64-y+= ftrace.o ftrace_entry.o
+endif
 endif
+
 obj-$(CONFIG_TRACING)  += trace_clock.o
 
 obj-$(CONFIG_PPC64)+= $(obj64-y)
-- 
2.46.0




Re: [PATCH v4 2/5] powerpc/code-patching: Add data patch alignment check

2024-08-20 Thread Hari Bathini




On 15/05/24 8:14 am, Benjamin Gray wrote:

The new data patching still needs to be aligned within a
cacheline too for the flushes to work correctly. To simplify
this requirement, we just say data patches must be aligned.

Detect when data patching is not aligned, returning an invalid
argument error.

Signed-off-by: Benjamin Gray 


Reviewed-by: Hari Bathini 



---

v3: * New in v3
---
  arch/powerpc/include/asm/code-patching.h | 6 ++
  arch/powerpc/lib/code-patching.c | 6 ++
  2 files changed, 12 insertions(+)

diff --git a/arch/powerpc/include/asm/code-patching.h 
b/arch/powerpc/include/asm/code-patching.h
index 21a36e2c4e26..e7f14720f630 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -95,11 +95,17 @@ int patch_ulong(void *addr, unsigned long val);
  
  static inline int patch_uint(void *addr, unsigned int val)

  {
+   if (!IS_ALIGNED((unsigned long)addr, sizeof(unsigned int)))
+   return -EINVAL;
+
return patch_instruction(addr, ppc_inst(val));
  }
  
  static inline int patch_ulong(void *addr, unsigned long val)

  {
+   if (!IS_ALIGNED((unsigned long)addr, sizeof(unsigned long)))
+   return -EINVAL;
+
return patch_instruction(addr, ppc_inst(val));
  }
  
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c

index 18f762616db9..384b275d1bc5 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -386,12 +386,18 @@ NOKPROBE_SYMBOL(patch_instruction);
  
  int patch_uint(void *addr, unsigned int val)

  {
+   if (!IS_ALIGNED((unsigned long)addr, sizeof(unsigned int)))
+   return -EINVAL;
+
return patch_mem(addr, val, false);
  }
  NOKPROBE_SYMBOL(patch_uint);
  
  int patch_ulong(void *addr, unsigned long val)

  {
+   if (!IS_ALIGNED((unsigned long)addr, sizeof(unsigned long)))
+   return -EINVAL;
+
return patch_mem(addr, val, true);
  }
  NOKPROBE_SYMBOL(patch_ulong);




Re: [PATCH v4 1/5] powerpc/code-patching: Add generic memory patching

2024-08-20 Thread Hari Bathini




On 15/05/24 8:14 am, Benjamin Gray wrote:

patch_instruction() is designed for patching instructions in otherwise
readonly memory. Other consumers also sometimes need to patch readonly
memory, so have abused patch_instruction() for arbitrary data patches.

This is a problem on ppc64 as patch_instruction() decides on the patch
width using the 'instruction' opcode to see if it's a prefixed
instruction. Data that triggers this can lead to larger writes, possibly
crossing a page boundary and failing the write altogether.

Introduce patch_uint(), and patch_ulong(), with aliases patch_u32(), and
patch_u64() (on ppc64) designed for aligned data patches. The patch
size is now determined by the called function, and is passed as an
additional parameter to generic internals.

While the instruction flushing is not required for data patches, it
remains unconditional in this patch. A followup series is possible if
benchmarking shows fewer flushes gives an improvement in some
data-patching workload.

ppc32 does not support prefixed instructions, so is unaffected by the
original issue. Care is taken in not exposing the size parameter in the
public (non-static) interface, so the compiler can const-propagate it
away.

Signed-off-by: Benjamin Gray 


Reviewed-by: Hari Bathini 



---

v3: * Rename from *_memory to *_mem
 * Change type of ppc32 patch_uint() address to void*
 * Explain introduction of val32 for big endian
 * Some formatting

v2: * Deduplicate patch_32() definition
 * Use u32 for val32
 * Remove noinline
---
  arch/powerpc/include/asm/code-patching.h | 31 
  arch/powerpc/lib/code-patching.c | 64 ++--
  2 files changed, 80 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/code-patching.h 
b/arch/powerpc/include/asm/code-patching.h
index 0e29ccf903d0..21a36e2c4e26 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -76,6 +76,37 @@ int patch_instruction(u32 *addr, ppc_inst_t instr);
  int raw_patch_instruction(u32 *addr, ppc_inst_t instr);
  int patch_instructions(u32 *addr, u32 *code, size_t len, bool repeat_instr);
  
+/*

+ * The data patching functions patch_uint() and patch_ulong(), etc., must be
+ * called on aligned addresses.
+ *
+ * The instruction patching functions patch_instruction() and similar must be
+ * called on addresses satisfying instruction alignment requirements.
+ */
+
+#ifdef CONFIG_PPC64
+
+int patch_uint(void *addr, unsigned int val);
+int patch_ulong(void *addr, unsigned long val);
+
+#define patch_u64 patch_ulong
+
+#else
+
+static inline int patch_uint(void *addr, unsigned int val)
+{
+   return patch_instruction(addr, ppc_inst(val));
+}
+
+static inline int patch_ulong(void *addr, unsigned long val)
+{
+   return patch_instruction(addr, ppc_inst(val));
+}
+
+#endif
+
+#define patch_u32 patch_uint
+
  static inline unsigned long patch_site_addr(s32 *site)
  {
return (unsigned long)site + *site;
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index df64343b9214..18f762616db9 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -20,15 +20,14 @@
  #include 
  #include 
  
-static int __patch_instruction(u32 *exec_addr, ppc_inst_t instr, u32 *patch_addr)

+static int __patch_mem(void *exec_addr, unsigned long val, void *patch_addr, 
bool is_dword)
  {
-   if (!ppc_inst_prefixed(instr)) {
-   u32 val = ppc_inst_val(instr);
+   if (!IS_ENABLED(CONFIG_PPC64) || likely(!is_dword)) {
+   /* For big endian correctness: plain address would use the 
wrong half */
+   u32 val32 = val;
  
-		__put_kernel_nofault(patch_addr, &val, u32, failed);

+   __put_kernel_nofault(patch_addr, &val32, u32, failed);
} else {
-   u64 val = ppc_inst_as_ulong(instr);
-
__put_kernel_nofault(patch_addr, &val, u64, failed);
}
  
@@ -44,7 +43,10 @@ static int __patch_instruction(u32 *exec_addr, ppc_inst_t instr, u32 *patch_addr
  
  int raw_patch_instruction(u32 *addr, ppc_inst_t instr)

  {
-   return __patch_instruction(addr, instr, addr);
+   if (ppc_inst_prefixed(instr))
+   return __patch_mem(addr, ppc_inst_as_ulong(instr), addr, true);
+   else
+   return __patch_mem(addr, ppc_inst_val(instr), addr, false);
  }
  
  struct patch_context {

@@ -276,7 +278,7 @@ static void unmap_patch_area(unsigned long addr)
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
  }
  
-static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr)

+static int __do_patch_mem_mm(void *addr, unsigned long val, bool is_dword)
  {
int err;
u32 *patch_addr;
@@ -305,7 +307,7 @@ static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t 
instr)
  
  	orig_mm = start_using_temp_mm(patching_mm);
  
-	err = __patch_instruction(addr, instr

Re: [PATCH v4 5/5] powerpc/code-patching: Add boot selftest for data patching

2024-08-20 Thread Hari Bathini




On 15/05/24 8:14 am, Benjamin Gray wrote:

Extend the code patching selftests with some basic coverage of the new
data patching variants too.

Signed-off-by: Benjamin Gray 


Reviewed-by: Hari Bathini 



---

v4: * Change store to a check
 * Account for doubleword alignment
v3: * New in v3
---
  arch/powerpc/lib/test-code-patching.c | 41 +++
  1 file changed, 41 insertions(+)

diff --git a/arch/powerpc/lib/test-code-patching.c 
b/arch/powerpc/lib/test-code-patching.c
index f76030087f98..8cd3b32f805b 100644
--- a/arch/powerpc/lib/test-code-patching.c
+++ b/arch/powerpc/lib/test-code-patching.c
@@ -438,6 +438,46 @@ static void __init test_multi_instruction_patching(void)
vfree(buf);
  }
  
+static void __init test_data_patching(void)

+{
+   void *buf;
+   u32 *addr32;
+
+   buf = vzalloc(PAGE_SIZE);
+   check(buf);
+   if (!buf)
+   return;
+
+   addr32 = buf + 128;
+
+   addr32[1] = 0xA0A1A2A3;
+   addr32[2] = 0xB0B1B2B3;
+
+   check(!patch_uint(&addr32[1], 0xC0C1C2C3));
+
+   check(addr32[0] == 0);
+   check(addr32[1] == 0xC0C1C2C3);
+   check(addr32[2] == 0xB0B1B2B3);
+   check(addr32[3] == 0);
+
+   /* Unaligned patch_ulong() should fail */
+   if (IS_ENABLED(CONFIG_PPC64))
+   check(patch_ulong(&addr32[1], 0xD0D1D2D3) == -EINVAL);
+
+   check(!patch_ulong(&addr32[2], 0xD0D1D2D3));
+
+   check(addr32[0] == 0);
+   check(addr32[1] == 0xC0C1C2C3);
+   check(*(unsigned long *)(&addr32[2]) == 0xD0D1D2D3);
+
+   if (!IS_ENABLED(CONFIG_PPC64))
+   check(addr32[3] == 0);
+
+   check(addr32[4] == 0);
+
+   vfree(buf);
+}
+
  static int __init test_code_patching(void)
  {
pr_info("Running code patching self-tests ...\n");
@@ -448,6 +488,7 @@ static int __init test_code_patching(void)
test_translate_branch();
test_prefixed_patching();
test_multi_instruction_patching();
+   test_data_patching();
  
  	return 0;

  }




Re: [PATCH v4 4/5] powerpc/32: Convert patch_instruction() to patch_uint()

2024-08-20 Thread Hari Bathini




On 15/05/24 8:14 am, Benjamin Gray wrote:

These changes are for patch_instruction() uses on data. Unlike ppc64
these should not be incorrect as-is, but using the patch_uint() alias
better reflects what kind of data being patched and allows for
benchmarking the effect of different patch_* implementations (e.g.,
skipping instruction flushing when patching data).

Signed-off-by: Benjamin Gray 


Tested-by: Hari Bathini 


---
  arch/powerpc/kernel/static_call.c | 2 +-
  arch/powerpc/platforms/powermac/smp.c | 2 +-
  2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/static_call.c 
b/arch/powerpc/kernel/static_call.c
index 863a7aa24650..1502b7e439ca 100644
--- a/arch/powerpc/kernel/static_call.c
+++ b/arch/powerpc/kernel/static_call.c
@@ -17,7 +17,7 @@ void arch_static_call_transform(void *site, void *tramp, void 
*func, bool tail)
mutex_lock(&text_mutex);
  
  	if (func && !is_short) {

-   err = patch_instruction(tramp + PPC_SCT_DATA, ppc_inst(target));
+   err = patch_ulong(tramp + PPC_SCT_DATA, target);
if (err)
goto out;
}
diff --git a/arch/powerpc/platforms/powermac/smp.c 
b/arch/powerpc/platforms/powermac/smp.c
index 15644be31990..d21b681f52fb 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -827,7 +827,7 @@ static int smp_core99_kick_cpu(int nr)
mdelay(1);
  
  	/* Restore our exception vector */

-   patch_instruction(vector, ppc_inst(save_vector));
+   patch_uint(vector, save_vector);
  
  	local_irq_restore(flags);

if (ppc_md.progress) ppc_md.progress("smp_core99_kick_cpu done", 0x347);




Re: [PATCH v4 3/5] powerpc/64: Convert patch_instruction() to patch_u32()

2024-08-19 Thread Hari Bathini




On 15/05/24 8:14 am, Benjamin Gray wrote:

This use of patch_instruction() is working on 32 bit data, and can fail
if the data looks like a prefixed instruction and the extra write
crosses a page boundary. Use patch_u32() to fix the write size.

Fixes: 8734b41b3efe ("powerpc/module_64: Fix livepatching for RO modules")
Link: https://lore.kernel.org/all/20230203004649.1f59dbd4@yea/
Signed-off-by: Benjamin Gray 


Tested-by: Hari Bathini 



---

v2: * Added the fixes tag, it seems appropriate even if the subject does
   mention a more robust solution being required.

patch_u64() should be more efficient, but judging from the bug report
it doesn't seem like the data is doubleword aligned.
---
  arch/powerpc/kernel/module_64.c | 5 ++---
  1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 7112adc597a8..e9bab599d0c2 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -651,12 +651,11 @@ static inline int create_stub(const Elf64_Shdr *sechdrs,
// func_desc_t is 8 bytes if ABIv2, else 16 bytes
desc = func_desc(addr);
for (i = 0; i < sizeof(func_desc_t) / sizeof(u32); i++) {
-   if (patch_instruction(((u32 *)&entry->funcdata) + i,
- ppc_inst(((u32 *)(&desc))[i])))
+   if (patch_u32(((u32 *)&entry->funcdata) + i, ((u32 *)&desc)[i]))
return 0;
}
  
-	if (patch_instruction(&entry->magic, ppc_inst(STUB_MAGIC)))

+   if (patch_u32(&entry->magic, STUB_MAGIC))
return 0;
  
  	return 1;




Re: [PATCH 2/2] MAINTAINERS: Update powerpc BPF JIT maintainers

2024-07-16 Thread Hari Bathini




On 14/07/24 2:04 pm, Naveen N Rao wrote:

Hari Bathini has been updating and maintaining the powerpc BPF JIT since
a while now. Christophe Leroy has been doing the same for 32-bit
powerpc. Add them as maintainers for the powerpc BPF JIT.

I am no longer actively looking into the powerpc BPF JIT. Change my role
to that of a reviewer so that I can help with the odd query.

Signed-off-by: Naveen N Rao 


Acked-by: Hari Bathini 


---
  MAINTAINERS | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 05f14b67cd74..c7a931ee7a2e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3878,8 +3878,10 @@ S:   Odd Fixes
  F:drivers/net/ethernet/netronome/nfp/bpf/
  
  BPF JIT for POWERPC (32-BIT AND 64-BIT)

-M: Naveen N Rao 
  M:Michael Ellerman 
+M: Hari Bathini 
+M: Christophe Leroy 
+R: Naveen N Rao 
  L:b...@vger.kernel.org
  S:Supported
  F:arch/powerpc/net/


[PATCH v3] radix/kfence: map __kfence_pool at page granularity

2024-07-01 Thread Hari Bathini
When KFENCE is enabled, total system memory is mapped at page level
granularity. But in radix MMU mode, ~3GB additional memory is needed
to map 100GB of system memory at page level granularity when compared
to using 2MB direct mapping. This is not desired considering KFENCE is
designed to be enabled in production kernels [1]. Also, mapping only
the memory allocated for KFENCE pool at page granularity is sufficient
to enable KFENCE support. So, allocate __kfence_pool during bootup and
map it at page granularity instead of mapping all system memory at
page granularity.

Without patch:
# cat /proc/meminfo
MemTotal:   101201920 kB

With patch:
# cat /proc/meminfo
MemTotal:   104483904 kB

Note that enabling KFENCE at runtime is disabled for radix MMU for now,
as it depends on the ability to split page table mappings and such APIs
are not currently implemented for radix MMU.

All kfence_test.c testcases passed with this patch.

[1] https://lore.kernel.org/all/20201103175841.3495947-2-el...@google.com/

Signed-off-by: Hari Bathini 
---

Changes in v3:
* Updated the changelog and contained changes relevant for radix MMU
  within radix-pgtable.c as suggested by mpe.

Changes in v2:
* Dropped the patch that adds support to enable KFENCE after startup.
* Added changes to avoid KFENCE enablement after system startup.
* Also, added a TODO explaining why KFENCE enablement after startup
  is not supported for now.
* Functions to alloc/map __kfence_pool as suggested by Ritesh.
* Moved changes that apply to ppc32 as well to common file as suggested
  by Christophe.

 
 arch/powerpc/include/asm/kfence.h| 11 +++-
 arch/powerpc/mm/book3s64/radix_pgtable.c | 84 ++--
 arch/powerpc/mm/init-common.c|  3 +
 3 files changed, 93 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/kfence.h 
b/arch/powerpc/include/asm/kfence.h
index 424ceef82ae6..fab124ada1c7 100644
--- a/arch/powerpc/include/asm/kfence.h
+++ b/arch/powerpc/include/asm/kfence.h
@@ -15,10 +15,19 @@
 #define ARCH_FUNC_PREFIX "."
 #endif
 
+#ifdef CONFIG_KFENCE
+extern bool kfence_disabled;
+
+static inline void disable_kfence(void)
+{
+   kfence_disabled = true;
+}
+
 static inline bool arch_kfence_init_pool(void)
 {
-   return true;
+   return !kfence_disabled;
 }
+#endif
 
 #ifdef CONFIG_PPC64
 static inline bool kfence_protect_page(unsigned long addr, bool protect)
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 15e88f1439ec..b0d927009af8 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -31,6 +32,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -293,7 +295,8 @@ static unsigned long next_boundary(unsigned long addr, 
unsigned long end)
 
 static int __meminit create_physical_mapping(unsigned long start,
 unsigned long end,
-int nid, pgprot_t _prot)
+int nid, pgprot_t _prot,
+unsigned long mapping_sz_limit)
 {
unsigned long vaddr, addr, mapping_size = 0;
bool prev_exec, exec = false;
@@ -301,7 +304,10 @@ static int __meminit create_physical_mapping(unsigned long 
start,
int psize;
unsigned long max_mapping_size = memory_block_size;
 
-   if (debug_pagealloc_enabled_or_kfence())
+   if (mapping_sz_limit < max_mapping_size)
+   max_mapping_size = mapping_sz_limit;
+
+   if (debug_pagealloc_enabled())
max_mapping_size = PAGE_SIZE;
 
start = ALIGN(start, PAGE_SIZE);
@@ -356,8 +362,74 @@ static int __meminit create_physical_mapping(unsigned long 
start,
return 0;
 }
 
+#ifdef CONFIG_KFENCE
+static bool __ro_after_init kfence_early_init = 
!!CONFIG_KFENCE_SAMPLE_INTERVAL;
+
+static int __init parse_kfence_early_init(char *arg)
+{
+   int val;
+
+   if (get_option(&arg, &val))
+   kfence_early_init = !!val;
+   return 0;
+}
+early_param("kfence.sample_interval", parse_kfence_early_init);
+
+static inline phys_addr_t alloc_kfence_pool(void)
+{
+   phys_addr_t kfence_pool;
+
+   /*
+* TODO: Support to enable KFENCE after bootup depends on the ability to
+*   split page table mappings. As such support is not currently
+*   implemented for radix pagetables, support enabling KFENCE
+*   only at system startup for now.
+*
+*   After support for splitting mappings is available on radix,
+*   alloc_kfence_pool() & map_kfence_pool() can be dropped and
+*   mapping for __kfence_pool memory can be
+*   split during arch_kfence_init_pool().
+  

[PATCH v2] radix/kfence: map __kfence_pool at page granularity

2024-06-19 Thread Hari Bathini
When KFENCE is enabled, total system memory is mapped at page level
granularity. But in radix MMU mode, ~3GB additional memory is needed
to map 100GB of system memory at page level granularity when compared
to using 2MB direct mapping. This is not desired considering KFENCE is
designed to be enabled in production kernels [1]. Also, mapping memory
allocated for KFENCE pool at page granularity seems sufficient enough
to enable KFENCE support. So, allocate __kfence_pool during bootup and
map it at page granularity instead of mapping all system memory at
page granularity.

Without patch:
# cat /proc/meminfo
MemTotal:   101201920 kB

With patch:
# cat /proc/meminfo
MemTotal:   104483904 kB

Note that enabling KFENCE at runtime is disabled for radix MMU for now,
as it depends on the ability to split page table mappings and such APIs
are not currently implemented for radix MMU.

All kfence_test.c testcases passed with this patch.

[1] https://lore.kernel.org/all/20201103175841.3495947-2-el...@google.com/

Signed-off-by: Hari Bathini 
---

Changes in v2:
* Dropped the patch that adds support to enable KFENCE after startup.
* Added changes to avoid KFENCE enablement after system startup.
* Also, added a TODO explaining why KFENCE enablement after startup
  is not supported for now.
* Functions to alloc/map __kfence_pool as suggested by Ritesh.
* Moved changes that apply to ppc32 as well to common file as suggested
  by Christophe.


 arch/powerpc/include/asm/kfence.h| 12 +++-
 arch/powerpc/mm/book3s64/radix_pgtable.c | 74 ++--
 arch/powerpc/mm/init-common.c| 14 +
 3 files changed, 95 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/kfence.h 
b/arch/powerpc/include/asm/kfence.h
index 424ceef82ae6..78590288ee80 100644
--- a/arch/powerpc/include/asm/kfence.h
+++ b/arch/powerpc/include/asm/kfence.h
@@ -15,10 +15,20 @@
 #define ARCH_FUNC_PREFIX "."
 #endif
 
+#ifdef CONFIG_KFENCE
+extern bool kfence_early_init;
+extern bool kfence_disabled;
+
+static inline void disable_kfence(void)
+{
+   kfence_disabled = true;
+}
+
 static inline bool arch_kfence_init_pool(void)
 {
-   return true;
+   return !kfence_disabled;
 }
+#endif
 
 #ifdef CONFIG_PPC64
 static inline bool kfence_protect_page(unsigned long addr, bool protect)
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 15e88f1439ec..a74912e0fd99 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -31,6 +32,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -293,7 +295,8 @@ static unsigned long next_boundary(unsigned long addr, 
unsigned long end)
 
 static int __meminit create_physical_mapping(unsigned long start,
 unsigned long end,
-int nid, pgprot_t _prot)
+int nid, pgprot_t _prot,
+unsigned long mapping_sz_limit)
 {
unsigned long vaddr, addr, mapping_size = 0;
bool prev_exec, exec = false;
@@ -301,7 +304,10 @@ static int __meminit create_physical_mapping(unsigned long 
start,
int psize;
unsigned long max_mapping_size = memory_block_size;
 
-   if (debug_pagealloc_enabled_or_kfence())
+   if (mapping_sz_limit < max_mapping_size)
+   max_mapping_size = mapping_sz_limit;
+
+   if (debug_pagealloc_enabled())
max_mapping_size = PAGE_SIZE;
 
start = ALIGN(start, PAGE_SIZE);
@@ -356,8 +362,64 @@ static int __meminit create_physical_mapping(unsigned long 
start,
return 0;
 }
 
+#ifdef CONFIG_KFENCE
+static inline phys_addr_t radix_alloc_kfence_pool_early(void)
+{
+   phys_addr_t kfence_pool;
+
+   /*
+* TODO: Support to enable KFENCE after bootup depends on the ability to
+*   split page table mappings. As such support is not currently
+*   implemented for radix pagetables, support enabling KFENCE
+*   only at system startup for now.
+*
+*   After support for splitting mappings is available on radix,
+*   radix_alloc_kfence_pool_early() & radix_map_kfence_pool_early()
+*   can be dropped and mapping for __kfence_pool memory can be
+*   split during arch_kfence_init_pool().
+*/
+   if (!kfence_early_init)
+   goto no_kfence;
+
+   kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
+   if (!kfence_pool)
+   goto no_kfence;
+
+   memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE);
+   return kfence_pool;
+
+no_kfence:
+   disable_kfence();
+   return 0;
+}
+
+static inline void radix_map_kfence_pool_early(phys_a

Re: [PATCH 2/5] powerpc64/bpf: jit support for unconditional byte swap

2024-05-22 Thread Hari Bathini




On 17/05/24 1:26 pm, Artem Savkov wrote:

Add jit support for unconditional byte swap. Tested using BSWAP tests
from test_bpf module.

Signed-off-by: Artem Savkov 
---
  arch/powerpc/net/bpf_jit_comp64.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index 3071205782b15..97191cf091bbf 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -699,11 +699,12 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, 
u32 *fimage, struct code
 */
case BPF_ALU | BPF_END | BPF_FROM_LE:
case BPF_ALU | BPF_END | BPF_FROM_BE:



+   case BPF_ALU64 | BPF_END | BPF_FROM_LE:


A comment here indicating this case does unconditional swap
could improve readability.

Other than this minor nit, the patchset looks good to me.
Also, tested the changes with test_bpf module and selftests.
For the series..

Reviewed-by: Hari Bathini 


  #ifdef __BIG_ENDIAN__
if (BPF_SRC(code) == BPF_FROM_BE)
goto emit_clear;
  #else /* !__BIG_ENDIAN__ */
-   if (BPF_SRC(code) == BPF_FROM_LE)
+   if (BPF_CLASS(code) == BPF_ALU && BPF_SRC(code) == 
BPF_FROM_LE)
goto emit_clear;
  #endif
switch (imm) {


[PATCH] powerpc/fadump: update documentation about bootargs_append

2024-05-10 Thread Hari Bathini
Update ABI documentation about the introduction of the new sysfs
entry bootargs_append. This sysfs entry will be used to setup the
additional parameters to be passed to dump capture kernel.

Signed-off-by: Hari Bathini 
---

* This patch is a follow-up of below patch series, to update corresponding
  ABI documentation:

https://lore.kernel.org/all/20240509115755.519982-1-hbath...@linux.ibm.com/

 Documentation/ABI/testing/sysfs-kernel-fadump | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-kernel-fadump 
b/Documentation/ABI/testing/sysfs-kernel-fadump
index c586054657d6..2f9daa7ca55b 100644
--- a/Documentation/ABI/testing/sysfs-kernel-fadump
+++ b/Documentation/ABI/testing/sysfs-kernel-fadump
@@ -49,3 +49,10 @@ Description: read only
memory add/remove events because elfcorehdr is now prepared in
the second/fadump kernel.
 User:  kexec-tools
+
+What:  /sys/kernel/fadump/bootargs_append
+Date:  May 2024
+Contact:   linuxppc-dev@lists.ozlabs.org
+Description:   read/write
+   This is a special sysfs file available to setup additional
+   parameters to be passed to capture kernel.
-- 
2.45.0



[PATCH] powerpc/85xx: fix compile error without CONFIG_CRASH_DUMP

2024-05-10 Thread Hari Bathini
Since commit 5c4233cc0920 ("powerpc/kdump: Split KEXEC_CORE and
CRASH_DUMP dependency"), crashing_cpu is not available without
CONFIG_CRASH_DUMP. Fix compile error on 64-BIT 85xx owing to this
change.

Cc: sta...@vger.kernel.org
Fixes: 5c4233cc0920 ("powerpc/kdump: Split KEXEC_CORE and CRASH_DUMP 
dependency")
Reported-by: Christian Zigotzky 
Closes: 
https://lore.kernel.org/all/fa247ae4-5825-4dbe-a737-d93b7ab4d...@xenosoft.de/
Suggested-by: Michael Ellerman 
Signed-off-by: Hari Bathini 
---
 arch/powerpc/platforms/85xx/smp.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/85xx/smp.c 
b/arch/powerpc/platforms/85xx/smp.c
index 40aa58206888..e52b848b64b7 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -398,6 +398,7 @@ static void mpc85xx_smp_kexec_cpu_down(int crash_shutdown, 
int secondary)
hard_irq_disable();
mpic_teardown_this_cpu(secondary);
 
+#ifdef CONFIG_CRASH_DUMP
if (cpu == crashing_cpu && cpu_thread_in_core(cpu) != 0) {
/*
 * We enter the crash kernel on whatever cpu crashed,
@@ -406,9 +407,11 @@ static void mpc85xx_smp_kexec_cpu_down(int crash_shutdown, 
int secondary)
 */
disable_threadbit = 1;
disable_cpu = cpu_first_thread_sibling(cpu);
-   } else if (sibling != crashing_cpu &&
-  cpu_thread_in_core(cpu) == 0 &&
-  cpu_thread_in_core(sibling) != 0) {
+   } else if (sibling == crashing_cpu) {
+   return;
+   }
+#endif
+   if (cpu_thread_in_core(cpu) == 0 && cpu_thread_in_core(sibling) != 0) {
disable_threadbit = 2;
disable_cpu = sibling;
}
-- 
2.45.0



[PATCH v2 3/3] powerpc/fadump: pass additional parameters when fadump is active

2024-05-09 Thread Hari Bathini
Append the additional parameters passed/set in the dedicated parameter
area (RTAS_FADUMP_PARAM_AREA) to bootargs in fadump capture kernel.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/fadump.h |  2 ++
 arch/powerpc/kernel/fadump.c  | 35 +++
 arch/powerpc/kernel/prom.c|  3 +++
 3 files changed, 40 insertions(+)

diff --git a/arch/powerpc/include/asm/fadump.h 
b/arch/powerpc/include/asm/fadump.h
index 526a6a647312..ef40c9b6972a 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -19,12 +19,14 @@ extern int is_fadump_active(void);
 extern int should_fadump_crash(void);
 extern void crash_fadump(struct pt_regs *, const char *);
 extern void fadump_cleanup(void);
+extern void fadump_append_bootargs(void);
 
 #else  /* CONFIG_FA_DUMP */
 static inline int is_fadump_active(void) { return 0; }
 static inline int should_fadump_crash(void) { return 0; }
 static inline void crash_fadump(struct pt_regs *regs, const char *str) { }
 static inline void fadump_cleanup(void) { }
+static inline void fadump_append_bootargs(void) { }
 #endif /* !CONFIG_FA_DUMP */
 
 #if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP)
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 6d35b09d6f3a..2276bacc4170 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -131,6 +131,41 @@ static int __init fadump_cma_init(void)
 static int __init fadump_cma_init(void) { return 1; }
 #endif /* CONFIG_CMA */
 
+/*
+ * Additional parameters meant for capture kernel are placed in a dedicated 
area.
+ * If this is capture kernel boot, append these parameters to bootargs.
+ */
+void __init fadump_append_bootargs(void)
+{
+   char *append_args;
+   size_t len;
+
+   if (!fw_dump.dump_active || !fw_dump.param_area_supported || 
!fw_dump.param_area)
+   return;
+
+   if (fw_dump.param_area >= fw_dump.boot_mem_top) {
+   if (memblock_reserve(fw_dump.param_area, COMMAND_LINE_SIZE)) {
+   pr_warn("WARNING: Can't use additional parameters 
area!\n");
+   fw_dump.param_area = 0;
+   return;
+   }
+   }
+
+   append_args = (char *)fw_dump.param_area;
+   len = strlen(boot_command_line);
+
+   /*
+* Too late to fail even if cmdline size exceeds. Truncate additional 
parameters
+* to cmdline size and proceed anyway.
+*/
+   if (len + strlen(append_args) >= COMMAND_LINE_SIZE - 1)
+   pr_warn("WARNING: Appending parameters exceeds cmdline size. 
Truncating!\n");
+
+   pr_debug("Cmdline: %s\n", boot_command_line);
+   snprintf(boot_command_line + len, COMMAND_LINE_SIZE - len, " %s", 
append_args);
+   pr_info("Updated cmdline: %s\n", boot_command_line);
+}
+
 /* Scan the Firmware Assisted dump configuration details. */
 int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
  int depth, void *data)
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index eb140ea6b6ff..60819751e55e 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -813,6 +813,9 @@ void __init early_init_devtree(void *params)
 */
of_scan_flat_dt(early_init_dt_scan_chosen_ppc, boot_command_line);
 
+   /* Append additional parameters passed for fadump capture kernel */
+   fadump_append_bootargs();
+
/* Scan memory nodes and rebuild MEMBLOCKs */
early_init_dt_scan_root();
early_init_dt_scan_memory_ppc();
-- 
2.45.0



[PATCH v2 2/3] powerpc/fadump: setup additional parameters for dump capture kernel

2024-05-09 Thread Hari Bathini
For fadump case, passing additional parameters to dump capture kernel
helps in minimizing the memory footprint for it and also provides the
flexibility to disable components/modules, like hugepages, that are
hindering the boot process of the special dump capture environment.

Set up a dedicated parameter area to be passed to the capture kernel.
This area type is defined as RTAS_FADUMP_PARAM_AREA. Sysfs attribute
'/sys/kernel/fadump/bootargs_append' is exported to the userspace to
specify the additional parameters to be passed to the capture kernel

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/fadump-internal.h   |  3 +
 arch/powerpc/kernel/fadump.c | 87 
 arch/powerpc/platforms/powernv/opal-fadump.c |  6 +-
 arch/powerpc/platforms/pseries/rtas-fadump.c | 35 +++-
 arch/powerpc/platforms/pseries/rtas-fadump.h | 11 ++-
 5 files changed, 133 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump-internal.h 
b/arch/powerpc/include/asm/fadump-internal.h
index 35787fa1ac60..e83869a4eb6a 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -124,6 +124,8 @@ struct fw_dump {
unsigned long   cpu_notes_buf_vaddr;
unsigned long   cpu_notes_buf_size;
 
+   unsigned long   param_area;
+
/*
 * Maximum size supported by firmware to copy from source to
 * destination address per entry.
@@ -138,6 +140,7 @@ struct fw_dump {
unsigned long   dump_active:1;
unsigned long   dump_registered:1;
unsigned long   nocma:1;
+   unsigned long   param_area_supported:1;
 
struct fadump_ops   *ops;
 };
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index fe6be00451b9..6d35b09d6f3a 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -1431,6 +1431,43 @@ static ssize_t registered_show(struct kobject *kobj,
return sprintf(buf, "%d\n", fw_dump.dump_registered);
 }
 
+static ssize_t bootargs_append_show(struct kobject *kobj,
+  struct kobj_attribute *attr,
+  char *buf)
+{
+   return sprintf(buf, "%s\n", (char *)__va(fw_dump.param_area));
+}
+
+static ssize_t bootargs_append_store(struct kobject *kobj,
+  struct kobj_attribute *attr,
+  const char *buf, size_t count)
+{
+   char *params;
+
+   if (!fw_dump.fadump_enabled || fw_dump.dump_active)
+   return -EPERM;
+
+   if (count >= COMMAND_LINE_SIZE)
+   return -EINVAL;
+
+   /*
+* Fail here instead of handling this scenario with
+* some silly workaround in capture kernel.
+*/
+   if (saved_command_line_len + count >= COMMAND_LINE_SIZE) {
+   pr_err("Appending parameters exceeds cmdline size!\n");
+   return -ENOSPC;
+   }
+
+   params = __va(fw_dump.param_area);
+   strscpy_pad(params, buf, COMMAND_LINE_SIZE);
+   /* Remove newline character at the end. */
+   if (params[count-1] == '\n')
+   params[count-1] = '\0';
+
+   return count;
+}
+
 static ssize_t registered_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
@@ -1490,6 +1527,7 @@ static struct kobj_attribute enable_attr = 
__ATTR_RO(enabled);
 static struct kobj_attribute register_attr = __ATTR_RW(registered);
 static struct kobj_attribute mem_reserved_attr = __ATTR_RO(mem_reserved);
 static struct kobj_attribute hotplug_ready_attr = __ATTR_RO(hotplug_ready);
+static struct kobj_attribute bootargs_append_attr = __ATTR_RW(bootargs_append);
 
 static struct attribute *fadump_attrs[] = {
&enable_attr.attr,
@@ -1663,6 +1701,54 @@ static void __init fadump_process(void)
fadump_invalidate_release_mem();
 }
 
+/*
+ * Reserve memory to store additional parameters to be passed
+ * for fadump/capture kernel.
+ */
+static void fadump_setup_param_area(void)
+{
+   phys_addr_t range_start, range_end;
+
+   if (!fw_dump.param_area_supported || fw_dump.dump_active)
+   return;
+
+   /* This memory can't be used by PFW or bootloader as it is shared 
across kernels */
+   if (radix_enabled()) {
+   /*
+* Anywhere in the upper half should be good enough as all 
memory
+* is accessible in real mode.
+*/
+   range_start = memblock_end_of_DRAM() / 2;
+   range_end = memblock_end_of_DRAM();
+   } else {
+   /*
+* Passing additional parameters is supported for hash MMU only
+* if the first memory block size is 768MB or higher.
+*/
+   

[PATCH v2 1/3] powerpc/pseries/fadump: add support for multiple boot memory regions

2024-05-09 Thread Hari Bathini
Currently, fadump on pseries assumes a single boot memory region even
though f/w supports more than one boot memory region. Add support for
more boot memory regions to make the implementation flexible for any
enhancements that introduce other region types. For this, rtas memory
structure for fadump is updated to have multiple boot memory regions
instead of just one. Additionally, methods responsible for creating
the fadump memory structure during both the first and second kernel
boot have been modified to take these multiple boot memory regions
into account. Also, a new callback has been added to the fadump_ops
structure to get the maximum boot memory regions supported by the
platform.

Signed-off-by: Sourabh Jain 
Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/fadump-internal.h   |   2 +-
 arch/powerpc/kernel/fadump.c |  27 +-
 arch/powerpc/platforms/powernv/opal-fadump.c |   7 +
 arch/powerpc/platforms/pseries/rtas-fadump.c | 255 +--
 arch/powerpc/platforms/pseries/rtas-fadump.h |  26 +-
 5 files changed, 197 insertions(+), 120 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump-internal.h 
b/arch/powerpc/include/asm/fadump-internal.h
index 5d706a7acc8a..35787fa1ac60 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -156,6 +156,7 @@ struct fadump_ops {
  struct seq_file *m);
void(*fadump_trigger)(struct fadump_crash_info_header *fdh,
  const char *msg);
+   int (*fadump_max_boot_mem_rgns)(void);
 };
 
 /* Helper functions */
@@ -163,7 +164,6 @@ s32 __init fadump_setup_cpu_notes_buf(u32 num_cpus);
 void fadump_free_cpu_notes_buf(void);
 u32 *__init fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs);
 void __init fadump_update_elfcore_header(char *bufp);
-bool is_fadump_boot_mem_contiguous(void);
 bool is_fadump_reserved_mem_contiguous(void);
 
 #else /* !CONFIG_PRESERVE_FA_DUMP */
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 0b849563393e..fe6be00451b9 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -220,28 +220,6 @@ static bool is_fadump_mem_area_contiguous(u64 d_start, u64 
d_end)
return ret;
 }
 
-/*
- * Returns true, if there are no holes in boot memory area,
- * false otherwise.
- */
-bool is_fadump_boot_mem_contiguous(void)
-{
-   unsigned long d_start, d_end;
-   bool ret = false;
-   int i;
-
-   for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) {
-   d_start = fw_dump.boot_mem_addr[i];
-   d_end   = d_start + fw_dump.boot_mem_sz[i];
-
-   ret = is_fadump_mem_area_contiguous(d_start, d_end);
-   if (!ret)
-   break;
-   }
-
-   return ret;
-}
-
 /*
  * Returns true, if there are no holes in reserved memory area,
  * false otherwise.
@@ -381,10 +359,11 @@ static unsigned long __init get_fadump_area_size(void)
 static int __init add_boot_mem_region(unsigned long rstart,
  unsigned long rsize)
 {
+   int max_boot_mem_rgns = fw_dump.ops->fadump_max_boot_mem_rgns();
int i = fw_dump.boot_mem_regs_cnt++;
 
-   if (fw_dump.boot_mem_regs_cnt > FADUMP_MAX_MEM_REGS) {
-   fw_dump.boot_mem_regs_cnt = FADUMP_MAX_MEM_REGS;
+   if (fw_dump.boot_mem_regs_cnt > max_boot_mem_rgns) {
+   fw_dump.boot_mem_regs_cnt = max_boot_mem_rgns;
return 0;
}
 
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c 
b/arch/powerpc/platforms/powernv/opal-fadump.c
index 767a6b19e42a..5a88d7efb48a 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -599,6 +599,12 @@ static void opal_fadump_trigger(struct 
fadump_crash_info_header *fdh,
pr_emerg("No backend support for MPIPL!\n");
 }
 
+/* FADUMP_MAX_MEM_REGS or lower */
+static int opal_fadump_max_boot_mem_rgns(void)
+{
+   return FADUMP_MAX_MEM_REGS;
+}
+
 static struct fadump_ops opal_fadump_ops = {
.fadump_init_mem_struct = opal_fadump_init_mem_struct,
.fadump_get_metadata_size   = opal_fadump_get_metadata_size,
@@ -611,6 +617,7 @@ static struct fadump_ops opal_fadump_ops = {
.fadump_process = opal_fadump_process,
.fadump_region_show = opal_fadump_region_show,
.fadump_trigger = opal_fadump_trigger,
+   .fadump_max_boot_mem_rgns   = opal_fadump_max_boot_mem_rgns,
 };
 
 void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c 
b/arch/powerpc/platforms/pseries/rtas-fadump.c
index 214f37788b2d..4db78b2bb2a8 100644
--- a/arch/powerpc/platforms/pseries/rtas-fadump.c
+++ b/arch/powerpc/platforms/pseries/rtas-fadump.c
@@ -29,9 +29,6 

[PATCH v2 0/3] powerpc/fadump: pass additional args to dump capture kernel

2024-05-09 Thread Hari Bathini
While fadump is a more reliable alternative to kdump dump capturing
method, it doesn't support passing additional parameters. Having
such support is desirable for two major reasons:

  1. It helps minimize the memory consumption of fadump dump capture
 kernel by disabling features that consume considerable amount of
 memory but have little significance for dump capture environment
 (eg. numa, cma, cgroup, etc.)
   2. It helps disable such features/components in dump capture kernel
  that are unstable and/or are being debugged.

This patch series is a follow-up to [1]. Adds support for passing
additional parameters to fadump capture kernel to make it more
desirable. For this, a dedicated area is passed between production
kernel and capture kerenl to pass these additional parameters. This
support is enabled only on pseries as of now. The dedicated area is
referred to as RTAS_FADUMP_PARAM_AREA.

In radix MMU mode, this dedicated area can be anywhere but in case of
hash MMU, it can only be in the first memory block to be accessible
during early boot. Enabling this feature support in both radix and
hash MMU modes but in hash MMU only when RMA size is 768MB or more
to avoid complex memory real estate with FW components.

The first patch adds support for multiple boot memory regions to make
addition of any new region types simpler. The second patch sets up the
parameter (dedicated) area to be passed to the capture kernel.
/sys/kernel/fadump/bootargs_append is exported to the userspace to
specify the additional parameters to be passed to the capture kernel.
The last patch appends the parameters to bootargs during capture
kernel boot.

Changes in v2:
* RFC tag removed.
* Moved variable declaration out of switch case.
* Zero'ed the parameter area while setting up.
* Reserving the parameter area only when needed.

[1] 
https://lore.kernel.org/linuxppc-dev/20231205201835.388030-1-hbath...@linux.ibm.com/

Hari Bathini (3):
  powerpc/pseries/fadump: add support for multiple boot memory regions
  powerpc/fadump: setup additional parameters for dump capture kernel
  powerpc/fadump: pass additional parameters when fadump is active

 arch/powerpc/include/asm/fadump-internal.h   |   5 +-
 arch/powerpc/include/asm/fadump.h|   2 +
 arch/powerpc/kernel/fadump.c | 149 --
 arch/powerpc/kernel/prom.c   |   3 +
 arch/powerpc/platforms/powernv/opal-fadump.c |  13 +-
 arch/powerpc/platforms/pseries/rtas-fadump.c | 290 +--
 arch/powerpc/platforms/pseries/rtas-fadump.h |  29 +-
 7 files changed, 366 insertions(+), 125 deletions(-)

-- 
2.45.0



[PATCH v4 1/2] powerpc64/bpf: fix tail calls for PCREL addressing

2024-05-02 Thread Hari Bathini
With PCREL addressing, there is no kernel TOC. So, it is not setup in
prologue when PCREL addressing is used. But the number of instructions
to skip on a tail call was not adjusted accordingly. That resulted in
not so obvious failures while using tailcalls. 'tailcalls' selftest
crashed the system with the below call trace:

  bpf_test_run+0xe8/0x3cc (unreliable)
  bpf_prog_test_run_skb+0x348/0x778
  __sys_bpf+0xb04/0x2b00
  sys_bpf+0x28/0x38
  system_call_exception+0x168/0x340
  system_call_vectored_common+0x15c/0x2ec

Also, as bpf programs are always module addresses and a bpf helper in
general is a core kernel text address, using PC relative addressing
often fails with "out of range of pcrel address" error. Switch to
using kernel base for relative addressing to handle this better.

Fixes: 7e3a68be42e1 ("powerpc/64: vmlinux support building with PCREL 
addresing")
Cc: sta...@vger.kernel.org
Signed-off-by: Hari Bathini 
---

* Changes in v4:
  - Fix out of range errors by switching to kernelbase instead of PC
for relative addressing.

* Changes in v3:
  - New patch to fix tailcall issues with PCREL addressing.


 arch/powerpc/net/bpf_jit_comp64.c | 30 --
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index 79f23974a320..4de08e35e284 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -202,7 +202,8 @@ void bpf_jit_build_epilogue(u32 *image, struct 
codegen_context *ctx)
EMIT(PPC_RAW_BLR());
 }
 
-static int bpf_jit_emit_func_call_hlp(u32 *image, struct codegen_context *ctx, 
u64 func)
+static int
+bpf_jit_emit_func_call_hlp(u32 *image, u32 *fimage, struct codegen_context 
*ctx, u64 func)
 {
unsigned long func_addr = func ? ppc_function_entry((void *)func) : 0;
long reladdr;
@@ -211,19 +212,20 @@ static int bpf_jit_emit_func_call_hlp(u32 *image, struct 
codegen_context *ctx, u
return -EINVAL;
 
if (IS_ENABLED(CONFIG_PPC_KERNEL_PCREL)) {
-   reladdr = func_addr - CTX_NIA(ctx);
+   reladdr = func_addr - local_paca->kernelbase;
 
if (reladdr >= (long)SZ_8G || reladdr < -(long)SZ_8G) {
-   pr_err("eBPF: address of %ps out of range of pcrel 
address.\n",
-   (void *)func);
+   pr_err("eBPF: address of %ps out of range of 34-bit 
relative address.\n",
+  (void *)func);
return -ERANGE;
}
-   /* pla r12,addr */
-   EMIT(PPC_PREFIX_MLS | __PPC_PRFX_R(1) | IMM_H18(reladdr));
-   EMIT(PPC_INST_PADDI | ___PPC_RT(_R12) | IMM_L(reladdr));
-   EMIT(PPC_RAW_MTCTR(_R12));
-   EMIT(PPC_RAW_BCTR());
-
+   EMIT(PPC_RAW_LD(_R12, _R13, offsetof(struct paca_struct, 
kernelbase)));
+   /* Align for subsequent prefix instruction */
+   if (!IS_ALIGNED((unsigned long)fimage + CTX_NIA(ctx), 8))
+   EMIT(PPC_RAW_NOP());
+   /* paddi r12,r12,addr */
+   EMIT(PPC_PREFIX_MLS | __PPC_PRFX_R(0) | IMM_H18(reladdr));
+   EMIT(PPC_INST_PADDI | ___PPC_RT(_R12) | ___PPC_RA(_R12) | 
IMM_L(reladdr));
} else {
reladdr = func_addr - kernel_toc_addr();
if (reladdr > 0x7FFF || reladdr < -(0x8000L)) {
@@ -233,9 +235,9 @@ static int bpf_jit_emit_func_call_hlp(u32 *image, struct 
codegen_context *ctx, u
 
EMIT(PPC_RAW_ADDIS(_R12, _R2, PPC_HA(reladdr)));
EMIT(PPC_RAW_ADDI(_R12, _R12, PPC_LO(reladdr)));
-   EMIT(PPC_RAW_MTCTR(_R12));
-   EMIT(PPC_RAW_BCTRL());
}
+   EMIT(PPC_RAW_MTCTR(_R12));
+   EMIT(PPC_RAW_BCTRL());
 
return 0;
 }
@@ -285,7 +287,7 @@ static int bpf_jit_emit_tail_call(u32 *image, struct 
codegen_context *ctx, u32 o
int b2p_index = bpf_to_ppc(BPF_REG_3);
int bpf_tailcall_prologue_size = 8;
 
-   if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2))
+   if (!IS_ENABLED(CONFIG_PPC_KERNEL_PCREL) && 
IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2))
bpf_tailcall_prologue_size += 4; /* skip past the toc load */
 
/*
@@ -993,7 +995,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 
*fimage, struct code
return ret;
 
if (func_addr_fixed)
-   ret = bpf_jit_emit_func_call_hlp(image, ctx, 
func_addr);
+   ret = bpf_jit_emit_func_call_hlp(image, fimage, 
ctx, func_addr);
else
ret = bpf_jit_emit_func_call_rel(image, fimage, 
ctx, func_addr);
 
-- 
2.44.0



[PATCH v4 2/2] powerpc/bpf: enable kfunc call

2024-05-02 Thread Hari Bathini
Currently, bpf jit code on powerpc assumes all the bpf functions and
helpers to be part of core kernel text. This is false for kfunc case,
as function addresses may not be part of core kernel text area. So,
add support for addresses that are not within core kernel text area
too, to enable kfunc support. Emit instructions based on whether the
function address is within core kernel text address or not, to retain
optimized instruction sequence where possible.

In case of PCREL, as a bpf function that is not within core kernel
text area is likely to go out of range with relative addressing on
kernel base, use PC relative addressing. If that goes out of range,
load the full address with PPC_LI64().

With addresses that are not within core kernel text area supported,
override bpf_jit_supports_kfunc_call() to enable kfunc support. Also,
override bpf_jit_supports_far_kfunc_call() to enable 64-bit pointers,
as an address offset can be more than 32-bit long on PPC64.

Signed-off-by: Hari Bathini 
---

* Changes in v4:
  - Use either kernelbase or PC for relative addressing. Also, fallback
to PPC_LI64(), if both are out of range.
  - Update r2 with kernel TOC for elfv1 too as elfv1 also uses the
optimization sequence, that expects r2 to be kernel TOC, when
function address is within core kernel text.

* Changes in v3:
  - Retained optimized instruction sequence when function address is
a core kernel address as suggested by Naveen.
  - Used unoptimized instruction sequence for PCREL addressing to
avoid out of range errors for core kernel function addresses.
  - Folded patch that adds support for kfunc calls with patch that
enables/advertises this support as suggested by Naveen.


 arch/powerpc/net/bpf_jit_comp.c   | 10 +
 arch/powerpc/net/bpf_jit_comp64.c | 61 ++-
 2 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 0f9a21783329..984655419da5 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -359,3 +359,13 @@ void bpf_jit_free(struct bpf_prog *fp)
 
bpf_prog_unlock_free(fp);
 }
+
+bool bpf_jit_supports_kfunc_call(void)
+{
+   return true;
+}
+
+bool bpf_jit_supports_far_kfunc_call(void)
+{
+   return IS_ENABLED(CONFIG_PPC64);
+}
diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index 4de08e35e284..8afc14a4a125 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -208,17 +208,13 @@ bpf_jit_emit_func_call_hlp(u32 *image, u32 *fimage, 
struct codegen_context *ctx,
unsigned long func_addr = func ? ppc_function_entry((void *)func) : 0;
long reladdr;
 
-   if (WARN_ON_ONCE(!core_kernel_text(func_addr)))
+   if (WARN_ON_ONCE(!kernel_text_address(func_addr)))
return -EINVAL;
 
-   if (IS_ENABLED(CONFIG_PPC_KERNEL_PCREL)) {
-   reladdr = func_addr - local_paca->kernelbase;
+#ifdef CONFIG_PPC_KERNEL_PCREL
+   reladdr = func_addr - local_paca->kernelbase;
 
-   if (reladdr >= (long)SZ_8G || reladdr < -(long)SZ_8G) {
-   pr_err("eBPF: address of %ps out of range of 34-bit 
relative address.\n",
-  (void *)func);
-   return -ERANGE;
-   }
+   if (reladdr < (long)SZ_8G && reladdr >= -(long)SZ_8G) {
EMIT(PPC_RAW_LD(_R12, _R13, offsetof(struct paca_struct, 
kernelbase)));
/* Align for subsequent prefix instruction */
if (!IS_ALIGNED((unsigned long)fimage + CTX_NIA(ctx), 8))
@@ -227,6 +223,26 @@ bpf_jit_emit_func_call_hlp(u32 *image, u32 *fimage, struct 
codegen_context *ctx,
EMIT(PPC_PREFIX_MLS | __PPC_PRFX_R(0) | IMM_H18(reladdr));
EMIT(PPC_INST_PADDI | ___PPC_RT(_R12) | ___PPC_RA(_R12) | 
IMM_L(reladdr));
} else {
+   unsigned long pc = (unsigned long)fimage + CTX_NIA(ctx);
+   bool alignment_needed = !IS_ALIGNED(pc, 8);
+
+   reladdr = func_addr - (alignment_needed ? pc + 4 :  pc);
+
+   if (reladdr < (long)SZ_8G && reladdr >= -(long)SZ_8G) {
+   if (alignment_needed)
+   EMIT(PPC_RAW_NOP());
+   /* pla r12,addr */
+   EMIT(PPC_PREFIX_MLS | __PPC_PRFX_R(1) | 
IMM_H18(reladdr));
+   EMIT(PPC_INST_PADDI | ___PPC_RT(_R12) | IMM_L(reladdr));
+   } else {
+   /* We can clobber r12 */
+   PPC_LI64(_R12, func);
+   }
+   }
+   EMIT(PPC_RAW_MTCTR(_R12));
+   EMIT(PPC_RAW_BCTRL());
+#else
+   if (core_kernel_text(func_addr)) {
reladdr = func_addr - kernel_toc_addr();
if (reladdr > 0x7FFF || reladdr < -(0x8000L)) {

[PATCH 1/2] radix/kfence: map __kfence_pool at page granularity

2024-04-24 Thread Hari Bathini
When KFENCE is enabled, total system memory is mapped at page level
granularity. But in radix MMU mode, ~3GB additional memory is needed
to map 100GB of system memory at page level granularity when compared
to using 2MB direct mapping. This is not desired considering KFENCE is
designed to be enabled in production kernels [1]. Also, mapping memory
allocated for KFENCE pool at page granularity seems sufficient enough
to enable KFENCE support. So, allocate __kfence_pool during bootup and
map it at page granularity instead of mapping all system memory at
page granularity.

Without patch:
# cat /proc/meminfo
MemTotal:   101201920 kB

With patch:
# cat /proc/meminfo
MemTotal:   104483904 kB

All kfence_test.c testcases passed with this patch.

[1] https://lore.kernel.org/all/20201103175841.3495947-2-el...@google.com/

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/kfence.h|  5 
 arch/powerpc/mm/book3s64/radix_pgtable.c | 34 ++--
 arch/powerpc/mm/init_64.c| 14 ++
 3 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/kfence.h 
b/arch/powerpc/include/asm/kfence.h
index 424ceef82ae6..18ec2b06ba1e 100644
--- a/arch/powerpc/include/asm/kfence.h
+++ b/arch/powerpc/include/asm/kfence.h
@@ -8,6 +8,7 @@
 #ifndef __ASM_POWERPC_KFENCE_H
 #define __ASM_POWERPC_KFENCE_H
 
+#include 
 #include 
 #include 
 
@@ -15,6 +16,10 @@
 #define ARCH_FUNC_PREFIX "."
 #endif
 
+#ifdef CONFIG_KFENCE
+extern bool kfence_early_init;
+#endif
+
 static inline bool arch_kfence_init_pool(void)
 {
return true;
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 15e88f1439ec..fccbf92f279b 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -291,9 +292,8 @@ static unsigned long next_boundary(unsigned long addr, 
unsigned long end)
return end;
 }
 
-static int __meminit create_physical_mapping(unsigned long start,
-unsigned long end,
-int nid, pgprot_t _prot)
+static int __meminit create_physical_mapping(unsigned long start, unsigned 
long end, int nid,
+pgprot_t _prot, unsigned long 
mapping_sz_limit)
 {
unsigned long vaddr, addr, mapping_size = 0;
bool prev_exec, exec = false;
@@ -301,7 +301,10 @@ static int __meminit create_physical_mapping(unsigned long 
start,
int psize;
unsigned long max_mapping_size = memory_block_size;
 
-   if (debug_pagealloc_enabled_or_kfence())
+   if (mapping_sz_limit < max_mapping_size)
+   max_mapping_size = mapping_sz_limit;
+
+   if (debug_pagealloc_enabled())
max_mapping_size = PAGE_SIZE;
 
start = ALIGN(start, PAGE_SIZE);
@@ -358,6 +361,7 @@ static int __meminit create_physical_mapping(unsigned long 
start,
 
 static void __init radix_init_pgtable(void)
 {
+   phys_addr_t kfence_pool __maybe_unused;
unsigned long rts_field;
phys_addr_t start, end;
u64 i;
@@ -365,6 +369,13 @@ static void __init radix_init_pgtable(void)
/* We don't support slb for radix */
slb_set_size(0);
 
+#ifdef CONFIG_KFENCE
+   if (kfence_early_init) {
+   kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
+   memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE);
+   }
+#endif
+
/*
 * Create the linear mapping
 */
@@ -380,10 +391,18 @@ static void __init radix_init_pgtable(void)
continue;
}
 
-   WARN_ON(create_physical_mapping(start, end,
-   -1, PAGE_KERNEL));
+   WARN_ON(create_physical_mapping(start, end, -1, PAGE_KERNEL, 
~0UL));
}
 
+#ifdef CONFIG_KFENCE
+   if (kfence_early_init) {
+   create_physical_mapping(kfence_pool, kfence_pool + 
KFENCE_POOL_SIZE, -1,
+   PAGE_KERNEL, PAGE_SIZE);
+   memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
+   __kfence_pool = __va(kfence_pool);
+   }
+#endif
+
if (!cpu_has_feature(CPU_FTR_HVMODE) &&
cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
/*
@@ -874,8 +893,7 @@ int __meminit radix__create_section_mapping(unsigned long 
start,
return -1;
}
 
-   return create_physical_mapping(__pa(start), __pa(end),
-  nid, prot);
+   return create_physical_mapping(__pa(start), __pa(end), nid, prot, ~0UL);
 }
 
 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long 
end)
diff --git a/arch/powerpc/mm/init_64.c b

[PATCH 2/2] radix/kfence: support late __kfence_pool allocation

2024-04-24 Thread Hari Bathini
With commit b33f778bba5ef ("kfence: alloc kfence_pool after system
startup"), KFENCE pool can be allocated after system startup via the
page allocator. This can lead to problems as all memory is not mapped
at page granularity anymore with CONFIG_KFENCE. Address this by direct
mapping all memory at PMD level and split the mapping for PMD pages
that overlap with __kfence_pool to page level granularity if and when
__kfence_pool is allocated after system startup.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/book3s/64/radix.h |  2 +
 arch/powerpc/include/asm/kfence.h  | 14 +-
 arch/powerpc/mm/book3s64/radix_pgtable.c   | 50 +-
 3 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index 8f55ff74bb68..0423ddbcf73c 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -340,6 +340,8 @@ extern void radix__vmemmap_remove_mapping(unsigned long 
start,
 extern int radix__map_kernel_page(unsigned long ea, unsigned long pa,
 pgprot_t flags, unsigned int psz);
 
+extern bool radix_kfence_init_pool(void);
+
 static inline unsigned long radix__get_tree_size(void)
 {
unsigned long rts_field;
diff --git a/arch/powerpc/include/asm/kfence.h 
b/arch/powerpc/include/asm/kfence.h
index 18ec2b06ba1e..c5d2fb2f9ecb 100644
--- a/arch/powerpc/include/asm/kfence.h
+++ b/arch/powerpc/include/asm/kfence.h
@@ -18,12 +18,24 @@
 
 #ifdef CONFIG_KFENCE
 extern bool kfence_early_init;
-#endif
+
+static inline bool kfence_alloc_pool_late(void)
+{
+   return !kfence_early_init;
+}
 
 static inline bool arch_kfence_init_pool(void)
 {
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (radix_enabled())
+   return radix_kfence_init_pool();
+#endif
+
return true;
 }
+#else
+static inline bool kfence_alloc_pool_late(void) { return false; }
+#endif
 
 #ifdef CONFIG_PPC64
 static inline bool kfence_protect_page(unsigned long addr, bool protect)
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index fccbf92f279b..f4374e3e31e1 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -253,6 +253,53 @@ void radix__mark_initmem_nx(void)
 }
 #endif /* CONFIG_STRICT_KERNEL_RWX */
 
+#ifdef CONFIG_KFENCE
+static inline int radix_split_pmd_page(pmd_t *pmd, unsigned long addr)
+{
+   pte_t *pte = pte_alloc_one_kernel(&init_mm);
+   unsigned long pfn = PFN_DOWN(__pa(addr));
+   int i;
+
+   if (!pte)
+   return -ENOMEM;
+
+   for (i = 0; i < PTRS_PER_PTE; i++) {
+   __set_pte_at(&init_mm, addr, pte + i, pfn_pte(pfn + i, 
PAGE_KERNEL), 0);
+   asm volatile("ptesync": : :"memory");
+   }
+   pmd_populate_kernel(&init_mm, pmd, pte);
+
+   flush_tlb_kernel_range(addr, addr + PMD_SIZE);
+   return 0;
+}
+
+bool radix_kfence_init_pool(void)
+{
+   unsigned int page_psize, pmd_psize;
+   unsigned long addr;
+   pmd_t *pmd;
+
+   if (!kfence_alloc_pool_late())
+   return true;
+
+   page_psize = shift_to_mmu_psize(PAGE_SHIFT);
+   pmd_psize = shift_to_mmu_psize(PMD_SHIFT);
+   for (addr = (unsigned long)__kfence_pool; is_kfence_address((void 
*)addr);
+addr += PAGE_SIZE) {
+   pmd = pmd_off_k(addr);
+
+   if (pmd_leaf(*pmd)) {
+   if (radix_split_pmd_page(pmd, addr & PMD_MASK))
+   return false;
+   update_page_count(pmd_psize, -1);
+   update_page_count(page_psize, PTRS_PER_PTE);
+   }
+   }
+
+   return true;
+}
+#endif
+
 static inline void __meminit
 print_mapping(unsigned long start, unsigned long end, unsigned long size, bool 
exec)
 {
@@ -391,7 +438,8 @@ static void __init radix_init_pgtable(void)
continue;
}
 
-   WARN_ON(create_physical_mapping(start, end, -1, PAGE_KERNEL, 
~0UL));
+   WARN_ON(create_physical_mapping(start, end, -1, PAGE_KERNEL,
+   kfence_alloc_pool_late() ? 
PMD_SIZE : ~0UL));
}
 
 #ifdef CONFIG_KFENCE
-- 
2.44.0



[PATCH v3 1/2] powerpc64/bpf: fix tail calls for PCREL addressing

2024-04-02 Thread Hari Bathini
With PCREL addressing, there is no kernel TOC. So, it is not setup in
prologue when PCREL addressing is used. But the number of instructions
to skip on a tail call was not adjusted accordingly. That resulted in
not so obvious failures while using tailcalls. 'tailcalls' selftest
crashed the system with the below call trace:

  bpf_test_run+0xe8/0x3cc (unreliable)
  bpf_prog_test_run_skb+0x348/0x778
  __sys_bpf+0xb04/0x2b00
  sys_bpf+0x28/0x38
  system_call_exception+0x168/0x340
  system_call_vectored_common+0x15c/0x2ec

Fixes: 7e3a68be42e1 ("powerpc/64: vmlinux support building with PCREL 
addresing")
Cc: sta...@vger.kernel.org
Signed-off-by: Hari Bathini 
---

* Changes in v3:
  - New patch to fix tailcall issues with PCREL addressing.


 arch/powerpc/net/bpf_jit_comp64.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index 79f23974a320..7f62ac4b4e65 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -285,8 +285,10 @@ static int bpf_jit_emit_tail_call(u32 *image, struct 
codegen_context *ctx, u32 o
int b2p_index = bpf_to_ppc(BPF_REG_3);
int bpf_tailcall_prologue_size = 8;
 
+#ifndef CONFIG_PPC_KERNEL_PCREL
if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2))
bpf_tailcall_prologue_size += 4; /* skip past the toc load */
+#endif
 
/*
 * if (index >= array->map.max_entries)
-- 
2.44.0



[PATCH v3 2/2] powerpc/bpf: enable kfunc call

2024-04-02 Thread Hari Bathini
Currently, bpf jit code on powerpc assumes all the bpf functions and
helpers to be kernel text. This is false for kfunc case, as function
addresses can be module addresses as well. So, ensure module addresses
are supported to enable kfunc support.

Emit instructions based on whether the function address is kernel text
address or module address to retain optimized instruction sequence for
kernel text address case.

Also, as bpf programs are always module addresses and a bpf helper can
be within kernel address as well, using relative addressing often fails
with "out of range of pcrel address" error. Use unoptimized instruction
sequence for both kernel and module addresses to work around this, when
PCREL addressing is used.

With module addresses supported, override bpf_jit_supports_kfunc_call()
to enable kfunc support. Since module address offsets can be more than
32-bit long on PPC64, override bpf_jit_supports_far_kfunc_call() to
enable 64-bit pointers.

Signed-off-by: Hari Bathini 
---

* Changes in v3:
  - Retained optimized instruction sequence when function address is
a core kernel address as suggested by Naveen.
  - Used unoptimized instruction sequence for PCREL addressing to
avoid out of range errors for core kernel function addresses.
  - Folded patch that adds support for kfunc calls with patch that
enables/advertises this support as suggested by Naveen.


 arch/powerpc/net/bpf_jit_comp.c   | 10 +++
 arch/powerpc/net/bpf_jit_comp64.c | 48 ---
 2 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 0f9a21783329..dc7ffafd7441 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -359,3 +359,13 @@ void bpf_jit_free(struct bpf_prog *fp)
 
bpf_prog_unlock_free(fp);
 }
+
+bool bpf_jit_supports_kfunc_call(void)
+{
+   return true;
+}
+
+bool bpf_jit_supports_far_kfunc_call(void)
+{
+   return IS_ENABLED(CONFIG_PPC64) ? true : false;
+}
diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index 7f62ac4b4e65..ec3adf715c55 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -207,24 +207,14 @@ static int bpf_jit_emit_func_call_hlp(u32 *image, struct 
codegen_context *ctx, u
unsigned long func_addr = func ? ppc_function_entry((void *)func) : 0;
long reladdr;
 
-   if (WARN_ON_ONCE(!core_kernel_text(func_addr)))
+   /*
+* With the introduction of kfunc feature, BPF helpers can be part of 
kernel as
+* well as module text address.
+*/
+   if (WARN_ON_ONCE(!kernel_text_address(func_addr)))
return -EINVAL;
 
-   if (IS_ENABLED(CONFIG_PPC_KERNEL_PCREL)) {
-   reladdr = func_addr - CTX_NIA(ctx);
-
-   if (reladdr >= (long)SZ_8G || reladdr < -(long)SZ_8G) {
-   pr_err("eBPF: address of %ps out of range of pcrel 
address.\n",
-   (void *)func);
-   return -ERANGE;
-   }
-   /* pla r12,addr */
-   EMIT(PPC_PREFIX_MLS | __PPC_PRFX_R(1) | IMM_H18(reladdr));
-   EMIT(PPC_INST_PADDI | ___PPC_RT(_R12) | IMM_L(reladdr));
-   EMIT(PPC_RAW_MTCTR(_R12));
-   EMIT(PPC_RAW_BCTR());
-
-   } else {
+   if (core_kernel_text(func_addr) && 
!IS_ENABLED(CONFIG_PPC_KERNEL_PCREL)) {
reladdr = func_addr - kernel_toc_addr();
if (reladdr > 0x7FFF || reladdr < -(0x8000L)) {
pr_err("eBPF: address of %ps out of range of 
kernel_toc.\n", (void *)func);
@@ -235,6 +225,32 @@ static int bpf_jit_emit_func_call_hlp(u32 *image, struct 
codegen_context *ctx, u
EMIT(PPC_RAW_ADDI(_R12, _R12, PPC_LO(reladdr)));
EMIT(PPC_RAW_MTCTR(_R12));
EMIT(PPC_RAW_BCTRL());
+   } else {
+   if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V1)) {
+   /* func points to the function descriptor */
+   PPC_LI64(bpf_to_ppc(TMP_REG_2), func);
+   /* Load actual entry point from function descriptor */
+   EMIT(PPC_RAW_LD(bpf_to_ppc(TMP_REG_1), 
bpf_to_ppc(TMP_REG_2), 0));
+   /* ... and move it to CTR */
+   EMIT(PPC_RAW_MTCTR(bpf_to_ppc(TMP_REG_1)));
+   /*
+* Load TOC from function descriptor at offset 8.
+* We can clobber r2 since we get called through a
+* function pointer (so caller will save/restore r2)
+* and since we don't use a TOC ourself.
+*/
+   EMIT(PPC_RAW_LD(2, bpf_to_ppc(TMP_REG_2), 8));
+   EMIT(PPC_RAW_BCTRL());
+  

Re: [PATCH v8 1/3] powerpc: make fadump resilient with memory add/remove events

2024-03-11 Thread Hari Bathini




On 17/02/24 12:50 pm, Sourabh Jain wrote:

Due to changes in memory resources caused by either memory hotplug or
online/offline events, the elfcorehdr, which describes the CPUs and
memory of the crashed kernel to the kernel that collects the dump (known
as second/fadump kernel), becomes outdated. Consequently, attempting
dump collection with an outdated elfcorehdr can lead to failed or
inaccurate dump collection.

Memory hotplug or online/offline events is referred as memory add/remove
events in reset of the commit message.

The current solution to address the aforementioned issue is as follows:
Monitor memory add/remove events in userspace using udev rules, and
re-register fadump whenever there are changes in memory resources. This
leads to the creation of a new elfcorehdr with updated system memory
information.

There are several notable issues associated with re-registering fadump
for every memory add/remove events.

1. Bulk memory add/remove events with udev-based fadump re-registration
can lead to race conditions and, more importantly, it creates a wide
window during which fadump is inactive until all memory add/remove
events are settled.
2. Re-registering fadump for every memory add/remove event is
inefficient.
3. The memory for elfcorehdr is allocated based on the memblock regions
available during early boot and remains fixed thereafter. However, if
elfcorehdr is later recreated with additional memblock regions, its
size will increase, potentially leading to memory corruption.

Address the aforementioned challenges by shifting the creation of
elfcorehdr from the first kernel (also referred as the crashed kernel),
where it was created and frequently recreated for every memory
add/remove event, to the fadump kernel. As a result, the elfcorehdr only
needs to be created once, thus eliminating the necessity to re-register
fadump during memory add/remove events.

At present, the first kernel is responsible for preparing the fadump
header and storing it in the fadump reserved area. The fadump header
includes the start address of the elfcorehdr, crashing CPU details, and
other relevant information. In the event of a crash in the first kernel,
the second/fadump boots and accesses the fadump header prepared by the
first kernel. It then performs the following steps in a
platform-specific function [rtas|opal]_fadump_process:

1. Sanity check for fadump header
2. Update CPU notes in elfcorehdr

Along with the above, update the setup_fadump()/fadump.c to create
elfcorehdr and set its address to the global variable elfcorehdr_addr
for the vmcore module to process it in the second/fadump kernel.

Section below outlines the information required to create the elfcorehdr
and the changes made to make it available to the fadump kernel if it's
not already.

To create elfcorehdr, the following crashed kernel information is
required: CPU notes, vmcoreinfo, and memory ranges.

At present, the CPU notes are already prepared in the fadump kernel, so
no changes are needed in that regard. The fadump kernel has access to
all crashed kernel memory regions, including boot memory regions that
are relocated by firmware to fadump reserved areas, so no changes for
that either. However, it is necessary to add new members to the fadump
header, i.e., the 'fadump_crash_info_header' structure, in order to pass
the crashed kernel's vmcoreinfo address and its size to fadump kernel.

In addition to the vmcoreinfo address and size, there are a few other
attributes also added to the fadump_crash_info_header structure.

1. version:
It stores the fadump header version, which is currently set to 1.
This provides flexibility to update the fadump crash info header in
the future without changing the magic number. For each change in the
fadump header, the version will be increased. This will help the
updated kernel determine how to handle kernel dumps from older
kernels. The magic number remains relevant for checking fadump header
corruption.

2. pt_regs_sz/cpu_mask_sz:
Store size of pt_regs and cpu_mask structure of first kernel. These
attributes are used to prevent dump processing if the sizes of
pt_regs or cpu_mask structure differ between the first and fadump
kernels.

Note: if either first/crashed kernel or second/fadump kernel do not have
the changes introduced here then kernel fail to collect the dump and
prints relevant error message on the console.

Signed-off-by: Sourabh Jain 
Cc: Aditya Gupta 
Cc: Aneesh Kumar K.V 
Cc: Hari Bathini 
Cc: Mahesh Salgaonkar 
Cc: Michael Ellerman 
Cc: Naveen N Rao 
---
  arch/powerpc/include/asm/fadump-internal.h   |  31 +-
  arch/powerpc/kernel/fadump.c | 339 +++
  arch/powerpc/platforms/powernv/opal-fadump.c |  22 +-
  arch/powerpc/platforms/pseries/rtas-fadump.c |  30 +-
  4 files changed, 232 insertions(+), 190 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump-intern

Re: [PATCH v17 6/6] powerpc/crash: add crash memory hotplug support

2024-03-02 Thread Hari Bathini




On 26/02/24 2:11 pm, Sourabh Jain wrote:

Extend the arch crash hotplug handler, as introduced by the patch title
("powerpc: add crash CPU hotplug support"), to also support memory
add/remove events.

Elfcorehdr describes the memory of the crash kernel to capture the
kernel; hence, it needs to be updated if memory resources change due to
memory add/remove events. Therefore, arch_crash_handle_hotplug_event()
is updated to recreate the elfcorehdr and replace it with the previous
one on memory add/remove events.

The memblock list is used to prepare the elfcorehdr. In the case of
memory hot remove, the memblock list is updated after the arch crash
hotplug handler is triggered, as depicted in Figure 1. Thus, the
hot-removed memory is explicitly removed from the crash memory ranges
to ensure that the memory ranges added to elfcorehdr do not include the
hot-removed memory.

 Memory remove
   |
   v
 Offline pages
   |
   v
  Initiate memory notify call <> crash hotplug handler
  chain for MEM_OFFLINE event
   |
   v
  Update memblock list

Figure 1

There are two system calls, `kexec_file_load` and `kexec_load`, used to
load the kdump image. A few changes have been made to ensure that the
kernel can safely update the elfcorehdr component of the kdump image for
both system calls.

For the kexec_file_load syscall, kdump image is prepared in the kernel.
To support an increasing number of memory regions, the elfcorehdr is
built with extra buffer space to ensure that it can accommodate
additional memory ranges in future.

For the kexec_load syscall, the elfcorehdr is updated only if the
KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag is passed to the kernel by the
kexec tool. Passing this flag to the kernel indicates that the
elfcorehdr is built to accommodate additional memory ranges and the
elfcorehdr segment is not considered for SHA calculation, making it safe
to update.

The changes related to this feature are kept under the CRASH_HOTPLUG
config, and it is enabled by default.



Overall, the patchset looks good. I tried out the changes too.

Acked-by: Hari Bathini 


Signed-off-by: Sourabh Jain 
Cc: Akhil Raj 
Cc: Andrew Morton 
Cc: Aneesh Kumar K.V 
Cc: Baoquan He 
Cc: Borislav Petkov (AMD) 
Cc: Boris Ostrovsky 
Cc: Christophe Leroy 
Cc: Dave Hansen 
Cc: Dave Young 
Cc: David Hildenbrand 
Cc: Greg Kroah-Hartman 
Cc: Hari Bathini 
Cc: Laurent Dufour 
Cc: Mahesh Salgaonkar 
Cc: Michael Ellerman 
Cc: Mimi Zohar 
Cc: Naveen N Rao 
Cc: Oscar Salvador 
Cc: Thomas Gleixner 
Cc: Valentin Schneider 
Cc: Vivek Goyal 
Cc: ke...@lists.infradead.org
Cc: x...@kernel.org
---
  arch/powerpc/include/asm/kexec.h|  3 +
  arch/powerpc/include/asm/kexec_ranges.h |  1 +
  arch/powerpc/kexec/crash.c  | 95 -
  arch/powerpc/kexec/file_load_64.c   | 20 +-
  arch/powerpc/kexec/ranges.c | 85 ++
  5 files changed, 202 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index e75970351bcd..95a98b390d62 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -141,6 +141,9 @@ void arch_crash_handle_hotplug_event(struct kimage *image, 
void *arg);
  
  int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags);

  #define arch_crash_hotplug_support arch_crash_hotplug_support
+
+unsigned int arch_crash_get_elfcorehdr_size(void);
+#define crash_get_elfcorehdr_size arch_crash_get_elfcorehdr_size
  #endif /* CONFIG_CRASH_HOTPLUG */
  
  extern int crashing_cpu;

diff --git a/arch/powerpc/include/asm/kexec_ranges.h 
b/arch/powerpc/include/asm/kexec_ranges.h
index 8489e844b447..14055896cbcb 100644
--- a/arch/powerpc/include/asm/kexec_ranges.h
+++ b/arch/powerpc/include/asm/kexec_ranges.h
@@ -7,6 +7,7 @@
  void sort_memory_ranges(struct crash_mem *mrngs, bool merge);
  struct crash_mem *realloc_mem_ranges(struct crash_mem **mem_ranges);
  int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size);
+int remove_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size);
  int get_exclude_memory_ranges(struct crash_mem **mem_ranges);
  int get_reserved_memory_ranges(struct crash_mem **mem_ranges);
  int get_crash_memory_ranges(struct crash_mem **mem_ranges);
diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c
index 8938a19af12f..21b193e938a3 100644
--- a/arch/powerpc/kexec/crash.c
+++ b/arch/powerpc/kexec/crash.c
@@ -17,6 +17,7 @@
  #include 
  #include 
  #include 
+#include 
  
  #include 

  #include 
@@ -25,6 +26,7 @@
  #include 
  #include 
  #include 
+#include 
  
  /*

   * The primary CPU waits a while for all secondary CPUs to enter. This is to
@@ -398,6 +400,94 @@ void default_machine_crash_shutdown(struct pt_regs *regs)
  #undef pr_fmt
  #define pr_fmt(fmt) "crash hp: " fmt
  
+/*

+ * Advertise preferred elfcorehdr size to use

Re: [PATCH v17 5/6] powerpc/crash: add crash CPU hotplug support

2024-03-02 Thread Hari Bathini




On 26/02/24 2:11 pm, Sourabh Jain wrote:

Due to CPU/Memory hotplug or online/offline events, the elfcorehdr
(which describes the CPUs and memory of the crashed kernel) and FDT
(Flattened Device Tree) of kdump image becomes outdated. Consequently,
attempting dump collection with an outdated elfcorehdr or FDT can lead
to failed or inaccurate dump collection.

Going forward, CPU hotplug or online/offline events are referred as
CPU/Memory add/remove events.

The current solution to address the above issue involves monitoring the
CPU/Memory add/remove events in userspace using udev rules and whenever
there are changes in CPU and memory resources, the entire kdump image
is loaded again. The kdump image includes kernel, initrd, elfcorehdr,
FDT, purgatory. Given that only elfcorehdr and FDT get outdated due to
CPU/Memory add/remove events, reloading the entire kdump image is
inefficient. More importantly, kdump remains inactive for a substantial
amount of time until the kdump reload completes.

To address the aforementioned issue, commit 247262756121 ("crash: add
generic infrastructure for crash hotplug support") added a generic
infrastructure that allows architectures to selectively update the kdump
image component during CPU or memory add/remove events within the kernel
itself.

In the event of a CPU or memory add/remove events, the generic crash
hotplug event handler, `crash_handle_hotplug_event()`, is triggered. It
then acquires the necessary locks to update the kdump image and invokes
the architecture-specific crash hotplug handler,
`arch_crash_handle_hotplug_event()`, to update the required kdump image
components.

This patch adds crash hotplug handler for PowerPC and enable support to
update the kdump image on CPU add/remove events. Support for memory
add/remove events is added in a subsequent patch with the title
"powerpc: add crash memory hotplug support"

As mentioned earlier, only the elfcorehdr and FDT kdump image components
need to be updated in the event of CPU or memory add/remove events.
However, on PowerPC architecture crash hotplug handler only updates the
FDT to enable crash hotplug support for CPU add/remove events. Here's
why.

The elfcorehdr on PowerPC is built with possible CPUs, and thus, it does
not need an update on CPU add/remove events. On the other hand, the FDT
needs to be updated on CPU add events to include the newly added CPU. If
the FDT is not updated and the kernel crashes on a newly added CPU, the
kdump kernel will fail to boot due to the unavailability of the crashing
CPU in the FDT. During the early boot, it is expected that the boot CPU
must be a part of the FDT; otherwise, the kernel will raise a BUG and
fail to boot. For more information, refer to commit 36ae37e3436b0
("powerpc: Make boot_cpuid common between 32 and 64-bit"). Since it is
okay to have an offline CPU in the kdump FDT, no action is taken in case
of CPU removal.

There are two system calls, `kexec_file_load` and `kexec_load`, used to
load the kdump image. Few changes have been made to ensure kernel can
safely update the FDT of kdump image loaded using both system calls.

For kexec_file_load syscall the kdump image is prepared in kernel. So to
support an increasing number of CPUs, the FDT is constructed with extra
buffer space to ensure it can accommodate a possible number of CPU
nodes. Additionally, a call to fdt_pack (which trims the unused space
once the FDT is prepared) is avoided if this feature is enabled.

For the kexec_load syscall, the FDT is updated only if the
KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag is passed to the kernel by
userspace (kexec tools). When userspace passes this flag to the kernel,
it indicates that the FDT is built to accommodate possible CPUs, and the
FDT segment is excluded from SHA calculation, making it safe to update.

The changes related to this feature are kept under the CRASH_HOTPLUG
config, and it is enabled by default.



Looks good.

Acked-by: Hari Bathini 


Signed-off-by: Sourabh Jain 
Cc: Akhil Raj 
Cc: Andrew Morton 
Cc: Aneesh Kumar K.V 
Cc: Baoquan He 
Cc: Borislav Petkov (AMD) 
Cc: Boris Ostrovsky 
Cc: Christophe Leroy 
Cc: Dave Hansen 
Cc: Dave Young 
Cc: David Hildenbrand 
Cc: Greg Kroah-Hartman 
Cc: Hari Bathini 
Cc: Laurent Dufour 
Cc: Mahesh Salgaonkar 
Cc: Michael Ellerman 
Cc: Mimi Zohar 
Cc: Naveen N Rao 
Cc: Oscar Salvador 
Cc: Thomas Gleixner 
Cc: Valentin Schneider 
Cc: Vivek Goyal 
Cc: ke...@lists.infradead.org
Cc: x...@kernel.org
---
  arch/powerpc/Kconfig  |   4 ++
  arch/powerpc/include/asm/kexec.h  |   8 +++
  arch/powerpc/kexec/crash.c| 103 ++
  arch/powerpc/kexec/elf_64.c   |   3 +-
  arch/powerpc/kexec/file_load_64.c |  17 +
  5 files changed, 134 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index e377deefa2dc..16d2b20574c4 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -686,6 +686,1

Re: [PATCH v17 4/6] PowerPC/kexec: make the update_cpus_node() function public

2024-03-02 Thread Hari Bathini




On 26/02/24 2:11 pm, Sourabh Jain wrote:

Move the update_cpus_node() from kexec/{file_load_64.c => core_64.c}
to allow other kexec components to use it.

Later in the series, this function is used for in-kernel updates
to the kdump image during CPU/memory hotplug or online/offline events for
both kexec_load and kexec_file_load syscalls.

No functional changes are intended.



Looks good to me.

Acked-by: Hari Bathini 


Signed-off-by: Sourabh Jain 
Cc: Akhil Raj 
Cc: Andrew Morton 
Cc: Aneesh Kumar K.V 
Cc: Baoquan He 
Cc: Borislav Petkov (AMD) 
Cc: Boris Ostrovsky 
Cc: Christophe Leroy 
Cc: Dave Hansen 
Cc: Dave Young 
Cc: David Hildenbrand 
Cc: Greg Kroah-Hartman 
Cc: Hari Bathini 
Cc: Laurent Dufour 
Cc: Mahesh Salgaonkar 
Cc: Michael Ellerman 
Cc: Mimi Zohar 
Cc: Naveen N Rao 
Cc: Oscar Salvador 
Cc: Thomas Gleixner 
Cc: Valentin Schneider 
Cc: Vivek Goyal 
Cc: ke...@lists.infradead.org
Cc: x...@kernel.org
---
  arch/powerpc/include/asm/kexec.h  |  4 ++
  arch/powerpc/kexec/core_64.c  | 91 +++
  arch/powerpc/kexec/file_load_64.c | 87 -
  3 files changed, 95 insertions(+), 87 deletions(-)

diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index fdb90e24dc74..d9ff4d0e392d 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -185,6 +185,10 @@ static inline void crash_send_ipi(void 
(*crash_ipi_callback)(struct pt_regs *))
  
  #endif /* CONFIG_CRASH_DUMP */
  
+#if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_DUMP)

+int update_cpus_node(void *fdt);
+#endif
+
  #ifdef CONFIG_PPC_BOOK3S_64
  #include 
  #endif
diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c
index 762e4d09aacf..85050be08a23 100644
--- a/arch/powerpc/kexec/core_64.c
+++ b/arch/powerpc/kexec/core_64.c
@@ -17,6 +17,7 @@
  #include 
  #include 
  #include 
+#include 
  
  #include 

  #include 
@@ -30,6 +31,7 @@
  #include 
  #include 
  #include 
+#include 
  
  int machine_kexec_prepare(struct kimage *image)

  {
@@ -419,3 +421,92 @@ static int __init export_htab_values(void)
  }
  late_initcall(export_htab_values);
  #endif /* CONFIG_PPC_64S_HASH_MMU */
+
+#if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_DUMP)
+/**
+ * add_node_props - Reads node properties from device node structure and add
+ *  them to fdt.
+ * @fdt:Flattened device tree of the kernel
+ * @node_offset:offset of the node to add a property at
+ * @dn: device node pointer
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int add_node_props(void *fdt, int node_offset, const struct device_node 
*dn)
+{
+   int ret = 0;
+   struct property *pp;
+
+   if (!dn)
+   return -EINVAL;
+
+   for_each_property_of_node(dn, pp) {
+   ret = fdt_setprop(fdt, node_offset, pp->name, pp->value, 
pp->length);
+   if (ret < 0) {
+   pr_err("Unable to add %s property: %s\n", pp->name, 
fdt_strerror(ret));
+   return ret;
+   }
+   }
+   return ret;
+}
+
+/**
+ * update_cpus_node - Update cpus node of flattened device tree using of_root
+ *device node.
+ * @fdt:  Flattened device tree of the kernel.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int update_cpus_node(void *fdt)
+{
+   struct device_node *cpus_node, *dn;
+   int cpus_offset, cpus_subnode_offset, ret = 0;
+
+   cpus_offset = fdt_path_offset(fdt, "/cpus");
+   if (cpus_offset < 0 && cpus_offset != -FDT_ERR_NOTFOUND) {
+   pr_err("Malformed device tree: error reading /cpus node: %s\n",
+  fdt_strerror(cpus_offset));
+   return cpus_offset;
+   }
+
+   if (cpus_offset > 0) {
+   ret = fdt_del_node(fdt, cpus_offset);
+   if (ret < 0) {
+   pr_err("Error deleting /cpus node: %s\n", 
fdt_strerror(ret));
+   return -EINVAL;
+   }
+   }
+
+   /* Add cpus node to fdt */
+   cpus_offset = fdt_add_subnode(fdt, fdt_path_offset(fdt, "/"), "cpus");
+   if (cpus_offset < 0) {
+   pr_err("Error creating /cpus node: %s\n", 
fdt_strerror(cpus_offset));
+   return -EINVAL;
+   }
+
+   /* Add cpus node properties */
+   cpus_node = of_find_node_by_path("/cpus");
+   ret = add_node_props(fdt, cpus_offset, cpus_node);
+   of_node_put(cpus_node);
+   if (ret < 0)
+   return ret;
+
+   /* Loop through all subnodes of cpus and add them to fdt */
+   for_each_node_by_type(dn, "cpu") {
+   cpus_subnode_offset = fdt_add_subnode(fdt, cpus_offset, 
dn->full_name);
+   if (cpus_subnode_off

Re: [PATCH v17 3/6] powerpc/kexec: move *_memory_ranges functions to ranges.c

2024-03-02 Thread Hari Bathini




On 26/02/24 2:11 pm, Sourabh Jain wrote:

Move the following functions form kexec/{file_load_64.c => ranges.c} and
make them public so that components other KEXEC_FILE can also use these
functions.
1. get_exclude_memory_ranges
2. get_reserved_memory_ranges
3. get_crash_memory_ranges
4. get_usable_memory_ranges

Later in the series get_crash_memory_ranges function is utilized for
in-kernel updates to kdump image during CPU/Memory hotplug or
online/offline events for both kexec_load and kexec_file_load syscalls.

Since the above functions are moved to ranges.c, some of the helper
functions in ranges.c are no longer required to be public. Mark them as
static and removed them from kexec_ranges.h header file.

Finally, remove the CONFIG_KEXEC_FILE build dependency for range.c
because it is required for other config, such as CONFIG_CRASH_DUMP.

No functional changes are intended.



Acked-by: Hari Bathini 


Signed-off-by: Sourabh Jain 
Cc: Akhil Raj 
Cc: Andrew Morton 
Cc: Aneesh Kumar K.V 
Cc: Baoquan He 
Cc: Borislav Petkov (AMD) 
Cc: Boris Ostrovsky 
Cc: Christophe Leroy 
Cc: Dave Hansen 
Cc: Dave Young 
Cc: David Hildenbrand 
Cc: Greg Kroah-Hartman 
Cc: Hari Bathini 
Cc: Laurent Dufour 
Cc: Mahesh Salgaonkar 
Cc: Michael Ellerman 
Cc: Mimi Zohar 
Cc: Naveen N Rao 
Cc: Oscar Salvador 
Cc: Thomas Gleixner 
Cc: Valentin Schneider 
Cc: Vivek Goyal 
Cc: ke...@lists.infradead.org
Cc: x...@kernel.org
---
  arch/powerpc/include/asm/kexec_ranges.h |  19 +-
  arch/powerpc/kexec/Makefile |   4 +-
  arch/powerpc/kexec/file_load_64.c   | 190 
  arch/powerpc/kexec/ranges.c | 227 +++-
  4 files changed, 224 insertions(+), 216 deletions(-)

diff --git a/arch/powerpc/include/asm/kexec_ranges.h 
b/arch/powerpc/include/asm/kexec_ranges.h
index f83866a19e87..8489e844b447 100644
--- a/arch/powerpc/include/asm/kexec_ranges.h
+++ b/arch/powerpc/include/asm/kexec_ranges.h
@@ -7,19 +7,8 @@
  void sort_memory_ranges(struct crash_mem *mrngs, bool merge);
  struct crash_mem *realloc_mem_ranges(struct crash_mem **mem_ranges);
  int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size);
-int add_tce_mem_ranges(struct crash_mem **mem_ranges);
-int add_initrd_mem_range(struct crash_mem **mem_ranges);
-#ifdef CONFIG_PPC_64S_HASH_MMU
-int add_htab_mem_range(struct crash_mem **mem_ranges);
-#else
-static inline int add_htab_mem_range(struct crash_mem **mem_ranges)
-{
-   return 0;
-}
-#endif
-int add_kernel_mem_range(struct crash_mem **mem_ranges);
-int add_rtas_mem_range(struct crash_mem **mem_ranges);
-int add_opal_mem_range(struct crash_mem **mem_ranges);
-int add_reserved_mem_ranges(struct crash_mem **mem_ranges);
-
+int get_exclude_memory_ranges(struct crash_mem **mem_ranges);
+int get_reserved_memory_ranges(struct crash_mem **mem_ranges);
+int get_crash_memory_ranges(struct crash_mem **mem_ranges);
+int get_usable_memory_ranges(struct crash_mem **mem_ranges);
  #endif /* _ASM_POWERPC_KEXEC_RANGES_H */
diff --git a/arch/powerpc/kexec/Makefile b/arch/powerpc/kexec/Makefile
index 8e469c4da3f8..470eb0453e17 100644
--- a/arch/powerpc/kexec/Makefile
+++ b/arch/powerpc/kexec/Makefile
@@ -3,11 +3,11 @@
  # Makefile for the linux kernel.
  #
  
-obj-y+= core.o core_$(BITS).o

+obj-y  += core.o core_$(BITS).o ranges.o
  
  obj-$(CONFIG_PPC32)		+= relocate_32.o
  
-obj-$(CONFIG_KEXEC_FILE)	+= file_load.o ranges.o file_load_$(BITS).o elf_$(BITS).o

+obj-$(CONFIG_KEXEC_FILE)   += file_load.o file_load_$(BITS).o elf_$(BITS).o
  obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o
  obj-$(CONFIG_CRASH_DUMP)  += crash.o
  
diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c

index 1bc65de6174f..6a01f62b8fcf 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -47,83 +47,6 @@ const struct kexec_file_ops * const kexec_file_loaders[] = {
NULL
  };
  
-/**

- * get_exclude_memory_ranges - Get exclude memory ranges. This list includes
- * regions like opal/rtas, tce-table, initrd,
- * kernel, htab which should be avoided while
- * setting up kexec load segments.
- * @mem_ranges:Range list to add the memory ranges to.
- *
- * Returns 0 on success, negative errno on error.
- */
-static int get_exclude_memory_ranges(struct crash_mem **mem_ranges)
-{
-   int ret;
-
-   ret = add_tce_mem_ranges(mem_ranges);
-   if (ret)
-   goto out;
-
-   ret = add_initrd_mem_range(mem_ranges);
-   if (ret)
-   goto out;
-
-   ret = add_htab_mem_range(mem_ranges);
-   if (ret)
-   goto out;
-
-   ret = add_kernel_mem_range(mem_ranges);
-   if (ret)
-   goto out;
-
-   ret = add_rtas_mem_range(mem_ranges);
-   if (ret)
-   goto out;
-
-   ret = add_opal_mem_range(mem_ran

Re: [PATCH v17 1/6] crash: forward memory_notify arg to arch crash hotplug handler

2024-03-02 Thread Hari Bathini




On 26/02/24 2:11 pm, Sourabh Jain wrote:

In the event of memory hotplug or online/offline events, the crash
memory hotplug notifier `crash_memhp_notifier()` receives a
`memory_notify` object but doesn't forward that object to the
generic and architecture-specific crash hotplug handler.

The `memory_notify` object contains the starting PFN (Page Frame Number)
and the number of pages in the hot-removed memory. This information is
necessary for architectures like PowerPC to update/recreate the kdump
image, specifically `elfcorehdr`.

So update the function signature of `crash_handle_hotplug_event()` and
`arch_crash_handle_hotplug_event()` to accept the `memory_notify` object
as an argument from crash memory hotplug notifier.

Since no such object is available in the case of CPU hotplug event, the
crash CPU hotplug notifier `crash_cpuhp_online()` passes NULL to the
crash hotplug handler.



Acked-by: Hari Bathini 


Signed-off-by: Sourabh Jain 
Acked-by: Baoquan He 
Cc: Akhil Raj 
Cc: Andrew Morton 
Cc: Aneesh Kumar K.V 
Cc: Borislav Petkov (AMD) 
Cc: Boris Ostrovsky 
Cc: Christophe Leroy 
Cc: Dave Hansen 
Cc: Dave Young 
Cc: David Hildenbrand 
Cc: Greg Kroah-Hartman 
Cc: Hari Bathini 
Cc: Laurent Dufour 
Cc: Mahesh Salgaonkar 
Cc: Michael Ellerman 
Cc: Mimi Zohar 
Cc: Naveen N Rao 
Cc: Oscar Salvador 
Cc: Thomas Gleixner 
Cc: Valentin Schneider 
Cc: Vivek Goyal 
Cc: ke...@lists.infradead.org
Cc: x...@kernel.org
---
  arch/x86/include/asm/kexec.h |  2 +-
  arch/x86/kernel/crash.c  |  4 +++-
  include/linux/crash_core.h   |  2 +-
  kernel/crash_core.c  | 14 +++---
  4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 91ca9a9ee3a2..cb1320ebbc23 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -207,7 +207,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage 
*image);
  extern void kdump_nmi_shootdown_cpus(void);
  
  #ifdef CONFIG_CRASH_HOTPLUG

-void arch_crash_handle_hotplug_event(struct kimage *image);
+void arch_crash_handle_hotplug_event(struct kimage *image, void *arg);
  #define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event
  
  #ifdef CONFIG_HOTPLUG_CPU

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index e74d0c4286c1..2a682fe86352 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -432,10 +432,12 @@ unsigned int arch_crash_get_elfcorehdr_size(void)
  /**
   * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes
   * @image: a pointer to kexec_crash_image
+ * @arg: struct memory_notify handler for memory hotplug case and
+ *   NULL for CPU hotplug case.
   *
   * Prepare the new elfcorehdr and replace the existing elfcorehdr.
   */
-void arch_crash_handle_hotplug_event(struct kimage *image)
+void arch_crash_handle_hotplug_event(struct kimage *image, void *arg)
  {
void *elfbuf = NULL, *old_elfcorehdr;
unsigned long nr_mem_ranges;
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index d33352c2e386..647e928efee8 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -37,7 +37,7 @@ static inline void arch_kexec_unprotect_crashkres(void) { }
  
  
  #ifndef arch_crash_handle_hotplug_event

-static inline void arch_crash_handle_hotplug_event(struct kimage *image) { }
+static inline void arch_crash_handle_hotplug_event(struct kimage *image, void 
*arg) { }
  #endif
  
  int crash_check_update_elfcorehdr(void);

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 78b5dc7cee3a..70fa8111a9d6 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -534,7 +534,7 @@ int crash_check_update_elfcorehdr(void)
   * list of segments it checks (since the elfcorehdr changes and thus
   * would require an update to purgatory itself to update the digest).
   */
-static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int 
cpu)
+static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int 
cpu, void *arg)
  {
struct kimage *image;
  
@@ -596,7 +596,7 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu)

image->hp_action = hp_action;
  
  	/* Now invoke arch-specific update handler */

-   arch_crash_handle_hotplug_event(image);
+   arch_crash_handle_hotplug_event(image, arg);
  
  	/* No longer handling a hotplug event */

image->hp_action = KEXEC_CRASH_HP_NONE;
@@ -612,17 +612,17 @@ static void crash_handle_hotplug_event(unsigned int 
hp_action, unsigned int cpu)
crash_hotplug_unlock();
  }
  
-static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v)

+static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, 
void *arg)
  {
switch (val) {
case MEM_ONLINE:
crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_MEMORY,
-   KEXEC_CRASH_HP_

Re: [PATCH v17 2/6] crash: add a new kexec flag for hotplug support

2024-03-02 Thread Hari Bathini




On 26/02/24 2:11 pm, Sourabh Jain wrote:

Commit a72bbec70da2 ("crash: hotplug support for kexec_load()")
introduced a new kexec flag, `KEXEC_UPDATE_ELFCOREHDR`. Kexec tool uses
this flag to indicate to the kernel that it is safe to modify the
elfcorehdr of the kdump image loaded using the kexec_load system call.

However, it is possible that architectures may need to update kexec
segments other then elfcorehdr. For example, FDT (Flatten Device Tree)
on PowerPC. Introducing a new kexec flag for every new kexec segment
may not be a good solution. Hence, a generic kexec flag bit,
`KEXEC_CRASH_HOTPLUG_SUPPORT`, is introduced to share the CPU/Memory
hotplug support intent between the kexec tool and the kernel for the
kexec_load system call.

Now, if the kexec tool sends KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag to
the kernel, it indicates to the kernel that all the required kexec
segment is skipped from SHA calculation and it is safe to update kdump
image loaded using the kexec_load syscall.

While loading the kdump image using the kexec_load syscall, the
@update_elfcorehdr member of struct kimage is set if the kexec tool
sends the KEXEC_UPDATE_ELFCOREHDR kexec flag. This member is later used
to determine whether it is safe to update elfcorehdr on hotplug events.
However, with the introduction of the KEXEC_CRASH_HOTPLUG_SUPPORT kexec
flag, the kexec tool could mark all the required kexec segments on an
architecture as safe to update. So rename the @update_elfcorehdr to
@hotplug_support. If @hotplug_support is set, the kernel can safely
update all the required kexec segments of the kdump image during
CPU/Memory hotplug events.

Introduce an architecture-specific function to process kexec flags for
determining hotplug support. Set the @hotplug_support member of struct
kimage for both kexec_load and kexec_file_load system calls. This
simplifies kernel checks to identify hotplug support for the currently
loaded kdump image by just examining the value of @hotplug_support.



Couple of minor nits. See comments below.
Otherwise, looks good to me.

Acked-by: Hari Bathini 


Signed-off-by: Sourabh Jain 
Cc: Akhil Raj 
Cc: Andrew Morton 
Cc: Aneesh Kumar K.V 
Cc: Baoquan He 
Cc: Borislav Petkov (AMD) 
Cc: Boris Ostrovsky 
Cc: Christophe Leroy 
Cc: Dave Hansen 
Cc: Dave Young 
Cc: David Hildenbrand 
Cc: Eric DeVolder 
Cc: Greg Kroah-Hartman 
Cc: Hari Bathini 
Cc: Laurent Dufour 
Cc: Mahesh Salgaonkar 
Cc: Michael Ellerman 
Cc: Mimi Zohar 
Cc: Naveen N Rao 
Cc: Oscar Salvador 
Cc: Thomas Gleixner 
Cc: Valentin Schneider 
Cc: Vivek Goyal 
Cc: ke...@lists.infradead.org
Cc: x...@kernel.org
---
  arch/x86/include/asm/kexec.h | 11 ++-
  arch/x86/kernel/crash.c  | 28 +---
  drivers/base/cpu.c   |  2 +-
  drivers/base/memory.c|  2 +-
  include/linux/crash_core.h   | 13 ++---
  include/linux/kexec.h| 11 +++
  include/uapi/linux/kexec.h   |  1 +
  kernel/crash_core.c  | 11 ---
  kernel/kexec.c   |  4 ++--
  kernel/kexec_file.c  |  5 +
  10 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index cb1320ebbc23..ae5482a2f0ca 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -210,15 +210,8 @@ extern void kdump_nmi_shootdown_cpus(void);
  void arch_crash_handle_hotplug_event(struct kimage *image, void *arg);
  #define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event
  
-#ifdef CONFIG_HOTPLUG_CPU

-int arch_crash_hotplug_cpu_support(void);
-#define crash_hotplug_cpu_support arch_crash_hotplug_cpu_support
-#endif
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_crash_hotplug_memory_support(void);
-#define crash_hotplug_memory_support arch_crash_hotplug_memory_support
-#endif
+int arch_crash_hotplug_support(struct kimage *image, unsigned long 
kexec_flags);
+#define arch_crash_hotplug_support arch_crash_hotplug_support
  
  unsigned int arch_crash_get_elfcorehdr_size(void);

  #define crash_get_elfcorehdr_size arch_crash_get_elfcorehdr_size
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 2a682fe86352..f06501445cd9 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -402,20 +402,26 @@ int crash_load_segments(struct kimage *image)
  #undef pr_fmt
  #define pr_fmt(fmt) "crash hp: " fmt





-/* These functions provide the value for the sysfs crash_hotplug nodes */
-#ifdef CONFIG_HOTPLUG_CPU
-int arch_crash_hotplug_cpu_support(void)
+int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags)
  {
-   return crash_check_update_elfcorehdr();
-}
-#endif
  
-#ifdef CONFIG_MEMORY_HOTPLUG

-int arch_crash_hotplug_memory_support(void)
-{
-   return crash_check_update_elfcorehdr();
-}
+#ifdef CONFIG_KEXEC_FILE
+   if (image->file_mode)
+   return 1;
  #endif
+   /*
+* Initially, crash hotplug support f

[PATCH linux-next v2 2/3] powerpc/kexec: split CONFIG_KEXEC_FILE and CONFIG_CRASH_DUMP

2024-02-26 Thread Hari Bathini
CONFIG_KEXEC_FILE does not have to select CONFIG_CRASH_DUMP. Move
some code under CONFIG_CRASH_DUMP to support CONFIG_KEXEC_FILE and
!CONFIG_CRASH_DUMP case.

Signed-off-by: Hari Bathini 
---

* No changes in v2.

 arch/powerpc/kexec/elf_64.c   |   4 +-
 arch/powerpc/kexec/file_load_64.c | 269 --
 2 files changed, 142 insertions(+), 131 deletions(-)

diff --git a/arch/powerpc/kexec/elf_64.c b/arch/powerpc/kexec/elf_64.c
index 904016cf89ea..6d8951e8e966 100644
--- a/arch/powerpc/kexec/elf_64.c
+++ b/arch/powerpc/kexec/elf_64.c
@@ -47,7 +47,7 @@ static void *elf64_load(struct kimage *image, char 
*kernel_buf,
if (ret)
return ERR_PTR(ret);
 
-   if (image->type == KEXEC_TYPE_CRASH) {
+   if (IS_ENABLED(CONFIG_CRASH_DUMP) && image->type == KEXEC_TYPE_CRASH) {
/* min & max buffer values for kdump case */
kbuf.buf_min = pbuf.buf_min = crashk_res.start;
kbuf.buf_max = pbuf.buf_max =
@@ -70,7 +70,7 @@ static void *elf64_load(struct kimage *image, char 
*kernel_buf,
kexec_dprintk("Loaded purgatory at 0x%lx\n", pbuf.mem);
 
/* Load additional segments needed for panic kernel */
-   if (image->type == KEXEC_TYPE_CRASH) {
+   if (IS_ENABLED(CONFIG_CRASH_DUMP) && image->type == KEXEC_TYPE_CRASH) {
ret = load_crashdump_segments_ppc64(image, &kbuf);
if (ret) {
pr_err("Failed to load kdump kernel segments\n");
diff --git a/arch/powerpc/kexec/file_load_64.c 
b/arch/powerpc/kexec/file_load_64.c
index 5b4c5cb23354..1bc65de6174f 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -96,119 +96,6 @@ static int get_exclude_memory_ranges(struct crash_mem 
**mem_ranges)
return ret;
 }
 
-/**
- * get_usable_memory_ranges - Get usable memory ranges. This list includes
- *regions like crashkernel, opal/rtas & tce-table,
- *that kdump kernel could use.
- * @mem_ranges:   Range list to add the memory ranges to.
- *
- * Returns 0 on success, negative errno on error.
- */
-static int get_usable_memory_ranges(struct crash_mem **mem_ranges)
-{
-   int ret;
-
-   /*
-* Early boot failure observed on guests when low memory (first memory
-* block?) is not added to usable memory. So, add [0, crashk_res.end]
-* instead of [crashk_res.start, crashk_res.end] to workaround it.
-* Also, crashed kernel's memory must be added to reserve map to
-* avoid kdump kernel from using it.
-*/
-   ret = add_mem_range(mem_ranges, 0, crashk_res.end + 1);
-   if (ret)
-   goto out;
-
-   ret = add_rtas_mem_range(mem_ranges);
-   if (ret)
-   goto out;
-
-   ret = add_opal_mem_range(mem_ranges);
-   if (ret)
-   goto out;
-
-   ret = add_tce_mem_ranges(mem_ranges);
-out:
-   if (ret)
-   pr_err("Failed to setup usable memory ranges\n");
-   return ret;
-}
-
-/**
- * get_crash_memory_ranges - Get crash memory ranges. This list includes
- *   first/crashing kernel's memory regions that
- *   would be exported via an elfcore.
- * @mem_ranges:  Range list to add the memory ranges to.
- *
- * Returns 0 on success, negative errno on error.
- */
-static int get_crash_memory_ranges(struct crash_mem **mem_ranges)
-{
-   phys_addr_t base, end;
-   struct crash_mem *tmem;
-   u64 i;
-   int ret;
-
-   for_each_mem_range(i, &base, &end) {
-   u64 size = end - base;
-
-   /* Skip backup memory region, which needs a separate entry */
-   if (base == BACKUP_SRC_START) {
-   if (size > BACKUP_SRC_SIZE) {
-   base = BACKUP_SRC_END + 1;
-   size -= BACKUP_SRC_SIZE;
-   } else
-   continue;
-   }
-
-   ret = add_mem_range(mem_ranges, base, size);
-   if (ret)
-   goto out;
-
-   /* Try merging adjacent ranges before reallocation attempt */
-   if ((*mem_ranges)->nr_ranges == (*mem_ranges)->max_nr_ranges)
-   sort_memory_ranges(*mem_ranges, true);
-   }
-
-   /* Reallocate memory ranges if there is no space to split ranges */
-   tmem = *mem_ranges;
-   if (tmem && (tmem->nr_ranges == tmem->max_nr_ranges)) {
-   tmem = realloc_mem_ranges(mem_ranges);
-   if (!tmem)
-   goto out;
-   }
-
-   /* Exclude crashkernel region */
-   ret = crash_exclude_mem_range(tmem, crashk_res.start, crashk_res.end);
-   if (ret)
-  

[PATCH linux-next v2 3/3] powerpc/kdump: Split KEXEC_CORE and CRASH_DUMP dependency

2024-02-26 Thread Hari Bathini
Remove CONFIG_CRASH_DUMP dependency on CONFIG_KEXEC. CONFIG_KEXEC_CORE
was used at places where CONFIG_CRASH_DUMP or CONFIG_CRASH_RESERVE was
appropriate. Replace with appropriate #ifdefs to support CONFIG_KEXEC
and !CONFIG_CRASH_DUMP configuration option. Also, make CONFIG_FA_DUMP
dependent on CONFIG_CRASH_DUMP to avoid unmet dependencies for FA_DUMP
with !CONFIG_KEXEC_CORE configuration option.

Signed-off-by: Hari Bathini 
---

Changes in v2:
* Fixed a compile error for POWERNV build reported by Sourabh.

 arch/powerpc/Kconfig |  9 +--
 arch/powerpc/include/asm/kexec.h | 98 ++--
 arch/powerpc/kernel/prom.c   |  2 +-
 arch/powerpc/kernel/setup-common.c   |  2 +-
 arch/powerpc/kernel/smp.c|  4 +-
 arch/powerpc/kexec/Makefile  |  3 +-
 arch/powerpc/kexec/core.c|  4 ++
 arch/powerpc/platforms/powernv/smp.c |  2 +-
 8 files changed, 61 insertions(+), 63 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5cf8ad8d7e8e..e377deefa2dc 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -607,11 +607,6 @@ config PPC64_SUPPORTS_MEMORY_FAILURE
 config ARCH_SUPPORTS_KEXEC
def_bool PPC_BOOK3S || PPC_E500 || (44x && !SMP)
 
-config ARCH_SELECTS_KEXEC
-   def_bool y
-   depends on KEXEC
-   select CRASH_DUMP
-
 config ARCH_SUPPORTS_KEXEC_FILE
def_bool PPC64
 
@@ -622,7 +617,6 @@ config ARCH_SELECTS_KEXEC_FILE
def_bool y
depends on KEXEC_FILE
select KEXEC_ELF
-   select CRASH_DUMP
select HAVE_IMA_KEXEC if IMA
 
 config PPC64_BIG_ENDIAN_ELF_ABI_V2
@@ -694,8 +688,7 @@ config ARCH_SELECTS_CRASH_DUMP
 
 config FA_DUMP
bool "Firmware-assisted dump"
-   depends on PPC64 && (PPC_RTAS || PPC_POWERNV)
-   select CRASH_DUMP
+   depends on CRASH_DUMP && PPC64 && (PPC_RTAS || PPC_POWERNV)
help
  A robust mechanism to get reliable kernel crash dump with
  assistance from firmware. This approach does not use kexec,
diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index e1b43aa12175..fdb90e24dc74 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -55,59 +55,18 @@
 typedef void (*crash_shutdown_t)(void);
 
 #ifdef CONFIG_KEXEC_CORE
-
-/*
- * This function is responsible for capturing register states if coming
- * via panic or invoking dump using sysrq-trigger.
- */
-static inline void crash_setup_regs(struct pt_regs *newregs,
-   struct pt_regs *oldregs)
-{
-   if (oldregs)
-   memcpy(newregs, oldregs, sizeof(*newregs));
-   else
-   ppc_save_regs(newregs);
-}
+struct kimage;
+struct pt_regs;
 
 extern void kexec_smp_wait(void);  /* get and clear naca physid, wait for
  master to copy new code to 0 */
-extern int crashing_cpu;
-extern void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *));
-extern void crash_ipi_callback(struct pt_regs *);
-extern int crash_wake_offline;
-
-struct kimage;
-struct pt_regs;
 extern void default_machine_kexec(struct kimage *image);
-extern void default_machine_crash_shutdown(struct pt_regs *regs);
-extern int crash_shutdown_register(crash_shutdown_t handler);
-extern int crash_shutdown_unregister(crash_shutdown_t handler);
-
-extern void crash_kexec_prepare(void);
-extern void crash_kexec_secondary(struct pt_regs *regs);
-int __init overlaps_crashkernel(unsigned long start, unsigned long size);
-extern void reserve_crashkernel(void);
 extern void machine_kexec_mask_interrupts(void);
 
-static inline bool kdump_in_progress(void)
-{
-   return crashing_cpu >= 0;
-}
-
 void relocate_new_kernel(unsigned long indirection_page, unsigned long 
reboot_code_buffer,
 unsigned long start_address) __noreturn;
-
 void kexec_copy_flush(struct kimage *image);
 
-#if defined(CONFIG_CRASH_DUMP)
-bool is_kdump_kernel(void);
-#define is_kdump_kernelis_kdump_kernel
-#if defined(CONFIG_PPC_RTAS)
-void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
-#define crash_free_reserved_phys_range crash_free_reserved_phys_range
-#endif /* CONFIG_PPC_RTAS */
-#endif /* CONFIG_CRASH_DUMP */
-
 #ifdef CONFIG_KEXEC_FILE
 extern const struct kexec_file_ops kexec_elf64_ops;
 
@@ -152,15 +111,56 @@ int setup_new_fdt_ppc64(const struct kimage *image, void 
*fdt,
 
 #endif /* CONFIG_KEXEC_FILE */
 
-#else /* !CONFIG_KEXEC_CORE */
-static inline void crash_kexec_secondary(struct pt_regs *regs) { }
+#endif /* CONFIG_KEXEC_CORE */
+
+#ifdef CONFIG_CRASH_RESERVE
+int __init overlaps_crashkernel(unsigned long start, unsigned long size);
+extern void reserve_crashkernel(void);
+#else
+static inline void reserve_crashkernel(void) {}
+static inline int overlaps_crashkernel(unsigned long start, unsigned long 
siz

[PATCH linux-next v2 0/3] powerpc/kexec: split CONFIG_CRASH_DUMP out from CONFIG_KEXEC_CORE

2024-02-26 Thread Hari Bathini
This patch series is a follow-up to [1] based on discussions at [2]
about additional work needed to get it working on powerpc.

The first patch in the series makes struct crash_mem available with or
without CONFIG_CRASH_DUMP enabled. The next patch moves kdump specific
code for kexec_file_load syscall under CONFIG_CRASH_DUMP and the last
patch splits other kdump specific code under CONFIG_CRASH_DUMP and
removes dependency with CONFIG_CRASH_DUMP for CONFIG_KEXEC_CORE.

[1] https://lore.kernel.org/all/20240124051254.67105-1-...@redhat.com/
[2] 
https://lore.kernel.org/all/9101bb07-70f1-476c-bec9-ec67e9899...@linux.ibm.com/

Changes in v2:
* Fixed a compile error for POWERNV build reported by Sourabh.

Hari Bathini (3):
  kexec/kdump: make struct crash_mem available without CONFIG_CRASH_DUMP
  powerpc/kexec: split CONFIG_KEXEC_FILE and CONFIG_CRASH_DUMP
  powerpc/kdump: Split KEXEC_CORE and CRASH_DUMP dependency

 arch/powerpc/Kconfig |   9 +-
 arch/powerpc/include/asm/kexec.h |  98 +-
 arch/powerpc/kernel/prom.c   |   2 +-
 arch/powerpc/kernel/setup-common.c   |   2 +-
 arch/powerpc/kernel/smp.c|   4 +-
 arch/powerpc/kexec/Makefile  |   3 +-
 arch/powerpc/kexec/core.c|   4 +
 arch/powerpc/kexec/elf_64.c  |   4 +-
 arch/powerpc/kexec/file_load_64.c| 269 ++-
 arch/powerpc/platforms/powernv/smp.c |   2 +-
 include/linux/crash_core.h   |  12 +-
 11 files changed, 209 insertions(+), 200 deletions(-)

-- 
2.43.2



[PATCH linux-next v2 1/3] kexec/kdump: make struct crash_mem available without CONFIG_CRASH_DUMP

2024-02-26 Thread Hari Bathini
struct crash_mem defined under include/linux/crash_core.h represents
a list of memory ranges. While it is used to represent memory ranges
for kdump kernel, it can also be used for other kind of memory ranges.
In fact, KEXEC_FILE_LOAD syscall in powerpc uses this structure to
represent reserved memory ranges and exclude memory ranges needed to
find the right memory regions to load kexec kernel. So, make the
definition of crash_mem structure available for !CONFIG_CRASH_DUMP
case too.

Signed-off-by: Hari Bathini 
---

* No changes in v2.

 include/linux/crash_core.h | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 23270b16e1db..d33352c2e386 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -8,6 +8,12 @@
 
 struct kimage;
 
+struct crash_mem {
+   unsigned int max_nr_ranges;
+   unsigned int nr_ranges;
+   struct range ranges[] __counted_by(max_nr_ranges);
+};
+
 #ifdef CONFIG_CRASH_DUMP
 
 int crash_shrink_memory(unsigned long new_size);
@@ -51,12 +57,6 @@ static inline unsigned int crash_get_elfcorehdr_size(void) { 
return 0; }
 /* Alignment required for elf header segment */
 #define ELF_CORE_HEADER_ALIGN   4096
 
-struct crash_mem {
-   unsigned int max_nr_ranges;
-   unsigned int nr_ranges;
-   struct range ranges[] __counted_by(max_nr_ranges);
-};
-
 extern int crash_exclude_mem_range(struct crash_mem *mem,
   unsigned long long mstart,
   unsigned long long mend);
-- 
2.43.2



Re: [PATCH linux-next 3/3] powerpc/kdump: Split KEXEC_CORE and CRASH_DUMP dependency

2024-02-25 Thread Hari Bathini




On 23/02/24 1:05 pm, Sourabh Jain wrote:

Hello Hari,


Hi Sourabh,



Build failure detected.


Thanks for trying out the patches.



On 13/02/24 17:01, Hari Bathini wrote:

Remove CONFIG_CRASH_DUMP dependency on CONFIG_KEXEC. CONFIG_KEXEC_CORE
was used at places where CONFIG_CRASH_DUMP or CONFIG_CRASH_RESERVE was
appropriate. Replace with appropriate #ifdefs to support CONFIG_KEXEC
and !CONFIG_CRASH_DUMP configuration option. Also, make CONFIG_FA_DUMP
dependent on CONFIG_CRASH_DUMP to avoid unmet dependencies for FA_DUMP
with !CONFIG_KEXEC_CORE configuration option.

Signed-off-by: Hari Bathini 
---
  arch/powerpc/Kconfig   |  9 +--
  arch/powerpc/include/asm/kexec.h   | 98 +++---
  arch/powerpc/kernel/prom.c |  2 +-
  arch/powerpc/kernel/setup-common.c |  2 +-
  arch/powerpc/kernel/smp.c  |  4 +-
  arch/powerpc/kexec/Makefile    |  3 +-
  arch/powerpc/kexec/core.c  |  4 ++
  7 files changed, 60 insertions(+), 62 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5cf8ad8d7e8e..e377deefa2dc 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -607,11 +607,6 @@ config PPC64_SUPPORTS_MEMORY_FAILURE
  config ARCH_SUPPORTS_KEXEC
  def_bool PPC_BOOK3S || PPC_E500 || (44x && !SMP)
-config ARCH_SELECTS_KEXEC
-    def_bool y
-    depends on KEXEC
-    select CRASH_DUMP
-
  config ARCH_SUPPORTS_KEXEC_FILE
  def_bool PPC64
@@ -622,7 +617,6 @@ config ARCH_SELECTS_KEXEC_FILE
  def_bool y
  depends on KEXEC_FILE
  select KEXEC_ELF
-    select CRASH_DUMP
  select HAVE_IMA_KEXEC if IMA
  config PPC64_BIG_ENDIAN_ELF_ABI_V2
@@ -694,8 +688,7 @@ config ARCH_SELECTS_CRASH_DUMP
  config FA_DUMP
  bool "Firmware-assisted dump"
-    depends on PPC64 && (PPC_RTAS || PPC_POWERNV)
-    select CRASH_DUMP
+    depends on CRASH_DUMP && PPC64 && (PPC_RTAS || PPC_POWERNV)
  help
    A robust mechanism to get reliable kernel crash dump with
    assistance from firmware. This approach does not use kexec,
diff --git a/arch/powerpc/include/asm/kexec.h 
b/arch/powerpc/include/asm/kexec.h

index e1b43aa12175..fdb90e24dc74 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -55,59 +55,18 @@
  typedef void (*crash_shutdown_t)(void);
  #ifdef CONFIG_KEXEC_CORE
-
-/*
- * This function is responsible for capturing register states if coming
- * via panic or invoking dump using sysrq-trigger.
- */
-static inline void crash_setup_regs(struct pt_regs *newregs,
-    struct pt_regs *oldregs)
-{
-    if (oldregs)
-    memcpy(newregs, oldregs, sizeof(*newregs));
-    else
-    ppc_save_regs(newregs);
-}
+struct kimage;
+struct pt_regs;
  extern void kexec_smp_wait(void);    /* get and clear naca physid, 
wait for

    master to copy new code to 0 */
-extern int crashing_cpu;
-extern void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs 
*));

-extern void crash_ipi_callback(struct pt_regs *);
-extern int crash_wake_offline;
-
-struct kimage;
-struct pt_regs;
  extern void default_machine_kexec(struct kimage *image);
-extern void default_machine_crash_shutdown(struct pt_regs *regs);
-extern int crash_shutdown_register(crash_shutdown_t handler);
-extern int crash_shutdown_unregister(crash_shutdown_t handler);
-
-extern void crash_kexec_prepare(void);
-extern void crash_kexec_secondary(struct pt_regs *regs);
-int __init overlaps_crashkernel(unsigned long start, unsigned long 
size);

-extern void reserve_crashkernel(void);
  extern void machine_kexec_mask_interrupts(void);
-static inline bool kdump_in_progress(void)
-{
-    return crashing_cpu >= 0;
-}
-
  void relocate_new_kernel(unsigned long indirection_page, unsigned 
long reboot_code_buffer,

   unsigned long start_address) __noreturn;
-
  void kexec_copy_flush(struct kimage *image);
-#if defined(CONFIG_CRASH_DUMP)
-bool is_kdump_kernel(void);
-#define is_kdump_kernel    is_kdump_kernel
-#if defined(CONFIG_PPC_RTAS)
-void crash_free_reserved_phys_range(unsigned long begin, unsigned 
long end);

-#define crash_free_reserved_phys_range crash_free_reserved_phys_range
-#endif /* CONFIG_PPC_RTAS */
-#endif /* CONFIG_CRASH_DUMP */
-
  #ifdef CONFIG_KEXEC_FILE
  extern const struct kexec_file_ops kexec_elf64_ops;
@@ -152,15 +111,56 @@ int setup_new_fdt_ppc64(const struct kimage 
*image, void *fdt,

  #endif /* CONFIG_KEXEC_FILE */
-#else /* !CONFIG_KEXEC_CORE */
-static inline void crash_kexec_secondary(struct pt_regs *regs) { }
+#endif /* CONFIG_KEXEC_CORE */
+
+#ifdef CONFIG_CRASH_RESERVE
+int __init overlaps_crashkernel(unsigned long start, unsigned long 
size);

+extern void reserve_crashkernel(void);
+#else
+static inline void reserve_crashkernel(void) {}
+static inline int overlaps_crashkernel(unsigned long start, unsigned 
long size) { return 0; }

+#endif
-static inline int overlaps_crashkernel(unsigned l

Re: [PATCH v2 00/14] Split crash out from kexec and clean up related config items

2024-02-22 Thread Hari Bathini




On 23/02/24 2:59 am, Andrew Morton wrote:

On Thu, 22 Feb 2024 10:47:29 +0530 Hari Bathini  wrote:




On 22/02/24 2:27 am, Andrew Morton wrote:

On Wed, 21 Feb 2024 11:15:00 +0530 Hari Bathini  wrote:


On 04/02/24 8:56 am, Baoquan He wrote:

Hope Hari and Pingfan can help have a look, see if
it's doable. Now, I make it either have both kexec and crash enabled, or
disable both of them altogether.


Sure. I will take a closer look...

Thanks a lot. Please feel free to post patches to make that, or I can do
it with your support or suggestion.


Tested your changes and on top of these changes, came up with the below
changes to get it working for powerpc:

   
https://lore.kernel.org/all/20240213113150.1148276-1-hbath...@linux.ibm.com/


So can we take it that you're OK with Baoquan's series as-is?


Hi Andrew,

If you mean

v3 (https://lore.kernel.org/all/20240124051254.67105-1-...@redhat.com/)
+
follow-up from Baoquan
(https://lore.kernel.org/all/Zb8D1ASrgX0qVm9z@MiWiFi-R3L-srv/)

Yes.



Can I add your Acked-by: and/or Tested-by: to the patches in this series?


Sure, Andrew.

Acked-by: Hari Bathini 

for..

Patches 1-5 & 8 in:

  https://lore.kernel.org/all/20240124051254.67105-1-...@redhat.com/

and this follow-up patch:

  https://lore.kernel.org/all/Zb8D1ASrgX0qVm9z@MiWiFi-R3L-srv/

Thanks
Hari


Re: [powerpc] Dump capture failure with recent linux-next

2024-02-21 Thread Hari Bathini

Hi Sachin,

On 22/02/24 10:55 am, Sachin Sant wrote:

Kdump fails to save vmcore with recent linux-next builds on IBM Power server
with following messages

  Starting Kdump Vmcore Save Service...
[ 17.349599] kdump[367]: Kdump is using the default log level(3).
[ 17.407407] kdump[391]: saving to 
/sysroot//var/crash//127.0.0.1-2024-02-21-15:03:55/
[ 17.441270] EXT4-fs (sda2): re-mounted 630dfb4e-74bd-45c4-a8de-232992bc8724 
r/w. Quota mode: none.
[ 17.04] kdump[395]: saving vmcore-dmesg.txt to 
/sysroot//var/crash//127.0.0.1-2024-02-21-15:03:55/
[ 17.464859] kdump[401]: saving vmcore-dmesg.txt complete
[ 17.466636] kdump[403]: saving vmcore
[ 17.551589] kdump.sh[404]:
Checking for memory holes : [ 0.0 %] /
Checking for memory holes : [100.0 %] | readpage_elf: Attempt to read 
non-existent page at 0xc.
[ 17.551718] kdump.sh[404]: readmem: type_addr: 0, addr:c00c, 
size:16384
[ 17.551793] kdump.sh[404]: __exclude_unnecessary_pages: Can't read the buffer 
of struct page.
[ 17.551864] kdump.sh[404]: create_2nd_bitmap: Can't exclude unnecessary pages.
[ 17.562632] kdump.sh[404]: The kernel version is not supported.
[ 17.562708] kdump.sh[404]: The makedumpfile operation may be incomplete.
[ 17.562773] kdump.sh[404]: makedumpfile Failed.
[ 17.564335] kdump[406]: saving vmcore failed, _exitcode:1
[ 17.566013] kdump[408]: saving the /run/initramfs/kexec-dmesg.log to 
/sysroot//var/crash//127.0.0.1-2024-02-21-15:03:55/
[ 17.583658] kdump[414]: saving vmcore failed

Git bisect points to following patch

commit 378eb24a0658dd922b29524e0ce35c6c43f56cba
 mm/vmalloc: remove vmap_area_list

Reverting this patch allows kdump to save vmcore file correctly.

Does this change require any corresponding changes to makedumpfile?


Right. The change intends the tools to use VMALLOC_START exported via
vmcoreinfo instead of vmap_area_list. I don't see the corresponding
makedumpfile change submitted upstream yet though.

Aditya, can you help with this..

- Hari


Re: [PATCH v2 00/14] Split crash out from kexec and clean up related config items

2024-02-21 Thread Hari Bathini




On 22/02/24 2:27 am, Andrew Morton wrote:

On Wed, 21 Feb 2024 11:15:00 +0530 Hari Bathini  wrote:


On 04/02/24 8:56 am, Baoquan He wrote:

Hope Hari and Pingfan can help have a look, see if
it's doable. Now, I make it either have both kexec and crash enabled, or
disable both of them altogether.


Sure. I will take a closer look...

Thanks a lot. Please feel free to post patches to make that, or I can do
it with your support or suggestion.


Tested your changes and on top of these changes, came up with the below
changes to get it working for powerpc:

  
https://lore.kernel.org/all/20240213113150.1148276-1-hbath...@linux.ibm.com/


So can we take it that you're OK with Baoquan's series as-is?


Hi Andrew,

If you mean

v3 (https://lore.kernel.org/all/20240124051254.67105-1-...@redhat.com/)
+
follow-up from Baoquan 
(https://lore.kernel.org/all/Zb8D1ASrgX0qVm9z@MiWiFi-R3L-srv/)


Yes.

My changes are based on top of the above patches..

Thanks
Hari


Re: [PATCH v2 00/14] Split crash out from kexec and clean up related config items

2024-02-20 Thread Hari Bathini

Hi Baoquan,

On 04/02/24 8:56 am, Baoquan He wrote:

Hope Hari and Pingfan can help have a look, see if
it's doable. Now, I make it either have both kexec and crash enabled, or
disable both of them altogether.


Sure. I will take a closer look...

Thanks a lot. Please feel free to post patches to make that, or I can do
it with your support or suggestion.


Tested your changes and on top of these changes, came up with the below
changes to get it working for powerpc:


https://lore.kernel.org/all/20240213113150.1148276-1-hbath...@linux.ibm.com/

Please take a look.

Thanks
Hari


Re: [PATCH v2 1/2] powerpc/bpf: ensure module addresses are supported

2024-02-15 Thread Hari Bathini




On 13/02/24 1:23 pm, Christophe Leroy wrote:



Le 01/02/2024 à 18:12, Hari Bathini a écrit :

Currently, bpf jit code on powerpc assumes all the bpf functions and
helpers to be kernel text. This is false for kfunc case, as function
addresses are mostly module addresses in that case. Ensure module
addresses are supported to enable kfunc support.

Assume kernel text address for programs with no kfunc call to optimize
instruction sequence in that case. Add a check to error out if this
assumption ever changes in the future.

Signed-off-by: Hari Bathini 
---

Changes in v2:
* Using bpf_prog_has_kfunc_call() to decide whether to use optimized
instruction sequence or not as suggested by Naveen.


   arch/powerpc/net/bpf_jit.h|   5 +-
   arch/powerpc/net/bpf_jit_comp.c   |   4 +-
   arch/powerpc/net/bpf_jit_comp32.c |   8 ++-
   arch/powerpc/net/bpf_jit_comp64.c | 109 --
   4 files changed, 97 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index cdea5dccaefe..fc56ee0ee9c5 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -160,10 +160,11 @@ static inline void bpf_clear_seen_register(struct 
codegen_context *ctx, int i)
   }
   
   void bpf_jit_init_reg_mapping(struct codegen_context *ctx);

-int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context 
*ctx, u64 func);
+int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context 
*ctx, u64 func,
+  bool has_kfunc_call);
   int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct 
codegen_context *ctx,
   u32 *addrs, int pass, bool extra_pass);
-void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx);
+void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx, bool 
has_kfunc_call);
   void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx);
   void bpf_jit_realloc_regs(struct codegen_context *ctx);
   int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx, int 
tmp_reg, long exit_addr);
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 0f9a21783329..7b4103b4c929 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -163,7 +163,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 * update ctgtx.idx as it pretends to output instructions, then we can
 * calculate total size from idx.
 */
-   bpf_jit_build_prologue(NULL, &cgctx);
+   bpf_jit_build_prologue(NULL, &cgctx, bpf_prog_has_kfunc_call(fp));
addrs[fp->len] = cgctx.idx * 4;
bpf_jit_build_epilogue(NULL, &cgctx);
   
@@ -192,7 +192,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)

/* Now build the prologue, body code & epilogue for real. */
cgctx.idx = 0;
cgctx.alt_exit_addr = 0;
-   bpf_jit_build_prologue(code_base, &cgctx);
+   bpf_jit_build_prologue(code_base, &cgctx, 
bpf_prog_has_kfunc_call(fp));
if (bpf_jit_build_body(fp, code_base, fcode_base, &cgctx, 
addrs, pass,
   extra_pass)) {
bpf_arch_text_copy(&fhdr->size, &hdr->size, 
sizeof(hdr->size));
diff --git a/arch/powerpc/net/bpf_jit_comp32.c 
b/arch/powerpc/net/bpf_jit_comp32.c
index 2f39c50ca729..447747e51a58 100644
--- a/arch/powerpc/net/bpf_jit_comp32.c
+++ b/arch/powerpc/net/bpf_jit_comp32.c
@@ -123,7 +123,7 @@ void bpf_jit_realloc_regs(struct codegen_context *ctx)
}
   }
   
-void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx)

+void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx, bool 
has_kfunc_call)
   {
int i;
   
@@ -201,7 +201,8 @@ void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx)

   }
   
   /* Relative offset needs to be calculated based on final image location */

-int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context 
*ctx, u64 func)
+int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context 
*ctx, u64 func,
+  bool has_kfunc_call)
   {
s32 rel = (s32)func - (s32)(fimage + ctx->idx);
   
@@ -1054,7 +1055,8 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code

EMIT(PPC_RAW_STW(bpf_to_ppc(BPF_REG_5), _R1, 
12));
}
   
-			ret = bpf_jit_emit_func_call_rel(image, fimage, ctx, func_addr);

+   ret = bpf_jit_emit_func_call_rel(image, fimage, ctx, 
func_addr,
+
bpf_prog_has_kfunc_call(fp));
if (ret)
return ret;
   
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net

Re: [PATCH v2 2/2] powerpc/bpf: enable kfunc call

2024-02-15 Thread Hari Bathini




On 13/02/24 1:24 pm, Christophe Leroy wrote:



Le 01/02/2024 à 18:12, Hari Bathini a écrit :

With module addresses supported, override bpf_jit_supports_kfunc_call()
to enable kfunc support. Module address offsets can be more than 32-bit
long, so override bpf_jit_supports_far_kfunc_call() to enable 64-bit
pointers.


What's the impact on PPC32 ? There are no 64-bit pointers on PPC32.


Yeah. Not required to return true for PPC32 case and probably not a
good thing to claim support for far kfunc calls for PPC32. Changing to:

+bool bpf_jit_supports_far_kfunc_call(void)
+{
+   return IS_ENABLED(CONFIG_PPC64);
+}



Signed-off-by: Hari Bathini 
---

* No changes since v1.


   arch/powerpc/net/bpf_jit_comp.c | 10 ++
   1 file changed, 10 insertions(+)

diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 7b4103b4c929..f896a4213696 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -359,3 +359,13 @@ void bpf_jit_free(struct bpf_prog *fp)
   
   	bpf_prog_unlock_free(fp);

   }
+
+bool bpf_jit_supports_kfunc_call(void)
+{
+   return true;
+}
+
+bool bpf_jit_supports_far_kfunc_call(void)
+{
+   return true;
+}


[PATCH linux-next 3/3] powerpc/kdump: Split KEXEC_CORE and CRASH_DUMP dependency

2024-02-13 Thread Hari Bathini
Remove CONFIG_CRASH_DUMP dependency on CONFIG_KEXEC. CONFIG_KEXEC_CORE
was used at places where CONFIG_CRASH_DUMP or CONFIG_CRASH_RESERVE was
appropriate. Replace with appropriate #ifdefs to support CONFIG_KEXEC
and !CONFIG_CRASH_DUMP configuration option. Also, make CONFIG_FA_DUMP
dependent on CONFIG_CRASH_DUMP to avoid unmet dependencies for FA_DUMP
with !CONFIG_KEXEC_CORE configuration option.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/Kconfig   |  9 +--
 arch/powerpc/include/asm/kexec.h   | 98 +++---
 arch/powerpc/kernel/prom.c |  2 +-
 arch/powerpc/kernel/setup-common.c |  2 +-
 arch/powerpc/kernel/smp.c  |  4 +-
 arch/powerpc/kexec/Makefile|  3 +-
 arch/powerpc/kexec/core.c  |  4 ++
 7 files changed, 60 insertions(+), 62 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5cf8ad8d7e8e..e377deefa2dc 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -607,11 +607,6 @@ config PPC64_SUPPORTS_MEMORY_FAILURE
 config ARCH_SUPPORTS_KEXEC
def_bool PPC_BOOK3S || PPC_E500 || (44x && !SMP)
 
-config ARCH_SELECTS_KEXEC
-   def_bool y
-   depends on KEXEC
-   select CRASH_DUMP
-
 config ARCH_SUPPORTS_KEXEC_FILE
def_bool PPC64
 
@@ -622,7 +617,6 @@ config ARCH_SELECTS_KEXEC_FILE
def_bool y
depends on KEXEC_FILE
select KEXEC_ELF
-   select CRASH_DUMP
select HAVE_IMA_KEXEC if IMA
 
 config PPC64_BIG_ENDIAN_ELF_ABI_V2
@@ -694,8 +688,7 @@ config ARCH_SELECTS_CRASH_DUMP
 
 config FA_DUMP
bool "Firmware-assisted dump"
-   depends on PPC64 && (PPC_RTAS || PPC_POWERNV)
-   select CRASH_DUMP
+   depends on CRASH_DUMP && PPC64 && (PPC_RTAS || PPC_POWERNV)
help
  A robust mechanism to get reliable kernel crash dump with
  assistance from firmware. This approach does not use kexec,
diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index e1b43aa12175..fdb90e24dc74 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -55,59 +55,18 @@
 typedef void (*crash_shutdown_t)(void);
 
 #ifdef CONFIG_KEXEC_CORE
-
-/*
- * This function is responsible for capturing register states if coming
- * via panic or invoking dump using sysrq-trigger.
- */
-static inline void crash_setup_regs(struct pt_regs *newregs,
-   struct pt_regs *oldregs)
-{
-   if (oldregs)
-   memcpy(newregs, oldregs, sizeof(*newregs));
-   else
-   ppc_save_regs(newregs);
-}
+struct kimage;
+struct pt_regs;
 
 extern void kexec_smp_wait(void);  /* get and clear naca physid, wait for
  master to copy new code to 0 */
-extern int crashing_cpu;
-extern void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *));
-extern void crash_ipi_callback(struct pt_regs *);
-extern int crash_wake_offline;
-
-struct kimage;
-struct pt_regs;
 extern void default_machine_kexec(struct kimage *image);
-extern void default_machine_crash_shutdown(struct pt_regs *regs);
-extern int crash_shutdown_register(crash_shutdown_t handler);
-extern int crash_shutdown_unregister(crash_shutdown_t handler);
-
-extern void crash_kexec_prepare(void);
-extern void crash_kexec_secondary(struct pt_regs *regs);
-int __init overlaps_crashkernel(unsigned long start, unsigned long size);
-extern void reserve_crashkernel(void);
 extern void machine_kexec_mask_interrupts(void);
 
-static inline bool kdump_in_progress(void)
-{
-   return crashing_cpu >= 0;
-}
-
 void relocate_new_kernel(unsigned long indirection_page, unsigned long 
reboot_code_buffer,
 unsigned long start_address) __noreturn;
-
 void kexec_copy_flush(struct kimage *image);
 
-#if defined(CONFIG_CRASH_DUMP)
-bool is_kdump_kernel(void);
-#define is_kdump_kernelis_kdump_kernel
-#if defined(CONFIG_PPC_RTAS)
-void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
-#define crash_free_reserved_phys_range crash_free_reserved_phys_range
-#endif /* CONFIG_PPC_RTAS */
-#endif /* CONFIG_CRASH_DUMP */
-
 #ifdef CONFIG_KEXEC_FILE
 extern const struct kexec_file_ops kexec_elf64_ops;
 
@@ -152,15 +111,56 @@ int setup_new_fdt_ppc64(const struct kimage *image, void 
*fdt,
 
 #endif /* CONFIG_KEXEC_FILE */
 
-#else /* !CONFIG_KEXEC_CORE */
-static inline void crash_kexec_secondary(struct pt_regs *regs) { }
+#endif /* CONFIG_KEXEC_CORE */
+
+#ifdef CONFIG_CRASH_RESERVE
+int __init overlaps_crashkernel(unsigned long start, unsigned long size);
+extern void reserve_crashkernel(void);
+#else
+static inline void reserve_crashkernel(void) {}
+static inline int overlaps_crashkernel(unsigned long start, unsigned long 
size) { return 0; }
+#endif
 
-static inline int overlaps_crashkernel(unsigned long start, unsigned long size)
+#if defined(CONFIG_CRASH_DUMP)
+/*
+

[PATCH linux-next 2/3] powerpc/kexec: split CONFIG_KEXEC_FILE and CONFIG_CRASH_DUMP

2024-02-13 Thread Hari Bathini
CONFIG_KEXEC_FILE does not have to select CONFIG_CRASH_DUMP. Move
some code under CONFIG_CRASH_DUMP to support CONFIG_KEXEC_FILE and
!CONFIG_CRASH_DUMP case.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/kexec/elf_64.c   |   4 +-
 arch/powerpc/kexec/file_load_64.c | 269 --
 2 files changed, 142 insertions(+), 131 deletions(-)

diff --git a/arch/powerpc/kexec/elf_64.c b/arch/powerpc/kexec/elf_64.c
index 904016cf89ea..6d8951e8e966 100644
--- a/arch/powerpc/kexec/elf_64.c
+++ b/arch/powerpc/kexec/elf_64.c
@@ -47,7 +47,7 @@ static void *elf64_load(struct kimage *image, char 
*kernel_buf,
if (ret)
return ERR_PTR(ret);
 
-   if (image->type == KEXEC_TYPE_CRASH) {
+   if (IS_ENABLED(CONFIG_CRASH_DUMP) && image->type == KEXEC_TYPE_CRASH) {
/* min & max buffer values for kdump case */
kbuf.buf_min = pbuf.buf_min = crashk_res.start;
kbuf.buf_max = pbuf.buf_max =
@@ -70,7 +70,7 @@ static void *elf64_load(struct kimage *image, char 
*kernel_buf,
kexec_dprintk("Loaded purgatory at 0x%lx\n", pbuf.mem);
 
/* Load additional segments needed for panic kernel */
-   if (image->type == KEXEC_TYPE_CRASH) {
+   if (IS_ENABLED(CONFIG_CRASH_DUMP) && image->type == KEXEC_TYPE_CRASH) {
ret = load_crashdump_segments_ppc64(image, &kbuf);
if (ret) {
pr_err("Failed to load kdump kernel segments\n");
diff --git a/arch/powerpc/kexec/file_load_64.c 
b/arch/powerpc/kexec/file_load_64.c
index 5b4c5cb23354..1bc65de6174f 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -96,119 +96,6 @@ static int get_exclude_memory_ranges(struct crash_mem 
**mem_ranges)
return ret;
 }
 
-/**
- * get_usable_memory_ranges - Get usable memory ranges. This list includes
- *regions like crashkernel, opal/rtas & tce-table,
- *that kdump kernel could use.
- * @mem_ranges:   Range list to add the memory ranges to.
- *
- * Returns 0 on success, negative errno on error.
- */
-static int get_usable_memory_ranges(struct crash_mem **mem_ranges)
-{
-   int ret;
-
-   /*
-* Early boot failure observed on guests when low memory (first memory
-* block?) is not added to usable memory. So, add [0, crashk_res.end]
-* instead of [crashk_res.start, crashk_res.end] to workaround it.
-* Also, crashed kernel's memory must be added to reserve map to
-* avoid kdump kernel from using it.
-*/
-   ret = add_mem_range(mem_ranges, 0, crashk_res.end + 1);
-   if (ret)
-   goto out;
-
-   ret = add_rtas_mem_range(mem_ranges);
-   if (ret)
-   goto out;
-
-   ret = add_opal_mem_range(mem_ranges);
-   if (ret)
-   goto out;
-
-   ret = add_tce_mem_ranges(mem_ranges);
-out:
-   if (ret)
-   pr_err("Failed to setup usable memory ranges\n");
-   return ret;
-}
-
-/**
- * get_crash_memory_ranges - Get crash memory ranges. This list includes
- *   first/crashing kernel's memory regions that
- *   would be exported via an elfcore.
- * @mem_ranges:  Range list to add the memory ranges to.
- *
- * Returns 0 on success, negative errno on error.
- */
-static int get_crash_memory_ranges(struct crash_mem **mem_ranges)
-{
-   phys_addr_t base, end;
-   struct crash_mem *tmem;
-   u64 i;
-   int ret;
-
-   for_each_mem_range(i, &base, &end) {
-   u64 size = end - base;
-
-   /* Skip backup memory region, which needs a separate entry */
-   if (base == BACKUP_SRC_START) {
-   if (size > BACKUP_SRC_SIZE) {
-   base = BACKUP_SRC_END + 1;
-   size -= BACKUP_SRC_SIZE;
-   } else
-   continue;
-   }
-
-   ret = add_mem_range(mem_ranges, base, size);
-   if (ret)
-   goto out;
-
-   /* Try merging adjacent ranges before reallocation attempt */
-   if ((*mem_ranges)->nr_ranges == (*mem_ranges)->max_nr_ranges)
-   sort_memory_ranges(*mem_ranges, true);
-   }
-
-   /* Reallocate memory ranges if there is no space to split ranges */
-   tmem = *mem_ranges;
-   if (tmem && (tmem->nr_ranges == tmem->max_nr_ranges)) {
-   tmem = realloc_mem_ranges(mem_ranges);
-   if (!tmem)
-   goto out;
-   }
-
-   /* Exclude crashkernel region */
-   ret = crash_exclude_mem_range(tmem, crashk_res.start, crashk_res.end);
-   if (ret)
-   goto out;
-
-   /*
-  

[PATCH linux-next 1/3] kexec/kdump: make struct crash_mem available without CONFIG_CRASH_DUMP

2024-02-13 Thread Hari Bathini
struct crash_mem defined under include/linux/crash_core.h represents
a list of memory ranges. While it is used to represent memory ranges
for kdump kernel, it can also be used for other kind of memory ranges.
In fact, KEXEC_FILE_LOAD syscall in powerpc uses this structure to
represent reserved memory ranges and exclude memory ranges needed to
find the right memory regions to load kexec kernel. So, make the
definition of crash_mem structure available for !CONFIG_CRASH_DUMP
case too.

Signed-off-by: Hari Bathini 
---
 include/linux/crash_core.h | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 23270b16e1db..d33352c2e386 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -8,6 +8,12 @@
 
 struct kimage;
 
+struct crash_mem {
+   unsigned int max_nr_ranges;
+   unsigned int nr_ranges;
+   struct range ranges[] __counted_by(max_nr_ranges);
+};
+
 #ifdef CONFIG_CRASH_DUMP
 
 int crash_shrink_memory(unsigned long new_size);
@@ -51,12 +57,6 @@ static inline unsigned int crash_get_elfcorehdr_size(void) { 
return 0; }
 /* Alignment required for elf header segment */
 #define ELF_CORE_HEADER_ALIGN   4096
 
-struct crash_mem {
-   unsigned int max_nr_ranges;
-   unsigned int nr_ranges;
-   struct range ranges[] __counted_by(max_nr_ranges);
-};
-
 extern int crash_exclude_mem_range(struct crash_mem *mem,
   unsigned long long mstart,
   unsigned long long mend);
-- 
2.43.0



[PATCH linux-next 0/3] powerpc/kexec: split CONFIG_CRASH_DUMP out from CONFIG_KEXEC_CORE

2024-02-13 Thread Hari Bathini
This patch series is a follow-up to [1] based on discussions at [2]
about additional work needed to get it working on powerpc.

The first patch in the series makes struct crash_mem available with or
without CONFIG_CRASH_DUMP enabled. The next patch moves kdump specific
code for kexec_file_load syscall under CONFIG_CRASH_DUMP and the last
patch splits other kdump specific code under CONFIG_CRASH_DUMP and
removes dependency with CONFIG_CRASH_DUMP for CONFIG_KEXEC_CORE.

[1] https://lore.kernel.org/all/20240124051254.67105-1-...@redhat.com/
[2] 
https://lore.kernel.org/all/9101bb07-70f1-476c-bec9-ec67e9899...@linux.ibm.com/


Hari Bathini (3):
  kexec/kdump: make struct crash_mem available without CONFIG_CRASH_DUMP
  powerpc/kexec: split CONFIG_KEXEC_FILE and CONFIG_CRASH_DUMP
  powerpc/kdump: Split KEXEC_CORE and CRASH_DUMP dependency

 arch/powerpc/Kconfig   |   9 +-
 arch/powerpc/include/asm/kexec.h   |  98 +--
 arch/powerpc/kernel/prom.c |   2 +-
 arch/powerpc/kernel/setup-common.c |   2 +-
 arch/powerpc/kernel/smp.c  |   4 +-
 arch/powerpc/kexec/Makefile|   3 +-
 arch/powerpc/kexec/core.c  |   4 +
 arch/powerpc/kexec/elf_64.c|   4 +-
 arch/powerpc/kexec/file_load_64.c  | 269 +++--
 include/linux/crash_core.h |  12 +-
 10 files changed, 208 insertions(+), 199 deletions(-)

-- 
2.43.0



Re: [PATCH v2 00/14] Split crash out from kexec and clean up related config items

2024-02-01 Thread Hari Bathini

Hi Baoquan,

On 19/01/24 8:22 pm, Baoquan He wrote:

Motivation:
=
Previously, LKP reported a building error. When investigating, it can't
be resolved reasonablly with the present messy kdump config items.

  https://lore.kernel.org/oe-kbuild-all/202312182200.ka7mzifq-...@intel.com/

The kdump (crash dumping) related config items could causes confusions:

Firstly,
---
CRASH_CORE enables codes including
  - crashkernel reservation;
  - elfcorehdr updating;
  - vmcoreinfo exporting;
  - crash hotplug handling;

Now fadump of powerpc, kcore dynamic debugging and kdump all selects
CRASH_CORE, while fadump
  - fadump needs crashkernel parsing, vmcoreinfo exporting, and accessing
global variable 'elfcorehdr_addr';
  - kcore only needs vmcoreinfo exporting;
  - kdump needs all of the current kernel/crash_core.c.

So only enabling PROC_CORE or FA_DUMP will enable CRASH_CORE, this
mislead people that we enable crash dumping, actual it's not.

Secondly,
---
It's not reasonable to allow KEXEC_CORE select CRASH_CORE.

Because KEXEC_CORE enables codes which allocate control pages, copy
kexec/kdump segments, and prepare for switching. These codes are
shared by both kexec reboot and kdump. We could want kexec reboot,
but disable kdump. In that case, CRASH_CORE should not be selected.

  
  CONFIG_CRASH_CORE=y
  CONFIG_KEXEC_CORE=y
  CONFIG_KEXEC=y
  CONFIG_KEXEC_FILE=y
 -

Thirdly,
---
It's not reasonable to allow CRASH_DUMP select KEXEC_CORE.

That could make KEXEC_CORE, CRASH_DUMP are enabled independently from
KEXEC or KEXEC_FILE. However, w/o KEXEC or KEXEC_FILE, the KEXEC_CORE
code built in doesn't make any sense because no kernel loading or
switching will happen to utilize the KEXEC_CORE code.
  -
  CONFIG_CRASH_CORE=y
  CONFIG_KEXEC_CORE=y
  CONFIG_CRASH_DUMP=y
  -

In this case, what is worse, on arch sh and arm, KEXEC relies on MMU,
while CRASH_DUMP can still be enabled when !MMU, then compiling error is
seen as the lkp test robot reported in above link.

  --arch/sh/Kconfig--
  config ARCH_SUPPORTS_KEXEC
  def_bool MMU

  config ARCH_SUPPORTS_CRASH_DUMP
  def_bool BROKEN_ON_SMP
  ---

Changes:
===
1, split out crash_reserve.c from crash_core.c;
2, split out vmcore_infoc. from crash_core.c;
3, move crash related codes in kexec_core.c into crash_core.c;
4, remove dependency of FA_DUMP on CRASH_DUMP;
5, clean up kdump related config items;
6, wrap up crash codes in crash related ifdefs on all 9 arch-es
which support crash dumping;

Achievement:
===
With above changes, I can rearrange the config item logic as below (the right
item depends on or is selected by the left item):

 PROC_KCORE ---> VMCORE_INFO

|--> VMCORE_INFO
 FA_DUMP|
|--> CRASH_RESERVE


FA_DUMP also needs PROC_VMCORE (CRASH_DUMP by dependency, I guess).
So, the FA_DUMP related changes here will need a relook..



 >VMCORE_INFO
/
|>CRASH_RESERVE
 KEXEC  --|/|
  |--> KEXEC_CORE--> CRASH_DUMP-->/-|>PROC_VMCORE
 KEXEC_FILE --|   \ |
\>CRASH_HOTPLUG


 KEXEC  --|
  |--> KEXEC_CORE (for kexec reboot only)
 KEXEC_FILE --|

Test

On all 8 architectures, including x86_64, arm64, s390x, sh, arm, mips,
riscv, loongarch, I did below three cases of config item setting and
building all passed. Let me take configs on x86_64 as exampmle here:

(1) Both CONFIG_KEXEC and KEXEC_FILE is unset, then all kexec/kdump
items are unset automatically:
# Kexec and crash features
# CONFIG_KEXEC is not set
# CONFIG_KEXEC_FILE is not set
# end of Kexec and crash features

(2) set CONFIG_KEXEC_FILE and 'make olddefconfig':
---
# Kexec and crash features
CONFIG_CRASH_RESERVE=y
CONFIG_VMCORE_INFO=y
CONFIG_KEXEC_CORE=y
CONFIG_KEXEC_FILE=y
CONFIG_CRASH_DUMP=y
CONFIG_CRASH_HOTPLUG=y
CONFIG_CRASH_MAX_MEMORY_RANGES=8192
# end of Kexec and crash features
---

(3) unset CONFIG_CRASH_DUMP in case 2 and execute 'make olddefconfig':

# Kexec and crash features
CONFIG_KEXEC_CORE=y
CONFIG_KEXEC_FILE=y
# end of Kexec and crash features


Note:
For ppc, it needs investigation to make clear how to split out crash
code in arch folder.


On powerpc, both kdump and fadump need PROC_VMCORE & CRASH_DUMP.
Hope that clears things. So, patch 3/14 breaks things for FA_DUMP..


Hope Hari and Pingfan can help have a look, see if
it's doable. Now, I make it either have both kexec and crash enabled, or
disable both of them altogether.



Sure. I will take a closer l

[PATCH v2 2/2] powerpc/bpf: enable kfunc call

2024-02-01 Thread Hari Bathini
With module addresses supported, override bpf_jit_supports_kfunc_call()
to enable kfunc support. Module address offsets can be more than 32-bit
long, so override bpf_jit_supports_far_kfunc_call() to enable 64-bit
pointers.

Signed-off-by: Hari Bathini 
---

* No changes since v1.


 arch/powerpc/net/bpf_jit_comp.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 7b4103b4c929..f896a4213696 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -359,3 +359,13 @@ void bpf_jit_free(struct bpf_prog *fp)
 
bpf_prog_unlock_free(fp);
 }
+
+bool bpf_jit_supports_kfunc_call(void)
+{
+   return true;
+}
+
+bool bpf_jit_supports_far_kfunc_call(void)
+{
+   return true;
+}
-- 
2.43.0



[PATCH v2 1/2] powerpc/bpf: ensure module addresses are supported

2024-02-01 Thread Hari Bathini
Currently, bpf jit code on powerpc assumes all the bpf functions and
helpers to be kernel text. This is false for kfunc case, as function
addresses are mostly module addresses in that case. Ensure module
addresses are supported to enable kfunc support.

Assume kernel text address for programs with no kfunc call to optimize
instruction sequence in that case. Add a check to error out if this
assumption ever changes in the future.

Signed-off-by: Hari Bathini 
---

Changes in v2:
* Using bpf_prog_has_kfunc_call() to decide whether to use optimized
  instruction sequence or not as suggested by Naveen.


 arch/powerpc/net/bpf_jit.h|   5 +-
 arch/powerpc/net/bpf_jit_comp.c   |   4 +-
 arch/powerpc/net/bpf_jit_comp32.c |   8 ++-
 arch/powerpc/net/bpf_jit_comp64.c | 109 --
 4 files changed, 97 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index cdea5dccaefe..fc56ee0ee9c5 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -160,10 +160,11 @@ static inline void bpf_clear_seen_register(struct 
codegen_context *ctx, int i)
 }
 
 void bpf_jit_init_reg_mapping(struct codegen_context *ctx);
-int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context 
*ctx, u64 func);
+int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context 
*ctx, u64 func,
+  bool has_kfunc_call);
 int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct 
codegen_context *ctx,
   u32 *addrs, int pass, bool extra_pass);
-void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx);
+void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx, bool 
has_kfunc_call);
 void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx);
 void bpf_jit_realloc_regs(struct codegen_context *ctx);
 int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx, int 
tmp_reg, long exit_addr);
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 0f9a21783329..7b4103b4c929 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -163,7 +163,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 * update ctgtx.idx as it pretends to output instructions, then we can
 * calculate total size from idx.
 */
-   bpf_jit_build_prologue(NULL, &cgctx);
+   bpf_jit_build_prologue(NULL, &cgctx, bpf_prog_has_kfunc_call(fp));
addrs[fp->len] = cgctx.idx * 4;
bpf_jit_build_epilogue(NULL, &cgctx);
 
@@ -192,7 +192,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
/* Now build the prologue, body code & epilogue for real. */
cgctx.idx = 0;
cgctx.alt_exit_addr = 0;
-   bpf_jit_build_prologue(code_base, &cgctx);
+   bpf_jit_build_prologue(code_base, &cgctx, 
bpf_prog_has_kfunc_call(fp));
if (bpf_jit_build_body(fp, code_base, fcode_base, &cgctx, 
addrs, pass,
   extra_pass)) {
bpf_arch_text_copy(&fhdr->size, &hdr->size, 
sizeof(hdr->size));
diff --git a/arch/powerpc/net/bpf_jit_comp32.c 
b/arch/powerpc/net/bpf_jit_comp32.c
index 2f39c50ca729..447747e51a58 100644
--- a/arch/powerpc/net/bpf_jit_comp32.c
+++ b/arch/powerpc/net/bpf_jit_comp32.c
@@ -123,7 +123,7 @@ void bpf_jit_realloc_regs(struct codegen_context *ctx)
}
 }
 
-void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx)
+void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx, bool 
has_kfunc_call)
 {
int i;
 
@@ -201,7 +201,8 @@ void bpf_jit_build_epilogue(u32 *image, struct 
codegen_context *ctx)
 }
 
 /* Relative offset needs to be calculated based on final image location */
-int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context 
*ctx, u64 func)
+int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context 
*ctx, u64 func,
+  bool has_kfunc_call)
 {
s32 rel = (s32)func - (s32)(fimage + ctx->idx);
 
@@ -1054,7 +1055,8 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, 
u32 *fimage, struct code
EMIT(PPC_RAW_STW(bpf_to_ppc(BPF_REG_5), _R1, 
12));
}
 
-   ret = bpf_jit_emit_func_call_rel(image, fimage, ctx, 
func_addr);
+   ret = bpf_jit_emit_func_call_rel(image, fimage, ctx, 
func_addr,
+
bpf_prog_has_kfunc_call(fp));
if (ret)
return ret;
 
diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index 79f23974a320..385a5df1670c 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powe

Re: [PATCH v15 5/5] powerpc: add crash memory hotplug support

2024-01-23 Thread Hari Bathini




On 11/01/24 4:21 pm, Sourabh Jain wrote:

Extend the arch crash hotplug handler, as introduced by the patch title
("powerpc: add crash CPU hotplug support"), to also support memory
add/remove events.

Elfcorehdr describes the memory of the crash kernel to capture the
kernel; hence, it needs to be updated if memory resources change due to
memory add/remove events. Therefore, arch_crash_handle_hotplug_event()
is updated to recreate the elfcorehdr and replace it with the previous
one on memory add/remove events.

The memblock list is used to prepare the elfcorehdr. In the case of
memory hot removal, the memblock list is updated after the arch crash
hotplug handler is triggered, as depicted in Figure 1. Thus, the
hot-removed memory is explicitly removed from the crash memory ranges
to ensure that the memory ranges added to elfcorehdr do not include the
hot-removed memory.

 Memory remove
   |
   v
 Offline pages
   |
   v
  Initiate memory notify call <> crash hotplug handler
  chain for MEM_OFFLINE event
   |
   v
  Update memblock list

Figure 1

There are two system calls, `kexec_file_load` and `kexec_load`, used to
load the kdump image. A few changes have been made to ensure that the
kernel can safely update the elfcorehdr component of the kdump image for
both system calls.

For the kexec_file_load syscall, kdump image is prepared in the kernel.
To support an increasing number of memory regions, the elfcorehdr is
built with extra buffer space to ensure that it can accommodate
additional memory ranges in future.

For the kexec_load syscall, the elfcorehdr is updated only if the
KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag is passed to the kernel by the
kexec tool. Passing this flag to the kernel indicates that the
elfcorehdr is built to accommodate additional memory ranges and the
elfcorehdr segment is not considered for SHA calculation, making it safe
to update.

The changes related to this feature are kept under the CRASH_HOTPLUG
config, and it is enabled by default.

Signed-off-by: Sourabh Jain 
Cc: Akhil Raj 
Cc: Andrew Morton 
Cc: Aneesh Kumar K.V 
Cc: Baoquan He 
Cc: Borislav Petkov (AMD) 
Cc: Boris Ostrovsky 
Cc: Christophe Leroy 
Cc: Dave Hansen 
Cc: Dave Young 
Cc: David Hildenbrand 
Cc: Greg Kroah-Hartman 
Cc: Hari Bathini 
Cc: Laurent Dufour 
Cc: Mahesh Salgaonkar 
Cc: Michael Ellerman 
Cc: Mimi Zohar 
Cc: Naveen N Rao 
Cc: Oscar Salvador 
Cc: Thomas Gleixner 
Cc: Valentin Schneider 
Cc: Vivek Goyal 
Cc: ke...@lists.infradead.org
Cc: x...@kernel.org
---
  arch/powerpc/include/asm/kexec.h|   5 +-
  arch/powerpc/include/asm/kexec_ranges.h |   1 +
  arch/powerpc/kexec/core_64.c| 107 +++-
  arch/powerpc/kexec/file_load_64.c   |  34 +++-
  arch/powerpc/kexec/ranges.c |  85 +++
  5 files changed, 225 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index 943e58eb9bff..25ff5b7f1a28 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -116,8 +116,11 @@ int get_crash_memory_ranges(struct crash_mem **mem_ranges);
  #ifdef CONFIG_CRASH_HOTPLUG
  void arch_crash_handle_hotplug_event(struct kimage *image, void *arg);
  #define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event
-#endif /*CONFIG_CRASH_HOTPLUG */
  
+unsigned int arch_crash_get_elfcorehdr_size(void);

+#define crash_get_elfcorehdr_size arch_crash_get_elfcorehdr_size
+
+#endif /*CONFIG_CRASH_HOTPLUG */
  #endif /* CONFIG_PPC64 */
  
  #ifdef CONFIG_KEXEC_FILE

diff --git a/arch/powerpc/include/asm/kexec_ranges.h 
b/arch/powerpc/include/asm/kexec_ranges.h
index f83866a19e87..802abf580cf0 100644
--- a/arch/powerpc/include/asm/kexec_ranges.h
+++ b/arch/powerpc/include/asm/kexec_ranges.h
@@ -7,6 +7,7 @@
  void sort_memory_ranges(struct crash_mem *mrngs, bool merge);
  struct crash_mem *realloc_mem_ranges(struct crash_mem **mem_ranges);
  int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size);
+int remove_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size);
  int add_tce_mem_ranges(struct crash_mem **mem_ranges);
  int add_initrd_mem_range(struct crash_mem **mem_ranges);
  #ifdef CONFIG_PPC_64S_HASH_MMU
diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c
index 43fcd78c2102..4673f150f973 100644
--- a/arch/powerpc/kexec/core_64.c
+++ b/arch/powerpc/kexec/core_64.c
@@ -19,8 +19,11 @@
  #include 
  #include 
  #include 
+#include 
  
  #include 

+#include 
+#include 
  #include 
  #include 
  #include 
@@ -546,6 +549,101 @@ int update_cpus_node(void *fdt)
  #undef pr_fmt
  #define pr_fmt(fmt) "crash hp: " fmt
  
+/*

+ * Advertise preferred elfcorehdr size to userspace via
+ * /sys/kernel/crash_elfcorehdr_size sysfs interface.
+ */
+unsigned int arch_crash_get_elfcorehdr_size(void)
+{
+   unsigned int sz;
+   

Re: [PATCH v15 4/5] powerpc: add crash CPU hotplug support

2024-01-23 Thread Hari Bathini




On 11/01/24 4:21 pm, Sourabh Jain wrote:

Due to CPU/Memory hotplug or online/offline events, the elfcorehdr
(which describes the CPUs and memory of the crashed kernel) and FDT
(Flattened Device Tree) of kdump image becomes outdated. Consequently,
attempting dump collection with an outdated elfcorehdr or FDT can lead
to failed or inaccurate dump collection.

Going forward, CPU hotplug or online/offline events are referred as
CPU/Memory add/remove events.

The current solution to address the above issue involves monitoring the
CPU/Memory add/remove events in userspace using udev rules and whenever
there are changes in CPU and memory resources, the entire kdump image
is loaded again. The kdump image includes kernel, initrd, elfcorehdr,
FDT, purgatory. Given that only elfcorehdr and FDT get outdated due to
CPU/Memory add/remove events, reloading the entire kdump image is
inefficient. More importantly, kdump remains inactive for a substantial
amount of time until the kdump reload completes.

To address the aforementioned issue, commit 247262756121 ("crash: add
generic infrastructure for crash hotplug support") added a generic
infrastructure that allows architectures to selectively update the kdump
image component during CPU or memory add/remove events within the kernel
itself.

In the event of a CPU or memory add/remove events, the generic crash
hotplug event handler, `crash_handle_hotplug_event()`, is triggered. It
then acquires the necessary locks to update the kdump image and invokes
the architecture-specific crash hotplug handler,
`arch_crash_handle_hotplug_event()`, to update the required kdump image
components.

This patch adds crash hotplug handler for PowerPC and enable support to
update the kdump image on CPU add/remove events. Support for memory
add/remove events is added in a subsequent patch with the title
"powerpc: add crash memory hotplug support"

As mentioned earlier, only the elfcorehdr and FDT kdump image components
need to be updated in the event of CPU or memory add/remove events.
However, on PowerPC architecture crash hotplug handler only updates the
FDT to enable crash hotplug support for CPU add/remove events. Here's
why.

The elfcorehdr on PowerPC is built with possible CPUs, and thus, it does
not need an update on CPU add/remove events. On the other hand, the FDT
needs to be updated on CPU add events to include the newly added CPU. If
the FDT is not updated and the kernel crashes on a newly added CPU, the
kdump kernel will fail to boot due to the unavailability of the crashing
CPU in the FDT. During the early boot, it is expected that the boot CPU
must be a part of the FDT; otherwise, the kernel will raise a BUG and
fail to boot. For more information, refer to commit 36ae37e3436b0
("powerpc: Make boot_cpuid common between 32 and 64-bit"). Since it is
okay to have an offline CPU in the kdump FDT, no action is taken in case
of CPU removal.

There are two system calls, `kexec_file_load` and `kexec_load`, used to
load the kdump image. Few changes have been made to ensure kernel can
safely update the FDT of kdump image loaded using both system calls.

For kexec_file_load syscall the kdump image is prepared in kernel. So to
support an increasing number of CPUs, the FDT is constructed with extra
buffer space to ensure it can accommodate a possible number of CPU
nodes. Additionally, a call to fdt_pack (which trims the unused space
once the FDT is prepared) is avoided if this feature is enabled.

For the kexec_load syscall, the FDT is updated only if the
KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag is passed to the kernel by
userspace (kexec tools). When userspace passes this flag to the kernel,
it indicates that the FDT is built to accommodate possible CPUs, and the
FDT segment is excluded from SHA calculation, making it safe to update.

The changes related to this feature are kept under the CRASH_HOTPLUG
config, and it is enabled by default.

Signed-off-by: Sourabh Jain 
Cc: Akhil Raj 
Cc: Andrew Morton 
Cc: Aneesh Kumar K.V 
Cc: Baoquan He 
Cc: Borislav Petkov (AMD) 
Cc: Boris Ostrovsky 
Cc: Christophe Leroy 
Cc: Dave Hansen 
Cc: Dave Young 
Cc: David Hildenbrand 
Cc: Greg Kroah-Hartman 
Cc: Hari Bathini 
Cc: Laurent Dufour 
Cc: Mahesh Salgaonkar 
Cc: Michael Ellerman 
Cc: Mimi Zohar 
Cc: Naveen N Rao 
Cc: Oscar Salvador 
Cc: Thomas Gleixner 
Cc: Valentin Schneider 
Cc: Vivek Goyal 
Cc: ke...@lists.infradead.org
Cc: x...@kernel.org
---
  arch/powerpc/Kconfig  |  4 ++
  arch/powerpc/include/asm/kexec.h  |  6 +++
  arch/powerpc/kexec/core_64.c  | 69 +++
  arch/powerpc/kexec/elf_64.c   | 12 +-
  arch/powerpc/kexec/file_load_64.c | 15 +++
  5 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 414b978b8010..91d7bb0b81ee 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -682,6 +682,10 @@ config RELOCATABLE_TEST
  config ARCH_SUPPORT

Re: [PATCH v7 1/3] powerpc: make fadump resilient with memory add/remove events

2024-01-23 Thread Hari Bathini
_mask are not the same across the crashed and fadump kernel.

Note: if either first/crashed kernel or second/fadump kernel do not have
the changes introduced here then kernel fail to collect the dump and
prints relevant error message on the console.

Signed-off-by: Sourabh Jain 
Cc: Aditya Gupta 
Cc: Aneesh Kumar K.V 
Cc: Hari Bathini 
Cc: Mahesh Salgaonkar 
Cc: Michael Ellerman 
Cc: Naveen N Rao 
---
  arch/powerpc/include/asm/fadump-internal.h   |  31 +-
  arch/powerpc/kernel/fadump.c | 355 +++
  arch/powerpc/platforms/powernv/opal-fadump.c |  18 +-
  arch/powerpc/platforms/pseries/rtas-fadump.c |  23 +-
  4 files changed, 242 insertions(+), 185 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump-internal.h 
b/arch/powerpc/include/asm/fadump-internal.h
index 27f9e11eda28..a632e9708610 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -42,13 +42,40 @@ static inline u64 fadump_str_to_u64(const char *str)
  
  #define FADUMP_CPU_UNKNOWN		(~((u32)0))
  
-#define FADUMP_CRASH_INFO_MAGIC		fadump_str_to_u64("FADMPINF")

+/*
+ * The introduction of new fields in the fadump crash info header has
+ * led to a change in the magic key from `FADMPINF` to `FADMPSIG` for
+ * identifying a kernel crash from an old kernel.
+ *
+ * To prevent the need for further changes to the magic number in the
+ * event of future modifications to the fadump crash info header, a
+ * version field has been introduced to track the fadump crash info
+ * header version.
+ *
+ * Consider a few points before adding new members to the fadump crash info
+ * header structure:
+ *
+ *  - Append new members; avoid adding them in between.
+ *  - Non-primitive members should have a size member as well.
+ *  - For every change in the fadump header, increment the
+ *fadump header version. This helps the updated kernel decide how to
+ *handle kernel dumps from older kernels.
+ */
+#define FADUMP_CRASH_INFO_MAGIC_OLDfadump_str_to_u64("FADMPINF")
+#define FADUMP_CRASH_INFO_MAGICfadump_str_to_u64("FADMPSIG")
+#define FADUMP_HEADER_VERSION  1
  
  /* fadump crash info structure */

  struct fadump_crash_info_header {
u64 magic_number;
-   u64 elfcorehdr_addr;
+   u32 version;
u32 crashing_cpu;



+   u64 elfcorehdr_addr;
+   u64 elfcorehdr_size;


fadump_crash_info_header structure is to share info across reboots.
Now that elfcorehdr is prepared in second kernel and also dump capture
of older kernel is not supported, get rid of elfcorehdr_addr &
elfcorehdr_size from fadump_crash_info_header structure and put them
in fw_dump structure instead..


+   u64 vmcoreinfo_raddr;
+   u64 vmcoreinfo_size;
+   u32 pt_regs_sz;
+   u32 cpu_mask_sz;
struct pt_regs  regs;
struct cpumask  cpu_mask;
  };
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index d14eda1e8589..eb9132538268 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -53,8 +53,6 @@ static struct kobject *fadump_kobj;
  static atomic_t cpus_in_fadump;
  static DEFINE_MUTEX(fadump_mutex);
  
-static struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0, false };

-
  #define RESERVED_RNGS_SZ  16384 /* 16K - 128 entries */
  #define RESERVED_RNGS_CNT (RESERVED_RNGS_SZ / \
 sizeof(struct fadump_memory_range))
@@ -373,12 +371,6 @@ static unsigned long __init get_fadump_area_size(void)
size = PAGE_ALIGN(size);
size += fw_dump.boot_memory_size;
size += sizeof(struct fadump_crash_info_header);
-   size += sizeof(struct elfhdr); /* ELF core header.*/
-   size += sizeof(struct elf_phdr); /* place holder for cpu notes */
-   /* Program headers for crash memory regions. */
-   size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2);
-
-   size = PAGE_ALIGN(size);
  
  	/* This is to hold kernel metadata on platforms that support it */

size += (fw_dump.ops->fadump_get_metadata_size ?
@@ -931,36 +923,6 @@ static inline int fadump_add_mem_range(struct 
fadump_mrange_info *mrange_info,
return 0;
  }
  
-static int fadump_exclude_reserved_area(u64 start, u64 end)

-{
-   u64 ra_start, ra_end;
-   int ret = 0;
-
-   ra_start = fw_dump.reserve_dump_area_start;
-   ra_end = ra_start + fw_dump.reserve_dump_area_size;
-
-   if ((ra_start < end) && (ra_end > start)) {
-   if ((start < ra_start) && (end > ra_end)) {
-   ret = fadump_add_mem_range(&crash_mrange_info,
-  start, ra_start);
-   

Re: [PATCHv9 2/2] powerpc/setup: Loosen the mapping between cpu logical id and its seq in dt

2024-01-08 Thread Hari Bathini




On 09/01/24 9:57 am, Hari Bathini wrote:

Hi Michael,



Sorry, Michael.
I am just about getting back to work and I spoke too soon.
You already seem to have posted a set with the approach you had in mind:

  https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=388350

Thanks
Hari


I am fine with either approach. I was trying to address your concerns
in my way. Looking for your inputs here on how to go about this now..

On 29/11/23 7:00 am, Pingfan Liu wrote:

Hi Hari,


On Mon, Nov 27, 2023 at 12:30 PM Hari Bathini  
wrote:


Hi Pingfan, Michael,

On 17/10/23 4:03 pm, Hari Bathini wrote:



On 17/10/23 7:58 am, Pingfan Liu wrote:

*** Idea ***
For kexec -p, the boot cpu can be not the cpu0, this causes the 
problem

of allocating memory for paca_ptrs[]. However, in theory, there is no
requirement to assign cpu's logical id as its present sequence in the
device tree. But there is something like cpu_first_thread_sibling(),
which makes assumption on the mapping inside a core. Hence partially
loosening the mapping, i.e. unbind the mapping of core while keep the
mapping inside a core.

*** Implement ***
At this early stage, there are plenty of memory to utilize. Hence, 
this

patch allocates interim memory to link the cpu info on a list, then
reorder cpus by changing the list head. As a result, there is a rotate
shift between the sequence number in dt and the cpu logical number.

*** Result ***
After this patch, a boot-cpu's logical id will always be mapped 
into the

range [0,threads_per_core).

Besides this, at this phase, all threads in the boot core are 
forced to

be onlined. This restriction will be lifted in a later patch with
extra effort.

Signed-off-by: Pingfan Liu 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Christophe Leroy 
Cc: Mahesh Salgaonkar 
Cc: Wen Xiong 
Cc: Baoquan He 
Cc: Ming Lei 
Cc: Sourabh Jain 
Cc: Hari Bathini 
Cc: ke...@lists.infradead.org
To: linuxppc-dev@lists.ozlabs.org


Thanks for working on this, Pingfan.
Looks good to me.

Acked-by: Hari Bathini 



On second thoughts, probably better off with no impact for
bootcpu < nr_cpu_ids case and changing only two cores logical
numbering otherwise. Something like the below (Please share
your thoughts):



I am afraid that it may not be as ideal as it looks, considering the
following factors:
-1. For the case of 'bootcpu < nr_cpu_ids', crash can happen evenly
across any cpu in the system, which seriously undermines the
protection intended here (Under the most optimistic scenario, there is
a 50% chance of success)

-2. For the re-ordering of logical numbering, IMHO, if there is
concern that re-ordering will break something, the partial re-ordering
can not avoid that.  We ought to spot probable hazards so as to ease
worries.


Thanks,

Pingfan


diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index ec82f5bda908..78a8312aa8c4 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -76,7 +76,9 @@ u64 ppc64_rma_size;
   unsigned int boot_cpu_node_count __ro_after_init;
   #endif
   static phys_addr_t first_memblock_size;
+#ifdef CONFIG_SMP
   static int __initdata boot_cpu_count;
+#endif

   static int __init early_parse_mem(char *p)
   {
@@ -357,6 +359,25 @@ static int __init early_init_dt_scan_cpus(unsigned
long node,
 fdt_boot_cpuid_phys(initial_boot_params)) {
 found = boot_cpu_count;
 found_thread = i;
+   /*
+    * Map boot-cpu logical id into the range
+    * of [0, thread_per_core) if it can't be
+    * accommodated within nr_cpu_ids.
+    */
+   if (i != boot_cpu_count && boot_cpu_count >= 
nr_cpu_ids) {

+   boot_cpuid = i;
+   DBG("Logical CPU number for boot CPU 
changed from %d to %d\n",

+   boot_cpu_count, i);
+   } else {
+   boot_cpuid = boot_cpu_count;
+   }
+
+   /* Ensure boot thread is acconted for in 
nr_cpu_ids */

+   if (boot_cpuid >= nr_cpu_ids) {
+   set_nr_cpu_ids(boot_cpuid + 1);
+   DBG("Adjusted nr_cpu_ids to %u, to 
include boot CPU.\n",

+   nr_cpu_ids);
+   }
 }
   #ifdef CONFIG_SMP
 /* logical cpu id is always 0 on UP kernels */
@@ -368,9 +389,8 @@ static int __init early_init_dt_scan_cpus(unsigned
long node,
 if (found < 0)
 return 0;

-   DBG("boot cpu: logical %d physical %d\n", found,
+   DBG("boot cpu: logical %d physical %d\n", boot_cpuid,
 be32_to_cpu(intserv[found_thread]));
-   boot_cp

Re: [PATCHv9 2/2] powerpc/setup: Loosen the mapping between cpu logical id and its seq in dt

2024-01-08 Thread Hari Bathini

Hi Michael,

I am fine with either approach. I was trying to address your concerns
in my way. Looking for your inputs here on how to go about this now..

On 29/11/23 7:00 am, Pingfan Liu wrote:

Hi Hari,


On Mon, Nov 27, 2023 at 12:30 PM Hari Bathini  wrote:


Hi Pingfan, Michael,

On 17/10/23 4:03 pm, Hari Bathini wrote:



On 17/10/23 7:58 am, Pingfan Liu wrote:

*** Idea ***
For kexec -p, the boot cpu can be not the cpu0, this causes the problem
of allocating memory for paca_ptrs[]. However, in theory, there is no
requirement to assign cpu's logical id as its present sequence in the
device tree. But there is something like cpu_first_thread_sibling(),
which makes assumption on the mapping inside a core. Hence partially
loosening the mapping, i.e. unbind the mapping of core while keep the
mapping inside a core.

*** Implement ***
At this early stage, there are plenty of memory to utilize. Hence, this
patch allocates interim memory to link the cpu info on a list, then
reorder cpus by changing the list head. As a result, there is a rotate
shift between the sequence number in dt and the cpu logical number.

*** Result ***
After this patch, a boot-cpu's logical id will always be mapped into the
range [0,threads_per_core).

Besides this, at this phase, all threads in the boot core are forced to
be onlined. This restriction will be lifted in a later patch with
extra effort.

Signed-off-by: Pingfan Liu 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Christophe Leroy 
Cc: Mahesh Salgaonkar 
Cc: Wen Xiong 
Cc: Baoquan He 
Cc: Ming Lei 
Cc: Sourabh Jain 
Cc: Hari Bathini 
Cc: ke...@lists.infradead.org
To: linuxppc-dev@lists.ozlabs.org


Thanks for working on this, Pingfan.
Looks good to me.

Acked-by: Hari Bathini 



On second thoughts, probably better off with no impact for
bootcpu < nr_cpu_ids case and changing only two cores logical
numbering otherwise. Something like the below (Please share
your thoughts):



I am afraid that it may not be as ideal as it looks, considering the
following factors:
-1. For the case of 'bootcpu < nr_cpu_ids', crash can happen evenly
across any cpu in the system, which seriously undermines the
protection intended here (Under the most optimistic scenario, there is
a 50% chance of success)

-2. For the re-ordering of logical numbering, IMHO, if there is
concern that re-ordering will break something, the partial re-ordering
can not avoid that.  We ought to spot probable hazards so as to ease
worries.


Thanks,

Pingfan


diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index ec82f5bda908..78a8312aa8c4 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -76,7 +76,9 @@ u64 ppc64_rma_size;
   unsigned int boot_cpu_node_count __ro_after_init;
   #endif
   static phys_addr_t first_memblock_size;
+#ifdef CONFIG_SMP
   static int __initdata boot_cpu_count;
+#endif

   static int __init early_parse_mem(char *p)
   {
@@ -357,6 +359,25 @@ static int __init early_init_dt_scan_cpus(unsigned
long node,
 fdt_boot_cpuid_phys(initial_boot_params)) {
 found = boot_cpu_count;
 found_thread = i;
+   /*
+* Map boot-cpu logical id into the range
+* of [0, thread_per_core) if it can't be
+* accommodated within nr_cpu_ids.
+*/
+   if (i != boot_cpu_count && boot_cpu_count >= 
nr_cpu_ids) {
+   boot_cpuid = i;
+   DBG("Logical CPU number for boot CPU changed from %d 
to %d\n",
+   boot_cpu_count, i);
+   } else {
+   boot_cpuid = boot_cpu_count;
+   }
+
+   /* Ensure boot thread is acconted for in nr_cpu_ids */
+   if (boot_cpuid >= nr_cpu_ids) {
+   set_nr_cpu_ids(boot_cpuid + 1);
+   DBG("Adjusted nr_cpu_ids to %u, to include boot 
CPU.\n",
+   nr_cpu_ids);
+   }
 }
   #ifdef CONFIG_SMP
 /* logical cpu id is always 0 on UP kernels */
@@ -368,9 +389,8 @@ static int __init early_init_dt_scan_cpus(unsigned
long node,
 if (found < 0)
 return 0;

-   DBG("boot cpu: logical %d physical %d\n", found,
+   DBG("boot cpu: logical %d physical %d\n", boot_cpuid,
 be32_to_cpu(intserv[found_thread]));
-   boot_cpuid = found;

 boot_cpu_hwid = be32_to_cpu(intserv[found_thread]);

diff --git a/arch/powerpc/kernel/setup-common.c
b/arch/powerpc/kernel/setup-common.c
index b7b733474b60..f7179525c774 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -409,6 +409,12

[PATCH 2/2] powerpc/bpf: enable kfunc call

2023-12-20 Thread Hari Bathini
With module addresses supported, override bpf_jit_supports_kfunc_call()
to enable kfunc support. Module address offsets can be more than 32-bit
long, so override bpf_jit_supports_far_kfunc_call() to enable 64-bit
pointers.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/net/bpf_jit_comp.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 0f9a21783329..a6151a5ef9a5 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -359,3 +359,13 @@ void bpf_jit_free(struct bpf_prog *fp)
 
bpf_prog_unlock_free(fp);
 }
+
+bool bpf_jit_supports_kfunc_call(void)
+{
+   return true;
+}
+
+bool bpf_jit_supports_far_kfunc_call(void)
+{
+   return true;
+}
-- 
2.43.0



[PATCH 1/2] powerpc/bpf: ensure module addresses are supported

2023-12-20 Thread Hari Bathini
Currently, bpf jit code on powerpc assumes all the bpf functions and
helpers to be kernel text. This is false for kfunc case, as function
addresses are mostly module addresses in that case. Ensure module
addresses are supported to enable kfunc support.

This effectively reverts commit feb6307289d8 ("powerpc64/bpf: Optimize
instruction sequence used for function calls") and commit 43d636f8b4fd
("powerpc64/bpf elfv1: Do not load TOC before calling functions") that
assumed only kernel text for bpf functions/helpers.

Also, commit b10cb163c4b3 ("powerpc64/bpf elfv2: Setup kernel TOC in
r2 on entry") that paved the way for the commits mentioned above is
reverted.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/net/bpf_jit.h|  2 +-
 arch/powerpc/net/bpf_jit_comp32.c |  8 +--
 arch/powerpc/net/bpf_jit_comp64.c | 90 +--
 3 files changed, 52 insertions(+), 48 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index cdea5dccaefe..48503caa5b58 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -160,7 +160,7 @@ static inline void bpf_clear_seen_register(struct 
codegen_context *ctx, int i)
 }
 
 void bpf_jit_init_reg_mapping(struct codegen_context *ctx);
-int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context 
*ctx, u64 func);
+void bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct 
codegen_context *ctx, u64 func);
 int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct 
codegen_context *ctx,
   u32 *addrs, int pass, bool extra_pass);
 void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx);
diff --git a/arch/powerpc/net/bpf_jit_comp32.c 
b/arch/powerpc/net/bpf_jit_comp32.c
index 2f39c50ca729..1236a75c04ea 100644
--- a/arch/powerpc/net/bpf_jit_comp32.c
+++ b/arch/powerpc/net/bpf_jit_comp32.c
@@ -201,7 +201,7 @@ void bpf_jit_build_epilogue(u32 *image, struct 
codegen_context *ctx)
 }
 
 /* Relative offset needs to be calculated based on final image location */
-int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context 
*ctx, u64 func)
+void bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct 
codegen_context *ctx, u64 func)
 {
s32 rel = (s32)func - (s32)(fimage + ctx->idx);
 
@@ -214,8 +214,6 @@ int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, 
struct codegen_context *
EMIT(PPC_RAW_MTCTR(_R0));
EMIT(PPC_RAW_BCTRL());
}
-
-   return 0;
 }
 
 static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 
out)
@@ -1054,9 +1052,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, 
u32 *fimage, struct code
EMIT(PPC_RAW_STW(bpf_to_ppc(BPF_REG_5), _R1, 
12));
}
 
-   ret = bpf_jit_emit_func_call_rel(image, fimage, ctx, 
func_addr);
-   if (ret)
-   return ret;
+   bpf_jit_emit_func_call_rel(image, fimage, ctx, 
func_addr);
 
EMIT(PPC_RAW_MR(bpf_to_ppc(BPF_REG_0) - 1, _R3));
EMIT(PPC_RAW_MR(bpf_to_ppc(BPF_REG_0), _R4));
diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index 79f23974a320..e7199c202a00 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -126,11 +126,6 @@ void bpf_jit_build_prologue(u32 *image, struct 
codegen_context *ctx)
 {
int i;
 
-#ifndef CONFIG_PPC_KERNEL_PCREL
-   if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2))
-   EMIT(PPC_RAW_LD(_R2, _R13, offsetof(struct paca_struct, 
kernel_toc)));
-#endif
-
/*
 * Initialize tail_call_cnt if we do tail calls.
 * Otherwise, put in NOPs so that it can be skipped when we are
@@ -145,6 +140,8 @@ void bpf_jit_build_prologue(u32 *image, struct 
codegen_context *ctx)
EMIT(PPC_RAW_NOP());
}
 
+#define BPF_TAILCALL_PROLOGUE_SIZE 8
+
if (bpf_has_stack_frame(ctx)) {
/*
 * We need a stack frame, but we don't necessarily need to
@@ -204,14 +201,9 @@ void bpf_jit_build_epilogue(u32 *image, struct 
codegen_context *ctx)
 
 static int bpf_jit_emit_func_call_hlp(u32 *image, struct codegen_context *ctx, 
u64 func)
 {
-   unsigned long func_addr = func ? ppc_function_entry((void *)func) : 0;
-   long reladdr;
-
-   if (WARN_ON_ONCE(!core_kernel_text(func_addr)))
-   return -EINVAL;
-
if (IS_ENABLED(CONFIG_PPC_KERNEL_PCREL)) {
-   reladdr = func_addr - CTX_NIA(ctx);
+   unsigned long func_addr = func ? ppc_function_entry((void 
*)func) : 0;
+   long reladdr = func_addr - CTX_NIA(ctx);
 
if (reladdr >= (long)SZ_8G || reladdr < -(long)SZ_8G) {
pr_err("eBPF: address of %ps out o

Re: [PATCH v14 5/6] powerpc: add crash CPU hotplug support

2023-12-19 Thread Hari Bathini

Hi Sourabh

On 11/12/23 2:00 pm, Sourabh Jain wrote:

Due to CPU/Memory hotplug or online/offline events the elfcorehdr
(which describes the CPUs and memory of the crashed kernel) and FDT
(Flattened Device Tree) of kdump image becomes outdated. Consequently,
attempting dump collection with an outdated elfcorehdr or FDT can lead
to failed or inaccurate dump collection.

Going forward CPU hotplug or online/offlice events are referred as


s/offlice/offline/


CPU/Memory add/remvoe events.


s/remvoe/remove/


The current solution to address the above issue involves monitoring the
CPU/memory add/remove events in userspace using udev rules and whenever
there are changes in CPU and memory resources, the entire kdump image
is loaded again. The kdump image includes kernel, initrd, elfcorehdr,
FDT, purgatory. Given that only elfcorehdr and FDT get outdated due to
CPU/Memory add/remove events, reloading the entire kdump image is
inefficient. More importantly, kdump remains inactive for a substantial
amount of time until the kdump reload completes.

To address the aforementioned issue, commit 247262756121 ("crash: add
generic infrastructure for crash hotplug support") added a generic
infrastructure that allows architectures to selectively update the kdump
image component during CPU or memory add/remove events within the kernel
itself.

In the event of a CPU or memory add/remove event, the generic crash
hotplug event handler, `crash_handle_hotplug_event()`, is triggered. It
then acquires the necessary locks to update the kdump image and invokes
the architecture-specific crash hotplug handler,
`arch_crash_handle_hotplug_event()`, to update the required kdump image
components.

This patch adds crash hotplug handler for PowerPC and enable support to
update the kdump image on CPU add/remove events. Support for memory
add/remove events is added in a subsequent patch with the title
"powerpc: add crash memory hotplug support."

As mentioned earlier, only the elfcorehdr and FDT kdump image components
need to be updated in the event of CPU or memory add/remove events.
However, the PowerPC architecture crash hotplug handler only updates the
FDT to enable crash hotplug support for CPU add/remove events. Here's
why.

The Elfcorehdr on PowerPC is built with possible CPUs, and thus, it does
not need an update on CPU add/remove events. On the other hand, the FDT
needs to be updated on CPU add events to include the newly added CPU. If
the FDT is not updated and the kernel crashes on a newly added CPU, the
kdump kernel will fail to boot due to the unavailability of the crashing
CPU in the FDT. During the early boot, it is expected that the boot CPU
must be a part of the FDT; otherwise, the kernel will raise a BUG and
fail to boot. For more information, refer to commit 36ae37e3436b0
("powerpc: Make boot_cpuid common between 32 and 64-bit"). Since it is
okay to have an offline CPU in the kdump FDT, no action is taken in case
of CPU removal.

There are two system calls, `kexec_file_load` and `kexec_load`, used to
load the kdump image. Few changes have been made to ensure kernel can
safely update the kdump FDT for both system calls.

For kexec_file_load syscall the kdump image is prepared in kernel. So to
support an increasing number of CPUs, the FDT is constructed with extra
buffer space to ensure it can accommodate a possible number of CPU
nodes. Additionally, a call to fdt_pack (which trims the unused space
once the FDT is prepared) is avoided for kdump image loading if this
feature is enabled.

For the kexec_load syscall, the FDT is updated only if both the
KEXEC_UPDATE_FDT and KEXEC_UPDATE_ELFCOREHDR kexec flags are passed to
the kernel by the kexec tool. Passing these flags to the kernel
indicates that the FDT is built to accommodate possible CPUs, and the
FDT segment is not considered for SHA calculation, making it safe to
update the FDT.

Commit 88a6f8994421 ("crash: memory and CPU hotplug sysfs attributes")
added a sysfs interface to indicate userspace (kdump udev rule) that
kernel will update the kdump image on CPU hotplug events, so kdump
reload can be avoided. Implement arch specific function
`arch_crash_hotplug_cpu_support()` to correctly advertise kernel
capability to update kdump image.

This feature is advertised to userspace when the following conditions
are met:

1. Kdump image is loaded using kexec_file_load system call.
2. Kdump image is loaded using kexec_load system and both
KEXEC_UPATE_ELFCOREHDR and KEXEC_UPDATE_FDT kexec flags are
passed to kernel.

The changes related to this feature are kept under the CRASH_HOTPLUG
config, and it is enabled by default.

Signed-off-by: Sourabh Jain 
Cc: Akhil Raj 
Cc: Andrew Morton 
Cc: Aneesh Kumar K.V 
Cc: Baoquan He 
Cc: Borislav Petkov (AMD) 
Cc: Boris Ostrovsky 
Cc: Christophe Leroy 
Cc: Dave Hansen 
Cc: Dave Young 
Cc: David Hildenbrand 
Cc: Eric DeVolder 
Cc: Greg Kroah-Hartman 
Cc: Hari Bathini 
Cc: Laurent Dufour

Re: [RFC PATCH 2/3] powerpc/fadump: pass additional parameters to dump capture kernel

2023-12-17 Thread Hari Bathini

Hi Sourabh,

On 15/12/23 2:12 pm, Sourabh Jain wrote:

Hello Hari,

On 06/12/23 01:48, Hari Bathini wrote:

For fadump case, passing additional parameters to dump capture kernel
helps in minimizing the memory footprint for it and also provides the
flexibility to disable components/modules, like hugepages, that are
hindering the boot process of the special dump capture environment.

Set up a dedicated parameter area to be passed to the capture kernel.
This area type is defined as RTAS_FADUMP_PARAM_AREA. Sysfs attribute
'/sys/kernel/fadump/bootargs_append' is exported to the userspace to
specify the additional parameters to be passed to the capture kernel

Signed-off-by: Hari Bathini 
---
  arch/powerpc/include/asm/fadump-internal.h   |  3 +
  arch/powerpc/kernel/fadump.c | 80 
  arch/powerpc/platforms/powernv/opal-fadump.c |  6 +-
  arch/powerpc/platforms/pseries/rtas-fadump.c | 35 -
  arch/powerpc/platforms/pseries/rtas-fadump.h | 11 ++-
  5 files changed, 126 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump-internal.h 
b/arch/powerpc/include/asm/fadump-internal.h

index b3956c400519..81629226b15f 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -97,6 +97,8 @@ struct fw_dump {
  unsigned long    cpu_notes_buf_vaddr;
  unsigned long    cpu_notes_buf_size;
+    unsigned long    param_area;
+
  /*
   * Maximum size supported by firmware to copy from source to
   * destination address per entry.
@@ -111,6 +113,7 @@ struct fw_dump {
  unsigned long    dump_active:1;
  unsigned long    dump_registered:1;
  unsigned long    nocma:1;
+    unsigned long    param_area_supported:1;
  struct fadump_ops    *ops;
  };
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 757681658dda..98f089747ac9 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -1470,6 +1470,7 @@ static ssize_t mem_reserved_show(struct kobject 
*kobj,

  return sprintf(buf, "%ld\n", fw_dump.reserve_dump_area_size);
  }
+
  static ssize_t registered_show(struct kobject *kobj,
 struct kobj_attribute *attr,
 char *buf)
@@ -1477,6 +1478,43 @@ static ssize_t registered_show(struct kobject 
*kobj,

  return sprintf(buf, "%d\n", fw_dump.dump_registered);
  }
+static ssize_t bootargs_append_show(struct kobject *kobj,
+   struct kobj_attribute *attr,
+   char *buf)
+{
+    return sprintf(buf, "%s\n", (char *)__va(fw_dump.param_area));
+}
+
+static ssize_t bootargs_append_store(struct kobject *kobj,
+   struct kobj_attribute *attr,
+   const char *buf, size_t count)
+{
+    char *params;
+
+    if (!fw_dump.fadump_enabled || fw_dump.dump_active)
+    return -EPERM;
+
+    if (count >= COMMAND_LINE_SIZE)
+    return -EINVAL;
+
+    /*
+ * Fail here instead of handling this scenario with
+ * some silly workaround in capture kernel.
+ */
+    if (saved_command_line_len + count >= COMMAND_LINE_SIZE) {
+    pr_err("Appending parameters exceeds cmdline size!\n");
+    return -ENOSPC;
+    }
+
+    params = __va(fw_dump.param_area);
+    strscpy_pad(params, buf, COMMAND_LINE_SIZE);
+    /* Remove newline character at the end. */
+    if (params[count-1] == '\n')
+    params[count-1] = '\0';
+
+    return count;
+}
+
  static ssize_t registered_store(struct kobject *kobj,
  struct kobj_attribute *attr,
  const char *buf, size_t count)
@@ -1535,6 +1573,7 @@ static struct kobj_attribute release_attr = 
__ATTR_WO(release_mem);

  static struct kobj_attribute enable_attr = __ATTR_RO(enabled);
  static struct kobj_attribute register_attr = __ATTR_RW(registered);
  static struct kobj_attribute mem_reserved_attr = 
__ATTR_RO(mem_reserved);
+static struct kobj_attribute bootargs_append_attr = 
__ATTR_RW(bootargs_append);

  static struct attribute *fadump_attrs[] = {
  &enable_attr.attr,
@@ -1611,6 +1650,46 @@ static void __init fadump_init_files(void)
  return;
  }
+/*
+ * Reserve memory to store additional parameters to be passed
+ * for fadump/capture kernel.
+ */
+static void fadump_setup_param_area(void)
+{
+    phys_addr_t range_start, range_end;
+
+    if (!fw_dump.param_area_supported || fw_dump.dump_active)
+    return;
+
+    /* This memory can't be used by PFW or bootloader as it is shared 
across kernels */

+    if (radix_enabled()) {
+    /*
+ * Anywhere in the upper half should be good enough as all 
memory

+ * is accessible in real mode.
+ */
+    range_start = memblock_end_of_DRAM() / 2;
+    range_end = memblock_end_of_DRAM();
+    } else {
+    /*
+ * Passing additional parameters is supported for hash MMU only
+ 

[RFC PATCH 3/3] powerpc/fadump: pass additional parameters when fadump is active

2023-12-05 Thread Hari Bathini
Append the additional parameters passed/set in the dedicated parameter
area (RTAS_FADUMP_PARAM_AREA) to bootargs in fadump capture kernel.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/fadump.h |  2 ++
 arch/powerpc/kernel/fadump.c  | 34 +++
 arch/powerpc/kernel/prom.c|  3 +++
 3 files changed, 39 insertions(+)

diff --git a/arch/powerpc/include/asm/fadump.h 
b/arch/powerpc/include/asm/fadump.h
index 526a6a647312..ef40c9b6972a 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -19,12 +19,14 @@ extern int is_fadump_active(void);
 extern int should_fadump_crash(void);
 extern void crash_fadump(struct pt_regs *, const char *);
 extern void fadump_cleanup(void);
+extern void fadump_append_bootargs(void);
 
 #else  /* CONFIG_FA_DUMP */
 static inline int is_fadump_active(void) { return 0; }
 static inline int should_fadump_crash(void) { return 0; }
 static inline void crash_fadump(struct pt_regs *regs, const char *str) { }
 static inline void fadump_cleanup(void) { }
+static inline void fadump_append_bootargs(void) { }
 #endif /* !CONFIG_FA_DUMP */
 
 #if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP)
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 98f089747ac9..9b1601f99f72 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -133,6 +133,40 @@ static int __init fadump_cma_init(void)
 static int __init fadump_cma_init(void) { return 1; }
 #endif /* CONFIG_CMA */
 
+/*
+ * Additional parameters meant for capture kernel are placed in a dedicated 
area.
+ * If this is capture kernel boot, append these parameters to bootargs.
+ */
+void __init fadump_append_bootargs(void)
+{
+   char *append_args;
+   size_t len;
+
+   if (!fw_dump.dump_active || !fw_dump.param_area_supported || 
!fw_dump.param_area)
+   return;
+
+   /* TODO: What to do if this overlaps with reserve area (radix case?) */
+   if (memblock_reserve(fw_dump.param_area, COMMAND_LINE_SIZE)) {
+   pr_warn("WARNING: Can't use additional parameters area!\n");
+   fw_dump.param_area = 0;
+   return;
+   }
+
+   append_args = (char *)fw_dump.param_area;
+   len = strlen(boot_command_line);
+
+   /*
+* Too late to fail even if cmdline size exceeds. Truncate additional 
parameters
+* to cmdline size and proceed anyway.
+*/
+   if (len + strlen(append_args) >= COMMAND_LINE_SIZE - 1)
+   pr_warn("WARNING: Appending parameters exceeds cmdline size. 
Truncating!\n");
+
+   pr_debug("Cmdline: %s\n", boot_command_line);
+   snprintf(boot_command_line + len, COMMAND_LINE_SIZE - len, " %s", 
append_args);
+   pr_info("Updated cmdline: %s\n", boot_command_line);
+}
+
 /* Scan the Firmware Assisted dump configuration details. */
 int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
  int depth, void *data)
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 0b5878c3125b..00a03d476cb9 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -791,6 +791,9 @@ void __init early_init_devtree(void *params)
 */
of_scan_flat_dt(early_init_dt_scan_chosen_ppc, boot_command_line);
 
+   /* Append additional parameters passed for fadump capture kernel */
+   fadump_append_bootargs();
+
/* Scan memory nodes and rebuild MEMBLOCKs */
early_init_dt_scan_root();
early_init_dt_scan_memory_ppc();
-- 
2.43.0



[RFC PATCH 2/3] powerpc/fadump: pass additional parameters to dump capture kernel

2023-12-05 Thread Hari Bathini
For fadump case, passing additional parameters to dump capture kernel
helps in minimizing the memory footprint for it and also provides the
flexibility to disable components/modules, like hugepages, that are
hindering the boot process of the special dump capture environment.

Set up a dedicated parameter area to be passed to the capture kernel.
This area type is defined as RTAS_FADUMP_PARAM_AREA. Sysfs attribute
'/sys/kernel/fadump/bootargs_append' is exported to the userspace to
specify the additional parameters to be passed to the capture kernel

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/fadump-internal.h   |  3 +
 arch/powerpc/kernel/fadump.c | 80 
 arch/powerpc/platforms/powernv/opal-fadump.c |  6 +-
 arch/powerpc/platforms/pseries/rtas-fadump.c | 35 -
 arch/powerpc/platforms/pseries/rtas-fadump.h | 11 ++-
 5 files changed, 126 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump-internal.h 
b/arch/powerpc/include/asm/fadump-internal.h
index b3956c400519..81629226b15f 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -97,6 +97,8 @@ struct fw_dump {
unsigned long   cpu_notes_buf_vaddr;
unsigned long   cpu_notes_buf_size;
 
+   unsigned long   param_area;
+
/*
 * Maximum size supported by firmware to copy from source to
 * destination address per entry.
@@ -111,6 +113,7 @@ struct fw_dump {
unsigned long   dump_active:1;
unsigned long   dump_registered:1;
unsigned long   nocma:1;
+   unsigned long   param_area_supported:1;
 
struct fadump_ops   *ops;
 };
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 757681658dda..98f089747ac9 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -1470,6 +1470,7 @@ static ssize_t mem_reserved_show(struct kobject *kobj,
return sprintf(buf, "%ld\n", fw_dump.reserve_dump_area_size);
 }
 
+
 static ssize_t registered_show(struct kobject *kobj,
   struct kobj_attribute *attr,
   char *buf)
@@ -1477,6 +1478,43 @@ static ssize_t registered_show(struct kobject *kobj,
return sprintf(buf, "%d\n", fw_dump.dump_registered);
 }
 
+static ssize_t bootargs_append_show(struct kobject *kobj,
+  struct kobj_attribute *attr,
+  char *buf)
+{
+   return sprintf(buf, "%s\n", (char *)__va(fw_dump.param_area));
+}
+
+static ssize_t bootargs_append_store(struct kobject *kobj,
+  struct kobj_attribute *attr,
+  const char *buf, size_t count)
+{
+   char *params;
+
+   if (!fw_dump.fadump_enabled || fw_dump.dump_active)
+   return -EPERM;
+
+   if (count >= COMMAND_LINE_SIZE)
+   return -EINVAL;
+
+   /*
+* Fail here instead of handling this scenario with
+* some silly workaround in capture kernel.
+*/
+   if (saved_command_line_len + count >= COMMAND_LINE_SIZE) {
+   pr_err("Appending parameters exceeds cmdline size!\n");
+   return -ENOSPC;
+   }
+
+   params = __va(fw_dump.param_area);
+   strscpy_pad(params, buf, COMMAND_LINE_SIZE);
+   /* Remove newline character at the end. */
+   if (params[count-1] == '\n')
+   params[count-1] = '\0';
+
+   return count;
+}
+
 static ssize_t registered_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
@@ -1535,6 +1573,7 @@ static struct kobj_attribute release_attr = 
__ATTR_WO(release_mem);
 static struct kobj_attribute enable_attr = __ATTR_RO(enabled);
 static struct kobj_attribute register_attr = __ATTR_RW(registered);
 static struct kobj_attribute mem_reserved_attr = __ATTR_RO(mem_reserved);
+static struct kobj_attribute bootargs_append_attr = __ATTR_RW(bootargs_append);
 
 static struct attribute *fadump_attrs[] = {
&enable_attr.attr,
@@ -1611,6 +1650,46 @@ static void __init fadump_init_files(void)
return;
 }
 
+/*
+ * Reserve memory to store additional parameters to be passed
+ * for fadump/capture kernel.
+ */
+static void fadump_setup_param_area(void)
+{
+   phys_addr_t range_start, range_end;
+
+   if (!fw_dump.param_area_supported || fw_dump.dump_active)
+   return;
+
+   /* This memory can't be used by PFW or bootloader as it is shared 
across kernels */
+   if (radix_enabled()) {
+   /*
+* Anywhere in the upper half should be good enough as all 
memory
+* is accessible in real mode.
+*/
+   range_start = memblock_

[RFC PATCH 1/3] powerpc/pseries/fadump: add support for multiple boot memory regions

2023-12-05 Thread Hari Bathini
From: Sourabh Jain 

Currently, fadump on pseries assumes a single boot memory region even
though f/w supports more than one boot memory region. Add support for
more boot memory regions to make the implementation flexible for any
enhancements that introduce other region types. For this, rtas memory
structure for fadump is updated to have multiple boot memory regions
instead of just one. Additionally, methods responsible for creating
the fadump memory structure during both the first and second kernel
boot have been modified to take these multiple boot memory regions
into account. Also, a new callback has been added to the fadump_ops
structure to get the maximum boot memory regions supported by the
platform.

Signed-off-by: Sourabh Jain 
Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/fadump-internal.h   |   2 +-
 arch/powerpc/kernel/fadump.c |  27 +-
 arch/powerpc/platforms/powernv/opal-fadump.c |   8 +
 arch/powerpc/platforms/pseries/rtas-fadump.c | 258 ---
 arch/powerpc/platforms/pseries/rtas-fadump.h |  26 +-
 5 files changed, 199 insertions(+), 122 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump-internal.h 
b/arch/powerpc/include/asm/fadump-internal.h
index 27f9e11eda28..b3956c400519 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -129,6 +129,7 @@ struct fadump_ops {
  struct seq_file *m);
void(*fadump_trigger)(struct fadump_crash_info_header *fdh,
  const char *msg);
+   int (*fadump_max_boot_mem_rgns)(void);
 };
 
 /* Helper functions */
@@ -136,7 +137,6 @@ s32 __init fadump_setup_cpu_notes_buf(u32 num_cpus);
 void fadump_free_cpu_notes_buf(void);
 u32 *__init fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs);
 void __init fadump_update_elfcore_header(char *bufp);
-bool is_fadump_boot_mem_contiguous(void);
 bool is_fadump_reserved_mem_contiguous(void);
 
 #else /* !CONFIG_PRESERVE_FA_DUMP */
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index d14eda1e8589..757681658dda 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -222,28 +222,6 @@ static bool is_fadump_mem_area_contiguous(u64 d_start, u64 
d_end)
return ret;
 }
 
-/*
- * Returns true, if there are no holes in boot memory area,
- * false otherwise.
- */
-bool is_fadump_boot_mem_contiguous(void)
-{
-   unsigned long d_start, d_end;
-   bool ret = false;
-   int i;
-
-   for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) {
-   d_start = fw_dump.boot_mem_addr[i];
-   d_end   = d_start + fw_dump.boot_mem_sz[i];
-
-   ret = is_fadump_mem_area_contiguous(d_start, d_end);
-   if (!ret)
-   break;
-   }
-
-   return ret;
-}
-
 /*
  * Returns true, if there are no holes in reserved memory area,
  * false otherwise.
@@ -389,10 +367,11 @@ static unsigned long __init get_fadump_area_size(void)
 static int __init add_boot_mem_region(unsigned long rstart,
  unsigned long rsize)
 {
+   int max_boot_mem_rgns = fw_dump.ops->fadump_max_boot_mem_rgns();
int i = fw_dump.boot_mem_regs_cnt++;
 
-   if (fw_dump.boot_mem_regs_cnt > FADUMP_MAX_MEM_REGS) {
-   fw_dump.boot_mem_regs_cnt = FADUMP_MAX_MEM_REGS;
+   if (fw_dump.boot_mem_regs_cnt > max_boot_mem_rgns) {
+   fw_dump.boot_mem_regs_cnt = max_boot_mem_rgns;
return 0;
}
 
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c 
b/arch/powerpc/platforms/powernv/opal-fadump.c
index 964f464b1b0e..fa26c21a08d9 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -615,6 +615,13 @@ static void opal_fadump_trigger(struct 
fadump_crash_info_header *fdh,
pr_emerg("No backend support for MPIPL!\n");
 }
 
+/* FADUMP_MAX_MEM_REGS or lower */
+static int opal_fadump_max_boot_mem_rgns(void)
+{
+   return FADUMP_MAX_MEM_REGS;
+
+}
+
 static struct fadump_ops opal_fadump_ops = {
.fadump_init_mem_struct = opal_fadump_init_mem_struct,
.fadump_get_metadata_size   = opal_fadump_get_metadata_size,
@@ -627,6 +634,7 @@ static struct fadump_ops opal_fadump_ops = {
.fadump_process = opal_fadump_process,
.fadump_region_show = opal_fadump_region_show,
.fadump_trigger = opal_fadump_trigger,
+   .fadump_max_boot_mem_rgns   = opal_fadump_max_boot_mem_rgns,
 };
 
 void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c 
b/arch/powerpc/platforms/pseries/rtas-fadump.c
index b5853e9fcc3c..1b05b4cefdfd 100644
--- a/arch/powerpc/platforms/pseries/rtas-fadump.c
+++ b/arch/powerpc/platforms/pseries/rtas-fadump.c
@@ -29,9

[RFC PATCH 0/3] powerpc/fadump: pass additional args to dump capture kernel

2023-12-05 Thread Hari Bathini
While fadump is a more reliable alternative to kdump dump capturing
method, it doesn't support passing additional parameters. Having
such support is desirable for two major reasons:

  1. It helps minimize the memory consumption of fadump dump capture
 kernel by disabling features that consume considerable amount of
 memory but have little significance for dump capture environment
 (eg. numa, cma, cgroup, etc.)
   2. It helps disable such features/components in dump capture kernel
  that are unstable and/or are being debugged.

This patch series adds support to pass additional parameters to fadump
capture kernel to make it more desirable. For this, a dedicated area
is passed between production kernel and capture kerenl to pass these
additional parameters. This support is enabled only on pseries as of
now. The dedicated area is referred to as RTAS_FADUMP_PARAM_AREA.

In radix MMU mode, this dedicated area can be anywhere but in case of
hash MMU, it can only be in the first memory block to be accessible
during early boot. Enabling this feature support in both radix and
hash MMU modes but in hash MMU only when RMA size is 768MB or more
to avoid complex memory real estate with FW components.

The first patch adds support for multiple boot memory regions to make
addition of any new region types simpler. The second patch sets up the
parameter (dedicated) area to be passed to the capture kernel.
/sys/kernel/fadump/bootargs_append is exported to the userspace to
specify the additional parameters to be passed to the capture kernel.
The last patch appends the parameters to bootargs during capture
kernel boot.

Hari Bathini (2):
  powerpc/fadump: pass additional parameters to dump capture kernel
  powerpc/fadump: pass additional parameters when fadump is active

Sourabh Jain (1):
  powerpc/pseries/fadump: add support for multiple boot memory regions

 arch/powerpc/include/asm/fadump-internal.h   |   5 +-
 arch/powerpc/include/asm/fadump.h|   2 +
 arch/powerpc/kernel/fadump.c | 141 +++--
 arch/powerpc/kernel/prom.c   |   3 +
 arch/powerpc/platforms/powernv/opal-fadump.c |  14 +-
 arch/powerpc/platforms/pseries/rtas-fadump.c | 293 +--
 arch/powerpc/platforms/pseries/rtas-fadump.h |  29 +-
 7 files changed, 360 insertions(+), 127 deletions(-)

-- 
2.43.0



Re: [PATCHv9 2/2] powerpc/setup: Loosen the mapping between cpu logical id and its seq in dt

2023-11-26 Thread Hari Bathini

Hi Pingfan, Michael,

On 17/10/23 4:03 pm, Hari Bathini wrote:



On 17/10/23 7:58 am, Pingfan Liu wrote:

*** Idea ***
For kexec -p, the boot cpu can be not the cpu0, this causes the problem
of allocating memory for paca_ptrs[]. However, in theory, there is no
requirement to assign cpu's logical id as its present sequence in the
device tree. But there is something like cpu_first_thread_sibling(),
which makes assumption on the mapping inside a core. Hence partially
loosening the mapping, i.e. unbind the mapping of core while keep the
mapping inside a core.

*** Implement ***
At this early stage, there are plenty of memory to utilize. Hence, this
patch allocates interim memory to link the cpu info on a list, then
reorder cpus by changing the list head. As a result, there is a rotate
shift between the sequence number in dt and the cpu logical number.

*** Result ***
After this patch, a boot-cpu's logical id will always be mapped into the
range [0,threads_per_core).

Besides this, at this phase, all threads in the boot core are forced to
be onlined. This restriction will be lifted in a later patch with
extra effort.

Signed-off-by: Pingfan Liu 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Christophe Leroy 
Cc: Mahesh Salgaonkar 
Cc: Wen Xiong 
Cc: Baoquan He 
Cc: Ming Lei 
Cc: Sourabh Jain 
Cc: Hari Bathini 
Cc: ke...@lists.infradead.org
To: linuxppc-dev@lists.ozlabs.org


Thanks for working on this, Pingfan.
Looks good to me.

Acked-by: Hari Bathini 



On second thoughts, probably better off with no impact for
bootcpu < nr_cpu_ids case and changing only two cores logical
numbering otherwise. Something like the below (Please share
your thoughts):

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index ec82f5bda908..78a8312aa8c4 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -76,7 +76,9 @@ u64 ppc64_rma_size;
 unsigned int boot_cpu_node_count __ro_after_init;
 #endif
 static phys_addr_t first_memblock_size;
+#ifdef CONFIG_SMP
 static int __initdata boot_cpu_count;
+#endif

 static int __init early_parse_mem(char *p)
 {
@@ -357,6 +359,25 @@ static int __init early_init_dt_scan_cpus(unsigned 
long node,

fdt_boot_cpuid_phys(initial_boot_params)) {
found = boot_cpu_count;
found_thread = i;
+   /*
+* Map boot-cpu logical id into the range
+* of [0, thread_per_core) if it can't be
+* accommodated within nr_cpu_ids.
+*/
+   if (i != boot_cpu_count && boot_cpu_count >= 
nr_cpu_ids) {
+   boot_cpuid = i;
+   DBG("Logical CPU number for boot CPU changed from %d 
to %d\n",
+   boot_cpu_count, i);
+   } else {
+   boot_cpuid = boot_cpu_count;
+   }
+
+   /* Ensure boot thread is acconted for in nr_cpu_ids */
+   if (boot_cpuid >= nr_cpu_ids) {
+   set_nr_cpu_ids(boot_cpuid + 1);
+   DBG("Adjusted nr_cpu_ids to %u, to include boot 
CPU.\n",
+   nr_cpu_ids);
+   }
}
 #ifdef CONFIG_SMP
/* logical cpu id is always 0 on UP kernels */
@@ -368,9 +389,8 @@ static int __init early_init_dt_scan_cpus(unsigned 
long node,

if (found < 0)
return 0;

-   DBG("boot cpu: logical %d physical %d\n", found,
+   DBG("boot cpu: logical %d physical %d\n", boot_cpuid,
be32_to_cpu(intserv[found_thread]));
-   boot_cpuid = found;

boot_cpu_hwid = be32_to_cpu(intserv[found_thread]);

diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c

index b7b733474b60..f7179525c774 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -409,6 +409,12 @@ static void __init cpu_init_thread_core_maps(int tpc)

 u32 *cpu_to_phys_id = NULL;

+struct interrupt_server_node {
+   boolavail;
+   int len;
+   __be32 intserv[];
+};
+
 /**
  * setup_cpu_maps - initialize the following cpu maps:
  *  cpu_possible_mask
@@ -429,9 +435,13 @@ u32 *cpu_to_phys_id = NULL;
  */
 void __init smp_setup_cpu_maps(void)
 {
+   struct interrupt_server_node *core0_node = NULL, *bt_node = NULL;
+   int orig_boot_cpu = -1, orig_boot_thread = -1;
+   bool found_boot_cpu = false;
struct device_node *dn;
-   int cpu = 0;
int nthreads = 1;
+   int cpu = 0;
+   int j, len;

DBG("smp_setup_cpu_maps()\n");

@@ -442,9 +452,9 @@ void __init smp_setup_cpu_maps(void)
  __func__, nr_cpu_ids * sizeof(u32)

Re: [PATCH v5 1/3] powerpc: make fadump resilient with memory add/remove events

2023-11-16 Thread Hari Bathini




On 17/11/23 11:01 am, Aneesh Kumar K V wrote:

On 11/17/23 10:03 AM, Sourabh Jain wrote:

Hi Aneesh,

Thanks for reviewing the patch.

On 15/11/23 10:14, Aneesh Kumar K.V wrote:

Sourabh Jain  writes:




diff --git a/arch/powerpc/include/asm/fadump-internal.h 
b/arch/powerpc/include/asm/fadump-internal.h
index 27f9e11eda28..7be3d8894520 100644
--- a/arch/powerpc/include/asm/fadump-internal.h
+++ b/arch/powerpc/include/asm/fadump-internal.h
@@ -42,7 +42,25 @@ static inline u64 fadump_str_to_u64(const char *str)
     #define FADUMP_CPU_UNKNOWN    (~((u32)0))
   -#define FADUMP_CRASH_INFO_MAGIC    fadump_str_to_u64("FADMPINF")
+/*
+ * The introduction of new fields in the fadump crash info header has
+ * led to a change in the magic key, from `FADMPINF` to `FADMPSIG`.
+ * This alteration ensures backward compatibility, enabling the kernel
+ * with the updated fadump crash info to handle kernel dumps from older
+ * kernels.
+ *
+ * To prevent the need for further changes to the magic number in the
+ * event of future modifications to the fadump header, a version field
+ * has been introduced to track the fadump crash info header version.
+ *
+ * Historically, there was no connection between the magic number and
+ * the fadump crash info header version. However, moving forward, the
+ * `FADMPINF` magic number in header will be treated as version 0, while
+ * the `FADMPSIG` magic number in header will include a version field to
+ * determine its version.
+ */
+#define FADUMP_CRASH_INFO_MAGIC    fadump_str_to_u64("FADMPSIG")
+#define FADUMP_VERSION    1


Can we keep the old magic details as

#define FADUMP_CRASH_INFO_MAGIC_OLD    fadump_str_to_u64("FADMPINF")
#define FADUMP_CRASH_INFO_MAGIC    fadump_str_to_u64("FADMPSIG")


Sure.


Also considering the struct need not be backward compatible, can we just
do

struct fadump_crash_info_header {
 u64    magic_number;
 u32    crashing_cpu;
 u64    elfcorehdr_addr;
 u64    elfcorehdr_size;
 u64    vmcoreinfo_raddr;
 u64    vmcoreinfo_size;
 struct pt_regs    regs;
 struct cpumask    cpu_mask;
};
static inline bool fadump_compatible(struct fadump_crash_info_header
*fdh)
{
 return (fdh->magic_number == FADUMP_CRASH_INFO_MAGIC)
}

and fail fadump if we find it not compatible?


Agree that it is unsafe to collect a dump with an incompatible fadump crash 
info header.

Given that I am updating the fadump crash info header, we can make a few 
arrangements
like adding a size filed for the dynamic size attribute like pt_regs and 
cpumask to ensure
better compatibility in the future.

Additionally, let's introduce a version field to the fadump crash info header 
to avoid changing
the magic number in the future.



I am not sure whether we need to add all the complexity to enable supporting 
different fadump kernel
version. Is that even a possible use case with fadump? Can't we always assume 
that with fadump the
crash kernel and fadump kernel will be same version? if yes we can simply fail 
with a magic number
mismatch because that indicates an user config error?


If we decide not to support different kernel versions for production
kernel and capture kernel, We can make that implicit by adding kernel
version info of production kernel in the header and bailing out if
there is kernel version mismatch as magic could still match for two
different kernel versions.

I would personally prefer something like the below though:

struct fadump_crash_info_header {
u64 magic_number;
u32 version
u32 crashing_cpu;
u64 elfcorehdr_addr;
u64 elfcorehdr_size;
u64 vmcoreinfo_raddr;
u64 vmcoreinfo_size;
u8  kernel_version[];
u32 pt_regs_sz;
struct pt_regs  regs;
u32 cpu_mask_sz;
struct cpumask  cpu_mask;
};

if (magic_number != new_magic)
goto err;   /* Error out */

if (kernel_version != capture_kernel_version)
{
		if (cpu_mask_sz == sizeof(struct pt_regs) && cpu_mask_sz == 
sizeof(struct cpumask))

/*
 * Warn about the kernel version mismatch and how data 
can be different
 * across kernel versions and proceed anyway!
 */
else
goto err;   /* Error out */
}

This ensures we warn and proceed in cases where it is less likely to
have issues capturing kernel dump. This helps in dev environment where
we are trying to debug an early boot crash - in which case capture
kernel can't be the same kernel as it would likely hit the same problem
while booting..

Thanks
Hari


Re: [PATCH v7 1/5] powerpc/code-patching: introduce patch_instructions()

2023-10-30 Thread Hari Bathini

Hi Aneesh,

On 30/10/23 6:32 pm, Aneesh Kumar K.V wrote:

Hari Bathini  writes:


patch_instruction() entails setting up pte, patching the instruction,
clearing the pte and flushing the tlb. If multiple instructions need
to be patched, every instruction would have to go through the above
drill unnecessarily. Instead, introduce patch_instructions() function
that sets up the pte, clears the pte and flushes the tlb only once
per page range of instructions to be patched. Duplicate most of the
patch_instruction() code instead of merging with it, to avoid the
performance degradation observed on ppc32, for patch_instruction(),
with the code path merged. Also, setup poking_init() always as BPF
expects poking_init() to be setup even when STRICT_KERNEL_RWX is off.

Signed-off-by: Hari Bathini 
Acked-by: Song Liu 



A lot of this is duplicate of patch_instruction(). Can we consolidate
thing between them?


True. The code was consolidated till v5 but had to duplicate most of it
to avoid performance degradation reported on ppc32:


https://lore.kernel.org/all/6cceb564-8b52-4d98-9118-92a914f48...@csgroup.eu/




---

Changes in v7:
* Fixed crash observed with !STRICT_RWX.


  arch/powerpc/include/asm/code-patching.h |   1 +
  arch/powerpc/lib/code-patching.c | 141 ++-
  2 files changed, 139 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/code-patching.h 
b/arch/powerpc/include/asm/code-patching.h
index 3f881548fb61..0e29ccf903d0 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -74,6 +74,7 @@ int create_cond_branch(ppc_inst_t *instr, const u32 *addr,
  int patch_branch(u32 *addr, unsigned long target, int flags);
  int patch_instruction(u32 *addr, ppc_inst_t instr);
  int raw_patch_instruction(u32 *addr, ppc_inst_t instr);
+int patch_instructions(u32 *addr, u32 *code, size_t len, bool repeat_instr);
  
  static inline unsigned long patch_site_addr(s32 *site)

  {
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index b00112d7ad46..e1c1fd9246d8 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -204,9 +204,6 @@ void __init poking_init(void)
  {
int ret;
  
-	if (!IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))

-   return;
-
if (mm_patch_enabled())
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
"powerpc/text_poke_mm:online",
@@ -378,6 +375,144 @@ int patch_instruction(u32 *addr, ppc_inst_t instr)
  }
  NOKPROBE_SYMBOL(patch_instruction);
  
+static int __patch_instructions(u32 *patch_addr, u32 *code, size_t len, bool repeat_instr)

+{
+   unsigned long start = (unsigned long)patch_addr;
+
+   /* Repeat instruction */
+   if (repeat_instr) {
+   ppc_inst_t instr = ppc_inst_read(code);
+
+   if (ppc_inst_prefixed(instr)) {
+   u64 val = ppc_inst_as_ulong(instr);
+
+   memset64((u64 *)patch_addr, val, len / 8);
+   } else {
+   u32 val = ppc_inst_val(instr);
+
+   memset32(patch_addr, val, len / 4);
+   }
+   } else {
+   memcpy(patch_addr, code, len);
+   }
+
+   smp_wmb();  /* smp write barrier */
+   flush_icache_range(start, start + len);
+   return 0;
+}
+
+/*
+ * A page is mapped and instructions that fit the page are patched.
+ * Assumes 'len' to be (PAGE_SIZE - offset_in_page(addr)) or below.
+ */
+static int __do_patch_instructions_mm(u32 *addr, u32 *code, size_t len, bool 
repeat_instr)
+{
+   struct mm_struct *patching_mm, *orig_mm;
+   unsigned long pfn = get_patch_pfn(addr);
+   unsigned long text_poke_addr;
+   spinlock_t *ptl;
+   u32 *patch_addr;
+   pte_t *pte;
+   int err;
+
+   patching_mm = __this_cpu_read(cpu_patching_context.mm);
+   text_poke_addr = __this_cpu_read(cpu_patching_context.addr);
+   patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
+
+   pte = get_locked_pte(patching_mm, text_poke_addr, &ptl);
+   if (!pte)
+   return -ENOMEM;
+
+   __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, 
PAGE_KERNEL), 0);
+
+   /* order PTE update before use, also serves as the hwsync */
+   asm volatile("ptesync" ::: "memory");
+
+   /* order context switch after arbitrary prior code */
+   isync();
+
+   orig_mm = start_using_temp_mm(patching_mm);
+
+   err = __patch_instructions(patch_addr, code, len, repeat_instr);
+
+   /* context synchronisation performed by __patch_instructions */
+   stop_using_temp_mm(patching_mm, orig_mm);
+
+   pte_clear(patching_mm, text_poke_addr, pte);
+   /*
+* ptesync to order PTE update before TLB invalidation done
+* by radix__local_flush_tlb_page_psize (in _tlbiel_va)
+ 

Re: [PATCH v6 5/5] powerpc/bpf: use bpf_jit_binary_pack_[alloc|finalize|free]

2023-10-20 Thread Hari Bathini




On 19/10/23 11:41 am, Michael Ellerman wrote:

Hari Bathini  writes:

Use bpf_jit_binary_pack_alloc in powerpc jit. The jit engine first
writes the program to the rw buffer. When the jit is done, the program
is copied to the final location with bpf_jit_binary_pack_finalize.
With multiple jit_subprogs, bpf_jit_free is called on some subprograms
that haven't got bpf_jit_binary_pack_finalize() yet. Implement custom
bpf_jit_free() like in commit 1d5f82d9dd47 ("bpf, x86: fix freeing of
not-finalized bpf_prog_pack") to call bpf_jit_binary_pack_finalize(),
if necessary. As bpf_flush_icache() is not needed anymore, remove it.

Signed-off-by: Hari Bathini 
Acked-by: Song Liu 
---
  arch/powerpc/net/bpf_jit.h|  18 ++---
  arch/powerpc/net/bpf_jit_comp.c   | 106 ++
  arch/powerpc/net/bpf_jit_comp32.c |  13 ++--
  arch/powerpc/net/bpf_jit_comp64.c |  10 +--
  4 files changed, 96 insertions(+), 51 deletions(-)


This causes a crash at boot on my Power7 box:


Thanks, Michael.
Posted v7.

- Hari


[PATCH v7 1/5] powerpc/code-patching: introduce patch_instructions()

2023-10-20 Thread Hari Bathini
patch_instruction() entails setting up pte, patching the instruction,
clearing the pte and flushing the tlb. If multiple instructions need
to be patched, every instruction would have to go through the above
drill unnecessarily. Instead, introduce patch_instructions() function
that sets up the pte, clears the pte and flushes the tlb only once
per page range of instructions to be patched. Duplicate most of the
patch_instruction() code instead of merging with it, to avoid the
performance degradation observed on ppc32, for patch_instruction(),
with the code path merged. Also, setup poking_init() always as BPF
expects poking_init() to be setup even when STRICT_KERNEL_RWX is off.

Signed-off-by: Hari Bathini 
Acked-by: Song Liu 
---

Changes in v7:
* Fixed crash observed with !STRICT_RWX.


 arch/powerpc/include/asm/code-patching.h |   1 +
 arch/powerpc/lib/code-patching.c | 141 ++-
 2 files changed, 139 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/code-patching.h 
b/arch/powerpc/include/asm/code-patching.h
index 3f881548fb61..0e29ccf903d0 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -74,6 +74,7 @@ int create_cond_branch(ppc_inst_t *instr, const u32 *addr,
 int patch_branch(u32 *addr, unsigned long target, int flags);
 int patch_instruction(u32 *addr, ppc_inst_t instr);
 int raw_patch_instruction(u32 *addr, ppc_inst_t instr);
+int patch_instructions(u32 *addr, u32 *code, size_t len, bool repeat_instr);
 
 static inline unsigned long patch_site_addr(s32 *site)
 {
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index b00112d7ad46..e1c1fd9246d8 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -204,9 +204,6 @@ void __init poking_init(void)
 {
int ret;
 
-   if (!IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
-   return;
-
if (mm_patch_enabled())
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
"powerpc/text_poke_mm:online",
@@ -378,6 +375,144 @@ int patch_instruction(u32 *addr, ppc_inst_t instr)
 }
 NOKPROBE_SYMBOL(patch_instruction);
 
+static int __patch_instructions(u32 *patch_addr, u32 *code, size_t len, bool 
repeat_instr)
+{
+   unsigned long start = (unsigned long)patch_addr;
+
+   /* Repeat instruction */
+   if (repeat_instr) {
+   ppc_inst_t instr = ppc_inst_read(code);
+
+   if (ppc_inst_prefixed(instr)) {
+   u64 val = ppc_inst_as_ulong(instr);
+
+   memset64((u64 *)patch_addr, val, len / 8);
+   } else {
+   u32 val = ppc_inst_val(instr);
+
+   memset32(patch_addr, val, len / 4);
+   }
+   } else {
+   memcpy(patch_addr, code, len);
+   }
+
+   smp_wmb();  /* smp write barrier */
+   flush_icache_range(start, start + len);
+   return 0;
+}
+
+/*
+ * A page is mapped and instructions that fit the page are patched.
+ * Assumes 'len' to be (PAGE_SIZE - offset_in_page(addr)) or below.
+ */
+static int __do_patch_instructions_mm(u32 *addr, u32 *code, size_t len, bool 
repeat_instr)
+{
+   struct mm_struct *patching_mm, *orig_mm;
+   unsigned long pfn = get_patch_pfn(addr);
+   unsigned long text_poke_addr;
+   spinlock_t *ptl;
+   u32 *patch_addr;
+   pte_t *pte;
+   int err;
+
+   patching_mm = __this_cpu_read(cpu_patching_context.mm);
+   text_poke_addr = __this_cpu_read(cpu_patching_context.addr);
+   patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
+
+   pte = get_locked_pte(patching_mm, text_poke_addr, &ptl);
+   if (!pte)
+   return -ENOMEM;
+
+   __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, 
PAGE_KERNEL), 0);
+
+   /* order PTE update before use, also serves as the hwsync */
+   asm volatile("ptesync" ::: "memory");
+
+   /* order context switch after arbitrary prior code */
+   isync();
+
+   orig_mm = start_using_temp_mm(patching_mm);
+
+   err = __patch_instructions(patch_addr, code, len, repeat_instr);
+
+   /* context synchronisation performed by __patch_instructions */
+   stop_using_temp_mm(patching_mm, orig_mm);
+
+   pte_clear(patching_mm, text_poke_addr, pte);
+   /*
+* ptesync to order PTE update before TLB invalidation done
+* by radix__local_flush_tlb_page_psize (in _tlbiel_va)
+*/
+   local_flush_tlb_page_psize(patching_mm, text_poke_addr, 
mmu_virtual_psize);
+
+   pte_unmap_unlock(pte, ptl);
+
+   return err;
+}
+
+/*
+ * A page is mapped and instructions that fit the page are patched.
+ * Assumes 'len' to be (PAGE_SIZE - offset_in_page(addr)) or below.
+ */
+static int __do_patch_instructions(u32 *addr, u32 *code, size_t len, bool 
repeat_in

[PATCH v7 5/5] powerpc/bpf: use bpf_jit_binary_pack_[alloc|finalize|free]

2023-10-20 Thread Hari Bathini
Use bpf_jit_binary_pack_alloc in powerpc jit. The jit engine first
writes the program to the rw buffer. When the jit is done, the program
is copied to the final location with bpf_jit_binary_pack_finalize.
With multiple jit_subprogs, bpf_jit_free is called on some subprograms
that haven't got bpf_jit_binary_pack_finalize() yet. Implement custom
bpf_jit_free() like in commit 1d5f82d9dd47 ("bpf, x86: fix freeing of
not-finalized bpf_prog_pack") to call bpf_jit_binary_pack_finalize(),
if necessary. As bpf_flush_icache() is not needed anymore, remove it.

Signed-off-by: Hari Bathini 
Acked-by: Song Liu 
---

* No changes in v7.


 arch/powerpc/net/bpf_jit.h|  18 ++---
 arch/powerpc/net/bpf_jit_comp.c   | 106 ++
 arch/powerpc/net/bpf_jit_comp32.c |  13 ++--
 arch/powerpc/net/bpf_jit_comp64.c |  10 +--
 4 files changed, 96 insertions(+), 51 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index 72b7bb34fade..cdea5dccaefe 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -36,9 +36,6 @@
EMIT(PPC_RAW_BRANCH(offset)); \
} while (0)
 
-/* bl (unconditional 'branch' with link) */
-#define PPC_BL(dest)   EMIT(PPC_RAW_BL((dest) - (unsigned long)(image + 
ctx->idx)))
-
 /* "cond" here covers BO:BI fields. */
 #define PPC_BCC_SHORT(cond, dest)\
do {  \
@@ -147,12 +144,6 @@ struct codegen_context {
 #define BPF_FIXUP_LEN  2 /* Two instructions => 8 bytes */
 #endif
 
-static inline void bpf_flush_icache(void *start, void *end)
-{
-   smp_wmb();  /* smp write barrier */
-   flush_icache_range((unsigned long)start, (unsigned long)end);
-}
-
 static inline bool bpf_is_seen_register(struct codegen_context *ctx, int i)
 {
return ctx->seen & (1 << (31 - i));
@@ -169,16 +160,17 @@ static inline void bpf_clear_seen_register(struct 
codegen_context *ctx, int i)
 }
 
 void bpf_jit_init_reg_mapping(struct codegen_context *ctx);
-int bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 
func);
-int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context 
*ctx,
+int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context 
*ctx, u64 func);
+int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct 
codegen_context *ctx,
   u32 *addrs, int pass, bool extra_pass);
 void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx);
 void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx);
 void bpf_jit_realloc_regs(struct codegen_context *ctx);
 int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx, int 
tmp_reg, long exit_addr);
 
-int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, int pass, struct 
codegen_context *ctx,
- int insn_idx, int jmp_off, int dst_reg);
+int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, u32 *fimage, int 
pass,
+ struct codegen_context *ctx, int insn_idx,
+ int jmp_off, int dst_reg);
 
 #endif
 
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index e7ca270a39d5..a79d7c478074 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -44,9 +44,12 @@ int bpf_jit_emit_exit_insn(u32 *image, struct 
codegen_context *ctx, int tmp_reg,
 }
 
 struct powerpc_jit_data {
-   struct bpf_binary_header *header;
+   /* address of rw header */
+   struct bpf_binary_header *hdr;
+   /* address of ro final header */
+   struct bpf_binary_header *fhdr;
u32 *addrs;
-   u8 *image;
+   u8 *fimage;
u32 proglen;
struct codegen_context ctx;
 };
@@ -67,11 +70,14 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
struct codegen_context cgctx;
int pass;
int flen;
-   struct bpf_binary_header *bpf_hdr;
+   struct bpf_binary_header *fhdr = NULL;
+   struct bpf_binary_header *hdr = NULL;
struct bpf_prog *org_fp = fp;
struct bpf_prog *tmp_fp;
bool bpf_blinded = false;
bool extra_pass = false;
+   u8 *fimage = NULL;
+   u32 *fcode_base;
u32 extable_len;
u32 fixup_len;
 
@@ -101,9 +107,16 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
addrs = jit_data->addrs;
if (addrs) {
cgctx = jit_data->ctx;
-   image = jit_data->image;
-   bpf_hdr = jit_data->header;
+   /*
+* JIT compiled to a writable location (image/code_base) first.
+* It is then moved to the readonly final location 
(fimage/fcode_base)
+* using instruction patching.
+*/
+   

[PATCH v7 3/5] powerpc/bpf: implement bpf_arch_text_invalidate for bpf_prog_pack

2023-10-20 Thread Hari Bathini
Implement bpf_arch_text_invalidate and use it to fill unused part of
the bpf_prog_pack with trap instructions when a BPF program is freed.

Signed-off-by: Hari Bathini 
Acked-by: Song Liu 
---

* No changes in v7.


 arch/powerpc/net/bpf_jit_comp.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index c740eac8d584..ecd7cffbbe28 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -292,3 +292,18 @@ void *bpf_arch_text_copy(void *dst, void *src, size_t len)
 
return err ? ERR_PTR(err) : dst;
 }
+
+int bpf_arch_text_invalidate(void *dst, size_t len)
+{
+   u32 insn = BREAKPOINT_INSTRUCTION;
+   int ret;
+
+   if (WARN_ON_ONCE(core_kernel_text((unsigned long)dst)))
+   return -EINVAL;
+
+   mutex_lock(&text_mutex);
+   ret = patch_instructions(dst, &insn, len, true);
+   mutex_unlock(&text_mutex);
+
+   return ret;
+}
-- 
2.41.0



[PATCH v7 4/5] powerpc/bpf: rename powerpc64_jit_data to powerpc_jit_data

2023-10-20 Thread Hari Bathini
powerpc64_jit_data is a misnomer as it is meant for both ppc32 and
ppc64. Rename it to powerpc_jit_data.

Signed-off-by: Hari Bathini 
Acked-by: Song Liu 
---

* No changes in v7.


 arch/powerpc/net/bpf_jit_comp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index ecd7cffbbe28..e7ca270a39d5 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -43,7 +43,7 @@ int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context 
*ctx, int tmp_reg,
return 0;
 }
 
-struct powerpc64_jit_data {
+struct powerpc_jit_data {
struct bpf_binary_header *header;
u32 *addrs;
u8 *image;
@@ -63,7 +63,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
u8 *image = NULL;
u32 *code_base;
u32 *addrs;
-   struct powerpc64_jit_data *jit_data;
+   struct powerpc_jit_data *jit_data;
struct codegen_context cgctx;
int pass;
int flen;
-- 
2.41.0



[PATCH v7 2/5] powerpc/bpf: implement bpf_arch_text_copy

2023-10-20 Thread Hari Bathini
bpf_arch_text_copy is used to dump JITed binary to RX page, allowing
multiple BPF programs to share the same page. Use the newly introduced
patch_instructions() to implement it.

Signed-off-by: Hari Bathini 
Acked-by: Song Liu 
---

* No changes in v7.


 arch/powerpc/net/bpf_jit_comp.c | 20 +++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 37043dfc1add..c740eac8d584 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -13,9 +13,13 @@
 #include 
 #include 
 #include 
-#include 
+#include 
+#include 
 #include 
 
+#include 
+#include 
+
 #include "bpf_jit.h"
 
 static void bpf_jit_fill_ill_insns(void *area, unsigned int size)
@@ -274,3 +278,17 @@ int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, 
int pass, struct code
ctx->exentry_idx++;
return 0;
 }
+
+void *bpf_arch_text_copy(void *dst, void *src, size_t len)
+{
+   int err;
+
+   if (WARN_ON_ONCE(core_kernel_text((unsigned long)dst)))
+   return ERR_PTR(-EINVAL);
+
+   mutex_lock(&text_mutex);
+   err = patch_instructions(dst, src, len, false);
+   mutex_unlock(&text_mutex);
+
+   return err ? ERR_PTR(err) : dst;
+}
-- 
2.41.0



[PATCH v7 0/5] powerpc/bpf: use BPF prog pack allocator

2023-10-20 Thread Hari Bathini
Most BPF programs are small, but they consume a page each. For systems
with busy traffic and many BPF programs, this may also add significant
pressure on instruction TLB. High iTLB pressure usually slows down the
whole system causing visible performance degradation for production
workloads.

bpf_prog_pack, a customized allocator that packs multiple bpf programs
into preallocated memory chunks, was proposed [1] to address it. This
series extends this support on powerpc.

Both bpf_arch_text_copy() & bpf_arch_text_invalidate() functions,
needed for this support depend on instruction patching in text area.
Currently, patch_instruction() supports patching only one instruction
at a time. The first patch introduces patch_instructions() function
to enable patching more than one instruction at a time. This helps in
avoiding performance degradation while JITing bpf programs.

Patches 2 & 3 implement the above mentioned arch specific functions
using patch_instructions(). Patch 4 fixes a misnomer in bpf JITing
code. The last patch enables the use of BPF prog pack allocator on
powerpc and also, ensures cleanup is handled gracefully.

[1] https://lore.kernel.org/bpf/20220204185742.271030-1-s...@kernel.org/

Changes in v7:
* Fixed crash observed with !STRICT_RWX.

Changes in v6:
* No changes in patches 2-5/5 except addition of Acked-by tags from Song.
* Skipped merging code path of patch_instruction() & patch_instructions()
  to avoid performance overhead observed on ppc32 with that.

Changes in v5:
* Moved introduction of patch_instructions() as 1st patch in series.
* Improved patch_instructions() to use memset & memcpy.
* Fixed the misnomer in JITing code as a separate patch.
* Removed unused bpf_flush_icache() function.

Changes in v4:
* Updated bpf_patch_instructions() definition in patch 1/5 so that
  it doesn't have to be updated again in patch 2/5.
* Addressed Christophe's comment on bpf_arch_text_invalidate() return
  value in patch 2/5.

Changes in v3:
* Fixed segfault issue observed on ppc32 due to inaccurate offset
  calculation for branching.
* Tried to minimize the performance impact for patch_instruction()
  with the introduction of patch_instructions().
* Corrected uses of u32* vs ppc_instr_t.
* Moved the change that introduces patch_instructions() to after
  enabling bpf_prog_pack support.
* Added few comments to improve code readability.

Changes in v2:
* Introduced patch_instructions() to help with patching bpf programs.


Hari Bathini (5):
  powerpc/code-patching: introduce patch_instructions()
  powerpc/bpf: implement bpf_arch_text_copy
  powerpc/bpf: implement bpf_arch_text_invalidate for bpf_prog_pack
  powerpc/bpf: rename powerpc64_jit_data to powerpc_jit_data
  powerpc/bpf: use bpf_jit_binary_pack_[alloc|finalize|free]

 arch/powerpc/include/asm/code-patching.h |   1 +
 arch/powerpc/lib/code-patching.c | 141 +-
 arch/powerpc/net/bpf_jit.h   |  18 +--
 arch/powerpc/net/bpf_jit_comp.c  | 145 ++-
 arch/powerpc/net/bpf_jit_comp32.c|  13 +-
 arch/powerpc/net/bpf_jit_comp64.c|  10 +-
 6 files changed, 271 insertions(+), 57 deletions(-)

-- 
2.41.0



Re: [PATCHv9 2/2] powerpc/setup: Loosen the mapping between cpu logical id and its seq in dt

2023-10-20 Thread Hari Bathini




On 18/10/23 1:51 pm, Pingfan Liu wrote:

On Tue, Oct 17, 2023 at 6:39 PM Hari Bathini  wrote:




On 17/10/23 7:58 am, Pingfan Liu wrote:

*** Idea ***
For kexec -p, the boot cpu can be not the cpu0, this causes the problem
of allocating memory for paca_ptrs[]. However, in theory, there is no
requirement to assign cpu's logical id as its present sequence in the
device tree. But there is something like cpu_first_thread_sibling(),
which makes assumption on the mapping inside a core. Hence partially
loosening the mapping, i.e. unbind the mapping of core while keep the
mapping inside a core.

*** Implement ***
At this early stage, there are plenty of memory to utilize. Hence, this
patch allocates interim memory to link the cpu info on a list, then
reorder cpus by changing the list head. As a result, there is a rotate
shift between the sequence number in dt and the cpu logical number.

*** Result ***
After this patch, a boot-cpu's logical id will always be mapped into the
range [0,threads_per_core).

Besides this, at this phase, all threads in the boot core are forced to
be onlined. This restriction will be lifted in a later patch with
extra effort.

Signed-off-by: Pingfan Liu 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Christophe Leroy 
Cc: Mahesh Salgaonkar 
Cc: Wen Xiong 
Cc: Baoquan He 
Cc: Ming Lei 
Cc: Sourabh Jain 
Cc: Hari Bathini 
Cc: ke...@lists.infradead.org
To: linuxppc-dev@lists.ozlabs.org


Thanks for working on this, Pingfan.
Looks good to me.

Acked-by: Hari Bathini 



Thank you for kindly reviewing. I hope that after all these years, we
have accomplished the objective.



I hope so too.
Thanks!


Re: [PATCHv9 2/2] powerpc/setup: Loosen the mapping between cpu logical id and its seq in dt

2023-10-17 Thread Hari Bathini




On 17/10/23 7:58 am, Pingfan Liu wrote:

*** Idea ***
For kexec -p, the boot cpu can be not the cpu0, this causes the problem
of allocating memory for paca_ptrs[]. However, in theory, there is no
requirement to assign cpu's logical id as its present sequence in the
device tree. But there is something like cpu_first_thread_sibling(),
which makes assumption on the mapping inside a core. Hence partially
loosening the mapping, i.e. unbind the mapping of core while keep the
mapping inside a core.

*** Implement ***
At this early stage, there are plenty of memory to utilize. Hence, this
patch allocates interim memory to link the cpu info on a list, then
reorder cpus by changing the list head. As a result, there is a rotate
shift between the sequence number in dt and the cpu logical number.

*** Result ***
After this patch, a boot-cpu's logical id will always be mapped into the
range [0,threads_per_core).

Besides this, at this phase, all threads in the boot core are forced to
be onlined. This restriction will be lifted in a later patch with
extra effort.

Signed-off-by: Pingfan Liu 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Christophe Leroy 
Cc: Mahesh Salgaonkar 
Cc: Wen Xiong 
Cc: Baoquan He 
Cc: Ming Lei 
Cc: Sourabh Jain 
Cc: Hari Bathini 
Cc: ke...@lists.infradead.org
To: linuxppc-dev@lists.ozlabs.org


Thanks for working on this, Pingfan.
Looks good to me.

Acked-by: Hari Bathini 


---
  arch/powerpc/kernel/prom.c | 25 +
  arch/powerpc/kernel/setup-common.c | 84 +++---
  2 files changed, 82 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index ec82f5bda908..7ed9034912ca 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -76,7 +76,9 @@ u64 ppc64_rma_size;
  unsigned int boot_cpu_node_count __ro_after_init;
  #endif
  static phys_addr_t first_memblock_size;
+#ifdef CONFIG_SMP
  static int __initdata boot_cpu_count;
+#endif
  
  static int __init early_parse_mem(char *p)

  {
@@ -331,8 +333,7 @@ static int __init early_init_dt_scan_cpus(unsigned long 
node,
const __be32 *intserv;
int i, nthreads;
int len;
-   int found = -1;
-   int found_thread = 0;
+   bool found = false;
  
  	/* We are scanning "cpu" nodes only */

if (type == NULL || strcmp(type, "cpu") != 0)
@@ -355,8 +356,15 @@ static int __init early_init_dt_scan_cpus(unsigned long 
node,
for (i = 0; i < nthreads; i++) {
if (be32_to_cpu(intserv[i]) ==
fdt_boot_cpuid_phys(initial_boot_params)) {
-   found = boot_cpu_count;
-   found_thread = i;
+   /*
+* always map the boot-cpu logical id into the
+* range of [0, thread_per_core)
+*/
+   boot_cpuid = i;
+   found = true;
+   /* This forces all threads in a core to be online */
+   if (nr_cpu_ids % nthreads != 0)
+   set_nr_cpu_ids(ALIGN(nr_cpu_ids, nthreads));
}
  #ifdef CONFIG_SMP
/* logical cpu id is always 0 on UP kernels */
@@ -365,14 +373,13 @@ static int __init early_init_dt_scan_cpus(unsigned long 
node,
}
  
  	/* Not the boot CPU */

-   if (found < 0)
+   if (!found)
return 0;
  
-	DBG("boot cpu: logical %d physical %d\n", found,

-   be32_to_cpu(intserv[found_thread]));
-   boot_cpuid = found;
+   DBG("boot cpu: logical %d physical %d\n", boot_cpuid,
+   be32_to_cpu(intserv[boot_cpuid]));
  
-	boot_cpu_hwid = be32_to_cpu(intserv[found_thread]);

+   boot_cpu_hwid = be32_to_cpu(intserv[boot_cpuid]);
  
  	/*

 * PAPR defines "logical" PVR values for cpus that
diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c
index 707f0490639d..9802c7e5ee2f 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -36,6 +36,7 @@
  #include 
  #include 
  #include 
+#include 
  #include 
  #include 
  #include 
@@ -425,6 +426,13 @@ static void __init cpu_init_thread_core_maps(int tpc)
  
  u32 *cpu_to_phys_id = NULL;
  
+struct interrupt_server_node {

+   struct list_head node;
+   boolavail;
+   int len;
+   __be32 intserv[];
+};
+
  /**
   * setup_cpu_maps - initialize the following cpu maps:
   *  cpu_possible_mask
@@ -446,11 +454,16 @@ u32 *cpu_to_phys_id = NULL;
  void __init smp_setup_cpu_maps(void)
  {
struct device_node *dn;
-   int cpu = 0;
-   int nthreads = 1;
+   int shift = 0, cpu = 0;
+   int j, nthreads = 1;
+   int len;
+   struct interrupt_server_node *intserv_node, *n;
+   struct list_head *bt_node, head;
+   boo

Re: [PATCHv9 1/2] powerpc/setup : Enable boot_cpu_hwid for PPC32

2023-10-17 Thread Hari Bathini




On 17/10/23 7:58 am, Pingfan Liu wrote:

In order to identify the boot cpu, its intserv[] should be recorded and
checked in smp_setup_cpu_maps().

smp_setup_cpu_maps() is shared between PPC64 and PPC32. Since PPC64 has
already used boot_cpu_hwid to carry that information, enabling this
variable on PPC32 so later it can also be used to carry that information
for PPC32 in the coming patch.

Signed-off-by: Pingfan Liu 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Christophe Leroy 
Cc: Mahesh Salgaonkar 
Cc: Wen Xiong 
Cc: Baoquan He 
Cc: Ming Lei 
Cc: Sourabh Jain 
Cc: Hari Bathini 
Cc: ke...@lists.infradead.org
To: linuxppc-dev@lists.ozlabs.org


LGTM.

Acked-by: Hari Bathini 


---
  arch/powerpc/include/asm/smp.h | 2 +-
  arch/powerpc/kernel/prom.c | 3 +--
  arch/powerpc/kernel/setup-common.c | 2 --
  3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 576d0e15..5db9178cc800 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -26,7 +26,7 @@
  #include 
  
  extern int boot_cpuid;

-extern int boot_cpu_hwid; /* PPC64 only */
+extern int boot_cpu_hwid;
  extern int spinning_secondaries;
  extern u32 *cpu_to_phys_id;
  extern bool coregroup_enabled;
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 0b5878c3125b..ec82f5bda908 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -372,8 +372,7 @@ static int __init early_init_dt_scan_cpus(unsigned long 
node,
be32_to_cpu(intserv[found_thread]));
boot_cpuid = found;
  
-	if (IS_ENABLED(CONFIG_PPC64))

-   boot_cpu_hwid = be32_to_cpu(intserv[found_thread]);
+   boot_cpu_hwid = be32_to_cpu(intserv[found_thread]);
  
  	/*

 * PAPR defines "logical" PVR values for cpus that
diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c
index 2f1026fba00d..707f0490639d 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -87,9 +87,7 @@ EXPORT_SYMBOL(machine_id);
  int boot_cpuid = -1;
  EXPORT_SYMBOL_GPL(boot_cpuid);
  
-#ifdef CONFIG_PPC64

  int boot_cpu_hwid = -1;
-#endif
  
  /*

   * These are used in binfmt_elf.c to put aux entries on the stack


Re: [PATCH v6 0/5] powerpc/bpf: use BPF prog pack allocator

2023-10-16 Thread Hari Bathini




On 16/10/23 5:37 pm, Daniel Borkmann wrote:

On 10/12/23 10:03 PM, Hari Bathini wrote:

Most BPF programs are small, but they consume a page each. For systems
with busy traffic and many BPF programs, this may also add significant
pressure on instruction TLB. High iTLB pressure usually slows down the
whole system causing visible performance degradation for production
workloads.

bpf_prog_pack, a customized allocator that packs multiple bpf programs
into preallocated memory chunks, was proposed [1] to address it. This
series extends this support on powerpc.

Both bpf_arch_text_copy() & bpf_arch_text_invalidate() functions,
needed for this support depend on instruction patching in text area.
Currently, patch_instruction() supports patching only one instruction
at a time. The first patch introduces patch_instructions() function
to enable patching more than one instruction at a time. This helps in
avoiding performance degradation while JITing bpf programs.

Patches 2 & 3 implement the above mentioned arch specific functions
using patch_instructions(). Patch 4 fixes a misnomer in bpf JITing
code. The last patch enables the use of BPF prog pack allocator on
powerpc and also, ensures cleanup is handled gracefully.

[1] https://lore.kernel.org/bpf/20220204185742.271030-1-s...@kernel.org/

Changes in v6:
* No changes in patches 2-5/5 except addition of Acked-by tags from Song.
* Skipped merging code path of patch_instruction() & patch_instructions()
   to avoid performance overhead observed on ppc32 with that.


I presume this will be routed via Michael?


Yes, Daniel. This can go via linuxppc tree.

Thanks
Hari


Re: [PATCH v5 1/5] powerpc/code-patching: introduce patch_instructions()

2023-10-12 Thread Hari Bathini

Thanks for the review, Christophe.

On 10/10/23 11:16 pm, Christophe Leroy wrote:



Le 28/09/2023 à 21:48, Hari Bathini a écrit :

patch_instruction() entails setting up pte, patching the instruction,
clearing the pte and flushing the tlb. If multiple instructions need
to be patched, every instruction would have to go through the above
drill unnecessarily. Instead, introduce function patch_instructions()
that sets up the pte, clears the pte and flushes the tlb only once per
page range of instructions to be patched. This adds a slight overhead
to patch_instruction() call while improving the patching time for
scenarios where more than one instruction needs to be patched.


Not a "slight" but a "significant" overhead on PPC32.

Thinking about it once more I don't think it is a good idea to try and
merge that into the existing code_patching logic which is really single
instruction performance oriented.

Anyway, comments below.



Signed-off-by: Hari Bathini 
---
   arch/powerpc/include/asm/code-patching.h |  1 +
   arch/powerpc/lib/code-patching.c | 93 +---
   2 files changed, 85 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/code-patching.h 
b/arch/powerpc/include/asm/code-patching.h
index 3f881548fb61..43a4aedfa703 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -74,6 +74,7 @@ int create_cond_branch(ppc_inst_t *instr, const u32 *addr,
   int patch_branch(u32 *addr, unsigned long target, int flags);
   int patch_instruction(u32 *addr, ppc_inst_t instr);
   int raw_patch_instruction(u32 *addr, ppc_inst_t instr);
+int patch_instructions(void *addr, void *code, size_t len, bool repeat_instr);


I don't like void *, you can do to much nasty things with that.
I think you want u32 *

   
   static inline unsigned long patch_site_addr(s32 *site)

   {
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index b00112d7ad46..4ff002bc41f6 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -278,7 +278,36 @@ static void unmap_patch_area(unsigned long addr)
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
   }
   
-static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr)

+static int __patch_instructions(u32 *patch_addr, void *code, size_t len, bool 
repeat_instr)
+{
+   unsigned long start = (unsigned long)patch_addr;
+
+   /* Repeat instruction */
+   if (repeat_instr) {
+   ppc_inst_t instr = ppc_inst_read(code);
+
+   if (ppc_inst_prefixed(instr)) {
+   u64 val = ppc_inst_as_ulong(instr);
+
+   memset64((uint64_t *)patch_addr, val, len / 8);


Use u64 instead of uint64_t.


+   } else {
+   u32 val = ppc_inst_val(instr);
+
+   memset32(patch_addr, val, len / 4);
+   }
+   } else
+   memcpy(patch_addr, code, len);


Missing braces, see
https://docs.kernel.org/process/coding-style.html#placing-braces-and-spaces


+
+   smp_wmb();  /* smp write barrier */
+   flush_icache_range(start, start + len);
+   return 0;
+}
+
+/*
+ * A page is mapped and instructions that fit the page are patched.
+ * Assumes 'len' to be (PAGE_SIZE - offset_in_page(addr)) or below.
+ */
+static int __do_patch_instructions_mm(u32 *addr, void *code, size_t len, bool 
repeat_instr)
   {
int err;
u32 *patch_addr;
@@ -307,11 +336,15 @@ static int __do_patch_instruction_mm(u32 *addr, 
ppc_inst_t instr)
   
   	orig_mm = start_using_temp_mm(patching_mm);
   
-	err = __patch_instruction(addr, instr, patch_addr);

+   /* Single instruction case. */
+   if (len == 0) {
+   err = __patch_instruction(addr, *(ppc_inst_t *)code, 
patch_addr);


Take care, you can't convert u32 * to ppc_inst_t that way, you have to
use ppc_inst_read() otherwise you'll get odd result with prefixed
instructions depending on endianness.

   
-	/* hwsync performed by __patch_instruction (sync) if successful */

-   if (err)
-   mb();  /* sync */
+   /* hwsync performed by __patch_instruction (sync) if successful 
*/
+   if (err)
+   mb();  /* sync */


Get this away, see my patch at
https://patchwork.ozlabs.org/project/linuxppc-dev/patch/e88b154eaf2efd9ff177d472d3411dcdec8ff4f5.1696675567.git.christophe.le...@csgroup.eu/


+   } else
+   err = __patch_instructions(patch_addr, code, len, repeat_instr);
   
   	/* context synchronisation performed by __patch_instruction (isync or exception) */

stop_using_temp_mm(patching_mm, orig_mm);
@@ -328,7 +361,11 @@ static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t 
instr)
return err;
   }
   
-static int __do_patch_instruction(u32 *addr, ppc_inst_t instr)

+/*
+ * A page is mapped and instructions that

[PATCH v6 5/5] powerpc/bpf: use bpf_jit_binary_pack_[alloc|finalize|free]

2023-10-12 Thread Hari Bathini
Use bpf_jit_binary_pack_alloc in powerpc jit. The jit engine first
writes the program to the rw buffer. When the jit is done, the program
is copied to the final location with bpf_jit_binary_pack_finalize.
With multiple jit_subprogs, bpf_jit_free is called on some subprograms
that haven't got bpf_jit_binary_pack_finalize() yet. Implement custom
bpf_jit_free() like in commit 1d5f82d9dd47 ("bpf, x86: fix freeing of
not-finalized bpf_prog_pack") to call bpf_jit_binary_pack_finalize(),
if necessary. As bpf_flush_icache() is not needed anymore, remove it.

Signed-off-by: Hari Bathini 
Acked-by: Song Liu 
---
 arch/powerpc/net/bpf_jit.h|  18 ++---
 arch/powerpc/net/bpf_jit_comp.c   | 106 ++
 arch/powerpc/net/bpf_jit_comp32.c |  13 ++--
 arch/powerpc/net/bpf_jit_comp64.c |  10 +--
 4 files changed, 96 insertions(+), 51 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index 72b7bb34fade..cdea5dccaefe 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -36,9 +36,6 @@
EMIT(PPC_RAW_BRANCH(offset)); \
} while (0)
 
-/* bl (unconditional 'branch' with link) */
-#define PPC_BL(dest)   EMIT(PPC_RAW_BL((dest) - (unsigned long)(image + 
ctx->idx)))
-
 /* "cond" here covers BO:BI fields. */
 #define PPC_BCC_SHORT(cond, dest)\
do {  \
@@ -147,12 +144,6 @@ struct codegen_context {
 #define BPF_FIXUP_LEN  2 /* Two instructions => 8 bytes */
 #endif
 
-static inline void bpf_flush_icache(void *start, void *end)
-{
-   smp_wmb();  /* smp write barrier */
-   flush_icache_range((unsigned long)start, (unsigned long)end);
-}
-
 static inline bool bpf_is_seen_register(struct codegen_context *ctx, int i)
 {
return ctx->seen & (1 << (31 - i));
@@ -169,16 +160,17 @@ static inline void bpf_clear_seen_register(struct 
codegen_context *ctx, int i)
 }
 
 void bpf_jit_init_reg_mapping(struct codegen_context *ctx);
-int bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 
func);
-int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context 
*ctx,
+int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context 
*ctx, u64 func);
+int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct 
codegen_context *ctx,
   u32 *addrs, int pass, bool extra_pass);
 void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx);
 void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx);
 void bpf_jit_realloc_regs(struct codegen_context *ctx);
 int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx, int 
tmp_reg, long exit_addr);
 
-int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, int pass, struct 
codegen_context *ctx,
- int insn_idx, int jmp_off, int dst_reg);
+int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, u32 *fimage, int 
pass,
+ struct codegen_context *ctx, int insn_idx,
+ int jmp_off, int dst_reg);
 
 #endif
 
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index e7ca270a39d5..a79d7c478074 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -44,9 +44,12 @@ int bpf_jit_emit_exit_insn(u32 *image, struct 
codegen_context *ctx, int tmp_reg,
 }
 
 struct powerpc_jit_data {
-   struct bpf_binary_header *header;
+   /* address of rw header */
+   struct bpf_binary_header *hdr;
+   /* address of ro final header */
+   struct bpf_binary_header *fhdr;
u32 *addrs;
-   u8 *image;
+   u8 *fimage;
u32 proglen;
struct codegen_context ctx;
 };
@@ -67,11 +70,14 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
struct codegen_context cgctx;
int pass;
int flen;
-   struct bpf_binary_header *bpf_hdr;
+   struct bpf_binary_header *fhdr = NULL;
+   struct bpf_binary_header *hdr = NULL;
struct bpf_prog *org_fp = fp;
struct bpf_prog *tmp_fp;
bool bpf_blinded = false;
bool extra_pass = false;
+   u8 *fimage = NULL;
+   u32 *fcode_base;
u32 extable_len;
u32 fixup_len;
 
@@ -101,9 +107,16 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
addrs = jit_data->addrs;
if (addrs) {
cgctx = jit_data->ctx;
-   image = jit_data->image;
-   bpf_hdr = jit_data->header;
+   /*
+* JIT compiled to a writable location (image/code_base) first.
+* It is then moved to the readonly final location 
(fimage/fcode_base)
+* using instruction patching.
+*/
+ 

[PATCH v6 4/5] powerpc/bpf: rename powerpc64_jit_data to powerpc_jit_data

2023-10-12 Thread Hari Bathini
powerpc64_jit_data is a misnomer as it is meant for both ppc32 and
ppc64. Rename it to powerpc_jit_data.

Signed-off-by: Hari Bathini 
Acked-by: Song Liu 
---
 arch/powerpc/net/bpf_jit_comp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index ecd7cffbbe28..e7ca270a39d5 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -43,7 +43,7 @@ int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context 
*ctx, int tmp_reg,
return 0;
 }
 
-struct powerpc64_jit_data {
+struct powerpc_jit_data {
struct bpf_binary_header *header;
u32 *addrs;
u8 *image;
@@ -63,7 +63,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
u8 *image = NULL;
u32 *code_base;
u32 *addrs;
-   struct powerpc64_jit_data *jit_data;
+   struct powerpc_jit_data *jit_data;
struct codegen_context cgctx;
int pass;
int flen;
-- 
2.41.0



[PATCH v6 3/5] powerpc/bpf: implement bpf_arch_text_invalidate for bpf_prog_pack

2023-10-12 Thread Hari Bathini
Implement bpf_arch_text_invalidate and use it to fill unused part of
the bpf_prog_pack with trap instructions when a BPF program is freed.

Signed-off-by: Hari Bathini 
Acked-by: Song Liu 
---
 arch/powerpc/net/bpf_jit_comp.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index c740eac8d584..ecd7cffbbe28 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -292,3 +292,18 @@ void *bpf_arch_text_copy(void *dst, void *src, size_t len)
 
return err ? ERR_PTR(err) : dst;
 }
+
+int bpf_arch_text_invalidate(void *dst, size_t len)
+{
+   u32 insn = BREAKPOINT_INSTRUCTION;
+   int ret;
+
+   if (WARN_ON_ONCE(core_kernel_text((unsigned long)dst)))
+   return -EINVAL;
+
+   mutex_lock(&text_mutex);
+   ret = patch_instructions(dst, &insn, len, true);
+   mutex_unlock(&text_mutex);
+
+   return ret;
+}
-- 
2.41.0



[PATCH v6 2/5] powerpc/bpf: implement bpf_arch_text_copy

2023-10-12 Thread Hari Bathini
bpf_arch_text_copy is used to dump JITed binary to RX page, allowing
multiple BPF programs to share the same page. Use the newly introduced
patch_instructions() to implement it.

Signed-off-by: Hari Bathini 
Acked-by: Song Liu 
---
 arch/powerpc/net/bpf_jit_comp.c | 20 +++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 37043dfc1add..c740eac8d584 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -13,9 +13,13 @@
 #include 
 #include 
 #include 
-#include 
+#include 
+#include 
 #include 
 
+#include 
+#include 
+
 #include "bpf_jit.h"
 
 static void bpf_jit_fill_ill_insns(void *area, unsigned int size)
@@ -274,3 +278,17 @@ int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, 
int pass, struct code
ctx->exentry_idx++;
return 0;
 }
+
+void *bpf_arch_text_copy(void *dst, void *src, size_t len)
+{
+   int err;
+
+   if (WARN_ON_ONCE(core_kernel_text((unsigned long)dst)))
+   return ERR_PTR(-EINVAL);
+
+   mutex_lock(&text_mutex);
+   err = patch_instructions(dst, src, len, false);
+   mutex_unlock(&text_mutex);
+
+   return err ? ERR_PTR(err) : dst;
+}
-- 
2.41.0



[PATCH v6 1/5] powerpc/code-patching: introduce patch_instructions()

2023-10-12 Thread Hari Bathini
patch_instruction() entails setting up pte, patching the instruction,
clearing the pte and flushing the tlb. If multiple instructions need
to be patched, every instruction would have to go through the above
drill unnecessarily. Instead, introduce patch_instructions() function
that sets up the pte, clears the pte and flushes the tlb only once per
page range of instructions to be patched. This duplicates most of the
code patching logic, instead of merging with it, to avoid performance
degradation observed for single instruction patching on ppc32 with
the code path merged.

Signed-off-by: Hari Bathini 
Acked-by: Song Liu 
---

Changes in v6:
* Skipped merging code path of patch_instruction() & patch_instructions()
  to avoid performance overhead observed on ppc32 with that.


 arch/powerpc/include/asm/code-patching.h |   1 +
 arch/powerpc/lib/code-patching.c | 138 +++
 2 files changed, 139 insertions(+)

diff --git a/arch/powerpc/include/asm/code-patching.h 
b/arch/powerpc/include/asm/code-patching.h
index 3f881548fb61..0e29ccf903d0 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -74,6 +74,7 @@ int create_cond_branch(ppc_inst_t *instr, const u32 *addr,
 int patch_branch(u32 *addr, unsigned long target, int flags);
 int patch_instruction(u32 *addr, ppc_inst_t instr);
 int raw_patch_instruction(u32 *addr, ppc_inst_t instr);
+int patch_instructions(u32 *addr, u32 *code, size_t len, bool repeat_instr);
 
 static inline unsigned long patch_site_addr(s32 *site)
 {
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index b00112d7ad46..a115496f934b 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -378,6 +378,144 @@ int patch_instruction(u32 *addr, ppc_inst_t instr)
 }
 NOKPROBE_SYMBOL(patch_instruction);
 
+static int __patch_instructions(u32 *patch_addr, u32 *code, size_t len, bool 
repeat_instr)
+{
+   unsigned long start = (unsigned long)patch_addr;
+
+   /* Repeat instruction */
+   if (repeat_instr) {
+   ppc_inst_t instr = ppc_inst_read(code);
+
+   if (ppc_inst_prefixed(instr)) {
+   u64 val = ppc_inst_as_ulong(instr);
+
+   memset64((u64 *)patch_addr, val, len / 8);
+   } else {
+   u32 val = ppc_inst_val(instr);
+
+   memset32(patch_addr, val, len / 4);
+   }
+   } else {
+   memcpy(patch_addr, code, len);
+   }
+
+   smp_wmb();  /* smp write barrier */
+   flush_icache_range(start, start + len);
+   return 0;
+}
+
+/*
+ * A page is mapped and instructions that fit the page are patched.
+ * Assumes 'len' to be (PAGE_SIZE - offset_in_page(addr)) or below.
+ */
+static int __do_patch_instructions_mm(u32 *addr, u32 *code, size_t len, bool 
repeat_instr)
+{
+   struct mm_struct *patching_mm, *orig_mm;
+   unsigned long pfn = get_patch_pfn(addr);
+   unsigned long text_poke_addr;
+   spinlock_t *ptl;
+   u32 *patch_addr;
+   pte_t *pte;
+   int err;
+
+   patching_mm = __this_cpu_read(cpu_patching_context.mm);
+   text_poke_addr = __this_cpu_read(cpu_patching_context.addr);
+   patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
+
+   pte = get_locked_pte(patching_mm, text_poke_addr, &ptl);
+   if (!pte)
+   return -ENOMEM;
+
+   __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, 
PAGE_KERNEL), 0);
+
+   /* order PTE update before use, also serves as the hwsync */
+   asm volatile("ptesync" ::: "memory");
+
+   /* order context switch after arbitrary prior code */
+   isync();
+
+   orig_mm = start_using_temp_mm(patching_mm);
+
+   err = __patch_instructions(patch_addr, code, len, repeat_instr);
+
+   /* context synchronisation performed by __patch_instructions */
+   stop_using_temp_mm(patching_mm, orig_mm);
+
+   pte_clear(patching_mm, text_poke_addr, pte);
+   /*
+* ptesync to order PTE update before TLB invalidation done
+* by radix__local_flush_tlb_page_psize (in _tlbiel_va)
+*/
+   local_flush_tlb_page_psize(patching_mm, text_poke_addr, 
mmu_virtual_psize);
+
+   pte_unmap_unlock(pte, ptl);
+
+   return err;
+}
+
+/*
+ * A page is mapped and instructions that fit the page are patched.
+ * Assumes 'len' to be (PAGE_SIZE - offset_in_page(addr)) or below.
+ */
+static int __do_patch_instructions(u32 *addr, u32 *code, size_t len, bool 
repeat_instr)
+{
+   unsigned long pfn = get_patch_pfn(addr);
+   unsigned long text_poke_addr;
+   u32 *patch_addr;
+   pte_t *pte;
+   int err;
+
+   text_poke_addr = (unsigned 
long)__this_cpu_read(cpu_patching_context.addr) & PAGE_MASK;
+   patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
+
+ 

[PATCH v6 0/5] powerpc/bpf: use BPF prog pack allocator

2023-10-12 Thread Hari Bathini
Most BPF programs are small, but they consume a page each. For systems
with busy traffic and many BPF programs, this may also add significant
pressure on instruction TLB. High iTLB pressure usually slows down the
whole system causing visible performance degradation for production
workloads.

bpf_prog_pack, a customized allocator that packs multiple bpf programs
into preallocated memory chunks, was proposed [1] to address it. This
series extends this support on powerpc.

Both bpf_arch_text_copy() & bpf_arch_text_invalidate() functions,
needed for this support depend on instruction patching in text area.
Currently, patch_instruction() supports patching only one instruction
at a time. The first patch introduces patch_instructions() function
to enable patching more than one instruction at a time. This helps in
avoiding performance degradation while JITing bpf programs.

Patches 2 & 3 implement the above mentioned arch specific functions
using patch_instructions(). Patch 4 fixes a misnomer in bpf JITing
code. The last patch enables the use of BPF prog pack allocator on
powerpc and also, ensures cleanup is handled gracefully.

[1] https://lore.kernel.org/bpf/20220204185742.271030-1-s...@kernel.org/

Changes in v6:
* No changes in patches 2-5/5 except addition of Acked-by tags from Song.
* Skipped merging code path of patch_instruction() & patch_instructions()
  to avoid performance overhead observed on ppc32 with that.

Changes in v5:
* Moved introduction of patch_instructions() as 1st patch in series.
* Improved patch_instructions() to use memset & memcpy.
* Fixed the misnomer in JITing code as a separate patch.
* Removed unused bpf_flush_icache() function.

Changes in v4:
* Updated bpf_patch_instructions() definition in patch 1/5 so that
  it doesn't have to be updated again in patch 2/5.
* Addressed Christophe's comment on bpf_arch_text_invalidate() return
  value in patch 2/5.

Changes in v3:
* Fixed segfault issue observed on ppc32 due to inaccurate offset
  calculation for branching.
* Tried to minimize the performance impact for patch_instruction()
  with the introduction of patch_instructions().
* Corrected uses of u32* vs ppc_instr_t.
* Moved the change that introduces patch_instructions() to after
  enabling bpf_prog_pack support.
* Added few comments to improve code readability.

Changes in v2:
* Introduced patch_instructions() to help with patching bpf programs.


Hari Bathini (5):
  powerpc/code-patching: introduce patch_instructions()
  powerpc/bpf: implement bpf_arch_text_copy
  powerpc/bpf: implement bpf_arch_text_invalidate for bpf_prog_pack
  powerpc/bpf: rename powerpc64_jit_data to powerpc_jit_data
  powerpc/bpf: use bpf_jit_binary_pack_[alloc|finalize|free]

 arch/powerpc/include/asm/code-patching.h |   1 +
 arch/powerpc/lib/code-patching.c | 138 +
 arch/powerpc/net/bpf_jit.h   |  18 +--
 arch/powerpc/net/bpf_jit_comp.c  | 145 ++-
 arch/powerpc/net/bpf_jit_comp32.c|  13 +-
 arch/powerpc/net/bpf_jit_comp64.c|  10 +-
 6 files changed, 271 insertions(+), 54 deletions(-)

-- 
2.41.0



Re: [PATCHv8 3/5] powerpc/setup: Handle the case when boot_cpuid greater than nr_cpus

2023-10-11 Thread Hari Bathini




On 11/10/23 8:35 am, Pingfan Liu wrote:

On Tue, Oct 10, 2023 at 01:56:13PM +0530, Hari Bathini wrote:



On 09/10/23 5:00 pm, Pingfan Liu wrote:

If the boot_cpuid is smaller than nr_cpus, it requires extra effort to
ensure the boot_cpu is in cpu_present_mask. This can be achieved by
reserving the last quota for the boot cpu.

Note: the restriction on nr_cpus will be lifted with more effort in the
successive patches

Signed-off-by: Pingfan Liu 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Christophe Leroy 
Cc: Mahesh Salgaonkar 
Cc: Wen Xiong 
Cc: Baoquan He 
Cc: Ming Lei 
Cc: ke...@lists.infradead.org
To: linuxppc-dev@lists.ozlabs.org
---
   arch/powerpc/kernel/setup-common.c | 25 ++---
   1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c
index 81291e13dec0..f9ef0a2666b0 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -454,8 +454,8 @@ struct interrupt_server_node {
   void __init smp_setup_cpu_maps(void)
   {
struct device_node *dn;
-   int shift = 0, cpu = 0;
-   int j, nthreads = 1;
+   int terminate, shift = 0, cpu = 0;
+   int j, bt_thread = 0, nthreads = 1;
int len;
struct interrupt_server_node *intserv_node, *n;
struct list_head *bt_node, head;
@@ -518,6 +518,7 @@ void __init smp_setup_cpu_maps(void)
for (j = 0 ; j < nthreads; j++) {
if (be32_to_cpu(intserv[j]) == boot_cpu_hwid) {
bt_node = &intserv_node->node;
+   bt_thread = j;
found_boot_cpu = true;
/*
 * Record the round-shift between dt
@@ -537,11 +538,21 @@ void __init smp_setup_cpu_maps(void)
/* Select the primary thread, the boot cpu's slibing, as the logic 0 */
list_add_tail(&head, bt_node);
pr_info("the round shift between dt seq and the cpu logic number: 
%d\n", shift);
+   terminate = nr_cpu_ids;
list_for_each_entry(intserv_node, &head, node) {
+   j = 0;



+   /* Choose a start point to cover the boot cpu */
+   if (nr_cpu_ids - 1 < bt_thread) {
+   /*
+* The processor core puts assumption on the thread id,
+* not to breach the assumption.
+*/
+   terminate = nr_cpu_ids - 1;


nthreads is anyway assumed to be same for all cores. So, enforcing
nr_cpu_ids to a minimum of nthreads (and multiple of nthreads) should
make the code much simpler without the need for above check and the
other complexities addressed in the subsequent patches...



Indeed, this series can be splited into two partsk, [1-2/5] and [3-5/5].
In [1-2/5], if smaller, the nr_cpu_ids is enforced to be equal to
nthreads. I will make it align upward on nthreads in the next version.
So [1-2/5] can be totally independent from the rest patches in this
series.


Yup. Would prefer it that way.


 From an engineer's perspective, [3-5/5] are added to maintain the
nr_cpus semantics. (Finally, nr_cpus=1 can be achieved but requiring
effort on other subsystem)


I understand it would be nice to maintain semantics but not worth the
complexity it brings, IMHO. So, my suggest would be to drop [3-5/5].

Thanks
Hari


Re: [PATCHv8 2/5] powerpc/setup: Loosen the mapping between cpu logical id and its seq in dt

2023-10-10 Thread Hari Bathini




On 09/10/23 5:00 pm, Pingfan Liu wrote:

*** Idea ***
For kexec -p, the boot cpu can be not the cpu0, this causes the problem
of allocating memory for paca_ptrs[]. However, in theory, there is no
requirement to assign cpu's logical id as its present sequence in the
device tree. But there is something like cpu_first_thread_sibling(),
which makes assumption on the mapping inside a core. Hence partially
loosening the mapping, i.e. unbind the mapping of core while keep the
mapping inside a core.

*** Implement ***
At this early stage, there are plenty of memory to utilize. Hence, this
patch allocates interim memory to link the cpu info on a list, then
reorder cpus by changing the list head. As a result, there is a rotate
shift between the sequence number in dt and the cpu logical number.

*** Result ***
After this patch, a boot-cpu's logical id will always be mapped into the
range [0,threads_per_core).

Besides this, at this phase, all threads in the boot core are forced to
be onlined. This restriction will be lifted in a later patch with
extra effort.

Signed-off-by: Pingfan Liu 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Christophe Leroy 
Cc: Mahesh Salgaonkar 
Cc: Wen Xiong 
Cc: Baoquan He 
Cc: Ming Lei 
Cc: ke...@lists.infradead.org
To: linuxppc-dev@lists.ozlabs.org
---
  arch/powerpc/kernel/prom.c | 25 +
  arch/powerpc/kernel/setup-common.c | 87 +++---
  2 files changed, 85 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index ec82f5bda908..87272a2d8c10 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -76,7 +76,9 @@ u64 ppc64_rma_size;
  unsigned int boot_cpu_node_count __ro_after_init;
  #endif
  static phys_addr_t first_memblock_size;
+#ifdef CONFIG_SMP
  static int __initdata boot_cpu_count;
+#endif
  
  static int __init early_parse_mem(char *p)

  {
@@ -331,8 +333,7 @@ static int __init early_init_dt_scan_cpus(unsigned long 
node,
const __be32 *intserv;
int i, nthreads;
int len;
-   int found = -1;
-   int found_thread = 0;
+   bool found = false;
  
  	/* We are scanning "cpu" nodes only */

if (type == NULL || strcmp(type, "cpu") != 0)
@@ -355,8 +356,15 @@ static int __init early_init_dt_scan_cpus(unsigned long 
node,
for (i = 0; i < nthreads; i++) {
if (be32_to_cpu(intserv[i]) ==
fdt_boot_cpuid_phys(initial_boot_params)) {
-   found = boot_cpu_count;
-   found_thread = i;
+   /*
+* always map the boot-cpu logical id into the
+* range of [0, thread_per_core)
+*/
+   boot_cpuid = i;
+   found = true;
+   /* This works around the hole in paca_ptrs[]. */
+   if (nr_cpu_ids < nthreads)
+   set_nr_cpu_ids(nthreads);
}
  #ifdef CONFIG_SMP
/* logical cpu id is always 0 on UP kernels */
@@ -365,14 +373,13 @@ static int __init early_init_dt_scan_cpus(unsigned long 
node,
}
  
  	/* Not the boot CPU */

-   if (found < 0)
+   if (!found)
return 0;
  
-	DBG("boot cpu: logical %d physical %d\n", found,

-   be32_to_cpu(intserv[found_thread]));
-   boot_cpuid = found;
+   DBG("boot cpu: logical %d physical %d\n", boot_cpuid,
+   be32_to_cpu(intserv[boot_cpuid]));
  
-	boot_cpu_hwid = be32_to_cpu(intserv[found_thread]);

+   boot_cpu_hwid = be32_to_cpu(intserv[boot_cpuid]);
  
  	/*

 * PAPR defines "logical" PVR values for cpus that
diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c
index 1b19a9815672..81291e13dec0 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -36,6 +36,7 @@
  #include 
  #include 
  #include 
+#include 
  #include 
  #include 
  #include 
@@ -425,6 +426,13 @@ static void __init cpu_init_thread_core_maps(int tpc)
  
  u32 *cpu_to_phys_id = NULL;
  
+struct interrupt_server_node {

+   struct list_head node;
+   boolavail;
+   int len;
+   __be32 *intserv;
+};
+
  /**
   * setup_cpu_maps - initialize the following cpu maps:
   *  cpu_possible_mask
@@ -446,11 +454,16 @@ u32 *cpu_to_phys_id = NULL;
  void __init smp_setup_cpu_maps(void)
  {
struct device_node *dn;
-   int cpu = 0;
-   int nthreads = 1;
+   int shift = 0, cpu = 0;
+   int j, nthreads = 1;
+   int len;
+   struct interrupt_server_node *intserv_node, *n;
+   struct list_head *bt_node, head;
+   bool avail, found_boot_cpu = false;
  
  	DBG("smp_setup_cpu_maps()\n");
  
+	INIT_LIST_HEAD(&head);

cpu_to_phys_id = memblock_alloc(nr_cpu_ids * sizeof(u32),
__alignof__(u32));
i

  1   2   3   4   5   6   7   8   9   >