[RFC PATCH v4 01/17] powerpc/trace: Account for -fpatchable-function-entry support by toolchain
So far, we have relied on the fact that gcc supports both -mprofile-kernel, as well as -fpatchable-function-entry, and clang supports neither. Our Makefile only checks for CONFIG_MPROFILE_KERNEL to decide which files to build. Clang has a feature request out [*] to implement -fpatchable-function-entry, and is unlikely to support -mprofile-kernel. Update our Makefile checks so that we pick up the correct files to build once clang picks up support for -fpatchable-function-entry. [*] https://github.com/llvm/llvm-project/issues/57031 Signed-off-by: Naveen N Rao --- arch/powerpc/kernel/trace/Makefile | 11 +++ 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/trace/Makefile b/arch/powerpc/kernel/trace/Makefile index 125f4ca588b9..d6c3885453bd 100644 --- a/arch/powerpc/kernel/trace/Makefile +++ b/arch/powerpc/kernel/trace/Makefile @@ -9,12 +9,15 @@ CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_ftrace_64_pg.o = $(CC_FLAGS_FTRACE) endif -obj32-$(CONFIG_FUNCTION_TRACER)+= ftrace.o ftrace_entry.o -ifdef CONFIG_MPROFILE_KERNEL -obj64-$(CONFIG_FUNCTION_TRACER)+= ftrace.o ftrace_entry.o +ifdef CONFIG_FUNCTION_TRACER +obj32-y+= ftrace.o ftrace_entry.o +ifeq ($(CONFIG_MPROFILE_KERNEL)$(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY),) +obj64-y+= ftrace_64_pg.o ftrace_64_pg_entry.o else -obj64-$(CONFIG_FUNCTION_TRACER)+= ftrace_64_pg.o ftrace_64_pg_entry.o +obj64-y+= ftrace.o ftrace_entry.o endif +endif + obj-$(CONFIG_TRACING) += trace_clock.o obj-$(CONFIG_PPC64)+= $(obj64-y) -- 2.45.2
[RFC PATCH v4 10/17] powerpc/ftrace: Add a postlink script to validate function tracer
Function tracer on powerpc can only work with vmlinux having a .text size of up to ~64MB due to powerpc branch instruction having a limited relative branch range of 32MB. Today, this is only detected on kernel boot when ftrace is init'ed. Add a post-link script to check the size of .text so that we can detect this at build time, and break the build if necessary. We add a dependency on !COMPILE_TEST for CONFIG_HAVE_FUNCTION_TRACER so that allyesconfig and other test builds can continue to work without enabling ftrace. Signed-off-by: Naveen N Rao --- arch/powerpc/Kconfig | 2 +- arch/powerpc/Makefile.postlink | 8 ++ arch/powerpc/tools/ftrace_check.sh | 45 ++ 3 files changed, 54 insertions(+), 1 deletion(-) create mode 100755 arch/powerpc/tools/ftrace_check.sh diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index f8891fbe7c16..68f0e7a5576f 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -244,7 +244,7 @@ config PPC select HAVE_FUNCTION_DESCRIPTORSif PPC64_ELF_ABI_V1 select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER - select HAVE_FUNCTION_TRACER if PPC64 || (PPC32 && CC_IS_GCC) + select HAVE_FUNCTION_TRACER if !COMPILE_TEST && (PPC64 || (PPC32 && CC_IS_GCC)) select HAVE_GCC_PLUGINS if GCC_VERSION >= 50200 # plugin support on gcc <= 5.1 is buggy on PPC select HAVE_GENERIC_VDSO select HAVE_HARDLOCKUP_DETECTOR_ARCHif PPC_BOOK3S_64 && SMP diff --git a/arch/powerpc/Makefile.postlink b/arch/powerpc/Makefile.postlink index ae5a4256b03d..bb601be36173 100644 --- a/arch/powerpc/Makefile.postlink +++ b/arch/powerpc/Makefile.postlink @@ -24,6 +24,9 @@ else $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$(NM)" "$@" endif +quiet_cmd_ftrace_check = CHKFTRC $@ + cmd_ftrace_check = $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/ftrace_check.sh "$(NM)" "$@" + # `@true` prevents complaint when there is nothing to be done vmlinux: FORCE @@ -34,6 +37,11 @@ endif ifdef CONFIG_RELOCATABLE $(call if_changed,relocs_check) endif +ifdef CONFIG_FUNCTION_TRACER +ifndef CONFIG_PPC64_ELF_ABI_V1 + $(call cmd,ftrace_check) +endif +endif clean: rm -f .tmp_symbols.txt diff --git a/arch/powerpc/tools/ftrace_check.sh b/arch/powerpc/tools/ftrace_check.sh new file mode 100755 index ..33f2fd45e54d --- /dev/null +++ b/arch/powerpc/tools/ftrace_check.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-or-later +# +# This script checks vmlinux to ensure that all functions can call ftrace_caller() either directly, +# or through the stub, ftrace_tramp_text, at the end of kernel text. + +# Error out if any command fails +set -e + +# Allow for verbose output +if [ "$V" = "1" ]; then + set -x +fi + +if [ $# -lt 2 ]; then + echo "$0 [path to nm] [path to vmlinux]" 1>&2 + exit 1 +fi + +# Have Kbuild supply the path to nm so we handle cross compilation. +nm="$1" +vmlinux="$2" + +stext_addr=$($nm "$vmlinux" | grep -e " [TA] _stext$" | cut -d' ' -f1 | tr '[[:lower:]]' '[[:upper:]]') +ftrace_caller_addr=$($nm "$vmlinux" | grep -e " T ftrace_caller$" | cut -d' ' -f1 | tr '[[:lower:]]' '[[:upper:]]') +ftrace_tramp_addr=$($nm "$vmlinux" | grep -e " T ftrace_tramp_text$" | cut -d' ' -f1 | tr '[[:lower:]]' '[[:upper:]]') + +ftrace_caller_offset=$(echo "ibase=16;$ftrace_caller_addr - $stext_addr" | bc) +ftrace_tramp_offset=$(echo "ibase=16;$ftrace_tramp_addr - $ftrace_caller_addr" | bc) +sz_32m=$(printf "%d" 0x200) +sz_64m=$(printf "%d" 0x400) + +# ftrace_caller - _stext < 32M +if [ $ftrace_caller_offset -ge $sz_32m ]; then + echo "ERROR: ftrace_caller (0x$ftrace_caller_addr) is beyond 32MiB of _stext" 1>&2 + echo "ERROR: consider disabling CONFIG_FUNCTION_TRACER, or reducing the size of kernel text" 1>&2 + exit 1 +fi + +# ftrace_tramp_text - ftrace_caller < 64M +if [ $ftrace_tramp_offset -ge $sz_64m ]; then + echo "ERROR: kernel text extends beyond 64MiB from ftrace_caller" 1>&2 + echo "ERROR: consider disabling CONFIG_FUNCTION_TRACER, or reducing the size of kernel text" 1>&2 + exit 1 +fi -- 2.45.2
[RFC PATCH v4 09/17] powerpc64/bpf: Fold bpf_jit_emit_func_call_hlp() into bpf_jit_emit_func_call_rel()
Commit 61688a82e047 ("powerpc/bpf: enable kfunc call") enhanced bpf_jit_emit_func_call_hlp() to handle calls out to module region, where bpf progs are generated. The only difference now between bpf_jit_emit_func_call_hlp() and bpf_jit_emit_func_call_rel() is in handling of the initial pass where target function address is not known. Fold that logic into bpf_jit_emit_func_call_hlp() and rename it to bpf_jit_emit_func_call_rel() to simplify bpf function call JIT code. We don't actually need to load/restore TOC across a call out to a different kernel helper or to a different bpf program since they all work with the kernel TOC. We only need to do it if we have to call out to a module function. So, guard TOC load/restore with appropriate conditions. Signed-off-by: Naveen N Rao --- arch/powerpc/net/bpf_jit_comp64.c | 61 +-- 1 file changed, 17 insertions(+), 44 deletions(-) diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 2cbcdf93cc19..f3be024fc685 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -202,14 +202,22 @@ void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) EMIT(PPC_RAW_BLR()); } -static int -bpf_jit_emit_func_call_hlp(u32 *image, u32 *fimage, struct codegen_context *ctx, u64 func) +int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context *ctx, u64 func) { unsigned long func_addr = func ? ppc_function_entry((void *)func) : 0; long reladdr; - if (WARN_ON_ONCE(!kernel_text_address(func_addr))) - return -EINVAL; + /* bpf to bpf call, func is not known in the initial pass. Emit 5 nops as a placeholder */ + if (!func) { + for (int i = 0; i < 5; i++) + EMIT(PPC_RAW_NOP()); + /* elfv1 needs an additional instruction to load addr from descriptor */ + if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V1)) + EMIT(PPC_RAW_NOP()); + EMIT(PPC_RAW_MTCTR(_R12)); + EMIT(PPC_RAW_BCTRL()); + return 0; + } #ifdef CONFIG_PPC_KERNEL_PCREL reladdr = func_addr - local_paca->kernelbase; @@ -266,7 +274,8 @@ bpf_jit_emit_func_call_hlp(u32 *image, u32 *fimage, struct codegen_context *ctx, * We can clobber r2 since we get called through a * function pointer (so caller will save/restore r2). */ - EMIT(PPC_RAW_LD(_R2, bpf_to_ppc(TMP_REG_2), 8)); + if (is_module_text_address(func_addr)) + EMIT(PPC_RAW_LD(_R2, bpf_to_ppc(TMP_REG_2), 8)); } else { PPC_LI64(_R12, func); EMIT(PPC_RAW_MTCTR(_R12)); @@ -276,46 +285,14 @@ bpf_jit_emit_func_call_hlp(u32 *image, u32 *fimage, struct codegen_context *ctx, * Load r2 with kernel TOC as kernel TOC is used if function address falls * within core kernel text. */ - EMIT(PPC_RAW_LD(_R2, _R13, offsetof(struct paca_struct, kernel_toc))); + if (is_module_text_address(func_addr)) + EMIT(PPC_RAW_LD(_R2, _R13, offsetof(struct paca_struct, kernel_toc))); } #endif return 0; } -int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context *ctx, u64 func) -{ - unsigned int i, ctx_idx = ctx->idx; - - if (WARN_ON_ONCE(func && is_module_text_address(func))) - return -EINVAL; - - /* skip past descriptor if elf v1 */ - func += FUNCTION_DESCR_SIZE; - - /* Load function address into r12 */ - PPC_LI64(_R12, func); - - /* For bpf-to-bpf function calls, the callee's address is unknown -* until the last extra pass. As seen above, we use PPC_LI64() to -* load the callee's address, but this may optimize the number of -* instructions required based on the nature of the address. -* -* Since we don't want the number of instructions emitted to increase, -* we pad the optimized PPC_LI64() call with NOPs to guarantee that -* we always have a five-instruction sequence, which is the maximum -* that PPC_LI64() can emit. -*/ - if (!image) - for (i = ctx->idx - ctx_idx; i < 5; i++) - EMIT(PPC_RAW_NOP()); - - EMIT(PPC_RAW_MTCTR(_R12)); - EMIT(PPC_RAW_BCTRL()); - - return 0; -} - static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 out) { /* @@ -1102,11 +1079,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code if (ret < 0) return ret; -
[RFC PATCH v4 08/17] powerpc/ftrace: Move ftrace stub used for init text before _einittext
Move the ftrace stub used to cover inittext before _einittext so that it is within kernel text, as seen through core_kernel_text(). This is required for a subsequent change to ftrace. Signed-off-by: Naveen N Rao --- arch/powerpc/kernel/vmlinux.lds.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index f420df7888a7..0aef9959f2cd 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -267,14 +267,13 @@ SECTIONS .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { _sinittext = .; INIT_TEXT - + *(.tramp.ftrace.init); /* *.init.text might be RO so we must ensure this section ends on * a page boundary. */ . = ALIGN(PAGE_SIZE); _einittext = .; - *(.tramp.ftrace.init); } :text /* .exit.text is discarded at runtime, not link time, -- 2.45.2
[RFC PATCH v4 07/17] powerpc/ftrace: Skip instruction patching if the instructions are the same
To simplify upcoming changes to ftrace, add a check to skip actual instruction patching if the old and new instructions are the same. We still validate that the instruction is what we expect, but don't actually patch the same instruction again. Signed-off-by: Naveen N Rao --- arch/powerpc/kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index fe0546fbac8e..719517265d39 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -82,7 +82,7 @@ static inline int ftrace_modify_code(unsigned long ip, ppc_inst_t old, ppc_inst_ { int ret = ftrace_validate_inst(ip, old); - if (!ret) + if (!ret && !ppc_inst_equal(old, new)) ret = patch_instruction((u32 *)ip, new); return ret; -- 2.45.2
[RFC PATCH v4 06/17] powerpc/ftrace: Remove pointer to struct module from dyn_arch_ftrace
Pointer to struct module is only relevant for ftrace records belonging to kernel modules. Having this field in dyn_arch_ftrace wastes memory for all ftrace records belonging to the kernel. Remove the same in favour of looking up the module from the ftrace record address, similar to other architectures. Reviewed-by: Nicholas Piggin Signed-off-by: Naveen N Rao --- arch/powerpc/include/asm/ftrace.h| 1 - arch/powerpc/kernel/trace/ftrace.c | 49 + arch/powerpc/kernel/trace/ftrace_64_pg.c | 69 ++-- 3 files changed, 56 insertions(+), 63 deletions(-) diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 107fc5a48456..201f9d15430a 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -26,7 +26,6 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, struct module; struct dyn_ftrace; struct dyn_arch_ftrace { - struct module *mod; }; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 8c3e523e4f96..fe0546fbac8e 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -106,28 +106,43 @@ static unsigned long find_ftrace_tramp(unsigned long ip) return 0; } +#ifdef CONFIG_MODULES +static unsigned long ftrace_lookup_module_stub(unsigned long ip, unsigned long addr) +{ + struct module *mod = NULL; + + preempt_disable(); + mod = __module_text_address(ip); + preempt_enable(); + + if (!mod) + pr_err("No module loaded at addr=%lx\n", ip); + + return (addr == (unsigned long)ftrace_caller ? mod->arch.tramp : mod->arch.tramp_regs); +} +#else +static unsigned long ftrace_lookup_module_stub(unsigned long ip, unsigned long addr) +{ + return 0; +} +#endif + static int ftrace_get_call_inst(struct dyn_ftrace *rec, unsigned long addr, ppc_inst_t *call_inst) { unsigned long ip = rec->ip; unsigned long stub; - if (is_offset_in_branch_range(addr - ip)) { + if (is_offset_in_branch_range(addr - ip)) /* Within range */ stub = addr; -#ifdef CONFIG_MODULES - } else if (rec->arch.mod) { - /* Module code would be going to one of the module stubs */ - stub = (addr == (unsigned long)ftrace_caller ? rec->arch.mod->arch.tramp : - rec->arch.mod->arch.tramp_regs); -#endif - } else if (core_kernel_text(ip)) { + else if (core_kernel_text(ip)) /* We would be branching to one of our ftrace stubs */ stub = find_ftrace_tramp(ip); - if (!stub) { - pr_err("0x%lx: No ftrace stubs reachable\n", ip); - return -EINVAL; - } - } else { + else + stub = ftrace_lookup_module_stub(ip, addr); + + if (!stub) { + pr_err("0x%lx: No ftrace stubs reachable\n", ip); return -EINVAL; } @@ -262,14 +277,6 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) if (ret) return ret; - if (!core_kernel_text(ip)) { - if (!mod) { - pr_err("0x%lx: No module provided for non-kernel address\n", ip); - return -EFAULT; - } - rec->arch.mod = mod; - } - /* Nop-out the ftrace location */ new = ppc_inst(PPC_RAW_NOP()); addr = MCOUNT_ADDR; diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.c b/arch/powerpc/kernel/trace/ftrace_64_pg.c index 12fab1803bcf..8a551dfca3d0 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_pg.c +++ b/arch/powerpc/kernel/trace/ftrace_64_pg.c @@ -116,6 +116,20 @@ static unsigned long find_bl_target(unsigned long ip, ppc_inst_t op) } #ifdef CONFIG_MODULES +static struct module *ftrace_lookup_module(struct dyn_ftrace *rec) +{ + struct module *mod; + + preempt_disable(); + mod = __module_text_address(rec->ip); + preempt_enable(); + + if (!mod) + pr_err("No module loaded at addr=%lx\n", rec->ip); + + return mod; +} + static int __ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) @@ -124,6 +138,12 @@ __ftrace_make_nop(struct module *mod, unsigned long ip = rec->ip; ppc_inst_t op, pop; + if (!mod) { + mod = ftrace_lookup_module(rec); + if (!mod) + return -EINVAL; + } + /* read where this goes */ if (copy_inst_from_kernel_nofault(&op, (void *)ip)) { pr_err("Fetching opcode failed.\n"); @@ -366,27 +386,6 @@ int ftrace_m
[RFC PATCH v4 05/17] powerpc/module_64: Convert #ifdef to IS_ENABLED()
Minor refactor for converting #ifdef to IS_ENABLED(). Reviewed-by: Nicholas Piggin Signed-off-by: Naveen N Rao --- arch/powerpc/kernel/module_64.c | 10 ++ 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index e9bab599d0c2..1db88409bd95 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -241,14 +241,8 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, } } -#ifdef CONFIG_DYNAMIC_FTRACE - /* make the trampoline to the ftrace_caller */ - relocs++; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - /* an additional one for ftrace_regs_caller */ - relocs++; -#endif -#endif + /* stubs for ftrace_caller and ftrace_regs_caller */ + relocs += IS_ENABLED(CONFIG_DYNAMIC_FTRACE) + IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_REGS); pr_debug("Looks like a total of %lu stubs, max\n", relocs); return relocs * sizeof(struct ppc64_stub_entry); -- 2.45.2
[RFC PATCH v4 04/17] powerpc32/ftrace: Unify 32-bit and 64-bit ftrace entry code
On 32-bit powerpc, gcc generates a three instruction sequence for function profiling: mflrr0 stw r0, 4(r1) bl _mcount On kernel boot, the call to _mcount() is nop-ed out, to be patched back in when ftrace is actually enabled. The 'stw' instruction therefore is not necessary unless ftrace is enabled. Nop it out during ftrace init. When ftrace is enabled, we want the 'stw' so that stack unwinding works properly. Perform the same within the ftrace handler, similar to 64-bit powerpc. Reviewed-by: Nicholas Piggin Signed-off-by: Naveen N Rao --- arch/powerpc/kernel/trace/ftrace.c | 6 -- arch/powerpc/kernel/trace/ftrace_entry.S | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 2ef504700e8d..8c3e523e4f96 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -240,8 +240,10 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) } else if (IS_ENABLED(CONFIG_PPC32)) { /* Expected sequence: 'mflr r0', 'stw r0,4(r1)', 'bl _mcount' */ ret = ftrace_validate_inst(ip - 8, ppc_inst(PPC_RAW_MFLR(_R0))); - if (!ret) - ret = ftrace_validate_inst(ip - 4, ppc_inst(PPC_RAW_STW(_R0, _R1, 4))); + if (ret) + return ret; + ret = ftrace_modify_code(ip - 4, ppc_inst(PPC_RAW_STW(_R0, _R1, 4)), +ppc_inst(PPC_RAW_NOP())); } else if (IS_ENABLED(CONFIG_MPROFILE_KERNEL)) { /* Expected sequence: 'mflr r0', ['std r0,16(r1)'], 'bl _mcount' */ ret = ftrace_read_inst(ip - 4, &old); diff --git a/arch/powerpc/kernel/trace/ftrace_entry.S b/arch/powerpc/kernel/trace/ftrace_entry.S index 76dbe9fd2c0f..244a1c7bb1e8 100644 --- a/arch/powerpc/kernel/trace/ftrace_entry.S +++ b/arch/powerpc/kernel/trace/ftrace_entry.S @@ -33,6 +33,8 @@ * and then arrange for the ftrace function to be called. */ .macro ftrace_regs_entry allregs + /* Save the original return address in A's stack frame */ + PPC_STL r0, LRSAVE(r1) /* Create a minimal stack frame for representing B */ PPC_STLUr1, -STACK_FRAME_MIN_SIZE(r1) @@ -44,8 +46,6 @@ SAVE_GPRS(3, 10, r1) #ifdef CONFIG_PPC64 - /* Save the original return address in A's stack frame */ - std r0, LRSAVE+SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE(r1) /* Ok to continue? */ lbz r3, PACA_FTRACE_ENABLED(r13) cmpdi r3, 0 -- 2.45.2
[RFC PATCH v4 03/17] powerpc64/ftrace: Nop out additional 'std' instruction emitted by gcc v5.x
Gcc v5.x emits a 3-instruction sequence for -mprofile-kernel: mflrr0 std r0, 16(r1) bl _mcount Gcc v6.x moved to a simpler 2-instruction sequence by removing the 'std' instruction. The store saved the return address in the LR save area in the caller stack frame for stack unwinding. However, with dynamic ftrace, we no longer have a call to _mcount on kernel boot when ftrace is not enabled. When ftrace is enabled, that store is performed within ftrace_caller(). As such, the additional 'std' instruction is redundant. Nop it out on kernel boot. With this change, we now use the same 2-instruction profiling sequence with both -mprofile-kernel, as well as -fpatchable-function-entry on 64-bit powerpc. Signed-off-by: Naveen N Rao --- arch/powerpc/kernel/trace/ftrace.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index d8d6b4fd9a14..2ef504700e8d 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -246,8 +246,12 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) /* Expected sequence: 'mflr r0', ['std r0,16(r1)'], 'bl _mcount' */ ret = ftrace_read_inst(ip - 4, &old); if (!ret && !ppc_inst_equal(old, ppc_inst(PPC_RAW_MFLR(_R0 { + /* Gcc v5.x emit the additional 'std' instruction, gcc v6.x don't */ ret = ftrace_validate_inst(ip - 8, ppc_inst(PPC_RAW_MFLR(_R0))); - ret |= ftrace_validate_inst(ip - 4, ppc_inst(PPC_RAW_STD(_R0, _R1, 16))); + if (ret) + return ret; + ret = ftrace_modify_code(ip - 4, ppc_inst(PPC_RAW_STD(_R0, _R1, 16)), +ppc_inst(PPC_RAW_NOP())); } } else { return -EINVAL; -- 2.45.2
[RFC PATCH v4 02/17] powerpc/kprobes: Use ftrace to determine if a probe is at function entry
Rather than hard-coding the offset into a function to be used to determine if a kprobe is at function entry, use ftrace_location() to determine the ftrace location within the function and categorize all instructions till that offset to be function entry. For functions that cannot be traced, we fall back to using a fixed offset of 8 (two instructions) to categorize a probe as being at function entry for 64-bit elfv2, unless we are using pcrel. Acked-by: Masami Hiramatsu (Google) Signed-off-by: Naveen N Rao --- arch/powerpc/kernel/kprobes.c | 18 -- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 14c5ddec3056..ca204f4f21c1 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -105,24 +105,22 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset) return addr; } -static bool arch_kprobe_on_func_entry(unsigned long offset) +static bool arch_kprobe_on_func_entry(unsigned long addr, unsigned long offset) { -#ifdef CONFIG_PPC64_ELF_ABI_V2 -#ifdef CONFIG_KPROBES_ON_FTRACE - return offset <= 16; -#else - return offset <= 8; -#endif -#else + unsigned long ip = ftrace_location(addr); + + if (ip) + return offset <= (ip - addr); + if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2) && !IS_ENABLED(CONFIG_PPC_KERNEL_PCREL)) + return offset <= 8; return !offset; -#endif } /* XXX try and fold the magic of kprobe_lookup_name() in this */ kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset, bool *on_func_entry) { - *on_func_entry = arch_kprobe_on_func_entry(offset); + *on_func_entry = arch_kprobe_on_func_entry(addr, offset); return (kprobe_opcode_t *)(addr + offset); } -- 2.45.2
[RFC PATCH v4 17/17] powerpc64/bpf: Add support for bpf trampolines
Add support for bpf_arch_text_poke() and arch_prepare_bpf_trampoline() for 64-bit powerpc. While the code is generic, BPF trampolines are only enabled on 64-bit powerpc. 32-bit powerpc will need testing and some updates. BPF Trampolines adhere to the existing ftrace ABI utilizing a two-instruction profiling sequence, as well as the newer ABI utilizing a three-instruction profiling sequence enabling return with a 'blr'. The trampoline code itself closely follows x86 implementation. BPF prog JIT is extended to mimic 64-bit powerpc approach for ftrace having a single nop at function entry, followed by the function profiling sequence out-of-line and a separate long branch stub for calls to trampolines that are out of range. A dummy_tramp is provided to simplify synchronization similar to arm64. When attaching a bpf trampoline to a bpf prog, we can patch up to three things: - the nop at bpf prog entry to go to the out-of-line stub - the instruction in the out-of-line stub to either call the bpf trampoline directly, or to branch to the long_branch stub. - the trampoline address before the long_branch stub. We do not need any synchronization here since we always have a valid branch target regardless of the order in which the above stores are seen. dummy_tramp ensures that the long_branch stub goes to a valid destination on other cpus, even when the branch to the long_branch stub is seen before the updated trampoline address. However, when detaching a bpf trampoline from a bpf prog, or if changing the bpf trampoline address, we need synchronization to ensure that other cpus can no longer branch into the older trampoline so that it can be safely freed. bpf_tramp_image_put() uses rcu_tasks to ensure all cpus make forward progress, but we still need to ensure that other cpus execute isync (or some CSI) so that they don't go back into the trampoline again. Signed-off-by: Naveen N Rao --- arch/powerpc/include/asm/ppc-opcode.h | 14 + arch/powerpc/net/bpf_jit.h| 12 + arch/powerpc/net/bpf_jit_comp.c | 842 +- arch/powerpc/net/bpf_jit_comp32.c | 7 +- arch/powerpc/net/bpf_jit_comp64.c | 7 +- 5 files changed, 879 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index b98a9e982c03..4312bcb913a4 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -587,12 +587,26 @@ #define PPC_RAW_MTSPR(spr, d) (0x7c0003a6 | ___PPC_RS(d) | __PPC_SPR(spr)) #define PPC_RAW_EIEIO()(0x7c0006ac) +/* bcl 20,31,$+4 */ +#define PPC_RAW_BCL4() (0x429f0005) #define PPC_RAW_BRANCH(offset) (0x4800 | PPC_LI(offset)) #define PPC_RAW_BL(offset) (0x4801 | PPC_LI(offset)) #define PPC_RAW_TW(t0, a, b) (0x7c08 | ___PPC_RS(t0) | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_TRAP() PPC_RAW_TW(31, 0, 0) #define PPC_RAW_SETB(t, bfa) (0x7c000100 | ___PPC_RT(t) | ___PPC_RA((bfa) << 2)) +#ifdef CONFIG_PPC32 +#define PPC_RAW_STLPPC_RAW_STW +#define PPC_RAW_STLU PPC_RAW_STWU +#define PPC_RAW_LL PPC_RAW_LWZ +#define PPC_RAW_CMPLI PPC_RAW_CMPWI +#else +#define PPC_RAW_STLPPC_RAW_STD +#define PPC_RAW_STLU PPC_RAW_STDU +#define PPC_RAW_LL PPC_RAW_LD +#define PPC_RAW_CMPLI PPC_RAW_CMPDI +#endif + /* Deal with instructions that older assemblers aren't aware of */ #definePPC_BCCTR_FLUSH stringify_in_c(.long PPC_INST_BCCTR_FLUSH) #definePPC_CP_ABORTstringify_in_c(.long PPC_RAW_CP_ABORT) diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index cdea5dccaefe..2d04ce5a23da 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -12,6 +12,7 @@ #include #include +#include #ifdef CONFIG_PPC64_ELF_ABI_V1 #define FUNCTION_DESCR_SIZE24 @@ -21,6 +22,9 @@ #define CTX_NIA(ctx) ((unsigned long)ctx->idx * 4) +#define SZLsizeof(unsigned long) +#define BPF_INSN_SAFETY64 + #define PLANT_INSTR(d, idx, instr) \ do { if (d) { (d)[idx] = instr; } idx++; } while (0) #define EMIT(instr)PLANT_INSTR(image, ctx->idx, instr) @@ -81,6 +85,13 @@ EMIT(PPC_RAW_ORI(d, d, (uintptr_t)(i) & \ 0x)); \ } } while (0) +#define PPC_LI_ADDRPPC_LI64 +#define PPC64_LOAD_PACA()\ + EMIT(PPC_RAW_LD(_R2, _R13, offsetof(struct paca_struct, kernel_toc))) +#else +#define PPC_LI64(d, i) BUILD_BUG() +#define PPC_LI_ADDRPPC_LI32 +#define PPC64_LOAD_PACA() BUILD_BUG() #endif /* @@ -165,6 +176,7 @@ int bpf_jit
[RFC PATCH v4 16/17] samples/ftrace: Add support for ftrace direct samples on powerpc
Add powerpc 32-bit and 64-bit samples for ftrace direct. This serves to show the sample instruction sequence to be used by ftrace direct calls to adhere to the ftrace ABI. On 64-bit powerpc, TOC setup requires some additional work. Signed-off-by: Naveen N Rao --- arch/powerpc/Kconfig| 2 + samples/ftrace/ftrace-direct-modify.c | 85 +++- samples/ftrace/ftrace-direct-multi-modify.c | 101 +++- samples/ftrace/ftrace-direct-multi.c| 79 ++- samples/ftrace/ftrace-direct-too.c | 83 +++- samples/ftrace/ftrace-direct.c | 69 - 6 files changed, 414 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 29aab3770415..f6ff44acf112 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -275,6 +275,8 @@ config PPC select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE select HAVE_RSEQ + select HAVE_SAMPLE_FTRACE_DIRECTif HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + select HAVE_SAMPLE_FTRACE_DIRECT_MULTI if HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS select HAVE_SETUP_PER_CPU_AREA if PPC64 select HAVE_SOFTIRQ_ON_OWN_STACK select HAVE_STACKPROTECTOR if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2) diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c index 81220390851a..cfea7a38befb 100644 --- a/samples/ftrace/ftrace-direct-modify.c +++ b/samples/ftrace/ftrace-direct-modify.c @@ -2,7 +2,7 @@ #include #include #include -#ifndef CONFIG_ARM64 +#if !defined(CONFIG_ARM64) && !defined(CONFIG_PPC32) #include #endif @@ -199,6 +199,89 @@ asm ( #endif /* CONFIG_LOONGARCH */ +#ifdef CONFIG_PPC +#include + +#ifdef CONFIG_PPC64 +#define STACK_FRAME_SIZE 48 +#else +#define STACK_FRAME_SIZE 24 +#endif + +#if defined(CONFIG_PPC64_ELF_ABI_V2) && !defined(CONFIG_PPC_KERNEL_PCREL) +#define PPC64_TOC_SAVE_AND_UPDATE \ +" std 2, 24(1)\n" \ +" bcl 20, 31, 1f\n" \ +" 1: mflr12\n" \ +" ld 2, (99f - 1b)(12)\n" +#define PPC64_TOC_RESTORE \ +" ld 2, 24(1)\n" +#define PPC64_TOC \ +" 99:.quad .TOC.@tocbase\n" +#else +#define PPC64_TOC_SAVE_AND_UPDATE "" +#define PPC64_TOC_RESTORE "" +#define PPC64_TOC "" +#endif + +#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE +#define PPC_FTRACE_RESTORE_LR \ + PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \ +" mtlr0\n" +#define PPC_FTRACE_RET \ +" blr\n" +#else +#define PPC_FTRACE_RESTORE_LR \ + PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" \ +" mtctr 0\n" +#define PPC_FTRACE_RET \ +" mtlr0\n"\ +" bctr\n" +#endif + +asm ( +" .pushsection.text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .globl my_tramp1\n" +" my_tramp1:\n" + PPC_STL"0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_STLU" 1, -"__stringify(STACK_FRAME_MIN_SIZE)"(1)\n" +" mflr0\n" + PPC_STL"0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_STLU" 1, -"__stringify(STACK_FRAME_SIZE)"(1)\n" + PPC64_TOC_SAVE_AND_UPDATE +" bl my_direct_func1\n" + PPC64_TOC_RESTORE +" addi1, 1, "__stringify(STACK_FRAME_SIZE)"\n" + PPC_FTRACE_RESTORE_LR +" addi1, 1, "__stringify(STACK_FRAME_MIN_SIZE)"\n" + PPC_LL" 0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_FTRACE_RET +" .size my_tramp1, .-my_tramp1\n" + +" .type my_tramp2, @function\n" +" .globl my_tramp2\n" +" my_tramp2:\n" + PPC_STL"0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_STLU" 1, -"__stringify(STACK_FRAME_MIN_SIZE)"(1)\n" +" mflr0\n" + PPC_STL"0, "__stringify(PPC_LR_STKOFF)"(1)\n" + PPC_STLU" 1, -"__stringify(STACK_FRAME_SIZE)"(1)\n"
[RFC PATCH v4 15/17] powerpc/ftrace: Add support for DYNAMIC_FTRACE_WITH_DIRECT_CALLS
Add support for DYNAMIC_FTRACE_WITH_DIRECT_CALLS similar to the arm64 implementation. ftrace direct calls allow custom trampolines to be called into directly from function ftrace call sites, bypassing the ftrace trampoline completely. This functionality is currently utilized by BPF trampolines to hook into kernel function entries. Since we have limited relative branch range, we support ftrace direct calls through support for DYNAMIC_FTRACE_WITH_CALL_OPS. In this approach, ftrace trampoline is not entirely bypassed. Rather, it is re-purposed into a stub that reads direct_call field from the associated ftrace_ops structure and branches into that, if it is not NULL. For this, it is sufficient if we can ensure that the ftrace trampoline is reachable from all traceable functions. When multiple ftrace_ops are associated with a call site, we utilize a call back to set pt_regs->orig_gpr3 that can then be tested on the return path from the ftrace trampoline to branch into the direct caller. Signed-off-by: Naveen N Rao --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/ftrace.h| 15 +++ arch/powerpc/kernel/asm-offsets.c| 3 + arch/powerpc/kernel/trace/ftrace.c | 9 ++ arch/powerpc/kernel/trace/ftrace_entry.S | 114 +-- 5 files changed, 113 insertions(+), 29 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index cb6031d86dc9..29aab3770415 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -236,6 +236,7 @@ config PPC select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_ARGSif ARCH_USING_PATCHABLE_FUNCTION_ENTRY || MPROFILE_KERNEL || PPC32 select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if PPC_FTRACE_OUT_OF_LINE || (PPC32 && ARCH_USING_PATCHABLE_FUNCTION_ENTRY) + select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS select HAVE_DYNAMIC_FTRACE_WITH_REGSif ARCH_USING_PATCHABLE_FUNCTION_ENTRY || MPROFILE_KERNEL || PPC32 select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 267bd52fef21..2f1a6d25838d 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -150,6 +150,21 @@ extern unsigned int ftrace_ool_stub_text_end_count, ftrace_ool_stub_text_count, #endif void ftrace_free_init_tramp(void); unsigned long ftrace_call_adjust(unsigned long addr); + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* + * When an ftrace registered caller is tracing a function that is also set by a + * register_ftrace_direct() call, it needs to be differentiated in the + * ftrace_caller trampoline so that the direct call can be invoked after the + * other ftrace ops. To do this, place the direct caller in the orig_gpr3 field + * of pt_regs. This tells ftrace_caller that there's a direct caller. + */ +static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsigned long addr) +{ + struct pt_regs *regs = &fregs->regs; + regs->orig_gpr3 = addr; +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #else static inline void ftrace_free_init_tramp(void) { } static inline unsigned long ftrace_call_adjust(unsigned long addr) { return addr; } diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 60d1e388c2ba..dbd56264a8bc 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -680,6 +680,9 @@ int main(void) #ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS OFFSET(FTRACE_OPS_FUNC, ftrace_ops, func); +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + OFFSET(FTRACE_OPS_DIRECT_CALL, ftrace_ops, direct_call); +#endif #endif return 0; diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 291c6c3d3a78..4316a7cfbdb8 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -150,6 +150,15 @@ static int ftrace_get_call_inst(struct dyn_ftrace *rec, unsigned long addr, ppc_ else ip = rec->ip; + if (!is_offset_in_branch_range(addr - ip) && addr != FTRACE_ADDR && addr != FTRACE_REGS_ADDR) { + /* This can only happen with ftrace direct */ + if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS)) { + pr_err("0x%lx (0x%lx): Unexpected target address 0x%lx\n", ip, rec->ip, addr); + return -EINVAL; + } + addr = FTRACE_ADDR; + } + if (is_offset_in_branch_range(addr - ip)) /* Within range */ stub = addr; diff --git a/arch/powerpc/kernel/trace/ftrace_entry.S b/arch/powerpc/kernel/trace/ftrace_entry.S index c019380bdd6a..eea4cb3737a8 100644 --- a/arch/powerpc/kernel/trace/ftrace_entry.S +++ b
[RFC PATCH v4 14/17] powerpc/ftrace: Add support for DYNAMIC_FTRACE_WITH_CALL_OPS
Implement support for DYNAMIC_FTRACE_WITH_CALL_OPS similar to the arm64 implementation. This works by patching-in a pointer to an associated ftrace_ops structure before each traceable function. If multiple ftrace_ops are associated with a call site, then a special ftrace_list_ops is used to enable iterating over all the registered ftrace_ops. If no ftrace_ops are associated with a call site, then a special ftrace_nop_ops structure is used to render the ftrace call as a no-op. ftrace trampoline can then read the associated ftrace_ops for a call site by loading from an offset from the LR, and branch directly to the associated function. The primary advantage with this approach is that we don't have to iterate over all the registered ftrace_ops for call sites that have a single ftrace_ops registered. This is the equivalent of implementing support for dynamic ftrace trampolines, which set up a special ftrace trampoline for each registered ftrace_ops and have individual call sites branch into those directly. A secondary advantage is that this gives us a way to add support for direct ftrace callers without having to resort to using stubs. The address of the direct call trampoline can be loaded from the ftrace_ops structure. To support this, we reserve a nop before each function on 32-bit powerpc. For 64-bit powerpc, two nops are reserved before each out-of-line stub. During ftrace activation, we update this location with the associated ftrace_ops pointer. Then, on ftrace entry, we load from this location and call into ftrace_ops->func(). For 64-bit powerpc, we ensure that the out-of-line stub area is doubleword aligned so that ftrace_ops address can be updated atomically. Signed-off-by: Naveen N Rao --- arch/powerpc/Kconfig | 1 + arch/powerpc/Makefile | 4 ++ arch/powerpc/include/asm/ftrace.h | 5 +- arch/powerpc/kernel/asm-offsets.c | 4 ++ arch/powerpc/kernel/trace/ftrace.c | 59 +- arch/powerpc/kernel/trace/ftrace_entry.S | 36 ++--- arch/powerpc/tools/ftrace-gen-ool-stubs.sh | 5 +- 7 files changed, 102 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index a4dff8624510..cb6031d86dc9 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -235,6 +235,7 @@ config PPC select HAVE_DEBUG_STACKOVERFLOW select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_ARGSif ARCH_USING_PATCHABLE_FUNCTION_ENTRY || MPROFILE_KERNEL || PPC32 + select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if PPC_FTRACE_OUT_OF_LINE || (PPC32 && ARCH_USING_PATCHABLE_FUNCTION_ENTRY) select HAVE_DYNAMIC_FTRACE_WITH_REGSif ARCH_USING_PATCHABLE_FUNCTION_ENTRY || MPROFILE_KERNEL || PPC32 select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index c973e6cd1ae8..7dede0ec0163 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -158,8 +158,12 @@ KBUILD_CPPFLAGS+= -DCC_USING_PATCHABLE_FUNCTION_ENTRY ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE CC_FLAGS_FTRACE := -fpatchable-function-entry=1 else +ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS # PPC32 only +CC_FLAGS_FTRACE := -fpatchable-function-entry=3,1 +else CC_FLAGS_FTRACE := -fpatchable-function-entry=2 endif +endif else CC_FLAGS_FTRACE := -pg ifdef CONFIG_MPROFILE_KERNEL diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index dc870824359c..267bd52fef21 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -138,8 +138,11 @@ static inline u8 this_cpu_get_ftrace_enabled(void) { return 1; } extern unsigned int ftrace_tramp_text[], ftrace_tramp_init[]; #ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE struct ftrace_ool_stub { +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS + struct ftrace_ops *ftrace_op; +#endif u32 insn[4]; -}; +} __aligned(sizeof(unsigned long)); extern struct ftrace_ool_stub ftrace_ool_stub_text_end[], ftrace_ool_stub_text[], ftrace_ool_stub_inittext[]; extern unsigned int ftrace_ool_stub_text_end_count, ftrace_ool_stub_text_count, diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 6854547d3164..60d1e388c2ba 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -678,5 +678,9 @@ int main(void) DEFINE(FTRACE_OOL_STUB_SIZE, sizeof(struct ftrace_ool_stub)); #endif +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS + OFFSET(FTRACE_OPS_FUNC, ftrace_ops, func); +#endif + return 0; } diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index b4de8b8cbe3a..291c6c3d3a78 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -38,8 +38,11 @@ unsigned long ftrace_call_adjust(unsigned long addr) return
[RFC PATCH v4 13/17] powerpc64/ftrace: Support .text larger than 32MB with out-of-line stubs
We are restricted to a .text size of ~32MB when using out-of-line function profile sequence. Allow this to be extended up to the previous limit of ~64MB by reserving space in the middle of .text. A new config option CONFIG_PPC_FTRACE_OUT_OF_LINE_NUM_RESERVE is introduced to specify the number of function stubs that are reserved in .text. On boot, ftrace utilizes stubs from this area first before using the stub area at the end of .text. A ppc64le defconfig has ~44k functions that can be traced. A more conservative value of 32k functions is chosen as the default value of PPC_FTRACE_OUT_OF_LINE_NUM_RESERVE so that we do not allot more space than necessary by default. If building a kernel that only has 32k trace-able functions, we won't allot any more space at the end of .text during the pass on vmlinux.o. Otherwise, only the remaining functions get space for stubs at the end of .text. This default value should help cover a .text size of ~48MB in total (including space reserved at the end of .text which can cover up to 32MB), which should be sufficient for most common builds. For a very small kernel build, this can be set to 0. Or, this can be bumped up to a larger value to support vmlinux .text size up to ~64MB. Signed-off-by: Naveen N Rao --- arch/powerpc/Kconfig | 12 arch/powerpc/include/asm/ftrace.h | 6 -- arch/powerpc/kernel/trace/ftrace.c | 21 + arch/powerpc/kernel/trace/ftrace_entry.S | 8 arch/powerpc/tools/Makefile| 2 +- arch/powerpc/tools/ftrace-gen-ool-stubs.sh | 11 +++ 6 files changed, 49 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index f50cfd15bb73..a4dff8624510 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -573,6 +573,18 @@ config PPC_FTRACE_OUT_OF_LINE depends on PPC64 select ARCH_WANTS_PRE_LINK_VMLINUX +config PPC_FTRACE_OUT_OF_LINE_NUM_RESERVE + int "Number of ftrace out-of-line stubs to reserve within .text" + default 32768 if PPC_FTRACE_OUT_OF_LINE + default 0 + help + Number of stubs to reserve for use by ftrace. This space is + reserved within .text, and is distinct from any additional space + added at the end of .text before the final vmlinux link. Set to + zero to have stubs only be generated at the end of vmlinux (only + if the size of vmlinux is less than 32MB). Set to a higher value + if building vmlinux larger than 48MB. + config HOTPLUG_CPU bool "Support for enabling/disabling CPUs" depends on SMP && (PPC_PSERIES || \ diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 0589bb252de7..dc870824359c 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -140,8 +140,10 @@ extern unsigned int ftrace_tramp_text[], ftrace_tramp_init[]; struct ftrace_ool_stub { u32 insn[4]; }; -extern struct ftrace_ool_stub ftrace_ool_stub_text_end[], ftrace_ool_stub_inittext[]; -extern unsigned int ftrace_ool_stub_text_end_count, ftrace_ool_stub_inittext_count; +extern struct ftrace_ool_stub ftrace_ool_stub_text_end[], ftrace_ool_stub_text[], + ftrace_ool_stub_inittext[]; +extern unsigned int ftrace_ool_stub_text_end_count, ftrace_ool_stub_text_count, + ftrace_ool_stub_inittext_count; #endif void ftrace_free_init_tramp(void); unsigned long ftrace_call_adjust(unsigned long addr); diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index c03336301bad..b4de8b8cbe3a 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -168,7 +168,7 @@ static int ftrace_get_call_inst(struct dyn_ftrace *rec, unsigned long addr, ppc_ static int ftrace_init_ool_stub(struct module *mod, struct dyn_ftrace *rec) { #ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE - static int ool_stub_text_end_index, ool_stub_inittext_index; + static int ool_stub_text_index, ool_stub_text_end_index, ool_stub_inittext_index; int ret = 0, ool_stub_count, *ool_stub_index; ppc_inst_t inst; /* @@ -191,9 +191,22 @@ static int ftrace_init_ool_stub(struct module *mod, struct dyn_ftrace *rec) ool_stub_index = &ool_stub_inittext_index; ool_stub_count = ftrace_ool_stub_inittext_count; } else if (is_kernel_text(rec->ip)) { - ool_stub = ftrace_ool_stub_text_end; - ool_stub_index = &ool_stub_text_end_index; - ool_stub_count = ftrace_ool_stub_text_end_count; + /* +* ftrace records are sorted, so we first use up the stub area within .text +* (ftrace_ool_stub_text) before using the area at the end of .text +* (ftrace_ool_stub_text_end), un
[RFC PATCH v4 12/17] powerpc64/ftrace: Move ftrace sequence out of line
Function profile sequence on powerpc includes two instructions at the beginning of each function: mflrr0 bl ftrace_caller The call to ftrace_caller() gets nop'ed out during kernel boot and is patched in when ftrace is enabled. Given the sequence, we cannot return from ftrace_caller with 'blr' as we need to keep LR and r0 intact. This results in link stack (return address predictor) imbalance when ftrace is enabled. To address that, we would like to use a three instruction sequence: mflrr0 bl ftrace_caller mtlrr0 Further more, to support DYNAMIC_FTRACE_WITH_CALL_OPS, we need to reserve two instruction slots before the function. This results in a total of five instruction slots to be reserved for ftrace use on each function that is traced. Move the function profile sequence out-of-line to minimize its impact. To do this, we reserve a single nop at function entry using -fpatchable-function-entry=1 and add a pass on vmlinux.o to determine the total number of functions that can be traced. This is then used to generate a .S file reserving the appropriate amount of space for use as ftrace stubs, which is built and linked into vmlinux. On bootup, the stub space is split into separate stubs per function and populated with the proper instruction sequence. A pointer to the associated stub is maintained in dyn_arch_ftrace. For modules, space for ftrace stubs is reserved from the generic module stub space. This is restricted to and enabled by default only on 64-bit powerpc, though there are some changes to accommodate 32-bit powerpc. This is done so that 32-bit powerpc could choose to opt into this based on further tests and benchmarks. As an example, after this patch, kernel functions will have a single nop at function entry: : addis r2,r12,467 addir2,r2,-16028 nop mfocrf r11,8 ... When ftrace is enabled, the nop is converted to an unconditional branch to the stub associated with that function: : addis r2,r12,467 addir2,r2,-16028 b ftrace_ool_stub_text_end+0x11b28 mfocrf r11,8 ... The associated stub: : mflrr0 bl ftrace_caller mtlrr0 b kernel_clone+0xc ... Signed-off-by: Naveen N Rao --- arch/powerpc/Kconfig | 5 + arch/powerpc/Makefile | 4 + arch/powerpc/include/asm/ftrace.h | 11 ++ arch/powerpc/include/asm/module.h | 5 + arch/powerpc/kernel/asm-offsets.c | 4 + arch/powerpc/kernel/module_64.c| 58 +++- arch/powerpc/kernel/trace/ftrace.c | 157 +++-- arch/powerpc/kernel/trace/ftrace_entry.S | 116 +++ arch/powerpc/tools/Makefile| 10 ++ arch/powerpc/tools/ftrace-gen-ool-stubs.sh | 48 +++ 10 files changed, 381 insertions(+), 37 deletions(-) create mode 100644 arch/powerpc/tools/Makefile create mode 100755 arch/powerpc/tools/ftrace-gen-ool-stubs.sh diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 68f0e7a5576f..f50cfd15bb73 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -568,6 +568,11 @@ config ARCH_USING_PATCHABLE_FUNCTION_ENTRY def_bool $(success,$(srctree)/arch/powerpc/tools/gcc-check-fpatchable-function-entry.sh $(CC) -mlittle-endian) if PPC64 && CPU_LITTLE_ENDIAN def_bool $(success,$(srctree)/arch/powerpc/tools/gcc-check-fpatchable-function-entry.sh $(CC) -mbig-endian) if PPC64 && CPU_BIG_ENDIAN +config PPC_FTRACE_OUT_OF_LINE + def_bool PPC64 && ARCH_USING_PATCHABLE_FUNCTION_ENTRY + depends on PPC64 + select ARCH_WANTS_PRE_LINK_VMLINUX + config HOTPLUG_CPU bool "Support for enabling/disabling CPUs" depends on SMP && (PPC_PSERIES || \ diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index bbfe4a1f06ef..c973e6cd1ae8 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -155,7 +155,11 @@ CC_FLAGS_NO_FPU:= $(call cc-option,-msoft-float) ifdef CONFIG_FUNCTION_TRACER ifdef CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY KBUILD_CPPFLAGS+= -DCC_USING_PATCHABLE_FUNCTION_ENTRY +ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE +CC_FLAGS_FTRACE := -fpatchable-function-entry=1 +else CC_FLAGS_FTRACE := -fpatchable-function-entry=2 +endif else CC_FLAGS_FTRACE := -pg ifdef CONFIG_MPROFILE_KERNEL diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 201f9d15430a..0589bb252de7 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -26,6 +26,10 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, struct module; struct dyn_ftrace; struct dyn_arch_ftrace { +#ifdef CONFIG_PPC_FTRACE_OUT_OF_LINE + /* pointer to the associated out-of-line stub
[RFC PATCH v4 11/17] kbuild: Add generic hook for architectures to use before the final vmlinux link
On powerpc, we would like to be able to make a pass on vmlinux.o and generate a new object file to be linked into vmlinux. Add a generic pass in Makefile.vmlinux that architectures can use for this purpose. Architectures need to select CONFIG_ARCH_WANTS_PRE_LINK_VMLINUX and must provide arch//tools/Makefile with .arch.vmlinux.o target, which will be invoked prior to the final vmlinux link step. Signed-off-by: Naveen N Rao --- arch/Kconfig | 6 ++ scripts/Makefile.vmlinux | 8 scripts/link-vmlinux.sh | 11 --- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 975dd22a2dbd..ef868ff8156a 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1643,4 +1643,10 @@ config CC_HAS_SANE_FUNCTION_ALIGNMENT config ARCH_NEED_CMPXCHG_1_EMU bool +config ARCH_WANTS_PRE_LINK_VMLINUX + def_bool n + help + An architecture can select this if it provides arch//tools/Makefile + with .arch.vmlinux.o target to be linked into vmlinux. + endmenu diff --git a/scripts/Makefile.vmlinux b/scripts/Makefile.vmlinux index 49946cb96844..6410e0be7f52 100644 --- a/scripts/Makefile.vmlinux +++ b/scripts/Makefile.vmlinux @@ -22,6 +22,14 @@ targets += .vmlinux.export.o vmlinux: .vmlinux.export.o endif +ifdef CONFIG_ARCH_WANTS_PRE_LINK_VMLINUX +targets += .arch.vmlinux.o +.arch.vmlinux.o: vmlinux.o FORCE + $(Q)$(MAKE) $(build)=arch/$(SRCARCH)/tools .arch.vmlinux.o + +vmlinux: .arch.vmlinux.o +endif + ARCH_POSTLINK := $(wildcard $(srctree)/arch/$(SRCARCH)/Makefile.postlink) # Final link of vmlinux with optional arch pass after final link diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 518c70b8db50..aafaed1412ea 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -122,7 +122,7 @@ gen_btf() return 1 fi - vmlinux_link ${1} + vmlinux_link ${1} ${arch_vmlinux_o} info "BTF" ${2} LLVM_OBJCOPY="${OBJCOPY}" ${PAHOLE} -J ${PAHOLE_FLAGS} ${1} @@ -178,7 +178,7 @@ kallsyms_step() kallsymso=${kallsyms_vmlinux}.o kallsyms_S=${kallsyms_vmlinux}.S - vmlinux_link ${kallsyms_vmlinux} "${kallsymso_prev}" ${btf_vmlinux_bin_o} + vmlinux_link ${kallsyms_vmlinux} "${kallsymso_prev}" ${btf_vmlinux_bin_o} ${arch_vmlinux_o} mksysmap ${kallsyms_vmlinux} ${kallsyms_vmlinux}.syms kallsyms ${kallsyms_vmlinux}.syms ${kallsyms_S} @@ -223,6 +223,11 @@ fi ${MAKE} -f "${srctree}/scripts/Makefile.build" obj=init init/version-timestamp.o +arch_vmlinux_o="" +if is_enabled CONFIG_ARCH_WANTS_PRE_LINK_VMLINUX; then + arch_vmlinux_o=.arch.vmlinux.o +fi + btf_vmlinux_bin_o="" if is_enabled CONFIG_DEBUG_INFO_BTF; then btf_vmlinux_bin_o=.btf.vmlinux.bin.o @@ -273,7 +278,7 @@ if is_enabled CONFIG_KALLSYMS; then fi fi -vmlinux_link vmlinux "${kallsymso}" ${btf_vmlinux_bin_o} +vmlinux_link vmlinux "${kallsymso}" ${btf_vmlinux_bin_o} ${arch_vmlinux_o} # fill in BTF IDs if is_enabled CONFIG_DEBUG_INFO_BTF && is_enabled CONFIG_BPF; then -- 2.45.2
[RFC PATCH v4 00/17] powerpc: Core ftrace rework, support for ftrace direct and bpf trampolines
This is v4 of the series posted here: http://lkml.kernel.org/r/cover.1718908016.git.nav...@kernel.org This series reworks core ftrace support on powerpc to have the function profiling sequence moved out of line. This enables us to have a single nop at kernel function entry virtually eliminating effect of the function tracer when it is not enabled. The function profile sequence is moved out of line and is allocated at two separate places depending on a new config option. For 64-bit powerpc, the function profiling sequence is also updated to include an additional instruction 'mtlr r0' after the usual two-instruction sequence to fix link stack imbalance (return address predictor) when ftrace is enabled. This showed an improvement of ~22% in null_syscall benchmark on a Power 10 system with ftrace enabled. Finally, support for ftrace direct calls is added based on support for DYNAMIC_FTRACE_WITH_CALL_OPS. BPF Trampoline support is added atop this. Support for ftrace direct calls is added for 32-bit powerpc. There is some code to enable bpf trampolines for 32-bit powerpc, but it is not complete and will need to be pursued separately. This is marked RFC so that this can get more testing. Patches 1 to 10 are independent of this series and can go in separately though. Rest of the patches depend on the series from Benjamin Gray adding support for patch_uint() and patch_ulong(): http://lkml.kernel.org/r/20240515024445.236364-1-bg...@linux.ibm.com Changelog v4: - Patches 1, 10 and 13 are new. - Address review comments from Nick. Numerous changes throughout the patch series. - Extend support for ftrace ool to vmlinux text up to 64MB (patch 13). - Address remaining TODOs in support for BPF Trampolines. - Update synchronization when patching instructions during trampoline attach/detach. - Naveen Naveen N Rao (17): powerpc/trace: Account for -fpatchable-function-entry support by toolchain powerpc/kprobes: Use ftrace to determine if a probe is at function entry powerpc64/ftrace: Nop out additional 'std' instruction emitted by gcc v5.x powerpc32/ftrace: Unify 32-bit and 64-bit ftrace entry code powerpc/module_64: Convert #ifdef to IS_ENABLED() powerpc/ftrace: Remove pointer to struct module from dyn_arch_ftrace powerpc/ftrace: Skip instruction patching if the instructions are the same powerpc/ftrace: Move ftrace stub used for init text before _einittext powerpc64/bpf: Fold bpf_jit_emit_func_call_hlp() into bpf_jit_emit_func_call_rel() powerpc/ftrace: Add a postlink script to validate function tracer kbuild: Add generic hook for architectures to use before the final vmlinux link powerpc64/ftrace: Move ftrace sequence out of line powerpc64/ftrace: Support .text larger than 32MB with out-of-line stubs powerpc/ftrace: Add support for DYNAMIC_FTRACE_WITH_CALL_OPS powerpc/ftrace: Add support for DYNAMIC_FTRACE_WITH_DIRECT_CALLS samples/ftrace: Add support for ftrace direct samples on powerpc powerpc64/bpf: Add support for bpf trampolines arch/Kconfig| 6 + arch/powerpc/Kconfig| 23 +- arch/powerpc/Makefile | 8 + arch/powerpc/Makefile.postlink | 8 + arch/powerpc/include/asm/ftrace.h | 32 +- arch/powerpc/include/asm/module.h | 5 + arch/powerpc/include/asm/ppc-opcode.h | 14 + arch/powerpc/kernel/asm-offsets.c | 11 + arch/powerpc/kernel/kprobes.c | 18 +- arch/powerpc/kernel/module_64.c | 66 +- arch/powerpc/kernel/trace/Makefile | 11 +- arch/powerpc/kernel/trace/ftrace.c | 295 ++- arch/powerpc/kernel/trace/ftrace_64_pg.c| 69 +- arch/powerpc/kernel/trace/ftrace_entry.S| 246 -- arch/powerpc/kernel/vmlinux.lds.S | 3 +- arch/powerpc/net/bpf_jit.h | 12 + arch/powerpc/net/bpf_jit_comp.c | 842 +++- arch/powerpc/net/bpf_jit_comp32.c | 7 +- arch/powerpc/net/bpf_jit_comp64.c | 68 +- arch/powerpc/tools/Makefile | 10 + arch/powerpc/tools/ftrace-gen-ool-stubs.sh | 52 ++ arch/powerpc/tools/ftrace_check.sh | 45 ++ samples/ftrace/ftrace-direct-modify.c | 85 +- samples/ftrace/ftrace-direct-multi-modify.c | 101 ++- samples/ftrace/ftrace-direct-multi.c| 79 +- samples/ftrace/ftrace-direct-too.c | 83 +- samples/ftrace/ftrace-direct.c | 69 +- scripts/Makefile.vmlinux| 8 + scripts/link-vmlinux.sh | 11 +- 29 files changed, 2083 insertions(+), 204 deletions(-) create mode 100644 arch/powerpc/tools/Makefile create mode 100755 arch/powerpc/tools/ftrace-gen-ool-stubs.sh create mode 100755 arch/powerpc/tools/ftrace_check.sh base-commit: 582b0e554593e530b1386eacafee6c412c5673cc prerequisite-patch-id: a1d50e589288239d5a8b
[PATCH v4] trace/kprobe: Display the actual notrace function when rejecting a probe
Trying to probe update_sd_lb_stats() using perf results in the below message in the kernel log: trace_kprobe: Could not probe notrace function _text This is because 'perf probe' specifies the kprobe location as an offset from '_text': $ sudo perf probe -D update_sd_lb_stats p:probe/update_sd_lb_stats _text+1830728 However, the error message is misleading and doesn't help convey the actual notrace function that is being probed. Fix this by looking up the actual function name that is being probed. With this fix, we now get the below message in the kernel log: trace_kprobe: Could not probe notrace function update_sd_lb_stats.constprop.0 Signed-off-by: Naveen N Rao --- v4: Use printk format specifier %ps with probe address to lookup the symbol, as suggested by Masami. kernel/trace/trace_kprobe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3d7a180a8427..0017404d6e8d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -487,8 +487,8 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) return -EINVAL; if (within_notrace_func(tk)) { - pr_warn("Could not probe notrace function %s\n", - trace_kprobe_symbol(tk)); + pr_warn("Could not probe notrace function %ps\n", + (void *)trace_kprobe_address(tk)); return -EINVAL; } base-commit: 4758560fa268cecfa1144f015aa9f2525d164b7e -- 2.43.0
Re: [PATCH v3] trace/kprobe: Display the actual notrace function when rejecting a probe
On Thu, Dec 14, 2023 at 08:02:10AM +0900, Masami Hiramatsu wrote: > On Wed, 13 Dec 2023 20:09:14 +0530 > Naveen N Rao wrote: > > > Trying to probe update_sd_lb_stats() using perf results in the below > > message in the kernel log: > > trace_kprobe: Could not probe notrace function _text > > > > This is because 'perf probe' specifies the kprobe location as an offset > > from '_text': > > $ sudo perf probe -D update_sd_lb_stats > > p:probe/update_sd_lb_stats _text+1830728 > > > > However, the error message is misleading and doesn't help convey the > > actual notrace function that is being probed. Fix this by looking up the > > actual function name that is being probed. With this fix, we now get the > > below message in the kernel log: > > trace_kprobe: Could not probe notrace function > > update_sd_lb_stats.constprop.0 > > > > Signed-off-by: Naveen N Rao > > --- > > v3: Remove tk parameter from within_notrace_func() as suggested by > > Masami > > > > kernel/trace/trace_kprobe.c | 11 ++- > > 1 file changed, 6 insertions(+), 5 deletions(-) > > > > diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c > > index 3d7a180a8427..dc36c6ed6131 100644 > > --- a/kernel/trace/trace_kprobe.c > > +++ b/kernel/trace/trace_kprobe.c > > @@ -449,9 +449,8 @@ static bool __within_notrace_func(unsigned long addr) > > return !ftrace_location_range(addr, addr + size - 1); > > } > > > > -static bool within_notrace_func(struct trace_kprobe *tk) > > +static bool within_notrace_func(unsigned long addr) > > { > > - unsigned long addr = trace_kprobe_address(tk); > > char symname[KSYM_NAME_LEN], *p; > > > > if (!__within_notrace_func(addr)) > > @@ -471,12 +470,14 @@ static bool within_notrace_func(struct trace_kprobe > > *tk) > > return true; > > } > > #else > > -#define within_notrace_func(tk)(false) > > +#define within_notrace_func(addr) (false) > > #endif > > > > /* Internal register function - just handle k*probes and flags */ > > static int __register_trace_kprobe(struct trace_kprobe *tk) > > { > > + unsigned long addr = trace_kprobe_address(tk); > > + char symname[KSYM_NAME_LEN]; > > int i, ret; > > > > ret = security_locked_down(LOCKDOWN_KPROBES); > > @@ -486,9 +487,9 @@ static int __register_trace_kprobe(struct trace_kprobe > > *tk) > > if (trace_kprobe_is_registered(tk)) > > return -EINVAL; > > > > - if (within_notrace_func(tk)) { > > + if (within_notrace_func(addr)) { > > pr_warn("Could not probe notrace function %s\n", > > - trace_kprobe_symbol(tk)); > > + lookup_symbol_name(addr, symname) ? > > trace_kprobe_symbol(tk) : symname); > > Can we just use %ps and (void *)trace_kprobe_address(tk) here? > > That will be simpler. Indeed - that is much simpler. v4 on its way... Thanks! - Naveen
[PATCH v3] trace/kprobe: Display the actual notrace function when rejecting a probe
Trying to probe update_sd_lb_stats() using perf results in the below message in the kernel log: trace_kprobe: Could not probe notrace function _text This is because 'perf probe' specifies the kprobe location as an offset from '_text': $ sudo perf probe -D update_sd_lb_stats p:probe/update_sd_lb_stats _text+1830728 However, the error message is misleading and doesn't help convey the actual notrace function that is being probed. Fix this by looking up the actual function name that is being probed. With this fix, we now get the below message in the kernel log: trace_kprobe: Could not probe notrace function update_sd_lb_stats.constprop.0 Signed-off-by: Naveen N Rao --- v3: Remove tk parameter from within_notrace_func() as suggested by Masami kernel/trace/trace_kprobe.c | 11 ++- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3d7a180a8427..dc36c6ed6131 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -449,9 +449,8 @@ static bool __within_notrace_func(unsigned long addr) return !ftrace_location_range(addr, addr + size - 1); } -static bool within_notrace_func(struct trace_kprobe *tk) +static bool within_notrace_func(unsigned long addr) { - unsigned long addr = trace_kprobe_address(tk); char symname[KSYM_NAME_LEN], *p; if (!__within_notrace_func(addr)) @@ -471,12 +470,14 @@ static bool within_notrace_func(struct trace_kprobe *tk) return true; } #else -#define within_notrace_func(tk)(false) +#define within_notrace_func(addr) (false) #endif /* Internal register function - just handle k*probes and flags */ static int __register_trace_kprobe(struct trace_kprobe *tk) { + unsigned long addr = trace_kprobe_address(tk); + char symname[KSYM_NAME_LEN]; int i, ret; ret = security_locked_down(LOCKDOWN_KPROBES); @@ -486,9 +487,9 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) if (trace_kprobe_is_registered(tk)) return -EINVAL; - if (within_notrace_func(tk)) { + if (within_notrace_func(addr)) { pr_warn("Could not probe notrace function %s\n", - trace_kprobe_symbol(tk)); + lookup_symbol_name(addr, symname) ? trace_kprobe_symbol(tk) : symname); return -EINVAL; } base-commit: 4758560fa268cecfa1144f015aa9f2525d164b7e -- 2.43.0
Re: [PATCH v1 2/2] powerpc: Enable OPTPROBES on PPC32
Christophe Leroy wrote: For that, create a 32 bits version of patch_imm64_load_insns() and create a patch_imm_load_insns() which calls patch_imm32_load_insns() on PPC32 and patch_imm64_load_insns() on PPC64. Adapt optprobes_head.S for PPC32. Use PPC_LL/PPC_STL macros instead of raw ld/std, opt out things linked to paca and use stmw/lmw to save/restore registers. Signed-off-by: Christophe Leroy --- arch/powerpc/Kconfig | 2 +- arch/powerpc/kernel/optprobes.c | 24 +-- arch/powerpc/kernel/optprobes_head.S | 46 +++- 3 files changed, 53 insertions(+), 19 deletions(-) Thanks for adding support for ppc32. It is good to see that it works without too many changes. diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index c1344c05226c..49b538e54efb 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -227,7 +227,7 @@ config PPC select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI if PERF_EVENTS || (PPC64 && PPC_BOOK3S) select HAVE_HARDLOCKUP_DETECTOR_ARCHif PPC64 && PPC_BOOK3S && SMP - select HAVE_OPTPROBES if PPC64 + select HAVE_OPTPROBES select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI if PPC64 select HAVE_HARDLOCKUP_DETECTOR_PERFif PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index 58fdb9f66e0f..cdf87086fa33 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -141,11 +141,21 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op) } } +static void patch_imm32_load_insns(unsigned long val, int reg, kprobe_opcode_t *addr) +{ + patch_instruction((struct ppc_inst *)addr, + ppc_inst(PPC_RAW_LIS(reg, IMM_H(val; + addr++; + + patch_instruction((struct ppc_inst *)addr, + ppc_inst(PPC_RAW_ORI(reg, reg, IMM_L(val; +} + /* * Generate instructions to load provided immediate 64-bit value * to register 'reg' and patch these instructions at 'addr'. */ -static void patch_imm64_load_insns(unsigned long val, int reg, kprobe_opcode_t *addr) +static void patch_imm64_load_insns(unsigned long long val, int reg, kprobe_opcode_t *addr) Do you really need this? { /* lis reg,(op)@highest */ patch_instruction((struct ppc_inst *)addr, @@ -177,6 +187,14 @@ static void patch_imm64_load_insns(unsigned long val, int reg, kprobe_opcode_t * ___PPC_RS(reg) | (val & 0x))); } +static void patch_imm_load_insns(unsigned long val, int reg, kprobe_opcode_t *addr) +{ + if (IS_ENABLED(CONFIG_PPC64)) + patch_imm64_load_insns(val, reg, addr); + else + patch_imm32_load_insns(val, reg, addr); +} + int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) { struct ppc_inst branch_op_callback, branch_emulate_step, temp; @@ -230,7 +248,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) * Fixup the template with instructions to: * 1. load the address of the actual probepoint */ - patch_imm64_load_insns((unsigned long)op, 3, buff + TMPL_OP_IDX); + patch_imm_load_insns((unsigned long)op, 3, buff + TMPL_OP_IDX); /* * 2. branch to optimized_callback() and emulate_step() @@ -264,7 +282,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) * 3. load instruction to be emulated into relevant register, and */ temp = ppc_inst_read((struct ppc_inst *)p->ainsn.insn); - patch_imm64_load_insns(ppc_inst_as_ulong(temp), 4, buff + TMPL_INSN_IDX); + patch_imm_load_insns(ppc_inst_as_ulong(temp), 4, buff + TMPL_INSN_IDX); /* * 4. branch back from trampoline diff --git a/arch/powerpc/kernel/optprobes_head.S b/arch/powerpc/kernel/optprobes_head.S index ff8ba4d3824d..49f31e554573 100644 --- a/arch/powerpc/kernel/optprobes_head.S +++ b/arch/powerpc/kernel/optprobes_head.S @@ -30,39 +30,47 @@ optinsn_slot: .global optprobe_template_entry optprobe_template_entry: /* Create an in-memory pt_regs */ - stdur1,-INT_FRAME_SIZE(r1) + PPC_STLUr1,-INT_FRAME_SIZE(r1) SAVE_GPR(0,r1) /* Save the previous SP into stack */ addir0,r1,INT_FRAME_SIZE - std r0,GPR1(r1) + PPC_STL r0,GPR1(r1) +#ifdef CONFIG_PPC64 SAVE_10GPRS(2,r1) SAVE_10GPRS(12,r1) SAVE_10GPRS(22,r1) +#else + stmwr2, GPR2(r1) +#endif /* Save SPRS */ mfmsr r5 - std r5,_MSR(r1) + PPC_STL r5,_MSR(r1) li r5,0x700 - std r5,_TRAP(r1) + PPC_STL r5,_TRAP(r1) li r5,0 - std r5,ORIG_GPR3(r1)
Re: [PATCH v4] powerpc/uprobes: Validation for prefixed instruction
On 2021/03/09 08:54PM, Michael Ellerman wrote: > Ravi Bangoria writes: > > As per ISA 3.1, prefixed instruction should not cross 64-byte > > boundary. So don't allow Uprobe on such prefixed instruction. > > > > There are two ways probed instruction is changed in mapped pages. > > First, when Uprobe is activated, it searches for all the relevant > > pages and replace instruction in them. In this case, if that probe > > is on the 64-byte unaligned prefixed instruction, error out > > directly. Second, when Uprobe is already active and user maps a > > relevant page via mmap(), instruction is replaced via mmap() code > > path. But because Uprobe is invalid, entire mmap() operation can > > not be stopped. In this case just print an error and continue. > > > > Signed-off-by: Ravi Bangoria > > Acked-by: Naveen N. Rao > > Do we have a Fixes: tag for this? Since this is an additional check we are adding, I don't think we should add a Fixes: tag. Nothing is broken per-se -- we're just adding more checks to catch simple mistakes. Also, like Oleg pointed out, there are still many other ways for users to shoot themselves in the foot with uprobes and prefixed instructions, if they so desire. However, if you still think we should add a Fixes: tag, we can perhaps use the below commit since I didn't see any specific commit adding support for prefixed instructions for uprobes: Fixes: 650b55b707fdfa ("powerpc: Add prefixed instructions to instruction data type") > > > --- > > v3: > > https://lore.kernel.org/r/20210304050529.59391-1-ravi.bango...@linux.ibm.com > > v3->v4: > > - CONFIG_PPC64 check was not required, remove it. > > - Use SZ_ macros instead of hardcoded numbers. > > > > arch/powerpc/kernel/uprobes.c | 7 +++ > > 1 file changed, 7 insertions(+) > > > > diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c > > index e8a63713e655..4cbfff6e94a3 100644 > > --- a/arch/powerpc/kernel/uprobes.c > > +++ b/arch/powerpc/kernel/uprobes.c > > @@ -41,6 +41,13 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, > > if (addr & 0x03) > > return -EINVAL; > > > > + if (cpu_has_feature(CPU_FTR_ARCH_31) && > > + ppc_inst_prefixed(auprobe->insn) && > > + (addr & (SZ_64 - 4)) == SZ_64 - 4) { > > + pr_info_ratelimited("Cannot register a uprobe on 64 byte > > unaligned prefixed instruction\n"); > > + return -EINVAL; > > I realise we already did the 0x03 check above, but I still think this > would be clearer simply as: > > (addr & 0x3f == 60) Indeed, I like the use of `60' there -- hex is overrated ;) - Naveen
Re: [PATCH v3] powerpc/uprobes: Validation for prefixed instruction
On 2021/03/04 10:35AM, Ravi Bangoria wrote: > As per ISA 3.1, prefixed instruction should not cross 64-byte > boundary. So don't allow Uprobe on such prefixed instruction. > > There are two ways probed instruction is changed in mapped pages. > First, when Uprobe is activated, it searches for all the relevant > pages and replace instruction in them. In this case, if that probe > is on the 64-byte unaligned prefixed instruction, error out > directly. Second, when Uprobe is already active and user maps a > relevant page via mmap(), instruction is replaced via mmap() code > path. But because Uprobe is invalid, entire mmap() operation can > not be stopped. In this case just print an error and continue. > > Signed-off-by: Ravi Bangoria > --- > v2: > https://lore.kernel.org/r/20210204104703.273429-1-ravi.bango...@linux.ibm.com > v2->v3: > - Drop restriction for Uprobe on suffix of prefixed instruction. > It needs lot of code change including generic code but what > we get in return is not worth it. > > arch/powerpc/kernel/uprobes.c | 8 > 1 file changed, 8 insertions(+) > > diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c > index e8a63713e655..c400971ebe70 100644 > --- a/arch/powerpc/kernel/uprobes.c > +++ b/arch/powerpc/kernel/uprobes.c > @@ -41,6 +41,14 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, > if (addr & 0x03) > return -EINVAL; > > + if (!IS_ENABLED(CONFIG_PPC64) || !cpu_has_feature(CPU_FTR_ARCH_31)) > + return 0; Sorry, I missed this last time, but I think we can drop the above checks. ppc_inst_prefixed() already factors in the dependency on CONFIG_PPC64, and I don't think we need to confirm if we're running on a ISA V3.1 for the below check. With that: Acked-by: Naveen N. Rao > + > + if (ppc_inst_prefixed(auprobe->insn) && (addr & 0x3F) == 0x3C) { > + pr_info_ratelimited("Cannot register a uprobe on 64 byte > unaligned prefixed instruction\n"); > + return -EINVAL; > + } > + - Naveen
Re: [PATCH v2] powerpc/uprobes: Validation for prefixed instruction
On 2021/02/04 06:38PM, Naveen N. Rao wrote: > On 2021/02/04 04:17PM, Ravi Bangoria wrote: > > Don't allow Uprobe on 2nd word of a prefixed instruction. As per > > ISA 3.1, prefixed instruction should not cross 64-byte boundary. > > So don't allow Uprobe on such prefixed instruction as well. > > > > There are two ways probed instruction is changed in mapped pages. > > First, when Uprobe is activated, it searches for all the relevant > > pages and replace instruction in them. In this case, if we notice > > that probe is on the 2nd word of prefixed instruction, error out > > directly. Second, when Uprobe is already active and user maps a > > relevant page via mmap(), instruction is replaced via mmap() code > > path. But because Uprobe is invalid, entire mmap() operation can > > not be stopped. In this case just print an error and continue. > > > > Signed-off-by: Ravi Bangoria > > --- > > v1: > > http://lore.kernel.org/r/20210119091234.76317-1-ravi.bango...@linux.ibm.com > > v1->v2: > > - Instead of introducing new arch hook from verify_opcode(), use > > existing hook arch_uprobe_analyze_insn(). > > - Add explicit check for prefixed instruction crossing 64-byte > > boundary. If probe is on such instruction, throw an error. > > > > arch/powerpc/kernel/uprobes.c | 66 ++- > > 1 file changed, 65 insertions(+), 1 deletion(-) > > > > diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c > > index e8a63713e655..485d19a2a31f 100644 > > --- a/arch/powerpc/kernel/uprobes.c > > +++ b/arch/powerpc/kernel/uprobes.c > > @@ -7,6 +7,7 @@ > > * Adapted from the x86 port by Ananth N Mavinakayanahalli > > > > */ > > #include > > +#include > > #include > > #include > > #include > > @@ -28,6 +29,69 @@ bool is_trap_insn(uprobe_opcode_t *insn) > > return (is_trap(*insn)); > > } > > > > +#ifdef CONFIG_PPC64 > > +static int get_instr(struct mm_struct *mm, unsigned long addr, u32 *instr) > > +{ > > + struct page *page; > > + struct vm_area_struct *vma; > > + void *kaddr; > > + unsigned int gup_flags = FOLL_FORCE | FOLL_SPLIT_PMD; > > + > > + if (get_user_pages_remote(mm, addr, 1, gup_flags, &page, &vma, NULL) <= > > 0) > > + return -EINVAL; > > + > > + kaddr = kmap_atomic(page); > > + *instr = *((u32 *)(kaddr + (addr & ~PAGE_MASK))); > > + kunmap_atomic(kaddr); > > + put_page(page); > > + return 0; > > +} > > + > > +static int validate_prefixed_instr(struct mm_struct *mm, unsigned long > > addr) > > +{ > > + struct ppc_inst inst; > > + u32 prefix, suffix; > > + > > + /* > > +* No need to check if addr is pointing to beginning of the > > +* page. Even if probe is on a suffix of page-unaligned > > +* prefixed instruction, hw will raise exception and kernel > > +* will send SIGBUS. > > +*/ > > + if (!(addr & ~PAGE_MASK)) > > + return 0; > > + > > + if (get_instr(mm, addr, &prefix) < 0) > > + return -EINVAL; > > + if (get_instr(mm, addr + 4, &suffix) < 0) > > + return -EINVAL; > > + > > + inst = ppc_inst_prefix(prefix, suffix); > > + if (ppc_inst_prefixed(inst) && (addr & 0x3F) == 0x3C) { > > + printk_ratelimited("Cannot register a uprobe on 64 byte " > ^^ pr_info_ratelimited() > > It should be sufficient to check the primary opcode to determine if it > is a prefixed instruction. You don't have to read the suffix. I see that > we don't have a helper to do this currently, so you could do: > > if (ppc_inst_primary_opcode(ppc_inst(prefix)) == 1) Seeing the kprobes code, I realized that we have to check for another scenario (Thanks, Jordan!). If this is the suffix of a prefix instruction for which a uprobe has already been installed, then the previous word will be a 'trap' instruction. You need to check if there is a uprobe at the previous word, and if the original instruction there was a prefix instruction. - Naveen
Re: [PATCH v2] powerpc/uprobes: Validation for prefixed instruction
On 2021/02/04 04:19PM, Ravi Bangoria wrote: > > > On 2/4/21 4:17 PM, Ravi Bangoria wrote: > > Don't allow Uprobe on 2nd word of a prefixed instruction. As per > > ISA 3.1, prefixed instruction should not cross 64-byte boundary. > > So don't allow Uprobe on such prefixed instruction as well. > > > > There are two ways probed instruction is changed in mapped pages. > > First, when Uprobe is activated, it searches for all the relevant > > pages and replace instruction in them. In this case, if we notice > > that probe is on the 2nd word of prefixed instruction, error out > > directly. Second, when Uprobe is already active and user maps a > > relevant page via mmap(), instruction is replaced via mmap() code > > path. But because Uprobe is invalid, entire mmap() operation can > > not be stopped. In this case just print an error and continue. > > @mpe, > > arch_uprobe_analyze_insn() can return early if > cpu_has_feature(CPU_FTR_ARCH_31) is not set. But that will > miss out a rare scenario of user running binary with prefixed > instruction on p10 predecessors. Please let me know if I > should add cpu_has_feature(CPU_FTR_ARCH_31) or not. The check you are adding is very specific to prefixed instructions, so it makes sense to add a cpu feature check for v3.1. On older processors, those are invalid instructions like any other. The instruction emulation infrastructure will refuse to emulate it and the instruction will be single stepped. - Naveen
Re: [PATCH v2] powerpc/uprobes: Validation for prefixed instruction
On 2021/02/04 04:17PM, Ravi Bangoria wrote: > Don't allow Uprobe on 2nd word of a prefixed instruction. As per > ISA 3.1, prefixed instruction should not cross 64-byte boundary. > So don't allow Uprobe on such prefixed instruction as well. > > There are two ways probed instruction is changed in mapped pages. > First, when Uprobe is activated, it searches for all the relevant > pages and replace instruction in them. In this case, if we notice > that probe is on the 2nd word of prefixed instruction, error out > directly. Second, when Uprobe is already active and user maps a > relevant page via mmap(), instruction is replaced via mmap() code > path. But because Uprobe is invalid, entire mmap() operation can > not be stopped. In this case just print an error and continue. > > Signed-off-by: Ravi Bangoria > --- > v1: > http://lore.kernel.org/r/20210119091234.76317-1-ravi.bango...@linux.ibm.com > v1->v2: > - Instead of introducing new arch hook from verify_opcode(), use > existing hook arch_uprobe_analyze_insn(). > - Add explicit check for prefixed instruction crossing 64-byte > boundary. If probe is on such instruction, throw an error. > > arch/powerpc/kernel/uprobes.c | 66 ++- > 1 file changed, 65 insertions(+), 1 deletion(-) > > diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c > index e8a63713e655..485d19a2a31f 100644 > --- a/arch/powerpc/kernel/uprobes.c > +++ b/arch/powerpc/kernel/uprobes.c > @@ -7,6 +7,7 @@ > * Adapted from the x86 port by Ananth N Mavinakayanahalli > > */ > #include > +#include > #include > #include > #include > @@ -28,6 +29,69 @@ bool is_trap_insn(uprobe_opcode_t *insn) > return (is_trap(*insn)); > } > > +#ifdef CONFIG_PPC64 > +static int get_instr(struct mm_struct *mm, unsigned long addr, u32 *instr) > +{ > + struct page *page; > + struct vm_area_struct *vma; > + void *kaddr; > + unsigned int gup_flags = FOLL_FORCE | FOLL_SPLIT_PMD; > + > + if (get_user_pages_remote(mm, addr, 1, gup_flags, &page, &vma, NULL) <= > 0) > + return -EINVAL; > + > + kaddr = kmap_atomic(page); > + *instr = *((u32 *)(kaddr + (addr & ~PAGE_MASK))); > + kunmap_atomic(kaddr); > + put_page(page); > + return 0; > +} > + > +static int validate_prefixed_instr(struct mm_struct *mm, unsigned long addr) > +{ > + struct ppc_inst inst; > + u32 prefix, suffix; > + > + /* > + * No need to check if addr is pointing to beginning of the > + * page. Even if probe is on a suffix of page-unaligned > + * prefixed instruction, hw will raise exception and kernel > + * will send SIGBUS. > + */ > + if (!(addr & ~PAGE_MASK)) > + return 0; > + > + if (get_instr(mm, addr, &prefix) < 0) > + return -EINVAL; > + if (get_instr(mm, addr + 4, &suffix) < 0) > + return -EINVAL; > + > + inst = ppc_inst_prefix(prefix, suffix); > + if (ppc_inst_prefixed(inst) && (addr & 0x3F) == 0x3C) { > + printk_ratelimited("Cannot register a uprobe on 64 byte " ^^ pr_info_ratelimited() It should be sufficient to check the primary opcode to determine if it is a prefixed instruction. You don't have to read the suffix. I see that we don't have a helper to do this currently, so you could do: if (ppc_inst_primary_opcode(ppc_inst(prefix)) == 1) In the future, it might be worthwhile to add IS_PREFIX() as a macro similar to IS_MTMSRD() if there are more such uses. Along with this, if you also add the below to the start of this function, you can get rid of the #ifdef: if (!IS_ENABLED(CONFIG_PPC64)) return 0; - Naveen
Re: [PATCH] kprobes: Warn if the kprobe is reregistered
On 2021/02/03 11:59PM, Masami Hiramatsu wrote: > Warn if the kprobe is reregistered, since there must be > a software bug (actively used resource must not be re-registered) > and caller must be fixed. > > Signed-off-by: Masami Hiramatsu > --- > kernel/kprobes.c | 13 - > 1 file changed, 8 insertions(+), 5 deletions(-) Suggested-by: Naveen N. Rao Acked-by: Naveen N. Rao Thanks, Naveen
Re: [PATCH] selftests/ftrace: Disable multiple_kprobes test on powerpc
Masami Hiramatsu wrote: On Tue, 5 Jan 2021 19:01:56 +0900 Masami Hiramatsu wrote: On Tue, 5 Jan 2021 12:27:30 +0530 "Naveen N. Rao" wrote: > Not all symbols are blacklisted on powerpc. Disable multiple_kprobes > test until that is sorted, so that rest of ftrace and kprobe selftests > can be run. This looks good to me, but could you try to find the functions which should be blocked from kprobes? (Usually, the function which are involved in the sw-breakpoint handling, including locks etc.) Ah, OK. I wonder why CONFIG_KPROBE_EVENTS_ON_NOTRACE=n doesn't help, it was ignored if the arch doesn't support CONFIG_KPROBES_ON_FTRACE. Good point, though we do support CONFIG_KPROBES_ON_FTRACE on powerpc so the below patch is unlikely to help. However, since entry code is unlikely to be the source of the issue due to CONFIG_KPROBE_EVENTS_ON_NOTRACE, I will take another look to see where the problem lies. Naveen, could you try to run this test case with following patch on powerpc? diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b911e9f6d9f5..241a55313476 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -433,7 +433,7 @@ static int disable_trace_kprobe(struct trace_event_call *call, return 0; } -#if defined(CONFIG_KPROBES_ON_FTRACE) && \ +#if defined(CONFIG_FUNCTION_TRACER) && \ !defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE) static bool __within_notrace_func(unsigned long addr) { This looks like a good change regardless, so if you intend to post this separately: Acked-by: Naveen N. Rao Thanks, Naveen
Re: [PATCH] selftests/ftrace: Disable multiple_kprobes test on powerpc
Masami Hiramatsu wrote: On Tue, 5 Jan 2021 12:27:30 +0530 "Naveen N. Rao" wrote: Not all symbols are blacklisted on powerpc. Disable multiple_kprobes test until that is sorted, so that rest of ftrace and kprobe selftests can be run. This looks good to me, but could you try to find the functions which should be blocked from kprobes? (Usually, the function which are involved in the sw-breakpoint handling, including locks etc.) Yes, we did add several blacklists some time back, but there has been quite a bit of churn in our entry code. I've been meaning to audit it for a while now, but this has been blocking tests. It would be nice to skip this test for now until I am able to spend some time on this. Thanks, Naveen
[PATCH] selftests/ftrace: Disable multiple_kprobes test on powerpc
Not all symbols are blacklisted on powerpc. Disable multiple_kprobes test until that is sorted, so that rest of ftrace and kprobe selftests can be run. Signed-off-by: Naveen N. Rao --- .../testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc b/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc index 312d237800969e..41503c32f53eed 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc @@ -7,7 +7,7 @@ # Currently HAVE_KPROBES_ON_FTRACE defined on x86 and powerpc64le case `uname -m` in x86_64|i[3456]86) OFFS=5;; - ppc64le) OFFS=8;; + ppc*) exit_unsupported;; *) OFFS=0;; esac base-commit: 36bbbd0e234d817938bdc52121a0f5473b3e58f5 -- 2.25.4
Re: [PATCH 3/4] perf tools: Update powerpc's syscall.tbl
Arnaldo Carvalho de Melo wrote: Em Fri, Dec 18, 2020 at 08:08:56PM +0530, Naveen N. Rao escreveu: Hi Arnaldo, Arnaldo Carvalho de Melo wrote: > Em Fri, Dec 18, 2020 at 08:26:59AM -0300, Arnaldo Carvalho de Melo escreveu: > > Em Fri, Dec 18, 2020 at 03:59:23PM +0800, Tiezhu Yang escreveu: > > > This silences the following tools/perf/ build warning: > > > Warning: Kernel ABI header at 'tools/perf/arch/powerpc/entry/syscalls/syscall.tbl' differs from latest version at 'arch/powerpc/kernel/syscalls/syscall.tbl' > > > > Hi Ravi, Naveen, > > > > Can I get your Reviewed-by or Acked-by for this change and the > > other that adds s390's syscall.tbl to check_headers.sh so that we get > > oops s/s390/powerpc/g :-) > > > notified when the copy drifts, so that we can see if it still continues > > working and we can get new syscalls to be supported in things like 'perf > > trace'? Yes, this looks good to me: Reviewed-by: Naveen N. Rao FWIW, I had posted a similar patch back in April, but glad to have this go in ;) http://lkml.kernel.org/r/20200220063740.785913-1-naveen.n@linux.vnet.ibm.com My bad :-\ No worries, thanks for checking on this one. - Naveen
Re: [PATCH 3/4] perf tools: Update powerpc's syscall.tbl
Hi Arnaldo, Arnaldo Carvalho de Melo wrote: Em Fri, Dec 18, 2020 at 08:26:59AM -0300, Arnaldo Carvalho de Melo escreveu: Em Fri, Dec 18, 2020 at 03:59:23PM +0800, Tiezhu Yang escreveu: > This silences the following tools/perf/ build warning: > Warning: Kernel ABI header at 'tools/perf/arch/powerpc/entry/syscalls/syscall.tbl' differs from latest version at 'arch/powerpc/kernel/syscalls/syscall.tbl' Hi Ravi, Naveen, Can I get your Reviewed-by or Acked-by for this change and the other that adds s390's syscall.tbl to check_headers.sh so that we get oops s/s390/powerpc/g :-) notified when the copy drifts, so that we can see if it still continues working and we can get new syscalls to be supported in things like 'perf trace'? Yes, this looks good to me: Reviewed-by: Naveen N. Rao FWIW, I had posted a similar patch back in April, but glad to have this go in ;) http://lkml.kernel.org/r/20200220063740.785913-1-naveen.n@linux.vnet.ibm.com Thanks, Naveen
Re: [RFC PATCH 01/14] ftrace: Fix updating FTRACE_FL_TRAMP
Steven Rostedt wrote: On Thu, 26 Nov 2020 23:38:38 +0530 "Naveen N. Rao" wrote: On powerpc, kprobe-direct.tc triggered FTRACE_WARN_ON() in ftrace_get_addr_new() followed by the below message: Bad trampoline accounting at: 4222522f (wake_up_process+0xc/0x20) (f001) The set of steps leading to this involved: - modprobe ftrace-direct-too - enable_probe - modprobe ftrace-direct - rmmod ftrace-direct <-- trigger The problem turned out to be that we were not updating flags in the ftrace record properly. From the above message about the trampoline accounting being bad, it can be seen that the ftrace record still has FTRACE_FL_TRAMP set though ftrace-direct module is going away. This happens because we are checking if any ftrace_ops has the FTRACE_FL_TRAMP flag set _before_ updating the filter hash. The fix for this is to look for any _other_ ftrace_ops that also needs FTRACE_FL_TRAMP. I'm applying this now and sending this for -rc and stable. The code worked on x86 because x86 has a way to make all users use trampolines, so this was never an issue (everything has a trampoline). I modified the kernel so that x86 would not create its own trampoline (see the weak function arch_ftrace_update_trampoline(), and I was able to reproduce the bug. Good to know that you were able to reproduce this. I'm adding: Cc: sta...@vger.kernel.org Fixes: a124692b698b0 ("ftrace: Enable trampoline when rec count returns back to one") That looks good to me. Thanks for picking the two patches and for your review on the others! - Naveen
[RFC PATCH 09/14] powerpc/ftrace: Use a hash table for tracking ftrace stubs
In preparation for having to deal with large number of ftrace stubs in support of ftrace direct calls, convert existing stubs to use a hash table. The hash table is key'ed off the target address for the stubs since there could be multiple stubs for the same target to cover the full kernel text. Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/trace/ftrace.c | 75 +- 1 file changed, 33 insertions(+), 42 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 14b39f7797d455..7ddb6e4b527c39 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -13,6 +13,7 @@ #define pr_fmt(fmt) "ftrace-powerpc: " fmt +#include #include #include #include @@ -32,14 +33,12 @@ #ifdef CONFIG_DYNAMIC_FTRACE -/* - * We generally only have a single long_branch tramp and at most 2 or 3 plt - * tramps generated. But, we don't use the plt tramps currently. We also allot - * 2 tramps after .text and .init.text. So, we only end up with around 3 usable - * tramps in total. Set aside 8 just to be sure. - */ -#defineNUM_FTRACE_TRAMPS 8 -static unsigned long ftrace_tramps[NUM_FTRACE_TRAMPS]; +static DEFINE_HASHTABLE(ppc_ftrace_stubs, 8); +struct ppc_ftrace_stub_data { + unsigned long addr; + unsigned long target; + struct hlist_node hentry; +}; static struct ppc_inst ftrace_call_replace(unsigned long ip, unsigned long addr, int link) @@ -288,36 +287,31 @@ __ftrace_make_nop(struct module *mod, #endif /* PPC64 */ #endif /* CONFIG_MODULES */ -static unsigned long find_ftrace_tramp(unsigned long ip) +static unsigned long find_ftrace_tramp(unsigned long ip, unsigned long target) { - int i; + struct ppc_ftrace_stub_data *stub; struct ppc_inst instr; - /* -* We have the compiler generated long_branch tramps at the end -* and we prefer those -*/ - for (i = NUM_FTRACE_TRAMPS - 1; i >= 0; i--) - if (!ftrace_tramps[i]) - continue; - else if (create_branch(&instr, (void *)ip, - ftrace_tramps[i], 0) == 0) - return ftrace_tramps[i]; + hash_for_each_possible(ppc_ftrace_stubs, stub, hentry, target) + if (stub->target == target && !create_branch(&instr, (void *)ip, stub->addr, 0)) + return stub->addr; return 0; } -static int add_ftrace_tramp(unsigned long tramp) +static int add_ftrace_tramp(unsigned long tramp, unsigned long target) { - int i; + struct ppc_ftrace_stub_data *stub; - for (i = 0; i < NUM_FTRACE_TRAMPS; i++) - if (!ftrace_tramps[i]) { - ftrace_tramps[i] = tramp; - return 0; - } + stub = kmalloc(sizeof(*stub), GFP_KERNEL); + if (!stub) + return -1; - return -1; + stub->addr = tramp; + stub->target = target; + hash_add(ppc_ftrace_stubs, &stub->hentry, target); + + return 0; } /* @@ -328,16 +322,14 @@ static int add_ftrace_tramp(unsigned long tramp) */ static int setup_mcount_compiler_tramp(unsigned long tramp) { - int i; struct ppc_inst op; - unsigned long ptr; struct ppc_inst instr; + struct ppc_ftrace_stub_data *stub; + unsigned long ptr, ftrace_target = ppc_global_function_entry((void *)FTRACE_REGS_ADDR); /* Is this a known long jump tramp? */ - for (i = 0; i < NUM_FTRACE_TRAMPS; i++) - if (!ftrace_tramps[i]) - break; - else if (ftrace_tramps[i] == tramp) + hash_for_each_possible(ppc_ftrace_stubs, stub, hentry, ftrace_target) + if (stub->target == ftrace_target && stub->addr == tramp) return 0; /* New trampoline -- read where this goes */ @@ -361,19 +353,18 @@ static int setup_mcount_compiler_tramp(unsigned long tramp) } /* Let's re-write the tramp to go to ftrace_[regs_]caller */ - ptr = ppc_global_function_entry((void *)FTRACE_REGS_ADDR); - if (create_branch(&instr, (void *)tramp, ptr, 0)) { + if (create_branch(&instr, (void *)tramp, ftrace_target, 0)) { pr_debug("%ps is not reachable from existing mcount tramp\n", - (void *)ptr); + (void *)ftrace_target); return -1; } - if (patch_branch((struct ppc_inst *)tramp, ptr, 0)) { + if (patch_branch((struct ppc_inst *)tramp, ftrace_target, 0)) { pr_debug("REL24 out of range!\n"); return -1; } - if (add_ftrace_tramp(tramp)) { + if (add_ftrace_tramp(tramp, ftrace_target)) {
[RFC PATCH 11/14] powerpc/ftrace: Use GPR save/restore macros in ftrace_graph_caller()
Use SAVE_8GPRS(), REST_8GPRS() and _NIP(), along with using the standard SWITCH_FRAME_SIZE for the stack frame in ftrace_graph_caller() to simplify code. This increases the stack frame size, but it is unlikely to be an issue since ftrace_[regs_]caller() have just used a similar stack frame size, and it isn't evident that the graph caller has too deep a call stack to cause issues. Signed-off-by: Naveen N. Rao --- .../powerpc/kernel/trace/ftrace_64_mprofile.S | 28 +-- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index f9fd5f743eba34..bbe871b47ade58 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -279,24 +279,17 @@ livepatch_handler: #ifdef CONFIG_FUNCTION_GRAPH_TRACER _GLOBAL(ftrace_graph_caller) - stdur1, -112(r1) + stdur1,-SWITCH_FRAME_SIZE(r1) /* with -mprofile-kernel, parameter regs are still alive at _mcount */ - std r10, 104(r1) - std r9, 96(r1) - std r8, 88(r1) - std r7, 80(r1) - std r6, 72(r1) - std r5, 64(r1) - std r4, 56(r1) - std r3, 48(r1) + SAVE_8GPRS(3, r1) /* Save callee's TOC in the ABI compliant location */ std r2, 24(r1) ld r2, PACATOC(r13)/* get kernel TOC in r2 */ - addir5, r1, 112 + addir5, r1, SWITCH_FRAME_SIZE mfctr r4 /* ftrace_caller has moved local addr here */ - std r4, 40(r1) + std r4, _NIP(r1) mflrr3 /* ftrace_caller has restored LR from stack */ subir4, r4, MCOUNT_INSN_SIZE @@ -309,21 +302,14 @@ _GLOBAL(ftrace_graph_caller) */ mtlrr3 - ld r0, 40(r1) + ld r0, _NIP(r1) mtctr r0 - ld r10, 104(r1) - ld r9, 96(r1) - ld r8, 88(r1) - ld r7, 80(r1) - ld r6, 72(r1) - ld r5, 64(r1) - ld r4, 56(r1) - ld r3, 48(r1) + REST_8GPRS(3, r1) /* Restore callee's TOC */ ld r2, 24(r1) - addir1, r1, 112 + addir1, r1, SWITCH_FRAME_SIZE mflrr0 std r0, LRSAVE(r1) bctr -- 2.25.4
[RFC PATCH 14/14] samples/ftrace: Add powerpc support for ftrace direct samples
Add a simple powerpc trampoline to demonstrate use of ftrace direct on powerpc. Signed-off-by: Naveen N. Rao --- samples/Kconfig | 2 +- samples/ftrace/ftrace-direct-modify.c | 58 +++ samples/ftrace/ftrace-direct-too.c| 48 -- samples/ftrace/ftrace-direct.c| 45 +++-- 4 files changed, 144 insertions(+), 9 deletions(-) diff --git a/samples/Kconfig b/samples/Kconfig index 0ed6e4d71d87b1..fdc9e44dba3b95 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -26,7 +26,7 @@ config SAMPLE_TRACE_PRINTK config SAMPLE_FTRACE_DIRECT tristate "Build register_ftrace_direct() example" depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS && m - depends on X86_64 # has x86_64 inlined asm + depends on X86_64 || PPC64 # has inlined asm help This builds an ftrace direct function example that hooks to wake_up_process and prints the parameters. diff --git a/samples/ftrace/ftrace-direct-modify.c b/samples/ftrace/ftrace-direct-modify.c index c13a5bc5095bea..89d66a12d300e1 100644 --- a/samples/ftrace/ftrace-direct-modify.c +++ b/samples/ftrace/ftrace-direct-modify.c @@ -2,6 +2,7 @@ #include #include #include +#include void my_direct_func1(void) { @@ -18,6 +19,7 @@ extern void my_tramp2(void *); static unsigned long my_ip = (unsigned long)schedule; +#ifdef CONFIG_X86 asm ( " .pushsection.text, \"ax\", @progbits\n" " .type my_tramp1, @function\n" @@ -38,6 +40,58 @@ asm ( " .size my_tramp2, .-my_tramp2\n" " .popsection\n" ); +#elif CONFIG_PPC64 +asm ( +" .pushsection.text, \"ax\", @progbits\n" +" .type my_tramp1, @function\n" +" .global my_tramp1\n" +" my_tramp1:\n" +" std 0, 16(1)\n" +" stdu1, -480(1)\n" +" std 2, 24(1)\n" +" mflr7\n" +" std 7, 368(1)\n" +" bcl 20, 31, 1f\n" +"1:mflr12\n" +" ld 2, (3f - 1b)(12)\n" +" bl my_direct_func1\n" +" nop\n" +" ld 2, 24(1)\n" +" ld 7, 368(1)\n" +" mtctr 7\n" +" addi1, 1, 480\n" +" ld 0, 16(1)\n" +" mtlr0\n" +" bctr\n" +" .size my_tramp1, .-my_tramp1\n" +" nop\n" +" .type my_tramp2, @function\n" +" .global my_tramp2\n" +" my_tramp2:\n" +" std 0, 16(1)\n" +" stdu1, -480(1)\n" +" std 2, 24(1)\n" +" mflr7\n" +" std 7, 368(1)\n" +" bcl 20, 31, 2f\n" +"2:mflr12\n" +" ld 2, (3f - 2b)(12)\n" +" bl my_direct_func2\n" +" nop\n" +" ld 2, 24(1)\n" +" ld 7, 368(1)\n" +" mtctr 7\n" +" addi1, 1, 480\n" +" ld 0, 16(1)\n" +" mtlr0\n" +" bctr\n" +" .size my_tramp2, .-my_tramp2\n" +"3:\n" +" .quad .TOC.@tocbase\n" +" .popsection\n" +); +#endif + static unsigned long my_tramp = (unsigned long)my_tramp1; static unsigned long tramps[2] = { @@ -72,6 +126,10 @@ static int __init ftrace_direct_init(void) { int ret; +#ifdef CONFIG_PPC64 + my_ip = ppc_function_entry((void *)my_ip) + 4; +#endif + ret = register_ftrace_direct(my_ip, my_tramp); if (!ret) simple_tsk = kthread_run(simple_thread, NULL, "event-sample-fn"); diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c index d5c5022be66429..9a82abecbe0dcc 100644 --- a/samples/ftrace/ftrace-direct-too.c +++ b/samples/ftrace/ftrace-direct-too.c @@ -3,6 +3,7 @@ #include /* for handle_mm_fault() */ #include +#include void my_direct_func(struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -13,6 +14,9 @@ void my_direct_func(struct vm_area_struct *vma, extern void my_tramp(void *); +static unsigned long my_ip = (unsigned long)handle_mm_fault; + +#ifdef CONFIG_X86 asm ( " .pushsection.text, \"ax\", @progbits\n" " .type my_tramp, @function\n" @@ -31,18 +35,54 @@ asm ( " .size my_tramp, .-my_tramp\n" " .popsection\n" ); +#elif CONFIG_PPC64 +asm ( +" .pushsection.text, \"ax\", @progbits\n" +" .type my_tramp, @function\n
[RFC PATCH 10/14] powerpc/ftrace: Drop assumptions about ftrace trampoline target
We currently assume that ftrace locations are patched to go to either ftrace_caller or ftrace_regs_caller. Drop this assumption in preparation for supporting ftrace direct calls. Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/trace/ftrace.c | 107 +++-- 1 file changed, 86 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 7ddb6e4b527c39..fcb21a9756e456 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -322,14 +322,15 @@ static int add_ftrace_tramp(unsigned long tramp, unsigned long target) */ static int setup_mcount_compiler_tramp(unsigned long tramp) { + int i; struct ppc_inst op; struct ppc_inst instr; struct ppc_ftrace_stub_data *stub; unsigned long ptr, ftrace_target = ppc_global_function_entry((void *)FTRACE_REGS_ADDR); - /* Is this a known long jump tramp? */ - hash_for_each_possible(ppc_ftrace_stubs, stub, hentry, ftrace_target) - if (stub->target == ftrace_target && stub->addr == tramp) + /* Is this a known tramp? */ + hash_for_each(ppc_ftrace_stubs, i, stub, hentry) + if (stub->addr == tramp) return 0; /* New trampoline -- read where this goes */ @@ -608,23 +609,16 @@ static int __ftrace_make_call_kernel(struct dyn_ftrace *rec, unsigned long addr) { struct ppc_inst op; void *ip = (void *)rec->ip; - unsigned long tramp, entry, ptr; + unsigned long tramp, ptr; - /* Make sure we're being asked to patch branch to a known ftrace addr */ - entry = ppc_global_function_entry((void *)ftrace_caller); ptr = ppc_global_function_entry((void *)addr); - if (ptr != entry) { #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - entry = ppc_global_function_entry((void *)ftrace_regs_caller); - if (ptr != entry) { + /* Make sure we branch to ftrace_regs_caller since we only setup stubs for that */ + tramp = ppc_global_function_entry((void *)ftrace_caller); + if (ptr == tramp) + ptr = ppc_global_function_entry((void *)FTRACE_REGS_ADDR); #endif - pr_err("Unknown ftrace addr to patch: %ps\n", (void *)ptr); - return -EINVAL; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - } -#endif - } /* Make sure we have a nop */ if (probe_kernel_read_inst(&op, ip)) { @@ -637,7 +631,7 @@ static int __ftrace_make_call_kernel(struct dyn_ftrace *rec, unsigned long addr) return -EINVAL; } - tramp = find_ftrace_tramp((unsigned long)ip, FTRACE_REGS_ADDR); + tramp = find_ftrace_tramp((unsigned long)ip, ptr); if (!tramp) { pr_err("No ftrace trampolines reachable from %ps\n", ip); return -EINVAL; @@ -783,6 +777,81 @@ __ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, } #endif +static int +__ftrace_modify_call_kernel(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) +{ + struct ppc_inst op; + unsigned long ip = rec->ip; + unsigned long entry, ptr, tramp; + + /* read where this goes */ + if (probe_kernel_read_inst(&op, (void *)ip)) { + pr_err("Fetching opcode failed.\n"); + return -EFAULT; + } + + /* Make sure that this is still a 24bit jump */ + if (!is_bl_op(op)) { + pr_err("Not expected bl: opcode is %s\n", ppc_inst_as_str(op)); + return -EINVAL; + } + + /* lets find where the pointer goes */ + tramp = find_bl_target(ip, op); + entry = ppc_global_function_entry((void *)old_addr); + + pr_devel("ip:%lx jumps to %lx", ip, tramp); + + if (tramp != entry) { + /* old_addr is not within range, so we must have used a trampoline */ + struct ppc_ftrace_stub_data *stub; + + hash_for_each_possible(ppc_ftrace_stubs, stub, hentry, entry) + if (stub->target == entry && stub->addr == tramp) + break; + + if (stub->target != entry || stub->addr != tramp) { + pr_err("we don't know about the tramp at %lx!\n", tramp); + return -EFAULT; + } + } + + /* The new target may be within range */ + if (test_24bit_addr(ip, addr)) { + /* within range */ + if (patch_branch((struct ppc_inst *)ip, addr, BRANCH_SET_LINK)) { + pr_err("REL24 out of range!\n"); + return -EINVAL; + } + + return 0; + } + + ptr = ppc_global_function_entry((void *)a
[RFC PATCH 13/14] powerpc/ftrace: Add support for register_ftrace_direct() for MPROFILE_KERNEL
Add support for register_ftrace_direct() for MPROFILE_KERNEL, as it depends on DYNAMIC_FTRACE_WITH_REGS. Since powerpc only provides a branch range of 32MB, we set aside a 64k area within kernel text for creating stubs that can be used to branch to the provided trampoline, which can be located in the module area. This is limited to kernel text, and as such, ftrace direct calls are not supported for functions in kernel modules at this time. We use orig_gpr3 to stash the address of the direct call trampoline in arch_ftrace_set_direct_caller(). ftrace_regs_caller() is updated to check for this to determine if we need to redirect to a direct call trampoline. As the direct call trampoline has to work as an alternative for the ftrace trampoline, we setup LR and r0 appropriately, and update ctr to the trampoline address. Finally, ftrace_graph_caller() is updated to save/restore r0. Signed-off-by: Naveen N. Rao --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/ftrace.h | 14 ++ arch/powerpc/kernel/trace/ftrace.c| 140 +- .../powerpc/kernel/trace/ftrace_64_mprofile.S | 40 - 4 files changed, 182 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index cfc6dd787f532c..a87ac2e403196e 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -197,6 +197,7 @@ config PPC select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW select HAVE_DYNAMIC_FTRACE + select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLSif MPROFILE_KERNEL select HAVE_DYNAMIC_FTRACE_WITH_REGSif MPROFILE_KERNEL select HAVE_EBPF_JITif PPC64 select HAVE_EFFICIENT_UNALIGNED_ACCESS if !(CPU_LITTLE_ENDIAN && POWER7_CPU) diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index bc76970b6ee532..2f1c46e9f5d416 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -10,6 +10,8 @@ #define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR +#define FTRACE_STUBS_SIZE 65536 + #ifdef __ASSEMBLY__ /* Based off of objdump optput from glibc */ @@ -59,6 +61,18 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) struct dyn_arch_ftrace { struct module *mod; }; + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* + * When there is a direct caller registered, we use regs->orig_gpr3 (similar to + * how x86 uses orig_ax) to let ftrace_{regs_}_caller know that we should go + * there instead of returning to the function + */ +static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) +{ + regs->orig_gpr3 = addr; +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #endif /* __ASSEMBLY__ */ #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index fcb21a9756e456..815b14ae45a71f 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -37,6 +37,7 @@ static DEFINE_HASHTABLE(ppc_ftrace_stubs, 8); struct ppc_ftrace_stub_data { unsigned long addr; unsigned long target; + refcount_t refs; struct hlist_node hentry; }; @@ -299,7 +300,7 @@ static unsigned long find_ftrace_tramp(unsigned long ip, unsigned long target) return 0; } -static int add_ftrace_tramp(unsigned long tramp, unsigned long target) +static int add_ftrace_tramp(unsigned long tramp, unsigned long target, int lock) { struct ppc_ftrace_stub_data *stub; @@ -309,11 +310,123 @@ static int add_ftrace_tramp(unsigned long tramp, unsigned long target) stub->addr = tramp; stub->target = target; + refcount_set(&stub->refs, 1); + if (lock) + refcount_inc(&stub->refs); hash_add(ppc_ftrace_stubs, &stub->hentry, target); return 0; } +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +static u32 ftrace_direct_stub_insns[] = { + PPC_RAW_LIS(12, 0), + PPC_RAW_ORI(12, 12, 0), + PPC_RAW_SLDI(12, 12, 32), + PPC_RAW_ORIS(12, 12, 0), + PPC_RAW_ORI(12, 12, 0), + PPC_RAW_MTCTR(12), + PPC_RAW_BCTR(), +}; +#define FTRACE_NUM_STUBS (FTRACE_STUBS_SIZE / sizeof(ftrace_direct_stub_insns)) +static DECLARE_BITMAP(stubs_bitmap, FTRACE_NUM_STUBS); +extern unsigned int ftrace_stubs[]; + +static unsigned long get_ftrace_tramp(unsigned long ip, unsigned long target) +{ + struct ppc_ftrace_stub_data *stub_data; + struct ppc_inst instr; + unsigned int *stub; + int index; + + hash_for_each_possible(ppc_ftrace_stubs, stub_data, hentry, target) { + if (stub_data->target == target && + !create_branch(&instr, (void *)ip, stub_data->addr, 0)) { + refcount_inc(&s
[RFC PATCH 12/14] powerpc/ftrace: Drop saving LR to stack save area for -mprofile-kernel
In -mprofile-kernel variant of ftrace_graph_caller(), we save the optionally-updated LR address into the stack save area at the end. This is likely an offshoot of the initial -mprofile-kernel implementation in gcc emitting the same as part of the -mprofile-kernel instruction sequence. However, this is not required. Drop it. Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/trace/ftrace_64_mprofile.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index bbe871b47ade58..c5602e9b07faa3 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -310,7 +310,5 @@ _GLOBAL(ftrace_graph_caller) ld r2, 24(r1) addir1, r1, SWITCH_FRAME_SIZE - mflrr0 - std r0, LRSAVE(r1) bctr #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -- 2.25.4
[RFC PATCH 08/14] powerpc/ftrace: Use FTRACE_REGS_ADDR to identify the correct ftrace trampoline
Use FTRACE_REGS_ADDR instead of keying off CONFIG_DYNAMIC_FTRACE_WITH_REGS to identify the proper ftrace trampoline address to use. Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/trace/ftrace.c | 12 ++-- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 4fe5f373172fd2..14b39f7797d455 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -361,11 +361,7 @@ static int setup_mcount_compiler_tramp(unsigned long tramp) } /* Let's re-write the tramp to go to ftrace_[regs_]caller */ -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - ptr = ppc_global_function_entry((void *)ftrace_regs_caller); -#else - ptr = ppc_global_function_entry((void *)ftrace_caller); -#endif + ptr = ppc_global_function_entry((void *)FTRACE_REGS_ADDR); if (create_branch(&instr, (void *)tramp, ptr, 0)) { pr_debug("%ps is not reachable from existing mcount tramp\n", (void *)ptr); @@ -885,11 +881,7 @@ int __init ftrace_dyn_arch_init(void) 0x7d8903a6, /* mtctr r12 */ 0x4e800420, /* bctr */ }; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS - unsigned long addr = ppc_global_function_entry((void *)ftrace_regs_caller); -#else - unsigned long addr = ppc_global_function_entry((void *)ftrace_caller); -#endif + unsigned long addr = ppc_global_function_entry((void *)FTRACE_REGS_ADDR); long reladdr = addr - kernel_toc_addr(); if (reladdr > 0x7FFF || reladdr < -(0x8000L)) { -- 2.25.4
[RFC PATCH 07/14] powerpc/ftrace: Remove dead code
ftrace_plt_tramps[] was intended to speed up skipping plt branches, but the code wasn't completed. It is also not significantly better than reading and decoding the instruction. Remove the same. Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/trace/ftrace.c | 8 1 file changed, 8 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 42761ebec9f755..4fe5f373172fd2 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -332,7 +332,6 @@ static int setup_mcount_compiler_tramp(unsigned long tramp) struct ppc_inst op; unsigned long ptr; struct ppc_inst instr; - static unsigned long ftrace_plt_tramps[NUM_FTRACE_TRAMPS]; /* Is this a known long jump tramp? */ for (i = 0; i < NUM_FTRACE_TRAMPS; i++) @@ -341,13 +340,6 @@ static int setup_mcount_compiler_tramp(unsigned long tramp) else if (ftrace_tramps[i] == tramp) return 0; - /* Is this a known plt tramp? */ - for (i = 0; i < NUM_FTRACE_TRAMPS; i++) - if (!ftrace_plt_tramps[i]) - break; - else if (ftrace_plt_tramps[i] == tramp) - return -1; - /* New trampoline -- read where this goes */ if (probe_kernel_read_inst(&op, (void *)tramp)) { pr_debug("Fetching opcode failed.\n"); -- 2.25.4
[RFC PATCH 06/14] powerpc: Add support for CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
Add register_get_kernel_argument() for a rudimentary way to access kernel function arguments. Signed-off-by: Naveen N. Rao --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/ptrace.h | 31 +++ 2 files changed, 32 insertions(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index e9f13fe084929b..cfc6dd787f532c 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -202,6 +202,7 @@ config PPC select HAVE_EFFICIENT_UNALIGNED_ACCESS if !(CPU_LITTLE_ENDIAN && POWER7_CPU) select HAVE_FAST_GUP select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index e2c778c176a3a6..956828c07abd70 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -62,6 +62,8 @@ struct pt_regs }; #endif +#define NR_REG_ARGUMENTS 8 + #ifdef __powerpc64__ /* @@ -85,8 +87,10 @@ struct pt_regs #ifdef PPC64_ELF_ABI_v2 #define STACK_FRAME_MIN_SIZE 32 +#define STACK_FRAME_PARM_SAVE 32 #else #define STACK_FRAME_MIN_SIZE STACK_FRAME_OVERHEAD +#define STACK_FRAME_PARM_SAVE 48 #endif /* Size of dummy stack frame allocated when calling signal handler. */ @@ -103,6 +107,7 @@ struct pt_regs #define STACK_INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) #define STACK_FRAME_MARKER 2 #define STACK_FRAME_MIN_SIZE STACK_FRAME_OVERHEAD +#define STACK_FRAME_PARM_SAVE 8 /* Size of stack frame allocated when calling signal handler. */ #define __SIGNAL_FRAMESIZE 64 @@ -309,6 +314,32 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, return 0; } +/** + * regs_get_kernel_argument() - get Nth function argument in kernel + * @regs: pt_regs of that context + * @n: function argument number (start from 0) + * + * regs_get_argument() returns @n th argument of the function call. + * Note that this chooses most probable assignment, and is incorrect + * in scenarios where double or fp/vector parameters are involved. + * This also doesn't take into account stack alignment requirements. + * + * This is expected to be called from kprobes or ftrace with regs + * at function entry, so the current function has not setup its stack. + */ +static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, +unsigned int n) +{ + if (n >= NR_REG_ARGUMENTS) { +#ifndef __powerpc64__ + n -= NR_REG_ARGUMENTS; +#endif + n += STACK_FRAME_PARM_SAVE / sizeof(unsigned long); + return regs_get_kernel_stack_nth(regs, n); + } else { + return regs_get_register(regs, offsetof(struct pt_regs, gpr[n + 3])); + } +} #endif /* __ASSEMBLY__ */ #ifndef __powerpc64__ -- 2.25.4
[RFC PATCH 03/14] ftrace: Fix cleanup in error path of register_ftrace_direct()
We need to remove hash entry if register_ftrace_function() fails. Consolidate the cleanup to be done after register_ftrace_function() at the end. Signed-off-by: Naveen N. Rao --- kernel/trace/ftrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9c1bba8cc51b03..3844a4a1346a9c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5136,8 +5136,6 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) __add_hash_entry(direct_functions, entry); ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0); - if (ret) - remove_hash_entry(direct_functions, entry); if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) { ret = register_ftrace_function(&direct_ops); @@ -5146,6 +5144,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) } if (ret) { + remove_hash_entry(direct_functions, entry); kfree(entry); if (!direct->count) { list_del_rcu(&direct->next); -- 2.25.4
[RFC PATCH 02/14] ftrace: Fix DYNAMIC_FTRACE_WITH_DIRECT_CALLS dependency
DYNAMIC_FTRACE_WITH_DIRECT_CALLS should depend on DYNAMIC_FTRACE_WITH_REGS since we need ftrace_regs_caller(). Fixes: 763e34e74bb7d5c ("ftrace: Add register_ftrace_direct()") Signed-off-by: Naveen N. Rao --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a4020c0b4508c9..e1bf5228fb692a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -202,7 +202,7 @@ config DYNAMIC_FTRACE_WITH_REGS config DYNAMIC_FTRACE_WITH_DIRECT_CALLS def_bool y - depends on DYNAMIC_FTRACE + depends on DYNAMIC_FTRACE_WITH_REGS depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS config FUNCTION_PROFILER -- 2.25.4
[RFC PATCH 05/14] ftrace: Add architectural helpers for [un]register_ftrace_direct()
Architectures may want to do some validation (such as to ensure that the trampoline code is reachable from the provided ftrace location) before accepting ftrace direct registration. Add helpers for the same. Signed-off-by: Naveen N. Rao --- include/linux/ftrace.h | 2 ++ kernel/trace/ftrace.c | 27 +-- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 46b4b7ee28c41f..3fdcb4c513bc2d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -290,6 +290,8 @@ int ftrace_modify_direct_caller(struct ftrace_func_entry *entry, unsigned long old_addr, unsigned long new_addr); unsigned long ftrace_find_rec_direct(unsigned long ip); +int arch_register_ftrace_direct(unsigned long ip, unsigned long addr); +void arch_unregister_ftrace_direct(unsigned long ip, unsigned long addr); #else # define ftrace_direct_func_count 0 static inline int register_ftrace_direct(unsigned long ip, unsigned long addr) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7476f2458b6d95..0e259b90527722 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5005,6 +5005,13 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +int __weak arch_register_ftrace_direct(unsigned long ip, unsigned long addr) +{ + return 0; +} + +void __weak arch_unregister_ftrace_direct(unsigned long ip, unsigned long addr) { } + /** * register_ftrace_direct - Call a custom trampoline directly * @ip: The address of the nop at the beginning of a function @@ -5028,6 +5035,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) struct ftrace_hash *free_hash = NULL; struct dyn_ftrace *rec; int ret = -EBUSY; + int arch_ret; mutex_lock(&direct_mutex); @@ -5082,18 +5090,24 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) entry->direct = addr; __add_hash_entry(direct_functions, entry); - ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0); + arch_ret = arch_register_ftrace_direct(ip, addr); - if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) { - ret = register_ftrace_function(&direct_ops); - if (ret) - ftrace_set_filter_ip(&direct_ops, ip, 1, 0); + if (!arch_ret) { + ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0); + + if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) { + ret = register_ftrace_function(&direct_ops); + if (ret) + ftrace_set_filter_ip(&direct_ops, ip, 1, 0); + } } - if (ret) { + if (arch_ret || ret) { remove_hash_entry(direct_functions, entry); ftrace_direct_func_count--; kfree(entry); + if (!arch_ret) + arch_unregister_ftrace_direct(ip, addr); } out_unlock: mutex_unlock(&direct_mutex); @@ -5155,6 +5169,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) remove_hash_entry(direct_functions, entry); ftrace_direct_func_count--; kfree(entry); + arch_unregister_ftrace_direct(ip, addr); out_unlock: mutex_unlock(&direct_mutex); -- 2.25.4
[RFC PATCH 04/14] ftrace: Remove ftrace_find_direct_func()
This is a revert of commit 013bf0da047481 ("ftrace: Add ftrace_find_direct_func()") ftrace_find_direct_func() was meant for use in the function graph tracer by architecture specific code. However, commit ff205766dbbee0 ("ftrace: Fix function_graph tracer interaction with BPF trampoline") disabled function graph tracer for direct calls leaving this without any users. In addition, modify_ftrace_direct() allowed redirecting the direct call to a different trampoline that was never registered through register_ftrace_direct(). This meant that ftrace_direct_funcs didn't capture all trampolines. Signed-off-by: Naveen N. Rao --- include/linux/ftrace.h | 5 --- kernel/trace/ftrace.c | 84 ++ 2 files changed, 4 insertions(+), 85 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 1bd3a0356ae478..46b4b7ee28c41f 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -285,7 +285,6 @@ extern int ftrace_direct_func_count; int register_ftrace_direct(unsigned long ip, unsigned long addr); int unregister_ftrace_direct(unsigned long ip, unsigned long addr); int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr); -struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr); int ftrace_modify_direct_caller(struct ftrace_func_entry *entry, struct dyn_ftrace *rec, unsigned long old_addr, @@ -306,10 +305,6 @@ static inline int modify_ftrace_direct(unsigned long ip, { return -ENOTSUPP; } -static inline struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr) -{ - return NULL; -} static inline int ftrace_modify_direct_caller(struct ftrace_func_entry *entry, struct dyn_ftrace *rec, unsigned long old_addr, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3844a4a1346a9c..7476f2458b6d95 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5005,46 +5005,6 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS - -struct ftrace_direct_func { - struct list_headnext; - unsigned long addr; - int count; -}; - -static LIST_HEAD(ftrace_direct_funcs); - -/** - * ftrace_find_direct_func - test an address if it is a registered direct caller - * @addr: The address of a registered direct caller - * - * This searches to see if a ftrace direct caller has been registered - * at a specific address, and if so, it returns a descriptor for it. - * - * This can be used by architecture code to see if an address is - * a direct caller (trampoline) attached to a fentry/mcount location. - * This is useful for the function_graph tracer, as it may need to - * do adjustments if it traced a location that also has a direct - * trampoline attached to it. - */ -struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr) -{ - struct ftrace_direct_func *entry; - bool found = false; - - /* May be called by fgraph trampoline (protected by rcu tasks) */ - list_for_each_entry_rcu(entry, &ftrace_direct_funcs, next) { - if (entry->addr == addr) { - found = true; - break; - } - } - if (found) - return entry; - - return NULL; -} - /** * register_ftrace_direct - Call a custom trampoline directly * @ip: The address of the nop at the beginning of a function @@ -5064,7 +5024,6 @@ struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr) */ int register_ftrace_direct(unsigned long ip, unsigned long addr) { - struct ftrace_direct_func *direct; struct ftrace_func_entry *entry; struct ftrace_hash *free_hash = NULL; struct dyn_ftrace *rec; @@ -5118,19 +5077,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) if (!entry) goto out_unlock; - direct = ftrace_find_direct_func(addr); - if (!direct) { - direct = kmalloc(sizeof(*direct), GFP_KERNEL); - if (!direct) { - kfree(entry); - goto out_unlock; - } - direct->addr = addr; - direct->count = 0; - list_add_rcu(&direct->next, &ftrace_direct_funcs); - ftrace_direct_func_count++; - } - + ftrace_direct_func_count++; entry->ip = ip; entry->direct = addr; __add_hash_entry(direct_functions, entry); @@ -5145,18 +5092,8 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) if (ret) { remove_hash_entry(direct_functions, entry); +
[RFC PATCH 01/14] ftrace: Fix updating FTRACE_FL_TRAMP
On powerpc, kprobe-direct.tc triggered FTRACE_WARN_ON() in ftrace_get_addr_new() followed by the below message: Bad trampoline accounting at: 4222522f (wake_up_process+0xc/0x20) (f001) The set of steps leading to this involved: - modprobe ftrace-direct-too - enable_probe - modprobe ftrace-direct - rmmod ftrace-direct <-- trigger The problem turned out to be that we were not updating flags in the ftrace record properly. From the above message about the trampoline accounting being bad, it can be seen that the ftrace record still has FTRACE_FL_TRAMP set though ftrace-direct module is going away. This happens because we are checking if any ftrace_ops has the FTRACE_FL_TRAMP flag set _before_ updating the filter hash. The fix for this is to look for any _other_ ftrace_ops that also needs FTRACE_FL_TRAMP. Signed-off-by: Naveen N. Rao --- kernel/trace/ftrace.c | 22 +- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8185f7240095f4..9c1bba8cc51b03 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1629,6 +1629,8 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) static struct ftrace_ops * ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); static struct ftrace_ops * +ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_exclude); +static struct ftrace_ops * ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops); static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, @@ -1778,7 +1780,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, * to it. */ if (ftrace_rec_count(rec) == 1 && - ftrace_find_tramp_ops_any(rec)) + ftrace_find_tramp_ops_any_other(rec, ops)) rec->flags |= FTRACE_FL_TRAMP; else rec->flags &= ~FTRACE_FL_TRAMP; @@ -2244,6 +2246,24 @@ ftrace_find_tramp_ops_any(struct dyn_ftrace *rec) return NULL; } +static struct ftrace_ops * +ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_exclude) +{ + struct ftrace_ops *op; + unsigned long ip = rec->ip; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + + if (op == op_exclude || !op->trampoline) + continue; + + if (hash_contains_ip(ip, op->func_hash)) + return op; + } while_for_each_ftrace_op(op); + + return NULL; +} + static struct ftrace_ops * ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *op) -- 2.25.4
[RFC PATCH 00/14] powerpc64: Add support for ftrace direct calls
This series adds support for DYNAMIC_FTRACE_WITH_DIRECT_CALLS for powerpc64le. This is mostly working fine for me, except for a soft lockup I see with the ftrace direct selftest. It happens when irqsoff tracer is being tested with the ftrace direct modules. This appears to be an existing upstream issue since I am able to reproduce the lockup without these patches. I will be looking into that to see if I can figure out the cause of those lockups. In the meantime, I would appreciate a review of these patches. - Naveen Naveen N. Rao (14): ftrace: Fix updating FTRACE_FL_TRAMP ftrace: Fix DYNAMIC_FTRACE_WITH_DIRECT_CALLS dependency ftrace: Fix cleanup in error path of register_ftrace_direct() ftrace: Remove ftrace_find_direct_func() ftrace: Add architectural helpers for [un]register_ftrace_direct() powerpc: Add support for CONFIG_HAVE_FUNCTION_ARG_ACCESS_API powerpc/ftrace: Remove dead code powerpc/ftrace: Use FTRACE_REGS_ADDR to identify the correct ftrace trampoline powerpc/ftrace: Use a hash table for tracking ftrace stubs powerpc/ftrace: Drop assumptions about ftrace trampoline target powerpc/ftrace: Use GPR save/restore macros in ftrace_graph_caller() powerpc/ftrace: Drop saving LR to stack save area for -mprofile-kernel powerpc/ftrace: Add support for register_ftrace_direct() for MPROFILE_KERNEL samples/ftrace: Add powerpc support for ftrace direct samples arch/powerpc/Kconfig | 2 + arch/powerpc/include/asm/ftrace.h | 14 + arch/powerpc/include/asm/ptrace.h | 31 ++ arch/powerpc/kernel/trace/ftrace.c| 314 +- .../powerpc/kernel/trace/ftrace_64_mprofile.S | 70 ++-- include/linux/ftrace.h| 7 +- kernel/trace/Kconfig | 2 +- kernel/trace/ftrace.c | 130 +++- samples/Kconfig | 2 +- samples/ftrace/ftrace-direct-modify.c | 58 samples/ftrace/ftrace-direct-too.c| 48 ++- samples/ftrace/ftrace-direct.c| 45 ++- 12 files changed, 519 insertions(+), 204 deletions(-) base-commit: 4c202167192a77481310a3cacae9f12618b92216 -- 2.25.4
Re: [PATCH 1/3 v5] ftrace: Have the callbacks receive a struct ftrace_regs instead of pt_regs
Hi Steven, Steven Rostedt wrote: From: "Steven Rostedt (VMware)" In preparation to have arguments of a function passed to callbacks attached to functions as default, change the default callback prototype to receive a struct ftrace_regs as the forth parameter instead of a pt_regs. For callbacks that set the FL_SAVE_REGS flag in their ftrace_ops flags, they will now need to get the pt_regs via a ftrace_get_regs() helper call. If this is called by a callback that their ftrace_ops did not have a FL_SAVE_REGS flag set, it that helper function will return NULL. This will allow the ftrace_regs to hold enough just to get the parameters and stack pointer, but without the worry that callbacks may have a pt_regs that is not completely filled. Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/kprobes/ftrace.c | 3 ++- fs/pstore/ftrace.c| 2 +- include/linux/ftrace.h| 16 ++-- include/linux/kprobes.h | 2 +- kernel/livepatch/patch.c | 3 ++- kernel/trace/ftrace.c | 27 +++ kernel/trace/trace_event_perf.c | 2 +- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_functions.c| 9 - kernel/trace/trace_irqsoff.c | 2 +- kernel/trace/trace_sched_wakeup.c | 2 +- kernel/trace/trace_selftest.c | 20 +++- kernel/trace/trace_stack.c| 2 +- 13 files changed, 55 insertions(+), 37 deletions(-) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 629abaf25681..be73350955e4 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -345,7 +345,7 @@ static inline void wait_for_kprobe_optimizer(void) { } #endif /* CONFIG_OPTPROBES */ #ifdef CONFIG_KPROBES_ON_FTRACE extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs); + struct ftrace_ops *ops, struct ftrace_regs *fregs); This breaks the build on non-x86 architectures that support KPROBES_ON_FTRACE. - Naveen
Re: [PATCH v3 8/9] perf mem: Return NULL for event 'ldst' on PowerPC
[+ Maddy] Leo Yan wrote: If user specifies event type "ldst", PowerPC's perf_mem_events__name() will wrongly return the store event name "cpu/mem-stores/". This patch changes to return NULL for the event "ldst" on PowerPC. Signed-off-by: Leo Yan --- tools/perf/arch/powerpc/util/mem-events.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/arch/powerpc/util/mem-events.c b/tools/perf/arch/powerpc/util/mem-events.c index 07fb5e049488..90c5a0760685 100644 --- a/tools/perf/arch/powerpc/util/mem-events.c +++ b/tools/perf/arch/powerpc/util/mem-events.c @@ -7,6 +7,8 @@ char *perf_mem_events__name(int i) { if (i == PERF_MEM_EVENTS__LOAD) return (char *) "cpu/mem-loads/"; - - return (char *) "cpu/mem-stores/"; + else if (i == PERF_MEM_EVENTS__STORE) + return (char *) "cpu/mem-stores/"; + else + return NULL; } -- 2.17.1
Re: [PATCH v1] powerpc/process: Remove unnecessary #ifdef CONFIG_FUNCTION_GRAPH_TRACER
Christophe Leroy wrote: ftrace_graph_ret_addr() is always defined and returns 'ip' when CONFIG_FUNCTION GRAPH_TRACER is not set. So the #ifdef is not needed, remove it. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/process.c | 4 1 file changed, 4 deletions(-) LGTM. Acked-by: Naveen N. Rao - Naveen
Re: [PATCH] MAINTAINERS: adjust kprobes.rst entry to new location
Lukas Bulwahn wrote: Commit 2165b82fde82 ("docs: Move kprobes.rst from staging/ to trace/") moved kprobes.rst, but missed to adjust the MAINTAINERS entry. Hence, ./scripts/get_maintainer.pl --self-test=patterns complains: warning: no file matchesF:Documentation/staging/kprobes.rst Adjust the entry to the new file location. Signed-off-by: Lukas Bulwahn --- Naveen, Masami-san, please ack. Jonathan, please pick this minor non-urgent patch into docs-next. applies cleanly on next-20200724 Ah, sorry. Hadn't noticed this change from Mauro. Acked-by: Naveen N. Rao Thanks! - Naveen
[PATCH 2/3] docs: staging/kprobes.rst: Move references to a separate appendix
Kprobes references are currently listed right after kretprobes example, and appears to be part of the same section. Move this out to a separate appendix for clarity. Signed-off-by: Naveen N. Rao --- Documentation/staging/kprobes.rst | 14 +- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/Documentation/staging/kprobes.rst b/Documentation/staging/kprobes.rst index 620699f5df22..b757b6dfd3d4 100644 --- a/Documentation/staging/kprobes.rst +++ b/Documentation/staging/kprobes.rst @@ -20,6 +20,7 @@ Kernel Probes (Kprobes) 10. Deprecated Features Appendix A: The kprobes debugfs interface Appendix B: The kprobes sysctl interface + Appendix C: References Concepts: Kprobes and Return Probes = @@ -710,11 +711,6 @@ Kretprobes Example See samples/kprobes/kretprobe_example.c -For additional information on Kprobes, refer to the following URLs: - -- https://www.ibm.com/developerworks/library/l-kprobes/index.html -- https://www.kernel.org/doc/ols/2006/ols2006v2-pages-109-124.pdf - Deprecated Features === @@ -797,3 +793,11 @@ Note that this knob *changes* the optimized state. This means that optimized probes (marked [OPTIMIZED]) will be unoptimized ([OPTIMIZED] tag will be removed). If the knob is turned on, they will be optimized again. +References +== + +For additional information on Kprobes, refer to the following URLs: + +- https://www.ibm.com/developerworks/library/l-kprobes/index.html +- https://www.kernel.org/doc/ols/2006/ols2006v2-pages-109-124.pdf + -- 2.25.4
[PATCH 3/3] docs: Move kprobes.rst from staging/ to trace/
Kprobes contitutes a dynamic tracing technology and as such can be moved alongside documentation of other tracing technologies. Signed-off-by: Naveen N. Rao --- Documentation/staging/index.rst | 1 - Documentation/trace/index.rst| 1 + Documentation/{staging => trace}/kprobes.rst | 0 3 files changed, 1 insertion(+), 1 deletion(-) rename Documentation/{staging => trace}/kprobes.rst (100%) diff --git a/Documentation/staging/index.rst b/Documentation/staging/index.rst index 184e6aece0a7..abd0d18254d2 100644 --- a/Documentation/staging/index.rst +++ b/Documentation/staging/index.rst @@ -7,7 +7,6 @@ Unsorted Documentation :maxdepth: 2 crc32 - kprobes lzo remoteproc rpmsg diff --git a/Documentation/trace/index.rst b/Documentation/trace/index.rst index 7d83156c9ac1..879ebb9f647d 100644 --- a/Documentation/trace/index.rst +++ b/Documentation/trace/index.rst @@ -9,6 +9,7 @@ Linux Tracing Technologies tracepoint-analysis ftrace ftrace-uses + kprobes kprobetrace uprobetracer tracepoints diff --git a/Documentation/staging/kprobes.rst b/Documentation/trace/kprobes.rst similarity index 100% rename from Documentation/staging/kprobes.rst rename to Documentation/trace/kprobes.rst -- 2.25.4
[PATCH 0/3] docs: kprobes: Update URLs and move under trace/
This series updates some of the URLs in the kprobes document and moves the same under trace/ directory. - Naveen Naveen N. Rao (3): docs: staging/kprobes.rst: Update some of the references docs: staging/kprobes.rst: Move references to a separate appendix docs: Move kprobes.rst from staging/ to trace/ Documentation/staging/index.rst | 1 - Documentation/trace/index.rst| 1 + Documentation/{staging => trace}/kprobes.rst | 16 +--- 3 files changed, 10 insertions(+), 8 deletions(-) rename Documentation/{staging => trace}/kprobes.rst (99%) base-commit: f33d4075e512274c0c6edcd7602da2cf1d5a0341 -- 2.25.4
[PATCH 1/3] docs: staging/kprobes.rst: Update some of the references
Some of the kprobes references are not valid anymore. Update the URLs to point to their changed locations, where appropriate. Drop two URLs which do not exist anymore. Reported-by: Masami Hiramatsu Signed-off-by: Naveen N. Rao --- Documentation/staging/kprobes.rst | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Documentation/staging/kprobes.rst b/Documentation/staging/kprobes.rst index 8baab8832c5b..620699f5df22 100644 --- a/Documentation/staging/kprobes.rst +++ b/Documentation/staging/kprobes.rst @@ -712,10 +712,8 @@ See samples/kprobes/kretprobe_example.c For additional information on Kprobes, refer to the following URLs: -- http://www-106.ibm.com/developerworks/library/l-kprobes.html?ca=dgr-lnxw42Kprobe -- http://www.redhat.com/magazine/005mar05/features/kprobes/ -- http://www-users.cs.umn.edu/~boutcher/kprobes/ -- http://www.linuxsymposium.org/2006/linuxsymposium_procv2.pdf (pages 101-115) +- https://www.ibm.com/developerworks/library/l-kprobes/index.html +- https://www.kernel.org/doc/ols/2006/ols2006v2-pages-109-124.pdf Deprecated Features === -- 2.25.4
Re: [PATCH] Replace HTTP links with HTTPS ones: kprobes
Masami Hiramatsu wrote: On Tue, 14 Jul 2020 00:02:49 +0200 "Alexander A. Klimov" wrote: Am 13.07.20 um 16:20 schrieb Masami Hiramatsu: > Hi Naveen and Alexander, > > On Fri, 10 Jul 2020 19:14:47 +0530 > "Naveen N. Rao" wrote: > >> Masami Hiramatsu wrote: >>> On Tue, 7 Jul 2020 21:49:59 +0200 >>> "Alexander A. Klimov" wrote: >>> >>>> Rationale: >>>> Reduces attack surface on kernel devs opening the links for MITM >>>> as HTTPS traffic is much harder to manipulate. >>>> >>>> Deterministic algorithm: >>>> For each file: >>>>If not .svg: >>>> For each line: >>>>If doesn't contain `\bxmlns\b`: >>>> For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`: >>>>If both the HTTP and HTTPS versions >>>>return 200 OK and serve the same content: >>>> Replace HTTP with HTTPS. >>> >>> OK, but it seems that some of them are disappeared :( >>> >>> http://www-106.ibm.com/developerworks/library/l-kprobes.html?ca=dgr-lnxw42Kprobe >>> >>> -> https://www.ibm.com/developerworks/library/l-kprobes/index.html >> >> That looks right. >> >>> >>> http://www.redhat.com/magazine/005mar05/features/kprobes/ >>> >>> -> I can not find that. >> >> Ditto, we should drop that. >> >>> >>>> - http://www-users.cs.umn.edu/~boutcher/kprobes/ >>>> - http://www.linuxsymposium.org/2006/linuxsymposium_procv2.pdf (pages 101-115) >>> >>> Both are not found. >> >> It looks like the first link is gone, but there seems to be a copy in >> the web archive: >> https://web.archive.org/web/20061106154519/http://www-users.cs.umn.edu/~boutcher/kprobes/ >> >> I suppose we can drop that link. >> >>> >>> (OT, it seems http://www.linuxsymposium.org/ has been left from historical >>> Linux Symposium, we must remove it asap) >> >> Indeed, I think that link pointed to the Kprobes paper: >> https://www.kernel.org/doc/ols/2006/ols2006v2-pages-109-124.pdf > > Ah, there is. > Thank you for the confirmation. > Alexander, can you update other urls instead of just replacing the http with https? Sry, but I don't steal others' work (on principle). If I didn't the work (e.g. searched the replacement URL), I don't deserve to author the respective commit. Alexander, Next time, please ask if you aren't sure about including others' suggestions -- no need to term it as "stealing". Masami asked if you can include this, and I shared what I thought are the correct URLs so that they can be included. We don't mind someone else doing this change. Besides, there are ways to acknowledge others, through a Suggested-by tag, as an example. Also my HTTPSifying task is not done yet. Hmm, Naveen, then, can you make the update? Sure, I will send a patch. - Naveen
Re: [PATCH] Replace HTTP links with HTTPS ones: kprobes
Masami Hiramatsu wrote: On Tue, 7 Jul 2020 21:49:59 +0200 "Alexander A. Klimov" wrote: Rationale: Reduces attack surface on kernel devs opening the links for MITM as HTTPS traffic is much harder to manipulate. Deterministic algorithm: For each file: If not .svg: For each line: If doesn't contain `\bxmlns\b`: For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`: If both the HTTP and HTTPS versions return 200 OK and serve the same content: Replace HTTP with HTTPS. OK, but it seems that some of them are disappeared :( http://www-106.ibm.com/developerworks/library/l-kprobes.html?ca=dgr-lnxw42Kprobe -> https://www.ibm.com/developerworks/library/l-kprobes/index.html That looks right. http://www.redhat.com/magazine/005mar05/features/kprobes/ -> I can not find that. Ditto, we should drop that. - http://www-users.cs.umn.edu/~boutcher/kprobes/ - http://www.linuxsymposium.org/2006/linuxsymposium_procv2.pdf (pages 101-115) Both are not found. It looks like the first link is gone, but there seems to be a copy in the web archive: https://web.archive.org/web/20061106154519/http://www-users.cs.umn.edu/~boutcher/kprobes/ I suppose we can drop that link. (OT, it seems http://www.linuxsymposium.org/ has been left from historical Linux Symposium, we must remove it asap) Indeed, I think that link pointed to the Kprobes paper: https://www.kernel.org/doc/ols/2006/ols2006v2-pages-109-124.pdf - Naveen
Re: [PATCH] perf powerpc: Don't ignore sym-handling.c file
Arnaldo Carvalho de Melo wrote: Em Mon, May 11, 2020 at 11:45:09PM +0530, Sandipan Das escreveu: On 09/05/20 4:51 pm, Ravi Bangoria wrote: > Commit 7eec00a74720 ("perf symbols: Consolidate symbol fixup issue") > removed powerpc specific sym-handling.c file from Build. This wasn't > caught by build CI because all functions in this file are declared > as __weak in common code. Fix it. > > Fixes: 7eec00a74720 ("perf symbols: Consolidate symbol fixup issue") > Reported-by: Sandipan Das > Signed-off-by: Ravi Bangoria > --- > tools/perf/arch/powerpc/util/Build | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build > index e5c9504f8586..e86e210bf514 100644 > --- a/tools/perf/arch/powerpc/util/Build > +++ b/tools/perf/arch/powerpc/util/Build > @@ -2,6 +2,7 @@ perf-y += header.o > perf-y += kvm-stat.o > perf-y += perf_regs.o > perf-y += mem-events.o > +perf-y += sym-handling.o > > perf-$(CONFIG_DWARF) += dwarf-regs.o > perf-$(CONFIG_DWARF) += skip-callchain-idx.o > Thanks for fixing this! Acked-by: Sandipan Das Leo, Naveen, can you comment on this? Shoot -- this is a bad miss, I should have caught it. FWIW: Reviewed-by: Naveen N. Rao Thanks, Naveen
Re: [PATCH 12/14] docs: move remaining stuff under Documentation/*.txt to Documentation/staging
Masami Hiramatsu wrote: On Fri, 1 May 2020 17:37:56 +0200 Mauro Carvalho Chehab wrote: There are several files that I was unable to find a proper place for them, and 3 ones that are still in plain old text format. Let's place those stuff behind the carpet, as we'd like to keep the root directory clean. We can later discuss and move those into better places. Hi Mauro, Thanks for cleaning it up! Tentatively moving kprobes.txt under staging/ is good to me. Acked-by: Masami Hiramatsu BTW, I think kprobes.txt is under trace/ or we may be better making a new core-api/events/ directory and prepare other event systems (PMU, uprobes, and hw_breakpoint.) I think it would be good to move kprobes.txt under trace/ -- all other tracing bits are already present there, including uprobes. - Naveen
Re: [PATCH 0/2] pseries/hotplug: Change the default behaviour of cede_offline
Michael Ellerman wrote: "Naveen N. Rao" writes: Michael Ellerman wrote: "Gautham R. Shenoy" writes: From: "Gautham R. Shenoy" Currently on Pseries Linux Guests, the offlined CPU can be put to one of the following two states: - Long term processor cede (also called extended cede) - Returned to the Hypervisor via RTAS "stop-self" call. This is controlled by the kernel boot parameter "cede_offline=on/off". By default the offlined CPUs enter extended cede. Since commit 3aa565f53c39 ("powerpc/pseries: Add hooks to put the CPU into an appropriate offline state") (Nov 2009) Which you wrote :) Why was that wrong? The PHYP hypervisor considers CPUs in extended cede to be "active" since the CPUs are still under the control fo the Linux Guests. Hence, when we change the SMT modes by offlining the secondary CPUs, the PURR and the RWMR SPRs will continue to count the values for offlined CPUs in extended cede as if they are online. One of the expectations with PURR is that the for an interval of time, the sum of the PURR increments across the online CPUs of a core should equal the number of timebase ticks for that interval. This is currently not the case. But why does that matter? It's just some accounting stuff, does it actually break something meaningful? Yes, this broke lparstat at the very least (though its quite unfortunate we took so long to notice). By "so long" you mean 10 years? Also I've never heard of lparstat, but I assume it's one of these tools that's meant to behave like the AIX equivalent? Yes, and yes. lparstat is part of powerpc-utils. If it's been "broken" for 10 years and no one noticed, I'd argue the current behaviour is now "correct" and fixing it would actually be a breakage :) :) More on this below... With SMT disabled, and under load: $ sudo lparstat 1 10 System Configuration type=Shared mode=Uncapped smt=Off lcpu=2 mem=7759616 kB cpus=6 ent=1.00 %user %sys %wait%idlephysc %entc lbusy vcsw phint - - --- - - - - 100.00 0.00 0.00 0.00 1.10 110.00 100.00 128784460 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128784860 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128785260 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128785662 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128786062 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128786462 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128786862 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128787262 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128787664 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128788064 0 What about that is wrong? The 'physc' column represents cpu usage in units of physical cores. With 2 virtual cores ('lcpu=2') in uncapped, shared processor mode, we expect this to be closer to 2 when fully loaded (and spare capacity elsewhere in the system). With cede_offline=off: $ sudo lparstat 1 10 System Configuration type=Shared mode=Uncapped smt=Off lcpu=2 mem=7759616 kB cpus=6 ent=1.00 %user %sys %wait%idlephysc %entc lbusy vcsw phint - - --- - - - - 100.00 0.00 0.00 0.00 1.94 194.00 100.00 128961588 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128961988 0 100.00 0.00 0.00 0.00 inf inf 100.00 128962392 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128962792 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128963192 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128963592 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128963992 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128964392 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128964792 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128965194 0 [The 'inf' values there show a different bug] Also, since we expose [S]PURR through sysfs, any tools that make use of that directly are also affected due to this. But again if we've had the current behaviour for 10 years then arguably that's now the correct behaviour. That's a fair point, and probably again points to this area getting less tested. One of the main reasons for this being caught now though, is that there are workloads being tested under lower SMT levels now. So, I suspect no one has been relying on this behavior and we can consider this to be a bug. Thanks, Naveen
Re: [PATCH 0/2] pseries/hotplug: Change the default behaviour of cede_offline
Michael Ellerman wrote: "Gautham R. Shenoy" writes: From: "Gautham R. Shenoy" Currently on Pseries Linux Guests, the offlined CPU can be put to one of the following two states: - Long term processor cede (also called extended cede) - Returned to the Hypervisor via RTAS "stop-self" call. This is controlled by the kernel boot parameter "cede_offline=on/off". By default the offlined CPUs enter extended cede. Since commit 3aa565f53c39 ("powerpc/pseries: Add hooks to put the CPU into an appropriate offline state") (Nov 2009) Which you wrote :) Why was that wrong? The PHYP hypervisor considers CPUs in extended cede to be "active" since the CPUs are still under the control fo the Linux Guests. Hence, when we change the SMT modes by offlining the secondary CPUs, the PURR and the RWMR SPRs will continue to count the values for offlined CPUs in extended cede as if they are online. One of the expectations with PURR is that the for an interval of time, the sum of the PURR increments across the online CPUs of a core should equal the number of timebase ticks for that interval. This is currently not the case. But why does that matter? It's just some accounting stuff, does it actually break something meaningful? Yes, this broke lparstat at the very least (though its quite unfortunate we took so long to notice). With SMT disabled, and under load: $ sudo lparstat 1 10 System Configuration type=Shared mode=Uncapped smt=Off lcpu=2 mem=7759616 kB cpus=6 ent=1.00 %user %sys %wait%idlephysc %entc lbusy vcsw phint - - --- - - - - 100.00 0.00 0.00 0.00 1.10 110.00 100.00 128784460 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128784860 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128785260 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128785662 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128786062 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128786462 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128786862 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128787262 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128787664 0 100.00 0.00 0.00 0.00 1.07 107.00 100.00 128788064 0 With cede_offline=off: $ sudo lparstat 1 10 System Configuration type=Shared mode=Uncapped smt=Off lcpu=2 mem=7759616 kB cpus=6 ent=1.00 %user %sys %wait%idlephysc %entc lbusy vcsw phint - - --- - - - - 100.00 0.00 0.00 0.00 1.94 194.00 100.00 128961588 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128961988 0 100.00 0.00 0.00 0.00 inf inf 100.00 128962392 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128962792 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128963192 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128963592 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128963992 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128964392 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128964792 0 100.00 0.00 0.00 0.00 1.91 191.00 100.00 128965194 0 [The 'inf' values there show a different bug] Also, since we expose [S]PURR through sysfs, any tools that make use of that directly are also affected due to this. - Naveen
Re: [PATCH 2/2] powerpc/watchpoint: Disable watchpoint hit by larx/stcx instructions
Ravi Bangoria wrote: If watchpoint exception is generated by larx/stcx instructions, the reservation created by larx gets lost while handling exception, and thus stcx instruction always fails. Generally these instructions are used in a while(1) loop, for example spinlocks. And because stcx never succeeds, it loops forever and ultimately hangs the system. Note that ptrace anyway works in one-shot mode and thus for ptrace we don't change the behaviour. It's up to ptrace user to take care of this. Signed-off-by: Ravi Bangoria --- arch/powerpc/kernel/hw_breakpoint.c | 49 +++-- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 28ad3171bb82..9fa496a598ce 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -195,14 +195,32 @@ void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs) tsk->thread.last_hit_ubp = NULL; } +static bool is_larx_stcx_instr(struct pt_regs *regs, unsigned int instr) +{ + int ret, type; + struct instruction_op op; + + ret = analyse_instr(&op, regs, instr); + type = GETTYPE(op.type); + return (!ret && (type == LARX || type == STCX)); +} + /* * Handle debug exception notifications. */ static bool stepping_handler(struct pt_regs *regs, struct perf_event *bp, unsigned long addr) { - int stepped; - unsigned int instr; + unsigned int instr = 0; + + if (__get_user_inatomic(instr, (unsigned int *)regs->nip)) + goto fail; + + if (is_larx_stcx_instr(regs, instr)) { + printk_ratelimited("Watchpoint: Can't emulate/single-step larx/" + "stcx instructions. Disabling watchpoint.\n"); The below WARN() uses the term 'breakpoint'. Better to use consistent terminology. I would rewrite the above as: printk_ratelimited("Breakpoint hit on instruction that can't be emulated. " "Breakpoint at 0x%lx will be disabled.\n", addr); Otherwise: Acked-by: Naveen N. Rao - Naveen + goto disable; + } /* Do not emulate user-space instructions, instead single-step them */ if (user_mode(regs)) { @@ -211,23 +229,22 @@ static bool stepping_handler(struct pt_regs *regs, struct perf_event *bp, return false; } - stepped = 0; - instr = 0; - if (!__get_user_inatomic(instr, (unsigned int *)regs->nip)) - stepped = emulate_step(regs, instr); + if (!emulate_step(regs, instr)) + goto fail; + return true; + +fail: /* -* emulate_step() could not execute it. We've failed in reliably -* handling the hw-breakpoint. Unregister it and throw a warning -* message to let the user know about it. +* We've failed in reliably handling the hw-breakpoint. Unregister +* it and throw a warning message to let the user know about it. */ - if (!stepped) { - WARN(1, "Unable to handle hardware breakpoint. Breakpoint at " - "0x%lx will be disabled.", addr); - perf_event_disable_inatomic(bp); - return false; - } - return true; + WARN(1, "Unable to handle hardware breakpoint. Breakpoint at " + "0x%lx will be disabled.", addr); + +disable: + perf_event_disable_inatomic(bp); + return false; } int hw_breakpoint_handler(struct die_args *args) -- 2.21.0
[PATCH 3/3] powerpc: Use ftrace_graph_ret_addr() when unwinding
With support for HAVE_FUNCTION_GRAPH_RET_ADDR_PTR, ftrace_graph_ret_addr() provides more robust unwinding when function graph is in use. Update show_stack() to use the same. With dump_stack() added to sysrq_sysctl_handler(), before this patch: root@(none):/sys/kernel/debug/tracing# cat /proc/sys/kernel/sysrq CPU: 0 PID: 218 Comm: cat Not tainted 5.3.0-rc7-00868-g8453ad4a078c-dirty #20 Call Trace: [c000d1e13c30] [c006ab98] return_to_handler+0x0/0x40 (dump_stack+0xe8/0x164) (unreliable) [c000d1e13c80] [c0145680] sysrq_sysctl_handler+0x48/0xb8 [c000d1e13cd0] [c006ab98] return_to_handler+0x0/0x40 (proc_sys_call_handler+0x274/0x2a0) [c000d1e13d60] [c006ab98] return_to_handler+0x0/0x40 (return_to_handler+0x0/0x40) [c000d1e13d80] [c006ab98] return_to_handler+0x0/0x40 (__vfs_read+0x3c/0x70) [c000d1e13dd0] [c006ab98] return_to_handler+0x0/0x40 (vfs_read+0xb8/0x1b0) [c000d1e13e20] [c006ab98] return_to_handler+0x0/0x40 (ksys_read+0x7c/0x140) After this patch: Call Trace: [c000d1e33c30] [c006ab58] return_to_handler+0x0/0x40 (dump_stack+0xe8/0x164) (unreliable) [c000d1e33c80] [c0145680] sysrq_sysctl_handler+0x48/0xb8 [c000d1e33cd0] [c006ab58] return_to_handler+0x0/0x40 (proc_sys_call_handler+0x274/0x2a0) [c000d1e33d60] [c006ab58] return_to_handler+0x0/0x40 (__vfs_read+0x3c/0x70) [c000d1e33d80] [c006ab58] return_to_handler+0x0/0x40 (vfs_read+0xb8/0x1b0) [c000d1e33dd0] [c006ab58] return_to_handler+0x0/0x40 (ksys_read+0x7c/0x140) [c000d1e33e20] [c006ab58] return_to_handler+0x0/0x40 (system_call+0x5c/0x68) Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/process.c | 19 ++- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 24621e7e5033..f289bdd2b562 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -2047,10 +2047,8 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) int count = 0; int firstframe = 1; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - struct ftrace_ret_stack *ret_stack; - extern void return_to_handler(void); - unsigned long rth = (unsigned long)return_to_handler; - int curr_frame = 0; + unsigned long ret_addr; + int ftrace_idx = 0; #endif if (tsk == NULL) @@ -2079,15 +2077,10 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) if (!firstframe || ip != lr) { printk("["REG"] ["REG"] %pS", sp, ip, (void *)ip); #ifdef CONFIG_FUNCTION_GRAPH_TRACER - if ((ip == rth) && curr_frame >= 0) { - ret_stack = ftrace_graph_get_ret_stack(current, - curr_frame++); - if (ret_stack) - pr_cont(" (%pS)", - (void *)ret_stack->ret); - else - curr_frame = -1; - } + ret_addr = ftrace_graph_ret_addr(current, + &ftrace_idx, ip, stack); + if (ret_addr != ip) + pr_cont(" (%pS)", (void *)ret_addr); #endif if (firstframe) pr_cont(" (unreliable)"); -- 2.23.0
[PATCH 2/3] powerpc/ftrace: Enable HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
This associates entries in the ftrace_ret_stack with corresponding stack frames, enabling more robust stack unwinding. Also update the only user of ftrace_graph_ret_addr() to pass the stack pointer. Signed-off-by: Naveen N. Rao --- arch/powerpc/include/asm/asm-prototypes.h | 3 ++- arch/powerpc/include/asm/ftrace.h | 2 ++ arch/powerpc/kernel/stacktrace.c | 2 +- arch/powerpc/kernel/trace/ftrace.c | 5 +++-- arch/powerpc/kernel/trace/ftrace_32.S | 1 + arch/powerpc/kernel/trace/ftrace_64_mprofile.S | 1 + arch/powerpc/kernel/trace/ftrace_64_pg.S | 1 + 7 files changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 49196d35e3bb..8561498e653c 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -134,7 +134,8 @@ extern int __ucmpdi2(u64, u64); /* tracing */ void _mcount(void); -unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip); +unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, + unsigned long sp); void pnv_power9_force_smt4_catch(void); void pnv_power9_force_smt4_release(void); diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 3dfb80b86561..f54a08a2cd70 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -8,6 +8,8 @@ #define MCOUNT_ADDR((unsigned long)(_mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ +#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR + #ifdef __ASSEMBLY__ /* Based off of objdump optput from glibc */ diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 1e2276963f6d..e2a46cfed5fd 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -182,7 +182,7 @@ static int __save_stack_trace_tsk_reliable(struct task_struct *tsk, * FIXME: IMHO these tests do not belong in * arch-dependent code, they are generic. */ - ip = ftrace_graph_ret_addr(tsk, &graph_idx, ip, NULL); + ip = ftrace_graph_ret_addr(tsk, &graph_idx, ip, stack); #ifdef CONFIG_KPROBES /* * Mark stacktraces with kretprobed functions on them diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index be1ca98fce5c..7ea0ca044b65 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -944,7 +944,8 @@ int ftrace_disable_ftrace_graph_caller(void) * Hook the return address and push it in the stack of return addrs * in current thread info. Return the address we want to divert to. */ -unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip) +unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, + unsigned long sp) { unsigned long return_hooker; @@ -956,7 +957,7 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip) return_hooker = ppc_function_entry(return_to_handler); - if (!function_graph_enter(parent, ip, 0, NULL)) + if (!function_graph_enter(parent, ip, 0, (unsigned long *)sp)) parent = return_hooker; out: return parent; diff --git a/arch/powerpc/kernel/trace/ftrace_32.S b/arch/powerpc/kernel/trace/ftrace_32.S index 183f608efb81..e023ae59c429 100644 --- a/arch/powerpc/kernel/trace/ftrace_32.S +++ b/arch/powerpc/kernel/trace/ftrace_32.S @@ -50,6 +50,7 @@ _GLOBAL(ftrace_stub) #ifdef CONFIG_FUNCTION_GRAPH_TRACER _GLOBAL(ftrace_graph_caller) + addir5, r1, 48 /* load r4 with local address */ lwz r4, 44(r1) subir4, r4, MCOUNT_INSN_SIZE diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index 74acbf16a666..f9fd5f743eba 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -294,6 +294,7 @@ _GLOBAL(ftrace_graph_caller) std r2, 24(r1) ld r2, PACATOC(r13)/* get kernel TOC in r2 */ + addir5, r1, 112 mfctr r4 /* ftrace_caller has moved local addr here */ std r4, 40(r1) mflrr3 /* ftrace_caller has restored LR from stack */ diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.S b/arch/powerpc/kernel/trace/ftrace_64_pg.S index e41a7d13c99c..6708e24db0ab 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_pg.S +++ b/arch/powerpc/kernel/trace/ftrace_64_pg.S @@ -41,6 +41,7 @@ _GLOBAL(ftrace_stub) #ifdef CONFIG_FUNCTION_GRAPH_TRACER _GLOBAL(ftrace_graph_caller) + addir5, r1, 112 /* load r4 with local address */ ld r4
[PATCH 0/3] powerpc/ftrace: Enable HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
Enable HAVE_FUNCTION_GRAPH_RET_ADDR_PTR for more robust stack unwinding when function graph tracer is in use. Convert powerpc show_stack() to use ftrace_graph_ret_addr() for better stack unwinding. - Naveen Naveen N. Rao (3): ftrace: Look up the address of return_to_handler() using helpers powerpc/ftrace: Enable HAVE_FUNCTION_GRAPH_RET_ADDR_PTR powerpc: Use ftrace_graph_ret_addr() when unwinding arch/powerpc/include/asm/asm-prototypes.h | 3 ++- arch/powerpc/include/asm/ftrace.h | 2 ++ arch/powerpc/kernel/process.c | 19 ++- arch/powerpc/kernel/stacktrace.c | 2 +- arch/powerpc/kernel/trace/ftrace.c| 5 +++-- arch/powerpc/kernel/trace/ftrace_32.S | 1 + .../powerpc/kernel/trace/ftrace_64_mprofile.S | 1 + arch/powerpc/kernel/trace/ftrace_64_pg.S | 1 + kernel/trace/fgraph.c | 4 ++-- 9 files changed, 19 insertions(+), 19 deletions(-) -- 2.23.0
[PATCH 1/3] ftrace: Look up the address of return_to_handler() using helpers
This ensures that we use the right address on architectures that use function descriptors. Signed-off-by: Naveen N. Rao --- kernel/trace/fgraph.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 8dfd5021b933..7950a0356042 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -276,7 +276,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, int index = task->curr_ret_stack; int i; - if (ret != (unsigned long)return_to_handler) + if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler)) return ret; if (index < 0) @@ -294,7 +294,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, { int task_idx; - if (ret != (unsigned long)return_to_handler) + if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler)) return ret; task_idx = task->curr_ret_stack; -- 2.23.0
Re: [PATCH v3 2/3] Powerpc64/Watchpoint: Don't ignore extraneous exceptions
Ravi Bangoria wrote: On Powerpc64, watchpoint match range is double-word granular. On a watchpoint hit, DAR is set to the first byte of overlap between actual access and watched range. And thus it's quite possible that DAR does not point inside user specified range. Ex, say user creates a watchpoint with address range 0x1004 to 0x1007. So hw would be configured to watch from 0x1000 to 0x1007. If there is a 4 byte access from 0x1002 to 0x1005, DAR will point to 0x1002 and thus interrupt handler considers it as extraneous, but it's actually not, because part of the access belongs to what user has asked. So, let kernel pass it on to user and let user decide what to do with it instead of silently ignoring it. The drawback is, it can generate false positive events. I think you should do the additional validation here, instead of generating false positives. You should be able to read the instruction, run it through analyse_instr(), and then use OP_IS_LOAD_STORE() and GETSIZE() to understand the access range. This can be used to then perform a better match against what the user asked for. - Naveen
Re: [PATCH -tip v2] kprobes: Prohibit probing on BUG() and WARN() address
Masami Hiramatsu wrote: Since BUG() and WARN() may use a trap (e.g. UD2 on x86) to get the address where the BUG() has occurred, kprobes can not do single-step out-of-line that instruction. So prohibit probing on such address. Without this fix, if someone put a kprobe on WARN(), the kernel will crash with invalid opcode error instead of outputing warning message, because kernel can not find correct bug address. Signed-off-by: Masami Hiramatsu --- Changes in v2: - Add find_bug() stub function for !CONFIG_GENERIC_BUG - Cast the p->addr to unsigned long. --- include/linux/bug.h |5 + kernel/kprobes.c|3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) Acked-by: Naveen N. Rao - Naveen
Re: [PATCH 1/2] ftrace: Fix NULL pointer dereference in t_probe_next()
Steven Rostedt wrote: On Thu, 4 Jul 2019 20:04:41 +0530 "Naveen N. Rao" wrote: kernel/trace/ftrace.c | 4 1 file changed, 4 insertions(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7b037295a1f1..0791eafb693d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3093,6 +3093,10 @@ t_probe_next(struct seq_file *m, loff_t *pos) hnd = &iter->probe_entry->hlist; hash = iter->probe->ops.func_hash->filter_hash; + + if (!hash) + return NULL; + size = 1 << hash->size_bits; retry: OK, I added this, but I'm also adding this on top: Thanks, the additional comments do make this much clearer. Regards, Naveen
[tip: perf/core] perf arch powerpc: Sync powerpc syscall.tbl
The following commit has been merged into the perf/core branch of tip: Commit-ID: 0a56e0603fa13af08816d673f6f71b68cda2fb2e Gitweb: https://git.kernel.org/tip/0a56e0603fa13af08816d673f6f71b68cda2fb2e Author:Naveen N. Rao AuthorDate:Tue, 27 Aug 2019 12:44:58 +05:30 Committer: Arnaldo Carvalho de Melo CommitterDate: Wed, 28 Aug 2019 10:25:38 -03:00 perf arch powerpc: Sync powerpc syscall.tbl Copy over powerpc syscall.tbl to grab changes from the below commits: commit cee3536d24a1 ("powerpc: Wire up clone3 syscall") commit 1a271a68e030 ("arch: mark syscall number 435 reserved for clone3") commit 7615d9e1780e ("arch: wire-up pidfd_open()") commit d8076bdb56af ("uapi: Wire up the mount API syscalls on non-x86 arches [ver #2]") commit 39036cd27273 ("arch: add pidfd and io_uring syscalls everywhere") commit 48166e6ea47d ("y2038: add 64-bit time_t syscalls to all 32-bit architectures") commit d33c577cccd0 ("y2038: rename old time and utime syscalls") commit 00bf25d693e7 ("y2038: use time32 syscall names on 32-bit") commit 8dabe7245bbc ("y2038: syscalls: rename y2038 compat syscalls") commit 0d6040d46817 ("arch: add split IPC system calls where needed") Reported-by: Nicholas Piggin Signed-off-by: Naveen N. Rao Cc: Ravi Bangoria Cc: linuxppc-...@lists.ozlabs.org Link: http://lkml.kernel.org/r/20190827071458.19897-1-naveen.n@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 146 +--- 1 file changed, 119 insertions(+), 27 deletions(-) diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index db3bbb8..43f736e 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -20,7 +20,9 @@ 10 common unlink sys_unlink 11 nospu execve sys_execve compat_sys_execve 12 common chdir sys_chdir -13 common timesys_time compat_sys_time +13 32 timesys_time32 +13 64 timesys_time +13 spu timesys_time 14 common mknod sys_mknod 15 common chmod sys_chmod 16 common lchown sys_lchown @@ -36,14 +38,17 @@ 22 spu umount sys_ni_syscall 23 common setuid sys_setuid 24 common getuid sys_getuid -25 common stime sys_stime compat_sys_stime +25 32 stime sys_stime32 +25 64 stime sys_stime +25 spu stime sys_stime 26 nospu ptrace sys_ptrace compat_sys_ptrace 27 common alarm sys_alarm 28 32 oldfstatsys_fstat sys_ni_syscall 28 64 oldfstatsys_ni_syscall 28 spu oldfstatsys_ni_syscall 29 nospu pause sys_pause -30 nospu utime sys_utime compat_sys_utime +30 32 utime sys_utime32 +30 64 utime sys_utime 31 common sttysys_ni_syscall 32 common gttysys_ni_syscall 33 common access sys_access @@ -157,7 +162,9 @@ 121common setdomainname sys_setdomainname 122common uname sys_newuname 123common modify_ldt sys_ni_syscall -124common adjtimexsys_adjtimex compat_sys_adjtimex +12432 adjtimexsys_adjtimex_time32 +12464 adjtimexsys_adjtimex +124spu adjtimexsys_adjtimex 125common mprotectsys_mprotect 12632 sigprocmask sys_sigprocmask compat_sys_sigprocmask 12664 sigprocmask sys_ni_syscall @@ -198,8 +205,12 @@ 158common sched_yield sys_sched_yield 159common sched_get_priority_max sys_sched_get_priority_max 160common sched_get_priority_min sys_sched_get_priority_min -161common sched_rr_get_interval sys_sched_rr_get_interval
[PATCH] perf arch powerpc: Sync powerpc syscall.tbl
Copy over powerpc syscall.tbl to grab changes from the below commits: commit cee3536d24a1 ("powerpc: Wire up clone3 syscall") commit 1a271a68e030 ("arch: mark syscall number 435 reserved for clone3") commit 7615d9e1780e ("arch: wire-up pidfd_open()") commit d8076bdb56af ("uapi: Wire up the mount API syscalls on non-x86 arches [ver #2]") commit 39036cd27273 ("arch: add pidfd and io_uring syscalls everywhere") commit 48166e6ea47d ("y2038: add 64-bit time_t syscalls to all 32-bit architectures") commit d33c577cccd0 ("y2038: rename old time and utime syscalls") commit 00bf25d693e7 ("y2038: use time32 syscall names on 32-bit") commit 8dabe7245bbc ("y2038: syscalls: rename y2038 compat syscalls") commit 0d6040d46817 ("arch: add split IPC system calls where needed") Reported-by: Nicholas Piggin Signed-off-by: Naveen N. Rao --- .../arch/powerpc/entry/syscalls/syscall.tbl | 146 ++ 1 file changed, 119 insertions(+), 27 deletions(-) diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index db3bbb8744af..43f736ed47f2 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -20,7 +20,9 @@ 10 common unlink sys_unlink 11 nospu execve sys_execve compat_sys_execve 12 common chdir sys_chdir -13 common timesys_time compat_sys_time +13 32 timesys_time32 +13 64 timesys_time +13 spu timesys_time 14 common mknod sys_mknod 15 common chmod sys_chmod 16 common lchown sys_lchown @@ -36,14 +38,17 @@ 22 spu umount sys_ni_syscall 23 common setuid sys_setuid 24 common getuid sys_getuid -25 common stime sys_stime compat_sys_stime +25 32 stime sys_stime32 +25 64 stime sys_stime +25 spu stime sys_stime 26 nospu ptrace sys_ptrace compat_sys_ptrace 27 common alarm sys_alarm 28 32 oldfstatsys_fstat sys_ni_syscall 28 64 oldfstatsys_ni_syscall 28 spu oldfstatsys_ni_syscall 29 nospu pause sys_pause -30 nospu utime sys_utime compat_sys_utime +30 32 utime sys_utime32 +30 64 utime sys_utime 31 common sttysys_ni_syscall 32 common gttysys_ni_syscall 33 common access sys_access @@ -157,7 +162,9 @@ 121common setdomainname sys_setdomainname 122common uname sys_newuname 123common modify_ldt sys_ni_syscall -124common adjtimexsys_adjtimex compat_sys_adjtimex +12432 adjtimexsys_adjtimex_time32 +12464 adjtimexsys_adjtimex +124spu adjtimexsys_adjtimex 125common mprotectsys_mprotect 12632 sigprocmask sys_sigprocmask compat_sys_sigprocmask 12664 sigprocmask sys_ni_syscall @@ -198,8 +205,12 @@ 158common sched_yield sys_sched_yield 159common sched_get_priority_max sys_sched_get_priority_max 160common sched_get_priority_min sys_sched_get_priority_min -161common sched_rr_get_interval sys_sched_rr_get_interval compat_sys_sched_rr_get_interval -162common nanosleep sys_nanosleep compat_sys_nanosleep +16132 sched_rr_get_interval sys_sched_rr_get_interval_time32 +16164 sched_rr_get_interval sys_sched_rr_get_interval +161spu sched_rr_get_interval sys_sched_rr_get_interval +16232 nanosleep sys_nanosleep_time32 +16264 nanosleep sys_nanosleep +162spu nanosleep sys_nanosleep 163common mremap
Re: [PATCH] bpf: handle 32-bit zext during constant blinding
Jiong Wang wrote: Naveen N. Rao writes: Since BPF constant blinding is performed after the verifier pass, the ALU32 instructions inserted for doubleword immediate loads don't have a corresponding zext instruction. This is causing a kernel oops on powerpc and can be reproduced by running 'test_cgroup_storage' with bpf_jit_harden=2. Fix this by emitting BPF_ZEXT during constant blinding if prog->aux->verifier_zext is set. Fixes: a4b1d3c1ddf6cb ("bpf: verifier: insert zero extension according to analysis result") Reported-by: Michael Ellerman Signed-off-by: Naveen N. Rao Thanks for the fix. Reviewed-by: Jiong Wang Just two other comments during review in case I am wrong on somewhere. - Use verifier_zext instead of bpf_jit_needs_zext() seems better, even though the latter could avoid extending function argument. Because JIT back-ends look at verifier_zext, true means zext inserted by verifier so JITs won't do the code-gen. Use verifier_zext is sort of keeping JIT blinding the same behaviour has verifier even though blinding doesn't belong to verifier, but for such insn patching, it could be seen as a extension of verifier, therefore use verifier_zext seems better than bpf_jit_needs_zext() to me. - JIT blinding is also escaping the HI32 randomization which happens inside verifier, otherwise x86-64 regression should have caught this issue. Jiong, Thanks for the review. Alexei, Daniel, Can you please pick this up for v5.3. This is a regression and is causing a crash on powerpc. - Naveen
[PATCH] bpf: handle 32-bit zext during constant blinding
Since BPF constant blinding is performed after the verifier pass, the ALU32 instructions inserted for doubleword immediate loads don't have a corresponding zext instruction. This is causing a kernel oops on powerpc and can be reproduced by running 'test_cgroup_storage' with bpf_jit_harden=2. Fix this by emitting BPF_ZEXT during constant blinding if prog->aux->verifier_zext is set. Fixes: a4b1d3c1ddf6cb ("bpf: verifier: insert zero extension according to analysis result") Reported-by: Michael Ellerman Signed-off-by: Naveen N. Rao --- Changes since RFC: - Removed changes to ALU32 and JMP32 ops since those don't alter program execution, and the verifier would have already accounted for them. kernel/bpf/core.c | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8191a7db2777..66088a9e9b9e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -890,7 +890,8 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog, static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, - struct bpf_insn *to_buff) + struct bpf_insn *to_buff, + bool emit_zext) { struct bpf_insn *to = to_buff; u32 imm_rnd = get_random_int(); @@ -1005,6 +1006,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + if (emit_zext) + *to++ = BPF_ZEXT_REG(BPF_REG_AX); *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX); break; @@ -1088,7 +1091,8 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) insn[1].code == 0) memcpy(aux, insn, sizeof(aux)); - rewritten = bpf_jit_blind_insn(insn, aux, insn_buff); + rewritten = bpf_jit_blind_insn(insn, aux, insn_buff, + clone->aux->verifier_zext); if (!rewritten) continue; -- 2.22.0
Re: Regression fix for bpf in v5.3 (was Re: [RFC PATCH] bpf: handle 32-bit zext during constant blinding)
Jiong Wang wrote: Michael Ellerman writes: "Naveen N. Rao" writes: Since BPF constant blinding is performed after the verifier pass, there are certain ALU32 instructions inserted which don't have a corresponding zext instruction inserted after. This is causing a kernel oops on powerpc and can be reproduced by running 'test_cgroup_storage' with bpf_jit_harden=2. Fix this by emitting BPF_ZEXT during constant blinding if prog->aux->verifier_zext is set. Fixes: a4b1d3c1ddf6cb ("bpf: verifier: insert zero extension according to analysis result") Reported-by: Michael Ellerman Signed-off-by: Naveen N. Rao --- This approach (the location where zext is being introduced below, in particular) works for powerpc, but I am not entirely sure if this is sufficient for other architectures as well. This is broken on v5.3-rc4. Any comment on this? Have commented on https://marc.info/?l=linux-netdev&m=156637836024743&w=2 The fix looks correct to me on "BPF_LD | BPF_IMM | BPF_DW", but looks unnecessary on two other places. It would be great if you or Naveen could confirm it. Jiong, Thanks for the review. I can now see why the other two changes are not necessary. I will post a follow-on patch. Thanks! - Naveen
Re: [RFC PATCH] bpf: handle 32-bit zext during constant blinding
Naveen N. Rao wrote: Since BPF constant blinding is performed after the verifier pass, there are certain ALU32 instructions inserted which don't have a corresponding zext instruction inserted after. This is causing a kernel oops on powerpc and can be reproduced by running 'test_cgroup_storage' with bpf_jit_harden=2. Fix this by emitting BPF_ZEXT during constant blinding if prog->aux->verifier_zext is set. Fixes: a4b1d3c1ddf6cb ("bpf: verifier: insert zero extension according to analysis result") Reported-by: Michael Ellerman Signed-off-by: Naveen N. Rao --- This approach (the location where zext is being introduced below, in particular) works for powerpc, but I am not entirely sure if this is sufficient for other architectures as well. This is broken on v5.3-rc4. Alexie, Daniel, Jiong, Any feedback on this? - Naveen
Re: [PATCH 13/16] include/asm-generic: prefer __section from compiler_attributes.h
Nick Desaulniers wrote: Reported-by: Sedat Dilek Suggested-by: Josh Poimboeuf Signed-off-by: Nick Desaulniers --- Acked-by: Naveen N. Rao - Naveen
Re: [PATCH 4/4] arm64: implement KPROBES_ON_FTRACE
Jisheng Zhang wrote: This patch implements KPROBES_ON_FTRACE for arm64. ~ # mount -t debugfs debugfs /sys/kernel/debug/ ~ # cd /sys/kernel/debug/ /sys/kernel/debug # echo 'p _do_fork' > tracing/kprobe_events before the patch: /sys/kernel/debug # cat kprobes/list ff801009ff7c k _do_fork+0x4[DISABLED] This looks wrong -- we should not be allowing kprobe to be registered on ftrace address without KPROBES_ON_FTRACE. Is _do_fork+0x4 the location of ftrace entry on arm64? - Naveen
Re: [PATCH 1/4] kprobes: adjust kprobe addr for KPROBES_ON_FTRACE
Jisheng Zhang wrote: For KPROBES_ON_FTRACE case, we need to adjust the kprobe's addr correspondingly. Signed-off-by: Jisheng Zhang --- kernel/kprobes.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9873fc627d61..f8400753a8a9 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1560,6 +1560,9 @@ int register_kprobe(struct kprobe *p) addr = kprobe_addr(p); if (IS_ERR(addr)) return PTR_ERR(addr); +#ifdef CONFIG_KPROBES_ON_FTRACE + addr = (kprobe_opcode_t *)ftrace_call_adjust((unsigned long)addr); +#endif p->addr = addr; I'm not sure what this is achieving, but looks wrong to me. If you intend to have kprobes default to using ftrace entry for probing functions, consider over-riding kprobe_lookup_name() -- see powerpc variant for example. - Naveen
[RFC PATCH] bpf: handle 32-bit zext during constant blinding
Since BPF constant blinding is performed after the verifier pass, there are certain ALU32 instructions inserted which don't have a corresponding zext instruction inserted after. This is causing a kernel oops on powerpc and can be reproduced by running 'test_cgroup_storage' with bpf_jit_harden=2. Fix this by emitting BPF_ZEXT during constant blinding if prog->aux->verifier_zext is set. Fixes: a4b1d3c1ddf6cb ("bpf: verifier: insert zero extension according to analysis result") Reported-by: Michael Ellerman Signed-off-by: Naveen N. Rao --- This approach (the location where zext is being introduced below, in particular) works for powerpc, but I am not entirely sure if this is sufficient for other architectures as well. This is broken on v5.3-rc4. - Naveen kernel/bpf/core.c | 14 -- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8191a7db2777..d84146e6fd9e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -890,7 +890,8 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog, static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, - struct bpf_insn *to_buff) + struct bpf_insn *to_buff, + bool emit_zext) { struct bpf_insn *to = to_buff; u32 imm_rnd = get_random_int(); @@ -939,6 +940,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX); + if (emit_zext) + *to++ = BPF_ZEXT_REG(from->dst_reg); break; case BPF_ALU64 | BPF_ADD | BPF_K: @@ -992,6 +995,10 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, off -= 2; *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + if (emit_zext) { + *to++ = BPF_ZEXT_REG(BPF_REG_AX); + off--; + } *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX, off); break; @@ -1005,6 +1012,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + if (emit_zext) + *to++ = BPF_ZEXT_REG(BPF_REG_AX); *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX); break; @@ -1088,7 +1097,8 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) insn[1].code == 0) memcpy(aux, insn, sizeof(aux)); - rewritten = bpf_jit_blind_insn(insn, aux, insn_buff); + rewritten = bpf_jit_blind_insn(insn, aux, insn_buff, + clone->aux->verifier_zext); if (!rewritten) continue; -- 2.22.0
Re: [PATCH 0/2] ftrace: two fixes with func_probes handling
Naveen N. Rao wrote: Two patches addressing bugs in ftrace function probe handling. The first patch addresses a NULL pointer dereference reported by LTP tests, while the second one is a trivial patch to address a missing check for return value, found by code inspection. Steven, Can you please take a look at these patches? Thanks, Naveen
[PATCH 2/2] ftrace: Check for successful allocation of hash
In register_ftrace_function_probe(), we are not checking the return value of alloc_and_copy_ftrace_hash(). The subsequent call to ftrace_match_records() may end up dereferencing the same. Add a check to ensure this doesn't happen. Signed-off-by: Naveen N. Rao --- kernel/trace/ftrace.c | 5 + 1 file changed, 5 insertions(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0791eafb693d..0d5f7d4a4936 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4328,6 +4328,11 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, old_hash = *orig_hash; hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); + if (!hash) { + ret = -ENOMEM; + goto out; + } + ret = ftrace_match_records(hash, glob, strlen(glob)); /* Nothing found? */ -- 2.22.0
[PATCH 1/2] ftrace: Fix NULL pointer dereference in t_probe_next()
LTP testsuite on powerpc results in the below crash: Unable to handle kernel paging request for data at address 0x Faulting instruction address: 0xc029d800 Oops: Kernel access of bad area, sig: 11 [#1] LE SMP NR_CPUS=2048 NUMA PowerNV ... CPU: 68 PID: 96584 Comm: cat Kdump: loaded Tainted: GW NIP: c029d800 LR: c029dac4 CTR: c01e6ad0 REGS: c0002017fae8ba10 TRAP: 0300 Tainted: GW MSR: 90009033 CR: 28022422 XER: 2004 CFAR: c029d90c DAR: DSISR: 4000 IRQMASK: 0 ... NIP [c029d800] t_probe_next+0x60/0x180 LR [c029dac4] t_mod_start+0x1a4/0x1f0 Call Trace: [c0002017fae8bc90] [c0cdbc40] _cond_resched+0x10/0xb0 (unreliable) [c0002017fae8bce0] [c02a15b0] t_start+0xf0/0x1c0 [c0002017fae8bd30] [c04ec2b4] seq_read+0x184/0x640 [c0002017fae8bdd0] [c04a57bc] sys_read+0x10c/0x300 [c0002017fae8be30] [c000b388] system_call+0x5c/0x70 The test (ftrace_set_ftrace_filter.sh) is part of ftrace stress tests and the crash happens when the test does 'cat $TRACING_PATH/set_ftrace_filter'. The address points to the second line below, in t_probe_next(), where filter_hash is dereferenced: hash = iter->probe->ops.func_hash->filter_hash; size = 1 << hash->size_bits; This happens due to a race with register_ftrace_function_probe(). A new ftrace_func_probe is created and added into the func_probes list in trace_array under ftrace_lock. However, before initializing the filter, we drop ftrace_lock, and re-acquire it after acquiring regex_lock. If another process is trying to read set_ftrace_filter, it will be able to acquire ftrace_lock during this window and it will end up seeing a NULL filter_hash. Fix this by just checking for a NULL filter_hash in t_probe_next(). If the filter_hash is NULL, then this probe is just being added and we can simply return from here. Signed-off-by: Naveen N. Rao --- kernel/trace/ftrace.c | 4 1 file changed, 4 insertions(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7b037295a1f1..0791eafb693d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3093,6 +3093,10 @@ t_probe_next(struct seq_file *m, loff_t *pos) hnd = &iter->probe_entry->hlist; hash = iter->probe->ops.func_hash->filter_hash; + + if (!hash) + return NULL; + size = 1 << hash->size_bits; retry: -- 2.22.0
[PATCH 0/2] ftrace: two fixes with func_probes handling
Two patches addressing bugs in ftrace function probe handling. The first patch addresses a NULL pointer dereference reported by LTP tests, while the second one is a trivial patch to address a missing check for return value, found by code inspection. - Naveen Naveen N. Rao (2): ftrace: Fix NULL pointer dereference in t_probe_next() ftrace: Check for successful allocation of hash kernel/trace/ftrace.c | 9 + 1 file changed, 9 insertions(+) -- 2.22.0
Re: [PATCH v2 4/7] powerpc/ftrace: Additionally nop out the preceding mflr with -mprofile-kernel
Steven Rostedt wrote: On Thu, 27 Jun 2019 20:58:20 +0530 "Naveen N. Rao" wrote: > But interesting, I don't see a synchronize_rcu_tasks() call > there. We felt we don't need it in this case. We patch the branch to ftrace with a nop first. Other cpus should see that first. But, now that I think about it, should we add a memory barrier to ensure the writes get ordered properly? Do you send an ipi to the other CPUs. I would just to be safe. We are handling this through ftrace_replace_code() and __ftrace_make_call_prep() below. For FTRACE_UPDATE_MAKE_CALL, we patch in the mflr, followed by smp_call_function(isync) and synchronize_rcu_tasks() before we proceed to patch the branch to ftrace. I don't see any other scenario where we end up in __ftrace_make_nop_kernel() without going through ftrace_replace_code(). For kernel modules, this can happen during module load/init and so, I patch out both instructions in __ftrace_make_call() above without any synchronization. Am I missing anything? No, I think I got confused ;-), it's the patch out that I was worried about, but when I was going through the scenario, I somehow turned it into the patching in (which I already audited :-p). I was going to reply with just the top part of this email, but then the confusion started :-/ OK, yes, patching out should be fine, and you already covered the patching in. Sorry for the noise. Just to confirm and totally remove the confusion, the patch does: : mflrr0 <-- preempt here bl _mcount : mflrr0 nop And this is fine regardless. OK, Reviewed-by: Steven Rostedt (VMware) Thanks for confirming! We do need an IPI to be sure, as you pointed out above. I will have the patching out take the same path to simplify things. - Naveen
Re: [PATCH v2 4/7] powerpc/ftrace: Additionally nop out the preceding mflr with -mprofile-kernel
Hi Steven, Thanks for the review! Steven Rostedt wrote: On Thu, 27 Jun 2019 16:53:52 +0530 "Naveen N. Rao" wrote: With -mprofile-kernel, gcc emits 'mflr r0', followed by 'bl _mcount' to enable function tracing and profiling. So far, with dynamic ftrace, we used to only patch out the branch to _mcount(). However, mflr is executed by the branch unit that can only execute one per cycle on POWER9 and shared with branches, so it would be nice to avoid it where possible. We cannot simply nop out the mflr either. When enabling function tracing, there can be a race if tracing is enabled when some thread was interrupted after executing a nop'ed out mflr. In this case, the thread would execute the now-patched-in branch to _mcount() without having executed the preceding mflr. To solve this, we now enable function tracing in 2 steps: patch in the mflr instruction, use 'smp_call_function(isync); synchronize_rcu_tasks()' to ensure all existing threads make progress, and then patch in the branch to _mcount(). We override ftrace_replace_code() with a powerpc64 variant for this purpose. You may want to explain that you do the reverse when patching it out. That is, patch out the "bl _mcount" into a nop and then patch out the "mflr r0". Sure. I think we can add: "When disabling function tracing, we can nop out the two instructions without need for any synchronization in between, as long as we nop out the branch to ftrace first. The lone 'mflr r0' is harmless. Finally, with FTRACE_UPDATE_MODIFY_CALL, no changes are needed since we are simply changing where the branch to ftrace goes." But interesting, I don't see a synchronize_rcu_tasks() call there. We felt we don't need it in this case. We patch the branch to ftrace with a nop first. Other cpus should see that first. But, now that I think about it, should we add a memory barrier to ensure the writes get ordered properly? Suggested-by: Nicholas Piggin Reviewed-by: Nicholas Piggin Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/trace/ftrace.c | 258 ++--- 1 file changed, 236 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 517662a56bdc..86c2b50dcaa9 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -125,7 +125,7 @@ __ftrace_make_nop(struct module *mod, { unsigned long entry, ptr, tramp; unsigned long ip = rec->ip; - unsigned int op, pop; + unsigned int op; /* read where this goes */ if (probe_kernel_read(&op, (void *)ip, sizeof(int))) { @@ -160,8 +160,6 @@ __ftrace_make_nop(struct module *mod, #ifdef CONFIG_MPROFILE_KERNEL /* When using -mkernel_profile there is no load to jump over */ - pop = PPC_INST_NOP; - if (probe_kernel_read(&op, (void *)(ip - 4), 4)) { pr_err("Fetching instruction at %lx failed.\n", ip - 4); return -EFAULT; @@ -169,26 +167,23 @@ __ftrace_make_nop(struct module *mod, /* We expect either a mflr r0, or a std r0, LRSAVE(r1) */ if (op != PPC_INST_MFLR && op != PPC_INST_STD_LR) { - pr_err("Unexpected instruction %08x around bl _mcount\n", op); + pr_err("Unexpected instruction %08x before bl _mcount\n", op); return -EINVAL; } -#else - /* -* Our original call site looks like: -* -* bl -* ld r2,XX(r1) -* -* Milton Miller pointed out that we can not simply nop the branch. -* If a task was preempted when calling a trace function, the nops -* will remove the way to restore the TOC in r2 and the r2 TOC will -* get corrupted. -* -* Use a b +8 to jump over the load. -*/ - pop = PPC_INST_BRANCH | 8; /* b +8 */ + /* We should patch out the bl to _mcount first */ + if (patch_instruction((unsigned int *)ip, PPC_INST_NOP)) { + pr_err("Patching NOP failed.\n"); + return -EPERM; + } + /* then, nop out the preceding 'mflr r0' as an optimization */ + if (op == PPC_INST_MFLR && + patch_instruction((unsigned int *)(ip - 4), PPC_INST_NOP)) { + pr_err("Patching NOP failed.\n"); + return -EPERM; + } +#else /* * Check what is in the next instruction. We can see ld r2,40(r1), but * on first pass after boot we will see mflr r0. @@ -202,12 +197,25 @@ __ftrace_make_nop(struct module *mod, pr_err("Expected %08x found %08x\n", PPC_INST_LD_TOC, op); return -EINVAL; } -#endif /* CONFIG_MPROFILE_KERNEL */ - if (patch_instruction((unsigned int *)ip, pop)) { + /* +* Our
Re: [PATCH v2 4/7] powerpc/ftrace: Additionally nop out the preceding mflr with -mprofile-kernel
Naveen N. Rao wrote: With -mprofile-kernel, gcc emits 'mflr r0', followed by 'bl _mcount' to enable function tracing and profiling. So far, with dynamic ftrace, we used to only patch out the branch to _mcount(). However, mflr is executed by the branch unit that can only execute one per cycle on POWER9 and shared with branches, so it would be nice to avoid it where possible. We cannot simply nop out the mflr either. When enabling function tracing, there can be a race if tracing is enabled when some thread was interrupted after executing a nop'ed out mflr. In this case, the thread would execute the now-patched-in branch to _mcount() without having executed the preceding mflr. To solve this, we now enable function tracing in 2 steps: patch in the mflr instruction, use 'smp_call_function(isync); synchronize_rcu_tasks()' to ensure all existing threads make progress, and then patch in the branch to _mcount(). We override ftrace_replace_code() with a powerpc64 variant for this purpose. Suggested-by: Nicholas Piggin Reviewed-by: Nicholas Piggin Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/trace/ftrace.c | 258 ++--- 1 file changed, 236 insertions(+), 22 deletions(-) I missed adding a comment here to explain the changes. As discussed in the previous series, I think we are ok with this patch from a CMODX perspective. For smp_call_function(), I decided to have it included in this patch since we know that we need it here for sure. I am not entirely sure we want to do that in patch_instruction() since ftrace doesn't seem to need it elsewhere. As Nick Piggin pointed out, we may want to have users of patch_instruction() (kprobes) add the necessary synchronization. Thanks, Naveen
Re: [PATCH v2 2/7] x86/ftrace: Fix use of flags in ftrace_replace_code()
Steven Rostedt wrote: On Thu, 27 Jun 2019 16:53:50 +0530 "Naveen N. Rao" wrote: In commit a0572f687fb3c ("ftrace: Allow ftrace_replace_code() to be schedulable), the generic ftrace_replace_code() function was modified to accept a flags argument in place of a single 'enable' flag. However, the x86 version of this function was not updated. Fix the same. Fixes: a0572f687fb3c ("ftrace: Allow ftrace_replace_code() to be schedulable") I don't mind this change, but it's not a bug, and I'm not sure it should have the fixes tag. The reason being, the FTRACE_MODIFY_ENABLE_FL is only set when ftrace is called by with the command flag FTRACE_MAY_SLEEP, which is never done on x86. I guess you meant to say that *FTRACE_MODIFY_MAY_SLEEP_FL* is only set with FTRACE_MAY_SLEEP. That said, I'm fine with the change as it makes it more robust, but by adding the fixes tag, you're going to get this into all the stable code, and I'm not sure that's really necessary. Agreed. Thanks for pointing this out. We can drop this patch from this series and I will re-post this as a simpler cleanup later on. Thanks, Naveen
Re: [PATCH v2 2/7] x86/ftrace: Fix use of flags in ftrace_replace_code()
Naveen N. Rao wrote: In commit a0572f687fb3c ("ftrace: Allow ftrace_replace_code() to be schedulable), the generic ftrace_replace_code() function was modified to accept a flags argument in place of a single 'enable' flag. However, the x86 version of this function was not updated. Fix the same. Fixes: a0572f687fb3c ("ftrace: Allow ftrace_replace_code() to be schedulable") Signed-off-by: Naveen N. Rao --- arch/x86/kernel/ftrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) If the rest of this series is ok, and if Ingo and Steven are ok to have this series go through the powerpc tree, then I can re-post this particular patch for x86 after -rc1. Thanks, Naveen
[PATCH v2 6/7] kprobes/ftrace: Use ftrace_location() when [dis]arming probes
Ftrace location could include more than a single instruction in case of some architectures (powerpc64, for now). In this case, kprobe is permitted on any of those instructions, and uses ftrace infrastructure for functioning. However, [dis]arm_kprobe_ftrace() uses the kprobe address when setting up ftrace filter IP. This won't work if the address points to any instruction apart from the one that has a branch to _mcount(). To resolve this, have [dis]arm_kprobe_ftrace() use ftrace_function() to identify the filter IP. Acked-by: Masami Hiramatsu Signed-off-by: Naveen N. Rao --- kernel/kprobes.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 445337c107e0..282ee704e2d8 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -978,10 +978,10 @@ static int prepare_kprobe(struct kprobe *p) /* Caller must lock kprobe_mutex */ static int arm_kprobe_ftrace(struct kprobe *p) { + unsigned long ftrace_ip = ftrace_location((unsigned long)p->addr); int ret = 0; - ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, - (unsigned long)p->addr, 0, 0); + ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, ftrace_ip, 0, 0); if (ret) { pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n", p->addr, ret); @@ -1005,13 +1005,14 @@ static int arm_kprobe_ftrace(struct kprobe *p) * non-empty filter_hash for IPMODIFY ops, we're safe from an accidental * empty filter_hash which would undesirably trace all functions. */ - ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0); + ftrace_set_filter_ip(&kprobe_ftrace_ops, ftrace_ip, 1, 0); return ret; } /* Caller must lock kprobe_mutex */ static int disarm_kprobe_ftrace(struct kprobe *p) { + unsigned long ftrace_ip = ftrace_location((unsigned long)p->addr); int ret = 0; if (kprobe_ftrace_enabled == 1) { @@ -1022,8 +1023,7 @@ static int disarm_kprobe_ftrace(struct kprobe *p) kprobe_ftrace_enabled--; - ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, - (unsigned long)p->addr, 1, 0); + ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, ftrace_ip, 1, 0); WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n", p->addr, ret); return ret; -- 2.22.0
[PATCH v2 5/7] ftrace: Update ftrace_location() for powerpc -mprofile-kernel
Now that we are patching the preceding 'mflr r0' instruction with -mprofile-kernel, we need to update ftrace_location() to recognise that as being part of ftrace. To do this, we introduce FTRACE_IP_EXTENSION to denote the length (in bytes) of the mcount caller. By default, this is set to 0. For powerpc with CONFIG_MPROFILE_KERNEL, we set this to MCOUNT_INSN_SIZE so that this works if ftrace_location() is called with the address of the actual ftrace entry call, or with the address of the preceding 'mflr r0' instruction. Note that we do not check if the preceding instruction is indeed the 'mflr r0'. Earlier -mprofile-kernel ABI included a 'std r0,stack' instruction between the 'mflr r0' and the 'bl _mcount'. This is harmless as the 'std r0,stack' instruction is inconsequential and is not relied upon. Suggested-by: Steven Rostedt (VMware) Signed-off-by: Naveen N. Rao --- arch/powerpc/include/asm/ftrace.h | 8 include/linux/ftrace.h| 9 + kernel/trace/ftrace.c | 2 +- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 3dfb80b86561..510a8ac8ac8d 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -61,6 +61,14 @@ struct dyn_arch_ftrace { #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS #define ARCH_SUPPORTS_FTRACE_OPS 1 + +#ifdef CONFIG_MPROFILE_KERNEL +/* + * We consider two instructions -- 'mflr r0', 'bl _mcount' -- to be part + * of ftrace with -mprofile-kernel + */ +#define FTRACE_IP_EXTENSIONMCOUNT_INSN_SIZE +#endif #endif #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index fa653a561da5..310b514438cb 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -29,6 +29,15 @@ #define ARCH_SUPPORTS_FTRACE_OPS 0 #endif +/* + * This denotes the number of instructions (in bytes) that is used by the + * arch mcount caller. All instructions in this range will be owned by + * ftrace. + */ +#ifndef FTRACE_IP_EXTENSION +#define FTRACE_IP_EXTENSION 0 +#endif + /* * If the arch's mcount caller does not support all of ftrace's * features, then it must call an indirect function that diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 21d8e201ee80..308555925b81 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1575,7 +1575,7 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end) */ unsigned long ftrace_location(unsigned long ip) { - return ftrace_location_range(ip, ip); + return ftrace_location_range(ip, ip + FTRACE_IP_EXTENSION); } /** -- 2.22.0
[PATCH v2 2/7] x86/ftrace: Fix use of flags in ftrace_replace_code()
In commit a0572f687fb3c ("ftrace: Allow ftrace_replace_code() to be schedulable), the generic ftrace_replace_code() function was modified to accept a flags argument in place of a single 'enable' flag. However, the x86 version of this function was not updated. Fix the same. Fixes: a0572f687fb3c ("ftrace: Allow ftrace_replace_code() to be schedulable") Signed-off-by: Naveen N. Rao --- arch/x86/kernel/ftrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 0927bb158ffc..f34005a17051 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -573,8 +573,9 @@ static void run_sync(void) local_irq_disable(); } -void ftrace_replace_code(int enable) +void ftrace_replace_code(int mod_flags) { + int enable = mod_flags & FTRACE_MODIFY_ENABLE_FL; struct ftrace_rec_iter *iter; struct dyn_ftrace *rec; const char *report = "adding breakpoints"; -- 2.22.0
[PATCH v2 0/7] powerpc/ftrace: Patch out -mprofile-kernel instructions
Changes since v1 (https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=114556): - Patches 1,2,3 and 6: No changes - Patch 4: Add smp_call_function() to flush icache on all cpus after patching in the 'mflr r0' instruction. - Patch 5: Changes as per Steven Rostedt's suggestions. - Patch 7: Changes as per Masami and Joe Perches. -- On powerpc64, -mprofile-kernel results in two instructions being emitted: 'mflr r0' and 'bl _mcount'. So far, we were only nop'ing out the branch to _mcount(). This series implements an approach to also nop out the preceding mflr. - Naveen Naveen N. Rao (7): ftrace: Expose flags used for ftrace_replace_code() x86/ftrace: Fix use of flags in ftrace_replace_code() ftrace: Expose __ftrace_replace_code() powerpc/ftrace: Additionally nop out the preceding mflr with -mprofile-kernel ftrace: Update ftrace_location() for powerpc -mprofile-kernel kprobes/ftrace: Use ftrace_location() when [dis]arming probes powerpc/kprobes: Allow probing on any ftrace address arch/powerpc/include/asm/ftrace.h| 8 + arch/powerpc/kernel/kprobes-ftrace.c | 32 +++- arch/powerpc/kernel/trace/ftrace.c | 258 --- arch/x86/kernel/ftrace.c | 3 +- include/linux/ftrace.h | 15 ++ kernel/kprobes.c | 10 +- kernel/trace/ftrace.c| 15 +- 7 files changed, 302 insertions(+), 39 deletions(-) -- 2.22.0
[PATCH v2 1/7] ftrace: Expose flags used for ftrace_replace_code()
Since ftrace_replace_code() is a __weak function and can be overridden, we need to expose the flags that can be set. So, move the flags enum to the header file. Reviewed-by: Steven Rostedt (VMware) Signed-off-by: Naveen N. Rao --- include/linux/ftrace.h | 5 + kernel/trace/ftrace.c | 5 - 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 25e2995d4a4c..e97789c95c4e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -162,6 +162,11 @@ enum { FTRACE_OPS_FL_TRACE_ARRAY = 1 << 15, }; +enum { + FTRACE_MODIFY_ENABLE_FL = (1 << 0), + FTRACE_MODIFY_MAY_SLEEP_FL = (1 << 1), +}; + #ifdef CONFIG_DYNAMIC_FTRACE /* The hash used to know what functions callbacks trace */ struct ftrace_ops_hash { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 38277af44f5c..5710a6b3edc1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -75,11 +75,6 @@ #define INIT_OPS_HASH(opsname) #endif -enum { - FTRACE_MODIFY_ENABLE_FL = (1 << 0), - FTRACE_MODIFY_MAY_SLEEP_FL = (1 << 1), -}; - struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, -- 2.22.0
[PATCH v2 7/7] powerpc/kprobes: Allow probing on any ftrace address
With KPROBES_ON_FTRACE, kprobe is allowed to be inserted on instructions that branch to _mcount (referred to as ftrace location). With -mprofile-kernel, we now include the preceding 'mflr r0' as being part of the ftrace location. However, by default, probing on an instruction that is not actually the branch to _mcount() is prohibited, as that is considered to not be at an instruction boundary. This is not the case on powerpc, so allow the same by overriding arch_check_ftrace_location() In addition, we update kprobe_ftrace_handler() to detect this scenarios and to pass the proper nip to the pre and post probe handlers. Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/kprobes-ftrace.c | 32 +++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 972cb28174b2..23c840748183 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -12,15 +12,35 @@ #include #include +/* + * With -mprofile-kernel, we patch two instructions -- the branch to _mcount + * as well as the preceding 'mflr r0'. Both these instructions are claimed + * by ftrace and we should allow probing on either instruction. + */ +int arch_check_ftrace_location(struct kprobe *p) +{ + if (ftrace_location((unsigned long)p->addr)) + p->flags |= KPROBE_FLAG_FTRACE; + return 0; +} + /* Ftrace callback handler for kprobes */ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, struct ftrace_ops *ops, struct pt_regs *regs) { struct kprobe *p; + int mflr_kprobe = 0; struct kprobe_ctlblk *kcb; p = get_kprobe((kprobe_opcode_t *)nip); - if (unlikely(!p) || kprobe_disabled(p)) + if (!p) { + p = get_kprobe((kprobe_opcode_t *)(nip - MCOUNT_INSN_SIZE)); + if (unlikely(!p)) + return; + mflr_kprobe = 1; + } + + if (kprobe_disabled(p)) return; kcb = get_kprobe_ctlblk(); @@ -33,6 +53,9 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, */ regs->nip -= MCOUNT_INSN_SIZE; + if (mflr_kprobe) + regs->nip -= MCOUNT_INSN_SIZE; + __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (!p->pre_handler || !p->pre_handler(p, regs)) { @@ -45,6 +68,8 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, kcb->kprobe_status = KPROBE_HIT_SSDONE; p->post_handler(p, regs, 0); } + if (mflr_kprobe) + regs->nip += MCOUNT_INSN_SIZE; } /* * If pre_handler returns !0, it changes regs->nip. We have to @@ -57,6 +82,11 @@ NOKPROBE_SYMBOL(kprobe_ftrace_handler); int arch_prepare_kprobe_ftrace(struct kprobe *p) { + if ((unsigned long)p->addr & 0x03) { + pr_err("Attempt to register kprobe at an unaligned address\n"); + return -EILSEQ; + } + p->ainsn.insn = NULL; p->ainsn.boostable = -1; return 0; -- 2.22.0
[PATCH v2 4/7] powerpc/ftrace: Additionally nop out the preceding mflr with -mprofile-kernel
With -mprofile-kernel, gcc emits 'mflr r0', followed by 'bl _mcount' to enable function tracing and profiling. So far, with dynamic ftrace, we used to only patch out the branch to _mcount(). However, mflr is executed by the branch unit that can only execute one per cycle on POWER9 and shared with branches, so it would be nice to avoid it where possible. We cannot simply nop out the mflr either. When enabling function tracing, there can be a race if tracing is enabled when some thread was interrupted after executing a nop'ed out mflr. In this case, the thread would execute the now-patched-in branch to _mcount() without having executed the preceding mflr. To solve this, we now enable function tracing in 2 steps: patch in the mflr instruction, use 'smp_call_function(isync); synchronize_rcu_tasks()' to ensure all existing threads make progress, and then patch in the branch to _mcount(). We override ftrace_replace_code() with a powerpc64 variant for this purpose. Suggested-by: Nicholas Piggin Reviewed-by: Nicholas Piggin Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/trace/ftrace.c | 258 ++--- 1 file changed, 236 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 517662a56bdc..86c2b50dcaa9 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -125,7 +125,7 @@ __ftrace_make_nop(struct module *mod, { unsigned long entry, ptr, tramp; unsigned long ip = rec->ip; - unsigned int op, pop; + unsigned int op; /* read where this goes */ if (probe_kernel_read(&op, (void *)ip, sizeof(int))) { @@ -160,8 +160,6 @@ __ftrace_make_nop(struct module *mod, #ifdef CONFIG_MPROFILE_KERNEL /* When using -mkernel_profile there is no load to jump over */ - pop = PPC_INST_NOP; - if (probe_kernel_read(&op, (void *)(ip - 4), 4)) { pr_err("Fetching instruction at %lx failed.\n", ip - 4); return -EFAULT; @@ -169,26 +167,23 @@ __ftrace_make_nop(struct module *mod, /* We expect either a mflr r0, or a std r0, LRSAVE(r1) */ if (op != PPC_INST_MFLR && op != PPC_INST_STD_LR) { - pr_err("Unexpected instruction %08x around bl _mcount\n", op); + pr_err("Unexpected instruction %08x before bl _mcount\n", op); return -EINVAL; } -#else - /* -* Our original call site looks like: -* -* bl -* ld r2,XX(r1) -* -* Milton Miller pointed out that we can not simply nop the branch. -* If a task was preempted when calling a trace function, the nops -* will remove the way to restore the TOC in r2 and the r2 TOC will -* get corrupted. -* -* Use a b +8 to jump over the load. -*/ - pop = PPC_INST_BRANCH | 8; /* b +8 */ + /* We should patch out the bl to _mcount first */ + if (patch_instruction((unsigned int *)ip, PPC_INST_NOP)) { + pr_err("Patching NOP failed.\n"); + return -EPERM; + } + /* then, nop out the preceding 'mflr r0' as an optimization */ + if (op == PPC_INST_MFLR && + patch_instruction((unsigned int *)(ip - 4), PPC_INST_NOP)) { + pr_err("Patching NOP failed.\n"); + return -EPERM; + } +#else /* * Check what is in the next instruction. We can see ld r2,40(r1), but * on first pass after boot we will see mflr r0. @@ -202,12 +197,25 @@ __ftrace_make_nop(struct module *mod, pr_err("Expected %08x found %08x\n", PPC_INST_LD_TOC, op); return -EINVAL; } -#endif /* CONFIG_MPROFILE_KERNEL */ - if (patch_instruction((unsigned int *)ip, pop)) { + /* +* Our original call site looks like: +* +* bl +* ld r2,XX(r1) +* +* Milton Miller pointed out that we can not simply nop the branch. +* If a task was preempted when calling a trace function, the nops +* will remove the way to restore the TOC in r2 and the r2 TOC will +* get corrupted. +* +* Use a b +8 to jump over the load. +*/ + if (patch_instruction((unsigned int *)ip, PPC_INST_BRANCH | 8)) { pr_err("Patching NOP failed.\n"); return -EPERM; } +#endif /* CONFIG_MPROFILE_KERNEL */ return 0; } @@ -421,6 +429,26 @@ static int __ftrace_make_nop_kernel(struct dyn_ftrace *rec, unsigned long addr) return -EPERM; } +#ifdef CONFIG_MPROFILE_KERNEL + /* Nop out the preceding 'mflr r0' as an optimization */ + if (probe_kernel_read(&op, (void *)(ip - 4), 4)) { +
[PATCH v2 3/7] ftrace: Expose __ftrace_replace_code()
While over-riding ftrace_replace_code(), we still want to reuse the existing __ftrace_replace_code() function. Rename the function and make it available for other kernel code. Signed-off-by: Naveen N. Rao --- include/linux/ftrace.h | 1 + kernel/trace/ftrace.c | 8 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e97789c95c4e..fa653a561da5 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -456,6 +456,7 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable); /* defined in arch */ extern int ftrace_ip_converted(unsigned long ip); extern int ftrace_dyn_arch_init(void); +extern int ftrace_replace_code_rec(struct dyn_ftrace *rec, int enable); extern void ftrace_replace_code(int enable); extern int ftrace_update_ftrace_func(ftrace_func_t func); extern void ftrace_caller(void); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5710a6b3edc1..21d8e201ee80 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2351,8 +2351,8 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) return (unsigned long)FTRACE_ADDR; } -static int -__ftrace_replace_code(struct dyn_ftrace *rec, int enable) +int +ftrace_replace_code_rec(struct dyn_ftrace *rec, int enable) { unsigned long ftrace_old_addr; unsigned long ftrace_addr; @@ -2403,7 +2403,7 @@ void __weak ftrace_replace_code(int mod_flags) if (rec->flags & FTRACE_FL_DISABLED) continue; - failed = __ftrace_replace_code(rec, enable); + failed = ftrace_replace_code_rec(rec, enable); if (failed) { ftrace_bug(failed, rec); /* Stop processing */ @@ -5827,7 +5827,7 @@ void ftrace_module_enable(struct module *mod) rec->flags = cnt; if (ftrace_start_up && cnt) { - int failed = __ftrace_replace_code(rec, 1); + int failed = ftrace_replace_code_rec(rec, 1); if (failed) { ftrace_bug(failed, rec); goto out_loop; -- 2.22.0
[PATCH] recordmcount: Fix spurious mcount entries on powerpc
The recent change enabling HAVE_C_RECORDMCOUNT on powerpc started showing the following issue: # modprobe kprobe_example ftrace-powerpc: Not expected bl: opcode is 3c4c0001 WARNING: CPU: 0 PID: 227 at kernel/trace/ftrace.c:2001 ftrace_bug+0x90/0x318 Modules linked in: CPU: 0 PID: 227 Comm: modprobe Not tainted 5.2.0-rc6-00678-g1c329100b942 #2 NIP: c0264318 LR: c025d694 CTR: c0f5cd30 REGS: c1f2b7b0 TRAP: 0700 Not tainted (5.2.0-rc6-00678-g1c329100b942) MSR: 90010282b033 CR: 28228222 XER: CFAR: c02642fc IRQMASK: 0 NIP [c0264318] ftrace_bug+0x90/0x318 LR [c025d694] ftrace_process_locs+0x4f4/0x5e0 Call Trace: [c1f2ba40] [0004] 0x4 (unreliable) [c1f2bad0] [c025d694] ftrace_process_locs+0x4f4/0x5e0 [c1f2bb90] [c020ff10] load_module+0x25b0/0x30c0 [c1f2bd00] [c0210cb0] sys_finit_module+0xc0/0x130 [c1f2be20] [c000bda4] system_call+0x5c/0x70 Instruction dump: 419e0018 2f83 419e00bc 2f83ffea 409e00cc 481c 0fe0 3c62ff96 3901 3940 386386d0 48c4 <0fe0> 3ce20003 3901 3c62ff96 ---[ end trace 4c438d5cebf78381 ]--- ftrace failed to modify [] 0xc008012a0008 actual: 01:00:4c:3c Initializing ftrace call sites ftrace record flags: 200 (0) expected tramp: c006af4c Looking at the relocation records in __mcount_loc showed a few spurious entries: RELOCATION RECORDS FOR [__mcount_loc]: OFFSET TYPE VALUE R_PPC64_ADDR64.text.unlikely+0x0008 0008 R_PPC64_ADDR64.text.unlikely+0x0014 0010 R_PPC64_ADDR64.text.unlikely+0x0060 0018 R_PPC64_ADDR64.text.unlikely+0x00b4 0020 R_PPC64_ADDR64.init.text+0x0008 0028 R_PPC64_ADDR64.init.text+0x0014 The first entry in each section is incorrect. Looking at the relocation records, the spurious entries correspond to the R_PPC64_ENTRY records: RELOCATION RECORDS FOR [.text.unlikely]: OFFSET TYPE VALUE R_PPC64_REL64 .TOC.-0x0008 0008 R_PPC64_ENTRY *ABS* 0014 R_PPC64_REL24 _mcount The problem is that we are not validating the return value from get_mcountsym() in sift_rel_mcount(). With this entry, mcountsym is 0, but Elf_r_sym(relp) also ends up being 0. Fix this by ensuring mcountsym is valid before processing the entry. Fixes: c7d64b560ce80 ("powerpc/ftrace: Enable C Version of recordmcount") Signed-off-by: Naveen N. Rao --- scripts/recordmcount.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h index 13c5e6c8829c..47fca2c69a73 100644 --- a/scripts/recordmcount.h +++ b/scripts/recordmcount.h @@ -325,7 +325,8 @@ static uint_t *sift_rel_mcount(uint_t *mlocp, if (!mcountsym) mcountsym = get_mcountsym(sym0, relp, str0); - if (mcountsym == Elf_r_sym(relp) && !is_fake_mcount(relp)) { + if (mcountsym && mcountsym == Elf_r_sym(relp) && + !is_fake_mcount(relp)) { uint_t const addend = _w(_w(relp->r_offset) - recval + mcount_adjust); mrelp->r_offset = _w(offbase -- 2.22.0
Re: [PATCH 7/7] powerpc/kprobes: Allow probing on any ftrace address
Masami Hiramatsu wrote: On Tue, 18 Jun 2019 20:17:06 +0530 "Naveen N. Rao" wrote: With KPROBES_ON_FTRACE, kprobe is allowed to be inserted on instructions that branch to _mcount (referred to as ftrace location). With -mprofile-kernel, we now include the preceding 'mflr r0' as being part of the ftrace location. However, by default, probing on an instruction that is not actually the branch to _mcount() is prohibited, as that is considered to not be at an instruction boundary. This is not the case on powerpc, so allow the same by overriding arch_check_ftrace_location() In addition, we update kprobe_ftrace_handler() to detect this scenarios and to pass the proper nip to the pre and post probe handlers. Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/kprobes-ftrace.c | 30 1 file changed, 30 insertions(+) diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 972cb28174b2..6a0bd3c16cb6 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -12,14 +12,34 @@ #include #include +/* + * With -mprofile-kernel, we patch two instructions -- the branch to _mcount + * as well as the preceding 'mflr r0'. Both these instructions are claimed + * by ftrace and we should allow probing on either instruction. + */ +int arch_check_ftrace_location(struct kprobe *p) +{ + if (ftrace_location((unsigned long)p->addr)) + p->flags |= KPROBE_FLAG_FTRACE; + return 0; +} + /* Ftrace callback handler for kprobes */ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, struct ftrace_ops *ops, struct pt_regs *regs) { struct kprobe *p; + int mflr_kprobe = 0; struct kprobe_ctlblk *kcb; p = get_kprobe((kprobe_opcode_t *)nip); + if (unlikely(!p)) { Hmm, is this really unlikely? If we put a kprobe on the second instruction address, we will see p == NULL always. + p = get_kprobe((kprobe_opcode_t *)(nip - MCOUNT_INSN_SIZE)); + if (!p) Here will be unlikely, because we can not find kprobe at both of nip and nip - MCOUNT_INSN_SIZE. + return; + mflr_kprobe = 1; + } + if (unlikely(!p) || kprobe_disabled(p)) "unlikely(!p)" is not needed here. ... Joe Perches wrote: On Fri, 2019-06-21 at 23:50 +0900, Masami Hiramatsu wrote: On Tue, 18 Jun 2019 20:17:06 +0530 "Naveen N. Rao" wrote: trivia: > diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c [] > @@ -57,6 +82,11 @@ NOKPROBE_SYMBOL(kprobe_ftrace_handler); > > int arch_prepare_kprobe_ftrace(struct kprobe *p) > { > + if ((unsigned long)p->addr & 0x03) { > + printk("Attempt to register kprobe at an unaligned address\n"); Please use the appropriate KERN_ or pr_ All good points. Thanks for the review. - Naveen