[PATCH v2] x86, perf: Fix arch_perf_out_copy_user and copy_from_user_nmi return values
All architectures except x86 use __copy_from_user_inatomic to provide arch_perf_out_copy_user; like the other copy_from routines, it returns the number of bytes not copied. perf was expecting the number of bytes that had been copied. This change corrects that, and thereby allows PERF_SAMPLE_STACK_USER to be enabled on non-x86 architectures. x86 uses copy_from_user_nmi, which deviates from the other copy_from routines by returning the number of bytes copied. This change therefore also reverses copy_from_user_nmi's return value, so that the perf change doesn't break user stack copies on x86, and to help prevent bugs caused by this surprising difference (and simplify callers, which mostly want to know if the number of uncopied bytes is nonzero). Signed-off-by: Jed Davis --- arch/x86/kernel/cpu/perf_event.c | 8 ++-- arch/x86/kernel/cpu/perf_event_intel_ds.c | 6 ++ arch/x86/kernel/cpu/perf_event_intel_lbr.c | 4 +--- arch/x86/lib/usercopy.c| 2 +- arch/x86/oprofile/backtrace.c | 8 ++-- kernel/events/internal.h | 9 + 6 files changed, 13 insertions(+), 24 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index a7c7305..038c18c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1983,12 +1983,10 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) fp = compat_ptr(ss_base + regs->bp); while (entry->nr < PERF_MAX_STACK_DEPTH) { - unsigned long bytes; frame.next_frame = 0; frame.return_address = 0; - bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); - if (bytes != sizeof(frame)) + if (copy_from_user_nmi(&frame, fp, sizeof(frame))) break; if (!valid_user_frame(fp, sizeof(frame))) @@ -2035,12 +2033,10 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) return; while (entry->nr < PERF_MAX_STACK_DEPTH) { - unsigned long bytes; frame.next_frame = NULL; frame.return_address = 0; - bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); - if (bytes != sizeof(frame)) + if (copy_from_user_nmi(&frame, fp, sizeof(frame))) break; if (!valid_user_frame(fp, sizeof(frame))) diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 3065c57..5208fe1 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -729,10 +729,8 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) old_to = to; if (!kernel_ip(ip)) { - int bytes, size = MAX_INSN_SIZE; - - bytes = copy_from_user_nmi(buf, (void __user *)to, size); - if (bytes != size) + if (copy_from_user_nmi(buf, (void __user *)to, + MAX_INSN_SIZE)) return 0; kaddr = buf; diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index d5be06a..2833514 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -449,7 +449,6 @@ static int branch_type(unsigned long from, unsigned long to, int abort) { struct insn insn; void *addr; - int bytes, size = MAX_INSN_SIZE; int ret = X86_BR_NONE; int ext, to_plm, from_plm; u8 buf[MAX_INSN_SIZE]; @@ -477,8 +476,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort) return X86_BR_NONE; /* may fail if text not present */ - bytes = copy_from_user_nmi(buf, (void __user *)from, size); - if (bytes != size) + if (copy_from_user_nmi(buf, (void __user *)from, MAX_INSN_SIZE)) return X86_BR_NONE; addr = buf; diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index 4f74d94..7a13c98 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -44,6 +44,6 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) } while (len < n); - return len; + return n - len; } EXPORT_SYMBOL_GPL(copy_from_user_nmi); diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index d6aa6e8..e778d41 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -44,10 +44,8 @@ dump_user_backtrace_32(struct stack_frame_ia32 *head) /* Also check accessibility of one
[PATCH 2/2] x86: Fix copy_from_user_nmi return to match copy_from_user.
copy_from_user returns the number of bytes not copied. This change makes copy_from_user_nmi behave the same way, instead of returning the number of bytes that were copied, to help prevent bugs caused by this surprising difference (and simplify callers, which mostly want to know if the number of uncopied bytes is nonzero). Signed-off-by: Jed Davis --- arch/x86/include/asm/perf_event.h | 9 + arch/x86/kernel/cpu/perf_event.c | 8 ++-- arch/x86/kernel/cpu/perf_event_intel_ds.c | 6 ++ arch/x86/kernel/cpu/perf_event_intel_lbr.c | 4 +--- arch/x86/lib/usercopy.c| 2 +- arch/x86/oprofile/backtrace.c | 8 ++-- 6 files changed, 9 insertions(+), 28 deletions(-) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index ddae5bd..8249df4 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -274,13 +274,6 @@ static inline void perf_check_microcode(void) { } static inline void amd_pmu_disable_virt(void) { } #endif -static inline unsigned long copy_from_user_nmi_for_perf(void *to, - const void __user *from, - unsigned long n) -{ - return n - copy_from_user_nmi(to, from, n); -} - -#define arch_perf_out_copy_user copy_from_user_nmi_for_perf +#define arch_perf_out_copy_user copy_from_user_nmi #endif /* _ASM_X86_PERF_EVENT_H */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index a7c7305..038c18c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1983,12 +1983,10 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) fp = compat_ptr(ss_base + regs->bp); while (entry->nr < PERF_MAX_STACK_DEPTH) { - unsigned long bytes; frame.next_frame = 0; frame.return_address = 0; - bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); - if (bytes != sizeof(frame)) + if (copy_from_user_nmi(&frame, fp, sizeof(frame))) break; if (!valid_user_frame(fp, sizeof(frame))) @@ -2035,12 +2033,10 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) return; while (entry->nr < PERF_MAX_STACK_DEPTH) { - unsigned long bytes; frame.next_frame = NULL; frame.return_address = 0; - bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); - if (bytes != sizeof(frame)) + if (copy_from_user_nmi(&frame, fp, sizeof(frame))) break; if (!valid_user_frame(fp, sizeof(frame))) diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 3065c57..5208fe1 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -729,10 +729,8 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) old_to = to; if (!kernel_ip(ip)) { - int bytes, size = MAX_INSN_SIZE; - - bytes = copy_from_user_nmi(buf, (void __user *)to, size); - if (bytes != size) + if (copy_from_user_nmi(buf, (void __user *)to, + MAX_INSN_SIZE)) return 0; kaddr = buf; diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index d5be06a..2833514 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -449,7 +449,6 @@ static int branch_type(unsigned long from, unsigned long to, int abort) { struct insn insn; void *addr; - int bytes, size = MAX_INSN_SIZE; int ret = X86_BR_NONE; int ext, to_plm, from_plm; u8 buf[MAX_INSN_SIZE]; @@ -477,8 +476,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort) return X86_BR_NONE; /* may fail if text not present */ - bytes = copy_from_user_nmi(buf, (void __user *)from, size); - if (bytes != size) + if (copy_from_user_nmi(buf, (void __user *)from, MAX_INSN_SIZE)) return X86_BR_NONE; addr = buf; diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index 4f74d94..7a13c98 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -44,6 +44,6 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) } while (len < n); - return len; + return n - len; } EXPORT_SYMBOL_GPL(copy_fro
[PATCH 1/2] perf: Fix handling of arch_perf_out_copy_user return value.
All architectures except x86 use __copy_from_user_inatomic to provide arch_perf_out_copy_user; like the other copy_from routines, it returns the number of bytes not copied. perf was expecting the number of bytes that had been copied. This change corrects that, and thereby allows PERF_SAMPLE_STACK_USER to be enabled on non-x86 architectures. x86 uses copy_from_user_nmi, which deviates from the other copy_from routines by returning the number of bytes copied. (This cancels out the effect of perf being backwards; apparently this code has only ever been tested on x86.) This change therefore adds a second wrapper to re-reverse it for perf; the next patch in this series will clean it up. Signed-off-by: Jed Davis --- arch/x86/include/asm/perf_event.h | 9 - kernel/events/internal.h | 11 ++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8249df4..ddae5bd 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -274,6 +274,13 @@ static inline void perf_check_microcode(void) { } static inline void amd_pmu_disable_virt(void) { } #endif -#define arch_perf_out_copy_user copy_from_user_nmi +static inline unsigned long copy_from_user_nmi_for_perf(void *to, + const void __user *from, + unsigned long n) +{ + return n - copy_from_user_nmi(to, from, n); +} + +#define arch_perf_out_copy_user copy_from_user_nmi_for_perf #endif /* _ASM_X86_PERF_EVENT_H */ diff --git a/kernel/events/internal.h b/kernel/events/internal.h index ca65997..e61b22c 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -81,6 +81,7 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); } +/* The memcpy_func must return the number of bytes successfully copied. */ #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ static inline unsigned int \ func_name(struct perf_output_handle *handle, \ @@ -122,11 +123,19 @@ DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP) +/* arch_perf_out_copy_user must return the number of bytes not copied. */ #ifndef arch_perf_out_copy_user #define arch_perf_out_copy_user __copy_from_user_inatomic #endif -DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) +static inline unsigned long perf_memcpy_from_user(void *to, + const void __user *from, + unsigned long n) +{ + return n - arch_perf_out_copy_user(to, from, n); +} + +DEFINE_OUTPUT_COPY(__output_copy_user, perf_memcpy_from_user) /* Callchain handling */ extern struct perf_callchain_entry * -- 1.8.3.2 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] ARM: Fix r7/r11 confusion when CONFIG_THUMB2_KERNEL=y
On Mon, Jul 22, 2013 at 07:52:39PM +0100, Dave Martin wrote: > On Sun, Jul 21, 2013 at 10:37:53PM +0100, Will Deacon wrote: > > Ok, I think I'm with you now. I also think that a better solution would be > > to try and limit the r7/fp confusion to one place, perhaps behind something > > like: > > > > void arm_get_current_stackframe(struct pt_regs *regs, struct stackframe > > *frame); > > > > then that function can act as the bridge between pt_regs (where we leave > > everything as it is) and stackframe (where we assign either r7 or fp into > > the fp member). Then we just fix up the call sites to pass the regs they're > > interested in to our new function. > > > > What do you think? I can see that being useful if we wanted to opacify struct stackframe, but... all uses of stackframe that I see involve passing it to unwind_frame, which expands it back out into an array of registers. (Except with CONFIG_FRAME_POINTER, but "APCS variants that require a frame pointer register are obsolete.") So... why not make pt_regs and stackframe the same, and avoid the translations entirely? > Do the ARM unwind tables guarantee not to need knowledge of any > virtual registers for the purpose of processing the opcodes of a single > function's unwind table entry, except those virtual regs that we happen > to initialise? Or do we just rely on luck? Yes, for some value of "luck". The spec documents 0x90|N, for N a core register number other than 13 or 15, as setting the vSP to the value of virtual register N. We can get away with some omissions for kernel code (e.g., unwind.c doesn't handle saved floating point registers, nor adding constants larger than 1024 to vSP), but is this one of them? [...] > The purist answer seems to be: we might need all the regs in struct > stackframe. > > The pragmatic answer might be that we definitely need r7 for Thumb code, > but given the nimbleness of GCC to evolve we might get away with not > including extra registers for a long time yet. The other question to ask here might be: what does the "pragmatic answer" gain us over the "purist answer"? > A review of existing custom unwind annotations might be a good idea > anyway, to check whether any of them requires another register right now. `egrep -r '\.(setf|movs)p' arch/arm` finds nothing, for what it's worth. --Jed -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] ARM: Fix r7/r11 confusion when CONFIG_THUMB2_KERNEL=y
On Mon, Jul 15, 2013 at 02:54:20PM +0100, Will Deacon wrote: > On Sat, Jul 13, 2013 at 04:18:20AM +0100, Jed Davis wrote: [...] > > Effects of this are probably limited to failure of EHABI unwinding when > > starting from a function that uses r7 to restore its stack pointer, but > > the possibility for further breakage (which would be invisible on > > non-Thumb kernels) is worrying. [...] > I'm struggling to understand exactly the problem that this patch is trying > to address. If it's just a code consistency issue, I don't think it's worth > it (I actually find it less confusing the way we currently have things) but > if there is a real bug, perhaps you could provide a testcase? There is a real bug here, but my commit message wasn't very clear. This was breaking PERF_COUNT_SW_CONTEXT_SWITCHES with CONFIG_THUMB2_KERNEL=y (with my other recently posted patch applied), because kernel/sched.c is built with -fno-omit-frame-pointer (which is wrong, but that's a problem for another patch) and so __schedule's EHABI entry uses 0x97 (mov sp, r7), and somewhere along the line the unwinder gets the r11 value instead. This would also apply to any function with a variable-length array, but __schedule happens to have the perf hook I was trying to use. I should add that this bug doesn't affect me directly at the moment, because we're not currently using CONFIG_THUMB2_KERNEL on Firefox OS, because our kernels are assorted older versions with hardware vendors' changes and there are some issues with it. But I felt it was worth tracking this down before trying to send changes upstream. The "right" thing to do here might be to just include all the registers, or at least {r4-pc}, in struct stackframe. The parts that aren't {fp, sp, lr, pc} could be ifdef'ed if we're concerned enough about the overhead in kernels using APCS frame pointer unwinding. I agree that a test case would be good -- I'm more than a little worried of regressions without one -- but I could use some advice on how best to do that. --Jed -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] ARM: perf: Implement perf_arch_fetch_caller_regs
On Mon, Jul 15, 2013 at 02:53:42PM +0100, Will Deacon wrote: > On Sat, Jul 13, 2013 at 04:17:14AM +0100, Jed Davis wrote: [...] > > +#ifdef CONFIG_THUMB2_KERNEL > > +#define perf_arch_fetch_caller_regs(regs, ip) > > \ > > + do {\ > > + __u32 _cpsr, _pc; \ > > + __asm__ __volatile__("str r7, [%[_regs], #(7 * 4)]\n\t" \ > > +"str r13, [%[_regs], #(13 * 4)]\n\t" \ > > +"str r14, [%[_regs], #(14 * 4)]\n\t" \ > > Is this safe? How can we be sure that the registers haven't been clobbered > by the callee before this macro is expanded? For example, you can end up > with the following code: They might be clobbered, but if they are, then... > 0058 : > 58: e92d 43f0 stmdb sp!, {r4, r5, r6, r7, r8, r9, lr} > 5c: b09bsub sp, #108; 0x6c > 5e: ac08add r4, sp, #32 > 60: 4681mov r9, r0 > 62: 4688mov r8, r1 > 64: 4620mov r0, r4 > 66: 2148movsr1, #72 ; 0x48 > 68: f7ff fffe bl 0 <__memzero> > 6c: 61e7str r7, [r4, #28] > 6e: f8c4 d034 str.w sp, [r4, #52] ; 0x34 > 72: f8c4 e038 str.w lr, [r4, #56] ; 0x38 > > but the call to __memzero will have corrupted the lr. ...the function's unwinding entry will restore them from the stack, and can't assume anything about their values before then. It's the same problem as if the code was interrupted by a profiler timer at that point and we tried unwinding from the trap state. Hopefully. It's hard to be as rigorous as I'd like about this, because the Exception Handling ABI was meant for exception handling, so as specified it only needs to work at points where C++ exceptions can be thrown, as I understand it. Fortunately GCC isn't limited to that, but there are more issues, because -fasynchronous-unwind-tables doesn't actually make things work from *any* instruction as documented (which is not entirely bad, because working correctly would significantly increase the size of .extab/.exidx). All that said, the unwinding program for a function should work at any point after the prologue and before the epilogue. > > +"mov %[_pc], r15\n\t" \ > > +"mrs %[_cpsr], cpsr\n\t" \ > > +: [_cpsr] "=r" (_cpsr),\ > > + [_pc] "=r" (_pc) \ > > +: [_regs] "r" (&(regs)->uregs) \ > > It would be cleaner to pass a separate argument for each register, using the > ARM_rN macros rather than calculating the offset by hand. I'll do that. If there were more arguments there might be a problem at -O0, because the naive translation can run out of registers in some cases, but that shouldn't be a problem here. (Nor if someone later extends this to all the core registers, because {r0-r13} can and presumably should use a store-multiple.) Thanks for the quick review, and sorry for the delay in responding. --Jed -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] ARM: Fix r7/r11 confusion when CONFIG_THUMB2_KERNEL=y
There is currently some inconsistency about the "frame pointer" on ARM. r11 is the register with assemblers recognize and disassemblers often print as "fp", and which is sufficient for stack unwinding when using the APCS frame pointer option; but when unwinding with the Exception Handling ABI, the register GCC uses when a constant offset won't suffice (or when -fno-omit-frame-pointer is used; see kernel/sched/Makefile in particular) is r11 on ARM and r7 on Thumb. Correspondingly, arch/arm/include/uapi/arm/ptrace.h defines ARM_fp to refer to r11, but arch/arm/kernel/unwind.c uses "FP" to mean either r11 or r7 depending on Thumbness, and it is unclear what other cases such as the "fp" in struct stackframe should be doing. Effects of this are probably limited to failure of EHABI unwinding when starting from a function that uses r7 to restore its stack pointer, but the possibility for further breakage (which would be invisible on non-Thumb kernels) is worrying. With this change, it is hoped, r7 is consistently referred to as "r7", and "fp" always means r11; this costs a few extra ifdefs, but it should help prevent future issues. Signed-off-by: Jed Davis --- arch/arm/include/asm/stacktrace.h |4 arch/arm/include/asm/thread_info.h |2 ++ arch/arm/kernel/perf_event.c |4 arch/arm/kernel/process.c |4 arch/arm/kernel/time.c |4 arch/arm/kernel/unwind.c | 27 ++- arch/arm/oprofile/common.c |4 7 files changed, 48 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/stacktrace.h b/arch/arm/include/asm/stacktrace.h index 4d0a164..5e546bf 100644 --- a/arch/arm/include/asm/stacktrace.h +++ b/arch/arm/include/asm/stacktrace.h @@ -2,7 +2,11 @@ #define __ASM_STACKTRACE_H struct stackframe { +#ifdef CONFIG_THUMB2_KERNEL + unsigned long r7; +#else unsigned long fp; +#endif unsigned long sp; unsigned long lr; unsigned long pc; diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 214d415..ae3cd81 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -105,6 +105,8 @@ static inline struct thread_info *current_thread_info(void) ((unsigned long)(task_thread_info(tsk)->cpu_context.sp)) #define thread_saved_fp(tsk) \ ((unsigned long)(task_thread_info(tsk)->cpu_context.fp)) +#define thread_saved_r7(tsk) \ + ((unsigned long)(task_thread_info(tsk)->cpu_context.r7)) extern void crunch_task_disable(struct thread_info *); extern void crunch_task_copy(struct thread_info *, void *); diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index d9f5cd4..55776d6 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -601,7 +601,11 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) return; } +#ifdef CONFIG_THUMB2_KERNEL + fr.r7 = regs->ARM_r7; +#else fr.fp = regs->ARM_fp; +#endif fr.sp = regs->ARM_sp; fr.lr = regs->ARM_lr; fr.pc = regs->ARM_pc; diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index d3ca4f6..aeb4c28 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -405,7 +405,11 @@ unsigned long get_wchan(struct task_struct *p) if (!p || p == current || p->state == TASK_RUNNING) return 0; +#ifdef CONFIG_THUMB2_KERNEL + frame.r7 = thread_saved_r7(p); +#else frame.fp = thread_saved_fp(p); +#endif frame.sp = thread_saved_sp(p); frame.lr = 0; /* recovered from the stack */ frame.pc = thread_saved_pc(p); diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c index 98aee32..80410d3 100644 --- a/arch/arm/kernel/time.c +++ b/arch/arm/kernel/time.c @@ -49,7 +49,11 @@ unsigned long profile_pc(struct pt_regs *regs) if (!in_lock_functions(regs->ARM_pc)) return regs->ARM_pc; +#ifdef CONFIG_THUMB2_KERNEL + frame.r7 = regs->ARM_r7; +#else frame.fp = regs->ARM_fp; +#endif frame.sp = regs->ARM_sp; frame.lr = regs->ARM_lr; frame.pc = regs->ARM_pc; diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c index 00df012..dec03ae 100644 --- a/arch/arm/kernel/unwind.c +++ b/arch/arm/kernel/unwind.c @@ -74,7 +74,7 @@ struct unwind_ctrl_block { enum regs { #ifdef CONFIG_THUMB2_KERNEL - FP = 7, + R7 = 7, #else FP = 11, #endif @@ -317,8 +317,13 @@ static int unwind_exec_insn(struct unwind_ctrl_block *ctrl) return -URC_FAILURE; } +#ifdef CONFIG_THUMB2_KERNEL + pr_debug("%s: r7 = %08lx sp = %08lx lr = %08lx pc = %08lx\n", __func__, +ctrl->vrs[R7], ctrl-
[PATCH] ARM: perf: Implement perf_arch_fetch_caller_regs
We need a perf_arch_fetch_caller_regs for at least some software events to be able to get a callchain; even user stacks won't work without at least the CPSR bits for non-user-mode (see perf_callchain). In particular, profiling context switches needs this. This records the state of the point at which perf_arch_fetch_caller_regs is expanded, instead of that function activation's call site, because we need SP and PC to be consistent for EHABI unwinding; hopefully nothing will be inconvenienced by the extra stack frame. Signed-off-by: Jed Davis --- arch/arm/include/asm/perf_event.h | 43 + 1 file changed, 43 insertions(+) diff --git a/arch/arm/include/asm/perf_event.h b/arch/arm/include/asm/perf_event.h index 7558775..2cc7255 100644 --- a/arch/arm/include/asm/perf_event.h +++ b/arch/arm/include/asm/perf_event.h @@ -12,6 +12,8 @@ #ifndef __ARM_PERF_EVENT_H__ #define __ARM_PERF_EVENT_H__ +#include + /* * The ARMv7 CPU PMU supports up to 32 event counters. */ @@ -28,4 +30,45 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs); #define perf_misc_flags(regs) perf_misc_flags(regs) #endif +/* + * We can't actually get the caller's registers here; the saved PC and + * SP values have to be consistent or else EHABI unwinding won't work, + * and the only way to find the matching SP for the return address is + * to unwind the current function. So we save the current state + * instead. + * + * Note that the ARM Exception Handling ABI allows unwinding to depend + * on the contents of any core register, but our unwinder is limited + * to the ones in struct stackframe (which are the only ones we expect + * GCC to need for kernel code), so we just record those. + */ +#ifdef CONFIG_THUMB2_KERNEL +#define perf_arch_fetch_caller_regs(regs, ip) \ + do {\ + __u32 _cpsr, _pc; \ + __asm__ __volatile__("str r7, [%[_regs], #(7 * 4)]\n\t" \ +"str r13, [%[_regs], #(13 * 4)]\n\t" \ +"str r14, [%[_regs], #(14 * 4)]\n\t" \ +"mov %[_pc], r15\n\t" \ +"mrs %[_cpsr], cpsr\n\t" \ +: [_cpsr] "=r" (_cpsr),\ + [_pc] "=r" (_pc) \ +: [_regs] "r" (&(regs)->uregs) \ +: "memory"); \ + (regs)->ARM_pc = _pc; \ + (regs)->ARM_cpsr = _cpsr; \ + } while (0) +#else +#define perf_arch_fetch_caller_regs(regs, ip) \ + do {\ + __u32 _cpsr;\ + __asm__ __volatile__("stmia %[_regs11], {r11 - r15}\n\t" \ +"mrs %[_cpsr], cpsr\n\t" \ +: [_cpsr] "=r" (_cpsr) \ +: [_regs11] "r" (&(regs)->uregs[11]) \ +: "memory"); \ + (regs)->ARM_cpsr = _cpsr; \ + } while (0) +#endif + #endif /* __ARM_PERF_EVENT_H__ */ -- 1.7.10.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] perf: ARM: Record the user-mode PC in the call chain.
On Tue, Jun 18, 2013 at 02:13:19PM +0100, Will Deacon wrote: > On Fri, Jun 14, 2013 at 12:21:11AM +0100, Jed Davis wrote: > > With this change, we no longer lose the innermost entry in the user-mode > > part of the call chain. See also the x86 port, which includes the ip. > > > > It's possible to partially work around this problem by post-processing > > the data to use the PERF_SAMPLE_IP value, but this works only if the CPU > > wasn't in the kernel when the sample was taken. > > Thanks. I guess we need something similar for arm64 too. Could you cook a > similar patch please? Done (and tested, on the ARM V8 Foundation Model). It looked as if the powerpc and sparc ports might have similar issues, but I haven't checked on them yet. --Jed -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] perf: arm64: Record the user-mode PC in the call chain.
With this change, we no longer lose the innermost entry in the user-mode part of the call chain. See also the x86 port, which includes the ip, and the corresponding change in arch/arm. Signed-off-by: Jed Davis --- arch/arm64/kernel/perf_event.c |1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 1e49e5eb..9ba33c4 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -1336,6 +1336,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry, return; } + perf_callchain_store(entry, regs->pc); tail = (struct frame_tail __user *)regs->regs[29]; while (entry->nr < PERF_MAX_STACK_DEPTH && -- 1.7.10.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] perf: ARM: Record the user-mode PC in the call chain.
With this change, we no longer lose the innermost entry in the user-mode part of the call chain. See also the x86 port, which includes the ip. It's possible to partially work around this problem by post-processing the data to use the PERF_SAMPLE_IP value, but this works only if the CPU wasn't in the kernel when the sample was taken. Signed-off-by: Jed Davis --- arch/arm/kernel/perf_event.c |1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index 8c3094d..d9f5cd4 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -569,6 +569,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) return; } + perf_callchain_store(entry, regs->ARM_pc); tail = (struct frame_tail __user *)regs->ARM_fp - 1; while ((entry->nr < PERF_MAX_STACK_DEPTH) && -- 1.7.10.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/