[PATCH v2] x86, perf: Fix arch_perf_out_copy_user and copy_from_user_nmi return values

2013-08-16 Thread Jed Davis
All architectures except x86 use __copy_from_user_inatomic to provide
arch_perf_out_copy_user; like the other copy_from routines, it returns
the number of bytes not copied.  perf was expecting the number of bytes
that had been copied.  This change corrects that, and thereby allows
PERF_SAMPLE_STACK_USER to be enabled on non-x86 architectures.

x86 uses copy_from_user_nmi, which deviates from the other copy_from
routines by returning the number of bytes copied.  This change therefore
also reverses copy_from_user_nmi's return value, so that the perf change
doesn't break user stack copies on x86, and to help prevent bugs caused
by this surprising difference (and simplify callers, which mostly want
to know if the number of uncopied bytes is nonzero).

Signed-off-by: Jed Davis 
---
 arch/x86/kernel/cpu/perf_event.c   | 8 ++--
 arch/x86/kernel/cpu/perf_event_intel_ds.c  | 6 ++
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 4 +---
 arch/x86/lib/usercopy.c| 2 +-
 arch/x86/oprofile/backtrace.c  | 8 ++--
 kernel/events/internal.h   | 9 +
 6 files changed, 13 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index a7c7305..038c18c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1983,12 +1983,10 @@ perf_callchain_user32(struct pt_regs *regs, struct 
perf_callchain_entry *entry)
 
fp = compat_ptr(ss_base + regs->bp);
while (entry->nr < PERF_MAX_STACK_DEPTH) {
-   unsigned long bytes;
frame.next_frame = 0;
frame.return_address = 0;
 
-   bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
-   if (bytes != sizeof(frame))
+   if (copy_from_user_nmi(&frame, fp, sizeof(frame)))
break;
 
if (!valid_user_frame(fp, sizeof(frame)))
@@ -2035,12 +2033,10 @@ perf_callchain_user(struct perf_callchain_entry *entry, 
struct pt_regs *regs)
return;
 
while (entry->nr < PERF_MAX_STACK_DEPTH) {
-   unsigned long bytes;
frame.next_frame = NULL;
frame.return_address = 0;
 
-   bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
-   if (bytes != sizeof(frame))
+   if (copy_from_user_nmi(&frame, fp, sizeof(frame)))
break;
 
if (!valid_user_frame(fp, sizeof(frame)))
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c 
b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 3065c57..5208fe1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -729,10 +729,8 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 
old_to = to;
if (!kernel_ip(ip)) {
-   int bytes, size = MAX_INSN_SIZE;
-
-   bytes = copy_from_user_nmi(buf, (void __user *)to, 
size);
-   if (bytes != size)
+   if (copy_from_user_nmi(buf, (void __user *)to,
+  MAX_INSN_SIZE))
return 0;
 
kaddr = buf;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c 
b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index d5be06a..2833514 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -449,7 +449,6 @@ static int branch_type(unsigned long from, unsigned long 
to, int abort)
 {
struct insn insn;
void *addr;
-   int bytes, size = MAX_INSN_SIZE;
int ret = X86_BR_NONE;
int ext, to_plm, from_plm;
u8 buf[MAX_INSN_SIZE];
@@ -477,8 +476,7 @@ static int branch_type(unsigned long from, unsigned long 
to, int abort)
return X86_BR_NONE;
 
/* may fail if text not present */
-   bytes = copy_from_user_nmi(buf, (void __user *)from, size);
-   if (bytes != size)
+   if (copy_from_user_nmi(buf, (void __user *)from, MAX_INSN_SIZE))
return X86_BR_NONE;
 
addr = buf;
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index 4f74d94..7a13c98 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -44,6 +44,6 @@ copy_from_user_nmi(void *to, const void __user *from, 
unsigned long n)
 
} while (len < n);
 
-   return len;
+   return n - len;
 }
 EXPORT_SYMBOL_GPL(copy_from_user_nmi);
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index d6aa6e8..e778d41 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -44,10 +44,8 @@ dump_user_backtrace_32(struct stack_frame_ia32 *head)
/* Also check accessibility of one

[PATCH 2/2] x86: Fix copy_from_user_nmi return to match copy_from_user.

2013-07-29 Thread Jed Davis
copy_from_user returns the number of bytes not copied.  This change
makes copy_from_user_nmi behave the same way, instead of returning the
number of bytes that were copied, to help prevent bugs caused by this
surprising difference (and simplify callers, which mostly want to know
if the number of uncopied bytes is nonzero).

Signed-off-by: Jed Davis 
---
 arch/x86/include/asm/perf_event.h  | 9 +
 arch/x86/kernel/cpu/perf_event.c   | 8 ++--
 arch/x86/kernel/cpu/perf_event_intel_ds.c  | 6 ++
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 4 +---
 arch/x86/lib/usercopy.c| 2 +-
 arch/x86/oprofile/backtrace.c  | 8 ++--
 6 files changed, 9 insertions(+), 28 deletions(-)

diff --git a/arch/x86/include/asm/perf_event.h 
b/arch/x86/include/asm/perf_event.h
index ddae5bd..8249df4 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -274,13 +274,6 @@ static inline void perf_check_microcode(void) { }
  static inline void amd_pmu_disable_virt(void) { }
 #endif
 
-static inline unsigned long copy_from_user_nmi_for_perf(void *to,
-   const void __user *from,
-   unsigned long n)
-{
-   return n - copy_from_user_nmi(to, from, n);
-}
-
-#define arch_perf_out_copy_user copy_from_user_nmi_for_perf
+#define arch_perf_out_copy_user copy_from_user_nmi
 
 #endif /* _ASM_X86_PERF_EVENT_H */
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index a7c7305..038c18c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1983,12 +1983,10 @@ perf_callchain_user32(struct pt_regs *regs, struct 
perf_callchain_entry *entry)
 
fp = compat_ptr(ss_base + regs->bp);
while (entry->nr < PERF_MAX_STACK_DEPTH) {
-   unsigned long bytes;
frame.next_frame = 0;
frame.return_address = 0;
 
-   bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
-   if (bytes != sizeof(frame))
+   if (copy_from_user_nmi(&frame, fp, sizeof(frame)))
break;
 
if (!valid_user_frame(fp, sizeof(frame)))
@@ -2035,12 +2033,10 @@ perf_callchain_user(struct perf_callchain_entry *entry, 
struct pt_regs *regs)
return;
 
while (entry->nr < PERF_MAX_STACK_DEPTH) {
-   unsigned long bytes;
frame.next_frame = NULL;
frame.return_address = 0;
 
-   bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
-   if (bytes != sizeof(frame))
+   if (copy_from_user_nmi(&frame, fp, sizeof(frame)))
break;
 
if (!valid_user_frame(fp, sizeof(frame)))
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c 
b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 3065c57..5208fe1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -729,10 +729,8 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 
old_to = to;
if (!kernel_ip(ip)) {
-   int bytes, size = MAX_INSN_SIZE;
-
-   bytes = copy_from_user_nmi(buf, (void __user *)to, 
size);
-   if (bytes != size)
+   if (copy_from_user_nmi(buf, (void __user *)to,
+  MAX_INSN_SIZE))
return 0;
 
kaddr = buf;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c 
b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index d5be06a..2833514 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -449,7 +449,6 @@ static int branch_type(unsigned long from, unsigned long 
to, int abort)
 {
struct insn insn;
void *addr;
-   int bytes, size = MAX_INSN_SIZE;
int ret = X86_BR_NONE;
int ext, to_plm, from_plm;
u8 buf[MAX_INSN_SIZE];
@@ -477,8 +476,7 @@ static int branch_type(unsigned long from, unsigned long 
to, int abort)
return X86_BR_NONE;
 
/* may fail if text not present */
-   bytes = copy_from_user_nmi(buf, (void __user *)from, size);
-   if (bytes != size)
+   if (copy_from_user_nmi(buf, (void __user *)from, MAX_INSN_SIZE))
return X86_BR_NONE;
 
addr = buf;
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index 4f74d94..7a13c98 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -44,6 +44,6 @@ copy_from_user_nmi(void *to, const void __user *from, 
unsigned long n)
 
} while (len < n);
 
-   return len;
+   return n - len;
 }
 EXPORT_SYMBOL_GPL(copy_fro

[PATCH 1/2] perf: Fix handling of arch_perf_out_copy_user return value.

2013-07-29 Thread Jed Davis
All architectures except x86 use __copy_from_user_inatomic to provide
arch_perf_out_copy_user; like the other copy_from routines, it returns
the number of bytes not copied.  perf was expecting the number of bytes
that had been copied.  This change corrects that, and thereby allows
PERF_SAMPLE_STACK_USER to be enabled on non-x86 architectures.

x86 uses copy_from_user_nmi, which deviates from the other copy_from
routines by returning the number of bytes copied.  (This cancels out
the effect of perf being backwards; apparently this code has only ever
been tested on x86.)  This change therefore adds a second wrapper to
re-reverse it for perf; the next patch in this series will clean it up.

Signed-off-by: Jed Davis 
---
 arch/x86/include/asm/perf_event.h |  9 -
 kernel/events/internal.h  | 11 ++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/perf_event.h 
b/arch/x86/include/asm/perf_event.h
index 8249df4..ddae5bd 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -274,6 +274,13 @@ static inline void perf_check_microcode(void) { }
  static inline void amd_pmu_disable_virt(void) { }
 #endif
 
-#define arch_perf_out_copy_user copy_from_user_nmi
+static inline unsigned long copy_from_user_nmi_for_perf(void *to,
+   const void __user *from,
+   unsigned long n)
+{
+   return n - copy_from_user_nmi(to, from, n);
+}
+
+#define arch_perf_out_copy_user copy_from_user_nmi_for_perf
 
 #endif /* _ASM_X86_PERF_EVENT_H */
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index ca65997..e61b22c 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -81,6 +81,7 @@ static inline unsigned long perf_data_size(struct ring_buffer 
*rb)
return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
 }
 
+/* The memcpy_func must return the number of bytes successfully copied. */
 #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
 static inline unsigned int \
 func_name(struct perf_output_handle *handle,   \
@@ -122,11 +123,19 @@ DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
 
 DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP)
 
+/* arch_perf_out_copy_user must return the number of bytes not copied. */
 #ifndef arch_perf_out_copy_user
 #define arch_perf_out_copy_user __copy_from_user_inatomic
 #endif
 
-DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
+static inline unsigned long perf_memcpy_from_user(void *to,
+ const void __user *from,
+ unsigned long n)
+{
+   return n - arch_perf_out_copy_user(to, from, n);
+}
+
+DEFINE_OUTPUT_COPY(__output_copy_user, perf_memcpy_from_user)
 
 /* Callchain handling */
 extern struct perf_callchain_entry *
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] ARM: Fix r7/r11 confusion when CONFIG_THUMB2_KERNEL=y

2013-07-29 Thread Jed Davis
On Mon, Jul 22, 2013 at 07:52:39PM +0100, Dave Martin wrote:
> On Sun, Jul 21, 2013 at 10:37:53PM +0100, Will Deacon wrote:
> > Ok, I think I'm with you now. I also think that a better solution would be
> > to try and limit the r7/fp confusion to one place, perhaps behind something
> > like:
> > 
> > void arm_get_current_stackframe(struct pt_regs *regs, struct stackframe 
> > *frame);
> > 
> > then that function can act as the bridge between pt_regs (where we leave
> > everything as it is) and stackframe (where we assign either r7 or fp into
> > the fp member). Then we just fix up the call sites to pass the regs they're
> > interested in to our new function.
> > 
> > What do you think?

I can see that being useful if we wanted to opacify struct stackframe,
but... all uses of stackframe that I see involve passing it to
unwind_frame, which expands it back out into an array of registers.
(Except with CONFIG_FRAME_POINTER, but "APCS variants that require a
frame pointer register are obsolete.")

So... why not make pt_regs and stackframe the same, and avoid the
translations entirely?

> Do the ARM unwind tables guarantee not to need knowledge of any
> virtual registers for the purpose of processing the opcodes of a single
> function's unwind table entry, except those virtual regs that we happen
> to initialise?  Or do we just rely on luck?

Yes, for some value of "luck".  The spec documents 0x90|N, for N a core
register number other than 13 or 15, as setting the vSP to the value of
virtual register N.  We can get away with some omissions for kernel code
(e.g., unwind.c doesn't handle saved floating point registers, nor adding
constants larger than 1024 to vSP), but is this one of them?

[...]
> The purist answer seems to be: we might need all the regs in struct
> stackframe.
> 
> The pragmatic answer might be that we definitely need r7 for Thumb code,
> but given the nimbleness of GCC to evolve we might get away with not
> including extra registers for a long time yet.

The other question to ask here might be: what does the "pragmatic
answer" gain us over the "purist answer"?

> A review of existing custom unwind annotations might be a good idea
> anyway, to check whether any of them requires another register right now.

`egrep -r '\.(setf|movs)p' arch/arm` finds nothing, for what it's worth.

--Jed

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] ARM: Fix r7/r11 confusion when CONFIG_THUMB2_KERNEL=y

2013-07-19 Thread Jed Davis
On Mon, Jul 15, 2013 at 02:54:20PM +0100, Will Deacon wrote:
> On Sat, Jul 13, 2013 at 04:18:20AM +0100, Jed Davis wrote:
[...]
> > Effects of this are probably limited to failure of EHABI unwinding when
> > starting from a function that uses r7 to restore its stack pointer, but
> > the possibility for further breakage (which would be invisible on
> > non-Thumb kernels) is worrying.
[...]
> I'm struggling to understand exactly the problem that this patch is trying
> to address. If it's just a code consistency issue, I don't think it's worth
> it (I actually find it less confusing the way we currently have things) but
> if there is a real bug, perhaps you could provide a testcase?

There is a real bug here, but my commit message wasn't very clear.  This
was breaking PERF_COUNT_SW_CONTEXT_SWITCHES with CONFIG_THUMB2_KERNEL=y
(with my other recently posted patch applied), because kernel/sched.c is
built with -fno-omit-frame-pointer (which is wrong, but that's a problem
for another patch) and so __schedule's EHABI entry uses 0x97 (mov sp, r7),
and somewhere along the line the unwinder gets the r11 value instead.
This would also apply to any function with a variable-length array, but
__schedule happens to have the perf hook I was trying to use.

I should add that this bug doesn't affect me directly at the moment,
because we're not currently using CONFIG_THUMB2_KERNEL on Firefox OS,
because our kernels are assorted older versions with hardware vendors'
changes and there are some issues with it.  But I felt it was worth
tracking this down before trying to send changes upstream.

The "right" thing to do here might be to just include all the registers,
or at least {r4-pc}, in struct stackframe.  The parts that aren't {fp,
sp, lr, pc} could be ifdef'ed if we're concerned enough about the
overhead in kernels using APCS frame pointer unwinding.

I agree that a test case would be good -- I'm more than a little worried
of regressions without one -- but I could use some advice on how best to
do that.

--Jed

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] ARM: perf: Implement perf_arch_fetch_caller_regs

2013-07-19 Thread Jed Davis
On Mon, Jul 15, 2013 at 02:53:42PM +0100, Will Deacon wrote:
> On Sat, Jul 13, 2013 at 04:17:14AM +0100, Jed Davis wrote:
[...]
> > +#ifdef CONFIG_THUMB2_KERNEL
> > +#define perf_arch_fetch_caller_regs(regs, ip)  
> > \
> > +   do {\
> > +   __u32 _cpsr, _pc;   \
> > +   __asm__ __volatile__("str r7, [%[_regs], #(7 * 4)]\n\t" \
> > +"str r13, [%[_regs], #(13 * 4)]\n\t" \
> > +"str r14, [%[_regs], #(14 * 4)]\n\t" \
> 
> Is this safe? How can we be sure that the registers haven't been clobbered
> by the callee before this macro is expanded? For example, you can end up
> with the following code:

They might be clobbered, but if they are, then...

> 0058 :
>   58:   e92d 43f0   stmdb   sp!, {r4, r5, r6, r7, r8, r9, lr}
>   5c:   b09bsub sp, #108; 0x6c
>   5e:   ac08add r4, sp, #32
>   60:   4681mov r9, r0
>   62:   4688mov r8, r1
>   64:   4620mov r0, r4
>   66:   2148movsr1, #72 ; 0x48
>   68:   f7ff fffe   bl  0 <__memzero>
>   6c:   61e7str r7, [r4, #28]
>   6e:   f8c4 d034   str.w   sp, [r4, #52]   ; 0x34
>   72:   f8c4 e038   str.w   lr, [r4, #56]   ; 0x38
> 
> but the call to __memzero will have corrupted the lr.

...the function's unwinding entry will restore them from the stack, and
can't assume anything about their values before then.  It's the same
problem as if the code was interrupted by a profiler timer at that point
and we tried unwinding from the trap state.

Hopefully.  It's hard to be as rigorous as I'd like about this, because
the Exception Handling ABI was meant for exception handling, so as
specified it only needs to work at points where C++ exceptions can be
thrown, as I understand it.  Fortunately GCC isn't limited to that, but
there are more issues, because -fasynchronous-unwind-tables doesn't
actually make things work from *any* instruction as documented (which is
not entirely bad, because working correctly would significantly increase
the size of .extab/.exidx).

All that said, the unwinding program for a function should work at any
point after the prologue and before the epilogue.

> > +"mov %[_pc],  r15\n\t" \
> > +"mrs %[_cpsr], cpsr\n\t"   \
> > +: [_cpsr] "=r" (_cpsr),\
> > +  [_pc] "=r" (_pc) \
> > +: [_regs] "r" (&(regs)->uregs) \
> 
> It would be cleaner to pass a separate argument for each register, using the
> ARM_rN macros rather than calculating the offset by hand.

I'll do that.  If there were more arguments there might be a problem
at -O0, because the naive translation can run out of registers in
some cases, but that shouldn't be a problem here.  (Nor if someone
later extends this to all the core registers, because {r0-r13} can and
presumably should use a store-multiple.)

Thanks for the quick review, and sorry for the delay in responding.

--Jed

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] ARM: Fix r7/r11 confusion when CONFIG_THUMB2_KERNEL=y

2013-07-12 Thread Jed Davis
There is currently some inconsistency about the "frame pointer" on ARM.
r11 is the register with assemblers recognize and disassemblers often
print as "fp", and which is sufficient for stack unwinding when using
the APCS frame pointer option; but when unwinding with the Exception
Handling ABI, the register GCC uses when a constant offset won't suffice
(or when -fno-omit-frame-pointer is used; see kernel/sched/Makefile in
particular) is r11 on ARM and r7 on Thumb.

Correspondingly, arch/arm/include/uapi/arm/ptrace.h defines ARM_fp to
refer to r11, but arch/arm/kernel/unwind.c uses "FP" to mean either r11
or r7 depending on Thumbness, and it is unclear what other cases such as
the "fp" in struct stackframe should be doing.

Effects of this are probably limited to failure of EHABI unwinding when
starting from a function that uses r7 to restore its stack pointer, but
the possibility for further breakage (which would be invisible on
non-Thumb kernels) is worrying.

With this change, it is hoped, r7 is consistently referred to as "r7",
and "fp" always means r11; this costs a few extra ifdefs, but it should
help prevent future issues.

Signed-off-by: Jed Davis 
---
 arch/arm/include/asm/stacktrace.h  |4 
 arch/arm/include/asm/thread_info.h |2 ++
 arch/arm/kernel/perf_event.c   |4 
 arch/arm/kernel/process.c  |4 
 arch/arm/kernel/time.c |4 
 arch/arm/kernel/unwind.c   |   27 ++-
 arch/arm/oprofile/common.c |4 
 7 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/stacktrace.h 
b/arch/arm/include/asm/stacktrace.h
index 4d0a164..5e546bf 100644
--- a/arch/arm/include/asm/stacktrace.h
+++ b/arch/arm/include/asm/stacktrace.h
@@ -2,7 +2,11 @@
 #define __ASM_STACKTRACE_H
 
 struct stackframe {
+#ifdef CONFIG_THUMB2_KERNEL
+   unsigned long r7;
+#else
unsigned long fp;
+#endif
unsigned long sp;
unsigned long lr;
unsigned long pc;
diff --git a/arch/arm/include/asm/thread_info.h 
b/arch/arm/include/asm/thread_info.h
index 214d415..ae3cd81 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -105,6 +105,8 @@ static inline struct thread_info *current_thread_info(void)
((unsigned long)(task_thread_info(tsk)->cpu_context.sp))
 #define thread_saved_fp(tsk)   \
((unsigned long)(task_thread_info(tsk)->cpu_context.fp))
+#define thread_saved_r7(tsk)   \
+   ((unsigned long)(task_thread_info(tsk)->cpu_context.r7))
 
 extern void crunch_task_disable(struct thread_info *);
 extern void crunch_task_copy(struct thread_info *, void *);
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index d9f5cd4..55776d6 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -601,7 +601,11 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, 
struct pt_regs *regs)
return;
}
 
+#ifdef CONFIG_THUMB2_KERNEL
+   fr.r7 = regs->ARM_r7;
+#else
fr.fp = regs->ARM_fp;
+#endif
fr.sp = regs->ARM_sp;
fr.lr = regs->ARM_lr;
fr.pc = regs->ARM_pc;
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index d3ca4f6..aeb4c28 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -405,7 +405,11 @@ unsigned long get_wchan(struct task_struct *p)
if (!p || p == current || p->state == TASK_RUNNING)
return 0;
 
+#ifdef CONFIG_THUMB2_KERNEL
+   frame.r7 = thread_saved_r7(p);
+#else
frame.fp = thread_saved_fp(p);
+#endif
frame.sp = thread_saved_sp(p);
frame.lr = 0;   /* recovered from the stack */
frame.pc = thread_saved_pc(p);
diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c
index 98aee32..80410d3 100644
--- a/arch/arm/kernel/time.c
+++ b/arch/arm/kernel/time.c
@@ -49,7 +49,11 @@ unsigned long profile_pc(struct pt_regs *regs)
if (!in_lock_functions(regs->ARM_pc))
return regs->ARM_pc;
 
+#ifdef CONFIG_THUMB2_KERNEL
+   frame.r7 = regs->ARM_r7;
+#else
frame.fp = regs->ARM_fp;
+#endif
frame.sp = regs->ARM_sp;
frame.lr = regs->ARM_lr;
frame.pc = regs->ARM_pc;
diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
index 00df012..dec03ae 100644
--- a/arch/arm/kernel/unwind.c
+++ b/arch/arm/kernel/unwind.c
@@ -74,7 +74,7 @@ struct unwind_ctrl_block {
 
 enum regs {
 #ifdef CONFIG_THUMB2_KERNEL
-   FP = 7,
+   R7 = 7,
 #else
FP = 11,
 #endif
@@ -317,8 +317,13 @@ static int unwind_exec_insn(struct unwind_ctrl_block *ctrl)
return -URC_FAILURE;
}
 
+#ifdef CONFIG_THUMB2_KERNEL
+   pr_debug("%s: r7 = %08lx sp = %08lx lr = %08lx pc = %08lx\n", __func__,
+ctrl->vrs[R7], ctrl-

[PATCH] ARM: perf: Implement perf_arch_fetch_caller_regs

2013-07-12 Thread Jed Davis
We need a perf_arch_fetch_caller_regs for at least some software events
to be able to get a callchain; even user stacks won't work without
at least the CPSR bits for non-user-mode (see perf_callchain).  In
particular, profiling context switches needs this.

This records the state of the point at which perf_arch_fetch_caller_regs
is expanded, instead of that function activation's call site, because we
need SP and PC to be consistent for EHABI unwinding; hopefully nothing
will be inconvenienced by the extra stack frame.

Signed-off-by: Jed Davis 
---
 arch/arm/include/asm/perf_event.h |   43 +
 1 file changed, 43 insertions(+)

diff --git a/arch/arm/include/asm/perf_event.h 
b/arch/arm/include/asm/perf_event.h
index 7558775..2cc7255 100644
--- a/arch/arm/include/asm/perf_event.h
+++ b/arch/arm/include/asm/perf_event.h
@@ -12,6 +12,8 @@
 #ifndef __ARM_PERF_EVENT_H__
 #define __ARM_PERF_EVENT_H__
 
+#include 
+
 /*
  * The ARMv7 CPU PMU supports up to 32 event counters.
  */
@@ -28,4 +30,45 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
 #define perf_misc_flags(regs)  perf_misc_flags(regs)
 #endif
 
+/*
+ * We can't actually get the caller's registers here; the saved PC and
+ * SP values have to be consistent or else EHABI unwinding won't work,
+ * and the only way to find the matching SP for the return address is
+ * to unwind the current function.  So we save the current state
+ * instead.
+ *
+ * Note that the ARM Exception Handling ABI allows unwinding to depend
+ * on the contents of any core register, but our unwinder is limited
+ * to the ones in struct stackframe (which are the only ones we expect
+ * GCC to need for kernel code), so we just record those.
+ */
+#ifdef CONFIG_THUMB2_KERNEL
+#define perf_arch_fetch_caller_regs(regs, ip)  \
+   do {\
+   __u32 _cpsr, _pc;   \
+   __asm__ __volatile__("str r7, [%[_regs], #(7 * 4)]\n\t" \
+"str r13, [%[_regs], #(13 * 4)]\n\t" \
+"str r14, [%[_regs], #(14 * 4)]\n\t" \
+"mov %[_pc],  r15\n\t" \
+"mrs %[_cpsr], cpsr\n\t"   \
+: [_cpsr] "=r" (_cpsr),\
+  [_pc] "=r" (_pc) \
+: [_regs] "r" (&(regs)->uregs) \
+: "memory");   \
+   (regs)->ARM_pc = _pc;   \
+   (regs)->ARM_cpsr = _cpsr;   \
+   } while (0)
+#else
+#define perf_arch_fetch_caller_regs(regs, ip)  \
+   do {\
+   __u32 _cpsr;\
+   __asm__ __volatile__("stmia %[_regs11], {r11 - r15}\n\t" \
+"mrs %[_cpsr], cpsr\n\t"   \
+: [_cpsr] "=r" (_cpsr) \
+: [_regs11] "r" (&(regs)->uregs[11]) \
+: "memory");   \
+   (regs)->ARM_cpsr = _cpsr;   \
+   } while (0)
+#endif
+
 #endif /* __ARM_PERF_EVENT_H__ */
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] perf: ARM: Record the user-mode PC in the call chain.

2013-06-19 Thread Jed Davis
On Tue, Jun 18, 2013 at 02:13:19PM +0100, Will Deacon wrote:
> On Fri, Jun 14, 2013 at 12:21:11AM +0100, Jed Davis wrote:
> > With this change, we no longer lose the innermost entry in the user-mode
> > part of the call chain.  See also the x86 port, which includes the ip.
> > 
> > It's possible to partially work around this problem by post-processing
> > the data to use the PERF_SAMPLE_IP value, but this works only if the CPU
> > wasn't in the kernel when the sample was taken.
> 
> Thanks. I guess we need something similar for arm64 too. Could you cook a
> similar patch please?

Done (and tested, on the ARM V8 Foundation Model).

It looked as if the powerpc and sparc ports might have similar issues,
but I haven't checked on them yet.

--Jed

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] perf: arm64: Record the user-mode PC in the call chain.

2013-06-19 Thread Jed Davis
With this change, we no longer lose the innermost entry in the user-mode
part of the call chain.  See also the x86 port, which includes the ip,
and the corresponding change in arch/arm.

Signed-off-by: Jed Davis 
---
 arch/arm64/kernel/perf_event.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 1e49e5eb..9ba33c4 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -1336,6 +1336,7 @@ void perf_callchain_user(struct perf_callchain_entry 
*entry,
return;
}
 
+   perf_callchain_store(entry, regs->pc);
tail = (struct frame_tail __user *)regs->regs[29];
 
while (entry->nr < PERF_MAX_STACK_DEPTH &&
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] perf: ARM: Record the user-mode PC in the call chain.

2013-06-13 Thread Jed Davis
With this change, we no longer lose the innermost entry in the user-mode
part of the call chain.  See also the x86 port, which includes the ip.

It's possible to partially work around this problem by post-processing
the data to use the PERF_SAMPLE_IP value, but this works only if the CPU
wasn't in the kernel when the sample was taken.

Signed-off-by: Jed Davis 
---
 arch/arm/kernel/perf_event.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index 8c3094d..d9f5cd4 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -569,6 +569,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, 
struct pt_regs *regs)
return;
}
 
+   perf_callchain_store(entry, regs->ARM_pc);
tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/