On 23/03/20(Mon) 15:18, Martin Pieuchot wrote:
> On 19/03/20(Thu) 15:37, Martin Pieuchot wrote:
> > People are starting to capture kernel stack traces and produce
> > Flamegraphs. Those traces currently include the frames used by
> > dt(4) itself:
> >
> > dt_pcb_ring_get+0x11d
> > dt_prov_profile_enter+0x6e
> > hardclock+0x1a9
> > lapic_clockintr+0x3f
> > Xresume_lapic_ltimer+0x26
> > acpicpu_idle+0x261 <--- CPU was Idle
> > sched_idle+0x225
> > proc_trampoline+0x1c
> > 1
> >
> > Saving the last 5 frames in the example above consumes CPU time and
> > require some post-processing to cleanup gathered data. So the diff
> > below extends the existing stacktrace_save() into and *_at() version
> > that takes an arbitrary frame as argument.
> >
> > The fun start when we ask "How many frames do we skip?". There are
> > currently two different contexts in which dt(4) can gather data:
> > - in hardlock(9), which mean a specialized interrupt context
> > - in any thread or interrupt context
> >
> > Those two contexts require a different number of frames to reach the
> > recording function: dt_pcb_ring_get(). Now what's fun is that every
> > architecture has its own way to reach hardclock(9) (possibly multiple
> > ones), has its own stack unwinder (with specific bugs) and might use
> > different compilers with different optimizations (yeah!).
> >
> > So the diff below introduce two per-arch defines that specify at which
> > frame the unwinding should start. This has been tested on sparc64 and
> > amd64 and here's the result:
> >
> > # btrace -e 'tracepoint:sched:sleep { printf("%s1\n", kstack); }'
> > sleep_setup+0x16c
> > msleep+0x160
> > taskq_next_work+0x5c
> > taskq_thread+0x38
> > 0x1013074
> > 1
> >
> > Note that the number of frames for hardclock(9) on amd64 assumes lapic
> > is used. That's why on vmm(4) you'll still see an extra frame as below:
> >
> > # btrace -e 'profile:hz:99 { printf("%s1\n", kstack); }'
> > Xresume_legacy0+0x1d3
> > cpu_idle_cycle+0x1b
> > proc_trampoline+0x1c
> > 1
> >
> > Whereas on real hardware:
> >
> > # btrace -e 'profile:hz:99 { printf("%s1\n", kstack); }'
> > acpicpu_idle+0x1d2
> > sched_idle+0x225
> > proc_trampoline+0x1c
> > 1
> >
> > The diff below will break compiling dt(4) on the other archs because
> > they don't yet provide stacktrace_save_at(), but it shouldn't be hard
> > to fix ;)
>
> Updated diff including stacktrace_save_at() for i386, any comment on the
> approach or Ok?
New diff, new approach suggested by visa@. Passes a number of frames to
skip to the unwinding function. This is compatible with the mips64
unwinder and makes sure that the __builtin_frame_address() isn't called
with an index other than 0.
Indexes have been adjusted because __builtin_frame_address(0) is now in
a deeper frame.
Ok?
Index: arch/amd64/amd64/db_trace.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/db_trace.c,v
retrieving revision 1.49
diff -u -p -r1.49 db_trace.c
--- arch/amd64/amd64/db_trace.c 20 Jan 2020 15:58:23 -0000 1.49
+++ arch/amd64/amd64/db_trace.c 24 Mar 2020 13:03:35 -0000
@@ -256,14 +256,17 @@ db_stack_trace_print(db_expr_t addr, int
}
void
-stacktrace_save(struct stacktrace *st)
+stacktrace_save_at(struct stacktrace *st, unsigned int skip)
{
struct callframe *frame, *lastframe;
frame = __builtin_frame_address(0);
st->st_count = 0;
while (st->st_count < STACKTRACE_MAX) {
- st->st_pc[st->st_count++] = frame->f_retaddr;
+ if (skip == 0)
+ st->st_pc[st->st_count++] = frame->f_retaddr;
+ else
+ skip--;
lastframe = frame;
frame = frame->f_frame;
@@ -275,6 +278,12 @@ stacktrace_save(struct stacktrace *st)
if (!INKERNEL(frame->f_retaddr))
break;
}
+}
+
+void
+stacktrace_save(struct stacktrace *st)
+{
+ return stacktrace_save_at(st, 0);
}
vaddr_t
Index: arch/i386/i386/db_trace.c
===================================================================
RCS file: /cvs/src/sys/arch/i386/i386/db_trace.c,v
retrieving revision 1.38
diff -u -p -r1.38 db_trace.c
--- arch/i386/i386/db_trace.c 20 Jan 2020 15:58:23 -0000 1.38
+++ arch/i386/i386/db_trace.c 24 Mar 2020 12:59:34 -0000
@@ -261,14 +261,17 @@ db_stack_trace_print(db_expr_t addr, int
}
void
-stacktrace_save(struct stacktrace *st)
+stacktrace_save_at(struct stacktrace *st, unsigned int skip)
{
struct callframe *frame, *lastframe;
frame = __builtin_frame_address(0);
st->st_count = 0;
while (st->st_count < STACKTRACE_MAX) {
- st->st_pc[st->st_count++] = frame->f_retaddr;
+ if (skip == 0)
+ st->st_pc[st->st_count++] = frame->f_retaddr;
+ else
+ skip--;
lastframe = frame;
frame = frame->f_frame;
@@ -280,6 +283,12 @@ stacktrace_save(struct stacktrace *st)
if (!INKERNEL(frame->f_retaddr))
break;
}
+}
+
+void
+stacktrace_save(struct stacktrace *st)
+{
+ return stacktrace_save_at(st, 0);
}
vaddr_t
Index: arch/sparc64/sparc64/db_trace.c
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/sparc64/db_trace.c,v
retrieving revision 1.20
diff -u -p -r1.20 db_trace.c
--- arch/sparc64/sparc64/db_trace.c 20 Jan 2020 15:58:23 -0000 1.20
+++ arch/sparc64/sparc64/db_trace.c 24 Mar 2020 13:08:55 -0000
@@ -155,7 +155,7 @@ db_stack_trace_print(db_expr_t addr, int
}
void
-stacktrace_save(struct stacktrace *st)
+stacktrace_save_at(struct stacktrace *st, unsigned int skip)
{
struct frame64 *f64;
vaddr_t pc;
@@ -181,8 +181,17 @@ stacktrace_save(struct stacktrace *st)
if ((frame & 1) == 0)
break;
- st->st_pc[st->st_count++] = pc;
+ if (skip == 0)
+ st->st_pc[st->st_count++] = pc;
+ else
+ skip--;
}
+}
+
+void
+stacktrace_save(struct stacktrace *st)
+{
+ return stacktrace_save_at(st, 0);
}
void
Index: dev/dt/dt_dev.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v
retrieving revision 1.4
diff -u -p -r1.4 dt_dev.c
--- dev/dt/dt_dev.c 4 Feb 2020 10:56:15 -0000 1.4
+++ dev/dt/dt_dev.c 24 Mar 2020 13:22:04 -0000
@@ -26,29 +26,42 @@
#include <dev/dt/dtvar.h>
/*
- * How many frames are used by the profiling code? For example
- * on amd64:
+ * Number of frames to skip in stack traces.
*
- * From syscall provider:
+ * The number of frames required to execute dt(4) profiling code
+ * depends on the probe, context, architecture and possibly the
+ * compiler.
+ *
+ * Static probes (tracepoints) are executed in the context of the
+ * current thread and only need to skip frames up to the recording
+ * function. For example the syscall provider:
*
* dt_prov_syscall_entry+0x141
* syscall+0x205 <--- start here
* Xsyscall+0x128
*
- * From profile provider:
+ * Probes executed in their own context, like the profile provider,
+ * need to skip the frames of that context which are different for
+ * every architecture. For example the profile provider executed
+ * from hardclock(9) on amd64:
*
* dt_prov_profile_enter+0x6e
- * hardclock+0x12c
- * clockintr+0x59
- * intr_handler+0x6e
- * Xresume_legacy0+0x1d3
- * cpu_idle_cycle+0x1b <---- start here.
+ * hardclock+0x1a9
+ * lapic_clockintr+0x3f
+ * Xresume_lapic_ltimer+0x26
+ * acpicpu_idle+0x1d2 <---- start here.
+ * sched_idle+0x225
* proc_trampoline+0x1c
*/
-#if notyet
-#define DT_HOOK_FRAME_ADDRESS __builtin_frame_address(4)
+#if defined(__amd64__)
+#define DT_FA_PROFILE 5
+#define DT_FA_STATIC 2
+#elif defined(__sparc64__)
+#define DT_FA_PROFILE 5
+#define DT_FA_STATIC 1
#else
-#define DT_HOOK_FRAME_ADDRESS __builtin_frame_address(0)
+#define DT_FA_STATIC 0
+#define DT_FA_PROFILE 0
#endif
#define DT_EVTRING_SIZE 16 /* # of slots in per PCB event ring */
@@ -583,7 +596,7 @@ dt_pcb_filter(struct dt_pcb *dp)
* Get a reference to the next free event state from the ring.
*/
struct dt_evt *
-dt_pcb_ring_get(struct dt_pcb *dp)
+dt_pcb_ring_get(struct dt_pcb *dp, int profiling)
{
struct proc *p = curproc;
struct dt_evt *dtev;
@@ -617,11 +630,10 @@ dt_pcb_ring_get(struct dt_pcb *dp)
memcpy(dtev->dtev_comm, p->p_p->ps_comm, DTMAXCOMLEN - 1);
if (ISSET(dp->dp_evtflags, DTEVT_KSTACK|DTEVT_USTACK)) {
-#if notyet
- stacktrace_save_at(&dtev->dtev_kstack, DT_HOOK_FRAME_ADDRESS);
-#else
- stacktrace_save(&dtev->dtev_kstack);
-#endif
+ if (profiling)
+ stacktrace_save_at(&dtev->dtev_kstack, DT_FA_PROFILE);
+ else
+ stacktrace_save_at(&dtev->dtev_kstack, DT_FA_STATIC);
}
return dtev;
Index: dev/dt/dt_prov_profile.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_profile.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_prov_profile.c
--- dev/dt/dt_prov_profile.c 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dt_prov_profile.c 19 Mar 2020 14:06:01 -0000
@@ -107,7 +107,7 @@ dt_prov_profile_fire(struct dt_pcb *dp)
if (++dp->dp_nticks < dp->dp_maxtick)
return;
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 1);
if (dtev == NULL)
return;
dt_pcb_ring_consume(dp, dtev);
Index: dev/dt/dt_prov_static.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_static.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_prov_static.c
--- dev/dt/dt_prov_static.c 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dt_prov_static.c 19 Mar 2020 14:06:01 -0000
@@ -120,7 +120,7 @@ dt_prov_static_hook(struct dt_provider *
SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) {
struct dt_evt *dtev;
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 0);
if (dtev == NULL)
continue;
Index: dev/dt/dt_prov_syscall.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_syscall.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_prov_syscall.c
--- dev/dt/dt_prov_syscall.c 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dt_prov_syscall.c 19 Mar 2020 14:06:01 -0000
@@ -148,7 +148,7 @@ dt_prov_syscall_entry(struct dt_provider
SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) {
struct dt_evt *dtev;
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 0);
if (dtev == NULL)
continue;
@@ -190,7 +190,7 @@ dt_prov_syscall_return(struct dt_provide
SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) {
struct dt_evt *dtev;
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 0);
if (dtev == NULL)
continue;
Index: dev/dt/dtvar.h
===================================================================
RCS file: /cvs/src/sys/dev/dt/dtvar.h,v
retrieving revision 1.1
diff -u -p -r1.1 dtvar.h
--- dev/dt/dtvar.h 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dtvar.h 19 Mar 2020 14:06:01 -0000
@@ -194,7 +194,7 @@ void dt_pcb_free(struct dt_pcb *);
void dt_pcb_purge(struct dt_pcb_list *);
int dt_pcb_filter(struct dt_pcb *);
-struct dt_evt *dt_pcb_ring_get(struct dt_pcb *);
+struct dt_evt *dt_pcb_ring_get(struct dt_pcb *, int);
void dt_pcb_ring_consume(struct dt_pcb *, struct dt_evt *);
/*
Index: sys/stacktrace.h
===================================================================
RCS file: /cvs/src/sys/sys/stacktrace.h,v
retrieving revision 1.1
diff -u -p -r1.1 stacktrace.h
--- sys/stacktrace.h 20 Jan 2020 15:58:23 -0000 1.1
+++ sys/stacktrace.h 24 Mar 2020 13:00:54 -0000
@@ -28,6 +28,7 @@ struct stacktrace {
#ifdef _KERNEL
void stacktrace_print(struct stacktrace *, int (*)(const char *, ...));
+void stacktrace_save_at(struct stacktrace *, unsigned int);
void stacktrace_save(struct stacktrace *);
#endif