On 19/03/20(Thu) 15:37, Martin Pieuchot wrote:
> People are starting to capture kernel stack traces and produce
> Flamegraphs. Those traces currently include the frames used by
> dt(4) itself:
>
> dt_pcb_ring_get+0x11d
> dt_prov_profile_enter+0x6e
> hardclock+0x1a9
> lapic_clockintr+0x3f
> Xresume_lapic_ltimer+0x26
> acpicpu_idle+0x261 <--- CPU was Idle
> sched_idle+0x225
> proc_trampoline+0x1c
> 1
>
> Saving the last 5 frames in the example above consumes CPU time and
> require some post-processing to cleanup gathered data. So the diff
> below extends the existing stacktrace_save() into and *_at() version
> that takes an arbitrary frame as argument.
>
> The fun start when we ask "How many frames do we skip?". There are
> currently two different contexts in which dt(4) can gather data:
> - in hardlock(9), which mean a specialized interrupt context
> - in any thread or interrupt context
>
> Those two contexts require a different number of frames to reach the
> recording function: dt_pcb_ring_get(). Now what's fun is that every
> architecture has its own way to reach hardclock(9) (possibly multiple
> ones), has its own stack unwinder (with specific bugs) and might use
> different compilers with different optimizations (yeah!).
>
> So the diff below introduce two per-arch defines that specify at which
> frame the unwinding should start. This has been tested on sparc64 and
> amd64 and here's the result:
>
> # btrace -e 'tracepoint:sched:sleep { printf("%s1\n", kstack); }'
> sleep_setup+0x16c
> msleep+0x160
> taskq_next_work+0x5c
> taskq_thread+0x38
> 0x1013074
> 1
>
> Note that the number of frames for hardclock(9) on amd64 assumes lapic
> is used. That's why on vmm(4) you'll still see an extra frame as below:
>
> # btrace -e 'profile:hz:99 { printf("%s1\n", kstack); }'
> Xresume_legacy0+0x1d3
> cpu_idle_cycle+0x1b
> proc_trampoline+0x1c
> 1
>
> Whereas on real hardware:
>
> # btrace -e 'profile:hz:99 { printf("%s1\n", kstack); }'
> acpicpu_idle+0x1d2
> sched_idle+0x225
> proc_trampoline+0x1c
> 1
>
> The diff below will break compiling dt(4) on the other archs because
> they don't yet provide stacktrace_save_at(), but it shouldn't be hard
> to fix ;)
Updated diff including stacktrace_save_at() for i386, any comment on the
approach or Ok?
Index: arch/amd64/amd64/db_trace.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/db_trace.c,v
retrieving revision 1.49
diff -u -p -r1.49 db_trace.c
--- arch/amd64/amd64/db_trace.c 20 Jan 2020 15:58:23 -0000 1.49
+++ arch/amd64/amd64/db_trace.c 19 Mar 2020 14:06:01 -0000
@@ -256,11 +256,13 @@ db_stack_trace_print(db_expr_t addr, int
}
void
-stacktrace_save(struct stacktrace *st)
+stacktrace_save_at(struct stacktrace *st, void *xframe)
{
- struct callframe *frame, *lastframe;
+ struct callframe *lastframe, *frame = xframe;
+
+ if (frame == NULL)
+ frame = __builtin_frame_address(0);
- frame = __builtin_frame_address(0);
st->st_count = 0;
while (st->st_count < STACKTRACE_MAX) {
st->st_pc[st->st_count++] = frame->f_retaddr;
@@ -275,6 +277,12 @@ stacktrace_save(struct stacktrace *st)
if (!INKERNEL(frame->f_retaddr))
break;
}
+}
+
+void
+stacktrace_save(struct stacktrace *st)
+{
+ return stacktrace_save_at(st, NULL);
}
vaddr_t
Index: arch/i386/i386/db_trace.c
===================================================================
RCS file: /cvs/src/sys/arch/i386/i386/db_trace.c,v
retrieving revision 1.38
diff -u -p -r1.38 db_trace.c
--- arch/i386/i386/db_trace.c 20 Jan 2020 15:58:23 -0000 1.38
+++ arch/i386/i386/db_trace.c 20 Mar 2020 10:50:28 -0000
@@ -263,9 +263,17 @@ db_stack_trace_print(db_expr_t addr, int
void
stacktrace_save(struct stacktrace *st)
{
- struct callframe *frame, *lastframe;
+ return stacktrace_save_at(st, NULL);
+}
+
+void
+stacktrace_save_at(struct stacktrace *st, void *xframe)
+{
+ struct callframe *lastframe, *frame = xframe;
+
+ if (frame == NULL)
+ frame = __builtin_frame_address(0);
- frame = __builtin_frame_address(0);
st->st_count = 0;
while (st->st_count < STACKTRACE_MAX) {
st->st_pc[st->st_count++] = frame->f_retaddr;
Index: arch/sparc64/sparc64/db_trace.c
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/sparc64/db_trace.c,v
retrieving revision 1.20
diff -u -p -r1.20 db_trace.c
--- arch/sparc64/sparc64/db_trace.c 20 Jan 2020 15:58:23 -0000 1.20
+++ arch/sparc64/sparc64/db_trace.c 19 Mar 2020 14:06:01 -0000
@@ -155,7 +155,7 @@ db_stack_trace_print(db_expr_t addr, int
}
void
-stacktrace_save(struct stacktrace *st)
+stacktrace_save_at(struct stacktrace *st, void *xframe)
{
struct frame64 *f64;
vaddr_t pc;
@@ -163,7 +163,10 @@ stacktrace_save(struct stacktrace *st)
write_all_windows();
- frame = (vaddr_t)__builtin_frame_address(0) - BIAS;
+ if (xframe == NULL)
+ xframe = __builtin_frame_address(0);
+
+ frame = (vaddr_t)xframe - BIAS;
if ((frame & 1) == 0)
return;
@@ -185,6 +188,11 @@ stacktrace_save(struct stacktrace *st)
}
}
+void
+stacktrace_save(struct stacktrace *st)
+{
+ return stacktrace_save_at(st, NULL);
+}
void
db_dump_window(db_expr_t addr, int have_addr, db_expr_t count, char *modif)
{
Index: dev/dt/dt_dev.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v
retrieving revision 1.4
diff -u -p -r1.4 dt_dev.c
--- dev/dt/dt_dev.c 4 Feb 2020 10:56:15 -0000 1.4
+++ dev/dt/dt_dev.c 19 Mar 2020 14:08:53 -0000
@@ -26,29 +26,40 @@
#include <dev/dt/dtvar.h>
/*
- * How many frames are used by the profiling code? For example
- * on amd64:
+ * The number of frames required to execute dt(4) profiling code
+ * depends on the probe, context, architecture and possibly the
+ * compiler.
*
- * From syscall provider:
+ * Static probes (tracepoints) are executed in the context of the
+ * current thread and only need to skip frames up to the recording
+ *function. For example the syscall provider:
*
* dt_prov_syscall_entry+0x141
* syscall+0x205 <--- start here
* Xsyscall+0x128
*
- * From profile provider:
+ * Probes executed in their own context, like the profile provider,
+ * need to skip the frames of that context which are different for
+ * every architecture. For example the profile provider executed
+ * from hardclock(9) on amd64:
*
* dt_prov_profile_enter+0x6e
- * hardclock+0x12c
- * clockintr+0x59
- * intr_handler+0x6e
- * Xresume_legacy0+0x1d3
- * cpu_idle_cycle+0x1b <---- start here.
+ * hardclock+0x1a9
+ * lapic_clockintr+0x3f
+ * Xresume_lapic_ltimer+0x26
+ * acpicpu_idle+0x1d2 <---- start here.
+ * sched_idle+0x225
* proc_trampoline+0x1c
*/
-#if notyet
-#define DT_HOOK_FRAME_ADDRESS __builtin_frame_address(4)
+#if defined(__amd64__)
+#define DT_FA_PROFILE __builtin_frame_address(4)
+#define DT_FA_STATIC __builtin_frame_address(1)
+#elif defined(__sparc64__)
+#define DT_FA_PROFILE __builtin_frame_address(4)
+#define DT_FA_STATIC __builtin_frame_address(0)
#else
-#define DT_HOOK_FRAME_ADDRESS __builtin_frame_address(0)
+#define DT_FA_STATIC __builtin_frame_address(0)
+#define DT_FA_PROFILE DT_FA_STATIC
#endif
#define DT_EVTRING_SIZE 16 /* # of slots in per PCB event ring */
@@ -583,7 +594,7 @@ dt_pcb_filter(struct dt_pcb *dp)
* Get a reference to the next free event state from the ring.
*/
struct dt_evt *
-dt_pcb_ring_get(struct dt_pcb *dp)
+dt_pcb_ring_get(struct dt_pcb *dp, int profiling)
{
struct proc *p = curproc;
struct dt_evt *dtev;
@@ -617,11 +628,10 @@ dt_pcb_ring_get(struct dt_pcb *dp)
memcpy(dtev->dtev_comm, p->p_p->ps_comm, DTMAXCOMLEN - 1);
if (ISSET(dp->dp_evtflags, DTEVT_KSTACK|DTEVT_USTACK)) {
-#if notyet
- stacktrace_save_at(&dtev->dtev_kstack, DT_HOOK_FRAME_ADDRESS);
-#else
- stacktrace_save(&dtev->dtev_kstack);
-#endif
+ if (profiling)
+ stacktrace_save_at(&dtev->dtev_kstack, DT_FA_PROFILE);
+ else
+ stacktrace_save_at(&dtev->dtev_kstack, DT_FA_STATIC);
}
return dtev;
Index: dev/dt/dt_prov_profile.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_profile.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_prov_profile.c
--- dev/dt/dt_prov_profile.c 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dt_prov_profile.c 19 Mar 2020 14:06:01 -0000
@@ -107,7 +107,7 @@ dt_prov_profile_fire(struct dt_pcb *dp)
if (++dp->dp_nticks < dp->dp_maxtick)
return;
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 1);
if (dtev == NULL)
return;
dt_pcb_ring_consume(dp, dtev);
Index: dev/dt/dt_prov_static.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_static.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_prov_static.c
--- dev/dt/dt_prov_static.c 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dt_prov_static.c 19 Mar 2020 14:06:01 -0000
@@ -120,7 +120,7 @@ dt_prov_static_hook(struct dt_provider *
SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) {
struct dt_evt *dtev;
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 0);
if (dtev == NULL)
continue;
Index: dev/dt/dt_prov_syscall.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_syscall.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_prov_syscall.c
--- dev/dt/dt_prov_syscall.c 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dt_prov_syscall.c 19 Mar 2020 14:06:01 -0000
@@ -148,7 +148,7 @@ dt_prov_syscall_entry(struct dt_provider
SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) {
struct dt_evt *dtev;
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 0);
if (dtev == NULL)
continue;
@@ -190,7 +190,7 @@ dt_prov_syscall_return(struct dt_provide
SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) {
struct dt_evt *dtev;
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 0);
if (dtev == NULL)
continue;
Index: dev/dt/dtvar.h
===================================================================
RCS file: /cvs/src/sys/dev/dt/dtvar.h,v
retrieving revision 1.1
diff -u -p -r1.1 dtvar.h
--- dev/dt/dtvar.h 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dtvar.h 19 Mar 2020 14:06:01 -0000
@@ -194,7 +194,7 @@ void dt_pcb_free(struct dt_pcb *);
void dt_pcb_purge(struct dt_pcb_list *);
int dt_pcb_filter(struct dt_pcb *);
-struct dt_evt *dt_pcb_ring_get(struct dt_pcb *);
+struct dt_evt *dt_pcb_ring_get(struct dt_pcb *, int);
void dt_pcb_ring_consume(struct dt_pcb *, struct dt_evt *);
/*
Index: sys/stacktrace.h
===================================================================
RCS file: /cvs/src/sys/sys/stacktrace.h,v
retrieving revision 1.1
diff -u -p -r1.1 stacktrace.h
--- sys/stacktrace.h 20 Jan 2020 15:58:23 -0000 1.1
+++ sys/stacktrace.h 19 Mar 2020 14:06:01 -0000
@@ -28,6 +28,7 @@ struct stacktrace {
#ifdef _KERNEL
void stacktrace_print(struct stacktrace *, int (*)(const char *, ...));
+void stacktrace_save_at(struct stacktrace *, void *);
void stacktrace_save(struct stacktrace *);
#endif