People are starting to capture kernel stack traces and produce
Flamegraphs. Those traces currently include the frames used by
dt(4) itself:
dt_pcb_ring_get+0x11d
dt_prov_profile_enter+0x6e
hardclock+0x1a9
lapic_clockintr+0x3f
Xresume_lapic_ltimer+0x26
acpicpu_idle+0x261 <--- CPU was Idle
sched_idle+0x225
proc_trampoline+0x1c
1
Saving the last 5 frames in the example above consumes CPU time and
require some post-processing to cleanup gathered data. So the diff
below extends the existing stacktrace_save() into and *_at() version
that takes an arbitrary frame as argument.
The fun start when we ask "How many frames do we skip?". There are
currently two different contexts in which dt(4) can gather data:
- in hardlock(9), which mean a specialized interrupt context
- in any thread or interrupt context
Those two contexts require a different number of frames to reach the
recording function: dt_pcb_ring_get(). Now what's fun is that every
architecture has its own way to reach hardclock(9) (possibly multiple
ones), has its own stack unwinder (with specific bugs) and might use
different compilers with different optimizations (yeah!).
So the diff below introduce two per-arch defines that specify at which
frame the unwinding should start. This has been tested on sparc64 and
amd64 and here's the result:
# btrace -e 'tracepoint:sched:sleep { printf("%s1\n", kstack); }'
sleep_setup+0x16c
msleep+0x160
taskq_next_work+0x5c
taskq_thread+0x38
0x1013074
1
Note that the number of frames for hardclock(9) on amd64 assumes lapic
is used. That's why on vmm(4) you'll still see an extra frame as below:
# btrace -e 'profile:hz:99 { printf("%s1\n", kstack); }'
Xresume_legacy0+0x1d3
cpu_idle_cycle+0x1b
proc_trampoline+0x1c
1
Whereas on real hardware:
# btrace -e 'profile:hz:99 { printf("%s1\n", kstack); }'
acpicpu_idle+0x1d2
sched_idle+0x225
proc_trampoline+0x1c
1
The diff below will break compiling dt(4) on the other archs because
they don't yet provide stacktrace_save_at(), but it shouldn't be hard
to fix ;)
Ok?
Index: dev/dt/dt_dev.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v
retrieving revision 1.4
diff -u -p -r1.4 dt_dev.c
--- dev/dt/dt_dev.c 4 Feb 2020 10:56:15 -0000 1.4
+++ dev/dt/dt_dev.c 19 Mar 2020 14:08:53 -0000
@@ -26,29 +26,40 @@
#include <dev/dt/dtvar.h>
/*
- * How many frames are used by the profiling code? For example
- * on amd64:
+ * The number of frames required to execute dt(4) profiling code
+ * depends on the probe, context, architecture and possibly the
+ * compiler.
*
- * From syscall provider:
+ * Static probes (tracepoints) are executed in the context of the
+ * current thread and only need to skip frames up to the recording
+ *function. For example the syscall provider:
*
* dt_prov_syscall_entry+0x141
* syscall+0x205 <--- start here
* Xsyscall+0x128
*
- * From profile provider:
+ * Probes executed in their own context, like the profile provider,
+ * need to skip the frames of that context which are different for
+ * every architecture. For example the profile provider executed
+ * from hardclock(9) on amd64:
*
* dt_prov_profile_enter+0x6e
- * hardclock+0x12c
- * clockintr+0x59
- * intr_handler+0x6e
- * Xresume_legacy0+0x1d3
- * cpu_idle_cycle+0x1b <---- start here.
+ * hardclock+0x1a9
+ * lapic_clockintr+0x3f
+ * Xresume_lapic_ltimer+0x26
+ * acpicpu_idle+0x1d2 <---- start here.
+ * sched_idle+0x225
* proc_trampoline+0x1c
*/
-#if notyet
-#define DT_HOOK_FRAME_ADDRESS __builtin_frame_address(4)
+#if defined(__amd64__)
+#define DT_FA_PROFILE __builtin_frame_address(4)
+#define DT_FA_STATIC __builtin_frame_address(1)
+#elif defined(__sparc64__)
+#define DT_FA_PROFILE __builtin_frame_address(4)
+#define DT_FA_STATIC __builtin_frame_address(0)
#else
-#define DT_HOOK_FRAME_ADDRESS __builtin_frame_address(0)
+#define DT_FA_STATIC __builtin_frame_address(0)
+#define DT_FA_PROFILE DT_FA_STATIC
#endif
#define DT_EVTRING_SIZE 16 /* # of slots in per PCB event ring */
@@ -583,7 +594,7 @@ dt_pcb_filter(struct dt_pcb *dp)
* Get a reference to the next free event state from the ring.
*/
struct dt_evt *
-dt_pcb_ring_get(struct dt_pcb *dp)
+dt_pcb_ring_get(struct dt_pcb *dp, int profiling)
{
struct proc *p = curproc;
struct dt_evt *dtev;
@@ -617,11 +628,10 @@ dt_pcb_ring_get(struct dt_pcb *dp)
memcpy(dtev->dtev_comm, p->p_p->ps_comm, DTMAXCOMLEN - 1);
if (ISSET(dp->dp_evtflags, DTEVT_KSTACK|DTEVT_USTACK)) {
-#if notyet
- stacktrace_save_at(&dtev->dtev_kstack, DT_HOOK_FRAME_ADDRESS);
-#else
- stacktrace_save(&dtev->dtev_kstack);
-#endif
+ if (profiling)
+ stacktrace_save_at(&dtev->dtev_kstack, DT_FA_PROFILE);
+ else
+ stacktrace_save_at(&dtev->dtev_kstack, DT_FA_STATIC);
}
return dtev;
Index: dev/dt/dt_prov_profile.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_profile.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_prov_profile.c
--- dev/dt/dt_prov_profile.c 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dt_prov_profile.c 19 Mar 2020 14:06:01 -0000
@@ -107,7 +107,7 @@ dt_prov_profile_fire(struct dt_pcb *dp)
if (++dp->dp_nticks < dp->dp_maxtick)
return;
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 1);
if (dtev == NULL)
return;
dt_pcb_ring_consume(dp, dtev);
Index: dev/dt/dt_prov_static.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_static.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_prov_static.c
--- dev/dt/dt_prov_static.c 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dt_prov_static.c 19 Mar 2020 14:06:01 -0000
@@ -120,7 +120,7 @@ dt_prov_static_hook(struct dt_provider *
SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) {
struct dt_evt *dtev;
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 0);
if (dtev == NULL)
continue;
Index: dev/dt/dt_prov_syscall.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_syscall.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_prov_syscall.c
--- dev/dt/dt_prov_syscall.c 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dt_prov_syscall.c 19 Mar 2020 14:06:01 -0000
@@ -148,7 +148,7 @@ dt_prov_syscall_entry(struct dt_provider
SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) {
struct dt_evt *dtev;
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 0);
if (dtev == NULL)
continue;
@@ -190,7 +190,7 @@ dt_prov_syscall_return(struct dt_provide
SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) {
struct dt_evt *dtev;
- dtev = dt_pcb_ring_get(dp);
+ dtev = dt_pcb_ring_get(dp, 0);
if (dtev == NULL)
continue;
Index: dev/dt/dtvar.h
===================================================================
RCS file: /cvs/src/sys/dev/dt/dtvar.h,v
retrieving revision 1.1
diff -u -p -r1.1 dtvar.h
--- dev/dt/dtvar.h 21 Jan 2020 16:16:23 -0000 1.1
+++ dev/dt/dtvar.h 19 Mar 2020 14:06:01 -0000
@@ -194,7 +194,7 @@ void dt_pcb_free(struct dt_pcb *);
void dt_pcb_purge(struct dt_pcb_list *);
int dt_pcb_filter(struct dt_pcb *);
-struct dt_evt *dt_pcb_ring_get(struct dt_pcb *);
+struct dt_evt *dt_pcb_ring_get(struct dt_pcb *, int);
void dt_pcb_ring_consume(struct dt_pcb *, struct dt_evt *);
/*
Index: sys/stacktrace.h
===================================================================
RCS file: /cvs/src/sys/sys/stacktrace.h,v
retrieving revision 1.1
diff -u -p -r1.1 stacktrace.h
--- sys/stacktrace.h 20 Jan 2020 15:58:23 -0000 1.1
+++ sys/stacktrace.h 19 Mar 2020 14:06:01 -0000
@@ -28,6 +28,7 @@ struct stacktrace {
#ifdef _KERNEL
void stacktrace_print(struct stacktrace *, int (*)(const char *, ...));
+void stacktrace_save_at(struct stacktrace *, void *);
void stacktrace_save(struct stacktrace *);
#endif
Index: arch/amd64/amd64/db_trace.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/db_trace.c,v
retrieving revision 1.49
diff -u -p -r1.49 db_trace.c
--- arch/amd64/amd64/db_trace.c 20 Jan 2020 15:58:23 -0000 1.49
+++ arch/amd64/amd64/db_trace.c 19 Mar 2020 14:06:01 -0000
@@ -256,11 +256,13 @@ db_stack_trace_print(db_expr_t addr, int
}
void
-stacktrace_save(struct stacktrace *st)
+stacktrace_save_at(struct stacktrace *st, void *xframe)
{
- struct callframe *frame, *lastframe;
+ struct callframe *lastframe, *frame = xframe;
+
+ if (frame == NULL)
+ frame = __builtin_frame_address(0);
- frame = __builtin_frame_address(0);
st->st_count = 0;
while (st->st_count < STACKTRACE_MAX) {
st->st_pc[st->st_count++] = frame->f_retaddr;
@@ -275,6 +277,12 @@ stacktrace_save(struct stacktrace *st)
if (!INKERNEL(frame->f_retaddr))
break;
}
+}
+
+void
+stacktrace_save(struct stacktrace *st)
+{
+ return stacktrace_save_at(st, NULL);
}
vaddr_t
Index: arch/sparc64/sparc64/db_trace.c
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/sparc64/db_trace.c,v
retrieving revision 1.20
diff -u -p -r1.20 db_trace.c
--- arch/sparc64/sparc64/db_trace.c 20 Jan 2020 15:58:23 -0000 1.20
+++ arch/sparc64/sparc64/db_trace.c 19 Mar 2020 14:06:01 -0000
@@ -155,7 +155,7 @@ db_stack_trace_print(db_expr_t addr, int
}
void
-stacktrace_save(struct stacktrace *st)
+stacktrace_save_at(struct stacktrace *st, void *xframe)
{
struct frame64 *f64;
vaddr_t pc;
@@ -163,7 +163,10 @@ stacktrace_save(struct stacktrace *st)
write_all_windows();
- frame = (vaddr_t)__builtin_frame_address(0) - BIAS;
+ if (xframe == NULL)
+ xframe = __builtin_frame_address(0);
+
+ frame = (vaddr_t)xframe - BIAS;
if ((frame & 1) == 0)
return;
@@ -185,6 +188,11 @@ stacktrace_save(struct stacktrace *st)
}
}
+void
+stacktrace_save(struct stacktrace *st)
+{
+ return stacktrace_save_at(st, NULL);
+}
void
db_dump_window(db_expr_t addr, int have_addr, db_expr_t count, char *modif)
{