On 19/03/20(Thu) 15:37, Martin Pieuchot wrote: > People are starting to capture kernel stack traces and produce > Flamegraphs. Those traces currently include the frames used by > dt(4) itself: > > dt_pcb_ring_get+0x11d > dt_prov_profile_enter+0x6e > hardclock+0x1a9 > lapic_clockintr+0x3f > Xresume_lapic_ltimer+0x26 > acpicpu_idle+0x261 <--- CPU was Idle > sched_idle+0x225 > proc_trampoline+0x1c > 1 > > Saving the last 5 frames in the example above consumes CPU time and > require some post-processing to cleanup gathered data. So the diff > below extends the existing stacktrace_save() into and *_at() version > that takes an arbitrary frame as argument. > > The fun start when we ask "How many frames do we skip?". There are > currently two different contexts in which dt(4) can gather data: > - in hardlock(9), which mean a specialized interrupt context > - in any thread or interrupt context > > Those two contexts require a different number of frames to reach the > recording function: dt_pcb_ring_get(). Now what's fun is that every > architecture has its own way to reach hardclock(9) (possibly multiple > ones), has its own stack unwinder (with specific bugs) and might use > different compilers with different optimizations (yeah!). > > So the diff below introduce two per-arch defines that specify at which > frame the unwinding should start. This has been tested on sparc64 and > amd64 and here's the result: > > # btrace -e 'tracepoint:sched:sleep { printf("%s1\n", kstack); }' > sleep_setup+0x16c > msleep+0x160 > taskq_next_work+0x5c > taskq_thread+0x38 > 0x1013074 > 1 > > Note that the number of frames for hardclock(9) on amd64 assumes lapic > is used. That's why on vmm(4) you'll still see an extra frame as below: > > # btrace -e 'profile:hz:99 { printf("%s1\n", kstack); }' > Xresume_legacy0+0x1d3 > cpu_idle_cycle+0x1b > proc_trampoline+0x1c > 1 > > Whereas on real hardware: > > # btrace -e 'profile:hz:99 { printf("%s1\n", kstack); }' > acpicpu_idle+0x1d2 > sched_idle+0x225 > proc_trampoline+0x1c > 1 > > The diff below will break compiling dt(4) on the other archs because > they don't yet provide stacktrace_save_at(), but it shouldn't be hard > to fix ;)
Updated diff including stacktrace_save_at() for i386, any comment on the approach or Ok? Index: arch/amd64/amd64/db_trace.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/db_trace.c,v retrieving revision 1.49 diff -u -p -r1.49 db_trace.c --- arch/amd64/amd64/db_trace.c 20 Jan 2020 15:58:23 -0000 1.49 +++ arch/amd64/amd64/db_trace.c 19 Mar 2020 14:06:01 -0000 @@ -256,11 +256,13 @@ db_stack_trace_print(db_expr_t addr, int } void -stacktrace_save(struct stacktrace *st) +stacktrace_save_at(struct stacktrace *st, void *xframe) { - struct callframe *frame, *lastframe; + struct callframe *lastframe, *frame = xframe; + + if (frame == NULL) + frame = __builtin_frame_address(0); - frame = __builtin_frame_address(0); st->st_count = 0; while (st->st_count < STACKTRACE_MAX) { st->st_pc[st->st_count++] = frame->f_retaddr; @@ -275,6 +277,12 @@ stacktrace_save(struct stacktrace *st) if (!INKERNEL(frame->f_retaddr)) break; } +} + +void +stacktrace_save(struct stacktrace *st) +{ + return stacktrace_save_at(st, NULL); } vaddr_t Index: arch/i386/i386/db_trace.c =================================================================== RCS file: /cvs/src/sys/arch/i386/i386/db_trace.c,v retrieving revision 1.38 diff -u -p -r1.38 db_trace.c --- arch/i386/i386/db_trace.c 20 Jan 2020 15:58:23 -0000 1.38 +++ arch/i386/i386/db_trace.c 20 Mar 2020 10:50:28 -0000 @@ -263,9 +263,17 @@ db_stack_trace_print(db_expr_t addr, int void stacktrace_save(struct stacktrace *st) { - struct callframe *frame, *lastframe; + return stacktrace_save_at(st, NULL); +} + +void +stacktrace_save_at(struct stacktrace *st, void *xframe) +{ + struct callframe *lastframe, *frame = xframe; + + if (frame == NULL) + frame = __builtin_frame_address(0); - frame = __builtin_frame_address(0); st->st_count = 0; while (st->st_count < STACKTRACE_MAX) { st->st_pc[st->st_count++] = frame->f_retaddr; Index: arch/sparc64/sparc64/db_trace.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/sparc64/db_trace.c,v retrieving revision 1.20 diff -u -p -r1.20 db_trace.c --- arch/sparc64/sparc64/db_trace.c 20 Jan 2020 15:58:23 -0000 1.20 +++ arch/sparc64/sparc64/db_trace.c 19 Mar 2020 14:06:01 -0000 @@ -155,7 +155,7 @@ db_stack_trace_print(db_expr_t addr, int } void -stacktrace_save(struct stacktrace *st) +stacktrace_save_at(struct stacktrace *st, void *xframe) { struct frame64 *f64; vaddr_t pc; @@ -163,7 +163,10 @@ stacktrace_save(struct stacktrace *st) write_all_windows(); - frame = (vaddr_t)__builtin_frame_address(0) - BIAS; + if (xframe == NULL) + xframe = __builtin_frame_address(0); + + frame = (vaddr_t)xframe - BIAS; if ((frame & 1) == 0) return; @@ -185,6 +188,11 @@ stacktrace_save(struct stacktrace *st) } } +void +stacktrace_save(struct stacktrace *st) +{ + return stacktrace_save_at(st, NULL); +} void db_dump_window(db_expr_t addr, int have_addr, db_expr_t count, char *modif) { Index: dev/dt/dt_dev.c =================================================================== RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v retrieving revision 1.4 diff -u -p -r1.4 dt_dev.c --- dev/dt/dt_dev.c 4 Feb 2020 10:56:15 -0000 1.4 +++ dev/dt/dt_dev.c 19 Mar 2020 14:08:53 -0000 @@ -26,29 +26,40 @@ #include <dev/dt/dtvar.h> /* - * How many frames are used by the profiling code? For example - * on amd64: + * The number of frames required to execute dt(4) profiling code + * depends on the probe, context, architecture and possibly the + * compiler. * - * From syscall provider: + * Static probes (tracepoints) are executed in the context of the + * current thread and only need to skip frames up to the recording + *function. For example the syscall provider: * * dt_prov_syscall_entry+0x141 * syscall+0x205 <--- start here * Xsyscall+0x128 * - * From profile provider: + * Probes executed in their own context, like the profile provider, + * need to skip the frames of that context which are different for + * every architecture. For example the profile provider executed + * from hardclock(9) on amd64: * * dt_prov_profile_enter+0x6e - * hardclock+0x12c - * clockintr+0x59 - * intr_handler+0x6e - * Xresume_legacy0+0x1d3 - * cpu_idle_cycle+0x1b <---- start here. + * hardclock+0x1a9 + * lapic_clockintr+0x3f + * Xresume_lapic_ltimer+0x26 + * acpicpu_idle+0x1d2 <---- start here. + * sched_idle+0x225 * proc_trampoline+0x1c */ -#if notyet -#define DT_HOOK_FRAME_ADDRESS __builtin_frame_address(4) +#if defined(__amd64__) +#define DT_FA_PROFILE __builtin_frame_address(4) +#define DT_FA_STATIC __builtin_frame_address(1) +#elif defined(__sparc64__) +#define DT_FA_PROFILE __builtin_frame_address(4) +#define DT_FA_STATIC __builtin_frame_address(0) #else -#define DT_HOOK_FRAME_ADDRESS __builtin_frame_address(0) +#define DT_FA_STATIC __builtin_frame_address(0) +#define DT_FA_PROFILE DT_FA_STATIC #endif #define DT_EVTRING_SIZE 16 /* # of slots in per PCB event ring */ @@ -583,7 +594,7 @@ dt_pcb_filter(struct dt_pcb *dp) * Get a reference to the next free event state from the ring. */ struct dt_evt * -dt_pcb_ring_get(struct dt_pcb *dp) +dt_pcb_ring_get(struct dt_pcb *dp, int profiling) { struct proc *p = curproc; struct dt_evt *dtev; @@ -617,11 +628,10 @@ dt_pcb_ring_get(struct dt_pcb *dp) memcpy(dtev->dtev_comm, p->p_p->ps_comm, DTMAXCOMLEN - 1); if (ISSET(dp->dp_evtflags, DTEVT_KSTACK|DTEVT_USTACK)) { -#if notyet - stacktrace_save_at(&dtev->dtev_kstack, DT_HOOK_FRAME_ADDRESS); -#else - stacktrace_save(&dtev->dtev_kstack); -#endif + if (profiling) + stacktrace_save_at(&dtev->dtev_kstack, DT_FA_PROFILE); + else + stacktrace_save_at(&dtev->dtev_kstack, DT_FA_STATIC); } return dtev; Index: dev/dt/dt_prov_profile.c =================================================================== RCS file: /cvs/src/sys/dev/dt/dt_prov_profile.c,v retrieving revision 1.1 diff -u -p -r1.1 dt_prov_profile.c --- dev/dt/dt_prov_profile.c 21 Jan 2020 16:16:23 -0000 1.1 +++ dev/dt/dt_prov_profile.c 19 Mar 2020 14:06:01 -0000 @@ -107,7 +107,7 @@ dt_prov_profile_fire(struct dt_pcb *dp) if (++dp->dp_nticks < dp->dp_maxtick) return; - dtev = dt_pcb_ring_get(dp); + dtev = dt_pcb_ring_get(dp, 1); if (dtev == NULL) return; dt_pcb_ring_consume(dp, dtev); Index: dev/dt/dt_prov_static.c =================================================================== RCS file: /cvs/src/sys/dev/dt/dt_prov_static.c,v retrieving revision 1.1 diff -u -p -r1.1 dt_prov_static.c --- dev/dt/dt_prov_static.c 21 Jan 2020 16:16:23 -0000 1.1 +++ dev/dt/dt_prov_static.c 19 Mar 2020 14:06:01 -0000 @@ -120,7 +120,7 @@ dt_prov_static_hook(struct dt_provider * SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) { struct dt_evt *dtev; - dtev = dt_pcb_ring_get(dp); + dtev = dt_pcb_ring_get(dp, 0); if (dtev == NULL) continue; Index: dev/dt/dt_prov_syscall.c =================================================================== RCS file: /cvs/src/sys/dev/dt/dt_prov_syscall.c,v retrieving revision 1.1 diff -u -p -r1.1 dt_prov_syscall.c --- dev/dt/dt_prov_syscall.c 21 Jan 2020 16:16:23 -0000 1.1 +++ dev/dt/dt_prov_syscall.c 19 Mar 2020 14:06:01 -0000 @@ -148,7 +148,7 @@ dt_prov_syscall_entry(struct dt_provider SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) { struct dt_evt *dtev; - dtev = dt_pcb_ring_get(dp); + dtev = dt_pcb_ring_get(dp, 0); if (dtev == NULL) continue; @@ -190,7 +190,7 @@ dt_prov_syscall_return(struct dt_provide SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) { struct dt_evt *dtev; - dtev = dt_pcb_ring_get(dp); + dtev = dt_pcb_ring_get(dp, 0); if (dtev == NULL) continue; Index: dev/dt/dtvar.h =================================================================== RCS file: /cvs/src/sys/dev/dt/dtvar.h,v retrieving revision 1.1 diff -u -p -r1.1 dtvar.h --- dev/dt/dtvar.h 21 Jan 2020 16:16:23 -0000 1.1 +++ dev/dt/dtvar.h 19 Mar 2020 14:06:01 -0000 @@ -194,7 +194,7 @@ void dt_pcb_free(struct dt_pcb *); void dt_pcb_purge(struct dt_pcb_list *); int dt_pcb_filter(struct dt_pcb *); -struct dt_evt *dt_pcb_ring_get(struct dt_pcb *); +struct dt_evt *dt_pcb_ring_get(struct dt_pcb *, int); void dt_pcb_ring_consume(struct dt_pcb *, struct dt_evt *); /* Index: sys/stacktrace.h =================================================================== RCS file: /cvs/src/sys/sys/stacktrace.h,v retrieving revision 1.1 diff -u -p -r1.1 stacktrace.h --- sys/stacktrace.h 20 Jan 2020 15:58:23 -0000 1.1 +++ sys/stacktrace.h 19 Mar 2020 14:06:01 -0000 @@ -28,6 +28,7 @@ struct stacktrace { #ifdef _KERNEL void stacktrace_print(struct stacktrace *, int (*)(const char *, ...)); +void stacktrace_save_at(struct stacktrace *, void *); void stacktrace_save(struct stacktrace *); #endif