On 19/03/20(Thu) 15:37, Martin Pieuchot wrote:
> People are starting to capture kernel stack traces and produce
> Flamegraphs.  Those traces currently include the frames used by
> dt(4) itself:
> 
>     dt_pcb_ring_get+0x11d
>     dt_prov_profile_enter+0x6e
>     hardclock+0x1a9
>     lapic_clockintr+0x3f
>     Xresume_lapic_ltimer+0x26
>     acpicpu_idle+0x261                <--- CPU was Idle
>     sched_idle+0x225
>     proc_trampoline+0x1c
>     1
> 
> Saving the last 5 frames in the example above consumes CPU time and
> require some post-processing to cleanup gathered data.  So the diff
> below extends the existing stacktrace_save() into and *_at() version
> that takes an arbitrary frame as argument.
> 
> The fun start when we ask "How many frames do we skip?".  There are
> currently two different contexts in which dt(4) can gather data:
>  - in hardlock(9), which mean a specialized interrupt context
>  - in any thread or interrupt context
> 
> Those two contexts require a different number of frames to reach the
> recording function: dt_pcb_ring_get().  Now what's fun is that every
> architecture has its own way to reach hardclock(9) (possibly multiple
> ones), has its own stack unwinder (with specific bugs) and might use
> different compilers with different optimizations (yeah!).
> 
> So the diff below introduce two per-arch defines that specify at which
> frame the unwinding should start.  This has been tested on sparc64 and
> amd64 and here's the result:
> 
>     # btrace -e 'tracepoint:sched:sleep { printf("%s1\n", kstack); }'
>     sleep_setup+0x16c
>     msleep+0x160
>     taskq_next_work+0x5c
>     taskq_thread+0x38
>     0x1013074
>     1
> 
> Note that the number of frames for hardclock(9) on amd64 assumes lapic
> is used.  That's why on vmm(4) you'll still see an extra frame as below:
> 
>   #  btrace -e 'profile:hz:99 { printf("%s1\n", kstack); }'          
>   Xresume_legacy0+0x1d3
>   cpu_idle_cycle+0x1b
>   proc_trampoline+0x1c
>   1
> 
> Whereas on real hardware:
> 
>   # btrace -e 'profile:hz:99 { printf("%s1\n", kstack); }' 
>   acpicpu_idle+0x1d2
>   sched_idle+0x225
>   proc_trampoline+0x1c
>   1
> 
> The diff below will break compiling dt(4) on the other archs because
> they don't yet provide stacktrace_save_at(), but it shouldn't be hard
> to fix ;)

Updated diff including stacktrace_save_at() for i386, any comment on the
approach or Ok?

Index: arch/amd64/amd64/db_trace.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/db_trace.c,v
retrieving revision 1.49
diff -u -p -r1.49 db_trace.c
--- arch/amd64/amd64/db_trace.c 20 Jan 2020 15:58:23 -0000      1.49
+++ arch/amd64/amd64/db_trace.c 19 Mar 2020 14:06:01 -0000
@@ -256,11 +256,13 @@ db_stack_trace_print(db_expr_t addr, int
 }
 
 void
-stacktrace_save(struct stacktrace *st)
+stacktrace_save_at(struct stacktrace *st, void *xframe)
 {
-       struct callframe *frame, *lastframe;
+       struct callframe *lastframe, *frame = xframe;
+
+       if (frame == NULL)
+               frame = __builtin_frame_address(0);
 
-       frame = __builtin_frame_address(0);
        st->st_count = 0;
        while (st->st_count < STACKTRACE_MAX) {
                st->st_pc[st->st_count++] = frame->f_retaddr;
@@ -275,6 +277,12 @@ stacktrace_save(struct stacktrace *st)
                if (!INKERNEL(frame->f_retaddr))
                        break;
        }
+}
+
+void
+stacktrace_save(struct stacktrace *st)
+{
+       return stacktrace_save_at(st, NULL);
 }
 
 vaddr_t
Index: arch/i386/i386/db_trace.c
===================================================================
RCS file: /cvs/src/sys/arch/i386/i386/db_trace.c,v
retrieving revision 1.38
diff -u -p -r1.38 db_trace.c
--- arch/i386/i386/db_trace.c   20 Jan 2020 15:58:23 -0000      1.38
+++ arch/i386/i386/db_trace.c   20 Mar 2020 10:50:28 -0000
@@ -263,9 +263,17 @@ db_stack_trace_print(db_expr_t addr, int
 void
 stacktrace_save(struct stacktrace *st)
 {
-       struct callframe *frame, *lastframe;
+       return stacktrace_save_at(st, NULL);
+}
+
+void
+stacktrace_save_at(struct stacktrace *st, void *xframe)
+{
+       struct callframe *lastframe, *frame = xframe;
+
+       if (frame == NULL)
+               frame = __builtin_frame_address(0);
 
-       frame = __builtin_frame_address(0);
        st->st_count = 0;
        while (st->st_count < STACKTRACE_MAX) {
                st->st_pc[st->st_count++] = frame->f_retaddr;
Index: arch/sparc64/sparc64/db_trace.c
===================================================================
RCS file: /cvs/src/sys/arch/sparc64/sparc64/db_trace.c,v
retrieving revision 1.20
diff -u -p -r1.20 db_trace.c
--- arch/sparc64/sparc64/db_trace.c     20 Jan 2020 15:58:23 -0000      1.20
+++ arch/sparc64/sparc64/db_trace.c     19 Mar 2020 14:06:01 -0000
@@ -155,7 +155,7 @@ db_stack_trace_print(db_expr_t addr, int
 }
 
 void
-stacktrace_save(struct stacktrace *st)
+stacktrace_save_at(struct stacktrace *st, void *xframe)
 {
        struct frame64  *f64;
        vaddr_t         pc;
@@ -163,7 +163,10 @@ stacktrace_save(struct stacktrace *st)
 
        write_all_windows();
 
-       frame = (vaddr_t)__builtin_frame_address(0) - BIAS;
+       if (xframe == NULL)
+               xframe = __builtin_frame_address(0);
+
+       frame = (vaddr_t)xframe - BIAS;
        if ((frame & 1) == 0)
                return;
 
@@ -185,6 +188,11 @@ stacktrace_save(struct stacktrace *st)
        }
 }
 
+void
+stacktrace_save(struct stacktrace *st)
+{
+       return stacktrace_save_at(st, NULL);
+}
 void
 db_dump_window(db_expr_t addr, int have_addr, db_expr_t count, char *modif)
 {
Index: dev/dt/dt_dev.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v
retrieving revision 1.4
diff -u -p -r1.4 dt_dev.c
--- dev/dt/dt_dev.c     4 Feb 2020 10:56:15 -0000       1.4
+++ dev/dt/dt_dev.c     19 Mar 2020 14:08:53 -0000
@@ -26,29 +26,40 @@
 #include <dev/dt/dtvar.h>
 
 /*
- * How many frames are used by the profiling code?  For example
- * on amd64:
+ * The number of frames required to execute dt(4) profiling code
+ * depends on the probe, context, architecture and possibly the
+ * compiler.
  *
- * From syscall provider:
+ * Static probes (tracepoints) are executed in the context of the
+ * current thread and only need to skip frames up to the recording
+ *function.  For example the syscall provider:
  *
  *     dt_prov_syscall_entry+0x141
  *     syscall+0x205           <--- start here
  *     Xsyscall+0x128
  *
- * From profile provider:
+ * Probes executed in their own context, like the profile provider,
+ * need to skip the frames of that context which are different for
+ * every architecture.  For example the profile provider executed
+ * from hardclock(9) on amd64:
  *
  *     dt_prov_profile_enter+0x6e
- *     hardclock+0x12c
- *     clockintr+0x59
- *     intr_handler+0x6e
- *     Xresume_legacy0+0x1d3
- *     cpu_idle_cycle+0x1b     <---- start here.
+ *     hardclock+0x1a9
+ *     lapic_clockintr+0x3f
+ *     Xresume_lapic_ltimer+0x26
+ *     acpicpu_idle+0x1d2      <---- start here.
+ *     sched_idle+0x225
  *     proc_trampoline+0x1c
  */
-#if notyet
-#define DT_HOOK_FRAME_ADDRESS  __builtin_frame_address(4)
+#if defined(__amd64__)
+#define DT_FA_PROFILE  __builtin_frame_address(4)
+#define DT_FA_STATIC   __builtin_frame_address(1)
+#elif defined(__sparc64__)
+#define DT_FA_PROFILE  __builtin_frame_address(4)
+#define DT_FA_STATIC   __builtin_frame_address(0)
 #else
-#define DT_HOOK_FRAME_ADDRESS  __builtin_frame_address(0)
+#define DT_FA_STATIC   __builtin_frame_address(0)
+#define DT_FA_PROFILE  DT_FA_STATIC
 #endif
 
 #define DT_EVTRING_SIZE        16      /* # of slots in per PCB event ring */
@@ -583,7 +594,7 @@ dt_pcb_filter(struct dt_pcb *dp)
  * Get a reference to the next free event state from the ring.
  */
 struct dt_evt *
-dt_pcb_ring_get(struct dt_pcb *dp)
+dt_pcb_ring_get(struct dt_pcb *dp, int profiling)
 {
        struct proc *p = curproc;
        struct dt_evt *dtev;
@@ -617,11 +628,10 @@ dt_pcb_ring_get(struct dt_pcb *dp)
                memcpy(dtev->dtev_comm, p->p_p->ps_comm, DTMAXCOMLEN - 1);
 
        if (ISSET(dp->dp_evtflags, DTEVT_KSTACK|DTEVT_USTACK)) {
-#if notyet
-               stacktrace_save_at(&dtev->dtev_kstack, DT_HOOK_FRAME_ADDRESS);
-#else
-               stacktrace_save(&dtev->dtev_kstack);
-#endif
+               if (profiling)
+                       stacktrace_save_at(&dtev->dtev_kstack, DT_FA_PROFILE);
+               else
+                       stacktrace_save_at(&dtev->dtev_kstack, DT_FA_STATIC);
        }
 
        return dtev;
Index: dev/dt/dt_prov_profile.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_profile.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_prov_profile.c
--- dev/dt/dt_prov_profile.c    21 Jan 2020 16:16:23 -0000      1.1
+++ dev/dt/dt_prov_profile.c    19 Mar 2020 14:06:01 -0000
@@ -107,7 +107,7 @@ dt_prov_profile_fire(struct dt_pcb *dp)
        if (++dp->dp_nticks < dp->dp_maxtick)
                return;
 
-       dtev = dt_pcb_ring_get(dp);
+       dtev = dt_pcb_ring_get(dp, 1);
        if (dtev == NULL)
                return;
        dt_pcb_ring_consume(dp, dtev);
Index: dev/dt/dt_prov_static.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_static.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_prov_static.c
--- dev/dt/dt_prov_static.c     21 Jan 2020 16:16:23 -0000      1.1
+++ dev/dt/dt_prov_static.c     19 Mar 2020 14:06:01 -0000
@@ -120,7 +120,7 @@ dt_prov_static_hook(struct dt_provider *
        SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) {
                struct dt_evt *dtev;
 
-               dtev = dt_pcb_ring_get(dp);
+               dtev = dt_pcb_ring_get(dp, 0);
                if (dtev == NULL)
                        continue;
 
Index: dev/dt/dt_prov_syscall.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_syscall.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_prov_syscall.c
--- dev/dt/dt_prov_syscall.c    21 Jan 2020 16:16:23 -0000      1.1
+++ dev/dt/dt_prov_syscall.c    19 Mar 2020 14:06:01 -0000
@@ -148,7 +148,7 @@ dt_prov_syscall_entry(struct dt_provider
        SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) {
                struct dt_evt *dtev;
 
-               dtev = dt_pcb_ring_get(dp);
+               dtev = dt_pcb_ring_get(dp, 0);
                if (dtev == NULL)
                        continue;
 
@@ -190,7 +190,7 @@ dt_prov_syscall_return(struct dt_provide
        SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) {
                struct dt_evt *dtev;
 
-               dtev = dt_pcb_ring_get(dp);
+               dtev = dt_pcb_ring_get(dp, 0);
                if (dtev == NULL)
                        continue;
 
Index: dev/dt/dtvar.h
===================================================================
RCS file: /cvs/src/sys/dev/dt/dtvar.h,v
retrieving revision 1.1
diff -u -p -r1.1 dtvar.h
--- dev/dt/dtvar.h      21 Jan 2020 16:16:23 -0000      1.1
+++ dev/dt/dtvar.h      19 Mar 2020 14:06:01 -0000
@@ -194,7 +194,7 @@ void                 dt_pcb_free(struct dt_pcb *);
 void            dt_pcb_purge(struct dt_pcb_list *);
 int             dt_pcb_filter(struct dt_pcb *);
 
-struct dt_evt  *dt_pcb_ring_get(struct dt_pcb *);
+struct dt_evt  *dt_pcb_ring_get(struct dt_pcb *, int);
 void            dt_pcb_ring_consume(struct dt_pcb *, struct dt_evt *);
 
 /*
Index: sys/stacktrace.h
===================================================================
RCS file: /cvs/src/sys/sys/stacktrace.h,v
retrieving revision 1.1
diff -u -p -r1.1 stacktrace.h
--- sys/stacktrace.h    20 Jan 2020 15:58:23 -0000      1.1
+++ sys/stacktrace.h    19 Mar 2020 14:06:01 -0000
@@ -28,6 +28,7 @@ struct stacktrace {
 
 #ifdef _KERNEL
 void   stacktrace_print(struct stacktrace *, int (*)(const char *, ...));
+void   stacktrace_save_at(struct stacktrace *, void *);
 void   stacktrace_save(struct stacktrace *);
 #endif
 

Reply via email to