On Tue, Oct 07, 2025 at 05:40:08PM -0400, Steven Rostedt wrote:

>  include/linux/perf_event.h            |   9 +-
>  include/linux/unwind_deferred.h       |  15 ++
>  include/uapi/linux/perf_event.h       |  25 ++-
>  kernel/bpf/stackmap.c                 |   4 +-
>  kernel/events/callchain.c             |  14 +-
>  kernel/events/core.c                  | 362 
> +++++++++++++++++++++++++++++++++-
>  kernel/unwind/deferred.c              | 283 ++++++++++++++++++++++----
>  tools/include/uapi/linux/perf_event.h |  25 ++-
>  8 files changed, 686 insertions(+), 51 deletions(-)

After staring at this some, I mostly threw it all out and wrote the
below.

I also have some hackery on the userspace patches to go along with this,
and it all sits in my unwind/cleanup branch.

Trouble is, pretty much every unwind is 510 entries long -- this cannot
be right. I'm sure there's a silly mistake in unwind/user.c but I'm too
tired to find it just now. I'll try again tomorrow.
  
---
 include/linux/perf_event.h            |    2 
 include/linux/unwind_deferred.h       |   12 -----
 include/linux/unwind_deferred_types.h |   13 +++++
 include/uapi/linux/perf_event.h       |   21 ++++++++-
 kernel/bpf/stackmap.c                 |    4 -
 kernel/events/callchain.c             |   14 +++++-
 kernel/events/core.c                  |   79 +++++++++++++++++++++++++++++++++-
 tools/include/uapi/linux/perf_event.h |   21 ++++++++-
 8 files changed, 146 insertions(+), 20 deletions(-)

--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1720,7 +1720,7 @@ extern void perf_callchain_user(struct p
 extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, 
struct pt_regs *regs);
 extern struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
-                  u32 max_stack, bool crosstask, bool add_mark);
+                  u32 max_stack, bool crosstask, bool add_mark, u64 
defer_cookie);
 extern int get_callchain_buffers(int max_stack);
 extern void put_callchain_buffers(void);
 extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
--- a/include/linux/unwind_deferred.h
+++ b/include/linux/unwind_deferred.h
@@ -6,18 +6,6 @@
 #include <linux/unwind_user.h>
 #include <linux/unwind_deferred_types.h>
 
-struct unwind_work;
-
-typedef void (*unwind_callback_t)(struct unwind_work *work,
-                                 struct unwind_stacktrace *trace,
-                                 u64 cookie);
-
-struct unwind_work {
-       struct list_head                list;
-       unwind_callback_t               func;
-       int                             bit;
-};
-
 #ifdef CONFIG_UNWIND_USER
 
 enum {
--- a/include/linux/unwind_deferred_types.h
+++ b/include/linux/unwind_deferred_types.h
@@ -39,4 +39,17 @@ struct unwind_task_info {
        union unwind_task_id    id;
 };
 
+struct unwind_work;
+struct unwind_stacktrace;
+
+typedef void (*unwind_callback_t)(struct unwind_work *work,
+                                 struct unwind_stacktrace *trace,
+                                 u64 cookie);
+
+struct unwind_work {
+       struct list_head                list;
+       unwind_callback_t               func;
+       int                             bit;
+};
+
 #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -463,7 +463,9 @@ struct perf_event_attr {
                                inherit_thread :  1, /* children only inherit 
if cloned with CLONE_THREAD */
                                remove_on_exec :  1, /* event is removed from 
task on exec */
                                sigtrap        :  1, /* send synchronous 
SIGTRAP on event */
-                               __reserved_1   : 26;
+                               defer_callchain:  1, /* request 
PERF_RECORD_CALLCHAIN_DEFERRED records */
+                               defer_output   :  1, /* output 
PERF_RECORD_CALLCHAIN_DEFERRED records */
+                               __reserved_1   : 24;
 
        union {
                __u32           wakeup_events;    /* wake up every n events */
@@ -1239,6 +1241,22 @@ enum perf_event_type {
         */
        PERF_RECORD_AUX_OUTPUT_HW_ID            = 21,
 
+       /*
+        * This user callchain capture was deferred until shortly before
+        * returning to user space.  Previous samples would have kernel
+        * callchains only and they need to be stitched with this to make full
+        * callchains.
+        *
+        * struct {
+        *      struct perf_event_header        header;
+        *      u64                             cookie;
+        *      u64                             nr;
+        *      u64                             ips[nr];
+        *      struct sample_id                sample_id;
+        * };
+        */
+       PERF_RECORD_CALLCHAIN_DEFERRED          = 22,
+
        PERF_RECORD_MAX,                        /* non-ABI */
 };
 
@@ -1269,6 +1287,7 @@ enum perf_callchain_context {
        PERF_CONTEXT_HV                         = (__u64)-32,
        PERF_CONTEXT_KERNEL                     = (__u64)-128,
        PERF_CONTEXT_USER                       = (__u64)-512,
+       PERF_CONTEXT_USER_DEFERRED              = (__u64)-640,
 
        PERF_CONTEXT_GUEST                      = (__u64)-2048,
        PERF_CONTEXT_GUEST_KERNEL               = (__u64)-2176,
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -315,7 +315,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_re
                max_depth = sysctl_perf_event_max_stack;
 
        trace = get_perf_callchain(regs, kernel, user, max_depth,
-                                  false, false);
+                                  false, false, 0);
 
        if (unlikely(!trace))
                /* couldn't fetch the stack trace */
@@ -452,7 +452,7 @@ static long __bpf_get_stack(struct pt_re
                trace = get_callchain_entry_for_task(task, max_depth);
        else
                trace = get_perf_callchain(regs, kernel, user, max_depth,
-                                          crosstask, false);
+                                          crosstask, false, 0);
 
        if (unlikely(!trace) || trace->nr < skip) {
                if (may_fault)
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -218,7 +218,7 @@ static void fixup_uretprobe_trampoline_e
 
 struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
-                  u32 max_stack, bool crosstask, bool add_mark)
+                  u32 max_stack, bool crosstask, bool add_mark, u64 
defer_cookie)
 {
        struct perf_callchain_entry *entry;
        struct perf_callchain_entry_ctx ctx;
@@ -251,6 +251,18 @@ get_perf_callchain(struct pt_regs *regs,
                        regs = task_pt_regs(current);
                }
 
+               if (defer_cookie) {
+                       /*
+                        * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED
+                        * which can be stitched to this one, and add
+                        * the cookie after it (it will be cut off when the
+                        * user stack is copied to the callchain).
+                        */
+                       perf_callchain_store_context(&ctx, 
PERF_CONTEXT_USER_DEFERRED);
+                       perf_callchain_store_context(&ctx, defer_cookie);
+                       goto exit_put;
+               }
+
                if (add_mark)
                        perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
 
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -56,6 +56,7 @@
 #include <linux/buildid.h>
 #include <linux/task_work.h>
 #include <linux/percpu-rwsem.h>
+#include <linux/unwind_deferred.h>
 
 #include "internal.h"
 
@@ -8200,6 +8201,8 @@ static u64 perf_get_page_size(unsigned l
 
 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
 
+static struct unwind_work perf_unwind_work;
+
 struct perf_callchain_entry *
 perf_callchain(struct perf_event *event, struct pt_regs *regs)
 {
@@ -8208,8 +8211,11 @@ perf_callchain(struct perf_event *event,
                !(current->flags & (PF_KTHREAD | PF_USER_WORKER));
        /* Disallow cross-task user callchains. */
        bool crosstask = event->ctx->task && event->ctx->task != current;
+       bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user &&
+                         event->attr.defer_callchain;
        const u32 max_stack = event->attr.sample_max_stack;
        struct perf_callchain_entry *callchain;
+       u64 defer_cookie;
 
        if (!current->mm)
                user = false;
@@ -8217,8 +8223,13 @@ perf_callchain(struct perf_event *event,
        if (!kernel && !user)
                return &__empty_callchain;
 
-       callchain = get_perf_callchain(regs, kernel, user,
-                                      max_stack, crosstask, true);
+       if (!(user && defer_user && !crosstask &&
+             unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0))
+               defer_cookie = 0;
+
+       callchain = get_perf_callchain(regs, kernel, user, max_stack,
+                                      crosstask, true, defer_cookie);
+
        return callchain ?: &__empty_callchain;
 }
 
@@ -10003,6 +10014,67 @@ void perf_event_bpf_event(struct bpf_pro
        perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
 }
 
+struct perf_callchain_deferred_event {
+       struct unwind_stacktrace *trace;
+       struct {
+               struct perf_event_header        header;
+               u64                             cookie;
+               u64                             nr;
+               u64                             ips[];
+       } event;
+};
+
+static void perf_callchain_deferred_output(struct perf_event *event, void 
*data)
+{
+       struct perf_callchain_deferred_event *deferred_event = data;
+       struct perf_output_handle handle;
+       struct perf_sample_data sample;
+       int ret, size = deferred_event->event.header.size;
+
+       if (!event->attr.defer_output)
+               return;
+
+       /* XXX do we really need sample_id_all for this ??? */
+       perf_event_header__init_id(&deferred_event->event.header, &sample, 
event);
+
+       ret = perf_output_begin(&handle, &sample, event,
+                               deferred_event->event.header.size);
+       if (ret)
+               goto out;
+
+       perf_output_put(&handle, deferred_event->event);
+       for (int i = 0; i < deferred_event->trace->nr; i++) {
+               u64 entry = deferred_event->trace->entries[i];
+               perf_output_put(&handle, entry);
+       }
+       perf_event__output_id_sample(event, &handle, &sample);
+
+       perf_output_end(&handle);
+out:
+       deferred_event->event.header.size = size;
+}
+
+/* Deferred unwinding callback for task specific events */
+static void perf_unwind_deferred_callback(struct unwind_work *work,
+                                        struct unwind_stacktrace *trace, u64 
cookie)
+{
+       struct perf_callchain_deferred_event deferred_event = {
+               .trace = trace,
+               .event = {
+                       .header = {
+                               .type = PERF_RECORD_CALLCHAIN_DEFERRED,
+                               .misc = PERF_RECORD_MISC_USER,
+                               .size = sizeof(deferred_event.event) +
+                                       (trace->nr * sizeof(u64)),
+                       },
+                       .cookie = cookie,
+                       .nr = trace->nr,
+               },
+       };
+
+       perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL);
+}
+
 struct perf_text_poke_event {
        const void              *old_bytes;
        const void              *new_bytes;
@@ -14799,6 +14871,9 @@ void __init perf_event_init(void)
 
        idr_init(&pmu_idr);
 
+       unwind_deferred_init(&perf_unwind_work,
+                            perf_unwind_deferred_callback);
+
        perf_event_init_all_cpus();
        init_srcu_struct(&pmus_srcu);
        perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -463,7 +463,9 @@ struct perf_event_attr {
                                inherit_thread :  1, /* children only inherit 
if cloned with CLONE_THREAD */
                                remove_on_exec :  1, /* event is removed from 
task on exec */
                                sigtrap        :  1, /* send synchronous 
SIGTRAP on event */
-                               __reserved_1   : 26;
+                               defer_callchain:  1, /* request 
PERF_RECORD_CALLCHAIN_DEFERRED records */
+                               defer_output   :  1, /* output 
PERF_RECORD_CALLCHAIN_DEFERRED records */
+                               __reserved_1   : 24;
 
        union {
                __u32           wakeup_events;    /* wake up every n events */
@@ -1239,6 +1241,22 @@ enum perf_event_type {
         */
        PERF_RECORD_AUX_OUTPUT_HW_ID            = 21,
 
+       /*
+        * This user callchain capture was deferred until shortly before
+        * returning to user space.  Previous samples would have kernel
+        * callchains only and they need to be stitched with this to make full
+        * callchains.
+        *
+        * struct {
+        *      struct perf_event_header        header;
+        *      u64                             cookie;
+        *      u64                             nr;
+        *      u64                             ips[nr];
+        *      struct sample_id                sample_id;
+        * };
+        */
+       PERF_RECORD_CALLCHAIN_DEFERRED          = 22,
+
        PERF_RECORD_MAX,                        /* non-ABI */
 };
 
@@ -1269,6 +1287,7 @@ enum perf_callchain_context {
        PERF_CONTEXT_HV                         = (__u64)-32,
        PERF_CONTEXT_KERNEL                     = (__u64)-128,
        PERF_CONTEXT_USER                       = (__u64)-512,
+       PERF_CONTEXT_USER_DEFERRED              = (__u64)-640,
 
        PERF_CONTEXT_GUEST                      = (__u64)-2048,
        PERF_CONTEXT_GUEST_KERNEL               = (__u64)-2176,

Reply via email to