From: Josh Poimboeuf <jpoim...@kernel.org> Use the new unwind_deferred_trace() interface (if available) to defer unwinds to task context. This will allow the use of .sframe (when it becomes available) and also prevents duplicate userspace unwinds.
As the struct unwind_stacktrace has its entries as "unsigned long", and it is used to copy directly into struct perf_callchain_entry which its "ip" field is defined as u64, only allow deferred callchains for 64bit architectures. Suggested-by: Peter Zijlstra <pet...@infradead.org> Co-developed-by: Steven Rostedt (Google) <rost...@goodmis.org> Signed-off-by: Josh Poimboeuf <jpoim...@kernel.org> Signed-off-by: Steven Rostedt (Google) <rost...@goodmis.org> --- Changse since v13: https://lore.kernel.org/20250708020050.928524...@kernel.org - Rename unwind_deferred_trace() to unwind_user_faultable(). The function's name was changed, but wasn't caught because the code was moved around in the next patch which also did the rename. Compiling this patch as a standalone caught the issue. - Need to copy the trace.entries[] one a at a time as the perf entry in the ring buffer has 64 bit entries, but trace.entries[] are size long. include/linux/perf_event.h | 7 +- include/uapi/linux/perf_event.h | 19 +++- kernel/bpf/stackmap.c | 4 +- kernel/events/callchain.c | 11 +- kernel/events/core.c | 154 +++++++++++++++++++++++++- tools/include/uapi/linux/perf_event.h | 19 +++- 6 files changed, 206 insertions(+), 8 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 54e0d31afcad..c7d474391e51 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -53,6 +53,7 @@ #include <linux/security.h> #include <linux/static_call.h> #include <linux/lockdep.h> +#include <linux/unwind_deferred.h> #include <asm/local.h> @@ -880,6 +881,10 @@ struct perf_event { struct callback_head pending_task; unsigned int pending_work; + unsigned int pending_unwind_callback; + struct callback_head pending_unwind_work; + struct rcuwait pending_unwind_wait; + atomic_t event_limit; /* address range filters */ @@ -1720,7 +1725,7 @@ extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct p extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, - u32 max_stack, bool crosstask, bool add_mark); + u32 max_stack, bool crosstask, bool add_mark, bool defer_user); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); extern struct perf_callchain_entry *get_callchain_entry(int *rctx); diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 78a362b80027..184740d1e79d 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -463,7 +463,8 @@ struct perf_event_attr { inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ remove_on_exec : 1, /* event is removed from task on exec */ sigtrap : 1, /* send synchronous SIGTRAP on event */ - __reserved_1 : 26; + defer_callchain: 1, /* generate PERF_RECORD_CALLCHAIN_DEFERRED records */ + __reserved_1 : 25; union { __u32 wakeup_events; /* wake up every n events */ @@ -1239,6 +1240,21 @@ enum perf_event_type { */ PERF_RECORD_AUX_OUTPUT_HW_ID = 21, + /* + * This user callchain capture was deferred until shortly before + * returning to user space. Previous samples would have kernel + * callchains only and they need to be stitched with this to make full + * callchains. + * + * struct { + * struct perf_event_header header; + * u64 nr; + * u64 ips[nr]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_CALLCHAIN_DEFERRED = 22, + PERF_RECORD_MAX, /* non-ABI */ }; @@ -1269,6 +1285,7 @@ enum perf_callchain_context { PERF_CONTEXT_HV = (__u64)-32, PERF_CONTEXT_KERNEL = (__u64)-128, PERF_CONTEXT_USER = (__u64)-512, + PERF_CONTEXT_USER_DEFERRED = (__u64)-640, PERF_CONTEXT_GUEST = (__u64)-2048, PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index ec3a57a5fba1..339f7cbbcf36 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -315,7 +315,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, max_depth = sysctl_perf_event_max_stack; trace = get_perf_callchain(regs, kernel, user, max_depth, - false, false); + false, false, false); if (unlikely(!trace)) /* couldn't fetch the stack trace */ @@ -452,7 +452,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, trace = get_callchain_entry_for_task(task, max_depth); else trace = get_perf_callchain(regs, kernel, user, max_depth, - crosstask, false); + crosstask, false, false); if (unlikely(!trace) || trace->nr < skip) { if (may_fault) diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 808c0d7a31fa..d0e0da66a164 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -218,7 +218,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, - u32 max_stack, bool crosstask, bool add_mark) + u32 max_stack, bool crosstask, bool add_mark, bool defer_user) { struct perf_callchain_entry *entry; struct perf_callchain_entry_ctx ctx; @@ -251,6 +251,15 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, regs = task_pt_regs(current); } + if (defer_user) { + /* + * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED + * which can be stitched to this one. + */ + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED); + goto exit_put; + } + if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); diff --git a/kernel/events/core.c b/kernel/events/core.c index bd0a33f389d2..cab5fa238684 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5582,6 +5582,93 @@ static bool exclusive_event_installable(struct perf_event *event, return true; } +static void perf_pending_unwind_sync(struct perf_event *event) +{ + might_sleep(); + + if (!event->pending_unwind_callback) + return; + + /* + * If the task is queued to the current task's queue, we + * obviously can't wait for it to complete. Simply cancel it. + */ + if (task_work_cancel(current, &event->pending_unwind_work)) { + event->pending_unwind_callback = 0; + local_dec(&event->ctx->nr_no_switch_fast); + return; + } + + /* + * All accesses related to the event are within the same RCU section in + * perf_event_callchain_deferred(). The RCU grace period before the + * event is freed will make sure all those accesses are complete by then. + */ + rcuwait_wait_event(&event->pending_unwind_wait, !event->pending_unwind_callback, TASK_UNINTERRUPTIBLE); +} + +struct perf_callchain_deferred_event { + struct perf_event_header header; + u64 nr; + u64 ips[]; +}; + +static void perf_event_callchain_deferred(struct callback_head *work) +{ + struct perf_event *event = container_of(work, struct perf_event, pending_unwind_work); + struct perf_callchain_deferred_event deferred_event; + u64 callchain_context = PERF_CONTEXT_USER; + struct unwind_stacktrace trace; + struct perf_output_handle handle; + struct perf_sample_data data; + u64 nr; + + if (!event->pending_unwind_callback) + return; + + if (unwind_user_faultable(&trace) < 0) + goto out; + + /* + * All accesses to the event must belong to the same implicit RCU + * read-side critical section as the ->pending_unwind_callback reset. + * See comment in perf_pending_unwind_sync(). + */ + guard(rcu)(); + + if (current->flags & (PF_KTHREAD | PF_USER_WORKER)) + goto out; + + nr = trace.nr + 1 ; /* '+1' == callchain_context */ + + deferred_event.header.type = PERF_RECORD_CALLCHAIN_DEFERRED; + deferred_event.header.misc = PERF_RECORD_MISC_USER; + deferred_event.header.size = sizeof(deferred_event) + (nr * sizeof(u64)); + + deferred_event.nr = nr; + + perf_event_header__init_id(&deferred_event.header, &data, event); + + if (perf_output_begin(&handle, &data, event, deferred_event.header.size)) + goto out; + + perf_output_put(&handle, deferred_event); + perf_output_put(&handle, callchain_context); + /* trace.entries[] are not guaranteed to be 64bit */ + for (int i = 0; i < trace.nr; i++) { + u64 entry = trace.entries[i]; + perf_output_put(&handle, entry); + } + perf_event__output_id_sample(event, &handle, &data); + + perf_output_end(&handle); + +out: + event->pending_unwind_callback = 0; + local_dec(&event->ctx->nr_no_switch_fast); + rcuwait_wake_up(&event->pending_unwind_wait); +} + static void perf_free_addr_filters(struct perf_event *event); /* vs perf_event_alloc() error */ @@ -5649,6 +5736,7 @@ static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending_irq); irq_work_sync(&event->pending_disable_irq); + perf_pending_unwind_sync(event); unaccount_event(event); @@ -8162,6 +8250,46 @@ static u64 perf_get_page_size(unsigned long addr) static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; +/* + * Returns: +* > 0 : if already queued. + * 0 : if it performed the queuing + * < 0 : if it did not get queued. + */ +static int deferred_request(struct perf_event *event) +{ + struct callback_head *work = &event->pending_unwind_work; + int pending; + int ret; + + /* Only defer for task events */ + if (!event->ctx->task) + return -EINVAL; + + if ((current->flags & (PF_KTHREAD | PF_USER_WORKER)) || + !user_mode(task_pt_regs(current))) + return -EINVAL; + + guard(irqsave)(); + + /* callback already pending? */ + pending = READ_ONCE(event->pending_unwind_callback); + if (pending) + return 1; + + /* Claim the work unless an NMI just now swooped in to do so. */ + if (!try_cmpxchg(&event->pending_unwind_callback, &pending, 1)) + return 1; + + /* The work has been claimed, now schedule it. */ + ret = task_work_add(current, work, TWA_RESUME); + if (WARN_ON_ONCE(ret)) { + WRITE_ONCE(event->pending_unwind_callback, 0); + return ret; + } + return 0; +} + struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { @@ -8172,6 +8300,9 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) bool crosstask = event->ctx->task && event->ctx->task != current; const u32 max_stack = event->attr.sample_max_stack; struct perf_callchain_entry *callchain; + /* perf currently only supports deferred in 64bit */ + bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user && + event->attr.defer_callchain; if (!current->mm) user = false; @@ -8179,8 +8310,21 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!kernel && !user) return &__empty_callchain; - callchain = get_perf_callchain(regs, kernel, user, - max_stack, crosstask, true); + /* Disallow cross-task callchains. */ + if (event->ctx->task && event->ctx->task != current) + return &__empty_callchain; + + if (defer_user) { + int ret = deferred_request(event); + if (!ret) + local_inc(&event->ctx->nr_no_switch_fast); + else if (ret < 0) + defer_user = false; + } + + callchain = get_perf_callchain(regs, kernel, user, max_stack, + crosstask, true, defer_user); + return callchain ?: &__empty_callchain; } @@ -12850,6 +12994,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable); init_task_work(&event->pending_task, perf_pending_task); + rcuwait_init(&event->pending_unwind_wait); + mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); @@ -13018,6 +13164,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (err) return ERR_PTR(err); + if (event->attr.defer_callchain) + init_task_work(&event->pending_unwind_work, + perf_event_callchain_deferred); + /* symmetric to unaccount_event() in _free_event() */ account_event(event); diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 78a362b80027..184740d1e79d 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -463,7 +463,8 @@ struct perf_event_attr { inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ remove_on_exec : 1, /* event is removed from task on exec */ sigtrap : 1, /* send synchronous SIGTRAP on event */ - __reserved_1 : 26; + defer_callchain: 1, /* generate PERF_RECORD_CALLCHAIN_DEFERRED records */ + __reserved_1 : 25; union { __u32 wakeup_events; /* wake up every n events */ @@ -1239,6 +1240,21 @@ enum perf_event_type { */ PERF_RECORD_AUX_OUTPUT_HW_ID = 21, + /* + * This user callchain capture was deferred until shortly before + * returning to user space. Previous samples would have kernel + * callchains only and they need to be stitched with this to make full + * callchains. + * + * struct { + * struct perf_event_header header; + * u64 nr; + * u64 ips[nr]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_CALLCHAIN_DEFERRED = 22, + PERF_RECORD_MAX, /* non-ABI */ }; @@ -1269,6 +1285,7 @@ enum perf_callchain_context { PERF_CONTEXT_HV = (__u64)-32, PERF_CONTEXT_KERNEL = (__u64)-128, PERF_CONTEXT_USER = (__u64)-512, + PERF_CONTEXT_USER_DEFERRED = (__u64)-640, PERF_CONTEXT_GUEST = (__u64)-2048, PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, -- 2.47.2