Re: [RESEND][PATCH v15 2/4] perf: Support deferred user callchains

Peter Zijlstra Fri, 17 Oct 2025 09:41:28 -0700

On Mon, Sep 08, 2025 at 01:14:14PM -0400, Steven Rostedt wrote:
> +struct perf_callchain_deferred_event {
> +     struct perf_event_header        header;
> +     u64                             cookie;
> +     u64                             nr;
> +     u64                             ips[];
> +};
> +
> +static void perf_event_callchain_deferred(struct callback_head *work)
> +{
> +     struct perf_event *event = container_of(work, struct perf_event, 
> pending_unwind_work);
> +     struct perf_callchain_deferred_event deferred_event;
> +     u64 callchain_context = PERF_CONTEXT_USER;
> +     struct unwind_stacktrace trace;
> +     struct perf_output_handle handle;
> +     struct perf_sample_data data;
> +     u64 nr;
> +
> +     if (!event->pending_unwind_callback)
> +             return;
> +
> +     if (unwind_user_faultable(&trace) < 0)
> +             goto out;
> +
> +     /*
> +      * All accesses to the event must belong to the same implicit RCU
> +      * read-side critical section as the ->pending_unwind_callback reset.
> +      * See comment in perf_pending_unwind_sync().
> +      */
> +     guard(rcu)();
> +
> +     if (current->flags & (PF_KTHREAD | PF_USER_WORKER))
> +             goto out;
> +
> +     nr = trace.nr + 1 ; /* '+1' == callchain_context */
> +
> +     deferred_event.header.type = PERF_RECORD_CALLCHAIN_DEFERRED;
> +     deferred_event.header.misc = PERF_RECORD_MISC_USER;
> +     deferred_event.header.size = sizeof(deferred_event) + (nr * 
> sizeof(u64));
> +
> +     deferred_event.nr = nr;
> +     deferred_event.cookie = unwind_user_get_cookie();
> +
> +     perf_event_header__init_id(&deferred_event.header, &data, event);
> +
> +     if (perf_output_begin(&handle, &data, event, 
> deferred_event.header.size))
> +             goto out;
> +
> +     perf_output_put(&handle, deferred_event);
> +     perf_output_put(&handle, callchain_context);
> +     /* trace.entries[] are not guaranteed to be 64bit */
> +     for (int i = 0; i < trace.nr; i++) {
> +             u64 entry = trace.entries[i];
> +             perf_output_put(&handle, entry);
> +     }
> +     perf_event__output_id_sample(event, &handle, &data);
> +
> +     perf_output_end(&handle);
> +
> +out:
> +     event->pending_unwind_callback = 0;
> +     local_dec(&event->ctx->nr_no_switch_fast);
> +     rcuwait_wake_up(&event->pending_unwind_wait);
> +}
> +


> +/*
> + * Returns:
> +*     > 0 : if already queued.
> + *      0 : if it performed the queuing
> + *    < 0 : if it did not get queued.
> + */
> +static int deferred_request(struct perf_event *event)
> +{
> +     struct callback_head *work = &event->pending_unwind_work;
> +     int pending;
> +     int ret;
> +
> +     /* Only defer for task events */
> +     if (!event->ctx->task)
> +             return -EINVAL;
> +
> +     if ((current->flags & (PF_KTHREAD | PF_USER_WORKER)) ||
> +         !user_mode(task_pt_regs(current)))
> +             return -EINVAL;
> +
> +     guard(irqsave)();
> +
> +     /* callback already pending? */
> +     pending = READ_ONCE(event->pending_unwind_callback);
> +     if (pending)
> +             return 1;
> +
> +     /* Claim the work unless an NMI just now swooped in to do so. */
> +     if (!try_cmpxchg(&event->pending_unwind_callback, &pending, 1))
> +             return 1;
> +
> +     /* The work has been claimed, now schedule it. */
> +     ret = task_work_add(current, work, TWA_RESUME);
> +     if (WARN_ON_ONCE(ret)) {
> +             WRITE_ONCE(event->pending_unwind_callback, 0);
> +             return ret;
> +     }
> +     return 0;
> +}

So the thing that stands out is that you're not actually using the
unwind infrastructure you've previously created. Things like: struct
unwind_work, unwind_deferred_{init,request,cancel}() all go unused, and
instead you seem to have build a parallel set, with similar bugs to the
ones I just had to fix in the unwind_deferred things :/

I'm also not much of a fan of nr_no_switch_fast, and the fact that this
patch is limited to per-task events, and you're then adding another 300+
lines of code to support per-cpu events later on.

Fundamentally we only have one stack-trace per task at any one point. We
can have many events per task and many more per-cpu. Let us stick a
struct unwind_work in task_struct and have the perf callback function
use perf_iterate_sb() to find all events that want delivery or so (or we
can add another per perf_event_context list for this purpose).

But duplicating all this seems 'unfortunate'.

Re: [RESEND][PATCH v15 2/4] perf: Support deferred user callchains

Reply via email to