From: Josh Poimboeuf <[email protected]>

If the user fault unwind is available (the one that will be used for
sframes), have perf be able to utilize it. Currently all user stack
traces are done at the request site. This mostly happens in interrupt or
NMI context where user space is only accessible if it is currently present
in memory. It is possible that the user stack was swapped out and is not
present, but mostly the use of sframes will require faulting in user pages
which will not be possible from interrupt context. Instead, add a frame
work that will delay the reading of the user space stack until the task
goes back to user space where faulting in pages is possible. This is also
advantageous as the user space stack doesn't change while in the kernel,
and this will also remove duplicate entries of user space stacks for a
long running system call being profiled.

A new perf context is created called PERF_CONTEXT_USER_DEFERRED. It is
added to the kernel callchain, usually when an interrupt or NMI is
triggered (but can be added to any callchain). When a deferred unwind is
required, it uses the new deferred unwind infrastructure.

When tracing a single task and a user stack trace is required, perf will
call unwind_deferred_request(). This will trigger a task_work that on task
kernel space exit will call the perf function perf_event_deferred_task()
with the user stacktrace and a cookie (an identifier for that stack
trace).

This user stack trace will go into a new perf type called
PERF_RECORD_CALLCHAIN_DEFERRED.  The perf user space will need to attach
this stack trace to each of the previous kernel callchains for that task
with the PERF_CONTEXT_USER_DEFERRED context in them.

Suggested-by: Peter Zijlstra <[email protected]>
Co-developed-by: Steven Rostedt (Google) <[email protected]>
Signed-off-by: Josh Poimboeuf <[email protected]>
Signed-off-by: Steven Rostedt (Google) <[email protected]>
---
Changes since v15: https://lore.kernel.org/[email protected]

- Peter Zijlstra pointed out that the code mostly duplicated the code of
  the unwind infrastructure, and had the same bugs as it had.
  The unwind infrastructure was updated to allow a tracer to use it for a
  single task. The perf code now uses that which greatly simplified
  this version over the previous one.

 include/linux/perf_event.h            |   5 +-
 include/uapi/linux/perf_event.h       |  20 ++++-
 kernel/bpf/stackmap.c                 |   4 +-
 kernel/events/callchain.c             |  11 ++-
 kernel/events/core.c                  | 110 +++++++++++++++++++++++++-
 tools/include/uapi/linux/perf_event.h |  20 ++++-
 6 files changed, 162 insertions(+), 8 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fd1d91017b99..152e3dacff98 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -53,6 +53,7 @@
 #include <linux/security.h>
 #include <linux/static_call.h>
 #include <linux/lockdep.h>
+#include <linux/unwind_deferred.h>
 
 #include <asm/local.h>
 
@@ -880,6 +881,8 @@ struct perf_event {
        struct callback_head            pending_task;
        unsigned int                    pending_work;
 
+       struct unwind_work              unwind_work;
+
        atomic_t                        event_limit;
 
        /* address range filters */
@@ -1720,7 +1723,7 @@ extern void perf_callchain_user(struct 
perf_callchain_entry_ctx *entry, struct p
 extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, 
struct pt_regs *regs);
 extern struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
-                  u32 max_stack, bool crosstask, bool add_mark);
+                  u32 max_stack, bool crosstask, bool add_mark, bool 
defer_user);
 extern int get_callchain_buffers(int max_stack);
 extern void put_callchain_buffers(void);
 extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 78a362b80027..20b8f890113b 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -463,7 +463,8 @@ struct perf_event_attr {
                                inherit_thread :  1, /* children only inherit 
if cloned with CLONE_THREAD */
                                remove_on_exec :  1, /* event is removed from 
task on exec */
                                sigtrap        :  1, /* send synchronous 
SIGTRAP on event */
-                               __reserved_1   : 26;
+                               defer_callchain:  1, /* generate 
PERF_RECORD_CALLCHAIN_DEFERRED records */
+                               __reserved_1   : 25;
 
        union {
                __u32           wakeup_events;    /* wake up every n events */
@@ -1239,6 +1240,22 @@ enum perf_event_type {
         */
        PERF_RECORD_AUX_OUTPUT_HW_ID            = 21,
 
+       /*
+        * This user callchain capture was deferred until shortly before
+        * returning to user space.  Previous samples would have kernel
+        * callchains only and they need to be stitched with this to make full
+        * callchains.
+        *
+        * struct {
+        *      struct perf_event_header        header;
+        *      u64                             cookie;
+        *      u64                             nr;
+        *      u64                             ips[nr];
+        *      struct sample_id                sample_id;
+        * };
+        */
+       PERF_RECORD_CALLCHAIN_DEFERRED          = 22,
+
        PERF_RECORD_MAX,                        /* non-ABI */
 };
 
@@ -1269,6 +1286,7 @@ enum perf_callchain_context {
        PERF_CONTEXT_HV                         = (__u64)-32,
        PERF_CONTEXT_KERNEL                     = (__u64)-128,
        PERF_CONTEXT_USER                       = (__u64)-512,
+       PERF_CONTEXT_USER_DEFERRED              = (__u64)-640,
 
        PERF_CONTEXT_GUEST                      = (__u64)-2048,
        PERF_CONTEXT_GUEST_KERNEL               = (__u64)-2176,
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index ec3a57a5fba1..339f7cbbcf36 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -315,7 +315,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct 
bpf_map *, map,
                max_depth = sysctl_perf_event_max_stack;
 
        trace = get_perf_callchain(regs, kernel, user, max_depth,
-                                  false, false);
+                                  false, false, false);
 
        if (unlikely(!trace))
                /* couldn't fetch the stack trace */
@@ -452,7 +452,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct 
task_struct *task,
                trace = get_callchain_entry_for_task(task, max_depth);
        else
                trace = get_perf_callchain(regs, kernel, user, max_depth,
-                                          crosstask, false);
+                                          crosstask, false, false);
 
        if (unlikely(!trace) || trace->nr < skip) {
                if (may_fault)
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 808c0d7a31fa..d0e0da66a164 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -218,7 +218,7 @@ static void fixup_uretprobe_trampoline_entries(struct 
perf_callchain_entry *entr
 
 struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
-                  u32 max_stack, bool crosstask, bool add_mark)
+                  u32 max_stack, bool crosstask, bool add_mark, bool 
defer_user)
 {
        struct perf_callchain_entry *entry;
        struct perf_callchain_entry_ctx ctx;
@@ -251,6 +251,15 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool 
user,
                        regs = task_pt_regs(current);
                }
 
+               if (defer_user) {
+                       /*
+                        * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED
+                        * which can be stitched to this one.
+                        */
+                       perf_callchain_store_context(&ctx, 
PERF_CONTEXT_USER_DEFERRED);
+                       goto exit_put;
+               }
+
                if (add_mark)
                        perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 28de3baff792..be94b437e7e0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5582,6 +5582,67 @@ static bool exclusive_event_installable(struct 
perf_event *event,
        return true;
 }
 
+static void perf_pending_unwind_sync(struct perf_event *event)
+{
+       struct unwind_work *work = &event->unwind_work;
+
+       unwind_deferred_cancel(work);
+}
+
+struct perf_callchain_deferred_event {
+       struct perf_event_header        header;
+       u64                             cookie;
+       u64                             nr;
+       u64                             ips[];
+};
+
+static void perf_event_callchain_deferred(struct perf_event *event,
+                                         struct unwind_stacktrace *trace,
+                                         u64 cookie)
+{
+       struct perf_callchain_deferred_event deferred_event;
+       u64 callchain_context = PERF_CONTEXT_USER;
+       struct perf_output_handle handle;
+       struct perf_sample_data data;
+       u64 nr;
+
+       nr = trace->nr + 1 ; /* '+1' == callchain_context */
+
+       deferred_event.header.type = PERF_RECORD_CALLCHAIN_DEFERRED;
+       deferred_event.header.misc = PERF_RECORD_MISC_USER;
+       deferred_event.header.size = sizeof(deferred_event) + (nr * 
sizeof(u64));
+
+       deferred_event.nr = nr;
+       deferred_event.cookie = cookie;
+
+       perf_event_header__init_id(&deferred_event.header, &data, event);
+
+       if (perf_output_begin(&handle, &data, event, 
deferred_event.header.size))
+               return;
+
+       perf_output_put(&handle, deferred_event);
+       perf_output_put(&handle, callchain_context);
+       /* trace->entries[] are not guaranteed to be 64bit */
+       for (int i = 0; i < trace->nr; i++) {
+               u64 entry = trace->entries[i];
+               perf_output_put(&handle, entry);
+       }
+       perf_event__output_id_sample(event, &handle, &data);
+
+       perf_output_end(&handle);
+}
+
+/* Deferred unwinding callback for task specific events */
+static void perf_event_deferred_task(struct unwind_work *work,
+                                    struct unwind_stacktrace *trace, u64 
cookie)
+{
+       struct perf_event *event = container_of(work, struct perf_event, 
unwind_work);
+
+       perf_event_callchain_deferred(event, trace, cookie);
+
+       local_dec(&event->ctx->nr_no_switch_fast);
+}
+
 static void perf_free_addr_filters(struct perf_event *event);
 
 /* vs perf_event_alloc() error */
@@ -5649,6 +5710,7 @@ static void _free_event(struct perf_event *event)
 {
        irq_work_sync(&event->pending_irq);
        irq_work_sync(&event->pending_disable_irq);
+       perf_pending_unwind_sync(event);
 
        unaccount_event(event);
 
@@ -8194,6 +8256,28 @@ static u64 perf_get_page_size(unsigned long addr)
 
 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
 
+/*
+ * Returns:
+*     > 0 : if already queued.
+ *      0 : if it performed the queuing
+ *    < 0 : if it did not get queued.
+ */
+static int deferred_request(struct perf_event *event)
+{
+       struct unwind_work *work = &event->unwind_work;
+       u64 cookie;
+
+       /* Only defer for task events */
+       if (!event->ctx->task)
+               return -EINVAL;
+
+       if ((current->flags & (PF_KTHREAD | PF_USER_WORKER)) ||
+           !user_mode(task_pt_regs(current)))
+               return -EINVAL;
+
+       return unwind_deferred_request(work, &cookie);
+}
+
 struct perf_callchain_entry *
 perf_callchain(struct perf_event *event, struct pt_regs *regs)
 {
@@ -8204,6 +8288,8 @@ perf_callchain(struct perf_event *event, struct pt_regs 
*regs)
        bool crosstask = event->ctx->task && event->ctx->task != current;
        const u32 max_stack = event->attr.sample_max_stack;
        struct perf_callchain_entry *callchain;
+       bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user &&
+                         event->attr.defer_callchain;
 
        if (!current->mm)
                user = false;
@@ -8211,8 +8297,21 @@ perf_callchain(struct perf_event *event, struct pt_regs 
*regs)
        if (!kernel && !user)
                return &__empty_callchain;
 
-       callchain = get_perf_callchain(regs, kernel, user,
-                                      max_stack, crosstask, true);
+       /* Disallow cross-task callchains. */
+       if (event->ctx->task && event->ctx->task != current)
+               return &__empty_callchain;
+
+       if (defer_user) {
+               int ret = deferred_request(event);
+               if (!ret)
+                       local_inc(&event->ctx->nr_no_switch_fast);
+               else if (ret < 0)
+                       defer_user = false;
+       }
+
+       callchain = get_perf_callchain(regs, kernel, user, max_stack,
+                                      crosstask, true, defer_user);
+
        return callchain ?: &__empty_callchain;
 }
 
@@ -13046,6 +13145,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
                }
        }
 
+       if (event->attr.defer_callchain) {
+               if (task) {
+                       err = unwind_deferred_task_init(&event->unwind_work,
+                                                       
perf_event_deferred_task);
+               }
+       }
+
        err = security_perf_event_alloc(event);
        if (err)
                return ERR_PTR(err);
diff --git a/tools/include/uapi/linux/perf_event.h 
b/tools/include/uapi/linux/perf_event.h
index 78a362b80027..20b8f890113b 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -463,7 +463,8 @@ struct perf_event_attr {
                                inherit_thread :  1, /* children only inherit 
if cloned with CLONE_THREAD */
                                remove_on_exec :  1, /* event is removed from 
task on exec */
                                sigtrap        :  1, /* send synchronous 
SIGTRAP on event */
-                               __reserved_1   : 26;
+                               defer_callchain:  1, /* generate 
PERF_RECORD_CALLCHAIN_DEFERRED records */
+                               __reserved_1   : 25;
 
        union {
                __u32           wakeup_events;    /* wake up every n events */
@@ -1239,6 +1240,22 @@ enum perf_event_type {
         */
        PERF_RECORD_AUX_OUTPUT_HW_ID            = 21,
 
+       /*
+        * This user callchain capture was deferred until shortly before
+        * returning to user space.  Previous samples would have kernel
+        * callchains only and they need to be stitched with this to make full
+        * callchains.
+        *
+        * struct {
+        *      struct perf_event_header        header;
+        *      u64                             cookie;
+        *      u64                             nr;
+        *      u64                             ips[nr];
+        *      struct sample_id                sample_id;
+        * };
+        */
+       PERF_RECORD_CALLCHAIN_DEFERRED          = 22,
+
        PERF_RECORD_MAX,                        /* non-ABI */
 };
 
@@ -1269,6 +1286,7 @@ enum perf_callchain_context {
        PERF_CONTEXT_HV                         = (__u64)-32,
        PERF_CONTEXT_KERNEL                     = (__u64)-128,
        PERF_CONTEXT_USER                       = (__u64)-512,
+       PERF_CONTEXT_USER_DEFERRED              = (__u64)-640,
 
        PERF_CONTEXT_GUEST                      = (__u64)-2048,
        PERF_CONTEXT_GUEST_KERNEL               = (__u64)-2176,
-- 
2.50.1



Reply via email to