Introduce helper bpf_get_task_stack(), which dumps stack trace of given
task. This is different to bpf_get_stack(), which gets stack track of
current task. One potential use case of bpf_get_task_stack() is to call
it from bpf_iter__task and dump all /proc/<pid>/stack to a seq_file.

bpf_get_task_stack() uses stack_trace_save_tsk() instead of
get_perf_callchain() for kernel stack. The benefit of this choice is that
stack_trace_save_tsk() doesn't require changes in arch/. The downside of
using stack_trace_save_tsk() is that stack_trace_save_tsk() dumps the
stack trace to unsigned long array. For 32-bit systems, we need to
translate it to u64 array.

Acked-by: Andrii Nakryiko <andr...@fb.com>
Signed-off-by: Song Liu <songliubrav...@fb.com>
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       | 36 +++++++++++++++-
 kernel/bpf/stackmap.c          | 77 ++++++++++++++++++++++++++++++++--
 kernel/bpf/verifier.c          |  4 +-
 kernel/trace/bpf_trace.c       |  2 +
 scripts/bpf_helpers_doc.py     |  2 +
 tools/include/uapi/linux/bpf.h | 36 +++++++++++++++-
 7 files changed, 151 insertions(+), 7 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3d2ade703a357..0cd7f6884c5cd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1627,6 +1627,7 @@ extern const struct bpf_func_proto 
bpf_get_current_uid_gid_proto;
 extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
 extern const struct bpf_func_proto bpf_get_stack_proto;
+extern const struct bpf_func_proto bpf_get_task_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 extern const struct bpf_func_proto bpf_sock_hash_update_proto;
 extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0cb8ec9488168..cefb78a77d928 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3285,6 +3285,39 @@ union bpf_attr {
  *             Dynamically cast a *sk* pointer to a *udp6_sock* pointer.
  *     Return
  *             *sk* if casting is valid, or NULL otherwise.
+ *
+ * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 
flags)
+ *     Description
+ *             Return a user or a kernel stack in bpf program provided buffer.
+ *             To achieve this, the helper needs *task*, which is a valid
+ *             pointer to struct task_struct. To store the stacktrace, the
+ *             bpf program provides *buf* with a nonnegative *size*.
+ *
+ *             The last argument, *flags*, holds the number of stack frames to
+ *             skip (from 0 to 255), masked with
+ *             **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ *             the following flags:
+ *
+ *             **BPF_F_USER_STACK**
+ *                     Collect a user space stack instead of a kernel stack.
+ *             **BPF_F_USER_BUILD_ID**
+ *                     Collect buildid+offset instead of ips for user stack,
+ *                     only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ *             **bpf_get_task_stack**\ () can collect up to
+ *             **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ *             to sufficient large buffer size. Note that
+ *             this limit can be controlled with the **sysctl** program, and
+ *             that it should be manually increased in order to profile long
+ *             user stacks (such as stacks for Java programs). To do so, use:
+ *
+ *             ::
+ *
+ *                     # sysctl kernel.perf_event_max_stack=<new value>
+ *     Return
+ *             A non-negative value equal to or less than *size* on success,
+ *             or a negative error in case of failure.
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)          \
        FN(unspec),                     \
@@ -3427,7 +3460,8 @@ union bpf_attr {
        FN(skc_to_tcp_sock),            \
        FN(skc_to_tcp_timewait_sock),   \
        FN(skc_to_tcp_request_sock),    \
-       FN(skc_to_udp6_sock),
+       FN(skc_to_udp6_sock),           \
+       FN(get_task_stack),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 27dc9b1b08a52..0ba66b29ef227 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -348,6 +348,40 @@ static void stack_map_get_build_id_offset(struct 
bpf_stack_build_id *id_offs,
        }
 }
 
+static struct perf_callchain_entry *
+get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
+{
+       struct perf_callchain_entry *entry;
+       int rctx;
+
+       entry = get_callchain_entry(&rctx);
+
+       if (!entry)
+               return NULL;
+
+       entry->nr = init_nr +
+               stack_trace_save_tsk(task, (unsigned long *)(entry->ip + 
init_nr),
+                                    sysctl_perf_event_max_stack - init_nr, 0);
+
+       /* stack_trace_save_tsk() works on unsigned long array, while
+        * perf_callchain_entry uses u64 array. For 32-bit systems, it is
+        * necessary to fix this mismatch.
+        */
+       if (__BITS_PER_LONG != 64) {
+               unsigned long *from = (unsigned long *) entry->ip;
+               u64 *to = entry->ip;
+               int i;
+
+               /* copy data from the end to avoid using extra buffer */
+               for (i = entry->nr - 1; i >= (int)init_nr; i--)
+                       to[i] = (u64)(from[i]);
+       }
+
+       put_callchain_entry(rctx);
+
+       return entry;
+}
+
 BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
           u64, flags)
 {
@@ -448,8 +482,8 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
        .arg3_type      = ARG_ANYTHING,
 };
 
-BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
-          u64, flags)
+static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
+                           void *buf, u32 size, u64 flags)
 {
        u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
        bool user_build_id = flags & BPF_F_USER_BUILD_ID;
@@ -471,13 +505,22 @@ BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, 
buf, u32, size,
        if (unlikely(size % elem_size))
                goto clear;
 
+       /* cannot get valid user stack for task without user_mode regs */
+       if (task && user && !user_mode(regs))
+               goto err_fault;
+
        num_elem = size / elem_size;
        if (sysctl_perf_event_max_stack < num_elem)
                init_nr = 0;
        else
                init_nr = sysctl_perf_event_max_stack - num_elem;
-       trace = get_perf_callchain(regs, init_nr, kernel, user,
-                                  sysctl_perf_event_max_stack, false, false);
+
+       if (kernel && task)
+               trace = get_callchain_entry_for_task(task, init_nr);
+       else
+               trace = get_perf_callchain(regs, init_nr, kernel, user,
+                                          sysctl_perf_event_max_stack,
+                                          false, false);
        if (unlikely(!trace))
                goto err_fault;
 
@@ -505,6 +548,12 @@ BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, 
buf, u32, size,
        return err;
 }
 
+BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
+          u64, flags)
+{
+       return __bpf_get_stack(regs, NULL, buf, size, flags);
+}
+
 const struct bpf_func_proto bpf_get_stack_proto = {
        .func           = bpf_get_stack,
        .gpl_only       = true,
@@ -515,6 +564,26 @@ const struct bpf_func_proto bpf_get_stack_proto = {
        .arg4_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
+          u32, size, u64, flags)
+{
+       struct pt_regs *regs = task_pt_regs(task);
+
+       return __bpf_get_stack(regs, task, buf, size, flags);
+}
+
+static int bpf_get_task_stack_btf_ids[5];
+const struct bpf_func_proto bpf_get_task_stack_proto = {
+       .func           = bpf_get_task_stack,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_BTF_ID,
+       .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
+       .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
+       .arg4_type      = ARG_ANYTHING,
+       .btf_id         = bpf_get_task_stack_btf_ids,
+};
+
 /* Called from eBPF program */
 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
 {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7de98906ddf4a..b608185e1ffd5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4864,7 +4864,9 @@ static int check_helper_call(struct bpf_verifier_env 
*env, int func_id, int insn
        if (err)
                return err;
 
-       if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) {
+       if ((func_id == BPF_FUNC_get_stack ||
+            func_id == BPF_FUNC_get_task_stack) &&
+           !env->prog->has_callchain_buf) {
                const char *err_str;
 
 #ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 5d59dda5f6615..977ba3b6f6c64 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1137,6 +1137,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const 
struct bpf_prog *prog)
                return &bpf_ringbuf_query_proto;
        case BPF_FUNC_jiffies64:
                return &bpf_jiffies64_proto;
+       case BPF_FUNC_get_task_stack:
+               return &bpf_get_task_stack_proto;
        default:
                return NULL;
        }
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index 6bab40ff442e8..6843376733df8 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -426,6 +426,7 @@ class PrinterHelpers(Printer):
             'struct tcp_timewait_sock',
             'struct tcp_request_sock',
             'struct udp6_sock',
+            'struct task_struct',
 
             'struct __sk_buff',
             'struct sk_msg_md',
@@ -468,6 +469,7 @@ class PrinterHelpers(Printer):
             'struct tcp_timewait_sock',
             'struct tcp_request_sock',
             'struct udp6_sock',
+            'struct task_struct',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 0cb8ec9488168..cefb78a77d928 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3285,6 +3285,39 @@ union bpf_attr {
  *             Dynamically cast a *sk* pointer to a *udp6_sock* pointer.
  *     Return
  *             *sk* if casting is valid, or NULL otherwise.
+ *
+ * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 
flags)
+ *     Description
+ *             Return a user or a kernel stack in bpf program provided buffer.
+ *             To achieve this, the helper needs *task*, which is a valid
+ *             pointer to struct task_struct. To store the stacktrace, the
+ *             bpf program provides *buf* with a nonnegative *size*.
+ *
+ *             The last argument, *flags*, holds the number of stack frames to
+ *             skip (from 0 to 255), masked with
+ *             **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ *             the following flags:
+ *
+ *             **BPF_F_USER_STACK**
+ *                     Collect a user space stack instead of a kernel stack.
+ *             **BPF_F_USER_BUILD_ID**
+ *                     Collect buildid+offset instead of ips for user stack,
+ *                     only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ *             **bpf_get_task_stack**\ () can collect up to
+ *             **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ *             to sufficient large buffer size. Note that
+ *             this limit can be controlled with the **sysctl** program, and
+ *             that it should be manually increased in order to profile long
+ *             user stacks (such as stacks for Java programs). To do so, use:
+ *
+ *             ::
+ *
+ *                     # sysctl kernel.perf_event_max_stack=<new value>
+ *     Return
+ *             A non-negative value equal to or less than *size* on success,
+ *             or a negative error in case of failure.
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)          \
        FN(unspec),                     \
@@ -3427,7 +3460,8 @@ union bpf_attr {
        FN(skc_to_tcp_sock),            \
        FN(skc_to_tcp_timewait_sock),   \
        FN(skc_to_tcp_request_sock),    \
-       FN(skc_to_udp6_sock),
+       FN(skc_to_udp6_sock),           \
+       FN(get_task_stack),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
2.24.1

Reply via email to