[PATCH v3 linux-trace 1/8] tracing: attach eBPF programs to tracepoints and syscalls

Alexei Starovoitov Mon, 09 Feb 2015 19:50:26 -0800

User interface:
struct perf_event_attr attr = {.type = PERF_TYPE_TRACEPOINT, .config = 
event_id, ...};
event_fd = perf_event_open(&attr,...);
ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);


prog_fd is a file descriptor associated with eBPF program previously loaded.
event_id is an ID of static tracepoint event or syscall.
(kprobe support is in next patch)

close(event_fd) - automatically detaches eBPF program from it

eBPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- fetch_ptr/u64/u32/u16/u8 values from unsafe address via probe_kernel_read(),
  so that eBPF program can walk any kernel data structures
- probe_memcmp - combination of probe_kernel_read() and memcmp()

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
 include/linux/bpf.h             |    6 +-
 include/linux/ftrace_event.h    |   11 +++
 include/trace/bpf_trace.h       |   25 +++++++
 include/trace/ftrace.h          |   31 +++++++++
 include/uapi/linux/bpf.h        |    7 ++
 include/uapi/linux/perf_event.h |    1 +
 kernel/events/core.c            |   55 +++++++++++++++
 kernel/trace/Makefile           |    1 +
 kernel/trace/bpf_trace.c        |  145 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_syscalls.c   |   35 ++++++++++
 10 files changed, 316 insertions(+), 1 deletion(-)
 create mode 100644 include/trace/bpf_trace.h
 create mode 100644 kernel/trace/bpf_trace.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bbfceb756452..a0f6f636ced0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -130,10 +130,14 @@ struct bpf_prog_aux {
 
 #ifdef CONFIG_BPF_SYSCALL
 void bpf_prog_put(struct bpf_prog *prog);
+struct bpf_prog *bpf_prog_get(u32 ufd);
 #else
 static inline void bpf_prog_put(struct bpf_prog *prog) {}
+static inline struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+       return ERR_PTR(-ENOENT);
+}
 #endif
-struct bpf_prog *bpf_prog_get(u32 ufd);
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);
 
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 0bebb5c348b8..479d0a4a42b3 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -13,6 +13,7 @@ struct trace_array;
 struct trace_buffer;
 struct tracer;
 struct dentry;
+struct bpf_prog;
 
 struct trace_print_flags {
        unsigned long           mask;
@@ -299,6 +300,7 @@ struct ftrace_event_call {
 #ifdef CONFIG_PERF_EVENTS
        int                             perf_refcount;
        struct hlist_head __percpu      *perf_events;
+       struct bpf_prog                 *prog;
 
        int     (*perf_perm)(struct ftrace_event_call *,
                             struct perf_event *);
@@ -544,6 +546,15 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file 
*file,
                event_triggers_post_call(file, tt);
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
+#else
+static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+       return 1;
+}
+#endif
+
 enum {
        FILTER_OTHER = 0,
        FILTER_STATIC_STRING,
diff --git a/include/trace/bpf_trace.h b/include/trace/bpf_trace.h
new file mode 100644
index 000000000000..4e64f61f484d
--- /dev/null
+++ b/include/trace/bpf_trace.h
@@ -0,0 +1,25 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef _LINUX_KERNEL_BPF_TRACE_H
+#define _LINUX_KERNEL_BPF_TRACE_H
+
+/* For tracepoint filters argN fields match one to one to arguments
+ * passed to tracepoint events
+ *
+ * For syscall entry filters argN fields match syscall arguments
+ * For syscall exit filters arg1 is a return value
+ */
+struct bpf_context {
+       u64 arg1;
+       u64 arg2;
+       u64 arg3;
+       u64 arg4;
+       u64 arg5;
+       u64 arg6;
+};
+
+#endif /* _LINUX_KERNEL_BPF_TRACE_H */
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 139b5067345b..4c275ce2dcf0 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -17,6 +17,7 @@
  */
 
 #include <linux/ftrace_event.h>
+#include <trace/bpf_trace.h>
 
 /*
  * DECLARE_EVENT_CLASS can be used to add a generic function
@@ -755,12 +756,32 @@ __attribute__((section("_ftrace_events"))) 
*__event_##call = &event_##call
 #undef __perf_task
 #define __perf_task(t) (__task = (t))
 
+/* zero extend integer, pointer or aggregate type to u64 without warnings */
+#define __CAST_TO_U64(EXPR) ({ \
+       u64 ret = 0; \
+       typeof(EXPR) expr = EXPR; \
+       switch (sizeof(expr)) { \
+       case 8: ret = *(u64 *) &expr; break; \
+       case 4: ret = *(u32 *) &expr; break; \
+       case 2: ret = *(u16 *) &expr; break; \
+       case 1: ret = *(u8 *) &expr; break; \
+       } \
+       ret; })
+
+#define __BPF_CAST1(a,...) __CAST_TO_U64(a)
+#define __BPF_CAST2(a,...) __CAST_TO_U64(a), __BPF_CAST1(__VA_ARGS__)
+#define __BPF_CAST3(a,...) __CAST_TO_U64(a), __BPF_CAST2(__VA_ARGS__)
+#define __BPF_CAST4(a,...) __CAST_TO_U64(a), __BPF_CAST3(__VA_ARGS__)
+#define __BPF_CAST5(a,...) __CAST_TO_U64(a), __BPF_CAST4(__VA_ARGS__)
+#define __BPF_CAST6(a,...) __CAST_TO_U64(a), __BPF_CAST5(__VA_ARGS__)
+
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
 static notrace void                                                    \
 perf_trace_##call(void *__data, proto)                                 \
 {                                                                      \
        struct ftrace_event_call *event_call = __data;                  \
+       struct bpf_prog *prog = event_call->prog;                       \
        struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
        struct ftrace_raw_##call *entry;                                \
        struct pt_regs __regs;                                          \
@@ -771,6 +792,16 @@ perf_trace_##call(void *__data, proto)                     
                \
        int __data_size;                                                \
        int rctx;                                                       \
                                                                        \
+       if (prog) {                                                     \
+               __maybe_unused const u64 z = 0;                         \
+               struct bpf_context __ctx = ((struct bpf_context) {      \
+                               __BPF_CAST6(args, z, z, z, z, z)        \
+                       });                                             \
+                                                                       \
+               if (!trace_call_bpf(prog, &__ctx))                      \
+                       return;                                         \
+       }                                                               \
+                                                                       \
        __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
                                                                        \
        head = this_cpu_ptr(event_call->perf_events);                   \
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 45da7ec7d274..d73d7d0abe6e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
 enum bpf_prog_type {
        BPF_PROG_TYPE_UNSPEC,
        BPF_PROG_TYPE_SOCKET_FILTER,
+       BPF_PROG_TYPE_TRACEPOINT,
 };
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
@@ -162,6 +163,12 @@ enum bpf_func_id {
        BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
        BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, 
flags) */
        BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
+       BPF_FUNC_fetch_ptr,       /* void *bpf_fetch_ptr(void *unsafe_ptr) */
+       BPF_FUNC_fetch_u64,       /* u64 bpf_fetch_u64(void *unsafe_ptr) */
+       BPF_FUNC_fetch_u32,       /* u32 bpf_fetch_u32(void *unsafe_ptr) */
+       BPF_FUNC_fetch_u16,       /* u16 bpf_fetch_u16(void *unsafe_ptr) */
+       BPF_FUNC_fetch_u8,        /* u8 bpf_fetch_u8(void *unsafe_ptr) */
+       BPF_FUNC_probe_memcmp,    /* int bpf_probe_memcmp(unsafe_ptr, safe_ptr, 
size) */
        __BPF_FUNC_MAX_ID,
 };
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 9b79abbd1ab8..d7ba67234761 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -360,6 +360,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_SET_OUTPUT      _IO ('$', 5)
 #define PERF_EVENT_IOC_SET_FILTER      _IOW('$', 6, char *)
 #define PERF_EVENT_IOC_ID              _IOR('$', 7, __u64 *)
+#define PERF_EVENT_IOC_SET_BPF         _IOW('$', 8, __u32)
 
 enum perf_event_ioc_flags {
        PERF_IOC_FLAG_GROUP             = 1U << 0,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 882f835a0d85..674a8ca17190 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -42,6 +42,8 @@
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/compat.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
 
 #include "internal.h"
 
@@ -3283,6 +3285,7 @@ errout:
 }
 
 static void perf_event_free_filter(struct perf_event *event);
+static void perf_event_free_bpf_prog(struct perf_event *event);
 
 static void free_event_rcu(struct rcu_head *head)
 {
@@ -3292,6 +3295,7 @@ static void free_event_rcu(struct rcu_head *head)
        if (event->ns)
                put_pid_ns(event->ns);
        perf_event_free_filter(event);
+       perf_event_free_bpf_prog(event);
        kfree(event);
 }
 
@@ -3795,6 +3799,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
 static int perf_event_set_output(struct perf_event *event,
                                 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
@@ -3849,6 +3854,9 @@ static long perf_ioctl(struct file *file, unsigned int 
cmd, unsigned long arg)
        case PERF_EVENT_IOC_SET_FILTER:
                return perf_event_set_filter(event, (void __user *)arg);
 
+       case PERF_EVENT_IOC_SET_BPF:
+               return perf_event_set_bpf_prog(event, arg);
+
        default:
                return -ENOTTY;
        }
@@ -6266,6 +6274,45 @@ static void perf_event_free_filter(struct perf_event 
*event)
        ftrace_profile_free_filter(event);
 }
 
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+       struct bpf_prog *prog;
+
+       if (event->attr.type != PERF_TYPE_TRACEPOINT)
+               return -EINVAL;
+
+       if (event->tp_event->prog)
+               return -EEXIST;
+
+       prog = bpf_prog_get(prog_fd);
+       if (IS_ERR(prog))
+               return PTR_ERR(prog);
+
+       if (prog->aux->prog_type != BPF_PROG_TYPE_TRACEPOINT) {
+               /* valid fd, but invalid bpf program type */
+               bpf_prog_put(prog);
+               return -EINVAL;
+       }
+
+       event->tp_event->prog = prog;
+
+       return 0;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+       struct bpf_prog *prog;
+
+       if (!event->tp_event)
+               return;
+
+       prog = event->tp_event->prog;
+       if (prog) {
+               event->tp_event->prog = NULL;
+               bpf_prog_put(prog);
+       }
+}
+
 #else
 
 static inline void perf_tp_register(void)
@@ -6281,6 +6328,14 @@ static void perf_event_free_filter(struct perf_event 
*event)
 {
 }
 
+static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
+{
+       return -ENOENT;
+}
+
+static void perf_event_free_bpf_prog(struct perf_event *event)
+{
+}
 #endif /* CONFIG_EVENT_TRACING */
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 979ccde26720..54ae225e5fc6 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_trace.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_PM),y)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
new file mode 100644
index 000000000000..ec065e0a364e
--- /dev/null
+++ b/kernel/trace/bpf_trace.c
@@ -0,0 +1,145 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/uaccess.h>
+#include <trace/bpf_trace.h>
+#include "trace.h"
+
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+       unsigned int ret;
+
+       if (in_nmi()) /* not supported yet */
+               return 1;
+
+       rcu_read_lock();
+       ret = BPF_PROG_RUN(prog, ctx);
+       rcu_read_unlock();
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(trace_call_bpf);
+
+static u64 bpf_fetch_ptr(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+       void *unsafe_ptr = (void *) (long) r1;
+       void *ptr = NULL;
+
+       probe_kernel_read(&ptr, unsafe_ptr, sizeof(ptr));
+       return (u64) (unsigned long) ptr;
+}
+
+#define FETCH(SIZE) \
+static u64 bpf_fetch_##SIZE(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)    \
+{                                                                      \
+       void *unsafe_ptr = (void *) (long) r1;                          \
+       SIZE val = 0;                                                   \
+                                                                       \
+       probe_kernel_read(&val, unsafe_ptr, sizeof(val));               \
+       return (u64) (SIZE) val;                                        \
+}
+FETCH(u64)
+FETCH(u32)
+FETCH(u16)
+FETCH(u8)
+#undef FETCH
+
+static u64 bpf_probe_memcmp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+       void *unsafe_ptr = (void *) (long) r1;
+       void *safe_ptr = (void *) (long) r2;
+       u32 size = (u32) r3;
+       char buf[64];
+       int err;
+
+       if (size < 64) {
+               err = probe_kernel_read(buf, unsafe_ptr, size);
+               if (err)
+                       return err;
+               return memcmp(buf, safe_ptr, size);
+       }
+       return -1;
+}
+
+static struct bpf_func_proto tp_prog_funcs[] = {
+#define FETCH(SIZE)                            \
+       [BPF_FUNC_fetch_##SIZE] = {             \
+               .func = bpf_fetch_##SIZE,       \
+               .gpl_only = true,               \
+               .ret_type = RET_INTEGER,        \
+       },
+       FETCH(ptr)
+       FETCH(u64)
+       FETCH(u32)
+       FETCH(u16)
+       FETCH(u8)
+#undef FETCH
+       [BPF_FUNC_probe_memcmp] = {
+               .func = bpf_probe_memcmp,
+               .gpl_only = false,
+               .ret_type = RET_INTEGER,
+               .arg1_type = ARG_ANYTHING,
+               .arg2_type = ARG_PTR_TO_STACK,
+               .arg3_type = ARG_CONST_STACK_SIZE,
+       },
+};
+
+static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id 
func_id)
+{
+       switch (func_id) {
+       case BPF_FUNC_map_lookup_elem:
+               return &bpf_map_lookup_elem_proto;
+       case BPF_FUNC_map_update_elem:
+               return &bpf_map_update_elem_proto;
+       case BPF_FUNC_map_delete_elem:
+               return &bpf_map_delete_elem_proto;
+       default:
+               if (func_id < 0 || func_id >= ARRAY_SIZE(tp_prog_funcs))
+                       return NULL;
+               return &tp_prog_funcs[func_id];
+       }
+}
+
+/* check access to argN fields of 'struct bpf_context' from program */
+static bool tp_prog_is_valid_access(int off, int size,
+                                   enum bpf_access_type type)
+{
+       /* check bounds */
+       if (off < 0 || off >= sizeof(struct bpf_context))
+               return false;
+
+       /* only read is allowed */
+       if (type != BPF_READ)
+               return false;
+
+       /* disallow misaligned access */
+       if (off % size != 0)
+               return false;
+
+       return true;
+}
+
+static struct bpf_verifier_ops tp_prog_ops = {
+       .get_func_proto = tp_prog_func_proto,
+       .is_valid_access = tp_prog_is_valid_access,
+};
+
+static struct bpf_prog_type_list tl = {
+       .ops = &tp_prog_ops,
+       .type = BPF_PROG_TYPE_TRACEPOINT,
+};
+
+static int __init register_tp_prog_ops(void)
+{
+       bpf_register_prog_type(&tl);
+       return 0;
+}
+late_initcall(register_tp_prog_ops);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c6ee36fcbf90..3487c41f4c0e 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -7,6 +7,7 @@
 #include <linux/ftrace.h>
 #include <linux/perf_event.h>
 #include <asm/syscall.h>
+#include <trace/bpf_trace.h>
 
 #include "trace_output.h"
 #include "trace.h"
@@ -545,11 +546,26 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, 
NR_syscalls);
 static int sys_perf_refcount_enter;
 static int sys_perf_refcount_exit;
 
+static void populate_bpf_ctx(struct bpf_context *ctx, struct pt_regs *regs)
+{
+       struct task_struct *task = current;
+       unsigned long args[6];
+
+       syscall_get_arguments(task, regs, 0, 6, args);
+       ctx->arg1 = args[0];
+       ctx->arg2 = args[1];
+       ctx->arg3 = args[2];
+       ctx->arg4 = args[3];
+       ctx->arg5 = args[4];
+       ctx->arg6 = args[5];
+}
+
 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
        struct syscall_metadata *sys_data;
        struct syscall_trace_enter *rec;
        struct hlist_head *head;
+       struct bpf_prog *prog;
        int syscall_nr;
        int rctx;
        int size;
@@ -564,6 +580,15 @@ static void perf_syscall_enter(void *ignore, struct 
pt_regs *regs, long id)
        if (!sys_data)
                return;
 
+       prog = sys_data->enter_event->prog;
+       if (prog) {
+               struct bpf_context ctx;
+
+               populate_bpf_ctx(&ctx, regs);
+               if (!trace_call_bpf(prog, &ctx))
+                       return;
+       }
+
        head = this_cpu_ptr(sys_data->enter_event->perf_events);
        if (hlist_empty(head))
                return;
@@ -624,6 +649,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs 
*regs, long ret)
        struct syscall_metadata *sys_data;
        struct syscall_trace_exit *rec;
        struct hlist_head *head;
+       struct bpf_prog *prog;
        int syscall_nr;
        int rctx;
        int size;
@@ -638,6 +664,15 @@ static void perf_syscall_exit(void *ignore, struct pt_regs 
*regs, long ret)
        if (!sys_data)
                return;
 
+       prog = sys_data->exit_event->prog;
+       if (prog) {
+               struct bpf_context ctx = {};
+
+               ctx.arg1 = syscall_get_return_value(current, regs);
+               if (!trace_call_bpf(prog, &ctx))
+                       return;
+       }
+
        head = this_cpu_ptr(sys_data->exit_event->perf_events);
        if (hlist_empty(head))
                return;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 linux-trace 1/8] tracing: attach eBPF programs to tracepoints and syscalls

Reply via email to