On 09/02, Oleg Nesterov wrote:
>
> And... I think that BPF has even more problems with filtering. Not sure,
> I'll try to write another test-case tomorrow.

See below. This test-case needs a one-liner patch at the end, but this is only
because I have no idea how to add BPF_EMIT_CALL(BPF_FUNC_trace_printk) into
"struct bpf_insn insns[]" correctly. Is there a simple-to-use user-space tool
which can translate 'bpf_trace_printk("Hello world\n", 13)' into bpf_insn[] ???

So. The CONFIG_BPF_EVENTS code in __uprobe_perf_func() assumes that it "owns"
tu->consumer and uprobe_perf_filter(), but this is not true in general.

test.c:
        #include <unistd.h>

        int func(int i)
        {
                return i;
        }

        int main(void)
        {
                int i;
                for (i = 0;; ++i) {
                        sleep(1);
                        func(i);
                }
                return 0;
        }

run_prog.c
        // cc -I./tools/include -I./tools/include/uapi -Wall
        #include "./include/generated/uapi/linux/version.h"
        #include <linux/perf_event.h>
        #include <linux/filter.h>
        #define _GNU_SOURCE
        #include <sys/syscall.h>
        #include <sys/ioctl.h>
        #include <assert.h>
        #include <unistd.h>
        #include <stdlib.h>

        int prog_load(void)
        {
                struct bpf_insn insns[] = {
                        BPF_MOV64_IMM(BPF_REG_0, 0),
                        BPF_EXIT_INSN(),
                };

                union bpf_attr attr = {
                        .prog_type      = BPF_PROG_TYPE_KPROBE,
                        .insns          = (unsigned long)insns,
                        .insn_cnt       = sizeof(insns) / sizeof(insns[0]),
                        .license        = (unsigned long)"GPL",
                        .kern_version   = LINUX_VERSION_CODE, // unneeded
                };

                return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
        }

        void run_probe(int eid, int pid)
        {
                struct perf_event_attr attr = {
                        .type   = PERF_TYPE_TRACEPOINT,
                        .config = eid,
                        .size   = sizeof(struct perf_event_attr),
                };

                int fd = syscall(__NR_perf_event_open, &attr, pid, 0, -1, 0);
                assert(fd >= 0);

                int pfd = prog_load();
                assert(pfd >= 0);

                assert(ioctl(fd, PERF_EVENT_IOC_SET_BPF, pfd) == 0);
                assert(ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) == 0);

                for (;;)
                        pause();
        }

        int main(int argc, const char *argv[])
        {
                int eid = atoi(argv[1]);
                int pid = atoi(argv[2]);
                run_probe(eid, pid);
                return 0;
        }

Now,

        $ ./test &
        $ PID1=$!
        $ ./test &
        $ PID2=$!
        $ perf probe -x ./test -a func
        $ ./run_prog `cat /sys/kernel/debug/tracing/events/probe_test/func/id` 
$PID1 &

dmesg -c:
        trace_uprobe: BPF_FUNC: pid=50 ret=0
        trace_uprobe: BPF_FUNC: pid=50 ret=0
        trace_uprobe: BPF_FUNC: pid=50 ret=0
        trace_uprobe: BPF_FUNC: pid=50 ret=0
        ...

So far so good. Now,

        $ perf record -e probe_test:func -p $PID2 -- sleep 10 &

This creates another PERF_TYPE_TRACEPOINT perf_event which shares
trace_uprobe/consumer/filter with the perf_event created by run_prog.

dmesg -c:
        trace_uprobe: BPF_FUNC: pid=51 ret=0
        trace_uprobe: BPF_FUNC: pid=50 ret=0
        trace_uprobe: BPF_FUNC: pid=51 ret=0
        trace_uprobe: BPF_FUNC: pid=50 ret=0
        ...

until perf-record exits. and after that

        $ perf script

reports nothing.

So, in this case:

        - run_prog's bpf program is called when current->pid == $PID2, this 
patch
          (or any other change in trace_uprobe.c) can't help.

        - run_prog's bpf program "steals" __uprobe_perf_func() from 
/usr/bin/perf

and to me this is yet another indication that we need some changes in the
bpf_prog_run_array_uprobe() paths, or even in the user-space bpftrace/whatever's
code.

And. Why the "if (bpf_prog_array_valid(call))" block in __uprobe_perf_func() 
returns?
Why this depends on "ret == 0" ??? I fail to understand this logic.

Oleg.

--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1381,6 +1381,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
                u32 ret;
 
                ret = bpf_prog_run_array_uprobe(call->prog_array, regs, 
bpf_prog_run);
+               pr_crit("BPF_FUNC: pid=%d ret=%d\n", current->pid, ret);
                if (!ret)
                        return;
        }


Reply via email to