This patch adds a sample program, called trace_map_events,
that shows how to capture map events and filter them based on
the map id.

The program accepts a list of map IDs, via the -i command line
option, and filters all the map events related to those IDs (i.e.,
map_create/update/lookup/next_key).
If no IDs are specified, all map events are listed and no filtering
is performed.

Sample usage:

 # trace_map_events -i <map_id1> -i <map_id2> -i <map_id3> ...

Signed-off-by: Sebastiano Miano <sebastiano.mi...@polito.it>
---
 samples/bpf/Makefile                |    4 
 samples/bpf/trace_map_events_kern.c |  225 +++++++++++++++++++++++++
 samples/bpf/trace_map_events_user.c |  314 +++++++++++++++++++++++++++++++++++
 3 files changed, 543 insertions(+)
 create mode 100644 samples/bpf/trace_map_events_kern.c
 create mode 100644 samples/bpf/trace_map_events_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 4d6a6ed..a7d52b6 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -15,6 +15,7 @@ hostprogs-y += tracex6
 hostprogs-y += tracex7
 hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
+hostprogs-y += trace_map_events
 hostprogs-y += lathist
 hostprogs-y += offwaketime
 hostprogs-y += spintest
@@ -65,6 +66,7 @@ tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
+trace_map_events-objs := bpf_load.o $(LIBBPF) trace_map_events_user.o
 lathist-objs := bpf_load.o $(LIBBPF) lathist_user.o
 offwaketime-objs := bpf_load.o $(LIBBPF) offwaketime_user.o
 spintest-objs := bpf_load.o $(LIBBPF) spintest_user.o
@@ -111,6 +113,7 @@ always += tracex7_kern.o
 always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
+always += trace_map_events_kern.o
 always += tcbpf1_kern.o
 always += tcbpf2_kern.o
 always += tc_l2_redirect_kern.o
@@ -171,6 +174,7 @@ HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
 HOSTLOADLIBES_trace_output += -lelf -lrt
+HOSTLOADLIBES_trace_map_events += -lelf -lrt
 HOSTLOADLIBES_lathist += -lelf
 HOSTLOADLIBES_offwaketime += -lelf
 HOSTLOADLIBES_spintest += -lelf
diff --git a/samples/bpf/trace_map_events_kern.c 
b/samples/bpf/trace_map_events_kern.c
new file mode 100644
index 0000000..f887b5b
--- /dev/null
+++ b/samples/bpf/trace_map_events_kern.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (C) 2018 Politecnico di Torino, Italy
+ *
+ * Author: Sebastiano Miano <sebastiano.mi...@polito.it>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <linux/version.h>
+#include "bpf_helpers.h"
+
+enum map_event_type {
+       MAP_CREATE = 0,
+       MAP_UPDATE = 1,
+       MAP_LOOKUP = 2,
+       MAP_NEXT_KEY = 3
+};
+
+struct map_event_data {
+       u32 map_id;
+       enum map_event_type evnt_type;
+       u32 map_type;
+};
+
+struct bpf_map_def SEC("maps") map_event_trace = {
+       .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+       .key_size = sizeof(int),
+       .value_size = sizeof(u32),
+       .max_entries = 64,
+};
+
+struct bpf_map_def SEC("maps") filtered_ids = {
+       .type = BPF_MAP_TYPE_HASH,
+       .key_size = sizeof(u32),
+       .value_size = sizeof(u32),
+       .max_entries = 64,
+};
+
+struct bpf_map_def SEC("maps") filter_events = {
+       .type = BPF_MAP_TYPE_ARRAY,
+       .key_size = sizeof(u32),
+       .value_size = sizeof(bool),
+       .max_entries = 1,
+};
+
+/*
+ * Tracepoint format: 
/sys/kernel/debug/tracing/events/bpf/bpf_map_create/format
+ * Code in:                kernel/include/trace/events/bpf.h
+ */
+struct bpf_map_create_ctx {
+       u64 pad;                // First 8 bytes are not accessible by bpf code
+       u32 type;               // offset:8;    size:4; signed:0;
+       u32 size_key;           // offset:12;   size:4; signed:0;
+       u32 size_value;         // offset:16;   size:4; signed:0;
+       u32 max_entries;        // offset:20;   size:4; signed:0;
+       u32 flags;              // offset:24;   size:4; signed:0;
+       int ufd;                // offset:28;   size:4; signed:1;
+       u32 id;                 // offset:32;   size:4; signed:0;
+};
+
+SEC("tracepoint/bpf/bpf_map_create")
+int trace_bpf_map_create(struct bpf_map_create_ctx *ctx)
+{
+       struct map_event_data data;
+       int cpu = bpf_get_smp_processor_id();
+       bool *filter;
+       u32 key = 0, map_id = ctx->id;
+
+       filter = bpf_map_lookup_elem(&filter_events, &key);
+       if (!filter)
+               return 1;
+
+       if (!*filter)
+               goto send_event;
+
+       /*
+        * If the map_id is not in the list of filtered
+        * ids we immediately return
+        */
+       if (!bpf_map_lookup_elem(&filtered_ids, &map_id))
+               return 0;
+
+send_event:
+       data.map_id = map_id;
+       data.evnt_type = MAP_CREATE;
+       data.map_type = ctx->type;
+
+       bpf_perf_event_output(ctx, &map_event_trace, cpu, &data, sizeof(data));
+       return 0;
+}
+
+/*
+ * Tracepoint: /sys/kernel/debug/tracing/events/bpf/bpf_map_lookup_elem/format
+ * Tracepoint: /sys/kernel/debug/tracing/events/bpf/bpf_map_update_elem/format
+ * Code in:          kernel/include/trace/events/bpf.h
+ */
+struct bpf_map_keyval_ctx {
+       u64 pad;                // First 8 bytes are not accessible by bpf code
+       u32 type;               // offset:8;    size:4; signed:0;
+       u32 key_len;            // offset:12;   size:4; signed:0;
+       u32 key;                // offset:16;   size:4; signed:0;
+       bool key_trunc;         // offset:20;   size:1; signed:0;
+       u32 val_len;            // offset:24;   size:4; signed:0;
+       u32 val;                // offset:28;   size:4; signed:0;
+       bool val_trunc;         // offset:32;   size:1; signed:0;
+       int ufd;                // offset:36;   size:4; signed:1;
+       u32 id;                 // offset:40;   size:4; signed:0;
+};
+
+SEC("tracepoint/bpf/bpf_map_lookup_elem")
+int trace_bpf_map_lookup_elem(struct bpf_map_keyval_ctx *ctx)
+{
+       struct map_event_data data;
+       int cpu = bpf_get_smp_processor_id();
+       bool *filter;
+       u32 key = 0, map_id = ctx->id;
+
+       filter = bpf_map_lookup_elem(&filter_events, &key);
+       if (!filter)
+               return 1;
+
+       if (!*filter)
+               goto send_event;
+
+       /*
+        * If the map_id is not in the list of filtered
+        * ids we immediately return
+        */
+       if (!bpf_map_lookup_elem(&filtered_ids, &map_id))
+               return 0;
+
+send_event:
+       data.map_id = map_id;
+       data.evnt_type = MAP_LOOKUP;
+       data.map_type = ctx->type;
+
+       bpf_perf_event_output(ctx, &map_event_trace, cpu, &data, sizeof(data));
+       return 0;
+}
+
+SEC("tracepoint/bpf/bpf_map_update_elem")
+int trace_bpf_map_update_elem(struct bpf_map_keyval_ctx *ctx)
+{
+       struct map_event_data data;
+       int cpu = bpf_get_smp_processor_id();
+       bool *filter;
+       u32 key = 0, map_id = ctx->id;
+
+       filter = bpf_map_lookup_elem(&filter_events, &key);
+       if (!filter)
+               return 1;
+
+       if (!*filter)
+               goto send_event;
+
+       /*
+        * If the map_id is not in the list of filtered
+        * ids we immediately return
+        */
+       if (!bpf_map_lookup_elem(&filtered_ids, &map_id))
+               return 0;
+
+send_event:
+       data.map_id = map_id;
+       data.evnt_type = MAP_UPDATE;
+       data.map_type = ctx->type;
+
+       bpf_perf_event_output(ctx, &map_event_trace, cpu, &data, sizeof(data));
+       return 0;
+}
+
+/*
+ * Tracepoint: /sys/kernel/debug/tracing/events/bpf/bpf_map_next_key/format
+ * Code in:          kernel/include/trace/events/bpf.h
+ */
+struct bpf_map_next_key_ctx {
+       u64 pad;                // First 8 bytes are not accessible by bpf code
+       u32 type;               // offset:8;    size:4; signed:0;
+       u32 key_len;            // offset:12;   size:4; signed:0;
+       u32 key;                // offset:16;   size:4; signed:0;
+       u32 nxt;                // offset:20;   size:4; signed:0;
+       bool key_trunc;         // offset:24;   size:1; signed:0;
+       bool key_null;          // offset:25;   size:1; signed:0;
+       int ufd;                // offset:28;   size:4; signed:1;
+       u32 id;                 // offset:32;   size:4; signed:0;
+};
+
+SEC("tracepoint/bpf/bpf_map_next_key")
+int trace_bpf_map_next_key(struct bpf_map_next_key_ctx *ctx)
+{
+       struct map_event_data data;
+       int cpu = bpf_get_smp_processor_id();
+       bool *filter;
+       u32 key = 0, map_id = ctx->id;
+
+       filter = bpf_map_lookup_elem(&filter_events, &key);
+       if (!filter)
+               return 1;
+
+       if (!*filter)
+               goto send_event;
+
+       /*
+        * If the map_id is not in the list of filtered
+        * ids we immediately return
+        */
+       if (!bpf_map_lookup_elem(&filtered_ids, &map_id))
+               return 0;
+
+send_event:
+       data.map_id = map_id;
+       data.evnt_type = MAP_NEXT_KEY;
+       data.map_type = ctx->type;
+
+       bpf_perf_event_output(ctx, &map_event_trace, cpu, &data, sizeof(data));
+       return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/trace_map_events_user.c 
b/samples/bpf/trace_map_events_user.c
new file mode 100644
index 0000000..bc7447e
--- /dev/null
+++ b/samples/bpf/trace_map_events_user.c
@@ -0,0 +1,314 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (C) 2018 Politecnico di Torino, Italy
+ *
+ * Author: Sebastiano Miano <sebastiano.mi...@polito.it>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+static const char *__desc__ =
+"Sample program to trace map related events\n"
+"The -i option allows to set the id(s) of the map you are interested in.\n"
+"If no ID is specified, all map events are listed.\n";
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <sys/resource.h>
+#include <linux/perf_event.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/epoll.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <signal.h>
+#include <getopt.h>
+
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "perf-sys.h"
+
+#define MAX_FILTERED_IDS 64
+
+static int *perf_fd;
+
+int epoll_fd;
+int page_size;
+int page_cnt = 8;
+volatile struct perf_event_mmap_page **readers;
+
+typedef void (*event_cb)(void *data, int size);
+
+enum map_event_type {
+       MAP_CREATE = 0,
+       MAP_UPDATE = 1,
+       MAP_LOOKUP = 2,
+       MAP_NEXT_KEY = 3
+};
+
+static void usage(char *argv[])
+{
+       printf("\nDESCRIPTION:\n%s", __desc__);
+       printf("\n");
+       printf(" Usage: %s [-i map_id1] [-i map_id2] ...\n", argv[0]);
+       printf("\n");
+}
+
+static int perf_event_mmap(int fd, int cpu)
+{
+       void *base;
+       int mmap_size;
+
+       page_size = getpagesize();
+       mmap_size = page_size * (page_cnt + 1);
+
+       base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+       if (base == MAP_FAILED) {
+               printf("mmap err\n");
+               return -1;
+       }
+
+       readers[cpu] = base;
+       return 0;
+}
+
+static void init_bpf_perf_event_on_cpu(int cpu)
+{
+       struct perf_event_attr attr = {
+               .sample_type = PERF_SAMPLE_RAW,
+               .type = PERF_TYPE_SOFTWARE,
+               .config = PERF_COUNT_SW_BPF_OUTPUT,
+               .sample_period = 1,
+               .wakeup_events = 1,
+       };
+       int key = cpu;
+
+       perf_fd[cpu] = sys_perf_event_open(&attr, -1, cpu, -1, 0);
+
+       assert(perf_fd[cpu] >= 0);
+       assert(perf_event_mmap(perf_fd[cpu], cpu) >= 0);
+       assert(ioctl(perf_fd[cpu], PERF_EVENT_IOC_ENABLE, 0) >= 0);
+       assert(bpf_map_update_elem(map_fd[0], &key, &perf_fd[cpu], 0) == 0);
+
+       struct epoll_event e = { .events = EPOLLIN, .data.u32 = cpu };
+
+       assert(epoll_ctl(epoll_fd, EPOLL_CTL_ADD, perf_fd[cpu], &e) == 0);
+}
+
+static int perf_event_poll(int fd, int num_cpus, struct epoll_event *events)
+{
+       return epoll_wait(fd, events, num_cpus, -1);
+}
+
+struct perf_event_sample {
+       struct perf_event_header header;
+       __u32 size;
+       char data[];
+};
+
+static void perf_event_read(event_cb fn, __u32 index)
+{
+       __u64 data_tail = readers[index]->data_tail;
+       __u64 data_head = readers[index]->data_head;
+       __u64 buffer_size = page_cnt * page_size;
+       void *base, *begin, *end;
+       char buf[256];
+
+       asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
+       if (data_head == data_tail)
+               return;
+
+       base = ((char *)readers[index]) + page_size;
+
+       begin = base + data_tail % buffer_size;
+       end = base + data_head % buffer_size;
+
+       while (begin != end) {
+               struct perf_event_sample *e;
+
+               e = begin;
+               if (begin + e->header.size > base + buffer_size) {
+                       long len = base + buffer_size - begin;
+
+                       assert(len < e->header.size);
+                       memcpy(buf, begin, len);
+                       memcpy(buf + len, base, e->header.size - len);
+                       e = (void *) buf;
+                       begin = base + e->header.size - len;
+               } else if (begin + e->header.size == base + buffer_size) {
+                       begin = base;
+               } else {
+                       begin += e->header.size;
+               }
+
+               if (e->header.type == PERF_RECORD_SAMPLE) {
+                       fn(e->data, e->size);
+               } else if (e->header.type == PERF_RECORD_LOST) {
+                       struct {
+                               struct perf_event_header header;
+                               __u64 id;
+                               __u64 lost;
+                       } *lost = (void *) e;
+                       printf("lost %lld events\n", lost->lost);
+               } else {
+                       printf("unknown event type=%d size=%d\n",
+                              e->header.type, e->header.size);
+               }
+       }
+
+       __sync_synchronize(); /* smp_mb() */
+       readers[index]->data_tail = data_head;
+}
+
+static const char *get_event_type(enum map_event_type event)
+{
+       switch (event) {
+       case MAP_CREATE:
+               return "CREATE";
+       case MAP_LOOKUP:
+               return "LOOKUP";
+       case MAP_UPDATE:
+               return "UPDATE";
+       case MAP_NEXT_KEY:
+               return "NEXT_KEY";
+       }
+
+       return "UNKNOWN";
+}
+
+
+static void map_event_callback(void *data, int size)
+{
+       struct {
+               __u32 map_id;
+               enum map_event_type event_type;
+               __u32 map_type;
+       } *e = data;
+
+       printf("%s event for map id: %d and type: %d\n",
+              get_event_type(e->event_type), e->map_id, e->map_type);
+}
+
+static bool init_filtered_ids_map(int num_ids, int *filtered_ids)
+{
+       int i, key, value;
+       bool filtering = false;
+       /*
+        * I am going to put the IDs in the map. Only event related to those IDs
+        * will be shown. The key indicates the ID of the map while the value
+        * is not used and then is set to 0.
+        */
+       for (i = 0; i < num_ids; i++) {
+               key = filtered_ids[i];
+               value = 0;
+               if (bpf_map_update_elem(map_fd[1], &key, &value, 0) != 0) {
+                       fprintf(stderr,
+                       "ERR: bpf_map_update_elem failed key:0x%X\n", key);
+               return false;
+               }
+       }
+
+       if (num_ids > 0)
+               filtering = true;
+
+       key = 0;
+       assert(bpf_map_update_elem(map_fd[2], &key, &filtering, BPF_ANY) == 0);
+       return true;
+}
+
+static bool init_perf_buffer_data_structures(int nr_cpus)
+{
+       int i;
+
+       perf_fd = malloc(sizeof(int) * nr_cpus);
+       assert(perf_fd);
+       readers = malloc(sizeof(*readers) * nr_cpus);
+       assert(readers);
+
+       epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+
+       for (i = 0; i < nr_cpus; i++) {
+               printf("Init bpf_perf_event for cpu:%d\n", i);
+               init_bpf_perf_event_on_cpu(i);
+       }
+
+       return true;
+}
+
+int main(int argc, char **argv)
+{
+       struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+       int i, cnt, opt, ret = EXIT_SUCCESS;
+       char bpf_obj_file[256];
+       int num_ids = 0, nr_cpus = bpf_num_possible_cpus();
+       int filtered_ids[MAX_FILTERED_IDS];
+
+       snprintf(bpf_obj_file, sizeof(bpf_obj_file), "%s_kern.o", argv[0]);
+
+       /* Parse commands line args */
+       while ((opt = getopt(argc, argv, "hi:")) != -1) {
+               switch (opt) {
+               case 'i':
+                       if (num_ids == MAX_FILTERED_IDS) {
+                               printf("Reached maximum number of IDs");
+                               return EXIT_FAILURE;
+                       }
+                       i = atoi(optarg);
+                       if (!i)
+                               printf("ERROR - Invalid id %s", optarg);
+                       else
+                               filtered_ids[num_ids++] = i;
+                       break;
+               case 'h':
+               default:
+                       usage(argv);
+                       return EXIT_FAILURE;
+               }
+       }
+
+       if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+               perror("setrlimit(RLIMIT_MEMLOCK)");
+               return EXIT_FAILURE;
+       }
+
+       if (load_bpf_file(bpf_obj_file)) {
+               printf("ERROR - bpf_log_buf: %s", bpf_log_buf);
+               return EXIT_FAILURE;
+       }
+
+       if (!prog_fd[0]) {
+               printf("ERROR - load_bpf_file: %s\n", strerror(errno));
+               return EXIT_FAILURE;
+       }
+
+       init_filtered_ids_map(num_ids, filtered_ids);
+       init_perf_buffer_data_structures(nr_cpus);
+
+       struct epoll_event *events = calloc(nr_cpus, sizeof(*events));
+
+       while (true) {
+               printf("Waiting for map events...\n");
+               cnt = perf_event_poll(epoll_fd, nr_cpus, events);
+               for (i = 0; i < cnt; i++)
+                       perf_event_read(map_event_callback, events[i].data.u32);
+       }
+
+       free(perf_fd);
+       free(readers);
+       free(events);
+
+       return ret;
+}

Reply via email to