Re: [PATCH bpf-next v5 1/2] bpf: extend stackmap to save binary_build_id+offset instead of address

2018-03-14 Thread Song Liu


> On Mar 14, 2018, at 9:07 AM, Daniel Borkmann  wrote:
> 
> Just a minor question below, the rest seems fine to me as far as I
> can tell.
> 
> On 03/13/2018 10:47 PM, Song Liu wrote:
> [...]
>> +enum bpf_stack_build_id_status {
>> +/* user space need an empty entry to identify end of a trace */
>> +BPF_STACK_BUILD_ID_EMPTY = 0,
>> +/* with valid build_id and offset */
>> +BPF_STACK_BUILD_ID_VALID = 1,
>> +/* couldn't get build_id, fallback to ip */
>> +BPF_STACK_BUILD_ID_IP = 2,
>> +};
>> +
>> +#define BPF_BUILD_ID_SIZE 20
>> +struct bpf_stack_build_id {
>> +__s32   status;
>> +unsigned char   build_id[BPF_BUILD_ID_SIZE];
>> +union {
>> +__u64   offset;
>> +__u64   ip;
>> +};
>> +};
> [...]>  BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, 
> map,
>> u64, flags)
>> {
>>  struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, 
>> map);
>>  struct perf_callchain_entry *trace;
>>  struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
>> -u32 max_depth = map->value_size / 8;
>> +u32 max_depth = map->value_size / stack_map_data_size(map);
>>  /* stack_map_alloc() checks that max_depth <= 
>> sysctl_perf_event_max_stack */
>>  u32 init_nr = sysctl_perf_event_max_stack - max_depth;
>>  u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
>> @@ -128,11 +318,16 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, 
>> struct bpf_map *, map,
>>  bool user = flags & BPF_F_USER_STACK;
>>  bool kernel = !user;
>>  u64 *ips;
>> +bool hash_matches;
>> 
>>  if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
>> BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
>>  return -EINVAL;
>> 
>> +/* build_id+offset stack map only supports user stack */
>> +if (stack_map_use_build_id(map) && !user)
>> +return -EINVAL;
> 
> Instead of bailing out here, wouldn't it make sense to just reuse the
> BPF_STACK_BUILD_ID_IP status and use this 'fallback' for kernel similar
> to what we do anyway in stack_map_get_build_id_offset() when we cannot
> get the build id so that map can be used for both cases?

This a great idea! Let me implement it. 

Thanks,
Song

Re: [PATCH bpf-next v5 1/2] bpf: extend stackmap to save binary_build_id+offset instead of address

2018-03-14 Thread Daniel Borkmann
Just a minor question below, the rest seems fine to me as far as I
can tell.

On 03/13/2018 10:47 PM, Song Liu wrote:
[...]
> +enum bpf_stack_build_id_status {
> + /* user space need an empty entry to identify end of a trace */
> + BPF_STACK_BUILD_ID_EMPTY = 0,
> + /* with valid build_id and offset */
> + BPF_STACK_BUILD_ID_VALID = 1,
> + /* couldn't get build_id, fallback to ip */
> + BPF_STACK_BUILD_ID_IP = 2,
> +};
> +
> +#define BPF_BUILD_ID_SIZE 20
> +struct bpf_stack_build_id {
> + __s32   status;
> + unsigned char   build_id[BPF_BUILD_ID_SIZE];
> + union {
> + __u64   offset;
> + __u64   ip;
> + };
> +};
[...]>  BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, 
map,
>  u64, flags)
>  {
>   struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, 
> map);
>   struct perf_callchain_entry *trace;
>   struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
> - u32 max_depth = map->value_size / 8;
> + u32 max_depth = map->value_size / stack_map_data_size(map);
>   /* stack_map_alloc() checks that max_depth <= 
> sysctl_perf_event_max_stack */
>   u32 init_nr = sysctl_perf_event_max_stack - max_depth;
>   u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
> @@ -128,11 +318,16 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, 
> struct bpf_map *, map,
>   bool user = flags & BPF_F_USER_STACK;
>   bool kernel = !user;
>   u64 *ips;
> + bool hash_matches;
>  
>   if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
>  BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
>   return -EINVAL;
>  
> + /* build_id+offset stack map only supports user stack */
> + if (stack_map_use_build_id(map) && !user)
> + return -EINVAL;

Instead of bailing out here, wouldn't it make sense to just reuse the
BPF_STACK_BUILD_ID_IP status and use this 'fallback' for kernel similar
to what we do anyway in stack_map_get_build_id_offset() when we cannot
get the build id so that map can be used for both cases?

>   trace = get_perf_callchain(regs, init_nr, kernel, user,
>  sysctl_perf_event_max_stack, false, false);
>  
> @@ -156,24 +351,42 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, 
> struct bpf_map *, map,
>   id = hash & (smap->n_buckets - 1);
>   bucket = READ_ONCE(smap->buckets[id]);
>  
> - if (bucket && bucket->hash == hash) {
> - if (flags & BPF_F_FAST_STACK_CMP)
> + hash_matches = bucket && bucket->hash == hash;
> + /* fast cmp */
> + if (hash_matches && flags & BPF_F_FAST_STACK_CMP)
> + return id;
> +
> + if (stack_map_use_build_id(map)) {
> + /* for build_id+offset, pop a bucket before slow cmp */
> + new_bucket = (struct stack_map_bucket *)
> + pcpu_freelist_pop(>freelist);
> + if (unlikely(!new_bucket))
> + return -ENOMEM;
> + stack_map_get_build_id_offset(map, new_bucket, ips, trace_nr);
> + trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
> + if (hash_matches && bucket->nr == trace_nr &&
> + memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
> + pcpu_freelist_push(>freelist, _bucket->fnode);
>   return id;
> - if (bucket->nr == trace_nr &&
> - memcmp(bucket->ip, ips, trace_len) == 0)
> + }
> + if (bucket && !(flags & BPF_F_REUSE_STACKID)) {
> + pcpu_freelist_push(>freelist, _bucket->fnode);
> + return -EEXIST;
> + }
> + } else {
> + if (hash_matches && bucket->nr == trace_nr &&
> + memcmp(bucket->data, ips, trace_len) == 0)
>   return id;
> + if (bucket && !(flags & BPF_F_REUSE_STACKID))
> + return -EEXIST;
> +
> + new_bucket = (struct stack_map_bucket *)
> + pcpu_freelist_pop(>freelist);
> + if (unlikely(!new_bucket))
> + return -ENOMEM;
> + memcpy(new_bucket->data, ips, trace_len);
>   }
>  
> - /* this call stack is not in the map, try to add it */
> - if (bucket && !(flags & BPF_F_REUSE_STACKID))
> - return -EEXIST;
> -
> - new_bucket = (struct stack_map_bucket *)
> - pcpu_freelist_pop(>freelist);
> - if (unlikely(!new_bucket))
> - return -ENOMEM;
> -
> - memcpy(new_bucket->ip, ips, trace_len);
>   new_bucket->hash = hash;
>   new_bucket->nr = trace_nr;
>  
> @@ -212,8 +425,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, 
> void *value)
>   if (!bucket)
>   return -ENOENT;
>  
> - trace_len = bucket->nr * sizeof(u64);
> - memcpy(value, bucket->ip, 

[PATCH bpf-next v5 1/2] bpf: extend stackmap to save binary_build_id+offset instead of address

2018-03-13 Thread Song Liu
Currently, bpf stackmap store address for each entry in the call trace.
To map these addresses to user space files, it is necessary to maintain
the mapping from these virtual address to symbols in the binary. Usually,
the user space profiler (such as perf) has to scan /proc/pid/maps at the
beginning of profiling, and monitor mmap2() calls afterwards. Given the
cost of maintaining the address map, this solution is not practical for
system wide profiling that is always on.

This patch tries to solve this problem with a variation of stackmap. This
variation is enabled by flag BPF_F_STACK_BUILD_ID, and only works for
user stack. Instead of storing addresses, the variation stores ELF file
build_id + offset.

Build ID is a 20-byte unique identifier for ELF files. The following
command shows the Build ID of /bin/bash:

  [user@]$ readelf -n /bin/bash
  ...
Build ID: XX
  ...

With BPF_F_STACK_BUILD_ID, bpf_get_stackid() tries to parse Build ID
for each entry in the call trace, and translate it into the following
struct:

  struct bpf_stack_build_id_offset {
  __s32   status;
  unsigned char   build_id[BPF_BUILD_ID_SIZE];
  union {
  __u64   offset;
  __u64   ip;
  };
  };

The search of build_id is limited to the first page of the file, and this
page should be in page cache. Otherwise, we fallback to store ip for this
entry (ip field in struct bpf_stack_build_id_offset). This requires the
build_id to be stored in the first page. A quick survey of binary and
dynamic library files in a few different systems shows that almost all
binary and dynamic library files have build_id in the first page.

User space can access struct bpf_stack_build_id_offset with bpf
syscall BPF_MAP_LOOKUP_ELEM. It is necessary for user space to
maintain mapping from build id to binary files. This mostly static
mapping is much easier to maintain than per process address maps.

Note: Stackmap with build_id only works in non-nmi context at this time.
This is because we need to take mm->mmap_sem for find_vma(). If this
changes, we would like to allow build_id lookup in nmi context.

Signed-off-by: Song Liu 
---
 include/uapi/linux/bpf.h |  22 
 kernel/bpf/stackmap.c| 257 +++
 2 files changed, 257 insertions(+), 22 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2a66769..1e15d17 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -231,6 +231,28 @@ enum bpf_attach_type {
 #define BPF_F_RDONLY   (1U << 3)
 #define BPF_F_WRONLY   (1U << 4)
 
+/* Flag for stack_map, store build_id+offset instead of pointer */
+#define BPF_F_STACK_BUILD_ID   (1U << 5)
+
+enum bpf_stack_build_id_status {
+   /* user space need an empty entry to identify end of a trace */
+   BPF_STACK_BUILD_ID_EMPTY = 0,
+   /* with valid build_id and offset */
+   BPF_STACK_BUILD_ID_VALID = 1,
+   /* couldn't get build_id, fallback to ip */
+   BPF_STACK_BUILD_ID_IP = 2,
+};
+
+#define BPF_BUILD_ID_SIZE 20
+struct bpf_stack_build_id {
+   __s32   status;
+   unsigned char   build_id[BPF_BUILD_ID_SIZE];
+   union {
+   __u64   offset;
+   __u64   ip;
+   };
+};
+
 union bpf_attr {
struct { /* anonymous struct used by BPF_MAP_CREATE command */
__u32   map_type;   /* one of enum bpf_map_type */
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index b0ecf43..7b134e9 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -9,16 +9,19 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include "percpu_freelist.h"
 
-#define STACK_CREATE_FLAG_MASK \
-   (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+#define STACK_CREATE_FLAG_MASK \
+   (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY |\
+BPF_F_STACK_BUILD_ID)
 
 struct stack_map_bucket {
struct pcpu_freelist_node fnode;
u32 hash;
u32 nr;
-   u64 ip[];
+   u64 data[];
 };
 
 struct bpf_stack_map {
@@ -29,6 +32,17 @@ struct bpf_stack_map {
struct stack_map_bucket *buckets[];
 };
 
+static inline bool stack_map_use_build_id(struct bpf_map *map)
+{
+   return (map->map_flags & BPF_F_STACK_BUILD_ID);
+}
+
+static inline int stack_map_data_size(struct bpf_map *map)
+{
+   return stack_map_use_build_id(map) ?
+   sizeof(struct bpf_stack_build_id) : sizeof(u64);
+}
+
 static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
 {
u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
@@ -68,8 +82,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
-   value_size < 8 || value_size % 8 ||
-   value_size / 8 >