Re: [PATCH bpf-next v5 1/2] bpf: extend stackmap to save binary_build_id+offset instead of address

2018-03-14 Thread Song Liu


> On Mar 14, 2018, at 9:07 AM, Daniel Borkmann  wrote:
> 
> Just a minor question below, the rest seems fine to me as far as I
> can tell.
> 
> On 03/13/2018 10:47 PM, Song Liu wrote:
> [...]
>> +enum bpf_stack_build_id_status {
>> +/* user space need an empty entry to identify end of a trace */
>> +BPF_STACK_BUILD_ID_EMPTY = 0,
>> +/* with valid build_id and offset */
>> +BPF_STACK_BUILD_ID_VALID = 1,
>> +/* couldn't get build_id, fallback to ip */
>> +BPF_STACK_BUILD_ID_IP = 2,
>> +};
>> +
>> +#define BPF_BUILD_ID_SIZE 20
>> +struct bpf_stack_build_id {
>> +__s32   status;
>> +unsigned char   build_id[BPF_BUILD_ID_SIZE];
>> +union {
>> +__u64   offset;
>> +__u64   ip;
>> +};
>> +};
> [...]>  BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, 
> map,
>> u64, flags)
>> {
>>  struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, 
>> map);
>>  struct perf_callchain_entry *trace;
>>  struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
>> -u32 max_depth = map->value_size / 8;
>> +u32 max_depth = map->value_size / stack_map_data_size(map);
>>  /* stack_map_alloc() checks that max_depth <= 
>> sysctl_perf_event_max_stack */
>>  u32 init_nr = sysctl_perf_event_max_stack - max_depth;
>>  u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
>> @@ -128,11 +318,16 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, 
>> struct bpf_map *, map,
>>  bool user = flags & BPF_F_USER_STACK;
>>  bool kernel = !user;
>>  u64 *ips;
>> +bool hash_matches;
>> 
>>  if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
>> BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
>>  return -EINVAL;
>> 
>> +/* build_id+offset stack map only supports user stack */
>> +if (stack_map_use_build_id(map) && !user)
>> +return -EINVAL;
> 
> Instead of bailing out here, wouldn't it make sense to just reuse the
> BPF_STACK_BUILD_ID_IP status and use this 'fallback' for kernel similar
> to what we do anyway in stack_map_get_build_id_offset() when we cannot
> get the build id so that map can be used for both cases?

This a great idea! Let me implement it. 

Thanks,
Song

Re: [PATCH bpf-next v5 1/2] bpf: extend stackmap to save binary_build_id+offset instead of address

2018-03-14 Thread Daniel Borkmann
Just a minor question below, the rest seems fine to me as far as I
can tell.

On 03/13/2018 10:47 PM, Song Liu wrote:
[...]
> +enum bpf_stack_build_id_status {
> + /* user space need an empty entry to identify end of a trace */
> + BPF_STACK_BUILD_ID_EMPTY = 0,
> + /* with valid build_id and offset */
> + BPF_STACK_BUILD_ID_VALID = 1,
> + /* couldn't get build_id, fallback to ip */
> + BPF_STACK_BUILD_ID_IP = 2,
> +};
> +
> +#define BPF_BUILD_ID_SIZE 20
> +struct bpf_stack_build_id {
> + __s32   status;
> + unsigned char   build_id[BPF_BUILD_ID_SIZE];
> + union {
> + __u64   offset;
> + __u64   ip;
> + };
> +};
[...]>  BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, 
map,
>  u64, flags)
>  {
>   struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, 
> map);
>   struct perf_callchain_entry *trace;
>   struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
> - u32 max_depth = map->value_size / 8;
> + u32 max_depth = map->value_size / stack_map_data_size(map);
>   /* stack_map_alloc() checks that max_depth <= 
> sysctl_perf_event_max_stack */
>   u32 init_nr = sysctl_perf_event_max_stack - max_depth;
>   u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
> @@ -128,11 +318,16 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, 
> struct bpf_map *, map,
>   bool user = flags & BPF_F_USER_STACK;
>   bool kernel = !user;
>   u64 *ips;
> + bool hash_matches;
>  
>   if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
>  BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
>   return -EINVAL;
>  
> + /* build_id+offset stack map only supports user stack */
> + if (stack_map_use_build_id(map) && !user)
> + return -EINVAL;

Instead of bailing out here, wouldn't it make sense to just reuse the
BPF_STACK_BUILD_ID_IP status and use this 'fallback' for kernel similar
to what we do anyway in stack_map_get_build_id_offset() when we cannot
get the build id so that map can be used for both cases?

>   trace = get_perf_callchain(regs, init_nr, kernel, user,
>  sysctl_perf_event_max_stack, false, false);
>  
> @@ -156,24 +351,42 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, 
> struct bpf_map *, map,
>   id = hash & (smap->n_buckets - 1);
>   bucket = READ_ONCE(smap->buckets[id]);
>  
> - if (bucket && bucket->hash == hash) {
> - if (flags & BPF_F_FAST_STACK_CMP)
> + hash_matches = bucket && bucket->hash == hash;
> + /* fast cmp */
> + if (hash_matches && flags & BPF_F_FAST_STACK_CMP)
> + return id;
> +
> + if (stack_map_use_build_id(map)) {
> + /* for build_id+offset, pop a bucket before slow cmp */
> + new_bucket = (struct stack_map_bucket *)
> + pcpu_freelist_pop(>freelist);
> + if (unlikely(!new_bucket))
> + return -ENOMEM;
> + stack_map_get_build_id_offset(map, new_bucket, ips, trace_nr);
> + trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
> + if (hash_matches && bucket->nr == trace_nr &&
> + memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
> + pcpu_freelist_push(>freelist, _bucket->fnode);
>   return id;
> - if (bucket->nr == trace_nr &&
> - memcmp(bucket->ip, ips, trace_len) == 0)
> + }
> + if (bucket && !(flags & BPF_F_REUSE_STACKID)) {
> + pcpu_freelist_push(>freelist, _bucket->fnode);
> + return -EEXIST;
> + }
> + } else {
> + if (hash_matches && bucket->nr == trace_nr &&
> + memcmp(bucket->data, ips, trace_len) == 0)
>   return id;
> + if (bucket && !(flags & BPF_F_REUSE_STACKID))
> + return -EEXIST;
> +
> + new_bucket = (struct stack_map_bucket *)
> + pcpu_freelist_pop(>freelist);
> + if (unlikely(!new_bucket))
> + return -ENOMEM;
> + memcpy(new_bucket->data, ips, trace_len);
>   }
>  
> - /* this call stack is not in the map, try to add it */
> - if (bucket && !(flags & BPF_F_REUSE_STACKID))
> - return -EEXIST;
> -
> - new_bucket = (struct stack_map_bucket *)
> - pcpu_freelist_pop(>freelist);
> - if (unlikely(!new_bucket))
> - return -ENOMEM;
> -
> - memcpy(new_bucket->ip, ips, trace_len);
>   new_bucket->hash = hash;
>   new_bucket->nr = trace_nr;
>  
> @@ -212,8 +425,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, 
> void *value)
>   if (!bucket)
>   return -ENOENT;
>  
> - trace_len = bucket->nr * sizeof(u64);
> - memcpy(value, bucket->ip,