Re: [PATCH v3 1/2] bpf: extend stackmap to save binary_build_id+offset instead of address
On 03/10/2018 12:34 AM, Song Liu wrote: [...] > Thanks Daniel! I will fix these. > > I also chatted with Teng about this, that we may miss some cases. I will > also include that fix in the next version. Okay, thanks Song!
Re: [PATCH v3 1/2] bpf: extend stackmap to save binary_build_id+offset instead of address
> On Mar 9, 2018, at 3:25 PM, Daniel Borkmann wrote: > > Hi Song, > > On 03/06/2018 08:42 PM, Song Liu wrote: > [...]> +/* >> + * Parse build id from the note segment. This logic can be shared between >> + * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are >> + * identical. >> + */ >> +static inline int stack_map_parse_build_id(void *vm_start, >> + unsigned char *build_id, >> + void *note_start, >> + Elf32_Word note_size) >> +{ >> +Elf32_Word note_offs = 0, new_offs; >> + >> +/* check for overflow */ >> +if (note_start < vm_start || note_start + note_size < note_start) >> +return -EINVAL; >> + >> +/* only supports note that fits in the first page */ >> +if (note_start + note_size > vm_start + PAGE_SIZE) >> +return -EINVAL; >> + >> +while (note_offs + sizeof(Elf32_Nhdr) < note_size) { >> +Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); >> + >> +if (nhdr->n_type == BPF_BUILD_ID && >> +nhdr->n_namesz == sizeof("GNU") && >> +nhdr->n_descsz == BPF_BUILD_ID_SIZE) { >> +memcpy(build_id, >> + note_start + note_offs + >> + ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), >> + BPF_BUILD_ID_SIZE); >> +return 0; >> +} >> +new_offs = note_offs + sizeof(Elf32_Nhdr) + >> +ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); >> +if (new_offs <= note_offs) /* overflow */ >> +break; >> +note_offs = new_offs; >> +}; >> +return -EINVAL; >> +} >> + >> +/* Parse build ID from 32-bit ELF */ >> +static int stack_map_get_build_id_32(void *vm_start, >> + unsigned char *build_id) >> +{ >> +Elf32_Ehdr *ehdr = (Elf32_Ehdr *)vm_start; >> +Elf32_Phdr *phdr; >> +int i; >> + >> +/* only supports phdr that fits in one page */ >> +if (ehdr->e_phnum > >> +(PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) >> +return -EINVAL; >> + >> +phdr = (Elf32_Phdr *)(vm_start + sizeof(Elf32_Ehdr)); >> + >> +for (i = 0; i < ehdr->e_phnum; ++i) >> +if (phdr[i].p_type == PT_NOTE) >> +return stack_map_parse_build_id(vm_start, build_id, >> +vm_start + phdr[i].p_offset, >> +phdr[i].p_filesz); >> +return -EINVAL; >> +} >> + >> +/* Parse build ID from 64-bit ELF */ >> +static int stack_map_get_build_id_64(void *vm_start, >> + unsigned char *build_id) >> +{ >> +Elf64_Ehdr *ehdr = (Elf64_Ehdr *)vm_start; >> +Elf64_Phdr *phdr; >> +int i; >> + >> +/* only supports phdr that fits in one page */ >> +if (ehdr->e_phnum > >> +(PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) >> +return -EINVAL; >> + >> +phdr = (Elf64_Phdr *)(vm_start + sizeof(Elf64_Ehdr)); >> + >> +for (i = 0; i < ehdr->e_phnum; ++i) >> +if (phdr[i].p_type == PT_NOTE) >> +return stack_map_parse_build_id(vm_start, build_id, >> +vm_start + phdr[i].p_offset, >> +phdr[i].p_filesz); >> +return -EINVAL; >> +} >> + >> +/* Parse build ID of ELF file mapped to vma */ >> +static int stack_map_get_build_id(struct vm_area_struct *vma, >> + unsigned char *build_id) >> +{ >> +Elf32_Ehdr *ehdr; >> +struct page *page; >> +int ret; >> + >> +/* >> + * vm_start is user memory, so we need to be careful with it. >> + * We don't want too many copy_from_user to reduce overhead. >> + * Most ELF file is at least one page long, and the build_id >> + * is stored in the first page. Therefore, we limit the search of >> + * build_id to the first page only. This can be made safe with >> + * get_user_pages_fast(). If the file is smaller than PAGE_SIZE, >> + * or the build_id is not in the first page, the look up fails. >> + */ >> +if (vma->vm_end - vma->vm_start < PAGE_SIZE || >> +vma->vm_start & (PAGE_SIZE - 1))/* page aligned */ >> +return -EINVAL; >> + >> +if (get_user_pages_fast(vma->vm_start, 1, 0, &page) != 1) > > Shouldn't this throw a splat? Implementations of get_user_pages_fast() > call down_read() which has might_sleep() and we're under RCU read side > here. > >> +return -EFAULT; >> + >> +ret = -EINVAL; >> +ehdr = (Elf32_Ehdr *)page_address(page); >> + >> +/* compare magic x7f "ELF" */ >> +if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) >> +goto out; >> + >> +/* only support executable file and shared object file */ >> +if (ehdr->e_type
Re: [PATCH v3 1/2] bpf: extend stackmap to save binary_build_id+offset instead of address
Hi Song, On 03/06/2018 08:42 PM, Song Liu wrote: [...]> +/* > + * Parse build id from the note segment. This logic can be shared between > + * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are > + * identical. > + */ > +static inline int stack_map_parse_build_id(void *vm_start, > +unsigned char *build_id, > +void *note_start, > +Elf32_Word note_size) > +{ > + Elf32_Word note_offs = 0, new_offs; > + > + /* check for overflow */ > + if (note_start < vm_start || note_start + note_size < note_start) > + return -EINVAL; > + > + /* only supports note that fits in the first page */ > + if (note_start + note_size > vm_start + PAGE_SIZE) > + return -EINVAL; > + > + while (note_offs + sizeof(Elf32_Nhdr) < note_size) { > + Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); > + > + if (nhdr->n_type == BPF_BUILD_ID && > + nhdr->n_namesz == sizeof("GNU") && > + nhdr->n_descsz == BPF_BUILD_ID_SIZE) { > + memcpy(build_id, > +note_start + note_offs + > +ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), > +BPF_BUILD_ID_SIZE); > + return 0; > + } > + new_offs = note_offs + sizeof(Elf32_Nhdr) + > + ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); > + if (new_offs <= note_offs) /* overflow */ > + break; > + note_offs = new_offs; > + }; > + return -EINVAL; > +} > + > +/* Parse build ID from 32-bit ELF */ > +static int stack_map_get_build_id_32(void *vm_start, > + unsigned char *build_id) > +{ > + Elf32_Ehdr *ehdr = (Elf32_Ehdr *)vm_start; > + Elf32_Phdr *phdr; > + int i; > + > + /* only supports phdr that fits in one page */ > + if (ehdr->e_phnum > > + (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) > + return -EINVAL; > + > + phdr = (Elf32_Phdr *)(vm_start + sizeof(Elf32_Ehdr)); > + > + for (i = 0; i < ehdr->e_phnum; ++i) > + if (phdr[i].p_type == PT_NOTE) > + return stack_map_parse_build_id(vm_start, build_id, > + vm_start + phdr[i].p_offset, > + phdr[i].p_filesz); > + return -EINVAL; > +} > + > +/* Parse build ID from 64-bit ELF */ > +static int stack_map_get_build_id_64(void *vm_start, > + unsigned char *build_id) > +{ > + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)vm_start; > + Elf64_Phdr *phdr; > + int i; > + > + /* only supports phdr that fits in one page */ > + if (ehdr->e_phnum > > + (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) > + return -EINVAL; > + > + phdr = (Elf64_Phdr *)(vm_start + sizeof(Elf64_Ehdr)); > + > + for (i = 0; i < ehdr->e_phnum; ++i) > + if (phdr[i].p_type == PT_NOTE) > + return stack_map_parse_build_id(vm_start, build_id, > + vm_start + phdr[i].p_offset, > + phdr[i].p_filesz); > + return -EINVAL; > +} > + > +/* Parse build ID of ELF file mapped to vma */ > +static int stack_map_get_build_id(struct vm_area_struct *vma, > + unsigned char *build_id) > +{ > + Elf32_Ehdr *ehdr; > + struct page *page; > + int ret; > + > + /* > + * vm_start is user memory, so we need to be careful with it. > + * We don't want too many copy_from_user to reduce overhead. > + * Most ELF file is at least one page long, and the build_id > + * is stored in the first page. Therefore, we limit the search of > + * build_id to the first page only. This can be made safe with > + * get_user_pages_fast(). If the file is smaller than PAGE_SIZE, > + * or the build_id is not in the first page, the look up fails. > + */ > + if (vma->vm_end - vma->vm_start < PAGE_SIZE || > + vma->vm_start & (PAGE_SIZE - 1))/* page aligned */ > + return -EINVAL; > + > + if (get_user_pages_fast(vma->vm_start, 1, 0, &page) != 1) Shouldn't this throw a splat? Implementations of get_user_pages_fast() call down_read() which has might_sleep() and we're under RCU read side here. > + return -EFAULT; > + > + ret = -EINVAL; > + ehdr = (Elf32_Ehdr *)page_address(page); > + > + /* compare magic x7f "ELF" */ > + if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) > + goto out; > + > + /* only support executable file and shared object file */ > + if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) > + goto out; > + > + if (ehdr->e_ident[EI_CLASS] == 1) Mi
[PATCH v3 1/2] bpf: extend stackmap to save binary_build_id+offset instead of address
Currently, bpf stackmap store address for each entry in the call trace. To map these addresses to user space files, it is necessary to maintain the mapping from these virtual address to symbols in the binary. Usually, the user space profiler (such as perf) has to scan /proc/pid/maps at the beginning of profiling, and monitor mmap2() calls afterwards. Given the cost of maintaining the address map, this solution is not practical for system wide profiling that is always on. This patch tries to solve this problem with a variation of stackmap. This variation is enabled by flag BPF_F_STACK_BUILD_ID, and only works for user stack. Instead of storing addresses, the variation stores ELF file build_id + offset. Build ID is a 20-byte unique identifier for ELF files. The following command shows the Build ID of /bin/bash: [user@]$ readelf -n /bin/bash ... Build ID: XX ... With BPF_F_STACK_BUILD_ID, bpf_get_stackid() tries to parse Build ID for each entry in the call trace, and translate it into the following struct: struct bpf_stack_build_id_offset { __s32 status; unsigned char build_id[BPF_BUILD_ID_SIZE]; __aligned_u64 offset; }; Since the binary file is mapped to a vma (vm_area_struct), parsing Build ID (after successful find_vma) requires access to user memory (vma->vm_start to vma->vma_end). To make the access to user memory safe and cheap, get_user_pages_fast() is used to pin one page from vma->vm_start. This requires the ELF file is at least one page long, and the build_id is stored in the first page. A quick survey of binary and dynamic library files in a few different systems shows that almost all binary and dynamic library files have build_id in the first page. User space can access struct bpf_stack_build_id_offset with bpf syscall BPF_MAP_LOOKUP_ELEM. It is necessary for user space to maintain mapping from build id to binary files. This mostly static mapping is much easier to maintain than per process address maps. Signed-off-by: Song Liu --- include/uapi/linux/bpf.h | 15 +++ kernel/bpf/stackmap.c| 266 +++ 2 files changed, 259 insertions(+), 22 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2a66769..4a0c6bf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -231,6 +231,21 @@ enum bpf_attach_type { #define BPF_F_RDONLY (1U << 3) #define BPF_F_WRONLY (1U << 4) +/* Flag for stack_map, store build_id+offset instead of pointer */ +#define BPF_F_STACK_BUILD_ID (1U << 5) + +enum bpf_stack_build_id_status { + BPF_STACK_BUILD_ID_EMPTY = 0, + BPF_STACK_BUILD_ID_VALID = 1, +}; + +#define BPF_BUILD_ID_SIZE 20 +struct bpf_stack_build_id { + __s32 status; + unsigned char build_id[BPF_BUILD_ID_SIZE]; + __u64 offset; +}; + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index b0ecf43..eeef74c 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -9,16 +9,18 @@ #include #include #include +#include #include "percpu_freelist.h" -#define STACK_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +#define STACK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY |\ +BPF_F_STACK_BUILD_ID) struct stack_map_bucket { struct pcpu_freelist_node fnode; u32 hash; u32 nr; - u64 ip[]; + u64 data[]; }; struct bpf_stack_map { @@ -29,6 +31,17 @@ struct bpf_stack_map { struct stack_map_bucket *buckets[]; }; +static inline bool stack_map_use_build_id(struct bpf_map *map) +{ + return (map->map_flags & BPF_F_STACK_BUILD_ID); +} + +static inline int stack_map_data_size(struct bpf_map *map) +{ + return stack_map_use_build_id(map) ? + sizeof(struct bpf_stack_build_id) : sizeof(u64); +} + static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; @@ -68,8 +81,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - value_size < 8 || value_size % 8 || - value_size / 8 > sysctl_perf_event_max_stack) + value_size < 8 || value_size % 8) + return ERR_PTR(-EINVAL); + + BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64)); + if (attr->map_flags & BPF_F_STACK_BUILD_ID) { + if (value_size % sizeof(struct bpf_stack_build_id) || + value_size / sizeof(struct bpf_stack_build_id) + > sysctl_perf_event_max_stack) +