This commit bridges the gap between bpf-prog and the kernel decompression routines. At present, only a global memory allocator is used for the decompression. Later, if needed, the decompress_fn's prototype can be changed to pass in a task related allocator.
This memory allocator can allocate 2MB each time with a transient virtual address, up to a 1GB limit. After decompression finishes, it presents all of the decompressed data in a new unified virtual address space. Signed-off-by: Pingfan Liu <pi...@redhat.com> Cc: Alexei Starovoitov <a...@kernel.org> Cc: Daniel Borkmann <dan...@iogearbox.net> Cc: John Fastabend <john.fastab...@gmail.com> Cc: Andrii Nakryiko <and...@kernel.org> Cc: Martin KaFai Lau <martin....@linux.dev> Cc: Eduard Zingerman <eddy...@gmail.com> Cc: Song Liu <s...@kernel.org> Cc: Yonghong Song <yonghong.s...@linux.dev> Cc: KP Singh <kpsi...@kernel.org> Cc: Stanislav Fomichev <s...@fomichev.me> Cc: Hao Luo <hao...@google.com> Cc: Jiri Olsa <jo...@kernel.org> To: b...@vger.kernel.org --- kernel/bpf/helpers.c | 226 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bd83ec9a2b2a6..895fe8fdaa78d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -25,6 +25,7 @@ #include <linux/kasan.h> #include <linux/bpf_verifier.h> #include <linux/uaccess.h> +#include <linux/decompress/generic.h> #include "../../lib/kstrtox.h" @@ -3703,6 +3704,230 @@ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign) return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX); } +#ifdef CONFIG_KEXEC_PE_IMAGE + +#define MAX_UNCOMPRESSED_BUF_SIZE (1 << 28) +/* a chunk should be large enough to contain a decompressing */ +#define CHUNK_SIZE (1 << 23) + +/* + * At present, one global allocator for decompression. Later if needed, changing the + * prototype of decompress_fn to introduce each task's allocator. + */ +static DEFINE_MUTEX(output_buf_mutex); + +struct decompress_mem_allocator { + struct page **pages; + unsigned int pg_idx; + void *chunk_start; + unsigned int chunk_size; + void *chunk_cur; +}; + +static struct decompress_mem_allocator dcmpr_allocator; + +/* + * Set up an active chunk to hold partial decompressed data. + */ +static void *vmap_decompressed_chunk(void) +{ + struct decompress_mem_allocator *a = &dcmpr_allocator; + unsigned int i, pg_cnt = a->chunk_size >> PAGE_SHIFT; + struct page **pg_start = &a->pages[a->pg_idx]; + + for (i = 0; i < pg_cnt; i++) + a->pages[a->pg_idx++] = alloc_page(GFP_KERNEL | __GFP_ACCOUNT); + + return vmap(pg_start, pg_cnt, VM_MAP, PAGE_KERNEL); +} + +/* + * Present the scattered pages containing decompressed data at a unified virtual + * address. + */ +static int decompress_mem_allocator_handover(struct decompress_mem_allocator *a, + struct mem_range_result *range) +{ + unsigned long pg_array_sz = a->pg_idx * sizeof(struct page *); + + range->pages = vmalloc(pg_array_sz); + if (!range->pages) + return -ENOMEM; + + range->pg_cnt = a->pg_idx; + memcpy(range->pages, a->pages, pg_array_sz); + range->buf = vmap(range->pages, range->pg_cnt, VM_MAP, PAGE_KERNEL); + if (!range->buf) { + vfree(range->pages); + return -1; + } + /* + * Free the tracing pointer; The pages are freed when mem_range_result + * is released. + */ + vfree(a->pages); + a->pages = NULL; + + /* vmap-ed */ + range->alloc_type = TYPE_VMAP; + range->buf_sz = a->pg_idx << PAGE_SHIFT; + range->data_sz = range->buf_sz - a->chunk_size; + range->data_sz += a->chunk_cur - a->chunk_start; + + return 0; +} + +static int decompress_mem_allocator_init( + struct decompress_mem_allocator *allocator, + unsigned int chunk_size) +{ + unsigned long sz = (MAX_UNCOMPRESSED_BUF_SIZE >> PAGE_SHIFT) * sizeof(struct page *); + + allocator->pages = __vmalloc(sz, GFP_KERNEL | __GFP_ACCOUNT); + if (!allocator->pages) + return -ENOMEM; + + allocator->pg_idx = 0; + allocator->chunk_start = NULL; + allocator->chunk_size = chunk_size; + allocator->chunk_cur = NULL; + return 0; +} + +static void decompress_mem_allocator_fini(struct decompress_mem_allocator *allocator) +{ + unsigned int i; + + /* unmap the active chunk */ + if (!!allocator->chunk_start) + vunmap(allocator->chunk_start); + if (!!allocator->pages) { + for (i = 0; i < allocator->pg_idx; i++) + __free_pages(allocator->pages[i], 0); + vfree(allocator->pages); + } +} + +/* + * This is a callback for decompress_fn. + * + * It copies the partial decompressed content in [buf, buf + len) to dst. If the + * active chunk is not large enough, retire it and activate a new chunk to hold + * the remaining data. + */ +static long flush(void *buf, unsigned long len) +{ + struct decompress_mem_allocator *a = &dcmpr_allocator; + long free, copied = 0; + + /* The first time allocation */ + if (unlikely(!a->chunk_start)) { + a->chunk_start = a->chunk_cur = vmap_decompressed_chunk(); + if (unlikely(!a->chunk_start)) + return -1; + } + + free = a->chunk_start + a->chunk_size - a->chunk_cur; + BUG_ON(free < 0); + if (free < len) { + /* + * If the totoal size exceeds MAX_UNCOMPRESSED_BUF_SIZE, + * return -1 to indicate the decompress method that something + * is wrong + */ + if (unlikely((a->pg_idx >= MAX_UNCOMPRESSED_BUF_SIZE >> PAGE_SHIFT))) + return -1; + memcpy(a->chunk_cur, buf, free); + copied += free; + a->chunk_cur += free; + len -= free; + /* + * When retiring the active chunk, release its virtual address + * but do not release the contents in the pages. + */ + vunmap(a->chunk_start); + a->chunk_start = a->chunk_cur = vmap_decompressed_chunk(); + if (unlikely(!a->chunk_start)) + return -1; + } + memcpy(a->chunk_cur, buf, len); + copied += len; + a->chunk_cur += len; + return copied; +} + +__bpf_kfunc struct mem_range_result *bpf_decompress(char *image_gz_payload, int image_gz_sz) +{ + struct decompress_mem_allocator *a = &dcmpr_allocator; + decompress_fn decompressor; + struct mem_cgroup *memcg, *old_memcg; + struct mem_range_result *range; + const char *name; + char *input_buf; + int ret; + + memcg = get_mem_cgroup_from_current(); + old_memcg = set_active_memcg(memcg); + range = mem_range_result_alloc(); + if (!range) { + pr_err("fail to allocate mem_range_result\n"); + goto error; + } + + input_buf = __vmalloc(image_gz_sz, GFP_KERNEL | __GFP_ACCOUNT); + if (!input_buf) { + kfree(range); + pr_err("fail to allocate input buffer\n"); + goto error; + } + + ret = copy_from_kernel_nofault(input_buf, image_gz_payload, image_gz_sz); + if (ret < 0) { + kfree(range); + vfree(input_buf); + pr_err("Error when copying from 0x%p, size:0x%x\n", + image_gz_payload, image_gz_sz); + goto error; + } + + mutex_lock(&output_buf_mutex); + decompress_mem_allocator_init(a, CHUNK_SIZE); + decompressor = decompress_method(input_buf, image_gz_sz, &name); + if (!decompressor) { + kfree(range); + vfree(input_buf); + pr_err("Can not find decompress method\n"); + goto error; + } + ret = decompressor(input_buf, image_gz_sz, NULL, flush, + NULL, NULL, NULL); + + vfree(input_buf); + if (ret == 0) { + ret = decompress_mem_allocator_handover(a, range); + if (!!ret) + goto fail; + range->status = 0; + mem_cgroup_tryget(memcg); + range->memcg = memcg; + set_active_memcg(old_memcg); + } +fail: + decompress_mem_allocator_fini(a); + mutex_unlock(&output_buf_mutex); + if (!!ret) { + kfree(range); + range = NULL; + pr_err("Decompress error\n"); + } + +error: + set_active_memcg(old_memcg); + mem_cgroup_put(memcg); + return range; +} +#endif + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -3710,6 +3935,7 @@ BTF_KFUNCS_START(generic_btf_ids) BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif #ifdef CONFIG_KEXEC_PE_IMAGE +BTF_ID_FLAGS(func, bpf_decompress, KF_TRUSTED_ARGS | KF_ACQUIRE | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_mem_range_result_put, KF_RELEASE | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_copy_to_kernel, KF_TRUSTED_ARGS | KF_SLEEPABLE) #endif -- 2.49.0