This commit bridges the gap between bpf-prog and the kernel
decompression routines. At present, only a global memory allocator is
used for the decompression. Later, if needed, the decompress_fn's
prototype can be changed to pass in a task related allocator.

This memory allocator can allocate 2MB each time with a transient
virtual address, up to a 1GB limit.  After decompression finishes, it
presents all of the decompressed data in a new unified virtual
address space.

Signed-off-by: Pingfan Liu <pi...@redhat.com>
Cc: Alexei Starovoitov <a...@kernel.org>
Cc: Daniel Borkmann <dan...@iogearbox.net>
Cc: John Fastabend <john.fastab...@gmail.com>
Cc: Andrii Nakryiko <and...@kernel.org>
Cc: Martin KaFai Lau <martin....@linux.dev>
Cc: Eduard Zingerman <eddy...@gmail.com>
Cc: Song Liu <s...@kernel.org>
Cc: Yonghong Song <yonghong.s...@linux.dev>
Cc: KP Singh <kpsi...@kernel.org>
Cc: Stanislav Fomichev <s...@fomichev.me>
Cc: Hao Luo <hao...@google.com>
Cc: Jiri Olsa <jo...@kernel.org>
To: b...@vger.kernel.org
---
 kernel/bpf/helpers.c | 226 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 226 insertions(+)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bd83ec9a2b2a6..895fe8fdaa78d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -25,6 +25,7 @@
 #include <linux/kasan.h>
 #include <linux/bpf_verifier.h>
 #include <linux/uaccess.h>
+#include <linux/decompress/generic.h>
 
 #include "../../lib/kstrtox.h"
 
@@ -3703,6 +3704,230 @@ __bpf_kfunc int bpf_strstr(const char *s1__ign, const 
char *s2__ign)
        return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX);
 }
 
+#ifdef CONFIG_KEXEC_PE_IMAGE
+
+#define MAX_UNCOMPRESSED_BUF_SIZE      (1 << 28)
+/* a chunk should be large enough to contain a decompressing */
+#define CHUNK_SIZE     (1 << 23)
+
+/*
+ * At present, one global allocator for decompression. Later if needed, 
changing the
+ * prototype of decompress_fn to introduce each task's allocator.
+ */
+static DEFINE_MUTEX(output_buf_mutex);
+
+struct decompress_mem_allocator {
+       struct page **pages;
+       unsigned int pg_idx;
+       void *chunk_start;
+       unsigned int chunk_size;
+       void *chunk_cur;
+};
+
+static struct decompress_mem_allocator dcmpr_allocator;
+
+/*
+ * Set up an active chunk to hold partial decompressed data.
+ */
+static void *vmap_decompressed_chunk(void)
+{
+       struct decompress_mem_allocator *a = &dcmpr_allocator;
+       unsigned int i, pg_cnt = a->chunk_size >> PAGE_SHIFT;
+       struct page **pg_start = &a->pages[a->pg_idx];
+
+       for (i = 0; i < pg_cnt; i++)
+               a->pages[a->pg_idx++] = alloc_page(GFP_KERNEL | __GFP_ACCOUNT);
+
+       return vmap(pg_start, pg_cnt, VM_MAP, PAGE_KERNEL);
+}
+
+/*
+ * Present the scattered pages containing decompressed data at a unified 
virtual
+ * address.
+ */
+static int decompress_mem_allocator_handover(struct decompress_mem_allocator 
*a,
+               struct mem_range_result *range)
+{
+       unsigned long pg_array_sz = a->pg_idx * sizeof(struct page *);
+
+       range->pages = vmalloc(pg_array_sz);
+       if (!range->pages)
+               return -ENOMEM;
+
+       range->pg_cnt = a->pg_idx;
+       memcpy(range->pages, a->pages, pg_array_sz);
+       range->buf = vmap(range->pages, range->pg_cnt, VM_MAP, PAGE_KERNEL);
+       if (!range->buf) {
+               vfree(range->pages);
+               return -1;
+       }
+       /*
+        * Free the tracing pointer; The pages are freed when mem_range_result
+        * is released.
+        */
+       vfree(a->pages);
+       a->pages = NULL;
+
+       /* vmap-ed */
+       range->alloc_type = TYPE_VMAP;
+       range->buf_sz = a->pg_idx << PAGE_SHIFT;
+       range->data_sz = range->buf_sz - a->chunk_size;
+       range->data_sz += a->chunk_cur - a->chunk_start;
+
+       return 0;
+}
+
+static int decompress_mem_allocator_init(
+       struct decompress_mem_allocator *allocator,
+       unsigned int chunk_size)
+{
+       unsigned long sz = (MAX_UNCOMPRESSED_BUF_SIZE >> PAGE_SHIFT) * 
sizeof(struct page *);
+
+       allocator->pages = __vmalloc(sz, GFP_KERNEL | __GFP_ACCOUNT);
+       if (!allocator->pages)
+               return -ENOMEM;
+
+       allocator->pg_idx = 0;
+       allocator->chunk_start = NULL;
+       allocator->chunk_size = chunk_size;
+       allocator->chunk_cur = NULL;
+       return 0;
+}
+
+static void decompress_mem_allocator_fini(struct decompress_mem_allocator 
*allocator)
+{
+       unsigned int i;
+
+       /* unmap the active chunk */
+       if (!!allocator->chunk_start)
+               vunmap(allocator->chunk_start);
+       if (!!allocator->pages) {
+               for (i = 0; i < allocator->pg_idx; i++)
+                       __free_pages(allocator->pages[i], 0);
+               vfree(allocator->pages);
+       }
+}
+
+/*
+ * This is a callback for decompress_fn.
+ *
+ * It copies the partial decompressed content in [buf, buf + len) to dst. If 
the
+ * active chunk is not large enough, retire it and activate a new chunk to hold
+ * the remaining data.
+ */
+static long flush(void *buf, unsigned long len)
+{
+       struct decompress_mem_allocator *a = &dcmpr_allocator;
+       long free, copied = 0;
+
+       /* The first time allocation */
+       if (unlikely(!a->chunk_start)) {
+               a->chunk_start = a->chunk_cur = vmap_decompressed_chunk();
+               if (unlikely(!a->chunk_start))
+                       return -1;
+       }
+
+       free = a->chunk_start + a->chunk_size - a->chunk_cur;
+       BUG_ON(free < 0);
+       if (free < len) {
+               /*
+                * If the totoal size exceeds MAX_UNCOMPRESSED_BUF_SIZE,
+                * return -1 to indicate the decompress method that something
+                * is wrong
+                */
+               if (unlikely((a->pg_idx >= MAX_UNCOMPRESSED_BUF_SIZE >> 
PAGE_SHIFT)))
+                       return -1;
+               memcpy(a->chunk_cur, buf, free);
+               copied += free;
+               a->chunk_cur += free;
+               len -= free;
+               /*
+                * When retiring the active chunk, release its virtual address
+                * but do not release the contents in the pages.
+                */
+               vunmap(a->chunk_start);
+               a->chunk_start = a->chunk_cur = vmap_decompressed_chunk();
+               if (unlikely(!a->chunk_start))
+                       return -1;
+       }
+       memcpy(a->chunk_cur, buf, len);
+       copied += len;
+       a->chunk_cur += len;
+       return copied;
+}
+
+__bpf_kfunc struct mem_range_result *bpf_decompress(char *image_gz_payload, 
int image_gz_sz)
+{
+       struct decompress_mem_allocator *a = &dcmpr_allocator;
+       decompress_fn decompressor;
+       struct mem_cgroup *memcg, *old_memcg;
+       struct mem_range_result *range;
+       const char *name;
+       char *input_buf;
+       int ret;
+
+       memcg = get_mem_cgroup_from_current();
+       old_memcg = set_active_memcg(memcg);
+       range = mem_range_result_alloc();
+       if (!range) {
+               pr_err("fail to allocate mem_range_result\n");
+               goto error;
+       }
+
+       input_buf = __vmalloc(image_gz_sz, GFP_KERNEL | __GFP_ACCOUNT);
+       if (!input_buf) {
+               kfree(range);
+               pr_err("fail to allocate input buffer\n");
+               goto error;
+       }
+
+       ret = copy_from_kernel_nofault(input_buf, image_gz_payload, 
image_gz_sz);
+       if (ret < 0) {
+               kfree(range);
+               vfree(input_buf);
+               pr_err("Error when copying from 0x%p, size:0x%x\n",
+                               image_gz_payload, image_gz_sz);
+               goto error;
+       }
+
+       mutex_lock(&output_buf_mutex);
+       decompress_mem_allocator_init(a, CHUNK_SIZE);
+       decompressor = decompress_method(input_buf, image_gz_sz, &name);
+       if (!decompressor) {
+               kfree(range);
+               vfree(input_buf);
+               pr_err("Can not find decompress method\n");
+               goto error;
+       }
+       ret = decompressor(input_buf, image_gz_sz, NULL, flush,
+                               NULL, NULL, NULL);
+
+       vfree(input_buf);
+       if (ret == 0) {
+               ret = decompress_mem_allocator_handover(a, range);
+               if (!!ret)
+                       goto fail;
+               range->status = 0;
+               mem_cgroup_tryget(memcg);
+               range->memcg = memcg;
+               set_active_memcg(old_memcg);
+       }
+fail:
+       decompress_mem_allocator_fini(a);
+       mutex_unlock(&output_buf_mutex);
+       if (!!ret) {
+               kfree(range);
+               range = NULL;
+               pr_err("Decompress error\n");
+       }
+
+error:
+       set_active_memcg(old_memcg);
+       mem_cgroup_put(memcg);
+       return range;
+}
+#endif
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(generic_btf_ids)
@@ -3710,6 +3935,7 @@ BTF_KFUNCS_START(generic_btf_ids)
 BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
 #endif
 #ifdef CONFIG_KEXEC_PE_IMAGE
+BTF_ID_FLAGS(func, bpf_decompress, KF_TRUSTED_ARGS | KF_ACQUIRE | KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_mem_range_result_put, KF_RELEASE | KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_copy_to_kernel, KF_TRUSTED_ARGS | KF_SLEEPABLE)
 #endif
-- 
2.49.0

Reply via email to