This patch introduces three kfunc dedicated for kexec_file_load. In the case of kexec, kexec_trylock() ensures no concurrent, which relieves the kexec bpf kfunc design. (Maybe later, a dedicate BPF_PROG_TYPE_KEXEC to limit their use case to improve the safety)
bpf_kexec_decompress(): It creates a bridge to the kernel decompressor, avoiding the need to reimplement the lib/decompress_* in bpf-programs. bpf_kexec_result_release(): It releases the resource when bpf-prog is done with that. bpf_kexec_carrier(): The common data flow in bpf scheme is from kernel to bpf-prog. In the case of kexec_file_load, the kexec component needs to buffer the parsed result by bpf-prog (opposite the usual direction) to the next stage parsing. bpf_kexec_carrier() makes the opposite data flow possible. A bpf-prog can publish the parsed payload address to the kernel, and the latter can copy them for future use. Signed-off-by: Pingfan Liu <pi...@redhat.com> Cc: Alexei Starovoitov <a...@kernel.org> Cc: Daniel Borkmann <dan...@iogearbox.net> Cc: Andrii Nakryiko <and...@kernel.org> Cc: Martin KaFai Lau <martin....@linux.dev> Cc: Eduard Zingerman <eddy...@gmail.com> Cc: Song Liu <s...@kernel.org> Cc: Yonghong Song <yonghong.s...@linux.dev> Cc: John Fastabend <john.fastab...@gmail.com> Cc: KP Singh <kpsi...@kernel.org> Cc: Stanislav Fomichev <s...@fomichev.me> Cc: Hao Luo <hao...@google.com> Cc: Jiri Olsa <jo...@kernel.org> Cc: Baoquan He <b...@redhat.com> Cc: Dave Young <dyo...@redhat.com> Cc: Eric Biederman <ebied...@xmission.com> To: b...@vger.kernel.org To: kexec@lists.infradead.org --- kernel/kexec_pe_image.c | 194 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) diff --git a/kernel/kexec_pe_image.c b/kernel/kexec_pe_image.c index accf6b0f02e39..610bb134f5e34 100644 --- a/kernel/kexec_pe_image.c +++ b/kernel/kexec_pe_image.c @@ -15,6 +15,9 @@ #include <linux/kexec.h> #include <linux/pe.h> #include <linux/string.h> +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/decompress/generic.h> #include <asm/byteorder.h> #include <asm/cpufeature.h> #include <asm/image.h> @@ -52,6 +55,186 @@ static struct parsed_phase *alloc_new_phase(void) return phase; } +struct mem_range_result { + refcount_t usage; + /* + * Pointer to a kernel space, which is written by kfunc and read by + * bpf-prog. Hence kfunc guarantees its validation. + */ + char *buf; + uint32_t size; // Size of decompressed data + int status; // Status code (0 for success) +}; + +#define MAX_KEXEC_RES_SIZE (1 << 29) + +BTF_KFUNCS_START(bpf_kexec_ids) +BTF_ID_FLAGS(func, bpf_kexec_carrier, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kexec_decompress, KF_TRUSTED_ARGS | KF_ACQUIRE) +BTF_ID_FLAGS(func, bpf_kexec_result_release, KF_RELEASE) +BTF_KFUNCS_END(bpf_kexec_ids) + +static const struct btf_kfunc_id_set kexec_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_kexec_ids, +}; + +/* + * Copy the partial decompressed content in [buf, buf + len) to dst. + * If the dst size is beyond the capacity, return 0 to indicate the + * decompress method that something is wrong. + */ +//to do +static long flush_buffer(void *buf, unsigned long len) +{ + + //return len to indicate everything goest smoothly + return 0; +} + + +__bpf_kfunc_start_defs(); + +/* + * @name should be one of : kernel, initrd, cmdline + */ +__bpf_kfunc int bpf_kexec_carrier(const char *name, struct mem_range_result *r) +{ + struct kexec_res *res; + int ret = 0; + + if (!r) { + pr_err("%s, receive invalid range\n", __func__); + return -EINVAL; + } + + if (!r || !name) + return -EINVAL; + if (r->size == 0 || r->size > MAX_KEXEC_RES_SIZE) { + pr_err("Invalid resource size: 0x%x\n", r->size); + return -EINVAL; + } + + res = kzalloc(sizeof(struct kexec_res), GFP_KERNEL); + if (!res) + return -ENOMEM; + + for (int i = 0; i < ARRAY_SIZE(kexec_res_names); i++) { + if (!strcmp(kexec_res_names[i], name)) + res->name = kexec_res_names[i]; + } + + if (res->name == NULL) { + pr_err("Invalid resource name: %s, should be 'kernel', 'initrd', 'cmdline'\n", name); + kfree(res); + return -EINVAL; + } + + res->buf = vmalloc(r->size); + if (!res->buf) { + kfree(res); + return -ENOMEM; + } + ret = copy_from_kernel_nofault(res->buf, r->buf, r->size); + if (unlikely(ret < 0)) { + kfree(res->buf); + kfree(res); + return -EINVAL; + } + res->size = r->size; + + INIT_LIST_HEAD(&res->node); + list_add_tail(&res->node, &cur_phase->res_head); + return 0; +} + +__bpf_kfunc struct mem_range_result *bpf_kexec_decompress(char *image_gz_payload, int image_gz_sz, + unsigned int expected_decompressed_sz) +{ + decompress_fn decompressor; + //todo, use flush to cap the memory size used by decompression + long (*flush)(void*, unsigned long) = NULL; + struct mem_range_result *range; + const char *name; + void *output_buf; + char *input_buf; + int ret; + + range = kmalloc(sizeof(struct mem_range_result), GFP_KERNEL); + if (!range) { + pr_err("fail to allocate mem_range_result\n"); + return NULL; + } + refcount_set(&range->usage, 1); + + input_buf = vmalloc(image_gz_sz); + if (!input_buf) { + pr_err("fail to allocate input buffer\n"); + kfree(range); + return NULL; + } + + ret = copy_from_kernel_nofault(input_buf, image_gz_payload, image_gz_sz); + if (ret < 0) { + pr_err("Error when copying from 0x%px, size:0x%x\n", + image_gz_payload, image_gz_sz); + kfree(range); + vfree(input_buf); + return NULL; + } + + output_buf = vmalloc(expected_decompressed_sz); + if (!output_buf) { + pr_err("fail to allocate output buffer\n"); + kfree(range); + vfree(input_buf); + return NULL; + } + + decompressor = decompress_method(input_buf, image_gz_sz, &name); + if (!decompressor) { + pr_err("Can not find decompress method\n"); + kfree(range); + vfree(input_buf); + vfree(output_buf); + return NULL; + } + //to do, use flush + ret = decompressor(image_gz_payload, image_gz_sz, NULL, NULL, + output_buf, NULL, NULL); + + /* Update the range map */ + if (ret == 0) { + range->buf = output_buf; + range->size = expected_decompressed_sz; + range->status = 0; + } else { + pr_err("Decompress error\n"); + vfree(output_buf); + kfree(range); + return NULL; + } + pr_info("%s, return range 0x%lx\n", __func__, range); + return range; +} + +__bpf_kfunc int bpf_kexec_result_release(struct mem_range_result *result) +{ + if (!result) { + pr_err("%s, receive invalid range\n", __func__); + return -EINVAL; + } + + if (refcount_dec_and_test(&result->usage)) { + vfree(result->buf); + kfree(result); + } + + return 0; +} + +__bpf_kfunc_end_defs(); + static bool is_valid_pe(const char *kernel_buf, unsigned long kernel_len) { struct mz_hdr *mz; @@ -336,3 +519,14 @@ const struct kexec_file_ops kexec_pe_image_ops = { .verify_sig = kexec_kernel_verify_pe_sig, #endif }; + +static int __init bpf_kfunc_init(void) +{ + int ret; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &kexec_kfunc_set); + if (!!ret) + pr_err("Fail to register btf for kexec_kfunc_set\n"); + return ret; +} +late_initcall(bpf_kfunc_init); -- 2.49.0