This BPF program aligns with the convention defined in the kernel file kexec_pe_parser_bpf.lskel.h, where the interface between the BPF program and the kernel is established, and is composed of: four maps: struct bpf_map_desc ringbuf_1; struct bpf_map_desc ringbuf_2; struct bpf_map_desc ringbuf_3; struct bpf_map_desc ringbuf_4; four sections: struct bpf_map_desc rodata; struct bpf_map_desc data; struct bpf_map_desc bss; struct bpf_map_desc rodata_str1_1;
two progs: SEC("fentry.s/bpf_handle_pefile") SEC("fentry.s/bpf_post_handle_pefile") This BPF program only uses ringbuf_1, so it minimizes the size of the other three ringbufs to one byte. The size of ringbuf_1 is deduced from the size of the uncompressed file 'vmlinux.bin', which is usually less than 64MB. With the help of a group of bpf kfuncs: bpf_decompress(), bpf_copy_to_kernel(), bpf_mem_range_result_put(), this bpf-prog stores the uncompressed kernel image inside the kernel space. Signed-off-by: Pingfan Liu <pi...@redhat.com> Cc: Alexei Starovoitov <a...@kernel.org> Cc: Baoquan He <b...@redhat.com> Cc: Dave Young <dyo...@redhat.com> Cc: Andrew Morton <a...@linux-foundation.org> Cc: Philipp Rudo <pr...@redhat.com> Cc: b...@vger.kernel.org To: ke...@lists.infradead.org --- tools/kexec/Makefile | 82 +++++++++++++++++ tools/kexec/zboot_parser_bpf.c | 158 +++++++++++++++++++++++++++++++++ 2 files changed, 240 insertions(+) create mode 100644 tools/kexec/Makefile create mode 100644 tools/kexec/zboot_parser_bpf.c diff --git a/tools/kexec/Makefile b/tools/kexec/Makefile new file mode 100644 index 0000000000000..c9e7ce9ff4c19 --- /dev/null +++ b/tools/kexec/Makefile @@ -0,0 +1,82 @@ +# SPDX-License-Identifier: GPL-2.0 + +# Ensure Kbuild variables are available +include ../scripts/Makefile.include + +srctree := $(patsubst %/tools/kexec,%,$(CURDIR)) +VMLINUX = $(srctree)/vmlinux +TOOLSDIR := $(srctree)/tools +LIBDIR := $(TOOLSDIR)/lib +BPFDIR := $(LIBDIR)/bpf +ARCH ?= $(shell uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ -e s/aarch64.*/arm64/ -e s/riscv64/riscv/ -e s/loongarch.*/loongarch/) +# At present, zboot image format is used by arm64, riscv, loongarch +# And arch/$(ARCH)/boot/vmlinux.bin is the uncompressed file instead of arch/$(ARCH)/boot/Image +ifeq ($(ARCH),$(filter $(ARCH),arm64 riscv loongarch)) + EFI_IMAGE := $(srctree)/arch/$(ARCH)/boot/vmlinuz.efi + KERNEL_IMAGE := $(srctree)/arch/$(ARCH)/boot/vmlinux.bin +else + @echo "Unsupported architecture: $(ARCH)" + @exit 1 +endif + + +CC = clang +CFLAGS = -O2 +BPF_PROG_CFLAGS = -g -O2 -target bpf -Wall -I $(BPFDIR) -I . +BPFTOOL = bpftool + +# List of generated target files +HEADERS = vmlinux.h bpf_helper_defs.h image_size.h +ZBOOT_TARGETS = bytecode.c zboot_parser_bpf.o bytecode.o + + +# Targets +zboot: $(HEADERS) $(ZBOOT_TARGETS) + +# Rule to generate vmlinux.h from vmlinux +vmlinux.h: $(VMLINUX) + @command -v $(BPFTOOL) >/dev/null 2>&1 || { echo >&2 "$(BPFTOOL) is required but not found. Please install it."; exit 1; } + @$(BPFTOOL) btf dump file $(VMLINUX) format c > vmlinux.h + +bpf_helper_defs.h: $(srctree)/tools/include/uapi/linux/bpf.h + @$(QUIET_GEN)$(srctree)/scripts/bpf_doc.py --header \ + --file $(srctree)/tools/include/uapi/linux/bpf.h > bpf_helper_defs.h + +image_size.h: $(KERNEL_IMAGE) + @{ \ + if [ ! -f "$(KERNEL_IMAGE)" ]; then \ + echo "Error: File '$(KERNEL_IMAGE)' does not exist"; \ + exit 1; \ + fi; \ + FILE_SIZE=$$(stat -c '%s' "$(KERNEL_IMAGE)" 2>/dev/null); \ + POWER=4096; \ + while [ $$POWER -le $$FILE_SIZE ]; do \ + POWER=$$((POWER * 2)); \ + done; \ + RINGBUF_SIZE=$$POWER; \ + echo "#define RINGBUF1_SIZE $$RINGBUF_SIZE" > $@; \ + echo "#define IMAGE_SIZE $$FILE_SIZE" >> $@; \ + } + + +# Rule to generate zboot_parser_bpf.o, depends on vmlinux.h +zboot_parser_bpf.o: zboot_parser_bpf.c vmlinux.h bpf_helper_defs.h + @$(CC) $(BPF_PROG_CFLAGS) -c zboot_parser_bpf.c -o zboot_parser_bpf.o + +# Generate zboot_parser_bpf.lskel.h using bpftool +# Then, extract the opts_data[] and opts_insn[] arrays and remove 'static' +# keywords to avoid being optimized away. +bytecode.c: zboot_parser_bpf.o + @$(BPFTOOL) gen skeleton -L zboot_parser_bpf.o > zboot_parser_bpf.lskel.h + @sed -n '/static const char opts_data\[\]/,/;/p' zboot_parser_bpf.lskel.h | sed 's/static const/const/' > $@ + @sed -n '/static const char opts_insn\[\]/,/;/p' zboot_parser_bpf.lskel.h | sed 's/static const/const/' >> $@ + @rm -f zboot_parser_bpf.lskel.h + +bytecode.o: bytecode.c + @$(CC) -c $< -o $@ + +# Clean up generated files +clean: + @rm -f $(HEADERS) $(ZBOOT_TARGETS) + +.PHONY: all clean diff --git a/tools/kexec/zboot_parser_bpf.c b/tools/kexec/zboot_parser_bpf.c new file mode 100644 index 0000000000000..e60621780a1a9 --- /dev/null +++ b/tools/kexec/zboot_parser_bpf.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0 +// +#include "vmlinux.h" +#include <bpf_helpers.h> +#include <bpf_tracing.h> +#include "image_size.h" + +/* uncompressed vmlinux.bin plus 4KB */ +#define MAX_RECORD_SIZE (IMAGE_SIZE + 4096) +/* ringbuf 2,3,4 are useless */ +#define MIN_BUF_SIZE 1 + +#define KEXEC_RES_KERNEL_NAME "kexec:kernel" +#define KEXEC_RES_INITRD_NAME "kexec:initrd" +#define KEXEC_RES_CMDLINE_NAME "kexec:cmdline" + +/* ringbuf is safe since the user space has no write access to them */ +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, RINGBUF1_SIZE); +} ringbuf_1 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, MIN_BUF_SIZE); +} ringbuf_2 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, MIN_BUF_SIZE); +} ringbuf_3 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, MIN_BUF_SIZE); +} ringbuf_4 SEC(".maps"); + +char LICENSE[] SEC("license") = "GPL"; + +/* + * This function ensures that the sections .rodata, .data .bss and .rodata.str1.1 + * are created for a bpf prog. + */ +__attribute__((used)) static int dummy(void) +{ + static const char res_kernel[16] __attribute__((used, section(".rodata"))) = KEXEC_RES_KERNEL_NAME; + static char local_name[16] __attribute__((used, section(".data"))) = KEXEC_RES_CMDLINE_NAME; + static char res_cmdline[16] __attribute__((used, section(".bss"))); + + __builtin_memcpy(local_name, KEXEC_RES_INITRD_NAME, 16); + return __builtin_memcmp(local_name, res_kernel, 4); +} + +extern int bpf_copy_to_kernel(const char *name, char *buf, int size) __weak __ksym; +extern struct mem_range_result *bpf_decompress(char *image_gz_payload, int image_gz_sz) __weak __ksym; +extern int bpf_mem_range_result_put(struct mem_range_result *result) __weak __ksym; + + + + +/* see drivers/firmware/efi/libstub/zboot-header.S */ +struct linux_pe_zboot_header { + unsigned int mz_magic; + char image_type[4]; + unsigned int payload_offset; + unsigned int payload_size; + unsigned int reserved[2]; + char comp_type[4]; + unsigned int linux_pe_magic; + unsigned int pe_header_offset; +} __attribute__((packed)); + + +SEC("fentry.s/bpf_handle_pefile") +int BPF_PROG(parse_pe, struct kexec_context *context) +{ + struct linux_pe_zboot_header *zboot_header; + unsigned int image_sz; + char *buf; + char local_name[32]; + + bpf_printk("begin parse PE\n"); + /* BPF verifier should know each variable initial state */ + if (!context->image || (context->image_sz > MAX_RECORD_SIZE)) { + bpf_printk("Err: image size is greater than 0x%lx\n", MAX_RECORD_SIZE); + return 0; + } + + /* In order to access bytes not aligned on 2 order, copy into ringbuf. + * And allocate the memory all at once, later overwriting. + * + * R2 is ARG_CONST_ALLOC_SIZE_OR_ZERO, should be decided at compling time + */ + buf = (char *)bpf_ringbuf_reserve(&ringbuf_1, MAX_RECORD_SIZE, 0); + if (!buf) { + bpf_printk("Err: fail to reserve ringbuf to parse zboot header\n"); + return 0; + } + image_sz = context->image_sz; + bpf_probe_read((void *)buf, sizeof(struct linux_pe_zboot_header), context->image); + zboot_header = (struct linux_pe_zboot_header *)buf; + if (!!__builtin_memcmp(&zboot_header->image_type, "zimg", + sizeof(zboot_header->image_type))) { + bpf_ringbuf_discard(buf, BPF_RB_NO_WAKEUP); + bpf_printk("Err: image is not zboot image\n"); + return 0; + } + + unsigned int payload_offset = zboot_header->payload_offset; + unsigned int payload_size = zboot_header->payload_size; + bpf_printk("zboot image payload offset=0x%x, size=0x%x\n", payload_offset, payload_size); + /* sane check */ + if (payload_size > image_sz) { + bpf_ringbuf_discard(buf, BPF_RB_NO_WAKEUP); + bpf_printk("Invalid zboot image payload offset and size\n"); + return 0; + } + if (payload_size >= MAX_RECORD_SIZE ) { + bpf_ringbuf_discard(buf, BPF_RB_NO_WAKEUP); + bpf_printk("Err: payload_size > MAX_RECORD_SIZE\n"); + return 0; + } + /* Overwrite buf */ + bpf_probe_read((void *)buf, payload_size, context->image + payload_offset); + bpf_printk("Calling bpf_kexec_decompress()\n"); + struct mem_range_result *r = bpf_decompress(buf, payload_size - 4); + if (!r) { + bpf_ringbuf_discard(buf, BPF_RB_NO_WAKEUP); + bpf_printk("Err: fail to decompress\n"); + return 0; + } + + image_sz = r->data_sz; + if (image_sz > MAX_RECORD_SIZE) { + bpf_ringbuf_discard(buf, BPF_RB_NO_WAKEUP); + bpf_mem_range_result_put(r); + bpf_printk("Err: decompressed size too big\n"); + return 0; + } + + /* Since the decompressed size is bigger than original, no need to clean */ + bpf_probe_read((void *)buf, image_sz, r->buf); + bpf_printk("Calling bpf_copy_to_kernel(), image_sz=0x%x\n", image_sz); + /* Verifier is unhappy to expose .rodata.str1.1 'map' to kernel */ + __builtin_memcpy(local_name, KEXEC_RES_KERNEL_NAME, 32); + const char *res_name = local_name; + bpf_copy_to_kernel(res_name, buf, image_sz); + bpf_ringbuf_discard(buf, BPF_RB_NO_WAKEUP); + bpf_mem_range_result_put(r); + + return 0; +} + +SEC("fentry.s/bpf_post_handle_pefile") +int BPF_PROG(post_parse_pe, struct kexec_context *context) +{ + return 0; +} -- 2.49.0