This BPF program aligns with the convention defined in the kernel file kexec_pe_parser_bpf.lskel.h, where the interface between the BPF program and the kernel is established, and is composed of: four maps: struct bpf_map_desc ringbuf_1; struct bpf_map_desc ringbuf_2; struct bpf_map_desc ringbuf_3; struct bpf_map_desc ringbuf_4; four sections: struct bpf_map_desc rodata; struct bpf_map_desc data; struct bpf_map_desc bss; struct bpf_map_desc rodata_str1_1;
two progs: SEC("fentry.s/bpf_handle_pefile") SEC("fentry.s/bpf_post_handle_pefile") This BPF program only uses ringbuf_1, so it minimizes the size of the other three ringbufs to one byte. The size of ringbuf_1 is deduced from the size of the uncompressed file 'vmlinux.bin', which is usually less than 64MB. With the help of a group of bpf kfuncs: bpf_decompress(), bpf_copy_to_kernel(), bpf_mem_range_result_put(), this bpf-prog stores the uncompressed kernel image inside the kernel space. Signed-off-by: Pingfan Liu <pi...@redhat.com> Cc: Alexei Starovoitov <a...@kernel.org> Cc: Baoquan He <b...@redhat.com> Cc: Dave Young <dyo...@redhat.com> Cc: Andrew Morton <a...@linux-foundation.org> Cc: Philipp Rudo <pr...@redhat.com> Cc: b...@vger.kernel.org To: kexec@lists.infradead.org --- tools/kexec/Makefile | 81 +++++++++++++++ tools/kexec/pe.h | 177 +++++++++++++++++++++++++++++++++ tools/kexec/zboot_parser_bpf.c | 157 +++++++++++++++++++++++++++++ 3 files changed, 415 insertions(+) create mode 100644 tools/kexec/Makefile create mode 100644 tools/kexec/pe.h create mode 100644 tools/kexec/zboot_parser_bpf.c diff --git a/tools/kexec/Makefile b/tools/kexec/Makefile new file mode 100644 index 0000000000000..49de2ab309a43 --- /dev/null +++ b/tools/kexec/Makefile @@ -0,0 +1,81 @@ +# SPDX-License-Identifier: GPL-2.0 + +# Ensure Kbuild variables are available +include ../scripts/Makefile.include + +srctree := $(patsubst %/tools/kexec,%,$(CURDIR)) +VMLINUX = $(srctree)/vmlinux +TOOLSDIR := $(srctree)/tools +LIBDIR := $(TOOLSDIR)/lib +BPFDIR := $(LIBDIR)/bpf +ARCH ?= $(shell uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ -e s/aarch64.*/arm64/ -e s/riscv64/riscv/ -e s/loongarch.*/loongarch/) +# At present, zboot image format is used by arm64, riscv, loongarch +# And arch/$(ARCH)/boot/vmlinux.bin is the uncompressed file instead of arch/$(ARCH)/boot/Image +ifeq ($(ARCH),$(filter $(ARCH),arm64 riscv loongarch)) + EFI_IMAGE := $(srctree)/arch/$(ARCH)/boot/vmlinuz.efi + KERNEL_IMAGE := $(srctree)/arch/$(ARCH)/boot/vmlinux.bin +else + @echo "Unsupported architecture: $(ARCH)" + @exit 1 +endif + + +CC = clang +CFLAGS = -O2 +BPF_PROG_CFLAGS = -g -O2 -target bpf -Wall -I $(BPFDIR) -I . +BPFTOOL = bpftool + +# List of generated target files +HEADERS = vmlinux.h bpf_helper_defs.h image_size.h +ZBOOT_TARGETS = bytecode.c zboot_parser_bpf.o bytecode.o + + +# Targets +zboot: $(HEADERS) $(ZBOOT_TARGETS) + +# Rule to generate vmlinux.h from vmlinux +vmlinux.h: $(VMLINUX) + @$(BPFTOOL) btf dump file $(VMLINUX) format c > vmlinux.h + +bpf_helper_defs.h: $(srctree)/tools/include/uapi/linux/bpf.h + @$(QUIET_GEN)$(srctree)/scripts/bpf_doc.py --header \ + --file $(srctree)/tools/include/uapi/linux/bpf.h > bpf_helper_defs.h + +image_size.h: $(KERNEL_IMAGE) + @{ \ + if [ ! -f "$(KERNEL_IMAGE)" ]; then \ + echo "Error: File '$(KERNEL_IMAGE)' does not exist"; \ + exit 1; \ + fi; \ + FILE_SIZE=$$(stat -c '%s' "$(KERNEL_IMAGE)" 2>/dev/null); \ + POWER=4096; \ + while [ $$POWER -le $$FILE_SIZE ]; do \ + POWER=$$((POWER * 2)); \ + done; \ + RINGBUF_SIZE=$$POWER; \ + echo "#define RINGBUF1_SIZE $$RINGBUF_SIZE" > $@; \ + echo "#define IMAGE_SIZE $$FILE_SIZE" >> $@; \ + } + + +# Rule to generate zboot_parser_bpf.o, depends on vmlinux.h +zboot_parser_bpf.o: zboot_parser_bpf.c vmlinux.h bpf_helper_defs.h + @$(CC) $(BPF_PROG_CFLAGS) -c zboot_parser_bpf.c -o zboot_parser_bpf.o + +# Generate zboot_parser_bpf.lskel.h using bpftool +# Then, extract the opts_data[] and opts_insn[] arrays and remove 'static' +# keywords to avoid being optimized away. +bytecode.c: zboot_parser_bpf.o + @$(BPFTOOL) gen skeleton -L zboot_parser_bpf.o > zboot_parser_bpf.lskel.h + @sed -n '/static const char opts_data\[\]/,/;/p' zboot_parser_bpf.lskel.h | sed 's/static const/const/' > $@ + @sed -n '/static const char opts_insn\[\]/,/;/p' zboot_parser_bpf.lskel.h | sed 's/static const/const/' >> $@ + @rm -f zboot_parser_bpf.lskel.h + +bytecode.o: bytecode.c + @$(CC) -c $< -o $@ + +# Clean up generated files +clean: + @rm -f $(HEADERS) $(ZBOOT_TARGETS) + +.PHONY: all clean diff --git a/tools/kexec/pe.h b/tools/kexec/pe.h new file mode 100644 index 0000000000000..9f1d086d6cf1a --- /dev/null +++ b/tools/kexec/pe.h @@ -0,0 +1,177 @@ +/* + * Extract from linux kernel include/linux/pe.h + */ + +#ifndef __PE_H__ +#define __PE_H__ + +#define MZ_MAGIC 0x5a4d /* "MZ" */ +#define PE_MAGIC 0x00004550 /* "PE\0\0" */ + +struct mz_hdr { + uint16_t magic; /* MZ_MAGIC */ + uint16_t lbsize; /* size of last used block */ + uint16_t blocks; /* pages in file, 0x3 */ + uint16_t relocs; /* relocations */ + uint16_t hdrsize; /* header size in "paragraphs" */ + uint16_t min_extra_pps; /* .bss */ + uint16_t max_extra_pps; /* runtime limit for the arena size */ + uint16_t ss; /* relative stack segment */ + uint16_t sp; /* initial %sp register */ + uint16_t checksum; /* word checksum */ + uint16_t ip; /* initial %ip register */ + uint16_t cs; /* initial %cs relative to load segment */ + uint16_t reloc_table_offset; /* offset of the first relocation */ + uint16_t overlay_num; /* overlay number. set to 0. */ + uint16_t reserved0[4]; /* reserved */ + uint16_t oem_id; /* oem identifier */ + uint16_t oem_info; /* oem specific */ + uint16_t reserved1[10]; /* reserved */ + uint32_t peaddr; /* address of pe header */ + char message[]; /* message to print */ +}; + +struct pe_hdr { + uint32_t magic; /* PE magic */ + uint16_t machine; /* machine type */ + uint16_t sections; /* number of sections */ + uint32_t timestamp; /* time_t */ + uint32_t symbol_table; /* symbol table offset */ + uint32_t symbols; /* number of symbols */ + uint16_t opt_hdr_size; /* size of optional header */ + uint16_t flags; /* flags */ +}; + +/* the fact that pe32 isn't padded where pe32+ is 64-bit means union won't + * work right. vomit. */ +struct pe32_opt_hdr { + /* "standard" header */ + uint16_t magic; /* file type */ + uint8_t ld_major; /* linker major version */ + uint8_t ld_minor; /* linker minor version */ + uint32_t text_size; /* size of text section(s) */ + uint32_t data_size; /* size of data section(s) */ + uint32_t bss_size; /* size of bss section(s) */ + uint32_t entry_point; /* file offset of entry point */ + uint32_t code_base; /* relative code addr in ram */ + uint32_t data_base; /* relative data addr in ram */ + /* "windows" header */ + uint32_t image_base; /* preferred load address */ + uint32_t section_align; /* alignment in bytes */ + uint32_t file_align; /* file alignment in bytes */ + uint16_t os_major; /* major OS version */ + uint16_t os_minor; /* minor OS version */ + uint16_t image_major; /* major image version */ + uint16_t image_minor; /* minor image version */ + uint16_t subsys_major; /* major subsystem version */ + uint16_t subsys_minor; /* minor subsystem version */ + uint32_t win32_version; /* reserved, must be 0 */ + uint32_t image_size; /* image size */ + uint32_t header_size; /* header size rounded up to + file_align */ + uint32_t csum; /* checksum */ + uint16_t subsys; /* subsystem */ + uint16_t dll_flags; /* more flags! */ + uint32_t stack_size_req;/* amt of stack requested */ + uint32_t stack_size; /* amt of stack required */ + uint32_t heap_size_req; /* amt of heap requested */ + uint32_t heap_size; /* amt of heap required */ + uint32_t loader_flags; /* reserved, must be 0 */ + uint32_t data_dirs; /* number of data dir entries */ +}; + +struct pe32plus_opt_hdr { + uint16_t magic; /* file type */ + uint8_t ld_major; /* linker major version */ + uint8_t ld_minor; /* linker minor version */ + uint32_t text_size; /* size of text section(s) */ + uint32_t data_size; /* size of data section(s) */ + uint32_t bss_size; /* size of bss section(s) */ + uint32_t entry_point; /* file offset of entry point */ + uint32_t code_base; /* relative code addr in ram */ + /* "windows" header */ + uint64_t image_base; /* preferred load address */ + uint32_t section_align; /* alignment in bytes */ + uint32_t file_align; /* file alignment in bytes */ + uint16_t os_major; /* major OS version */ + uint16_t os_minor; /* minor OS version */ + uint16_t image_major; /* major image version */ + uint16_t image_minor; /* minor image version */ + uint16_t subsys_major; /* major subsystem version */ + uint16_t subsys_minor; /* minor subsystem version */ + uint32_t win32_version; /* reserved, must be 0 */ + uint32_t image_size; /* image size */ + uint32_t header_size; /* header size rounded up to + file_align */ + uint32_t csum; /* checksum */ + uint16_t subsys; /* subsystem */ + uint16_t dll_flags; /* more flags! */ + uint64_t stack_size_req;/* amt of stack requested */ + uint64_t stack_size; /* amt of stack required */ + uint64_t heap_size_req; /* amt of heap requested */ + uint64_t heap_size; /* amt of heap required */ + uint32_t loader_flags; /* reserved, must be 0 */ + uint32_t data_dirs; /* number of data dir entries */ +}; + +struct data_dirent { + uint32_t virtual_address; /* relative to load address */ + uint32_t size; +}; + +struct data_directory { + struct data_dirent exports; /* .edata */ + struct data_dirent imports; /* .idata */ + struct data_dirent resources; /* .rsrc */ + struct data_dirent exceptions; /* .pdata */ + struct data_dirent certs; /* certs */ + struct data_dirent base_relocations; /* .reloc */ + struct data_dirent debug; /* .debug */ + struct data_dirent arch; /* reservered */ + struct data_dirent global_ptr; /* global pointer reg. Size=0 */ + struct data_dirent tls; /* .tls */ + struct data_dirent load_config; /* load configuration structure */ + struct data_dirent bound_imports; /* no idea */ + struct data_dirent import_addrs; /* import address table */ + struct data_dirent delay_imports; /* delay-load import table */ + struct data_dirent clr_runtime_hdr; /* .cor (object only) */ + struct data_dirent reserved; +}; + +struct section_header { + char name[8]; /* name or "/12\0" string tbl offset */ + uint32_t virtual_size; /* size of loaded section in ram */ + uint32_t virtual_address; /* relative virtual address */ + uint32_t raw_data_size; /* size of the section */ + uint32_t data_addr; /* file pointer to first page of sec */ + uint32_t relocs; /* file pointer to relocation entries */ + uint32_t line_numbers; /* line numbers! */ + uint16_t num_relocs; /* number of relocations */ + uint16_t num_lin_numbers; /* srsly. */ + uint32_t flags; +}; + +struct win_certificate { + uint32_t length; + uint16_t revision; + uint16_t cert_type; +}; + +/* + * Return -1 if not PE, else offset of the PE header + */ +static int get_pehdr_offset(const char *buf) +{ + int pe_hdr_offset; + + pe_hdr_offset = *((int *)(buf + 0x3c)); + buf += pe_hdr_offset; + if (!!memcmp(buf, "PE\0\0", 4)) { + printf("Not a PE file\n"); + return -1; + } + + return pe_hdr_offset; +} + +#endif diff --git a/tools/kexec/zboot_parser_bpf.c b/tools/kexec/zboot_parser_bpf.c new file mode 100644 index 0000000000000..3f038b34c641a --- /dev/null +++ b/tools/kexec/zboot_parser_bpf.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +// +#include "vmlinux.h" +#include <bpf_helpers.h> +#include <bpf_tracing.h> +#include "image_size.h" + +/* 128 MB is big enough to hold either kernel or initramfs */ +#define MAX_RECORD_SIZE (IMAGE_SIZE + 4096) +#define MIN_BUF_SIZE 1 + +#define KEXEC_RES_KERNEL_NAME "kernel" +#define KEXEC_RES_INITRD_NAME "initrd" +#define KEXEC_RES_CMDLINE_NAME "cmdline" + +/* ringbuf is safe since the user space has no write access to them */ +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, RINGBUF1_SIZE); +} ringbuf_1 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, MIN_BUF_SIZE); +} ringbuf_2 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, MIN_BUF_SIZE); +} ringbuf_3 SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, MIN_BUF_SIZE); +} ringbuf_4 SEC(".maps"); + +char LICENSE[] SEC("license") = "GPL"; + +/* + * This function ensures that the sections .rodata, .data .bss and .rodata.str1.1 + * are created for a bpf prog. + */ +__attribute__((used)) static int dummy(void) +{ + static const char res_kernel[16] __attribute__((used, section(".rodata"))) = KEXEC_RES_KERNEL_NAME; + static char local_name[16] __attribute__((used, section(".data"))) = KEXEC_RES_CMDLINE_NAME; + static char res_cmdline[16] __attribute__((used, section(".bss"))); + + __builtin_memcpy(local_name, KEXEC_RES_INITRD_NAME, 16); + return __builtin_memcmp(local_name, res_kernel, 4); +} + +extern int bpf_copy_to_kernel(const char *name, char *buf, int size) __weak __ksym; +extern struct mem_range_result *bpf_decompress(char *image_gz_payload, int image_gz_sz) __weak __ksym; +extern int bpf_mem_range_result_put(struct mem_range_result *result) __weak __ksym; + + + + +/* see drivers/firmware/efi/libstub/zboot-header.S */ +struct linux_pe_zboot_header { + unsigned int mz_magic; + char image_type[4]; + unsigned int payload_offset; + unsigned int payload_size; + unsigned int reserved[2]; + char comp_type[4]; + unsigned int linux_pe_magic; + unsigned int pe_header_offset; +} __attribute__((packed)); + + +SEC("fentry.s/bpf_handle_pefile") +int BPF_PROG(parse_pe, struct kexec_context *context) +{ + struct linux_pe_zboot_header *zboot_header; + unsigned int image_sz; + char *buf; + char local_name[32]; + + bpf_printk("begin parse PE\n"); + /* BPF verifier should know each variable initial state */ + if (!context->image || (context->image_sz > MAX_RECORD_SIZE)) { + bpf_printk("Err: image size is greater than 0x%lx\n", MAX_RECORD_SIZE); + return 0; + } + + /* In order to access bytes not aligned on 2 order, copy into ringbuf. + * And allocate the memory all at once, later overwriting. + * + * R2 is ARG_CONST_ALLOC_SIZE_OR_ZERO, should be decided at compling time + */ + buf = (char *)bpf_ringbuf_reserve(&ringbuf_1, MAX_RECORD_SIZE, 0); + if (!buf) { + bpf_printk("Err: fail to reserve ringbuf to parse zboot header\n"); + return 0; + } + image_sz = context->image_sz; + bpf_probe_read((void *)buf, sizeof(struct linux_pe_zboot_header), context->image); + zboot_header = (struct linux_pe_zboot_header *)buf; + if (!!__builtin_memcmp(&zboot_header->image_type, "zimg", + sizeof(zboot_header->image_type))) { + bpf_ringbuf_discard(buf, BPF_RB_NO_WAKEUP); + bpf_printk("Err: image is not zboot image\n"); + return 0; + } + + unsigned int payload_offset = zboot_header->payload_offset; + unsigned int payload_size = zboot_header->payload_size; + bpf_printk("zboot image payload offset=0x%x, size=0x%x\n", payload_offset, payload_size); + /* sane check */ + if (payload_size > image_sz) { + bpf_ringbuf_discard(buf, BPF_RB_NO_WAKEUP); + bpf_printk("Invalid zboot image payload offset and size\n"); + return 0; + } + if (payload_size >= MAX_RECORD_SIZE ) { + bpf_ringbuf_discard(buf, BPF_RB_NO_WAKEUP); + bpf_printk("Err: payload_size > MAX_RECORD_SIZE\n"); + return 0; + } + /* Overwrite buf */ + bpf_probe_read((void *)buf, payload_size, context->image + payload_offset); + bpf_printk("Calling bpf_kexec_decompress()\n"); + struct mem_range_result *r = bpf_decompress(buf, payload_size - 4); + if (!r) { + bpf_ringbuf_discard(buf, BPF_RB_NO_WAKEUP); + bpf_printk("Err: fail to decompress\n"); + return 0; + } + + image_sz = r->data_sz; + if (image_sz > MAX_RECORD_SIZE) { + bpf_ringbuf_discard(buf, BPF_RB_NO_WAKEUP); + bpf_mem_range_result_put(r); + bpf_printk("Err: decompressed size too big\n"); + return 0; + } + + /* Since the decompressed size is bigger than original, no need to clean */ + bpf_probe_read((void *)buf, image_sz, r->buf); + bpf_printk("Calling bpf_copy_to_kernel(), image_sz=0x%x\n", image_sz); + /* Verifier is unhappy to expose .rodata.str1.1 'map' to kernel */ + __builtin_memcpy(local_name, KEXEC_RES_KERNEL_NAME, 32); + const char *res_name = local_name; + bpf_copy_to_kernel(res_name, buf, image_sz); + bpf_ringbuf_discard(buf, BPF_RB_NO_WAKEUP); + bpf_mem_range_result_put(r); + + return 0; +} + +SEC("fentry.s/bpf_post_handle_pefile") +int BPF_PROG(post_parse_pe, struct kexec_context *context) +{ + return 0; +} -- 2.49.0