[PATCH v2] x86/boot: Support uncompressed kernel
Compressed kernel has its own drawback: decompressing takes time. Even though the time is short enough to ignore for most cases but for cases when time is critical decompressing time still matters. The patch adds a 'CONFIG_KERNEL_RAW' configure choice so the built binary can have no decompressing at all. The experiment shows: kernel kernel sizetime in decompress_kernel compressed (gzip)3.3M 53ms compressed (lz4) 4.5M 16ms uncompressed 14M2ms Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com> --- v2: * add HAVE_KERNEL_RAW * decode ELF kernel in place instead of getting another copy * minor comment fix --- arch/x86/Kconfig | 1 + arch/x86/boot/compressed/Makefile | 3 +++ arch/x86/boot/compressed/misc.c | 18 +- init/Kconfig | 13 - scripts/Makefile.lib | 8 5 files changed, 37 insertions(+), 6 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cc98d5a..207695c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -142,6 +142,7 @@ config X86 select HAVE_KERNEL_LZ4 select HAVE_KERNEL_LZMA select HAVE_KERNEL_LZO + select HAVE_KERNEL_RAW select HAVE_KERNEL_XZ select HAVE_KPROBES select HAVE_KPROBES_ON_FTRACE diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 44163e8..ed366e1 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -120,6 +120,8 @@ $(obj)/vmlinux.relocs: vmlinux FORCE vmlinux.bin.all-y := $(obj)/vmlinux.bin vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs +$(obj)/vmlinux.bin.raw: $(vmlinux.bin.all-y) FORCE + $(call if_changed,raw) $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE $(call if_changed,gzip) $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE @@ -133,6 +135,7 @@ $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE $(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE $(call if_changed,lz4) +suffix-$(CONFIG_KERNEL_RAW):= raw suffix-$(CONFIG_KERNEL_GZIP) := gz suffix-$(CONFIG_KERNEL_BZIP2) := bz2 suffix-$(CONFIG_KERNEL_LZMA) := lzma diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index b3c5a5f0..9791ca9 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -51,6 +51,10 @@ static int vidport; static int lines, cols; +#ifdef CONFIG_KERNEL_RAW +#include +#endif + #ifdef CONFIG_KERNEL_GZIP #include "../../../../lib/decompress_inflate.c" #endif @@ -265,7 +269,7 @@ static inline void handle_relocations(void *output, unsigned long output_len, { } #endif -static void parse_elf(void *output) +static void parse_elf(void* buf, void *output) { #ifdef CONFIG_X86_64 Elf64_Ehdr ehdr; @@ -277,7 +281,7 @@ static void parse_elf(void *output) void *dest; int i; - memcpy(, output, sizeof(ehdr)); + memcpy(, buf, sizeof(ehdr)); if (ehdr.e_ident[EI_MAG0] != ELFMAG0 || ehdr.e_ident[EI_MAG1] != ELFMAG1 || ehdr.e_ident[EI_MAG2] != ELFMAG2 || @@ -292,7 +296,7 @@ static void parse_elf(void *output) if (!phdrs) error("Failed to allocate space for phdrs"); - memcpy(phdrs, output + ehdr.e_phoff, sizeof(*phdrs) * ehdr.e_phnum); + memcpy(phdrs, buf + ehdr.e_phoff, sizeof(*phdrs) * ehdr.e_phnum); for (i = 0; i < ehdr.e_phnum; i++) { phdr = [i]; @@ -305,7 +309,7 @@ static void parse_elf(void *output) #else dest = (void *)(phdr->p_paddr); #endif - memmove(dest, output + phdr->p_offset, phdr->p_filesz); + memmove(dest, buf + phdr->p_offset, phdr->p_filesz); break; default: /* Ignore other PT_* */ break; } @@ -401,10 +405,14 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, error("Destination virtual address changed when not relocatable"); #endif +#ifdef CONFIG_KERNEL_RAW + parse_elf(input_data, output); +#else debug_putstr("\nDecompressing Linux... "); __decompress(input_data, input_len, NULL, NULL, output, output_len, NULL, error); - parse_elf(output); + parse_elf(output, output); +#endif handle_relocations(output, output_len, virt_addr); debug_putstr("done.\nBooting the kernel.\n"); return output; diff --git a/init/Kconfig b/init/Kconfig index a92f27d..b8926bb 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -109,6 +109,9 @@ config LOCALVERSION_AUTO which is done within the script "scripts/setlocalversion".) +config HAVE_KERNEL_RAW + bool + config HAVE_KERNEL_GZIP bool @
[PATCH v2] x86/boot: Support uncompressed kernel
Compressed kernel has its own drawback: decompressing takes time. Even though the time is short enough to ignore for most cases but for cases when time is critical decompressing time still matters. The patch adds a 'CONFIG_KERNEL_RAW' configure choice so the built binary can have no decompressing at all. The experiment shows: kernel kernel sizetime in decompress_kernel compressed (gzip)3.3M 53ms compressed (lz4) 4.5M 16ms uncompressed 14M2ms Signed-off-by: Chao Peng --- v2: * add HAVE_KERNEL_RAW * decode ELF kernel in place instead of getting another copy * minor comment fix --- arch/x86/Kconfig | 1 + arch/x86/boot/compressed/Makefile | 3 +++ arch/x86/boot/compressed/misc.c | 18 +- init/Kconfig | 13 - scripts/Makefile.lib | 8 5 files changed, 37 insertions(+), 6 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cc98d5a..207695c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -142,6 +142,7 @@ config X86 select HAVE_KERNEL_LZ4 select HAVE_KERNEL_LZMA select HAVE_KERNEL_LZO + select HAVE_KERNEL_RAW select HAVE_KERNEL_XZ select HAVE_KPROBES select HAVE_KPROBES_ON_FTRACE diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 44163e8..ed366e1 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -120,6 +120,8 @@ $(obj)/vmlinux.relocs: vmlinux FORCE vmlinux.bin.all-y := $(obj)/vmlinux.bin vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs +$(obj)/vmlinux.bin.raw: $(vmlinux.bin.all-y) FORCE + $(call if_changed,raw) $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE $(call if_changed,gzip) $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE @@ -133,6 +135,7 @@ $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE $(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE $(call if_changed,lz4) +suffix-$(CONFIG_KERNEL_RAW):= raw suffix-$(CONFIG_KERNEL_GZIP) := gz suffix-$(CONFIG_KERNEL_BZIP2) := bz2 suffix-$(CONFIG_KERNEL_LZMA) := lzma diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index b3c5a5f0..9791ca9 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -51,6 +51,10 @@ static int vidport; static int lines, cols; +#ifdef CONFIG_KERNEL_RAW +#include +#endif + #ifdef CONFIG_KERNEL_GZIP #include "../../../../lib/decompress_inflate.c" #endif @@ -265,7 +269,7 @@ static inline void handle_relocations(void *output, unsigned long output_len, { } #endif -static void parse_elf(void *output) +static void parse_elf(void* buf, void *output) { #ifdef CONFIG_X86_64 Elf64_Ehdr ehdr; @@ -277,7 +281,7 @@ static void parse_elf(void *output) void *dest; int i; - memcpy(, output, sizeof(ehdr)); + memcpy(, buf, sizeof(ehdr)); if (ehdr.e_ident[EI_MAG0] != ELFMAG0 || ehdr.e_ident[EI_MAG1] != ELFMAG1 || ehdr.e_ident[EI_MAG2] != ELFMAG2 || @@ -292,7 +296,7 @@ static void parse_elf(void *output) if (!phdrs) error("Failed to allocate space for phdrs"); - memcpy(phdrs, output + ehdr.e_phoff, sizeof(*phdrs) * ehdr.e_phnum); + memcpy(phdrs, buf + ehdr.e_phoff, sizeof(*phdrs) * ehdr.e_phnum); for (i = 0; i < ehdr.e_phnum; i++) { phdr = [i]; @@ -305,7 +309,7 @@ static void parse_elf(void *output) #else dest = (void *)(phdr->p_paddr); #endif - memmove(dest, output + phdr->p_offset, phdr->p_filesz); + memmove(dest, buf + phdr->p_offset, phdr->p_filesz); break; default: /* Ignore other PT_* */ break; } @@ -401,10 +405,14 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, error("Destination virtual address changed when not relocatable"); #endif +#ifdef CONFIG_KERNEL_RAW + parse_elf(input_data, output); +#else debug_putstr("\nDecompressing Linux... "); __decompress(input_data, input_len, NULL, NULL, output, output_len, NULL, error); - parse_elf(output); + parse_elf(output, output); +#endif handle_relocations(output, output_len, virt_addr); debug_putstr("done.\nBooting the kernel.\n"); return output; diff --git a/init/Kconfig b/init/Kconfig index a92f27d..b8926bb 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -109,6 +109,9 @@ config LOCALVERSION_AUTO which is done within the script "scripts/setlocalversion".) +config HAVE_KERNEL_RAW + bool + config HAVE_KERNEL_GZIP bool @@ -130,7 +133,7 @@ config HA
Re: [PATCH] x86/boot: Support uncompressed kernel
On Mon, 2017-03-27 at 15:25 +0200, Arnd Bergmann wrote: > On Mon, Mar 27, 2017 at 1:47 PM, Michal Marek <mma...@suse.com> wrote: > > > > Dne 27.3.2017 v 09:58 Sebastian Andrzej Siewior napsal(a): > > > > > > On 2017-03-24 13:35:40 [+0800], Chao Peng wrote: > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > kernel kernel sizetime in > > > > > > > decompress_kernel > > > > > > > compressed (gzip)3.3M 53ms > > > > > > > uncompressed 14M3ms > > > > > > > > > > Exactly, LZ4 is the fastest. It takes 16ms to complete the > > > > decompression. Still sounds a little longer when compared to > > > > uncompressed kernel. > > > > > > Are we seriously talking here about one-time improvement of 13ms > > > boot time? > > > > If the use case is launching new VM instances continuously, then > > compressing the kernel image is about as useful as compressing > > /bin/bash. > > I guess the next step would be to use CONFIG_XIP_KERNEL on x86, > which requires an uncompressed kernel but has the additional advantage > of sharing the read-only sections of the kernel image across virtual > machines, resulting in better RAM and cache usage. That is something we wanna look into :) Chao
Re: [PATCH] x86/boot: Support uncompressed kernel
On Mon, 2017-03-27 at 15:25 +0200, Arnd Bergmann wrote: > On Mon, Mar 27, 2017 at 1:47 PM, Michal Marek wrote: > > > > Dne 27.3.2017 v 09:58 Sebastian Andrzej Siewior napsal(a): > > > > > > On 2017-03-24 13:35:40 [+0800], Chao Peng wrote: > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > kernel kernel sizetime in > > > > > > > decompress_kernel > > > > > > > compressed (gzip)3.3M 53ms > > > > > > > uncompressed 14M3ms > > > > > > > > > > Exactly, LZ4 is the fastest. It takes 16ms to complete the > > > > decompression. Still sounds a little longer when compared to > > > > uncompressed kernel. > > > > > > Are we seriously talking here about one-time improvement of 13ms > > > boot time? > > > > If the use case is launching new VM instances continuously, then > > compressing the kernel image is about as useful as compressing > > /bin/bash. > > I guess the next step would be to use CONFIG_XIP_KERNEL on x86, > which requires an uncompressed kernel but has the additional advantage > of sharing the read-only sections of the kernel image across virtual > machines, resulting in better RAM and cache usage. That is something we wanna look into :) Chao
Re: [PATCH] x86/boot: Support uncompressed kernel
On Mon, 2017-03-27 at 09:58 +0200, Sebastian Andrzej Siewior wrote: > On 2017-03-24 13:35:40 [+0800], Chao Peng wrote: > > > > > > > > > > > > > > > > > > > > > kernel kernel sizetime in > > > > > decompress_kernel > > > > > compressed (gzip)3.3M 53ms > > > > > uncompressed 14M3ms > > > > > > Exactly, LZ4 is the fastest. It takes 16ms to complete the > > decompression. Still sounds a little longer when compared to > > uncompressed kernel. > > Are we seriously talking here about one-time improvement of 13ms > boot time? The usage model for us is to lunch kernel in virtual machine and there will be thousands of instances lunched and shutdowned/re-lunched frequently, so every single million-second helps. And 13ms means 20% improvement to our existing optimization (the other part besides decompression is optimized to ~40ms). Chao > > > > > Chao > > Sebastian
Re: [PATCH] x86/boot: Support uncompressed kernel
On Mon, 2017-03-27 at 09:58 +0200, Sebastian Andrzej Siewior wrote: > On 2017-03-24 13:35:40 [+0800], Chao Peng wrote: > > > > > > > > > > > > > > > > > > > > > kernel kernel sizetime in > > > > > decompress_kernel > > > > > compressed (gzip)3.3M 53ms > > > > > uncompressed 14M3ms > > > > > > Exactly, LZ4 is the fastest. It takes 16ms to complete the > > decompression. Still sounds a little longer when compared to > > uncompressed kernel. > > Are we seriously talking here about one-time improvement of 13ms > boot time? The usage model for us is to lunch kernel in virtual machine and there will be thousands of instances lunched and shutdowned/re-lunched frequently, so every single million-second helps. And 13ms means 20% improvement to our existing optimization (the other part besides decompression is optimized to ~40ms). Chao > > > > > Chao > > Sebastian
Re: [PATCH] x86/boot: Support uncompressed kernel
> > > The patch adds a 'CONFIG_KERNEL_RAW' configure choice so the built > > > binary > > > can have no uncompressing at all. The experiment shows: > > > > > > kernel kernel sizetime in decompress_kernel > > > compressed (gzip)3.3M 53ms > > > uncompressed 14M3ms > > > > How about the time difference for bootloader to read kernel from > > flash/disk/network to ram? The loading time for bootloader can be longer as size increased, but that depends on which media it uses. For our usecase, it's not a big problem. As we run the kernel in virtual machine and lunch thousands of instances in the same physical machine so only the first instance needs to read from the file and later we just copy the memory. The thing that really matters for us is how fast we can boot for majority of the instances. > > there are also faster de-compressors than gzip out there. LZ4, for > instance. > LZ4, as far as I remember, can be quite fast, like ~10 times faster > than gzip. > have you tested it? Exactly, LZ4 is the fastest. It takes 16ms to complete the decompression. Still sounds a little longer when compared to uncompressed kernel. Chao
Re: [PATCH] x86/boot: Support uncompressed kernel
> > > The patch adds a 'CONFIG_KERNEL_RAW' configure choice so the built > > > binary > > > can have no uncompressing at all. The experiment shows: > > > > > > kernel kernel sizetime in decompress_kernel > > > compressed (gzip)3.3M 53ms > > > uncompressed 14M3ms > > > > How about the time difference for bootloader to read kernel from > > flash/disk/network to ram? The loading time for bootloader can be longer as size increased, but that depends on which media it uses. For our usecase, it's not a big problem. As we run the kernel in virtual machine and lunch thousands of instances in the same physical machine so only the first instance needs to read from the file and later we just copy the memory. The thing that really matters for us is how fast we can boot for majority of the instances. > > there are also faster de-compressors than gzip out there. LZ4, for > instance. > LZ4, as far as I remember, can be quite fast, like ~10 times faster > than gzip. > have you tested it? Exactly, LZ4 is the fastest. It takes 16ms to complete the decompression. Still sounds a little longer when compared to uncompressed kernel. Chao
[PATCH] x86/boot: Support uncompressed kernel
Compressed kernel has its own drawback: uncompressing takes time. Even though the time is short enough to ignore for most cases but for cases that time is critical this is still a big number. In our on-going optimization for kernel boot time, the measured overall kernel boot time is ~90ms while the uncompressing takes ~50ms with gzip. The patch adds a 'CONFIG_KERNEL_RAW' configure choice so the built binary can have no uncompressing at all. The experiment shows: kernel kernel sizetime in decompress_kernel compressed (gzip)3.3M 53ms uncompressed 14M3ms Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com> --- arch/x86/boot/compressed/Makefile | 3 +++ arch/x86/boot/compressed/misc.c | 14 ++ init/Kconfig | 7 +++ scripts/Makefile.lib | 8 4 files changed, 32 insertions(+) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index f9ce75d..fc0e1c0 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -73,6 +73,8 @@ $(obj)/vmlinux.relocs: vmlinux FORCE vmlinux.bin.all-y := $(obj)/vmlinux.bin vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs +$(obj)/vmlinux.bin.raw: $(vmlinux.bin.all-y) FORCE + $(call if_changed,raw) $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE $(call if_changed,gzip) $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE @@ -86,6 +88,7 @@ $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE $(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE $(call if_changed,lz4) +suffix-$(CONFIG_KERNEL_RAW):= raw suffix-$(CONFIG_KERNEL_GZIP) := gz suffix-$(CONFIG_KERNEL_BZIP2) := bz2 suffix-$(CONFIG_KERNEL_LZMA) := lzma diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 79dac17..fb3cd43 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -123,6 +123,20 @@ static char *vidmem; static int vidport; static int lines, cols; +#ifdef CONFIG_KERNEL_RAW +#include +static int __decompress(unsigned char *buf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), + unsigned char *outbuf, long olen, + long *pos, + void (*error)(char *x)) +{ + memcpy(outbuf, buf, olen); + return 0; +} +#endif + #ifdef CONFIG_KERNEL_GZIP #include "../../../../lib/decompress_inflate.c" #endif diff --git a/init/Kconfig b/init/Kconfig index 2232080..1db2ea2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -137,6 +137,13 @@ choice If in doubt, select 'gzip' +config KERNEL_RAW + bool "RAW" + help + No compression. It creates much bigger kernel and uses much more + space (disk/memory) than other choices. It can be useful when + decompression speed is the most concern while space is not a problem. + config KERNEL_GZIP bool "Gzip" depends on HAVE_KERNEL_GZIP diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 2edbcad..384128d 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -344,6 +344,14 @@ cmd_lz4 = (cat $(filter-out FORCE,$^) | \ lz4c -l -c1 stdin stdout && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ (rm -f $@ ; false) +# RAW +# --- +quiet_cmd_raw = RAW $@ +cmd_raw = (cat $(filter-out FORCE,$^) && \ + $(call size_append, $(filter-out FORCE,$^))) > $@ || \ + (rm -f $@ ; false) + + # U-Boot mkimage # --- -- 1.8.3.1
[PATCH] x86/boot: Support uncompressed kernel
Compressed kernel has its own drawback: uncompressing takes time. Even though the time is short enough to ignore for most cases but for cases that time is critical this is still a big number. In our on-going optimization for kernel boot time, the measured overall kernel boot time is ~90ms while the uncompressing takes ~50ms with gzip. The patch adds a 'CONFIG_KERNEL_RAW' configure choice so the built binary can have no uncompressing at all. The experiment shows: kernel kernel sizetime in decompress_kernel compressed (gzip)3.3M 53ms uncompressed 14M3ms Signed-off-by: Chao Peng --- arch/x86/boot/compressed/Makefile | 3 +++ arch/x86/boot/compressed/misc.c | 14 ++ init/Kconfig | 7 +++ scripts/Makefile.lib | 8 4 files changed, 32 insertions(+) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index f9ce75d..fc0e1c0 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -73,6 +73,8 @@ $(obj)/vmlinux.relocs: vmlinux FORCE vmlinux.bin.all-y := $(obj)/vmlinux.bin vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs +$(obj)/vmlinux.bin.raw: $(vmlinux.bin.all-y) FORCE + $(call if_changed,raw) $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE $(call if_changed,gzip) $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE @@ -86,6 +88,7 @@ $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE $(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE $(call if_changed,lz4) +suffix-$(CONFIG_KERNEL_RAW):= raw suffix-$(CONFIG_KERNEL_GZIP) := gz suffix-$(CONFIG_KERNEL_BZIP2) := bz2 suffix-$(CONFIG_KERNEL_LZMA) := lzma diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 79dac17..fb3cd43 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -123,6 +123,20 @@ static char *vidmem; static int vidport; static int lines, cols; +#ifdef CONFIG_KERNEL_RAW +#include +static int __decompress(unsigned char *buf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), + unsigned char *outbuf, long olen, + long *pos, + void (*error)(char *x)) +{ + memcpy(outbuf, buf, olen); + return 0; +} +#endif + #ifdef CONFIG_KERNEL_GZIP #include "../../../../lib/decompress_inflate.c" #endif diff --git a/init/Kconfig b/init/Kconfig index 2232080..1db2ea2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -137,6 +137,13 @@ choice If in doubt, select 'gzip' +config KERNEL_RAW + bool "RAW" + help + No compression. It creates much bigger kernel and uses much more + space (disk/memory) than other choices. It can be useful when + decompression speed is the most concern while space is not a problem. + config KERNEL_GZIP bool "Gzip" depends on HAVE_KERNEL_GZIP diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 2edbcad..384128d 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -344,6 +344,14 @@ cmd_lz4 = (cat $(filter-out FORCE,$^) | \ lz4c -l -c1 stdin stdout && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ (rm -f $@ ; false) +# RAW +# --- +quiet_cmd_raw = RAW $@ +cmd_raw = (cat $(filter-out FORCE,$^) && \ + $(call size_append, $(filter-out FORCE,$^))) > $@ || \ + (rm -f $@ ; false) + + # U-Boot mkimage # --- -- 1.8.3.1
[PATCH] KVM: VMX: use correct vmcs_read/write for guest segment selector/base
Guest segment selector is 16 bit field and guest segment base is natural width field. Fix two incorrect invocations accordingly. Without this patch, build fails when aggressive inlining is used with ICC. Cc: sta...@vger.kernel.org Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com> --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a236dec..2c22aef 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3962,7 +3962,7 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save) } vmcs_write16(sf->selector, var.selector); - vmcs_write32(sf->base, var.base); + vmcs_writel(sf->base, var.base); vmcs_write32(sf->limit, var.limit); vmcs_write32(sf->ar_bytes, vmx_segment_access_rights()); } @@ -8350,7 +8350,7 @@ static void kvm_flush_pml_buffers(struct kvm *kvm) static void vmx_dump_sel(char *name, uint32_t sel) { pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", - name, vmcs_read32(sel), + name, vmcs_read16(sel), vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); -- 1.8.3.1
[PATCH] KVM: VMX: use correct vmcs_read/write for guest segment selector/base
Guest segment selector is 16 bit field and guest segment base is natural width field. Fix two incorrect invocations accordingly. Without this patch, build fails when aggressive inlining is used with ICC. Cc: sta...@vger.kernel.org Signed-off-by: Chao Peng --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a236dec..2c22aef 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3962,7 +3962,7 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save) } vmcs_write16(sf->selector, var.selector); - vmcs_write32(sf->base, var.base); + vmcs_writel(sf->base, var.base); vmcs_write32(sf->limit, var.limit); vmcs_write32(sf->ar_bytes, vmx_segment_access_rights()); } @@ -8350,7 +8350,7 @@ static void kvm_flush_pml_buffers(struct kvm *kvm) static void vmx_dump_sel(char *name, uint32_t sel) { pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", - name, vmcs_read32(sel), + name, vmcs_read16(sel), vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); -- 1.8.3.1
Re: [RFC PATCH] x86/boot: make ELF kernel multiboot-able
On Wed, 2017-02-15 at 17:42 +0100, Paolo Bonzini wrote: > > On 15/02/2017 15:41, Chao Peng wrote: > > > > Multiboot specification > > (http://git.savannah.gnu.org/cgit/grub.git/tree/doc/multiboot.texi?h=mul > > tiboot2) > > is an open standard that provides kernels with a uniform way to be booted > > by multiboot-compliant bootloaders (like grub). > > > > This patch is trying to make Linux ELF kernel image to be a > > multiboot-compliant OS so that it can be loaded by a multiboot-comliant > > bootloader. The benefit is eliminating the maintainance for realmode and > > decompression code and especially when the kernel is loaded in a virtual > > machine, the reducing for these code can greatly cuts down the boot time. > > > > However, the current version of multiboot spec doesn't support 64 bit > > well so for 64 bit kernel we need stub code to jump from 32 bit code to > > 64 bit code. Besides, there are still some other issues: > > 1). '-z max-page-size=0x1000' is used so the text segment start is in > > multiboot header search scope because GNU LD has default page size of > > 0x0020 for ELF64, which will fail multiboot test. > > > > 2). The bootloader like grub has support for ELF kernel (even for ELF64) > > which makes the patch easier. However, the current grub implementaion > > thinks the entry address should be a VA. E.g. for 64 bit kernel, the entry > > address (0x100) is actually phiscial address, grub refuses to load it > > by saying: 'entry point isn't in a segment'. > > For kvm-unit-tests, we do "objcopy -O elf32-i386 dest.32bit dest.64bit" > and pass the resulting 32bit ELF file to grub. > > Out of curiosity, what happens if you pass the resulting multiboot file > to QEMU's -kernel option? > The resulting kernel is a multiboot2 kernel. QEMU however supports loading multiboot v1 only. Chao > Thanks, > > Paolo > > > > > This patch is sent out as RFC in case you have some ideas. > > > > Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com>
Re: [RFC PATCH] x86/boot: make ELF kernel multiboot-able
On Wed, 2017-02-15 at 17:42 +0100, Paolo Bonzini wrote: > > On 15/02/2017 15:41, Chao Peng wrote: > > > > Multiboot specification > > (http://git.savannah.gnu.org/cgit/grub.git/tree/doc/multiboot.texi?h=mul > > tiboot2) > > is an open standard that provides kernels with a uniform way to be booted > > by multiboot-compliant bootloaders (like grub). > > > > This patch is trying to make Linux ELF kernel image to be a > > multiboot-compliant OS so that it can be loaded by a multiboot-comliant > > bootloader. The benefit is eliminating the maintainance for realmode and > > decompression code and especially when the kernel is loaded in a virtual > > machine, the reducing for these code can greatly cuts down the boot time. > > > > However, the current version of multiboot spec doesn't support 64 bit > > well so for 64 bit kernel we need stub code to jump from 32 bit code to > > 64 bit code. Besides, there are still some other issues: > > 1). '-z max-page-size=0x1000' is used so the text segment start is in > > multiboot header search scope because GNU LD has default page size of > > 0x0020 for ELF64, which will fail multiboot test. > > > > 2). The bootloader like grub has support for ELF kernel (even for ELF64) > > which makes the patch easier. However, the current grub implementaion > > thinks the entry address should be a VA. E.g. for 64 bit kernel, the entry > > address (0x100) is actually phiscial address, grub refuses to load it > > by saying: 'entry point isn't in a segment'. > > For kvm-unit-tests, we do "objcopy -O elf32-i386 dest.32bit dest.64bit" > and pass the resulting 32bit ELF file to grub. > > Out of curiosity, what happens if you pass the resulting multiboot file > to QEMU's -kernel option? > The resulting kernel is a multiboot2 kernel. QEMU however supports loading multiboot v1 only. Chao > Thanks, > > Paolo > > > > > This patch is sent out as RFC in case you have some ideas. > > > > Signed-off-by: Chao Peng
Re: [RFC PATCH] x86/boot: make ELF kernel multiboot-able
> > Just something to consider, provided the issues with multiboot get > > resolved: > > > > If you want to boot Xen you actually use the multiboot protocol, the > > last PVH > > boot patches had borrowed ideas from Multiboot to add an entry to > > Linux, only > > it was Xen'ified. What would be Multiboot 2 seemed flexible enough to > > allow all > > sorts of custom semantics and information stacked into a boot image. > > The last > > thought I had over this topic (before giving up) was-- if we're going > > to add > > yet-another-entry (TM) why not add extend Mulitiboot 2 protocol with > > the > > semantics we need to boot any virtual environment and then add > > Multiboot 2 > > support entry on Linux? We could redirect any custom boot mechanism > > then to > > just use that given its flexibility. > > > > Luis > > Multiboot has a fundamentally broken assumption, which is to do certain work > for the kernel in the > bootloader. This is fundamentally a bad idea, because you always want to do > things in the latest > step possible during the boot process, being the most upgradeable, and have > the interface as > narrow as possible. > > Therefore, using Multiboot is actively a negative step. It is declared an > "Open Standard" but > anything can be such declared; it really is a claim that "everything should > work like Grub." Thanks Peter and Luis for comments. Chao
Re: [RFC PATCH] x86/boot: make ELF kernel multiboot-able
> > Just something to consider, provided the issues with multiboot get > > resolved: > > > > If you want to boot Xen you actually use the multiboot protocol, the > > last PVH > > boot patches had borrowed ideas from Multiboot to add an entry to > > Linux, only > > it was Xen'ified. What would be Multiboot 2 seemed flexible enough to > > allow all > > sorts of custom semantics and information stacked into a boot image. > > The last > > thought I had over this topic (before giving up) was-- if we're going > > to add > > yet-another-entry (TM) why not add extend Mulitiboot 2 protocol with > > the > > semantics we need to boot any virtual environment and then add > > Multiboot 2 > > support entry on Linux? We could redirect any custom boot mechanism > > then to > > just use that given its flexibility. > > > > Luis > > Multiboot has a fundamentally broken assumption, which is to do certain work > for the kernel in the > bootloader. This is fundamentally a bad idea, because you always want to do > things in the latest > step possible during the boot process, being the most upgradeable, and have > the interface as > narrow as possible. > > Therefore, using Multiboot is actively a negative step. It is declared an > "Open Standard" but > anything can be such declared; it really is a claim that "everything should > work like Grub." Thanks Peter and Luis for comments. Chao
[RFC PATCH] x86/boot: make ELF kernel multiboot-able
Multiboot specification (http://git.savannah.gnu.org/cgit/grub.git/tree/doc/multiboot.texi?h=multiboot2) is an open standard that provides kernels with a uniform way to be booted by multiboot-compliant bootloaders (like grub). This patch is trying to make Linux ELF kernel image to be a multiboot-compliant OS so that it can be loaded by a multiboot-comliant bootloader. The benefit is eliminating the maintainance for realmode and decompression code and especially when the kernel is loaded in a virtual machine, the reducing for these code can greatly cuts down the boot time. However, the current version of multiboot spec doesn't support 64 bit well so for 64 bit kernel we need stub code to jump from 32 bit code to 64 bit code. Besides, there are still some other issues: 1). '-z max-page-size=0x1000' is used so the text segment start is in multiboot header search scope because GNU LD has default page size of 0x0020 for ELF64, which will fail multiboot test. 2). The bootloader like grub has support for ELF kernel (even for ELF64) which makes the patch easier. However, the current grub implementaion thinks the entry address should be a VA. E.g. for 64 bit kernel, the entry address (0x100) is actually phiscial address, grub refuses to load it by saying: 'entry point isn't in a segment'. This patch is sent out as RFC in case you have some ideas. Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com> --- arch/x86/Kconfig | 6 + arch/x86/Makefile| 4 + arch/x86/kernel/head64.c | 64 ++- arch/x86/kernel/head_64.S| 175 ++ arch/x86/kernel/multiboot2.h | 417 +++ 5 files changed, 665 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/multiboot2.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bada636..75a9ef2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -585,6 +585,12 @@ config X86_AMD_PLATFORM_DEVICE I2C and UART depend on COMMON_CLK to set clock. GPIO driver is implemented under PINCTRL subsystem. +config X86_MULTIBOOT_STUB + bool "Multiboot stub support for ELF kernel image" + default n + ---help--- + Set whether multiboot stub is on or off. + config IOSF_MBI tristate "Intel SoC IOSF Sideband support for SoC platforms" depends on PCI diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 2d44933..d945c34 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -39,6 +39,10 @@ ifdef CONFIG_X86_NEED_RELOCS LDFLAGS_vmlinux := --emit-relocs endif +ifdef CONFIG_X86_MULTIBOOT_STUB + LDFLAGS_vmlinux += -z max-page-size=0x1000 +endif + # # Prevent GCC from generating any FP code by mistake. # diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 54a2372..c0f375a 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -29,6 +29,8 @@ #include #include +#include "multiboot2.h" + /* * Manage page tables very early on. */ @@ -36,6 +38,7 @@ extern pgd_t early_level4_pgt[PTRS_PER_PGD]; extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; static unsigned int __initdata next_early_pgt = 2; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); +char *multiboot_info = NULL; /* Wipe all early page tables except for the kernel symbol map */ static void __init reset_early_page_tables(void) @@ -130,6 +133,60 @@ static void __init copy_bootdata(char *real_mode_data) } } +static void __init copy_multiboot_cmdline(struct multiboot_tag_string *tag) +{ + unsigned int size = tag->size - 8; + + if (size > COMMAND_LINE_SIZE) + size = COMMAND_LINE_SIZE; + boot_params.hdr.cmdline_size = size; + memcpy(boot_command_line, tag->string, size); +} + +static void __init copy_multiboot_mmap(struct multiboot_tag_mmap *tag) +{ + multiboot_memory_map_t *mmap; + int nr = 0; + + for (mmap = tag->entries; + (u8 *)mmap < (u8 *)tag + tag->size && nr < E820MAX; + mmap = (multiboot_memory_map_t *)((unsigned long)mmap + + tag->entry_size)) { + boot_params.e820_map[nr].addr = mmap->addr; + boot_params.e820_map[nr].size = mmap->len; + boot_params.e820_map[nr].type = mmap->type; + nr++; + } + boot_params.e820_entries = nr; +} + +static void __init copy_multiboot_info(void) +{ + struct multiboot_tag *tag; + char *ptr = __va(multiboot_info); + + boot_params.hdr.boot_flag = 0xAA55; + boot_params.hdr.header = 0x53726448; + boot_params.hdr.version = 0x202; + + for (tag = (struct multiboot_tag *)(ptr + 8); + tag->type != MULTIBOOT_TAG_TYPE_END; + tag = (struct multiboot_tag *
[RFC PATCH] x86/boot: make ELF kernel multiboot-able
Multiboot specification (http://git.savannah.gnu.org/cgit/grub.git/tree/doc/multiboot.texi?h=multiboot2) is an open standard that provides kernels with a uniform way to be booted by multiboot-compliant bootloaders (like grub). This patch is trying to make Linux ELF kernel image to be a multiboot-compliant OS so that it can be loaded by a multiboot-comliant bootloader. The benefit is eliminating the maintainance for realmode and decompression code and especially when the kernel is loaded in a virtual machine, the reducing for these code can greatly cuts down the boot time. However, the current version of multiboot spec doesn't support 64 bit well so for 64 bit kernel we need stub code to jump from 32 bit code to 64 bit code. Besides, there are still some other issues: 1). '-z max-page-size=0x1000' is used so the text segment start is in multiboot header search scope because GNU LD has default page size of 0x0020 for ELF64, which will fail multiboot test. 2). The bootloader like grub has support for ELF kernel (even for ELF64) which makes the patch easier. However, the current grub implementaion thinks the entry address should be a VA. E.g. for 64 bit kernel, the entry address (0x100) is actually phiscial address, grub refuses to load it by saying: 'entry point isn't in a segment'. This patch is sent out as RFC in case you have some ideas. Signed-off-by: Chao Peng --- arch/x86/Kconfig | 6 + arch/x86/Makefile| 4 + arch/x86/kernel/head64.c | 64 ++- arch/x86/kernel/head_64.S| 175 ++ arch/x86/kernel/multiboot2.h | 417 +++ 5 files changed, 665 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/multiboot2.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bada636..75a9ef2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -585,6 +585,12 @@ config X86_AMD_PLATFORM_DEVICE I2C and UART depend on COMMON_CLK to set clock. GPIO driver is implemented under PINCTRL subsystem. +config X86_MULTIBOOT_STUB + bool "Multiboot stub support for ELF kernel image" + default n + ---help--- + Set whether multiboot stub is on or off. + config IOSF_MBI tristate "Intel SoC IOSF Sideband support for SoC platforms" depends on PCI diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 2d44933..d945c34 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -39,6 +39,10 @@ ifdef CONFIG_X86_NEED_RELOCS LDFLAGS_vmlinux := --emit-relocs endif +ifdef CONFIG_X86_MULTIBOOT_STUB + LDFLAGS_vmlinux += -z max-page-size=0x1000 +endif + # # Prevent GCC from generating any FP code by mistake. # diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 54a2372..c0f375a 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -29,6 +29,8 @@ #include #include +#include "multiboot2.h" + /* * Manage page tables very early on. */ @@ -36,6 +38,7 @@ extern pgd_t early_level4_pgt[PTRS_PER_PGD]; extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; static unsigned int __initdata next_early_pgt = 2; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); +char *multiboot_info = NULL; /* Wipe all early page tables except for the kernel symbol map */ static void __init reset_early_page_tables(void) @@ -130,6 +133,60 @@ static void __init copy_bootdata(char *real_mode_data) } } +static void __init copy_multiboot_cmdline(struct multiboot_tag_string *tag) +{ + unsigned int size = tag->size - 8; + + if (size > COMMAND_LINE_SIZE) + size = COMMAND_LINE_SIZE; + boot_params.hdr.cmdline_size = size; + memcpy(boot_command_line, tag->string, size); +} + +static void __init copy_multiboot_mmap(struct multiboot_tag_mmap *tag) +{ + multiboot_memory_map_t *mmap; + int nr = 0; + + for (mmap = tag->entries; + (u8 *)mmap < (u8 *)tag + tag->size && nr < E820MAX; + mmap = (multiboot_memory_map_t *)((unsigned long)mmap + + tag->entry_size)) { + boot_params.e820_map[nr].addr = mmap->addr; + boot_params.e820_map[nr].size = mmap->len; + boot_params.e820_map[nr].type = mmap->type; + nr++; + } + boot_params.e820_entries = nr; +} + +static void __init copy_multiboot_info(void) +{ + struct multiboot_tag *tag; + char *ptr = __va(multiboot_info); + + boot_params.hdr.boot_flag = 0xAA55; + boot_params.hdr.header = 0x53726448; + boot_params.hdr.version = 0x202; + + for (tag = (struct multiboot_tag *)(ptr + 8); + tag->type != MULTIBOOT_TAG_TYPE_END; + tag = (struct multiboot_tag *)((u8 *) tag + +
Re: [PATCH] cgroups: move cpuset specific checks from generic code to cpuset_can_attach
On Wed, Nov 25, 2015 at 08:01:17PM -0200, Marcelo Tosatti wrote: > > Move PF_NO_SETAFFINITY check to cpuset cgroups, where it belongs. > This makes it possible to attach PF_NO_SETAFFINITY to Intel CAT cgroups. Looks that's the right place. I tried intel_rdt subsystem at least it doesn't have this restriction anymore (all the tasks can be moved out of the default group), hence: Reviewed-by: Chao Peng Chao -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] cgroups: move cpuset specific checks from generic code to cpuset_can_attach
On Wed, Nov 25, 2015 at 08:01:17PM -0200, Marcelo Tosatti wrote: > > Move PF_NO_SETAFFINITY check to cpuset cgroups, where it belongs. > This makes it possible to attach PF_NO_SETAFFINITY to Intel CAT cgroups. Looks that's the right place. I tried intel_rdt subsystem at least it doesn't have this restriction anymore (all the tasks can be moved out of the default group), hence: Reviewed-by: Chao Peng <chao.p.p...@linux.intel.com> Chao -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFD] CAT user space interface revisited
On Wed, Nov 18, 2015 at 10:01:54PM -0200, Marcelo Tosatti wrote: > > tglx > > Again: you don't need to look into the MSR table and relate it > to tasks if you store the data as: > > task group 1 = { > reservation-1 = {size = 80Kb, type = data, socketmask = > 0x}, > reservation-2 = {size = 100Kb, type = code, socketmask > = 0x} > } > > task group 2 = { > reservation-1 = {size = 80Kb, type = data, socketmask = > 0x}, > reservation-3 = {size = 200Kb, type = code, socketmask > = 0x} > } > > Task group 1 and task group 2 share reservation-1. Because there is only size but not CBM position info, I guess for different reservations they will not overlap each other, right? Personally I like this way of exposing minimal information to userspace. I can think it working well except for one concern of losing flexibility: For instance, there is a box for which the full CBM is 0xf. After cache reservation creating/freeing for a while we then have reservations: reservation1: 0xf reservation2: 0x00ff0 Now people want to request a reservation which size is 0xff, so how will kernel do at this time? It could return just error or do some moving/merging (e.g. for reservation2: 0x00ff0 => 0x0ff00) and then satisfy the request. But I don't know if the moving/merging will cause delay for tasks that is using it. Thanks, Chao -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFD] CAT user space interface revisited
On Wed, Nov 18, 2015 at 10:01:54PM -0200, Marcelo Tosatti wrote: > > tglx > > Again: you don't need to look into the MSR table and relate it > to tasks if you store the data as: > > task group 1 = { > reservation-1 = {size = 80Kb, type = data, socketmask = > 0x}, > reservation-2 = {size = 100Kb, type = code, socketmask > = 0x} > } > > task group 2 = { > reservation-1 = {size = 80Kb, type = data, socketmask = > 0x}, > reservation-3 = {size = 200Kb, type = code, socketmask > = 0x} > } > > Task group 1 and task group 2 share reservation-1. Because there is only size but not CBM position info, I guess for different reservations they will not overlap each other, right? Personally I like this way of exposing minimal information to userspace. I can think it working well except for one concern of losing flexibility: For instance, there is a box for which the full CBM is 0xf. After cache reservation creating/freeing for a while we then have reservations: reservation1: 0xf reservation2: 0x00ff0 Now people want to request a reservation which size is 0xff, so how will kernel do at this time? It could return just error or do some moving/merging (e.g. for reservation2: 0x00ff0 => 0x0ff00) and then satisfy the request. But I don't know if the moving/merging will cause delay for tasks that is using it. Thanks, Chao -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFD] CAT user space interface revisited
On Wed, Nov 18, 2015 at 07:25:03PM +0100, Thomas Gleixner wrote: > > Let's look at partitioning itself. We have two options: > >1) Per task partitioning > >2) Per CPU partitioning > > So far we only talked about #1, but I think that #2 has a value as > well. Let me give you a simple example. I would second this. In practice per CPU partitioning is useful for realtime as well. And I can see three possible solutions: 1) What you suggested below, to address both problems in one framework. But I wonder if it would end with too complex. 2) Achieve per CPU partitioning with per task partitioning. For example, if current CAT patch can solve the kernel threads problem, together with CPU pinning, we then can set a same CBM for all the tasks/kernel threads run on an isolated CPU. 3) I wonder if it feasible to separate the two requirements? For example, divides the work into three components: rdt-base, per task interface (current cgroup interface/IOCTL or something) and per CPU interface. The two interfaces are exclusive and selected at build time. One thing to reject this option would be even with per CPU partitioning, we still need per task partitioning, in that case we will go to option 1) again. Thanks, Chao -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFD] CAT user space interface revisited
On Wed, Nov 18, 2015 at 07:25:03PM +0100, Thomas Gleixner wrote: > > Let's look at partitioning itself. We have two options: > >1) Per task partitioning > >2) Per CPU partitioning > > So far we only talked about #1, but I think that #2 has a value as > well. Let me give you a simple example. I would second this. In practice per CPU partitioning is useful for realtime as well. And I can see three possible solutions: 1) What you suggested below, to address both problems in one framework. But I wonder if it would end with too complex. 2) Achieve per CPU partitioning with per task partitioning. For example, if current CAT patch can solve the kernel threads problem, together with CPU pinning, we then can set a same CBM for all the tasks/kernel threads run on an isolated CPU. 3) I wonder if it feasible to separate the two requirements? For example, divides the work into three components: rdt-base, per task interface (current cgroup interface/IOCTL or something) and per CPU interface. The two interfaces are exclusive and selected at build time. One thing to reject this option would be even with per CPU partitioning, we still need per task partitioning, in that case we will go to option 1) again. Thanks, Chao -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] KVM: x86: Enable Intel AVX-512 for guest
On Wed, Oct 22, 2014 at 12:17:33PM +0200, Paolo Bonzini wrote: > On 10/22/2014 11:35 AM, Chao Peng wrote: > > Expose Intel AVX-512 feature bits to guest. Also add checks for > > xcr0 AVX512 related bits according to spec: > > http://download-software.intel.com/sites/default/files/managed/71/2e/319433-017.pdf > > > > Signed-off-by: Chao Peng > > The patch looks good, but you also have to patch QEMU in order to > save/restore the values of the registers. IIRC the manual already > details where the registers are in the XSAVE area, so it should be easy > to get them in and out. You can look at the MPX patches for an example. > > In the meanwhile, kernel bits are > > Reviewed-by: Paolo Bonzini > > Paolo Thanks Paolo. QEMU side patch is already sent out to QEMU list in the other thread. Also accessible from url: http://lists.nongnu.org/archive/html/qemu-devel/2014-10/msg02681.html Chao > > > --- > > arch/x86/include/asm/xsave.h |1 + > > arch/x86/kvm/cpuid.c |3 ++- > > arch/x86/kvm/x86.c |6 ++ > > arch/x86/kvm/x86.h |3 ++- > > 4 files changed, 11 insertions(+), 2 deletions(-) > > > > diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h > > index 7e7a79a..5fa9770 100644 > > --- a/arch/x86/include/asm/xsave.h > > +++ b/arch/x86/include/asm/xsave.h > > @@ -16,6 +16,7 @@ > > #define XSTATE_Hi16_ZMM0x80 > > > > #define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) > > +#define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | > > XSTATE_Hi16_ZMM) > > /* Bit 63 of XCR0 is reserved for future expansion */ > > #define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63))) > > > > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c > > index 976e3a5..20d8321 100644 > > --- a/arch/x86/kvm/cpuid.c > > +++ b/arch/x86/kvm/cpuid.c > > @@ -317,7 +317,8 @@ static inline int __do_cpuid_ent(struct > > kvm_cpuid_entry2 *entry, u32 function, > > const u32 kvm_supported_word9_x86_features = > > F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | > > F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | > > - F(ADX) | F(SMAP); > > + F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | > > + F(AVX512CD); > > > > /* all calls to cpuid_count() should be made on the same cpu */ > > get_cpu(); > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > index 5430e4b..3d77b88 100644 > > --- a/arch/x86/kvm/x86.c > > +++ b/arch/x86/kvm/x86.c > > @@ -651,6 +651,12 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, > > u64 xcr) > > if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR))) > > return 1; > > > > + if (xcr0 & XSTATE_AVX512) { > > + if (!(xcr0 & XSTATE_YMM)) > > + return 1; > > + if ((xcr0 & XSTATE_AVX512) != XSTATE_AVX512) > > + return 1; > > + } > > kvm_put_guest_xcr0(vcpu); > > vcpu->arch.xcr0 = xcr0; > > > > diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h > > index 7cb9c45..cc1d61a 100644 > > --- a/arch/x86/kvm/x86.h > > +++ b/arch/x86/kvm/x86.h > > @@ -162,7 +162,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt > > *ctxt, > > bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data); > > > > #define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ > > - | XSTATE_BNDREGS | XSTATE_BNDCSR) > > + | XSTATE_BNDREGS | XSTATE_BNDCSR \ > > + | XSTATE_AVX512) > > extern u64 host_xcr0; > > > > extern u64 kvm_supported_xcr0(void); > > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] KVM: x86: Enable Intel AVX-512 for guest
Expose Intel AVX-512 feature bits to guest. Also add checks for xcr0 AVX512 related bits according to spec: http://download-software.intel.com/sites/default/files/managed/71/2e/319433-017.pdf Signed-off-by: Chao Peng --- arch/x86/include/asm/xsave.h |1 + arch/x86/kvm/cpuid.c |3 ++- arch/x86/kvm/x86.c |6 ++ arch/x86/kvm/x86.h |3 ++- 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 7e7a79a..5fa9770 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -16,6 +16,7 @@ #define XSTATE_Hi16_ZMM0x80 #define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) +#define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) /* Bit 63 of XCR0 is reserved for future expansion */ #define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63))) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 976e3a5..20d8321 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -317,7 +317,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, const u32 kvm_supported_word9_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | - F(ADX) | F(SMAP); + F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | + F(AVX512CD); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5430e4b..3d77b88 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -651,6 +651,12 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR))) return 1; + if (xcr0 & XSTATE_AVX512) { + if (!(xcr0 & XSTATE_YMM)) + return 1; + if ((xcr0 & XSTATE_AVX512) != XSTATE_AVX512) + return 1; + } kvm_put_guest_xcr0(vcpu); vcpu->arch.xcr0 = xcr0; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 7cb9c45..cc1d61a 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -162,7 +162,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data); #define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ - | XSTATE_BNDREGS | XSTATE_BNDCSR) + | XSTATE_BNDREGS | XSTATE_BNDCSR \ + | XSTATE_AVX512) extern u64 host_xcr0; extern u64 kvm_supported_xcr0(void); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] KVM: x86: Enable Intel AVX-512 for guest
On Wed, Oct 22, 2014 at 12:17:33PM +0200, Paolo Bonzini wrote: On 10/22/2014 11:35 AM, Chao Peng wrote: Expose Intel AVX-512 feature bits to guest. Also add checks for xcr0 AVX512 related bits according to spec: http://download-software.intel.com/sites/default/files/managed/71/2e/319433-017.pdf Signed-off-by: Chao Peng chao.p.p...@linux.intel.com The patch looks good, but you also have to patch QEMU in order to save/restore the values of the registers. IIRC the manual already details where the registers are in the XSAVE area, so it should be easy to get them in and out. You can look at the MPX patches for an example. In the meanwhile, kernel bits are Reviewed-by: Paolo Bonzini pbonz...@redhat.com Paolo Thanks Paolo. QEMU side patch is already sent out to QEMU list in the other thread. Also accessible from url: http://lists.nongnu.org/archive/html/qemu-devel/2014-10/msg02681.html Chao --- arch/x86/include/asm/xsave.h |1 + arch/x86/kvm/cpuid.c |3 ++- arch/x86/kvm/x86.c |6 ++ arch/x86/kvm/x86.h |3 ++- 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 7e7a79a..5fa9770 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -16,6 +16,7 @@ #define XSTATE_Hi16_ZMM0x80 #define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) +#define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) /* Bit 63 of XCR0 is reserved for future expansion */ #define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL 63))) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 976e3a5..20d8321 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -317,7 +317,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, const u32 kvm_supported_word9_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | - F(ADX) | F(SMAP); + F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | + F(AVX512CD); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5430e4b..3d77b88 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -651,6 +651,12 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) if ((!(xcr0 XSTATE_BNDREGS)) != (!(xcr0 XSTATE_BNDCSR))) return 1; + if (xcr0 XSTATE_AVX512) { + if (!(xcr0 XSTATE_YMM)) + return 1; + if ((xcr0 XSTATE_AVX512) != XSTATE_AVX512) + return 1; + } kvm_put_guest_xcr0(vcpu); vcpu-arch.xcr0 = xcr0; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 7cb9c45..cc1d61a 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -162,7 +162,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data); #define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ - | XSTATE_BNDREGS | XSTATE_BNDCSR) + | XSTATE_BNDREGS | XSTATE_BNDCSR \ + | XSTATE_AVX512) extern u64 host_xcr0; extern u64 kvm_supported_xcr0(void); -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] KVM: x86: Enable Intel AVX-512 for guest
Expose Intel AVX-512 feature bits to guest. Also add checks for xcr0 AVX512 related bits according to spec: http://download-software.intel.com/sites/default/files/managed/71/2e/319433-017.pdf Signed-off-by: Chao Peng chao.p.p...@linux.intel.com --- arch/x86/include/asm/xsave.h |1 + arch/x86/kvm/cpuid.c |3 ++- arch/x86/kvm/x86.c |6 ++ arch/x86/kvm/x86.h |3 ++- 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 7e7a79a..5fa9770 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -16,6 +16,7 @@ #define XSTATE_Hi16_ZMM0x80 #define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) +#define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) /* Bit 63 of XCR0 is reserved for future expansion */ #define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL 63))) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 976e3a5..20d8321 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -317,7 +317,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, const u32 kvm_supported_word9_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | - F(ADX) | F(SMAP); + F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | + F(AVX512CD); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5430e4b..3d77b88 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -651,6 +651,12 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) if ((!(xcr0 XSTATE_BNDREGS)) != (!(xcr0 XSTATE_BNDCSR))) return 1; + if (xcr0 XSTATE_AVX512) { + if (!(xcr0 XSTATE_YMM)) + return 1; + if ((xcr0 XSTATE_AVX512) != XSTATE_AVX512) + return 1; + } kvm_put_guest_xcr0(vcpu); vcpu-arch.xcr0 = xcr0; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 7cb9c45..cc1d61a 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -162,7 +162,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data); #define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ - | XSTATE_BNDREGS | XSTATE_BNDCSR) + | XSTATE_BNDREGS | XSTATE_BNDCSR \ + | XSTATE_AVX512) extern u64 host_xcr0; extern u64 kvm_supported_xcr0(void); -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/