[PATCH v2] x86/boot: Support uncompressed kernel

2017-04-04 Thread Chao Peng
Compressed kernel has its own drawback: decompressing takes time. Even
though the time is short enough to ignore for most cases but for cases when
time is critical decompressing time still matters.

The patch adds a 'CONFIG_KERNEL_RAW' configure choice so the built binary
can have no decompressing at all. The experiment shows:

kernel   kernel sizetime in decompress_kernel
compressed (gzip)3.3M   53ms
compressed (lz4) 4.5M   16ms
uncompressed 14M2ms

Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com>
---
v2:
  * add HAVE_KERNEL_RAW
  * decode ELF kernel in place instead of getting another copy
  * minor comment fix
---
 arch/x86/Kconfig  |  1 +
 arch/x86/boot/compressed/Makefile |  3 +++
 arch/x86/boot/compressed/misc.c   | 18 +-
 init/Kconfig  | 13 -
 scripts/Makefile.lib  |  8 
 5 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc98d5a..207695c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -142,6 +142,7 @@ config X86
select HAVE_KERNEL_LZ4
select HAVE_KERNEL_LZMA
select HAVE_KERNEL_LZO
+   select HAVE_KERNEL_RAW
select HAVE_KERNEL_XZ
select HAVE_KPROBES
select HAVE_KPROBES_ON_FTRACE
diff --git a/arch/x86/boot/compressed/Makefile 
b/arch/x86/boot/compressed/Makefile
index 44163e8..ed366e1 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -120,6 +120,8 @@ $(obj)/vmlinux.relocs: vmlinux FORCE
 vmlinux.bin.all-y := $(obj)/vmlinux.bin
 vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
 
+$(obj)/vmlinux.bin.raw: $(vmlinux.bin.all-y) FORCE
+   $(call if_changed,raw)
 $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
$(call if_changed,gzip)
 $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
@@ -133,6 +135,7 @@ $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
 $(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE
$(call if_changed,lz4)
 
+suffix-$(CONFIG_KERNEL_RAW):= raw
 suffix-$(CONFIG_KERNEL_GZIP)   := gz
 suffix-$(CONFIG_KERNEL_BZIP2)  := bz2
 suffix-$(CONFIG_KERNEL_LZMA)   := lzma
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index b3c5a5f0..9791ca9 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -51,6 +51,10 @@
 static int vidport;
 static int lines, cols;
 
+#ifdef CONFIG_KERNEL_RAW
+#include 
+#endif
+
 #ifdef CONFIG_KERNEL_GZIP
 #include "../../../../lib/decompress_inflate.c"
 #endif
@@ -265,7 +269,7 @@ static inline void handle_relocations(void *output, 
unsigned long output_len,
 { }
 #endif
 
-static void parse_elf(void *output)
+static void parse_elf(void* buf, void *output)
 {
 #ifdef CONFIG_X86_64
Elf64_Ehdr ehdr;
@@ -277,7 +281,7 @@ static void parse_elf(void *output)
void *dest;
int i;
 
-   memcpy(, output, sizeof(ehdr));
+   memcpy(, buf, sizeof(ehdr));
if (ehdr.e_ident[EI_MAG0] != ELFMAG0 ||
   ehdr.e_ident[EI_MAG1] != ELFMAG1 ||
   ehdr.e_ident[EI_MAG2] != ELFMAG2 ||
@@ -292,7 +296,7 @@ static void parse_elf(void *output)
if (!phdrs)
error("Failed to allocate space for phdrs");
 
-   memcpy(phdrs, output + ehdr.e_phoff, sizeof(*phdrs) * ehdr.e_phnum);
+   memcpy(phdrs, buf + ehdr.e_phoff, sizeof(*phdrs) * ehdr.e_phnum);
 
for (i = 0; i < ehdr.e_phnum; i++) {
phdr = [i];
@@ -305,7 +309,7 @@ static void parse_elf(void *output)
 #else
dest = (void *)(phdr->p_paddr);
 #endif
-   memmove(dest, output + phdr->p_offset, phdr->p_filesz);
+   memmove(dest, buf + phdr->p_offset, phdr->p_filesz);
break;
default: /* Ignore other PT_* */ break;
}
@@ -401,10 +405,14 @@ asmlinkage __visible void *extract_kernel(void *rmode, 
memptr heap,
error("Destination virtual address changed when not 
relocatable");
 #endif
 
+#ifdef CONFIG_KERNEL_RAW
+   parse_elf(input_data, output);
+#else
debug_putstr("\nDecompressing Linux... ");
__decompress(input_data, input_len, NULL, NULL, output, output_len,
NULL, error);
-   parse_elf(output);
+   parse_elf(output, output);
+#endif
handle_relocations(output, output_len, virt_addr);
debug_putstr("done.\nBooting the kernel.\n");
return output;
diff --git a/init/Kconfig b/init/Kconfig
index a92f27d..b8926bb 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -109,6 +109,9 @@ config LOCALVERSION_AUTO
 
  which is done within the script "scripts/setlocalversion".)
 
+config HAVE_KERNEL_RAW
+   bool
+
 config HAVE_KERNEL_GZIP
bool
 
@

[PATCH v2] x86/boot: Support uncompressed kernel

2017-04-04 Thread Chao Peng
Compressed kernel has its own drawback: decompressing takes time. Even
though the time is short enough to ignore for most cases but for cases when
time is critical decompressing time still matters.

The patch adds a 'CONFIG_KERNEL_RAW' configure choice so the built binary
can have no decompressing at all. The experiment shows:

kernel   kernel sizetime in decompress_kernel
compressed (gzip)3.3M   53ms
compressed (lz4) 4.5M   16ms
uncompressed 14M2ms

Signed-off-by: Chao Peng 
---
v2:
  * add HAVE_KERNEL_RAW
  * decode ELF kernel in place instead of getting another copy
  * minor comment fix
---
 arch/x86/Kconfig  |  1 +
 arch/x86/boot/compressed/Makefile |  3 +++
 arch/x86/boot/compressed/misc.c   | 18 +-
 init/Kconfig  | 13 -
 scripts/Makefile.lib  |  8 
 5 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc98d5a..207695c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -142,6 +142,7 @@ config X86
select HAVE_KERNEL_LZ4
select HAVE_KERNEL_LZMA
select HAVE_KERNEL_LZO
+   select HAVE_KERNEL_RAW
select HAVE_KERNEL_XZ
select HAVE_KPROBES
select HAVE_KPROBES_ON_FTRACE
diff --git a/arch/x86/boot/compressed/Makefile 
b/arch/x86/boot/compressed/Makefile
index 44163e8..ed366e1 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -120,6 +120,8 @@ $(obj)/vmlinux.relocs: vmlinux FORCE
 vmlinux.bin.all-y := $(obj)/vmlinux.bin
 vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
 
+$(obj)/vmlinux.bin.raw: $(vmlinux.bin.all-y) FORCE
+   $(call if_changed,raw)
 $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
$(call if_changed,gzip)
 $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
@@ -133,6 +135,7 @@ $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
 $(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE
$(call if_changed,lz4)
 
+suffix-$(CONFIG_KERNEL_RAW):= raw
 suffix-$(CONFIG_KERNEL_GZIP)   := gz
 suffix-$(CONFIG_KERNEL_BZIP2)  := bz2
 suffix-$(CONFIG_KERNEL_LZMA)   := lzma
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index b3c5a5f0..9791ca9 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -51,6 +51,10 @@
 static int vidport;
 static int lines, cols;
 
+#ifdef CONFIG_KERNEL_RAW
+#include 
+#endif
+
 #ifdef CONFIG_KERNEL_GZIP
 #include "../../../../lib/decompress_inflate.c"
 #endif
@@ -265,7 +269,7 @@ static inline void handle_relocations(void *output, 
unsigned long output_len,
 { }
 #endif
 
-static void parse_elf(void *output)
+static void parse_elf(void* buf, void *output)
 {
 #ifdef CONFIG_X86_64
Elf64_Ehdr ehdr;
@@ -277,7 +281,7 @@ static void parse_elf(void *output)
void *dest;
int i;
 
-   memcpy(, output, sizeof(ehdr));
+   memcpy(, buf, sizeof(ehdr));
if (ehdr.e_ident[EI_MAG0] != ELFMAG0 ||
   ehdr.e_ident[EI_MAG1] != ELFMAG1 ||
   ehdr.e_ident[EI_MAG2] != ELFMAG2 ||
@@ -292,7 +296,7 @@ static void parse_elf(void *output)
if (!phdrs)
error("Failed to allocate space for phdrs");
 
-   memcpy(phdrs, output + ehdr.e_phoff, sizeof(*phdrs) * ehdr.e_phnum);
+   memcpy(phdrs, buf + ehdr.e_phoff, sizeof(*phdrs) * ehdr.e_phnum);
 
for (i = 0; i < ehdr.e_phnum; i++) {
phdr = [i];
@@ -305,7 +309,7 @@ static void parse_elf(void *output)
 #else
dest = (void *)(phdr->p_paddr);
 #endif
-   memmove(dest, output + phdr->p_offset, phdr->p_filesz);
+   memmove(dest, buf + phdr->p_offset, phdr->p_filesz);
break;
default: /* Ignore other PT_* */ break;
}
@@ -401,10 +405,14 @@ asmlinkage __visible void *extract_kernel(void *rmode, 
memptr heap,
error("Destination virtual address changed when not 
relocatable");
 #endif
 
+#ifdef CONFIG_KERNEL_RAW
+   parse_elf(input_data, output);
+#else
debug_putstr("\nDecompressing Linux... ");
__decompress(input_data, input_len, NULL, NULL, output, output_len,
NULL, error);
-   parse_elf(output);
+   parse_elf(output, output);
+#endif
handle_relocations(output, output_len, virt_addr);
debug_putstr("done.\nBooting the kernel.\n");
return output;
diff --git a/init/Kconfig b/init/Kconfig
index a92f27d..b8926bb 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -109,6 +109,9 @@ config LOCALVERSION_AUTO
 
  which is done within the script "scripts/setlocalversion".)
 
+config HAVE_KERNEL_RAW
+   bool
+
 config HAVE_KERNEL_GZIP
bool
 
@@ -130,7 +133,7 @@ config HA

Re: [PATCH] x86/boot: Support uncompressed kernel

2017-03-28 Thread Chao Peng
On Mon, 2017-03-27 at 15:25 +0200, Arnd Bergmann wrote:
> On Mon, Mar 27, 2017 at 1:47 PM, Michal Marek <mma...@suse.com> wrote:
> > 
> > Dne 27.3.2017 v 09:58 Sebastian Andrzej Siewior napsal(a):
> > > 
> > > On 2017-03-24 13:35:40 [+0800], Chao Peng wrote:
> > > > 
> > > > 
> > > > > 
> > > > > > 
> > > > > > > 
> > > > > > > kernel   kernel sizetime in
> > > > > > > decompress_kernel
> > > > > > > compressed (gzip)3.3M   53ms
> > > > > > > uncompressed 14M3ms
> > > > > > 
> > > > Exactly, LZ4 is the fastest. It takes 16ms to complete the
> > > > decompression. Still sounds a little longer when compared to
> > > > uncompressed kernel.
> > > 
> > > Are we seriously talking here about one-time improvement of 13ms
> > > boot time?
> > 
> > If the use case is launching new VM instances continuously, then
> > compressing the kernel image is about as useful as compressing
> > /bin/bash.
> 
> I guess the next step would be to use CONFIG_XIP_KERNEL on x86,
> which requires an uncompressed kernel but has the additional advantage
> of sharing the read-only sections of the kernel image across virtual
> machines, resulting in better RAM and cache usage.

That is something we wanna look into :)

Chao


Re: [PATCH] x86/boot: Support uncompressed kernel

2017-03-28 Thread Chao Peng
On Mon, 2017-03-27 at 15:25 +0200, Arnd Bergmann wrote:
> On Mon, Mar 27, 2017 at 1:47 PM, Michal Marek  wrote:
> > 
> > Dne 27.3.2017 v 09:58 Sebastian Andrzej Siewior napsal(a):
> > > 
> > > On 2017-03-24 13:35:40 [+0800], Chao Peng wrote:
> > > > 
> > > > 
> > > > > 
> > > > > > 
> > > > > > > 
> > > > > > > kernel   kernel sizetime in
> > > > > > > decompress_kernel
> > > > > > > compressed (gzip)3.3M   53ms
> > > > > > > uncompressed 14M3ms
> > > > > > 
> > > > Exactly, LZ4 is the fastest. It takes 16ms to complete the
> > > > decompression. Still sounds a little longer when compared to
> > > > uncompressed kernel.
> > > 
> > > Are we seriously talking here about one-time improvement of 13ms
> > > boot time?
> > 
> > If the use case is launching new VM instances continuously, then
> > compressing the kernel image is about as useful as compressing
> > /bin/bash.
> 
> I guess the next step would be to use CONFIG_XIP_KERNEL on x86,
> which requires an uncompressed kernel but has the additional advantage
> of sharing the read-only sections of the kernel image across virtual
> machines, resulting in better RAM and cache usage.

That is something we wanna look into :)

Chao


Re: [PATCH] x86/boot: Support uncompressed kernel

2017-03-27 Thread Chao Peng
On Mon, 2017-03-27 at 09:58 +0200, Sebastian Andrzej Siewior wrote:
> On 2017-03-24 13:35:40 [+0800], Chao Peng wrote:
> > 
> > 
> > > 
> > > > 
> > > > > 
> > > > > kernel   kernel sizetime in
> > > > > decompress_kernel
> > > > > compressed (gzip)3.3M   53ms
> > > > > uncompressed 14M3ms
> > > > 
> > Exactly, LZ4 is the fastest. It takes 16ms to complete the
> > decompression. Still sounds a little longer when compared to
> > uncompressed kernel.
> 
> Are we seriously talking here about one-time improvement of 13ms
> boot time?

The usage model for us is to lunch kernel in virtual machine and there
will be thousands of instances lunched and shutdowned/re-lunched
frequently, so every single million-second helps. And 13ms means 20%
improvement to our existing optimization (the other part besides
decompression is optimized to ~40ms).

Chao
> 
> > 
> > Chao
> 
> Sebastian


Re: [PATCH] x86/boot: Support uncompressed kernel

2017-03-27 Thread Chao Peng
On Mon, 2017-03-27 at 09:58 +0200, Sebastian Andrzej Siewior wrote:
> On 2017-03-24 13:35:40 [+0800], Chao Peng wrote:
> > 
> > 
> > > 
> > > > 
> > > > > 
> > > > > kernel   kernel sizetime in
> > > > > decompress_kernel
> > > > > compressed (gzip)3.3M   53ms
> > > > > uncompressed 14M3ms
> > > > 
> > Exactly, LZ4 is the fastest. It takes 16ms to complete the
> > decompression. Still sounds a little longer when compared to
> > uncompressed kernel.
> 
> Are we seriously talking here about one-time improvement of 13ms
> boot time?

The usage model for us is to lunch kernel in virtual machine and there
will be thousands of instances lunched and shutdowned/re-lunched
frequently, so every single million-second helps. And 13ms means 20%
improvement to our existing optimization (the other part besides
decompression is optimized to ~40ms).

Chao
> 
> > 
> > Chao
> 
> Sebastian


Re: [PATCH] x86/boot: Support uncompressed kernel

2017-03-23 Thread Chao Peng

> > > The patch adds a 'CONFIG_KERNEL_RAW' configure choice so the built
> > > binary
> > > can have no uncompressing at all. The experiment shows:
> > > 
> > > kernel   kernel sizetime in decompress_kernel
> > > compressed (gzip)3.3M   53ms
> > > uncompressed 14M3ms
> > 
> > How about the time difference for bootloader to read kernel from
> > flash/disk/network to ram?

The loading time for bootloader can be longer as size increased, but
that depends on which media it uses. For our usecase, it's not a big
problem. As we run the kernel in virtual machine and lunch thousands of
instances in the same physical machine so only the first instance needs
to read from the file and later we just copy the memory. The thing that
really matters for us is how fast we can boot for majority of the
instances.

> 
> there are also faster de-compressors than gzip out there. LZ4, for
> instance.
> LZ4, as far as I remember, can be quite fast, like ~10 times faster
> than gzip.
> have you tested it?

Exactly, LZ4 is the fastest. It takes 16ms to complete the
decompression. Still sounds a little longer when compared to
uncompressed kernel.

Chao


Re: [PATCH] x86/boot: Support uncompressed kernel

2017-03-23 Thread Chao Peng

> > > The patch adds a 'CONFIG_KERNEL_RAW' configure choice so the built
> > > binary
> > > can have no uncompressing at all. The experiment shows:
> > > 
> > > kernel   kernel sizetime in decompress_kernel
> > > compressed (gzip)3.3M   53ms
> > > uncompressed 14M3ms
> > 
> > How about the time difference for bootloader to read kernel from
> > flash/disk/network to ram?

The loading time for bootloader can be longer as size increased, but
that depends on which media it uses. For our usecase, it's not a big
problem. As we run the kernel in virtual machine and lunch thousands of
instances in the same physical machine so only the first instance needs
to read from the file and later we just copy the memory. The thing that
really matters for us is how fast we can boot for majority of the
instances.

> 
> there are also faster de-compressors than gzip out there. LZ4, for
> instance.
> LZ4, as far as I remember, can be quite fast, like ~10 times faster
> than gzip.
> have you tested it?

Exactly, LZ4 is the fastest. It takes 16ms to complete the
decompression. Still sounds a little longer when compared to
uncompressed kernel.

Chao


[PATCH] x86/boot: Support uncompressed kernel

2017-03-23 Thread Chao Peng
Compressed kernel has its own drawback: uncompressing takes time. Even
though the time is short enough to ignore for most cases but for cases that
time is critical this is still a big number. In our on-going optimization
for kernel boot time, the measured overall kernel boot time is ~90ms while
the uncompressing takes ~50ms with gzip.

The patch adds a 'CONFIG_KERNEL_RAW' configure choice so the built binary
can have no uncompressing at all. The experiment shows:

kernel   kernel sizetime in decompress_kernel
compressed (gzip)3.3M   53ms
uncompressed 14M3ms

Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com>
---
 arch/x86/boot/compressed/Makefile |  3 +++
 arch/x86/boot/compressed/misc.c   | 14 ++
 init/Kconfig  |  7 +++
 scripts/Makefile.lib  |  8 
 4 files changed, 32 insertions(+)

diff --git a/arch/x86/boot/compressed/Makefile 
b/arch/x86/boot/compressed/Makefile
index f9ce75d..fc0e1c0 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -73,6 +73,8 @@ $(obj)/vmlinux.relocs: vmlinux FORCE
 vmlinux.bin.all-y := $(obj)/vmlinux.bin
 vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
 
+$(obj)/vmlinux.bin.raw: $(vmlinux.bin.all-y) FORCE
+   $(call if_changed,raw)
 $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
$(call if_changed,gzip)
 $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
@@ -86,6 +88,7 @@ $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
 $(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE
$(call if_changed,lz4)
 
+suffix-$(CONFIG_KERNEL_RAW):= raw
 suffix-$(CONFIG_KERNEL_GZIP)   := gz
 suffix-$(CONFIG_KERNEL_BZIP2)  := bz2
 suffix-$(CONFIG_KERNEL_LZMA)   := lzma
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 79dac17..fb3cd43 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -123,6 +123,20 @@ static char *vidmem;
 static int vidport;
 static int lines, cols;
 
+#ifdef CONFIG_KERNEL_RAW
+#include 
+static int __decompress(unsigned char *buf, long len,
+   long (*fill)(void*, unsigned long),
+   long (*flush)(void*, unsigned long),
+   unsigned char *outbuf, long olen,
+   long *pos,
+   void (*error)(char *x))
+{
+   memcpy(outbuf, buf, olen);
+   return 0;
+}
+#endif
+
 #ifdef CONFIG_KERNEL_GZIP
 #include "../../../../lib/decompress_inflate.c"
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index 2232080..1db2ea2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -137,6 +137,13 @@ choice
 
  If in doubt, select 'gzip'
 
+config KERNEL_RAW
+   bool "RAW"
+   help
+ No compression. It creates much bigger kernel and uses much more
+ space (disk/memory) than other choices. It can be useful when
+ decompression speed is the most concern while space is not a problem.
+
 config KERNEL_GZIP
bool "Gzip"
depends on HAVE_KERNEL_GZIP
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 2edbcad..384128d 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -344,6 +344,14 @@ cmd_lz4 = (cat $(filter-out FORCE,$^) | \
lz4c -l -c1 stdin stdout && $(call size_append, $(filter-out 
FORCE,$^))) > $@ || \
(rm -f $@ ; false)
 
+# RAW
+# ---
+quiet_cmd_raw = RAW $@
+cmd_raw = (cat $(filter-out FORCE,$^) && \
+   $(call size_append, $(filter-out FORCE,$^))) > $@ || \
+   (rm -f $@ ; false)
+
+
 # U-Boot mkimage
 # ---
 
-- 
1.8.3.1



[PATCH] x86/boot: Support uncompressed kernel

2017-03-23 Thread Chao Peng
Compressed kernel has its own drawback: uncompressing takes time. Even
though the time is short enough to ignore for most cases but for cases that
time is critical this is still a big number. In our on-going optimization
for kernel boot time, the measured overall kernel boot time is ~90ms while
the uncompressing takes ~50ms with gzip.

The patch adds a 'CONFIG_KERNEL_RAW' configure choice so the built binary
can have no uncompressing at all. The experiment shows:

kernel   kernel sizetime in decompress_kernel
compressed (gzip)3.3M   53ms
uncompressed 14M3ms

Signed-off-by: Chao Peng 
---
 arch/x86/boot/compressed/Makefile |  3 +++
 arch/x86/boot/compressed/misc.c   | 14 ++
 init/Kconfig  |  7 +++
 scripts/Makefile.lib  |  8 
 4 files changed, 32 insertions(+)

diff --git a/arch/x86/boot/compressed/Makefile 
b/arch/x86/boot/compressed/Makefile
index f9ce75d..fc0e1c0 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -73,6 +73,8 @@ $(obj)/vmlinux.relocs: vmlinux FORCE
 vmlinux.bin.all-y := $(obj)/vmlinux.bin
 vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
 
+$(obj)/vmlinux.bin.raw: $(vmlinux.bin.all-y) FORCE
+   $(call if_changed,raw)
 $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
$(call if_changed,gzip)
 $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
@@ -86,6 +88,7 @@ $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
 $(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE
$(call if_changed,lz4)
 
+suffix-$(CONFIG_KERNEL_RAW):= raw
 suffix-$(CONFIG_KERNEL_GZIP)   := gz
 suffix-$(CONFIG_KERNEL_BZIP2)  := bz2
 suffix-$(CONFIG_KERNEL_LZMA)   := lzma
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 79dac17..fb3cd43 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -123,6 +123,20 @@ static char *vidmem;
 static int vidport;
 static int lines, cols;
 
+#ifdef CONFIG_KERNEL_RAW
+#include 
+static int __decompress(unsigned char *buf, long len,
+   long (*fill)(void*, unsigned long),
+   long (*flush)(void*, unsigned long),
+   unsigned char *outbuf, long olen,
+   long *pos,
+   void (*error)(char *x))
+{
+   memcpy(outbuf, buf, olen);
+   return 0;
+}
+#endif
+
 #ifdef CONFIG_KERNEL_GZIP
 #include "../../../../lib/decompress_inflate.c"
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index 2232080..1db2ea2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -137,6 +137,13 @@ choice
 
  If in doubt, select 'gzip'
 
+config KERNEL_RAW
+   bool "RAW"
+   help
+ No compression. It creates much bigger kernel and uses much more
+ space (disk/memory) than other choices. It can be useful when
+ decompression speed is the most concern while space is not a problem.
+
 config KERNEL_GZIP
bool "Gzip"
depends on HAVE_KERNEL_GZIP
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 2edbcad..384128d 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -344,6 +344,14 @@ cmd_lz4 = (cat $(filter-out FORCE,$^) | \
lz4c -l -c1 stdin stdout && $(call size_append, $(filter-out 
FORCE,$^))) > $@ || \
(rm -f $@ ; false)
 
+# RAW
+# ---
+quiet_cmd_raw = RAW $@
+cmd_raw = (cat $(filter-out FORCE,$^) && \
+   $(call size_append, $(filter-out FORCE,$^))) > $@ || \
+   (rm -f $@ ; false)
+
+
 # U-Boot mkimage
 # ---
 
-- 
1.8.3.1



[PATCH] KVM: VMX: use correct vmcs_read/write for guest segment selector/base

2017-02-21 Thread Chao Peng
Guest segment selector is 16 bit field and guest segment base is natural
width field. Fix two incorrect invocations accordingly.

Without this patch, build fails when aggressive inlining is used with ICC.

Cc: sta...@vger.kernel.org
Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com>
---
 arch/x86/kvm/vmx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a236dec..2c22aef 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3962,7 +3962,7 @@ static void fix_rmode_seg(int seg, struct kvm_segment 
*save)
}
 
vmcs_write16(sf->selector, var.selector);
-   vmcs_write32(sf->base, var.base);
+   vmcs_writel(sf->base, var.base);
vmcs_write32(sf->limit, var.limit);
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights());
 }
@@ -8350,7 +8350,7 @@ static void kvm_flush_pml_buffers(struct kvm *kvm)
 static void vmx_dump_sel(char *name, uint32_t sel)
 {
pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
-  name, vmcs_read32(sel),
+  name, vmcs_read16(sel),
   vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
   vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
   vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
-- 
1.8.3.1



[PATCH] KVM: VMX: use correct vmcs_read/write for guest segment selector/base

2017-02-21 Thread Chao Peng
Guest segment selector is 16 bit field and guest segment base is natural
width field. Fix two incorrect invocations accordingly.

Without this patch, build fails when aggressive inlining is used with ICC.

Cc: sta...@vger.kernel.org
Signed-off-by: Chao Peng 
---
 arch/x86/kvm/vmx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a236dec..2c22aef 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3962,7 +3962,7 @@ static void fix_rmode_seg(int seg, struct kvm_segment 
*save)
}
 
vmcs_write16(sf->selector, var.selector);
-   vmcs_write32(sf->base, var.base);
+   vmcs_writel(sf->base, var.base);
vmcs_write32(sf->limit, var.limit);
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights());
 }
@@ -8350,7 +8350,7 @@ static void kvm_flush_pml_buffers(struct kvm *kvm)
 static void vmx_dump_sel(char *name, uint32_t sel)
 {
pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
-  name, vmcs_read32(sel),
+  name, vmcs_read16(sel),
   vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
   vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
   vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
-- 
1.8.3.1



Re: [RFC PATCH] x86/boot: make ELF kernel multiboot-able

2017-02-15 Thread Chao Peng
On Wed, 2017-02-15 at 17:42 +0100, Paolo Bonzini wrote:
> 
> On 15/02/2017 15:41, Chao Peng wrote:
> > 
> > Multiboot specification 
> > (http://git.savannah.gnu.org/cgit/grub.git/tree/doc/multiboot.texi?h=mul
> > tiboot2)
> > is an open standard that provides kernels with a uniform way to be booted
> > by multiboot-compliant bootloaders (like grub).
> > 
> > This patch is trying to make Linux ELF kernel image to be a
> > multiboot-compliant OS so that it can be loaded by a multiboot-comliant
> > bootloader. The benefit is eliminating the maintainance for realmode and
> > decompression code and especially when the kernel is loaded in a virtual
> > machine, the reducing for these code can greatly cuts down the boot time.
> > 
> > However, the current version of multiboot spec doesn't support 64 bit
> > well so for 64 bit kernel we need stub code to jump from 32 bit code to
> > 64 bit code. Besides, there are still some other issues:
> >   1). '-z max-page-size=0x1000' is used so the text segment start is in
> >   multiboot header search scope because GNU LD has default page size of
> >   0x0020 for ELF64, which will fail multiboot test.
> > 
> >   2). The bootloader like grub has support for ELF kernel (even for ELF64)
> >   which makes the patch easier. However, the current grub implementaion
> >   thinks the entry address should be a VA. E.g. for 64 bit kernel, the entry
> >   address (0x100) is actually phiscial address, grub refuses to load it
> >   by saying: 'entry point isn't in a segment'.
> 
> For kvm-unit-tests, we do "objcopy -O elf32-i386 dest.32bit dest.64bit"
> and pass the resulting 32bit ELF file to grub.
> 
> Out of curiosity, what happens if you pass the resulting multiboot file
> to QEMU's -kernel option?
> 

The resulting kernel is a multiboot2 kernel. QEMU however supports loading
multiboot v1 only.

Chao
> Thanks,
> 
> Paolo
> 
> > 
> > This patch is sent out as RFC in case you have some ideas.
> > 
> > Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com>


Re: [RFC PATCH] x86/boot: make ELF kernel multiboot-able

2017-02-15 Thread Chao Peng
On Wed, 2017-02-15 at 17:42 +0100, Paolo Bonzini wrote:
> 
> On 15/02/2017 15:41, Chao Peng wrote:
> > 
> > Multiboot specification 
> > (http://git.savannah.gnu.org/cgit/grub.git/tree/doc/multiboot.texi?h=mul
> > tiboot2)
> > is an open standard that provides kernels with a uniform way to be booted
> > by multiboot-compliant bootloaders (like grub).
> > 
> > This patch is trying to make Linux ELF kernel image to be a
> > multiboot-compliant OS so that it can be loaded by a multiboot-comliant
> > bootloader. The benefit is eliminating the maintainance for realmode and
> > decompression code and especially when the kernel is loaded in a virtual
> > machine, the reducing for these code can greatly cuts down the boot time.
> > 
> > However, the current version of multiboot spec doesn't support 64 bit
> > well so for 64 bit kernel we need stub code to jump from 32 bit code to
> > 64 bit code. Besides, there are still some other issues:
> >   1). '-z max-page-size=0x1000' is used so the text segment start is in
> >   multiboot header search scope because GNU LD has default page size of
> >   0x0020 for ELF64, which will fail multiboot test.
> > 
> >   2). The bootloader like grub has support for ELF kernel (even for ELF64)
> >   which makes the patch easier. However, the current grub implementaion
> >   thinks the entry address should be a VA. E.g. for 64 bit kernel, the entry
> >   address (0x100) is actually phiscial address, grub refuses to load it
> >   by saying: 'entry point isn't in a segment'.
> 
> For kvm-unit-tests, we do "objcopy -O elf32-i386 dest.32bit dest.64bit"
> and pass the resulting 32bit ELF file to grub.
> 
> Out of curiosity, what happens if you pass the resulting multiboot file
> to QEMU's -kernel option?
> 

The resulting kernel is a multiboot2 kernel. QEMU however supports loading
multiboot v1 only.

Chao
> Thanks,
> 
> Paolo
> 
> > 
> > This patch is sent out as RFC in case you have some ideas.
> > 
> > Signed-off-by: Chao Peng 


Re: [RFC PATCH] x86/boot: make ELF kernel multiboot-able

2017-02-15 Thread Chao Peng

> > Just something to consider, provided the issues with multiboot get
> > resolved:
> > 
> > If you want to boot Xen you actually use the multiboot protocol, the
> > last PVH
> > boot patches had borrowed ideas from Multiboot to add an entry to
> > Linux, only
> > it was Xen'ified. What would be Multiboot 2 seemed flexible enough to
> > allow all
> > sorts of custom semantics and information stacked into a boot image.
> > The last
> > thought I had over this topic (before giving up)  was-- if we're going
> > to add
> > yet-another-entry (TM) why not add extend Mulitiboot 2 protocol with
> > the
> > semantics we need to boot any virtual environment and then add
> > Multiboot 2
> > support entry on Linux? We could redirect any custom boot mechanism
> > then to
> > just use that given its flexibility.
> > 
> >  Luis
> 
> Multiboot has a fundamentally broken assumption, which is to do certain work 
> for the kernel in the
> bootloader.  This is fundamentally a bad idea, because you always want to do 
> things in the latest
> step possible during the boot process, being the most upgradeable, and have 
> the interface as
> narrow as possible.
> 
> Therefore, using Multiboot is actively a negative step.  It is declared an 
> "Open Standard" but
> anything can be such declared; it really is a claim that "everything should 
> work like Grub."

Thanks Peter and Luis for comments.

Chao


Re: [RFC PATCH] x86/boot: make ELF kernel multiboot-able

2017-02-15 Thread Chao Peng

> > Just something to consider, provided the issues with multiboot get
> > resolved:
> > 
> > If you want to boot Xen you actually use the multiboot protocol, the
> > last PVH
> > boot patches had borrowed ideas from Multiboot to add an entry to
> > Linux, only
> > it was Xen'ified. What would be Multiboot 2 seemed flexible enough to
> > allow all
> > sorts of custom semantics and information stacked into a boot image.
> > The last
> > thought I had over this topic (before giving up)  was-- if we're going
> > to add
> > yet-another-entry (TM) why not add extend Mulitiboot 2 protocol with
> > the
> > semantics we need to boot any virtual environment and then add
> > Multiboot 2
> > support entry on Linux? We could redirect any custom boot mechanism
> > then to
> > just use that given its flexibility.
> > 
> >  Luis
> 
> Multiboot has a fundamentally broken assumption, which is to do certain work 
> for the kernel in the
> bootloader.  This is fundamentally a bad idea, because you always want to do 
> things in the latest
> step possible during the boot process, being the most upgradeable, and have 
> the interface as
> narrow as possible.
> 
> Therefore, using Multiboot is actively a negative step.  It is declared an 
> "Open Standard" but
> anything can be such declared; it really is a claim that "everything should 
> work like Grub."

Thanks Peter and Luis for comments.

Chao


[RFC PATCH] x86/boot: make ELF kernel multiboot-able

2017-02-15 Thread Chao Peng
Multiboot specification 
(http://git.savannah.gnu.org/cgit/grub.git/tree/doc/multiboot.texi?h=multiboot2)
is an open standard that provides kernels with a uniform way to be booted
by multiboot-compliant bootloaders (like grub).

This patch is trying to make Linux ELF kernel image to be a
multiboot-compliant OS so that it can be loaded by a multiboot-comliant
bootloader. The benefit is eliminating the maintainance for realmode and
decompression code and especially when the kernel is loaded in a virtual
machine, the reducing for these code can greatly cuts down the boot time.

However, the current version of multiboot spec doesn't support 64 bit
well so for 64 bit kernel we need stub code to jump from 32 bit code to
64 bit code. Besides, there are still some other issues:
  1). '-z max-page-size=0x1000' is used so the text segment start is in
  multiboot header search scope because GNU LD has default page size of
  0x0020 for ELF64, which will fail multiboot test.

  2). The bootloader like grub has support for ELF kernel (even for ELF64)
  which makes the patch easier. However, the current grub implementaion
  thinks the entry address should be a VA. E.g. for 64 bit kernel, the entry
  address (0x100) is actually phiscial address, grub refuses to load it
  by saying: 'entry point isn't in a segment'.

This patch is sent out as RFC in case you have some ideas.

Signed-off-by: Chao Peng <chao.p.p...@linux.intel.com>
---
 arch/x86/Kconfig |   6 +
 arch/x86/Makefile|   4 +
 arch/x86/kernel/head64.c |  64 ++-
 arch/x86/kernel/head_64.S| 175 ++
 arch/x86/kernel/multiboot2.h | 417 +++
 5 files changed, 665 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kernel/multiboot2.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bada636..75a9ef2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -585,6 +585,12 @@ config X86_AMD_PLATFORM_DEVICE
  I2C and UART depend on COMMON_CLK to set clock. GPIO driver is
  implemented under PINCTRL subsystem.
 
+config X86_MULTIBOOT_STUB
+   bool "Multiboot stub support for ELF kernel image"
+   default n
+   ---help---
+ Set whether multiboot stub is on or off.
+
 config IOSF_MBI
tristate "Intel SoC IOSF Sideband support for SoC platforms"
depends on PCI
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 2d44933..d945c34 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -39,6 +39,10 @@ ifdef CONFIG_X86_NEED_RELOCS
 LDFLAGS_vmlinux := --emit-relocs
 endif
 
+ifdef CONFIG_X86_MULTIBOOT_STUB
+   LDFLAGS_vmlinux += -z max-page-size=0x1000
+endif
+
 #
 # Prevent GCC from generating any FP code by mistake.
 #
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 54a2372..c0f375a 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -29,6 +29,8 @@
 #include 
 #include 
 
+#include "multiboot2.h"
+
 /*
  * Manage page tables very early on.
  */
@@ -36,6 +38,7 @@ extern pgd_t early_level4_pgt[PTRS_PER_PGD];
 extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
 static unsigned int __initdata next_early_pgt = 2;
 pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
+char *multiboot_info = NULL;
 
 /* Wipe all early page tables except for the kernel symbol map */
 static void __init reset_early_page_tables(void)
@@ -130,6 +133,60 @@ static void __init copy_bootdata(char *real_mode_data)
}
 }
 
+static void __init copy_multiboot_cmdline(struct multiboot_tag_string *tag)
+{
+   unsigned int size = tag->size - 8;
+
+   if (size > COMMAND_LINE_SIZE)
+   size = COMMAND_LINE_SIZE;
+   boot_params.hdr.cmdline_size = size;
+   memcpy(boot_command_line, tag->string, size);
+}
+
+static void __init copy_multiboot_mmap(struct multiboot_tag_mmap *tag)
+{
+   multiboot_memory_map_t *mmap;
+   int nr = 0;
+
+   for (mmap = tag->entries;
+   (u8 *)mmap < (u8 *)tag + tag->size && nr < E820MAX;
+   mmap = (multiboot_memory_map_t *)((unsigned long)mmap +
+   tag->entry_size)) {
+   boot_params.e820_map[nr].addr = mmap->addr;
+   boot_params.e820_map[nr].size = mmap->len;
+   boot_params.e820_map[nr].type = mmap->type;
+   nr++;
+   }
+   boot_params.e820_entries = nr;
+}
+
+static void __init copy_multiboot_info(void)
+{
+   struct multiboot_tag *tag;
+   char *ptr = __va(multiboot_info);
+
+   boot_params.hdr.boot_flag = 0xAA55;
+   boot_params.hdr.header = 0x53726448;
+   boot_params.hdr.version = 0x202;
+
+   for (tag = (struct multiboot_tag *)(ptr + 8);
+   tag->type != MULTIBOOT_TAG_TYPE_END;
+   tag = (struct multiboot_tag *

[RFC PATCH] x86/boot: make ELF kernel multiboot-able

2017-02-15 Thread Chao Peng
Multiboot specification 
(http://git.savannah.gnu.org/cgit/grub.git/tree/doc/multiboot.texi?h=multiboot2)
is an open standard that provides kernels with a uniform way to be booted
by multiboot-compliant bootloaders (like grub).

This patch is trying to make Linux ELF kernel image to be a
multiboot-compliant OS so that it can be loaded by a multiboot-comliant
bootloader. The benefit is eliminating the maintainance for realmode and
decompression code and especially when the kernel is loaded in a virtual
machine, the reducing for these code can greatly cuts down the boot time.

However, the current version of multiboot spec doesn't support 64 bit
well so for 64 bit kernel we need stub code to jump from 32 bit code to
64 bit code. Besides, there are still some other issues:
  1). '-z max-page-size=0x1000' is used so the text segment start is in
  multiboot header search scope because GNU LD has default page size of
  0x0020 for ELF64, which will fail multiboot test.

  2). The bootloader like grub has support for ELF kernel (even for ELF64)
  which makes the patch easier. However, the current grub implementaion
  thinks the entry address should be a VA. E.g. for 64 bit kernel, the entry
  address (0x100) is actually phiscial address, grub refuses to load it
  by saying: 'entry point isn't in a segment'.

This patch is sent out as RFC in case you have some ideas.

Signed-off-by: Chao Peng 
---
 arch/x86/Kconfig |   6 +
 arch/x86/Makefile|   4 +
 arch/x86/kernel/head64.c |  64 ++-
 arch/x86/kernel/head_64.S| 175 ++
 arch/x86/kernel/multiboot2.h | 417 +++
 5 files changed, 665 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kernel/multiboot2.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bada636..75a9ef2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -585,6 +585,12 @@ config X86_AMD_PLATFORM_DEVICE
  I2C and UART depend on COMMON_CLK to set clock. GPIO driver is
  implemented under PINCTRL subsystem.
 
+config X86_MULTIBOOT_STUB
+   bool "Multiboot stub support for ELF kernel image"
+   default n
+   ---help---
+ Set whether multiboot stub is on or off.
+
 config IOSF_MBI
tristate "Intel SoC IOSF Sideband support for SoC platforms"
depends on PCI
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 2d44933..d945c34 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -39,6 +39,10 @@ ifdef CONFIG_X86_NEED_RELOCS
 LDFLAGS_vmlinux := --emit-relocs
 endif
 
+ifdef CONFIG_X86_MULTIBOOT_STUB
+   LDFLAGS_vmlinux += -z max-page-size=0x1000
+endif
+
 #
 # Prevent GCC from generating any FP code by mistake.
 #
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 54a2372..c0f375a 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -29,6 +29,8 @@
 #include 
 #include 
 
+#include "multiboot2.h"
+
 /*
  * Manage page tables very early on.
  */
@@ -36,6 +38,7 @@ extern pgd_t early_level4_pgt[PTRS_PER_PGD];
 extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
 static unsigned int __initdata next_early_pgt = 2;
 pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
+char *multiboot_info = NULL;
 
 /* Wipe all early page tables except for the kernel symbol map */
 static void __init reset_early_page_tables(void)
@@ -130,6 +133,60 @@ static void __init copy_bootdata(char *real_mode_data)
}
 }
 
+static void __init copy_multiboot_cmdline(struct multiboot_tag_string *tag)
+{
+   unsigned int size = tag->size - 8;
+
+   if (size > COMMAND_LINE_SIZE)
+   size = COMMAND_LINE_SIZE;
+   boot_params.hdr.cmdline_size = size;
+   memcpy(boot_command_line, tag->string, size);
+}
+
+static void __init copy_multiboot_mmap(struct multiboot_tag_mmap *tag)
+{
+   multiboot_memory_map_t *mmap;
+   int nr = 0;
+
+   for (mmap = tag->entries;
+   (u8 *)mmap < (u8 *)tag + tag->size && nr < E820MAX;
+   mmap = (multiboot_memory_map_t *)((unsigned long)mmap +
+   tag->entry_size)) {
+   boot_params.e820_map[nr].addr = mmap->addr;
+   boot_params.e820_map[nr].size = mmap->len;
+   boot_params.e820_map[nr].type = mmap->type;
+   nr++;
+   }
+   boot_params.e820_entries = nr;
+}
+
+static void __init copy_multiboot_info(void)
+{
+   struct multiboot_tag *tag;
+   char *ptr = __va(multiboot_info);
+
+   boot_params.hdr.boot_flag = 0xAA55;
+   boot_params.hdr.header = 0x53726448;
+   boot_params.hdr.version = 0x202;
+
+   for (tag = (struct multiboot_tag *)(ptr + 8);
+   tag->type != MULTIBOOT_TAG_TYPE_END;
+   tag = (struct multiboot_tag *)((u8 *) tag +
+  

Re: [PATCH] cgroups: move cpuset specific checks from generic code to cpuset_can_attach

2015-11-26 Thread Chao Peng
On Wed, Nov 25, 2015 at 08:01:17PM -0200, Marcelo Tosatti wrote:
> 
> Move PF_NO_SETAFFINITY check to cpuset cgroups, where it belongs.
> This makes it possible to attach PF_NO_SETAFFINITY to Intel CAT cgroups.

Looks that's the right place. I tried intel_rdt subsystem at least it
doesn't have this restriction anymore (all the tasks can be moved out
of the default group), hence:

Reviewed-by: Chao Peng 

Chao
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] cgroups: move cpuset specific checks from generic code to cpuset_can_attach

2015-11-26 Thread Chao Peng
On Wed, Nov 25, 2015 at 08:01:17PM -0200, Marcelo Tosatti wrote:
> 
> Move PF_NO_SETAFFINITY check to cpuset cgroups, where it belongs.
> This makes it possible to attach PF_NO_SETAFFINITY to Intel CAT cgroups.

Looks that's the right place. I tried intel_rdt subsystem at least it
doesn't have this restriction anymore (all the tasks can be moved out
of the default group), hence:

Reviewed-by: Chao Peng <chao.p.p...@linux.intel.com>

Chao
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFD] CAT user space interface revisited

2015-11-24 Thread Chao Peng
On Wed, Nov 18, 2015 at 10:01:54PM -0200, Marcelo Tosatti wrote:
> > tglx
> 
> Again: you don't need to look into the MSR table and relate it 
> to tasks if you store the data as:
> 
>   task group 1 = {
>   reservation-1 = {size = 80Kb, type = data, socketmask = 
> 0x},
>   reservation-2 = {size = 100Kb, type = code, socketmask 
> = 0x}
>   }
>   
>   task group 2 = {
>   reservation-1 = {size = 80Kb, type = data, socketmask = 
> 0x},
>   reservation-3 = {size = 200Kb, type = code, socketmask 
> = 0x}
>   }
> 
> Task group 1 and task group 2 share reservation-1.

Because there is only size but not CBM position info, I guess for
different reservations they will not overlap each other, right?

Personally I like this way of exposing minimal information to userspace.
I can think it working well except for one concern of losing flexibility:

For instance, there is a box for which the full CBM is 0xf. After
cache reservation creating/freeing for a while we then have reservations:

reservation1: 0xf
reservation2: 0x00ff0

Now people want to request a reservation which size is 0xff, so how
will kernel do at this time? It could return just error or do some
moving/merging (e.g. for reservation2: 0x00ff0 => 0x0ff00) and then
satisfy the request. But I don't know if the moving/merging will cause
delay for tasks that is using it.

Thanks,
Chao
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFD] CAT user space interface revisited

2015-11-24 Thread Chao Peng
On Wed, Nov 18, 2015 at 10:01:54PM -0200, Marcelo Tosatti wrote:
> > tglx
> 
> Again: you don't need to look into the MSR table and relate it 
> to tasks if you store the data as:
> 
>   task group 1 = {
>   reservation-1 = {size = 80Kb, type = data, socketmask = 
> 0x},
>   reservation-2 = {size = 100Kb, type = code, socketmask 
> = 0x}
>   }
>   
>   task group 2 = {
>   reservation-1 = {size = 80Kb, type = data, socketmask = 
> 0x},
>   reservation-3 = {size = 200Kb, type = code, socketmask 
> = 0x}
>   }
> 
> Task group 1 and task group 2 share reservation-1.

Because there is only size but not CBM position info, I guess for
different reservations they will not overlap each other, right?

Personally I like this way of exposing minimal information to userspace.
I can think it working well except for one concern of losing flexibility:

For instance, there is a box for which the full CBM is 0xf. After
cache reservation creating/freeing for a while we then have reservations:

reservation1: 0xf
reservation2: 0x00ff0

Now people want to request a reservation which size is 0xff, so how
will kernel do at this time? It could return just error or do some
moving/merging (e.g. for reservation2: 0x00ff0 => 0x0ff00) and then
satisfy the request. But I don't know if the moving/merging will cause
delay for tasks that is using it.

Thanks,
Chao
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFD] CAT user space interface revisited

2015-11-23 Thread Chao Peng
On Wed, Nov 18, 2015 at 07:25:03PM +0100, Thomas Gleixner wrote:
> 
> Let's look at partitioning itself. We have two options:
> 
>1) Per task partitioning
> 
>2) Per CPU partitioning
> 
> So far we only talked about #1, but I think that #2 has a value as
> well. Let me give you a simple example.

I would second this. In practice per CPU partitioning is useful for
realtime as well. And I can see three possible solutions:

 1) What you suggested below, to address both problems in one
framework. But I wonder if it would end with too complex.

 2) Achieve per CPU partitioning with per task partitioning. For
example, if current CAT patch can solve the kernel threads
problem, together with CPU pinning, we then can set a same CBM
for all the tasks/kernel threads run on an isolated CPU. 

 3) I wonder if it feasible to separate the two requirements? For
example, divides the work into three components: rdt-base,
per task interface (current cgroup interface/IOCTL or something)
and per CPU interface. The two interfaces are exclusive and
selected at build time. One thing to reject this option would be
even with per CPU partitioning, we still need per task partitioning,
in that case we will go to option 1) again.

Thanks,
Chao
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFD] CAT user space interface revisited

2015-11-23 Thread Chao Peng
On Wed, Nov 18, 2015 at 07:25:03PM +0100, Thomas Gleixner wrote:
> 
> Let's look at partitioning itself. We have two options:
> 
>1) Per task partitioning
> 
>2) Per CPU partitioning
> 
> So far we only talked about #1, but I think that #2 has a value as
> well. Let me give you a simple example.

I would second this. In practice per CPU partitioning is useful for
realtime as well. And I can see three possible solutions:

 1) What you suggested below, to address both problems in one
framework. But I wonder if it would end with too complex.

 2) Achieve per CPU partitioning with per task partitioning. For
example, if current CAT patch can solve the kernel threads
problem, together with CPU pinning, we then can set a same CBM
for all the tasks/kernel threads run on an isolated CPU. 

 3) I wonder if it feasible to separate the two requirements? For
example, divides the work into three components: rdt-base,
per task interface (current cgroup interface/IOCTL or something)
and per CPU interface. The two interfaces are exclusive and
selected at build time. One thing to reject this option would be
even with per CPU partitioning, we still need per task partitioning,
in that case we will go to option 1) again.

Thanks,
Chao
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] KVM: x86: Enable Intel AVX-512 for guest

2014-10-22 Thread Chao Peng
On Wed, Oct 22, 2014 at 12:17:33PM +0200, Paolo Bonzini wrote:
> On 10/22/2014 11:35 AM, Chao Peng wrote:
> > Expose Intel AVX-512 feature bits to guest. Also add checks for
> > xcr0 AVX512 related bits according to spec:
> > http://download-software.intel.com/sites/default/files/managed/71/2e/319433-017.pdf
> > 
> > Signed-off-by: Chao Peng 
> 
> The patch looks good, but you also have to patch QEMU in order to
> save/restore the values of the registers.  IIRC the manual already
> details where the registers are in the XSAVE area, so it should be easy
> to get them in and out.  You can look at the MPX patches for an example.
> 
> In the meanwhile, kernel bits are
> 
> Reviewed-by: Paolo Bonzini 
> 
> Paolo

Thanks Paolo.

QEMU side patch is already sent out to QEMU list in the other thread.
Also accessible from url:
http://lists.nongnu.org/archive/html/qemu-devel/2014-10/msg02681.html

Chao
> 
> > ---
> >  arch/x86/include/asm/xsave.h |1 +
> >  arch/x86/kvm/cpuid.c |3 ++-
> >  arch/x86/kvm/x86.c   |6 ++
> >  arch/x86/kvm/x86.h   |3 ++-
> >  4 files changed, 11 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
> > index 7e7a79a..5fa9770 100644
> > --- a/arch/x86/include/asm/xsave.h
> > +++ b/arch/x86/include/asm/xsave.h
> > @@ -16,6 +16,7 @@
> >  #define XSTATE_Hi16_ZMM0x80
> >  
> >  #define XSTATE_FPSSE   (XSTATE_FP | XSTATE_SSE)
> > +#define XSTATE_AVX512  (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | 
> > XSTATE_Hi16_ZMM)
> >  /* Bit 63 of XCR0 is reserved for future expansion */
> >  #define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63)))
> >  
> > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> > index 976e3a5..20d8321 100644
> > --- a/arch/x86/kvm/cpuid.c
> > +++ b/arch/x86/kvm/cpuid.c
> > @@ -317,7 +317,8 @@ static inline int __do_cpuid_ent(struct 
> > kvm_cpuid_entry2 *entry, u32 function,
> > const u32 kvm_supported_word9_x86_features =
> > F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
> > F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
> > -   F(ADX) | F(SMAP);
> > +   F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
> > +   F(AVX512CD);
> >  
> > /* all calls to cpuid_count() should be made on the same cpu */
> > get_cpu();
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 5430e4b..3d77b88 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -651,6 +651,12 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, 
> > u64 xcr)
> > if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR)))
> > return 1;
> >  
> > +   if (xcr0 & XSTATE_AVX512) {
> > +   if (!(xcr0 & XSTATE_YMM))
> > +   return 1;
> > +   if ((xcr0 & XSTATE_AVX512) != XSTATE_AVX512)
> > +   return 1;
> > +   }
> > kvm_put_guest_xcr0(vcpu);
> > vcpu->arch.xcr0 = xcr0;
> >  
> > diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> > index 7cb9c45..cc1d61a 100644
> > --- a/arch/x86/kvm/x86.h
> > +++ b/arch/x86/kvm/x86.h
> > @@ -162,7 +162,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt 
> > *ctxt,
> >  bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
> >  
> >  #define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
> > -   | XSTATE_BNDREGS | XSTATE_BNDCSR)
> > +   | XSTATE_BNDREGS | XSTATE_BNDCSR \
> > +   | XSTATE_AVX512)
> >  extern u64 host_xcr0;
> >  
> >  extern u64 kvm_supported_xcr0(void);
> > 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] KVM: x86: Enable Intel AVX-512 for guest

2014-10-22 Thread Chao Peng
Expose Intel AVX-512 feature bits to guest. Also add checks for
xcr0 AVX512 related bits according to spec:
http://download-software.intel.com/sites/default/files/managed/71/2e/319433-017.pdf

Signed-off-by: Chao Peng 
---
 arch/x86/include/asm/xsave.h |1 +
 arch/x86/kvm/cpuid.c |3 ++-
 arch/x86/kvm/x86.c   |6 ++
 arch/x86/kvm/x86.h   |3 ++-
 4 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 7e7a79a..5fa9770 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -16,6 +16,7 @@
 #define XSTATE_Hi16_ZMM0x80
 
 #define XSTATE_FPSSE   (XSTATE_FP | XSTATE_SSE)
+#define XSTATE_AVX512  (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
 /* Bit 63 of XCR0 is reserved for future expansion */
 #define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63)))
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 976e3a5..20d8321 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -317,7 +317,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
const u32 kvm_supported_word9_x86_features =
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
-   F(ADX) | F(SMAP);
+   F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
+   F(AVX512CD);
 
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5430e4b..3d77b88 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -651,6 +651,12 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 
xcr)
if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR)))
return 1;
 
+   if (xcr0 & XSTATE_AVX512) {
+   if (!(xcr0 & XSTATE_YMM))
+   return 1;
+   if ((xcr0 & XSTATE_AVX512) != XSTATE_AVX512)
+   return 1;
+   }
kvm_put_guest_xcr0(vcpu);
vcpu->arch.xcr0 = xcr0;
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 7cb9c45..cc1d61a 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -162,7 +162,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt 
*ctxt,
 bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
 
 #define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
-   | XSTATE_BNDREGS | XSTATE_BNDCSR)
+   | XSTATE_BNDREGS | XSTATE_BNDCSR \
+   | XSTATE_AVX512)
 extern u64 host_xcr0;
 
 extern u64 kvm_supported_xcr0(void);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] KVM: x86: Enable Intel AVX-512 for guest

2014-10-22 Thread Chao Peng
On Wed, Oct 22, 2014 at 12:17:33PM +0200, Paolo Bonzini wrote:
 On 10/22/2014 11:35 AM, Chao Peng wrote:
  Expose Intel AVX-512 feature bits to guest. Also add checks for
  xcr0 AVX512 related bits according to spec:
  http://download-software.intel.com/sites/default/files/managed/71/2e/319433-017.pdf
  
  Signed-off-by: Chao Peng chao.p.p...@linux.intel.com
 
 The patch looks good, but you also have to patch QEMU in order to
 save/restore the values of the registers.  IIRC the manual already
 details where the registers are in the XSAVE area, so it should be easy
 to get them in and out.  You can look at the MPX patches for an example.
 
 In the meanwhile, kernel bits are
 
 Reviewed-by: Paolo Bonzini pbonz...@redhat.com
 
 Paolo

Thanks Paolo.

QEMU side patch is already sent out to QEMU list in the other thread.
Also accessible from url:
http://lists.nongnu.org/archive/html/qemu-devel/2014-10/msg02681.html

Chao
 
  ---
   arch/x86/include/asm/xsave.h |1 +
   arch/x86/kvm/cpuid.c |3 ++-
   arch/x86/kvm/x86.c   |6 ++
   arch/x86/kvm/x86.h   |3 ++-
   4 files changed, 11 insertions(+), 2 deletions(-)
  
  diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
  index 7e7a79a..5fa9770 100644
  --- a/arch/x86/include/asm/xsave.h
  +++ b/arch/x86/include/asm/xsave.h
  @@ -16,6 +16,7 @@
   #define XSTATE_Hi16_ZMM0x80
   
   #define XSTATE_FPSSE   (XSTATE_FP | XSTATE_SSE)
  +#define XSTATE_AVX512  (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | 
  XSTATE_Hi16_ZMM)
   /* Bit 63 of XCR0 is reserved for future expansion */
   #define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL  63)))
   
  diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
  index 976e3a5..20d8321 100644
  --- a/arch/x86/kvm/cpuid.c
  +++ b/arch/x86/kvm/cpuid.c
  @@ -317,7 +317,8 @@ static inline int __do_cpuid_ent(struct 
  kvm_cpuid_entry2 *entry, u32 function,
  const u32 kvm_supported_word9_x86_features =
  F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
  F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
  -   F(ADX) | F(SMAP);
  +   F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
  +   F(AVX512CD);
   
  /* all calls to cpuid_count() should be made on the same cpu */
  get_cpu();
  diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
  index 5430e4b..3d77b88 100644
  --- a/arch/x86/kvm/x86.c
  +++ b/arch/x86/kvm/x86.c
  @@ -651,6 +651,12 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, 
  u64 xcr)
  if ((!(xcr0  XSTATE_BNDREGS)) != (!(xcr0  XSTATE_BNDCSR)))
  return 1;
   
  +   if (xcr0  XSTATE_AVX512) {
  +   if (!(xcr0  XSTATE_YMM))
  +   return 1;
  +   if ((xcr0  XSTATE_AVX512) != XSTATE_AVX512)
  +   return 1;
  +   }
  kvm_put_guest_xcr0(vcpu);
  vcpu-arch.xcr0 = xcr0;
   
  diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
  index 7cb9c45..cc1d61a 100644
  --- a/arch/x86/kvm/x86.h
  +++ b/arch/x86/kvm/x86.h
  @@ -162,7 +162,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt 
  *ctxt,
   bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
   
   #define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
  -   | XSTATE_BNDREGS | XSTATE_BNDCSR)
  +   | XSTATE_BNDREGS | XSTATE_BNDCSR \
  +   | XSTATE_AVX512)
   extern u64 host_xcr0;
   
   extern u64 kvm_supported_xcr0(void);
  
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] KVM: x86: Enable Intel AVX-512 for guest

2014-10-22 Thread Chao Peng
Expose Intel AVX-512 feature bits to guest. Also add checks for
xcr0 AVX512 related bits according to spec:
http://download-software.intel.com/sites/default/files/managed/71/2e/319433-017.pdf

Signed-off-by: Chao Peng chao.p.p...@linux.intel.com
---
 arch/x86/include/asm/xsave.h |1 +
 arch/x86/kvm/cpuid.c |3 ++-
 arch/x86/kvm/x86.c   |6 ++
 arch/x86/kvm/x86.h   |3 ++-
 4 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 7e7a79a..5fa9770 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -16,6 +16,7 @@
 #define XSTATE_Hi16_ZMM0x80
 
 #define XSTATE_FPSSE   (XSTATE_FP | XSTATE_SSE)
+#define XSTATE_AVX512  (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
 /* Bit 63 of XCR0 is reserved for future expansion */
 #define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL  63)))
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 976e3a5..20d8321 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -317,7 +317,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
const u32 kvm_supported_word9_x86_features =
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
-   F(ADX) | F(SMAP);
+   F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
+   F(AVX512CD);
 
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5430e4b..3d77b88 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -651,6 +651,12 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 
xcr)
if ((!(xcr0  XSTATE_BNDREGS)) != (!(xcr0  XSTATE_BNDCSR)))
return 1;
 
+   if (xcr0  XSTATE_AVX512) {
+   if (!(xcr0  XSTATE_YMM))
+   return 1;
+   if ((xcr0  XSTATE_AVX512) != XSTATE_AVX512)
+   return 1;
+   }
kvm_put_guest_xcr0(vcpu);
vcpu-arch.xcr0 = xcr0;
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 7cb9c45..cc1d61a 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -162,7 +162,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt 
*ctxt,
 bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
 
 #define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
-   | XSTATE_BNDREGS | XSTATE_BNDCSR)
+   | XSTATE_BNDREGS | XSTATE_BNDCSR \
+   | XSTATE_AVX512)
 extern u64 host_xcr0;
 
 extern u64 kvm_supported_xcr0(void);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/