[PATCH 4/4] x86/boot/compressed/64: Handle 5-level paging boot if kernel is above 4G

2018-03-12 Thread Kirill A. Shutemov
This patch addresses a shortcoming in current boot process on machines
that supports 5-level paging.

If a bootloader enables 64-bit mode with 4-level paging, we might need to
switch over to 5-level paging. The switching requires the disabling
paging. It works fine if kernel itself is loaded below 4G.

But if the bootloader put the kernel above 4G (not sure if anybody does
this), we would lose control as soon as paging is disabled, because the
code becomes unreachable to the CPU.

This patch implements a trampoline in lower memory to handle this
situation.

We only need the memory for a very short time, until the main kernel
image sets up own page tables.

We go through the trampoline even if we don't have to: if we're already
in 5-level paging mode or if we don't need to switch to it. This way the
trampoline gets tested on every boot.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/boot/compressed/head_64.S | 69 +-
 1 file changed, 53 insertions(+), 16 deletions(-)

diff --git a/arch/x86/boot/compressed/head_64.S 
b/arch/x86/boot/compressed/head_64.S
index 836ed319e995..33d7e72f3943 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -307,11 +307,27 @@ ENTRY(startup_64)
 
/*
 * At this point we are in long mode with 4-level paging enabled,
-* but we want to enable 5-level paging.
+* but we might want to enable 5-level paging or vice versa.
 *
-* The problem is that we cannot do it directly. Setting LA57 in
-* long mode would trigger #GP. So we need to switch off long mode
-* first.
+* The problem is that we cannot do it directly. Setting or clearing
+* CR4.LA57 in long mode would trigger #GP. So we need to switch off
+* long mode and paging first.
+*
+* We also need a trampoline in lower memory to switch over from
+* 4- to 5-level paging for cases when the bootloader puts the kernel
+* above 4G, but didn't enable 5-level paging for us.
+*
+* The same trampoline can be used to switch from 5- to 4-level paging
+* mode, like when starting 4-level paging kernel via kexec() when
+* original kernel worked in 5-level paging mode.
+*
+* For the trampoline, we need the top page table to reside in lower
+* memory as we don't have a way to load 64-bit values into CR3 in
+* 32-bit mode.
+*
+* We go though the trampoline even if we don't have to: if we're
+* already in a desired paging mode. This way the trampoline code gets
+* tested on every boot.
 */
 
/* Make sure we have GDT with 32-bit code segment */
@@ -336,13 +352,18 @@ ENTRY(startup_64)
/* Save the trampoline address in RCX */
movq%rax, %rcx
 
+   /*
+* Load the address of trampoline_return() into RDI.
+* It will be used by the trampoline to return to the main code.
+*/
+   leaqtrampoline_return(%rip), %rdi
 
/* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
pushq   $__KERNEL32_CS
-   leaqcompatible_mode(%rip), %rax
+   leaqTRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax
pushq   %rax
lretq
-lvl5:
+trampoline_return:
/* Restore the stack, the 32-bit trampoline uses its own stack */
leaqboot_stack_end(%rbx), %rsp
 
@@ -492,8 +513,14 @@ relocated:
jmp *%rax
 
.code32
+/*
+ * This is the 32-bit trampoline that will be copied over to low memory.
+ *
+ * RDI contains the return address (might be above 4G).
+ * ECX contains the base address of the trampoline memory.
+ * Non zero RDX on return means we need to enable 5-level paging.
+ */
 ENTRY(trampoline_32bit_src)
-compatible_mode:
/* Set up data and stack segments */
movl$__KERNEL_DS, %eax
movl%eax, %ds
@@ -534,24 +561,34 @@ compatible_mode:
 1:
movl%eax, %cr4
 
-   /* Calculate address we are running at */
-   call1f
-1: popl%edi
-   subl$1b, %edi
+   /* Calculate address of paging_enabled() once we are executing in the 
trampoline */
+   lealpaging_enabled - trampoline_32bit_src + 
TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax
 
-   /* Prepare stack for far return to Long Mode */
+   /* Prepare the stack for far return to Long Mode */
pushl   $__KERNEL_CS
-   leallvl5(%edi), %eax
-   push%eax
+   pushl   %eax
 
-   /* Enable paging back */
+   /* Enable paging again */
movl$(X86_CR0_PG | X86_CR0_PE), %eax
movl%eax, %cr0
 
lret
 
+   .code64
+paging_enabled:
+   /* Return from the trampoline */
+   jmp *%rdi
+
+   /*
+ * The trampoline code has a size limit.
+ * Make sure we fail to compile if the trampoline code grows
+ * beyond TRAMPOLINE_32BIT_CODE_SIZE by

[PATCH 4/4] x86/boot/compressed/64: Handle 5-level paging boot if kernel is above 4G

2017-11-01 Thread Kirill A. Shutemov
This patch addresses shortcoming in current boot process on machines
that supports 5-level paging.

If bootloader enables 64-bit mode with 4-level paging, we need to
switch over to 5-level paging. The switching requires disabling paging.
It works fine if kernel itself is loaded below 4G.

If bootloader put the kernel above 4G (not sure if anybody does this),
we would loose control as soon as paging is disabled as code becomes
unreachable.

This patch implements trampoline in lower memory to handle this
situation.

We only need the memory for very short time, until main kernel image
setup its own page tables.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/boot/compressed/head_64.S | 72 --
 1 file changed, 45 insertions(+), 27 deletions(-)

diff --git a/arch/x86/boot/compressed/head_64.S 
b/arch/x86/boot/compressed/head_64.S
index 4d1555b39de0..e8331f5a77f4 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -32,6 +32,7 @@
 #include 
 #include 
 #include 
+#include "pagetable.h"
 
 /*
  * Locally defined symbols should be marked hidden:
@@ -288,6 +289,19 @@ ENTRY(startup_64)
leaqboot_stack_end(%rbx), %rsp
 
 #ifdef CONFIG_X86_5LEVEL
+/*
+ * We need trampoline in lower memory switch from 4- to 5-level paging for
+ * cases when bootloader put kernel above 4G, but didn't enable 5-level paging
+ * for us.
+ *
+ * We also have to have top page table in lower memory as we don't have a way
+ * to load 64-bit value into CR3 from 32-bit mode. We only need 8-bytes there
+ * as we only use the very first entry of the page table, but we allocate whole
+ * page anyway. We cannot have the code in the same because, there's hazard
+ * that a CPU would read page table speculatively and get confused seeing
+ * garbage.
+ */
+
/*
 * Check if we need to enable 5-level paging.
 * RSI holds real mode data and need to be preserved across
@@ -309,8 +323,8 @@ ENTRY(startup_64)
 * long mode would trigger #GP. So we need to switch off long mode
 * first.
 *
-* NOTE: This is not going to work if bootloader put us above 4G
-* limit.
+* We use trampoline in lower memory to handle situation when
+* bootloader put the kernel image above 4G.
 *
 * The first step is go into compatibility mode.
 */
@@ -327,26 +341,20 @@ ENTRY(startup_64)
popq%rsi
movq%rax, %rcx
 
-   /* Clear additional page table */
-   leaqlvl5_pgtable(%rbx), %rdi
-   xorq%rax, %rax
-   movq$(PAGE_SIZE/8), %rcx
-   rep stosq
-
/*
-* Setup current CR3 as the first and only entry in a new top level
-* page table.
+* Load address of lvl5 into RDI.
+* It will be used to return address from trampoline.
 */
-   movq%cr3, %rdi
-   leaq0x7 (%rdi), %rax
-   movq%rax, lvl5_pgtable(%rbx)
+   leaqlvl5(%rip), %rdi
 
/* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
pushq   $__KERNEL32_CS
-   leaqcompatible_mode(%rip), %rax
+   leaqLVL5_TRAMPOLINE_CODE_OFF(%rcx), %rax
pushq   %rax
lretq
 lvl5:
+   /* Restore stack, 32-bit trampoline uses own stack */
+   leaqboot_stack_end(%rbx), %rsp
 #endif
 
/* Zero EFLAGS */
@@ -484,22 +492,30 @@ relocated:
  */
jmp *%rax
 
-   .code32
 #ifdef CONFIG_X86_5LEVEL
+   .code32
+/*
+ * This is 32-bit trampoline that will be copied over to low memory.
+ *
+ * RDI contains return address (might be above 4G).
+ * ECX contains the base address of trampoline memory.
+ */
 ENTRY(lvl5_trampoline_src)
-compatible_mode:
/* Setup data and stack segments */
movl$__KERNEL_DS, %eax
movl%eax, %ds
movl%eax, %ss
 
+   /* Setup new stack at the end of trampoline memory */
+   lealLVL5_TRAMPOLINE_STACK_END (%ecx), %esp
+
/* Disable paging */
movl%cr0, %eax
btrl$X86_CR0_PG_BIT, %eax
movl%eax, %cr0
 
/* Point CR3 to 5-level paging */
-   leallvl5_pgtable(%ebx), %eax
+   lealLVL5_TRAMPOLINE_PGTABLE_OFF (%ecx), %eax
movl%eax, %cr3
 
/* Enable PAE and LA57 mode */
@@ -507,23 +523,29 @@ compatible_mode:
orl $(X86_CR4_PAE | X86_CR4_LA57), %eax
movl%eax, %cr4
 
-   /* Calculate address we are running at */
-   call1f
-1: popl%edi
-   subl$1b, %edi
+   /* Calculate address of lvl5_enabled once we are in trampoline */
+   leallvl5_enabled - lvl5_trampoline_src + LVL5_TRAMPOLINE_CODE_OFF 
(%ecx), %eax
 
/* Prepare stack for far return to Long Mode */
pushl   $__KERNEL_CS
-   leallvl5(%edi), %eax
-   push%eax
+   pushl   %eax
 
/* Enable paging back */
movl$(X86_CR0_PG | X86_CR0_PE), %eax
  

[PATCH 4/4] x86/boot/compressed/64: Handle 5-level paging boot if kernel is above 4G

2017-10-20 Thread Kirill A. Shutemov
This patch addresses shortcoming in current boot process on machines
that supports 5-level paging.

If bootloader enables 64-bit mode with 4-level paging, we need to
switch over to 5-level paging. The switching requires disabling paging.
It works fine if kernel itself is loaded below 4G.

If bootloader put the kernel above 4G (not sure if anybody does this),
we would loose control as soon as paging is disabled as code becomes
unreachable.

This patch implements trampoline in lower memory to handle this
situation.

We only need the memory for very short time, until main kernel image
setup its own page tables.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/boot/compressed/head_64.S | 72 --
 1 file changed, 45 insertions(+), 27 deletions(-)

diff --git a/arch/x86/boot/compressed/head_64.S 
b/arch/x86/boot/compressed/head_64.S
index 4d1555b39de0..e8331f5a77f4 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -32,6 +32,7 @@
 #include 
 #include 
 #include 
+#include "pagetable.h"
 
 /*
  * Locally defined symbols should be marked hidden:
@@ -288,6 +289,19 @@ ENTRY(startup_64)
leaqboot_stack_end(%rbx), %rsp
 
 #ifdef CONFIG_X86_5LEVEL
+/*
+ * We need trampoline in lower memory switch from 4- to 5-level paging for
+ * cases when bootloader put kernel above 4G, but didn't enable 5-level paging
+ * for us.
+ *
+ * We also have to have top page table in lower memory as we don't have a way
+ * to load 64-bit value into CR3 from 32-bit mode. We only need 8-bytes there
+ * as we only use the very first entry of the page table, but we allocate whole
+ * page anyway. We cannot have the code in the same because, there's hazard
+ * that a CPU would read page table speculatively and get confused seeing
+ * garbage.
+ */
+
/*
 * Check if we need to enable 5-level paging.
 * RSI holds real mode data and need to be preserved across
@@ -309,8 +323,8 @@ ENTRY(startup_64)
 * long mode would trigger #GP. So we need to switch off long mode
 * first.
 *
-* NOTE: This is not going to work if bootloader put us above 4G
-* limit.
+* We use trampoline in lower memory to handle situation when
+* bootloader put the kernel image above 4G.
 *
 * The first step is go into compatibility mode.
 */
@@ -327,26 +341,20 @@ ENTRY(startup_64)
popq%rsi
movq%rax, %rcx
 
-   /* Clear additional page table */
-   leaqlvl5_pgtable(%rbx), %rdi
-   xorq%rax, %rax
-   movq$(PAGE_SIZE/8), %rcx
-   rep stosq
-
/*
-* Setup current CR3 as the first and only entry in a new top level
-* page table.
+* Load address of lvl5 into RDI.
+* It will be used to return address from trampoline.
 */
-   movq%cr3, %rdi
-   leaq0x7 (%rdi), %rax
-   movq%rax, lvl5_pgtable(%rbx)
+   leaqlvl5(%rip), %rdi
 
/* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
pushq   $__KERNEL32_CS
-   leaqcompatible_mode(%rip), %rax
+   leaqLVL5_TRAMPOLINE_CODE_OFF(%rcx), %rax
pushq   %rax
lretq
 lvl5:
+   /* Restore stack, 32-bit trampoline uses own stack */
+   leaqboot_stack_end(%rbx), %rsp
 #endif
 
/* Zero EFLAGS */
@@ -484,22 +492,30 @@ relocated:
  */
jmp *%rax
 
-   .code32
 #ifdef CONFIG_X86_5LEVEL
+   .code32
+/*
+ * This is 32-bit trampoline that will be copied over to low memory.
+ *
+ * RDI contains return address (might be above 4G).
+ * ECX contains the base address of trampoline memory.
+ */
 ENTRY(lvl5_trampoline_src)
-compatible_mode:
/* Setup data and stack segments */
movl$__KERNEL_DS, %eax
movl%eax, %ds
movl%eax, %ss
 
+   /* Setup new stack at the end of trampoline memory */
+   lealLVL5_TRAMPOLINE_STACK_END (%ecx), %esp
+
/* Disable paging */
movl%cr0, %eax
btrl$X86_CR0_PG_BIT, %eax
movl%eax, %cr0
 
/* Point CR3 to 5-level paging */
-   leallvl5_pgtable(%ebx), %eax
+   lealLVL5_TRAMPOLINE_PGTABLE_OFF (%ecx), %eax
movl%eax, %cr3
 
/* Enable PAE and LA57 mode */
@@ -507,23 +523,29 @@ compatible_mode:
orl $(X86_CR4_PAE | X86_CR4_LA57), %eax
movl%eax, %cr4
 
-   /* Calculate address we are running at */
-   call1f
-1: popl%edi
-   subl$1b, %edi
+   /* Calculate address of lvl5_enabled once we are in trampoline */
+   leallvl5_enabled - lvl5_trampoline_src + LVL5_TRAMPOLINE_CODE_OFF 
(%ecx), %eax
 
/* Prepare stack for far return to Long Mode */
pushl   $__KERNEL_CS
-   leallvl5(%edi), %eax
-   push%eax
+   pushl   %eax
 
/* Enable paging back */
movl$(X86_CR0_PG | X86_CR0_PE), %eax