Hi Kirill,

Something is wrong in this patch. We regularly run CRIU tests on
linux-next, and yesterday I found that a kernel didn't boot. We run this
tests in Travis-CI, and we don't have access to kernel logs. I tried to
reproduce the problem localy, but I failed.

In Travis-CI, we build kernel, then dump a travis deamon, boot the
kernel with help of kexec and restore the travis daemon back.

Here is logs without this patch:
https://travis-ci.org/avagin/linux/jobs/340820418

Here is logs with this patch:
https://travis-ci.org/avagin/linux/jobs/340820584

Thanks,
Andrei

On Sun, Feb 11, 2018 at 04:20:04AM -0800, tip-bot for Jacob Shin wrote:
> Commit-ID:  b4b56015ed1c98cbc9469e35ebbc4373a2844030
> Gitweb:     
> https://git.kernel.org/tip/b4b56015ed1c98cbc9469e35ebbc4373a2844030
> Author:     Kirill A. Shutemov <kirill.shute...@linux.intel.com>
> AuthorDate: Fri, 9 Feb 2018 17:22:28 +0300
> Committer:  Ingo Molnar <mi...@kernel.org>
> CommitDate: Sun, 11 Feb 2018 12:36:19 +0100
> 
> x86/boot/compressed/64: Handle 5-level paging boot if kernel is above 4G
> 
> This patch addresses a shortcoming in current boot process on machines
> that supports 5-level paging.
> 
> If a bootloader enables 64-bit mode with 4-level paging, we might need to
> switch over to 5-level paging. The switching requires the disabling
> paging. It works fine if kernel itself is loaded below 4G.
> 
> But if the bootloader put the kernel above 4G (not sure if anybody does
> this), we would lose control as soon as paging is disabled, because the
> code becomes unreachable to the CPU.
> 
> This patch implements a trampoline in lower memory to handle this
> situation.
> 
> We only need the memory for a very short time, until the main kernel
> image sets up own page tables.
> 
> We go through the trampoline even if we don't have to: if we're already
> in 5-level paging mode or if we don't need to switch to it. This way the
> trampoline gets tested on every boot.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
> Cc: Andy Lutomirski <l...@amacapital.net>
> Cc: Borislav Petkov <b...@suse.de>
> Cc: Cyrill Gorcunov <gorcu...@openvz.org>
> Cc: Linus Torvalds <torva...@linux-foundation.org>
> Cc: Matthew Wilcox <wi...@infradead.org>
> Cc: Peter Zijlstra <pet...@infradead.org>
> Cc: Thomas Gleixner <t...@linutronix.de>
> Cc: linux...@kvack.org
> Link: 
> http://lkml.kernel.org/r/20180209142228.21231-5-kirill.shute...@linux.intel.com
> Signed-off-by: Ingo Molnar <mi...@kernel.org>
> ---
>  arch/x86/boot/compressed/head_64.S | 127 
> ++++++++++++++++++++++++++-----------
>  1 file changed, 89 insertions(+), 38 deletions(-)
> 
> diff --git a/arch/x86/boot/compressed/head_64.S 
> b/arch/x86/boot/compressed/head_64.S
> index af9ffbd..70b30f2 100644
> --- a/arch/x86/boot/compressed/head_64.S
> +++ b/arch/x86/boot/compressed/head_64.S
> @@ -307,13 +307,34 @@ ENTRY(startup_64)
>  
>       /*
>        * At this point we are in long mode with 4-level paging enabled,
> -      * but we want to enable 5-level paging.
> +      * but we might want to enable 5-level paging or vice versa.
>        *
> -      * The problem is that we cannot do it directly. Setting LA57 in
> -      * long mode would trigger #GP. So we need to switch off long mode
> -      * first.
> +      * The problem is that we cannot do it directly. Setting or clearing
> +      * CR4.LA57 in long mode would trigger #GP. So we need to switch off
> +      * long mode and paging first.
> +      *
> +      * We also need a trampoline in lower memory to switch over from
> +      * 4- to 5-level paging for cases when the bootloader puts the kernel
> +      * above 4G, but didn't enable 5-level paging for us.
> +      *
> +      * The same trampoline can be used to switch from 5- to 4-level paging
> +      * mode, like when starting 4-level paging kernel via kexec() when
> +      * original kernel worked in 5-level paging mode.
> +      *
> +      * For the trampoline, we need the top page table to reside in lower
> +      * memory as we don't have a way to load 64-bit values into CR3 in
> +      * 32-bit mode.
> +      *
> +      * We go though the trampoline even if we don't have to: if we're
> +      * already in a desired paging mode. This way the trampoline code gets
> +      * tested on every boot.
>        */
>  
> +     /* Make sure we have GDT with 32-bit code segment */
> +     leaq    gdt(%rip), %rax
> +     movl    %eax, gdt64+2(%rip)
> +     lgdt    gdt64(%rip)
> +
>       /*
>        * paging_prepare() sets up the trampoline and checks if we need to
>        * enable 5-level paging.
> @@ -331,30 +352,20 @@ ENTRY(startup_64)
>       /* Save the trampoline address in RCX */
>       movq    %rax, %rcx
>  
> -     /* Check if we need to enable 5-level paging */
> -     cmpq    $0, %rdx
> -     jz      lvl5
> -
> -     /* Clear additional page table */
> -     leaq    lvl5_pgtable(%rbx), %rdi
> -     xorq    %rax, %rax
> -     movq    $(PAGE_SIZE/8), %rcx
> -     rep     stosq
> -
>       /*
> -      * Setup current CR3 as the first and only entry in a new top level
> -      * page table.
> +      * Load the address of trampoline_return() into RDI.
> +      * It will be used by the trampoline to return to the main code.
>        */
> -     movq    %cr3, %rdi
> -     leaq    0x7 (%rdi), %rax
> -     movq    %rax, lvl5_pgtable(%rbx)
> +     leaq    trampoline_return(%rip), %rdi
>  
>       /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
>       pushq   $__KERNEL32_CS
> -     leaq    compatible_mode(%rip), %rax
> +     leaq    TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax
>       pushq   %rax
>       lretq
> -lvl5:
> +trampoline_return:
> +     /* Restore the stack, the 32-bit trampoline uses its own stack */
> +     leaq    boot_stack_end(%rbx), %rsp
>  
>       /*
>        * cleanup_trampoline() would restore trampoline memory.
> @@ -503,45 +514,82 @@ relocated:
>       jmp     *%rax
>  
>       .code32
> +/*
> + * This is the 32-bit trampoline that will be copied over to low memory.
> + *
> + * RDI contains the return address (might be above 4G).
> + * ECX contains the base address of the trampoline memory.
> + * Non zero RDX on return means we need to enable 5-level paging.
> + */
>  ENTRY(trampoline_32bit_src)
> -compatible_mode:
>       /* Set up data and stack segments */
>       movl    $__KERNEL_DS, %eax
>       movl    %eax, %ds
>       movl    %eax, %ss
>  
> +     /* Setup new stack */
> +     leal    TRAMPOLINE_32BIT_STACK_END(%ecx), %esp
> +
>       /* Disable paging */
>       movl    %cr0, %eax
>       btrl    $X86_CR0_PG_BIT, %eax
>       movl    %eax, %cr0
>  
> -     /* Point CR3 to 5-level paging */
> -     leal    lvl5_pgtable(%ebx), %eax
> -     movl    %eax, %cr3
> +     /* Check what paging mode we want to be in after the trampoline */
> +     cmpl    $0, %edx
> +     jz      1f
>  
> -     /* Enable PAE and LA57 mode */
> +     /* We want 5-level paging: don't touch CR3 if it already points to 
> 5-level page tables */
>       movl    %cr4, %eax
> -     orl     $(X86_CR4_PAE | X86_CR4_LA57), %eax
> +     testl   $X86_CR4_LA57, %eax
> +     jnz     3f
> +     jmp     2f
> +1:
> +     /* We want 4-level paging: don't touch CR3 if it already points to 
> 4-level page tables */
> +     movl    %cr4, %eax
> +     testl   $X86_CR4_LA57, %eax
> +     jz      3f
> +2:
> +     /* Point CR3 to the trampoline's new top level page table */
> +     leal    TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax
> +     movl    %eax, %cr3
> +3:
> +     /* Enable PAE and LA57 (if required) paging modes */
> +     movl    $X86_CR4_PAE, %eax
> +     cmpl    $0, %edx
> +     jz      1f
> +     orl     $X86_CR4_LA57, %eax
> +1:
>       movl    %eax, %cr4
>  
> -     /* Calculate address we are running at */
> -     call    1f
> -1:   popl    %edi
> -     subl    $1b, %edi
> +     /* Calculate address of paging_enabled() once we are executing in the 
> trampoline */
> +     leal    paging_enabled - trampoline_32bit_src + 
> TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax
>  
> -     /* Prepare stack for far return to Long Mode */
> +     /* Prepare the stack for far return to Long Mode */
>       pushl   $__KERNEL_CS
> -     leal    lvl5(%edi), %eax
> -     push    %eax
> +     pushl   %eax
>  
> -     /* Enable paging back */
> +     /* Enable paging again */
>       movl    $(X86_CR0_PG | X86_CR0_PE), %eax
>       movl    %eax, %cr0
>  
>       lret
>  
> +     .code64
> +paging_enabled:
> +     /* Return from the trampoline */
> +     jmp     *%rdi
> +
> +     /*
> +         * The trampoline code has a size limit.
> +         * Make sure we fail to compile if the trampoline code grows
> +         * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes.
> +      */
> +     .org    trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE
> +
> +     .code32
>  no_longmode:
> -     /* This isn't an x86-64 CPU so hang */
> +     /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue 
> */
>  1:
>       hlt
>       jmp     1b
> @@ -549,6 +597,11 @@ no_longmode:
>  #include "../../kernel/verify_cpu.S"
>  
>       .data
> +gdt64:
> +     .word   gdt_end - gdt
> +     .long   0
> +     .word   0
> +     .quad   0
>  gdt:
>       .word   gdt_end - gdt
>       .long   gdt
> @@ -602,8 +655,6 @@ trampoline_save:
>       .balign 4096
>  pgtable:
>       .fill BOOT_PGT_SIZE, 1, 0
> -lvl5_pgtable:
> -     .fill PAGE_SIZE, 1, 0
>  
>       .global pgtable_trampoline
>  pgtable_trampoline:

Reply via email to