This patch adds detection of 5-level paging at boot-time and adjusts
virtual memory layout and folds p4d page table layer if needed.

We have to make X86_5LEVEL dependant on SPARSEMEM_VMEMMAP.
!SPARSEMEM_VMEMMAP configuration doesn't work well with variable
MAX_PHYSMEM_BITS.

Signed-off-by: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
---
 arch/x86/Kconfig                        |  1 +
 arch/x86/boot/compressed/kaslr.c        | 13 +++++--
 arch/x86/entry/entry_64.S               | 12 +++++++
 arch/x86/include/asm/page_64_types.h    | 13 +++----
 arch/x86/include/asm/pgtable_64_types.h | 35 +++++++++++--------
 arch/x86/include/asm/processor.h        |  2 +-
 arch/x86/include/asm/sparsemem.h        |  9 ++---
 arch/x86/kernel/head64.c                | 60 ++++++++++++++++++++++++++-------
 arch/x86/kernel/head_64.S               | 18 ++++++----
 arch/x86/kernel/setup.c                 |  5 ++-
 arch/x86/mm/dump_pagetables.c           |  8 +++--
 arch/x86/mm/kaslr.c                     | 13 ++++---
 12 files changed, 129 insertions(+), 60 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2c9c4899d9ff..ac3358bb7bd2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1401,6 +1401,7 @@ config X86_PAE
 config X86_5LEVEL
        bool "Enable 5-level page tables support"
        depends on X86_64
+       depends on SPARSEMEM_VMEMMAP
        ---help---
          5-level paging enables access to larger address space:
          upto 128 PiB of virtual address space and 4 PiB of
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index f0f53817a8d2..e37bcfb9f432 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -44,8 +44,9 @@
 #include <linux/decompress/mm.h>
 
 #ifdef CONFIG_X86_5LEVEL
-unsigned int pgdir_shift __read_mostly = 48;
-unsigned int ptrs_per_p4d __read_mostly = 512;
+unsigned int p4d_folded __read_mostly = 1;
+unsigned int pgdir_shift __read_mostly = 39;
+unsigned int ptrs_per_p4d __read_mostly = 1;
 #endif
 
 extern unsigned long get_cmd_line_ptr(void);
@@ -642,6 +643,14 @@ void choose_random_location(unsigned long input,
                return;
        }
 
+#ifdef CONFIG_X86_5LEVEL
+       if (__read_cr4() & X86_CR4_LA57) {
+               p4d_folded = 0;
+               pgdir_shift = 48;
+               ptrs_per_p4d = 512;
+       }
+#endif
+
        boot_params->hdr.loadflags |= KASLR_FLAG;
 
        /* Prepare to add new identity pagetables on demand. */
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index daf8936d0628..077e8b45784c 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -273,8 +273,20 @@ return_from_SYSCALL_64:
         * Change top bits to match most significant bit (47th or 56th bit
         * depending on paging mode) in the address.
         */
+#ifdef CONFIG_X86_5LEVEL
+       testl   $1, p4d_folded(%rip)
+       jnz     1f
+       shl     $(64 - 57), %rcx
+       sar     $(64 - 57), %rcx
+       jmp     2f
+1:
+       shl     $(64 - 48), %rcx
+       sar     $(64 - 48), %rcx
+2:
+#else
        shl     $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
        sar     $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+#endif
 
        /* If this changed %rcx, it was not canonical */
        cmpq    %rcx, %r11
diff --git a/arch/x86/include/asm/page_64_types.h 
b/arch/x86/include/asm/page_64_types.h
index 0126d6bc2eb1..26056ef366b8 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -36,24 +36,21 @@
  * hypervisor to fit.  Choosing 16 slots here is arbitrary, but it's
  * what Xen requires.
  */
-#ifdef CONFIG_X86_5LEVEL
-#define __PAGE_OFFSET_BASE      _AC(0xff10000000000000, UL)
-#else
-#define __PAGE_OFFSET_BASE      _AC(0xffff880000000000, UL)
-#endif
+#define __PAGE_OFFSET_BASE57   _AC(0xff10000000000000, UL)
+#define __PAGE_OFFSET_BASE48   _AC(0xffff880000000000, UL)
 
 #if defined(CONFIG_RANDOMIZE_MEMORY) || defined(CONFIG_X86_5LEVEL)
 #define __PAGE_OFFSET           page_offset_base
 #else
-#define __PAGE_OFFSET           __PAGE_OFFSET_BASE
+#define __PAGE_OFFSET           __PAGE_OFFSET_BASE48
 #endif /* CONFIG_RANDOMIZE_MEMORY */
 
 #define __START_KERNEL_map     _AC(0xffffffff80000000, UL)
 
 /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
 #ifdef CONFIG_X86_5LEVEL
-#define __PHYSICAL_MASK_SHIFT  52
-#define __VIRTUAL_MASK_SHIFT   56
+#define __PHYSICAL_MASK_SHIFT  (p4d_folded ? 52 : 46)
+#define __VIRTUAL_MASK_SHIFT   (p4d_folded ? 47 : 56)
 #else
 #define __PHYSICAL_MASK_SHIFT  46
 #define __VIRTUAL_MASK_SHIFT   47
diff --git a/arch/x86/include/asm/pgtable_64_types.h 
b/arch/x86/include/asm/pgtable_64_types.h
index a5338b0936ad..1958a50d79cf 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -20,7 +20,7 @@ typedef unsigned long pgprotval_t;
 typedef struct { pteval_t pte; } pte_t;
 
 #ifdef CONFIG_X86_5LEVEL
-#define p4d_folded 0
+extern unsigned int p4d_folded;
 #else
 #define p4d_folded 1
 #endif
@@ -86,24 +86,31 @@ extern unsigned int ptrs_per_p4d;
 #define PGDIR_MASK     (~(PGDIR_SIZE - 1))
 
 /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
-#define MAXMEM         _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
-#ifdef CONFIG_X86_5LEVEL
-#define VMALLOC_SIZE_TB _AC(16384, UL)
-#define __VMALLOC_BASE _AC(0xff92000000000000, UL)
-#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
-#else
-#define VMALLOC_SIZE_TB        _AC(32, UL)
-#define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
-#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
-#endif
+#define MAXMEM         (1UL << MAX_PHYSMEM_BITS)
+
+#ifndef __ASSEMBLY__
+#define __VMALLOC_BASE48       0xffffc90000000000
+#define __VMALLOC_BASE57       0xff92000000000000
+
+#define VMALLOC_SIZE_TB48      32UL
+#define VMALLOC_SIZE_TB57      16384UL
+
+#define __VMEMMAP_BASE48       0xffffea0000000000
+#define __VMEMMAP_BASE57       0xffd4000000000000
+
 #if defined(CONFIG_RANDOMIZE_MEMORY) || defined(CONFIG_X86_5LEVEL)
 #define VMALLOC_START  vmalloc_base
+#define VMALLOC_SIZE_TB        (!p4d_folded ? VMALLOC_SIZE_TB57 : 
VMALLOC_SIZE_TB48)
 #define VMEMMAP_START  vmemmap_base
 #else
-#define VMALLOC_START  __VMALLOC_BASE
-#define VMEMMAP_START  __VMEMMAP_BASE
+#define VMALLOC_START  __VMALLOC_BASE48
+#define VMALLOC_SIZE_TB        VMALLOC_SIZE_TB48
+#define VMEMMAP_START  __VMEMMAP_BASE48
 #endif /* CONFIG_RANDOMIZE_MEMORY */
-#define VMALLOC_END    (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
+
+#define VMALLOC_END    (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1)
+#endif
+
 #define MODULES_VADDR    (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
 /* The module sections ends with the start of the fixmap */
 #define MODULES_END   __fix_to_virt(__end_of_fixed_addresses + 1)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c1352771b2f6..cb2958b88de5 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -871,7 +871,7 @@ static inline void spin_lock_prefetch(const void *x)
                                        IA32_PAGE_OFFSET : TASK_SIZE_MAX)
 
 #define STACK_TOP              TASK_SIZE_LOW
-#define STACK_TOP_MAX          TASK_SIZE_MAX
+#define STACK_TOP_MAX          (!p4d_folded ? TASK_SIZE_MAX : 
DEFAULT_MAP_WINDOW)
 
 #define INIT_THREAD  {                                         \
        .sp0                    = TOP_OF_INIT_STACK,            \
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 1f5bee2c202f..ba67afd870b7 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -26,13 +26,8 @@
 # endif
 #else /* CONFIG_X86_32 */
 # define SECTION_SIZE_BITS     27 /* matt - 128 is convenient right now */
-# ifdef CONFIG_X86_5LEVEL
-#  define MAX_PHYSADDR_BITS    52
-#  define MAX_PHYSMEM_BITS     52
-# else
-#  define MAX_PHYSADDR_BITS    44
-#  define MAX_PHYSMEM_BITS     46
-# endif
+# define MAX_PHYSADDR_BITS     (p4d_folded ? 44 : 52)
+# define MAX_PHYSMEM_BITS      (p4d_folded ? 46 : 52)
 #endif
 
 #endif /* CONFIG_SPARSEMEM */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index f05ea8d6b0fa..5a2029114fb7 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -39,18 +39,18 @@ static unsigned int __initdata next_early_pgt;
 pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
 
 #ifdef CONFIG_X86_5LEVEL
-unsigned int pgdir_shift __read_mostly = 48;
+unsigned int pgdir_shift __read_mostly = 39;
 EXPORT_SYMBOL(pgdir_shift);
-unsigned int ptrs_per_p4d __read_mostly = 512;
+unsigned int ptrs_per_p4d __read_mostly = 1;
 EXPORT_SYMBOL(ptrs_per_p4d);
 #endif
 
 #if defined(CONFIG_RANDOMIZE_MEMORY) || defined(CONFIG_X86_5LEVEL)
-unsigned long page_offset_base __read_mostly = __PAGE_OFFSET_BASE;
+unsigned long page_offset_base __read_mostly = __PAGE_OFFSET_BASE48;
 EXPORT_SYMBOL(page_offset_base);
-unsigned long vmalloc_base __read_mostly = __VMALLOC_BASE;
+unsigned long vmalloc_base __read_mostly = __VMALLOC_BASE48;
 EXPORT_SYMBOL(vmalloc_base);
-unsigned long vmemmap_base __read_mostly = __VMEMMAP_BASE;
+unsigned long vmemmap_base __read_mostly = __VMEMMAP_BASE48;
 EXPORT_SYMBOL(vmemmap_base);
 #endif
 
@@ -61,6 +61,36 @@ static void __head *fixup_pointer(void *ptr, unsigned long 
physaddr)
        return ptr - (void *)_text + (void *)physaddr;
 }
 
+static unsigned long __head *fixup_long(void *ptr, unsigned long physaddr)
+{
+       return fixup_pointer(ptr, physaddr);
+}
+
+#ifdef CONFIG_X86_5LEVEL
+static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr)
+{
+       return fixup_pointer(ptr, physaddr);
+}
+
+static void __head check_la57_support(unsigned long physaddr)
+{
+       if (native_cpuid_eax(0) < 7)
+               return;
+
+       if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
+               return;
+
+       *fixup_int(&p4d_folded, physaddr) = 0;
+       *fixup_int(&pgdir_shift, physaddr) = 48;
+       *fixup_int(&ptrs_per_p4d, physaddr) = 512;
+       *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE57;
+       *fixup_long(&vmalloc_base, physaddr) = __VMALLOC_BASE57;
+       *fixup_long(&vmemmap_base, physaddr) = __VMEMMAP_BASE57;
+}
+#else
+static void __head check_la57_support(unsigned long physaddr) {}
+#endif
+
 unsigned long __head __startup_64(unsigned long physaddr,
                                  struct boot_params *bp)
 {
@@ -72,6 +102,8 @@ unsigned long __head __startup_64(unsigned long physaddr,
        pmdval_t *pmd, pmd_entry;
        int i;
 
+       check_la57_support(physaddr);
+
        /* Is the address too large? */
        if (physaddr >> MAX_PHYSMEM_BITS)
                for (;;);
@@ -95,9 +127,14 @@ unsigned long __head __startup_64(unsigned long physaddr,
        /* Fixup the physical addresses in the page table */
 
        pgd = fixup_pointer(&early_top_pgt, physaddr);
-       pgd[pgd_index(__START_KERNEL_map)] += load_delta;
-
-       if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+       p = pgd + pgd_index(__START_KERNEL_map);
+       if (p4d_folded)
+               *p = (unsigned long)level3_kernel_pgt;
+       else
+               *p = (unsigned long)level4_kernel_pgt;
+       *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
+
+       if (!p4d_folded) {
                p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
                p4d[511] += load_delta;
        }
@@ -120,7 +157,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
        pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
        pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
 
-       if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+       if (!p4d_folded) {
                p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], 
physaddr);
 
                i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
@@ -166,8 +203,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
         * Fixup phys_base - remove the memory encryption mask to obtain
         * the true physical address.
         */
-       p = fixup_pointer(&phys_base, physaddr);
-       *p += load_delta - sme_get_me_mask();
+       *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask();
 
        /* Encrypt the kernel (if SME is active) */
        sme_encrypt_kernel();
@@ -218,7 +254,7 @@ int __init __early_make_pgtable(unsigned long address, 
pmdval_t pmd)
         * critical -- __PAGE_OFFSET would point us back into the dynamic
         * range and we might end up looping forever...
         */
-       if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+       if (p4d_folded)
                p4d_p = pgd_p;
        else if (pgd)
                p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map 
- phys_base);
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 2be7d1e7fcf1..ebdcb08a91cb 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -40,7 +40,7 @@
 #define pud_index(x)   (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
 
 #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
-PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
+PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE48)
 PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
 #endif
 L3_START_KERNEL = pud_index(__START_KERNEL_map)
@@ -121,7 +121,10 @@ ENTRY(secondary_startup_64)
        /* Enable PAE mode, PGE and LA57 */
        movl    $(X86_CR4_PAE | X86_CR4_PGE), %ecx
 #ifdef CONFIG_X86_5LEVEL
+       testl   $1, p4d_folded(%rip)
+       jnz     1f
        orl     $X86_CR4_LA57, %ecx
+1:
 #endif
        movq    %rcx, %cr4
 
@@ -350,12 +353,7 @@ GLOBAL(name)
 
        __INITDATA
 NEXT_PAGE(early_top_pgt)
-       .fill   511,8,0
-#ifdef CONFIG_X86_5LEVEL
-       .quad   level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
-#else
-       .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
-#endif
+       .fill   512,8,0
 
 NEXT_PAGE(early_dynamic_pgts)
        .fill   512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -434,6 +432,12 @@ ENTRY(phys_base)
        .quad   0x0000000000000000
 EXPORT_SYMBOL(phys_base)
 
+#ifdef CONFIG_X86_5LEVEL
+ENTRY(p4d_folded)
+       .word   1
+EXPORT_SYMBOL(p4d_folded)
+#endif
+
 #include "../../x86/xen/xen-head.S"
        
        __PAGE_ALIGNED_BSS
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 022ebddb3734..10e6dd1cb948 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -202,9 +202,7 @@ struct ist_info ist_info;
 #endif
 
 #else
-struct cpuinfo_x86 boot_cpu_data __read_mostly = {
-       .x86_phys_bits = MAX_PHYSMEM_BITS,
-};
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
 EXPORT_SYMBOL(boot_cpu_data);
 #endif
 
@@ -892,6 +890,7 @@ void __init setup_arch(char **cmdline_p)
        __flush_tlb_all();
 #else
        printk(KERN_INFO "Command line: %s\n", boot_command_line);
+       boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS;
 #endif
 
        /*
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index bad178deffba..acf264ab02f3 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -82,8 +82,8 @@ static struct addr_marker address_markers[] = {
        { 0/* VMALLOC_START */, "vmalloc() Area" },
        { 0/* VMEMMAP_START */, "Vmemmap" },
 #ifdef CONFIG_KASAN
-       { KASAN_SHADOW_START,   "KASAN shadow" },
-       { KASAN_SHADOW_END,     "KASAN shadow end" },
+       { 0/* KASAN_SHADOW_START */,    "KASAN shadow" },
+       { 0/* KASAN_SHADOW_END */,      "KASAN shadow end" },
 #endif
 # ifdef CONFIG_X86_ESPFIX64
        { ESPFIX_BASE_ADDR,     "ESPfix Area", 16 },
@@ -515,6 +515,10 @@ static int __init pt_dump_init(void)
        address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET;
        address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
        address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
+#ifdef CONFIG_KASAN
+       address_markers[KASAN_SHADOW_START_NR].start_address = 
KASAN_SHADOW_START;
+       address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END;
+#endif
 #endif
 #ifdef CONFIG_X86_32
        address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index e6420b18f6e0..2f6ba5c72905 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -43,7 +43,6 @@
  * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() 
to
  * ensure that this order is correct and won't be changed.
  */
-static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;
 
 #if defined(CONFIG_X86_ESPFIX64)
 static const unsigned long vaddr_end = ESPFIX_BASE_ADDR;
@@ -62,8 +61,8 @@ static __initdata struct kaslr_memory_region {
        unsigned long *base;
        unsigned long size_tb;
 } kaslr_regions[] = {
-       { &page_offset_base, 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum 
*/ },
-       { &vmalloc_base, VMALLOC_SIZE_TB },
+       { &page_offset_base, 0 },
+       { &vmalloc_base, 0 },
        { &vmemmap_base, 1 },
 };
 
@@ -86,11 +85,14 @@ static inline bool kaslr_memory_enabled(void)
 void __init kernel_randomize_memory(void)
 {
        size_t i;
-       unsigned long vaddr = vaddr_start;
+       unsigned long vaddr_start, vaddr;
        unsigned long rand, memory_tb;
        struct rnd_state rand_state;
        unsigned long remain_entropy;
 
+       vaddr_start = p4d_folded ? __PAGE_OFFSET_BASE48 : __PAGE_OFFSET_BASE57;
+       vaddr = vaddr_start;
+
        /*
         * All these BUILD_BUG_ON checks ensures the memory layout is
         * consistent with the vaddr_start/vaddr_end variables.
@@ -106,6 +108,9 @@ void __init kernel_randomize_memory(void)
        if (!kaslr_memory_enabled())
                return;
 
+       kaslr_regions[0].size_tb = 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT);
+       kaslr_regions[1].size_tb = VMALLOC_SIZE_TB;
+
        /*
         * Update Physical memory mapping to available and
         * add padding if needed (especially for memory hotplug support).
-- 
2.13.2

Reply via email to