On Tue, 19 Dec 2017, Thomas Gleixner wrote:
> On Tue, 19 Dec 2017, Ingo Molnar wrote:
> We don't run out of space, but the 0-day robot triggered a nasty issue.
> 
> The fixmap bottom address, which contains the early_ioremap fixmap area, is:
> 
>     vaddr_bt = FIXADDR_TOP - FIX_BTMAP_BEGIN * PAGE_SIZE
> 
> If that address is lower than:
> 
>     vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
> 
> then cleanup_highmap() will happily 0 out the PMD entry for the PTE page of
> FIX_BTMAP. That entry was set up earlier in early_ioremap_init().
> 
> As a consequence the first call to __early_set_fixmap() which tries to
> install a PTE for early_ioremap() will crash and burn.
> 
> Below is a nasty hack which fixes the problem. Ideally we get all of this
> cpu_entry_stuff out of the fixmap. I'll look into that later, but for now
> the patch 'fixes' the issue.

I had a stab on moving the cpu_entry_area to some other place.

The patch below works, but:

 - it breaks i386 build because I have not yet found a way to place the
   CPU_ENTRY_AREA_BASE without creating include recursion hell

 - it probably does not work on XEN_PV, but I'm too tired now to figure
   that out.

Thanks,

        tglx

8<-------------------

 Documentation/x86/x86_64/mm.txt         |    4 
 arch/x86/events/intel/ds.c              |   53 ++++++------
 arch/x86/include/asm/desc.h             |    1 
 arch/x86/include/asm/fixmap.h           |   89 --------------------
 arch/x86/include/asm/pgtable_32_types.h |    6 -
 arch/x86/include/asm/pgtable_64_types.h |   49 ++++++-----
 arch/x86/kernel/cpu/common.c            |  125 ----------------------------
 arch/x86/kernel/dumpstack.c             |    1 
 arch/x86/kernel/traps.c                 |    5 -
 arch/x86/mm/Makefile                    |    2 
 arch/x86/mm/dump_pagetables.c           |    2 
 arch/x86/mm/kasan_init_64.c             |    6 -
 arch/x86/mm/pti.c                       |   39 +++------
 arch/x86/xen/mmu_pv.c                   |    2 
 b/arch/x86/include/asm/cpu_entry_area.h |   79 ++++++++++++++++++
 b/arch/x86/mm/cpu_entry_area.c          |  138 ++++++++++++++++++++++++++++++++
 16 files changed, 309 insertions(+), 292 deletions(-)

--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,7 +12,8 @@ ffffea0000000000 - ffffeaffffffffff (=40
 ... unused hole ...
 ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
 ... unused hole ...
-fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI
+fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
+fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ... unused hole ...
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
@@ -36,6 +37,7 @@ ffd4000000000000 - ffd5ffffffffffff (=49
 ... unused hole ...
 ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
 ... unused hole ...
+fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ... unused hole ...
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -76,36 +76,39 @@ typedef struct { pteval_t pte; } pte_t;
 #define PGDIR_MASK     (~(PGDIR_SIZE - 1))
 
 /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
-#define MAXMEM         _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+#define MAXMEM                 _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
 #ifdef CONFIG_X86_5LEVEL
-#define VMALLOC_SIZE_TB _AC(12800, UL)
-#define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
-#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
-#define LDT_PGD_ENTRY _AC(-112, UL)
-#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
+#define VMALLOC_SIZE_TB                _AC(12800, UL)
+#define __VMALLOC_BASE         _AC(0xffa0000000000000, UL)
+#define __VMEMMAP_BASE         _AC(0xffd4000000000000, UL)
+#define LDT_PGD_ENTRY          _AC(-112, UL)
+#define LDT_BASE_ADDR          (LDT_PGD_ENTRY << PGDIR_SHIFT)
 #else
-#define VMALLOC_SIZE_TB        _AC(32, UL)
-#define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
-#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
-#define LDT_PGD_ENTRY _AC(-3, UL)
-#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
+#define VMALLOC_SIZE_TB                _AC(32, UL)
+#define __VMALLOC_BASE         _AC(0xffffc90000000000, UL)
+#define __VMEMMAP_BASE         _AC(0xffffea0000000000, UL)
+#define LDT_PGD_ENTRY          _AC(-4, UL)
+#define LDT_BASE_ADDR          (LDT_PGD_ENTRY << PGDIR_SHIFT)
 #endif
 #ifdef CONFIG_RANDOMIZE_MEMORY
-#define VMALLOC_START  vmalloc_base
-#define VMEMMAP_START  vmemmap_base
+#define VMALLOC_START          vmalloc_base
+#define VMEMMAP_START          vmemmap_base
 #else
-#define VMALLOC_START  __VMALLOC_BASE
-#define VMEMMAP_START  __VMEMMAP_BASE
+#define VMALLOC_START          __VMALLOC_BASE
+#define VMEMMAP_START          __VMEMMAP_BASE
 #endif /* CONFIG_RANDOMIZE_MEMORY */
-#define VMALLOC_END    (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
-#define MODULES_VADDR    (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
+
+#define VMALLOC_END            (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 
1, UL))
+#define MODULES_VADDR          (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
 /* The module sections ends with the start of the fixmap */
-#define MODULES_END   __fix_to_virt(__end_of_fixed_addresses + 1)
-#define MODULES_LEN   (MODULES_END - MODULES_VADDR)
-#define ESPFIX_PGD_ENTRY _AC(-2, UL)
-#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
-#define EFI_VA_START    ( -4 * (_AC(1, UL) << 30))
-#define EFI_VA_END      (-68 * (_AC(1, UL) << 30))
+#define MODULES_END            __fix_to_virt(__end_of_fixed_addresses + 1)
+#define MODULES_LEN            (MODULES_END - MODULES_VADDR)
+#define CPU_ENTRY_AREA_PGD     _AC(-3, UL)
+#define CPU_ENTRY_AREA_BASE    (CPU_ENTRY_AREA_PGD << P4D_SHIFT)
+#define ESPFIX_PGD_ENTRY       _AC(-2, UL)
+#define ESPFIX_BASE_ADDR       (ESPFIX_PGD_ENTRY << P4D_SHIFT)
+#define EFI_VA_START           ( -4 * (_AC(1, UL) << 30))
+#define EFI_VA_END             (-68 * (_AC(1, UL) << 30))
 
 #define EARLY_DYNAMIC_PAGE_TABLES      64
 
--- /dev/null
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -0,0 +1,79 @@
+#ifndef _ASM_X86_CPU_ENTRY_AREA_H
+#define _ASM_X86_CPU_ENTRY_AREA_H
+
+#include <asm/processor.h>
+#include <asm/intel_ds.h>
+
+/*
+ * cpu_entry_area is a percpu region that contains things needed by the CPU
+ * and early entry/exit code.  Real types aren't used for all fields here
+ * to avoid circular header dependencies.
+ *
+ * Every field is a virtual alias of some other allocated backing store.
+ * There is no direct allocation of a struct cpu_entry_area.
+ */
+struct cpu_entry_area {
+       char gdt[PAGE_SIZE];
+
+       /*
+        * The GDT is just below entry_stack and thus serves (on x86_64) as
+        * a a read-only guard page.
+        */
+       struct entry_stack_page entry_stack_page;
+
+       /*
+        * On x86_64, the TSS is mapped RO.  On x86_32, it's mapped RW because
+        * we need task switches to work, and task switches write to the TSS.
+        */
+       struct tss_struct tss;
+
+       char entry_trampoline[PAGE_SIZE];
+
+#ifdef CONFIG_X86_64
+       /*
+        * Exception stacks used for IST entries.
+        *
+        * In the future, this should have a separate slot for each stack
+        * with guard pages between them.
+        */
+       char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + 
DEBUG_STKSZ];
+#endif
+#ifdef CONFIG_CPU_SUP_INTEL
+       /*
+        * Per CPU debug store for Intel performance monitoring. Wastes a
+        * full page at the moment.
+        */
+       struct debug_store cpu_debug_store;
+       /*
+        * The actual PEBS/BTS buffers must be mapped to user space
+        * Reserve enough fixmap PTEs.
+        */
+       struct debug_store_buffers cpu_debug_buffers;
+#endif
+};
+
+#define CPU_ENTRY_AREA_SIZE    (sizeof(struct cpu_entry_area))
+#define CPU_ENTRY_AREA_TOT_SIZE        (CPU_ENTRY_AREA_SIZE * NR_CPUS)
+
+extern void setup_cpu_entry_areas(void);
+extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
+
+#define        CPU_ENTRY_AREA_RO_IDT           CPU_ENTRY_AREA_BASE
+#define CPU_ENTRY_AREA_PER_CPU         (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE)
+
+#define CPU_ENTRY_AREA_RO_IDT_VADDR    ((void *)CPU_ENTRY_AREA_RO_IDT)
+
+static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
+{
+       unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
+       BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
+
+       return (struct cpu_entry_area *) va;
+}
+
+static inline struct entry_stack *cpu_entry_stack(int cpu)
+{
+       return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
+}
+
+#endif
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -19,7 +19,6 @@
 #include <asm/acpi.h>
 #include <asm/apicdef.h>
 #include <asm/page.h>
-#include <asm/intel_ds.h>
 #ifdef CONFIG_X86_32
 #include <linux/threads.h>
 #include <asm/kmap_types.h>
@@ -45,57 +44,6 @@ extern unsigned long __FIXADDR_TOP;
                         PAGE_SIZE)
 #endif
 
-/*
- * cpu_entry_area is a percpu region in the fixmap that contains things
- * needed by the CPU and early entry/exit code.  Real types aren't used
- * for all fields here to avoid circular header dependencies.
- *
- * Every field is a virtual alias of some other allocated backing store.
- * There is no direct allocation of a struct cpu_entry_area.
- */
-struct cpu_entry_area {
-       char gdt[PAGE_SIZE];
-
-       /*
-        * The GDT is just below entry_stack and thus serves (on x86_64) as
-        * a a read-only guard page.
-        */
-       struct entry_stack_page entry_stack_page;
-
-       /*
-        * On x86_64, the TSS is mapped RO.  On x86_32, it's mapped RW because
-        * we need task switches to work, and task switches write to the TSS.
-        */
-       struct tss_struct tss;
-
-       char entry_trampoline[PAGE_SIZE];
-
-#ifdef CONFIG_X86_64
-       /*
-        * Exception stacks used for IST entries.
-        *
-        * In the future, this should have a separate slot for each stack
-        * with guard pages between them.
-        */
-       char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + 
DEBUG_STKSZ];
-#endif
-#ifdef CONFIG_CPU_SUP_INTEL
-       /*
-        * Per CPU debug store for Intel performance monitoring. Wastes a
-        * full page at the moment.
-        */
-       struct debug_store cpu_debug_store;
-       /*
-        * The actual PEBS/BTS buffers must be mapped to user space
-        * Reserve enough fixmap PTEs.
-        */
-       struct debug_store_buffers cpu_debug_buffers;
-#endif
-};
-
-#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
-
-extern void setup_cpu_entry_areas(void);
 
 /*
  * Here we define all the compile-time 'special' virtual
@@ -158,17 +106,7 @@ enum fixed_addresses {
        FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
        FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
 
-       /*
-        * Fixmap entries to remap the IDT, and the per CPU entry areas.
-        * Aligned to a PMD boundary.
-        */
-       FIX_USR_SHARED_TOP = round_up(FIX_TEXT_POKE0 + 1, PTRS_PER_PMD),
-       FIX_RO_IDT,
-       FIX_CPU_ENTRY_AREA_TOP,
-       FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + 
(CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1,
-       FIX_USR_SHARED_BOTTOM  = round_up(FIX_CPU_ENTRY_AREA_BOTTOM + 2, 
PTRS_PER_PMD) - 1,
-
-       __end_of_permanent_fixed_addresses = FIX_USR_SHARED_BOTTOM,
+       __end_of_permanent_fixed_addresses = FIX_TEXT_POKE0,
 
        /*
         * 512 temporary boot-time mappings, used by early_ioremap(),
@@ -249,30 +187,5 @@ void __init *early_memremap_decrypted_wp
 void __early_set_fixmap(enum fixed_addresses idx,
                        phys_addr_t phys, pgprot_t flags);
 
-static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page)
-{
-       BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
-
-       return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page;
-}
-
-#define __get_cpu_entry_area_offset_index(cpu, offset) ({              \
-       BUILD_BUG_ON(offset % PAGE_SIZE != 0);                          \
-       __get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE);       \
-       })
-
-#define get_cpu_entry_area_index(cpu, field)                           \
-       __get_cpu_entry_area_offset_index((cpu), offsetof(struct 
cpu_entry_area, field))
-
-static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
-{
-       return (struct cpu_entry_area 
*)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0));
-}
-
-static inline struct entry_stack *cpu_entry_stack(int cpu)
-{
-       return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
-}
-
 #endif /* !__ASSEMBLY__ */
 #endif /* _ASM_X86_FIXMAP_H */
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -483,133 +483,8 @@ static const unsigned int exception_stac
          [0 ... N_EXCEPTION_STACKS - 1]        = EXCEPTION_STKSZ,
          [DEBUG_STACK - 1]                     = DEBUG_STKSZ
 };
-
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
-       [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
 #endif
 
-static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page,
-                                  entry_stack_storage);
-
-/*
- * Force the population of PMDs for not yet allocated per cpu
- * memory like debug store buffers.
- */
-static void __init allocate_percpu_fixmap_ptes(int idx, int pages)
-{
-       for (; pages; pages--, idx--)
-               __set_fixmap(idx, 0, PAGE_NONE);
-}
-
-static void __init
-set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
-{
-       for ( ; pages; pages--, idx--, ptr += PAGE_SIZE)
-               __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot);
-}
-
-static void percpu_setup_debug_store(int cpu)
-{
-#ifdef CONFIG_CPU_SUP_INTEL
-       int npages, idx;
-
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
-               return;
-
-       idx = get_cpu_entry_area_index(cpu, cpu_debug_store);
-       npages = sizeof(struct debug_store) / PAGE_SIZE;
-       BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
-       set_percpu_fixmap_pages(idx, &per_cpu(cpu_debug_store, cpu), npages,
-                               PAGE_KERNEL);
-
-       idx = get_cpu_entry_area_index(cpu, cpu_debug_buffers);
-       npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
-       allocate_percpu_fixmap_ptes(idx, npages);
-#endif
-}
-
-/* Setup the fixmap mappings only once per-processor */
-static void __init setup_cpu_entry_area(int cpu)
-{
-#ifdef CONFIG_X86_64
-       extern char _entry_trampoline[];
-
-       /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
-       pgprot_t gdt_prot = PAGE_KERNEL_RO;
-       pgprot_t tss_prot = PAGE_KERNEL_RO;
-#else
-       /*
-        * On native 32-bit systems, the GDT cannot be read-only because
-        * our double fault handler uses a task gate, and entering through
-        * a task gate needs to change an available TSS to busy.  If the
-        * GDT is read-only, that will triple fault.  The TSS cannot be
-        * read-only because the CPU writes to it on task switches.
-        *
-        * On Xen PV, the GDT must be read-only because the hypervisor
-        * requires it.
-        */
-       pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
-               PAGE_KERNEL_RO : PAGE_KERNEL;
-       pgprot_t tss_prot = PAGE_KERNEL;
-#endif
-
-       __set_fixmap(get_cpu_entry_area_index(cpu, gdt), 
get_cpu_gdt_paddr(cpu), gdt_prot);
-       set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page),
-                               per_cpu_ptr(&entry_stack_storage, cpu), 1,
-                               PAGE_KERNEL);
-
-       /*
-        * The Intel SDM says (Volume 3, 7.2.1):
-        *
-        *  Avoid placing a page boundary in the part of the TSS that the
-        *  processor reads during a task switch (the first 104 bytes). The
-        *  processor may not correctly perform address translations if a
-        *  boundary occurs in this area. During a task switch, the processor
-        *  reads and writes into the first 104 bytes of each TSS (using
-        *  contiguous physical addresses beginning with the physical address
-        *  of the first byte of the TSS). So, after TSS access begins, if
-        *  part of the 104 bytes is not physically contiguous, the processor
-        *  will access incorrect information without generating a page-fault
-        *  exception.
-        *
-        * There are also a lot of errata involving the TSS spanning a page
-        * boundary.  Assert that we're not doing that.
-        */
-       BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
-                     offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
-       BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
-       set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
-                               &per_cpu(cpu_tss_rw, cpu),
-                               sizeof(struct tss_struct) / PAGE_SIZE,
-                               tss_prot);
-
-#ifdef CONFIG_X86_32
-       per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
-#endif
-
-#ifdef CONFIG_X86_64
-       BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
-       BUILD_BUG_ON(sizeof(exception_stacks) !=
-                    sizeof(((struct cpu_entry_area *)0)->exception_stacks));
-       set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks),
-                               &per_cpu(exception_stacks, cpu),
-                               sizeof(exception_stacks) / PAGE_SIZE,
-                               PAGE_KERNEL);
-
-       __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
-                    __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
-#endif
-       percpu_setup_debug_store(cpu);
-}
-
-void __init setup_cpu_entry_areas(void)
-{
-       unsigned int cpu;
-
-       for_each_possible_cpu(cpu)
-               setup_cpu_entry_area(cpu);
-}
-
 /* Load the original GDT from the per-cpu structure */
 void load_direct_gdt(int cpu)
 {
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -60,6 +60,7 @@
 #include <asm/trace/mpx.h>
 #include <asm/mpx.h>
 #include <asm/vm86.h>
+#include <asm/cpu_entry_area.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/x86_init.h>
@@ -950,8 +951,8 @@ void __init trap_init(void)
         * "sidt" instruction will not leak the location of the kernel, and
         * to defend the IDT against arbitrary memory write vulnerabilities.
         * It will be reloaded in cpu_init() */
-       __set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
-       idt_descr.address = fix_to_virt(FIX_RO_IDT);
+       cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), 
PAGE_KERNEL_RO);
+       idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
 
        /*
         * Should be a barrier for any external CPU state:
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -10,7 +10,7 @@ CFLAGS_REMOVE_mem_encrypt.o   = -pg
 endif
 
 obj-y  :=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o 
\
-           pat.o pgtable.o physaddr.o setup_nx.o tlb.o
+           pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o
 
 # Make sure __phys_addr has no stackprotector
 nostackp := $(call cc-option, -fno-stack-protector)
--- /dev/null
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -0,0 +1,138 @@
+#include <linux/percpu.h>
+#include <asm/cpu_entry_area.h>
+#include <asm/pgtable.h>
+#include <asm/fixmap.h>
+#include <asm/desc.h>
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, 
entry_stack_storage);
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+       [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+
+
+void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags)
+{
+       unsigned long va = (unsigned long) cea_vaddr;
+
+       set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags));
+}
+
+/*
+ * Force the population of PMDs for not yet allocated per cpu
+ * memory like debug store buffers.
+ */
+static void __init cea_allocate_ptes(void *cea_vaddr, int pages)
+{
+       for (; pages; pages--, cea_vaddr += PAGE_SIZE)
+               cea_set_pte(cea_vaddr, 0, PAGE_NONE);
+}
+
+static void __init
+cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
+{
+       for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE)
+               cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
+}
+
+static void percpu_setup_debug_store(int cpu)
+{
+#ifdef CONFIG_CPU_SUP_INTEL
+       int npages;
+       void *cea;
+
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+               return;
+
+       cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
+       npages = sizeof(struct debug_store) / PAGE_SIZE;
+       BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
+       cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
+                            PAGE_KERNEL);
+
+       cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
+       npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
+       cea_allocate_ptes(cea, npages);
+#endif
+}
+
+/* Setup the fixmap mappings only once per-processor */
+static void __init setup_cpu_entry_area(int cpu)
+{
+#ifdef CONFIG_X86_64
+       extern char _entry_trampoline[];
+
+       /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
+       pgprot_t gdt_prot = PAGE_KERNEL_RO;
+       pgprot_t tss_prot = PAGE_KERNEL_RO;
+#else
+       /*
+        * On native 32-bit systems, the GDT cannot be read-only because
+        * our double fault handler uses a task gate, and entering through
+        * a task gate needs to change an available TSS to busy.  If the
+        * GDT is read-only, that will triple fault.  The TSS cannot be
+        * read-only because the CPU writes to it on task switches.
+        *
+        * On Xen PV, the GDT must be read-only because the hypervisor
+        * requires it.
+        */
+       pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
+               PAGE_KERNEL_RO : PAGE_KERNEL;
+       pgprot_t tss_prot = PAGE_KERNEL;
+#endif
+
+       cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu),
+                   gdt_prot);
+
+       cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page,
+                            per_cpu_ptr(&entry_stack_storage, cpu), 1,
+                            PAGE_KERNEL);
+
+       /*
+        * The Intel SDM says (Volume 3, 7.2.1):
+        *
+        *  Avoid placing a page boundary in the part of the TSS that the
+        *  processor reads during a task switch (the first 104 bytes). The
+        *  processor may not correctly perform address translations if a
+        *  boundary occurs in this area. During a task switch, the processor
+        *  reads and writes into the first 104 bytes of each TSS (using
+        *  contiguous physical addresses beginning with the physical address
+        *  of the first byte of the TSS). So, after TSS access begins, if
+        *  part of the 104 bytes is not physically contiguous, the processor
+        *  will access incorrect information without generating a page-fault
+        *  exception.
+        *
+        * There are also a lot of errata involving the TSS spanning a page
+        * boundary.  Assert that we're not doing that.
+        */
+       BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
+                     offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
+       BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
+       cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss,
+                            &per_cpu(cpu_tss_rw, cpu),
+                            sizeof(struct tss_struct) / PAGE_SIZE, tss_prot);
+
+#ifdef CONFIG_X86_32
+       per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
+#endif
+
+#ifdef CONFIG_X86_64
+       BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
+       BUILD_BUG_ON(sizeof(exception_stacks) !=
+                    sizeof(((struct cpu_entry_area *)0)->exception_stacks));
+       cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks,
+                            &per_cpu(exception_stacks, cpu),
+                            sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL);
+
+       cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
+                    __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
+#endif
+       percpu_setup_debug_store(cpu);
+}
+
+void __init setup_cpu_entry_areas(void)
+{
+       unsigned int cpu;
+
+       for_each_possible_cpu(cpu)
+               setup_cpu_entry_area(cpu);
+}
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -3,6 +3,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 
+#include <asm/cpu_entry_area.h>
 #include <asm/perf_event.h>
 #include <asm/insn.h>
 
@@ -280,16 +281,22 @@ void fini_debug_store_on_cpu(int cpu)
 
 static DEFINE_PER_CPU(void *, insn_buffer);
 
-static u64 ds_update_fixmap(int idx, void *addr, size_t size, pgprot_t prot)
+static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
 {
-       phys_addr_t pa, va;
+       phys_addr_t pa;
        size_t msz = 0;
 
-       va = __fix_to_virt(idx);
        pa = virt_to_phys(addr);
-       for (; msz < size; idx--, msz += PAGE_SIZE, pa += PAGE_SIZE)
-               __set_fixmap(idx, pa, prot);
-       return va;
+       for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
+               cea_set_pte(cea, pa, prot);
+}
+
+static void ds_clear_cea(void *cea, size_t size)
+{
+       size_t msz = 0;
+
+       for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
+               cea_set_pte(cea, 0, PAGE_NONE);
 }
 
 static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
@@ -313,8 +320,8 @@ static int alloc_pebs_buffer(int cpu)
        struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
        struct debug_store *ds = hwev->ds;
        size_t bsiz = x86_pmu.pebs_buffer_size;
-       int idx, max, node = cpu_to_node(cpu);
-       void *buffer, *ibuffer;
+       int max, node = cpu_to_node(cpu);
+       void *buffer, *ibuffer, *cea;
 
        if (!x86_pmu.pebs)
                return 0;
@@ -336,10 +343,10 @@ static int alloc_pebs_buffer(int cpu)
                per_cpu(insn_buffer, cpu) = ibuffer;
        }
        hwev->ds_pebs_vaddr = buffer;
-       /* Update the fixmap */
-       idx = get_cpu_entry_area_index(cpu, cpu_debug_buffers.pebs_buffer);
-       ds->pebs_buffer_base = ds_update_fixmap(idx, buffer, bsiz,
-                                               PAGE_KERNEL);
+       /* Update the cpu entry area mapping */
+       cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+       ds->pebs_buffer_base = (u64) cea;
+       ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
        ds->pebs_index = ds->pebs_buffer_base;
        max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
        ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
@@ -350,7 +357,7 @@ static void release_pebs_buffer(int cpu)
 {
        struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
        struct debug_store *ds = hwev->ds;
-       int idx;
+       void *cea;
 
        if (!ds || !x86_pmu.pebs)
                return;
@@ -359,8 +366,8 @@ static void release_pebs_buffer(int cpu)
        per_cpu(insn_buffer, cpu) = NULL;
 
        /* Clear the fixmap */
-       idx = get_cpu_entry_area_index(cpu, cpu_debug_buffers.pebs_buffer);
-       ds_update_fixmap(idx, 0, x86_pmu.pebs_buffer_size, PAGE_NONE);
+       cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+       ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
        ds->pebs_buffer_base = 0;
        dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
        hwev->ds_pebs_vaddr = NULL;
@@ -370,8 +377,8 @@ static int alloc_bts_buffer(int cpu)
 {
        struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
        struct debug_store *ds = hwev->ds;
-       int idx, max;
-       void *buffer;
+       void *buffer, *cea;
+       int max;
 
        if (!x86_pmu.bts)
                return 0;
@@ -383,9 +390,9 @@ static int alloc_bts_buffer(int cpu)
        }
        hwev->ds_bts_vaddr = buffer;
        /* Update the fixmap */
-       idx = get_cpu_entry_area_index(cpu, cpu_debug_buffers.bts_buffer);
-       ds->bts_buffer_base = ds_update_fixmap(idx, buffer, BTS_BUFFER_SIZE,
-                                              PAGE_KERNEL);
+       cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+       ds->bts_buffer_base = (u64) cea;
+       ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
        ds->bts_index = ds->bts_buffer_base;
        max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
        ds->bts_absolute_maximum = ds->bts_buffer_base + max;
@@ -397,14 +404,14 @@ static void release_bts_buffer(int cpu)
 {
        struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
        struct debug_store *ds = hwev->ds;
-       int idx;
+       void *cea;
 
        if (!ds || !x86_pmu.bts)
                return;
 
        /* Clear the fixmap */
-       idx = get_cpu_entry_area_index(cpu, cpu_debug_buffers.bts_buffer);
-       ds_update_fixmap(idx, 0, BTS_BUFFER_SIZE, PAGE_NONE);
+       cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+       ds_clear_cea(cea, BTS_BUFFER_SIZE);
        ds->bts_buffer_base = 0;
        dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
        hwev->ds_bts_vaddr = NULL;
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -7,6 +7,7 @@
 #include <asm/mmu.h>
 #include <asm/fixmap.h>
 #include <asm/irq_vectors.h>
+#include <asm/cpu_entry_area.h>
 
 #include <linux/smp.h>
 #include <linux/percpu.h>
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -38,13 +38,13 @@ extern bool __vmalloc_start_set; /* set
 #define LAST_PKMAP 1024
 #endif
 
-#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1))     \
-                   & PMD_MASK)
+#define PKMAP_BASE             \
+       ((FIXMAP_START - PAGE_SIZE * (LAST_PKMAP + 1)) & PMD_MASK)
 
 #ifdef CONFIG_HIGHMEM
 # define VMALLOC_END   (PKMAP_BASE - 2 * PAGE_SIZE)
 #else
-# define VMALLOC_END   (FIXADDR_START - 2 * PAGE_SIZE)
+# define VMALLOC_END   (FIXMAP_START - 2 * PAGE_SIZE)
 #endif
 
 #define MODULES_VADDR  VMALLOC_START
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -18,6 +18,7 @@
 #include <linux/nmi.h>
 #include <linux/sysfs.h>
 
+#include <asm/cpu_entry_area.h>
 #include <asm/stacktrace.h>
 #include <asm/unwind.h>
 
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -62,6 +62,7 @@ enum address_markers_idx {
 #if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
        LDT_NR,
 #endif
+       CPU_ENTRY_AREA_START_NR,
 # ifdef CONFIG_X86_ESPFIX64
        ESPFIX_START_NR,
 # endif
@@ -97,6 +98,7 @@ static struct addr_marker address_marker
 #if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
        { LDT_BASE_ADDR,        "LDT remap" },
 #endif
+       { CPU_ENTRY_AREA_BASE,  "CPU entry Area" },
 # ifdef CONFIG_X86_ESPFIX64
        { ESPFIX_BASE_ADDR,     "ESPfix Area", 16 },
 # endif
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -15,6 +15,7 @@
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
 #include <asm/pgtable.h>
+#include <asm/cpu_entry_area.h>
 
 extern struct range pfn_mapped[E820_MAX_ENTRIES];
 
@@ -330,12 +331,13 @@ void __init kasan_init(void)
                              (unsigned long)kasan_mem_to_shadow(_end),
                              early_pfn_to_nid(__pa(_stext)));
 
-       shadow_cpu_entry_begin = (void 
*)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM);
+       shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE;
        shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
        shadow_cpu_entry_begin = (void *)round_down((unsigned 
long)shadow_cpu_entry_begin,
                                                PAGE_SIZE);
 
-       shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + 
PAGE_SIZE);
+       shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE +
+                                       CPU_ENTRY_AREA_TOT_SIZE);
        shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
        shadow_cpu_entry_end = (void *)round_up((unsigned 
long)shadow_cpu_entry_end,
                                        PAGE_SIZE);
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -275,21 +275,25 @@ pti_clone_pmds(unsigned long start, unsi
        }
 }
 
-static void __init pti_setup_espfix64(void)
+/*
+ * Clone a single p4d (i.e. a top-level entry on 4-level systems and a
+ * next-level entry on 5-level systems.
+ */
+static void __init pti_clone_p4d(unsigned long addr)
 {
-#ifdef CONFIG_X86_ESPFIX64
-       /*
-        * ESPFIX64 uses a single p4d (i.e. a top-level entry on 4-level
-        * systems and a next-level entry on 5-level systems.  Share that
-        * entry between the user and kernel pagetables.
-        */
-       pgd_t *kernel_pgd;
        p4d_t *kernel_p4d, *user_p4d;
+       pgd_t *kernel_pgd;
 
-       user_p4d = pti_user_pagetable_walk_p4d(ESPFIX_BASE_ADDR);
-       kernel_pgd = pgd_offset_k(ESPFIX_BASE_ADDR);
-       kernel_p4d = p4d_offset(kernel_pgd, ESPFIX_BASE_ADDR);
+       user_p4d = pti_user_pagetable_walk_p4d(addr);
+       kernel_pgd = pgd_offset_k(addr);
+       kernel_p4d = p4d_offset(kernel_pgd, addr);
        *user_p4d = *kernel_p4d;
+}
+
+static void __init pti_setup_espfix64(void)
+{
+#ifdef CONFIG_X86_ESPFIX64
+       pti_clone_p4d(ESPFIX_BASE_ADDR);
 #endif
 }
 
@@ -313,20 +317,11 @@ static void __init pti_setup_vsyscall(vo
 }
 
 /*
- * Clone the populated PMDs of the user shared fixmaps into the user space
- * visible page table.
+ * Clone the CPU_ENTRY_AREA into the user space visible page table.
  */
 static void __init pti_clone_user_shared(void)
 {
-       unsigned long bot, top;
-
-       bot = __fix_to_virt(FIX_USR_SHARED_BOTTOM);
-       top = __fix_to_virt(FIX_USR_SHARED_TOP) + PAGE_SIZE;
-
-       /* Top of the user shared block must be PMD-aligned. */
-       WARN_ON(top & ~PMD_MASK);
-
-       pti_clone_pmds(bot, top, 0);
+       pti_clone_p4d(CPU_ENTRY_AREA_BASE);
 }
 
 /*
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2261,7 +2261,6 @@ static void xen_set_fixmap(unsigned idx,
 
        switch (idx) {
        case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
-       case FIX_RO_IDT:
 #ifdef CONFIG_X86_32
        case FIX_WP_TEST:
 # ifdef CONFIG_HIGHMEM
@@ -2272,7 +2271,6 @@ static void xen_set_fixmap(unsigned idx,
 #endif
        case FIX_TEXT_POKE0:
        case FIX_TEXT_POKE1:
-       case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM:
                /* All local page mappings */
                pte = pfn_pte(phys, prot);
                break;

Reply via email to