On Tue, 19 Dec 2017, Thomas Gleixner wrote: > On Tue, 19 Dec 2017, Ingo Molnar wrote: > We don't run out of space, but the 0-day robot triggered a nasty issue. > > The fixmap bottom address, which contains the early_ioremap fixmap area, is: > > vaddr_bt = FIXADDR_TOP - FIX_BTMAP_BEGIN * PAGE_SIZE > > If that address is lower than: > > vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE; > > then cleanup_highmap() will happily 0 out the PMD entry for the PTE page of > FIX_BTMAP. That entry was set up earlier in early_ioremap_init(). > > As a consequence the first call to __early_set_fixmap() which tries to > install a PTE for early_ioremap() will crash and burn. > > Below is a nasty hack which fixes the problem. Ideally we get all of this > cpu_entry_stuff out of the fixmap. I'll look into that later, but for now > the patch 'fixes' the issue.
I had a stab on moving the cpu_entry_area to some other place. The patch below works, but: - it breaks i386 build because I have not yet found a way to place the CPU_ENTRY_AREA_BASE without creating include recursion hell - it probably does not work on XEN_PV, but I'm too tired now to figure that out. Thanks, tglx 8<------------------- Documentation/x86/x86_64/mm.txt | 4 arch/x86/events/intel/ds.c | 53 ++++++------ arch/x86/include/asm/desc.h | 1 arch/x86/include/asm/fixmap.h | 89 -------------------- arch/x86/include/asm/pgtable_32_types.h | 6 - arch/x86/include/asm/pgtable_64_types.h | 49 ++++++----- arch/x86/kernel/cpu/common.c | 125 ---------------------------- arch/x86/kernel/dumpstack.c | 1 arch/x86/kernel/traps.c | 5 - arch/x86/mm/Makefile | 2 arch/x86/mm/dump_pagetables.c | 2 arch/x86/mm/kasan_init_64.c | 6 - arch/x86/mm/pti.c | 39 +++------ arch/x86/xen/mmu_pv.c | 2 b/arch/x86/include/asm/cpu_entry_area.h | 79 ++++++++++++++++++ b/arch/x86/mm/cpu_entry_area.c | 138 ++++++++++++++++++++++++++++++++ 16 files changed, 309 insertions(+), 292 deletions(-) --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -12,7 +12,8 @@ ffffea0000000000 - ffffeaffffffffff (=40 ... unused hole ... ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) ... unused hole ... -fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI +fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI +fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ... unused hole ... ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space @@ -36,6 +37,7 @@ ffd4000000000000 - ffd5ffffffffffff (=49 ... unused hole ... ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) ... unused hole ... +fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ... unused hole ... ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -76,36 +76,39 @@ typedef struct { pteval_t pte; } pte_t; #define PGDIR_MASK (~(PGDIR_SIZE - 1)) /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ -#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) +#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) #ifdef CONFIG_X86_5LEVEL -#define VMALLOC_SIZE_TB _AC(12800, UL) -#define __VMALLOC_BASE _AC(0xffa0000000000000, UL) -#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) -#define LDT_PGD_ENTRY _AC(-112, UL) -#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) +#define VMALLOC_SIZE_TB _AC(12800, UL) +#define __VMALLOC_BASE _AC(0xffa0000000000000, UL) +#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) +#define LDT_PGD_ENTRY _AC(-112, UL) +#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) #else -#define VMALLOC_SIZE_TB _AC(32, UL) -#define __VMALLOC_BASE _AC(0xffffc90000000000, UL) -#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) -#define LDT_PGD_ENTRY _AC(-3, UL) -#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) +#define VMALLOC_SIZE_TB _AC(32, UL) +#define __VMALLOC_BASE _AC(0xffffc90000000000, UL) +#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) +#define LDT_PGD_ENTRY _AC(-4, UL) +#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) #endif #ifdef CONFIG_RANDOMIZE_MEMORY -#define VMALLOC_START vmalloc_base -#define VMEMMAP_START vmemmap_base +#define VMALLOC_START vmalloc_base +#define VMEMMAP_START vmemmap_base #else -#define VMALLOC_START __VMALLOC_BASE -#define VMEMMAP_START __VMEMMAP_BASE +#define VMALLOC_START __VMALLOC_BASE +#define VMEMMAP_START __VMEMMAP_BASE #endif /* CONFIG_RANDOMIZE_MEMORY */ -#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) -#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) + +#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) +#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) /* The module sections ends with the start of the fixmap */ -#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) -#define MODULES_LEN (MODULES_END - MODULES_VADDR) -#define ESPFIX_PGD_ENTRY _AC(-2, UL) -#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) -#define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) -#define EFI_VA_END (-68 * (_AC(1, UL) << 30)) +#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) +#define MODULES_LEN (MODULES_END - MODULES_VADDR) +#define CPU_ENTRY_AREA_PGD _AC(-3, UL) +#define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT) +#define ESPFIX_PGD_ENTRY _AC(-2, UL) +#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) +#define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) +#define EFI_VA_END (-68 * (_AC(1, UL) << 30)) #define EARLY_DYNAMIC_PAGE_TABLES 64 --- /dev/null +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -0,0 +1,79 @@ +#ifndef _ASM_X86_CPU_ENTRY_AREA_H +#define _ASM_X86_CPU_ENTRY_AREA_H + +#include <asm/processor.h> +#include <asm/intel_ds.h> + +/* + * cpu_entry_area is a percpu region that contains things needed by the CPU + * and early entry/exit code. Real types aren't used for all fields here + * to avoid circular header dependencies. + * + * Every field is a virtual alias of some other allocated backing store. + * There is no direct allocation of a struct cpu_entry_area. + */ +struct cpu_entry_area { + char gdt[PAGE_SIZE]; + + /* + * The GDT is just below entry_stack and thus serves (on x86_64) as + * a a read-only guard page. + */ + struct entry_stack_page entry_stack_page; + + /* + * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because + * we need task switches to work, and task switches write to the TSS. + */ + struct tss_struct tss; + + char entry_trampoline[PAGE_SIZE]; + +#ifdef CONFIG_X86_64 + /* + * Exception stacks used for IST entries. + * + * In the future, this should have a separate slot for each stack + * with guard pages between them. + */ + char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; +#endif +#ifdef CONFIG_CPU_SUP_INTEL + /* + * Per CPU debug store for Intel performance monitoring. Wastes a + * full page at the moment. + */ + struct debug_store cpu_debug_store; + /* + * The actual PEBS/BTS buffers must be mapped to user space + * Reserve enough fixmap PTEs. + */ + struct debug_store_buffers cpu_debug_buffers; +#endif +}; + +#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) +#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) + +extern void setup_cpu_entry_areas(void); +extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); + +#define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE +#define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE) + +#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT) + +static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) +{ + unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; + BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); + + return (struct cpu_entry_area *) va; +} + +static inline struct entry_stack *cpu_entry_stack(int cpu) +{ + return &get_cpu_entry_area(cpu)->entry_stack_page.stack; +} + +#endif --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -19,7 +19,6 @@ #include <asm/acpi.h> #include <asm/apicdef.h> #include <asm/page.h> -#include <asm/intel_ds.h> #ifdef CONFIG_X86_32 #include <linux/threads.h> #include <asm/kmap_types.h> @@ -45,57 +44,6 @@ extern unsigned long __FIXADDR_TOP; PAGE_SIZE) #endif -/* - * cpu_entry_area is a percpu region in the fixmap that contains things - * needed by the CPU and early entry/exit code. Real types aren't used - * for all fields here to avoid circular header dependencies. - * - * Every field is a virtual alias of some other allocated backing store. - * There is no direct allocation of a struct cpu_entry_area. - */ -struct cpu_entry_area { - char gdt[PAGE_SIZE]; - - /* - * The GDT is just below entry_stack and thus serves (on x86_64) as - * a a read-only guard page. - */ - struct entry_stack_page entry_stack_page; - - /* - * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because - * we need task switches to work, and task switches write to the TSS. - */ - struct tss_struct tss; - - char entry_trampoline[PAGE_SIZE]; - -#ifdef CONFIG_X86_64 - /* - * Exception stacks used for IST entries. - * - * In the future, this should have a separate slot for each stack - * with guard pages between them. - */ - char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; -#endif -#ifdef CONFIG_CPU_SUP_INTEL - /* - * Per CPU debug store for Intel performance monitoring. Wastes a - * full page at the moment. - */ - struct debug_store cpu_debug_store; - /* - * The actual PEBS/BTS buffers must be mapped to user space - * Reserve enough fixmap PTEs. - */ - struct debug_store_buffers cpu_debug_buffers; -#endif -}; - -#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) - -extern void setup_cpu_entry_areas(void); /* * Here we define all the compile-time 'special' virtual @@ -158,17 +106,7 @@ enum fixed_addresses { FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ - /* - * Fixmap entries to remap the IDT, and the per CPU entry areas. - * Aligned to a PMD boundary. - */ - FIX_USR_SHARED_TOP = round_up(FIX_TEXT_POKE0 + 1, PTRS_PER_PMD), - FIX_RO_IDT, - FIX_CPU_ENTRY_AREA_TOP, - FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1, - FIX_USR_SHARED_BOTTOM = round_up(FIX_CPU_ENTRY_AREA_BOTTOM + 2, PTRS_PER_PMD) - 1, - - __end_of_permanent_fixed_addresses = FIX_USR_SHARED_BOTTOM, + __end_of_permanent_fixed_addresses = FIX_TEXT_POKE0, /* * 512 temporary boot-time mappings, used by early_ioremap(), @@ -249,30 +187,5 @@ void __init *early_memremap_decrypted_wp void __early_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags); -static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page) -{ - BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); - - return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page; -} - -#define __get_cpu_entry_area_offset_index(cpu, offset) ({ \ - BUILD_BUG_ON(offset % PAGE_SIZE != 0); \ - __get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \ - }) - -#define get_cpu_entry_area_index(cpu, field) \ - __get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field)) - -static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) -{ - return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); -} - -static inline struct entry_stack *cpu_entry_stack(int cpu) -{ - return &get_cpu_entry_area(cpu)->entry_stack_page.stack; -} - #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_FIXMAP_H */ --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -483,133 +483,8 @@ static const unsigned int exception_stac [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, [DEBUG_STACK - 1] = DEBUG_STKSZ }; - -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); #endif -static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, - entry_stack_storage); - -/* - * Force the population of PMDs for not yet allocated per cpu - * memory like debug store buffers. - */ -static void __init allocate_percpu_fixmap_ptes(int idx, int pages) -{ - for (; pages; pages--, idx--) - __set_fixmap(idx, 0, PAGE_NONE); -} - -static void __init -set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) -{ - for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) - __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); -} - -static void percpu_setup_debug_store(int cpu) -{ -#ifdef CONFIG_CPU_SUP_INTEL - int npages, idx; - - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) - return; - - idx = get_cpu_entry_area_index(cpu, cpu_debug_store); - npages = sizeof(struct debug_store) / PAGE_SIZE; - BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0); - set_percpu_fixmap_pages(idx, &per_cpu(cpu_debug_store, cpu), npages, - PAGE_KERNEL); - - idx = get_cpu_entry_area_index(cpu, cpu_debug_buffers); - npages = sizeof(struct debug_store_buffers) / PAGE_SIZE; - allocate_percpu_fixmap_ptes(idx, npages); -#endif -} - -/* Setup the fixmap mappings only once per-processor */ -static void __init setup_cpu_entry_area(int cpu) -{ -#ifdef CONFIG_X86_64 - extern char _entry_trampoline[]; - - /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ - pgprot_t gdt_prot = PAGE_KERNEL_RO; - pgprot_t tss_prot = PAGE_KERNEL_RO; -#else - /* - * On native 32-bit systems, the GDT cannot be read-only because - * our double fault handler uses a task gate, and entering through - * a task gate needs to change an available TSS to busy. If the - * GDT is read-only, that will triple fault. The TSS cannot be - * read-only because the CPU writes to it on task switches. - * - * On Xen PV, the GDT must be read-only because the hypervisor - * requires it. - */ - pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? - PAGE_KERNEL_RO : PAGE_KERNEL; - pgprot_t tss_prot = PAGE_KERNEL; -#endif - - __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); - set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, entry_stack_page), - per_cpu_ptr(&entry_stack_storage, cpu), 1, - PAGE_KERNEL); - - /* - * The Intel SDM says (Volume 3, 7.2.1): - * - * Avoid placing a page boundary in the part of the TSS that the - * processor reads during a task switch (the first 104 bytes). The - * processor may not correctly perform address translations if a - * boundary occurs in this area. During a task switch, the processor - * reads and writes into the first 104 bytes of each TSS (using - * contiguous physical addresses beginning with the physical address - * of the first byte of the TSS). So, after TSS access begins, if - * part of the 104 bytes is not physically contiguous, the processor - * will access incorrect information without generating a page-fault - * exception. - * - * There are also a lot of errata involving the TSS spanning a page - * boundary. Assert that we're not doing that. - */ - BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ - offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); - BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); - set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), - &per_cpu(cpu_tss_rw, cpu), - sizeof(struct tss_struct) / PAGE_SIZE, - tss_prot); - -#ifdef CONFIG_X86_32 - per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); -#endif - -#ifdef CONFIG_X86_64 - BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); - BUILD_BUG_ON(sizeof(exception_stacks) != - sizeof(((struct cpu_entry_area *)0)->exception_stacks)); - set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), - &per_cpu(exception_stacks, cpu), - sizeof(exception_stacks) / PAGE_SIZE, - PAGE_KERNEL); - - __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), - __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); -#endif - percpu_setup_debug_store(cpu); -} - -void __init setup_cpu_entry_areas(void) -{ - unsigned int cpu; - - for_each_possible_cpu(cpu) - setup_cpu_entry_area(cpu); -} - /* Load the original GDT from the per-cpu structure */ void load_direct_gdt(int cpu) { --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -60,6 +60,7 @@ #include <asm/trace/mpx.h> #include <asm/mpx.h> #include <asm/vm86.h> +#include <asm/cpu_entry_area.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -950,8 +951,8 @@ void __init trap_init(void) * "sidt" instruction will not leak the location of the kernel, and * to defend the IDT against arbitrary memory write vulnerabilities. * It will be reloaded in cpu_init() */ - __set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO); - idt_descr.address = fix_to_virt(FIX_RO_IDT); + cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), PAGE_KERNEL_RO); + idt_descr.address = CPU_ENTRY_AREA_RO_IDT; /* * Should be a barrier for any external CPU state: --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -10,7 +10,7 @@ CFLAGS_REMOVE_mem_encrypt.o = -pg endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o setup_nx.o tlb.o + pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) --- /dev/null +++ b/arch/x86/mm/cpu_entry_area.c @@ -0,0 +1,138 @@ +#include <linux/percpu.h> +#include <asm/cpu_entry_area.h> +#include <asm/pgtable.h> +#include <asm/fixmap.h> +#include <asm/desc.h> + +static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); + +static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); + + +void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags) +{ + unsigned long va = (unsigned long) cea_vaddr; + + set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags)); +} + +/* + * Force the population of PMDs for not yet allocated per cpu + * memory like debug store buffers. + */ +static void __init cea_allocate_ptes(void *cea_vaddr, int pages) +{ + for (; pages; pages--, cea_vaddr += PAGE_SIZE) + cea_set_pte(cea_vaddr, 0, PAGE_NONE); +} + +static void __init +cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) +{ + for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE) + cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); +} + +static void percpu_setup_debug_store(int cpu) +{ +#ifdef CONFIG_CPU_SUP_INTEL + int npages; + void *cea; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return; + + cea = &get_cpu_entry_area(cpu)->cpu_debug_store; + npages = sizeof(struct debug_store) / PAGE_SIZE; + BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0); + cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages, + PAGE_KERNEL); + + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers; + npages = sizeof(struct debug_store_buffers) / PAGE_SIZE; + cea_allocate_ptes(cea, npages); +#endif +} + +/* Setup the fixmap mappings only once per-processor */ +static void __init setup_cpu_entry_area(int cpu) +{ +#ifdef CONFIG_X86_64 + extern char _entry_trampoline[]; + + /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ + pgprot_t gdt_prot = PAGE_KERNEL_RO; + pgprot_t tss_prot = PAGE_KERNEL_RO; +#else + /* + * On native 32-bit systems, the GDT cannot be read-only because + * our double fault handler uses a task gate, and entering through + * a task gate needs to change an available TSS to busy. If the + * GDT is read-only, that will triple fault. The TSS cannot be + * read-only because the CPU writes to it on task switches. + * + * On Xen PV, the GDT must be read-only because the hypervisor + * requires it. + */ + pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? + PAGE_KERNEL_RO : PAGE_KERNEL; + pgprot_t tss_prot = PAGE_KERNEL; +#endif + + cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu), + gdt_prot); + + cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page, + per_cpu_ptr(&entry_stack_storage, cpu), 1, + PAGE_KERNEL); + + /* + * The Intel SDM says (Volume 3, 7.2.1): + * + * Avoid placing a page boundary in the part of the TSS that the + * processor reads during a task switch (the first 104 bytes). The + * processor may not correctly perform address translations if a + * boundary occurs in this area. During a task switch, the processor + * reads and writes into the first 104 bytes of each TSS (using + * contiguous physical addresses beginning with the physical address + * of the first byte of the TSS). So, after TSS access begins, if + * part of the 104 bytes is not physically contiguous, the processor + * will access incorrect information without generating a page-fault + * exception. + * + * There are also a lot of errata involving the TSS spanning a page + * boundary. Assert that we're not doing that. + */ + BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ + offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); + BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); + cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss, + &per_cpu(cpu_tss_rw, cpu), + sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); + +#ifdef CONFIG_X86_32 + per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); +#endif + +#ifdef CONFIG_X86_64 + BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); + BUILD_BUG_ON(sizeof(exception_stacks) != + sizeof(((struct cpu_entry_area *)0)->exception_stacks)); + cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks, + &per_cpu(exception_stacks, cpu), + sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); + + cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, + __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); +#endif + percpu_setup_debug_store(cpu); +} + +void __init setup_cpu_entry_areas(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) + setup_cpu_entry_area(cpu); +} --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -3,6 +3,7 @@ #include <linux/types.h> #include <linux/slab.h> +#include <asm/cpu_entry_area.h> #include <asm/perf_event.h> #include <asm/insn.h> @@ -280,16 +281,22 @@ void fini_debug_store_on_cpu(int cpu) static DEFINE_PER_CPU(void *, insn_buffer); -static u64 ds_update_fixmap(int idx, void *addr, size_t size, pgprot_t prot) +static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot) { - phys_addr_t pa, va; + phys_addr_t pa; size_t msz = 0; - va = __fix_to_virt(idx); pa = virt_to_phys(addr); - for (; msz < size; idx--, msz += PAGE_SIZE, pa += PAGE_SIZE) - __set_fixmap(idx, pa, prot); - return va; + for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE) + cea_set_pte(cea, pa, prot); +} + +static void ds_clear_cea(void *cea, size_t size) +{ + size_t msz = 0; + + for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE) + cea_set_pte(cea, 0, PAGE_NONE); } static void *dsalloc_pages(size_t size, gfp_t flags, int cpu) @@ -313,8 +320,8 @@ static int alloc_pebs_buffer(int cpu) struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); struct debug_store *ds = hwev->ds; size_t bsiz = x86_pmu.pebs_buffer_size; - int idx, max, node = cpu_to_node(cpu); - void *buffer, *ibuffer; + int max, node = cpu_to_node(cpu); + void *buffer, *ibuffer, *cea; if (!x86_pmu.pebs) return 0; @@ -336,10 +343,10 @@ static int alloc_pebs_buffer(int cpu) per_cpu(insn_buffer, cpu) = ibuffer; } hwev->ds_pebs_vaddr = buffer; - /* Update the fixmap */ - idx = get_cpu_entry_area_index(cpu, cpu_debug_buffers.pebs_buffer); - ds->pebs_buffer_base = ds_update_fixmap(idx, buffer, bsiz, - PAGE_KERNEL); + /* Update the cpu entry area mapping */ + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; + ds->pebs_buffer_base = (u64) cea; + ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL); ds->pebs_index = ds->pebs_buffer_base; max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size); ds->pebs_absolute_maximum = ds->pebs_buffer_base + max; @@ -350,7 +357,7 @@ static void release_pebs_buffer(int cpu) { struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); struct debug_store *ds = hwev->ds; - int idx; + void *cea; if (!ds || !x86_pmu.pebs) return; @@ -359,8 +366,8 @@ static void release_pebs_buffer(int cpu) per_cpu(insn_buffer, cpu) = NULL; /* Clear the fixmap */ - idx = get_cpu_entry_area_index(cpu, cpu_debug_buffers.pebs_buffer); - ds_update_fixmap(idx, 0, x86_pmu.pebs_buffer_size, PAGE_NONE); + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; + ds_clear_cea(cea, x86_pmu.pebs_buffer_size); ds->pebs_buffer_base = 0; dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size); hwev->ds_pebs_vaddr = NULL; @@ -370,8 +377,8 @@ static int alloc_bts_buffer(int cpu) { struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); struct debug_store *ds = hwev->ds; - int idx, max; - void *buffer; + void *buffer, *cea; + int max; if (!x86_pmu.bts) return 0; @@ -383,9 +390,9 @@ static int alloc_bts_buffer(int cpu) } hwev->ds_bts_vaddr = buffer; /* Update the fixmap */ - idx = get_cpu_entry_area_index(cpu, cpu_debug_buffers.bts_buffer); - ds->bts_buffer_base = ds_update_fixmap(idx, buffer, BTS_BUFFER_SIZE, - PAGE_KERNEL); + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; + ds->bts_buffer_base = (u64) cea; + ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL); ds->bts_index = ds->bts_buffer_base; max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE); ds->bts_absolute_maximum = ds->bts_buffer_base + max; @@ -397,14 +404,14 @@ static void release_bts_buffer(int cpu) { struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); struct debug_store *ds = hwev->ds; - int idx; + void *cea; if (!ds || !x86_pmu.bts) return; /* Clear the fixmap */ - idx = get_cpu_entry_area_index(cpu, cpu_debug_buffers.bts_buffer); - ds_update_fixmap(idx, 0, BTS_BUFFER_SIZE, PAGE_NONE); + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; + ds_clear_cea(cea, BTS_BUFFER_SIZE); ds->bts_buffer_base = 0; dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE); hwev->ds_bts_vaddr = NULL; --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -7,6 +7,7 @@ #include <asm/mmu.h> #include <asm/fixmap.h> #include <asm/irq_vectors.h> +#include <asm/cpu_entry_area.h> #include <linux/smp.h> #include <linux/percpu.h> --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -38,13 +38,13 @@ extern bool __vmalloc_start_set; /* set #define LAST_PKMAP 1024 #endif -#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1)) \ - & PMD_MASK) +#define PKMAP_BASE \ + ((FIXMAP_START - PAGE_SIZE * (LAST_PKMAP + 1)) & PMD_MASK) #ifdef CONFIG_HIGHMEM # define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE) #else -# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) +# define VMALLOC_END (FIXMAP_START - 2 * PAGE_SIZE) #endif #define MODULES_VADDR VMALLOC_START --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -18,6 +18,7 @@ #include <linux/nmi.h> #include <linux/sysfs.h> +#include <asm/cpu_entry_area.h> #include <asm/stacktrace.h> #include <asm/unwind.h> --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -62,6 +62,7 @@ enum address_markers_idx { #if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL) LDT_NR, #endif + CPU_ENTRY_AREA_START_NR, # ifdef CONFIG_X86_ESPFIX64 ESPFIX_START_NR, # endif @@ -97,6 +98,7 @@ static struct addr_marker address_marker #if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL) { LDT_BASE_ADDR, "LDT remap" }, #endif + { CPU_ENTRY_AREA_BASE, "CPU entry Area" }, # ifdef CONFIG_X86_ESPFIX64 { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, # endif --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -15,6 +15,7 @@ #include <asm/tlbflush.h> #include <asm/sections.h> #include <asm/pgtable.h> +#include <asm/cpu_entry_area.h> extern struct range pfn_mapped[E820_MAX_ENTRIES]; @@ -330,12 +331,13 @@ void __init kasan_init(void) (unsigned long)kasan_mem_to_shadow(_end), early_pfn_to_nid(__pa(_stext))); - shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM); + shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE; shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin, PAGE_SIZE); - shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE); + shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE + + CPU_ENTRY_AREA_TOT_SIZE); shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end, PAGE_SIZE); --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -275,21 +275,25 @@ pti_clone_pmds(unsigned long start, unsi } } -static void __init pti_setup_espfix64(void) +/* + * Clone a single p4d (i.e. a top-level entry on 4-level systems and a + * next-level entry on 5-level systems. + */ +static void __init pti_clone_p4d(unsigned long addr) { -#ifdef CONFIG_X86_ESPFIX64 - /* - * ESPFIX64 uses a single p4d (i.e. a top-level entry on 4-level - * systems and a next-level entry on 5-level systems. Share that - * entry between the user and kernel pagetables. - */ - pgd_t *kernel_pgd; p4d_t *kernel_p4d, *user_p4d; + pgd_t *kernel_pgd; - user_p4d = pti_user_pagetable_walk_p4d(ESPFIX_BASE_ADDR); - kernel_pgd = pgd_offset_k(ESPFIX_BASE_ADDR); - kernel_p4d = p4d_offset(kernel_pgd, ESPFIX_BASE_ADDR); + user_p4d = pti_user_pagetable_walk_p4d(addr); + kernel_pgd = pgd_offset_k(addr); + kernel_p4d = p4d_offset(kernel_pgd, addr); *user_p4d = *kernel_p4d; +} + +static void __init pti_setup_espfix64(void) +{ +#ifdef CONFIG_X86_ESPFIX64 + pti_clone_p4d(ESPFIX_BASE_ADDR); #endif } @@ -313,20 +317,11 @@ static void __init pti_setup_vsyscall(vo } /* - * Clone the populated PMDs of the user shared fixmaps into the user space - * visible page table. + * Clone the CPU_ENTRY_AREA into the user space visible page table. */ static void __init pti_clone_user_shared(void) { - unsigned long bot, top; - - bot = __fix_to_virt(FIX_USR_SHARED_BOTTOM); - top = __fix_to_virt(FIX_USR_SHARED_TOP) + PAGE_SIZE; - - /* Top of the user shared block must be PMD-aligned. */ - WARN_ON(top & ~PMD_MASK); - - pti_clone_pmds(bot, top, 0); + pti_clone_p4d(CPU_ENTRY_AREA_BASE); } /* --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2261,7 +2261,6 @@ static void xen_set_fixmap(unsigned idx, switch (idx) { case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: - case FIX_RO_IDT: #ifdef CONFIG_X86_32 case FIX_WP_TEST: # ifdef CONFIG_HIGHMEM @@ -2272,7 +2271,6 @@ static void xen_set_fixmap(unsigned idx, #endif case FIX_TEXT_POKE0: case FIX_TEXT_POKE1: - case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM: /* All local page mappings */ pte = pfn_pte(phys, prot); break;