On Tue, Jul 02, 2019 at 10:18:27PM +0200, Peter Zijlstra wrote:

> I think; lemme re-read that thread.

*completely* untested, not even been near a compiler yet.

but it now includes 32bit support and should be more or less complete.

I removed the most horrendous (rbx control flow) hacks from idtentry,
although none of this is winning any prices. Cleaning up the idtentry is
for later, otherwise we'll get side-tracked into that again and leave
this bug lingering for a long long while still.

XXX write proper changelog, after testing.

---
 arch/x86/entry/entry_32.S             |   60 +++++++++++++++++++---------------
 arch/x86/entry/entry_64.S             |   28 +++++++--------
 arch/x86/include/asm/kvm_para.h       |    2 -
 arch/x86/include/asm/paravirt.h       |   22 +++++++-----
 arch/x86/include/asm/paravirt_types.h |    2 -
 arch/x86/include/asm/traps.h          |    2 -
 arch/x86/kernel/asm-offsets.c         |    1 
 arch/x86/kernel/head_64.S             |    4 --
 arch/x86/kernel/kvm.c                 |    8 ++--
 arch/x86/kernel/paravirt.c            |    2 -
 arch/x86/mm/fault.c                   |    3 -
 arch/x86/xen/enlighten_pv.c           |    3 +
 arch/x86/xen/mmu_pv.c                 |   12 ------
 arch/x86/xen/xen-asm.S                |   25 ++++++++++++++
 arch/x86/xen/xen-ops.h                |    3 +
 15 files changed, 103 insertions(+), 74 deletions(-)

--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -294,9 +294,11 @@
 .Lfinished_frame_\@:
 .endm
 
-.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0
+.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0
        cld
+.if \skip_gs == 0
        PUSH_GS
+.endif
        FIXUP_FRAME
        pushl   %fs
        pushl   %es
@@ -313,13 +315,13 @@
        movl    %edx, %es
        movl    $(__KERNEL_PERCPU), %edx
        movl    %edx, %fs
+.if \skip_gs == 0
        SET_KERNEL_GS %edx
-
+.endif
        /* Switch to kernel stack if necessary */
 .if \switch_stacks > 0
        SWITCH_TO_KERNEL_STACK
 .endif
-
 .endm
 
 .macro SAVE_ALL_NMI cr3_reg:req
@@ -1441,39 +1443,45 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vec
 
 ENTRY(page_fault)
        ASM_CLAC
-       pushl   $do_page_fault
-       ALIGN
-       jmp common_exception
+       pushl   $0; /* %gs's slot on the stack */
+
+       SAVE_ALL switch_stacks=1 skip_gs=1
+
+       ENCODE_FRAME_POINTER
+       UNWIND_ESPFIX_STACK
+
+       /* fixup %gs */
+       GS_TO_REG %ecx
+       REG_TO_PTGS %ecx
+       SET_KERNEL_GS %ecx
+
+       GET_CR2_INTO(%ecx)                      # might clobber %eax
+
+       movl    PT_ORIG_EAX(%esp), %edx         # get the error code
+       movl    $-1, PT_ORIG_EAX(%esp)          # no syscall to restart
+
+       TRACE_IRQS_OFF
+       movl    %esp, %eax                      # pt_regs pointer
+       call    $do_page_fault
+       jmp     ret_from_exception
 END(page_fault)
 
 common_exception:
        /* the function address is in %gs's slot on the stack */
-       FIXUP_FRAME
-       pushl   %fs
-       pushl   %es
-       pushl   %ds
-       pushl   %eax
-       movl    $(__USER_DS), %eax
-       movl    %eax, %ds
-       movl    %eax, %es
-       movl    $(__KERNEL_PERCPU), %eax
-       movl    %eax, %fs
-       pushl   %ebp
-       pushl   %edi
-       pushl   %esi
-       pushl   %edx
-       pushl   %ecx
-       pushl   %ebx
-       SWITCH_TO_KERNEL_STACK
+       SAVE_ALL switch_stacks=1 skip_gs=1
        ENCODE_FRAME_POINTER
-       cld
        UNWIND_ESPFIX_STACK
+
+       /* fixup %gs */
        GS_TO_REG %ecx
        movl    PT_GS(%esp), %edi               # get the function address
-       movl    PT_ORIG_EAX(%esp), %edx         # get the error code
-       movl    $-1, PT_ORIG_EAX(%esp)          # no syscall to restart
        REG_TO_PTGS %ecx
        SET_KERNEL_GS %ecx
+
+       /* fixup orig %eax */
+       movl    PT_ORIG_EAX(%esp), %edx         # get the error code
+       movl    $-1, PT_ORIG_EAX(%esp)          # no syscall to restart
+
        TRACE_IRQS_OFF
        movl    %esp, %eax                      # pt_regs pointer
        CALL_NOSPEC %edi
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -901,7 +901,7 @@ apicinterrupt IRQ_WORK_VECTOR                       irq_work
  * @paranoid == 2 is special: the stub will never switch stacks.  This is for
  * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
  */
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 
ist_offset=0 create_gap=0
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 
ist_offset=0 create_gap=0 read_cr2=0
 ENTRY(\sym)
        UNWIND_HINT_IRET_REGS offset=\has_error_code*8
 
@@ -937,18 +937,27 @@ ENTRY(\sym)
 
        .if \paranoid
        call    paranoid_entry
+       /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
        .else
        call    error_entry
        .endif
        UNWIND_HINT_REGS
-       /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
 
-       .if \paranoid
+       .if \read_cr2
+       GET_CR2_INTO(%rdx);                     /* can clobber %rax */
+       .endif
+
        .if \shift_ist != -1
        TRACE_IRQS_OFF_DEBUG                    /* reload IDT in case of 
recursion */
        .else
        TRACE_IRQS_OFF
        .endif
+
+       .if \paranoid == 0
+       testb   $3, CS(%rsp)
+       jz      .Lfrom_kernel_no_context_tracking_\@
+       CALL_enter_from_user_mode
+.Lfrom_kernel_no_context_tracking_\@:
        .endif
 
        movq    %rsp, %rdi                      /* pt_regs pointer */
@@ -1180,10 +1189,10 @@ idtentry xenint3                do_int3                 
has_error_co
 #endif
 
 idtentry general_protection    do_general_protection   has_error_code=1
-idtentry page_fault            do_page_fault           has_error_code=1
+idtentry page_fault            do_page_fault           has_error_code=1        
read_cr2=1
 
 #ifdef CONFIG_KVM_GUEST
-idtentry async_page_fault      do_async_page_fault     has_error_code=1
+idtentry async_page_fault      do_async_page_fault     has_error_code=1        
read_cr2=1
 #endif
 
 #ifdef CONFIG_X86_MCE
@@ -1338,18 +1347,9 @@ ENTRY(error_entry)
        movq    %rax, %rsp                      /* switch stack */
        ENCODE_FRAME_POINTER
        pushq   %r12
-
-       /*
-        * We need to tell lockdep that IRQs are off.  We can't do this until
-        * we fix gsbase, and we should do it before enter_from_user_mode
-        * (which can take locks).
-        */
-       TRACE_IRQS_OFF
-       CALL_enter_from_user_mode
        ret
 
 .Lerror_entry_done:
-       TRACE_IRQS_OFF
        ret
 
        /*
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -92,7 +92,7 @@ void kvm_async_pf_task_wait(u32 token, i
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_pf_reason(void);
 extern void kvm_disable_steal_time(void);
-void do_async_page_fault(struct pt_regs *regs, unsigned long error_code);
+void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, 
unsigned long address);
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 void __init kvm_spinlock_init(void);
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -116,7 +116,7 @@ static inline void write_cr0(unsigned lo
 
 static inline unsigned long read_cr2(void)
 {
-       return PVOP_CALL0(unsigned long, mmu.read_cr2);
+       return PVOP_CALLEE0(unsigned long, mmu.read_cr2);
 }
 
 static inline void write_cr2(unsigned long x)
@@ -909,13 +909,7 @@ extern void default_banner(void);
                  ANNOTATE_RETPOLINE_SAFE;                              \
                  call PARA_INDIRECT(pv_ops+PV_CPU_swapgs);             \
                 )
-#endif
-
-#define GET_CR2_INTO_RAX                               \
-       ANNOTATE_RETPOLINE_SAFE;                                \
-       call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2);
 
-#ifdef CONFIG_PARAVIRT_XXL
 #define USERGS_SYSRET64                                                        
\
        PARA_SITE(PARA_PATCH(PV_CPU_usergs_sysret64),                   \
                  ANNOTATE_RETPOLINE_SAFE;                              \
@@ -929,9 +923,19 @@ extern void default_banner(void);
                  call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);        \
                  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
 #endif
-#endif
+#endif /* CONFIG_PARAVIRT_XXL */
+#endif /* CONFIG_X86_64 */
+
+#ifdef CONFIG_PARAVIRT_XXL
+
+#define GET_CR2_INTO_AX                                                        
\
+       PARA_SITE(PARA_PATCH(PV_MMU_read_cr2),                          \
+                 ANNOTATE_RETPOLINE_SAFE;                              \
+                 call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2);           \
+                )
+
+#endif /* CONFIG_PARAVIRT_XXL */
 
-#endif /* CONFIG_X86_32 */
 
 #endif /* __ASSEMBLY__ */
 #else  /* CONFIG_PARAVIRT */
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -220,7 +220,7 @@ struct pv_mmu_ops {
        void (*exit_mmap)(struct mm_struct *mm);
 
 #ifdef CONFIG_PARAVIRT_XXL
-       unsigned long (*read_cr2)(void);
+       struct paravirt_callee_save read_cr2;
        void (*write_cr2)(unsigned long);
 
        unsigned long (*read_cr3)(void);
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -81,7 +81,7 @@ struct bad_iret_stack *fixup_bad_iret(st
 void __init trap_init(void);
 #endif
 dotraplinkage void do_general_protection(struct pt_regs *regs, long 
error_code);
-dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long 
error_code);
+dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long 
error_code, unsigned long address);
 dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long 
error_code);
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code);
 dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code);
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -77,6 +77,7 @@ static void __used common(void)
        BLANK();
        OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
        OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+       OFFSET(XEN_vcpu_info_arch_cr2, vcpu_info, arch.cr2);
 #endif
 
        BLANK();
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -29,9 +29,7 @@
 #ifdef CONFIG_PARAVIRT_XXL
 #include <asm/asm-offsets.h>
 #include <asm/paravirt.h>
-#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg
 #else
-#define GET_CR2_INTO(reg) movq %cr2, reg
 #define INTERRUPT_RETURN iretq
 #endif
 
@@ -323,7 +321,7 @@ END(early_idt_handler_array)
 
        cmpq $14,%rsi           /* Page fault? */
        jnz 10f
-       GET_CR2_INTO(%rdi)      /* Can clobber any volatile register if pv */
+       GET_CR2_INTO(%rdi)      /* can clobber %rax if pv */
        call early_make_pgtable
        andl %eax,%eax
        jz 20f                  /* All good */
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -242,23 +242,23 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_
 NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
 
 dotraplinkage void
-do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
+do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned 
long address)
 {
        enum ctx_state prev_state;
 
        switch (kvm_read_and_reset_pf_reason()) {
        default:
-               do_page_fault(regs, error_code);
+               do_page_fault(regs, error_code, address);
                break;
        case KVM_PV_REASON_PAGE_NOT_PRESENT:
                /* page is swapped out by the host. */
                prev_state = exception_enter();
-               kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs));
+               kvm_async_pf_task_wait((u32)address, !user_mode(regs));
                exception_exit(prev_state);
                break;
        case KVM_PV_REASON_PAGE_READY:
                rcu_irq_enter();
-               kvm_async_pf_task_wake((u32)read_cr2());
+               kvm_async_pf_task_wake((u32)address);
                rcu_irq_exit();
                break;
        }
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -370,7 +370,7 @@ struct paravirt_patch_template pv_ops =
        .mmu.exit_mmap          = paravirt_nop,
 
 #ifdef CONFIG_PARAVIRT_XXL
-       .mmu.read_cr2           = native_read_cr2,
+       .mmu.read_cr2           = __PV_IS_CALLEE_SAVE(native_read_cr2),
        .mmu.write_cr2          = native_write_cr2,
        .mmu.read_cr3           = __native_read_cr3,
        .mmu.write_cr3          = native_write_cr3,
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1548,9 +1548,8 @@ trace_page_fault_entries(unsigned long a
  * exception_{enter,exit}() contains all sorts of tracepoints.
  */
 dotraplinkage void notrace
-do_page_fault(struct pt_regs *regs, unsigned long error_code)
+do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long 
address)
 {
-       unsigned long address = read_cr2(); /* Get the faulting address */
        enum ctx_state prev_state;
 
        prev_state = exception_enter();
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -998,7 +998,8 @@ void __init xen_setup_vcpu_info_placemen
                        __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
                pv_ops.irq.irq_enable =
                        __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
-               pv_ops.mmu.read_cr2 = xen_read_cr2_direct;
+               pv_ops.mmu.read_cr2 =
+                       __PV_IS_CALLEE_SAVE(xen_read_cr2_direct);
        }
 }
 
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1307,16 +1307,6 @@ static void xen_write_cr2(unsigned long
        this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
 }
 
-static unsigned long xen_read_cr2(void)
-{
-       return this_cpu_read(xen_vcpu)->arch.cr2;
-}
-
-unsigned long xen_read_cr2_direct(void)
-{
-       return this_cpu_read(xen_vcpu_info.arch.cr2);
-}
-
 static noinline void xen_flush_tlb(void)
 {
        struct mmuext_op *op;
@@ -2397,7 +2387,7 @@ static void xen_leave_lazy_mmu(void)
 }
 
 static const struct pv_mmu_ops xen_mmu_ops __initconst = {
-       .read_cr2 = xen_read_cr2,
+       .read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2),
        .write_cr2 = xen_write_cr2,
 
        .read_cr3 = xen_read_cr3,
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -135,3 +135,28 @@ ENTRY(check_events)
        FRAME_END
        ret
 ENDPROC(check_events)
+
+ENTRY(xen_read_cr2)
+       FRAME_BEGIN
+#ifdef CONFIG_X86_64
+       movq    PER_CPU_VAR(xen_vcpu), %rax
+       movq    XEN_vcpu_info_arch_cr2(%rax), %rax
+#else
+       movl    PER_CPU_VAR(xen_vcpu), %eax
+       movl    XEN_vcpu_info_arch_cr2(%eax), %eax
+#endif
+       FRAME_END
+       ret
+       ENDPROC(xen_read_cr2);
+
+ENTRY(xen_read_cr2_direct)
+       FRAME_BEGIN
+#ifdef CONFIG_X86_64
+       movq    PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_arch_cr2, %rax
+#else
+       movl    PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_arch_cr2, %eax
+#endif
+       FRAME_END
+       ret
+       ENDPROC(xen_read_cr2_direct);
+
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -134,6 +134,9 @@ __visible void xen_irq_disable_direct(vo
 __visible unsigned long xen_save_fl_direct(void);
 __visible void xen_restore_fl_direct(unsigned long);
 
+__visible unsigned long xen_read_cr2(void);
+__visible unsigned long xen_read_cr2_direct(void);
+
 /* These are not functions, and cannot be called normally */
 __visible void xen_iret(void);
 __visible void xen_sysret32(void);

Reply via email to