Author: jkoshy
Date: Tue Feb  3 09:01:45 2009
New Revision: 188065
URL: http://svn.freebsd.org/changeset/base/188065

Log:
  Improve robustness of NMI handling, for NMIs recognized in kernel
  mode.
  
  - Make the NMI handler run on its own stack (TSS_IST2).
  - Store the GSBASE value for each CPU just before the start of
    each NMI stack, permitting efficient retrieval using %rsp-relative
    addressing.
  - For NMIs taken from kernel mode, program MSR_GSBASE explicitly
    since one or both of MSR_GSBASE and MSR_KGSBASE can be potentially
    invalid.  The current contents of MSR_GSBASE are saved and restored
    at exit.
  - For NMIs handled from user mode, continue to use 'swapgs' to
    load the per-CPU GSBASE.
  
  Reviewed by:  jeff
  Debugging help:       jeff
  Tested by:    gnn, Artem Belevich <artemb at gmail dot com>

Modified:
  head/sys/amd64/amd64/exception.S
  head/sys/amd64/amd64/machdep.c
  head/sys/amd64/amd64/mp_machdep.c
  head/sys/amd64/include/intr_machdep.h

Modified: head/sys/amd64/amd64/exception.S
==============================================================================
--- head/sys/amd64/amd64/exception.S    Tue Feb  3 08:25:24 2009        
(r188064)
+++ head/sys/amd64/amd64/exception.S    Tue Feb  3 09:01:45 2009        
(r188065)
@@ -383,22 +383,24 @@ IDTVEC(fast_syscall32)
  * NMI handling is special.
  *
  * First, NMIs do not respect the state of the processor's RFLAGS.IF
- * bit and the NMI handler may be invoked at any time, including when
- * the processor is in a critical section with RFLAGS.IF == 0.  In
- * particular, this means that the processor's GS.base values could be
- * inconsistent on entry to the handler, and so we need to read
- * MSR_GSBASE to determine if a 'swapgs' is needed.  We use '%ebx', a
- * C-preserved register, to remember whether to swap GS back on the
- * exit path.
+ * bit.  The NMI handler may be entered at any time, including when
+ * the processor is in a critical section with RFLAGS.IF == 0.
+ * The processor's GS.base value could be invalid on entry to the
+ * handler.
  *
  * Second, the processor treats NMIs specially, blocking further NMIs
- * until an 'iretq' instruction is executed.  We therefore need to
- * execute the NMI handler with interrupts disabled to prevent a
- * nested interrupt from executing an 'iretq' instruction and
- * inadvertently taking the processor out of NMI mode.
+ * until an 'iretq' instruction is executed.  We thus need to execute
+ * the NMI handler with interrupts disabled, to prevent a nested interrupt
+ * from executing an 'iretq' instruction and inadvertently taking the
+ * processor out of NMI mode.
  *
- * Third, the NMI handler runs on its own stack (tss_ist1), shared
- * with the double fault handler.
+ * Third, the NMI handler runs on its own stack (tss_ist2). The canonical
+ * GS.base value for the processor is stored just above the bottom of its
+ * NMI stack.  For NMIs taken from kernel mode, the current value in
+ * the processor's GS.base is saved at entry to C-preserved register %r12,
+ * the canonical value for GS.base is then loaded into the processor, and
+ * the saved value is restored at exit time.  For NMIs taken from user mode,
+ * the cheaper 'SWAPGS' instructions are used for swapping GS.base.
  */
 
 IDTVEC(nmi)
@@ -423,12 +425,22 @@ IDTVEC(nmi)
        movq    %r15,TF_R15(%rsp)
        xorl    %ebx,%ebx
        testb   $SEL_RPL_MASK,TF_CS(%rsp)
-       jnz     nmi_needswapgs          /* we came from userland */
+       jnz     nmi_fromuserspace
+       /*
+        * We've interrupted the kernel.  Preserve GS.base in %r12.
+        */
        movl    $MSR_GSBASE,%ecx
        rdmsr
-       cmpl    $VM_MAXUSER_ADDRESS >> 32,%edx
-       jae     nmi_calltrap            /* GS.base holds a kernel VA */
-nmi_needswapgs:
+       movq    %rax,%r12
+       shlq    $32,%rdx
+       orq     %rdx,%r12
+       /* Retrieve and load the canonical value for GS.base. */
+       movq    TF_SIZE(%rsp),%rdx
+       movl    %edx,%eax
+       shrq    $32,%rdx
+       wrmsr
+       jmp     nmi_calltrap
+nmi_fromuserspace:
        incl    %ebx
        swapgs
 /* Note: this label is also used by ddb and gdb: */
@@ -439,14 +451,19 @@ nmi_calltrap:
        MEXITCOUNT
 #ifdef HWPMC_HOOKS
        /*
-        * Check if the current trap was from user mode and if so
-        * whether the current thread needs a user call chain to be
-        * captured. We are still in NMI mode at this point.
+        * Capture a userspace callchain if needed.
+        * 
+        * - Check if the current trap was from user mode.
+        * - Check if the current thread is valid.
+        * - Check if the thread requires a user call chain to be
+        *   captured.
+        *
+        * We are still in NMI mode at this point.
         */
-       testb   $SEL_RPL_MASK,TF_CS(%rsp)
-       jz      nocallchain
-       movq    PCPU(CURTHREAD),%rax    /* curthread present? */
-       orq     %rax,%rax
+       testl   %ebx,%ebx
+       jz      nocallchain     /* not from userspace */
+       movq    PCPU(CURTHREAD),%rax
+       orq     %rax,%rax       /* curthread present? */
        jz      nocallchain
        testl   $TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
        jz      nocallchain
@@ -498,8 +515,18 @@ outofnmi:
 nocallchain:
 #endif
        testl   %ebx,%ebx
-       jz      nmi_restoreregs
+       jz      nmi_kernelexit
        swapgs
+       jmp     nmi_restoreregs
+nmi_kernelexit:        
+       /*
+        * Put back the preserved MSR_GSBASE value.
+        */
+       movl    $MSR_GSBASE,%ecx
+       movq    %r12,%rdx
+       movl    %edx,%eax
+       shrq    $32,%rdx
+       wrmsr
 nmi_restoreregs:
        movq    TF_RDI(%rsp),%rdi
        movq    TF_RSI(%rsp),%rsi

Modified: head/sys/amd64/amd64/machdep.c
==============================================================================
--- head/sys/amd64/amd64/machdep.c      Tue Feb  3 08:25:24 2009        
(r188064)
+++ head/sys/amd64/amd64/machdep.c      Tue Feb  3 09:01:45 2009        
(r188065)
@@ -809,6 +809,9 @@ struct gate_descriptor *idt = &idt0[0];     
 
 static char dblfault_stack[PAGE_SIZE] __aligned(16);
 
+static char nmi0_stack[PAGE_SIZE] __aligned(16);
+CTASSERT(sizeof(struct nmi_pcpu) == 16);
+
 struct amd64tss common_tss[MAXCPU];
 
 /* software prototypes -- in more palatable form */
@@ -1291,6 +1294,7 @@ hammer_time(u_int64_t modulep, u_int64_t
        caddr_t kmdp;
        int gsel_tss, x;
        struct pcpu *pc;
+       struct nmi_pcpu *np;
        u_int64_t msr;
        char *env;
 
@@ -1365,7 +1369,7 @@ hammer_time(u_int64_t modulep, u_int64_t
                setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
        setidt(IDT_DE, &IDTVEC(div),  SDT_SYSIGT, SEL_KPL, 0);
        setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYSIGT, SEL_KPL, 0);
-       setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 1);
+       setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
        setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYSIGT, SEL_UPL, 0);
        setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYSIGT, SEL_KPL, 0);
        setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYSIGT, SEL_KPL, 0);
@@ -1438,6 +1442,14 @@ hammer_time(u_int64_t modulep, u_int64_t
        /* doublefault stack space, runs on ist1 */
        common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
 
+       /*
+        * NMI stack, runs on ist2.  The pcpu pointer is stored just
+        * above the start of the ist2 stack.
+        */
+       np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
+       np->np_pcpu = (register_t) pc;
+       common_tss[0].tss_ist2 = (long) np;
+
        /* Set the IO permission bitmap (empty due to tss seg limit) */
        common_tss[0].tss_iobase = sizeof(struct amd64tss);
 

Modified: head/sys/amd64/amd64/mp_machdep.c
==============================================================================
--- head/sys/amd64/amd64/mp_machdep.c   Tue Feb  3 08:25:24 2009        
(r188064)
+++ head/sys/amd64/amd64/mp_machdep.c   Tue Feb  3 09:01:45 2009        
(r188065)
@@ -92,6 +92,7 @@ void *bootstacks[MAXCPU];
 
 /* Temporary holder for double fault stack */
 char *doublefault_stack;
+char *nmi_stack;
 
 /* Hotwire a 0->4MB V==P mapping */
 extern pt_entry_t *KPTphys;
@@ -437,6 +438,7 @@ void
 init_secondary(void)
 {
        struct pcpu *pc;
+       struct nmi_pcpu *np;
        u_int64_t msr, cr0;
        int cpu, gsel_tss, x;
        struct region_descriptor ap_gdt;
@@ -450,6 +452,10 @@ init_secondary(void)
        common_tss[cpu].tss_iobase = sizeof(struct amd64tss);
        common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE];
 
+       /* The NMI stack runs on IST2. */
+       np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1;
+       common_tss[cpu].tss_ist2 = (long) np;
+
        /* Prepare private GDT */
        gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu];
        ssdtosyssd(&gdt_segs[GPROC0_SEL],
@@ -474,6 +480,9 @@ init_secondary(void)
        pc->pc_rsp0 = 0;
        pc->pc_gs32p = &gdt[NGDT * cpu + GUGS32_SEL];
 
+       /* Save the per-cpu pointer for use by the NMI handler. */
+       np->np_pcpu = (register_t) pc;
+
        wrmsr(MSR_FSBASE, 0);           /* User value */
        wrmsr(MSR_GSBASE, (u_int64_t)pc);
        wrmsr(MSR_KGSBASE, (u_int64_t)pc);      /* XXX User value while we're 
in the kernel */
@@ -725,6 +734,7 @@ start_all_aps(void)
                /* allocate and set up an idle stack data page */
                bootstacks[cpu] = (void *)kmem_alloc(kernel_map, KSTACK_PAGES * 
PAGE_SIZE);
                doublefault_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE);
+               nmi_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE);
 
                bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 
8;
                bootAP = cpu;

Modified: head/sys/amd64/include/intr_machdep.h
==============================================================================
--- head/sys/amd64/include/intr_machdep.h       Tue Feb  3 08:25:24 2009        
(r188064)
+++ head/sys/amd64/include/intr_machdep.h       Tue Feb  3 09:01:45 2009        
(r188065)
@@ -120,6 +120,15 @@ struct intsrc {
 
 struct trapframe;
 
+/*
+ * The following data structure holds per-cpu data, and is placed just
+ * above the top of the space used for the NMI stack.
+ */
+struct nmi_pcpu {
+       register_t      np_pcpu;
+       register_t      __padding;      /* pad to 16 bytes */
+};
+
 extern struct mtx icu_lock;
 extern int elcr_found;
 
_______________________________________________
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to