[akaros] Re: Art Review - NMIs for profiling

Barret Rhoden Fri, 29 Jul 2016 14:47:37 -0700

On 2016-07-28 at 18:29 Barret Rhoden <[email protected]> wrote:
>   [email protected]:brho/akaros.git NMI


Fixed a bug that could cause us to ignore an NMI and not rerun the
handler.  


The following changes since commit 1c7ae8e546970102308614b13bc84e4a90ae255a:

  Change vmrunkernel to use getopt() (2016-07-26 18:35:26 -0400)

are available in the git repository at:

  [email protected]:brho/akaros.git NMI

for you to fetch changes up to 31f3a4156ea1b9e831463c1fceab5e5c7366892d:

  x86: Don't backtrace from trampoline asm (2016-07-29 17:43:08 -0400)

----------------------------------------------------------------
View this online at:
https://github.com/brho/akaros/compare/1c7ae8e54697...31f3a4156ea1

----------------------------------------------------------------


Here's the diff, if you're curious:

diff --git a/kern/arch/x86/trap.c b/kern/arch/x86/trap.c
index bddb336bfbc4..94fe699003e8 100644
--- a/kern/arch/x86/trap.c
+++ b/kern/arch/x86/trap.c
@@ -400,22 +400,36 @@ __nmi_bottom_half(struct hw_trapframe *hw_tf)
        struct per_cpu_info *pcpui = &per_cpu_info[core_id()];
 
        while (1) {
+               /* Signal that we're doing work.  A concurrent NMI will set 
this to
+                * NMI_HANDLE_ANOTHER if we should continue, which we'll catch 
later. */
+               pcpui->nmi_status = NMI_IN_PROGRESS;
                do_nmi_work(hw_tf);
-               if (ACCESS_ONCE(pcpui->nmi_status) == NMI_HANDLE_ANOTHER) {
-                       pcpui->nmi_status = NMI_IN_PROGRESS;
-                       continue;
-               }
-               /* We need to atomically set the status to NMI_NORMAL_OPN, 
leave this
-                * stack, and return to the original context.  As soon as 
NORMAL_OPN is
-                * set, we could get interrupted by an NMI that trashes this 
stack.
+               /* We need to check nmi_status to see if it is 
NMI_HANDLE_ANOTHER (if
+                * so, run again), write NMI_NORMAL_OPN, leave this stack, and 
return to
+                * the original context.  We need to do that in such a manner 
that an
+                * NMI can come in at any time.  There are two concerns.
                 *
-                * The NMI handler will work together with the following 
function such
-                * that if that race occurs while we're in the function, it'll 
fail and
-                * return.  Then we'll just do_nmi_work and try again. */
-               extern void nmi_try_to_pop(struct hw_trapframe *, int *, int);
-
-               nmi_try_to_pop(hw_tf, &pcpui->nmi_status, NMI_NORMAL_OPN);
-               assert(ACCESS_ONCE(pcpui->nmi_status) != NMI_NORMAL_OPN);
+                * First, we need to not "miss the signal" telling us to re-run 
the NMI
+                * handler.  To do that, we'll do the actual checking in asm.  
Being in
+                * the asm code block is a signal to the real NMI handler that 
we need
+                * to abort and do_nmi_work() again.
+                *
+                * Second, we need to atomically leave the stack and return.  
By being
+                * in asm, the NMI handler knows to just hack our PC to make us 
return,
+                * instead of starting up a fresh __nmi_bottom_half().
+                *
+                * The NMI handler works together with the following function 
such that
+                * if that race occurs while we're in the function, it'll fail 
and
+                * return.  Then we'll just do_nmi_work() and try again. */
+               extern void nmi_try_to_pop(struct hw_trapframe *tf, int *status,
+                                          int old_val, int new_val);
+
+               nmi_try_to_pop(hw_tf, &pcpui->nmi_status, NMI_IN_PROGRESS,
+                              NMI_NORMAL_OPN);
+               /* Either we returned on our own, since we lost a race with 
nmi_status
+                * and didn't write (status = ANOTHER), or we won the race, but 
an NMI
+                * handler set the status to ANOTHER and restarted us. */
+               assert(pcpui->nmi_status != NMI_NORMAL_OPN);
        }
 }
 
@@ -466,12 +480,16 @@ void handle_nmi(struct hw_trapframe *hw_tf)
        if (pcpui->nmi_status == NMI_IN_PROGRESS) {
                /* Force the handler to run again.  We don't need to worry about
                 * concurrent access here.  We're running, they are not.  We 
cannot
-                * 'PAUSE' since NMIs are fully blocked. */
+                * 'PAUSE' since NMIs are fully blocked.
+                *
+                * The asm routine, for its part, does a compare-and-swap, so 
if we
+                * happened to interrupt it before it wrote NMI_NORMAL_OPN, 
it'll
+                * notice, abort, and not write the status. */
                pcpui->nmi_status = NMI_HANDLE_ANOTHER;
                return;
        }
        assert(pcpui->nmi_status == NMI_NORMAL_OPN);
-       pcpui->nmi_status = NMI_IN_PROGRESS;
+       pcpui->nmi_status = NMI_HANDLE_ANOTHER;
        /* We could be interrupting an NMI that is trying to pop back to a 
normal
         * context.  We can tell by looking at its PC.  If it is within the 
popping
         * routine, then we interrupted it at this bad time.  We'll hack the TF 
such
diff --git a/kern/arch/x86/trapentry64.S b/kern/arch/x86/trapentry64.S
index dc5231f3996b..7c4175d9a92e 100644
--- a/kern/arch/x86/trapentry64.S
+++ b/kern/arch/x86/trapentry64.S
@@ -599,9 +599,14 @@ nmi_popal:
 .globl __nmi_pop_fail_start;
 .globl __nmi_pop_fail_end;
 
+# extern void nmi_try_to_pop(struct hw_trapframe *tf, int *status,
+#                            int old_val, int new_val);
+#
 # __nmi_bottom_half calls this to atomically pop a hw_tf (%rdi) and set
-# &pcpui->nmi_status (%rsi) to NMI_NORMAL_OPN (%edx). (Careful, nmi_status is 
an
-# int, not a long.)
+# &pcpui->nmi_status (%rsi) with compare and swap to NMI_NORMAL_OPN (%ecx) 
given
+# that it was NMI_IN_PROGRESS (%edx)
+#
+# (Careful, nmi_status is an int, not a long.)
 #
 # If the real NMI handler interrupts us, it'll move us to the fail section of
 # the code.  That code is identical to 'ok', up until ok's final statement.
@@ -613,6 +618,12 @@ nmi_popal:
 .type nmi_try_to_pop, @function;
 nmi_try_to_pop:
 __nmi_pop_ok_start:
+       # careful only to use caller-saved or argument registers before saving
+       movl %edx, %eax                 # load old_val into eax for the CAS
+       cmpxchgl %ecx, (%rsi)   # no need for LOCK, since an NMI would serialize
+       jz nmi_ok_cas_worked    # ZF = 1 on successful CAS
+       ret
+nmi_ok_cas_worked:
        # save callee-saved regs (the pops below clobber them, and we might 
return)
        pushq %rbp
        pushq %rbx
@@ -620,7 +631,6 @@ __nmi_pop_ok_start:
        pushq %r13
        pushq %r14
        pushq %r15
-       movl %edx, (%rsi)               # interruptible after this write
        # We need to save the current rsp into the scratch space at the top of 
the
        # stack.  This assumes we're within the top page of our stack, which 
should
        # always be true.  Careful not to use rdi, which still has an argument.
@@ -680,6 +690,12 @@ __nmi_pop_ok_end:
 # iretq, other than 'ok' replaced with 'fail'.  In place of iretq, we undo the
 # entire operation.
 __nmi_pop_fail_start:
+       # careful only to use caller-saved or argument registers before saving
+       movl %edx, %eax                 # load old_val into eax for the CAS
+       cmpxchgl %ecx, (%rsi)   # no need for LOCK, since an NMI would serialize
+       jz nmi_fail_cas_worked  # ZF = 1 on successful CAS
+       ret
+nmi_fail_cas_worked:
        # save callee-saved regs (the pops below clobber them, and we might 
return)
        pushq %rbp
        pushq %rbx
@@ -687,7 +703,6 @@ __nmi_pop_fail_start:
        pushq %r13
        pushq %r14
        pushq %r15
-       movl %edx, (%rsi)                       # interruptible after this write
        # We need to save the current rsp into the scratch space at the top of 
the
        # stack.  This assumes we're within the top page of our stack, which 
should
        # always be true.  Careful not to use rdi, which still has an argument.

-- 
You received this message because you are subscribed to the Google Groups 
"Akaros" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To post to this group, send email to [email protected].
For more options, visit https://groups.google.com/d/optout.

[akaros] Re: Art Review - NMIs for profiling

Reply via email to