This patch addresses item #2215532 in the kvm bug tracker, but was
finally also visible with other Linux guests that use the NMI watchdog:

There is a subtle race in kvm-intel between a pending IRQ and a briefly
later arriving NMI (e.g. from the watchdog). If the IRQ was injected but
the guest exited again on ejection due to some page fault, the flag
interrupt.pending remained true. If now some NMI just happened to be
pended as well, that one overruled the IRQ and was re-injected instead
(what is OK!). But during the next run of vmx_complete_interrupts the
originally pending IRQ fell on the floor and was forgotten. That means
we dequeued some IRQ from the [A]PIC, but never delivered it,
effectively causing a stall of IRQ deliveries. You may guess that it
took me a while to understand this...

The patch below addresses the issue by turning interrupt.pending into a
three-state variable: NONE, QUEUED (but not currently injected), and
INJECTED. If we overwrite some IRQ injection with an NMI, the state gets
properly updated. Moreover, we only transit from INJECTED to NONE to
avoid loosing IRQs.

To simplify review and maintenance, the patch aligns the decision
pattern in vmx_intr_assist with do_interrupt_requests.

Signed-off-by: Jan Kiszka <[EMAIL PROTECTED]>
---
 arch/x86/include/asm/kvm_host.h |    6 +++
 arch/x86/kvm/vmx.c              |   61 +++++++++++++++++++++++++++-------------
 arch/x86/kvm/x86.h              |    4 +-
 3 files changed, 49 insertions(+), 22 deletions(-)

Index: b/arch/x86/include/asm/kvm_host.h
===================================================================
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -301,7 +301,11 @@ struct kvm_vcpu_arch {
        } exception;
 
        struct kvm_queued_interrupt {
-               bool pending;
+               enum {
+                       KVMIRQ_NONE,
+                       KVMIRQ_QUEUED,
+                       KVMIRQ_INJECTED
+               } pending;
                u8 nr;
        } interrupt;
 
Index: b/arch/x86/kvm/vmx.c
===================================================================
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1037,7 +1037,7 @@ static int set_guest_debug(struct kvm_vc
 
 static int vmx_get_irq(struct kvm_vcpu *vcpu)
 {
-       if (!vcpu->arch.interrupt.pending)
+       if (vcpu->arch.interrupt.pending == KVMIRQ_NONE)
                return -1;
        return vcpu->arch.interrupt.nr;
 }
@@ -2487,9 +2487,16 @@ static void do_interrupt_requests(struct
        }
        if (vcpu->arch.nmi_injected) {
                vmx_inject_nmi(vcpu);
+               if (vcpu->arch.interrupt.pending == KVMIRQ_INJECTED)
+                       /*
+                        * Degrade pending state, we will properly reinject
+                        * after the NMI.
+                        */
+                       vcpu->arch.interrupt.pending = KVMIRQ_QUEUED;
                if (vcpu->arch.nmi_pending || kvm_run->request_nmi_window)
                        enable_nmi_window(vcpu);
-               else if (vcpu->arch.irq_summary
+               else if (vcpu->arch.interrupt.pending != KVMIRQ_NONE
+                        || vcpu->arch.irq_summary
                         || kvm_run->request_interrupt_window)
                        enable_irq_window(vcpu);
                return;
@@ -2498,14 +2505,18 @@ static void do_interrupt_requests(struct
                enable_nmi_window(vcpu);
 
        if (vcpu->arch.interrupt_window_open) {
-               if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
+               if (vcpu->arch.irq_summary &&
+                   vcpu->arch.interrupt.pending == KVMIRQ_NONE)
                        kvm_do_inject_irq(vcpu);
 
-               if (vcpu->arch.interrupt.pending)
+               if (vcpu->arch.interrupt.pending != KVMIRQ_NONE) {
+                       vcpu->arch.interrupt.pending = KVMIRQ_INJECTED;
                        vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+               }
        }
        if (!vcpu->arch.interrupt_window_open &&
-           (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
+           (vcpu->arch.irq_summary || kvm_run->request_interrupt_window
+            || vcpu->arch.interrupt.pending != KVMIRQ_NONE))
                enable_irq_window(vcpu);
 }
 
@@ -2624,7 +2635,8 @@ static int handle_exception(struct kvm_v
                cr2 = vmcs_readl(EXIT_QUALIFICATION);
                KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
                            (u32)((u64)cr2 >> 32), handler);
-               if (vcpu->arch.interrupt.pending || 
vcpu->arch.exception.pending)
+               if (vcpu->arch.interrupt.pending != KVMIRQ_NONE
+                   || vcpu->arch.exception.pending)
                        kvm_mmu_unprotect_page_virt(vcpu, cr2);
                return kvm_mmu_page_fault(vcpu, cr2, error_code);
        }
@@ -3244,7 +3256,8 @@ static void vmx_complete_interrupts(stru
                                        GUEST_INTR_STATE_NMI);
                else
                        vmx->vcpu.arch.nmi_injected = false;
-       }
+       } else if (vmx->vcpu.arch.interrupt.pending == KVMIRQ_INJECTED)
+               kvm_clear_interrupt_queue(&vmx->vcpu);
        kvm_clear_exception_queue(&vmx->vcpu);
        if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) {
                if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
@@ -3253,9 +3266,7 @@ static void vmx_complete_interrupts(stru
                } else
                        kvm_queue_exception(&vmx->vcpu, vector);
                vmx->idt_vectoring_info = 0;
-       }
-       kvm_clear_interrupt_queue(&vmx->vcpu);
-       if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) {
+       } else if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) {
                kvm_queue_interrupt(&vmx->vcpu, vector);
                vmx->idt_vectoring_info = 0;
        }
@@ -3278,22 +3289,34 @@ static void vmx_intr_assist(struct kvm_v
        }
        if (vcpu->arch.nmi_injected) {
                vmx_inject_nmi(vcpu);
+               if (vcpu->arch.interrupt.pending == KVMIRQ_INJECTED)
+                       /*
+                        * Degrade pending state, we will properly reinject
+                        * after the NMI.
+                        */
+                       vcpu->arch.interrupt.pending = KVMIRQ_QUEUED;
                if (vcpu->arch.nmi_pending)
                        enable_nmi_window(vcpu);
-               else if (kvm_cpu_has_interrupt(vcpu))
+               else if (vcpu->arch.interrupt.pending != KVMIRQ_NONE
+                        || kvm_cpu_has_interrupt(vcpu))
                        enable_irq_window(vcpu);
                return;
        }
-       if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
-               if (vcpu->arch.interrupt_window_open)
+       if (vcpu->arch.interrupt_window_open) {
+               if (vcpu->arch.interrupt.pending == KVMIRQ_NONE
+                   && kvm_cpu_has_interrupt(vcpu))
                        kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
-               else
-                       enable_irq_window(vcpu);
-       }
-       if (vcpu->arch.interrupt.pending) {
-               vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
-               kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
+
+               if (vcpu->arch.interrupt.pending != KVMIRQ_NONE) {
+                       vcpu->arch.interrupt.pending = KVMIRQ_INJECTED;
+                       vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+                       kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
+               }
        }
+       if (!vcpu->arch.interrupt_window_open
+           && (vcpu->arch.interrupt.pending != KVMIRQ_NONE
+               || kvm_cpu_has_interrupt(vcpu)))
+               enable_irq_window(vcpu);
 }
 
 /*
Index: b/arch/x86/kvm/x86.h
===================================================================
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -10,13 +10,13 @@ static inline void kvm_clear_exception_q
 
 static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector)
 {
-       vcpu->arch.interrupt.pending = true;
+       vcpu->arch.interrupt.pending = KVMIRQ_QUEUED;
        vcpu->arch.interrupt.nr = vector;
 }
 
 static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
 {
-       vcpu->arch.interrupt.pending = false;
+       vcpu->arch.interrupt.pending = KVMIRQ_NONE;
 }
 
 #endif

-- 
Siemens AG, Corporate Technology, CT SE 2 ES-OS
Corporate Competence Center Embedded Linux
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to