Signed-off-by: Maxim Levitsky <mlevi...@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  23 +-
 arch/x86/include/uapi/asm/kvm.h |  14 +-
 arch/x86/kvm/svm/nested.c       |  62 +++---
 arch/x86/kvm/svm/svm.c          |   8 +-
 arch/x86/kvm/vmx/nested.c       | 114 +++++-----
 arch/x86/kvm/vmx/vmx.c          |  14 +-
 arch/x86/kvm/x86.c              | 370 +++++++++++++++++++-------------
 arch/x86/kvm/x86.h              |   6 +-
 include/uapi/linux/kvm.h        |   1 +
 9 files changed, 367 insertions(+), 245 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4aa48fb55361d..190e245aa6670 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -637,16 +637,22 @@ struct kvm_vcpu_arch {
 
        u8 event_exit_inst_len;
 
-       struct kvm_queued_exception {
-               bool pending;
-               bool injected;
+       struct kvm_pending_exception {
+               bool valid;
                bool has_error_code;
                u8 nr;
                u32 error_code;
                unsigned long payload;
                bool has_payload;
                u8 nested_apf;
-       } exception;
+       } pending_exception;
+
+       struct kvm_queued_exception {
+               bool valid;
+               bool has_error_code;
+               u8 nr;
+               u32 error_code;
+       } injected_exception;
 
        struct kvm_queued_interrupt {
                bool injected;
@@ -1018,6 +1024,7 @@ struct kvm_arch {
 
        bool guest_can_read_msr_platform_info;
        bool exception_payload_enabled;
+       bool exception_separate_injected_pending;
 
        /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */
        u32 user_space_msr_mask;
@@ -1351,6 +1358,14 @@ struct kvm_x86_ops {
 
 struct kvm_x86_nested_ops {
        int (*check_events)(struct kvm_vcpu *vcpu);
+
+       /*
+        * return value: 0 - delivered vm exit, 1 - exception not intercepted,
+        * negative - failure
+        * */
+
+       int (*deliver_exception)(struct kvm_vcpu *vcpu);
+
        bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
        int (*get_state)(struct kvm_vcpu *vcpu,
                         struct kvm_nested_state __user *user_kvm_nested_state,
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 5a3022c8af82b..9556e420e8ecb 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -345,9 +345,17 @@ struct kvm_vcpu_events {
                __u8 smm_inside_nmi;
                __u8 latched_init;
        } smi;
-       __u8 reserved[27];
-       __u8 exception_has_payload;
-       __u64 exception_payload;
+
+       __u8 reserved[20];
+
+       struct {
+               __u32 error_code;
+               __u8 nr;
+               __u8 pad;
+               __u8 has_error_code;
+               __u8 has_payload;
+               __u64 payload;
+       } pending_exception;
 };
 
 /* for KVM_GET/SET_DEBUGREGS */
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 4c82abce0ea0c..9df01b6e2e091 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -315,15 +315,16 @@ static void nested_save_pending_event_to_vmcb12(struct 
vcpu_svm *svm,
        u32 exit_int_info = 0;
        unsigned int nr;
 
-       if (vcpu->arch.exception.injected) {
-               nr = vcpu->arch.exception.nr;
+       if (vcpu->arch.injected_exception.valid) {
+               nr = vcpu->arch.injected_exception.nr;
                exit_int_info = nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT;
 
-               if (vcpu->arch.exception.has_error_code) {
+               if (vcpu->arch.injected_exception.has_error_code) {
                        exit_int_info |= SVM_EVTINJ_VALID_ERR;
                        vmcb12->control.exit_int_info_err =
-                               vcpu->arch.exception.error_code;
+                               vcpu->arch.injected_exception.error_code;
                }
+               vcpu->arch.injected_exception.valid = false;
 
        } else if (vcpu->arch.nmi_injected) {
                exit_int_info = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
@@ -923,30 +924,30 @@ int nested_svm_check_permissions(struct kvm_vcpu *vcpu)
 
 static bool nested_exit_on_exception(struct vcpu_svm *svm)
 {
-       unsigned int nr = svm->vcpu.arch.exception.nr;
+       unsigned int nr = svm->vcpu.arch.pending_exception.nr;
 
        return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(nr));
 }
 
 static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
 {
-       unsigned int nr = svm->vcpu.arch.exception.nr;
+       unsigned int nr = svm->vcpu.arch.pending_exception.nr;
 
        svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
        svm->vmcb->control.exit_code_hi = 0;
 
-       if (svm->vcpu.arch.exception.has_error_code)
-               svm->vmcb->control.exit_info_1 = 
svm->vcpu.arch.exception.error_code;
+       if (svm->vcpu.arch.pending_exception.has_error_code)
+               svm->vmcb->control.exit_info_1 = 
svm->vcpu.arch.pending_exception.error_code;
 
        /*
         * EXITINFO2 is undefined for all exception intercepts other
         * than #PF.
         */
        if (nr == PF_VECTOR) {
-               if (svm->vcpu.arch.exception.nested_apf)
+               if (svm->vcpu.arch.pending_exception.nested_apf)
                        svm->vmcb->control.exit_info_2 = 
svm->vcpu.arch.apf.nested_apf_token;
-               else if (svm->vcpu.arch.exception.has_payload)
-                       svm->vmcb->control.exit_info_2 = 
svm->vcpu.arch.exception.payload;
+               else if (svm->vcpu.arch.pending_exception.has_payload)
+                       svm->vmcb->control.exit_info_2 = 
svm->vcpu.arch.pending_exception.payload;
                else
                        svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
        } else if (nr == DB_VECTOR) {
@@ -957,7 +958,7 @@ static void nested_svm_inject_exception_vmexit(struct 
vcpu_svm *svm)
                        kvm_update_dr7(&svm->vcpu);
                }
        } else
-               WARN_ON(svm->vcpu.arch.exception.has_payload);
+               WARN_ON(svm->vcpu.arch.pending_exception.has_payload);
 
        nested_svm_vmexit(svm);
 }
@@ -1023,20 +1024,6 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                return 0;
        }
 
-       if (vcpu->arch.exception.pending) {
-               /*
-                * Only pending nested run can block an pending exception
-                * Otherwise an injected NMI/interrupt should either be
-                * lost or delivered to the nested hypervisor in EXITINTINFO
-                * */
-               if (svm->nested.nested_run_pending)
-                        return -EBUSY;
-               if (!nested_exit_on_exception(svm))
-                       return 0;
-               nested_svm_inject_exception_vmexit(svm);
-               return 0;
-       }
-
        if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
                if (block_nested_events)
                        return -EBUSY;
@@ -1063,7 +1050,29 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
                nested_svm_intr(svm);
                return 0;
        }
+       return 0;
+}
+
+int svm_deliver_nested_exception(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       /*
+        * Only pending exception can cause vm exit.
+        * Injected exception are either already started delivery
+        * or came from nested EVENTINJ which doesn't check intercepts
+        */
+
+       if (!vcpu->arch.pending_exception.valid)
+               return 1;
+
+       if(svm->nested.nested_run_pending)
+               return -EBUSY;
+
+       if (!nested_exit_on_exception(svm))
+               return 1;
 
+       nested_svm_inject_exception_vmexit(svm);
        return 0;
 }
 
@@ -1302,6 +1311,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu 
*vcpu)
 
 struct kvm_x86_nested_ops svm_nested_ops = {
        .check_events = svm_check_nested_events,
+       .deliver_exception = svm_deliver_nested_exception,
        .get_nested_state_pages = svm_get_nested_state_pages,
        .get_state = svm_get_nested_state,
        .set_state = svm_set_nested_state,
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index cdbbda37b9419..0a1857f5fe55e 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -363,11 +363,11 @@ static int skip_emulated_instruction(struct kvm_vcpu 
*vcpu)
 static void svm_queue_exception(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       unsigned nr = vcpu->arch.exception.nr;
-       bool has_error_code = vcpu->arch.exception.has_error_code;
-       u32 error_code = vcpu->arch.exception.error_code;
+       unsigned nr = vcpu->arch.injected_exception.nr;
+       bool has_error_code = vcpu->arch.injected_exception.has_error_code;
+       u32 error_code = vcpu->arch.injected_exception.error_code;
 
-       kvm_deliver_exception_payload(vcpu);
+       WARN_ON(vcpu->arch.pending_exception.valid);
 
        if (nr == BP_VECTOR && !nrips) {
                unsigned long rip, old_rip = kvm_rip_read(vcpu);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 20ed1a351b2d9..be9c4e449aafd 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -388,17 +388,19 @@ static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 
*vmcs12,
 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long 
*exit_qual)
 {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-       unsigned int nr = vcpu->arch.exception.nr;
-       bool has_payload = vcpu->arch.exception.has_payload;
-       unsigned long payload = vcpu->arch.exception.payload;
+       unsigned int nr = vcpu->arch.pending_exception.nr;
+       bool has_payload = vcpu->arch.pending_exception.has_payload;
+       unsigned long payload = vcpu->arch.pending_exception.payload;
+
+       /* injected exception doesn't need checking here */
 
        if (nr == PF_VECTOR) {
-               if (vcpu->arch.exception.nested_apf) {
+               if (vcpu->arch.pending_exception.nested_apf) {
                        *exit_qual = vcpu->arch.apf.nested_apf_token;
                        return 1;
                }
                if (nested_vmx_is_page_fault_vmexit(vmcs12,
-                                                   
vcpu->arch.exception.error_code)) {
+                                                   
vcpu->arch.pending_exception.error_code)) {
                        *exit_qual = has_payload ? payload : vcpu->arch.cr2;
                        return 1;
                }
@@ -3621,8 +3623,8 @@ static void vmcs12_save_pending_event(struct kvm_vcpu 
*vcpu,
        u32 idt_vectoring;
        unsigned int nr;
 
-       if (vcpu->arch.exception.injected) {
-               nr = vcpu->arch.exception.nr;
+       if (vcpu->arch.injected_exception.valid) {
+               nr = vcpu->arch.injected_exception.nr;
                idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
 
                if (kvm_exception_is_soft(nr)) {
@@ -3632,10 +3634,10 @@ static void vmcs12_save_pending_event(struct kvm_vcpu 
*vcpu,
                } else
                        idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
 
-               if (vcpu->arch.exception.has_error_code) {
+               if (vcpu->arch.injected_exception.has_error_code) {
                        idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
                        vmcs12->idt_vectoring_error_code =
-                               vcpu->arch.exception.error_code;
+                               vcpu->arch.injected_exception.error_code;
                }
 
                vmcs12->idt_vectoring_info_field = idt_vectoring;
@@ -3716,11 +3718,11 @@ static void nested_vmx_inject_exception_vmexit(struct 
kvm_vcpu *vcpu,
                                               unsigned long exit_qual)
 {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-       unsigned int nr = vcpu->arch.exception.nr;
+       unsigned int nr = vcpu->arch.pending_exception.nr;
        u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
-       if (vcpu->arch.exception.has_error_code) {
-               vmcs12->vm_exit_intr_error_code = 
vcpu->arch.exception.error_code;
+       if (vcpu->arch.pending_exception.has_error_code) {
+               vmcs12->vm_exit_intr_error_code = 
vcpu->arch.pending_exception.error_code;
                intr_info |= INTR_INFO_DELIVER_CODE_MASK;
        }
 
@@ -3744,9 +3746,9 @@ static void nested_vmx_inject_exception_vmexit(struct 
kvm_vcpu *vcpu,
  */
 static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu)
 {
-       return vcpu->arch.exception.pending &&
-                       vcpu->arch.exception.nr == DB_VECTOR &&
-                       vcpu->arch.exception.payload;
+       return vcpu->arch.pending_exception.valid &&
+                       vcpu->arch.pending_exception.nr == DB_VECTOR &&
+                       vcpu->arch.pending_exception.payload;
 }
 
 /*
@@ -3760,7 +3762,7 @@ static void nested_vmx_update_pending_dbg(struct kvm_vcpu 
*vcpu)
 {
        if (vmx_pending_dbg_trap(vcpu))
                vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
-                           vcpu->arch.exception.payload);
+                           vcpu->arch.pending_exception.payload);
 }
 
 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
@@ -3772,10 +3774,8 @@ static bool nested_vmx_preemption_timer_pending(struct 
kvm_vcpu *vcpu)
 static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       unsigned long exit_qual;
        bool block_nested_events =
            vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
-       bool mtf_pending = vmx->nested.mtf_pending;
        struct kvm_lapic *apic = vcpu->arch.apic;
 
        /*
@@ -3808,39 +3808,6 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
                return 0;
        }
 
-       /*
-        * Process any exceptions that are not debug traps before MTF.
-        *
-        * Note that only pending nested run can block an pending exception
-        * Otherwise an injected NMI/interrupt should either be
-        * lost or delivered to the nested hypervisor in EXITINTINFO
-        */
-
-       if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
-               if (vmx->nested.nested_run_pending)
-                       return -EBUSY;
-               if (!nested_vmx_check_exception(vcpu, &exit_qual))
-                       goto no_vmexit;
-               nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
-               return 0;
-       }
-
-       if (mtf_pending) {
-               if (block_nested_events)
-                       return -EBUSY;
-               nested_vmx_update_pending_dbg(vcpu);
-               nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
-               return 0;
-       }
-
-       if (vcpu->arch.exception.pending) {
-               if (vmx->nested.nested_run_pending)
-                       return -EBUSY;
-               if (!nested_vmx_check_exception(vcpu, &exit_qual))
-                       goto no_vmexit;
-               nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
-               return 0;
-       }
 
        if (nested_vmx_preemption_timer_pending(vcpu)) {
                if (block_nested_events)
@@ -3887,6 +3854,50 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
        return 0;
 }
 
+static int nested_vmx_deliver_nested_exception(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long exit_qual;
+
+
+       if (!vcpu->arch.pending_exception.valid && !vmx->nested.mtf_pending)
+               return 1;
+
+       if(vmx->nested.nested_run_pending)
+               return -EBUSY;
+
+       /*
+        * Process any exceptions that are not debug traps before MTF.
+        *
+        * Note that only pending nested run can block an pending exception
+        * Otherwise an injected NMI/interrupt should either be
+        * lost or delivered to the nested hypervisor in EXITINTINFO
+        */
+
+       if (vcpu->arch.pending_exception.valid && !vmx_pending_dbg_trap(vcpu)) {
+               if (!nested_vmx_check_exception(vcpu, &exit_qual))
+                       goto no_vmexit;
+               nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+               return 0;
+       }
+
+       if (vmx->nested.mtf_pending) {
+               /* TODO: check this */
+               nested_vmx_update_pending_dbg(vcpu);
+               nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
+               return 0;
+       }
+
+       if (vcpu->arch.pending_exception.valid) {
+               if (!nested_vmx_check_exception(vcpu, &exit_qual))
+                       goto no_vmexit;
+               nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+               return 0;
+       }
+no_vmexit:
+       return 1;
+}
+
 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
 {
        ktime_t remaining =
@@ -6598,6 +6609,7 @@ __init int nested_vmx_hardware_setup(int 
(*exit_handlers[])(struct kvm_vcpu *))
 
 struct kvm_x86_nested_ops vmx_nested_ops = {
        .check_events = vmx_check_nested_events,
+       .deliver_exception = nested_vmx_deliver_nested_exception,
        .hv_timer_pending = nested_vmx_preemption_timer_pending,
        .get_state = vmx_get_nested_state,
        .set_state = vmx_set_nested_state,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f2714d22228de..d480bd48d786f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1630,8 +1630,8 @@ static void vmx_update_emulated_instruction(struct 
kvm_vcpu *vcpu)
         * vmx_check_nested_events().
         */
        if (nested_cpu_has_mtf(vmcs12) &&
-           (!vcpu->arch.exception.pending ||
-            vcpu->arch.exception.nr == DB_VECTOR))
+           (!vcpu->arch.pending_exception.valid ||
+            vcpu->arch.pending_exception.nr == DB_VECTOR))
                vmx->nested.mtf_pending = true;
        else
                vmx->nested.mtf_pending = false;
@@ -1659,12 +1659,12 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
 static void vmx_queue_exception(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       unsigned nr = vcpu->arch.exception.nr;
-       bool has_error_code = vcpu->arch.exception.has_error_code;
-       u32 error_code = vcpu->arch.exception.error_code;
+       unsigned nr = vcpu->arch.injected_exception.nr;
+       bool has_error_code = vcpu->arch.injected_exception.has_error_code;
+       u32 error_code = vcpu->arch.injected_exception.error_code;
        u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
-       kvm_deliver_exception_payload(vcpu);
+       WARN_ON(vcpu->arch.pending_exception.valid);
 
        if (has_error_code) {
                vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
@@ -5400,7 +5400,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu 
*vcpu)
                        return 0;
 
                if (vmx->emulation_required && !vmx->rmode.vm86_active &&
-                   vcpu->arch.exception.pending) {
+                   vcpu->arch.pending_exception.valid) {
                        vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                        vcpu->run->internal.suberror =
                                                KVM_INTERNAL_ERROR_EMULATION;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a9d814a0b5e4f..eec62c0dafc36 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -480,9 +480,9 @@ static int exception_type(int vector)
 
 void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
 {
-       unsigned nr = vcpu->arch.exception.nr;
-       bool has_payload = vcpu->arch.exception.has_payload;
-       unsigned long payload = vcpu->arch.exception.payload;
+       unsigned nr = vcpu->arch.pending_exception.nr;
+       bool has_payload = vcpu->arch.pending_exception.has_payload;
+       unsigned long payload = vcpu->arch.pending_exception.payload;
 
        if (!has_payload)
                return;
@@ -528,83 +528,130 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
                break;
        }
 
-       vcpu->arch.exception.has_payload = false;
-       vcpu->arch.exception.payload = 0;
+       vcpu->arch.pending_exception.has_payload = false;
+       vcpu->arch.pending_exception.payload = 0;
 }
 EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
 
-static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
-               unsigned nr, bool has_error, u32 error_code,
-               bool has_payload, unsigned long payload, bool reinject)
+
+/*
+ * Delivers exception payload and converts/merges current pending
+ * exception with injected exception (if any) and writes
+ * result to injected exception
+ */
+int kvm_deliver_pending_exception(struct kvm_vcpu *vcpu)
 {
-       u32 prev_nr;
-       int class1, class2;
+       while (vcpu->arch.pending_exception.valid) {
+               u32 prev_nr;
+               int class1, class2;
 
-       kvm_make_request(KVM_REQ_EVENT, vcpu);
+               /* try to deliver current pending exception as VM exit */
+               if (is_guest_mode(vcpu)) {
+                       int ret = 
kvm_x86_ops.nested_ops->deliver_exception(vcpu);
+                       if (ret <= 0)
+                               return ret;
+               }
 
-       if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
-       queue:
-               if (reinject) {
-                       /*
-                        * On vmentry, vcpu->arch.exception.pending is only
-                        * true if an event injection was blocked by
-                        * nested_run_pending.  In that case, however,
-                        * vcpu_enter_guest requests an immediate exit,
-                        * and the guest shouldn't proceed far enough to
-                        * need reinjection.
+               /* No injected exception, so just deliver the payload and 
inject it */
+               if (!vcpu->arch.injected_exception.valid) {
+
+                       trace_kvm_inj_exception(vcpu->arch.pending_exception.nr,
+                                               
vcpu->arch.pending_exception.has_error_code,
+                                               
vcpu->arch.pending_exception.error_code);
+
+                       /* Intel SDM 17.3.1.1 */
+                       if (exception_type(vcpu->arch.pending_exception.nr) == 
EXCPT_FAULT)
+                               __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
+                                                    X86_EFLAGS_RF);
+
+                       kvm_deliver_exception_payload(vcpu);
+
+                       /* Intel SDM 17.2.4
+                        * The processor clears the GD flag upon entering to the
+                        * debug exception handler, to allow the handler access
+                        * to the debug registers.
                         */
-                       WARN_ON_ONCE(vcpu->arch.exception.pending);
-                       vcpu->arch.exception.injected = true;
-                       if (WARN_ON_ONCE(has_payload)) {
-                               /*
-                                * A reinjected event has already
-                                * delivered its payload.
-                                */
-                               has_payload = false;
-                               payload = 0;
+                       if (vcpu->arch.pending_exception.nr == DB_VECTOR) {
+                               if (vcpu->arch.dr7 & DR7_GD) {
+                                       vcpu->arch.dr7 &= ~DR7_GD;
+                                       kvm_update_dr7(vcpu);
+                               }
                        }
-               } else {
-                       vcpu->arch.exception.pending = true;
-                       vcpu->arch.exception.injected = false;
+
+                       if (vcpu->arch.pending_exception.error_code && 
!is_protmode(vcpu))
+                               vcpu->arch.pending_exception.error_code = false;
+
+                       vcpu->arch.pending_exception.valid = false;
+                       vcpu->arch.injected_exception.valid = true;
+                       vcpu->arch.injected_exception.has_error_code = 
vcpu->arch.pending_exception.has_error_code;
+                       vcpu->arch.injected_exception.nr = 
vcpu->arch.pending_exception.nr;
+                       vcpu->arch.injected_exception.error_code = 
vcpu->arch.pending_exception.error_code;
+                       return 0;
+               }
+
+               /* Convert both pending and injected exception to triple fault*/
+               prev_nr = vcpu->arch.injected_exception.nr;
+               if (prev_nr == DF_VECTOR) {
+                       /* triple fault -> shutdown */
+                       vcpu->arch.injected_exception.valid = false;
+                       vcpu->arch.pending_exception.valid = false;
+
+                       /* TODO - make KVM_REQ_TRIPLE_FAULT inject vmexit when 
guest intercepts it */
+                       kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+                       break;
+               }
+
+               class1 = exception_class(prev_nr);
+               class2 = exception_class(vcpu->arch.pending_exception.nr);
+
+               vcpu->arch.injected_exception.valid = false;
+
+               if ((class1 == EXCPT_CONTRIBUTORY && class2 == 
EXCPT_CONTRIBUTORY)
+                       || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
+                       /* Generate double fault per SDM Table 5-5. */
+                       vcpu->arch.pending_exception.has_error_code = true;
+                       vcpu->arch.pending_exception.nr = DF_VECTOR;
+                       vcpu->arch.pending_exception.error_code = 0;
+                       vcpu->arch.pending_exception.has_payload = false;
                }
-               vcpu->arch.exception.has_error_code = has_error;
-               vcpu->arch.exception.nr = nr;
-               vcpu->arch.exception.error_code = error_code;
-               vcpu->arch.exception.has_payload = has_payload;
-               vcpu->arch.exception.payload = payload;
-               if (!is_guest_mode(vcpu))
-                       kvm_deliver_exception_payload(vcpu);
-               return;
        }
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_deliver_pending_exception);
 
-       /* to check exception */
-       prev_nr = vcpu->arch.exception.nr;
-       if (prev_nr == DF_VECTOR) {
-               /* triple fault -> shutdown */
-               kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
-               return;
+static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
+               unsigned nr, bool has_error, u32 error_code,
+               bool has_payload, unsigned long payload, bool reinject)
+{
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+       if (reinject) {
+               // exceptions are re-injected right after VM exit,
+               // before we are able to generate another
+               // pending exception
+               if (WARN_ON_ONCE(vcpu->arch.pending_exception.valid))
+                       return;
+               // it is not possible to inject more that one exception
+               if (WARN_ON_ONCE(vcpu->arch.injected_exception.valid))
+                       return;
+               vcpu->arch.injected_exception.valid = true;
+               vcpu->arch.injected_exception.nr = nr;
+               vcpu->arch.injected_exception.has_error_code = has_error;
+               vcpu->arch.injected_exception.error_code = error_code;
+
+               // re-injected exception has its payload already delivered
+               WARN_ON_ONCE(has_payload);
+       } else {
+               // can't have more that one pending exception
+               if (WARN_ON_ONCE(vcpu->arch.pending_exception.valid))
+                       return;
+               vcpu->arch.pending_exception.valid = true;
+               vcpu->arch.pending_exception.nr = nr;
+               vcpu->arch.pending_exception.has_error_code = has_error;
+               vcpu->arch.pending_exception.error_code = error_code;
+               vcpu->arch.pending_exception.has_payload = has_payload;
+               vcpu->arch.pending_exception.payload = payload;
        }
-       class1 = exception_class(prev_nr);
-       class2 = exception_class(nr);
-       if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
-               || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
-               /*
-                * Generate double fault per SDM Table 5-5.  Set
-                * exception.pending = true so that the double fault
-                * can trigger a nested vmexit.
-                */
-               vcpu->arch.exception.pending = true;
-               vcpu->arch.exception.injected = false;
-               vcpu->arch.exception.has_error_code = true;
-               vcpu->arch.exception.nr = DF_VECTOR;
-               vcpu->arch.exception.error_code = 0;
-               vcpu->arch.exception.has_payload = false;
-               vcpu->arch.exception.payload = 0;
-       } else
-               /* replace previous exception with a new one in a hope
-                  that instruction re-execution will regenerate lost
-                  exception */
-               goto queue;
 }
 
 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
@@ -647,9 +694,9 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 {
        ++vcpu->stat.pf_guest;
-       vcpu->arch.exception.nested_apf =
+       vcpu->arch.pending_exception.nested_apf =
                is_guest_mode(vcpu) && fault->async_page_fault;
-       if (vcpu->arch.exception.nested_apf) {
+       if (vcpu->arch.pending_exception.nested_apf) {
                vcpu->arch.apf.nested_apf_token = fault->address;
                kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
        } else {
@@ -4267,47 +4314,69 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct 
kvm_vcpu *vcpu,
        if (kvm_check_request(KVM_REQ_SMI, vcpu))
                process_smi(vcpu);
 
+       events->exception.injected = 0;
+       events->exception.pending = 0;
+       events->pending_exception.has_payload = 0;
+       events->pending_exception.payload = 0;
+
+       /* In unlikely case when we have both pending and injected exception 
and userspace didn't enable
+        * KVM_CAP_EXCEPTION_INJECTED_PENDING deliver the pending exception now
+        */
+       if (!vcpu->kvm->arch.exception_separate_injected_pending) {
+               if (vcpu->arch.pending_exception.valid && 
vcpu->arch.injected_exception.valid)
+                       if (kvm_deliver_pending_exception(vcpu) < 0)
+                               /* in case the delivery fails, we
+                                * forget about the injected exception */
+                               vcpu->arch.injected_exception.valid = false;
+       }
+
        /*
-        * In guest mode, payload delivery should be deferred,
-        * so that the L1 hypervisor can intercept #PF before
-        * CR2 is modified (or intercept #DB before DR6 is
-        * modified under nVMX). Unless the per-VM capability,
+        * Unless the per-VM capability,
         * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of
         * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we
         * opportunistically defer the exception payload, deliver it if the
         * capability hasn't been requested before processing a
         * KVM_GET_VCPU_EVENTS.
         */
+
        if (!vcpu->kvm->arch.exception_payload_enabled &&
-           vcpu->arch.exception.pending && vcpu->arch.exception.has_payload)
+           vcpu->arch.pending_exception.valid && 
vcpu->arch.pending_exception.valid)
                kvm_deliver_exception_payload(vcpu);
 
+       if (vcpu->arch.pending_exception.valid &&
+           !kvm_exception_is_soft(vcpu->arch.pending_exception.nr)) {
+
+               events->exception.pending = true;
+               events->pending_exception.has_payload = 
vcpu->arch.pending_exception.has_payload;
+               events->pending_exception.payload = 
vcpu->arch.pending_exception.payload;
+
+               /* TODO: this code looks ugly */
+               if (vcpu->kvm->arch.exception_separate_injected_pending) {
+                       events->pending_exception.has_error_code = 
vcpu->arch.pending_exception.has_error_code;
+                       events->pending_exception.error_code = 
vcpu->arch.pending_exception.error_code;
+                       events->pending_exception.nr = 
vcpu->arch.pending_exception.nr;
+               } else {
+                       events->exception.has_error_code = 
vcpu->arch.pending_exception.has_error_code;
+                       events->exception.error_code = 
vcpu->arch.pending_exception.error_code;
+                       events->exception.nr = vcpu->arch.pending_exception.nr;
+               }
+       }
+
+       if (vcpu->arch.injected_exception.valid &&
+           !kvm_exception_is_soft(vcpu->arch.injected_exception.nr)) {
+               events->exception.injected = true;
+               events->exception.nr = vcpu->arch.injected_exception.nr;
+               events->exception.has_error_code = 
vcpu->arch.injected_exception.has_error_code;
+               events->exception.error_code = 
vcpu->arch.injected_exception.error_code;
+       }
+
        /*
-        * The API doesn't provide the instruction length for software
-        * exceptions, so don't report them. As long as the guest RIP
-        * isn't advanced, we should expect to encounter the exception
-        * again.
+        * For ABI compatibility, deliberately conflate
+        * pending and injected exceptions when
+        * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
         */
-       if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
-               events->exception.injected = 0;
-               events->exception.pending = 0;
-       } else {
-               events->exception.injected = vcpu->arch.exception.injected;
-               events->exception.pending = vcpu->arch.exception.pending;
-               /*
-                * For ABI compatibility, deliberately conflate
-                * pending and injected exceptions when
-                * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
-                */
-               if (!vcpu->kvm->arch.exception_payload_enabled)
-                       events->exception.injected |=
-                               vcpu->arch.exception.pending;
-       }
-       events->exception.nr = vcpu->arch.exception.nr;
-       events->exception.has_error_code = vcpu->arch.exception.has_error_code;
-       events->exception.error_code = vcpu->arch.exception.error_code;
-       events->exception_has_payload = vcpu->arch.exception.has_payload;
-       events->exception_payload = vcpu->arch.exception.payload;
+       if (!vcpu->kvm->arch.exception_payload_enabled)
+               events->exception.injected |= 
vcpu->arch.pending_exception.valid;
 
        events->interrupt.injected =
                vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
@@ -4339,6 +4408,11 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct 
kvm_vcpu *vcpu,
 
 static void kvm_smm_changed(struct kvm_vcpu *vcpu);
 
+static bool is_valid_exception(int nr)
+{
+       return nr < 32 && nr != NMI_VECTOR;
+}
+
 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                                              struct kvm_vcpu_events *events)
 {
@@ -4355,16 +4429,21 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct 
kvm_vcpu *vcpu,
                if (events->exception.pending)
                        events->exception.injected = 0;
                else
-                       events->exception_has_payload = 0;
+                       events->pending_exception.has_payload = 0;
        } else {
                events->exception.pending = 0;
-               events->exception_has_payload = 0;
+               events->pending_exception.has_payload = 0;
        }
 
        if ((events->exception.injected || events->exception.pending) &&
-           (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
+           (!is_valid_exception(events->exception.nr)))
                return -EINVAL;
 
+       if (vcpu->kvm->arch.exception_separate_injected_pending)
+               if (events->exception.pending &&
+                   !is_valid_exception(events->pending_exception.nr))
+                       return -EINVAL;
+
        /* INITs are latched while in SMM */
        if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
            (events->smi.smm || events->smi.pending) &&
@@ -4372,13 +4451,30 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct 
kvm_vcpu *vcpu,
                return -EINVAL;
 
        process_nmi(vcpu);
-       vcpu->arch.exception.injected = events->exception.injected;
-       vcpu->arch.exception.pending = events->exception.pending;
-       vcpu->arch.exception.nr = events->exception.nr;
-       vcpu->arch.exception.has_error_code = events->exception.has_error_code;
-       vcpu->arch.exception.error_code = events->exception.error_code;
-       vcpu->arch.exception.has_payload = events->exception_has_payload;
-       vcpu->arch.exception.payload = events->exception_payload;
+
+       if (events->exception.injected) {
+               vcpu->arch.injected_exception.valid = true;
+               vcpu->arch.injected_exception.nr = events->exception.nr;
+               vcpu->arch.injected_exception.has_error_code = 
events->exception.has_error_code;
+               vcpu->arch.injected_exception.error_code = 
events->exception.error_code;
+       }
+
+       if (events->exception.pending) {
+               vcpu->arch.pending_exception.valid = true;
+
+               if (vcpu->kvm->arch.exception_separate_injected_pending) {
+                       vcpu->arch.pending_exception.nr = 
events->pending_exception.nr;
+                       vcpu->arch.pending_exception.has_error_code = 
events->pending_exception.has_error_code;
+                       vcpu->arch.pending_exception.error_code = 
events->pending_exception.error_code;
+               } else {
+                       vcpu->arch.pending_exception.nr = events->exception.nr;
+                       vcpu->arch.pending_exception.has_error_code = 
events->exception.has_error_code;
+                       vcpu->arch.pending_exception.error_code = 
events->exception.error_code;
+               }
+
+               vcpu->arch.pending_exception.has_payload = 
events->pending_exception.has_payload;
+               vcpu->arch.pending_exception.payload = 
events->pending_exception.payload;
+       }
 
        vcpu->arch.interrupt.injected = events->interrupt.injected;
        vcpu->arch.interrupt.nr = events->interrupt.nr;
@@ -5347,6 +5443,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
                kvm->arch.exception_payload_enabled = cap->args[0];
                r = 0;
                break;
+       case KVM_CAP_EXCEPTION_INJECTED_PENDING:
+               kvm->arch.exception_separate_injected_pending = cap->args[0];
+               r = 0;
+               break;
+
        case KVM_CAP_X86_USER_SPACE_MSR:
                kvm->arch.user_space_msr_mask = cap->args[0];
                r = 0;
@@ -8345,8 +8446,6 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 
 static void kvm_inject_exception(struct kvm_vcpu *vcpu)
 {
-       if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
-               vcpu->arch.exception.error_code = false;
        static_call(kvm_x86_queue_exception)(vcpu);
 }
 
@@ -8355,9 +8454,14 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, 
bool *req_immediate_exit
        int r;
        bool can_inject = true;
 
-       /* try to reinject previous events if any */
+       r = kvm_deliver_pending_exception(vcpu);
+       if (r < 0)
+               goto busy;
+
+       WARN_ON_ONCE(vcpu->arch.pending_exception.valid);
 
-       if (vcpu->arch.exception.injected) {
+       /* try to reinject previous events if any */
+       if (vcpu->arch.injected_exception.valid) {
                kvm_inject_exception(vcpu);
                can_inject = false;
        }
@@ -8375,7 +8479,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, 
bool *req_immediate_exit
         * serviced prior to recognizing any new events in order to
         * fully complete the previous instruction.
         */
-       else if (!vcpu->arch.exception.pending) {
+       else {
                if (vcpu->arch.nmi_injected) {
                        static_call(kvm_x86_set_nmi)(vcpu);
                        can_inject = false;
@@ -8385,9 +8489,6 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, 
bool *req_immediate_exit
                }
        }
 
-       WARN_ON_ONCE(vcpu->arch.exception.injected &&
-                    vcpu->arch.exception.pending);
-
        /*
         * Call check_nested_events() even if we reinjected a previous event
         * in order for caller to determine if it should require immediate-exit
@@ -8400,31 +8501,6 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, 
bool *req_immediate_exit
                        goto busy;
        }
 
-       /* try to inject new event if pending */
-       if (vcpu->arch.exception.pending) {
-               trace_kvm_inj_exception(vcpu->arch.exception.nr,
-                                       vcpu->arch.exception.has_error_code,
-                                       vcpu->arch.exception.error_code);
-
-               vcpu->arch.exception.pending = false;
-               vcpu->arch.exception.injected = true;
-
-               if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
-                       __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
-                                            X86_EFLAGS_RF);
-
-               if (vcpu->arch.exception.nr == DB_VECTOR) {
-                       kvm_deliver_exception_payload(vcpu);
-                       if (vcpu->arch.dr7 & DR7_GD) {
-                               vcpu->arch.dr7 &= ~DR7_GD;
-                               kvm_update_dr7(vcpu);
-                       }
-               }
-
-               kvm_inject_exception(vcpu);
-               can_inject = false;
-       }
-
        /*
         * Finally, inject interrupt events.  If an event cannot be injected
         * due to architectural conditions (e.g. IF=0) a window-open exit
@@ -8482,7 +8558,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, 
bool *req_immediate_exit
            kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
                *req_immediate_exit = true;
 
-       WARN_ON(vcpu->arch.exception.pending);
+       WARN_ON(vcpu->arch.pending_exception.valid);
        return;
 
 busy:
@@ -9584,7 +9660,7 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct 
kvm_regs *regs)
        kvm_rip_write(vcpu, regs->rip);
        kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
 
-       vcpu->arch.exception.pending = false;
+       vcpu->arch.pending_exception.valid = false;
 
        kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
@@ -9870,7 +9946,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu 
*vcpu,
 
        if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
                r = -EBUSY;
-               if (vcpu->arch.exception.pending)
+               if (vcpu->arch.pending_exception.valid)
                        goto out;
                if (dbg->control & KVM_GUESTDBG_INJECT_DB)
                        kvm_queue_exception(vcpu, DB_VECTOR);
@@ -10931,7 +11007,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu 
*vcpu)
        if (vcpu->arch.pv.pv_unhalted)
                return true;
 
-       if (vcpu->arch.exception.pending)
+       if (vcpu->arch.pending_exception.valid || 
vcpu->arch.injected_exception.valid)
                return true;
 
        if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
@@ -11171,7 +11247,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
 {
        if (unlikely(!lapic_in_kernel(vcpu) ||
                     kvm_event_needs_reinjection(vcpu) ||
-                    vcpu->arch.exception.pending))
+                    vcpu->arch.pending_exception.valid))
                return false;
 
        if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index ee6e01067884d..e3848072c5bdb 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -58,8 +58,8 @@ static inline unsigned int __shrink_ple_window(unsigned int 
val,
 
 static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
 {
-       vcpu->arch.exception.pending = false;
-       vcpu->arch.exception.injected = false;
+       vcpu->arch.pending_exception.valid = false;
+       vcpu->arch.injected_exception.valid = false;
 }
 
 static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
@@ -77,7 +77,7 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu 
*vcpu)
 
 static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
 {
-       return vcpu->arch.exception.injected || vcpu->arch.interrupt.injected ||
+       return vcpu->arch.injected_exception.valid || 
vcpu->arch.interrupt.injected ||
                vcpu->arch.nmi_injected;
 }
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 63f8f6e956487..d913a46d36b04 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1077,6 +1077,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_SYS_HYPERV_CPUID 191
 #define KVM_CAP_DIRTY_LOG_RING 192
 #define KVM_CAP_X86_BUS_LOCK_EXIT 193
+#define KVM_CAP_EXCEPTION_INJECTED_PENDING 194
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.26.2

Reply via email to