Re: [PATCH 3/4] KVM: x86: correctly merge pending and injected exception
On Thu, Apr 01, 2021, Paolo Bonzini wrote: > On 01/04/21 16:38, Maxim Levitsky wrote: > > +static int kvm_do_deliver_pending_exception(struct kvm_vcpu *vcpu) > > +{ > > + int class1, class2, ret; > > + > > + /* try to deliver current pending exception as VM exit */ > > + if (is_guest_mode(vcpu)) { > > + ret = kvm_x86_ops.nested_ops->deliver_exception_as_vmexit(vcpu); > > + if (ret || !vcpu->arch.pending_exception.valid) > > + return ret; > > + } > > + > > + /* No injected exception, so just deliver the payload and inject it */ > > + if (!vcpu->arch.injected_exception.valid) { > > + trace_kvm_inj_exception(vcpu->arch.pending_exception.nr, > > + > > vcpu->arch.pending_exception.has_error_code, > > + > > vcpu->arch.pending_exception.error_code); > > +queue: > > If you move the queue label to the top of the function, you can "goto queue" > for #DF as well and you don't need to call kvm_do_deliver_pending_exception > again. In fact you can merge this function and kvm_deliver_pending_exception > completely: > > > static int kvm_deliver_pending_exception_as_vmexit(struct kvm_vcpu *vcpu) > { > WARN_ON(!vcpu->arch.pending_exception.valid); > if (is_guest_mode(vcpu)) > return > kvm_x86_ops.nested_ops->deliver_exception_as_vmexit(vcpu); > else > return 0; > } > > static int kvm_merge_injected_exception(struct kvm_vcpu *vcpu) > { > /* >* First check if the pending exception takes precedence >* over the injected one, which will be reported in the >* vmexit info. >*/ > ret = kvm_deliver_pending_exception_as_vmexit(vcpu); > if (ret || !vcpu->arch.pending_exception.valid) > return ret; > > if (vcpu->arch.injected_exception.nr == DF_VECTOR) { > ... > return 0; > } > ... > if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) > || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { > ... > } > vcpu->arch.injected_exception.valid = false; > } > > static int kvm_deliver_pending_exception(struct kvm_vcpu *vcpu) > { > if (!vcpu->arch.pending_exception.valid) > return 0; > > if (vcpu->arch.injected_exception.valid) > kvm_merge_injected_exception(vcpu); > > ret = kvm_deliver_pending_exception_as_vmexit(vcpu)); > if (ret || !vcpu->arch.pending_exception.valid) I really don't like querying arch.pending_exception.valid to see if the exception was morphed to a VM-Exit. I also find kvm_deliver_pending_exception_as_vmexit() to be misleading; to me, that reads as being a command, i.e. "deliver this pending exception as a VM-Exit". It' also be nice to make the helpers closer to pure functions, i.e. pass the exception as a param instead of pulling it from vcpu->arch. Now that we have static_call, the number of calls into vendor code isn't a huge issue. Moving nested_run_pending to arch code would help, too. What about doing something like: static bool kvm_l1_wants_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector) { return is_guest_mode(vcpu) && kvm_x86_l1_wants_exception(vcpu, vector); } ... if (!kvm_x86_exception_allowed(vcpu)) return -EBUSY; if (kvm_l1_wants_exception_vmexit(vcpu, vcpu->arch...)) return kvm_x86_deliver_exception_as_vmexit(...); > return ret; > > trace_kvm_inj_exception(vcpu->arch.pending_exception.nr, > vcpu->arch.pending_exception.has_error_code, > vcpu->arch.pending_exception.error_code); > ... > } >
Re: [PATCH 3/4] KVM: x86: correctly merge pending and injected exception
On 01/04/21 16:38, Maxim Levitsky wrote: +static int kvm_do_deliver_pending_exception(struct kvm_vcpu *vcpu) +{ + int class1, class2, ret; + + /* try to deliver current pending exception as VM exit */ + if (is_guest_mode(vcpu)) { + ret = kvm_x86_ops.nested_ops->deliver_exception_as_vmexit(vcpu); + if (ret || !vcpu->arch.pending_exception.valid) + return ret; + } + + /* No injected exception, so just deliver the payload and inject it */ + if (!vcpu->arch.injected_exception.valid) { + trace_kvm_inj_exception(vcpu->arch.pending_exception.nr, + vcpu->arch.pending_exception.has_error_code, + vcpu->arch.pending_exception.error_code); +queue: If you move the queue label to the top of the function, you can "goto queue" for #DF as well and you don't need to call kvm_do_deliver_pending_exception again. In fact you can merge this function and kvm_deliver_pending_exception completely: static int kvm_deliver_pending_exception_as_vmexit(struct kvm_vcpu *vcpu) { WARN_ON(!vcpu->arch.pending_exception.valid); if (is_guest_mode(vcpu)) return kvm_x86_ops.nested_ops->deliver_exception_as_vmexit(vcpu); else return 0; } static int kvm_merge_injected_exception(struct kvm_vcpu *vcpu) { /* * First check if the pending exception takes precedence * over the injected one, which will be reported in the * vmexit info. */ ret = kvm_deliver_pending_exception_as_vmexit(vcpu); if (ret || !vcpu->arch.pending_exception.valid) return ret; if (vcpu->arch.injected_exception.nr == DF_VECTOR) { ... return 0; } ... if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { ... } vcpu->arch.injected_exception.valid = false; } static int kvm_deliver_pending_exception(struct kvm_vcpu *vcpu) { if (!vcpu->arch.pending_exception.valid) return 0; if (vcpu->arch.injected_exception.valid) kvm_merge_injected_exception(vcpu); ret = kvm_deliver_pending_exception_as_vmexit(vcpu)); if (ret || !vcpu->arch.pending_exception.valid) return ret; trace_kvm_inj_exception(vcpu->arch.pending_exception.nr, vcpu->arch.pending_exception.has_error_code, vcpu->arch.pending_exception.error_code); ... } Note that if the pending exception is a page fault, its payload must be delivered to CR2 before converting it to a double fault. Going forward to vmx.c: if (mtf_pending) { if (block_nested_events) return -EBUSY; + nested_vmx_update_pending_dbg(vcpu); Should this instead "WARN_ON(vmx_pending_dbg_trap(vcpu));" since the pending-#DB-plus-MTF combination is handled in vmx_deliver_exception_as_vmexit?... + + if (vmx->nested.mtf_pending && vmx_pending_dbg_trap(vcpu)) { + /* +* A pending monitor trap takes precedence over pending +* debug exception which is 'stashed' into +* 'GUEST_PENDING_DBG_EXCEPTIONS' +*/ + + nested_vmx_update_pending_dbg(vcpu); + vmx->nested.mtf_pending = false; + nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); + return 0; + } ... though this is quite ugly, even more so if you add the case of an INIT with a pending #DB. The problem is that INIT and MTF have higher priority than debug exceptions. The good thing is that an INIT or MTF plus #DB cannot happen with nested_run_pending == 1, so it will always be inject right away. There is precedent with KVM_GET_* modifying the CPU state; in particular, KVM_GET_MPSTATE can modify CS and RIP and even cause a vmexit via kvm_apic_accept_events. And in fact, because kvm_apic_accept_events calls kvm_check_nested_events, calling it from KVM_GET_VCPU_EVENTS would fix the problem: the injected exception would go into the IDT-vectored exit field, while the pending exception would go into GUEST_PENDING_DBG_EXCEPTION and disappear. However, you cannot do kvm_apic_accept_events twice because there would be a race with INIT: a #DB exception could be first stored by KVM_GET_VCPU_EVENTS, then disappear when kvm_apic_accept_events KVM_GET_MPSTATE is called. Fortunately, the correct order for KVM_GET_* events is first KVM_GET_VCPU_EVENTS and then KVM_GET_MPSTATE. So perhaps instead of calling kvm_deliver_pending_exception on vmexit, KVM_GET_VCPU_EVENTS could call kvm_apic_accept_events (and thus kvm_check_nested_events) instead of KVM_GET_MPSTATE. In
[PATCH 3/4] KVM: x86: correctly merge pending and injected exception
Allow the pending and the injected exceptions to co-exist when they are raised. Add 'kvm_deliver_pending_exception' function which 'merges' the pending and injected exception or delivers a VM exit with both for a case when the L1 intercepts the pending exception. The later is done by vendor code using new nested callback 'deliver_exception_as_vmexit' The kvm_deliver_pending_exception is called after each VM exit, and prior to VM entry which ensures that during userspace VM exits, only injected exception can be in a raised state. Signed-off-by: Maxim Levitsky --- arch/x86/include/asm/kvm_host.h | 9 ++ arch/x86/kvm/svm/nested.c | 27 ++-- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/nested.c | 58 arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/x86.c | 233 ++-- 6 files changed, 181 insertions(+), 150 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3b2fd276e8d5..a9b9cd030d9a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1346,6 +1346,15 @@ struct kvm_x86_ops { struct kvm_x86_nested_ops { int (*check_events)(struct kvm_vcpu *vcpu); + + /* +* Deliver a pending exception as a VM exit if the L1 intercepts it. +* Returns -EBUSY if L1 does intercept the exception but, +* it is not possible to deliver it right now. +* (for example when nested run is pending) +*/ + int (*deliver_exception_as_vmexit)(struct kvm_vcpu *vcpu); + bool (*hv_timer_pending)(struct kvm_vcpu *vcpu); void (*triple_fault)(struct kvm_vcpu *vcpu); int (*get_state)(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 7adad9b6dcad..ff745d59ffcf 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1061,21 +1061,6 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) return 0; } - if (vcpu->arch.pending_exception.valid) { - /* -* Only a pending nested run can block a pending exception. -* Otherwise an injected NMI/interrupt should either be -* lost or delivered to the nested hypervisor in the EXITINTINFO -* vmcb field, while delivering the pending exception. -*/ - if (svm->nested.nested_run_pending) -return -EBUSY; - if (!nested_exit_on_exception(svm)) - return 0; - nested_svm_inject_exception_vmexit(svm); - return 0; - } - if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) { if (block_nested_events) return -EBUSY; @@ -1107,6 +1092,17 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) return 0; } +int svm_deliver_nested_exception_as_vmexit(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (svm->nested.nested_run_pending) + return -EBUSY; + if (nested_exit_on_exception(svm)) + nested_svm_inject_exception_vmexit(svm); + return 0; +} + int nested_svm_exit_special(struct vcpu_svm *svm) { u32 exit_code = svm->vmcb->control.exit_code; @@ -1321,6 +1317,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, struct kvm_x86_nested_ops svm_nested_ops = { .check_events = svm_check_nested_events, .triple_fault = nested_svm_triple_fault, + .deliver_exception_as_vmexit = svm_deliver_nested_exception_as_vmexit, .get_nested_state_pages = svm_get_nested_state_pages, .get_state = svm_get_nested_state, .set_state = svm_set_nested_state, diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 90b541138c5a..b89e48574c39 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -363,7 +363,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) bool has_error_code = vcpu->arch.injected_exception.has_error_code; u32 error_code = vcpu->arch.injected_exception.error_code; - kvm_deliver_exception_payload(vcpu); + WARN_ON_ONCE(vcpu->arch.pending_exception.valid); if (nr == BP_VECTOR && !nrips) { unsigned long rip, old_rip = kvm_rip_read(vcpu); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 5d54fecff9a7..1c09b132c55c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3768,7 +3768,6 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) static int vmx_check_nested_events(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long exit_qual; bool block_nested_events = vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); bool mtf_pending = vmx->nested.mtf_pending; @@ -3804,41 +3803,15