Re: [PATCH 3/4] KVM: x86: correctly merge pending and injected exception

2021-04-01 Thread Sean Christopherson
On Thu, Apr 01, 2021, Paolo Bonzini wrote:
> On 01/04/21 16:38, Maxim Levitsky wrote:
> > +static int kvm_do_deliver_pending_exception(struct kvm_vcpu *vcpu)
> > +{
> > +   int class1, class2, ret;
> > +
> > +   /* try to deliver current pending exception as VM exit */
> > +   if (is_guest_mode(vcpu)) {
> > +   ret = kvm_x86_ops.nested_ops->deliver_exception_as_vmexit(vcpu);
> > +   if (ret || !vcpu->arch.pending_exception.valid)
> > +   return ret;
> > +   }
> > +
> > +   /* No injected exception, so just deliver the payload and inject it */
> > +   if (!vcpu->arch.injected_exception.valid) {
> > +   trace_kvm_inj_exception(vcpu->arch.pending_exception.nr,
> > +   
> > vcpu->arch.pending_exception.has_error_code,
> > +   
> > vcpu->arch.pending_exception.error_code);
> > +queue:
> 
> If you move the queue label to the top of the function, you can "goto queue" 
> for #DF as well and you don't need to call kvm_do_deliver_pending_exception 
> again.  In fact you can merge this function and kvm_deliver_pending_exception 
> completely:
> 
> 
> static int kvm_deliver_pending_exception_as_vmexit(struct kvm_vcpu *vcpu)
> {
>   WARN_ON(!vcpu->arch.pending_exception.valid);
>   if (is_guest_mode(vcpu))
>   return 
> kvm_x86_ops.nested_ops->deliver_exception_as_vmexit(vcpu);
>   else
>   return 0;
> }
> 
> static int kvm_merge_injected_exception(struct kvm_vcpu *vcpu)
> {
>   /*
>* First check if the pending exception takes precedence
>* over the injected one, which will be reported in the
>* vmexit info.
>*/
>   ret = kvm_deliver_pending_exception_as_vmexit(vcpu);
>   if (ret || !vcpu->arch.pending_exception.valid)
>   return ret;
> 
>   if (vcpu->arch.injected_exception.nr == DF_VECTOR) {
>   ...
>   return 0;
>   }
>   ...
>   if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
>   || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
>   ...
>   }
>   vcpu->arch.injected_exception.valid = false;
> }
> 
> static int kvm_deliver_pending_exception(struct kvm_vcpu *vcpu)
> {
>   if (!vcpu->arch.pending_exception.valid)
>   return 0;
> 
>   if (vcpu->arch.injected_exception.valid)
>   kvm_merge_injected_exception(vcpu);
> 
>   ret = kvm_deliver_pending_exception_as_vmexit(vcpu));
>   if (ret || !vcpu->arch.pending_exception.valid)

I really don't like querying arch.pending_exception.valid to see if the 
exception
was morphed to a VM-Exit.  I also find kvm_deliver_pending_exception_as_vmexit()
to be misleading; to me, that reads as being a command, i.e. "deliver this
pending exception as a VM-Exit".

It' also be nice to make the helpers closer to pure functions, i.e. pass the
exception as a param instead of pulling it from vcpu->arch.

Now that we have static_call, the number of calls into vendor code isn't a huge
issue.  Moving nested_run_pending to arch code would help, too.  What about
doing something like:

static bool kvm_l1_wants_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector)
{
return is_guest_mode(vcpu) && kvm_x86_l1_wants_exception(vcpu, vector);
}

...

if (!kvm_x86_exception_allowed(vcpu))
return -EBUSY;

if (kvm_l1_wants_exception_vmexit(vcpu, vcpu->arch...))
return kvm_x86_deliver_exception_as_vmexit(...);

>   return ret;
> 
>   trace_kvm_inj_exception(vcpu->arch.pending_exception.nr,
>   vcpu->arch.pending_exception.has_error_code,
>   vcpu->arch.pending_exception.error_code);
>   ...
> }
> 


Re: [PATCH 3/4] KVM: x86: correctly merge pending and injected exception

2021-04-01 Thread Paolo Bonzini

On 01/04/21 16:38, Maxim Levitsky wrote:

+static int kvm_do_deliver_pending_exception(struct kvm_vcpu *vcpu)
+{
+   int class1, class2, ret;
+
+   /* try to deliver current pending exception as VM exit */
+   if (is_guest_mode(vcpu)) {
+   ret = kvm_x86_ops.nested_ops->deliver_exception_as_vmexit(vcpu);
+   if (ret || !vcpu->arch.pending_exception.valid)
+   return ret;
+   }
+
+   /* No injected exception, so just deliver the payload and inject it */
+   if (!vcpu->arch.injected_exception.valid) {
+   trace_kvm_inj_exception(vcpu->arch.pending_exception.nr,
+   
vcpu->arch.pending_exception.has_error_code,
+   
vcpu->arch.pending_exception.error_code);
+queue:


If you move the queue label to the top of the function, you can "goto queue" 
for #DF as well and you don't need to call kvm_do_deliver_pending_exception again.  In 
fact you can merge this function and kvm_deliver_pending_exception completely:


static int kvm_deliver_pending_exception_as_vmexit(struct kvm_vcpu *vcpu)
{
WARN_ON(!vcpu->arch.pending_exception.valid);
if (is_guest_mode(vcpu))
return 
kvm_x86_ops.nested_ops->deliver_exception_as_vmexit(vcpu);
else
return 0;
}

static int kvm_merge_injected_exception(struct kvm_vcpu *vcpu)
{
/*
 * First check if the pending exception takes precedence
 * over the injected one, which will be reported in the
 * vmexit info.
 */
ret = kvm_deliver_pending_exception_as_vmexit(vcpu);
if (ret || !vcpu->arch.pending_exception.valid)
return ret;

if (vcpu->arch.injected_exception.nr == DF_VECTOR) {
...
return 0;
}
...
if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
...
}
vcpu->arch.injected_exception.valid = false;
}

static int kvm_deliver_pending_exception(struct kvm_vcpu *vcpu)
{
if (!vcpu->arch.pending_exception.valid)
return 0;

if (vcpu->arch.injected_exception.valid)
kvm_merge_injected_exception(vcpu);

ret = kvm_deliver_pending_exception_as_vmexit(vcpu));
if (ret || !vcpu->arch.pending_exception.valid)
return ret;

trace_kvm_inj_exception(vcpu->arch.pending_exception.nr,
vcpu->arch.pending_exception.has_error_code,
vcpu->arch.pending_exception.error_code);
...
}

Note that if the pending exception is a page fault, its payload
must be delivered to CR2 before converting it to a double fault.

Going forward to vmx.c:



if (mtf_pending) {
if (block_nested_events)
return -EBUSY;
+
nested_vmx_update_pending_dbg(vcpu);


Should this instead "WARN_ON(vmx_pending_dbg_trap(vcpu));" since
the pending-#DB-plus-MTF combination is handled in
vmx_deliver_exception_as_vmexit?...



+
+   if (vmx->nested.mtf_pending && vmx_pending_dbg_trap(vcpu)) {
+   /*
+* A pending monitor trap takes precedence over pending
+* debug exception which is 'stashed' into
+* 'GUEST_PENDING_DBG_EXCEPTIONS'
+*/
+
+   nested_vmx_update_pending_dbg(vcpu);
+   vmx->nested.mtf_pending = false;
+   nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
+   return 0;
+   }


... though this is quite ugly, even more so if you add the case of an
INIT with a pending #DB.  The problem is that INIT and MTF have higher
priority than debug exceptions.

The good thing is that an INIT or MTF plus #DB cannot happen with
nested_run_pending == 1, so it will always be inject right away.

There is precedent with KVM_GET_* modifying the CPU state; in
particular, KVM_GET_MPSTATE can modify CS and RIP and even cause a
vmexit via kvm_apic_accept_events.  And in fact, because
kvm_apic_accept_events calls kvm_check_nested_events, calling it
from KVM_GET_VCPU_EVENTS would fix the problem: the injected exception
would go into the IDT-vectored exit field, while the pending exception
would go into GUEST_PENDING_DBG_EXCEPTION and disappear.

However, you cannot do kvm_apic_accept_events twice because there would
be a race with INIT: a #DB exception could be first stored by
KVM_GET_VCPU_EVENTS, then disappear when kvm_apic_accept_events
KVM_GET_MPSTATE is called.

Fortunately, the correct order for KVM_GET_* events is first
KVM_GET_VCPU_EVENTS and then KVM_GET_MPSTATE.  So perhaps
instead of calling kvm_deliver_pending_exception on vmexit,
KVM_GET_VCPU_EVENTS could call kvm_apic_accept_events (and thus
kvm_check_nested_events) instead of KVM_GET_MPSTATE.  In 

[PATCH 3/4] KVM: x86: correctly merge pending and injected exception

2021-04-01 Thread Maxim Levitsky
Allow the pending and the injected exceptions to co-exist
when they are raised.

Add 'kvm_deliver_pending_exception' function which 'merges' the pending
and injected exception or delivers a VM exit with both for a case when
the L1 intercepts the pending exception.

The later is done by vendor code using new nested callback
'deliver_exception_as_vmexit'

The kvm_deliver_pending_exception is called after each VM exit,
and prior to VM entry which ensures that during userspace VM exits,
only injected exception can be in a raised state.

Signed-off-by: Maxim Levitsky 
---
 arch/x86/include/asm/kvm_host.h |   9 ++
 arch/x86/kvm/svm/nested.c   |  27 ++--
 arch/x86/kvm/svm/svm.c  |   2 +-
 arch/x86/kvm/vmx/nested.c   |  58 
 arch/x86/kvm/vmx/vmx.c  |   2 +-
 arch/x86/kvm/x86.c  | 233 ++--
 6 files changed, 181 insertions(+), 150 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3b2fd276e8d5..a9b9cd030d9a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1346,6 +1346,15 @@ struct kvm_x86_ops {
 
 struct kvm_x86_nested_ops {
int (*check_events)(struct kvm_vcpu *vcpu);
+
+   /*
+* Deliver a pending exception as a VM exit if the L1 intercepts it.
+* Returns -EBUSY if L1 does intercept the exception but,
+* it is not possible to deliver it right now.
+* (for example when nested run is pending)
+*/
+   int (*deliver_exception_as_vmexit)(struct kvm_vcpu *vcpu);
+
bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
void (*triple_fault)(struct kvm_vcpu *vcpu);
int (*get_state)(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 7adad9b6dcad..ff745d59ffcf 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -1061,21 +1061,6 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
return 0;
}
 
-   if (vcpu->arch.pending_exception.valid) {
-   /*
-* Only a pending nested run can block a pending exception.
-* Otherwise an injected NMI/interrupt should either be
-* lost or delivered to the nested hypervisor in the EXITINTINFO
-* vmcb field, while delivering the pending exception.
-*/
-   if (svm->nested.nested_run_pending)
-return -EBUSY;
-   if (!nested_exit_on_exception(svm))
-   return 0;
-   nested_svm_inject_exception_vmexit(svm);
-   return 0;
-   }
-
if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
if (block_nested_events)
return -EBUSY;
@@ -1107,6 +1092,17 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
return 0;
 }
 
+int svm_deliver_nested_exception_as_vmexit(struct kvm_vcpu *vcpu)
+{
+   struct vcpu_svm *svm = to_svm(vcpu);
+
+   if (svm->nested.nested_run_pending)
+   return -EBUSY;
+   if (nested_exit_on_exception(svm))
+   nested_svm_inject_exception_vmexit(svm);
+   return 0;
+}
+
 int nested_svm_exit_special(struct vcpu_svm *svm)
 {
u32 exit_code = svm->vmcb->control.exit_code;
@@ -1321,6 +1317,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
 struct kvm_x86_nested_ops svm_nested_ops = {
.check_events = svm_check_nested_events,
.triple_fault = nested_svm_triple_fault,
+   .deliver_exception_as_vmexit = svm_deliver_nested_exception_as_vmexit,
.get_nested_state_pages = svm_get_nested_state_pages,
.get_state = svm_get_nested_state,
.set_state = svm_set_nested_state,
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 90b541138c5a..b89e48574c39 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -363,7 +363,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
bool has_error_code = vcpu->arch.injected_exception.has_error_code;
u32 error_code = vcpu->arch.injected_exception.error_code;
 
-   kvm_deliver_exception_payload(vcpu);
+   WARN_ON_ONCE(vcpu->arch.pending_exception.valid);
 
if (nr == BP_VECTOR && !nrips) {
unsigned long rip, old_rip = kvm_rip_read(vcpu);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 5d54fecff9a7..1c09b132c55c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3768,7 +3768,6 @@ static bool nested_vmx_preemption_timer_pending(struct 
kvm_vcpu *vcpu)
 static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
-   unsigned long exit_qual;
bool block_nested_events =
vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
bool mtf_pending = vmx->nested.mtf_pending;
@@ -3804,41 +3803,15