Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-17 Thread Paolo Bonzini

On 15/01/21 08:00, Wei Huang wrote:

If the whole body inside if-statement is moved out, do you expect the
interface of x86_emulate_decoded_instruction to be something like:

int x86_emulate_decoded_instruction(struct kvm_vcpu *vcpu,
 gpa_t cr2_or_gpa,
 int emulation_type, void *insn,
 int insn_len,
 bool write_fault_to_spt)


An idea is to making the body of the new function just

init_emulate_ctxt(vcpu);

/*
 * We will reenter on the same instruction since
 * we do not set complete_userspace_io.  This does not
 * handle watchpoints yet, those would be handled in
 * the emulate_ops.
 */
if (!(emulation_type & EMULTYPE_SKIP) &&
kvm_vcpu_check_breakpoint(vcpu, ))
return r;

ctxt->interruptibility = 0;
ctxt->have_exception = false;
ctxt->exception.vector = -1;
ctxt->exception.error_code_valid = false;

ctxt->perm_ok = false;

ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;

r = x86_decode_insn(ctxt, insn, insn_len);

trace_kvm_emulate_insn_start(vcpu);
++vcpu->stat.insn_emulation;
return r;

because for the new caller, on EMULATION_FAILED you can just re-enter 
the guest.



And if so, what is the emulation type to use when calling this function
from svm.c? EMULTYPE_VMWARE_GP?


Just 0 I think.

Paolo



Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-14 Thread Wei Huang



On 1/12/21 8:01 AM, Paolo Bonzini wrote:
> On 12/01/21 07:37, Wei Huang wrote:
>>   static int gp_interception(struct vcpu_svm *svm)
>>   {
>>   struct kvm_vcpu *vcpu = >vcpu;
>>   u32 error_code = svm->vmcb->control.exit_info_1;
>> -
>> -    WARN_ON_ONCE(!enable_vmware_backdoor);
>> +    int rc;
>>     /*
>> - * VMware backdoor emulation on #GP interception only handles IN{S},
>> - * OUT{S}, and RDPMC, none of which generate a non-zero error code.
>> + * Only VMware backdoor and SVM VME errata are handled. Neither of
>> + * them has non-zero error codes.
>>    */
>>   if (error_code) {
>>   kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>>   return 1;
>>   }
>> -    return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
>> +
>> +    rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
>> +    if (rc > 1)
>> +    rc = svm_emulate_vm_instr(vcpu, rc);
>> +    return rc;
>>   }
>>   
> 
> Passing back the third byte is quick hacky.  Instead of this change to
> kvm_emulate_instruction, I'd rather check the instruction bytes in
> gp_interception before calling kvm_emulate_instruction.  That would be
> something like:
> 
> - move "kvm_clear_exception_queue(vcpu);" inside the "if
> (!(emulation_type & EMULTYPE_NO_DECODE))".  It doesn't apply when you
> are coming back from userspace.
> 
> - extract the "if (!(emulation_type & EMULTYPE_NO_DECODE))" body to a
> new function x86_emulate_decoded_instruction.  Call it from
> gp_interception, we know this is not a pagefault and therefore
> vcpu->arch.write_fault_to_shadow_pgtable must be false.

If the whole body inside if-statement is moved out, do you expect the
interface of x86_emulate_decoded_instruction to be something like:

int x86_emulate_decoded_instruction(struct kvm_vcpu *vcpu,
gpa_t cr2_or_gpa,
int emulation_type, void *insn,
int insn_len,
bool write_fault_to_spt)

And if so, what is the emulation type to use when calling this function
from svm.c? EMULTYPE_VMWARE_GP?

> 
> - check ctxt->insn_bytes for an SVM instruction
> 
> - if not an SVM instruction, call kvm_emulate_instruction(vcpu,
> EMULTYPE_VMWARE_GP|EMULTYPE_NO_DECODE).
> 
> Thanks,
> 
> Paolo
> 


Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-14 Thread Sean Christopherson
On Thu, Jan 14, 2021, Maxim Levitsky wrote:
> On Tue, 2021-01-12 at 15:00 -0500, Bandan Das wrote:
> > Sean Christopherson  writes:
> > ...
> > > > -   if ((emulation_type & EMULTYPE_VMWARE_GP) &&
> > > > -   !is_vmware_backdoor_opcode(ctxt)) {
> > > > -   kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> > > > -   return 1;
> > > > +   if (emulation_type & EMULTYPE_PARAVIRT_GP) {
> > > > +   vminstr = is_vm_instr_opcode(ctxt);
> > > > +   if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
> > > > +   kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> > > > +   return 1;
> > > > +   }
> > > > +   if (vminstr)
> > > > +   return vminstr;
> > > 
> > > I'm pretty sure this doesn't correctly handle a VM-instr in L2 that hits 
> > > a bad
> > > L0 GPA and that L1 wants to intercept.  The intercept bitmap isn't 
> > > checked until
> > > x86_emulate_insn(), and the vm*_interception() helpers expect nested 
> > > VM-Exits to
> > > be handled further up the stack.
> 
> Actually IMHO this exactly what we want. We want L0 to always intercept
> these #GPs, and hide them from the guest.
> 
> What we do need to do (and I prepared and attached a patch for that, is that
> if we run a guest, we want to inject corresponding vmexit (like
> SVM_EXIT_VMRUN) instead of emulating the instruction.

Yes, lack of forwarding to L1 as a nested exit is what I meant by "doesn't
correctly handle".


Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-14 Thread Maxim Levitsky
On Tue, 2021-01-12 at 00:37 -0600, Wei Huang wrote:
> From: Bandan Das 
> 
> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
> before checking VMCB's instruction intercept. If EAX falls into such
> memory areas, #GP is triggered before VMEXIT. This causes problem under
> nested virtualization. To solve this problem, KVM needs to trap #GP and
> check the instructions triggering #GP. For VM execution instructions,
> KVM emulates these instructions; otherwise it re-injects #GP back to
> guest VMs.
> 
> Signed-off-by: Bandan Das 
> Co-developed-by: Wei Huang 
> Signed-off-by: Wei Huang 
> ---
>  arch/x86/include/asm/kvm_host.h |   8 +-
>  arch/x86/kvm/mmu.h  |   1 +
>  arch/x86/kvm/mmu/mmu.c  |   7 ++
>  arch/x86/kvm/svm/svm.c  | 157 +++-
>  arch/x86/kvm/svm/svm.h  |   8 ++
>  arch/x86/kvm/vmx/vmx.c  |   2 +-
>  arch/x86/kvm/x86.c  |  37 +++-
>  7 files changed, 146 insertions(+), 74 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3d6616f6f6ef..0ddc309f5a14 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
>   *due to an intercepted #UD (see EMULTYPE_TRAP_UD).
>   *Used to test the full emulator from userspace.
>   *
> - * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
> + * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
I would prefer to see this change in a separate patch.


>   *   backdoor emulation, which is opt in via module param.
>   *   VMware backoor emulation handles select instructions
> - *   and reinjects the #GP for all other cases.
> + *   and reinjects #GP for all other cases. This also
> + *   handles other cases where #GP condition needs to be
> + *   handled and emulated appropriately
>   *
>   * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in 
> which
>   *case the CR2/GPA value pass on the stack is valid.
> @@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
>  #define EMULTYPE_SKIP(1 << 2)
>  #define EMULTYPE_ALLOW_RETRY_PF  (1 << 3)
>  #define EMULTYPE_TRAP_UD_FORCED  (1 << 4)
> -#define EMULTYPE_VMWARE_GP   (1 << 5)
> +#define EMULTYPE_PARAVIRT_GP (1 << 5)
>  #define EMULTYPE_PF  (1 << 6)
>  
>  int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 581925e476d6..1a2fff4e7140 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
>  
>  int kvm_mmu_post_init_vm(struct kvm *kvm);
>  void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
> +bool kvm_is_host_reserved_region(u64 gpa);
>  
>  #endif
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6d16481aa29d..c5c4aaf01a1a 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -50,6 +50,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include "trace.h"
>  
>  extern bool itlb_multihit_kvm_mitigation;
> @@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>  
> +bool kvm_is_host_reserved_region(u64 gpa)
> +{
> + return e820__mapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
> +}
> +EXPORT_SYMBOL_GPL(kvm_is_host_reserved_region);
> +
>  void kvm_mmu_zap_all(struct kvm *kvm)
>  {
>   struct kvm_mmu_page *sp, *node;
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 7ef171790d02..74620d32aa82 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -288,6 +288,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>   if (!(efer & EFER_SVME)) {
>   svm_leave_nested(svm);
>   svm_set_gif(svm, true);
> + clr_exception_intercept(svm, GP_VECTOR);
Wouldn't that be wrong if we intercept #GP due to the vmware backdoor?

I would add a flag that will be true when the workaround for the errata is 
enabled,
and use it together with flag that enables vmware backdoor for decisions
such as the above.

The flag can even be a module param to allow users to disable it if they
really want to.

>  
>   /*
>* Free the nested guest state, unless we are in SMM.
> @@ -309,6 +310,10 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>  
>   svm->vmcb->save.efer = efer | EFER_SVME;
>   vmcb_mark_dirty(svm->vmcb, VMCB_CR);
> + /* Enable GP interception for SVM instructions if needed */
> + if (efer & EFER_SVME)
> 

Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-14 Thread Maxim Levitsky
On Tue, 2021-01-12 at 15:00 -0500, Bandan Das wrote:
> Sean Christopherson  writes:
> ...
> > > - if ((emulation_type & EMULTYPE_VMWARE_GP) &&
> > > - !is_vmware_backdoor_opcode(ctxt)) {
> > > - kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> > > - return 1;
> > > + if (emulation_type & EMULTYPE_PARAVIRT_GP) {
> > > + vminstr = is_vm_instr_opcode(ctxt);
> > > + if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
> > > + kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> > > + return 1;
> > > + }
> > > + if (vminstr)
> > > + return vminstr;
> > 
> > I'm pretty sure this doesn't correctly handle a VM-instr in L2 that hits a 
> > bad
> > L0 GPA and that L1 wants to intercept.  The intercept bitmap isn't checked 
> > until
> > x86_emulate_insn(), and the vm*_interception() helpers expect nested 
> > VM-Exits to
> > be handled further up the stack.

Actually IMHO this exactly what we want. We want L0 to always intercept
these #GPs, and hide them from the guest.

What we do need to do (and I prepared and attached a patch for that, is that if 
we run
a guest, we want to inject corresponding vmexit (like SVM_EXIT_VMRUN)
instead of emulating the instruction.
The attached patch does this, and it made my kvm unit test pass,
even if the test was run in a VM (with an unpatched kernel).

This together with setting that X86_FEATURE_SVME_ADDR_CHK bit for
the guest will allow us to hide that errata completely from the guest
which is a very good thing.
(for example for guests that we can't modify)


Best regards,
Maxim Levitsky



> > 
> So, the condition is that L2 executes a vmload and #GPs on a reserved 
> address, jumps to L0 - L0 doesn't
> check if L1 has asked for the instruction to be intercepted and goes on with 
> emulating
> vmload and returning back to L2 ?



> 
> > >   }
> > >  
> > >   /*
> > > -- 
> > > 2.27.0
> > > 

commit 28ab89aaa11380306bafbf49265222f2a2da71da
Author: Maxim Levitsky 
Date:   Thu Jan 14 10:53:25 2021 +0200

kvm: x86: fix that errata for nested guests

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index c31e005252d69..9cfa5946fac69 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2027,6 +2027,26 @@ static int svm_emulate_vm_instr(struct kvm_vcpu *vcpu, u8 modrm)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	if (is_guest_mode(vcpu)) {
+		switch (modrm) {
+		case 0xd8: /* VMRUN */
+			svm->vmcb->control.exit_code = SVM_EXIT_VMRUN;
+			break;
+		case 0xda: /* VMLOAD */
+			svm->vmcb->control.exit_code = SVM_EXIT_VMLOAD;
+			break;
+		case 0xdb: /* VMSAVE */
+			svm->vmcb->control.exit_code = SVM_EXIT_VMLOAD;
+			break;
+		default:
+			goto inject_exception;
+		}
+
+		svm->vmcb->control.exit_info_1 = 0;
+		svm->vmcb->control.exit_info_2 = 0;
+		return nested_svm_vmexit(svm);
+	}
+
 	switch (modrm) {
 	case 0xd8: /* VMRUN */
 		return vmrun_interception(svm);
@@ -2035,6 +2055,7 @@ static int svm_emulate_vm_instr(struct kvm_vcpu *vcpu, u8 modrm)
 	case 0xdb: /* VMSAVE */
 		return vmsave_interception(svm);
 	default:
+inject_exception:
 		/* inject a #GP for all other cases */
 		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
 		return 1;


Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-14 Thread Maxim Levitsky
On Tue, 2021-01-12 at 23:15 -0600, Wei Huang wrote:
> 
> On 1/12/21 12:58 PM, Andy Lutomirski wrote:
> > Andrew Cooper points out that there may be a nicer workaround.  Make
> > sure that the SMRAM and HT region (FFFD - ) are
> > marked as reserved in the guest, too.
> 
> In theory this proposed solution can avoid intercepting #GP. But in 
> reality SMRAM regions can be different on different machines. So this 
> solution can break after VM migration.
> 
I should add to this, that on my 3970X,
I just noticed that the problematic SMRAM region moved on
its own (likely due to the fact that I moved some pcie cards around recently).

Best regards,
Maxim Levitsky



Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-13 Thread Paolo Bonzini

On 12/01/21 18:59, Sean Christopherson wrote:

It would be very helpful to list exactly which CPUs are/aren't affected, even if
that just means stating something like "all CPUs before XYZ".  Given patch 2/2,
I assume it's all CPUs without the new CPUID flag?

Ah, despite calling this an 'errata', the bad behavior is explicitly documented
in the APM, i.e. it's an architecture bug, not a silicon bug.


I would still call it an errata for the case when virtualized 
VMSAVE/VMLOAD is enabled (and therefore VMLOAD intercepts are disabled). 
 In that case, the problem is that the GPA does not go through NPT 
before it is checked against *host* reserved memory regions.


In fact I hope that,  on processors that have the fix, VMSAVE/VMLOAD 
from guest mode _does_ check the GPA after it's been translated!


Paolo



Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-13 Thread Paolo Bonzini

On 12/01/21 18:42, Sean Christopherson wrote:

On a related topic, it feels like nested should be disabled by default on SVM
until it's truly ready for primetime, with the patch tagged for stable.  That
way we don't have to worry about crafting non-trivial fixes (like this one) to
make them backport-friendly.


Well, that's historical; I wish it had been disabled by default back in 
the day.


However, after 10 years and after the shakedown last year, it's hard to 
justify breaking backwards compatibility.  Nested SVM is not any less 
ready than nested VMX---just a little less optimized for things such as 
TLB flushes and ASID/VPID---even without this fix.  The erratum has 
visible effects only on a minority of AMD systems (it depends on an 
unlucky placement of TSEG on L0), and it is easy to work around it by 
lowering the amount of <4G memory in L1.


Paolo



Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-12 Thread Wei Huang




On 1/12/21 12:58 PM, Andy Lutomirski wrote:

Andrew Cooper points out that there may be a nicer workaround.  Make
sure that the SMRAM and HT region (FFFD - ) are
marked as reserved in the guest, too.


In theory this proposed solution can avoid intercepting #GP. But in 
reality SMRAM regions can be different on different machines. So this 
solution can break after VM migration.


Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-12 Thread Wei Huang




On 1/12/21 11:59 AM, Sean Christopherson wrote:

On Tue, Jan 12, 2021, Sean Christopherson wrote:

On Tue, Jan 12, 2021, Wei Huang wrote:

From: Bandan Das 

While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
before checking VMCB's instruction intercept.


It would be very helpful to list exactly which CPUs are/aren't affected, even if
that just means stating something like "all CPUs before XYZ".  Given patch 2/2,
I assume it's all CPUs without the new CPUID flag?


This behavior was dated back to fairly old CPUs. It is fair to assume 
that _most_ CPUs without this CPUID bit can demonstrate such behavior.




Ah, despite calling this an 'errata', the bad behavior is explicitly documented
in the APM, i.e. it's an architecture bug, not a silicon bug.

Can you reword the changelog to make it clear that the premature #GP is the
correct architectural behavior for CPUs without the new CPUID flag?


Sure, will do in the next version.





Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-12 Thread Wei Huang




On 1/12/21 11:56 AM, Sean Christopherson wrote:

On Tue, Jan 12, 2021, Andy Lutomirski wrote:



On Jan 12, 2021, at 7:46 AM, Bandan Das  wrote:

Andy Lutomirski  writes:
...

#endif diff --git a/arch/x86/kvm/mmu/mmu.c
b/arch/x86/kvm/mmu/mmu.c index 6d16481aa29d..c5c4aaf01a1a 100644
--- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@
-50,6 +50,7 @@ #include  #include  #include
 +#include  #include
"trace.h"

extern bool itlb_multihit_kvm_mitigation; @@ -5675,6 +5676,12 @@
void kvm_mmu_slot_set_dirty(struct kvm *kvm, }
EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);

+bool kvm_is_host_reserved_region(u64 gpa) +{ + return
e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED); +}

While _e820__mapped_any()'s doc says '..  checks if any part of
the range  is mapped ..' it seems to me that the real
check is [start, end) so we should use 'gpa' instead of 'gpa-1',
no?

Why do you need to check GPA at all?


To reduce the scope of the workaround.

The errata only happens when you use one of SVM instructions in the
guest with EAX that happens to be inside one of the host reserved
memory regions (for example SMM).


This code reduces the scope of the workaround at the cost of
increasing the complexity of the workaround and adding a nonsensical
coupling between KVM and host details and adding an export that really
doesn’t deserve to be exported.

Is there an actual concrete benefit to this check?


Besides reducing the scope, my intention for the check was that we should
know if such exceptions occur for any other undiscovered reasons with other
memory types rather than hiding them under this workaround.


Ask AMD?


There are several checking before VMRUN launch. The function, 
e820__mapped_raw_any(), was definitely one of the easies way to figure 
out the problematic regions we had.




I would also believe that someone somewhere has a firmware that simply omits
the problematic region instead of listing it as reserved.


I agree with Andy, odds are very good that attempting to be precise will lead to
pain due to false negatives.

And, KVM's SVM instruction emulation needs to be be rock solid regardless of
this behavior since KVM unconditionally intercepts the instruction, i.e. there's
basically zero risk to KVM.



Are you saying that the instruction decode before 
kvm_is_host_reserved_region() already guarantee the instructions #GP hit 
are SVM execution instructions (see below)? If so, I think this argument 
is fair.


+   switch (ctxt->modrm) {
+   case 0xd8: /* VMRUN */
+   case 0xda: /* VMLOAD */
+   case 0xdb: /* VMSAVE */

Bandan: What is your thoughts about removing kvm_is_host_reserved_region()?



Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-12 Thread Bandan Das
Sean Christopherson  writes:
...
>> -if ((emulation_type & EMULTYPE_VMWARE_GP) &&
>> -!is_vmware_backdoor_opcode(ctxt)) {
>> -kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
>> -return 1;
>> +if (emulation_type & EMULTYPE_PARAVIRT_GP) {
>> +vminstr = is_vm_instr_opcode(ctxt);
>> +if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
>> +kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
>> +return 1;
>> +}
>> +if (vminstr)
>> +return vminstr;
>
> I'm pretty sure this doesn't correctly handle a VM-instr in L2 that hits a bad
> L0 GPA and that L1 wants to intercept.  The intercept bitmap isn't checked 
> until
> x86_emulate_insn(), and the vm*_interception() helpers expect nested VM-Exits 
> to
> be handled further up the stack.
>
So, the condition is that L2 executes a vmload and #GPs on a reserved address, 
jumps to L0 - L0 doesn't
check if L1 has asked for the instruction to be intercepted and goes on with 
emulating
vmload and returning back to L2 ?

>>  }
>>  
>>  /*
>> -- 
>> 2.27.0
>> 



Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-12 Thread Wei Huang




On 1/12/21 6:15 AM, Vitaly Kuznetsov wrote:

Wei Huang  writes:


From: Bandan Das 

While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
before checking VMCB's instruction intercept. If EAX falls into such
memory areas, #GP is triggered before VMEXIT. This causes problem under
nested virtualization. To solve this problem, KVM needs to trap #GP and
check the instructions triggering #GP. For VM execution instructions,
KVM emulates these instructions; otherwise it re-injects #GP back to
guest VMs.

Signed-off-by: Bandan Das 
Co-developed-by: Wei Huang 
Signed-off-by: Wei Huang 
---
  arch/x86/include/asm/kvm_host.h |   8 +-
  arch/x86/kvm/mmu.h  |   1 +
  arch/x86/kvm/mmu/mmu.c  |   7 ++
  arch/x86/kvm/svm/svm.c  | 157 +++-
  arch/x86/kvm/svm/svm.h  |   8 ++
  arch/x86/kvm/vmx/vmx.c  |   2 +-
  arch/x86/kvm/x86.c  |  37 +++-
  7 files changed, 146 insertions(+), 74 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3d6616f6f6ef..0ddc309f5a14 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
   * due to an intercepted #UD (see EMULTYPE_TRAP_UD).
   * Used to test the full emulator from userspace.
   *
- * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
+ * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
   *backdoor emulation, which is opt in via module param.
   *VMware backoor emulation handles select instructions
- * and reinjects the #GP for all other cases.
+ * and reinjects #GP for all other cases. This also
+ * handles other cases where #GP condition needs to be
+ * handled and emulated appropriately
   *
   * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in 
which
   * case the CR2/GPA value pass on the stack is valid.
@@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
  #define EMULTYPE_SKIP (1 << 2)
  #define EMULTYPE_ALLOW_RETRY_PF   (1 << 3)
  #define EMULTYPE_TRAP_UD_FORCED   (1 << 4)
-#define EMULTYPE_VMWARE_GP (1 << 5)
+#define EMULTYPE_PARAVIRT_GP   (1 << 5)
  #define EMULTYPE_PF   (1 << 6)
  
  int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 581925e476d6..1a2fff4e7140 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
  
  int kvm_mmu_post_init_vm(struct kvm *kvm);

  void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
+bool kvm_is_host_reserved_region(u64 gpa);


Just a suggestion: "kvm_gpa_in_host_reserved()" maybe?


Will do in v2.



  
  #endif

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6d16481aa29d..c5c4aaf01a1a 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -50,6 +50,7 @@
  #include 
  #include 
  #include 
+#include 
  #include "trace.h"
  
  extern bool itlb_multihit_kvm_mitigation;

@@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
  
+bool kvm_is_host_reserved_region(u64 gpa)

+{
+   return e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
+}


While _e820__mapped_any()'s doc says '..  checks if any part of the
range  is mapped ..' it seems to me that the real check is
[start, end) so we should use 'gpa' instead of 'gpa-1', no?


I think you are right. The statement of "entry->addr >= end || 
entry->addr + entry->size <= start" shows the checking is against the 
area of [start, end).





+EXPORT_SYMBOL_GPL(kvm_is_host_reserved_region);
+
  void kvm_mmu_zap_all(struct kvm *kvm)
  {
struct kvm_mmu_page *sp, *node;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 7ef171790d02..74620d32aa82 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -288,6 +288,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
if (!(efer & EFER_SVME)) {
svm_leave_nested(svm);
svm_set_gif(svm, true);
+   clr_exception_intercept(svm, GP_VECTOR);
  
  			/*

 * Free the nested guest state, unless we are in SMM.
@@ -309,6 +310,10 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
  
  	svm->vmcb->save.efer = efer | EFER_SVME;

vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+   /* Enable GP interception for SVM instructions if needed */
+   if (efer & EFER_SVME)
+   set_exception_intercept(svm, GP_VECTOR);
+
return 0;
  }
  
@@ -1957,22 

Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-12 Thread Wei Huang




On 1/12/21 5:09 AM, Maxim Levitsky wrote:

On Tue, 2021-01-12 at 00:37 -0600, Wei Huang wrote:

From: Bandan Das 

While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
before checking VMCB's instruction intercept. If EAX falls into such
memory areas, #GP is triggered before VMEXIT. This causes problem under
nested virtualization. To solve this problem, KVM needs to trap #GP and
check the instructions triggering #GP. For VM execution instructions,
KVM emulates these instructions; otherwise it re-injects #GP back to
guest VMs.

Signed-off-by: Bandan Das 
Co-developed-by: Wei Huang 
Signed-off-by: Wei Huang 


This is the ultimate fix for this bug that I had in mind,
but I didn't dare to develop it, thinking it won't be accepted
due to the added complexity.

 From a cursory look this look all right, and I will review
and test this either today or tomorrow.


My tests mainly relied on the kvm-unit-test you developed (thanks BTW), 
on machines w/ and w/o CPUID_0x800A_EDX[28]=1. Both cases passed.






Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-12 Thread Sean Christopherson
On Tue, Jan 12, 2021, Wei Huang wrote:
> +/* Emulate SVM VM execution instructions */
> +static int svm_emulate_vm_instr(struct kvm_vcpu *vcpu, u8 modrm)
> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> +
> + switch (modrm) {
> + case 0xd8: /* VMRUN */
> + return vmrun_interception(svm);
> + case 0xda: /* VMLOAD */
> + return vmload_interception(svm);
> + case 0xdb: /* VMSAVE */
> + return vmsave_interception(svm);
> + default:
> + /* inject a #GP for all other cases */
> + kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> + return 1;
> + }
> +}
v> +
>  static int gp_interception(struct vcpu_svm *svm)
>  {
>   struct kvm_vcpu *vcpu = >vcpu;
>   u32 error_code = svm->vmcb->control.exit_info_1;
> -
> - WARN_ON_ONCE(!enable_vmware_backdoor);
> + int rc;
>  
>   /*
> -  * VMware backdoor emulation on #GP interception only handles IN{S},
> -  * OUT{S}, and RDPMC, none of which generate a non-zero error code.
> +  * Only VMware backdoor and SVM VME errata are handled. Neither of
> +  * them has non-zero error codes.
>*/
>   if (error_code) {
>   kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
>   return 1;
>   }
> - return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
> +
> + rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
> + if (rc > 1)
> + rc = svm_emulate_vm_instr(vcpu, rc);
> + return rc;
>  }
 
...
  
> +static int is_vm_instr_opcode(struct x86_emulate_ctxt *ctxt)
> +{
> + unsigned long rax;
> +
> + if (ctxt->b != 0x1)
> + return 0;
> +
> + switch (ctxt->modrm) {
> + case 0xd8: /* VMRUN */
> + case 0xda: /* VMLOAD */
> + case 0xdb: /* VMSAVE */
> + rax = kvm_register_read(emul_to_vcpu(ctxt), VCPU_REGS_RAX);
> + if (!kvm_is_host_reserved_region(rax))
> + return 0;
> + break;
> + default:
> + return 0;
> + }
> +
> + return ctxt->modrm;
> +}
> +
>  static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
>  {
>   switch (ctxt->opcode_len) {
> @@ -7305,6 +7327,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, 
> gpa_t cr2_or_gpa,
>   struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
>   bool writeback = true;
>   bool write_fault_to_spt;
> + int vminstr;
>  
>   if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, 
> insn_len)))
>   return 1;
> @@ -7367,10 +7390,14 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, 
> gpa_t cr2_or_gpa,
>   }
>   }
>  
> - if ((emulation_type & EMULTYPE_VMWARE_GP) &&
> - !is_vmware_backdoor_opcode(ctxt)) {
> - kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> - return 1;
> + if (emulation_type & EMULTYPE_PARAVIRT_GP) {
> + vminstr = is_vm_instr_opcode(ctxt);
> + if (!vminstr && !is_vmware_backdoor_opcode(ctxt)) {
> + kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
> + return 1;
> + }
> + if (vminstr)
> + return vminstr;

I'm pretty sure this doesn't correctly handle a VM-instr in L2 that hits a bad
L0 GPA and that L1 wants to intercept.  The intercept bitmap isn't checked until
x86_emulate_insn(), and the vm*_interception() helpers expect nested VM-Exits to
be handled further up the stack.

>   }
>  
>   /*
> -- 
> 2.27.0
> 


Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-12 Thread Andy Lutomirski
On Tue, Jan 12, 2021 at 9:59 AM Sean Christopherson  wrote:
>
> On Tue, Jan 12, 2021, Sean Christopherson wrote:
> > On Tue, Jan 12, 2021, Wei Huang wrote:
> > > From: Bandan Das 
> > >
> > > While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
> > > CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
> > > before checking VMCB's instruction intercept.
> >
> > It would be very helpful to list exactly which CPUs are/aren't affected, 
> > even if
> > that just means stating something like "all CPUs before XYZ".  Given patch 
> > 2/2,
> > I assume it's all CPUs without the new CPUID flag?
>
> Ah, despite calling this an 'errata', the bad behavior is explicitly 
> documented
> in the APM, i.e. it's an architecture bug, not a silicon bug.
>
> Can you reword the changelog to make it clear that the premature #GP is the
> correct architectural behavior for CPUs without the new CPUID flag?

Andrew Cooper points out that there may be a nicer workaround.  Make
sure that the SMRAM and HT region (FFFD - ) are
marked as reserved in the guest, too.

--Andy


Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-12 Thread Sean Christopherson
On Tue, Jan 12, 2021, Sean Christopherson wrote:
> On Tue, Jan 12, 2021, Wei Huang wrote:
> > From: Bandan Das 
> > 
> > While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
> > CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
> > before checking VMCB's instruction intercept.
> 
> It would be very helpful to list exactly which CPUs are/aren't affected, even 
> if
> that just means stating something like "all CPUs before XYZ".  Given patch 
> 2/2,
> I assume it's all CPUs without the new CPUID flag?

Ah, despite calling this an 'errata', the bad behavior is explicitly documented
in the APM, i.e. it's an architecture bug, not a silicon bug.

Can you reword the changelog to make it clear that the premature #GP is the
correct architectural behavior for CPUs without the new CPUID flag?


Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-12 Thread Sean Christopherson
On Tue, Jan 12, 2021, Andy Lutomirski wrote:
> 
> > On Jan 12, 2021, at 7:46 AM, Bandan Das  wrote:
> > 
> > Andy Lutomirski  writes:
> > ...
> >> #endif diff --git a/arch/x86/kvm/mmu/mmu.c
> >> b/arch/x86/kvm/mmu/mmu.c index 6d16481aa29d..c5c4aaf01a1a 100644
> >> --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@
> >> -50,6 +50,7 @@ #include  #include  #include
> >>  +#include  #include
> >> "trace.h"
> >> 
> >> extern bool itlb_multihit_kvm_mitigation; @@ -5675,6 +5676,12 @@
> >> void kvm_mmu_slot_set_dirty(struct kvm *kvm, }
> >> EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
> >> 
> >> +bool kvm_is_host_reserved_region(u64 gpa) +{ + return
> >> e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED); +}
> > While _e820__mapped_any()'s doc says '..  checks if any part of
> > the range  is mapped ..' it seems to me that the real
> > check is [start, end) so we should use 'gpa' instead of 'gpa-1',
> > no?
>  Why do you need to check GPA at all?
>  
> >>> To reduce the scope of the workaround.
> >>> 
> >>> The errata only happens when you use one of SVM instructions in the
> >>> guest with EAX that happens to be inside one of the host reserved
> >>> memory regions (for example SMM).
> >> 
> >> This code reduces the scope of the workaround at the cost of
> >> increasing the complexity of the workaround and adding a nonsensical
> >> coupling between KVM and host details and adding an export that really
> >> doesn’t deserve to be exported.
> >> 
> >> Is there an actual concrete benefit to this check?
> > 
> > Besides reducing the scope, my intention for the check was that we should
> > know if such exceptions occur for any other undiscovered reasons with other
> > memory types rather than hiding them under this workaround.
> 
> Ask AMD?
> 
> I would also believe that someone somewhere has a firmware that simply omits
> the problematic region instead of listing it as reserved.

I agree with Andy, odds are very good that attempting to be precise will lead to
pain due to false negatives.

And, KVM's SVM instruction emulation needs to be be rock solid regardless of
this behavior since KVM unconditionally intercepts the instruction, i.e. there's
basically zero risk to KVM.


Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-12 Thread Sean Christopherson
On Tue, Jan 12, 2021, Paolo Bonzini wrote:
> On 12/01/21 07:37, Wei Huang wrote:
> >   static int gp_interception(struct vcpu_svm *svm)
> >   {
> > struct kvm_vcpu *vcpu = >vcpu;
> > u32 error_code = svm->vmcb->control.exit_info_1;
> > -
> > -   WARN_ON_ONCE(!enable_vmware_backdoor);
> > +   int rc;
> > /*
> > -* VMware backdoor emulation on #GP interception only handles IN{S},
> > -* OUT{S}, and RDPMC, none of which generate a non-zero error code.
> > +* Only VMware backdoor and SVM VME errata are handled. Neither of
> > +* them has non-zero error codes.
> >  */
> > if (error_code) {
> > kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
> > return 1;
> > }
> > -   return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
> > +
> > +   rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
> > +   if (rc > 1)
> > +   rc = svm_emulate_vm_instr(vcpu, rc);
> > +   return rc;
> >   }
> 
> Passing back the third byte is quick hacky.  Instead of this change to
> kvm_emulate_instruction, I'd rather check the instruction bytes in
> gp_interception before calling kvm_emulate_instruction.

Agreed.  And I'd also prefer that any pure refactoring is done in separate
patch(es) so that the actual functional change is better isolated.

On a related topic, it feels like nested should be disabled by default on SVM
until it's truly ready for primetime, with the patch tagged for stable.  That
way we don't have to worry about crafting non-trivial fixes (like this one) to
make them backport-friendly.


Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-12 Thread Sean Christopherson
On Tue, Jan 12, 2021, Wei Huang wrote:
> From: Bandan Das 
> 
> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
> before checking VMCB's instruction intercept.

It would be very helpful to list exactly which CPUs are/aren't affected, even if
that just means stating something like "all CPUs before XYZ".  Given patch 2/2,
I assume it's all CPUs without the new CPUID flag?


Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-12 Thread Andy Lutomirski


> On Jan 12, 2021, at 7:46 AM, Bandan Das  wrote:
> 
> Andy Lutomirski  writes:
> ...
>> #endif diff --git a/arch/x86/kvm/mmu/mmu.c
>> b/arch/x86/kvm/mmu/mmu.c index 6d16481aa29d..c5c4aaf01a1a 100644
>> --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@
>> -50,6 +50,7 @@ #include  #include  #include
>>  +#include  #include
>> "trace.h"
>> 
>> extern bool itlb_multihit_kvm_mitigation; @@ -5675,6 +5676,12 @@
>> void kvm_mmu_slot_set_dirty(struct kvm *kvm, }
>> EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>> 
>> +bool kvm_is_host_reserved_region(u64 gpa) +{ + return
>> e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED); +}
> While _e820__mapped_any()'s doc says '..  checks if any part of
> the range  is mapped ..' it seems to me that the real
> check is [start, end) so we should use 'gpa' instead of 'gpa-1',
> no?
 Why do you need to check GPA at all?
 
>>> To reduce the scope of the workaround.
>>> 
>>> The errata only happens when you use one of SVM instructions in the
>>> guest with EAX that happens to be inside one of the host reserved
>>> memory regions (for example SMM).
>> 
>> This code reduces the scope of the workaround at the cost of
>> increasing the complexity of the workaround and adding a nonsensical
>> coupling between KVM and host details and adding an export that really
>> doesn’t deserve to be exported.
>> 
>> Is there an actual concrete benefit to this check?
> 
> Besides reducing the scope, my intention for the check was that we should
> know if such exceptions occur for any other undiscovered reasons with other
> memory types rather than hiding them under this workaround.

Ask AMD?

I would also believe that someone somewhere has a firmware that simply omits 
the problematic region instead of listing it as reserved.

> 
> Bandan
> 
> 
> 


Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-12 Thread Bandan Das
Andy Lutomirski  writes:
...
> #endif diff --git a/arch/x86/kvm/mmu/mmu.c
> b/arch/x86/kvm/mmu/mmu.c index 6d16481aa29d..c5c4aaf01a1a 100644
> --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@
> -50,6 +50,7 @@ #include  #include  #include
>  +#include  #include
> "trace.h"
> 
> extern bool itlb_multihit_kvm_mitigation; @@ -5675,6 +5676,12 @@
> void kvm_mmu_slot_set_dirty(struct kvm *kvm, }
> EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
> 
> +bool kvm_is_host_reserved_region(u64 gpa) +{ + return
> e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED); +}
  While _e820__mapped_any()'s doc says '..  checks if any part of
 the range  is mapped ..' it seems to me that the real
 check is [start, end) so we should use 'gpa' instead of 'gpa-1',
 no?
>>>  Why do you need to check GPA at all?
>>> 
>> To reduce the scope of the workaround.
>> 
>> The errata only happens when you use one of SVM instructions in the
>> guest with EAX that happens to be inside one of the host reserved
>> memory regions (for example SMM).
>
> This code reduces the scope of the workaround at the cost of
> increasing the complexity of the workaround and adding a nonsensical
> coupling between KVM and host details and adding an export that really
> doesn’t deserve to be exported.
>
> Is there an actual concrete benefit to this check?

Besides reducing the scope, my intention for the check was that we should
know if such exceptions occur for any other undiscovered reasons with other
memory types rather than hiding them under this workaround.

Bandan





Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-12 Thread Andy Lutomirski



> On Jan 12, 2021, at 7:17 AM, Maxim Levitsky  wrote:
> 
> On Tue, 2021-01-12 at 07:11 -0800, Andy Lutomirski wrote:
 On Jan 12, 2021, at 4:15 AM, Vitaly Kuznetsov  wrote:
>>> 
>>> Wei Huang  writes:
>>> 
 From: Bandan Das 
 
 While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
 CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
 before checking VMCB's instruction intercept. If EAX falls into such
 memory areas, #GP is triggered before VMEXIT. This causes problem under
 nested virtualization. To solve this problem, KVM needs to trap #GP and
 check the instructions triggering #GP. For VM execution instructions,
 KVM emulates these instructions; otherwise it re-injects #GP back to
 guest VMs.
 
 Signed-off-by: Bandan Das 
 Co-developed-by: Wei Huang 
 Signed-off-by: Wei Huang 
 ---
 arch/x86/include/asm/kvm_host.h |   8 +-
 arch/x86/kvm/mmu.h  |   1 +
 arch/x86/kvm/mmu/mmu.c  |   7 ++
 arch/x86/kvm/svm/svm.c  | 157 +++-
 arch/x86/kvm/svm/svm.h  |   8 ++
 arch/x86/kvm/vmx/vmx.c  |   2 +-
 arch/x86/kvm/x86.c  |  37 +++-
 7 files changed, 146 insertions(+), 74 deletions(-)
 
 diff --git a/arch/x86/include/asm/kvm_host.h 
 b/arch/x86/include/asm/kvm_host.h
 index 3d6616f6f6ef..0ddc309f5a14 100644
 --- a/arch/x86/include/asm/kvm_host.h
 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
 * due to an intercepted #UD (see EMULTYPE_TRAP_UD).
 * Used to test the full emulator from userspace.
 *
 - * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
 + * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
 *backdoor emulation, which is opt in via module param.
 *VMware backoor emulation handles select instructions
 - *and reinjects the #GP for all other cases.
 + *and reinjects #GP for all other cases. This also
 + *handles other cases where #GP condition needs to be
 + *handled and emulated appropriately
 *
 * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in 
 which
 * case the CR2/GPA value pass on the stack is valid.
 @@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
 #define EMULTYPE_SKIP(1 << 2)
 #define EMULTYPE_ALLOW_RETRY_PF(1 << 3)
 #define EMULTYPE_TRAP_UD_FORCED(1 << 4)
 -#define EMULTYPE_VMWARE_GP(1 << 5)
 +#define EMULTYPE_PARAVIRT_GP(1 << 5)
 #define EMULTYPE_PF(1 << 6)
 
 int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
 diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
 index 581925e476d6..1a2fff4e7140 100644
 --- a/arch/x86/kvm/mmu.h
 +++ b/arch/x86/kvm/mmu.h
 @@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
 
 int kvm_mmu_post_init_vm(struct kvm *kvm);
 void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
 +bool kvm_is_host_reserved_region(u64 gpa);
>>> 
>>> Just a suggestion: "kvm_gpa_in_host_reserved()" maybe? 
>>> 
 #endif
 diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
 index 6d16481aa29d..c5c4aaf01a1a 100644
 --- a/arch/x86/kvm/mmu/mmu.c
 +++ b/arch/x86/kvm/mmu/mmu.c
 @@ -50,6 +50,7 @@
 #include 
 #include 
 #include 
 +#include 
 #include "trace.h"
 
 extern bool itlb_multihit_kvm_mitigation;
 @@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
 
 +bool kvm_is_host_reserved_region(u64 gpa)
 +{
 +return e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
 +}
>>> 
>>> While _e820__mapped_any()'s doc says '..  checks if any part of the
>>> range  is mapped ..' it seems to me that the real check is
>>> [start, end) so we should use 'gpa' instead of 'gpa-1', no?
>> 
>> Why do you need to check GPA at all?
>> 
> To reduce the scope of the workaround.
> 
> The errata only happens when you use one of SVM instructions
> in the guest with EAX that happens to be inside one
> of the host reserved memory regions (for example SMM).

This code reduces the scope of the workaround at the cost of increasing the 
complexity of the workaround and adding a nonsensical coupling between KVM and 
host details and adding an export that really doesn’t deserve to be exported.

Is there an actual concrete benefit to this check?



Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-12 Thread Maxim Levitsky
On Tue, 2021-01-12 at 07:11 -0800, Andy Lutomirski wrote:
> > On Jan 12, 2021, at 4:15 AM, Vitaly Kuznetsov  wrote:
> > 
> > Wei Huang  writes:
> > 
> > > From: Bandan Das 
> > > 
> > > While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
> > > CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
> > > before checking VMCB's instruction intercept. If EAX falls into such
> > > memory areas, #GP is triggered before VMEXIT. This causes problem under
> > > nested virtualization. To solve this problem, KVM needs to trap #GP and
> > > check the instructions triggering #GP. For VM execution instructions,
> > > KVM emulates these instructions; otherwise it re-injects #GP back to
> > > guest VMs.
> > > 
> > > Signed-off-by: Bandan Das 
> > > Co-developed-by: Wei Huang 
> > > Signed-off-by: Wei Huang 
> > > ---
> > > arch/x86/include/asm/kvm_host.h |   8 +-
> > > arch/x86/kvm/mmu.h  |   1 +
> > > arch/x86/kvm/mmu/mmu.c  |   7 ++
> > > arch/x86/kvm/svm/svm.c  | 157 +++-
> > > arch/x86/kvm/svm/svm.h  |   8 ++
> > > arch/x86/kvm/vmx/vmx.c  |   2 +-
> > > arch/x86/kvm/x86.c  |  37 +++-
> > > 7 files changed, 146 insertions(+), 74 deletions(-)
> > > 
> > > diff --git a/arch/x86/include/asm/kvm_host.h 
> > > b/arch/x86/include/asm/kvm_host.h
> > > index 3d6616f6f6ef..0ddc309f5a14 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
> > >  * due to an intercepted #UD (see EMULTYPE_TRAP_UD).
> > >  * Used to test the full emulator from userspace.
> > >  *
> > > - * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
> > > + * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for 
> > > VMware
> > >  *backdoor emulation, which is opt in via module param.
> > >  *VMware backoor emulation handles select instructions
> > > - *and reinjects the #GP for all other cases.
> > > + *and reinjects #GP for all other cases. This also
> > > + *handles other cases where #GP condition needs to be
> > > + *handled and emulated appropriately
> > >  *
> > >  * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in 
> > > which
> > >  * case the CR2/GPA value pass on the stack is valid.
> > > @@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
> > > #define EMULTYPE_SKIP(1 << 2)
> > > #define EMULTYPE_ALLOW_RETRY_PF(1 << 3)
> > > #define EMULTYPE_TRAP_UD_FORCED(1 << 4)
> > > -#define EMULTYPE_VMWARE_GP(1 << 5)
> > > +#define EMULTYPE_PARAVIRT_GP(1 << 5)
> > > #define EMULTYPE_PF(1 << 6)
> > > 
> > > int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
> > > diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> > > index 581925e476d6..1a2fff4e7140 100644
> > > --- a/arch/x86/kvm/mmu.h
> > > +++ b/arch/x86/kvm/mmu.h
> > > @@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
> > > 
> > > int kvm_mmu_post_init_vm(struct kvm *kvm);
> > > void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
> > > +bool kvm_is_host_reserved_region(u64 gpa);
> > 
> > Just a suggestion: "kvm_gpa_in_host_reserved()" maybe? 
> > 
> > > #endif
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index 6d16481aa29d..c5c4aaf01a1a 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -50,6 +50,7 @@
> > > #include 
> > > #include 
> > > #include 
> > > +#include 
> > > #include "trace.h"
> > > 
> > > extern bool itlb_multihit_kvm_mitigation;
> > > @@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
> > > }
> > > EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
> > > 
> > > +bool kvm_is_host_reserved_region(u64 gpa)
> > > +{
> > > +return e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
> > > +}
> > 
> > While _e820__mapped_any()'s doc says '..  checks if any part of the
> > range  is mapped ..' it seems to me that the real check is
> > [start, end) so we should use 'gpa' instead of 'gpa-1', no?
> 
> Why do you need to check GPA at all?
> 
To reduce the scope of the workaround.

The errata only happens when you use one of SVM instructions
in the guest with EAX that happens to be inside one
of the host reserved memory regions (for example SMM).

So it is not expected for an SVM instruction with EAX that is a valid host
physical address to get a #GP due to this errata. 

Best regards,
Maxim Levitsky

> 




Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-12 Thread Andy Lutomirski


> On Jan 12, 2021, at 4:15 AM, Vitaly Kuznetsov  wrote:
> 
> Wei Huang  writes:
> 
>> From: Bandan Das 
>> 
>> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
>> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
>> before checking VMCB's instruction intercept. If EAX falls into such
>> memory areas, #GP is triggered before VMEXIT. This causes problem under
>> nested virtualization. To solve this problem, KVM needs to trap #GP and
>> check the instructions triggering #GP. For VM execution instructions,
>> KVM emulates these instructions; otherwise it re-injects #GP back to
>> guest VMs.
>> 
>> Signed-off-by: Bandan Das 
>> Co-developed-by: Wei Huang 
>> Signed-off-by: Wei Huang 
>> ---
>> arch/x86/include/asm/kvm_host.h |   8 +-
>> arch/x86/kvm/mmu.h  |   1 +
>> arch/x86/kvm/mmu/mmu.c  |   7 ++
>> arch/x86/kvm/svm/svm.c  | 157 +++-
>> arch/x86/kvm/svm/svm.h  |   8 ++
>> arch/x86/kvm/vmx/vmx.c  |   2 +-
>> arch/x86/kvm/x86.c  |  37 +++-
>> 7 files changed, 146 insertions(+), 74 deletions(-)
>> 
>> diff --git a/arch/x86/include/asm/kvm_host.h 
>> b/arch/x86/include/asm/kvm_host.h
>> index 3d6616f6f6ef..0ddc309f5a14 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
>>  * due to an intercepted #UD (see EMULTYPE_TRAP_UD).
>>  * Used to test the full emulator from userspace.
>>  *
>> - * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
>> + * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
>>  *backdoor emulation, which is opt in via module param.
>>  *VMware backoor emulation handles select instructions
>> - *and reinjects the #GP for all other cases.
>> + *and reinjects #GP for all other cases. This also
>> + *handles other cases where #GP condition needs to be
>> + *handled and emulated appropriately
>>  *
>>  * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in 
>> which
>>  * case the CR2/GPA value pass on the stack is valid.
>> @@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
>> #define EMULTYPE_SKIP(1 << 2)
>> #define EMULTYPE_ALLOW_RETRY_PF(1 << 3)
>> #define EMULTYPE_TRAP_UD_FORCED(1 << 4)
>> -#define EMULTYPE_VMWARE_GP(1 << 5)
>> +#define EMULTYPE_PARAVIRT_GP(1 << 5)
>> #define EMULTYPE_PF(1 << 6)
>> 
>> int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
>> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
>> index 581925e476d6..1a2fff4e7140 100644
>> --- a/arch/x86/kvm/mmu.h
>> +++ b/arch/x86/kvm/mmu.h
>> @@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
>> 
>> int kvm_mmu_post_init_vm(struct kvm *kvm);
>> void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
>> +bool kvm_is_host_reserved_region(u64 gpa);
> 
> Just a suggestion: "kvm_gpa_in_host_reserved()" maybe? 
> 
>> 
>> #endif
>> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
>> index 6d16481aa29d..c5c4aaf01a1a 100644
>> --- a/arch/x86/kvm/mmu/mmu.c
>> +++ b/arch/x86/kvm/mmu/mmu.c
>> @@ -50,6 +50,7 @@
>> #include 
>> #include 
>> #include 
>> +#include 
>> #include "trace.h"
>> 
>> extern bool itlb_multihit_kvm_mitigation;
>> @@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
>> }
>> EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>> 
>> +bool kvm_is_host_reserved_region(u64 gpa)
>> +{
>> +return e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
>> +}
> 
> While _e820__mapped_any()'s doc says '..  checks if any part of the
> range  is mapped ..' it seems to me that the real check is
> [start, end) so we should use 'gpa' instead of 'gpa-1', no?

Why do you need to check GPA at all?




Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-12 Thread Paolo Bonzini

On 12/01/21 07:37, Wei Huang wrote:

  static int gp_interception(struct vcpu_svm *svm)
  {
struct kvm_vcpu *vcpu = >vcpu;
u32 error_code = svm->vmcb->control.exit_info_1;
-
-   WARN_ON_ONCE(!enable_vmware_backdoor);
+   int rc;
  
  	/*

-* VMware backdoor emulation on #GP interception only handles IN{S},
-* OUT{S}, and RDPMC, none of which generate a non-zero error code.
+* Only VMware backdoor and SVM VME errata are handled. Neither of
+* them has non-zero error codes.
 */
if (error_code) {
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
return 1;
}
-   return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
+
+   rc = kvm_emulate_instruction(vcpu, EMULTYPE_PARAVIRT_GP);
+   if (rc > 1)
+   rc = svm_emulate_vm_instr(vcpu, rc);
+   return rc;
  }
  


Passing back the third byte is quick hacky.  Instead of this change to 
kvm_emulate_instruction, I'd rather check the instruction bytes in 
gp_interception before calling kvm_emulate_instruction.  That would be 
something like:


- move "kvm_clear_exception_queue(vcpu);" inside the "if 
(!(emulation_type & EMULTYPE_NO_DECODE))".  It doesn't apply when you 
are coming back from userspace.


- extract the "if (!(emulation_type & EMULTYPE_NO_DECODE))" body to a 
new function x86_emulate_decoded_instruction.  Call it from 
gp_interception, we know this is not a pagefault and therefore 
vcpu->arch.write_fault_to_shadow_pgtable must be false.


- check ctxt->insn_bytes for an SVM instruction

- if not an SVM instruction, call kvm_emulate_instruction(vcpu, 
EMULTYPE_VMWARE_GP|EMULTYPE_NO_DECODE).


Thanks,

Paolo



Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-12 Thread Vitaly Kuznetsov
Wei Huang  writes:

> From: Bandan Das 
>
> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
> before checking VMCB's instruction intercept. If EAX falls into such
> memory areas, #GP is triggered before VMEXIT. This causes problem under
> nested virtualization. To solve this problem, KVM needs to trap #GP and
> check the instructions triggering #GP. For VM execution instructions,
> KVM emulates these instructions; otherwise it re-injects #GP back to
> guest VMs.
>
> Signed-off-by: Bandan Das 
> Co-developed-by: Wei Huang 
> Signed-off-by: Wei Huang 
> ---
>  arch/x86/include/asm/kvm_host.h |   8 +-
>  arch/x86/kvm/mmu.h  |   1 +
>  arch/x86/kvm/mmu/mmu.c  |   7 ++
>  arch/x86/kvm/svm/svm.c  | 157 +++-
>  arch/x86/kvm/svm/svm.h  |   8 ++
>  arch/x86/kvm/vmx/vmx.c  |   2 +-
>  arch/x86/kvm/x86.c  |  37 +++-
>  7 files changed, 146 insertions(+), 74 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3d6616f6f6ef..0ddc309f5a14 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
>   *due to an intercepted #UD (see EMULTYPE_TRAP_UD).
>   *Used to test the full emulator from userspace.
>   *
> - * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
> + * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
>   *   backdoor emulation, which is opt in via module param.
>   *   VMware backoor emulation handles select instructions
> - *   and reinjects the #GP for all other cases.
> + *   and reinjects #GP for all other cases. This also
> + *   handles other cases where #GP condition needs to be
> + *   handled and emulated appropriately
>   *
>   * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in 
> which
>   *case the CR2/GPA value pass on the stack is valid.
> @@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
>  #define EMULTYPE_SKIP(1 << 2)
>  #define EMULTYPE_ALLOW_RETRY_PF  (1 << 3)
>  #define EMULTYPE_TRAP_UD_FORCED  (1 << 4)
> -#define EMULTYPE_VMWARE_GP   (1 << 5)
> +#define EMULTYPE_PARAVIRT_GP (1 << 5)
>  #define EMULTYPE_PF  (1 << 6)
>  
>  int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 581925e476d6..1a2fff4e7140 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
>  
>  int kvm_mmu_post_init_vm(struct kvm *kvm);
>  void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
> +bool kvm_is_host_reserved_region(u64 gpa);

Just a suggestion: "kvm_gpa_in_host_reserved()" maybe? 

>  
>  #endif
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6d16481aa29d..c5c4aaf01a1a 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -50,6 +50,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include "trace.h"
>  
>  extern bool itlb_multihit_kvm_mitigation;
> @@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>  
> +bool kvm_is_host_reserved_region(u64 gpa)
> +{
> + return e820__mbapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
> +}

While _e820__mapped_any()'s doc says '..  checks if any part of the
range  is mapped ..' it seems to me that the real check is
[start, end) so we should use 'gpa' instead of 'gpa-1', no?

> +EXPORT_SYMBOL_GPL(kvm_is_host_reserved_region);
> +
>  void kvm_mmu_zap_all(struct kvm *kvm)
>  {
>   struct kvm_mmu_page *sp, *node;
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 7ef171790d02..74620d32aa82 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -288,6 +288,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>   if (!(efer & EFER_SVME)) {
>   svm_leave_nested(svm);
>   svm_set_gif(svm, true);
> + clr_exception_intercept(svm, GP_VECTOR);
>  
>   /*
>* Free the nested guest state, unless we are in SMM.
> @@ -309,6 +310,10 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>  
>   svm->vmcb->save.efer = efer | EFER_SVME;
>   vmcb_mark_dirty(svm->vmcb, VMCB_CR);
> + /* Enable GP interception for SVM instructions if needed */
> + if (efer & EFER_SVME)
> + set_exception_intercept(svm, GP_VECTOR);
> +
>   return 0;
>  }
>  
> @@ -1957,22 +1962,104 @@ static int ac_interception(struct vcpu_svm *svm)
>   return 1;
>  

Re: [PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-12 Thread Maxim Levitsky
On Tue, 2021-01-12 at 00:37 -0600, Wei Huang wrote:
> From: Bandan Das 
> 
> While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
> CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
> before checking VMCB's instruction intercept. If EAX falls into such
> memory areas, #GP is triggered before VMEXIT. This causes problem under
> nested virtualization. To solve this problem, KVM needs to trap #GP and
> check the instructions triggering #GP. For VM execution instructions,
> KVM emulates these instructions; otherwise it re-injects #GP back to
> guest VMs.
> 
> Signed-off-by: Bandan Das 
> Co-developed-by: Wei Huang 
> Signed-off-by: Wei Huang 

This is the ultimate fix for this bug that I had in mind, 
but I didn't dare to develop it, thinking it won't be accepted 
due to the added complexity.

>From a cursory look this look all right, and I will review 
and test this either today or tomorrow.

Thank you very much for doing the right fix for this bug.

Best regards,
Maxim Levitsky

> ---
>  arch/x86/include/asm/kvm_host.h |   8 +-
>  arch/x86/kvm/mmu.h  |   1 +
>  arch/x86/kvm/mmu/mmu.c  |   7 ++
>  arch/x86/kvm/svm/svm.c  | 157 +++-
>  arch/x86/kvm/svm/svm.h  |   8 ++
>  arch/x86/kvm/vmx/vmx.c  |   2 +-
>  arch/x86/kvm/x86.c  |  37 +++-
>  7 files changed, 146 insertions(+), 74 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3d6616f6f6ef..0ddc309f5a14 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
>   *due to an intercepted #UD (see EMULTYPE_TRAP_UD).
>   *Used to test the full emulator from userspace.
>   *
> - * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
> + * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
>   *   backdoor emulation, which is opt in via module param.
>   *   VMware backoor emulation handles select instructions
> - *   and reinjects the #GP for all other cases.
> + *   and reinjects #GP for all other cases. This also
> + *   handles other cases where #GP condition needs to be
> + *   handled and emulated appropriately
>   *
>   * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in 
> which
>   *case the CR2/GPA value pass on the stack is valid.
> @@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
>  #define EMULTYPE_SKIP(1 << 2)
>  #define EMULTYPE_ALLOW_RETRY_PF  (1 << 3)
>  #define EMULTYPE_TRAP_UD_FORCED  (1 << 4)
> -#define EMULTYPE_VMWARE_GP   (1 << 5)
> +#define EMULTYPE_PARAVIRT_GP (1 << 5)
>  #define EMULTYPE_PF  (1 << 6)
>  
>  int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 581925e476d6..1a2fff4e7140 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
>  
>  int kvm_mmu_post_init_vm(struct kvm *kvm);
>  void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
> +bool kvm_is_host_reserved_region(u64 gpa);
>  
>  #endif
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6d16481aa29d..c5c4aaf01a1a 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -50,6 +50,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include "trace.h"
>  
>  extern bool itlb_multihit_kvm_mitigation;
> @@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
>  
> +bool kvm_is_host_reserved_region(u64 gpa)
> +{
> + return e820__mapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
> +}
> +EXPORT_SYMBOL_GPL(kvm_is_host_reserved_region);
> +
>  void kvm_mmu_zap_all(struct kvm *kvm)
>  {
>   struct kvm_mmu_page *sp, *node;
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 7ef171790d02..74620d32aa82 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -288,6 +288,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>   if (!(efer & EFER_SVME)) {
>   svm_leave_nested(svm);
>   svm_set_gif(svm, true);
> + clr_exception_intercept(svm, GP_VECTOR);
>  
>   /*
>* Free the nested guest state, unless we are in SMM.
> @@ -309,6 +310,10 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
>  
>   svm->vmcb->save.efer = efer | EFER_SVME;
>   vmcb_mark_dirty(svm->vmcb, VMCB_CR);
> + /* Enable GP interception for SVM instructions if needed */
> + if (efer & EFER_SVME)
> + set_exception_intercept(svm, 

[PATCH 1/2] KVM: x86: Add emulation support for #GP triggered by VM instructions

2021-01-11 Thread Wei Huang
From: Bandan Das 

While running VM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD
CPUs check EAX against reserved memory regions (e.g. SMM memory on host)
before checking VMCB's instruction intercept. If EAX falls into such
memory areas, #GP is triggered before VMEXIT. This causes problem under
nested virtualization. To solve this problem, KVM needs to trap #GP and
check the instructions triggering #GP. For VM execution instructions,
KVM emulates these instructions; otherwise it re-injects #GP back to
guest VMs.

Signed-off-by: Bandan Das 
Co-developed-by: Wei Huang 
Signed-off-by: Wei Huang 
---
 arch/x86/include/asm/kvm_host.h |   8 +-
 arch/x86/kvm/mmu.h  |   1 +
 arch/x86/kvm/mmu/mmu.c  |   7 ++
 arch/x86/kvm/svm/svm.c  | 157 +++-
 arch/x86/kvm/svm/svm.h  |   8 ++
 arch/x86/kvm/vmx/vmx.c  |   2 +-
 arch/x86/kvm/x86.c  |  37 +++-
 7 files changed, 146 insertions(+), 74 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3d6616f6f6ef..0ddc309f5a14 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1450,10 +1450,12 @@ extern u64 kvm_mce_cap_supported;
  *  due to an intercepted #UD (see EMULTYPE_TRAP_UD).
  *  Used to test the full emulator from userspace.
  *
- * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
+ * EMULTYPE_PARAVIRT_GP - Set when emulating an intercepted #GP for VMware
  * backdoor emulation, which is opt in via module param.
  * VMware backoor emulation handles select instructions
- * and reinjects the #GP for all other cases.
+ * and reinjects #GP for all other cases. This also
+ * handles other cases where #GP condition needs to be
+ * handled and emulated appropriately
  *
  * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
  *  case the CR2/GPA value pass on the stack is valid.
@@ -1463,7 +1465,7 @@ extern u64 kvm_mce_cap_supported;
 #define EMULTYPE_SKIP  (1 << 2)
 #define EMULTYPE_ALLOW_RETRY_PF(1 << 3)
 #define EMULTYPE_TRAP_UD_FORCED(1 << 4)
-#define EMULTYPE_VMWARE_GP (1 << 5)
+#define EMULTYPE_PARAVIRT_GP   (1 << 5)
 #define EMULTYPE_PF(1 << 6)
 
 int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 581925e476d6..1a2fff4e7140 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -219,5 +219,6 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
 
 int kvm_mmu_post_init_vm(struct kvm *kvm);
 void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
+bool kvm_is_host_reserved_region(u64 gpa);
 
 #endif
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6d16481aa29d..c5c4aaf01a1a 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -50,6 +50,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "trace.h"
 
 extern bool itlb_multihit_kvm_mitigation;
@@ -5675,6 +5676,12 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
 
+bool kvm_is_host_reserved_region(u64 gpa)
+{
+   return e820__mapped_raw_any(gpa-1, gpa+1, E820_TYPE_RESERVED);
+}
+EXPORT_SYMBOL_GPL(kvm_is_host_reserved_region);
+
 void kvm_mmu_zap_all(struct kvm *kvm)
 {
struct kvm_mmu_page *sp, *node;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 7ef171790d02..74620d32aa82 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -288,6 +288,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
if (!(efer & EFER_SVME)) {
svm_leave_nested(svm);
svm_set_gif(svm, true);
+   clr_exception_intercept(svm, GP_VECTOR);
 
/*
 * Free the nested guest state, unless we are in SMM.
@@ -309,6 +310,10 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 
svm->vmcb->save.efer = efer | EFER_SVME;
vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+   /* Enable GP interception for SVM instructions if needed */
+   if (efer & EFER_SVME)
+   set_exception_intercept(svm, GP_VECTOR);
+
return 0;
 }
 
@@ -1957,22 +1962,104 @@ static int ac_interception(struct vcpu_svm *svm)
return 1;
 }
 
+static int vmload_interception(struct vcpu_svm *svm)
+{
+   struct vmcb *nested_vmcb;
+   struct kvm_host_map map;
+   int ret;
+
+   if (nested_svm_check_permissions(svm))
+   return 1;
+
+   ret = kvm_vcpu_map(>vcpu, gpa_to_gfn(svm->vmcb->save.rax), );
+   if (ret) {
+   if (ret == -EINVAL)
+   kvm_inject_gp(>vcpu, 0);
+   return 1;
+   }
+
+