[PATCH 08/12] Subject: [PATCH 08/10] nEPT: Nested INVEPT

2013-04-25 Thread Nakajima, Jun
If we let L1 use EPT, we should probably also support the INVEPT instruction.

In our current nested EPT implementation, when L1 changes its EPT table for
L2 (i.e., EPT12), L0 modifies the shadow EPT table (EPT02), and in the course
of this modification already calls INVEPT. Therefore, when L1 calls INVEPT,
we don't really need to do anything. In particular we *don't* need to call
the real INVEPT again. All we do in our INVEPT is verify the validity of the
call, and its parameters, and then do nothing.

In KVM Forum 2010, Dong et al. presented Nested Virtualization Friendly KVM
and classified our current nested EPT implementation as shadow-like virtual
EPT. He recommended instead a different approach, which he called VTLB-like
virtual EPT. If we had taken that alternative approach, INVEPT would have had
a bigger role: L0 would only rebuild the shadow EPT table when L1 calls INVEPT.

Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com

modified:   arch/x86/include/asm/vmx.h
modified:   arch/x86/kvm/vmx.c
---
 arch/x86/include/asm/vmx.h |  4 ++-
 arch/x86/kvm/vmx.c | 83 ++
 2 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index b6fbf86..0ce54f3 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -376,7 +376,9 @@ enum vmcs_field {
 #define VMX_EPTP_WB_BIT (1ull  14)
 #define VMX_EPT_2MB_PAGE_BIT (1ull  16)
 #define VMX_EPT_1GB_PAGE_BIT (1ull  17)
-#define VMX_EPT_AD_BIT(1ull  21)
+#define VMX_EPT_INVEPT_BIT (1ull  20)
+#define VMX_EPT_AD_BIT (1ull  21)
+#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull  24)
 #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull  25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull  26)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a5e14d1..10f2a69 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5878,6 +5878,87 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
  return 1;
 }

+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+ u32 vmx_instruction_info;
+ unsigned long type;
+ gva_t gva;
+ struct x86_exception e;
+ struct {
+ u64 eptp, gpa;
+ } operand;
+
+ if (!(nested_vmx_secondary_ctls_high  SECONDARY_EXEC_ENABLE_EPT) ||
+!(nested_vmx_ept_caps  VMX_EPT_INVEPT_BIT)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ /* According to the Intel VMX instruction reference, the memory
+ * operand is read even if it isn't needed (e.g., for type==global)
+ */
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ vmx_instruction_info, gva))
+ return 1;
+ if (kvm_read_guest_virt(vcpu-arch.emulate_ctxt, gva, operand,
+ sizeof(operand), e)) {
+ kvm_inject_page_fault(vcpu, e);
+ return 1;
+ }
+
+ type = kvm_register_read(vcpu, (vmx_instruction_info  28)  0xf);
+
+ switch (type) {
+ case VMX_EPT_EXTENT_GLOBAL:
+ if (!(nested_vmx_ept_caps  VMX_EPT_EXTENT_GLOBAL_BIT))
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ else {
+ /*
+ * Do nothing: when L1 changes EPT12, we already
+ * update EPT02 (the shadow EPT table) and call INVEPT.
+ * So when L1 calls INVEPT, there's nothing left to do.
+ */
+ nested_vmx_succeed(vcpu);
+ }
+ break;
+ case VMX_EPT_EXTENT_CONTEXT:
+ if (!(nested_vmx_ept_caps  VMX_EPT_EXTENT_CONTEXT_BIT))
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ else {
+ /* Do nothing */
+ nested_vmx_succeed(vcpu);
+ }
+ break;
+ case VMX_EPT_EXTENT_INDIVIDUAL_ADDR:
+ if (!(nested_vmx_ept_caps  VMX_EPT_EXTENT_INDIVIDUAL_BIT))
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ else {
+ /* Do nothing */
+ nested_vmx_succeed(vcpu);
+ }
+ break;
+ default:
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ }
+
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -5922,6 +6003,7 @@ static int (*const
kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
  [EXIT_REASON_PAUSE_INSTRUCTION]   = handle_pause,
  [EXIT_REASON_MWAIT_INSTRUCTION]  = handle_invalid_op,
  [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
+ [EXIT_REASON_INVEPT]  = handle_invept,
 };

 static const int kvm_vmx_max_exit_handlers =
@@ -6106,6 +6188,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
  case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
  case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
  case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+ case EXIT_REASON_INVEPT:
  /*
  * VMX instructions trap unconditionally. This allows 

[PATCH 08/10] nEPT: Nested INVEPT

2012-08-01 Thread Nadav Har'El
If we let L1 use EPT, we should probably also support the INVEPT instruction.

In our current nested EPT implementation, when L1 changes its EPT table for
L2 (i.e., EPT12), L0 modifies the shadow EPT table (EPT02), and in the course
of this modification already calls INVEPT. Therefore, when L1 calls INVEPT,
we don't really need to do anything. In particular we *don't* need to call
the real INVEPT again. All we do in our INVEPT is verify the validity of the
call, and its parameters, and then do nothing.

In KVM Forum 2010, Dong et al. presented Nested Virtualization Friendly KVM
and classified our current nested EPT implementation as shadow-like virtual
EPT. He recommended instead a different approach, which he called VTLB-like
virtual EPT. If we had taken that alternative approach, INVEPT would have had
a bigger role: L0 would only rebuild the shadow EPT table when L1 calls INVEPT.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
 arch/x86/include/asm/vmx.h |2 
 arch/x86/kvm/vmx.c |   87 +++
 2 files changed, 89 insertions(+)

--- .before/arch/x86/include/asm/vmx.h  2012-08-01 17:22:47.0 +0300
+++ .after/arch/x86/include/asm/vmx.h   2012-08-01 17:22:47.0 +0300
@@ -280,6 +280,7 @@ enum vmcs_field {
 #define EXIT_REASON_APIC_ACCESS 44
 #define EXIT_REASON_EPT_VIOLATION   48
 #define EXIT_REASON_EPT_MISCONFIG   49
+#define EXIT_REASON_INVEPT 50
 #define EXIT_REASON_WBINVD 54
 #define EXIT_REASON_XSETBV 55
 #define EXIT_REASON_INVPCID58
@@ -406,6 +407,7 @@ enum vmcs_field {
 #define VMX_EPTP_WB_BIT(1ull  14)
 #define VMX_EPT_2MB_PAGE_BIT   (1ull  16)
 #define VMX_EPT_1GB_PAGE_BIT   (1ull  17)
+#define VMX_EPT_INVEPT_BIT (1ull  20)
 #define VMX_EPT_AD_BIT (1ull  21)
 #define VMX_EPT_EXTENT_INDIVIDUAL_BIT  (1ull  24)
 #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull  25)
--- .before/arch/x86/kvm/vmx.c  2012-08-01 17:22:47.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2012-08-01 17:22:47.0 +0300
@@ -2026,6 +2026,10 @@ static __init void nested_vmx_setup_ctls
/* nested EPT: emulate EPT also to L1 */
nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT;
+   nested_vmx_ept_caps |=
+   VMX_EPT_INVEPT_BIT | VMX_EPT_EXTENT_GLOBAL_BIT |
+   VMX_EPT_EXTENT_CONTEXT_BIT |
+   VMX_EPT_EXTENT_INDIVIDUAL_BIT;
nested_vmx_ept_caps = vmx_capability.ept;
} else
nested_vmx_ept_caps = 0;
@@ -5702,6 +5706,87 @@ static int handle_vmptrst(struct kvm_vcp
return 1;
 }
 
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+   u32 vmx_instruction_info;
+   unsigned long type;
+   gva_t gva;
+   struct x86_exception e;
+   struct {
+   u64 eptp, gpa;
+   } operand;
+
+   if (!(nested_vmx_secondary_ctls_high  SECONDARY_EXEC_ENABLE_EPT) ||
+   !(nested_vmx_ept_caps  VMX_EPT_INVEPT_BIT)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
+
+   if (!nested_vmx_check_permission(vcpu))
+   return 1;
+
+   if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
+
+   /* According to the Intel VMX instruction reference, the memory
+* operand is read even if it isn't needed (e.g., for type==global)
+*/
+   vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+   if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+   vmx_instruction_info, gva))
+   return 1;
+   if (kvm_read_guest_virt(vcpu-arch.emulate_ctxt, gva, operand,
+   sizeof(operand), e)) {
+   kvm_inject_page_fault(vcpu, e);
+   return 1;
+   }
+
+   type = kvm_register_read(vcpu, (vmx_instruction_info  28)  0xf);
+
+   switch (type) {
+   case VMX_EPT_EXTENT_GLOBAL:
+   if (!(nested_vmx_ept_caps  VMX_EPT_EXTENT_GLOBAL_BIT))
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+   else {
+   /*
+* Do nothing: when L1 changes EPT12, we already
+* update EPT02 (the shadow EPT table) and call INVEPT.
+* So when L1 calls INVEPT, there's nothing left to do.
+*/
+   nested_vmx_succeed(vcpu);
+   }
+   break;
+   case VMX_EPT_EXTENT_CONTEXT:
+   if (!(nested_vmx_ept_caps  

Re: [PATCH 08/10] nEPT: Nested INVEPT

2011-12-11 Thread Nadav Har'El
On Thu, Nov 10, 2011, Avi Kivity wrote about Re: [PATCH 08/10] nEPT: Nested 
INVEPT:
 On 11/10/2011 12:01 PM, Nadav Har'El wrote:
  If we let L1 use EPT, we should probably also support the INVEPT 
  instruction.
..
  +   if (vmcs12  nested_cpu_has_ept(vmcs12) 
  +   (vmcs12-ept_pointer == operand.eptp) 
  +   vmx-nested.last_eptp02)
  +   ept_sync_context(vmx-nested.last_eptp02);
  +   else
  +   ept_sync_global();
 
 Are either of these needed?  Won't a write to a shadowed EPT table cause
 them anyway?

This is very good point... You're right that as it stands, any changes
to the guest EPT table (EPT12) will cause changes to the shadow EPT
table (EPT02), and these already cause KVM to do an INVEPT, so no point
to do this again when the guest asks.  So basically, I can have INVEPT
emulated by doing absolutely nothing (after checking all the checks), right?

I wonder if I am missing any reason why a hypervisor might want to do
INVEPT without changing the EPT12 table first.

-- 
Nadav Har'El|Sunday, Dec 11 2011, 
n...@math.technion.ac.il |-
Phone +972-523-790466, ICQ 13349191 |Why do programmers mix up Christmas and
http://nadav.harel.org.il   |Halloween? Because DEC 25 = OCT 31
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/10] nEPT: Nested INVEPT

2011-12-11 Thread Avi Kivity
On 12/11/2011 04:24 PM, Nadav Har'El wrote:
 On Thu, Nov 10, 2011, Avi Kivity wrote about Re: [PATCH 08/10] nEPT: Nested 
 INVEPT:
  On 11/10/2011 12:01 PM, Nadav Har'El wrote:
   If we let L1 use EPT, we should probably also support the INVEPT 
   instruction.
 ..
   + if (vmcs12  nested_cpu_has_ept(vmcs12) 
   + (vmcs12-ept_pointer == operand.eptp) 
   + vmx-nested.last_eptp02)
   + ept_sync_context(vmx-nested.last_eptp02);
   + else
   + ept_sync_global();
  
  Are either of these needed?  Won't a write to a shadowed EPT table cause
  them anyway?

 This is very good point... You're right that as it stands, any changes
 to the guest EPT table (EPT12) will cause changes to the shadow EPT
 table (EPT02), and these already cause KVM to do an INVEPT, so no point
 to do this again when the guest asks.  So basically, I can have INVEPT
 emulated by doing absolutely nothing (after checking all the checks), right?

Right.  This was the case for INVLPG before we added out-of-sync pages;
we didn't even intercept the instruction.

 I wonder if I am missing any reason why a hypervisor might want to do
 INVEPT without changing the EPT12 table first.

Shouldn't happen, but why do you care?  If EPT12 has not changed, any
access through EPT02 or its TLB entry is valid.

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 08/10] nEPT: Nested INVEPT

2011-11-10 Thread Nadav Har'El
If we let L1 use EPT, we should probably also support the INVEPT instruction.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
 arch/x86/include/asm/vmx.h |2 
 arch/x86/kvm/vmx.c |  112 +++
 2 files changed, 114 insertions(+)

--- .before/arch/x86/include/asm/vmx.h  2011-11-10 11:33:59.0 +0200
+++ .after/arch/x86/include/asm/vmx.h   2011-11-10 11:33:59.0 +0200
@@ -279,6 +279,7 @@ enum vmcs_field {
 #define EXIT_REASON_APIC_ACCESS 44
 #define EXIT_REASON_EPT_VIOLATION   48
 #define EXIT_REASON_EPT_MISCONFIG   49
+#define EXIT_REASON_INVEPT 50
 #define EXIT_REASON_WBINVD 54
 #define EXIT_REASON_XSETBV 55
 
@@ -404,6 +405,7 @@ enum vmcs_field {
 #define VMX_EPTP_WB_BIT(1ull  14)
 #define VMX_EPT_2MB_PAGE_BIT   (1ull  16)
 #define VMX_EPT_1GB_PAGE_BIT   (1ull  17)
+#define VMX_EPT_INVEPT_BIT (1ull  20)
 #define VMX_EPT_EXTENT_INDIVIDUAL_BIT  (1ull  24)
 #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull  25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT  (1ull  26)
--- .before/arch/x86/kvm/vmx.c  2011-11-10 11:33:59.0 +0200
+++ .after/arch/x86/kvm/vmx.c   2011-11-10 11:33:59.0 +0200
@@ -351,6 +351,8 @@ struct nested_vmx {
struct list_head vmcs02_pool;
int vmcs02_num;
u64 vmcs01_tsc_offset;
+   /* Remember last EPT02, for single-context INVEPT optimization */
+   u64 last_eptp02;
/* L2 must run next, and mustn't decide to exit to L1. */
bool nested_run_pending;
/*
@@ -1987,6 +1989,10 @@ static __init void nested_vmx_setup_ctls
/* ept capabilities */
if (nested_ept) {
nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT;
+   nested_vmx_ept_caps |=
+   VMX_EPT_INVEPT_BIT | VMX_EPT_EXTENT_GLOBAL_BIT |
+   VMX_EPT_EXTENT_CONTEXT_BIT |
+   VMX_EPT_EXTENT_INDIVIDUAL_BIT;
nested_vmx_ept_caps = vmx_capability.ept;
} else
nested_vmx_ept_caps = 0;
@@ -5568,6 +5574,105 @@ static int handle_vmptrst(struct kvm_vcp
return 1;
 }
 
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+   u32 vmx_instruction_info;
+   unsigned long type;
+   gva_t gva;
+   struct x86_exception e;
+   struct {
+   u64 eptp, gpa;
+   } operand;
+
+
+   if (!nested_ept || !(nested_vmx_ept_caps  VMX_EPT_INVEPT_BIT)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
+
+   if (!nested_vmx_check_permission(vcpu))
+   return 1;
+
+   if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
+
+   /* According to the Intel VMX instruction reference, the memory
+* operand is read even if it isn't needed (e.g., for type==global)
+*/
+   vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+   if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+   vmx_instruction_info, gva))
+   return 1;
+   if (kvm_read_guest_virt(vcpu-arch.emulate_ctxt, gva, operand,
+   sizeof(operand), e)) {
+   kvm_inject_page_fault(vcpu, e);
+   return 1;
+   }
+
+   type = kvm_register_read(vcpu, (vmx_instruction_info  28)  0xf);
+
+   switch (type) {
+   case VMX_EPT_EXTENT_GLOBAL:
+   if (!(nested_vmx_ept_caps  VMX_EPT_EXTENT_GLOBAL_BIT))
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+   else {
+   ept_sync_global();
+   nested_vmx_succeed(vcpu);
+   }
+   break;
+   case VMX_EPT_EXTENT_CONTEXT:
+   if (!(nested_vmx_ept_caps  VMX_EPT_EXTENT_CONTEXT_BIT))
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+   else {
+   /*
+* We efficiently handle the common case, of L1
+* invalidating the last eptp it used to run L2.
+* TODO: Instead of saving one last_eptp02, look up
+* operand.eptp in the shadow EPT table cache, to
+* find its shadow. Then last_eptp02 won't be needed.
+*/
+   struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   if (vmcs12  nested_cpu_has_ept(vmcs12) 
+   (vmcs12-ept_pointer == operand.eptp) 
+   vmx-nested.last_eptp02)
+   

Re: [PATCH 08/10] nEPT: Nested INVEPT

2011-11-10 Thread Avi Kivity
On 11/10/2011 12:01 PM, Nadav Har'El wrote:
 If we let L1 use EPT, we should probably also support the INVEPT instruction.

 + case VMX_EPT_EXTENT_CONTEXT:
 + if (!(nested_vmx_ept_caps  VMX_EPT_EXTENT_CONTEXT_BIT))
 + nested_vmx_failValid(vcpu,
 + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
 + else {
 + /*
 +  * We efficiently handle the common case, of L1
 +  * invalidating the last eptp it used to run L2.
 +  * TODO: Instead of saving one last_eptp02, look up
 +  * operand.eptp in the shadow EPT table cache, to
 +  * find its shadow. Then last_eptp02 won't be needed.
 +  */
 + struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 + struct vcpu_vmx *vmx = to_vmx(vcpu);
 + if (vmcs12  nested_cpu_has_ept(vmcs12) 
 + (vmcs12-ept_pointer == operand.eptp) 
 + vmx-nested.last_eptp02)
 + ept_sync_context(vmx-nested.last_eptp02);
 + else
 + ept_sync_global();

Are either of these needed?  Won't a write to a shadowed EPT table cause
them anyway?

 + nested_vmx_succeed(vcpu);
 + }
 + break;
 + case VMX_EPT_EXTENT_INDIVIDUAL_ADDR:
 + if (!(nested_vmx_ept_caps  VMX_EPT_EXTENT_INDIVIDUAL_BIT))
 + nested_vmx_failValid(vcpu,
 + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
 + else {
 + struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 + struct vcpu_vmx *vmx = to_vmx(vcpu);
 + if (vmcs12  nested_cpu_has_ept(vmcs12) 
 + (vmcs12-ept_pointer == operand.eptp) 
 + vmx-nested.last_eptp02)
 + ept_sync_individual_addr(
 + vmx-nested.last_eptp02, operand.gpa);

Same here.

 + else
 + ept_sync_global();
 + nested_vmx_succeed(vcpu);
 + }
 + break;
 + default:
 + nested_vmx_failValid(vcpu,
 + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
 + }
 +
 + skip_emulated_instruction(vcpu);
 + return 1;
 +}
 +


-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html