[PATCH v3 13/13] nEPT: Inject EPT violation/misconfigration

2013-05-18 Thread Jun Nakajima
Add code to detect EPT misconfiguration and inject it to L1 VMM. Also,
it injects more correct exit qualification upon EPT violation to L1
VMM.  Now L1 can correctly go to ept_misconfig handler (instead of
wrongly going to fast_page_fault), it will try to handle mmio page
fault, if failed, it is a real EPT misconfiguration.

Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/include/asm/kvm_host.h |  4 +++
 arch/x86/kvm/mmu.c  |  5 ---
 arch/x86/kvm/mmu.h  |  5 +++
 arch/x86/kvm/paging_tmpl.h  | 26 ++
 arch/x86/kvm/vmx.c  | 79 +++--
 5 files changed, 111 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3741c65..1d03202 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -262,6 +262,8 @@ struct kvm_mmu {
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
   u64 *spte, const void *pte);
+   bool (*check_tdp_pte)(u64 pte, int level);
+
hpa_t root_hpa;
int root_level;
int shadow_root_level;
@@ -503,6 +505,8 @@ struct kvm_vcpu_arch {
 * instruction.
 */
bool write_fault_to_shadow_pgtable;
+
+   unsigned long exit_qualification; /* set at EPT violation at this point 
*/
 };
 
 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 93d6abf..3a3b11f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -233,11 +233,6 @@ static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t 
pfn, unsigned access)
return false;
 }
 
-static inline u64 rsvd_bits(int s, int e)
-{
-   return ((1ULL << (e - s + 1)) - 1) << s;
-}
-
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask)
 {
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 8fc94dd..559e2e0 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -88,6 +88,11 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
 }
 
+static inline u64 rsvd_bits(int s, int e)
+{
+   return ((1ULL << (e - s + 1)) - 1) << s;
+}
+
 /*
  * Will a fault with a given page-fault error code (pfec) cause a permission
  * fault with the given access (in ACC_* format)?
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2432d49..067b1f8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -126,10 +126,14 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
 
 static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
 {
+#if PTTYPE == PTTYPE_EPT
+   return (mmu->check_tdp_pte(gpte, level));
+#else
int bit7;
 
bit7 = (gpte >> 7) & 1;
return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
+#endif
 }
 
 static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
@@ -352,6 +356,28 @@ error:
walker->fault.vector = PF_VECTOR;
walker->fault.error_code_valid = true;
walker->fault.error_code = errcode;
+
+#if PTTYPE == PTTYPE_EPT
+   /*
+* Use PFERR_RSVD_MASK in erorr_code to to tell if EPT
+* misconfiguration requires to be injected. The detection is
+* done by is_rsvd_bits_set() above.
+*
+* We set up the value of exit_qualification to inject:
+* [2:0] -- Derive from [2:0] of real exit_qualification at EPT 
violation
+* [5:3] -- Calculated by the page walk of the guest EPT page tables
+* [7:8] -- Clear to 0.
+*
+* The other bits are set to 0.
+*/
+   if (!(errcode & PFERR_RSVD_MASK)) {
+   unsigned long exit_qualification = 
vcpu->arch.exit_qualification;
+
+   pte_access = pt_access & pte;
+   vcpu->arch.exit_qualification = ((pte_access & 0x7) << 3) |
+   (exit_qualification & 0x7);
+   }
+#endif
walker->fault.address = addr;
walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ec4e9b9..667be15 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5310,6 +5310,8 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
/* ept page table is present? */
error_code |= (exit_qualification >> 3) & 0x1;
 
+   vcpu->arch.exit_qualification = exit_qualification;
+
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }
 
@@ -7432,7 +7434,7 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu 
*vcpu)
 }
 
 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
-   struct x86_exception *fault)
+ 

[PATCH v3 11/13] nEPT: Miscelleneous cleanups

2013-05-18 Thread Jun Nakajima
From: Nadav Har'El 

Some trivial code cleanups not really related to nested EPT.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
Reviewed-by: Paolo Bonzini 
---
 arch/x86/kvm/vmx.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d9d991d..ec4e9b9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -714,7 +714,6 @@ static void nested_release_page_clean(struct page *page)
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -1039,8 +1038,7 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, 
u32 bit)
(vmcs12->secondary_vm_exec_control & bit);
 }
 
-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
-   struct kvm_vcpu *vcpu)
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
 {
return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }
@@ -6737,7 +6735,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
!(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
-   get_vmcs12(vcpu), vcpu {
+   get_vmcs12(vcpu) {
if (vmx_interrupt_allowed(vcpu)) {
vmx->soft_vnmi_blocked = 0;
} else if (vmx->vnmi_blocked_time > 10LL &&
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 12/13] nEPT: Move is_rsvd_bits_set() to paging_tmpl.h

2013-05-18 Thread Jun Nakajima
Move is_rsvd_bits_set() to paging_tmpl.h so that it can be used to check
reserved bits in EPT page table entries as well.

Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/mmu.c |  8 
 arch/x86/kvm/paging_tmpl.h | 12 ++--
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 37f8d7f..93d6abf 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2468,14 +2468,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
mmu_free_roots(vcpu);
 }
 
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
-   int bit7;
-
-   bit7 = (gpte >> 7) & 1;
-   return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
-}
-
 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 bool no_dirty_log)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index dc495f9..2432d49 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -124,11 +124,19 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
 }
 #endif
 
+static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+   int bit7;
+
+   bit7 = (gpte >> 7) & 1;
+   return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
+}
+
 static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
  struct kvm_mmu_page *sp, u64 *spte,
  u64 gpte)
 {
-   if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+   if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
goto no_present;
 
if (!is_present_gpte(gpte))
@@ -279,7 +287,7 @@ retry_walk:
if (unlikely(!is_present_gpte(pte)))
goto error;
 
-   if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
+   if (unlikely(FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, pte,
  walker->level))) {
errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
goto error;
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 10/13] nEPT: Nested INVEPT

2013-05-18 Thread Jun Nakajima
From: Nadav Har'El 

If we let L1 use EPT, we should probably also support the INVEPT instruction.

In our current nested EPT implementation, when L1 changes its EPT table for
L2 (i.e., EPT12), L0 modifies the shadow EPT table (EPT02), and in the course
of this modification already calls INVEPT. Therefore, when L1 calls INVEPT,
we don't really need to do anything. In particular we *don't* need to call
the real INVEPT again. All we do in our INVEPT is verify the validity of the
call, and its parameters, and then do nothing.

In KVM Forum 2010, Dong et al. presented "Nested Virtualization Friendly KVM"
and classified our current nested EPT implementation as "shadow-like virtual
EPT". He recommended instead a different approach, which he called "VTLB-like
virtual EPT". If we had taken that alternative approach, INVEPT would have had
a bigger role: L0 would only rebuild the shadow EPT table when L1 calls INVEPT.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/include/uapi/asm/vmx.h |  1 +
 arch/x86/kvm/vmx.c  | 83 +
 2 files changed, 84 insertions(+)

diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index d651082..7a34e8f 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@
 #define EXIT_REASON_EOI_INDUCED 45
 #define EXIT_REASON_EPT_VIOLATION   48
 #define EXIT_REASON_EPT_MISCONFIG   49
+#define EXIT_REASON_INVEPT  50
 #define EXIT_REASON_PREEMPTION_TIMER52
 #define EXIT_REASON_WBINVD  54
 #define EXIT_REASON_XSETBV  55
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1cf8a41..d9d991d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6251,6 +6251,87 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
return 1;
 }
 
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+   u32 vmx_instruction_info;
+   unsigned long type;
+   gva_t gva;
+   struct x86_exception e;
+   struct {
+   u64 eptp, gpa;
+   } operand;
+
+   if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
+   !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
+
+   if (!nested_vmx_check_permission(vcpu))
+   return 1;
+
+   if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
+
+   /* According to the Intel VMX instruction reference, the memory
+* operand is read even if it isn't needed (e.g., for type==global)
+*/
+   vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+   if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+   vmx_instruction_info, &gva))
+   return 1;
+   if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
+   sizeof(operand), &e)) {
+   kvm_inject_page_fault(vcpu, &e);
+   return 1;
+   }
+
+   type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+   switch (type) {
+   case VMX_EPT_EXTENT_GLOBAL:
+   if (!(nested_vmx_ept_caps & VMX_EPT_EXTENT_GLOBAL_BIT))
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+   else {
+   /*
+* Do nothing: when L1 changes EPT12, we already
+* update EPT02 (the shadow EPT table) and call INVEPT.
+* So when L1 calls INVEPT, there's nothing left to do.
+*/
+   nested_vmx_succeed(vcpu);
+   }
+   break;
+   case VMX_EPT_EXTENT_CONTEXT:
+   if (!(nested_vmx_ept_caps & VMX_EPT_EXTENT_CONTEXT_BIT))
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+   else {
+   /* Do nothing */
+   nested_vmx_succeed(vcpu);
+   }
+   break;
+   case VMX_EPT_EXTENT_INDIVIDUAL_ADDR:
+   if (!(nested_vmx_ept_caps & VMX_EPT_EXTENT_INDIVIDUAL_BIT))
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+   else {
+   /* Do nothing */
+   nested_vmx_succeed(vcpu);
+   }
+   break;
+   default:
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+   }

[PATCH v3 09/13] nEPT: Advertise EPT to L1

2013-05-18 Thread Jun Nakajima
From: Nadav Har'El 

Advertise the support of EPT to the L1 guest, through the appropriate MSR.

This is the last patch of the basic Nested EPT feature, so as to allow
bisection through this patch series: The guest will not see EPT support until
this last patch, and will not attempt to use the half-applied feature.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/include/asm/vmx.h |  2 ++
 arch/x86/kvm/vmx.c | 17 +++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index f3e01a2..4aec45d 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -394,7 +394,9 @@ enum vmcs_field {
 #define VMX_EPTP_WB_BIT(1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT   (1ull << 16)
 #define VMX_EPT_1GB_PAGE_BIT   (1ull << 17)
+#define VMX_EPT_INVEPT_BIT (1ull << 20)
 #define VMX_EPT_AD_BIT (1ull << 21)
+#define VMX_EPT_EXTENT_INDIVIDUAL_BIT  (1ull << 24)
 #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT  (1ull << 26)
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4661a22..1cf8a41 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2155,6 +2155,7 @@ static u32 nested_vmx_pinbased_ctls_low, 
nested_vmx_pinbased_ctls_high;
 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
 static u32 nested_vmx_misc_low, nested_vmx_misc_high;
+static u32 nested_vmx_ept_caps;
 static __init void nested_vmx_setup_ctls_msrs(void)
 {
/*
@@ -2242,6 +2243,18 @@ static __init void nested_vmx_setup_ctls_msrs(void)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_WBINVD_EXITING;
 
+   if (enable_ept) {
+   /* nested EPT: emulate EPT also to L1 */
+   nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
+   nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT;
+   nested_vmx_ept_caps |=
+   VMX_EPT_INVEPT_BIT | VMX_EPT_EXTENT_GLOBAL_BIT |
+   VMX_EPT_EXTENT_CONTEXT_BIT |
+   VMX_EPT_EXTENT_INDIVIDUAL_BIT;
+   nested_vmx_ept_caps &= vmx_capability.ept;
+   } else
+   nested_vmx_ept_caps = 0;
+
/* miscellaneous data */
rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
@@ -2347,8 +2360,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 
msr_index, u64 *pdata)
nested_vmx_secondary_ctls_high);
break;
case MSR_IA32_VMX_EPT_VPID_CAP:
-   /* Currently, no nested ept or nested vpid */
-   *pdata = 0;
+   /* Currently, no nested vpid support */
+   *pdata = nested_vmx_ept_caps;
break;
default:
return 0;
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 08/13] nEPT: Some additional comments

2013-05-18 Thread Jun Nakajima
From: Nadav Har'El 

Some additional comments to preexisting code:
Explain who (L0 or L1) handles EPT violation and misconfiguration exits.
Don't mention "shadow on either EPT or shadow" as the only two options.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/vmx.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b79efd4..4661a22 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6540,7 +6540,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu 
*vcpu)
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
case EXIT_REASON_EPT_VIOLATION:
+   /*
+* L0 always deals with the EPT violation. If nested EPT is
+* used, and the nested mmu code discovers that the address is
+* missing in the guest EPT table (EPT12), the EPT violation
+* will be injected with nested_ept_inject_page_fault()
+*/
+   return 0;
case EXIT_REASON_EPT_MISCONFIG:
+   /*
+* L2 never uses directly L1's EPT, but rather L0's own EPT
+* table (shadow on EPT) or a merged EPT table that L0 built
+* (EPT on EPT). So any problems with the structure of the
+* table is L0's fault.
+*/
return 0;
case EXIT_REASON_PREEMPTION_TIMER:
return vmcs12->pin_based_vm_exec_control &
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 06/13] nEPT: Fix cr3 handling in nested exit and entry

2013-05-18 Thread Jun Nakajima
From: Nadav Har'El 

The existing code for handling cr3 and related VMCS fields during nested
exit and entry wasn't correct in all cases:

If L2 is allowed to control cr3 (and this is indeed the case in nested EPT),
during nested exit we must copy the modified cr3 from vmcs02 to vmcs12, and
we forgot to do so. This patch adds this copy.

If L0 isn't controlling cr3 when running L2 (i.e., L0 is using EPT), and
whoever does control cr3 (L1 or L2) is using PAE, the processor might have
saved PDPTEs and we should also save them in vmcs12 (and restore later).

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/vmx.c | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a88432f..b79efd4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7608,6 +7608,17 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct 
vmcs12 *vmcs12)
kvm_set_cr3(vcpu, vmcs12->guest_cr3);
kvm_mmu_reset_context(vcpu);
 
+   /*
+* Additionally, except when L0 is using shadow page tables, L1 or
+* L2 control guest_cr3 for L2, so they may also have saved PDPTEs
+*/
+   if (enable_ept) {
+   vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+   vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+   vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+   vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+   }
+
kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
 }
@@ -7930,6 +7941,25 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct 
vmcs12 *vmcs12)
vmcs12->guest_pending_dbg_exceptions =
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
 
+   /*
+* In some cases (usually, nested EPT), L2 is allowed to change its
+* own CR3 without exiting. If it has changed it, we must keep it.
+* Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
+* by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
+*/
+   if (enable_ept)
+   vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3);
+   /*
+* Additionally, except when L0 is using shadow page tables, L1 or
+* L2 control guest_cr3 for L2, so save their PDPTEs
+*/
+   if (enable_ept) {
+   vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+   vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+   vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+   vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+   }
+
vmcs12->vm_entry_controls =
(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
(vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 07/13] nEPT: Fix wrong test in kvm_set_cr3

2013-05-18 Thread Jun Nakajima
From: Nadav Har'El 

kvm_set_cr3() attempts to check if the new cr3 is a valid guest physical
address. The problem is that with nested EPT, cr3 is an *L2* physical
address, not an L1 physical address as this test expects.

As the comment above this test explains, it isn't necessary, and doesn't
correspond to anything a real processor would do. So this patch removes it.

Note that this wrong test could have also theoretically caused problems
in nested NPT, not just in nested EPT. However, in practice, the problem
was avoided: nested_svm_vmexit()/vmrun() do not call kvm_set_cr3 in the
nested NPT case, and instead set the vmcb (and arch.cr3) directly, thus
circumventing the problem. Additional potential calls to the buggy function
are avoided in that we don't trap cr3 modifications when nested NPT is
enabled. However, because in nested VMX we did want to use kvm_set_cr3()
(as requested in Avi Kivity's review of the original nested VMX patches),
we can't avoid this problem and need to fix it.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/x86.c | 11 ---
 1 file changed, 11 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 094b5d9..7b36ec6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -683,17 +683,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 */
}
 
-   /*
-* Does the new cr3 value map to physical memory? (Note, we
-* catch an invalid cr3 even in real-mode, because it would
-* cause trouble later on when we turn on paging anyway.)
-*
-* A real CPU would silently accept an invalid cr3 and would
-* attempt to use it - with largely undefined (and often hard
-* to debug) behavior on the guest side.
-*/
-   if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
-   return 1;
vcpu->arch.cr3 = cr3;
__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
vcpu->arch.mmu.new_cr3(vcpu);
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 03/13] nEPT: Add EPT tables support to paging_tmpl.h

2013-05-18 Thread Jun Nakajima
From: Nadav Har'El 

This is the first patch in a series which adds nested EPT support to KVM's
nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
to set its own cr3 and take its own page faults without either of L0 or L1
getting involved. This often significanlty improves L2's performance over the
previous two alternatives (shadow page tables over EPT, and shadow page
tables over shadow page tables).

This patch adds EPT support to paging_tmpl.h.

paging_tmpl.h contains the code for reading and writing page tables. The code
for 32-bit and 64-bit tables is very similar, but not identical, so
paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
with PTTYPE=64, and this generates the two sets of similar functions.

There are subtle but important differences between the format of EPT tables
and that of ordinary x86 64-bit page tables, so for nested EPT we need a
third set of functions to read the guest EPT table and to write the shadow
EPT table.

So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
with "EPT") which correctly read and write EPT tables.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/mmu.c |  5 +
 arch/x86/kvm/paging_tmpl.h | 43 +--
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 117233f..6c1670f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3397,6 +3397,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, 
unsigned level, unsigned gp
return mmu->last_pte_bitmap & (1 << index);
 }
 
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index df34d4a..4c45654 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -50,6 +50,22 @@
#define PT_LEVEL_BITS PT32_LEVEL_BITS
#define PT_MAX_FULL_LEVELS 2
#define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+   #define pt_element_t u64
+   #define guest_walker guest_walkerEPT
+   #define FNAME(name) EPT_##name
+   #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+   #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+   #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+   #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+   #define PT_LEVEL_BITS PT64_LEVEL_BITS
+   #ifdef CONFIG_X86_64
+   #define PT_MAX_FULL_LEVELS 4
+   #define CMPXCHG cmpxchg
+   #else
+   #define CMPXCHG cmpxchg64
+   #define PT_MAX_FULL_LEVELS 2
+   #endif
 #else
#error Invalid PTTYPE value
 #endif
@@ -80,6 +96,10 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
 }
 
+#if PTTYPE != PTTYPE_EPT
+/*
+ *  Comment out this for EPT because update_accessed_dirty_bits() is not used.
+ */
 static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
   pt_element_t __user *ptep_user, unsigned index,
   pt_element_t orig_pte, pt_element_t new_pte)
@@ -102,6 +122,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
 
return (ret != orig_pte);
 }
+#endif
 
 static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
  struct kvm_mmu_page *sp, u64 *spte,
@@ -126,13 +147,21 @@ no_present:
 static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
 {
unsigned access;
-
+#if PTTYPE == PTTYPE_EPT
+   access = (gpte & (VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK |
+ VMX_EPT_EXECUTABLE_MASK));
+#else
access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
 
return access;
 }
 
+#if PTTYPE != PTTYPE_EPT
+/*
+ * EPT A/D bit support is not implemented.
+ */
 static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 struct kvm_mmu *mmu,
 struct guest_walker *walker,
@@ -169,6 +198,7 @@ static int FNAME(update_accessed_dirty_bits)(struct 
kvm_vcpu *vcpu,
}
return 0;
 }
+#endif
 
 /*
  * Fetch a guest pte for a guest virtual address
@@ -177,7 +207,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker 
*walker,
struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gva_t addr, u32 access)
 {
-   int ret;
pt_element_t pte;

[PATCH v3 04/13] nEPT: Define EPT-specific link_shadow_page()

2013-05-18 Thread Jun Nakajima
From: Nadav Har'El 

Since link_shadow_page() is used by a routine in mmu.c, add an
EPT-specific link_shadow_page() in paging_tmp.h, rather than moving
it.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/paging_tmpl.h | 20 
 1 file changed, 20 insertions(+)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4c45654..dc495f9 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -461,6 +461,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, 
struct guest_walker *gw,
}
 }
 
+#if PTTYPE == PTTYPE_EPT
+static void FNAME(link_shadow_page)(u64 *sptep, struct kvm_mmu_page *sp)
+{
+   u64 spte;
+
+   spte = __pa(sp->spt) | VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK |
+   VMX_EPT_EXECUTABLE_MASK;
+
+   mmu_spte_set(sptep, spte);
+}
+#endif
+
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  * If the guest tries to write a write-protected page, we need to
@@ -513,7 +525,11 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
goto out_gpte_changed;
 
if (sp)
+#if PTTYPE == PTTYPE_EPT
+   FNAME(link_shadow_page)(it.sptep, sp);
+#else
link_shadow_page(it.sptep, sp);
+#endif
}
 
for (;
@@ -533,7 +549,11 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 
sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
  true, direct_access, it.sptep);
+#if PTTYPE == PTTYPE_EPT
+   FNAME(link_shadow_page)(it.sptep, sp);
+#else
link_shadow_page(it.sptep, sp);
+#endif
}
 
clear_sp_write_flooding_count(it.sptep);
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 05/13] nEPT: MMU context for nested EPT

2013-05-18 Thread Jun Nakajima
From: Nadav Har'El 

KVM's existing shadow MMU code already supports nested TDP. To use it, we
need to set up a new "MMU context" for nested EPT, and create a few callbacks
for it (nested_ept_*()). This context should also use the EPT versions of
the page table access functions (defined in the previous patch).
Then, we need to switch back and forth between this nested context and the
regular MMU context when switching between L1 and L2 (when L1 runs this L2
with EPT).

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/mmu.c | 38 ++
 arch/x86/kvm/mmu.h |  1 +
 arch/x86/kvm/vmx.c | 54 +-
 3 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6c1670f..37f8d7f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3653,6 +3653,44 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct 
kvm_mmu *context)
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
+int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+{
+   ASSERT(vcpu);
+   ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+   context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+
+   context->nx = is_nx(vcpu); /* TODO: ? */
+   context->new_cr3 = paging_new_cr3;
+   context->page_fault = EPT_page_fault;
+   context->gva_to_gpa = EPT_gva_to_gpa;
+   context->sync_page = EPT_sync_page;
+   context->invlpg = EPT_invlpg;
+   context->update_pte = EPT_update_pte;
+   context->free = paging_free;
+   context->root_level = context->shadow_root_level;
+   context->root_hpa = INVALID_PAGE;
+   context->direct_map = false;
+
+   /* TODO: reset_rsvds_bits_mask() is not built for EPT, we need
+  something different.
+*/
+   reset_rsvds_bits_mask(vcpu, context);
+
+
+   /* TODO: I copied these from kvm_init_shadow_mmu, I don't know why
+  they are done, or why they write to vcpu->arch.mmu and not context
+*/
+   vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
+   vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
+   vcpu->arch.mmu.base_role.smep_andnot_wp =
+   kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) &&
+   !is_write_protection(vcpu);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_EPT_mmu);
+
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 2adcbc2..8fc94dd 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -54,6 +54,7 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 
addr, u64 sptes[4]);
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool 
direct);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fb9cae5..a88432f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1045,6 +1045,11 @@ static inline bool nested_cpu_has_virtual_nmis(struct 
vmcs12 *vmcs12,
return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }
 
+static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -7311,6 +7316,46 @@ static void vmx_set_supported_cpuid(u32 func, struct 
kvm_cpuid_entry2 *entry)
entry->ecx |= bit(X86_FEATURE_VMX);
 }
 
+/* Callbacks for nested_ept_init_mmu_context: */
+
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+{
+   /* return the page table to be shadowed - in our case, EPT12 */
+   return get_vmcs12(vcpu)->ept_pointer;
+}
+
+static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
+   struct x86_exception *fault)
+{
+   struct vmcs12 *vmcs12;
+   nested_vmx_vmexit(vcpu);
+   vmcs12 = get_vmcs12(vcpu);
+   /*
+* Note no need to set vmcs12->vm_exit_reason as it is already copied
+* from vmcs02 in nested_vmx_vmexit() above, i.e., EPT_VIOLATION.
+*/
+   vmcs12->exit_qualification = fault->error_code;
+   vmcs12->guest_physical_address = fault->address;
+}
+
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+   int r = kvm_init_shadow_EPT_mmu(vcpu, &vcpu->arch.mmu);
+
+   vcpu->arch.mmu.set_cr3   = vmx_set_cr3;
+   vcpu->arch.mmu

[PATCH v3 02/13] nEPT: Move gpte_access() and prefetch_invalid_gpte() to paging_tmpl.h

2013-05-18 Thread Jun Nakajima
From: Nadav Har'El 

For preparation, we just move gpte_access() and prefetch_invalid_gpte() from 
mmu.c to paging_tmpl.h.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/mmu.c | 30 --
 arch/x86/kvm/paging_tmpl.h | 40 +++-
 2 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 004cc87..117233f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2488,26 +2488,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu 
*vcpu, gfn_t gfn,
return gfn_to_pfn_memslot_atomic(slot, gfn);
 }
 
-static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- u64 gpte)
-{
-   if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
-   goto no_present;
-
-   if (!is_present_gpte(gpte))
-   goto no_present;
-
-   if (!(gpte & PT_ACCESSED_MASK))
-   goto no_present;
-
-   return false;
-
-no_present:
-   drop_spte(vcpu->kvm, spte);
-   return true;
-}
-
 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
u64 *start, u64 *end)
@@ -3408,16 +3388,6 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, 
unsigned access,
return false;
 }
 
-static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
-{
-   unsigned access;
-
-   access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-   access &= ~(gpte >> PT64_NX_SHIFT);
-
-   return access;
-}
-
 static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned 
gpte)
 {
unsigned index;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index da20860..df34d4a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -103,6 +103,36 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
return (ret != orig_pte);
 }
 
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ u64 gpte)
+{
+   if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+   goto no_present;
+
+   if (!is_present_gpte(gpte))
+   goto no_present;
+
+   if (!(gpte & PT_ACCESSED_MASK))
+   goto no_present;
+
+   return false;
+
+no_present:
+   drop_spte(vcpu->kvm, spte);
+   return true;
+}
+
+static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+   unsigned access;
+
+   access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+   access &= ~(gpte >> PT64_NX_SHIFT);
+
+   return access;
+}
+
 static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 struct kvm_mmu *mmu,
 struct guest_walker *walker,
@@ -225,7 +255,7 @@ retry_walk:
}
 
accessed_dirty &= pte;
-   pte_access = pt_access & gpte_access(vcpu, pte);
+   pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
 
walker->ptes[walker->level - 1] = pte;
} while (!is_last_gpte(mmu, walker->level, pte));
@@ -309,13 +339,13 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp,
gfn_t gfn;
pfn_t pfn;
 
-   if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
+   if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
return false;
 
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
 
gfn = gpte_to_gfn(gpte);
-   pte_access = sp->role.access & gpte_access(vcpu, gpte);
+   pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
protect_clean_gpte(&pte_access, gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
no_dirty_log && (pte_access & ACC_WRITE_MASK));
@@ -782,14 +812,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp)
  sizeof(pt_element_t)))
return -EINVAL;
 
-   if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) {
+   if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
vcpu->kvm->tlbs_dirty++;
continue;
}
 
gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access;
-   pte_access &= gpte_access(vcpu, gpte);
+   

[PATCH v3 01/13] nEPT: Support LOAD_IA32_EFER entry/exit controls for L1

2013-05-18 Thread Jun Nakajima
From: Nadav Har'El 

Recent KVM, since http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577
switch the EFER MSR when EPT is used and the host and guest have different
NX bits. So if we add support for nested EPT (L1 guest using EPT to run L2)
and want to be able to run recent KVM as L1, we need to allow L1 to use this
EFER switching feature.

To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER if available,
and if it isn't, it uses the generic VM_ENTRY/EXIT_MSR_LOAD. This patch adds
support for the former (the latter is still unsupported).

Nested entry and exit emulation (prepare_vmcs_02 and load_vmcs12_host_state,
respectively) already handled VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So all
that's left to do in this patch is to properly advertise this feature to L1.

Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by L0, by using
vmx_set_efer (which itself sets one of several vmcs02 fields), so we always
support this feature, regardless of whether the host supports it.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/vmx.c | 23 ---
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 260a919..fb9cae5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2192,7 +2192,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #else
nested_vmx_exit_ctls_high = 0;
 #endif
-   nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+   nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+ VM_EXIT_LOAD_IA32_EFER);
 
/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2201,8 +2202,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
nested_vmx_entry_ctls_high &=
VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
-   nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
-
+   nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
+  VM_ENTRY_LOAD_IA32_EFER);
/* cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
@@ -7492,10 +7493,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, 
struct vmcs12 *vmcs12)
vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
 
-   /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
-   vmcs_write32(VM_EXIT_CONTROLS,
-   vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
-   vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
+   /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
+* we should use its exit controls. Note that IA32_MODE, LOAD_IA32_EFER
+* bits are further modified by vmx_set_efer() below.
+*/
+   vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+   /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
+* emulated by vmx_set_efer(), below.
+*/
+   vmcs_write32(VM_ENTRY_CONTROLS,
+   (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
+   ~VM_ENTRY_IA32E_MODE) |
(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
 
if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 13/13] nEPT: Inject EPT violation/misconfigration

2013-05-08 Thread Jun Nakajima
Add code to detect EPT misconfiguration and inject it to L1 VMM. Also,
it injects more correct exit qualification upon EPT violation to L1
VMM.  Now L1 can correctly go to ept_misconfig handler (instead of
wrongly going to fast_page_fault), it will try to handle mmio page
fault, if failed, it is a real EPT misconfiguration.

Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/include/asm/kvm_host.h |  4 +++
 arch/x86/kvm/mmu.c  |  5 ---
 arch/x86/kvm/mmu.h  |  5 +++
 arch/x86/kvm/paging_tmpl.h  | 26 ++
 arch/x86/kvm/vmx.c  | 79 +++--
 5 files changed, 111 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3741c65..1d03202 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -262,6 +262,8 @@ struct kvm_mmu {
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
   u64 *spte, const void *pte);
+   bool (*check_tdp_pte)(u64 pte, int level);
+
hpa_t root_hpa;
int root_level;
int shadow_root_level;
@@ -503,6 +505,8 @@ struct kvm_vcpu_arch {
 * instruction.
 */
bool write_fault_to_shadow_pgtable;
+
+   unsigned long exit_qualification; /* set at EPT violation at this point 
*/
 };
 
 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 93d6abf..3a3b11f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -233,11 +233,6 @@ static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t 
pfn, unsigned access)
return false;
 }
 
-static inline u64 rsvd_bits(int s, int e)
-{
-   return ((1ULL << (e - s + 1)) - 1) << s;
-}
-
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask)
 {
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 8fc94dd..559e2e0 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -88,6 +88,11 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
 }
 
+static inline u64 rsvd_bits(int s, int e)
+{
+   return ((1ULL << (e - s + 1)) - 1) << s;
+}
+
 /*
  * Will a fault with a given page-fault error code (pfec) cause a permission
  * fault with the given access (in ACC_* format)?
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2432d49..067b1f8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -126,10 +126,14 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
 
 static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
 {
+#if PTTYPE == PTTYPE_EPT
+   return (mmu->check_tdp_pte(gpte, level));
+#else
int bit7;
 
bit7 = (gpte >> 7) & 1;
return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
+#endif
 }
 
 static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
@@ -352,6 +356,28 @@ error:
walker->fault.vector = PF_VECTOR;
walker->fault.error_code_valid = true;
walker->fault.error_code = errcode;
+
+#if PTTYPE == PTTYPE_EPT
+   /*
+* Use PFERR_RSVD_MASK in erorr_code to to tell if EPT
+* misconfiguration requires to be injected. The detection is
+* done by is_rsvd_bits_set() above.
+*
+* We set up the value of exit_qualification to inject:
+* [2:0] -- Derive from [2:0] of real exit_qualification at EPT 
violation
+* [5:3] -- Calculated by the page walk of the guest EPT page tables
+* [7:8] -- Clear to 0.
+*
+* The other bits are set to 0.
+*/
+   if (!(errcode & PFERR_RSVD_MASK)) {
+   unsigned long exit_qualification = 
vcpu->arch.exit_qualification;
+
+   pte_access = pt_access & pte;
+   vcpu->arch.exit_qualification = ((pte_access & 0x7) << 3) |
+   (exit_qualification & 0x7);
+   }
+#endif
walker->fault.address = addr;
walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ca49e19..a44e7fd 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5310,6 +5310,8 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
/* ept page table is present? */
error_code |= (exit_qualification >> 3) & 0x1;
 
+   vcpu->arch.exit_qualification = exit_qualification;
+
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }
 
@@ -7426,7 +7428,7 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu 
*vcpu)
 }
 
 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
-   struct x86_exception *fault)
+ 

[PATCH v3 11/13] nEPT: Miscelleneous cleanups

2013-05-08 Thread Jun Nakajima
Some trivial code cleanups not really related to nested EPT.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
Reviewed-by: Paolo Bonzini 
---
 arch/x86/kvm/vmx.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9ceab42..ca49e19 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -714,7 +714,6 @@ static void nested_release_page_clean(struct page *page)
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -1039,8 +1038,7 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, 
u32 bit)
(vmcs12->secondary_vm_exec_control & bit);
 }
 
-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
-   struct kvm_vcpu *vcpu)
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
 {
return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }
@@ -6731,7 +6729,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
!(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
-   get_vmcs12(vcpu), vcpu {
+   get_vmcs12(vcpu) {
if (vmx_interrupt_allowed(vcpu)) {
vmx->soft_vnmi_blocked = 0;
} else if (vmx->vnmi_blocked_time > 10LL &&
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 12/13] nEPT: Move is_rsvd_bits_set() to paging_tmpl.h

2013-05-08 Thread Jun Nakajima
Move is_rsvd_bits_set() to paging_tmpl.h so that it can be used to check
reserved bits in EPT page table entries as well.

Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/mmu.c |  8 
 arch/x86/kvm/paging_tmpl.h | 12 ++--
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 37f8d7f..93d6abf 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2468,14 +2468,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
mmu_free_roots(vcpu);
 }
 
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
-   int bit7;
-
-   bit7 = (gpte >> 7) & 1;
-   return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
-}
-
 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 bool no_dirty_log)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index dc495f9..2432d49 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -124,11 +124,19 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
 }
 #endif
 
+static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+   int bit7;
+
+   bit7 = (gpte >> 7) & 1;
+   return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
+}
+
 static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
  struct kvm_mmu_page *sp, u64 *spte,
  u64 gpte)
 {
-   if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+   if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
goto no_present;
 
if (!is_present_gpte(gpte))
@@ -279,7 +287,7 @@ retry_walk:
if (unlikely(!is_present_gpte(pte)))
goto error;
 
-   if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
+   if (unlikely(FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, pte,
  walker->level))) {
errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
goto error;
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 10/13] nEPT: Nested INVEPT

2013-05-08 Thread Jun Nakajima
If we let L1 use EPT, we should probably also support the INVEPT instruction.

In our current nested EPT implementation, when L1 changes its EPT table for
L2 (i.e., EPT12), L0 modifies the shadow EPT table (EPT02), and in the course
of this modification already calls INVEPT. Therefore, when L1 calls INVEPT,
we don't really need to do anything. In particular we *don't* need to call
the real INVEPT again. All we do in our INVEPT is verify the validity of the
call, and its parameters, and then do nothing.

In KVM Forum 2010, Dong et al. presented "Nested Virtualization Friendly KVM"
and classified our current nested EPT implementation as "shadow-like virtual
EPT". He recommended instead a different approach, which he called "VTLB-like
virtual EPT". If we had taken that alternative approach, INVEPT would have had
a bigger role: L0 would only rebuild the shadow EPT table when L1 calls INVEPT.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/include/uapi/asm/vmx.h |  1 +
 arch/x86/kvm/vmx.c  | 83 +
 2 files changed, 84 insertions(+)

diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index d651082..7a34e8f 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@
 #define EXIT_REASON_EOI_INDUCED 45
 #define EXIT_REASON_EPT_VIOLATION   48
 #define EXIT_REASON_EPT_MISCONFIG   49
+#define EXIT_REASON_INVEPT  50
 #define EXIT_REASON_PREEMPTION_TIMER52
 #define EXIT_REASON_WBINVD  54
 #define EXIT_REASON_XSETBV  55
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 136fc25..9ceab42 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6245,6 +6245,87 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
return 1;
 }
 
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+   u32 vmx_instruction_info;
+   unsigned long type;
+   gva_t gva;
+   struct x86_exception e;
+   struct {
+   u64 eptp, gpa;
+   } operand;
+
+   if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
+   !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
+
+   if (!nested_vmx_check_permission(vcpu))
+   return 1;
+
+   if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
+
+   /* According to the Intel VMX instruction reference, the memory
+* operand is read even if it isn't needed (e.g., for type==global)
+*/
+   vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+   if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+   vmx_instruction_info, &gva))
+   return 1;
+   if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
+   sizeof(operand), &e)) {
+   kvm_inject_page_fault(vcpu, &e);
+   return 1;
+   }
+
+   type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+   switch (type) {
+   case VMX_EPT_EXTENT_GLOBAL:
+   if (!(nested_vmx_ept_caps & VMX_EPT_EXTENT_GLOBAL_BIT))
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+   else {
+   /*
+* Do nothing: when L1 changes EPT12, we already
+* update EPT02 (the shadow EPT table) and call INVEPT.
+* So when L1 calls INVEPT, there's nothing left to do.
+*/
+   nested_vmx_succeed(vcpu);
+   }
+   break;
+   case VMX_EPT_EXTENT_CONTEXT:
+   if (!(nested_vmx_ept_caps & VMX_EPT_EXTENT_CONTEXT_BIT))
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+   else {
+   /* Do nothing */
+   nested_vmx_succeed(vcpu);
+   }
+   break;
+   case VMX_EPT_EXTENT_INDIVIDUAL_ADDR:
+   if (!(nested_vmx_ept_caps & VMX_EPT_EXTENT_INDIVIDUAL_BIT))
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+   else {
+   /* Do nothing */
+   nested_vmx_succeed(vcpu);
+   }
+   break;
+   default:
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+   }
+
+   skip_emulated_in

[PATCH v3 08/13] nEPT: Some additional comments

2013-05-08 Thread Jun Nakajima
Some additional comments to preexisting code:
Explain who (L0 or L1) handles EPT violation and misconfiguration exits.
Don't mention "shadow on either EPT or shadow" as the only two options.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/vmx.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index db8df4c..17d8b89 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6534,7 +6534,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu 
*vcpu)
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
case EXIT_REASON_EPT_VIOLATION:
+   /*
+* L0 always deals with the EPT violation. If nested EPT is
+* used, and the nested mmu code discovers that the address is
+* missing in the guest EPT table (EPT12), the EPT violation
+* will be injected with nested_ept_inject_page_fault()
+*/
+   return 0;
case EXIT_REASON_EPT_MISCONFIG:
+   /*
+* L2 never uses directly L1's EPT, but rather L0's own EPT
+* table (shadow on EPT) or a merged EPT table that L0 built
+* (EPT on EPT). So any problems with the structure of the
+* table is L0's fault.
+*/
return 0;
case EXIT_REASON_PREEMPTION_TIMER:
return vmcs12->pin_based_vm_exec_control &
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 09/13] nEPT: Advertise EPT to L1

2013-05-08 Thread Jun Nakajima
Advertise the support of EPT to the L1 guest, through the appropriate MSR.

This is the last patch of the basic Nested EPT feature, so as to allow
bisection through this patch series: The guest will not see EPT support until
this last patch, and will not attempt to use the half-applied feature.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/include/asm/vmx.h |  2 ++
 arch/x86/kvm/vmx.c | 17 +++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index f3e01a2..4aec45d 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -394,7 +394,9 @@ enum vmcs_field {
 #define VMX_EPTP_WB_BIT(1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT   (1ull << 16)
 #define VMX_EPT_1GB_PAGE_BIT   (1ull << 17)
+#define VMX_EPT_INVEPT_BIT (1ull << 20)
 #define VMX_EPT_AD_BIT (1ull << 21)
+#define VMX_EPT_EXTENT_INDIVIDUAL_BIT  (1ull << 24)
 #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT  (1ull << 26)
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 17d8b89..136fc25 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2155,6 +2155,7 @@ static u32 nested_vmx_pinbased_ctls_low, 
nested_vmx_pinbased_ctls_high;
 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
 static u32 nested_vmx_misc_low, nested_vmx_misc_high;
+static u32 nested_vmx_ept_caps;
 static __init void nested_vmx_setup_ctls_msrs(void)
 {
/*
@@ -2242,6 +2243,18 @@ static __init void nested_vmx_setup_ctls_msrs(void)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_WBINVD_EXITING;
 
+   if (enable_ept) {
+   /* nested EPT: emulate EPT also to L1 */
+   nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
+   nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT;
+   nested_vmx_ept_caps |=
+   VMX_EPT_INVEPT_BIT | VMX_EPT_EXTENT_GLOBAL_BIT |
+   VMX_EPT_EXTENT_CONTEXT_BIT |
+   VMX_EPT_EXTENT_INDIVIDUAL_BIT;
+   nested_vmx_ept_caps &= vmx_capability.ept;
+   } else
+   nested_vmx_ept_caps = 0;
+
/* miscellaneous data */
rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
@@ -2347,8 +2360,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 
msr_index, u64 *pdata)
nested_vmx_secondary_ctls_high);
break;
case MSR_IA32_VMX_EPT_VPID_CAP:
-   /* Currently, no nested ept or nested vpid */
-   *pdata = 0;
+   /* Currently, no nested vpid support */
+   *pdata = nested_vmx_ept_caps;
break;
default:
return 0;
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 07/13] nEPT: Fix wrong test in kvm_set_cr3

2013-05-08 Thread Jun Nakajima
kvm_set_cr3() attempts to check if the new cr3 is a valid guest physical
address. The problem is that with nested EPT, cr3 is an *L2* physical
address, not an L1 physical address as this test expects.

As the comment above this test explains, it isn't necessary, and doesn't
correspond to anything a real processor would do. So this patch removes it.

Note that this wrong test could have also theoretically caused problems
in nested NPT, not just in nested EPT. However, in practice, the problem
was avoided: nested_svm_vmexit()/vmrun() do not call kvm_set_cr3 in the
nested NPT case, and instead set the vmcb (and arch.cr3) directly, thus
circumventing the problem. Additional potential calls to the buggy function
are avoided in that we don't trap cr3 modifications when nested NPT is
enabled. However, because in nested VMX we did want to use kvm_set_cr3()
(as requested in Avi Kivity's review of the original nested VMX patches),
we can't avoid this problem and need to fix it.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/x86.c | 11 ---
 1 file changed, 11 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 94f35d2..ab09003 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -664,17 +664,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 */
}
 
-   /*
-* Does the new cr3 value map to physical memory? (Note, we
-* catch an invalid cr3 even in real-mode, because it would
-* cause trouble later on when we turn on paging anyway.)
-*
-* A real CPU would silently accept an invalid cr3 and would
-* attempt to use it - with largely undefined (and often hard
-* to debug) behavior on the guest side.
-*/
-   if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
-   return 1;
vcpu->arch.cr3 = cr3;
__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
vcpu->arch.mmu.new_cr3(vcpu);
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 05/13] nEPT: MMU context for nested EPT

2013-05-08 Thread Jun Nakajima
KVM's existing shadow MMU code already supports nested TDP. To use it, we
need to set up a new "MMU context" for nested EPT, and create a few callbacks
for it (nested_ept_*()). This context should also use the EPT versions of
the page table access functions (defined in the previous patch).
Then, we need to switch back and forth between this nested context and the
regular MMU context when switching between L1 and L2 (when L1 runs this L2
with EPT).

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/mmu.c | 38 ++
 arch/x86/kvm/mmu.h |  1 +
 arch/x86/kvm/vmx.c | 54 +-
 3 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6c1670f..37f8d7f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3653,6 +3653,44 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct 
kvm_mmu *context)
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
+int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+{
+   ASSERT(vcpu);
+   ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+   context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+
+   context->nx = is_nx(vcpu); /* TODO: ? */
+   context->new_cr3 = paging_new_cr3;
+   context->page_fault = EPT_page_fault;
+   context->gva_to_gpa = EPT_gva_to_gpa;
+   context->sync_page = EPT_sync_page;
+   context->invlpg = EPT_invlpg;
+   context->update_pte = EPT_update_pte;
+   context->free = paging_free;
+   context->root_level = context->shadow_root_level;
+   context->root_hpa = INVALID_PAGE;
+   context->direct_map = false;
+
+   /* TODO: reset_rsvds_bits_mask() is not built for EPT, we need
+  something different.
+*/
+   reset_rsvds_bits_mask(vcpu, context);
+
+
+   /* TODO: I copied these from kvm_init_shadow_mmu, I don't know why
+  they are done, or why they write to vcpu->arch.mmu and not context
+*/
+   vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
+   vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
+   vcpu->arch.mmu.base_role.smep_andnot_wp =
+   kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) &&
+   !is_write_protection(vcpu);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_EPT_mmu);
+
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 2adcbc2..8fc94dd 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -54,6 +54,7 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 
addr, u64 sptes[4]);
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool 
direct);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 51b8b4f0..80ab5b1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1045,6 +1045,11 @@ static inline bool nested_cpu_has_virtual_nmis(struct 
vmcs12 *vmcs12,
return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }
 
+static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -7305,6 +7310,46 @@ static void vmx_set_supported_cpuid(u32 func, struct 
kvm_cpuid_entry2 *entry)
entry->ecx |= bit(X86_FEATURE_VMX);
 }
 
+/* Callbacks for nested_ept_init_mmu_context: */
+
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+{
+   /* return the page table to be shadowed - in our case, EPT12 */
+   return get_vmcs12(vcpu)->ept_pointer;
+}
+
+static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
+   struct x86_exception *fault)
+{
+   struct vmcs12 *vmcs12;
+   nested_vmx_vmexit(vcpu);
+   vmcs12 = get_vmcs12(vcpu);
+   /*
+* Note no need to set vmcs12->vm_exit_reason as it is already copied
+* from vmcs02 in nested_vmx_vmexit() above, i.e., EPT_VIOLATION.
+*/
+   vmcs12->exit_qualification = fault->error_code;
+   vmcs12->guest_physical_address = fault->address;
+}
+
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+   int r = kvm_init_shadow_EPT_mmu(vcpu, &vcpu->arch.mmu);
+
+   vcpu->arch.mmu.set_cr3   = vmx_set_cr3;
+   vcpu->arch.mmu.get_cr3   = neste

[PATCH v3 06/13] nEPT: Fix cr3 handling in nested exit and entry

2013-05-08 Thread Jun Nakajima
The existing code for handling cr3 and related VMCS fields during nested
exit and entry wasn't correct in all cases:

If L2 is allowed to control cr3 (and this is indeed the case in nested EPT),
during nested exit we must copy the modified cr3 from vmcs02 to vmcs12, and
we forgot to do so. This patch adds this copy.

If L0 isn't controlling cr3 when running L2 (i.e., L0 is using EPT), and
whoever does control cr3 (L1 or L2) is using PAE, the processor might have
saved PDPTEs and we should also save them in vmcs12 (and restore later).

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/vmx.c | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 80ab5b1..db8df4c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7602,6 +7602,17 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct 
vmcs12 *vmcs12)
kvm_set_cr3(vcpu, vmcs12->guest_cr3);
kvm_mmu_reset_context(vcpu);
 
+   /*
+* Additionally, except when L0 is using shadow page tables, L1 or
+* L2 control guest_cr3 for L2, so they may also have saved PDPTEs
+*/
+   if (enable_ept) {
+   vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+   vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+   vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+   vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+   }
+
kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
 }
@@ -7924,6 +7935,25 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct 
vmcs12 *vmcs12)
vmcs12->guest_pending_dbg_exceptions =
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
 
+   /*
+* In some cases (usually, nested EPT), L2 is allowed to change its
+* own CR3 without exiting. If it has changed it, we must keep it.
+* Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
+* by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
+*/
+   if (enable_ept)
+   vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3);
+   /*
+* Additionally, except when L0 is using shadow page tables, L1 or
+* L2 control guest_cr3 for L2, so save their PDPTEs
+*/
+   if (enable_ept) {
+   vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+   vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+   vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+   vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+   }
+
vmcs12->vm_entry_controls =
(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
(vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 04/13] nEPT: Define EPT-specific link_shadow_page()

2013-05-08 Thread Jun Nakajima
Since link_shadow_page() is used by a routine in mmu.c, add an
EPT-specific link_shadow_page() in paging_tmp.h, rather than moving
it.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/paging_tmpl.h | 20 
 1 file changed, 20 insertions(+)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4c45654..dc495f9 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -461,6 +461,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, 
struct guest_walker *gw,
}
 }
 
+#if PTTYPE == PTTYPE_EPT
+static void FNAME(link_shadow_page)(u64 *sptep, struct kvm_mmu_page *sp)
+{
+   u64 spte;
+
+   spte = __pa(sp->spt) | VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK |
+   VMX_EPT_EXECUTABLE_MASK;
+
+   mmu_spte_set(sptep, spte);
+}
+#endif
+
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  * If the guest tries to write a write-protected page, we need to
@@ -513,7 +525,11 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
goto out_gpte_changed;
 
if (sp)
+#if PTTYPE == PTTYPE_EPT
+   FNAME(link_shadow_page)(it.sptep, sp);
+#else
link_shadow_page(it.sptep, sp);
+#endif
}
 
for (;
@@ -533,7 +549,11 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 
sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
  true, direct_access, it.sptep);
+#if PTTYPE == PTTYPE_EPT
+   FNAME(link_shadow_page)(it.sptep, sp);
+#else
link_shadow_page(it.sptep, sp);
+#endif
}
 
clear_sp_write_flooding_count(it.sptep);
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 03/13] nEPT: Add EPT tables support to paging_tmpl.h

2013-05-08 Thread Jun Nakajima
This is the first patch in a series which adds nested EPT support to KVM's
nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
to set its own cr3 and take its own page faults without either of L0 or L1
getting involved. This often significanlty improves L2's performance over the
previous two alternatives (shadow page tables over EPT, and shadow page
tables over shadow page tables).

This patch adds EPT support to paging_tmpl.h.

paging_tmpl.h contains the code for reading and writing page tables. The code
for 32-bit and 64-bit tables is very similar, but not identical, so
paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
with PTTYPE=64, and this generates the two sets of similar functions.

There are subtle but important differences between the format of EPT tables
and that of ordinary x86 64-bit page tables, so for nested EPT we need a
third set of functions to read the guest EPT table and to write the shadow
EPT table.

So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
with "EPT") which correctly read and write EPT tables.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/mmu.c |  5 +
 arch/x86/kvm/paging_tmpl.h | 43 +--
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 117233f..6c1670f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3397,6 +3397,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, 
unsigned level, unsigned gp
return mmu->last_pte_bitmap & (1 << index);
 }
 
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index df34d4a..4c45654 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -50,6 +50,22 @@
#define PT_LEVEL_BITS PT32_LEVEL_BITS
#define PT_MAX_FULL_LEVELS 2
#define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+   #define pt_element_t u64
+   #define guest_walker guest_walkerEPT
+   #define FNAME(name) EPT_##name
+   #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+   #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+   #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+   #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+   #define PT_LEVEL_BITS PT64_LEVEL_BITS
+   #ifdef CONFIG_X86_64
+   #define PT_MAX_FULL_LEVELS 4
+   #define CMPXCHG cmpxchg
+   #else
+   #define CMPXCHG cmpxchg64
+   #define PT_MAX_FULL_LEVELS 2
+   #endif
 #else
#error Invalid PTTYPE value
 #endif
@@ -80,6 +96,10 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
 }
 
+#if PTTYPE != PTTYPE_EPT
+/*
+ *  Comment out this for EPT because update_accessed_dirty_bits() is not used.
+ */
 static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
   pt_element_t __user *ptep_user, unsigned index,
   pt_element_t orig_pte, pt_element_t new_pte)
@@ -102,6 +122,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
 
return (ret != orig_pte);
 }
+#endif
 
 static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
  struct kvm_mmu_page *sp, u64 *spte,
@@ -126,13 +147,21 @@ no_present:
 static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
 {
unsigned access;
-
+#if PTTYPE == PTTYPE_EPT
+   access = (gpte & (VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK |
+ VMX_EPT_EXECUTABLE_MASK));
+#else
access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
 
return access;
 }
 
+#if PTTYPE != PTTYPE_EPT
+/*
+ * EPT A/D bit support is not implemented.
+ */
 static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 struct kvm_mmu *mmu,
 struct guest_walker *walker,
@@ -169,6 +198,7 @@ static int FNAME(update_accessed_dirty_bits)(struct 
kvm_vcpu *vcpu,
}
return 0;
 }
+#endif
 
 /*
  * Fetch a guest pte for a guest virtual address
@@ -177,7 +207,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker 
*walker,
struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gva_t addr, u32 access)
 {
-   int ret;
pt_element_t pte;
pt_element_t __user *

[PATCH v3 02/13] nEPT: Move gpte_access() and prefetch_invalid_gpte() to paging_tmpl.h

2013-05-08 Thread Jun Nakajima
For preparation, we just move gpte_access() and prefetch_invalid_gpte() from 
mmu.c to paging_tmpl.h.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/mmu.c | 30 --
 arch/x86/kvm/paging_tmpl.h | 40 +++-
 2 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 004cc87..117233f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2488,26 +2488,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu 
*vcpu, gfn_t gfn,
return gfn_to_pfn_memslot_atomic(slot, gfn);
 }
 
-static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- u64 gpte)
-{
-   if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
-   goto no_present;
-
-   if (!is_present_gpte(gpte))
-   goto no_present;
-
-   if (!(gpte & PT_ACCESSED_MASK))
-   goto no_present;
-
-   return false;
-
-no_present:
-   drop_spte(vcpu->kvm, spte);
-   return true;
-}
-
 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
u64 *start, u64 *end)
@@ -3408,16 +3388,6 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, 
unsigned access,
return false;
 }
 
-static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
-{
-   unsigned access;
-
-   access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-   access &= ~(gpte >> PT64_NX_SHIFT);
-
-   return access;
-}
-
 static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned 
gpte)
 {
unsigned index;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index da20860..df34d4a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -103,6 +103,36 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
return (ret != orig_pte);
 }
 
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ u64 gpte)
+{
+   if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+   goto no_present;
+
+   if (!is_present_gpte(gpte))
+   goto no_present;
+
+   if (!(gpte & PT_ACCESSED_MASK))
+   goto no_present;
+
+   return false;
+
+no_present:
+   drop_spte(vcpu->kvm, spte);
+   return true;
+}
+
+static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+   unsigned access;
+
+   access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+   access &= ~(gpte >> PT64_NX_SHIFT);
+
+   return access;
+}
+
 static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 struct kvm_mmu *mmu,
 struct guest_walker *walker,
@@ -225,7 +255,7 @@ retry_walk:
}
 
accessed_dirty &= pte;
-   pte_access = pt_access & gpte_access(vcpu, pte);
+   pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
 
walker->ptes[walker->level - 1] = pte;
} while (!is_last_gpte(mmu, walker->level, pte));
@@ -309,13 +339,13 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp,
gfn_t gfn;
pfn_t pfn;
 
-   if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
+   if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
return false;
 
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
 
gfn = gpte_to_gfn(gpte);
-   pte_access = sp->role.access & gpte_access(vcpu, gpte);
+   pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
protect_clean_gpte(&pte_access, gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
no_dirty_log && (pte_access & ACC_WRITE_MASK));
@@ -782,14 +812,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp)
  sizeof(pt_element_t)))
return -EINVAL;
 
-   if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) {
+   if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
vcpu->kvm->tlbs_dirty++;
continue;
}
 
gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access;
-   pte_access &= gpte_access(vcpu, gpte);
+   pte_access &=

[PATCH v3 01/13] nEPT: Support LOAD_IA32_EFER entry/exit controls for L1

2013-05-08 Thread Jun Nakajima
Recent KVM, since http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577
switch the EFER MSR when EPT is used and the host and guest have different
NX bits. So if we add support for nested EPT (L1 guest using EPT to run L2)
and want to be able to run recent KVM as L1, we need to allow L1 to use this
EFER switching feature.

To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER if available,
and if it isn't, it uses the generic VM_ENTRY/EXIT_MSR_LOAD. This patch adds
support for the former (the latter is still unsupported).

Nested entry and exit emulation (prepare_vmcs_02 and load_vmcs12_host_state,
respectively) already handled VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So all
that's left to do in this patch is to properly advertise this feature to L1.

Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by L0, by using
vmx_set_efer (which itself sets one of several vmcs02 fields), so we always
support this feature, regardless of whether the host supports it.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/vmx.c | 23 ---
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e53a5f7..51b8b4f0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2192,7 +2192,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #else
nested_vmx_exit_ctls_high = 0;
 #endif
-   nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+   nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+ VM_EXIT_LOAD_IA32_EFER);
 
/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2201,8 +2202,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
nested_vmx_entry_ctls_high &=
VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
-   nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
-
+   nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
+  VM_ENTRY_LOAD_IA32_EFER);
/* cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
@@ -7486,10 +7487,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, 
struct vmcs12 *vmcs12)
vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
 
-   /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
-   vmcs_write32(VM_EXIT_CONTROLS,
-   vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
-   vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
+   /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
+* we should use its exit controls. Note that IA32_MODE, LOAD_IA32_EFER
+* bits are further modified by vmx_set_efer() below.
+*/
+   vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+   /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
+* emulated by vmx_set_efer(), below.
+*/
+   vmcs_write32(VM_ENTRY_CONTROLS,
+   (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
+   ~VM_ENTRY_IA32E_MODE) |
(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
 
if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 13/13] nEPT: Inject EPT violation/misconfigration

2013-05-06 Thread Jun Nakajima
Add code to detect EPT misconfiguration and inject it to L1 VMM. Also,
it injects more correct exit qualification upon EPT violation to L1
VMM.  Now L1 can correctly go to ept_misconfig handler (instead of
wrongly going to fast_page_fault), it will try to handle mmio page
fault, if failed, it is a real EPT misconfiguration.

Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/include/asm/kvm_host.h |  4 +++
 arch/x86/kvm/mmu.c  |  5 ---
 arch/x86/kvm/mmu.h  |  5 +++
 arch/x86/kvm/paging_tmpl.h  | 26 ++
 arch/x86/kvm/vmx.c  | 79 +++--
 5 files changed, 111 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4979778..a32bda6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -262,6 +262,8 @@ struct kvm_mmu {
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
   u64 *spte, const void *pte);
+   bool (*check_tdp_pte)(u64 pte, int level);
+
hpa_t root_hpa;
int root_level;
int shadow_root_level;
@@ -504,6 +506,8 @@ struct kvm_vcpu_arch {
 * instruction.
 */
bool write_fault_to_shadow_pgtable;
+
+   unsigned long exit_qualification; /* set at EPT violation at this point 
*/
 };
 
 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 054c68b..613fbd2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -230,11 +230,6 @@ static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t 
pfn, unsigned access)
return false;
 }
 
-static inline u64 rsvd_bits(int s, int e)
-{
-   return ((1ULL << (e - s + 1)) - 1) << s;
-}
-
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask)
 {
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 19dd5ab..8aebd5a 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -91,6 +91,11 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
 }
 
+static inline u64 rsvd_bits(int s, int e)
+{
+   return ((1ULL << (e - s + 1)) - 1) << s;
+}
+
 /*
  * Will a fault with a given page-fault error code (pfec) cause a permission
  * fault with the given access (in ACC_* format)?
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 777d5d7..e4a0d72 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -126,10 +126,14 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
 
 static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
 {
+#if PTTYPE == PTTYPE_EPT
+   return (mmu->check_tdp_pte(gpte, level));
+#else
int bit7;
 
bit7 = (gpte >> 7) & 1;
return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
+#endif
 }
 
 static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
@@ -352,6 +356,28 @@ error:
walker->fault.vector = PF_VECTOR;
walker->fault.error_code_valid = true;
walker->fault.error_code = errcode;
+
+#if PTTYPE == PTTYPE_EPT
+   /*
+* Use PFERR_RSVD_MASK in erorr_code to to tell if EPT
+* misconfiguration requires to be injected. The detection is
+* done by is_rsvd_bits_set() above.
+* 
+* We set up the value of exit_qualification to inject:
+* [2:0] -- Derive from [2:0] of real exit_qualification at EPT 
violation
+* [5:3] -- Calculated by the page walk of the guest EPT page tables
+* [7:8] -- Clear to 0.
+* 
+* The other bits are set to 0.
+*/
+   if (!(errcode & PFERR_RSVD_MASK)) {
+   unsigned long exit_qualification = 
vcpu->arch.exit_qualification;
+
+   pte_access = pt_access & pte;
+   vcpu->arch.exit_qualification = ((pte_access & 0x7) << 3) |
+   (exit_qualification & 0x7);
+   }
+#endif
walker->fault.address = addr;
walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 914cdda..4edf1fe 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5074,6 +5074,8 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
/* ept page table is present? */
error_code |= (exit_qualification >> 3) & 0x1;
 
+   vcpu->arch.exit_qualification = exit_qualification;
+
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }
 
@@ -6994,7 +6996,7 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu 
*vcpu)
 }
 
 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
-   struct x86_exception *fault)
+ 

[PATCH v2 12/13] nEPT: Move is_rsvd_bits_set() to paging_tmpl.h

2013-05-06 Thread Jun Nakajima
Move is_rsvd_bits_set() to paging_tmpl.h so that it can be used to check
reserved bits in EPT page table entries as well.

Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/mmu.c |  8 
 arch/x86/kvm/paging_tmpl.h | 12 ++--
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 99bfc5e..054c68b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2460,14 +2460,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
mmu_free_roots(vcpu);
 }
 
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
-   int bit7;
-
-   bit7 = (gpte >> 7) & 1;
-   return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
-}
-
 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 bool no_dirty_log)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 51dca23..777d5d7 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -124,11 +124,19 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
 }
 #endif
 
+static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+   int bit7;
+
+   bit7 = (gpte >> 7) & 1;
+   return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
+}
+
 static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
  struct kvm_mmu_page *sp, u64 *spte,
  u64 gpte)
 {
-   if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+   if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
goto no_present;
 
if (!is_present_gpte(gpte))
@@ -279,7 +287,7 @@ retry_walk:
if (unlikely(!is_present_gpte(pte)))
goto error;
 
-   if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
+   if (unlikely(FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, pte,
  walker->level))) {
errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
goto error;
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 11/13] nEPT: Miscelleneous cleanups

2013-05-06 Thread Jun Nakajima
Some trivial code cleanups not really related to nested EPT.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
Reviewed-by: Paolo Bonzini 
---
 arch/x86/kvm/vmx.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 86e4022..914cdda 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -616,7 +616,6 @@ static void nested_release_page_clean(struct page *page)
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -912,8 +911,7 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, 
u32 bit)
(vmcs12->secondary_vm_exec_control & bit);
 }
 
-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
-   struct kvm_vcpu *vcpu)
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
 {
return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }
@@ -6321,7 +6319,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
!(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
-   get_vmcs12(vcpu), vcpu {
+   get_vmcs12(vcpu) {
if (vmx_interrupt_allowed(vcpu)) {
vmx->soft_vnmi_blocked = 0;
} else if (vmx->vnmi_blocked_time > 10LL &&
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 10/13] nEPT: Nested INVEPT

2013-05-06 Thread Jun Nakajima
If we let L1 use EPT, we should probably also support the INVEPT instruction.

In our current nested EPT implementation, when L1 changes its EPT table for
L2 (i.e., EPT12), L0 modifies the shadow EPT table (EPT02), and in the course
of this modification already calls INVEPT. Therefore, when L1 calls INVEPT,
we don't really need to do anything. In particular we *don't* need to call
the real INVEPT again. All we do in our INVEPT is verify the validity of the
call, and its parameters, and then do nothing.

In KVM Forum 2010, Dong et al. presented "Nested Virtualization Friendly KVM"
and classified our current nested EPT implementation as "shadow-like virtual
EPT". He recommended instead a different approach, which he called "VTLB-like
virtual EPT". If we had taken that alternative approach, INVEPT would have had
a bigger role: L0 would only rebuild the shadow EPT table when L1 calls INVEPT.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/include/uapi/asm/vmx.h |  1 +
 arch/x86/kvm/vmx.c  | 83 +
 2 files changed, 84 insertions(+)

diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 2871fcc..ec51012 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@
 #define EXIT_REASON_EOI_INDUCED 45
 #define EXIT_REASON_EPT_VIOLATION   48
 #define EXIT_REASON_EPT_MISCONFIG   49
+#define EXIT_REASON_INVEPT  50
 #define EXIT_REASON_WBINVD  54
 #define EXIT_REASON_XSETBV  55
 #define EXIT_REASON_APIC_WRITE  56
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index de6cfb4..86e4022 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5879,6 +5879,87 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
return 1;
 }
 
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+   u32 vmx_instruction_info;
+   unsigned long type;
+   gva_t gva;
+   struct x86_exception e;
+   struct {
+   u64 eptp, gpa;
+   } operand;
+
+   if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
+   !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
+
+   if (!nested_vmx_check_permission(vcpu))
+   return 1;
+
+   if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
+
+   /* According to the Intel VMX instruction reference, the memory
+* operand is read even if it isn't needed (e.g., for type==global)
+*/
+   vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+   if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+   vmx_instruction_info, &gva))
+   return 1;
+   if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
+   sizeof(operand), &e)) {
+   kvm_inject_page_fault(vcpu, &e);
+   return 1;
+   }
+
+   type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+   switch (type) {
+   case VMX_EPT_EXTENT_GLOBAL:
+   if (!(nested_vmx_ept_caps & VMX_EPT_EXTENT_GLOBAL_BIT))
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+   else {
+   /*
+* Do nothing: when L1 changes EPT12, we already
+* update EPT02 (the shadow EPT table) and call INVEPT.
+* So when L1 calls INVEPT, there's nothing left to do.
+*/
+   nested_vmx_succeed(vcpu);
+   }
+   break;
+   case VMX_EPT_EXTENT_CONTEXT:
+   if (!(nested_vmx_ept_caps & VMX_EPT_EXTENT_CONTEXT_BIT))
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+   else {
+   /* Do nothing */
+   nested_vmx_succeed(vcpu);
+   }
+   break;
+   case VMX_EPT_EXTENT_INDIVIDUAL_ADDR:
+   if (!(nested_vmx_ept_caps & VMX_EPT_EXTENT_INDIVIDUAL_BIT))
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+   else {
+   /* Do nothing */
+   nested_vmx_succeed(vcpu);
+   }
+   break;
+   default:
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+   }
+
+   skip_emulated_in

[PATCH v2 08/13] nEPT: Some additional comments

2013-05-06 Thread Jun Nakajima
Some additional comments to preexisting code:
Explain who (L0 or L1) handles EPT violation and misconfiguration exits.
Don't mention "shadow on either EPT or shadow" as the only two options.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/vmx.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d797d3e..419b9e3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6127,7 +6127,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu 
*vcpu)
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
case EXIT_REASON_EPT_VIOLATION:
+   /*
+* L0 always deals with the EPT violation. If nested EPT is
+* used, and the nested mmu code discovers that the address is
+* missing in the guest EPT table (EPT12), the EPT violation
+* will be injected with nested_ept_inject_page_fault()
+*/
+   return 0;
case EXIT_REASON_EPT_MISCONFIG:
+   /*
+* L2 never uses directly L1's EPT, but rather L0's own EPT
+* table (shadow on EPT) or a merged EPT table that L0 built
+* (EPT on EPT). So any problems with the structure of the
+* table is L0's fault.
+*/
return 0;
case EXIT_REASON_WBINVD:
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 09/13] nEPT: Advertise EPT to L1

2013-05-06 Thread Jun Nakajima
Advertise the support of EPT to the L1 guest, through the appropriate MSR.

This is the last patch of the basic Nested EPT feature, so as to allow
bisection through this patch series: The guest will not see EPT support until
this last patch, and will not attempt to use the half-applied feature.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/include/asm/vmx.h |  2 ++
 arch/x86/kvm/vmx.c | 17 +++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index b6fbf86..79a5beb 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -376,7 +376,9 @@ enum vmcs_field {
 #define VMX_EPTP_WB_BIT(1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT   (1ull << 16)
 #define VMX_EPT_1GB_PAGE_BIT   (1ull << 17)
+#define VMX_EPT_INVEPT_BIT (1ull << 20)
 #define VMX_EPT_AD_BIT (1ull << 21)
+#define VMX_EPT_EXTENT_INDIVIDUAL_BIT  (1ull << 24)
 #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT  (1ull << 26)
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 419b9e3..de6cfb4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2027,6 +2027,7 @@ static u32 nested_vmx_secondary_ctls_low, 
nested_vmx_secondary_ctls_high;
 static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
+static u32 nested_vmx_ept_caps;
 static __init void nested_vmx_setup_ctls_msrs(void)
 {
/*
@@ -2102,6 +2103,18 @@ static __init void nested_vmx_setup_ctls_msrs(void)
nested_vmx_secondary_ctls_low = 0;
nested_vmx_secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+   if (enable_ept) {
+   /* nested EPT: emulate EPT also to L1 */
+   nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
+   nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT;
+   nested_vmx_ept_caps |=
+   VMX_EPT_INVEPT_BIT | VMX_EPT_EXTENT_GLOBAL_BIT |
+   VMX_EPT_EXTENT_CONTEXT_BIT |
+   VMX_EPT_EXTENT_INDIVIDUAL_BIT;
+   nested_vmx_ept_caps &= vmx_capability.ept;
+   } else
+   nested_vmx_ept_caps = 0;
+
 }
 
 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -2201,8 +2214,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 
msr_index, u64 *pdata)
nested_vmx_secondary_ctls_high);
break;
case MSR_IA32_VMX_EPT_VPID_CAP:
-   /* Currently, no nested ept or nested vpid */
-   *pdata = 0;
+   /* Currently, no nested vpid support */
+   *pdata = nested_vmx_ept_caps;
break;
default:
return 0;
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 07/13] nEPT: Fix wrong test in kvm_set_cr3

2013-05-06 Thread Jun Nakajima
kvm_set_cr3() attempts to check if the new cr3 is a valid guest physical
address. The problem is that with nested EPT, cr3 is an *L2* physical
address, not an L1 physical address as this test expects.

As the comment above this test explains, it isn't necessary, and doesn't
correspond to anything a real processor would do. So this patch removes it.

Note that this wrong test could have also theoretically caused problems
in nested NPT, not just in nested EPT. However, in practice, the problem
was avoided: nested_svm_vmexit()/vmrun() do not call kvm_set_cr3 in the
nested NPT case, and instead set the vmcb (and arch.cr3) directly, thus
circumventing the problem. Additional potential calls to the buggy function
are avoided in that we don't trap cr3 modifications when nested NPT is
enabled. However, because in nested VMX we did want to use kvm_set_cr3()
(as requested in Avi Kivity's review of the original nested VMX patches),
we can't avoid this problem and need to fix it.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/x86.c | 11 ---
 1 file changed, 11 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e172132..c34590d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -659,17 +659,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 */
}
 
-   /*
-* Does the new cr3 value map to physical memory? (Note, we
-* catch an invalid cr3 even in real-mode, because it would
-* cause trouble later on when we turn on paging anyway.)
-*
-* A real CPU would silently accept an invalid cr3 and would
-* attempt to use it - with largely undefined (and often hard
-* to debug) behavior on the guest side.
-*/
-   if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
-   return 1;
vcpu->arch.cr3 = cr3;
__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
vcpu->arch.mmu.new_cr3(vcpu);
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 05/13] nEPT: MMU context for nested EPT

2013-05-06 Thread Jun Nakajima
KVM's existing shadow MMU code already supports nested TDP. To use it, we
need to set up a new "MMU context" for nested EPT, and create a few callbacks
for it (nested_ept_*()). This context should also use the EPT versions of
the page table access functions (defined in the previous patch).
Then, we need to switch back and forth between this nested context and the
regular MMU context when switching between L1 and L2 (when L1 runs this L2
with EPT).

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/mmu.c | 38 ++
 arch/x86/kvm/mmu.h |  1 +
 arch/x86/kvm/vmx.c | 53 -
 3 files changed, 91 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cb9c6fd..99bfc5e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3644,6 +3644,44 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct 
kvm_mmu *context)
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
+int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+{
+   ASSERT(vcpu);
+   ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+   context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+
+   context->nx = is_nx(vcpu); /* TODO: ? */
+   context->new_cr3 = paging_new_cr3;
+   context->page_fault = EPT_page_fault;
+   context->gva_to_gpa = EPT_gva_to_gpa;
+   context->sync_page = EPT_sync_page;
+   context->invlpg = EPT_invlpg;
+   context->update_pte = EPT_update_pte;
+   context->free = paging_free;
+   context->root_level = context->shadow_root_level;
+   context->root_hpa = INVALID_PAGE;
+   context->direct_map = false;
+
+   /* TODO: reset_rsvds_bits_mask() is not built for EPT, we need
+  something different.
+*/
+   reset_rsvds_bits_mask(vcpu, context);
+
+
+   /* TODO: I copied these from kvm_init_shadow_mmu, I don't know why
+  they are done, or why they write to vcpu->arch.mmu and not context
+*/
+   vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
+   vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
+   vcpu->arch.mmu.base_role.smep_andnot_wp =
+   kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) &&
+   !is_write_protection(vcpu);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_EPT_mmu);
+
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 6987108..19dd5ab 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -54,6 +54,7 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 
addr, u64 sptes[4]);
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool 
direct);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 485ded6..8fdcacf 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -918,6 +918,11 @@ static inline bool nested_cpu_has_virtual_nmis(struct 
vmcs12 *vmcs12,
return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }
 
+static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -6873,6 +6878,46 @@ static void vmx_set_supported_cpuid(u32 func, struct 
kvm_cpuid_entry2 *entry)
entry->ecx |= bit(X86_FEATURE_VMX);
 }
 
+/* Callbacks for nested_ept_init_mmu_context: */
+
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+{
+   /* return the page table to be shadowed - in our case, EPT12 */
+   return get_vmcs12(vcpu)->ept_pointer;
+}
+
+static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
+   struct x86_exception *fault)
+{
+   struct vmcs12 *vmcs12;
+   nested_vmx_vmexit(vcpu);
+   vmcs12 = get_vmcs12(vcpu);
+   /*
+* Note no need to set vmcs12->vm_exit_reason as it is already copied
+* from vmcs02 in nested_vmx_vmexit() above, i.e., EPT_VIOLATION.
+*/
+   vmcs12->exit_qualification = fault->error_code;
+   vmcs12->guest_physical_address = fault->address;
+}
+
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+   int r = kvm_init_shadow_EPT_mmu(vcpu, &vcpu->arch.mmu);
+
+   vcpu->arch.mmu.set_cr3   = vmx_set_cr3;
+   vcpu->arch.mmu.get_cr3   = neste

[PATCH v2 06/13] nEPT: Fix cr3 handling in nested exit and entry

2013-05-06 Thread Jun Nakajima
The existing code for handling cr3 and related VMCS fields during nested
exit and entry wasn't correct in all cases:

If L2 is allowed to control cr3 (and this is indeed the case in nested EPT),
during nested exit we must copy the modified cr3 from vmcs02 to vmcs12, and
we forgot to do so. This patch adds this copy.

If L0 isn't controlling cr3 when running L2 (i.e., L0 is using EPT), and
whoever does control cr3 (L1 or L2) is using PAE, the processor might have
saved PDPTEs and we should also save them in vmcs12 (and restore later).

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/vmx.c | 37 -
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8fdcacf..d797d3e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7163,10 +7163,26 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, 
struct vmcs12 *vmcs12)
vmx_set_cr4(vcpu, vmcs12->guest_cr4);
vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
 
-   /* shadow page tables on either EPT or shadow page tables */
+   /*
+* Note that kvm_set_cr3() and kvm_mmu_reset_context() will do the
+* right thing, and set GUEST_CR3 and/or EPT_POINTER in all supported
+* settings: 1. shadow page tables on shadow page tables, 2. shadow
+* page tables on EPT, 3. EPT on EPT.
+*/
kvm_set_cr3(vcpu, vmcs12->guest_cr3);
kvm_mmu_reset_context(vcpu);
 
+   /*
+* Additionally, except when L0 is using shadow page tables, L1 or
+* L2 control guest_cr3 for L2, so they may also have saved PDPTEs
+*/
+   if (enable_ept) {
+   vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+   vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+   vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+   vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+   }
+
kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
 }
@@ -7398,6 +7414,25 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 
*vmcs12)
vmcs12->guest_pending_dbg_exceptions =
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
 
+   /*
+* In some cases (usually, nested EPT), L2 is allowed to change its
+* own CR3 without exiting. If it has changed it, we must keep it.
+* Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
+* by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
+*/
+   if (enable_ept)
+   vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3);
+   /*
+* Additionally, except when L0 is using shadow page tables, L1 or
+* L2 control guest_cr3 for L2, so save their PDPTEs
+*/
+   if (enable_ept) {
+   vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+   vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+   vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+   vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+   }
+
/* TODO: These cannot have changed unless we have MSR bitmaps and
 * the relevant bit asks not to trap the change */
vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 04/13] nEPT: Define EPT-specific link_shadow_page()

2013-05-06 Thread Jun Nakajima
Since link_shadow_page() is used by a routine in mmu.c, add an
EPT-specific link_shadow_page() in paging_tmp.h, rather than moving
it.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/paging_tmpl.h | 20 
 1 file changed, 20 insertions(+)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 5644f61..51dca23 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -461,6 +461,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, 
struct guest_walker *gw,
}
 }
 
+#if PTTYPE == PTTYPE_EPT
+static void FNAME(link_shadow_page)(u64 *sptep, struct kvm_mmu_page *sp)
+{
+   u64 spte;
+
+   spte = __pa(sp->spt) | VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK |
+   VMX_EPT_EXECUTABLE_MASK;
+
+   mmu_spte_set(sptep, spte);
+}
+#endif
+
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  * If the guest tries to write a write-protected page, we need to
@@ -513,7 +525,11 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
goto out_gpte_changed;
 
if (sp)
+#if PTTYPE == PTTYPE_EPT
+   FNAME(link_shadow_page)(it.sptep, sp);
+#else
link_shadow_page(it.sptep, sp);
+#endif
}
 
for (;
@@ -533,7 +549,11 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 
sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
  true, direct_access, it.sptep);
+#if PTTYPE == PTTYPE_EPT
+   FNAME(link_shadow_page)(it.sptep, sp);
+#else
link_shadow_page(it.sptep, sp);
+#endif
}
 
clear_sp_write_flooding_count(it.sptep);
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 01/13] nEPT: Support LOAD_IA32_EFER entry/exit controls for L1

2013-05-06 Thread Jun Nakajima
Recent KVM, since http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577
switch the EFER MSR when EPT is used and the host and guest have different
NX bits. So if we add support for nested EPT (L1 guest using EPT to run L2)
and want to be able to run recent KVM as L1, we need to allow L1 to use this
EFER switching feature.

To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER if available,
and if it isn't, it uses the generic VM_ENTRY/EXIT_MSR_LOAD. This patch adds
support for the former (the latter is still unsupported).

Nested entry and exit emulation (prepare_vmcs_02 and load_vmcs12_host_state,
respectively) already handled VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So all
that's left to do in this patch is to properly advertise this feature to L1.

Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by L0, by using
vmx_set_efer (which itself sets one of several vmcs02 fields), so we always
support this feature, regardless of whether the host supports it.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/vmx.c | 18 ++
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 867b810..485ded6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2057,6 +2057,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #else
nested_vmx_exit_ctls_high = 0;
 #endif
+   nested_vmx_exit_ctls_high |= VM_EXIT_LOAD_IA32_EFER;
 
/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2064,6 +2065,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
nested_vmx_entry_ctls_low = 0;
nested_vmx_entry_ctls_high &=
VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
+   nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_IA32_EFER;
 
/* cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -7050,10 +7052,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, 
struct vmcs12 *vmcs12)
vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
 
-   /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
-   vmcs_write32(VM_EXIT_CONTROLS,
-   vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
-   vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
+   /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
+* we should use its exit controls. Note that IA32_MODE, LOAD_IA32_EFER
+* bits are further modified by vmx_set_efer() below.
+*/
+   vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+   /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
+* emulated by vmx_set_efer(), below.
+*/
+   vmcs_write32(VM_ENTRY_CONTROLS,
+   (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
+   ~VM_ENTRY_IA32E_MODE) |
(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
 
if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 03/13] nEPT: Add EPT tables support to paging_tmpl.h

2013-05-06 Thread Jun Nakajima
This is the first patch in a series which adds nested EPT support to KVM's
nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
to set its own cr3 and take its own page faults without either of L0 or L1
getting involved. This often significanlty improves L2's performance over the
previous two alternatives (shadow page tables over EPT, and shadow page
tables over shadow page tables).

This patch adds EPT support to paging_tmpl.h.

paging_tmpl.h contains the code for reading and writing page tables. The code
for 32-bit and 64-bit tables is very similar, but not identical, so
paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
with PTTYPE=64, and this generates the two sets of similar functions.

There are subtle but important differences between the format of EPT tables
and that of ordinary x86 64-bit page tables, so for nested EPT we need a
third set of functions to read the guest EPT table and to write the shadow
EPT table.

So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
with "EPT") which correctly read and write EPT tables.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/mmu.c |  5 +
 arch/x86/kvm/paging_tmpl.h | 43 +--
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a431495..cb9c6fd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3388,6 +3388,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, 
unsigned level, unsigned gp
return mmu->last_pte_bitmap & (1 << index);
 }
 
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 13ceca6..5644f61 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -50,6 +50,22 @@
#define PT_LEVEL_BITS PT32_LEVEL_BITS
#define PT_MAX_FULL_LEVELS 2
#define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+   #define pt_element_t u64
+   #define guest_walker guest_walkerEPT
+   #define FNAME(name) EPT_##name
+   #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+   #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+   #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+   #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+   #define PT_LEVEL_BITS PT64_LEVEL_BITS
+   #ifdef CONFIG_X86_64
+   #define PT_MAX_FULL_LEVELS 4
+   #define CMPXCHG cmpxchg
+   #else
+   #define CMPXCHG cmpxchg64
+   #define PT_MAX_FULL_LEVELS 2
+   #endif
 #else
#error Invalid PTTYPE value
 #endif
@@ -80,6 +96,10 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
 }
 
+#if PTTYPE != PTTYPE_EPT
+/*
+ *  Comment out this for EPT because update_accessed_dirty_bits() is not used.
+ */
 static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
   pt_element_t __user *ptep_user, unsigned index,
   pt_element_t orig_pte, pt_element_t new_pte)
@@ -102,6 +122,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
 
return (ret != orig_pte);
 }
+#endif
 
 static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
  struct kvm_mmu_page *sp, u64 *spte,
@@ -126,13 +147,21 @@ no_present:
 static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
 {
unsigned access;
-
+#if PTTYPE == PTTYPE_EPT
+   access = (gpte & (VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK |
+ VMX_EPT_EXECUTABLE_MASK));
+#else
access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
access &= ~(gpte >> PT64_NX_SHIFT);
+#endif
 
return access;
 }
 
+#if PTTYPE != PTTYPE_EPT
+/*
+ * EPT A/D bit support is not implemented.
+ */
 static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 struct kvm_mmu *mmu,
 struct guest_walker *walker,
@@ -169,6 +198,7 @@ static int FNAME(update_accessed_dirty_bits)(struct 
kvm_vcpu *vcpu,
}
return 0;
 }
+#endif
 
 /*
  * Fetch a guest pte for a guest virtual address
@@ -177,7 +207,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker 
*walker,
struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gva_t addr, u32 access)
 {
-   int ret;
pt_element_t pte;
pt_element_t __user *

[PATCH v2 02/13] nEPT: Move gpte_access() and prefetch_invalid_gpte() to paging_tmpl.h

2013-05-06 Thread Jun Nakajima
For preparation, we just move gpte_access() and prefetch_invalid_gpte() from 
mmu.c to paging_tmpl.h.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/mmu.c | 30 --
 arch/x86/kvm/paging_tmpl.h | 40 +++-
 2 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 956ca35..a431495 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2480,26 +2480,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu 
*vcpu, gfn_t gfn,
return gfn_to_pfn_memslot_atomic(slot, gfn);
 }
 
-static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- u64 gpte)
-{
-   if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
-   goto no_present;
-
-   if (!is_present_gpte(gpte))
-   goto no_present;
-
-   if (!(gpte & PT_ACCESSED_MASK))
-   goto no_present;
-
-   return false;
-
-no_present:
-   drop_spte(vcpu->kvm, spte);
-   return true;
-}
-
 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
u64 *start, u64 *end)
@@ -3399,16 +3379,6 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, 
unsigned access,
return false;
 }
 
-static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
-{
-   unsigned access;
-
-   access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-   access &= ~(gpte >> PT64_NX_SHIFT);
-
-   return access;
-}
-
 static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned 
gpte)
 {
unsigned index;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 105dd5b..13ceca6 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -103,6 +103,36 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
return (ret != orig_pte);
 }
 
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ u64 gpte)
+{
+   if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+   goto no_present;
+
+   if (!is_present_gpte(gpte))
+   goto no_present;
+
+   if (!(gpte & PT_ACCESSED_MASK))
+   goto no_present;
+
+   return false;
+
+no_present:
+   drop_spte(vcpu->kvm, spte);
+   return true;
+}
+
+static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+   unsigned access;
+
+   access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+   access &= ~(gpte >> PT64_NX_SHIFT);
+
+   return access;
+}
+
 static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 struct kvm_mmu *mmu,
 struct guest_walker *walker,
@@ -225,7 +255,7 @@ retry_walk:
}
 
accessed_dirty &= pte;
-   pte_access = pt_access & gpte_access(vcpu, pte);
+   pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
 
walker->ptes[walker->level - 1] = pte;
} while (!is_last_gpte(mmu, walker->level, pte));
@@ -309,13 +339,13 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp,
gfn_t gfn;
pfn_t pfn;
 
-   if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
+   if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
return false;
 
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
 
gfn = gpte_to_gfn(gpte);
-   pte_access = sp->role.access & gpte_access(vcpu, gpte);
+   pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
protect_clean_gpte(&pte_access, gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
no_dirty_log && (pte_access & ACC_WRITE_MASK));
@@ -782,14 +812,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp)
  sizeof(pt_element_t)))
return -EINVAL;
 
-   if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) {
+   if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
vcpu->kvm->tlbs_dirty++;
continue;
}
 
gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access;
-   pte_access &= gpte_access(vcpu, gpte);
+   pte_access &=

[PATCH 11/11] nEPT: Provide the correct exit qualification upon EPT

2013-04-25 Thread Jun Nakajima
Save [2:0] of exit qualificaiton at EPT violation, and use the information when 
injecting EPT violation.

Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/include/asm/kvm_host.h | 2 ++
 arch/x86/kvm/paging_tmpl.h  | 5 +
 arch/x86/kvm/vmx.c  | 3 +++
 3 files changed, 10 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4979778..e029bba 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -504,6 +504,8 @@ struct kvm_vcpu_arch {
 * instruction.
 */
bool write_fault_to_shadow_pgtable;
+
+   unsigned long exit_qualification; /* set at EPT violation at this point 
*/
 };
 
 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index e13b6c5..bd370e7 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -349,7 +349,12 @@ error:
 
walker->fault.vector = PF_VECTOR;
walker->fault.error_code_valid = true;
+#if PTTYPE != PTTYPE_EPT
walker->fault.error_code = errcode;
+#else
+   /* Reuse bits [2:0] of EPT violation */
+   walker->fault.error_code = vcpu->arch.exit_qualification & 0x7;
+#endif
walker->fault.address = addr;
walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 95304cc..61e2853 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -425,6 +425,7 @@ struct vcpu_vmx {
ktime_t entry_time;
s64 vnmi_blocked_time;
u32 exit_reason;
+   unsigned long exit_qualification;
 
bool rdtscp_enabled;
 
@@ -5074,6 +5075,8 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
/* ept page table is present? */
error_code |= (exit_qualification >> 3) & 0x1;
 
+vcpu->arch.exit_qualification = exit_qualification;
+
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }
 
-- 
1.8.2.1.610.g562af5b

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/11] nEPT: Miscelleneous cleanups

2013-04-25 Thread Jun Nakajima
Some trivial code cleanups not really related to nested EPT.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/vmx.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c67eb06..95304cc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -616,7 +616,6 @@ static void nested_release_page_clean(struct page *page)
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -912,8 +911,7 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, 
u32 bit)
(vmcs12->secondary_vm_exec_control & bit);
 }
 
-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
-   struct kvm_vcpu *vcpu)
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
 {
return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }
@@ -6321,7 +6319,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
!(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
-   get_vmcs12(vcpu), vcpu {
+   get_vmcs12(vcpu) {
if (vmx_interrupt_allowed(vcpu)) {
vmx->soft_vnmi_blocked = 0;
} else if (vmx->vnmi_blocked_time > 10LL &&
-- 
1.8.2.1.610.g562af5b

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 09/11] nEPT: Documentation

2013-04-25 Thread Jun Nakajima
Update the documentation to no longer say that nested EPT is not supported.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 Documentation/virtual/kvm/nested-vmx.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/virtual/kvm/nested-vmx.txt 
b/Documentation/virtual/kvm/nested-vmx.txt
index 8ed937d..cdf7839 100644
--- a/Documentation/virtual/kvm/nested-vmx.txt
+++ b/Documentation/virtual/kvm/nested-vmx.txt
@@ -38,8 +38,8 @@ The current code supports running Linux guests under KVM 
guests.
 Only 64-bit guest hypervisors are supported.
 
 Additional patches for running Windows under guest KVM, and Linux under
-guest VMware server, and support for nested EPT, are currently running in
-the lab, and will be sent as follow-on patchsets.
+guest VMware server, are currently running in the lab, and will be sent as
+follow-on patchsets.
 
 
 Running nested VMX
-- 
1.8.2.1.610.g562af5b

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 08/11] nEPT: Nested INVEPT

2013-04-25 Thread Jun Nakajima
If we let L1 use EPT, we should probably also support the INVEPT instruction.

In our current nested EPT implementation, when L1 changes its EPT table for
L2 (i.e., EPT12), L0 modifies the shadow EPT table (EPT02), and in the course
of this modification already calls INVEPT. Therefore, when L1 calls INVEPT,
we don't really need to do anything. In particular we *don't* need to call
the real INVEPT again. All we do in our INVEPT is verify the validity of the
call, and its parameters, and then do nothing.

In KVM Forum 2010, Dong et al. presented "Nested Virtualization Friendly KVM"
and classified our current nested EPT implementation as "shadow-like virtual
EPT". He recommended instead a different approach, which he called "VTLB-like
virtual EPT". If we had taken that alternative approach, INVEPT would have had
a bigger role: L0 would only rebuild the shadow EPT table when L1 calls INVEPT.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/include/asm/vmx.h  |  4 +-
 arch/x86/include/uapi/asm/vmx.h |  1 +
 arch/x86/kvm/vmx.c  | 83 +
 3 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index b6fbf86..0ce54f3 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -376,7 +376,9 @@ enum vmcs_field {
 #define VMX_EPTP_WB_BIT(1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT   (1ull << 16)
 #define VMX_EPT_1GB_PAGE_BIT   (1ull << 17)
-#define VMX_EPT_AD_BIT (1ull << 21)
+#define VMX_EPT_INVEPT_BIT (1ull << 20)
+#define VMX_EPT_AD_BIT (1ull << 21)
+#define VMX_EPT_EXTENT_INDIVIDUAL_BIT  (1ull << 24)
 #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT  (1ull << 26)
 
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 2871fcc..5662cef 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@
 #define EXIT_REASON_EOI_INDUCED 45
 #define EXIT_REASON_EPT_VIOLATION   48
 #define EXIT_REASON_EPT_MISCONFIG   49
+#define EXIT_REASON_INVEPT 50
 #define EXIT_REASON_WBINVD  54
 #define EXIT_REASON_XSETBV  55
 #define EXIT_REASON_APIC_WRITE  56
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 76df3a8..c67eb06 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5879,6 +5879,87 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
return 1;
 }
 
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+   u32 vmx_instruction_info;
+   unsigned long type;
+   gva_t gva;
+   struct x86_exception e;
+   struct {
+   u64 eptp, gpa;
+   } operand;
+
+   if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
+   !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
+
+   if (!nested_vmx_check_permission(vcpu))
+   return 1;
+
+   if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
+
+   /* According to the Intel VMX instruction reference, the memory
+* operand is read even if it isn't needed (e.g., for type==global)
+*/
+   vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+   if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+   vmx_instruction_info, &gva))
+   return 1;
+   if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
+   sizeof(operand), &e)) {
+   kvm_inject_page_fault(vcpu, &e);
+   return 1;
+   }
+
+   type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+   switch (type) {
+   case VMX_EPT_EXTENT_GLOBAL:
+   if (!(nested_vmx_ept_caps & VMX_EPT_EXTENT_GLOBAL_BIT))
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+   else {
+   /*
+* Do nothing: when L1 changes EPT12, we already
+* update EPT02 (the shadow EPT table) and call INVEPT.
+* So when L1 calls INVEPT, there's nothing left to do.
+*/
+   nested_vmx_succeed(vcpu);
+   }
+   break;
+   case VMX_EPT_EXTENT_CONTEXT:
+   if (!(nested_vmx_ept_caps 

[PATCH 07/11] nEPT: Advertise EPT to L1

2013-04-25 Thread Jun Nakajima
Advertise the support of EPT to the L1 guest, through the appropriate MSR.

This is the last patch of the basic Nested EPT feature, so as to allow
bisection through this patch series: The guest will not see EPT support until
this last patch, and will not attempt to use the half-applied feature.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/vmx.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 66ead51..76df3a8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2027,6 +2027,7 @@ static u32 nested_vmx_secondary_ctls_low, 
nested_vmx_secondary_ctls_high;
 static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
+static u32 nested_vmx_ept_caps;
 static __init void nested_vmx_setup_ctls_msrs(void)
 {
/*
@@ -2102,6 +2103,18 @@ static __init void nested_vmx_setup_ctls_msrs(void)
nested_vmx_secondary_ctls_low = 0;
nested_vmx_secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+   if (enable_ept) {
+   /* nested EPT: emulate EPT also to L1 */
+   nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
+   nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT;
+   nested_vmx_ept_caps |=
+   VMX_EPT_INVEPT_BIT | VMX_EPT_EXTENT_GLOBAL_BIT |
+   VMX_EPT_EXTENT_CONTEXT_BIT |
+   VMX_EPT_EXTENT_INDIVIDUAL_BIT;
+   nested_vmx_ept_caps &= vmx_capability.ept;
+   } else
+   nested_vmx_ept_caps = 0;
+
 }
 
 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -2201,8 +2214,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 
msr_index, u64 *pdata)
nested_vmx_secondary_ctls_high);
break;
case MSR_IA32_VMX_EPT_VPID_CAP:
-   /* Currently, no nested ept or nested vpid */
-   *pdata = 0;
+   /* Currently, no nested vpid support */
+   *pdata = nested_vmx_ept_caps;
break;
default:
return 0;
-- 
1.8.2.1.610.g562af5b

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 06/11] nEPT: Some additional comments

2013-04-25 Thread Jun Nakajima
Some additional comments to preexisting code:
Explain who (L0 or L1) handles EPT violation and misconfiguration exits.
Don't mention "shadow on either EPT or shadow" as the only two options.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/vmx.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 26a1b6f..66ead51 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6127,7 +6127,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu 
*vcpu)
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
case EXIT_REASON_EPT_VIOLATION:
+   /*
+* L0 always deals with the EPT violation. If nested EPT is
+* used, and the nested mmu code discovers that the address is
+* missing in the guest EPT table (EPT12), the EPT violation
+* will be injected with nested_ept_inject_page_fault()
+*/
+   return 0;
case EXIT_REASON_EPT_MISCONFIG:
+   /*
+* L2 never uses directly L1's EPT, but rather L0's own EPT
+* table (shadow on EPT) or a merged EPT table that L0 built
+* (EPT on EPT). So any problems with the structure of the
+* table is L0's fault.
+*/
return 0;
case EXIT_REASON_WBINVD:
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
-- 
1.8.2.1.610.g562af5b

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 05/11] nEPT: Fix wrong test in kvm_set_cr3

2013-04-25 Thread Jun Nakajima
kvm_set_cr3() attempts to check if the new cr3 is a valid guest physical
address. The problem is that with nested EPT, cr3 is an *L2* physical
address, not an L1 physical address as this test expects.

As the comment above this test explains, it isn't necessary, and doesn't
correspond to anything a real processor would do. So this patch removes it.

Note that this wrong test could have also theoretically caused problems
in nested NPT, not just in nested EPT. However, in practice, the problem
was avoided: nested_svm_vmexit()/vmrun() do not call kvm_set_cr3 in the
nested NPT case, and instead set the vmcb (and arch.cr3) directly, thus
circumventing the problem. Additional potential calls to the buggy function
are avoided in that we don't trap cr3 modifications when nested NPT is
enabled. However, because in nested VMX we did want to use kvm_set_cr3()
(as requested in Avi Kivity's review of the original nested VMX patches),
we can't avoid this problem and need to fix it.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/x86.c | 11 ---
 1 file changed, 11 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e172132..c34590d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -659,17 +659,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 */
}
 
-   /*
-* Does the new cr3 value map to physical memory? (Note, we
-* catch an invalid cr3 even in real-mode, because it would
-* cause trouble later on when we turn on paging anyway.)
-*
-* A real CPU would silently accept an invalid cr3 and would
-* attempt to use it - with largely undefined (and often hard
-* to debug) behavior on the guest side.
-*/
-   if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
-   return 1;
vcpu->arch.cr3 = cr3;
__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
vcpu->arch.mmu.new_cr3(vcpu);
-- 
1.8.2.1.610.g562af5b

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 04/11] nEPT: Fix cr3 handling in nested exit and entry

2013-04-25 Thread Jun Nakajima
The existing code for handling cr3 and related VMCS fields during nested
exit and entry wasn't correct in all cases:

If L2 is allowed to control cr3 (and this is indeed the case in nested EPT),
during nested exit we must copy the modified cr3 from vmcs02 to vmcs12, and
we forgot to do so. This patch adds this copy.

If L0 isn't controlling cr3 when running L2 (i.e., L0 is using EPT), and
whoever does control cr3 (L1 or L2) is using PAE, the processor might have
saved PDPTEs and we should also save them in vmcs12 (and restore later).

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/vmx.c | 37 -
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6ab53ca..26a1b6f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7163,10 +7163,26 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, 
struct vmcs12 *vmcs12)
vmx_set_cr4(vcpu, vmcs12->guest_cr4);
vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
 
-   /* shadow page tables on either EPT or shadow page tables */
+   /*
+* Note that kvm_set_cr3() and kvm_mmu_reset_context() will do the
+* right thing, and set GUEST_CR3 and/or EPT_POINTER in all supported
+* settings: 1. shadow page tables on shadow page tables, 2. shadow
+* page tables on EPT, 3. EPT on EPT.
+*/
kvm_set_cr3(vcpu, vmcs12->guest_cr3);
kvm_mmu_reset_context(vcpu);
 
+   /*
+* Additionally, except when L0 is using shadow page tables, L1 or
+* L2 control guest_cr3 for L2, so they may also have saved PDPTEs
+*/
+   if (enable_ept) {
+   vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+   vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+   vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+   vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+   }
+
kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
 }
@@ -7398,6 +7414,25 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 
*vmcs12)
vmcs12->guest_pending_dbg_exceptions =
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
 
+   /*
+* In some cases (usually, nested EPT), L2 is allowed to change its
+* own CR3 without exiting. If it has changed it, we must keep it.
+* Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
+* by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
+*/
+   if (enable_ept)
+   vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3);
+   /*
+* Additionally, except when L0 is using shadow page tables, L1 or
+* L2 control guest_cr3 for L2, so save their PDPTEs
+*/
+   if (enable_ept) {
+   vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+   vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+   vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+   vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+   }
+
/* TODO: These cannot have changed unless we have MSR bitmaps and
 * the relevant bit asks not to trap the change */
vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
-- 
1.8.2.1.610.g562af5b

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03/11] nEPT: MMU context for nested EPT

2013-04-25 Thread Jun Nakajima
KVM's existing shadow MMU code already supports nested TDP. To use it, we
need to set up a new "MMU context" for nested EPT, and create a few callbacks
for it (nested_ept_*()). This context should also use the EPT versions of
the page table access functions (defined in the previous patch).
Then, we need to switch back and forth between this nested context and the
regular MMU context when switching between L1 and L2 (when L1 runs this L2
with EPT).

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/mmu.c | 38 ++
 arch/x86/kvm/mmu.h |  1 +
 arch/x86/kvm/vmx.c | 53 -
 3 files changed, 91 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cb9c6fd..99bfc5e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3644,6 +3644,44 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct 
kvm_mmu *context)
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
+int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+{
+   ASSERT(vcpu);
+   ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+   context->shadow_root_level = kvm_x86_ops->get_tdp_level();
+
+   context->nx = is_nx(vcpu); /* TODO: ? */
+   context->new_cr3 = paging_new_cr3;
+   context->page_fault = EPT_page_fault;
+   context->gva_to_gpa = EPT_gva_to_gpa;
+   context->sync_page = EPT_sync_page;
+   context->invlpg = EPT_invlpg;
+   context->update_pte = EPT_update_pte;
+   context->free = paging_free;
+   context->root_level = context->shadow_root_level;
+   context->root_hpa = INVALID_PAGE;
+   context->direct_map = false;
+
+   /* TODO: reset_rsvds_bits_mask() is not built for EPT, we need
+  something different.
+*/
+   reset_rsvds_bits_mask(vcpu, context);
+
+
+   /* TODO: I copied these from kvm_init_shadow_mmu, I don't know why
+  they are done, or why they write to vcpu->arch.mmu and not context
+*/
+   vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
+   vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
+   vcpu->arch.mmu.base_role.smep_andnot_wp =
+   kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) &&
+   !is_write_protection(vcpu);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_EPT_mmu);
+
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 6987108..19dd5ab 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -54,6 +54,7 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 
addr, u64 sptes[4]);
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool 
direct);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9e0ec9d..6ab53ca 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -918,6 +918,11 @@ static inline bool nested_cpu_has_virtual_nmis(struct 
vmcs12 *vmcs12,
return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }
 
+static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -6873,6 +6878,46 @@ static void vmx_set_supported_cpuid(u32 func, struct 
kvm_cpuid_entry2 *entry)
entry->ecx |= bit(X86_FEATURE_VMX);
 }
 
+/* Callbacks for nested_ept_init_mmu_context: */
+
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+{
+   /* return the page table to be shadowed - in our case, EPT12 */
+   return get_vmcs12(vcpu)->ept_pointer;
+}
+
+static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
+   struct x86_exception *fault)
+{
+   struct vmcs12 *vmcs12;
+   nested_vmx_vmexit(vcpu);
+   vmcs12 = get_vmcs12(vcpu);
+   /*
+* Note no need to set vmcs12->vm_exit_reason as it is already copied
+* from vmcs02 in nested_vmx_vmexit() above, i.e., EPT_VIOLATION.
+*/
+   vmcs12->exit_qualification = fault->error_code;
+   vmcs12->guest_physical_address = fault->address;
+}
+
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+   int r = kvm_init_shadow_EPT_mmu(vcpu, &vcpu->arch.mmu);
+
+   vcpu->arch.mmu.set_cr3   = vmx_set_cr3;
+   vcpu->arch.mmu.get_cr3   = neste

[PATCH 02/11] nEPT: Add EPT tables support to paging_tmpl.h

2013-04-25 Thread Jun Nakajima
This is the first patch in a series which adds nested EPT support to KVM's
nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
to set its own cr3 and take its own page faults without either of L0 or L1
getting involved. This often significanlty improves L2's performance over the
previous two alternatives (shadow page tables over EPT, and shadow page
tables over shadow page tables).

This patch adds EPT support to paging_tmpl.h.

paging_tmpl.h contains the code for reading and writing page tables. The code
for 32-bit and 64-bit tables is very similar, but not identical, so
paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
with PTTYPE=64, and this generates the two sets of similar functions.

There are subtle but important differences between the format of EPT tables
and that of ordinary x86 64-bit page tables, so for nested EPT we need a
third set of functions to read the guest EPT table and to write the shadow
EPT table.

So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
with "EPT") which correctly read and write EPT tables.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/mmu.c |  35 ++--
 arch/x86/kvm/paging_tmpl.h | 133 ++---
 2 files changed, 130 insertions(+), 38 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 956ca35..cb9c6fd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2480,26 +2480,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu 
*vcpu, gfn_t gfn,
return gfn_to_pfn_memslot_atomic(slot, gfn);
 }
 
-static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- u64 gpte)
-{
-   if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
-   goto no_present;
-
-   if (!is_present_gpte(gpte))
-   goto no_present;
-
-   if (!(gpte & PT_ACCESSED_MASK))
-   goto no_present;
-
-   return false;
-
-no_present:
-   drop_spte(vcpu->kvm, spte);
-   return true;
-}
-
 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
u64 *start, u64 *end)
@@ -3399,16 +3379,6 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, 
unsigned access,
return false;
 }
 
-static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
-{
-   unsigned access;
-
-   access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-   access &= ~(gpte >> PT64_NX_SHIFT);
-
-   return access;
-}
-
 static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned 
gpte)
 {
unsigned index;
@@ -3418,6 +3388,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, 
unsigned level, unsigned gp
return mmu->last_pte_bitmap & (1 << index);
 }
 
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 105dd5b..e13b6c5 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -50,6 +50,22 @@
#define PT_LEVEL_BITS PT32_LEVEL_BITS
#define PT_MAX_FULL_LEVELS 2
#define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+   #define pt_element_t u64
+   #define guest_walker guest_walkerEPT
+   #define FNAME(name) EPT_##name
+   #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+   #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+   #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+   #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+   #define PT_LEVEL_BITS PT64_LEVEL_BITS
+   #ifdef CONFIG_X86_64
+   #define PT_MAX_FULL_LEVELS 4
+   #define CMPXCHG cmpxchg
+   #else
+   #define CMPXCHG cmpxchg64
+   #define PT_MAX_FULL_LEVELS 2
+   #endif
 #else
#error Invalid PTTYPE value
 #endif
@@ -80,6 +96,7 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
 }
 
+#if PTTYPE != PTTYPE_EPT
 static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
   pt_element_t __user *ptep_user, unsigned index,
   pt_element_t orig_pte, pt_element_t new_pte)
@@ -102,7 +119,52 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
 
return (ret != orig_pte);
 }
+#endif
+
+static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+   unsigned a

[PATCH 01/11] nEPT: Support LOAD_IA32_EFER entry/exit controls for L1

2013-04-25 Thread Jun Nakajima
Recent KVM, since http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577
switch the EFER MSR when EPT is used and the host and guest have different
NX bits. So if we add support for nested EPT (L1 guest using EPT to run L2)
and want to be able to run recent KVM as L1, we need to allow L1 to use this
EFER switching feature.

To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER if available,
and if it isn't, it uses the generic VM_ENTRY/EXIT_MSR_LOAD. This patch adds
support for the former (the latter is still unsupported).

Nested entry and exit emulation (prepare_vmcs_02 and load_vmcs12_host_state,
respectively) already handled VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So all
that's left to do in this patch is to properly advertise this feature to L1.

Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by L0, by using
vmx_set_efer (which itself sets one of several vmcs02 fields), so we always
support this feature, regardless of whether the host supports it.

Signed-off-by: Nadav Har'El 
Signed-off-by: Jun Nakajima 
Signed-off-by: Xinhao Xu 
---
 arch/x86/kvm/vmx.c | 18 ++
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6667042..9e0ec9d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2057,6 +2057,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #else
nested_vmx_exit_ctls_high = 0;
 #endif
+   nested_vmx_exit_ctls_high |= VM_EXIT_LOAD_IA32_EFER;
 
/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2064,6 +2065,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
nested_vmx_entry_ctls_low = 0;
nested_vmx_entry_ctls_high &=
VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
+   nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_IA32_EFER;
 
/* cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -7050,10 +7052,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, 
struct vmcs12 *vmcs12)
vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
 
-   /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
-   vmcs_write32(VM_EXIT_CONTROLS,
-   vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
-   vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
+   /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
+* we should use its exit controls. Note that IA32_MODE, LOAD_IA32_EFER
+* bits are further modified by vmx_set_efer() below.
+*/
+   vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+   /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
+* emulated by vmx_set_efer(), below.
+*/
+   vmcs_write32(VM_ENTRY_CONTROLS,
+   (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
+   ~VM_ENTRY_IA32E_MODE) |
(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
 
if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
-- 
1.8.2.1.610.g562af5b

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html