[kvm-devel] [RFC][PATCH 5/7] KVM: VMX: Enable EPT feature

Yang, Sheng Fri, 01 Feb 2008 00:23:07 -0800

From 5062793a4aae25b8f701decbcf6dead9a4a38348 Mon Sep 17 00:00:00 2001
From: Sheng Yang <[EMAIL PROTECTED]>
Date: Fri, 1 Feb 2008 06:51:01 +0800
Subject: [PATCH] KVM: VMX: Enable EPT feature


For EPT's entry format is different from page table, we would set up EPT in 
this
patch. In EPT, hardware CR3 always point to guest page table, and changing CR3
won't cause vmexit. The real mode was supported by using identity mapped page
table.

Signed-off-by: Sheng Yang <[EMAIL PROTECTED]>
---
 arch/x86/kvm/vmx.c |  357 
+++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 352 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5f3767a..f5b59e7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -40,7 +40,7 @@ module_param(bypass_guest_pf, bool, 0);
 static int enable_vpid = 1;
 module_param(enable_vpid, bool, 0);

-static int enable_ept = 0;
+static int enable_ept = 1;
 module_param(enable_ept, bool, 0);

 struct vmcs {
@@ -136,6 +136,15 @@ struct vmx_capability {
                .ar_bytes = GUEST_##seg##_AR_BYTES,             \
        }

+#define EPT_PT_BITS 9
+#define EPT_ENT_PER_PAGE (1 << EPT_PT_BITS)
+#define EPT_PAGE_SHIFT 12
+/* level 0 is the leaf of EPT table */
+#define VMX_GET_EPTE_OFFSET(gpa, level)                                \
+       (((gpa) & ((EPT_ENT_PER_PAGE - 1) << (EPT_PAGE_SHIFT +  \
+       EPT_PT_BITS * (level)))) >> (EPT_PAGE_SHIFT +           \
+       EPT_PT_BITS * (level)));
+
 static struct kvm_vmx_segment_field {
        unsigned selector;
        unsigned base;
@@ -294,6 +303,18 @@ static inline void __invvpid(int ext, u16 vpid, gva_t 
gva)
                  : : "a"(&operand), "c"(ext) : "cc", "memory");
 }

+static inline void __invept(int ext, u64 eptp, gpa_t gpa)
+{
+       struct {
+               u64 eptp, gpa;
+       } operand = {eptp, gpa};
+
+       asm volatile (ASM_VMX_INVEPT
+                       /* CF==1 or ZF==1 --> rc = -1 */
+                       "; ja 1f ; ud2 ; 1:\n"
+                       : : "a" (&operand), "c" (ext) : "cc", "memory");
+}
+
 static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
 {
        int i;
@@ -345,6 +366,34 @@ static inline void vpid_sync_vcpu_all(struct vcpu_vmx 
*vmx)
        __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
 }

+static inline void ept_sync_global(void)
+{
+       if (cpu_has_vmx_invept_global())
+               __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+}
+
+static inline void ept_sync_context(struct kvm *kvm)
+{
+       if (vm_need_ept()) {
+               if (cpu_has_vmx_invept_context())
+                       __invept(VMX_EPT_EXTENT_CONTEXT,
+                                       kvm->arch.eptp.entry, 0);
+               else
+                       ept_sync_global();
+       }
+}
+
+static inline void ept_sync_individual_addr(struct kvm *kvm, gpa_t gpa)
+{
+       if (vm_need_ept()) {
+               if (cpu_has_vmx_invept_individual_addr())
+                       __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
+                                       kvm->arch.eptp.entry, gpa);
+               else
+                       ept_sync_context(kvm);
+       }
+}
+
 static unsigned long vmcs_readl(unsigned long field)
 {
        unsigned long value;
@@ -432,6 +481,8 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
                eb |= 1u << 1;
        if (vcpu->arch.rmode.active)
                eb = ~0;
+       if (vm_need_ept())
+               eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
        vmcs_write32(EXCEPTION_BITMAP, eb);
 }

@@ -1355,8 +1406,64 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu 
*vcpu)
        vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
 }

+static void ept_new_cr3(struct kvm_vcpu *vcpu)
+{
+       if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
+               if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
+                       printk(KERN_ERR "EPT: Fail to load pdptrs!\n");
+                       return;
+               }
+               vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
+               vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
+               vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
+               vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]);
+       }
+}
+
+static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+
+static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
+                                       unsigned long cr0,
+                                       struct kvm_vcpu *vcpu)
+{
+       if (!(cr0 & X86_CR0_PG)) {
+               /* From paging/starting to nonpaging */
+               vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+                            vmcs_config.cpu_based_exec_ctrl |
+                            (CPU_BASED_CR3_LOAD_EXITING |
+                             CPU_BASED_CR3_STORE_EXITING));
+               vcpu->arch.cr0 = cr0;
+               vmx_set_cr4(vcpu, vcpu->arch.cr4);
+               *hw_cr0 |= X86_CR0_PE | X86_CR0_PG;
+               *hw_cr0 &= ~X86_CR0_WP;
+       } else if (!is_paging(vcpu)) {
+               /* From nonpaging to paging */
+               vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+                            vmcs_config.cpu_based_exec_ctrl &
+                            ~(CPU_BASED_CR3_LOAD_EXITING |
+                              CPU_BASED_CR3_STORE_EXITING));
+               vcpu->arch.cr0 = cr0;
+               vmx_set_cr4(vcpu, vcpu->arch.cr4);
+               if (!(vcpu->arch.cr0 & X86_CR0_WP))
+                       *hw_cr0 &= ~X86_CR0_WP;
+       }
+}
+
+static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
+                                       struct kvm_vcpu *vcpu)
+{
+       if (!is_paging(vcpu)) {
+               *hw_cr4 &= ~X86_CR4_PAE;
+               *hw_cr4 |= X86_CR4_PSE;
+       } else if (!(vcpu->arch.cr4 & X86_CR4_PAE))
+               *hw_cr4 &= ~X86_CR4_PAE;
+}
+
 static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
+       unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) |
+                               KVM_VM_CR0_ALWAYS_ON;
+
        vmx_fpu_deactivate(vcpu);

        if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
@@ -1374,9 +1481,11 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned 
long cr0)
        }
 #endif

+       if (vm_need_ept())
+               ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
+
        vmcs_writel(CR0_READ_SHADOW, cr0);
-       vmcs_writel(GUEST_CR0,
-                   (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
+       vmcs_writel(GUEST_CR0, hw_cr0);
        vcpu->arch.cr0 = cr0;

        if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
@@ -1385,6 +1494,9 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned 
long cr0)

 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
+       if (vm_need_ept())
+               ept_new_cr3(vcpu);
+
        vmx_flush_tlb(vcpu);
        vmcs_writel(GUEST_CR3, cr3);
        if (vcpu->arch.cr0 & X86_CR0_PE)
@@ -1393,9 +1505,14 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned 
long cr3)

 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
+       unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ?
+                   KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+
+       if (vm_need_ept())
+               ept_update_paging_mode_cr4(&hw_cr4, vcpu);
+
        vmcs_writel(CR4_READ_SHADOW, cr4);
-       vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
-                   KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
+       vmcs_writel(GUEST_CR4, hw_cr4);
        vcpu->arch.cr4 = cr4;
 }

@@ -1857,6 +1974,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)

        vpid_sync_vcpu_all(vmx);

+       ept_sync_context(vmx->vcpu.kvm);
+
        return 0;

 out:
@@ -2016,6 +2135,9 @@ static int handle_exception(struct kvm_vcpu *vcpu, 
struct kvm_run *kvm_run)
        if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
                error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
        if (is_page_fault(intr_info)) {
+               /* EPT won't cause page fault */
+               if (vm_need_ept())
+                       BUG();
                cr2 = vmcs_readl(EXIT_QUALIFICATION);
                return kvm_mmu_page_fault(vcpu, cr2, error_code);
        }
@@ -2307,6 +2429,140 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, 
struct kvm_run *kvm_run)
        return 1;
 }

+static int insert_ept_entry(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
+{
+       struct page *page = pfn_to_page(kvm->arch.eptp.fields.asr_mfn);
+       int level, rtn;
+       u64 offset;
+       u64 *table;
+       epte_t epte;
+       struct page *ept_page;
+
+       rtn = 0;
+       /* level 0 is leaf */
+       mutex_lock(&kvm->arch.ept_mutex);
+       for (level = kvm->arch.eptp.fields.gaw; level > 0; level--) {
+               table = kmap_atomic(page, KM_USER0);
+               offset = VMX_GET_EPTE_OFFSET(gpa, level);
+               epte.entry = 0;
+               if (table[offset] == 0) {
+                       ept_page = alloc_page(GFP_KERNEL |
+                                             __GFP_HIGHMEM | __GFP_ZERO);
+                       if (!ept_page) {
+                               kunmap_atomic(table, KM_USER0);
+                               rtn = -ENOMEM;
+                               break;
+                       }
+                       atomic_inc(&kvm->arch.ept_npages);
+                       epte.fields.addr_mfn = page_to_pfn(ept_page);
+                       epte.fields.r = epte.fields.w = epte.fields.x = 1;
+                       table[offset] = epte.entry;
+               } else
+                       epte.entry = (u64)table[offset];
+               kunmap_atomic(table, KM_USER0);
+               page = pfn_to_page(epte.fields.addr_mfn);
+       }
+       if (rtn < 0)
+               goto out;
+       /* dealing with leaf */
+       table = kmap_atomic(page, KM_USER0);
+       offset = VMX_GET_EPTE_OFFSET(gpa, 0);
+       epte.entry = (u64)table[offset];
+       if (table[offset] == 0) {
+               epte.fields.addr_mfn = hpa >> PAGE_SHIFT;
+               epte.fields.emt = kvm->arch.eptp.fields.etmt;
+               epte.fields.r = epte.fields.w = epte.fields.x = 1;
+               table[offset] = epte.entry;
+       } else {
+               printk(KERN_INFO "EPT: GPA have been mapped. "
+                       "GPA: 0x%lx, HPA: 0x%lx\n",
+                       (long unsigned int)gpa, (long unsigned int)hpa);
+               rtn = 1;
+       }
+       kunmap_atomic(table, KM_USER0);
+out:
+       mutex_unlock(&kvm->arch.ept_mutex);
+       return rtn;
+}
+
+static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run 
*kvm_run)
+{
+       u64 exit_qualification;
+       enum emulation_result er;
+       gpa_t gpa;
+       hpa_t hpa;
+       unsigned long hva;
+       struct page *pages[1];
+       int npages, gla_validity;
+       int r;
+
+       exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+       /*
+        * 1. GPA exceeds GAW.
+        * 2. RWX violation.
+        */
+       if (exit_qualification & (1 << 6)) {
+               printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
+               return -ENOTSUPP;
+       }
+       gla_validity = (exit_qualification >> 7) & 0x3;
+       if (gla_validity != 0x3 && gla_validity != 0x1) {
+               printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
+               printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
+                       (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
+                       (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
+               printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
+                       (long unsigned int)exit_qualification);
+               kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+               kvm_run->hw.hardware_exit_reason = 0;
+               return -ENOTSUPP;
+       }
+
+       gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+       hva = gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT);
+       if (!kvm_is_error_hva(hva)) {
+               npages = get_user_pages(current, current->mm, hva, 1, 1, 0,
+                               pages, NULL);
+               if (npages != 1) {
+                       printk(KERN_ERR
+                               "EPT: Error when dealing with hva 0x%lx\n",
+                               hva);
+                       return -ENOTSUPP;
+               }
+               atomic_inc(&vcpu->kvm->arch.guest_npages);
+               hpa = page_to_phys(pages[0]);
+               r = insert_ept_entry(vcpu->kvm, gpa & PAGE_MASK, hpa);
+               if (r != 0) {
+                       /* In normal condition, more than one vcpu may caused
+                        * violation at the same time, so release others */
+                       kvm_release_page_clean(pages[0]);
+                       atomic_dec(&vcpu->kvm->arch.guest_npages);
+               }
+               if (r < 0) {
+                       printk(KERN_ERR "EPT: Not enough memory!\n");
+                       return -ENOMEM;
+               }
+               return 1;
+       } else {
+               /* must be MMIO */
+               er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+
+               if (er == EMULATE_FAIL) {
+                       printk(KERN_ERR
+                        "EPT: Fail to handle EPT violation vmexit!er is %d\n",
+                        er);
+                       printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
+                        (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
+                        (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
+                       printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
+                               (long unsigned int)exit_qualification);
+                       return -ENOTSUPP;
+               } else if (er == EMULATE_DO_MMIO)
+                       return 0;
+       }
+       return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest 
execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what 
needs
@@ -2329,6 +2585,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu 
*vcpu,
        [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
        [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
        [EXIT_REASON_WBINVD]                  = handle_wbinvd,
+       [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
 };

 static const int kvm_vmx_max_exit_handlers =
@@ -2597,6 +2854,13 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct 
kvm_run *kvm_run)
 #endif
              );

+       /* Access CR3 don't cause VMExit in paging mode, so we need
+        * to sync with guest real CR3. */
+       if (vm_need_ept() && is_paging(vcpu)) {
+               vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+               ept_new_cr3(vcpu);
+       }
+
        vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
        if (vmx->rmode.irq.pending)
                fixup_rmode_irq(vmx);
@@ -2625,6 +2889,59 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
        }
 }

+static int ept_teardown(struct kvm *kvm)
+{
+       const int entries_per_page = PAGE_SIZE / sizeof(u64);
+       int level, i;
+       hpa_t path[VMX_EPT_MAX_GAW];
+       epte_t *table;
+       eptp_t eptp;
+       struct page *page, *ept_page;
+
+       mutex_lock(&kvm->arch.ept_mutex);
+       eptp = kvm->arch.eptp;
+       level = eptp.fields.gaw;
+       path[level] = eptp.fields.asr_mfn;
+       while (level <= eptp.fields.gaw) {
+               ept_page = pfn_to_page(path[level]);
+               table = kmap_atomic(ept_page, KM_USER0);
+               for (i = 0; i < entries_per_page; i++) {
+                       if (table[i].entry == 0)
+                               continue;
+                       if (level == 0) {
+                               page = pfn_to_page(table[i].fields.addr_mfn);
+                               kvm_release_page_clean(page);
+                               atomic_dec(&kvm->arch.guest_npages);
+                               table[i].entry = 0;
+                       } else {
+                               level--;
+                               path[level] = table[i].fields.addr_mfn;
+                               table[i].entry = 0;
+                               break;
+                       }
+               }
+               kunmap_atomic(table, KM_USER0);
+               if (i != entries_per_page)
+                       continue;
+               __free_page(ept_page);
+               atomic_dec(&kvm->arch.ept_npages);
+               level++;
+       }
+       kvm->arch.eptp.entry = 0;
+       mutex_unlock(&kvm->arch.ept_mutex);
+
+       if (atomic_read(&kvm->arch.ept_npages) ||
+           atomic_read(&kvm->arch.guest_npages))
+               printk(KERN_ERR "EPT: Fail to teardown ept table!"
+                               "ept pages remains %d, guest remains %d\n",
+                               atomic_read(&kvm->arch.ept_npages),
+                               atomic_read(&kvm->arch.guest_npages));
+       else
+               printk(KERN_INFO "EPT: Success teardown ept table\n");
+
+       return 0;
+}
+
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2633,6 +2950,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
        if (vmx->vpid != 0)
                __clear_bit(vmx->vpid, vmx_vpid_bitmap);
        spin_unlock(&vmx_vpid_lock);
+       if (vcpu->vcpu_id == 0 && vm_need_ept())
+               ept_teardown(vcpu->kvm);
        vmx_free_vmcs(vcpu);
        kfree(vmx->host_msrs);
        kfree(vmx->guest_msrs);
@@ -2640,6 +2959,26 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
        kmem_cache_free(kvm_vcpu_cache, vmx);
 }

+static int ept_eptp_init(struct kvm *kvm)
+{
+       struct page *root_page;
+
+       root_page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+       if (!root_page)
+               return -ENOMEM;
+
+       mutex_init(&kvm->arch.ept_mutex);
+
+       /* TODO write the value reading from MSR */
+       kvm->arch.eptp.fields.etmt = VMX_EPT_DEFAULT_MT;
+       kvm->arch.eptp.fields.gaw = VMX_EPT_DEFAULT_GAW;
+       kvm->arch.eptp.fields.asr_mfn = page_to_pfn(root_page);
+       atomic_set(&kvm->arch.guest_npages, 0);
+       atomic_set(&kvm->arch.ept_npages, 1);
+
+       return 0;
+}
+
 static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 {
        int err;
@@ -2650,6 +2989,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, 
unsigned int id)
                return ERR_PTR(-ENOMEM);

        allocate_vpid(vmx);
+       if (id == 0 && vm_need_ept())
+               if (ept_eptp_init(kvm) < 0)
+                       return ERR_PTR(-ENOMEM);

        err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
        if (err)
@@ -2802,9 +3144,14 @@ static int __init vmx_init(void)
        if (r)
                goto out1;

+       if (cpu_has_vmx_ept())
+               bypass_guest_pf = 0;
+
        if (bypass_guest_pf)
                kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);

+       ept_sync_global();
+
        return 0;

 out1:
--
debian.1.5.3.7.1-dirty

From 5062793a4aae25b8f701decbcf6dead9a4a38348 Mon Sep 17 00:00:00 2001
From: Sheng Yang <[EMAIL PROTECTED]>
Date: Fri, 1 Feb 2008 06:51:01 +0800
Subject: [PATCH] KVM: VMX: Enable EPT feature

For EPT's entry format is different from page table, we would set up EPT in this
patch. In EPT, hardware CR3 always point to guest page table, and changing CR3
won't cause vmexit. The real mode was supported by using identity mapped page
table.

Signed-off-by: Sheng Yang <[EMAIL PROTECTED]>
---
 arch/x86/kvm/vmx.c |  357 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 352 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5f3767a..f5b59e7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -40,7 +40,7 @@ module_param(bypass_guest_pf, bool, 0);
 static int enable_vpid = 1;
 module_param(enable_vpid, bool, 0);
 
-static int enable_ept = 0;
+static int enable_ept = 1;
 module_param(enable_ept, bool, 0);
 
 struct vmcs {
@@ -136,6 +136,15 @@ struct vmx_capability {
 		.ar_bytes = GUEST_##seg##_AR_BYTES,	   	\
 	}
 
+#define EPT_PT_BITS 9
+#define EPT_ENT_PER_PAGE (1 << EPT_PT_BITS)
+#define EPT_PAGE_SHIFT 12
+/* level 0 is the leaf of EPT table */
+#define VMX_GET_EPTE_OFFSET(gpa, level)				\
+	(((gpa) & ((EPT_ENT_PER_PAGE - 1) << (EPT_PAGE_SHIFT +	\
+	EPT_PT_BITS * (level)))) >> (EPT_PAGE_SHIFT +		\
+	EPT_PT_BITS * (level)));
+
 static struct kvm_vmx_segment_field {
 	unsigned selector;
 	unsigned base;
@@ -294,6 +303,18 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
 		  : : "a"(&operand), "c"(ext) : "cc", "memory");
 }
 
+static inline void __invept(int ext, u64 eptp, gpa_t gpa)
+{
+	struct {
+		u64 eptp, gpa;
+	} operand = {eptp, gpa};
+
+	asm volatile (ASM_VMX_INVEPT
+			/* CF==1 or ZF==1 --> rc = -1 */
+			"; ja 1f ; ud2 ; 1:\n"
+			: : "a" (&operand), "c" (ext) : "cc", "memory");
+}
+
 static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
@@ -345,6 +366,34 @@ static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
 	__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
 }
 
+static inline void ept_sync_global(void)
+{
+	if (cpu_has_vmx_invept_global())
+		__invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+}
+
+static inline void ept_sync_context(struct kvm *kvm)
+{
+	if (vm_need_ept()) {
+		if (cpu_has_vmx_invept_context())
+			__invept(VMX_EPT_EXTENT_CONTEXT,
+					kvm->arch.eptp.entry, 0);
+		else
+			ept_sync_global();
+	}
+}
+
+static inline void ept_sync_individual_addr(struct kvm *kvm, gpa_t gpa)
+{
+	if (vm_need_ept()) {
+		if (cpu_has_vmx_invept_individual_addr())
+			__invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
+					kvm->arch.eptp.entry, gpa);
+		else
+			ept_sync_context(kvm);
+	}
+}
+
 static unsigned long vmcs_readl(unsigned long field)
 {
 	unsigned long value;
@@ -432,6 +481,8 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 		eb |= 1u << 1;
 	if (vcpu->arch.rmode.active)
 		eb = ~0;
+	if (vm_need_ept())
+		eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 
@@ -1355,8 +1406,64 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
 	vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
 }
 
+static void ept_new_cr3(struct kvm_vcpu *vcpu)
+{
+	if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
+		if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
+			printk(KERN_ERR "EPT: Fail to load pdptrs!\n");
+			return;
+		}
+		vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
+		vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
+		vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
+		vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]);
+	}
+}
+
+static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+
+static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
+					unsigned long cr0,
+					struct kvm_vcpu *vcpu)
+{
+	if (!(cr0 & X86_CR0_PG)) {
+		/* From paging/starting to nonpaging */
+		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+			     vmcs_config.cpu_based_exec_ctrl |
+			     (CPU_BASED_CR3_LOAD_EXITING |
+			      CPU_BASED_CR3_STORE_EXITING));
+		vcpu->arch.cr0 = cr0;
+		vmx_set_cr4(vcpu, vcpu->arch.cr4);
+		*hw_cr0 |= X86_CR0_PE | X86_CR0_PG;
+		*hw_cr0 &= ~X86_CR0_WP;
+	} else if (!is_paging(vcpu)) {
+		/* From nonpaging to paging */
+		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+			     vmcs_config.cpu_based_exec_ctrl &
+			     ~(CPU_BASED_CR3_LOAD_EXITING |
+			       CPU_BASED_CR3_STORE_EXITING));
+		vcpu->arch.cr0 = cr0;
+		vmx_set_cr4(vcpu, vcpu->arch.cr4);
+		if (!(vcpu->arch.cr0 & X86_CR0_WP))
+			*hw_cr0 &= ~X86_CR0_WP;
+	}
+}
+
+static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
+					struct kvm_vcpu *vcpu)
+{
+	if (!is_paging(vcpu)) {
+		*hw_cr4 &= ~X86_CR4_PAE;
+		*hw_cr4 |= X86_CR4_PSE;
+	} else if (!(vcpu->arch.cr4 & X86_CR4_PAE))
+		*hw_cr4 &= ~X86_CR4_PAE;
+}
+
 static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
+	unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) |
+				KVM_VM_CR0_ALWAYS_ON;
+
 	vmx_fpu_deactivate(vcpu);
 
 	if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
@@ -1374,9 +1481,11 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	}
 #endif
 
+	if (vm_need_ept())
+		ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
+
 	vmcs_writel(CR0_READ_SHADOW, cr0);
-	vmcs_writel(GUEST_CR0,
-		    (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
+	vmcs_writel(GUEST_CR0, hw_cr0);
 	vcpu->arch.cr0 = cr0;
 
 	if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
@@ -1385,6 +1494,9 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
+	if (vm_need_ept())
+		ept_new_cr3(vcpu);
+
 	vmx_flush_tlb(vcpu);
 	vmcs_writel(GUEST_CR3, cr3);
 	if (vcpu->arch.cr0 & X86_CR0_PE)
@@ -1393,9 +1505,14 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 
 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
+	unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ?
+		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+
+	if (vm_need_ept())
+		ept_update_paging_mode_cr4(&hw_cr4, vcpu);
+
 	vmcs_writel(CR4_READ_SHADOW, cr4);
-	vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
-		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
+	vmcs_writel(GUEST_CR4, hw_cr4);
 	vcpu->arch.cr4 = cr4;
 }
 
@@ -1857,6 +1974,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	vpid_sync_vcpu_all(vmx);
 
+	ept_sync_context(vmx->vcpu.kvm);
+
 	return 0;
 
 out:
@@ -2016,6 +2135,9 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
 		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
 	if (is_page_fault(intr_info)) {
+		/* EPT won't cause page fault */
+		if (vm_need_ept())
+			BUG();
 		cr2 = vmcs_readl(EXIT_QUALIFICATION);
 		return kvm_mmu_page_fault(vcpu, cr2, error_code);
 	}
@@ -2307,6 +2429,140 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	return 1;
 }
 
+static int insert_ept_entry(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
+{
+	struct page *page = pfn_to_page(kvm->arch.eptp.fields.asr_mfn);
+	int level, rtn;
+	u64 offset;
+	u64 *table;
+	epte_t epte;
+	struct page *ept_page;
+
+	rtn = 0;
+	/* level 0 is leaf */
+	mutex_lock(&kvm->arch.ept_mutex);
+	for (level = kvm->arch.eptp.fields.gaw; level > 0; level--) {
+		table = kmap_atomic(page, KM_USER0);
+		offset = VMX_GET_EPTE_OFFSET(gpa, level);
+		epte.entry = 0;
+		if (table[offset] == 0) {
+			ept_page = alloc_page(GFP_KERNEL |
+					      __GFP_HIGHMEM | __GFP_ZERO);
+			if (!ept_page) {
+				kunmap_atomic(table, KM_USER0);
+				rtn = -ENOMEM;
+				break;
+			}
+			atomic_inc(&kvm->arch.ept_npages);
+			epte.fields.addr_mfn = page_to_pfn(ept_page);
+			epte.fields.r = epte.fields.w = epte.fields.x = 1;
+			table[offset] = epte.entry;
+		} else
+			epte.entry = (u64)table[offset];
+		kunmap_atomic(table, KM_USER0);
+		page = pfn_to_page(epte.fields.addr_mfn);
+	}
+	if (rtn < 0)
+		goto out;
+	/* dealing with leaf */
+	table = kmap_atomic(page, KM_USER0);
+	offset = VMX_GET_EPTE_OFFSET(gpa, 0);
+	epte.entry = (u64)table[offset];
+	if (table[offset] == 0) {
+		epte.fields.addr_mfn = hpa >> PAGE_SHIFT;
+		epte.fields.emt = kvm->arch.eptp.fields.etmt;
+		epte.fields.r = epte.fields.w = epte.fields.x = 1;
+		table[offset] = epte.entry;
+	} else {
+		printk(KERN_INFO "EPT: GPA have been mapped. "
+			"GPA: 0x%lx, HPA: 0x%lx\n",
+			(long unsigned int)gpa, (long unsigned int)hpa);
+		rtn = 1;
+	}
+	kunmap_atomic(table, KM_USER0);
+out:
+	mutex_unlock(&kvm->arch.ept_mutex);
+	return rtn;
+}
+
+static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	u64 exit_qualification;
+	enum emulation_result er;
+	gpa_t gpa;
+	hpa_t hpa;
+	unsigned long hva;
+	struct page *pages[1];
+	int npages, gla_validity;
+	int r;
+
+	exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+	/*
+	 * 1. GPA exceeds GAW.
+	 * 2. RWX violation.
+	 */
+	if (exit_qualification & (1 << 6)) {
+		printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
+		return -ENOTSUPP;
+	}
+	gla_validity = (exit_qualification >> 7) & 0x3;
+	if (gla_validity != 0x3 && gla_validity != 0x1) {
+		printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
+		printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
+			(long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
+			(long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
+		printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
+			(long unsigned int)exit_qualification);
+		kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
+		kvm_run->hw.hardware_exit_reason = 0;
+		return -ENOTSUPP;
+	}
+
+	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+	hva = gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT);
+	if (!kvm_is_error_hva(hva)) {
+		npages = get_user_pages(current, current->mm, hva, 1, 1, 0,
+				pages, NULL);
+		if (npages != 1) {
+			printk(KERN_ERR
+				"EPT: Error when dealing with hva 0x%lx\n",
+				hva);
+			return -ENOTSUPP;
+		}
+		atomic_inc(&vcpu->kvm->arch.guest_npages);
+		hpa = page_to_phys(pages[0]);
+		r = insert_ept_entry(vcpu->kvm, gpa & PAGE_MASK, hpa);
+		if (r != 0) {
+			/* In normal condition, more than one vcpu may caused
+			 * violation at the same time, so release others */
+			kvm_release_page_clean(pages[0]);
+			atomic_dec(&vcpu->kvm->arch.guest_npages);
+		}
+		if (r < 0) {
+			printk(KERN_ERR "EPT: Not enough memory!\n");
+			return -ENOMEM;
+		}
+		return 1;
+	} else {
+		/* must be MMIO */
+		er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+
+		if (er == EMULATE_FAIL) {
+			printk(KERN_ERR
+			 "EPT: Fail to handle EPT violation vmexit!er is %d\n",
+			 er);
+			printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
+			 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
+			 (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
+			printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
+				(long unsigned int)exit_qualification);
+			return -ENOTSUPP;
+		} else if (er == EMULATE_DO_MMIO)
+			return 0;
+	}
+	return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -2329,6 +2585,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
 	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
 	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
+	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -2597,6 +2854,13 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 #endif
 	      );
 
+	/* Access CR3 don't cause VMExit in paging mode, so we need
+	 * to sync with guest real CR3. */
+	if (vm_need_ept() && is_paging(vcpu)) {
+		vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+		ept_new_cr3(vcpu);
+	}
+
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 	if (vmx->rmode.irq.pending)
 		fixup_rmode_irq(vmx);
@@ -2625,6 +2889,59 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
 	}
 }
 
+static int ept_teardown(struct kvm *kvm)
+{
+	const int entries_per_page = PAGE_SIZE / sizeof(u64);
+	int level, i;
+	hpa_t path[VMX_EPT_MAX_GAW];
+	epte_t *table;
+	eptp_t eptp;
+	struct page *page, *ept_page;
+
+	mutex_lock(&kvm->arch.ept_mutex);
+	eptp = kvm->arch.eptp;
+	level = eptp.fields.gaw;
+	path[level] = eptp.fields.asr_mfn;
+	while (level <= eptp.fields.gaw) {
+		ept_page = pfn_to_page(path[level]);
+		table = kmap_atomic(ept_page, KM_USER0);
+		for (i = 0; i < entries_per_page; i++) {
+			if (table[i].entry == 0)
+				continue;
+			if (level == 0) {
+				page = pfn_to_page(table[i].fields.addr_mfn);
+				kvm_release_page_clean(page);
+				atomic_dec(&kvm->arch.guest_npages);
+				table[i].entry = 0;
+			} else {
+				level--;
+				path[level] = table[i].fields.addr_mfn;
+				table[i].entry = 0;
+				break;
+			}
+		}
+		kunmap_atomic(table, KM_USER0);
+		if (i != entries_per_page)
+			continue;
+		__free_page(ept_page);
+		atomic_dec(&kvm->arch.ept_npages);
+		level++;
+	}
+	kvm->arch.eptp.entry = 0;
+	mutex_unlock(&kvm->arch.ept_mutex);
+
+	if (atomic_read(&kvm->arch.ept_npages) ||
+	    atomic_read(&kvm->arch.guest_npages))
+		printk(KERN_ERR "EPT: Fail to teardown ept table!"
+				"ept pages remains %d, guest remains %d\n",
+				atomic_read(&kvm->arch.ept_npages),
+				atomic_read(&kvm->arch.guest_npages));
+	else
+		printk(KERN_INFO "EPT: Success teardown ept table\n");
+
+	return 0;
+}
+
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2633,6 +2950,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 	if (vmx->vpid != 0)
 		__clear_bit(vmx->vpid, vmx_vpid_bitmap);
 	spin_unlock(&vmx_vpid_lock);
+	if (vcpu->vcpu_id == 0 && vm_need_ept())
+		ept_teardown(vcpu->kvm);
 	vmx_free_vmcs(vcpu);
 	kfree(vmx->host_msrs);
 	kfree(vmx->guest_msrs);
@@ -2640,6 +2959,26 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 }
 
+static int ept_eptp_init(struct kvm *kvm)
+{
+	struct page *root_page;
+
+	root_page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+	if (!root_page)
+		return -ENOMEM;
+
+	mutex_init(&kvm->arch.ept_mutex);
+
+	/* TODO write the value reading from MSR */
+	kvm->arch.eptp.fields.etmt = VMX_EPT_DEFAULT_MT;
+	kvm->arch.eptp.fields.gaw = VMX_EPT_DEFAULT_GAW;
+	kvm->arch.eptp.fields.asr_mfn = page_to_pfn(root_page);
+	atomic_set(&kvm->arch.guest_npages, 0);
+	atomic_set(&kvm->arch.ept_npages, 1);
+
+	return 0;
+}
+
 static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 {
 	int err;
@@ -2650,6 +2989,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 		return ERR_PTR(-ENOMEM);
 
 	allocate_vpid(vmx);
+	if (id == 0 && vm_need_ept())
+		if (ept_eptp_init(kvm) < 0)
+			return ERR_PTR(-ENOMEM);
 
 	err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
 	if (err)
@@ -2802,9 +3144,14 @@ static int __init vmx_init(void)
 	if (r)
 		goto out1;
 
+	if (cpu_has_vmx_ept())
+		bypass_guest_pf = 0;
+
 	if (bypass_guest_pf)
 		kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
 
+	ept_sync_global();
+
 	return 0;
 
 out1:
-- 
debian.1.5.3.7.1-dirty

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

_______________________________________________
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel

[kvm-devel] [RFC][PATCH 5/7] KVM: VMX: Enable EPT feature

Reply via email to