From: Marcelo Tosatti <[EMAIL PROTECTED]>

Hypercall based pte updates are faster than faults, and also allow use
of the lazy MMU mode to batch operations.

Don't report the feature if two dimensional paging is enabled.

v1->v2:
- guest passes physical destination addr, which is cheaper than doing v->p
translation in the host.
- infer size of pte from guest mode

v2->v3:
- switch to one ioctl per paravirt feature
- move hypercall handling to mmu.c

v3->v4:
- one mmu_op hypercall instead of one per op
- allow 64-bit gpa on hypercall
- don't pass host errors (-ENOMEM) to guest

Signed-off-by: Marcelo Tosatti <[EMAIL PROTECTED]>
Signed-off-by: Avi Kivity <[EMAIL PROTECTED]>
---
 arch/x86/kvm/mmu.c         |  135 +++++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c         |   18 ++++++-
 include/asm-x86/kvm_host.h |    4 +
 include/asm-x86/kvm_para.h |   29 +++++++++
 include/linux/kvm.h        |    1 +
 include/linux/kvm_para.h   |    5 +-
 6 files changed, 189 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4583329..14de7dc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -40,7 +40,7 @@
  * 2. while doing 1. it walks guest-physical to host-physical
  * If the hardware supports that we don't need to do shadow paging.
  */
-static bool tdp_enabled = false;
+bool tdp_enabled = false;
 
 #undef MMU_DEBUG
 
@@ -167,6 +167,13 @@ static int dbg = 1;
 #define ACC_USER_MASK    PT_USER_MASK
 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
 
+struct kvm_pv_mmu_op_buffer {
+       void *ptr;
+       unsigned len;
+       unsigned processed;
+       char buf[512] __aligned(sizeof(long));
+};
+
 struct kvm_rmap_desc {
        u64 *shadow_ptes[RMAP_EXT];
        struct kvm_rmap_desc *more;
@@ -1995,6 +2002,132 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm 
*kvm)
        return nr_mmu_pages;
 }
 
+static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
+                               unsigned len)
+{
+       if (len > buffer->len)
+               return NULL;
+       return buffer->ptr;
+}
+
+static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
+                               unsigned len)
+{
+       void *ret;
+
+       ret = pv_mmu_peek_buffer(buffer, len);
+       if (!ret)
+               return ret;
+       buffer->ptr += len;
+       buffer->len -= len;
+       buffer->processed += len;
+       return ret;
+}
+
+static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
+                            gpa_t addr, gpa_t value)
+{
+       int bytes = 8;
+       int r;
+
+       if (!is_long_mode(vcpu) && !is_pae(vcpu))
+               bytes = 4;
+
+       r = mmu_topup_memory_caches(vcpu);
+       if (r)
+               return r;
+
+       if (!__emulator_write_phys(vcpu, addr, &value, bytes))
+               return -EFAULT;
+
+       return 1;
+}
+
+static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+       kvm_x86_ops->tlb_flush(vcpu);
+       return 1;
+}
+
+static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
+{
+       spin_lock(&vcpu->kvm->mmu_lock);
+       mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       return 1;
+}
+
+static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
+                            struct kvm_pv_mmu_op_buffer *buffer)
+{
+       struct kvm_mmu_op_header *header;
+
+       header = pv_mmu_peek_buffer(buffer, sizeof *header);
+       if (!header)
+               return 0;
+       switch (header->op) {
+       case KVM_MMU_OP_WRITE_PTE: {
+               struct kvm_mmu_op_write_pte *wpte;
+
+               wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
+               if (!wpte)
+                       return 0;
+               return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
+                                       wpte->pte_val);
+       }
+       case KVM_MMU_OP_FLUSH_TLB: {
+               struct kvm_mmu_op_flush_tlb *ftlb;
+
+               ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
+               if (!ftlb)
+                       return 0;
+               return kvm_pv_mmu_flush_tlb(vcpu);
+       }
+       case KVM_MMU_OP_RELEASE_PT: {
+               struct kvm_mmu_op_release_pt *rpt;
+
+               rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
+               if (!rpt)
+                       return 0;
+               return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
+       }
+       default: return 0;
+       }
+}
+
+int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
+                 gpa_t addr, unsigned long *ret)
+{
+       int r;
+       struct kvm_pv_mmu_op_buffer buffer;
+
+       down_read(&current->mm->mmap_sem);
+       down_read(&vcpu->kvm->slots_lock);
+
+       buffer.ptr = buffer.buf;
+       buffer.len = min(bytes, sizeof buffer.buf);
+       buffer.processed = 0;
+
+       r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len);
+       if (r)
+               goto out;
+
+       while (buffer.len) {
+               r = kvm_pv_mmu_op_one(vcpu, &buffer);
+               if (r < 0)
+                       goto out;
+               if (r == 0)
+                       break;
+       }
+
+       r = 1;
+out:
+       *ret = buffer.processed;
+       up_read(&vcpu->kvm->slots_lock);
+       up_read(&current->mm->mmap_sem);
+       return r;
+}
+
 #ifdef AUDIT
 
 static const char *audit_msg;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a1a6f0a..29f4f5d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -817,6 +817,9 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_NR_MEMSLOTS:
                r = KVM_MEMORY_SLOTS;
                break;
+       case KVM_CAP_MMU_WRITE:
+               r = !tdp_enabled;
+               break;
        default:
                r = 0;
                break;
@@ -2383,9 +2386,19 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
+static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
+                          unsigned long a1)
+{
+       if (is_long_mode(vcpu))
+               return a0;
+       else
+               return a0 | ((gpa_t)a1 << 32);
+}
+
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
        unsigned long nr, a0, a1, a2, a3, ret;
+       int r = 1;
 
        kvm_x86_ops->cache_regs(vcpu);
 
@@ -2407,6 +2420,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
        case KVM_HC_VAPIC_POLL_IRQ:
                ret = 0;
                break;
+       case KVM_HC_MMU_OP:
+               r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
+               break;
        default:
                ret = -KVM_ENOSYS;
                break;
@@ -2414,7 +2430,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
        vcpu->arch.regs[VCPU_REGS_RAX] = ret;
        kvm_x86_ops->decache_regs(vcpu);
        ++vcpu->stat.hypercalls;
-       return 0;
+       return r;
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
 
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 0639010..d20cabc 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -432,6 +432,10 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
 
 int __emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
                          const void *val, int bytes);
+int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
+                 gpa_t addr, unsigned long *ret);
+
+extern bool tdp_enabled;
 
 enum emulation_result {
        EMULATE_DONE,       /* no further processing */
diff --git a/include/asm-x86/kvm_para.h b/include/asm-x86/kvm_para.h
index ed5df3a..5098459 100644
--- a/include/asm-x86/kvm_para.h
+++ b/include/asm-x86/kvm_para.h
@@ -12,10 +12,39 @@
 #define KVM_CPUID_FEATURES     0x40000001
 #define KVM_FEATURE_CLOCKSOURCE                0
 #define KVM_FEATURE_NOP_IO_DELAY       1
+#define KVM_FEATURE_MMU_OP             2
 
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
 
+#define KVM_MAX_MMU_OP_BATCH           32
+
+/* Operations for KVM_HC_MMU_OP */
+#define KVM_MMU_OP_WRITE_PTE            1
+#define KVM_MMU_OP_FLUSH_TLB           2
+#define KVM_MMU_OP_RELEASE_PT          3
+
+/* Payload for KVM_HC_MMU_OP */
+struct kvm_mmu_op_header {
+       __u32 op;
+       __u32 pad;
+};
+
+struct kvm_mmu_op_write_pte {
+       struct kvm_mmu_op_header header;
+       __u64 pte_phys;
+       __u64 pte_val;
+};
+
+struct kvm_mmu_op_flush_tlb {
+       struct kvm_mmu_op_header header;
+};
+
+struct kvm_mmu_op_release_pt {
+       struct kvm_mmu_op_header header;
+       __u64 pt_phys;
+};
+
 #ifdef __KERNEL__
 #include <asm/processor.h>
 
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ea7907d..c4b1c44 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -237,6 +237,7 @@ struct kvm_vapic_addr {
 #define KVM_CAP_NR_VCPUS 9       /* returns max vcpus per vm */
 #define KVM_CAP_NR_MEMSLOTS 10   /* returns max memory slots per vm */
 #define KVM_CAP_NOP_IO_DELAY 11
+#define KVM_CAP_MMU_WRITE 12
 
 /*
  * ioctls for VM fds
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index 9c462c9..3ddce03 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -11,8 +11,11 @@
 
 /* Return values for hypercalls */
 #define KVM_ENOSYS             1000
+#define KVM_EFAULT             EFAULT
+#define KVM_E2BIG              E2BIG
 
-#define KVM_HC_VAPIC_POLL_IRQ            1
+#define KVM_HC_VAPIC_POLL_IRQ          1
+#define KVM_HC_MMU_OP                  2
 
 /*
  * hypercalls use architecture specific
-- 
1.5.4.2


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
kvm-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/kvm-devel

Reply via email to