[PATCH v3 3/4] live migration support for VM dirty log management

Mario Smarduch Tue, 22 Apr 2014 16:19:27 -0700

- made kvm_vm_ioctl_get_dirty_log() generic moved to kvm_main.c, deleted 
arm,x86  versions
- optimized kvm_mmu_write_protected_pt_masked() to skip upper table lookups



Signed-off-by: Mario Smarduch <m.smard...@samsung.com>
---
 arch/arm/include/asm/kvm_host.h |    3 ++
 arch/arm/kvm/arm.c              |    5 --
 arch/arm/kvm/mmu.c              |   99 +++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              |   78 ------------------------------
 virt/kvm/kvm_main.c             |   82 ++++++++++++++++++++++++++++++++
 5 files changed, 184 insertions(+), 83 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 9f827c8..c5c27d8 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -237,5 +237,8 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 
value);
 void kvm_tlb_flush_vmid(struct kvm *kvm);
 
 int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+               struct kvm_memory_slot *slot,
+               gfn_t gfn_offset, unsigned long mask);
 
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index b916478..6ca3e84 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -784,11 +784,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        }
 }
 
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
-{
-       return -EINVAL;
-}
-
 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
                                        struct kvm_arm_device_addr *dev_addr)
 {
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 4d029a6..52d4dd6 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -812,6 +812,105 @@ int kvm_mmu_slot_remove_write_access(struct kvm *kvm, int 
slot)
        return 0;
 }
 
+/**
+ * kvm_mmu_write_protected_pt_masked - after migration thread write protects
+ *  the entire VM address space itterative call are made to get diry pags
+ *  as the VM pages are being migrated. New dirty pages may be subset
+ *  of initial WPed VM or new writes faulted in. Here write protect new
+ *  dirty pages again in preparation of next dirty log read. This function is
+ *  called as a result KVM_GET_DIRTY_LOG ioctl, to determine what pages
+ *  need to be migrated.
+ *   'kvm->mmu_lock' must be  held to protect against concurrent modification
+ *   of page tables (2nd stage fault, mmu modifiers, ...)
+ *
+ * @kvm:        The KVM pointer
+ * @slot:       The memory slot the dirty log is retrieved for
+ * @gfn_offset: The gfn offset in memory slot
+ * @mask:       The mask of dirty pages at offset 'gnf_offset in this memory
+ *              slot to be writ protect
+ */
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+               struct kvm_memory_slot *slot,
+               gfn_t gfn_offset, unsigned long mask)
+{
+       phys_addr_t ipa, next, offset_ipa;
+       pgd_t *pgdp = kvm->arch.pgd, *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       gfn_t gfnofst = slot->base_gfn + gfn_offset;
+       bool crosses_pmd;
+
+       ipa = (gfnofst + __ffs(mask)) << PAGE_SHIFT;
+       offset_ipa  = gfnofst << PAGE_SHIFT;
+       next = (gfnofst + (BITS_PER_LONG - 1)) << PAGE_SHIFT;
+
+       /* check if mask width crosses 2nd level page table range, and
+        * possibly 3rd, 4th. If not skip upper table lookups. Unlikely
+        * to be true machine memory regions tend to start on atleast PMD
+        * boundary and mask is a power of 2.
+        */
+       crosses_pmd = ((offset_ipa & PMD_MASK) ^ (next & PMD_MASK)) ? true :
+                                                                       false;
+       /* If pgd, pud, pmd not present and you cross pmd range check next
+        * index. Unlikely that pgd and pud would be not present. Between
+        * dirty page marking and now page tables may have been altered.
+        */
+       pgd = pgdp + pgd_index(ipa);
+       if (unlikely(crosses_pmd && !pgd_present(*pgd))) {
+               pgd = pgdp + pgd_index(next);
+               if (!pgd_present(*pgd))
+                       return;
+       }
+
+       pud = pud_offset(pgd, ipa);
+       if (unlikely(crosses_pmd && !pud_present(*pud))) {
+               pud = pud_offset(pgd, next);
+               if (!pud_present(*pud))
+                       return;
+       }
+
+       pmd = pmd_offset(pud, ipa);
+       if (unlikely(crosses_pmd && !pmd_present(*pmd))) {
+               pmd = pmd_offset(pud, next);
+               if (!pmd_present(*pmd))
+                       return;
+       }
+
+       for (;;) {
+               pte = pte_offset_kernel(pmd, ipa);
+               if (!pte_present(*pte))
+                       goto next_ipa;
+
+               if ((*pte & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY)
+                       goto next_ipa;
+
+               stage2_mark_pte_ro(pte);
+
+next_ipa:
+               mask &= mask - 1;
+               if (!mask)
+                       break;
+
+               /* find next page */
+               ipa = (gfnofst + __ffs(mask)) << PAGE_SHIFT;
+
+               /* skip upper page table lookups */
+               if (!crosses_pmd)
+                       continue;
+
+               pgd = pgdp + pgd_index(ipa);
+               if (unlikely(!pgd_present(*pgd)))
+                       goto next_ipa;
+               pud = pud_offset(pgd, ipa);
+               if (unlikely(!pud_present(*pud)))
+                       goto next_ipa;
+               pmd = pmd_offset(pud, ipa);
+               if (unlikely(!pmd_present(*pmd)))
+                       goto next_ipa;
+       }
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
                          struct kvm_memory_slot *memslot,
                          unsigned long fault_status)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2b85784..316b655 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3539,84 +3539,6 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
        return 0;
 }
 
-/**
- * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
- * @kvm: kvm instance
- * @log: slot id and address to which we copy the log
- *
- * We need to keep it in mind that VCPU threads can write to the bitmap
- * concurrently.  So, to avoid losing data, we keep the following order for
- * each bit:
- *
- *   1. Take a snapshot of the bit and clear it if needed.
- *   2. Write protect the corresponding page.
- *   3. Flush TLB's if needed.
- *   4. Copy the snapshot to the userspace.
- *
- * Between 2 and 3, the guest may write to the page using the remaining TLB
- * entry.  This is not a problem because the page will be reported dirty at
- * step 4 using the snapshot taken before and step 3 ensures that successive
- * writes will be logged for the next call.
- */
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
-{
-       int r;
-       struct kvm_memory_slot *memslot;
-       unsigned long n, i;
-       unsigned long *dirty_bitmap;
-       unsigned long *dirty_bitmap_buffer;
-       bool is_dirty = false;
-
-       mutex_lock(&kvm->slots_lock);
-
-       r = -EINVAL;
-       if (log->slot >= KVM_USER_MEM_SLOTS)
-               goto out;
-
-       memslot = id_to_memslot(kvm->memslots, log->slot);
-
-       dirty_bitmap = memslot->dirty_bitmap;
-       r = -ENOENT;
-       if (!dirty_bitmap)
-               goto out;
-
-       n = kvm_dirty_bitmap_bytes(memslot);
-
-       dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
-       memset(dirty_bitmap_buffer, 0, n);
-
-       spin_lock(&kvm->mmu_lock);
-
-       for (i = 0; i < n / sizeof(long); i++) {
-               unsigned long mask;
-               gfn_t offset;
-
-               if (!dirty_bitmap[i])
-                       continue;
-
-               is_dirty = true;
-
-               mask = xchg(&dirty_bitmap[i], 0);
-               dirty_bitmap_buffer[i] = mask;
-
-               offset = i * BITS_PER_LONG;
-               kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
-       }
-       if (is_dirty)
-               kvm_flush_remote_tlbs(kvm);
-
-       spin_unlock(&kvm->mmu_lock);
-
-       r = -EFAULT;
-       if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
-               goto out;
-
-       r = 0;
-out:
-       mutex_unlock(&kvm->slots_lock);
-       return r;
-}
-
 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
                        bool line_status)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1d11912..879815c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -432,6 +432,88 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
        return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
 }
 
+/**
+ * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
+ * @kvm: kvm instance
+ * @log: slot id and address to which we copy the log
+ *
+ * Shared x86 and ARM.
+ *
+ * We need to keep it in mind that VCPU threads can write to the bitmap
+ * concurrently.  So, to avoid losing data, we keep the following order for
+ * each bit:
+ *
+ *   1. Take a snapshot of the bit and clear it if needed.
+ *   2. Write protect the corresponding page.
+ *   3. Flush TLB's if needed.
+ *   4. Copy the snapshot to the userspace.
+ *
+ * Between 2 and 3, the guest may write to the page using the remaining TLB
+ * entry.  This is not a problem because the page will be reported dirty at
+ * step 4 using the snapshot taken before and step 3 ensures that successive
+ * writes will be logged for the next call.
+ *
+ */
+int __weak kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+                                               struct kvm_dirty_log *log)
+{
+       int r;
+       struct kvm_memory_slot *memslot;
+       unsigned long n, i;
+       unsigned long *dirty_bitmap;
+       unsigned long *dirty_bitmap_buffer;
+       bool is_dirty = false;
+
+       mutex_lock(&kvm->slots_lock);
+
+       r = -EINVAL;
+       if (log->slot >= KVM_USER_MEM_SLOTS)
+               goto out;
+
+       memslot = id_to_memslot(kvm->memslots, log->slot);
+
+       dirty_bitmap = memslot->dirty_bitmap;
+       r = -ENOENT;
+       if (!dirty_bitmap)
+               goto out;
+
+       n = kvm_dirty_bitmap_bytes(memslot);
+
+       dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
+       memset(dirty_bitmap_buffer, 0, n);
+
+       spin_lock(&kvm->mmu_lock);
+
+       for (i = 0; i < n / sizeof(long); i++) {
+               unsigned long mask;
+               gfn_t offset;
+
+               if (!dirty_bitmap[i])
+                       continue;
+
+               is_dirty = true;
+
+               mask = xchg(&dirty_bitmap[i], 0);
+               dirty_bitmap_buffer[i] = mask;
+
+               offset = i * BITS_PER_LONG;
+               kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
+       }
+       if (is_dirty)
+               kvm_flush_remote_tlbs(kvm);
+
+       spin_unlock(&kvm->mmu_lock);
+
+       r = -EFAULT;
+       if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
+               goto out;
+
+       r = 0;
+out:
+       mutex_unlock(&kvm->slots_lock);
+       return r;
+}
+
 #else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
 
 static int kvm_init_mmu_notifier(struct kvm *kvm)
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v3 3/4] live migration support for VM dirty log management

Reply via email to