Re: [kvm-devel] swapping with MMU Notifiers V2

Andrea Arcangeli Tue, 29 Jan 2008 08:50:14 -0800

Didn't realize s390 doesn't need those at all. Do you think
mmu_notifier.h should also go in asm/mmu_notifier? We can always move
them there later after merging with some compat code if needed.


Signed-off-by: Andrea Arcangeli <[EMAIL PROTECTED]>

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 4086080..c527d7d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -18,6 +18,7 @@ config KVM
        tristate "Kernel-based Virtual Machine (KVM) support"
        depends on ARCH_SUPPORTS_KVM && EXPERIMENTAL
        select PREEMPT_NOTIFIERS
+       select MMU_NOTIFIER
        select ANON_INODES
        ---help---
          Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 635e70c..80ebc19 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -524,6 +524,110 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
                kvm_flush_remote_tlbs(kvm);
 }
 
+static void kvm_unmap_spte(struct kvm *kvm, u64 *spte)
+{
+       struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> 
PAGE_SHIFT);
+       get_page(page);
+       rmap_remove(kvm, spte);
+       set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+       kvm_flush_remote_tlbs(kvm);
+       __free_page(page);
+}
+
+static void kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+       u64 *spte, *curr_spte;
+
+       spte = rmap_next(kvm, rmapp, NULL);
+       while (spte) {
+               BUG_ON(!(*spte & PT_PRESENT_MASK));
+               rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
+               curr_spte = spte;
+               spte = rmap_next(kvm, rmapp, spte);
+               kvm_unmap_spte(kvm, curr_spte);
+       }
+}
+
+void kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+       int i;
+
+       /*
+        * If mmap_sem isn't taken, we can look the memslots with only
+        * the mmu_lock by skipping over the slots with userspace_addr == 0.
+        */
+       spin_lock(&kvm->mmu_lock);
+       for (i = 0; i < kvm->nmemslots; i++) {
+               struct kvm_memory_slot *memslot = &kvm->memslots[i];
+               unsigned long start = memslot->userspace_addr;
+               unsigned long end;
+
+               /* mmu_lock protects userspace_addr */
+               if (!start)
+                       continue;
+
+               end = start + (memslot->npages << PAGE_SHIFT);
+               if (hva >= start && hva < end) {
+                       gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+                       kvm_unmap_rmapp(kvm, &memslot->rmap[gfn_offset]);
+               }
+       }
+       spin_unlock(&kvm->mmu_lock);
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+       u64 *spte;
+       int young = 0;
+
+       spte = rmap_next(kvm, rmapp, NULL);
+       while (spte) {
+               int _young;
+               u64 _spte = *spte;
+               BUG_ON(!(_spte & PT_PRESENT_MASK));
+               _young = _spte & PT_ACCESSED_MASK;
+               if (_young) {
+                       young = !!_young;
+                       set_shadow_pte(spte, _spte & ~PT_ACCESSED_MASK);
+               }
+               spte = rmap_next(kvm, rmapp, spte);
+       }
+       return young;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+       int i;
+       int young = 0;
+
+       /*
+        * If mmap_sem isn't taken, we can look the memslots with only
+        * the mmu_lock by skipping over the slots with userspace_addr == 0.
+        */
+       spin_lock(&kvm->mmu_lock);
+       for (i = 0; i < kvm->nmemslots; i++) {
+               struct kvm_memory_slot *memslot = &kvm->memslots[i];
+               unsigned long start = memslot->userspace_addr;
+               unsigned long end;
+
+               /* mmu_lock protects userspace_addr */
+               if (!start)
+                       continue;
+
+               end = start + (memslot->npages << PAGE_SHIFT);
+               if (hva >= start && hva < end) {
+                       gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+                       young |= kvm_age_rmapp(kvm, &memslot->rmap[gfn_offset]);
+               }
+       }
+       spin_unlock(&kvm->mmu_lock);
+
+       if (young)
+               kvm_flush_remote_tlbs(kvm);
+
+       return young;
+}
+
 #ifdef MMU_DEBUG
 static int is_empty_shadow_page(u64 *spt)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8f94a0b..f556af6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3167,6 +3167,46 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
        free_page((unsigned long)vcpu->arch.pio_data);
 }
 
+static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
+{
+       struct kvm_arch *kvm_arch;
+       kvm_arch = container_of(mn, struct kvm_arch, mmu_notifier);
+       return container_of(kvm_arch, struct kvm, arch);
+}
+
+void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
+                                     struct mm_struct *mm,
+                                     unsigned long address)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       BUG_ON(mm != kvm->mm);
+       kvm_unmap_hva(kvm, address);
+}
+
+void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
+                                      struct mm_struct *mm,
+                                      unsigned long start, unsigned long end,
+                                      int lock)
+{
+       for (; start < end; start += PAGE_SIZE)
+               kvm_mmu_notifier_invalidate_page(mn, mm, start);
+}
+
+int kvm_mmu_notifier_age_page(struct mmu_notifier *mn,
+                             struct mm_struct *mm,
+                             unsigned long address)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       BUG_ON(mm != kvm->mm);
+       return kvm_age_hva(kvm, address);
+}
+
+static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+       .invalidate_range       = kvm_mmu_notifier_invalidate_range,
+       .invalidate_page        = kvm_mmu_notifier_invalidate_page,
+       .age_page               = kvm_mmu_notifier_age_page,
+};
+
 struct  kvm *kvm_arch_create_vm(void)
 {
        struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
@@ -3176,6 +3216,9 @@ struct  kvm *kvm_arch_create_vm(void)
 
        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 
+       kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+       mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+
        return kvm;
 }
 
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 67ae307..72ba267 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -13,6 +13,7 @@
 
 #include <linux/types.h>
 #include <linux/mm.h>
+#include <linux/mmu_notifier.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
@@ -287,6 +288,8 @@ struct kvm_arch{
        int round_robin_prev_vcpu;
        unsigned int tss_addr;
        struct page *apic_access_page;
+
+       struct mmu_notifier mmu_notifier;
 };
 
 struct kvm_vm_stat {
@@ -404,6 +407,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu);
 int kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
 
+void kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
 void kvm_mmu_zap_all(struct kvm *kvm);




> > @@ -1265,7 +1266,11 @@ static int kvm_resume(struct sys_device *dev)
> >  }
> > 
> >  static struct sysdev_class kvm_sysdev_class = {
> > +#ifdef set_kset_name
> >     set_kset_name("kvm"),
> > +#else
> > +   .name = "kvm",
> > +#endif
> >     .suspend = kvm_suspend,
> >     .resume = kvm_resume,
> >  };
> 
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 4295623..a67e38f 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -298,7 +299,15 @@ int __kvm_set_memory_region(struct kvm *kvm,
> >             memset(new.rmap, 0, npages * sizeof(*new.rmap));
> > 
> >             new.user_alloc = user_alloc;
> > -           new.userspace_addr = mem->userspace_addr;
> > +           /*
> > +            * hva_to_rmmap() serialzies with the mmu_lock and to be
> > +            * safe it has to ignore memslots with !user_alloc &&
> > +            * !userspace_addr.
> > +            */
> > +           if (user_alloc)
> > +                   new.userspace_addr = mem->userspace_addr;
> > +           else
> > +                   new.userspace_addr = 0;
> >     }
> > 
> >     /* Allocate page dirty bitmap if needed */
> > @@ -311,14 +320,18 @@ int __kvm_set_memory_region(struct kvm *kvm,
> >             memset(new.dirty_bitmap, 0, dirty_bytes);
> >     }
> > 
> > +   spin_lock(&kvm->mmu_lock);
> >     if (mem->slot >= kvm->nmemslots)
> >             kvm->nmemslots = mem->slot + 1;
> > 
> >     *memslot = new;
> > +   spin_unlock(&kvm->mmu_lock);
> > 
> >     r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
> >     if (r) {
> > +           spin_lock(&kvm->mmu_lock);
> >             *memslot = old;
> > +           spin_unlock(&kvm->mmu_lock);
> >             goto out_free;
> >     }
> > 
> > 
> > 
> This needs to go to arch too.

The mmu_lock isn't a s390 thing, so I doubt we should have different
locking rules for each arch. memslots are common code, and the locking
primitives required to access them should be common too.

Either the data structure is common and their locking primitives are
common too, or the data structure should be per-arch. I realize you
don't currently need to browse the memslots with only the mmu_lock,
but it doesn't sound good to have different locking rules in each arch
for common data structures. The performance impact of the additional
mmu_lock there, should be zero.

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel

Re: [kvm-devel] swapping with MMU Notifiers V2

Reply via email to