I updated the mmu notifier patch. There was a problem in shutting down
the vm in mmu_notifier_release instead of waiting the last
filp->release, because all vcpus will be freed and the
filp->private_data will point to already freed memory when the vcpu fd
is closed.

It seems we may really not need to do anything in the mmu notifier
release method, because for us the shadow pagetables are meaningless
if no guest can run, and the ->release method is only invoked when all
tasks with current->mm == kvm->mm already quit. After that the guest
can't possibly run anymore, the ioctl becomes useless too. So I
changed the code to only invalidate the root of the spte radix tree in
every vcpu, just for debugging. If any guest attempts to run after mmu
notifier release runs, we'll then notice. No spte can be established
after ->release returns.

Probably ->release shouldn't be mandatory to implement, but from a
different point of view it may also payoff to make all methods
mandatory to be implemented as a microoptimization to avoid the null
pointer check before invoking the notifier (and in the future to also
fail registration if the api is extended and a module isn't updated,
to decrease the risk of runtime failure).

Signed-off-by: Andrea Arcangeli <[EMAIL PROTECTED]>

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 8d45fab..ce3251c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
        tristate "Kernel-based Virtual Machine (KVM) support"
        depends on HAVE_KVM
        select PREEMPT_NOTIFIERS
+       select MMU_NOTIFIER
        select ANON_INODES
        ---help---
          Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3d769c3..978da9b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -651,6 +651,101 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
        account_shadowed(kvm, gfn);
 }
 
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+       u64 *spte, *curr_spte;
+       int need_tlb_flush = 0;
+
+       spte = rmap_next(kvm, rmapp, NULL);
+       while (spte) {
+               BUG_ON(!(*spte & PT_PRESENT_MASK));
+               rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
+               curr_spte = spte;
+               spte = rmap_next(kvm, rmapp, spte);
+               rmap_remove(kvm, curr_spte);
+               set_shadow_pte(curr_spte, shadow_trap_nonpresent_pte);
+               need_tlb_flush = 1;
+       }
+       return need_tlb_flush;
+}
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+       int i;
+       int need_tlb_flush = 0;
+
+       /*
+        * If mmap_sem isn't taken, we can look the memslots with only
+        * the mmu_lock by skipping over the slots with userspace_addr == 0.
+        */
+       for (i = 0; i < kvm->nmemslots; i++) {
+               struct kvm_memory_slot *memslot = &kvm->memslots[i];
+               unsigned long start = memslot->userspace_addr;
+               unsigned long end;
+
+               /* mmu_lock protects userspace_addr */
+               if (!start)
+                       continue;
+
+               end = start + (memslot->npages << PAGE_SHIFT);
+               if (hva >= start && hva < end) {
+                       gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+                       need_tlb_flush |= kvm_unmap_rmapp(kvm,
+                                                         
&memslot->rmap[gfn_offset]);
+               }
+       }
+
+       return need_tlb_flush;
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+       u64 *spte;
+       int young = 0;
+
+       spte = rmap_next(kvm, rmapp, NULL);
+       while (spte) {
+               int _young;
+               u64 _spte = *spte;
+               BUG_ON(!(_spte & PT_PRESENT_MASK));
+               _young = _spte & PT_ACCESSED_MASK;
+               if (_young) {
+                       young = !!_young;
+                       set_shadow_pte(spte, _spte & ~PT_ACCESSED_MASK);
+               }
+               spte = rmap_next(kvm, rmapp, spte);
+       }
+       return young;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+       int i;
+       int young = 0;
+
+       /*
+        * If mmap_sem isn't taken, we can look the memslots with only
+        * the mmu_lock by skipping over the slots with userspace_addr == 0.
+        */
+       for (i = 0; i < kvm->nmemslots; i++) {
+               struct kvm_memory_slot *memslot = &kvm->memslots[i];
+               unsigned long start = memslot->userspace_addr;
+               unsigned long end;
+
+               /* mmu_lock protects userspace_addr */
+               if (!start)
+                       continue;
+
+               end = start + (memslot->npages << PAGE_SHIFT);
+               if (hva >= start && hva < end) {
+                       gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+                       young |= kvm_age_rmapp(kvm, &memslot->rmap[gfn_offset]);
+               }
+       }
+
+       return young;
+}
+
 #ifdef MMU_DEBUG
 static int is_empty_shadow_page(u64 *spt)
 {
@@ -1189,6 +1284,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, 
int write, gfn_t gfn)
        int r;
        int largepage = 0;
        pfn_t pfn;
+       int mmu_seq;
 
        down_read(&current->mm->mmap_sem);
        if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
@@ -1196,6 +1292,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, 
int write, gfn_t gfn)
                largepage = 1;
        }
 
+       mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq);
+       /* implicit mb(), we'll read before PT lock is unlocked */
        pfn = gfn_to_pfn(vcpu->kvm, gfn);
        up_read(&current->mm->mmap_sem);
 
@@ -1206,6 +1304,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, 
int write, gfn_t gfn)
        }
 
        spin_lock(&vcpu->kvm->mmu_lock);
+       if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_count)))
+               goto out_unlock;
+       smp_rmb();
+       if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_seq) != mmu_seq))
+               goto out_unlock;
        kvm_mmu_free_some_pages(vcpu);
        r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
                         PT32E_ROOT_LEVEL);
@@ -1213,6 +1316,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, 
int write, gfn_t gfn)
 
 
        return r;
+
+out_unlock:
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       kvm_release_pfn_clean(pfn);
+       return 0;
 }
 
 
@@ -1230,9 +1338,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
        int i;
        struct kvm_mmu_page *sp;
 
-       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
-               return;
        spin_lock(&vcpu->kvm->mmu_lock);
+       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+               goto out;
        if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
                hpa_t root = vcpu->arch.mmu.root_hpa;
 
@@ -1240,9 +1348,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
                --sp->root_count;
                if (!sp->root_count && sp->role.invalid)
                        kvm_mmu_zap_page(vcpu->kvm, sp);
-               vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-               spin_unlock(&vcpu->kvm->mmu_lock);
-               return;
+               goto out_invalid;
        }
        for (i = 0; i < 4; ++i) {
                hpa_t root = vcpu->arch.mmu.pae_root[i];
@@ -1256,8 +1362,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
                }
                vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
        }
-       spin_unlock(&vcpu->kvm->mmu_lock);
+out_invalid:
        vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+out:
+       spin_unlock(&vcpu->kvm->mmu_lock);
 }
 
 static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
@@ -1340,6 +1448,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t 
gpa,
        int r;
        int largepage = 0;
        gfn_t gfn = gpa >> PAGE_SHIFT;
+       int mmu_seq;
 
        ASSERT(vcpu);
        ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1353,6 +1462,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t 
gpa,
                gfn &= ~(KVM_PAGES_PER_HPAGE-1);
                largepage = 1;
        }
+       mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq);
+       /* implicit mb(), we'll read before PT lock is unlocked */
        pfn = gfn_to_pfn(vcpu->kvm, gfn);
        up_read(&current->mm->mmap_sem);
        if (is_error_pfn(pfn)) {
@@ -1360,12 +1471,22 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t 
gpa,
                return 1;
        }
        spin_lock(&vcpu->kvm->mmu_lock);
+       if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_count)))
+               goto out_unlock;
+       smp_rmb();
+       if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_seq) != mmu_seq))
+               goto out_unlock;
        kvm_mmu_free_some_pages(vcpu);
        r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
                         largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
        spin_unlock(&vcpu->kvm->mmu_lock);
 
        return r;
+
+out_unlock:
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       kvm_release_pfn_clean(pfn);
+       return 0;
 }
 
 static void nonpaging_free(struct kvm_vcpu *vcpu)
@@ -1621,18 +1742,20 @@ static bool last_updated_pte_accessed(struct kvm_vcpu 
*vcpu)
        return !!(spte && (*spte & shadow_accessed_mask));
 }
 
-static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-                                         const u8 *new, int bytes)
+static int mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+                                        const u8 *new, int bytes,
+                                        gfn_t *_gfn, pfn_t *_pfn,
+                                        int *_mmu_seq, int *_largepage)
 {
        gfn_t gfn;
        int r;
        u64 gpte = 0;
        pfn_t pfn;
-
-       vcpu->arch.update_pte.largepage = 0;
+       int mmu_seq;
+       int largepage;
 
        if (bytes != 4 && bytes != 8)
-               return;
+               return 0;
 
        /*
         * Assume that the pte write on a page table of the same type
@@ -1645,7 +1768,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu 
*vcpu, gpa_t gpa,
                if ((bytes == 4) && (gpa % 4 == 0)) {
                        r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
                        if (r)
-                               return;
+                               return 0;
                        memcpy((void *)&gpte + (gpa % 8), new, 4);
                } else if ((bytes == 8) && (gpa % 8 == 0)) {
                        memcpy((void *)&gpte, new, 8);
@@ -1655,23 +1778,30 @@ static void mmu_guess_page_from_pte_write(struct 
kvm_vcpu *vcpu, gpa_t gpa,
                        memcpy((void *)&gpte, new, 4);
        }
        if (!is_present_pte(gpte))
-               return;
+               return 0;
        gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 
+       largepage = 0;
        down_read(&current->mm->mmap_sem);
        if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
                gfn &= ~(KVM_PAGES_PER_HPAGE-1);
-               vcpu->arch.update_pte.largepage = 1;
+               largepage = 1;
        }
+       mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq);
+       /* implicit mb(), we'll read before PT lock is unlocked */
        pfn = gfn_to_pfn(vcpu->kvm, gfn);
        up_read(&current->mm->mmap_sem);
 
-       if (is_error_pfn(pfn)) {
+       if (unlikely(is_error_pfn(pfn))) {
                kvm_release_pfn_clean(pfn);
-               return;
+               return 0;
        }
-       vcpu->arch.update_pte.gfn = gfn;
-       vcpu->arch.update_pte.pfn = pfn;
+
+       *_gfn = gfn;
+       *_pfn = pfn;
+       *_mmu_seq = mmu_seq;
+       *_largepage = largepage;
+       return 1;
 }
 
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -1694,9 +1824,24 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        int npte;
        int r;
 
+       int update_pte;
+       gfn_t gpte_gfn;
+       pfn_t pfn;
+       int mmu_seq;
+       int largepage;
+
        pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
-       mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
+       update_pte = mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes,
+                                                  &gpte_gfn, &pfn,
+                                                  &mmu_seq, &largepage);
        spin_lock(&vcpu->kvm->mmu_lock);
+       if (update_pte) {
+               BUG_ON(!is_error_pfn(vcpu->arch.update_pte.pfn));
+               vcpu->arch.update_pte.gfn = gpte_gfn;
+               vcpu->arch.update_pte.pfn = pfn;
+               vcpu->arch.update_pte.mmu_seq = mmu_seq;
+               vcpu->arch.update_pte.largepage = largepage;
+       }
        kvm_mmu_free_some_pages(vcpu);
        ++vcpu->kvm->stat.mmu_pte_write;
        kvm_mmu_audit(vcpu, "pre pte write");
@@ -1775,11 +1920,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                }
        }
        kvm_mmu_audit(vcpu, "post pte write");
-       spin_unlock(&vcpu->kvm->mmu_lock);
        if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
                kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
                vcpu->arch.update_pte.pfn = bad_pfn;
        }
+       spin_unlock(&vcpu->kvm->mmu_lock);
 }
 
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 156fe10..4ac73a6 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -263,6 +263,12 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu_page *page,
        pfn = vcpu->arch.update_pte.pfn;
        if (is_error_pfn(pfn))
                return;
+       if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_count)))
+               return;
+       smp_rmb();
+       if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_seq) !=
+                    vcpu->arch.update_pte.mmu_seq))
+               return;
        kvm_get_pfn(pfn);
        mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
                     gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
@@ -380,6 +386,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t 
addr,
        int r;
        pfn_t pfn;
        int largepage = 0;
+       int mmu_seq;
 
        pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
        kvm_mmu_audit(vcpu, "pre page fault");
@@ -413,6 +420,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t 
addr,
                        largepage = 1;
                }
        }
+       mmu_seq = atomic_read(&vcpu->kvm->arch.mmu_notifier_seq);
+       /* implicit mb(), we'll read before PT lock is unlocked */
        pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
        up_read(&current->mm->mmap_sem);
 
@@ -424,6 +433,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t 
addr,
        }
 
        spin_lock(&vcpu->kvm->mmu_lock);
+       if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_count)))
+               goto out_unlock;
+       smp_rmb();
+       if (unlikely(atomic_read(&vcpu->kvm->arch.mmu_notifier_seq) != mmu_seq))
+               goto out_unlock;
        kvm_mmu_free_some_pages(vcpu);
        shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
                                  largepage, &write_pt, pfn);
@@ -439,6 +453,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t 
addr,
        spin_unlock(&vcpu->kvm->mmu_lock);
 
        return write_pt;
+
+out_unlock:
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       kvm_release_pfn_clean(pfn);
+       return 0;
 }
 
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 979f983..ceb8dee 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/highmem.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/uaccess.h>
 #include <asm/msr.h>
@@ -3888,16 +3889,127 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
        free_page((unsigned long)vcpu->arch.pio_data);
 }
 
-struct  kvm *kvm_arch_create_vm(void)
+static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 {
-       struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+       struct kvm_arch *kvm_arch;
+       kvm_arch = container_of(mn, struct kvm_arch, mmu_notifier);
+       return container_of(kvm_arch, struct kvm, arch);
+}
 
-       if (!kvm)
-               return ERR_PTR(-ENOMEM);
+static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
+                                            struct mm_struct *mm,
+                                            unsigned long address)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       int need_tlb_flush;
 
-       INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+       /*
+        * When ->invalidate_page runs, the linux pte has been zapped
+        * already but the page is still allocated until
+        * ->invalidate_page returns. So if we increase the sequence
+        * here the kvm page fault will notice if the spte can't be
+        * established because the page is going to be freed. If
+        * instead the kvm page fault establishes the spte before
+        * ->invalidate_page runs, kvm_unmap_hva will release it
+        * before returning.
+
+        * No need of memory barriers as the sequence increase only
+        * need to be seen at spin_unlock time, and not at spin_lock
+        * time.
+        *
+        * Increasing the sequence after the spin_unlock would be
+        * unsafe because the kvm page fault could then establish the
+        * pte after kvm_unmap_hva returned, without noticing the page
+        * is going to be freed.
+        */
+       atomic_inc(&kvm->arch.mmu_notifier_seq);
+       spin_lock(&kvm->mmu_lock);
+       need_tlb_flush = kvm_unmap_hva(kvm, address);
+       spin_unlock(&kvm->mmu_lock);
 
-       return kvm;
+       /* we've to flush the tlb before the pages can be freed */
+       if (need_tlb_flush)
+               kvm_flush_remote_tlbs(kvm);
+
+}
+
+static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
+                                                   struct mm_struct *mm,
+                                                   unsigned long start,
+                                                   unsigned long end)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       int need_tlb_flush = 0;
+
+       /*
+        * The count increase must become visible at unlock time as no
+        * spte can be established without taking the mmu_lock and
+        * count is also read inside the mmu_lock critical section.
+        */
+       atomic_inc(&kvm->arch.mmu_notifier_count);
+
+       spin_lock(&kvm->mmu_lock);
+       for (; start < end; start += PAGE_SIZE)
+               need_tlb_flush |= kvm_unmap_hva(kvm, start);
+       spin_unlock(&kvm->mmu_lock);
+
+       /* we've to flush the tlb before the pages can be freed */
+       if (need_tlb_flush)
+               kvm_flush_remote_tlbs(kvm);
+}
+
+static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+                                                 struct mm_struct *mm,
+                                                 unsigned long start,
+                                                 unsigned long end)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       /*
+        *
+        * This sequence increase will notify the kvm page fault that
+        * the page that is going to be mapped in the spte could have
+        * been freed.
+        *
+        * There's also an implicit mb() here in this comment,
+        * provided by the last PT lock taken to zap pagetables, and
+        * that the read side has to take too in follow_page(). The
+        * sequence increase in the worst case will become visible to
+        * the kvm page fault after the spin_lock of the last PT lock
+        * of the last PT-lock-protected critical section preceeding
+        * invalidate_range_end. So if the kvm page fault is about to
+        * establish the spte inside the mmu_lock, while we're freeing
+        * the pages, it will have to backoff and when it retries, it
+        * will have to take the PT lock before it can check the
+        * pagetables again. And after taking the PT lock it will
+        * re-establish the pte even if it will see the already
+        * increased sequence number before calling gfn_to_pfn.
+        */
+       atomic_inc(&kvm->arch.mmu_notifier_seq);
+       /*
+        * The sequence increase must be visible before count
+        * decrease. The page fault has to read count before sequence
+        * for this write order to be effective.
+        */
+       wmb();
+       atomic_dec(&kvm->arch.mmu_notifier_count);
+       BUG_ON(atomic_read(&kvm->arch.mmu_notifier_count) < 0);
+}
+
+static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
+                                             struct mm_struct *mm,
+                                             unsigned long address)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       int young;
+
+       spin_lock(&kvm->mmu_lock);
+       young = kvm_age_hva(kvm, address);
+       spin_unlock(&kvm->mmu_lock);
+
+       if (young)
+               kvm_flush_remote_tlbs(kvm);
+
+       return young;
 }
 
 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
@@ -3907,16 +4019,62 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
        vcpu_put(vcpu);
 }
 
-static void kvm_free_vcpus(struct kvm *kvm)
+static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
+                                    struct mm_struct *mm)
 {
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
        unsigned int i;
 
+       BUG_ON(mm != kvm->mm);
+
        /*
-        * Unpin any mmu pages first.
+        * All tasks with current->mm == mm quit and guest and the
+        * ioctls can only run on tasks with current->mm == mm, so all
+        * shadow pagebles are already meaningless because no guest
+        * can run anymore at this point. We don't really need to, but
+        * we can set the roots invalid here just to be more strict.
         */
        for (i = 0; i < KVM_MAX_VCPUS; ++i)
                if (kvm->vcpus[i])
                        kvm_unload_vcpu_mmu(kvm->vcpus[i]);
+}
+
+static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+       .release                = kvm_mmu_notifier_release,
+       .invalidate_page        = kvm_mmu_notifier_invalidate_page,
+       .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
+       .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
+       .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
+};
+
+struct  kvm *kvm_arch_create_vm(void)
+{
+       struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+       int err;
+
+       if (!kvm)
+               return ERR_PTR(-ENOMEM);
+
+       INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+
+       kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+       err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+       if (err) {
+               kfree(kvm);
+               return ERR_PTR(err);
+       }
+
+       return kvm;
+}
+
+static void kvm_free_vcpus(struct kvm *kvm)
+{
+       unsigned int i;
+
+       for (i = 0; i < KVM_MAX_VCPUS; ++i)
+               if (kvm->vcpus[i])
+                       BUG_ON(kvm->vcpus[i]->arch.mmu.root_hpa !=
+                              INVALID_PAGE);
        for (i = 0; i < KVM_MAX_VCPUS; ++i) {
                if (kvm->vcpus[i]) {
                        kvm_arch_vcpu_free(kvm->vcpus[i]);
@@ -3931,6 +4089,12 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        kvm_free_pit(kvm);
        kfree(kvm->arch.vpic);
        kfree(kvm->arch.vioapic);
+       /*
+        * kvm_mmu_notifier_release() will be called before
+        * mmu_notifier_unregister returns, if it didn't run
+        * already.
+        */
+       mmu_notifier_unregister(&kvm->arch.mmu_notifier, kvm->mm);
        kvm_free_vcpus(kvm);
        kvm_free_physmem(kvm);
        if (kvm->arch.apic_access_page)
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 1d8cd01..b9a1421 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -13,6 +13,7 @@
 
 #include <linux/types.h>
 #include <linux/mm.h>
+#include <linux/mmu_notifier.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
@@ -247,6 +248,7 @@ struct kvm_vcpu_arch {
                gfn_t gfn;      /* presumed gfn during guest pte update */
                pfn_t pfn;      /* pfn corresponding to that gfn */
                int largepage;
+               int mmu_seq;
        } update_pte;
 
        struct i387_fxsave_struct host_fx_image;
@@ -317,6 +319,10 @@ struct kvm_arch{
 
        struct page *ept_identity_pagetable;
        bool ept_identity_pagetable_done;
+
+       struct mmu_notifier mmu_notifier;
+       atomic_t mmu_notifier_seq;
+       atomic_t mmu_notifier_count;
 };
 
 struct kvm_vm_stat {
@@ -441,6 +447,8 @@ void kvm_mmu_set_base_ptes(u64 base_pte);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
                u64 dirty_mask, u64 nx_mask, u64 x_mask);
 
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_age_hva(struct kvm *kvm, unsigned long hva);
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
 void kvm_mmu_zap_all(struct kvm *kvm);

-------------------------------------------------------------------------
This SF.net email is sponsored by the 2008 JavaOne(SM) Conference 
Don't miss this year's exciting event. There's still time to save $100. 
Use priority code J8TL2D2. 
http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone
_______________________________________________
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel

Reply via email to