This implements the low-level functions called by the MMU notifiers in
the generic KVM code, and defines KVM_ARCH_WANT_MMU_NOTIFIER if
CONFIG_KVM_BOOK3S_64_HV so that the generic KVM MMU notifiers get
included.

That means we also have to take notice of when PTE invalidations are
in progress, as indicated by mmu_notifier_retry().  In kvmppc_h_enter,
if any invalidation is in progress we just install a non-present HPTE.
In kvmppc_book3s_hv_page_fault, if an invalidation is in progress we
just return without resolving the guest, causing it to encounter another
page fault immediately.  This is better than spinning inside
kvmppc_book3s_hv_page_fault because this way the guest can get preempted
by a hypervisor decrementer interrupt without us having to do any
special checks.

We currently maintain a referenced bit in the rmap array, and when we
clear it, we make all the HPTEs that map the corresponding page be
non-present, as if the page were invalidated.  In future we could use
the hardware reference bit in the guest HPT instead.

The kvm_set_spte_hva function is implemented as kvm_unmap_hva.  The
former appears to be unused anyway.

This all means that on processors that support virtual partition
memory (POWER7), we can claim support for the KVM_CAP_SYNC_MMU
capability, and we no longer have to pin all the guest memory.

Signed-off-by: Paul Mackerras <pau...@samba.org>
---
 arch/powerpc/include/asm/kvm_host.h |   13 +++
 arch/powerpc/kvm/Kconfig            |    1 +
 arch/powerpc/kvm/book3s_64_mmu_hv.c |  160 ++++++++++++++++++++++++++++++++++-
 arch/powerpc/kvm/book3s_hv.c        |   25 +++--
 arch/powerpc/kvm/book3s_hv_rm_mmu.c |   34 ++++++-
 arch/powerpc/kvm/powerpc.c          |    3 +
 6 files changed, 218 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 3dfac3d..79bfc69 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -44,6 +44,19 @@
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #endif
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+#include <linux/mmu_notifier.h>
+
+#define KVM_ARCH_WANT_MMU_NOTIFIER
+
+struct kvm;
+extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+extern int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
+extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+
+#endif
+
 /* We don't currently support large pages. */
 #define KVM_HPAGE_GFN_SHIFT(x) 0
 #define KVM_NR_PAGE_SIZES      1
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 78133de..8f64709 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -69,6 +69,7 @@ config KVM_BOOK3S_64
 config KVM_BOOK3S_64_HV
        bool "KVM support for POWER7 and PPC970 using hypervisor mode in host"
        depends on KVM_BOOK3S_64
+       select MMU_NOTIFIER
        ---help---
          Support running unmodified book3s_64 guest kernels in
          virtual machines on POWER7 and PPC970 processors that have
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index e93c789..8c497b8 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -138,6 +138,15 @@ void kvmppc_map_vrma(struct kvm *kvm, struct 
kvm_userspace_memory_region *mem)
        hp1 = hpte1_pgsize_encoding(psize) |
                HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
 
+       spin_lock(&kvm->mmu_lock);
+       /* wait until no invalidations are in progress */
+       while (kvm->mmu_notifier_count) {
+               spin_unlock(&kvm->mmu_lock);
+               while (kvm->mmu_notifier_count)
+                       cpu_relax();
+               spin_lock(&kvm->mmu_lock);
+       }
+               
        for (i = 0; i < npages; ++i) {
                addr = i << porder;
                if (pfns) {
@@ -185,6 +194,7 @@ void kvmppc_map_vrma(struct kvm *kvm, struct 
kvm_userspace_memory_region *mem)
                                KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT;
                }
        }
+       spin_unlock(&kvm->mmu_lock);
 }
 
 int kvmppc_mmu_hv_init(void)
@@ -506,7 +516,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
        struct kvm *kvm = vcpu->kvm;
        struct kvmppc_slb *slbe;
        unsigned long *hptep, hpte[3];
-       unsigned long psize, pte_size;
+       unsigned long mmu_seq, psize, pte_size;
        unsigned long gfn, hva, pfn, amr;
        struct kvm_memory_slot *memslot;
        unsigned long *rmap;
@@ -581,6 +591,11 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, 
struct kvm_vcpu *vcpu,
        if (kvm->arch.slot_pfns[memslot->id])
                return -EFAULT;         /* should never get here */
        hva = gfn_to_hva_memslot(memslot, gfn);
+
+       /* used to check for invalidations in progress */
+       mmu_seq = kvm->mmu_notifier_seq;
+       smp_rmb();
+
        npages = get_user_pages_fast(hva, 1, 1, pages);
        if (npages < 1)
                return -EFAULT;
@@ -596,9 +611,15 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, 
struct kvm_vcpu *vcpu,
                goto out_put;
        pfn = page_to_pfn(page);
 
+       /* Check if we might have been invalidated; let the guest retry if so */
+       ret = RESUME_GUEST;
+       spin_lock(&kvm->mmu_lock);
+       if (mmu_notifier_retry(vcpu, mmu_seq))
+               goto out_unlock;
+
        /* Set the HPTE to point to pfn */
        ret = RESUME_GUEST;
-       hptep = (unsigned long *)kvm->arch.hpt_virt + (index << 1);
+       hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
        rev = &kvm->arch.revmap[index];
        while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
                cpu_relax();
@@ -606,7 +627,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
            rev->guest_rpte != hpte[2]) {
                /* HPTE has been changed under us; let the guest retry */
                hptep[0] &= ~HPTE_V_HVLOCK;
-               goto out_put;
+               goto out_unlock;
        }
        hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
        hpte[1] = (rev->guest_rpte & ~(HPTE_R_PP0 - pte_size)) |
@@ -617,6 +638,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
        if (page)
                SetPageDirty(page);
 
+ out_unlock:
+       spin_unlock(&kvm->mmu_lock);
  out_put:
        if (page)
                put_page(page);
@@ -635,6 +658,137 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, 
struct kvm_vcpu *vcpu,
        return RESUME_GUEST;
 }
 
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+                         int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+                                        unsigned long gfn))
+{
+       int i;
+       int ret;
+       int retval = 0;
+       struct kvm_memslots *slots;
+
+       slots = kvm_memslots(kvm);
+       for (i = 0; i < slots->nmemslots; i++) {
+               struct kvm_memory_slot *memslot = &slots->memslots[i];
+               unsigned long start = memslot->userspace_addr;
+               unsigned long end;
+
+               end = start + (memslot->npages << PAGE_SHIFT);
+               if (hva >= start && hva < end) {
+                       gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+
+                       ret = handler(kvm, &memslot->rmap[gfn_offset],
+                                     memslot->base_gfn + gfn_offset);
+                       retval |= ret;
+               }
+       }
+
+       return retval;
+}
+
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+                          unsigned long gfn)
+{
+       struct revmap_entry *rev = kvm->arch.revmap;
+       unsigned long h, i, j;
+       unsigned long *hptep, new_hpte[2];
+       unsigned long ptel, psize;
+       int n = 0;
+
+       for (;;) {
+               while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmapp))
+                       cpu_relax();
+               if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
+                       __clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmapp);
+                       break;
+               }
+
+               /*
+                * To avoid an ABBA deadlock with the HPTE lock bit,
+                * we have to unlock the rmap chain before locking the HPTE.
+                * Thus we remove the first entry, unlock the rmap chain,
+                * lock the HPTE and then check that it is for the
+                * page we're unmapping before changing it to non-present.
+                */
+               i = *rmapp & KVMPPC_RMAP_INDEX;
+               j = rev[i].forw;
+               if (j == i) {
+                       /* chain is now empty */
+                       j = 0;
+               } else {
+                       /* remove i from chain */
+                       h = rev[i].back;
+                       rev[h].forw = j;
+                       rev[j].back = h;
+                       rev[i].forw = rev[i].back = i;
+                       j |= KVMPPC_RMAP_PRESENT;
+               }
+               smp_wmb();
+               *rmapp = j | (1ul << KVMPPC_RMAP_REF_BIT);
+
+               /* Now lock, check and modify the HPTE */
+               hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
+               while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
+                       cpu_relax();
+               ptel = rev[i].guest_rpte;
+               psize = hpte_page_size(hptep[0], ptel);
+               if ((hptep[0] & HPTE_V_VALID) &&
+                   hpte_rpn(ptel, psize) == gfn) {
+                       new_hpte[0] = hptep[0] | HPTE_V_ABSENT;
+                       if ((new_hpte[0] & 0xffffffffff000000ul) ==
+                           (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16))))
+                               new_hpte[0] &= ~HPTE_V_VALID;
+                       new_hpte[1] = (ptel & ~(HPTE_R_PP0 - psize)) |
+                               HPTE_R_KEY_HI | HPTE_R_KEY_LO | HPTE_R_N;
+                       kvmppc_modify_hpte(kvm, hptep, new_hpte, i);
+                       ++n;
+               } else {
+                       hptep[0] &= ~HPTE_V_HVLOCK;
+               }
+       }
+       return 0;
+}
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+       kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+       return 0;
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+                        unsigned long gfn)
+{
+       if (!(*rmapp & KVMPPC_RMAP_REFERENCED))
+               return 0;
+       kvm_unmap_rmapp(kvm, rmapp, gfn);
+       while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmapp))
+               cpu_relax();
+       __clear_bit(KVMPPC_RMAP_REF_BIT, rmapp);
+       __clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmapp);
+       return 1;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+       return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
+}
+
+static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+                             unsigned long gfn)
+{
+       return !!(*rmapp & KVMPPC_RMAP_REFERENCED);
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+       return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp);
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+       kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+}
+
 void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
                            unsigned long *nb_ret)
 {
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 47053e9..9e67320 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1278,10 +1278,12 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
                        ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
        }
 
-       pfns = vzalloc(npages * sizeof(unsigned long));
-       if (!pfns)
-               return -ENOMEM;
-       kvm->arch.slot_pfns[mem->slot] = pfns;
+       if (!cpu_has_feature(CPU_FTR_ARCH_206)) {
+               pfns = vzalloc(npages * sizeof(unsigned long));
+               if (!pfns)
+                       return -ENOMEM;
+               kvm->arch.slot_pfns[mem->slot] = pfns;
+       }
 
        return 0;
 
@@ -1305,12 +1307,14 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
                return;
 
        pfns = kvm->arch.slot_pfns[mem->slot];
-       npages = mem->memory_size >> porder;
-       for (i = 0; i < npages; ++i) {
-               hva = mem->userspace_addr + (i << porder);
-               page = hva_to_page(hva);
-               if (page)
-                       pfns[i] = page_to_pfn(page);
+       if (pfns) {
+               npages = mem->memory_size >> porder;
+               for (i = 0; i < npages; ++i) {
+                       hva = mem->userspace_addr + (i << porder);
+                       page = hva_to_page(hva);
+                       if (page)
+                               pfns[i] = page_to_pfn(page);
+               }
        }
 
        if (mem->guest_phys_addr == 0 && mem->memory_size != 0 &&
@@ -1384,6 +1388,7 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
                                page = pfn_to_page(pfns[j]);
                                if (PageHuge(page))
                                        page = compound_head(page);
+                               SetPageDirty(page);
                                put_page(page);
                        }
                }
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 622bfcd..2cadd06 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -143,11 +143,17 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long 
flags,
        unsigned long *rmap;
        pte_t *ptep;
        unsigned int shift;
+       unsigned long mmu_seq;
+       long err;
 
        psize = hpte_page_size(pteh, ptel);
        if (!psize)
                return H_PARAMETER;
 
+       /* used later to detect if we might have been invalidated */
+       mmu_seq = kvm->mmu_notifier_seq;
+       smp_rmb();
+
        /* Find the memslot (if any) for this address */
        gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
        gfn = gpa >> PAGE_SHIFT;
@@ -212,6 +218,18 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long 
flags,
                        return H_PARAMETER;
        }
 
+       /*
+        * Now that we're about to write the HPTE and thus give the guest
+        * access to the page, check for any pending invalidations.
+        * We don't need to worry about that if this is a non-present page.
+        * Note that the HPTE bitlock has to nest inside the kvm->mmu_lock.
+        */
+       spin_lock(&kvm->mmu_lock);
+       if (mmu_notifier_retry(vcpu, mmu_seq))
+               /* inval in progress, write a non-present HPTE */
+               pa = 0;
+
+       err = H_PARAMETER;
        if (!pa) {
                /*
                 * If this is a non-present page for any reason
@@ -222,7 +240,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long 
flags,
                 * On 970 we have to have all pages present.
                 */
                if (!cpu_has_feature(CPU_FTR_ARCH_206))
-                       return H_PARAMETER;
+                       goto out;
                pteh |= HPTE_V_ABSENT;
                if ((pteh & 0xffffffffff000000ul) ==
                    (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16))))
@@ -231,14 +249,16 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long 
flags,
                        ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO | HPTE_R_N;
        }
 
+       /* Find and lock the HPTEG slot to use */
        if (pte_index >= HPT_NPTE)
-               return H_PARAMETER;
+               goto out;
+       err = H_PTEG_FULL;
        if (likely((flags & H_EXACT) == 0)) {
                pte_index &= ~7UL;
                hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
                for (i = 0; ; ++i) {
                        if (i == 8)
-                               return H_PTEG_FULL;
+                               goto out;
                        if ((*hpte & HPTE_V_VALID) == 0 &&
                            try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
                                          HPTE_V_ABSENT))
@@ -250,7 +270,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long 
flags,
                hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
                if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
                                   HPTE_V_ABSENT))
-                       return H_PTEG_FULL;
+                       goto out;
        }
 
        /* Save away the guest's idea of the second HPTE dword */
@@ -272,7 +292,11 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long 
flags,
        asm volatile("ptesync" : : : "memory");
 
        vcpu->arch.gpr[4] = pte_index;
-       return H_SUCCESS;
+       err = H_SUCCESS;
+
+ out:
+       spin_unlock(&kvm->mmu_lock);
+       return err;
 }
 
 #define LOCK_TOKEN     (*(u32 *)(&get_paca()->lock_token))
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 084d1c5..0f10a04 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -244,6 +244,9 @@ int kvm_dev_ioctl_check_extension(long ext)
                if (cpu_has_feature(CPU_FTR_ARCH_201))
                        r = 2;
                break;
+       case KVM_CAP_SYNC_MMU:
+               r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0;
+               break;
 #endif
        default:
                r = 0;
-- 
1.7.7.2

_______________________________________________
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Reply via email to