This patch implements MMU notifiers for KVM RISC-V so that Guest
physical address space is in-sync with Host physical address space.

This will allow swapping, page migration, etc to work transparently
with KVM RISC-V.

Signed-off-by: Anup Patel <anup.pa...@wdc.com>
Acked-by: Paolo Bonzini <pbonz...@redhat.com>
Reviewed-by: Paolo Bonzini <pbonz...@redhat.com>
Reviewed-by: Alexander Graf <g...@amazon.com>
---
 arch/riscv/include/asm/kvm_host.h |   7 ++
 arch/riscv/kvm/Kconfig            |   1 +
 arch/riscv/kvm/mmu.c              | 200 +++++++++++++++++++++++++++++-
 arch/riscv/kvm/vm.c               |   1 +
 4 files changed, 208 insertions(+), 1 deletion(-)

diff --git a/arch/riscv/include/asm/kvm_host.h 
b/arch/riscv/include/asm/kvm_host.h
index bc27f664b443..79ceb2aa8ae6 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -193,6 +193,13 @@ static inline void kvm_arch_vcpu_block_finish(struct 
kvm_vcpu *vcpu) {}
 int kvm_riscv_setup_vsip(void);
 void kvm_riscv_cleanup_vsip(void);
 
+#define KVM_ARCH_WANT_MMU_NOTIFIER
+int kvm_unmap_hva_range(struct kvm *kvm,
+                       unsigned long start, unsigned long end);
+int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
+
 void __kvm_riscv_hfence_gvma_vmid_gpa(unsigned long vmid,
                                      unsigned long gpa);
 void __kvm_riscv_hfence_gvma_vmid(unsigned long vmid);
diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig
index 9cca98c4673b..d8fa13b0da18 100644
--- a/arch/riscv/kvm/Kconfig
+++ b/arch/riscv/kvm/Kconfig
@@ -20,6 +20,7 @@ if VIRTUALIZATION
 config KVM
        tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)"
        depends on OF
+       select MMU_NOTIFIER
        select PREEMPT_NOTIFIERS
        select ANON_INODES
        select KVM_MMIO
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index 590669290139..d8a692d3e640 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -67,6 +67,66 @@ static void *stage2_cache_alloc(struct kvm_mmu_page_cache 
*pcache)
        return p;
 }
 
+static int stage2_pgdp_test_and_clear_young(pgd_t *pgd)
+{
+       return ptep_test_and_clear_young(NULL, 0, (pte_t *)pgd);
+}
+
+static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
+{
+       return ptep_test_and_clear_young(NULL, 0, (pte_t *)pmd);
+}
+
+static int stage2_ptep_test_and_clear_young(pte_t *pte)
+{
+       return ptep_test_and_clear_young(NULL, 0, pte);
+}
+
+static bool stage2_get_leaf_entry(struct kvm *kvm, gpa_t addr,
+                                 pgd_t **pgdpp, pmd_t **pmdpp, pte_t **ptepp)
+{
+       pgd_t *pgdp;
+       pmd_t *pmdp;
+       pte_t *ptep;
+
+       *pgdpp = NULL;
+       *pmdpp = NULL;
+       *ptepp = NULL;
+
+       pgdp = &kvm->arch.pgd[pgd_index(addr)];
+       if (!pgd_val(*pgdp))
+               return false;
+       if (pgd_val(*pgdp) & _PAGE_LEAF) {
+               *pgdpp = pgdp;
+               return true;
+       }
+
+       if (stage2_have_pmd) {
+               pmdp = (void *)pgd_page_vaddr(*pgdp);
+               pmdp = &pmdp[pmd_index(addr)];
+               if (!pmd_present(*pmdp))
+                       return false;
+               if (pmd_val(*pmdp) & _PAGE_LEAF) {
+                       *pmdpp = pmdp;
+                       return true;
+               }
+
+               ptep = (void *)pmd_page_vaddr(*pmdp);
+       } else {
+               ptep = (void *)pgd_page_vaddr(*pgdp);
+       }
+
+       ptep = &ptep[pte_index(addr)];
+       if (!pte_present(*ptep))
+               return false;
+       if (pte_val(*ptep) & _PAGE_LEAF) {
+               *ptepp = ptep;
+               return true;
+       }
+
+       return false;
+}
+
 struct local_guest_tlb_info {
        struct kvm_vmid *vmid;
        gpa_t addr;
@@ -450,6 +510,38 @@ int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t 
hpa,
 
 }
 
+static int handle_hva_to_gpa(struct kvm *kvm,
+                            unsigned long start,
+                            unsigned long end,
+                            int (*handler)(struct kvm *kvm,
+                                           gpa_t gpa, u64 size,
+                                           void *data),
+                            void *data)
+{
+       struct kvm_memslots *slots;
+       struct kvm_memory_slot *memslot;
+       int ret = 0;
+
+       slots = kvm_memslots(kvm);
+
+       /* we only care about the pages that the guest sees */
+       kvm_for_each_memslot(memslot, slots) {
+               unsigned long hva_start, hva_end;
+               gfn_t gpa;
+
+               hva_start = max(start, memslot->userspace_addr);
+               hva_end = min(end, memslot->userspace_addr +
+                                       (memslot->npages << PAGE_SHIFT));
+               if (hva_start >= hva_end)
+                       continue;
+
+               gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
+               ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
+       }
+
+       return ret;
+}
+
 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
                           struct kvm_memory_slot *dont)
 {
@@ -582,6 +674,106 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
        return ret;
 }
 
+static int kvm_unmap_hva_handler(struct kvm *kvm,
+                                gpa_t gpa, u64 size, void *data)
+{
+       stage2_unmap_range(kvm, gpa, size);
+       return 0;
+}
+
+int kvm_unmap_hva_range(struct kvm *kvm,
+                       unsigned long start, unsigned long end)
+{
+       if (!kvm->arch.pgd)
+               return 0;
+
+       handle_hva_to_gpa(kvm, start, end,
+                         &kvm_unmap_hva_handler, NULL);
+       return 0;
+}
+
+static int kvm_set_spte_handler(struct kvm *kvm,
+                               gpa_t gpa, u64 size, void *data)
+{
+       pte_t *pte = (pte_t *)data;
+
+       WARN_ON(size != PAGE_SIZE);
+       stage2_set_pte(kvm, NULL, gpa, pte);
+
+       return 0;
+}
+
+int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+       unsigned long end = hva + PAGE_SIZE;
+       kvm_pfn_t pfn = pte_pfn(pte);
+       pte_t stage2_pte;
+
+       if (!kvm->arch.pgd)
+               return 0;
+
+       stage2_pte = pfn_pte(pfn, PAGE_WRITE_EXEC);
+       handle_hva_to_gpa(kvm, hva, end,
+                         &kvm_set_spte_handler, &stage2_pte);
+
+       return 0;
+}
+
+static int kvm_age_hva_handler(struct kvm *kvm,
+                               gpa_t gpa, u64 size, void *data)
+{
+       pgd_t *pgd;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE);
+       if (!stage2_get_leaf_entry(kvm, gpa, &pgd, &pmd, &pte))
+               return 0;
+
+       if (pgd)
+               return stage2_pgdp_test_and_clear_young(pgd);
+       else if (pmd)
+               return stage2_pmdp_test_and_clear_young(pmd);
+       else
+               return stage2_ptep_test_and_clear_young(pte);
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+       if (!kvm->arch.pgd)
+               return 0;
+
+       return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
+}
+
+static int kvm_test_age_hva_handler(struct kvm *kvm,
+                                   gpa_t gpa, u64 size, void *data)
+{
+       pgd_t *pgd;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       WARN_ON(size != PAGE_SIZE && size != PMD_SIZE);
+       if (!stage2_get_leaf_entry(kvm, gpa, &pgd, &pmd, &pte))
+               return 0;
+
+       if (pgd)
+               return pte_young(*((pte_t *)pgd));
+       else if (pmd)
+               return pte_young(*((pte_t *)pmd));
+       else
+               return pte_young(*pte);
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+       if (!kvm->arch.pgd)
+               return 0;
+
+       return handle_hva_to_gpa(kvm, hva, hva,
+                                kvm_test_age_hva_handler, NULL);
+}
+
 int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long hva,
                         bool is_write)
 {
@@ -593,7 +785,7 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, 
unsigned long hva,
        struct vm_area_struct *vma;
        struct kvm *kvm = vcpu->kvm;
        struct kvm_mmu_page_cache *pcache = &vcpu->arch.mmu_page_cache;
-       unsigned long vma_pagesize;
+       unsigned long vma_pagesize, mmu_seq;
 
        down_read(&current->mm->mmap_sem);
 
@@ -623,6 +815,8 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, 
unsigned long hva,
                return ret;
        }
 
+       mmu_seq = kvm->mmu_notifier_seq;
+
        hfn = gfn_to_pfn_prot(kvm, gfn, is_write, &writeable);
        if (hfn == KVM_PFN_ERR_HWPOISON) {
                if (is_vm_hugetlb_page(vma))
@@ -641,6 +835,9 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, 
unsigned long hva,
 
        spin_lock(&kvm->mmu_lock);
 
+       if (mmu_notifier_retry(kvm, mmu_seq))
+               goto out_unlock;
+
        if (writeable) {
                kvm_set_pfn_dirty(hfn);
                ret = stage2_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT,
@@ -653,6 +850,7 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, 
unsigned long hva,
        if (ret)
                kvm_err("Failed to map in stage2\n");
 
+out_unlock:
        spin_unlock(&kvm->mmu_lock);
        kvm_set_pfn_accessed(hfn);
        kvm_release_pfn_clean(hfn);
diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
index c5aab5478c38..fd84b4d914dc 100644
--- a/arch/riscv/kvm/vm.c
+++ b/arch/riscv/kvm/vm.c
@@ -54,6 +54,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        switch (ext) {
        case KVM_CAP_DEVICE_CTRL:
        case KVM_CAP_USER_MEMORY:
+       case KVM_CAP_SYNC_MMU:
        case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
        case KVM_CAP_ONE_REG:
        case KVM_CAP_READONLY_MEM:
-- 
2.17.1

Reply via email to