Hi Andrea, > path for kvm so I guess not worth optimizing in the short/mid term, > but by optimizing the invalidate_page case we may halve the number of > tlb flushes for some common case. I leave it for later, the swapping > is heavily I/O bound anyway so a some more ipi in smp host shouldn't > be very measurable (on UP host it makes no difference to flush > multiple times in practice). > > Signed-off-by: Andrea Arcangeli <[EMAIL PROTECTED]> > > diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig > index 4086080..c527d7d 100644 > --- a/arch/x86/kvm/Kconfig > +++ b/arch/x86/kvm/Kconfig > @@ -18,6 +18,7 @@ config KVM > tristate "Kernel-based Virtual Machine (KVM) support" > depends on ARCH_SUPPORTS_KVM && EXPERIMENTAL > select PREEMPT_NOTIFIERS > + select MMU_NOTIFIER > select ANON_INODES > ---help--- > Support hosting fully virtualized guest machines using hardware > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > index 324ff9a..103c270 100644 > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -532,6 +532,36 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) > kvm_flush_remote_tlbs(kvm); > } > > +static void unmap_spte(struct kvm *kvm, u64 *spte) > +{ > + struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> > PAGE_SHIFT); > + get_page(page); > + rmap_remove(kvm, spte); > + set_shadow_pte(spte, shadow_trap_nonpresent_pte); > + kvm_flush_remote_tlbs(kvm); > + __free_page(page); > +} > + > +void kvm_rmap_unmap_gfn(struct kvm *kvm, gfn_t gfn) > +{ > + unsigned long *rmapp; > + u64 *spte, *curr_spte; > + > + spin_lock(&kvm->mmu_lock); > + gfn = unalias_gfn(kvm, gfn); > + rmapp = gfn_to_rmap(kvm, gfn);
The alias and memslot maps are protected only by mmap_sem, so you should make kvm_set_memory_region/set_memory_alias grab the mmu spinlock in addition to mmap_sem in write mode. kvm_mmu_zap_all() grabs the mmu lock.. that should probably move up into the caller. > + > + spte = rmap_next(kvm, rmapp, NULL); > + while (spte) { > + BUG_ON(!(*spte & PT_PRESENT_MASK)); > + rmap_printk("rmap_swap_page: spte %p %llx\n", spte, *spte); > + curr_spte = spte; > + spte = rmap_next(kvm, rmapp, spte); > + unmap_spte(kvm, curr_spte); > + } > + spin_unlock(&kvm->mmu_lock); > +} > + > #ifdef MMU_DEBUG > static int is_empty_shadow_page(u64 *spt) > { > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 8a90403..e9a3f6e 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -3159,6 +3159,36 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) > free_page((unsigned long)vcpu->arch.pio_data); > } > > +static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) > +{ > + return container_of(mn, struct kvm, mmu_notifier); > +} > + > +void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, > + struct mm_struct *mm, > + unsigned long address) > +{ > + struct kvm *kvm = mmu_notifier_to_kvm(mn); > + gfn_t gfn = hva_to_gfn(kvm, address); > + BUG_ON(mm != kvm->mm); > + if (gfn == -1UL) > + return; > + kvm_rmap_unmap_gfn(kvm, gfn); And then you also need to cover "hva_to_gfn()" to happen under the lock. > +} > + > +void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn, > + struct mm_struct *mm, > + unsigned long start, unsigned long end) > +{ > + for (; start < end; start += PAGE_SIZE) > + kvm_mmu_notifier_invalidate_page(mn, mm, start); > +} > + > +static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { > + .invalidate_range = kvm_mmu_notifier_invalidate_range, > + .invalidate_page = kvm_mmu_notifier_invalidate_page, > +}; > + > struct kvm *kvm_arch_create_vm(void) > { > struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); > @@ -3167,6 +3197,7 @@ struct kvm *kvm_arch_create_vm(void) > return ERR_PTR(-ENOMEM); > > INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); > + kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; > > return kvm; > } > diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h > index d6db0de..feacd77 100644 > --- a/include/asm-x86/kvm_host.h > +++ b/include/asm-x86/kvm_host.h > @@ -404,6 +404,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu); > int kvm_mmu_setup(struct kvm_vcpu *vcpu); > void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); > > +void kvm_rmap_unmap_gfn(struct kvm *kvm, gfn_t gfn); > int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); > void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); > void kvm_mmu_zap_all(struct kvm *kvm); > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 2714068..85da7fa 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -117,6 +117,7 @@ struct kvm { > struct kvm_io_bus pio_bus; > struct kvm_vm_stat stat; > struct kvm_arch arch; > + struct mmu_notifier mmu_notifier; > }; > > /* The guest did something we don't support. */ > @@ -163,6 +164,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, > struct kvm_memory_slot old, > int user_alloc); > gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); > +gfn_t hva_to_gfn(struct kvm *kvm, unsigned long addr); > struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); > void kvm_release_page_clean(struct page *page); > void kvm_release_page_dirty(struct page *page); > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 4295623..8f1dd86 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -165,6 +165,7 @@ static struct kvm *kvm_create_vm(void) > > kvm->mm = current->mm; > atomic_inc(&kvm->mm->mm_count); > + mmu_notifier_register(&kvm->mmu_notifier, kvm->mm); > spin_lock_init(&kvm->mmu_lock); > kvm_io_bus_init(&kvm->pio_bus); > mutex_init(&kvm->lock); > @@ -454,6 +455,23 @@ static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t > gfn) > return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); > } > > +gfn_t hva_to_gfn(struct kvm *kvm, unsigned long addr) > +{ > + int i; > + > + for (i = 0; i < kvm->nmemslots; i++) { > + struct kvm_memory_slot *memslot = &kvm->memslots[i]; > + unsigned long start = memslot->userspace_addr; > + unsigned long end = start + (memslot->npages << PAGE_SHIFT); > + > + if (addr >= start && addr < end) { > + gfn_t gfn_offset = (addr - start) >> PAGE_SHIFT; > + return memslot->base_gfn + gfn_offset; > + } > + } > + return -1UL; > +} > + > /* > * Requires current->mm->mmap_sem to be held > */ > > > > And here a compatibility patch to kvm-userland so the external module > still compile and runs with older kernels w/o MMU_NOTIFIER patch applied. > > Signed-off-by: Andrea Arcangeli <[EMAIL PROTECTED]> > > diff --git a/kernel/external-module-compat.h b/kernel/external-module-compat.h > index 67b9cc4..34ef0a5 100644 > --- a/kernel/external-module-compat.h > +++ b/kernel/external-module-compat.h > @@ -17,6 +17,28 @@ > #include <linux/hrtimer.h> > #include <asm/bitops.h> > > +#ifndef CONFIG_MMU_NOTIFIER > +struct mmu_notifier; > + > +struct mmu_notifier_ops { > + void (*release)(struct mmu_notifier * mn, > + struct mm_struct *mm); > + void (*invalidate_page)(struct mmu_notifier * mn, > + struct mm_struct *mm, > + unsigned long address); > + void (*invalidate_range)(struct mmu_notifier * mn, > + struct mm_struct *mm, > + unsigned long start, unsigned long end); > +}; > + > +struct mmu_notifier { > + const struct mmu_notifier_ops *ops; > +}; > +#define mmu_notifier_register(mn, mm) do {} while(0) > +#define mmu_notifier_unregister(mn) do {} while (0) > +#define mmu_notifier_release(mm) do {} while (0) > +#endif > + > /* > * 2.6.16 does not have GFP_NOWAIT > */ > > > Here another patch for kvm-userland where I can't see symmetry between > the lack of atomic_inc despite mmdrop is still run. I can't possibly > see how this is supposedly not required when compiled into the kernel > vs external module. Either atomic_inc is needed in both or none. Even > if I'm right still this bug wasn't destabilizing because > atomic_inc_and_dec only fires once in the overflow check, so it > shouldn't matter to run one mmdrop more than needed, but it's good for > correctness. > > Signed-off-by: Andrea Arcangeli <[EMAIL PROTECTED]> > > diff --git a/kernel/hack-module.awk b/kernel/hack-module.awk > index 5187c96..884bc50 100644 > --- a/kernel/hack-module.awk > +++ b/kernel/hack-module.awk > @@ -33,8 +33,6 @@ > vmx_load_host_state = 0 > } > > -/atomic_inc\(&kvm->mm->mm_count\);/ { $0 = "//" $0 } > - > /^\t\.fault = / { > fcn = gensub(/,/, "", "g", $3) > $0 = "\t.VMA_OPS_FAULT(fault) = VMA_OPS_FAULT_FUNC(" fcn ")," > > > I'll post the mmu-notifiers patch (required in the host kernel to run > the above) separately in CC with more mailing lists because that's not > KVM code at all and we hope to get it merged in the mainline kernel > soon after getting feedback on the interface from the other users of > the mmu notifiers. > > Thanks! > Andrea > > ------------------------------------------------------------------------- > Check out the new SourceForge.net Marketplace. > It's the best place to buy or sell services for > just about anything Open Source. > http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace > _______________________________________________ > kvm-devel mailing list > kvm-devel@lists.sourceforge.net > https://lists.sourceforge.net/lists/listinfo/kvm-devel ------------------------------------------------------------------------- Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace _______________________________________________ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel