On Tue, Dec 18, 2007 at 05:49:51PM +0200, Avi Kivity wrote: > Marcelo Tosatti wrote: > >Hi, > > > >The following is an improvement on top of an earlier patch by Izik. It > >increases pagefault scalability for SMP guests by allowing concurrent > >guest walking, allocation and instruction emulation on the fault path. > > > >The test being used is pft, which starts a number of threads > >allocating and writing malloc()'ed memory. pft.c can be found at > >http://lkml.org/lkml/2004/8/15/58 > > > >The script being used is: > > > >bytes=$((400*1024*1024)) > >./pft -t -b$bytes -r10 -f1 > >./pft -b$bytes -r10 -f2 > >./pft -b$bytes -r10 -f3 > >./pft -b$bytes -r10 -f4 > >./pft -b$bytes -r10 -f8 > > > >This is a 4-way guest. > > > >One important detail from the results is that there is no difference > >for the two threads case, but beyond that we see a clear improvement. > >follow_page() is showing up high in profiling, so I believe this > >is partly due to the fact that it does follow_page() twice while > >holding the lock, once in mmu_set_spte() from walk_addr() and again in > >mmu_set_spte() in fetch() - I'm looking into removing those duplicated > >calls for the same gfn. > > > >The patch still lacks the copy_from_user_inatomic() change in > >prefetch_page() to avoid a potential sleep in case the page is swapped > >out. > > > >Another issue is that now fetch() will re-read the pte's after > >instantiating a shadow page, but in theory they could be swapped out. I > >believe that is safe since walk_addr() just touched the pte's bringing > >them in from swap. > > > > We need to convert that to kvm_read_guest_atomic() to avoid even that > theoretical race. If the read fails, we can simply return and let the > guest retry the faulting instruction.
Updated patch, now feature complete. Changes from last version: - Use __gfn_to_page in cmpxchg_pte() to avoid potential deadlock - Add kvm_read_guest_inatomic() and use it in fetch() - Make prefetch_page() use copy_from_user_inatomic() - Pass grabbed page down to mmu_set_spte to avoid a potential schedule with mmu_lock held (this could happen even without the page being swapped out because get_user_pages() calls cond_resched). - Convert a few missing mutex lock users to mmap_sem. - Grab the mutex lock when calling kvm_iodevice_{read,write} Please review. Tests on 4-way guest: KVM stock: Gb Rep Threads User System Wall flt/cpu/s fault/wsec 0 10 1 0.368s 5.440s 6.017s176297.521 165958.112 0 10 2 0.520s 7.144s 4.023s133603.358 241902.916 0 10 3 0.576s 11.292s 4.061s 86277.053 221972.262 0 10 4 0.596s 14.996s 4.058s 65670.603 223380.197 0 10 8 0.916s 14.772s 4.063s 65268.743 220801.490 KVM + scale-2.patch: Gb Rep Threads User System Wall flt/cpu/s fault/wsec 0 10 1 0.296s 4.976s 6.006s194221.567 168951.621 0 10 2 0.408s 6.208s 3.084s154766.639 266578.709 0 10 3 0.528s 6.736s 2.093s140960.353 348877.073 0 10 4 0.548s 7.988s 2.059s119955.022 394976.087 0 10 8 1.596s 7.896s 3.016s107873.592 323434.429 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 401eb7c..1b375ba 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -810,7 +810,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) * number of actived pages , we must to free some mmu pages before we * change the value */ - + spin_lock(&kvm->mmu_lock); if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) > kvm_nr_mmu_pages) { int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages @@ -831,6 +831,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) - kvm->arch.n_alloc_mmu_pages; kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; + spin_unlock(&kvm->mmu_lock); } static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) @@ -879,13 +880,13 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) if (gpa == UNMAPPED_GVA) return NULL; - return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + return __gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); } static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, - int *ptwrite, gfn_t gfn) + int *ptwrite, gfn_t gfn, struct page *userpage) { u64 spte; int was_rmapped = is_rmap_pte(*shadow_pte); @@ -907,7 +908,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, if (!(pte_access & ACC_EXEC_MASK)) spte |= PT64_NX_MASK; - page = gfn_to_page(vcpu->kvm, gfn); + if (userpage) { + page = userpage; + get_page(page); + } else + page = __gfn_to_page(vcpu->kvm, gfn); spte |= PT_PRESENT_MASK; if (pte_access & ACC_USER_MASK) @@ -984,7 +989,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) if (level == 1) { mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, gfn); + 0, write, 1, &pt_write, gfn, NULL); return pt_write || is_io_pte(table[index]); } @@ -1026,6 +1031,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; + spin_lock(&vcpu->kvm->mmu_lock); #ifdef CONFIG_X86_64 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; @@ -1033,6 +1039,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) sp = page_header(root); --sp->root_count; vcpu->arch.mmu.root_hpa = INVALID_PAGE; + spin_unlock(&vcpu->kvm->mmu_lock); return; } #endif @@ -1047,6 +1054,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; } vcpu->arch.mmu.root_hpa = INVALID_PAGE; + spin_unlock(&vcpu->kvm->mmu_lock); } static void mmu_alloc_roots(struct kvm_vcpu *vcpu) @@ -1129,6 +1137,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) context->new_cr3 = nonpaging_new_cr3; context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; + context->pte_to_page = NULL; context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; context->root_level = 0; @@ -1177,6 +1186,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) context->new_cr3 = paging_new_cr3; context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; + context->pte_to_page = paging64_pte_to_page; context->prefetch_page = paging64_prefetch_page; context->free = paging_free; context->root_level = level; @@ -1197,6 +1207,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; + context->pte_to_page = paging32_pte_to_page; context->free = paging_free; context->prefetch_page = paging32_prefetch_page; context->root_level = PT32_ROOT_LEVEL; @@ -1245,15 +1256,16 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) { int r; - mutex_lock(&vcpu->kvm->lock); r = mmu_topup_memory_caches(vcpu); if (r) goto out; + + spin_lock(&vcpu->kvm->mmu_lock); mmu_alloc_roots(vcpu); + spin_unlock(&vcpu->kvm->mmu_lock); kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); kvm_mmu_flush_tlb(vcpu); out: - mutex_unlock(&vcpu->kvm->lock); return r; } EXPORT_SYMBOL_GPL(kvm_mmu_load); @@ -1286,7 +1298,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *new, int bytes, - int offset_in_pte) + int offset_in_pte, + struct page *userpage) { if (sp->role.level != PT_PAGE_TABLE_LEVEL) { ++vcpu->kvm->stat.mmu_pde_zapped; @@ -1295,9 +1308,11 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, ++vcpu->kvm->stat.mmu_pte_updated; if (sp->role.glevels == PT32_ROOT_LEVEL) - paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); + paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte, + userpage); else - paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); + paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte, + userpage); } static bool need_remote_flush(u64 old, u64 new) @@ -1329,7 +1344,7 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) } void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) + const u8 *new, int bytes, struct page *userpage) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; @@ -1410,7 +1425,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, entry = *spte; mmu_pte_write_zap_pte(vcpu, sp, spte); mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes, - page_offset & (pte_size - 1)); + page_offset & (pte_size - 1), + userpage); mmu_pte_write_flush_tlb(vcpu, entry, *spte); ++spte; } @@ -1420,13 +1436,22 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + gpa_t gpa; + int r; + + down_read(¤t->mm->mmap_sem); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + up_read(¤t->mm->mmap_sem); - return kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); + spin_lock(&vcpu->kvm->mmu_lock); + r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); + spin_unlock(&vcpu->kvm->mmu_lock); + return r; } void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { + spin_lock(&vcpu->kvm->mmu_lock); while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { struct kvm_mmu_page *sp; @@ -1435,6 +1460,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) kvm_mmu_zap_page(vcpu->kvm, sp); ++vcpu->kvm->stat.mmu_recycled; } + spin_unlock(&vcpu->kvm->mmu_lock); } int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) @@ -1442,7 +1468,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) int r; enum emulation_result er; - mutex_lock(&vcpu->kvm->lock); r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); if (r < 0) goto out; @@ -1457,7 +1482,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) goto out; er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); - mutex_unlock(&vcpu->kvm->lock); switch (er) { case EMULATE_DONE: @@ -1472,7 +1496,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) BUG(); } out: - mutex_unlock(&vcpu->kvm->lock); return r; } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); @@ -1569,8 +1592,10 @@ void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; + spin_lock(&kvm->mmu_lock); list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) kvm_mmu_zap_page(kvm, sp); + spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 56b88f7..b02be2e 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -68,6 +68,7 @@ struct guest_walker { pt_element_t ptes[PT_MAX_FULL_LEVELS]; gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; unsigned pt_access; + struct page *page; unsigned pte_access; gfn_t gfn; u32 error_code; @@ -91,7 +92,7 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, pt_element_t *table; struct page *page; - page = gfn_to_page(kvm, table_gfn); + page = __gfn_to_page(kvm, table_gfn); table = kmap_atomic(page, KM_USER0); ret = CMPXCHG(&table[index], orig_pte, new_pte); @@ -186,6 +187,7 @@ walk: if (walker->level == PT_PAGE_TABLE_LEVEL) { walker->gfn = gpte_to_gfn(pte); + walker->page = __gfn_to_page(vcpu->kvm, walker->gfn); break; } @@ -196,6 +198,7 @@ walk: walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); if (PTTYPE == 32 && is_cpuid_PSE36()) walker->gfn += pse36_gfn_delta(pte); + walker->page = __gfn_to_page(vcpu->kvm, walker->gfn); break; } @@ -209,10 +212,15 @@ walk: mark_page_dirty(vcpu->kvm, table_gfn); ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, pte|PT_DIRTY_MASK); - if (ret) + if (ret) { + kvm_release_page_clean(walker->page); goto walk; + } pte |= PT_DIRTY_MASK; - kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), + walker->page); + spin_unlock(&vcpu->kvm->mmu_lock); walker->ptes[walker->level - 1] = pte; } @@ -241,7 +249,7 @@ err: static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, u64 *spte, const void *pte, int bytes, - int offset_in_pte) + int offset_in_pte, struct page *userpage) { pt_element_t gpte; unsigned pte_access; @@ -257,7 +265,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, - gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte)); + gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), userpage); } /* @@ -316,11 +324,16 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, metaphysical, access, shadow_ent, &new_page); if (new_page && !metaphysical) { + int r; pt_element_t curr_pte; - kvm_read_guest(vcpu->kvm, walker->pte_gpa[level - 2], - &curr_pte, sizeof(curr_pte)); - if (curr_pte != walker->ptes[level - 2]) - return NULL; + r = kvm_read_guest_inatomic(vcpu->kvm, + walker->pte_gpa[level - 2], + &curr_pte, + sizeof(curr_pte)); + if (r || curr_pte != walker->ptes[level - 2]) { + shadow_ent = NULL; + goto out; + } } shadow_addr = __pa(shadow_page->spt); shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK @@ -331,8 +344,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, user_fault, write_fault, walker->ptes[walker->level-1] & PT_DIRTY_MASK, - ptwrite, walker->gfn); - + ptwrite, walker->gfn, walker->page); +out: + kvm_release_page_clean(walker->page); return shadow_ent; } @@ -371,6 +385,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, /* * Look up the shadow pte for the faulting address. */ + down_read(¤t->mm->mmap_sem); r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, fetch_fault); @@ -378,12 +393,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, * The page is not mapped by the guest. Let the guest handle it. */ if (!r) { + up_read(¤t->mm->mmap_sem); pgprintk("%s: guest page fault\n", __FUNCTION__); inject_page_fault(vcpu, addr, walker.error_code); vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ return 0; } - + spin_lock(&vcpu->kvm->mmu_lock); shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, &write_pt); pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, @@ -395,15 +411,32 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, /* * mmio: emulate if accessible, otherwise its a guest fault. */ - if (shadow_pte && is_io_pte(*shadow_pte)) + if (shadow_pte && is_io_pte(*shadow_pte)) { + spin_unlock(&vcpu->kvm->mmu_lock); + up_read(¤t->mm->mmap_sem); return 1; + } ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, "post page fault (fixed)"); - + spin_unlock(&vcpu->kvm->mmu_lock); + up_read(¤t->mm->mmap_sem); return write_pt; } +static struct page *FNAME(pte_to_page)(struct kvm_vcpu *vcpu, const void *pte, + int bytes) +{ + pt_element_t gpte = *(const pt_element_t *)pte; + + if (bytes < sizeof(pt_element_t)) + return NULL; + if (!is_present_pte(gpte)) + return NULL; + + return __gfn_to_page(vcpu->kvm, gpte_to_gfn(gpte)); +} + static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) { struct guest_walker walker; @@ -415,6 +448,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) if (r) { gpa = gfn_to_gpa(walker.gfn); gpa |= vaddr & ~PAGE_MASK; + kvm_release_page_clean(walker.page); } return gpa; @@ -423,27 +457,36 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - int i, offset = 0; + int i, r, offset = 0; pt_element_t *gpt; - struct page *page; - + void __user *src = (void __user *)gfn_to_hva(vcpu->kvm, sp->gfn); + void *dest = (void *)vcpu->kvm->prefetch_tmp_area; + if (sp->role.metaphysical || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { nonpaging_prefetch_page(vcpu, sp); return; } + pagefault_disable(); + r = __copy_from_user_inatomic(dest, src, PAGE_SIZE); + pagefault_enable(); + + if (r) { + nonpaging_prefetch_page(vcpu, sp); + return; + } + + gpt = (pt_element_t *)dest; + if (PTTYPE == 32) offset = sp->role.quadrant << PT64_LEVEL_BITS; - page = gfn_to_page(vcpu->kvm, sp->gfn); - gpt = kmap_atomic(page, KM_USER0); + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) if (is_present_pte(gpt[offset + i])) sp->spt[i] = shadow_trap_nonpresent_pte; else sp->spt[i] = shadow_notrap_nonpresent_pte; - kunmap_atomic(gpt, KM_USER0); - kvm_release_page_clean(page); } #undef pt_element_t diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 20c0f5e..e5a40dc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1431,27 +1431,34 @@ static int init_rmode_tss(struct kvm *kvm) { gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; u16 data = 0; + int ret = 0; int r; + down_read(¤t->mm->mmap_sem); r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); - if (r < 0) - return 0; + if (r < 0) + goto out; data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); if (r < 0) - return 0; + goto out; r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); if (r < 0) - return 0; + goto out; r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); if (r < 0) - return 0; + goto out; data = ~0; - r = kvm_write_guest_page(kvm, fn, &data, RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, - sizeof(u8)); + r = kvm_write_guest_page(kvm, fn, &data, + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, + sizeof(u8)); if (r < 0) - return 0; - return 1; + goto out; + + ret = 1; +out: + up_read(¤t->mm->mmap_sem); + return ret; } static void seg_setup(int seg) @@ -1468,8 +1475,8 @@ static int alloc_apic_access_page(struct kvm *kvm) { struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - - mutex_lock(&kvm->lock); + + down_write(¤t->mm->mmap_sem); if (kvm->arch.apic_access_page) goto out; kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; @@ -1481,7 +1488,7 @@ static int alloc_apic_access_page(struct kvm *kvm) goto out; kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); out: - mutex_unlock(&kvm->lock); + up_write(¤t->mm->mmap_sem); return r; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4b26270..24d8344 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -180,7 +180,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) int ret; u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; - mutex_lock(&vcpu->kvm->lock); + down_read(¤t->mm->mmap_sem); ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, offset * sizeof(u64), sizeof(pdpte)); if (ret < 0) { @@ -197,7 +197,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); out: - mutex_unlock(&vcpu->kvm->lock); + up_read(¤t->mm->mmap_sem); return ret; } @@ -211,13 +211,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) if (is_long_mode(vcpu) || !is_pae(vcpu)) return false; - mutex_lock(&vcpu->kvm->lock); + down_read(¤t->mm->mmap_sem); r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); if (r < 0) goto out; changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; out: - mutex_unlock(&vcpu->kvm->lock); + up_read(¤t->mm->mmap_sem); return changed; } @@ -277,9 +277,7 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) kvm_x86_ops->set_cr0(vcpu, cr0); vcpu->arch.cr0 = cr0; - mutex_lock(&vcpu->kvm->lock); kvm_mmu_reset_context(vcpu); - mutex_unlock(&vcpu->kvm->lock); return; } EXPORT_SYMBOL_GPL(set_cr0); @@ -319,9 +317,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) } kvm_x86_ops->set_cr4(vcpu, cr4); vcpu->arch.cr4 = cr4; - mutex_lock(&vcpu->kvm->lock); kvm_mmu_reset_context(vcpu); - mutex_unlock(&vcpu->kvm->lock); } EXPORT_SYMBOL_GPL(set_cr4); @@ -359,7 +355,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) */ } - mutex_lock(&vcpu->kvm->lock); + down_read(¤t->mm->mmap_sem); /* * Does the new cr3 value map to physical memory? (Note, we * catch an invalid cr3 even in real-mode, because it would @@ -375,7 +371,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vcpu->arch.cr3 = cr3; vcpu->arch.mmu.new_cr3(vcpu); } - mutex_unlock(&vcpu->kvm->lock); + up_read(¤t->mm->mmap_sem); } EXPORT_SYMBOL_GPL(set_cr3); @@ -1170,12 +1166,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) return -EINVAL; - mutex_lock(&kvm->lock); + down_write(¤t->mm->mmap_sem); kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; - mutex_unlock(&kvm->lock); + up_write(¤t->mm->mmap_sem); return 0; } @@ -1224,7 +1220,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, < alias->target_phys_addr) goto out; - mutex_lock(&kvm->lock); + down_write(¤t->mm->mmap_sem); p = &kvm->arch.aliases[alias->slot]; p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; @@ -1238,7 +1234,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, kvm_mmu_zap_all(kvm); - mutex_unlock(&kvm->lock); + up_write(¤t->mm->mmap_sem); return 0; @@ -1314,7 +1310,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot; int is_dirty = 0; - mutex_lock(&kvm->lock); + down_write(¤t->mm->mmap_sem); r = kvm_get_dirty_log(kvm, log, &is_dirty); if (r) @@ -1330,7 +1326,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, } r = 0; out: - mutex_unlock(&kvm->lock); + up_write(¤t->mm->mmap_sem); return r; } @@ -1524,25 +1520,32 @@ int emulator_read_std(unsigned long addr, struct kvm_vcpu *vcpu) { void *data = val; + int r = X86EMUL_CONTINUE; + down_read(¤t->mm->mmap_sem); while (bytes) { gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); unsigned offset = addr & (PAGE_SIZE-1); unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; + if (gpa == UNMAPPED_GVA) { + r = X86EMUL_PROPAGATE_FAULT; + goto out; + } ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy); - if (ret < 0) - return X86EMUL_UNHANDLEABLE; + if (ret < 0) { + r = X86EMUL_UNHANDLEABLE; + goto out; + } bytes -= tocopy; data += tocopy; addr += tocopy; } - - return X86EMUL_CONTINUE; +out: + up_read(¤t->mm->mmap_sem); + return r; } EXPORT_SYMBOL_GPL(emulator_read_std); @@ -1560,7 +1563,9 @@ static int emulator_read_emulated(unsigned long addr, return X86EMUL_CONTINUE; } + down_read(¤t->mm->mmap_sem); gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + up_read(¤t->mm->mmap_sem); /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -1576,11 +1581,14 @@ mmio: /* * Is this MMIO handled locally? */ + mutex_lock(&vcpu->kvm->lock); mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); if (mmio_dev) { kvm_iodevice_read(mmio_dev, gpa, bytes, val); + mutex_unlock(&vcpu->kvm->lock); return X86EMUL_CONTINUE; } + mutex_unlock(&vcpu->kvm->lock); vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; @@ -1594,11 +1602,21 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes) { int ret; + struct page *page; + down_read(¤t->mm->mmap_sem); ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); - if (ret < 0) + if (ret < 0) { + up_read(¤t->mm->mmap_sem); return 0; - kvm_mmu_pte_write(vcpu, gpa, val, bytes); + } + page = vcpu->arch.mmu.pte_to_page(vcpu, val, bytes); + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_pte_write(vcpu, gpa, val, bytes, page); + spin_unlock(&vcpu->kvm->mmu_lock); + up_read(¤t->mm->mmap_sem); + if (page) + kvm_release_page_clean(page); return 1; } @@ -1608,7 +1626,11 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { struct kvm_io_device *mmio_dev; - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa; + + down_read(¤t->mm->mmap_sem); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + up_read(¤t->mm->mmap_sem); if (gpa == UNMAPPED_GVA) { kvm_inject_page_fault(vcpu, addr, 2); @@ -1626,11 +1648,14 @@ mmio: /* * Is this MMIO handled locally? */ + mutex_lock(&vcpu->kvm->lock); mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); if (mmio_dev) { kvm_iodevice_write(mmio_dev, gpa, bytes, val); + mutex_unlock(&vcpu->kvm->lock); return X86EMUL_CONTINUE; } + mutex_unlock(&vcpu->kvm->lock); vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; @@ -1677,11 +1702,15 @@ static int emulator_cmpxchg_emulated(unsigned long addr, #ifndef CONFIG_X86_64 /* guests cmpxchg8b have to be emulated atomically */ if (bytes == 8) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa; struct page *page; char *addr; u64 *val; + down_read(¤t->mm->mmap_sem); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + up_read(¤t->mm->mmap_sem); + if (gpa == UNMAPPED_GVA || (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto emul_write; @@ -2077,10 +2106,10 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, kvm_x86_ops->skip_emulated_instruction(vcpu); for (i = 0; i < nr_pages; ++i) { - mutex_lock(&vcpu->kvm->lock); + down_read(¤t->mm->mmap_sem); page = gva_to_page(vcpu, address + i * PAGE_SIZE); vcpu->arch.pio.guest_pages[i] = page; - mutex_unlock(&vcpu->kvm->lock); + up_read(¤t->mm->mmap_sem); if (!page) { kvm_inject_gp(vcpu, 0); free_pio_guest_pages(vcpu); @@ -2203,7 +2232,6 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) char instruction[3]; int ret = 0; - mutex_lock(&vcpu->kvm->lock); /* * Blow out the MMU to ensure that no other VCPU has an active mapping @@ -2218,8 +2246,6 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) != X86EMUL_CONTINUE) ret = -EFAULT; - mutex_unlock(&vcpu->kvm->lock); - return ret; } @@ -2827,13 +2853,13 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, gpa_t gpa; vcpu_load(vcpu); - mutex_lock(&vcpu->kvm->lock); + down_read(¤t->mm->mmap_sem); gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); + up_read(¤t->mm->mmap_sem); tr->physical_address = gpa; tr->valid = gpa != UNMAPPED_GVA; tr->writeable = 1; tr->usermode = 0; - mutex_unlock(&vcpu->kvm->lock); vcpu_put(vcpu); return 0; @@ -3102,13 +3128,11 @@ int kvm_arch_set_memory_region(struct kvm *kvm, */ if (!user_alloc) { if (npages && !old.rmap) { - down_write(¤t->mm->mmap_sem); memslot->userspace_addr = do_mmap(NULL, 0, npages * PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0); - up_write(¤t->mm->mmap_sem); if (IS_ERR((void *)memslot->userspace_addr)) return PTR_ERR((void *)memslot->userspace_addr); @@ -3116,10 +3140,8 @@ int kvm_arch_set_memory_region(struct kvm *kvm, if (!old.user_alloc && old.rmap) { int ret; - down_write(¤t->mm->mmap_sem); ret = do_munmap(current->mm, old.userspace_addr, old.npages * PAGE_SIZE); - up_write(¤t->mm->mmap_sem); if (ret < 0) printk(KERN_WARNING "kvm_vm_ioctl_set_memory_region: " diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 28940e1..fd06723 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -177,6 +177,8 @@ struct kvm_mmu { int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); void (*free)(struct kvm_vcpu *vcpu); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); + struct page *(*pte_to_page)(struct kvm_vcpu *vcpu, const void *pte, + int bytes); void (*prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page); hpa_t root_hpa; @@ -468,7 +470,7 @@ unsigned long segment_base(u16 selector); void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes); + const u8 *new, int bytes, struct page *userpage); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 953b50a..6ca0bdb 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -105,11 +105,13 @@ struct kvm_memory_slot { struct kvm { struct mutex lock; /* protects everything except vcpus */ + spinlock_t mmu_lock; struct mm_struct *mm; /* userspace tied to this vm */ int nmemslots; struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS]; struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + unsigned long prefetch_tmp_area; struct list_head vm_list; struct file *filp; struct kvm_io_bus mmio_bus; @@ -163,11 +165,18 @@ int kvm_arch_set_memory_region(struct kvm *kvm, int user_alloc); gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); +struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn); +unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); void kvm_release_page_clean(struct page *page); void kvm_release_page_dirty(struct page *page); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); + +int kvm_read_guest_page_inatomic(struct kvm *kvm, gfn_t gfn, void *data, + int offset, int len); +int kvm_read_guest_inatomic(struct kvm *kvm, gpa_t gpa, void *data, + unsigned long len); int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, int offset, int len); int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 845beb2..afdb767 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -165,12 +165,14 @@ static struct kvm *kvm_create_vm(void) kvm->mm = current->mm; atomic_inc(&kvm->mm->mm_count); + spin_lock_init(&kvm->mmu_lock); kvm_io_bus_init(&kvm->pio_bus); mutex_init(&kvm->lock); kvm_io_bus_init(&kvm->mmio_bus); spin_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); spin_unlock(&kvm_lock); + kvm->prefetch_tmp_area = get_zeroed_page(GFP_KERNEL); out: return kvm; } @@ -211,6 +213,7 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_io_bus_destroy(&kvm->mmio_bus); kvm_arch_destroy_vm(kvm); mmdrop(mm); + free_page(kvm->prefetch_tmp_area); } static int kvm_vm_release(struct inode *inode, struct file *filp) @@ -227,7 +230,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) * * Discontiguous memory is allowed, mostly for framebuffers. * - * Must be called holding kvm->lock. + * Must be called holding mmap_sem for write. */ int __kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, @@ -338,9 +341,9 @@ int kvm_set_memory_region(struct kvm *kvm, { int r; - mutex_lock(&kvm->lock); + down_write(¤t->mm->mmap_sem); r = __kvm_set_memory_region(kvm, mem, user_alloc); - mutex_unlock(&kvm->lock); + up_write(¤t->mm->mmap_sem); return r; } EXPORT_SYMBOL_GPL(kvm_set_memory_region); @@ -442,7 +445,7 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); -static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) +unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *slot; @@ -452,11 +455,12 @@ static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) return bad_hva(); return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); } +EXPORT_SYMBOL_GPL(gfn_to_hva); /* * Requires current->mm->mmap_sem to be held */ -static struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn) +struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn) { struct page *page[1]; unsigned long addr; @@ -480,6 +484,7 @@ static struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn) return page[0]; } +EXPORT_SYMBOL(__gfn_to_page); struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) { @@ -552,6 +557,46 @@ int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) } EXPORT_SYMBOL_GPL(kvm_read_guest); +int kvm_read_guest_page_inatomic(struct kvm *kvm, gfn_t gfn, void *data, + int offset, int len) +{ + int r; + unsigned long addr; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return -EFAULT; + pagefault_disable(); + r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); + pagefault_enable(); + if (r) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(kvm_read_guest_page_inatomic); + +int kvm_read_guest_inatomic(struct kvm *kvm, gpa_t gpa, void *data, + unsigned long len) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + int seg; + int offset = offset_in_page(gpa); + int ret; + + while ((seg = next_segment(len, offset)) != 0) { + ret = kvm_read_guest_page_inatomic(kvm, gfn, data, offset, seg); + if (ret < 0) + return ret; + offset = 0; + len -= seg; + data += seg; + ++gfn; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_read_guest_inatomic); + + int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, int offset, int len) { ------------------------------------------------------------------------- SF.Net email is sponsored by: Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace _______________________________________________ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel