Marcelo Tosatti wrote: > On Wed, Feb 13, 2008 at 08:45:51AM +0200, Avi Kivity wrote: > >>> gfn_to_page() needs to grab the struct page corresponding to the large >>> page, not the offset struct page for the faulting 4k address within >>> the large frame. Since gfn_to_page can sleep, there is no way to do >>> that in the mapping logic which happens under mmu_lock protection. >>> We don't want to grab the large page frame "struct page" unless the >>> is_largepage_backed() checks are successful. >>> >>> The checks could be done in page_fault() if walker->level == 2, before >>> gfn_to_page()... But I don't see much difference of that and doing >>> it inside walk_addr(). What do you say? >>> >>> >>> >> I'd like to keep walk_addr() independent of the rest of the mmu (i.e. >> walk_addr is 100% guest oriented). Also, the issue you point out is >> shared by direct_map which doesn't call walk_addr(). >> >> An unrelated issue (pointed out by Jun Nakajima) is that this kills >> dirty log tracking (needed for migration). It could be solved simply by >> not using large page backing if dirty log tracking is enabled for that slot. >> > > Ok, fixed your comments and a bug which a root page was shadowed in the > large area being mapped. access.flat is happy. > > Joerg, can you give this a try on a NPT-enabled system (need the > attached qemu-largepage-hack.patch). > > Thanks > > Index: kvm.largepages/arch/x86/kvm/mmu.c > =================================================================== > --- kvm.largepages.orig/arch/x86/kvm/mmu.c > +++ kvm.largepages/arch/x86/kvm/mmu.c > @@ -27,6 +27,7 @@ > #include <linux/highmem.h> > #include <linux/module.h> > #include <linux/swap.h> > +#include <linux/hugetlb.h> > > #include <asm/page.h> > #include <asm/cmpxchg.h> > @@ -211,6 +212,11 @@ static int is_shadow_present_pte(u64 pte > && pte != shadow_notrap_nonpresent_pte; > } > > +static int is_large_pte(u64 pte) > +{ > + return pte & PT_PAGE_SIZE_MASK; > +} > + > static int is_writeble_pte(unsigned long pte) > { > return pte & PT_WRITABLE_MASK; > @@ -350,17 +356,120 @@ static void mmu_free_rmap_desc(struct kv > kfree(rd); > } > > +static int hpage_align_diff(unsigned long gfn) > +{ > + return ((gfn+KVM_PAGES_PER_HPAGE-1) & ~(KVM_PAGES_PER_HPAGE-1)) - gfn; > +} > + > +/* > + * Return the pointer to the largepage write count for a given > + * gfn, handling slots that are not large page aligned. > + */ > +static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot) > +{ > + unsigned long idx; > + > + idx = (gfn - slot->base_gfn) + hpage_align_diff(slot->base_gfn); > + idx /= KVM_PAGES_PER_HPAGE; > + return &slot->lpage_info[idx].write_count; > +} >
Can be further simplified to (gfn / KVM_PAGES_PER_HPAGE) - (slot->base_gfn / KVM_PAGES_PER_HPAGE). Sorry for not noticing earlier. > + > +static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) > +{ > + struct kvm_memory_slot *slot; > + > + if (has_wrprotected_page(vcpu->kvm, large_gfn)) > + return 0; > + > + if (!host_largepage_backed(vcpu->kvm, large_gfn)) > + return 0; > + > + slot = gfn_to_memslot(vcpu->kvm, large_gfn); > + if (slot && slot->dirty_bitmap) > + return 0; > + > + /* guest has 4M pages, host 2M */ > + if (!is_pae(vcpu) && HPAGE_SHIFT == 21) > + return 0; > Is this check necessary? I think that if we remove it things will just work. A 4MB page will be have either one or two 2MB sptes (which may even belong to different slots). > @@ -894,12 +1030,28 @@ struct page *gva_to_page(struct kvm_vcpu > static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, > unsigned pt_access, unsigned pte_access, > int user_fault, int write_fault, int dirty, > - int *ptwrite, gfn_t gfn, struct page *page) > + int *ptwrite, int largepage, gfn_t gfn, > + struct page *page) > { > u64 spte; > int was_rmapped = is_rmap_pte(*shadow_pte); > int was_writeble = is_writeble_pte(*shadow_pte); > > + /* > + * If we overwrite a PTE page pointer with a 2MB PMD, unlink > + * the parent of the now unreachable PTE. > + */ > + if (largepage) { > + if (was_rmapped && !is_large_pte(*shadow_pte)) { > + struct kvm_mmu_page *child; > + u64 pte = *shadow_pte; > + > + child = page_header(pte & PT64_BASE_ADDR_MASK); > + mmu_page_remove_parent_pte(child, shadow_pte); > + } > + was_rmapped = is_large_pte(*shadow_pte); > + } > + > pgprintk("%s: spte %llx access %x write_fault %d" > " user_fault %d gfn %lx\n", > __FUNCTION__, *shadow_pte, pt_access, > @@ -919,6 +1071,8 @@ static void mmu_set_spte(struct kvm_vcpu > spte |= PT_PRESENT_MASK; > if (pte_access & ACC_USER_MASK) > spte |= PT_USER_MASK; > + if (largepage) > + spte |= PT_PAGE_SIZE_MASK; > > spte |= page_to_phys(page); > > @@ -933,7 +1087,8 @@ static void mmu_set_spte(struct kvm_vcpu > } > > shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); > - if (shadow) { > + if (shadow || > + (largepage && has_wrprotected_page(vcpu->kvm, gfn))) { > pgprintk("%s: found shadow page for %lx, marking ro\n", > __FUNCTION__, gfn); > pte_access &= ~ACC_WRITE_MASK; > @@ -941,6 +1096,18 @@ static void mmu_set_spte(struct kvm_vcpu > spte &= ~PT_WRITABLE_MASK; > kvm_x86_ops->tlb_flush(vcpu); > } > + /* > + * Largepage creation is susceptible to a upper-level > + * table to be shadowed and write-protected in the > + * area being mapped. If that is the case, invalidate > + * the entry and let the instruction fault again > + * and use 4K mappings. > + */ > + if (largepage) { > + spte = shadow_trap_nonpresent_pte; > + kvm_x86_ops->tlb_flush(vcpu); > + goto unshadowed; > + } > Would it not repeat exactly the same code path? Or is this just for the case of the pte_update path? > - page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); > + if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1)) > + && is_physical_memory(vcpu->kvm, gfn)) { > + gfn &= ~(KVM_PAGES_PER_HPAGE-1); > + largepage = 1; > + } > Doesn't is_largepage_backed() imply is_physical_memory? > > Index: kvm.largepages/arch/x86/kvm/x86.c > =================================================================== > --- kvm.largepages.orig/arch/x86/kvm/x86.c > +++ kvm.largepages/arch/x86/kvm/x86.c > @@ -86,6 +86,7 @@ struct kvm_stats_debugfs_item debugfs_en > { "mmu_recycled", VM_STAT(mmu_recycled) }, > { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, > { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, > + { "lpages", VM_STAT(lpages) }, > { NULL } > }; > s/lpages/largepages/, this is user visible. > + new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info)); > + > + if (!new.lpage_info) > + goto out_free; > + > + memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info)); > + /* large page crosses memslot boundary */ > + if (npages % KVM_PAGES_PER_HPAGE) { > + new.lpage_info[0].write_count = 1; > This seems wrong, say a 3MB slot at 1GB, you kill the first largepage which is good. > + new.lpage_info[largepages-1].write_count = 1; > OTOH, a 3MB slot at 3MB, the last page is fine. The check needs to be against base_gfn and base_gfn + npages, not the number of pages. > + } > > + } > > /* Allocate page dirty bitmap if needed */ > if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { > @@ -444,7 +464,7 @@ int kvm_is_visible_gfn(struct kvm *kvm, > } > EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); > > -static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) > +unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) > { > struct kvm_memory_slot *slot; > > @@ -454,6 +474,7 @@ static unsigned long gfn_to_hva(struct k > return bad_hva(); > return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); > } > +EXPORT_SYMBOL(gfn_to_hva); > > /* > * Requires current->mm->mmap_sem to be held > > ------------------------------------------------------------------------ > > ------------------------------------------------------------------------- > This SF.net email is sponsored by: Microsoft > Defy all challenges. Microsoft(R) Visual Studio 2008. > http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ > ------------------------------------------------------------------------ > > _______________________________________________ > kvm-devel mailing list > kvm-devel@lists.sourceforge.net > https://lists.sourceforge.net/lists/listinfo/kvm-devel -- Any sufficiently difficult bug is indistinguishable from a feature. ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2008. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ _______________________________________________ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel