Marcelo Tosatti wrote: > Ok, how does the following look. Still need to plug in large page > creation in the nonpaging case, but this should be enough for comments. >
Most of the comments are cosmetic, but a couple have some meat. > > +#define HPAGE_ALIGN_OFFSET(x) ((((x)+HPAGE_SIZE-1)&HPAGE_MASK) - (x)) > A function please. > +/* > + * Return the offset inside the memslot largepage integer map for a given > + * gfn, handling slots that are not large page aligned. > + */ > +int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot) > +{ > + unsigned long long idx; > + unsigned long memslot_align; > + > + memslot_align = HPAGE_ALIGN_OFFSET(slot->base_gfn << PAGE_SHIFT); > + idx = ((gfn - slot->base_gfn) << PAGE_SHIFT) + memslot_align; > + idx /= HPAGE_SIZE; > Can probably be reshuffled to avoid the long long. The compiler may want to call helpers on i386... We also need our own HPAGE_SIZE since kvm uses pae on non-pae hosts. > + > +static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) > +{ > + struct kvm_memory_slot *slot; > + > + slot = gfn_to_memslot(kvm, gfn); > + if (slot) { > + int *largepage_idx; > + int end_gfn; > + > + largepage_idx = slot_largepage_idx(gfn, slot); > + /* check if the largepage crosses a memslot */ > + end_gfn = slot->base_gfn + slot->npages; > + if (gfn + (HPAGE_SIZE/PAGE_SIZE) >= end_gfn) > + return 1; > + else > + return *largepage_idx; > We might initialize the boundary slots to 1 instead of zero and so avoid this check. > +static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) > +{ > + if (has_wrprotected_page(vcpu->kvm, large_gfn)) > + return 0; > + > + if (!host_largepage_backed(vcpu->kvm, large_gfn)) > + return 0; > + > + if ((large_gfn << PAGE_SHIFT) & (HPAGE_SIZE-1)) > + return 0; > I'd drop this check and simply ignore the least significant 9 bits of large_gfn. > + > + /* guest has 4M pages */ > + if (!is_pae(vcpu)) > + return 0; > We can drop this check too. If we treat a 4MB page as two contiguous 2MB pages, and don't zero out the large_gfn least significant bits, most of the code needn't know about it at all. > @@ -362,6 +469,20 @@ static unsigned long *gfn_to_rmap(struct > return &slot->rmap[gfn - slot->base_gfn]; > } > /* > * Reverse mapping data structures: > * > @@ -371,7 +492,7 @@ static unsigned long *gfn_to_rmap(struct > * If rmapp bit zero is one, (then rmap & ~1) points to a struct > kvm_rmap_desc > * containing more mappings. > */ > -static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) > +static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int hpage) > { > struct kvm_mmu_page *sp; > struct kvm_rmap_desc *desc; > @@ -383,7 +504,10 @@ static void rmap_add(struct kvm_vcpu *vc > gfn = unalias_gfn(vcpu->kvm, gfn); > sp = page_header(__pa(spte)); > sp->gfns[spte - sp->spt] = gfn; > - rmapp = gfn_to_rmap(vcpu->kvm, gfn); > + if (!hpage) > + rmapp = gfn_to_rmap(vcpu->kvm, gfn); > + else > + rmapp = gfn_to_rmap_pde(vcpu->kvm, gfn); > This bit can go into a function > if (!*rmapp) { > rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); > *rmapp = (unsigned long)spte; > @@ -449,7 +573,10 @@ static void rmap_remove(struct kvm *kvm, > kvm_release_page_dirty(page); > else > kvm_release_page_clean(page); > - rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]); > + if (is_large_pte(*spte)) > + rmapp = gfn_to_rmap_pde(kvm, sp->gfns[spte - sp->spt]); > + else > + rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]); > As it is reused here. > > #ifdef MMU_DEBUG > @@ -749,11 +895,19 @@ static void kvm_mmu_page_unlink_children > for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { > ent = pt[i]; > > + if (is_shadow_present_pte(pt[i]) && is_large_pte(pt[i])) { > + if (is_writeble_pte(pt[i])) > + --kvm->stat.lpages; > + rmap_remove(kvm, &pt[i]); > + } > + > pt[i] == ent here. you can also move this code to be the 'else' part of the if () you introduce below. > pt[i] = shadow_trap_nonpresent_pte; > if (!is_shadow_present_pte(ent)) > continue; > - ent &= PT64_BASE_ADDR_MASK; > - mmu_page_remove_parent_pte(page_header(ent), &pt[i]); > + if (!is_large_pte(ent)) { > + ent &= PT64_BASE_ADDR_MASK; > + mmu_page_remove_parent_pte(page_header(ent), &pt[i]); > + } > } > kvm_flush_remote_tlbs(kvm); > } > > @@ -886,12 +1042,21 @@ struct page *gva_to_page(struct kvm_vcpu > static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, > unsigned pt_access, unsigned pte_access, > int user_fault, int write_fault, int dirty, > - int *ptwrite, gfn_t gfn, struct page *page) > + int *ptwrite, int largepage, gfn_t gfn, > + struct page *page) > { > u64 spte; > int was_rmapped = is_rmap_pte(*shadow_pte); > int was_writeble = is_writeble_pte(*shadow_pte); > > + /* > + * If its a largepage mapping, there could be a previous > + * pointer to a PTE page hanging there, which will falsely > + * set was_rmapped. > + */ > + if (largepage) > + was_rmapped = is_large_pte(*shadow_pte); > + > But that pte page will have its parent_pte chain pointing to shadow_pte, no? Either this can't happen, or we need to unlink that pte page first. > @@ -951,10 +1119,17 @@ unshadowed: > mark_page_dirty(vcpu->kvm, gfn); > > pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte); > + pgprintk("instantiating %s PTE (%s) at %d (%llx)\n", > + (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB", > + (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte); > set_shadow_pte(shadow_pte, spte); > + if (!was_rmapped && (spte & (PT_PAGE_SIZE_MASK|PT_WRITABLE_MASK)) > + == (PT_PAGE_SIZE_MASK|PT_WRITABLE_MASK)) > + ++vcpu->kvm->stat.lpages; > + > Why do you only account for writable large pages? > Index: linux-2.6-x86-kvm/arch/x86/kvm/paging_tmpl.h > =================================================================== > --- linux-2.6-x86-kvm.orig/arch/x86/kvm/paging_tmpl.h > +++ linux-2.6-x86-kvm/arch/x86/kvm/paging_tmpl.h > @@ -71,6 +71,7 @@ struct guest_walker { > unsigned pte_access; > gfn_t gfn; > u32 error_code; > + int largepage_backed; > }; > > static gfn_t gpte_to_gfn(pt_element_t gpte) > @@ -120,7 +121,8 @@ static unsigned FNAME(gpte_access)(struc > */ > static int FNAME(walk_addr)(struct guest_walker *walker, > struct kvm_vcpu *vcpu, gva_t addr, > - int write_fault, int user_fault, int fetch_fault) > + int write_fault, int user_fault, int fetch_fault, > + int faulting) > { > pt_element_t pte; > gfn_t table_gfn; > @@ -130,6 +132,7 @@ static int FNAME(walk_addr)(struct guest > pgprintk("%s: addr %lx\n", __FUNCTION__, addr); > walk: > walker->level = vcpu->arch.mmu.root_level; > + walker->largepage_backed = 0; > pte = vcpu->arch.cr3; > #if PTTYPE == 64 > if (!is_long_mode(vcpu)) { > @@ -192,10 +195,19 @@ walk: > if (walker->level == PT_DIRECTORY_LEVEL > && (pte & PT_PAGE_SIZE_MASK) > && (PTTYPE == 64 || is_pse(vcpu))) { > - walker->gfn = gpte_to_gfn_pde(pte); > + gfn_t gfn = gpte_to_gfn_pde(pte); > + walker->gfn = gfn; > + > walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); > if (PTTYPE == 32 && is_cpuid_PSE36()) > walker->gfn += pse36_gfn_delta(pte); > + > + if (faulting > + && is_largepage_backed(vcpu, gfn) > + && is_physical_memory(vcpu->kvm, walker->gfn)) { > + walker->largepage_backed = 1; > + walker->gfn = gfn; > + } > I don't like this bit. So far the walker has been independent of the host state, and only depended on guest data. We can set largepage_backed unconditionally on a large guest pte, leave gfn containing the low-order bits, and leave the host checks for the actual mapping logic. > > /* > @@ -299,6 +313,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu > shadow_ent = ((u64 *)__va(shadow_addr)) + index; > if (level == PT_PAGE_TABLE_LEVEL) > break; > + if (level == PT_DIRECTORY_LEVEL && walker->largepage_backed) > + break; > + > (here) > @@ -395,6 +412,13 @@ static int FNAME(page_fault)(struct kvm_ > > page = gfn_to_page(vcpu->kvm, walker.gfn); > > + /* shortcut non-RAM accesses to avoid walking over a 2MB PMD entry */ > + if (is_error_page(page)) { > + kvm_release_page_clean(page); > + up_read(¤t->mm->mmap_sem); > + return 1; > + } > + > This bit is already in kvm.git? > > struct kvm_vcpu_stat { > Index: linux-2.6-x86-kvm/include/linux/kvm_host.h > =================================================================== > --- linux-2.6-x86-kvm.orig/include/linux/kvm_host.h > +++ linux-2.6-x86-kvm/include/linux/kvm_host.h > @@ -99,7 +99,9 @@ struct kvm_memory_slot { > unsigned long npages; > unsigned long flags; > unsigned long *rmap; > + unsigned long *rmap_pde; > unsigned long *dirty_bitmap; > + int *largepage; > We can combine rmap_pde and largepage into an array of struct { int write_count, unsigned long rmap_pde; } and save some cacheline accesses. > > > void kvm_free_physmem(struct kvm *kvm) > @@ -300,6 +308,28 @@ int __kvm_set_memory_region(struct kvm * > new.user_alloc = user_alloc; > new.userspace_addr = mem->userspace_addr; > } > + if (npages && !new.rmap_pde) { > + int largepages = npages / (HPAGE_SIZE/PAGE_SIZE); > + if (npages % (HPAGE_SIZE/PAGE_SIZE)) > + largepages++; > + new.rmap_pde = vmalloc(largepages * sizeof(struct page *)); > + > + if (!new.rmap_pde) > + goto out_free; > + > + memset(new.rmap_pde, 0, largepages * sizeof(struct page *)); > + } > + if (npages && !new.largepage) { > + int largepages = npages / (HPAGE_SIZE/PAGE_SIZE); > + if (npages % (HPAGE_SIZE/PAGE_SIZE)) > + largepages++; > + new.largepage = kmalloc(largepages * sizeof(int), GFP_KERNEL); > + > + vmalloc() here too, for very large guests. Please test the changes with user/test/x86/access.flat, with both normal and large pages backing. -- error compiling committee.c: too many arguments to function ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2008. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ _______________________________________________ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel