Re: [kvm-devel] large page support for kvm

Avi Kivity Tue, 12 Feb 2008 03:56:24 -0800

Marcelo Tosatti wrote:
> Ok, how does the following look. Still need to plug in large page
> creation in the nonpaging case, but this should be enough for comments.
>


Most of the comments are cosmetic, but a couple have some meat.

>  
> +#define HPAGE_ALIGN_OFFSET(x) ((((x)+HPAGE_SIZE-1)&HPAGE_MASK) - (x))
>   

A function please.

> +/*
> + * Return the offset inside the memslot largepage integer map for a given
> + * gfn, handling slots that are not large page aligned.
> + */
> +int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
> +{
> +     unsigned long long idx;
> +     unsigned long memslot_align;
> +
> +     memslot_align = HPAGE_ALIGN_OFFSET(slot->base_gfn << PAGE_SHIFT);
> +     idx = ((gfn - slot->base_gfn) << PAGE_SHIFT) + memslot_align;
> +     idx /= HPAGE_SIZE;
>   

Can probably be reshuffled to avoid the long long.  The compiler may 
want to call helpers on i386...

We also need our own HPAGE_SIZE since kvm uses pae on non-pae hosts.
> +
> +static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
> +{
> +     struct kvm_memory_slot *slot;
> +
> +     slot = gfn_to_memslot(kvm, gfn);
> +     if (slot) {
> +             int *largepage_idx;
> +             int end_gfn;
> +
> +             largepage_idx = slot_largepage_idx(gfn, slot);
> +             /* check if the largepage crosses a memslot */
> +             end_gfn = slot->base_gfn + slot->npages;
> +             if (gfn + (HPAGE_SIZE/PAGE_SIZE) >= end_gfn)
> +                     return 1;
> +             else
> +                     return *largepage_idx;
>   

We might initialize the boundary slots to 1 instead of zero and so avoid 
this check.

> +static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
> +{
> +     if (has_wrprotected_page(vcpu->kvm, large_gfn))
> +             return 0;
> +
> +     if (!host_largepage_backed(vcpu->kvm, large_gfn))
> +             return 0;
> +
> +     if ((large_gfn << PAGE_SHIFT) & (HPAGE_SIZE-1))
> +             return 0;
>   

I'd drop this check and simply ignore the least significant 9 bits of 
large_gfn.

> +
> +     /* guest has 4M pages */
> +     if (!is_pae(vcpu))
> +             return 0;
>   

We can drop this check too.  If we treat a 4MB page as two contiguous 
2MB pages, and don't zero out the large_gfn least significant bits, most 
of the code needn't know about it at all.

> @@ -362,6 +469,20 @@ static unsigned long *gfn_to_rmap(struct
>       return &slot->rmap[gfn - slot->base_gfn];
>  }
>  /*
>   * Reverse mapping data structures:
>   *
> @@ -371,7 +492,7 @@ static unsigned long *gfn_to_rmap(struct
>   * If rmapp bit zero is one, (then rmap & ~1) points to a struct 
> kvm_rmap_desc
>   * containing more mappings.
>   */
> -static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
> +static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int hpage)
>  {
>       struct kvm_mmu_page *sp;
>       struct kvm_rmap_desc *desc;
> @@ -383,7 +504,10 @@ static void rmap_add(struct kvm_vcpu *vc
>       gfn = unalias_gfn(vcpu->kvm, gfn);
>       sp = page_header(__pa(spte));
>       sp->gfns[spte - sp->spt] = gfn;
> -     rmapp = gfn_to_rmap(vcpu->kvm, gfn);
> +     if (!hpage)
> +             rmapp = gfn_to_rmap(vcpu->kvm, gfn);
> +     else
> +             rmapp = gfn_to_rmap_pde(vcpu->kvm, gfn);
>   

This bit can go into a function

>       if (!*rmapp) {
>               rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
>               *rmapp = (unsigned long)spte;
> @@ -449,7 +573,10 @@ static void rmap_remove(struct kvm *kvm,
>               kvm_release_page_dirty(page);
>       else
>               kvm_release_page_clean(page);
> -     rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
> +     if (is_large_pte(*spte))
> +             rmapp = gfn_to_rmap_pde(kvm, sp->gfns[spte - sp->spt]);
> +     else
> +             rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
>   

As it is reused here.

>  
>  #ifdef MMU_DEBUG
> @@ -749,11 +895,19 @@ static void kvm_mmu_page_unlink_children
>       for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
>               ent = pt[i];
>  
> +             if (is_shadow_present_pte(pt[i]) && is_large_pte(pt[i])) {
> +                     if (is_writeble_pte(pt[i]))
> +                             --kvm->stat.lpages;
> +                     rmap_remove(kvm, &pt[i]);
> +             }
> +
>   

pt[i] == ent here.  you can also move this code to be the 'else' part of 
the if () you introduce below.

>               pt[i] = shadow_trap_nonpresent_pte;
>               if (!is_shadow_present_pte(ent))
>                       continue;
> -             ent &= PT64_BASE_ADDR_MASK;
> -             mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
> +             if (!is_large_pte(ent)) {
> +                     ent &= PT64_BASE_ADDR_MASK;
> +                     mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
> +             }
>       }
>       kvm_flush_remote_tlbs(kvm);
>  }
>   

> @@ -886,12 +1042,21 @@ struct page *gva_to_page(struct kvm_vcpu
>  static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
>                        unsigned pt_access, unsigned pte_access,
>                        int user_fault, int write_fault, int dirty,
> -                      int *ptwrite, gfn_t gfn, struct page *page)
> +                      int *ptwrite, int largepage, gfn_t gfn,
> +                      struct page *page)
>  {
>       u64 spte;
>       int was_rmapped = is_rmap_pte(*shadow_pte);
>       int was_writeble = is_writeble_pte(*shadow_pte);
>  
> +     /*
> +      * If its a largepage mapping, there could be a previous
> +      * pointer to a PTE page hanging there, which will falsely
> +      * set was_rmapped.
> +      */
> +     if (largepage)
> +             was_rmapped = is_large_pte(*shadow_pte);
> +
>   

But that pte page will have its parent_pte chain pointing to shadow_pte, 
no?  Either this can't happen, or we need to unlink that pte page first.

> @@ -951,10 +1119,17 @@ unshadowed:
>               mark_page_dirty(vcpu->kvm, gfn);
>  
>       pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
> +     pgprintk("instantiating %s PTE (%s) at %d (%llx)\n",
> +              (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
> +              (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte);
>       set_shadow_pte(shadow_pte, spte);
> +     if (!was_rmapped && (spte & (PT_PAGE_SIZE_MASK|PT_WRITABLE_MASK))
> +             == (PT_PAGE_SIZE_MASK|PT_WRITABLE_MASK))
> +             ++vcpu->kvm->stat.lpages;
> +
>   

Why do you only account for writable large pages?

> Index: linux-2.6-x86-kvm/arch/x86/kvm/paging_tmpl.h
> ===================================================================
> --- linux-2.6-x86-kvm.orig/arch/x86/kvm/paging_tmpl.h
> +++ linux-2.6-x86-kvm/arch/x86/kvm/paging_tmpl.h
> @@ -71,6 +71,7 @@ struct guest_walker {
>       unsigned pte_access;
>       gfn_t gfn;
>       u32 error_code;
> +     int largepage_backed;
>  };
>  
>  static gfn_t gpte_to_gfn(pt_element_t gpte)
> @@ -120,7 +121,8 @@ static unsigned FNAME(gpte_access)(struc
>   */
>  static int FNAME(walk_addr)(struct guest_walker *walker,
>                           struct kvm_vcpu *vcpu, gva_t addr,
> -                         int write_fault, int user_fault, int fetch_fault)
> +                         int write_fault, int user_fault, int fetch_fault,
> +                         int faulting)
>  {
>       pt_element_t pte;
>       gfn_t table_gfn;
> @@ -130,6 +132,7 @@ static int FNAME(walk_addr)(struct guest
>       pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
>  walk:
>       walker->level = vcpu->arch.mmu.root_level;
> +     walker->largepage_backed = 0;
>       pte = vcpu->arch.cr3;
>  #if PTTYPE == 64
>       if (!is_long_mode(vcpu)) {
> @@ -192,10 +195,19 @@ walk:
>               if (walker->level == PT_DIRECTORY_LEVEL
>                   && (pte & PT_PAGE_SIZE_MASK)
>                   && (PTTYPE == 64 || is_pse(vcpu))) {
> -                     walker->gfn = gpte_to_gfn_pde(pte);
> +                     gfn_t gfn = gpte_to_gfn_pde(pte);
> +                     walker->gfn = gfn;
> +
>                       walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
>                       if (PTTYPE == 32 && is_cpuid_PSE36())
>                               walker->gfn += pse36_gfn_delta(pte);
> +
> +                     if (faulting
> +                         && is_largepage_backed(vcpu, gfn)
> +                         && is_physical_memory(vcpu->kvm, walker->gfn)) {
> +                             walker->largepage_backed = 1;
> +                             walker->gfn = gfn;
> +                     }
>   

I don't like this bit.  So far the walker has been independent of the 
host state, and only depended on guest data.  We can set 
largepage_backed unconditionally on a large guest pte, leave gfn 
containing the low-order bits, and leave the host checks for the actual 
mapping logic.

>  
>  /*
> @@ -299,6 +313,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
>               shadow_ent = ((u64 *)__va(shadow_addr)) + index;
>               if (level == PT_PAGE_TABLE_LEVEL)
>                       break;
> +             if (level == PT_DIRECTORY_LEVEL && walker->largepage_backed)
> +                     break;
> +
>   

(here)

> @@ -395,6 +412,13 @@ static int FNAME(page_fault)(struct kvm_
>  
>       page = gfn_to_page(vcpu->kvm, walker.gfn);
>  
> +     /* shortcut non-RAM accesses to avoid walking over a 2MB PMD entry */
> +     if (is_error_page(page)) {
> +             kvm_release_page_clean(page);
> +             up_read(&current->mm->mmap_sem);
> +             return 1;
> +     }
> +
>   

This bit is already in kvm.git?

>  
>  struct kvm_vcpu_stat {
> Index: linux-2.6-x86-kvm/include/linux/kvm_host.h
> ===================================================================
> --- linux-2.6-x86-kvm.orig/include/linux/kvm_host.h
> +++ linux-2.6-x86-kvm/include/linux/kvm_host.h
> @@ -99,7 +99,9 @@ struct kvm_memory_slot {
>       unsigned long npages;
>       unsigned long flags;
>       unsigned long *rmap;
> +     unsigned long *rmap_pde;
>       unsigned long *dirty_bitmap;
> +     int *largepage;
>   

We can combine rmap_pde and largepage into an array of struct { int 
write_count, unsigned long rmap_pde; } and save some cacheline accesses.

>       
>  
>  void kvm_free_physmem(struct kvm *kvm)
> @@ -300,6 +308,28 @@ int __kvm_set_memory_region(struct kvm *
>               new.user_alloc = user_alloc;
>               new.userspace_addr = mem->userspace_addr;
>       }
> +     if (npages && !new.rmap_pde) {
> +             int largepages = npages / (HPAGE_SIZE/PAGE_SIZE);
> +             if (npages % (HPAGE_SIZE/PAGE_SIZE))
> +                     largepages++;
> +             new.rmap_pde = vmalloc(largepages * sizeof(struct page *));
> +
> +             if (!new.rmap_pde)
> +                     goto out_free;
> +
> +             memset(new.rmap_pde, 0, largepages * sizeof(struct page *));
> +     }
> +     if (npages && !new.largepage) {
> +             int largepages = npages / (HPAGE_SIZE/PAGE_SIZE);
> +             if (npages % (HPAGE_SIZE/PAGE_SIZE))
> +                     largepages++;
> +             new.largepage = kmalloc(largepages * sizeof(int), GFP_KERNEL);
> +
> +     

vmalloc() here too, for very large guests.

Please test the changes with user/test/x86/access.flat, with both normal 
and large pages backing.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
kvm-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/kvm-devel

Re: [kvm-devel] large page support for kvm

Reply via email to