Marcelo Tosatti wrote:
> Ok, how does the following look. Still need to plug in large page
> creation in the nonpaging case, but this should be enough for comments.
>
Most of the comments are cosmetic, but a couple have some meat.
>
> +#define HPAGE_ALIGN_OFFSET(x) ((((x)+HPAGE_SIZE-1)&HPAGE_MASK) - (x))
>
A function please.
> +/*
> + * Return the offset inside the memslot largepage integer map for a given
> + * gfn, handling slots that are not large page aligned.
> + */
> +int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
> +{
> + unsigned long long idx;
> + unsigned long memslot_align;
> +
> + memslot_align = HPAGE_ALIGN_OFFSET(slot->base_gfn << PAGE_SHIFT);
> + idx = ((gfn - slot->base_gfn) << PAGE_SHIFT) + memslot_align;
> + idx /= HPAGE_SIZE;
>
Can probably be reshuffled to avoid the long long. The compiler may
want to call helpers on i386...
We also need our own HPAGE_SIZE since kvm uses pae on non-pae hosts.
> +
> +static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
> +{
> + struct kvm_memory_slot *slot;
> +
> + slot = gfn_to_memslot(kvm, gfn);
> + if (slot) {
> + int *largepage_idx;
> + int end_gfn;
> +
> + largepage_idx = slot_largepage_idx(gfn, slot);
> + /* check if the largepage crosses a memslot */
> + end_gfn = slot->base_gfn + slot->npages;
> + if (gfn + (HPAGE_SIZE/PAGE_SIZE) >= end_gfn)
> + return 1;
> + else
> + return *largepage_idx;
>
We might initialize the boundary slots to 1 instead of zero and so avoid
this check.
> +static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
> +{
> + if (has_wrprotected_page(vcpu->kvm, large_gfn))
> + return 0;
> +
> + if (!host_largepage_backed(vcpu->kvm, large_gfn))
> + return 0;
> +
> + if ((large_gfn << PAGE_SHIFT) & (HPAGE_SIZE-1))
> + return 0;
>
I'd drop this check and simply ignore the least significant 9 bits of
large_gfn.
> +
> + /* guest has 4M pages */
> + if (!is_pae(vcpu))
> + return 0;
>
We can drop this check too. If we treat a 4MB page as two contiguous
2MB pages, and don't zero out the large_gfn least significant bits, most
of the code needn't know about it at all.
> @@ -362,6 +469,20 @@ static unsigned long *gfn_to_rmap(struct
> return &slot->rmap[gfn - slot->base_gfn];
> }
> /*
> * Reverse mapping data structures:
> *
> @@ -371,7 +492,7 @@ static unsigned long *gfn_to_rmap(struct
> * If rmapp bit zero is one, (then rmap & ~1) points to a struct
> kvm_rmap_desc
> * containing more mappings.
> */
> -static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
> +static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int hpage)
> {
> struct kvm_mmu_page *sp;
> struct kvm_rmap_desc *desc;
> @@ -383,7 +504,10 @@ static void rmap_add(struct kvm_vcpu *vc
> gfn = unalias_gfn(vcpu->kvm, gfn);
> sp = page_header(__pa(spte));
> sp->gfns[spte - sp->spt] = gfn;
> - rmapp = gfn_to_rmap(vcpu->kvm, gfn);
> + if (!hpage)
> + rmapp = gfn_to_rmap(vcpu->kvm, gfn);
> + else
> + rmapp = gfn_to_rmap_pde(vcpu->kvm, gfn);
>
This bit can go into a function
> if (!*rmapp) {
> rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
> *rmapp = (unsigned long)spte;
> @@ -449,7 +573,10 @@ static void rmap_remove(struct kvm *kvm,
> kvm_release_page_dirty(page);
> else
> kvm_release_page_clean(page);
> - rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
> + if (is_large_pte(*spte))
> + rmapp = gfn_to_rmap_pde(kvm, sp->gfns[spte - sp->spt]);
> + else
> + rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
>
As it is reused here.
>
> #ifdef MMU_DEBUG
> @@ -749,11 +895,19 @@ static void kvm_mmu_page_unlink_children
> for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
> ent = pt[i];
>
> + if (is_shadow_present_pte(pt[i]) && is_large_pte(pt[i])) {
> + if (is_writeble_pte(pt[i]))
> + --kvm->stat.lpages;
> + rmap_remove(kvm, &pt[i]);
> + }
> +
>
pt[i] == ent here. you can also move this code to be the 'else' part of
the if () you introduce below.
> pt[i] = shadow_trap_nonpresent_pte;
> if (!is_shadow_present_pte(ent))
> continue;
> - ent &= PT64_BASE_ADDR_MASK;
> - mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
> + if (!is_large_pte(ent)) {
> + ent &= PT64_BASE_ADDR_MASK;
> + mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
> + }
> }
> kvm_flush_remote_tlbs(kvm);
> }
>
> @@ -886,12 +1042,21 @@ struct page *gva_to_page(struct kvm_vcpu
> static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
> unsigned pt_access, unsigned pte_access,
> int user_fault, int write_fault, int dirty,
> - int *ptwrite, gfn_t gfn, struct page *page)
> + int *ptwrite, int largepage, gfn_t gfn,
> + struct page *page)
> {
> u64 spte;
> int was_rmapped = is_rmap_pte(*shadow_pte);
> int was_writeble = is_writeble_pte(*shadow_pte);
>
> + /*
> + * If its a largepage mapping, there could be a previous
> + * pointer to a PTE page hanging there, which will falsely
> + * set was_rmapped.
> + */
> + if (largepage)
> + was_rmapped = is_large_pte(*shadow_pte);
> +
>
But that pte page will have its parent_pte chain pointing to shadow_pte,
no? Either this can't happen, or we need to unlink that pte page first.
> @@ -951,10 +1119,17 @@ unshadowed:
> mark_page_dirty(vcpu->kvm, gfn);
>
> pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
> + pgprintk("instantiating %s PTE (%s) at %d (%llx)\n",
> + (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
> + (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte);
> set_shadow_pte(shadow_pte, spte);
> + if (!was_rmapped && (spte & (PT_PAGE_SIZE_MASK|PT_WRITABLE_MASK))
> + == (PT_PAGE_SIZE_MASK|PT_WRITABLE_MASK))
> + ++vcpu->kvm->stat.lpages;
> +
>
Why do you only account for writable large pages?
> Index: linux-2.6-x86-kvm/arch/x86/kvm/paging_tmpl.h
> ===================================================================
> --- linux-2.6-x86-kvm.orig/arch/x86/kvm/paging_tmpl.h
> +++ linux-2.6-x86-kvm/arch/x86/kvm/paging_tmpl.h
> @@ -71,6 +71,7 @@ struct guest_walker {
> unsigned pte_access;
> gfn_t gfn;
> u32 error_code;
> + int largepage_backed;
> };
>
> static gfn_t gpte_to_gfn(pt_element_t gpte)
> @@ -120,7 +121,8 @@ static unsigned FNAME(gpte_access)(struc
> */
> static int FNAME(walk_addr)(struct guest_walker *walker,
> struct kvm_vcpu *vcpu, gva_t addr,
> - int write_fault, int user_fault, int fetch_fault)
> + int write_fault, int user_fault, int fetch_fault,
> + int faulting)
> {
> pt_element_t pte;
> gfn_t table_gfn;
> @@ -130,6 +132,7 @@ static int FNAME(walk_addr)(struct guest
> pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
> walk:
> walker->level = vcpu->arch.mmu.root_level;
> + walker->largepage_backed = 0;
> pte = vcpu->arch.cr3;
> #if PTTYPE == 64
> if (!is_long_mode(vcpu)) {
> @@ -192,10 +195,19 @@ walk:
> if (walker->level == PT_DIRECTORY_LEVEL
> && (pte & PT_PAGE_SIZE_MASK)
> && (PTTYPE == 64 || is_pse(vcpu))) {
> - walker->gfn = gpte_to_gfn_pde(pte);
> + gfn_t gfn = gpte_to_gfn_pde(pte);
> + walker->gfn = gfn;
> +
> walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
> if (PTTYPE == 32 && is_cpuid_PSE36())
> walker->gfn += pse36_gfn_delta(pte);
> +
> + if (faulting
> + && is_largepage_backed(vcpu, gfn)
> + && is_physical_memory(vcpu->kvm, walker->gfn)) {
> + walker->largepage_backed = 1;
> + walker->gfn = gfn;
> + }
>
I don't like this bit. So far the walker has been independent of the
host state, and only depended on guest data. We can set
largepage_backed unconditionally on a large guest pte, leave gfn
containing the low-order bits, and leave the host checks for the actual
mapping logic.
>
> /*
> @@ -299,6 +313,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
> shadow_ent = ((u64 *)__va(shadow_addr)) + index;
> if (level == PT_PAGE_TABLE_LEVEL)
> break;
> + if (level == PT_DIRECTORY_LEVEL && walker->largepage_backed)
> + break;
> +
>
(here)
> @@ -395,6 +412,13 @@ static int FNAME(page_fault)(struct kvm_
>
> page = gfn_to_page(vcpu->kvm, walker.gfn);
>
> + /* shortcut non-RAM accesses to avoid walking over a 2MB PMD entry */
> + if (is_error_page(page)) {
> + kvm_release_page_clean(page);
> + up_read(¤t->mm->mmap_sem);
> + return 1;
> + }
> +
>
This bit is already in kvm.git?
>
> struct kvm_vcpu_stat {
> Index: linux-2.6-x86-kvm/include/linux/kvm_host.h
> ===================================================================
> --- linux-2.6-x86-kvm.orig/include/linux/kvm_host.h
> +++ linux-2.6-x86-kvm/include/linux/kvm_host.h
> @@ -99,7 +99,9 @@ struct kvm_memory_slot {
> unsigned long npages;
> unsigned long flags;
> unsigned long *rmap;
> + unsigned long *rmap_pde;
> unsigned long *dirty_bitmap;
> + int *largepage;
>
We can combine rmap_pde and largepage into an array of struct { int
write_count, unsigned long rmap_pde; } and save some cacheline accesses.
>
>
> void kvm_free_physmem(struct kvm *kvm)
> @@ -300,6 +308,28 @@ int __kvm_set_memory_region(struct kvm *
> new.user_alloc = user_alloc;
> new.userspace_addr = mem->userspace_addr;
> }
> + if (npages && !new.rmap_pde) {
> + int largepages = npages / (HPAGE_SIZE/PAGE_SIZE);
> + if (npages % (HPAGE_SIZE/PAGE_SIZE))
> + largepages++;
> + new.rmap_pde = vmalloc(largepages * sizeof(struct page *));
> +
> + if (!new.rmap_pde)
> + goto out_free;
> +
> + memset(new.rmap_pde, 0, largepages * sizeof(struct page *));
> + }
> + if (npages && !new.largepage) {
> + int largepages = npages / (HPAGE_SIZE/PAGE_SIZE);
> + if (npages % (HPAGE_SIZE/PAGE_SIZE))
> + largepages++;
> + new.largepage = kmalloc(largepages * sizeof(int), GFP_KERNEL);
> +
> +
vmalloc() here too, for very large guests.
Please test the changes with user/test/x86/access.flat, with both normal
and large pages backing.
--
error compiling committee.c: too many arguments to function
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
kvm-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/kvm-devel