On Sun, Dec 23, 2007 at 01:01:25PM +0200, Avi Kivity wrote:
> Andrew Morton wrote:
> >On Sun, 23 Dec 2007 12:35:30 +0200 Avi Kivity <[EMAIL PROTECTED]> wrote:
> >
> >
> >>Andrew Morton wrote:
> >>
> >>>On Sun, 23 Dec 2007 10:59:22 +0200 Avi Kivity <[EMAIL PROTECTED]> wrote:
> >>>
> >>>
> >>>
> >>>>Avi Kivity wrote:
> >>>>
> >>>>
> >>>>>Avi Kivity wrote:
> >>>>>
> >>>>>
> >>>>>
> >>>>>>Exactly. But it is better to be explicit about it and pass the page
> >>>>>>directly like you did before. I hate to make you go back-and-fourth,
> >>>>>>but I did not understand the issue completely before.
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>btw, the call to gfn_to_page() can happen in page_fault() instead of
> >>>>>walk_addr(); that will reduce the amount of error handling, and will
> >>>>>simplify the callers to walk_addr() that don't need the page.
> >>>>>
> >>>>>
> >>>>>
> >>>>>
> >>>>Note further that all this doesn't obviate the need for follow_page()
> >>>>(or get_user_pages_inatomic()); we still need something in update_pte()
> >>>>for the demand paging case.
> >>>>
> >>>>
> >>>Please review -mm's mm/pagewalk.c for suitability.
> >>>
> >>>If is is unsuitable but repairable then please cc Matt Mackall
> >>><[EMAIL PROTECTED]> on the review.
> >>>
> >>>
> >>>
> >>The "no locks are taken" comment is very worrying. We need accurate
> >>results.
> >>
> >
> >take down_read(mm->mmap_sem) before calling it..
> >
> >You have to do that anyway for its results to be meaningful in the caller.
> >Ditto get_user_pages().
> >
> >
>
> Yes, but what about the page table locks?
>
> follow_page() is much more thorough.
It can acquire the pagetablelock in the callback handler. But then,
vm_normal_page() must also be exported.
Are you guys OK with this ?
Modular KVM needs walk_page_range(), and also vm_normal_page() to be
used on pagewalk callback.
Signed-off-by: Marcelo Tosatti <[EMAIL PROTECTED]>
Index: kvm.quilt/mm/pagewalk.c
===================================================================
--- kvm.quilt.orig/mm/pagewalk.c
+++ kvm.quilt/mm/pagewalk.c
@@ -1,6 +1,7 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/sched.h>
+#include <linux/module.h>
static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
const struct mm_walk *walk, void *private)
@@ -129,3 +130,4 @@ int walk_page_range(const struct mm_stru
return err;
}
+EXPORT_SYMBOL(walk_page_range);
Index: kvm.quilt/mm/memory.c
===================================================================
--- kvm.quilt.orig/mm/memory.c
+++ kvm.quilt/mm/memory.c
@@ -412,6 +412,7 @@ struct page *vm_normal_page(struct vm_ar
*/
return pfn_to_page(pfn);
}
+EXPORT_SYMBOL(vm_normal_page);
/*
* copy one vm_area from one task to the other. Assumes the page tables
[PATCH] KVM: add kvm_follow_page()
In preparation for a mmu spinlock, avoid schedules in mmu_set_spte()
by using walk_page_range() instead of get_user_pages().
The fault handling work by get_user_pages() now happens outside the lock.
Signed-off-by: Marcelo Tosatti <[EMAIL PROTECTED]>
Index: kvm.quilt/arch/x86/kvm/mmu.c
===================================================================
--- kvm.quilt.orig/arch/x86/kvm/mmu.c
+++ kvm.quilt/arch/x86/kvm/mmu.c
@@ -882,14 +882,18 @@ struct page *gva_to_page(struct kvm_vcpu
return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
}
+/*
+ * mmu_set_spte must be called with an additional reference on
+ * struct page *page, and it will release that if necessary (so
+ * the callers should not worry about it).
+ */
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty,
- int *ptwrite, gfn_t gfn)
+ int *ptwrite, gfn_t gfn, struct page *page)
{
u64 spte;
int was_rmapped = is_rmap_pte(*shadow_pte);
- struct page *page;
pgprintk("%s: spte %llx access %x write_fault %d"
" user_fault %d gfn %lx\n",
@@ -907,8 +911,6 @@ static void mmu_set_spte(struct kvm_vcpu
if (!(pte_access & ACC_EXEC_MASK))
spte |= PT64_NX_MASK;
- page = gfn_to_page(vcpu->kvm, gfn);
-
spte |= PT_PRESENT_MASK;
if (pte_access & ACC_USER_MASK)
spte |= PT_USER_MASK;
@@ -983,8 +985,10 @@ static int nonpaging_map(struct kvm_vcpu
table = __va(table_addr);
if (level == 1) {
+ struct page *page;
+ page = gfn_to_page(vcpu->kvm, gfn);
mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
- 0, write, 1, &pt_write, gfn);
+ 0, write, 1, &pt_write, gfn, page);
return pt_write || is_io_pte(table[index]);
}
Index: kvm.quilt/include/linux/kvm_host.h
===================================================================
--- kvm.quilt.orig/include/linux/kvm_host.h
+++ kvm.quilt/include/linux/kvm_host.h
@@ -163,6 +163,7 @@ int kvm_arch_set_memory_region(struct kv
int user_alloc);
gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
+struct page *kvm_find_page(struct kvm *kvm, gfn_t gfn);
void kvm_release_page_clean(struct page *page);
void kvm_release_page_dirty(struct page *page);
int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
Index: kvm.quilt/virt/kvm/kvm_main.c
===================================================================
--- kvm.quilt.orig/virt/kvm/kvm_main.c
+++ kvm.quilt/virt/kvm/kvm_main.c
@@ -453,6 +453,54 @@ static unsigned long gfn_to_hva(struct k
return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
}
+static int kvm_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ void *private)
+{
+ struct page **page = private;
+ struct vm_area_struct *vma;
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+ int err = -EFAULT;
+
+ vma = find_vma(current->mm, addr);
+ if (!vma)
+ return err;
+
+ ptep = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
+ pte = *ptep;
+ if (!pte_present(pte))
+ goto unlock;
+
+ *page = vm_normal_page(vma, addr, pte);
+ if (!*page)
+ goto unlock;
+
+ get_page(*page);
+ err = 0;
+unlock:
+ pte_unmap_unlock(ptep, ptl);
+ return err;
+}
+
+static struct mm_walk kvm_mm_walk = { .pmd_entry = kvm_pte_range };
+
+struct page *kvm_find_page(struct kvm *kvm, gfn_t gfn)
+{
+ unsigned long addr;
+ struct page *page = NULL;
+
+ addr = gfn_to_hva(kvm, gfn);
+ /* MMIO access */
+ if (kvm_is_error_hva(addr)) {
+ get_page(bad_page);
+ return bad_page;
+ }
+
+ walk_page_range(current->mm, addr, addr+PAGE_SIZE, &kvm_mm_walk,
+ &page);
+ return page;
+}
+
/*
* Requires current->mm->mmap_sem to be held
*/
Index: kvm.quilt/arch/x86/kvm/paging_tmpl.h
===================================================================
--- kvm.quilt.orig/arch/x86/kvm/paging_tmpl.h
+++ kvm.quilt/arch/x86/kvm/paging_tmpl.h
@@ -67,6 +67,7 @@ struct guest_walker {
gfn_t table_gfn[PT_MAX_FULL_LEVELS];
pt_element_t ptes[PT_MAX_FULL_LEVELS];
gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
+ struct page *page;
unsigned pt_access;
unsigned pte_access;
gfn_t gfn;
@@ -203,14 +204,18 @@ walk:
--walker->level;
}
+ walker->page = gfn_to_page(vcpu->kvm, walker->gfn);
+
if (write_fault && !is_dirty_pte(pte)) {
bool ret;
mark_page_dirty(vcpu->kvm, table_gfn);
ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
pte|PT_DIRTY_MASK);
- if (ret)
+ if (ret) {
+ kvm_release_page_clean(walker->page);
goto walk;
+ }
pte |= PT_DIRTY_MASK;
mutex_lock(&vcpu->kvm->lock);
kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
@@ -241,12 +246,13 @@ err:
return 0;
}
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page
*spage,
u64 *spte, const void *pte, int bytes,
int offset_in_pte)
{
pt_element_t gpte;
unsigned pte_access;
+ struct page *page;
gpte = *(const pt_element_t *)pte;
if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
@@ -256,10 +262,15 @@ static void FNAME(update_pte)(struct kvm
}
if (bytes < sizeof(pt_element_t))
return;
+
+ page = kvm_find_page(vcpu->kvm, gpte_to_gfn(gpte));
+ if (!page)
+ return;
+
pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
- pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
- mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
- gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte));
+ pte_access = spage->role.access & FNAME(gpte_access)(vcpu, gpte);
+ mmu_set_spte(vcpu, spte, spage->role.access, pte_access, 0, 0,
+ gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), page);
}
/*
@@ -323,8 +334,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
r = kvm_read_guest_atomic(vcpu->kvm,
walker->pte_gpa[level - 2],
&curr_pte, sizeof(curr_pte));
- if (r || curr_pte != walker->ptes[level - 2])
+ if (r || curr_pte != walker->ptes[level - 2]) {
+ kvm_release_page_clean(walker->page);
return NULL;
+ }
}
shadow_addr = __pa(shadow_page->spt);
shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
@@ -335,7 +348,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
user_fault, write_fault,
walker->ptes[walker->level-1] & PT_DIRTY_MASK,
- ptwrite, walker->gfn);
+ ptwrite, walker->gfn, walker->page);
return shadow_ent;
}
@@ -425,6 +438,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kv
if (r) {
gpa = gfn_to_gpa(walker.gfn);
gpa |= vaddr & ~PAGE_MASK;
+ kvm_release_page_clean(walker.page);
}
return gpa;
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
kvm-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/kvm-devel