On Monday 31 August 2009 14:40:29 Avi Kivity wrote:
> On 08/31/2009 03:09 PM, Max Laier wrote:
> >>> As you can see there is less saw-
> >>> toothing in the after plot and also fewer changes overall (because we
> >>> don't zap mappings that are still in use as often). This is with a
> >>> limit of 64 for the shadow page table to increase the effect and
> >>> vmx/ept.
> >>>
> >>> I realize that the list_move and parent walk are quite expensive and
> >>> that kvm_mmu_alloc_page is only half the story. It should really be
> >>> done every time a new guest page table is mapped - maybe via rmap_add.
> >>> This would obviously completely kill performance-wise, though.
> >>>
> >>> Another idea would be to improve the reclaim logic in a way that it
> >>> prefers "old" PT_PAGE_TABLE_LEVEL over directories. Though I'm not
> >>> sure how to code that up sensibly, either.
> >>>
> >>> As I said, this is proof-of-concept and RFC. So any comments welcome.
> >>> For my use case the proof-of-concept diff seems to do well enough,
> >>> though.
> >>
> >> Given that reclaim is fairly rare, we should try to move the cost
> >> there. So how about this:
> >>
> >> - add an 'accessed' flag to struct kvm_mmu_page
> >> - when reclaiming, try to evict pages that were not recently accessed
> >> (but don't overscan - if you scan many recently accessed pages, evict
> >> some of them anyway)
> >
> > - prefer page table level pages over directory level pages in the face of
> > overscan.
>
> I'm hoping that overscan will only occur when we start to feel memory
> pressure, and that once we do a full scan we'll get accurate recency
> information.
>
> >> - when scanning, update the accessed flag with the accessed bit of all
> >> parent_ptes
> >
> > I might be misunderstanding, but I think it should be the other way
> > 'round. i.e. a page is accessed if any of it's children have been
> > accessed.
>
> They're both true, but looking at the parents is much more efficient.
> Note we need to look at the accessed bit of the parent_ptes, not parent
> kvm_mmu_pages.
>
> >> - when dropping an spte, update the accessed flag of the kvm_mmu_page it
> >> points to
> >> - when reloading cr3, mark the page as accessed (since it has no
> >> parent_ptes)
> >>
> >> This should introduce some LRU-ness that depends not only on fault
> >> behaviour but also on long-term guest access behaviour (which is
> >> important for long-running processes and kernel pages).
> >
> > I'll try to come up with a patch for this, later tonight. Unless you
> > already have something in the making. Thanks.
>
> Please do, it's an area that need attention.
Okay ... I have /something/, but I'm certainly not there yet as I have to
admit that I don't understand your idea completely. In addition it seems that
EPT doesn't have an accessed bit :-\ Any idea for this?
Regardless, testing the attached with EPT, it turns out that not zapping
shadow pages with root_count != 0 already makes much difference. After all we
don't really zap these pages anyways, but just mark them invalid after zapping
the children. So this could be a first improvement.
In any case, I clearly don't have the right idea here, yet. Plus I don't
really have time to look into this further right now. And my hack is "good
enough"[tm] for my testing ... so if anyone more knowledgeable would like to
continue - much appreciated. Maybe some of this can at least serve as food
for thoughts. Sorry.
--
/"\ Best regards, | [email protected]
\ / Max Laier | ICQ #67774661
X http://pf4freebsd.love2party.net/ | mla...@efnet
/ \ ASCII Ribbon Campaign | Against HTML Mail and News
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a3f637f..089ad0e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -394,6 +394,7 @@ struct kvm_arch{
* Hash table of struct kvm_mmu_page.
*/
struct list_head active_mmu_pages;
+ struct kvm_mmu_page *scan_hand;
struct list_head assigned_dev_head;
struct iommu_domain *iommu_domain;
int iommu_flags;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f76d086..3715242 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -869,6 +869,8 @@ static int is_empty_shadow_page(u64 *spt)
static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
ASSERT(is_empty_shadow_page(sp->spt));
+ if (kvm->arch.scan_hand == sp)
+ kvm->arch.scan_hand = NULL;
list_del(&sp->link);
__free_page(virt_to_page(sp->spt));
__free_page(virt_to_page(sp->gfns));
@@ -1490,6 +1492,71 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
return ret;
}
+static int kvm_mmu_test_and_clear_pte_active(struct kvm_mmu_page *sp)
+{
+ struct kvm_pte_chain *pte_chain;
+ struct hlist_node *node;
+ int i, accessed = 0;
+
+ if (!sp->multimapped) {
+ if (!sp->parent_pte) {
+ if (!sp->root_count)
+ return 0;
+ else
+ return 1;
+ }
+ if (*sp->parent_pte & PT_ACCESSED_MASK) {
+ clear_bit(PT_ACCESSED_SHIFT,
+ (unsigned long *)sp->parent_pte);
+ return 1;
+ } else
+ return 0;
+ }
+ /* Multimapped */
+ hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+ for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+ if (!pte_chain->parent_ptes[i])
+ break;
+ if (*pte_chain->parent_ptes[i] &
+ PT_ACCESSED_MASK) {
+ clear_bit(PT_ACCESSED_SHIFT,
+ (unsigned long *)
+ pte_chain->parent_ptes[i]);
+ accessed++;
+ }
+ }
+ if (!accessed)
+ return 0;
+ else
+ return 1;
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_inactive_page(struct kvm *kvm)
+{
+ struct kvm_mmu_page *sp, *prev = NULL;
+ int c = (kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) / 4;
+
+ if (kvm->arch.scan_hand)
+ sp = kvm->arch.scan_hand;
+ else
+ sp = container_of(kvm->arch.active_mmu_pages.prev,
+ struct kvm_mmu_page, link);
+
+ list_for_each_entry_reverse(sp, &kvm->arch.active_mmu_pages, link) {
+ if (!kvm_mmu_test_and_clear_pte_active(sp))
+ return sp;
+ if (!prev && sp->role.level == PT_PAGE_TABLE_LEVEL)
+ prev = sp;
+ else
+ kvm->arch.scan_hand = sp;
+ if (!--c)
+ break;
+ }
+
+ return prev ? prev : container_of(kvm->arch.active_mmu_pages.prev,
+ struct kvm_mmu_page, link);
+}
+
/*
* Changing the number of mmu pages allocated to the vm
* Note: if kvm_nr_mmu_pages is too small, you will get dead lock
@@ -1511,8 +1578,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
while (used_pages > kvm_nr_mmu_pages) {
struct kvm_mmu_page *page;
- page = container_of(kvm->arch.active_mmu_pages.prev,
- struct kvm_mmu_page, link);
+ page = kvm_mmu_get_inactive_page(kvm);
kvm_mmu_zap_page(kvm, page);
used_pages--;
}
@@ -2712,8 +2778,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
struct kvm_mmu_page *sp;
- sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
- struct kvm_mmu_page, link);
+ sp = kvm_mmu_get_inactive_page(vcpu->kvm);
kvm_mmu_zap_page(vcpu->kvm, sp);
++vcpu->kvm->stat.mmu_recycled;
}
@@ -2871,8 +2936,7 @@ static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm)
{
struct kvm_mmu_page *page;
- page = container_of(kvm->arch.active_mmu_pages.prev,
- struct kvm_mmu_page, link);
+ page = kvm_mmu_get_inactive_page(kvm);
kvm_mmu_zap_page(kvm, page);
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8b3a169..ccd5bea 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4782,6 +4782,7 @@ struct kvm *kvm_arch_create_vm(void)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ kvm->arch.scan_hand = NULL;
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */