Avi Kivity wrote:

There are (at least) three options available:
- detect and special-case this scenario
- change the flood detector to be per page table instead of per vcpu
- change the flood detector to look at a list of recently used page tables instead of the last page table

I'm having a hard time trying to pick between the second and third options.


The answer turns out to be "yes", so here's a patch that adds a pte access history table for each shadowed guest page-table. Let me know if it helps. Benchmarking a variety of workloads on all guests supported by kvm is left as an exercise for the reader, but I suspect the patch will either improve things all around, or can be modified to do so.

--
error compiling committee.c: too many arguments to function

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 154727d..1a3d01a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1130,7 +1130,8 @@ unshadowed:
 	if (speculative) {
 		vcpu->arch.last_pte_updated = shadow_pte;
 		vcpu->arch.last_pte_gfn = gfn;
-	}
+	} else
+		page_header(__pa(shadow_pte))->pte_history_len = 0;
 }
 
 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -1616,13 +1617,6 @@ static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
 		kvm_mmu_flush_tlb(vcpu);
 }
 
-static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
-{
-	u64 *spte = vcpu->arch.last_pte_updated;
-
-	return !!(spte && (*spte & shadow_accessed_mask));
-}
-
 static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 					  const u8 *new, int bytes)
 {
@@ -1679,13 +1673,49 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
 	u64 *spte = vcpu->arch.last_pte_updated;
+	struct kvm_mmu_page *page;
+
+	if (spte && vcpu->arch.last_pte_gfn == gfn) {
+		page = page_header(__pa(spte));
+		page->pte_history_len = 0;
+		pgprintk("clearing page history, gfn %x ent %lx\n",
+			 page->gfn, spte - page->spt);
+	}
+}
+
+static bool kvm_mmu_page_flooded(struct kvm_mmu_page *page)
+{
+	int i, j, ent, len;
 
-	if (spte
-	    && vcpu->arch.last_pte_gfn == gfn
-	    && shadow_accessed_mask
-	    && !(*spte & shadow_accessed_mask)
-	    && is_shadow_present_pte(*spte))
-		set_bit(PT_ACCESSED_SHIFT, spte);
+	len = page->pte_history_len;
+	for (i = len; i != 0; --i) {
+		ent = page->pte_history[i - 1];
+		if (test_bit(PT_ACCESSED_SHIFT, &page->spt[ent])) {
+			for (j = i; j < len; ++j)
+				page->pte_history[j-i] = page->pte_history[j];
+			page->pte_history_len = len - i;
+			return false;
+		}
+	}
+	if (page->pte_history_len < KVM_MAX_PTE_HISTORY)
+		return false;
+	return true;
+}
+
+static void kvm_mmu_log_pte_history(struct kvm_mmu_page *page, u64 *spte)
+{
+	int i;
+	unsigned ent = spte - page->spt;
+
+	if (page->pte_history_len > 0
+	    && page->pte_history[page->pte_history_len - 1] == ent)
+		return;
+	if (page->pte_history_len == KVM_MAX_PTE_HISTORY) {
+		for (i = 1; i < KVM_MAX_PTE_HISTORY; ++i)
+			page->pte_history[i-1] = page->pte_history[i];
+		--page->pte_history_len;
+	}
+	page->pte_history[page->pte_history_len++] = ent;
 }
 
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -1704,7 +1734,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	unsigned misaligned;
 	unsigned quadrant;
 	int level;
-	int flooded = 0;
 	int npte;
 	int r;
 
@@ -1715,16 +1744,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	kvm_mmu_free_some_pages(vcpu);
 	++vcpu->kvm->stat.mmu_pte_write;
 	kvm_mmu_audit(vcpu, "pre pte write");
-	if (gfn == vcpu->arch.last_pt_write_gfn
-	    && !last_updated_pte_accessed(vcpu)) {
-		++vcpu->arch.last_pt_write_count;
-		if (vcpu->arch.last_pt_write_count >= 3)
-			flooded = 1;
-	} else {
-		vcpu->arch.last_pt_write_gfn = gfn;
-		vcpu->arch.last_pt_write_count = 1;
-		vcpu->arch.last_pte_updated = NULL;
-	}
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
@@ -1733,7 +1752,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
 		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
 		misaligned |= bytes < 4;
-		if (misaligned || flooded) {
+		if (misaligned || kvm_mmu_page_flooded(sp)) {
 			/*
 			 * Misaligned accesses are too much trouble to fix
 			 * up; also, they usually indicate a page is not used
@@ -1785,6 +1804,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			mmu_pte_write_zap_pte(vcpu, sp, spte);
 			if (new)
 				mmu_pte_write_new_pte(vcpu, sp, spte, new);
+			kvm_mmu_log_pte_history(sp, spte);
 			mmu_pte_write_flush_tlb(vcpu, entry, *spte);
 			++spte;
 		}
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index a71f3aa..cbe550e 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -78,6 +78,7 @@
 #define KVM_MIN_FREE_MMU_PAGES 5
 #define KVM_REFILL_PAGES 25
 #define KVM_MAX_CPUID_ENTRIES 40
+#define KVM_MAX_PTE_HISTORY 4
 
 extern spinlock_t kvm_lock;
 extern struct list_head vm_list;
@@ -189,6 +190,9 @@ struct kvm_mmu_page {
 		u64 *parent_pte;               /* !multimapped */
 		struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
 	};
+
+	u16 pte_history_len;
+	u16 pte_history[KVM_MAX_PTE_HISTORY];
 };
 
 /*

Reply via email to