Allow vcpus to pin spte translations by:
1) Creating a per-vcpu list of pinned ranges.
2) On mmu reload request:
- Fault ranges.
- Mark sptes with a pinned bit.
- Mark shadow pages as pinned.
3) Then modify the following actions:
- Page age => skip spte flush.
- MMU notifiers => force mmu reload request (which kicks cpu out of
guest mode).
- GET_DIRTY_LOG => force mmu reload request.
- SLAB shrinker => skip shadow page deletion.
TDP-only.
Signed-off-by: Marcelo Tosatti <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 14 ++
arch/x86/kvm/mmu.c | 202 ++++++++++++++++++++++++++++++++++++++--
arch/x86/kvm/mmu.h | 5
arch/x86/kvm/mmutrace.h | 23 ++++
arch/x86/kvm/paging_tmpl.h | 2
arch/x86/kvm/x86.c | 4
6 files changed, 241 insertions(+), 9 deletions(-)
Index: kvm.pinned-sptes/arch/x86/include/asm/kvm_host.h
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/include/asm/kvm_host.h 2014-06-18
17:28:17.549456614 -0300
+++ kvm.pinned-sptes/arch/x86/include/asm/kvm_host.h 2014-06-18
17:28:24.338435658 -0300
@@ -221,6 +221,8 @@
/* hold the gfn of each spte inside spt */
gfn_t *gfns;
bool unsync;
+ bool pinned;
+
int root_count; /* Currently serving as active root */
unsigned int unsync_children;
unsigned long parent_ptes; /* Reverse mapping for parent_pte */
@@ -337,6 +339,14 @@
KVM_DEBUGREG_WONT_EXIT = 2,
};
+struct kvm_pinned_page_range {
+ gfn_t base_gfn;
+ unsigned long npages;
+ struct list_head link;
+};
+
+#define KVM_MAX_PER_VCPU_PINNED_RANGE 10
+
struct kvm_vcpu_arch {
/*
* rip and regs accesses must go through
@@ -392,6 +402,10 @@
struct kvm_mmu_memory_cache mmu_page_cache;
struct kvm_mmu_memory_cache mmu_page_header_cache;
+ struct list_head pinned_mmu_pages;
+ struct mutex pinned_mmu_mutex;
+ unsigned int nr_pinned_ranges;
+
struct fpu guest_fpu;
u64 xcr0;
u64 guest_supported_xcr0;
Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c 2014-06-18 17:28:17.550456611
-0300
+++ kvm.pinned-sptes/arch/x86/kvm/mmu.c 2014-06-18 17:28:24.339435654 -0300
@@ -148,6 +148,9 @@
#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
+#define SPTE_PINNED (1ULL << (PT64_SECOND_AVAIL_BITS_SHIFT))
+
+#define SPTE_PINNED_BIT PT64_SECOND_AVAIL_BITS_SHIFT
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
@@ -327,6 +330,11 @@
return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
}
+static int is_pinned_spte(u64 spte)
+{
+ return spte & SPTE_PINNED && is_shadow_present_pte(spte);
+}
+
static int is_large_pte(u64 pte)
{
return pte & PT_PAGE_SIZE_MASK;
@@ -2818,7 +2826,7 @@
* - false: let the real page fault path to fix it.
*/
static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
- u32 error_code)
+ u32 error_code, bool pin)
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
@@ -2828,6 +2836,9 @@
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return false;
+ if (pin)
+ return false;
+
if (!page_fault_can_be_fast(error_code))
return false;
@@ -2895,9 +2906,55 @@
}
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gva_t gva, pfn_t *pfn, bool write, bool *writable);
+ gva_t gva, pfn_t *pfn, bool write, bool *writable,
+ bool pin);
static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
+
+static int get_sptep_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes[4])
+{
+ struct kvm_shadow_walk_iterator iterator;
+ int nr_sptes = 0;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return nr_sptes;
+
+ for_each_shadow_entry(vcpu, addr, iterator) {
+ sptes[iterator.level-1] = iterator.sptep;
+ nr_sptes++;
+ if (!is_shadow_present_pte(*iterator.sptep))
+ break;
+ }
+
+ return nr_sptes;
+}
+
+static bool direct_pin_sptes(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ u64 *sptes[4];
+ int r, i, level;
+
+ r = get_sptep_hierarchy(vcpu, gfn << PAGE_SHIFT, sptes);
+ if (!r)
+ return false;
+
+ level = 5 - r;
+ if (!is_last_spte(*sptes[r-1], level))
+ return false;
+ if (!is_shadow_present_pte(*sptes[r-1]))
+ return false;
+
+ for (i = 0; i < r; i++) {
+ u64 *sptep = sptes[i];
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+ sp->pinned = true;
+ set_bit(SPTE_PINNED_BIT, (unsigned long *)sptep);
+ }
+
+ return true;
+}
+
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
gfn_t gfn, bool prefault, bool pin, bool *pinned)
{
@@ -2923,13 +2980,14 @@
} else
level = PT_PAGE_TABLE_LEVEL;
- if (fast_page_fault(vcpu, v, level, error_code))
+ if (fast_page_fault(vcpu, v, level, error_code, pin))
return 0;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
+ if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable,
+ pin))
return 0;
if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
@@ -2943,6 +3001,8 @@
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
prefault);
+ if (pin)
+ *pinned = direct_pin_sptes(vcpu, gfn);
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -3349,7 +3409,8 @@
}
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gva_t gva, pfn_t *pfn, bool write, bool *writable)
+ gva_t gva, pfn_t *pfn, bool write, bool *writable,
+ bool pin)
{
bool async;
@@ -3358,7 +3419,7 @@
if (!async)
return false; /* *pfn has correct page already */
- if (!prefault && can_do_async_pf(vcpu)) {
+ if (!prefault && !pin && can_do_async_pf(vcpu)) {
trace_kvm_try_async_get_page(gva, gfn);
if (kvm_find_async_pf_gfn(vcpu, gfn)) {
trace_kvm_async_pf_doublefault(gva, gfn);
@@ -3406,13 +3467,14 @@
} else
level = PT_PAGE_TABLE_LEVEL;
- if (fast_page_fault(vcpu, gpa, level, error_code))
+ if (fast_page_fault(vcpu, gpa, level, error_code, pin))
return 0;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+ if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable,
+ pin))
return 0;
if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
@@ -3426,6 +3488,8 @@
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
r = __direct_map(vcpu, gpa, write, map_writable,
level, gfn, pfn, prefault);
+ if (pin)
+ *pinned = direct_pin_sptes(vcpu, gfn);
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
@@ -3903,6 +3967,127 @@
}
EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
+int kvm_mmu_register_pinned_range(struct kvm_vcpu *vcpu,
+ gfn_t base_gfn, unsigned long npages)
+{
+ struct kvm_pinned_page_range *p;
+
+ mutex_lock(&vcpu->arch.pinned_mmu_mutex);
+ list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) {
+ if (p->base_gfn == base_gfn && p->npages == npages) {
+ mutex_unlock(&vcpu->arch.pinned_mmu_mutex);
+ return -EEXIST;
+ }
+ }
+ mutex_unlock(&vcpu->arch.pinned_mmu_mutex);
+
+ if (vcpu->arch.nr_pinned_ranges >=
+ KVM_MAX_PER_VCPU_PINNED_RANGE)
+ return -ENOSPC;
+
+ p = kzalloc(sizeof(struct kvm_pinned_page_range), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ vcpu->arch.nr_pinned_ranges++;
+
+ trace_kvm_mmu_register_pinned_range(vcpu->vcpu_id, base_gfn, npages);
+
+ INIT_LIST_HEAD(&p->link);
+ p->base_gfn = base_gfn;
+ p->npages = npages;
+ mutex_lock(&vcpu->arch.pinned_mmu_mutex);
+ list_add(&p->link, &vcpu->arch.pinned_mmu_pages);
+ mutex_unlock(&vcpu->arch.pinned_mmu_mutex);
+ kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+
+ return 0;
+}
+
+int kvm_mmu_unregister_pinned_range(struct kvm_vcpu *vcpu,
+ gfn_t base_gfn, unsigned long npages)
+{
+ struct kvm_pinned_page_range *p;
+
+ mutex_lock(&vcpu->arch.pinned_mmu_mutex);
+ list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) {
+ if (p->base_gfn == base_gfn && p->npages == npages) {
+ list_del(&p->link);
+ vcpu->arch.nr_pinned_ranges--;
+ mutex_unlock(&vcpu->arch.pinned_mmu_mutex);
+ kfree(p);
+ return 0;
+ }
+ }
+
+ mutex_unlock(&vcpu->arch.pinned_mmu_mutex);
+ return -ENOENT;
+}
+
+void kvm_mmu_free_pinned_ranges(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pinned_page_range *p, *p2;
+
+ mutex_lock(&vcpu->arch.pinned_mmu_mutex);
+ list_for_each_entry_safe(p, p2, &vcpu->arch.pinned_mmu_pages, link) {
+ list_del(&p->link);
+ kfree(p);
+ }
+ mutex_unlock(&vcpu->arch.pinned_mmu_mutex);
+}
+
+/*
+ * Pin KVM MMU page translations. This guarantees, for valid
+ * addresses registered by kvm_mmu_register_pinned_range (valid address
+ * meaning address which posses sufficient information for fault to
+ * be resolved), valid translations exist while in guest mode and
+ * therefore no VM-exits due to faults will occur.
+ *
+ * Failure to instantiate pages will abort guest entry.
+ *
+ * Page frames should be pinned with get_page in advance.
+ *
+ * Pinning is not guaranteed while executing as L2 guest.
+ *
+ */
+
+static void kvm_mmu_pin_pages(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pinned_page_range *p;
+
+ if (is_guest_mode(vcpu))
+ return;
+
+ if (!vcpu->arch.mmu.direct_map)
+ return;
+
+ ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ mutex_lock(&vcpu->arch.pinned_mmu_mutex);
+ list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) {
+ gfn_t gfn_offset;
+
+ for (gfn_offset = 0; gfn_offset < p->npages; gfn_offset++) {
+ gfn_t gfn = p->base_gfn + gfn_offset;
+ int r;
+ bool pinned = false;
+
+ r = vcpu->arch.mmu.page_fault(vcpu, gfn << PAGE_SHIFT,
+ PFERR_WRITE_MASK, false,
+ true, &pinned);
+ /* MMU notifier sequence window: retry */
+ if (!r && !pinned)
+ kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+ if (r) {
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ break;
+ }
+
+ }
+ }
+ mutex_unlock(&vcpu->arch.pinned_mmu_mutex);
+}
+
int kvm_mmu_load(struct kvm_vcpu *vcpu)
{
int r;
@@ -3916,6 +4101,7 @@
goto out;
/* set_cr3() should ensure TLB has been flushed */
vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+ kvm_mmu_pin_pages(vcpu);
out:
return r;
}
Index: kvm.pinned-sptes/arch/x86/kvm/mmu.h
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.h 2014-06-18 17:27:47.582549238
-0300
+++ kvm.pinned-sptes/arch/x86/kvm/mmu.h 2014-06-18 17:28:24.339435654 -0300
@@ -178,4 +178,9 @@
}
void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
+int kvm_mmu_register_pinned_range(struct kvm_vcpu *vcpu,
+ gfn_t base_gfn, unsigned long npages);
+int kvm_mmu_unregister_pinned_range(struct kvm_vcpu *vcpu,
+ gfn_t base_gfn, unsigned long npages);
+void kvm_mmu_free_pinned_ranges(struct kvm_vcpu *vcpu);
#endif
Index: kvm.pinned-sptes/arch/x86/kvm/x86.c
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/kvm/x86.c 2014-06-18 17:28:17.552456605
-0300
+++ kvm.pinned-sptes/arch/x86/kvm/x86.c 2014-06-18 17:28:24.340435651 -0300
@@ -7049,6 +7049,8 @@
kvm_async_pf_hash_reset(vcpu);
kvm_pmu_init(vcpu);
+ INIT_LIST_HEAD(&vcpu->arch.pinned_mmu_pages);
+ mutex_init(&vcpu->arch.pinned_mmu_mutex);
return 0;
fail_free_wbinvd_dirty_mask:
@@ -7069,6 +7071,7 @@
{
int idx;
+ kvm_mmu_free_pinned_ranges(vcpu);
kvm_pmu_destroy(vcpu);
kfree(vcpu->arch.mce_banks);
kvm_free_lapic(vcpu);
@@ -7113,6 +7116,7 @@
int r;
r = vcpu_load(vcpu);
BUG_ON(r);
+ kvm_mmu_free_pinned_ranges(vcpu);
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
}
Index: kvm.pinned-sptes/arch/x86/kvm/paging_tmpl.h
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/kvm/paging_tmpl.h 2014-06-18
17:28:17.550456611 -0300
+++ kvm.pinned-sptes/arch/x86/kvm/paging_tmpl.h 2014-06-18 17:28:24.340435651
-0300
@@ -747,7 +747,7 @@
smp_rmb();
if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
- &map_writable))
+ &map_writable, false))
return 0;
if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
Index: kvm.pinned-sptes/arch/x86/kvm/mmutrace.h
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/kvm/mmutrace.h 2014-06-18
17:27:47.583549234 -0300
+++ kvm.pinned-sptes/arch/x86/kvm/mmutrace.h 2014-06-18 17:28:24.340435651
-0300
@@ -322,6 +322,29 @@
__entry->kvm_gen == __entry->spte_gen
)
);
+
+TRACE_EVENT(
+ kvm_mmu_register_pinned_range,
+ TP_PROTO(unsigned int vcpu_id, gfn_t gfn, unsigned long npages),
+ TP_ARGS(vcpu_id, gfn, npages),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( gfn_t, gfn )
+ __field( unsigned long, npages )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->gfn = gfn;
+ __entry->npages = npages;
+ ),
+
+ TP_printk("vcpu_id %u gfn %llx npages %lx",
+ __entry->vcpu_id,
+ __entry->gfn,
+ __entry->npages)
+);
#endif /* _TRACE_KVMMMU_H */
#undef TRACE_INCLUDE_PATH
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html