[PATCH v7 06/11] KVM: MMU: show mmu_valid_gen in shadow page related tracepoints

2013-05-22 Thread Xiao Guangrong
Show sp->mmu_valid_gen

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmutrace.h |   22 --
 1 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index b8f6172..697f466 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -7,16 +7,18 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvmmmu
 
-#define KVM_MMU_PAGE_FIELDS \
-   __field(__u64, gfn) \
-   __field(__u32, role) \
-   __field(__u32, root_count) \
+#define KVM_MMU_PAGE_FIELDS\
+   __field(unsigned long, mmu_valid_gen)   \
+   __field(__u64, gfn) \
+   __field(__u32, role)\
+   __field(__u32, root_count)  \
__field(bool, unsync)
 
-#define KVM_MMU_PAGE_ASSIGN(sp) \
-   __entry->gfn = sp->gfn;  \
-   __entry->role = sp->role.word;   \
-   __entry->root_count = sp->root_count;\
+#define KVM_MMU_PAGE_ASSIGN(sp)\
+   __entry->mmu_valid_gen = sp->mmu_valid_gen; \
+   __entry->gfn = sp->gfn; \
+   __entry->role = sp->role.word;  \
+   __entry->root_count = sp->root_count;   \
__entry->unsync = sp->unsync;
 
 #define KVM_MMU_PAGE_PRINTK() ({   \
@@ -28,8 +30,8 @@
\
role.word = __entry->role;  \
\
-   trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s"   \
-" %snxe root %u %s%c", \
+   trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s"   \
+" %snxe root %u %s%c", __entry->mmu_valid_gen, \
 __entry->gfn, role.level,  \
 role.cr4_pae ? " pae" : "",\
 role.quadrant, \
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7 05/11] KVM: x86: use the fast way to invalidate all pages

2013-05-22 Thread Xiao Guangrong
Replace kvm_mmu_zap_all by kvm_mmu_invalidate_zap_all_pages

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   15 ---
 arch/x86/kvm/x86.c |4 ++--
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 688e755..c010ace 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4188,21 +4188,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, 
int slot)
spin_unlock(&kvm->mmu_lock);
 }
 
-void kvm_mmu_zap_all(struct kvm *kvm)
-{
-   struct kvm_mmu_page *sp, *node;
-   LIST_HEAD(invalid_list);
-
-   spin_lock(&kvm->mmu_lock);
-restart:
-   list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
-   if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
-   goto restart;
-
-   kvm_mmu_commit_zap_page(kvm, &invalid_list);
-   spin_unlock(&kvm->mmu_lock);
-}
-
 #define BATCH_ZAP_PAGES10
 static void kvm_zap_obsolete_pages(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3758ff9..15e10f7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7066,13 +7066,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
-   kvm_mmu_zap_all(kvm);
+   kvm_mmu_invalidate_zap_all_pages(kvm);
 }
 
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
   struct kvm_memory_slot *slot)
 {
-   kvm_arch_flush_shadow_all(kvm);
+   kvm_mmu_invalidate_zap_all_pages(kvm);
 }
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7 04/11] KVM: MMU: zap pages in batch

2013-05-22 Thread Xiao Guangrong
Zap at lease 10 pages before releasing mmu-lock to reduce the overload
caused by requiring lock

After the patch, kvm_zap_obsolete_pages can forward progress anyway,
so update the comments

[ It improves kernel building 0.6% ~ 1% ]

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   35 +++
 1 files changed, 11 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f302540..688e755 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4203,14 +4203,18 @@ restart:
spin_unlock(&kvm->mmu_lock);
 }
 
+#define BATCH_ZAP_PAGES10
 static void kvm_zap_obsolete_pages(struct kvm *kvm)
 {
struct kvm_mmu_page *sp, *node;
LIST_HEAD(invalid_list);
+   int batch = 0;
 
 restart:
list_for_each_entry_safe_reverse(sp, node,
  &kvm->arch.active_mmu_pages, link) {
+   int ret;
+
/*
 * No obsolete page exists before new created page since
 * active_mmu_pages is the FIFO list.
@@ -4219,28 +4223,6 @@ restart:
break;
 
/*
-* Do not repeatedly zap a root page to avoid unnecessary
-* KVM_REQ_MMU_RELOAD, otherwise we may not be able to
-* progress:
-*vcpu 0vcpu 1
-* call vcpu_enter_guest():
-*1): handle KVM_REQ_MMU_RELOAD
-*and require mmu-lock to
-*load mmu
-* repeat:
-*1): zap root page and
-*send KVM_REQ_MMU_RELOAD
-*
-*2): if (cond_resched_lock(mmu-lock))
-*
-*2): hold mmu-lock and load mmu
-*
-*3): see KVM_REQ_MMU_RELOAD bit
-*on vcpu->requests is set
-*then return 1 to call
-*vcpu_enter_guest() again.
-*goto repeat;
-*
 * Since we are reversely walking the list and the invalid
 * list will be moved to the head, skip the invalid page
 * can help us to avoid the infinity list walking.
@@ -4248,13 +4230,18 @@ restart:
if (sp->role.invalid)
continue;
 
-   if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+   if (batch >= BATCH_ZAP_PAGES &&
+ (need_resched() || spin_needbreak(&kvm->mmu_lock))) {
+   batch = 0;
kvm_mmu_commit_zap_page(kvm, &invalid_list);
cond_resched_lock(&kvm->mmu_lock);
goto restart;
}
 
-   if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
+   ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+   batch += ret;
+
+   if (ret)
goto restart;
}
 
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7 01/11] KVM: x86: drop calling kvm_mmu_zap_all in emulator_fix_hypercall

2013-05-22 Thread Xiao Guangrong
Quote Gleb's mail:

| Back then kvm->lock protected memslot access so code like:
|
| mutex_lock(&vcpu->kvm->lock);
| kvm_mmu_zap_all(vcpu->kvm);
| mutex_unlock(&vcpu->kvm->lock);
|
| which is what 7aa81cc0 does was enough to guaranty that no vcpu will
| run while code is patched. This is no longer the case and
| mutex_lock(&vcpu->kvm->lock); is gone from that code path long time ago,
| so now kvm_mmu_zap_all() there is useless and the code is incorrect.

So we drop it and it will be fixed later

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/x86.c |7 ---
 1 files changed, 0 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8d28810..6739b1d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5523,13 +5523,6 @@ static int emulator_fix_hypercall(struct 
x86_emulate_ctxt *ctxt)
char instruction[3];
unsigned long rip = kvm_rip_read(vcpu);
 
-   /*
-* Blow out the MMU to ensure that no other VCPU has an active mapping
-* to ensure that the updated hypercall appears atomically across all
-* VCPUs.
-*/
-   kvm_mmu_zap_all(vcpu->kvm);
-
kvm_x86_ops->patch_hypercall(vcpu, instruction);
 
return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7 02/11] KVM: MMU: drop unnecessary kvm_reload_remote_mmus

2013-05-22 Thread Xiao Guangrong
It is the responsibility of kvm_mmu_zap_all that keeps the
consistent of mmu and tlbs. And it is also unnecessary after
zap all mmio sptes since no mmio spte exists on root shadow
page and it can not be cached into tlb

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/x86.c |5 +
 1 files changed, 1 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6739b1d..3758ff9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7060,16 +7060,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 * If memory slot is created, or moved, we need to clear all
 * mmio sptes.
 */
-   if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
+   if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE))
kvm_mmu_zap_mmio_sptes(kvm);
-   kvm_reload_remote_mmus(kvm);
-   }
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
kvm_mmu_zap_all(kvm);
-   kvm_reload_remote_mmus(kvm);
 }
 
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7 00/11] KVM: MMU: fast zap all shadow pages

2013-05-22 Thread Xiao Guangrong
Changlog:
V7:
  1): separate some optimization into two patches which do not reuse
  the obsolete pages and collapse tlb flushes, suggested by Marcelo.

  2): make the patch based on Gleb's diff change which reduce
  KVM_REQ_MMU_RELOAD when root page is being zapped.

  3): remove calling kvm_mmu_zap_page when patching hypercall, investigated
  by Gleb.

  4): drop the patch which deleted page from hash list at the "prepare"
  time since it can break the walk based on hash list.

  5): rename kvm_mmu_invalidate_all_pages to kvm_mmu_invalidate_zap_all_pages.

  6): introduce kvm_mmu_prepare_zap_obsolete_page which is used to zap obsolete
  page to collapse tlb flushes.

V6:
  1): reversely walk active_list to skip the new created pages based
  on the comments from Gleb and Paolo.

  2): completely replace kvm_mmu_zap_all by kvm_mmu_invalidate_all_pages
  based on Gleb's comments.

  3): improve the parameters of kvm_mmu_invalidate_all_pages based on
  Gleb's comments.
 
  4): rename kvm_mmu_invalidate_memslot_pages to kvm_mmu_invalidate_all_pages
  5): rename zap_invalid_pages to kvm_zap_obsolete_pages

V5:
  1): rename is_valid_sp to is_obsolete_sp
  2): use lock-break technique to zap all old pages instead of only pages
  linked on invalid slot's rmap suggested by Marcelo.
  3): trace invalid pages and kvm_mmu_invalidate_memslot_pages()
  4): rename kvm_mmu_invalid_memslot_pages to kvm_mmu_invalidate_memslot_pages
  according to Takuya's comments.

V4:
  1): drop unmapping invalid rmap out of mmu-lock and use lock-break technique
  instead. Thanks to Gleb's comments.

  2): needn't handle invalid-gen pages specially due to page table always
  switched by KVM_REQ_MMU_RELOAD. Thanks to Marcelo's comments.

V3:
  completely redesign the algorithm, please see below.

V2:
  - do not reset n_requested_mmu_pages and n_max_mmu_pages
  - batch free root shadow pages to reduce vcpu notification and mmu-lock
contention
  - remove the first patch that introduce kvm->arch.mmu_cache since we only
'memset zero' on hashtable rather than all mmu cache members in this
version
  - remove unnecessary kvm_reload_remote_mmus after kvm_mmu_zap_all

* Issue
The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
walk and zap all shadow pages one by one, also it need to zap all guest
page's rmap and all shadow page's parent spte list. Particularly, things
become worse if guest uses more memory or vcpus. It is not good for
scalability.

* Idea
KVM maintains a global mmu invalid generation-number which is stored in
kvm->arch.mmu_valid_gen and every shadow page stores the current global
generation-number into sp->mmu_valid_gen when it is created.

When KVM need zap all shadow pages sptes, it just simply increase the
global generation-number then reload root shadow pages on all vcpus.
Vcpu will create a new shadow page table according to current kvm's
generation-number. It ensures the old pages are not used any more.

Then the invalid-gen pages (sp->mmu_valid_gen != kvm->arch.mmu_valid_gen)
are zapped by using lock-break technique.

Gleb Natapov (1):
  KVM: MMU: reduce KVM_REQ_MMU_RELOAD when root page is zapped

Xiao Guangrong (10):
  KVM: x86: drop calling kvm_mmu_zap_all in emulator_fix_hypercall
  KVM: MMU: drop unnecessary kvm_reload_remote_mmus
  KVM: MMU: fast invalidate all pages
  KVM: MMU: zap pages in batch
  KVM: x86: use the fast way to invalidate all pages
  KVM: MMU: show mmu_valid_gen in shadow page related tracepoints
  KVM: MMU: add tracepoint for kvm_mmu_invalidate_all_pages
  KVM: MMU: do not reuse the obsolete page
  KVM: MMU: introduce kvm_mmu_prepare_zap_obsolete_page
  KVM: MMU: collapse TLB flushes when zap all pages

 arch/x86/include/asm/kvm_host.h |2 +
 arch/x86/kvm/mmu.c  |  134 ---
 arch/x86/kvm/mmu.h  |1 +
 arch/x86/kvm/mmutrace.h |   42 +---
 arch/x86/kvm/x86.c  |   16 +
 5 files changed, 162 insertions(+), 33 deletions(-)

-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7 09/11] KVM: MMU: introduce kvm_mmu_prepare_zap_obsolete_page

2013-05-22 Thread Xiao Guangrong
It is only used to zap the obsolete page. Since the obsolete page
will not be used, we need not spend time to find its unsync children
out. Also, we delete the page from shadow page cache so that the page
is completely isolated after call this function.

The later patch will use it to collapse tlb flushes

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   46 +-
 1 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9b57faa..e676356 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1466,7 +1466,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm 
*kvm, int nr)
 static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
 {
ASSERT(is_empty_shadow_page(sp->spt));
-   hlist_del(&sp->hash_link);
+   hlist_del_init(&sp->hash_link);
list_del(&sp->link);
free_page((unsigned long)sp->spt);
if (!sp->role.direct)
@@ -2069,14 +2069,19 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
return zapped;
 }
 
-static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
-   struct list_head *invalid_list)
+static int
+__kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+  bool zap_unsync_children,
+  struct list_head *invalid_list)
 {
-   int ret;
+   int ret = 0;
 
trace_kvm_mmu_prepare_zap_page(sp);
++kvm->stat.mmu_shadow_zapped;
-   ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
+
+   if (likely(zap_unsync_children))
+   ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
+
kvm_mmu_page_unlink_children(kvm, sp);
kvm_mmu_unlink_parents(kvm, sp);
 
@@ -2099,6 +2104,37 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, 
struct kvm_mmu_page *sp,
return ret;
 }
 
+/*
+ * The obsolete page will not be used, we need not spend time to find
+ * its unsync children out. Also, we delete the page from shadow page
+ * cache so that the page is completely isolated after call this
+ * function.
+ *
+ * Note: if we use this function in for_each_gfn_xxx macros, we should
+ * re-walk the list when it successfully zaps one page.
+ */
+static int
+kvm_mmu_prepare_zap_obsolete_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+   int ret;
+
+   WARN_ON(!is_obsolete_sp(kvm, sp));
+
+   ret = __kvm_mmu_prepare_zap_page(kvm, sp, false, invalid_list);
+   if (ret)
+   hlist_del_init(&sp->hash_link);
+
+   WARN_ON(ret > 1);
+   return ret;
+}
+
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+   struct list_head *invalid_list)
+{
+   return __kvm_mmu_prepare_zap_page(kvm, sp, true, invalid_list);
+}
+
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list)
 {
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7 08/11] KVM: MMU: do not reuse the obsolete page

2013-05-22 Thread Xiao Guangrong
The obsolete page will be zapped soon, do not resue it to
reduce future page fault

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3a3e6c5..9b57faa 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1869,6 +1869,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
kvm_vcpu *vcpu,
role.quadrant = quadrant;
}
for_each_gfn_sp(vcpu->kvm, sp, gfn) {
+   if (is_obsolete_sp(vcpu->kvm, sp))
+   continue;
+
if (!need_sync && sp->unsync)
need_sync = true;
 
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7 10/11] KVM: MMU: collapse TLB flushes when zap all pages

2013-05-22 Thread Xiao Guangrong
kvm_zap_obsolete_pages uses lock-break technique to zap pages,
it will flush tlb every time when it does lock-break

We can reload mmu on all vcpus after updating the generation
number so that the obsolete pages are not used on any vcpus,
after that we do not need to flush tlb when obsolete pages
are zapped

Note: kvm_mmu_commit_zap_page is still needed before free
the pages since other vcpus may be doing locklessly shadow
page walking

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   32 ++--
 1 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e676356..5e34056 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4237,8 +4237,6 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
 restart:
list_for_each_entry_safe_reverse(sp, node,
  &kvm->arch.active_mmu_pages, link) {
-   int ret;
-
/*
 * No obsolete page exists before new created page since
 * active_mmu_pages is the FIFO list.
@@ -4254,21 +4252,24 @@ restart:
if (sp->role.invalid)
continue;
 
+   /*
+* Need not flush tlb since we only zap the sp with invalid
+* generation number.
+*/
if (batch >= BATCH_ZAP_PAGES &&
- (need_resched() || spin_needbreak(&kvm->mmu_lock))) {
+ cond_resched_lock(&kvm->mmu_lock)) {
batch = 0;
-   kvm_mmu_commit_zap_page(kvm, &invalid_list);
-   cond_resched_lock(&kvm->mmu_lock);
goto restart;
}
 
-   ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
-   batch += ret;
-
-   if (ret)
-   goto restart;
+   batch += kvm_mmu_prepare_zap_obsolete_page(kvm, sp,
+ &invalid_list);
}
 
+   /*
+* Should flush tlb before free page tables since lockless-walking
+* may use the pages.
+*/
kvm_mmu_commit_zap_page(kvm, &invalid_list);
 }
 
@@ -4287,6 +4288,17 @@ void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
trace_kvm_mmu_invalidate_zap_all_pages(kvm);
kvm->arch.mmu_valid_gen++;
 
+   /*
+* Notify all vcpus to reload its shadow page table
+* and flush TLB. Then all vcpus will switch to new
+* shadow page table with the new mmu_valid_gen.
+*
+* Note: we should do this under the protection of
+* mmu-lock, otherwise, vcpu would purge shadow page
+* but miss tlb flush.
+*/
+   kvm_reload_remote_mmus(kvm);
+
kvm_zap_obsolete_pages(kvm);
spin_unlock(&kvm->mmu_lock);
 }
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7 11/11] KVM: MMU: reduce KVM_REQ_MMU_RELOAD when root page is zapped

2013-05-22 Thread Xiao Guangrong
From: Gleb Natapov 

Quote Gleb's mail:
| why don't we check for sp->role.invalid in
| kvm_mmu_prepare_zap_page before calling kvm_reload_remote_mmus()?

and

| Actually we can add check for is_obsolete_sp() there too since
| kvm_mmu_invalidate_all_pages() already calls kvm_reload_remote_mmus()
| after incrementing mmu_valid_gen.

[ Xiao: add some comments and the check of is_obsolete_sp() ]

Signed-off-by: Gleb Natapov 
Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |8 +++-
 1 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5e34056..055d675 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2097,7 +2097,13 @@ __kvm_mmu_prepare_zap_page(struct kvm *kvm, struct 
kvm_mmu_page *sp,
kvm_mod_used_mmu_pages(kvm, -1);
} else {
list_move(&sp->link, &kvm->arch.active_mmu_pages);
-   kvm_reload_remote_mmus(kvm);
+
+   /*
+* The obsolete pages can not be used on any vcpus.
+* See the comments in kvm_mmu_invalidate_zap_all_pages().
+*/
+   if (!sp->role.invalid && !is_obsolete_sp(kvm, sp))
+   kvm_reload_remote_mmus(kvm);
}
 
sp->role.invalid = 1;
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7 07/11] KVM: MMU: add tracepoint for kvm_mmu_invalidate_all_pages

2013-05-22 Thread Xiao Guangrong
It is good for debug and development

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c  |1 +
 arch/x86/kvm/mmutrace.h |   20 
 2 files changed, 21 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c010ace..3a3e6c5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4245,6 +4245,7 @@ restart:
 void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
 {
spin_lock(&kvm->mmu_lock);
+   trace_kvm_mmu_invalidate_zap_all_pages(kvm);
kvm->arch.mmu_valid_gen++;
 
kvm_zap_obsolete_pages(kvm);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 697f466..eb444dd 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -276,6 +276,26 @@ TRACE_EVENT(
  __spte_satisfied(old_spte), __spte_satisfied(new_spte)
)
 );
+
+TRACE_EVENT(
+   kvm_mmu_invalidate_zap_all_pages,
+   TP_PROTO(struct kvm *kvm),
+   TP_ARGS(kvm),
+
+   TP_STRUCT__entry(
+   __field(unsigned long, mmu_valid_gen)
+   __field(unsigned int, mmu_used_pages)
+   ),
+
+   TP_fast_assign(
+   __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
+   __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
+   ),
+
+   TP_printk("kvm-mmu-valid-gen %lx used_pages %x",
+ __entry->mmu_valid_gen, __entry->mmu_used_pages
+   )
+);
 #endif /* _TRACE_KVMMMU_H */
 
 #undef TRACE_INCLUDE_PATH
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 3/7] KVM: MMU: fast invalidate all pages

2013-05-22 Thread Xiao Guangrong
On 05/22/2013 09:17 PM, Gleb Natapov wrote:
> On Wed, May 22, 2013 at 05:41:10PM +0800, Xiao Guangrong wrote:
>> On 05/22/2013 04:54 PM, Gleb Natapov wrote:
>>> On Wed, May 22, 2013 at 04:46:04PM +0800, Xiao Guangrong wrote:
>>>> On 05/22/2013 02:34 PM, Gleb Natapov wrote:
>>>>> On Tue, May 21, 2013 at 10:33:30PM -0300, Marcelo Tosatti wrote:
>>>>>> On Tue, May 21, 2013 at 11:39:03AM +0300, Gleb Natapov wrote:
>>>>>>>> Any pages with stale information will be zapped by kvm_mmu_zap_all().
>>>>>>>> When that happens, page faults will take place which will 
>>>>>>>> automatically 
>>>>>>>> use the new generation number.
>>>>>>>>
>>>>>>>> So still not clear why is this necessary.
>>>>>>>>
>>>>>>> This is not, strictly speaking, necessary, but it is the sane thing to 
>>>>>>> do.
>>>>>>> You cannot update page's generation number to prevent it from been
>>>>>>> destroyed since after kvm_mmu_zap_all() completes stale ptes in the
>>>>>>> shadow page may point to now deleted memslot. So why build shadow page
>>>>>>> table with a page that is in a process of been destroyed?
>>>>>>
>>>>>> OK, can this be introduced separately, in a later patch, with separate
>>>>>> justification, then?
>>>>>>
>>>>>> Xiao please have the first patches of the patchset focus on the problem
>>>>>> at hand: fix long mmu_lock hold times.
>>>>>>
>>>>>>> Not sure what you mean again. We flush TLB once before entering this 
>>>>>>> function.
>>>>>>> kvm_reload_remote_mmus() does this for us, no?
>>>>>>
>>>>>> kvm_reload_remote_mmus() is used as an optimization, its separate from 
>>>>>> the
>>>>>> problem solution.
>>>>>>
>>>>>>>>
>>>>>>>> What was suggested was... go to phrase which starts with "The only 
>>>>>>>> purpose
>>>>>>>> of the generation number should be to".
>>>>>>>>
>>>>>>>> The comment quoted here does not match that description.
>>>>>>>>
>>>>>>> The comment describes what code does and in this it is correct.
>>>>>>>
>>>>>>> You propose to not reload roots right away and do it only when root sp
>>>>>>> is encountered, right? So my question is what's the point? There are,
>>>>>>> obviously, root sps with invalid generation number at this point, so
>>>>>>> reload will happen regardless in kvm_mmu_prepare_zap_page(). So why not
>>>>>>> do it here right away and avoid it in kvm_mmu_prepare_zap_page() for
>>>>>>> invalid and obsolete sps as I proposed in one of my email?
>>>>>>
>>>>>> Sure. But Xiao please introduce that TLB collapsing optimization as a
>>>>>> later patch, so we can reason about it in a more organized fashion.
>>>>>
>>>>> So, if I understand correctly, you are asking to move is_obsolete_sp()
>>>>> check from kvm_mmu_get_page() and kvm_reload_remote_mmus() from
>>>>> kvm_mmu_invalidate_all_pages() to a separate patch. Fine by me, but if
>>>>> we drop kvm_reload_remote_mmus() from kvm_mmu_invalidate_all_pages() the
>>>>> call to kvm_mmu_invalidate_all_pages() in emulator_fix_hypercall() will
>>>>> become nop. But I question the need to zap all shadow pages tables there
>>>>> in the first place, why kvm_flush_remote_tlbs() is not enough?
>>>>
>>>> I do not know too... I even do no know why kvm_flush_remote_tlbs
>>>> is needed. :(
>>> We changed the content of an executable page, we need to flush instruction
>>> cache of all vcpus to not use stale data, so my suggestion to call
>>
>> I thought the reason is about icache too but icache is automatically
>> flushed on x86, we only need to invalidate the prefetched instructions by
>> executing a serializing operation.
>>
>> See the SDM in the chapter of
>> "8.1.3 Handling Self- and Cross-Modifying Code"
>>
> Right, so we do cross-modifying code here and we need to make sure no
> vcpu is running in a guest m

Re: [PATCH v6 3/7] KVM: MMU: fast invalidate all pages

2013-05-22 Thread Xiao Guangrong
On 05/22/2013 04:54 PM, Gleb Natapov wrote:
> On Wed, May 22, 2013 at 04:46:04PM +0800, Xiao Guangrong wrote:
>> On 05/22/2013 02:34 PM, Gleb Natapov wrote:
>>> On Tue, May 21, 2013 at 10:33:30PM -0300, Marcelo Tosatti wrote:
>>>> On Tue, May 21, 2013 at 11:39:03AM +0300, Gleb Natapov wrote:
>>>>>> Any pages with stale information will be zapped by kvm_mmu_zap_all().
>>>>>> When that happens, page faults will take place which will automatically 
>>>>>> use the new generation number.
>>>>>>
>>>>>> So still not clear why is this necessary.
>>>>>>
>>>>> This is not, strictly speaking, necessary, but it is the sane thing to do.
>>>>> You cannot update page's generation number to prevent it from been
>>>>> destroyed since after kvm_mmu_zap_all() completes stale ptes in the
>>>>> shadow page may point to now deleted memslot. So why build shadow page
>>>>> table with a page that is in a process of been destroyed?
>>>>
>>>> OK, can this be introduced separately, in a later patch, with separate
>>>> justification, then?
>>>>
>>>> Xiao please have the first patches of the patchset focus on the problem
>>>> at hand: fix long mmu_lock hold times.
>>>>
>>>>> Not sure what you mean again. We flush TLB once before entering this 
>>>>> function.
>>>>> kvm_reload_remote_mmus() does this for us, no?
>>>>
>>>> kvm_reload_remote_mmus() is used as an optimization, its separate from the
>>>> problem solution.
>>>>
>>>>>>
>>>>>> What was suggested was... go to phrase which starts with "The only 
>>>>>> purpose
>>>>>> of the generation number should be to".
>>>>>>
>>>>>> The comment quoted here does not match that description.
>>>>>>
>>>>> The comment describes what code does and in this it is correct.
>>>>>
>>>>> You propose to not reload roots right away and do it only when root sp
>>>>> is encountered, right? So my question is what's the point? There are,
>>>>> obviously, root sps with invalid generation number at this point, so
>>>>> reload will happen regardless in kvm_mmu_prepare_zap_page(). So why not
>>>>> do it here right away and avoid it in kvm_mmu_prepare_zap_page() for
>>>>> invalid and obsolete sps as I proposed in one of my email?
>>>>
>>>> Sure. But Xiao please introduce that TLB collapsing optimization as a
>>>> later patch, so we can reason about it in a more organized fashion.
>>>
>>> So, if I understand correctly, you are asking to move is_obsolete_sp()
>>> check from kvm_mmu_get_page() and kvm_reload_remote_mmus() from
>>> kvm_mmu_invalidate_all_pages() to a separate patch. Fine by me, but if
>>> we drop kvm_reload_remote_mmus() from kvm_mmu_invalidate_all_pages() the
>>> call to kvm_mmu_invalidate_all_pages() in emulator_fix_hypercall() will
>>> become nop. But I question the need to zap all shadow pages tables there
>>> in the first place, why kvm_flush_remote_tlbs() is not enough?
>>
>> I do not know too... I even do no know why kvm_flush_remote_tlbs
>> is needed. :(
> We changed the content of an executable page, we need to flush instruction
> cache of all vcpus to not use stale data, so my suggestion to call

I thought the reason is about icache too but icache is automatically
flushed on x86, we only need to invalidate the prefetched instructions by
executing a serializing operation.

See the SDM in the chapter of
"8.1.3 Handling Self- and Cross-Modifying Code"

> kvm_flush_remote_tlbs() is obviously incorrect since this flushes tlb,
> not instruction cache, but why kvm_reload_remote_mmus() would flush
> instruction cache?

kvm_reload_remote_mmus do not have any help i think.

I find that this change is introduced by commit: 7aa81cc0
and I have added Anthony in the CC.

I also find some discussions related to calling
kvm_reload_remote_mmus():

>
> But if the instruction is architecture dependent, and you run on the
> wrong architecture, now you have to patch many locations at fault time,
> introducing some nasty runtime code / data cache overlap performance
> problems.  Granted, they go away eventually.
>

We're addressing that by blowing away the shadow cache and holding the
big kvm lock to ensure SMP safety.  Not a great thing to do from a
performance perspective but the whole point of patching is that the cost
is amortized.

(http://kerneltrap.org/mailarchive/linux-kernel/2007/9/14/260288)

But i can not understand...


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 3/7] KVM: MMU: fast invalidate all pages

2013-05-22 Thread Xiao Guangrong
On 05/22/2013 02:34 PM, Gleb Natapov wrote:
> On Tue, May 21, 2013 at 10:33:30PM -0300, Marcelo Tosatti wrote:
>> On Tue, May 21, 2013 at 11:39:03AM +0300, Gleb Natapov wrote:
 Any pages with stale information will be zapped by kvm_mmu_zap_all().
 When that happens, page faults will take place which will automatically 
 use the new generation number.

 So still not clear why is this necessary.

>>> This is not, strictly speaking, necessary, but it is the sane thing to do.
>>> You cannot update page's generation number to prevent it from been
>>> destroyed since after kvm_mmu_zap_all() completes stale ptes in the
>>> shadow page may point to now deleted memslot. So why build shadow page
>>> table with a page that is in a process of been destroyed?
>>
>> OK, can this be introduced separately, in a later patch, with separate
>> justification, then?
>>
>> Xiao please have the first patches of the patchset focus on the problem
>> at hand: fix long mmu_lock hold times.
>>
>>> Not sure what you mean again. We flush TLB once before entering this 
>>> function.
>>> kvm_reload_remote_mmus() does this for us, no?
>>
>> kvm_reload_remote_mmus() is used as an optimization, its separate from the
>> problem solution.
>>

 What was suggested was... go to phrase which starts with "The only purpose
 of the generation number should be to".

 The comment quoted here does not match that description.

>>> The comment describes what code does and in this it is correct.
>>>
>>> You propose to not reload roots right away and do it only when root sp
>>> is encountered, right? So my question is what's the point? There are,
>>> obviously, root sps with invalid generation number at this point, so
>>> reload will happen regardless in kvm_mmu_prepare_zap_page(). So why not
>>> do it here right away and avoid it in kvm_mmu_prepare_zap_page() for
>>> invalid and obsolete sps as I proposed in one of my email?
>>
>> Sure. But Xiao please introduce that TLB collapsing optimization as a
>> later patch, so we can reason about it in a more organized fashion.
> 
> So, if I understand correctly, you are asking to move is_obsolete_sp()
> check from kvm_mmu_get_page() and kvm_reload_remote_mmus() from
> kvm_mmu_invalidate_all_pages() to a separate patch. Fine by me, but if
> we drop kvm_reload_remote_mmus() from kvm_mmu_invalidate_all_pages() the
> call to kvm_mmu_invalidate_all_pages() in emulator_fix_hypercall() will
> become nop. But I question the need to zap all shadow pages tables there
> in the first place, why kvm_flush_remote_tlbs() is not enough?

I do not know too... I even do no know why kvm_flush_remote_tlbs
is needed. :(



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 3/7] KVM: MMU: fast invalidate all pages

2013-05-20 Thread Xiao Guangrong
On 05/21/2013 04:40 AM, Marcelo Tosatti wrote:
> On Mon, May 20, 2013 at 11:15:45PM +0300, Gleb Natapov wrote:
>> On Mon, May 20, 2013 at 04:46:24PM -0300, Marcelo Tosatti wrote:
>>> On Fri, May 17, 2013 at 05:12:58AM +0800, Xiao Guangrong wrote:
>>>> The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
>>>> walk and zap all shadow pages one by one, also it need to zap all guest
>>>> page's rmap and all shadow page's parent spte list. Particularly, things
>>>> become worse if guest uses more memory or vcpus. It is not good for
>>>> scalability
>>>>
>>>> In this patch, we introduce a faster way to invalidate all shadow pages.
>>>> KVM maintains a global mmu invalid generation-number which is stored in
>>>> kvm->arch.mmu_valid_gen and every shadow page stores the current global
>>>> generation-number into sp->mmu_valid_gen when it is created
>>>>
>>>> When KVM need zap all shadow pages sptes, it just simply increase the
>>>> global generation-number then reload root shadow pages on all vcpus.
>>>> Vcpu will create a new shadow page table according to current kvm's
>>>> generation-number. It ensures the old pages are not used any more.
>>>> Then the invalid-gen pages (sp->mmu_valid_gen != kvm->arch.mmu_valid_gen)
>>>> are zapped by using lock-break technique
>>>>
>>>> Signed-off-by: Xiao Guangrong 
>>>> ---
>>>>  arch/x86/include/asm/kvm_host.h |2 +
>>>>  arch/x86/kvm/mmu.c  |  103 
>>>> +++
>>>>  arch/x86/kvm/mmu.h  |1 +
>>>>  3 files changed, 106 insertions(+), 0 deletions(-)
>>>>
>>>> diff --git a/arch/x86/include/asm/kvm_host.h 
>>>> b/arch/x86/include/asm/kvm_host.h
>>>> index 3741c65..bff7d46 100644
>>>> --- a/arch/x86/include/asm/kvm_host.h
>>>> +++ b/arch/x86/include/asm/kvm_host.h
>>>> @@ -222,6 +222,7 @@ struct kvm_mmu_page {
>>>>int root_count;  /* Currently serving as active root */
>>>>unsigned int unsync_children;
>>>>unsigned long parent_ptes;  /* Reverse mapping for parent_pte */
>>>> +  unsigned long mmu_valid_gen;
>>>>DECLARE_BITMAP(unsync_child_bitmap, 512);
>>>>  
>>>>  #ifdef CONFIG_X86_32
>>>> @@ -529,6 +530,7 @@ struct kvm_arch {
>>>>unsigned int n_requested_mmu_pages;
>>>>unsigned int n_max_mmu_pages;
>>>>unsigned int indirect_shadow_pages;
>>>> +  unsigned long mmu_valid_gen;
>>>>struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
>>>>/*
>>>> * Hash table of struct kvm_mmu_page.
>>>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>>>> index 682ecb4..891ad2c 100644
>>>> --- a/arch/x86/kvm/mmu.c
>>>> +++ b/arch/x86/kvm/mmu.c
>>>> @@ -1839,6 +1839,11 @@ static void clear_sp_write_flooding_count(u64 *spte)
>>>>__clear_sp_write_flooding_count(sp);
>>>>  }
>>>>  
>>>> +static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
>>>> +{
>>>> +  return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
>>>> +}
>>>> +
>>>>  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>>>> gfn_t gfn,
>>>> gva_t gaddr,
>>>> @@ -1865,6 +1870,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
>>>> kvm_vcpu *vcpu,
>>>>role.quadrant = quadrant;
>>>>}
>>>>for_each_gfn_sp(vcpu->kvm, sp, gfn) {
>>>> +  if (is_obsolete_sp(vcpu->kvm, sp))
>>>> +  continue;
>>>> +
>>>
>>> Whats the purpose of not using pages which are considered "obsolete" ?
>>>
>> The same as not using page that is invalid, to not reuse stale
>> information. The page may contain ptes that point to invalid slot.
> 
> Any pages with stale information will be zapped by kvm_mmu_zap_all().
> When that happens, page faults will take place which will automatically 
> use the new generation number.

kvm_mmu_zap_all() uses lock-break technique to zap pages, before it zaps
all obsolete pages other vcpus can require mmu-lock and call kvm_mmu_get_page()
to install new page. In this case,

Re: [PATCH v6 2/7] KVM: MMU: delete shadow page from hash list in kvm_mmu_prepare_zap_page

2013-05-20 Thread Xiao Guangrong
On 05/19/2013 06:47 PM, Gleb Natapov wrote:
> On Fri, May 17, 2013 at 05:12:57AM +0800, Xiao Guangrong wrote:
>> Move deletion shadow page from the hash list from kvm_mmu_commit_zap_page to
>> kvm_mmu_prepare_zap_page so that we can call kvm_mmu_commit_zap_page
>> once for multiple kvm_mmu_prepare_zap_page that can help us to avoid
>> unnecessary TLB flush
>>
> Don't we call kvm_mmu_commit_zap_page() once for multiple
> kvm_mmu_prepare_zap_page() now when possible? kvm_mmu_commit_zap_page()
> gets a list as a parameter. I am not against the change, but wish to
> understand it better.

The changelong is not clear enough, i mean we can "call
kvm_mmu_commit_zap_page once for multiple kvm_mmu_prepare_zap_page" when
we use lock-break technique. If we do not do this, the page can be found
in hashtable but they are linked on the invalid_list on other thread.

> 
>> Signed-off-by: Xiao Guangrong 
>> ---
>>  arch/x86/kvm/mmu.c |8 ++--
>>  1 files changed, 6 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index 40d7b2d..682ecb4 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -1466,7 +1466,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm 
>> *kvm, int nr)
>>  static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
>>  {
>>  ASSERT(is_empty_shadow_page(sp->spt));
>> -hlist_del(&sp->hash_link);
>> +
>>  list_del(&sp->link);
>>  free_page((unsigned long)sp->spt);
>>  if (!sp->role.direct)
>> @@ -1655,7 +1655,8 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
>>  
>>  #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) 
>> \
>>  for_each_gfn_sp(_kvm, _sp, _gfn)\
>> -if ((_sp)->role.direct || (_sp)->role.invalid) {} else
>> +if ((_sp)->role.direct ||   \
>> +  ((_sp)->role.invalid && WARN_ON(1))) {} else
>>  
>>  /* @sp->gfn should be write-protected at the call site */
>>  static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
>> @@ -2074,6 +2075,9 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, 
>> struct kvm_mmu_page *sp,
>>  unaccount_shadowed(kvm, sp->gfn);
>>  if (sp->unsync)
>>  kvm_unlink_unsync_page(kvm, sp);
>> +
>> +hlist_del_init(&sp->hash_link);
>> +
> What about moving this inside if() bellow and making it hlist_del()?
> Leave the page on the hash if root_count is non zero.
> 

It's a good idea. will update.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 3/7] KVM: MMU: fast invalidate all pages

2013-05-20 Thread Xiao Guangrong
On 05/19/2013 06:04 PM, Gleb Natapov wrote:

>> +/*
>> + * Do not repeatedly zap a root page to avoid unnecessary
>> + * KVM_REQ_MMU_RELOAD, otherwise we may not be able to
>> + * progress:
>> + *vcpu 0vcpu 1
>> + * call vcpu_enter_guest():
>> + *1): handle KVM_REQ_MMU_RELOAD
>> + *and require mmu-lock to
>> + *load mmu
>> + * repeat:
>> + *1): zap root page and
>> + *send KVM_REQ_MMU_RELOAD
>> + *
>> + *2): if (cond_resched_lock(mmu-lock))
>> + *
>> + *2): hold mmu-lock and load mmu
>> + *
>> + *3): see KVM_REQ_MMU_RELOAD bit
>> + *on vcpu->requests is set
>> + *then return 1 to call
>> + *vcpu_enter_guest() again.
>> + *goto repeat;
>> + *
>> + */
> I am not sure why the above scenario will prevent us from progressing.
> There is finite number of root pages with invalid generation number, so
> eventually we will zap them all and vcpu1 will stop seeing KVM_REQ_MMU_RELOAD
> request.

This patch does not "zap pages in batch", so kvm_zap_obsolete_pages() can
just zap invalid root pages and lock-break due to the lock contention on the
path of handing KVM_REQ_MMU_RELOAD.

Yes, after "zap pages in batch", this issue does not exist any more. I should
update this into that patch.

> 
> This check here prevent unnecessary KVM_REQ_MMU_RELOAD as you say, but
> this races the question, why don't we check for sp->role.invalid in
> kvm_mmu_prepare_zap_page before calling kvm_reload_remote_mmus()?
> Something like this:
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 40d7b2d..d2ae3a4 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -2081,7 +2081,8 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, 
> struct kvm_mmu_page *sp,
>   kvm_mod_used_mmu_pages(kvm, -1);
>   } else {
>   list_move(&sp->link, &kvm->arch.active_mmu_pages);
> - kvm_reload_remote_mmus(kvm);
> + if (!sp->role.invalid)
> + kvm_reload_remote_mmus(kvm);
>   }
> 
>   sp->role.invalid = 1;

Yes, it is better.

> 
> Actually we can add check for is_obsolete_sp() there too since
> kvm_mmu_invalidate_all_pages() already calls kvm_reload_remote_mmus()
> after incrementing mmu_valid_gen.

Yes, I agree.

> 
> Or do I miss something?

No, you are right. ;)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v6 1/7] KVM: MMU: drop unnecessary kvm_reload_remote_mmus

2013-05-16 Thread Xiao Guangrong
It is the responsibility of kvm_mmu_zap_all that keeps the
consistent of mmu and tlbs. And it is also unnecessary after
zap all mmio sptes since no mmio spte exists on root shadow
page and it can not be cached into tlb

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/x86.c |5 +
 1 files changed, 1 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8d28810..d885418 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7067,16 +7067,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 * If memory slot is created, or moved, we need to clear all
 * mmio sptes.
 */
-   if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
+   if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE))
kvm_mmu_zap_mmio_sptes(kvm);
-   kvm_reload_remote_mmus(kvm);
-   }
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
kvm_mmu_zap_all(kvm);
-   kvm_reload_remote_mmus(kvm);
 }
 
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v6 4/7] KVM: MMU: zap pages in batch

2013-05-16 Thread Xiao Guangrong
Zap at lease 10 pages before releasing mmu-lock to reduce the overload
caused by requiring lock

[ It improves kernel building 0.6% ~ 1% ]

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   14 --
 1 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 891ad2c..7ad0e50 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4207,14 +4207,18 @@ restart:
spin_unlock(&kvm->mmu_lock);
 }
 
+#define BATCH_ZAP_PAGES10
 static void kvm_zap_obsolete_pages(struct kvm *kvm)
 {
struct kvm_mmu_page *sp, *node;
LIST_HEAD(invalid_list);
+   int batch = 0;
 
 restart:
list_for_each_entry_safe_reverse(sp, node,
  &kvm->arch.active_mmu_pages, link) {
+   int ret;
+
/*
 * No obsolete page exists before new created page since
 * active_mmu_pages is the FIFO list.
@@ -4252,10 +4256,16 @@ restart:
 * Need not flush tlb since we only zap the sp with invalid
 * generation number.
 */
-   if (cond_resched_lock(&kvm->mmu_lock))
+   if ((batch >= BATCH_ZAP_PAGES) &&
+ cond_resched_lock(&kvm->mmu_lock)) {
+   batch = 0;
goto restart;
+   }
 
-   if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
+   ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+   batch += ret;
+
+   if (ret)
goto restart;
}
 
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v6 7/7] KVM: MMU: add tracepoint for kvm_mmu_invalidate_all_pages

2013-05-16 Thread Xiao Guangrong
It is good for debug and development

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c  |1 +
 arch/x86/kvm/mmutrace.h |   23 +++
 2 files changed, 24 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 89b51dc..2c512e8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4276,6 +4276,7 @@ restart:
 void kvm_mmu_invalidate_all_pages(struct kvm *kvm, bool zap_obsolete_pages)
 {
spin_lock(&kvm->mmu_lock);
+   trace_kvm_mmu_invalidate_all_pages(kvm, zap_obsolete_pages);
kvm->arch.mmu_valid_gen++;
 
/*
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 697f466..e13d253 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -276,6 +276,29 @@ TRACE_EVENT(
  __spte_satisfied(old_spte), __spte_satisfied(new_spte)
)
 );
+
+TRACE_EVENT(
+   kvm_mmu_invalidate_all_pages,
+   TP_PROTO(struct kvm *kvm, bool zap_obsolete_pages),
+   TP_ARGS(kvm, zap_obsolete_pages),
+
+   TP_STRUCT__entry(
+   __field(unsigned long, mmu_valid_gen)
+   __field(unsigned int, mmu_used_pages)
+   __field(bool, zap_obsolete_pages)
+   ),
+
+   TP_fast_assign(
+   __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
+   __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
+   __entry->zap_obsolete_pages = zap_obsolete_pages;
+   ),
+
+   TP_printk("kvm-mmu-valid-gen %lx zap_obsolete_pages %d "
+ "used_pages %x", __entry->mmu_valid_gen,
+ __entry->zap_obsolete_pages, __entry->mmu_used_pages
+   )
+);
 #endif /* _TRACE_KVMMMU_H */
 
 #undef TRACE_INCLUDE_PATH
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v6 6/7] KVM: MMU: show mmu_valid_gen in shadow page related tracepoints

2013-05-16 Thread Xiao Guangrong
Show sp->mmu_valid_gen

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmutrace.h |   22 --
 1 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index b8f6172..697f466 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -7,16 +7,18 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvmmmu
 
-#define KVM_MMU_PAGE_FIELDS \
-   __field(__u64, gfn) \
-   __field(__u32, role) \
-   __field(__u32, root_count) \
+#define KVM_MMU_PAGE_FIELDS\
+   __field(unsigned long, mmu_valid_gen)   \
+   __field(__u64, gfn) \
+   __field(__u32, role)\
+   __field(__u32, root_count)  \
__field(bool, unsync)
 
-#define KVM_MMU_PAGE_ASSIGN(sp) \
-   __entry->gfn = sp->gfn;  \
-   __entry->role = sp->role.word;   \
-   __entry->root_count = sp->root_count;\
+#define KVM_MMU_PAGE_ASSIGN(sp)\
+   __entry->mmu_valid_gen = sp->mmu_valid_gen; \
+   __entry->gfn = sp->gfn; \
+   __entry->role = sp->role.word;  \
+   __entry->root_count = sp->root_count;   \
__entry->unsync = sp->unsync;
 
 #define KVM_MMU_PAGE_PRINTK() ({   \
@@ -28,8 +30,8 @@
\
role.word = __entry->role;  \
\
-   trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s"   \
-" %snxe root %u %s%c", \
+   trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s"   \
+" %snxe root %u %s%c", __entry->mmu_valid_gen, \
 __entry->gfn, role.level,  \
 role.cr4_pae ? " pae" : "",\
 role.quadrant, \
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v6 5/7] KVM: x86: use the fast way to invalidate all pages

2013-05-16 Thread Xiao Guangrong
Replace kvm_mmu_zap_all by kvm_mmu_invalidate_all_pages

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   15 ---
 arch/x86/kvm/x86.c |6 +++---
 2 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7ad0e50..89b51dc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4192,21 +4192,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, 
int slot)
spin_unlock(&kvm->mmu_lock);
 }
 
-void kvm_mmu_zap_all(struct kvm *kvm)
-{
-   struct kvm_mmu_page *sp, *node;
-   LIST_HEAD(invalid_list);
-
-   spin_lock(&kvm->mmu_lock);
-restart:
-   list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
-   if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
-   goto restart;
-
-   kvm_mmu_commit_zap_page(kvm, &invalid_list);
-   spin_unlock(&kvm->mmu_lock);
-}
-
 #define BATCH_ZAP_PAGES10
 static void kvm_zap_obsolete_pages(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d885418..30a990c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5528,7 +5528,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt 
*ctxt)
 * to ensure that the updated hypercall appears atomically across all
 * VCPUs.
 */
-   kvm_mmu_zap_all(vcpu->kvm);
+   kvm_mmu_invalidate_all_pages(vcpu->kvm, false);
 
kvm_x86_ops->patch_hypercall(vcpu, instruction);
 
@@ -7073,13 +7073,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
-   kvm_mmu_zap_all(kvm);
+   kvm_mmu_invalidate_all_pages(kvm, true);
 }
 
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
   struct kvm_memory_slot *slot)
 {
-   kvm_arch_flush_shadow_all(kvm);
+   kvm_mmu_invalidate_all_pages(kvm, true);
 }
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v6 3/7] KVM: MMU: fast invalidate all pages

2013-05-16 Thread Xiao Guangrong
The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
walk and zap all shadow pages one by one, also it need to zap all guest
page's rmap and all shadow page's parent spte list. Particularly, things
become worse if guest uses more memory or vcpus. It is not good for
scalability

In this patch, we introduce a faster way to invalidate all shadow pages.
KVM maintains a global mmu invalid generation-number which is stored in
kvm->arch.mmu_valid_gen and every shadow page stores the current global
generation-number into sp->mmu_valid_gen when it is created

When KVM need zap all shadow pages sptes, it just simply increase the
global generation-number then reload root shadow pages on all vcpus.
Vcpu will create a new shadow page table according to current kvm's
generation-number. It ensures the old pages are not used any more.
Then the invalid-gen pages (sp->mmu_valid_gen != kvm->arch.mmu_valid_gen)
are zapped by using lock-break technique

Signed-off-by: Xiao Guangrong 
---
 arch/x86/include/asm/kvm_host.h |2 +
 arch/x86/kvm/mmu.c  |  103 +++
 arch/x86/kvm/mmu.h  |1 +
 3 files changed, 106 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3741c65..bff7d46 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -222,6 +222,7 @@ struct kvm_mmu_page {
int root_count;  /* Currently serving as active root */
unsigned int unsync_children;
unsigned long parent_ptes;  /* Reverse mapping for parent_pte */
+   unsigned long mmu_valid_gen;
DECLARE_BITMAP(unsync_child_bitmap, 512);
 
 #ifdef CONFIG_X86_32
@@ -529,6 +530,7 @@ struct kvm_arch {
unsigned int n_requested_mmu_pages;
unsigned int n_max_mmu_pages;
unsigned int indirect_shadow_pages;
+   unsigned long mmu_valid_gen;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
 * Hash table of struct kvm_mmu_page.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 682ecb4..891ad2c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1839,6 +1839,11 @@ static void clear_sp_write_flooding_count(u64 *spte)
__clear_sp_write_flooding_count(sp);
 }
 
+static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+   return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+}
+
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 gfn_t gfn,
 gva_t gaddr,
@@ -1865,6 +1870,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
kvm_vcpu *vcpu,
role.quadrant = quadrant;
}
for_each_gfn_sp(vcpu->kvm, sp, gfn) {
+   if (is_obsolete_sp(vcpu->kvm, sp))
+   continue;
+
if (!need_sync && sp->unsync)
need_sync = true;
 
@@ -1901,6 +1909,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
kvm_vcpu *vcpu,
 
account_shadowed(vcpu->kvm, gfn);
}
+   sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
init_shadow_page_table(sp);
trace_kvm_mmu_get_page(sp, true);
return sp;
@@ -2071,8 +2080,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, 
struct kvm_mmu_page *sp,
ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
kvm_mmu_page_unlink_children(kvm, sp);
kvm_mmu_unlink_parents(kvm, sp);
+
if (!sp->role.invalid && !sp->role.direct)
unaccount_shadowed(kvm, sp->gfn);
+
if (sp->unsync)
kvm_unlink_unsync_page(kvm, sp);
 
@@ -4196,6 +4207,98 @@ restart:
spin_unlock(&kvm->mmu_lock);
 }
 
+static void kvm_zap_obsolete_pages(struct kvm *kvm)
+{
+   struct kvm_mmu_page *sp, *node;
+   LIST_HEAD(invalid_list);
+
+restart:
+   list_for_each_entry_safe_reverse(sp, node,
+ &kvm->arch.active_mmu_pages, link) {
+   /*
+* No obsolete page exists before new created page since
+* active_mmu_pages is the FIFO list.
+*/
+   if (!is_obsolete_sp(kvm, sp))
+   break;
+
+   /*
+* Do not repeatedly zap a root page to avoid unnecessary
+* KVM_REQ_MMU_RELOAD, otherwise we may not be able to
+* progress:
+*vcpu 0vcpu 1
+* call vcpu_enter_guest():
+*1): handle KVM_REQ_MMU_RELOAD
+*and require mmu-lock to
+*load mmu
+* repea

[PATCH v6 2/7] KVM: MMU: delete shadow page from hash list in kvm_mmu_prepare_zap_page

2013-05-16 Thread Xiao Guangrong
Move deletion shadow page from the hash list from kvm_mmu_commit_zap_page to
kvm_mmu_prepare_zap_page so that we can call kvm_mmu_commit_zap_page
once for multiple kvm_mmu_prepare_zap_page that can help us to avoid
unnecessary TLB flush

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |8 ++--
 1 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 40d7b2d..682ecb4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1466,7 +1466,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm 
*kvm, int nr)
 static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
 {
ASSERT(is_empty_shadow_page(sp->spt));
-   hlist_del(&sp->hash_link);
+
list_del(&sp->link);
free_page((unsigned long)sp->spt);
if (!sp->role.direct)
@@ -1655,7 +1655,8 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 
 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)
\
for_each_gfn_sp(_kvm, _sp, _gfn)\
-   if ((_sp)->role.direct || (_sp)->role.invalid) {} else
+   if ((_sp)->role.direct ||   \
+ ((_sp)->role.invalid && WARN_ON(1))) {} else
 
 /* @sp->gfn should be write-protected at the call site */
 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -2074,6 +2075,9 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, 
struct kvm_mmu_page *sp,
unaccount_shadowed(kvm, sp->gfn);
if (sp->unsync)
kvm_unlink_unsync_page(kvm, sp);
+
+   hlist_del_init(&sp->hash_link);
+
if (!sp->root_count) {
/* Count self */
ret++;
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v6 0/7] KVM: MMU: fast zap all shadow pages

2013-05-16 Thread Xiao Guangrong
The benchmark and the result can be found at:
http://www.spinics.net/lists/kvm/msg91391.html

Changlog:
V6:
  1): reversely walk active_list to skip the new created pages based
  on the comments from Gleb and Paolo.

  2): completely replace kvm_mmu_zap_all by kvm_mmu_invalidate_all_pages
  based on Gleb's comments.

  3): improve the parameters of kvm_mmu_invalidate_all_pages based on
  Gleb's comments.
 
  4): rename kvm_mmu_invalidate_memslot_pages to kvm_mmu_invalidate_all_pages
  5): rename zap_invalid_pages to kvm_zap_obsolete_pages

V5:
  1): rename is_valid_sp to is_obsolete_sp
  2): use lock-break technique to zap all old pages instead of only pages
  linked on invalid slot's rmap suggested by Marcelo.
  3): trace invalid pages and kvm_mmu_invalidate_memslot_pages()
  4): rename kvm_mmu_invalid_memslot_pages to kvm_mmu_invalidate_memslot_pages
  according to Takuya's comments.

V4:
  1): drop unmapping invalid rmap out of mmu-lock and use lock-break technique
  instead. Thanks to Gleb's comments.

  2): needn't handle invalid-gen pages specially due to page table always
  switched by KVM_REQ_MMU_RELOAD. Thanks to Marcelo's comments.

V3:
  completely redesign the algorithm, please see below.

V2:
  - do not reset n_requested_mmu_pages and n_max_mmu_pages
  - batch free root shadow pages to reduce vcpu notification and mmu-lock
contention
  - remove the first patch that introduce kvm->arch.mmu_cache since we only
'memset zero' on hashtable rather than all mmu cache members in this
version
  - remove unnecessary kvm_reload_remote_mmus after kvm_mmu_zap_all

* Issue
The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
walk and zap all shadow pages one by one, also it need to zap all guest
page's rmap and all shadow page's parent spte list. Particularly, things
become worse if guest uses more memory or vcpus. It is not good for
scalability.

* Idea
KVM maintains a global mmu invalid generation-number which is stored in
kvm->arch.mmu_valid_gen and every shadow page stores the current global
generation-number into sp->mmu_valid_gen when it is created.

When KVM need zap all shadow pages sptes, it just simply increase the
global generation-number then reload root shadow pages on all vcpus.
Vcpu will create a new shadow page table according to current kvm's
generation-number. It ensures the old pages are not used any more.

Then the invalid-gen pages (sp->mmu_valid_gen != kvm->arch.mmu_valid_gen)
are zapped by using lock-break technique.

Xiao Guangrong (7):
  KVM: MMU: drop unnecessary kvm_reload_remote_mmus
  KVM: MMU: delete shadow page from hash list in
kvm_mmu_prepare_zap_page
  KVM: MMU: fast invalidate all pages
  KVM: MMU: zap pages in batch
  KVM: x86: use the fast way to invalidate all pages
  KVM: MMU: show mmu_valid_gen in shadow page related tracepoints
  KVM: MMU: add tracepoint for kvm_mmu_invalidate_all_pages

 arch/x86/include/asm/kvm_host.h |2 +
 arch/x86/kvm/mmu.c  |  115 +--
 arch/x86/kvm/mmu.h  |1 +
 arch/x86/kvm/mmutrace.h |   45 
 arch/x86/kvm/x86.c  |   11 ++---
 5 files changed, 151 insertions(+), 23 deletions(-)

-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 4/8] KVM: x86: use the fast way to invalidate all pages

2013-05-16 Thread Xiao Guangrong
On 05/17/2013 12:19 AM, Gleb Natapov wrote:
> On Thu, May 16, 2013 at 08:17:49PM +0800, Xiao Guangrong wrote:
>> Replace kvm_mmu_zap_all by kvm_mmu_invalidate_memslot_pages except on
>> the path of mmu_notifier->release() which will be fixed in
>> the later patch
>>
> Why ->release() cannot use kvm_mmu_invalidate_memslot_pages()?

Good eyes.

Forgot to update this part after applying new approach of
kvm_mmu_invalidate_memslot_pages(). Will update it.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 3/8] KVM: MMU: fast invalidate all pages

2013-05-16 Thread Xiao Guangrong
On 05/17/2013 12:18 AM, Gleb Natapov wrote:

>> +
>> +/*
>> + * Fast invalidate all shadow pages belong to @slot.
>> + *
>> + * @slot != NULL means the invalidation is caused the memslot specified
>> + * by @slot is being deleted, in this case, we should ensure that rmap
>> + * and lpage-info of the @slot can not be used after calling the function.
>> + *
>> + * @slot == NULL means the invalidation due to other reasons, we need
>> + * not care rmap and lpage-info since they are still valid after calling
>> + * the function.
>> + */
>> +void kvm_mmu_invalidate_memslot_pages(struct kvm *kvm,
>> +  struct kvm_memory_slot *slot)
> 
> Why pass "slot" here? If we want the function to sometimes wait for purge
> and sometimes not the more straightforward way is to have a "bool wait"
> parameter instead.
> 

That's my fault, i forgot to update it. Will use 'bool zap_invalid_pages'
instead.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 3/8] KVM: MMU: fast invalidate all pages

2013-05-16 Thread Xiao Guangrong
On 05/16/2013 11:57 PM, Gleb Natapov wrote:

> One more thought. With current patch if zap_invalid_page() will be
> called second time while another zap_invalid_page() is still running
> (can that happen?) they will both run concurrently fighting for the

Currently, it can not happen since zap_invalid_page is needed when slot
is being deleted which protected by slot-lock.

But we allow it to be concurrent as you commented: we can use it in
->release() instead of calling kvm_mmu_zap_all(), in that case, multiple
call zap_invalid_page() can happen.

> mmu_lock. Is this a problem?

Are you worry about that it can not progress due to lock contention when
walking active_list? Zapping at least 10 pages before releasing the lock
should ensure that it can progress.

Do you see any potential issue?



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 0/8] KVM: MMU: fast zap all shadow pages

2013-05-16 Thread Xiao Guangrong
On 05/16/2013 10:36 PM, Takuya Yoshikawa wrote:
> On Thu, 16 May 2013 20:17:45 +0800
> Xiao Guangrong  wrote:
> 
>> Bechmark result:
>> I have tested this patchset and the previous version that only zaps the
>> pages linked on invalid slot's rmap. The benchmark is written by myself
>> which has been attached, it writes large memory when do pci rom read.
>>
>> Host: Intel(R) Xeon(R) CPU X5690  @ 3.47GHz + 36G Memory
>> Guest: 12 VCPU + 32G Memory
>>
>> Current code:   This patchset Previous Version 
>> 2405434959 ns   2323016424 ns 2368810003 ns
>>
>> The interesting thing is, the previous version is slower than this patch,
>> i guess the reason is that the former keeps lots of invalid pages in mmu
>> which cause shadow page to be reclaimed due to used-pages > request-pages
>> or host memory shrink.
> 
> This patch series looks very nice!

Thank you, Takuya!

> 
> Minor issues may still need to be improved, but I really hope to see this
> get merged during this cycle.
> 
> [for the future]  Do you think that postponing some zapping/freeing of
> obsolete(already invalidated) pages to make_mmu_pages_available() time
> can improve the situation more?  -- say, for big guests.

Yes, i think it can. :)

We have made many efforts on this but still lack a straight way to
achieve it.

> 
> If accounting kept correct, make_mmu_pages_available() only needs to free
> some obsolete pages instead of valid pages.
> 

Yes.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 8/8] KVM: MMU: zap pages in batch

2013-05-16 Thread Xiao Guangrong
On 05/16/2013 08:45 PM, Paolo Bonzini wrote:
> Il 16/05/2013 14:17, Xiao Guangrong ha scritto:
>> Zap at lease 10 pages before releasing mmu-lock to reduce the overload
>> caused by requiring lock
>>
>> [ It improves kernel building 0.6% ~ 1% ]
>>
>> Signed-off-by: Xiao Guangrong 
>> ---
>>  arch/x86/kvm/mmu.c |   11 ---
>>  1 files changed, 8 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index e12f431..9c27fda 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -4216,10 +4216,12 @@ restart:
>>  spin_unlock(&kvm->mmu_lock);
>>  }
>>  
>> +#define BATCH_ZAP_PAGES 10
>>  static void zap_invalid_pages(struct kvm *kvm)
>>  {
>>  struct kvm_mmu_page *sp, *node;
>>  LIST_HEAD(invalid_list);
>> +int batch = 0;
>>  
>>  restart:
>>  list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
>> @@ -4256,11 +4258,14 @@ restart:
>>   * Need not flush tlb since we only zap the sp with invalid
>>   * generation number.
>>   */
>> -if (cond_resched_lock(&kvm->mmu_lock))
>> +if ((batch >= BATCH_ZAP_PAGES) &&
>> +  cond_resched_lock(&kvm->mmu_lock)) {
>> +batch = 0;
>>  goto restart;
>> +}
>>  
>> -if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
>> -goto restart;
>> +batch += kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
>> +goto restart;
> 
> Would this look again and again at the same page if
> kvm_mmu_prepare_zap_page returns 0?

We skip the invalid page (sp->role.invalid) before call
kvm_mmu_prepare_zap_page so that kvm_mmu_prepare_zap_page can not
meet the same page. ;)



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 3/8] KVM: MMU: fast invalidate all pages

2013-05-16 Thread Xiao Guangrong
On 05/16/2013 08:43 PM, Gleb Natapov wrote:
> On Thu, May 16, 2013 at 08:17:48PM +0800, Xiao Guangrong wrote:
>> The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
>> walk and zap all shadow pages one by one, also it need to zap all guest
>> page's rmap and all shadow page's parent spte list. Particularly, things
>> become worse if guest uses more memory or vcpus. It is not good for
>> scalability
>>
>> In this patch, we introduce a faster way to invalidate all shadow pages.
>> KVM maintains a global mmu invalid generation-number which is stored in
>> kvm->arch.mmu_valid_gen and every shadow page stores the current global
>> generation-number into sp->mmu_valid_gen when it is created
>>
>> When KVM need zap all shadow pages sptes, it just simply increase the
>> global generation-number then reload root shadow pages on all vcpus.
>> Vcpu will create a new shadow page table according to current kvm's
>> generation-number. It ensures the old pages are not used any more.
>> Then the invalid-gen pages (sp->mmu_valid_gen != kvm->arch.mmu_valid_gen)
>> are zapped by using lock-break technique
>>
>> Signed-off-by: Xiao Guangrong 
>> ---
>>  arch/x86/include/asm/kvm_host.h |2 +
>>  arch/x86/kvm/mmu.c  |   98 
>> +++
>>  arch/x86/kvm/mmu.h  |2 +
>>  3 files changed, 102 insertions(+), 0 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h 
>> b/arch/x86/include/asm/kvm_host.h
>> index 3741c65..bff7d46 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -222,6 +222,7 @@ struct kvm_mmu_page {
>>  int root_count;  /* Currently serving as active root */
>>  unsigned int unsync_children;
>>  unsigned long parent_ptes;  /* Reverse mapping for parent_pte */
>> +unsigned long mmu_valid_gen;
>>  DECLARE_BITMAP(unsync_child_bitmap, 512);
>>  
>>  #ifdef CONFIG_X86_32
>> @@ -529,6 +530,7 @@ struct kvm_arch {
>>  unsigned int n_requested_mmu_pages;
>>  unsigned int n_max_mmu_pages;
>>  unsigned int indirect_shadow_pages;
>> +unsigned long mmu_valid_gen;
>>  struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
>>  /*
>>   * Hash table of struct kvm_mmu_page.
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index 682ecb4..d9343fe 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -1839,6 +1839,11 @@ static void clear_sp_write_flooding_count(u64 *spte)
>>  __clear_sp_write_flooding_count(sp);
>>  }
>>  
>> +static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
>> +{
>> +return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
>> +}
>> +
>>  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>>   gfn_t gfn,
>>   gva_t gaddr,
>> @@ -1865,6 +1870,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
>> kvm_vcpu *vcpu,
>>  role.quadrant = quadrant;
>>  }
>>  for_each_gfn_sp(vcpu->kvm, sp, gfn) {
>> +if (is_obsolete_sp(vcpu->kvm, sp))
>> +continue;
>> +
>>  if (!need_sync && sp->unsync)
>>  need_sync = true;
>>  
>> @@ -1901,6 +1909,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
>> kvm_vcpu *vcpu,
>>  
>>  account_shadowed(vcpu->kvm, gfn);
>>  }
>> +sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
>>  init_shadow_page_table(sp);
>>  trace_kvm_mmu_get_page(sp, true);
>>  return sp;
>> @@ -2071,8 +2080,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, 
>> struct kvm_mmu_page *sp,
>>  ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
>>  kvm_mmu_page_unlink_children(kvm, sp);
>>  kvm_mmu_unlink_parents(kvm, sp);
>> +
>>  if (!sp->role.invalid && !sp->role.direct)
>>  unaccount_shadowed(kvm, sp->gfn);
>> +
>>  if (sp->unsync)
>>  kvm_unlink_unsync_page(kvm, sp);
>>  
>> @@ -4196,6 +4207,93 @@ restart:
>>  spin_unlock(&kvm->mmu_lock);
>>  }
>>  
>> +static void zap_invalid_pages(struct kvm *kvm)
>> +{
>> +struct kvm_mmu_page *sp, *node;
>> +LIST_HEAD(invalid_list);
>> +
>> +restart:

[PATCH v5 0/8] KVM: MMU: fast zap all shadow pages

2013-05-16 Thread Xiao Guangrong
Bechmark result:
I have tested this patchset and the previous version that only zaps the
pages linked on invalid slot's rmap. The benchmark is written by myself
which has been attached, it writes large memory when do pci rom read.

Host: Intel(R) Xeon(R) CPU X5690  @ 3.47GHz + 36G Memory
Guest: 12 VCPU + 32G Memory

Current code:   This patchset Previous Version 
2405434959 ns   2323016424 ns 2368810003 ns

The interesting thing is, the previous version is slower than this patch,
i guess the reason is that the former keeps lots of invalid pages in mmu
which cause shadow page to be reclaimed due to used-pages > request-pages
or host memory shrink.

Changlog:
V5:
  1): rename is_valid_sp to is_obsolete_sp
  2): use lock-break technique to zap all old pages instead of only pages
  linked on invalid slot's rmap suggested by Marcelo.
  3): trace invalid pages and kvm_mmu_invalidate_memslot_pages() 
  4): rename kvm_mmu_invalid_memslot_pages to kvm_mmu_invalidate_memslot_pages
  according to Takuya's comments. 

V4:
  1): drop unmapping invalid rmap out of mmu-lock and use lock-break technique
  instead. Thanks to Gleb's comments.

  2): needn't handle invalid-gen pages specially due to page table always
  switched by KVM_REQ_MMU_RELOAD. Thanks to Marcelo's comments.

V3:
  completely redesign the algorithm, please see below.

V2:
  - do not reset n_requested_mmu_pages and n_max_mmu_pages
  - batch free root shadow pages to reduce vcpu notification and mmu-lock
contention
  - remove the first patch that introduce kvm->arch.mmu_cache since we only
'memset zero' on hashtable rather than all mmu cache members in this
version
  - remove unnecessary kvm_reload_remote_mmus after kvm_mmu_zap_all

* Issue
The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
walk and zap all shadow pages one by one, also it need to zap all guest
page's rmap and all shadow page's parent spte list. Particularly, things
become worse if guest uses more memory or vcpus. It is not good for
scalability.

* Idea
KVM maintains a global mmu invalid generation-number which is stored in
kvm->arch.mmu_valid_gen and every shadow page stores the current global
generation-number into sp->mmu_valid_gen when it is created.

When KVM need zap all shadow pages sptes, it just simply increase the
global generation-number then reload root shadow pages on all vcpus.
Vcpu will create a new shadow page table according to current kvm's
generation-number. It ensures the old pages are not used any more.

Then the invalid-gen pages (sp->mmu_valid_gen != kvm->arch.mmu_valid_gen)
are zapped by using lock-break technique.

Xiao Guangrong (8):
  KVM: MMU: drop unnecessary kvm_reload_remote_mmus
  KVM: MMU: delete shadow page from hash list in
kvm_mmu_prepare_zap_page
  KVM: MMU: fast invalidate all pages
  KVM: x86: use the fast way to invalidate all pages
  KVM: MMU: make kvm_mmu_zap_all preemptable
  KVM: MMU: show mmu_valid_gen in shadow page related tracepoints
  KVM: MMU: add tracepoint for kvm_mmu_invalidate_memslot_pages
  KVM: MMU: zap pages in batch

 arch/x86/include/asm/kvm_host.h |2 +
 arch/x86/kvm/mmu.c  |  124 ++-
 arch/x86/kvm/mmu.h  |2 +
 arch/x86/kvm/mmutrace.h |   45 +++---
 arch/x86/kvm/x86.c  |9 +--
 5 files changed, 163 insertions(+), 19 deletions(-)

-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 0/8] KVM: MMU: fast zap all shadow pages

2013-05-16 Thread Xiao Guangrong
Attach the benchmark.

On 05/16/2013 08:17 PM, Xiao Guangrong wrote:
> Bechmark result:
> I have tested this patchset and the previous version that only zaps the
> pages linked on invalid slot's rmap. The benchmark is written by myself
> which has been attached, it writes large memory when do pci rom read.
> 
> Host: Intel(R) Xeon(R) CPU X5690  @ 3.47GHz + 36G Memory
> Guest: 12 VCPU + 32G Memory
> 
> Current code:   This patchset Previous Version 
> 2405434959 ns   2323016424 ns 2368810003 ns
> 
> The interesting thing is, the previous version is slower than this patch,
> i guess the reason is that the former keeps lots of invalid pages in mmu
> which cause shadow page to be reclaimed due to used-pages > request-pages
> or host memory shrink.
> 
> Changlog:
> V5:
>   1): rename is_valid_sp to is_obsolete_sp
>   2): use lock-break technique to zap all old pages instead of only pages
>   linked on invalid slot's rmap suggested by Marcelo.
>   3): trace invalid pages and kvm_mmu_invalidate_memslot_pages() 
>   4): rename kvm_mmu_invalid_memslot_pages to kvm_mmu_invalidate_memslot_pages
>   according to Takuya's comments. 
> 
> V4:
>   1): drop unmapping invalid rmap out of mmu-lock and use lock-break technique
>   instead. Thanks to Gleb's comments.
> 
>   2): needn't handle invalid-gen pages specially due to page table always
>   switched by KVM_REQ_MMU_RELOAD. Thanks to Marcelo's comments.
> 
> V3:
>   completely redesign the algorithm, please see below.
> 
> V2:
>   - do not reset n_requested_mmu_pages and n_max_mmu_pages
>   - batch free root shadow pages to reduce vcpu notification and mmu-lock
> contention
>   - remove the first patch that introduce kvm->arch.mmu_cache since we only
> 'memset zero' on hashtable rather than all mmu cache members in this
> version
>   - remove unnecessary kvm_reload_remote_mmus after kvm_mmu_zap_all
> 
> * Issue
> The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
> walk and zap all shadow pages one by one, also it need to zap all guest
> page's rmap and all shadow page's parent spte list. Particularly, things
> become worse if guest uses more memory or vcpus. It is not good for
> scalability.
> 
> * Idea
> KVM maintains a global mmu invalid generation-number which is stored in
> kvm->arch.mmu_valid_gen and every shadow page stores the current global
> generation-number into sp->mmu_valid_gen when it is created.
> 
> When KVM need zap all shadow pages sptes, it just simply increase the
> global generation-number then reload root shadow pages on all vcpus.
> Vcpu will create a new shadow page table according to current kvm's
> generation-number. It ensures the old pages are not used any more.
> 
> Then the invalid-gen pages (sp->mmu_valid_gen != kvm->arch.mmu_valid_gen)
> are zapped by using lock-break technique.
> 
> Xiao Guangrong (8):
>   KVM: MMU: drop unnecessary kvm_reload_remote_mmus
>   KVM: MMU: delete shadow page from hash list in
> kvm_mmu_prepare_zap_page
>   KVM: MMU: fast invalidate all pages
>   KVM: x86: use the fast way to invalidate all pages
>   KVM: MMU: make kvm_mmu_zap_all preemptable
>   KVM: MMU: show mmu_valid_gen in shadow page related tracepoints
>   KVM: MMU: add tracepoint for kvm_mmu_invalidate_memslot_pages
>   KVM: MMU: zap pages in batch
> 
>  arch/x86/include/asm/kvm_host.h |2 +
>  arch/x86/kvm/mmu.c  |  124 
> ++-
>  arch/x86/kvm/mmu.h  |2 +
>  arch/x86/kvm/mmutrace.h |   45 +++---
>  arch/x86/kvm/x86.c  |9 +--
>  5 files changed, 163 insertions(+), 19 deletions(-)
> 



mmtest.tar.bz2
Description: application/bzip


[PATCH v5 7/8] KVM: MMU: add tracepoint for kvm_mmu_invalidate_memslot_pages

2013-05-16 Thread Xiao Guangrong
It is good for debug and development

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c  |2 ++
 arch/x86/kvm/mmutrace.h |   23 +++
 2 files changed, 25 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 268b2ff..e12f431 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4300,6 +4300,8 @@ void kvm_mmu_invalidate_memslot_pages(struct kvm *kvm,
 
if (slot)
zap_invalid_pages(kvm);
+
+   trace_kvm_mmu_invalidate_memslot_pages(kvm, slot);
spin_unlock(&kvm->mmu_lock);
 }
 
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 697f466..8ef3e0e 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -276,6 +276,29 @@ TRACE_EVENT(
  __spte_satisfied(old_spte), __spte_satisfied(new_spte)
)
 );
+
+TRACE_EVENT(
+   kvm_mmu_invalidate_memslot_pages,
+   TP_PROTO(struct kvm *kvm, struct kvm_memory_slot *slot),
+   TP_ARGS(kvm, slot),
+
+   TP_STRUCT__entry(
+   __field(unsigned long, mmu_valid_gen)
+   __field(unsigned int, mmu_used_pages)
+   __field(int, slot_id)
+   ),
+
+   TP_fast_assign(
+   __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
+   __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
+   __entry->slot_id = slot ? slot->id : -1;
+   ),
+
+   TP_printk("kvm-mmu-valid-gen %lx slot_id %d used_pages %x",
+ __entry->mmu_valid_gen, __entry->slot_id,
+ __entry->mmu_used_pages
+   )
+);
 #endif /* _TRACE_KVMMMU_H */
 
 #undef TRACE_INCLUDE_PATH
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v5 3/8] KVM: MMU: fast invalidate all pages

2013-05-16 Thread Xiao Guangrong
The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
walk and zap all shadow pages one by one, also it need to zap all guest
page's rmap and all shadow page's parent spte list. Particularly, things
become worse if guest uses more memory or vcpus. It is not good for
scalability

In this patch, we introduce a faster way to invalidate all shadow pages.
KVM maintains a global mmu invalid generation-number which is stored in
kvm->arch.mmu_valid_gen and every shadow page stores the current global
generation-number into sp->mmu_valid_gen when it is created

When KVM need zap all shadow pages sptes, it just simply increase the
global generation-number then reload root shadow pages on all vcpus.
Vcpu will create a new shadow page table according to current kvm's
generation-number. It ensures the old pages are not used any more.
Then the invalid-gen pages (sp->mmu_valid_gen != kvm->arch.mmu_valid_gen)
are zapped by using lock-break technique

Signed-off-by: Xiao Guangrong 
---
 arch/x86/include/asm/kvm_host.h |2 +
 arch/x86/kvm/mmu.c  |   98 +++
 arch/x86/kvm/mmu.h  |2 +
 3 files changed, 102 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3741c65..bff7d46 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -222,6 +222,7 @@ struct kvm_mmu_page {
int root_count;  /* Currently serving as active root */
unsigned int unsync_children;
unsigned long parent_ptes;  /* Reverse mapping for parent_pte */
+   unsigned long mmu_valid_gen;
DECLARE_BITMAP(unsync_child_bitmap, 512);
 
 #ifdef CONFIG_X86_32
@@ -529,6 +530,7 @@ struct kvm_arch {
unsigned int n_requested_mmu_pages;
unsigned int n_max_mmu_pages;
unsigned int indirect_shadow_pages;
+   unsigned long mmu_valid_gen;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
 * Hash table of struct kvm_mmu_page.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 682ecb4..d9343fe 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1839,6 +1839,11 @@ static void clear_sp_write_flooding_count(u64 *spte)
__clear_sp_write_flooding_count(sp);
 }
 
+static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+   return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+}
+
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 gfn_t gfn,
 gva_t gaddr,
@@ -1865,6 +1870,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
kvm_vcpu *vcpu,
role.quadrant = quadrant;
}
for_each_gfn_sp(vcpu->kvm, sp, gfn) {
+   if (is_obsolete_sp(vcpu->kvm, sp))
+   continue;
+
if (!need_sync && sp->unsync)
need_sync = true;
 
@@ -1901,6 +1909,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
kvm_vcpu *vcpu,
 
account_shadowed(vcpu->kvm, gfn);
}
+   sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
init_shadow_page_table(sp);
trace_kvm_mmu_get_page(sp, true);
return sp;
@@ -2071,8 +2080,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, 
struct kvm_mmu_page *sp,
ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
kvm_mmu_page_unlink_children(kvm, sp);
kvm_mmu_unlink_parents(kvm, sp);
+
if (!sp->role.invalid && !sp->role.direct)
unaccount_shadowed(kvm, sp->gfn);
+
if (sp->unsync)
kvm_unlink_unsync_page(kvm, sp);
 
@@ -4196,6 +4207,93 @@ restart:
spin_unlock(&kvm->mmu_lock);
 }
 
+static void zap_invalid_pages(struct kvm *kvm)
+{
+   struct kvm_mmu_page *sp, *node;
+   LIST_HEAD(invalid_list);
+
+restart:
+   list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
+   if (!is_obsolete_sp(kvm, sp))
+   continue;
+
+   /*
+* Do not repeatedly zap a root page to avoid unnecessary
+* KVM_REQ_MMU_RELOAD, otherwise we may not be able to
+* progress:
+*vcpu 0vcpu 1
+* call vcpu_enter_guest():
+*1): handle KVM_REQ_MMU_RELOAD
+*and require mmu-lock to
+*load mmu
+* repeat:
+*1): zap root page and
+*send KVM_REQ_MMU_RELOAD
+*
+*2): if (cond_resched_lock(mmu-lock))

[PATCH v5 5/8] KVM: MMU: make kvm_mmu_zap_all preemptable

2013-05-16 Thread Xiao Guangrong
Now, kvm_mmu_zap_all is only called in the path of mmu_notifier->release,
at that time, vcpu has stopped that means no new page will be create, we
can use lock-break technique to avoid potential soft lockup

(Note: at this time, the mmu-lock still has contention between ->release
 and other mmu-notify handlers.)

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   11 ++-
 1 files changed, 10 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d9343fe..268b2ff 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4197,11 +4197,20 @@ void kvm_mmu_zap_all(struct kvm *kvm)
struct kvm_mmu_page *sp, *node;
LIST_HEAD(invalid_list);
 
+   might_sleep();
+
spin_lock(&kvm->mmu_lock);
 restart:
-   list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
+   list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
+   if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+   kvm_mmu_commit_zap_page(kvm, &invalid_list);
+   cond_resched_lock(&kvm->mmu_lock);
+   goto restart;
+   }
+
if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
goto restart;
+   }
 
kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v5 4/8] KVM: x86: use the fast way to invalidate all pages

2013-05-16 Thread Xiao Guangrong
Replace kvm_mmu_zap_all by kvm_mmu_invalidate_memslot_pages except on
the path of mmu_notifier->release() which will be fixed in
the later patch

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/x86.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d885418..c2dd732 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5528,7 +5528,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt 
*ctxt)
 * to ensure that the updated hypercall appears atomically across all
 * VCPUs.
 */
-   kvm_mmu_zap_all(vcpu->kvm);
+   kvm_mmu_invalidate_memslot_pages(vcpu->kvm, NULL);
 
kvm_x86_ops->patch_hypercall(vcpu, instruction);
 
@@ -7079,7 +7079,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
   struct kvm_memory_slot *slot)
 {
-   kvm_arch_flush_shadow_all(kvm);
+   kvm_mmu_invalidate_memslot_pages(kvm, slot);
 }
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v5 1/8] KVM: MMU: drop unnecessary kvm_reload_remote_mmus

2013-05-16 Thread Xiao Guangrong
It is the responsibility of kvm_mmu_zap_all that keeps the
consistent of mmu and tlbs. And it is also unnecessary after
zap all mmio sptes since no mmio spte exists on root shadow
page and it can not be cached into tlb

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/x86.c |5 +
 1 files changed, 1 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8d28810..d885418 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7067,16 +7067,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 * If memory slot is created, or moved, we need to clear all
 * mmio sptes.
 */
-   if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
+   if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE))
kvm_mmu_zap_mmio_sptes(kvm);
-   kvm_reload_remote_mmus(kvm);
-   }
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
kvm_mmu_zap_all(kvm);
-   kvm_reload_remote_mmus(kvm);
 }
 
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v5 8/8] KVM: MMU: zap pages in batch

2013-05-16 Thread Xiao Guangrong
Zap at lease 10 pages before releasing mmu-lock to reduce the overload
caused by requiring lock

[ It improves kernel building 0.6% ~ 1% ]

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   11 ---
 1 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e12f431..9c27fda 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4216,10 +4216,12 @@ restart:
spin_unlock(&kvm->mmu_lock);
 }
 
+#define BATCH_ZAP_PAGES10
 static void zap_invalid_pages(struct kvm *kvm)
 {
struct kvm_mmu_page *sp, *node;
LIST_HEAD(invalid_list);
+   int batch = 0;
 
 restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
@@ -4256,11 +4258,14 @@ restart:
 * Need not flush tlb since we only zap the sp with invalid
 * generation number.
 */
-   if (cond_resched_lock(&kvm->mmu_lock))
+   if ((batch >= BATCH_ZAP_PAGES) &&
+ cond_resched_lock(&kvm->mmu_lock)) {
+   batch = 0;
goto restart;
+   }
 
-   if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
-   goto restart;
+   batch += kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+   goto restart;
}
 
/*
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v5 6/8] KVM: MMU: show mmu_valid_gen in shadow page related tracepoints

2013-05-16 Thread Xiao Guangrong
Show sp->mmu_valid_gen

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmutrace.h |   22 --
 1 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index b8f6172..697f466 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -7,16 +7,18 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvmmmu
 
-#define KVM_MMU_PAGE_FIELDS \
-   __field(__u64, gfn) \
-   __field(__u32, role) \
-   __field(__u32, root_count) \
+#define KVM_MMU_PAGE_FIELDS\
+   __field(unsigned long, mmu_valid_gen)   \
+   __field(__u64, gfn) \
+   __field(__u32, role)\
+   __field(__u32, root_count)  \
__field(bool, unsync)
 
-#define KVM_MMU_PAGE_ASSIGN(sp) \
-   __entry->gfn = sp->gfn;  \
-   __entry->role = sp->role.word;   \
-   __entry->root_count = sp->root_count;\
+#define KVM_MMU_PAGE_ASSIGN(sp)\
+   __entry->mmu_valid_gen = sp->mmu_valid_gen; \
+   __entry->gfn = sp->gfn; \
+   __entry->role = sp->role.word;  \
+   __entry->root_count = sp->root_count;   \
__entry->unsync = sp->unsync;
 
 #define KVM_MMU_PAGE_PRINTK() ({   \
@@ -28,8 +30,8 @@
\
role.word = __entry->role;  \
\
-   trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s"   \
-" %snxe root %u %s%c", \
+   trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s"   \
+" %snxe root %u %s%c", __entry->mmu_valid_gen, \
 __entry->gfn, role.level,  \
 role.cr4_pae ? " pae" : "",\
 role.quadrant, \
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v5 2/8] KVM: MMU: delete shadow page from hash list in kvm_mmu_prepare_zap_page

2013-05-16 Thread Xiao Guangrong
Move deletion shadow page from the hash list from kvm_mmu_commit_zap_page to
kvm_mmu_prepare_zap_page so that we can call kvm_mmu_commit_zap_page
once for multiple kvm_mmu_prepare_zap_page that can help us to avoid
unnecessary TLB flush

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |8 ++--
 1 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 40d7b2d..682ecb4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1466,7 +1466,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm 
*kvm, int nr)
 static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
 {
ASSERT(is_empty_shadow_page(sp->spt));
-   hlist_del(&sp->hash_link);
+
list_del(&sp->link);
free_page((unsigned long)sp->spt);
if (!sp->role.direct)
@@ -1655,7 +1655,8 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 
 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)
\
for_each_gfn_sp(_kvm, _sp, _gfn)\
-   if ((_sp)->role.direct || (_sp)->role.invalid) {} else
+   if ((_sp)->role.direct ||   \
+ ((_sp)->role.invalid && WARN_ON(1))) {} else
 
 /* @sp->gfn should be write-protected at the call site */
 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -2074,6 +2075,9 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, 
struct kvm_mmu_page *sp,
unaccount_shadowed(kvm, sp->gfn);
if (sp->unsync)
kvm_unlink_unsync_page(kvm, sp);
+
+   hlist_del_init(&sp->hash_link);
+
if (!sp->root_count) {
/* Count self */
ret++;
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 00/13] perf: kvm live mode

2013-05-09 Thread Xiao Guangrong
CC kvm list.

On 05/09/2013 12:31 PM, David Ahern wrote:
> With the consolidation of the open counters code in December 2012
> (late to the party figuring that out) I think all of the past
> comments on the live mode for perf-kvm have been resolved.

Great work, David! I am playing it and glad to see it works fine.
Will review the patches later.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] perf tools: Fix perf version generation

2013-05-08 Thread Xiao Guangrong
On 05/08/2013 05:43 PM, Robert Richter wrote:
> From: Robert Richter 
> 
> The tag of the perf version is wrongly determined, always the latest
> tag is taken regardless of the HEAD commit:
> 
>  $ perf --version
>  perf version 3.9.rc8.gd7f5d3
>  $ git describe d7f5d3
>  v3.9-rc7-154-gd7f5d33
>  $ head -n 4 Makefile
>  VERSION = 3
>  PATCHLEVEL = 9
>  SUBLEVEL = 0
>  EXTRAVERSION = -rc7
> 
> In other cases no tag might be found.
> 
> This patch fixes this.
> 
> This new implementation handles also the case if there are no tags at
> all found in the git repo but there is a commit id.

I remember that Wang has posted a similar patch to fix this bug several
weeks ago:
http://lkml.org/lkml/2013/4/24/70

But have no response yet.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 4/6] KVM: MMU: fast invalid all shadow pages

2013-05-07 Thread Xiao Guangrong
On 05/07/2013 04:58 PM, Gleb Natapov wrote:
> On Tue, May 07, 2013 at 01:45:52AM +0800, Xiao Guangrong wrote:
>> On 05/07/2013 01:24 AM, Gleb Natapov wrote:
>>> On Mon, May 06, 2013 at 09:10:11PM +0800, Xiao Guangrong wrote:
>>>> On 05/06/2013 08:36 PM, Gleb Natapov wrote:
>>>>
>>>>>>> Step 1) Fix kvm_mmu_zap_all's behaviour: introduce lockbreak via
>>>>>>> spin_needbreak. Use generation numbers so that in case kvm_mmu_zap_all 
>>>>>>> releases mmu_lock and reacquires it again, only shadow pages 
>>>>>>> from the generation with which kvm_mmu_zap_all started are zapped (this
>>>>>>> guarantees forward progress and eventual termination).
>>>>>>>
>>>>>>> kvm_mmu_zap_generation()
>>>>>>> spin_lock(mmu_lock)
>>>>>>> int generation = kvm->arch.mmu_generation;
>>>>>>>
>>>>>>> for_each_shadow_page(sp) {
>>>>>>> if (sp->generation == kvm->arch.mmu_generation)
>>>>>>> zap_page(sp)
>>>>>>> if (spin_needbreak(mmu_lock)) {
>>>>>>> kvm->arch.mmu_generation++;
>>>>>>> cond_resched_lock(mmu_lock);
>>>>>>> }
>>>>>>> }
>>>>>>>
>>>>>>> kvm_mmu_zap_all()
>>>>>>> spin_lock(mmu_lock)
>>>>>>> for_each_shadow_page(sp) {
>>>>>>> if (spin_needbreak(mmu_lock)) {
>>>>>>> cond_resched_lock(mmu_lock);
>>>>>>> }
>>>>>>> }
>>>>>>>
>>>>>>> Use kvm_mmu_zap_generation for kvm_arch_flush_shadow_memslot.
>>>>>>> Use kvm_mmu_zap_all for kvm_mmu_notifier_release,kvm_destroy_vm.
>>>>>>>
>>>>>>> This addresses the main problem: excessively long hold times 
>>>>>>> of kvm_mmu_zap_all with very large guests.
>>>>>>>
>>>>>>> Do you see any problem with this logic? This was what i was thinking 
>>>>>>> we agreed.
>>>>>>
>>>>>> No. I understand it and it can work.
>>>>>>
>>>>>> Actually, it is similar with Gleb's idea that "zapping stale shadow pages
>>>>>> (and uses lock break technique)", after some discussion, we thought 
>>>>>> "only zap
>>>>>> shadow pages that are reachable from the slot's rmap" is better, that is 
>>>>>> this
>>>>>> patchset does.
>>>>>> (https://lkml.org/lkml/2013/4/23/73)
>>>>>>
>>>>> But this is not what the patch is doing. Close, but not the same :)
>>>>
>>>> Okay. :)
>>>>
>>>>> Instead of zapping shadow pages reachable from slot's rmap the patch
>>>>> does kvm_unmap_rmapp() which drop all spte without zapping shadow pages.
>>>>> That is why you need special code to re-init lpage_info. What I proposed
>>>>> was to call zap_page() on all shadow pages reachable from rmap. This
>>>>> will take care of lpage_info counters. Does this make sense?
>>>>
>>>> Unfortunately, no! We still need to care lpage_info. lpage_info is used
>>>> to count the number of guest page tables in the memslot.
>>>>
>>>> For example, there is a memslot:
>>>> memslot[0].based_gfn = 0, memslot[0].npages = 100,
>>>>
>>>> and there is a shadow page:
>>>> sp->role.direct =0, sp->role.level = 4, sp->gfn = 10.
>>>>
>>>> this sp is counted in the memslot[0] but it can not be found by walking
>>>> memslot[0]->rmap since there is no last mapping in this shadow page.
>>>>
>>> Right, so what about walking mmu_page_hash for each gfn belonging to the
>>> slot that is in process to be removed to find those?
>>
>> That will cost lots of time. The size of hashtable is 1 << 10. If the
>> memslot has 4M memory, it will walk all the entries, the cost is the same
>> as walking active_list (maybe litter more). And a memslot has 4M memory is
>> the normal case i think.
>>
> Memslots will be much bigger with memory hotplug. Lock break should be
> used while walking mmu_page_hash obviously, but still iterating over
> entire memslot gfn space to find a few gfn that may be there is
> suboptimal. We can keep a list of them in the memslot itself.

It sounds good to me.

BTW, this approach looks more complex and use more memory (new list_head
added into every shadow page) used, why you dislike clearing lpage_info? ;)

> 
>> Another point is that lpage_info stops mmu to use large page. If we
>> do not reset lpage_info, mmu is using 4K page until the invalid-sp is
>> zapped.
>>
> I do not think this is a big issue. If lpage_info prevented the use of
> large pages for some memory ranges before we zapped entire shadow pages
> it was probably for a reason, so new shadow page will prevent large
> pages from been created for the same memory ranges.

Still worried, but I will try it if Marcelo does not have objects.
Thanks a lot for your valuable suggestion, Gleb!

Now, i am trying my best to catch Marcelo's idea of "zapping root
pages", but..



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 4/6] KVM: MMU: fast invalid all shadow pages

2013-05-06 Thread Xiao Guangrong
On 05/07/2013 03:50 AM, Marcelo Tosatti wrote:
> On Mon, May 06, 2013 at 11:39:11AM +0800, Xiao Guangrong wrote:
>> On 05/04/2013 08:52 AM, Marcelo Tosatti wrote:
>>> On Sat, May 04, 2013 at 12:51:06AM +0800, Xiao Guangrong wrote:
>>>> On 05/03/2013 11:53 PM, Marcelo Tosatti wrote:
>>>>> On Fri, May 03, 2013 at 01:52:07PM +0800, Xiao Guangrong wrote:
>>>>>> On 05/03/2013 09:05 AM, Marcelo Tosatti wrote:
>>>>>>
>>>>>>>> +
>>>>>>>> +/*
>>>>>>>> + * Fast invalid all shadow pages belong to @slot.
>>>>>>>> + *
>>>>>>>> + * @slot != NULL means the invalidation is caused the memslot 
>>>>>>>> specified
>>>>>>>> + * by @slot is being deleted, in this case, we should ensure that rmap
>>>>>>>> + * and lpage-info of the @slot can not be used after calling the 
>>>>>>>> function.
>>>>>>>> + *
>>>>>>>> + * @slot == NULL means the invalidation due to other reasons, we need
>>>>>>>> + * not care rmap and lpage-info since they are still valid after 
>>>>>>>> calling
>>>>>>>> + * the function.
>>>>>>>> + */
>>>>>>>> +void kvm_mmu_invalid_memslot_pages(struct kvm *kvm,
>>>>>>>> + struct kvm_memory_slot *slot)
>>>>>>>> +{
>>>>>>>> +  spin_lock(&kvm->mmu_lock);
>>>>>>>> +  kvm->arch.mmu_valid_gen++;
>>>>>>>> +
>>>>>>>> +  /*
>>>>>>>> +   * All shadow paes are invalid, reset the large page info,
>>>>>>>> +   * then we can safely desotry the memslot, it is also good
>>>>>>>> +   * for large page used.
>>>>>>>> +   */
>>>>>>>> +  kvm_clear_all_lpage_info(kvm);
>>>>>>>
>>>>>>> Xiao,
>>>>>>>
>>>>>>> I understood it was agreed that simple mmu_lock lockbreak while
>>>>>>> avoiding zapping of newly instantiated pages upon a
>>>>>>>
>>>>>>> if(spin_needbreak)
>>>>>>> cond_resched_lock()
>>>>>>>
>>>>>>> cycle was enough as a first step? And then later introduce root zapping
>>>>>>> along with measurements.
>>>>>>>
>>>>>>> https://lkml.org/lkml/2013/4/22/544
>>>>>>
>>>>>> Yes, it is.
>>>>>>
>>>>>> See the changelog in 0/0:
>>>>>>
>>>>>> " we use lock-break technique to zap all sptes linked on the
>>>>>> invalid rmap, it is not very effective but good for the first step."
>>>>>>
>>>>>> Thanks!
>>>>>
>>>>> Sure, but what is up with zeroing kvm_clear_all_lpage_info(kvm) and
>>>>> zapping the root? Only lock-break technique along with generation number 
>>>>> was what was agreed.
>>>>
>>>> Marcelo,
>>>>
>>>> Please Wait... I am completely confused. :(
>>>>
>>>> Let's clarify "zeroing kvm_clear_all_lpage_info(kvm) and zapping the root" 
>>>> first.
>>>> Are these changes you wanted?
>>>>
>>>> void kvm_mmu_invalid_memslot_pages(struct kvm *kvm,
>>>>   struct kvm_memory_slot *slot)
>>>> {
>>>>spin_lock(&kvm->mmu_lock);
>>>>kvm->arch.mmu_valid_gen++;
>>>>
>>>>/* Zero all root pages.*/
>>>> restart:
>>>>list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
>>>>if (!sp->root_count)
>>>>continue;
>>>>
>>>>if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
>>>>goto restart;
>>>>}
>>>>
>>>>/*
>>>> * All shadow paes are invalid, reset the large page info,
>>>> * then we can safely desotry the memslot, it is also good
>>>> * for large page used.
>>>> */
>>>>kvm_clear_all_lpage

Re: [PATCH v4 4/6] KVM: MMU: fast invalid all shadow pages

2013-05-06 Thread Xiao Guangrong
On 05/07/2013 01:24 AM, Gleb Natapov wrote:
> On Mon, May 06, 2013 at 09:10:11PM +0800, Xiao Guangrong wrote:
>> On 05/06/2013 08:36 PM, Gleb Natapov wrote:
>>
>>>>> Step 1) Fix kvm_mmu_zap_all's behaviour: introduce lockbreak via
>>>>> spin_needbreak. Use generation numbers so that in case kvm_mmu_zap_all 
>>>>> releases mmu_lock and reacquires it again, only shadow pages 
>>>>> from the generation with which kvm_mmu_zap_all started are zapped (this
>>>>> guarantees forward progress and eventual termination).
>>>>>
>>>>> kvm_mmu_zap_generation()
>>>>>   spin_lock(mmu_lock)
>>>>>   int generation = kvm->arch.mmu_generation;
>>>>>
>>>>>   for_each_shadow_page(sp) {
>>>>>   if (sp->generation == kvm->arch.mmu_generation)
>>>>>   zap_page(sp)
>>>>>   if (spin_needbreak(mmu_lock)) {
>>>>>   kvm->arch.mmu_generation++;
>>>>>   cond_resched_lock(mmu_lock);
>>>>>   }
>>>>>   }
>>>>>
>>>>> kvm_mmu_zap_all()
>>>>>   spin_lock(mmu_lock)
>>>>>   for_each_shadow_page(sp) {
>>>>>   if (spin_needbreak(mmu_lock)) {
>>>>>   cond_resched_lock(mmu_lock);
>>>>>   }
>>>>>   }
>>>>>
>>>>> Use kvm_mmu_zap_generation for kvm_arch_flush_shadow_memslot.
>>>>> Use kvm_mmu_zap_all for kvm_mmu_notifier_release,kvm_destroy_vm.
>>>>>
>>>>> This addresses the main problem: excessively long hold times 
>>>>> of kvm_mmu_zap_all with very large guests.
>>>>>
>>>>> Do you see any problem with this logic? This was what i was thinking 
>>>>> we agreed.
>>>>
>>>> No. I understand it and it can work.
>>>>
>>>> Actually, it is similar with Gleb's idea that "zapping stale shadow pages
>>>> (and uses lock break technique)", after some discussion, we thought "only 
>>>> zap
>>>> shadow pages that are reachable from the slot's rmap" is better, that is 
>>>> this
>>>> patchset does.
>>>> (https://lkml.org/lkml/2013/4/23/73)
>>>>
>>> But this is not what the patch is doing. Close, but not the same :)
>>
>> Okay. :)
>>
>>> Instead of zapping shadow pages reachable from slot's rmap the patch
>>> does kvm_unmap_rmapp() which drop all spte without zapping shadow pages.
>>> That is why you need special code to re-init lpage_info. What I proposed
>>> was to call zap_page() on all shadow pages reachable from rmap. This
>>> will take care of lpage_info counters. Does this make sense?
>>
>> Unfortunately, no! We still need to care lpage_info. lpage_info is used
>> to count the number of guest page tables in the memslot.
>>
>> For example, there is a memslot:
>> memslot[0].based_gfn = 0, memslot[0].npages = 100,
>>
>> and there is a shadow page:
>> sp->role.direct =0, sp->role.level = 4, sp->gfn = 10.
>>
>> this sp is counted in the memslot[0] but it can not be found by walking
>> memslot[0]->rmap since there is no last mapping in this shadow page.
>>
> Right, so what about walking mmu_page_hash for each gfn belonging to the
> slot that is in process to be removed to find those?

That will cost lots of time. The size of hashtable is 1 << 10. If the
memslot has 4M memory, it will walk all the entries, the cost is the same
as walking active_list (maybe litter more). And a memslot has 4M memory is
the normal case i think.

Another point is that lpage_info stops mmu to use large page. If we
do not reset lpage_info, mmu is using 4K page until the invalid-sp is
zapped.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 4/6] KVM: MMU: fast invalid all shadow pages

2013-05-06 Thread Xiao Guangrong
On 05/06/2013 08:36 PM, Gleb Natapov wrote:

>>> Step 1) Fix kvm_mmu_zap_all's behaviour: introduce lockbreak via
>>> spin_needbreak. Use generation numbers so that in case kvm_mmu_zap_all 
>>> releases mmu_lock and reacquires it again, only shadow pages 
>>> from the generation with which kvm_mmu_zap_all started are zapped (this
>>> guarantees forward progress and eventual termination).
>>>
>>> kvm_mmu_zap_generation()
>>> spin_lock(mmu_lock)
>>> int generation = kvm->arch.mmu_generation;
>>>
>>> for_each_shadow_page(sp) {
>>> if (sp->generation == kvm->arch.mmu_generation)
>>> zap_page(sp)
>>> if (spin_needbreak(mmu_lock)) {
>>> kvm->arch.mmu_generation++;
>>> cond_resched_lock(mmu_lock);
>>> }
>>> }
>>>
>>> kvm_mmu_zap_all()
>>> spin_lock(mmu_lock)
>>> for_each_shadow_page(sp) {
>>> if (spin_needbreak(mmu_lock)) {
>>> cond_resched_lock(mmu_lock);
>>> }
>>> }
>>>
>>> Use kvm_mmu_zap_generation for kvm_arch_flush_shadow_memslot.
>>> Use kvm_mmu_zap_all for kvm_mmu_notifier_release,kvm_destroy_vm.
>>>
>>> This addresses the main problem: excessively long hold times 
>>> of kvm_mmu_zap_all with very large guests.
>>>
>>> Do you see any problem with this logic? This was what i was thinking 
>>> we agreed.
>>
>> No. I understand it and it can work.
>>
>> Actually, it is similar with Gleb's idea that "zapping stale shadow pages
>> (and uses lock break technique)", after some discussion, we thought "only zap
>> shadow pages that are reachable from the slot's rmap" is better, that is this
>> patchset does.
>> (https://lkml.org/lkml/2013/4/23/73)
>>
> But this is not what the patch is doing. Close, but not the same :)

Okay. :)

> Instead of zapping shadow pages reachable from slot's rmap the patch
> does kvm_unmap_rmapp() which drop all spte without zapping shadow pages.
> That is why you need special code to re-init lpage_info. What I proposed
> was to call zap_page() on all shadow pages reachable from rmap. This
> will take care of lpage_info counters. Does this make sense?

Unfortunately, no! We still need to care lpage_info. lpage_info is used
to count the number of guest page tables in the memslot.

For example, there is a memslot:
memslot[0].based_gfn = 0, memslot[0].npages = 100,

and there is a shadow page:
sp->role.direct =0, sp->role.level = 4, sp->gfn = 10.

this sp is counted in the memslot[0] but it can not be found by walking
memslot[0]->rmap since there is no last mapping in this shadow page.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH RESEND] mm: mmu_notifier: re-fix freed page still mapped in secondary MMU

2013-05-06 Thread Xiao Guangrong
The commit 751efd8610d3 (mmu_notifier_unregister NULL Pointer deref
and multiple ->release()) breaks the fix:
3ad3d901bbcfb15a5e4690e55350db0899095a68
(mm: mmu_notifier: fix freed page still mapped in secondary MMU)

Since hlist_for_each_entry_rcu() is changed now, we can not revert that patch
directly, so this patch reverts the commit and simply fix the bug spotted
by that patch

This bug spotted by commit 751efd8610d3 is:
==
There is a race condition between mmu_notifier_unregister() and
__mmu_notifier_release().

Assume two tasks, one calling mmu_notifier_unregister() as a result of a
filp_close() ->flush() callout (task A), and the other calling
mmu_notifier_release() from an mmput() (task B).

A   B
t1  srcu_read_lock()
t2  if (!hlist_unhashed())
t3  srcu_read_unlock()
t4  srcu_read_lock()
t5  hlist_del_init_rcu()
t6  synchronize_srcu()
t7  srcu_read_unlock()
t8  hlist_del_rcu()  <--- NULL pointer deref.
==

This can be fixed by using hlist_del_init_rcu instead of hlist_del_rcu.

The another issue spotted in the commit is
"multiple ->release() callouts", we needn't care it too much because
it is really rare (e.g, can not happen on kvm since mmu-notify is unregistered
after exit_mmap()) and the later call of multiple ->release should be
fast since all the pages have already been released by the first call.
Anyway, this issue should be fixed in a separate patch.

-stable suggestions:
Any version has commit 751efd8610d3 need to be backported. I find the oldest
version has this commit is 3.0-stable.

Tested-by: Robin Holt 
Cc: 
Signed-off-by: Xiao Guangrong 
---

Andrew, this patch has been tested by Robin and the test shows that the bug
of "NULL Pointer deref" bas been fixed. However, we have the argument that
whether the fix of "multiple ->release" should be merged into this patch.
(This patch just do fix the bug of "NULL Pointer deref")

Your thought?

 mm/mmu_notifier.c |   81 +++--
 1 files changed, 41 insertions(+), 40 deletions(-)

diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index be04122..606777a 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -40,48 +40,45 @@ void __mmu_notifier_release(struct mm_struct *mm)
int id;

/*
-* srcu_read_lock() here will block synchronize_srcu() in
-* mmu_notifier_unregister() until all registered
-* ->release() callouts this function makes have
-* returned.
+* SRCU here will block mmu_notifier_unregister until
+* ->release returns.
 */
id = srcu_read_lock(&srcu);
+   hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist)
+   /*
+* if ->release runs before mmu_notifier_unregister it
+* must be handled as it's the only way for the driver
+* to flush all existing sptes and stop the driver
+* from establishing any more sptes before all the
+* pages in the mm are freed.
+*/
+   if (mn->ops->release)
+   mn->ops->release(mn, mm);
+   srcu_read_unlock(&srcu, id);
+
spin_lock(&mm->mmu_notifier_mm->lock);
while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
mn = hlist_entry(mm->mmu_notifier_mm->list.first,
 struct mmu_notifier,
 hlist);
-
/*
-* Unlink.  This will prevent mmu_notifier_unregister()
-* from also making the ->release() callout.
+* We arrived before mmu_notifier_unregister so
+* mmu_notifier_unregister will do nothing other than
+* to wait ->release to finish and
+* mmu_notifier_unregister to return.
 */
hlist_del_init_rcu(&mn->hlist);
-   spin_unlock(&mm->mmu_notifier_mm->lock);
-
-   /*
-* Clear sptes. (see 'release' description in mmu_notifier.h)
-*/
-   if (mn->ops->release)
-   mn->ops->release(mn, mm);
-
-   spin_lock(&mm->mmu_notifier_mm->lock);
}
spin_unlock(&mm->mmu_notifier_mm->lock);

/*
-* All callouts to ->release() which we have done are complete.
-* Allow synchronize_srcu() in mmu_notifier_unregister() to complete
-*/
-   srcu_read_unlock(&srcu, id);
-
-   /*
- 

Re: [PATCH v4 4/6] KVM: MMU: fast invalid all shadow pages

2013-05-05 Thread Xiao Guangrong
On 05/04/2013 08:52 AM, Marcelo Tosatti wrote:
> On Sat, May 04, 2013 at 12:51:06AM +0800, Xiao Guangrong wrote:
>> On 05/03/2013 11:53 PM, Marcelo Tosatti wrote:
>>> On Fri, May 03, 2013 at 01:52:07PM +0800, Xiao Guangrong wrote:
>>>> On 05/03/2013 09:05 AM, Marcelo Tosatti wrote:
>>>>
>>>>>> +
>>>>>> +/*
>>>>>> + * Fast invalid all shadow pages belong to @slot.
>>>>>> + *
>>>>>> + * @slot != NULL means the invalidation is caused the memslot specified
>>>>>> + * by @slot is being deleted, in this case, we should ensure that rmap
>>>>>> + * and lpage-info of the @slot can not be used after calling the 
>>>>>> function.
>>>>>> + *
>>>>>> + * @slot == NULL means the invalidation due to other reasons, we need
>>>>>> + * not care rmap and lpage-info since they are still valid after calling
>>>>>> + * the function.
>>>>>> + */
>>>>>> +void kvm_mmu_invalid_memslot_pages(struct kvm *kvm,
>>>>>> +   struct kvm_memory_slot *slot)
>>>>>> +{
>>>>>> +spin_lock(&kvm->mmu_lock);
>>>>>> +kvm->arch.mmu_valid_gen++;
>>>>>> +
>>>>>> +/*
>>>>>> + * All shadow paes are invalid, reset the large page info,
>>>>>> + * then we can safely desotry the memslot, it is also good
>>>>>> + * for large page used.
>>>>>> + */
>>>>>> +kvm_clear_all_lpage_info(kvm);
>>>>>
>>>>> Xiao,
>>>>>
>>>>> I understood it was agreed that simple mmu_lock lockbreak while
>>>>> avoiding zapping of newly instantiated pages upon a
>>>>>
>>>>>   if(spin_needbreak)
>>>>>   cond_resched_lock()
>>>>>
>>>>> cycle was enough as a first step? And then later introduce root zapping
>>>>> along with measurements.
>>>>>
>>>>> https://lkml.org/lkml/2013/4/22/544
>>>>
>>>> Yes, it is.
>>>>
>>>> See the changelog in 0/0:
>>>>
>>>> " we use lock-break technique to zap all sptes linked on the
>>>> invalid rmap, it is not very effective but good for the first step."
>>>>
>>>> Thanks!
>>>
>>> Sure, but what is up with zeroing kvm_clear_all_lpage_info(kvm) and
>>> zapping the root? Only lock-break technique along with generation number 
>>> was what was agreed.
>>
>> Marcelo,
>>
>> Please Wait... I am completely confused. :(
>>
>> Let's clarify "zeroing kvm_clear_all_lpage_info(kvm) and zapping the root" 
>> first.
>> Are these changes you wanted?
>>
>> void kvm_mmu_invalid_memslot_pages(struct kvm *kvm,
>> struct kvm_memory_slot *slot)
>> {
>>  spin_lock(&kvm->mmu_lock);
>>  kvm->arch.mmu_valid_gen++;
>>
>>  /* Zero all root pages.*/
>> restart:
>>  list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
>>  if (!sp->root_count)
>>  continue;
>>
>>  if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
>>  goto restart;
>>  }
>>
>>  /*
>>   * All shadow paes are invalid, reset the large page info,
>>   * then we can safely desotry the memslot, it is also good
>>   * for large page used.
>>   */
>>  kvm_clear_all_lpage_info(kvm);
>>
>>  kvm_mmu_commit_zap_page(kvm, &invalid_list);
>>  spin_unlock(&kvm->mmu_lock);
>> }
>>
>> static void rmap_remove(struct kvm *kvm, u64 *spte)
>> {
>>  struct kvm_mmu_page *sp;
>>  gfn_t gfn;
>>  unsigned long *rmapp;
>>
>>  sp = page_header(__pa(spte));
>> +
>> +   /* Let invalid sp do not access its rmap. */
>> +if (!sp_is_valid(sp))
>> +return;
>> +
>>  gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
>>  rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
>>  pte_list_remove(spte, rmapp);
>> }
>>
>> If yes, there is the reason why we can not do this that i mentioned before:
>>
>> after call kvm_mmu_invalid_memslot_p

Re: [PATCH v4 4/6] KVM: MMU: fast invalid all shadow pages

2013-05-03 Thread Xiao Guangrong
On 05/03/2013 11:53 PM, Marcelo Tosatti wrote:
> On Fri, May 03, 2013 at 01:52:07PM +0800, Xiao Guangrong wrote:
>> On 05/03/2013 09:05 AM, Marcelo Tosatti wrote:
>>
>>>> +
>>>> +/*
>>>> + * Fast invalid all shadow pages belong to @slot.
>>>> + *
>>>> + * @slot != NULL means the invalidation is caused the memslot specified
>>>> + * by @slot is being deleted, in this case, we should ensure that rmap
>>>> + * and lpage-info of the @slot can not be used after calling the function.
>>>> + *
>>>> + * @slot == NULL means the invalidation due to other reasons, we need
>>>> + * not care rmap and lpage-info since they are still valid after calling
>>>> + * the function.
>>>> + */
>>>> +void kvm_mmu_invalid_memslot_pages(struct kvm *kvm,
>>>> + struct kvm_memory_slot *slot)
>>>> +{
>>>> +  spin_lock(&kvm->mmu_lock);
>>>> +  kvm->arch.mmu_valid_gen++;
>>>> +
>>>> +  /*
>>>> +   * All shadow paes are invalid, reset the large page info,
>>>> +   * then we can safely desotry the memslot, it is also good
>>>> +   * for large page used.
>>>> +   */
>>>> +  kvm_clear_all_lpage_info(kvm);
>>>
>>> Xiao,
>>>
>>> I understood it was agreed that simple mmu_lock lockbreak while
>>> avoiding zapping of newly instantiated pages upon a
>>>
>>> if(spin_needbreak)
>>> cond_resched_lock()
>>>
>>> cycle was enough as a first step? And then later introduce root zapping
>>> along with measurements.
>>>
>>> https://lkml.org/lkml/2013/4/22/544
>>
>> Yes, it is.
>>
>> See the changelog in 0/0:
>>
>> " we use lock-break technique to zap all sptes linked on the
>> invalid rmap, it is not very effective but good for the first step."
>>
>> Thanks!
> 
> Sure, but what is up with zeroing kvm_clear_all_lpage_info(kvm) and
> zapping the root? Only lock-break technique along with generation number 
> was what was agreed.

Marcelo,

Please Wait... I am completely confused. :(

Let's clarify "zeroing kvm_clear_all_lpage_info(kvm) and zapping the root" 
first.
Are these changes you wanted?

void kvm_mmu_invalid_memslot_pages(struct kvm *kvm,
   struct kvm_memory_slot *slot)
{
spin_lock(&kvm->mmu_lock);
kvm->arch.mmu_valid_gen++;

/* Zero all root pages.*/
restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
if (!sp->root_count)
continue;

if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
goto restart;
}

/*
 * All shadow paes are invalid, reset the large page info,
 * then we can safely desotry the memslot, it is also good
 * for large page used.
 */
kvm_clear_all_lpage_info(kvm);

kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
}

static void rmap_remove(struct kvm *kvm, u64 *spte)
{
struct kvm_mmu_page *sp;
gfn_t gfn;
unsigned long *rmapp;

sp = page_header(__pa(spte));
+
+   /* Let invalid sp do not access its rmap. */
+   if (!sp_is_valid(sp))
+   return;
+
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
pte_list_remove(spte, rmapp);
}

If yes, there is the reason why we can not do this that i mentioned before:

after call kvm_mmu_invalid_memslot_pages(), the memslot->rmap will be destroyed.
Later, if host reclaim page, the mmu-notify handlers, ->invalidate_page and
->invalidate_range_start, can not find any spte using the host page, then
Accessed/Dirty for host page is missing tracked.
(missing call kvm_set_pfn_accessed and kvm_set_pfn_dirty properly.)

What's your idea?

And I should apologize for my poor communications, really sorry for that...


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 4/5] mm: soft-dirty bits for user memory changes tracking

2013-05-03 Thread Xiao Guangrong
On 05/01/2013 12:12 AM, Pavel Emelyanov wrote:

> +static inline void clear_soft_dirty(struct vm_area_struct *vma,
> + unsigned long addr, pte_t *pte)
> +{
> +#ifdef CONFIG_MEM_SOFT_DIRTY
> + /*
> +  * The soft-dirty tracker uses #PF-s to catch writes
> +  * to pages, so write-protect the pte as well. See the
> +  * Documentation/vm/soft-dirty.txt for full description
> +  * of how soft-dirty works.
> +  */
> + pte_t ptent = *pte;
> + ptent = pte_wrprotect(ptent);
> + ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
> + set_pte_at(vma->vm_mm, addr, pte, ptent);
> +#endif

It seems that TLBs are not flushed and mmu-notification is not called?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 4/6] KVM: MMU: fast invalid all shadow pages

2013-05-02 Thread Xiao Guangrong
On 05/03/2013 10:27 AM, Takuya Yoshikawa wrote:
> On Sat, 27 Apr 2013 11:13:20 +0800
> Xiao Guangrong  wrote:
> 
>> +/*
>> + * Fast invalid all shadow pages belong to @slot.
>> + *
>> + * @slot != NULL means the invalidation is caused the memslot specified
>> + * by @slot is being deleted, in this case, we should ensure that rmap
>> + * and lpage-info of the @slot can not be used after calling the function.
>> + *
>> + * @slot == NULL means the invalidation due to other reasons, we need
> 
> The comment should explain what the "other reasons" are.
> But this API may better be split into two separate functions; it depends
> on the "other reasons".

NO.

> 
>> + * not care rmap and lpage-info since they are still valid after calling
>> + * the function.
>> + */
>> +void kvm_mmu_invalid_memslot_pages(struct kvm *kvm,
>> +   struct kvm_memory_slot *slot)
> 
> You yourself is explaining this as "invalidation" in the comment.
> kvm_mmu_invalidate_shadow_pages_memslot() or something...

Umm, invalidate is a better name. Will update after collecting Marcelo, Gleb
and other guy's comments.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 3/6] KVM: MMU: introduce kvm_clear_all_lpage_info

2013-05-02 Thread Xiao Guangrong
On 05/03/2013 10:15 AM, Takuya Yoshikawa wrote:
> On Sat, 27 Apr 2013 11:13:19 +0800
> Xiao Guangrong  wrote:
> 
>> This function is used to reset the large page info of all guest pages
>> which will be used in later patch
>>
>> Signed-off-by: Xiao Guangrong 
>> ---
>>  arch/x86/kvm/x86.c |   25 +
>>  arch/x86/kvm/x86.h |2 ++
>>  2 files changed, 27 insertions(+), 0 deletions(-)
>>
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 52b4e97..8e4494c 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -6951,6 +6951,31 @@ static void memslot_set_lpage_disallowed(struct 
>> kvm_memory_slot *slot,
>>  }
>>  }
>>  
>> +static void clear_memslot_lpage_info(struct kvm_memory_slot *slot)
>> +{
>> +int i;
>> +
>> +for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
>> +int lpages;
>> +int level = i + 1;
>> +
>> +lpages = gfn_to_index(slot->base_gfn + slot->npages - 1,
>> +  slot->base_gfn, level) + 1;
>> +
>> +memset(slot->arch.lpage_info[i - 1], 0,
>> +   sizeof(*slot->arch.lpage_info[i - 1]));
>> +memslot_set_lpage_disallowed(slot, slot->npages, i, lpages);
> 
> This does something other than clearing.

Aha, this API *clears* the count set by kvm mmu. It is meaningful enough,
i think.



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 2/6] KVM: x86: introduce memslot_set_lpage_disallowed

2013-05-02 Thread Xiao Guangrong
On 05/03/2013 10:10 AM, Takuya Yoshikawa wrote:
> On Sat, 27 Apr 2013 11:13:18 +0800
> Xiao Guangrong  wrote:
> 
>> It is used to set disallowed large page on the specified level, can be
>> used in later patch
>>
>> Signed-off-by: Xiao Guangrong 
>> ---
>>  arch/x86/kvm/x86.c |   53 
>> ++-
>>  1 files changed, 35 insertions(+), 18 deletions(-)
>>
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 91dd9f4..52b4e97 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -6917,12 +6917,45 @@ void kvm_arch_free_memslot(struct kvm_memory_slot 
>> *free,
>>  }
>>  }
>>  
>> +static void memslot_set_lpage_disallowed(struct kvm_memory_slot *slot,
>> + unsigned long npages,
>> + int lpage_size, int lpages)
> 
> What this function does is to disable large page support for this slot
> as can be seen in the comment below.
> 
> Since setting lpage_info to something ("disallowed" ?) is an implementation
> detail, we'd better hide such a thing from the function name.
> 
> Taking into account that we have "kvm_largepages_enabled()", something like
> disable_largepages_memslot() may be a candidate.
> 

No.

kvm_largepages_enabled effects on largepages_enabled, it is not related
with this function. Actually, I really do not care the different between
"disallowed" and "disabled".


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 4/6] KVM: MMU: fast invalid all shadow pages

2013-05-02 Thread Xiao Guangrong
On 05/03/2013 09:05 AM, Marcelo Tosatti wrote:

>> +
>> +/*
>> + * Fast invalid all shadow pages belong to @slot.
>> + *
>> + * @slot != NULL means the invalidation is caused the memslot specified
>> + * by @slot is being deleted, in this case, we should ensure that rmap
>> + * and lpage-info of the @slot can not be used after calling the function.
>> + *
>> + * @slot == NULL means the invalidation due to other reasons, we need
>> + * not care rmap and lpage-info since they are still valid after calling
>> + * the function.
>> + */
>> +void kvm_mmu_invalid_memslot_pages(struct kvm *kvm,
>> +   struct kvm_memory_slot *slot)
>> +{
>> +spin_lock(&kvm->mmu_lock);
>> +kvm->arch.mmu_valid_gen++;
>> +
>> +/*
>> + * All shadow paes are invalid, reset the large page info,
>> + * then we can safely desotry the memslot, it is also good
>> + * for large page used.
>> + */
>> +kvm_clear_all_lpage_info(kvm);
> 
> Xiao,
> 
> I understood it was agreed that simple mmu_lock lockbreak while
> avoiding zapping of newly instantiated pages upon a
> 
>   if(spin_needbreak)
>   cond_resched_lock()
> 
> cycle was enough as a first step? And then later introduce root zapping
> along with measurements.
> 
> https://lkml.org/lkml/2013/4/22/544

Yes, it is.

See the changelog in 0/0:

" we use lock-break technique to zap all sptes linked on the
invalid rmap, it is not very effective but good for the first step."

Thanks!


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v4 2/6] KVM: x86: introduce memslot_set_lpage_disallowed

2013-04-26 Thread Xiao Guangrong
It is used to set disallowed large page on the specified level, can be
used in later patch

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/x86.c |   53 ++-
 1 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 91dd9f4..52b4e97 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6917,12 +6917,45 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
}
 }
 
+static void memslot_set_lpage_disallowed(struct kvm_memory_slot *slot,
+unsigned long npages,
+int lpage_size, int lpages)
+{
+   struct kvm_lpage_info *lpage_info;
+   unsigned long ugfn;
+   int level = lpage_size + 1;
+
+   WARN_ON(!lpage_size);
+
+   lpage_info = slot->arch.lpage_info[lpage_size - 1];
+
+   if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
+   lpage_info[0].write_count = 1;
+
+   if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
+   lpage_info[lpages - 1].write_count = 1;
+
+   ugfn = slot->userspace_addr >> PAGE_SHIFT;
+
+   /*
+* If the gfn and userspace address are not aligned wrt each
+* other, or if explicitly asked to, disable large page
+* support for this slot
+*/
+   if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
+ !kvm_largepages_enabled()) {
+   unsigned long j;
+
+   for (j = 0; j < lpages; ++j)
+   lpage_info[j].write_count = 1;
+   }
+}
+
 int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 {
int i;
 
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
-   unsigned long ugfn;
int lpages;
int level = i + 1;
 
@@ -6941,23 +6974,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot 
*slot, unsigned long npages)
if (!slot->arch.lpage_info[i - 1])
goto out_free;
 
-   if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
-   slot->arch.lpage_info[i - 1][0].write_count = 1;
-   if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 
1))
-   slot->arch.lpage_info[i - 1][lpages - 1].write_count = 
1;
-   ugfn = slot->userspace_addr >> PAGE_SHIFT;
-   /*
-* If the gfn and userspace address are not aligned wrt each
-* other, or if explicitly asked to, disable large page
-* support for this slot
-*/
-   if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) 
||
-   !kvm_largepages_enabled()) {
-   unsigned long j;
-
-   for (j = 0; j < lpages; ++j)
-   slot->arch.lpage_info[i - 1][j].write_count = 1;
-   }
+   memslot_set_lpage_disallowed(slot, npages, i, lpages);
}
 
return 0;
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v4 4/6] KVM: MMU: fast invalid all shadow pages

2013-04-26 Thread Xiao Guangrong
The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
walk and zap all shadow pages one by one, also it need to zap all guest
page's rmap and all shadow page's parent spte list. Particularly, things
become worse if guest uses more memory or vcpus. It is not good for
scalability.

In this patch, we introduce a faster way to invalid all shadow pages.
KVM maintains a global mmu invalid generation-number which is stored in
kvm->arch.mmu_valid_gen and every shadow page stores the current global
generation-number into sp->mmu_valid_gen when it is created.

When KVM need zap all shadow pages sptes, it just simply increase the
global generation-number then reload root shadow pages on all vcpus.
Vcpu will create a new shadow page table according to current kvm's
generation-number. It ensures the old pages are not used any more.

The invalid-gen pages (sp->mmu_valid_gen != kvm->arch.mmu_valid_gen)
are keeped in mmu-cache until page allocator reclaims page.

If the invalidation is due to memslot changed, its rmap amd lpage-info
will be freed soon, in order to avoiding use invalid memory, we unmap
all sptes on its rmap and always reset the large-info all memslots so
that rmap and lpage info can be safely freed.

Signed-off-by: Xiao Guangrong 
---
 arch/x86/include/asm/kvm_host.h |2 +
 arch/x86/kvm/mmu.c  |   77 ++-
 arch/x86/kvm/mmu.h  |2 +
 3 files changed, 80 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 18635ae..7adf8f8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -220,6 +220,7 @@ struct kvm_mmu_page {
int root_count;  /* Currently serving as active root */
unsigned int unsync_children;
unsigned long parent_ptes;  /* Reverse mapping for parent_pte */
+   unsigned long mmu_valid_gen;
DECLARE_BITMAP(unsync_child_bitmap, 512);
 
 #ifdef CONFIG_X86_32
@@ -527,6 +528,7 @@ struct kvm_arch {
unsigned int n_requested_mmu_pages;
unsigned int n_max_mmu_pages;
unsigned int indirect_shadow_pages;
+   unsigned long mmu_valid_gen;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
 * Hash table of struct kvm_mmu_page.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 004cc87..63110c7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1838,6 +1838,11 @@ static void clear_sp_write_flooding_count(u64 *spte)
__clear_sp_write_flooding_count(sp);
 }
 
+static bool is_valid_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+   return likely(sp->mmu_valid_gen == kvm->arch.mmu_valid_gen);
+}
+
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 gfn_t gfn,
 gva_t gaddr,
@@ -1864,6 +1869,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
kvm_vcpu *vcpu,
role.quadrant = quadrant;
}
for_each_gfn_sp(vcpu->kvm, sp, gfn) {
+   if (!is_valid_sp(vcpu->kvm, sp))
+   continue;
+
if (!need_sync && sp->unsync)
need_sync = true;
 
@@ -1900,6 +1908,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
kvm_vcpu *vcpu,
 
account_shadowed(vcpu->kvm, gfn);
}
+   sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
init_shadow_page_table(sp);
trace_kvm_mmu_get_page(sp, true);
return sp;
@@ -2070,8 +2079,12 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, 
struct kvm_mmu_page *sp,
ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
kvm_mmu_page_unlink_children(kvm, sp);
kvm_mmu_unlink_parents(kvm, sp);
-   if (!sp->role.invalid && !sp->role.direct)
+
+   if (!sp->role.invalid && !sp->role.direct &&
+ /* Invalid-gen pages are not accounted. */
+ is_valid_sp(kvm, sp))
unaccount_shadowed(kvm, sp->gfn);
+
if (sp->unsync)
kvm_unlink_unsync_page(kvm, sp);
if (!sp->root_count) {
@@ -4194,6 +4207,68 @@ restart:
spin_unlock(&kvm->mmu_lock);
 }
 
+static void
+memslot_unmap_rmaps(struct kvm_memory_slot *slot, struct kvm *kvm)
+{
+   int level;
+
+   for (level = PT_PAGE_TABLE_LEVEL;
+ level < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++level) {
+   unsigned long idx, *rmapp;
+
+   rmapp = slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL];
+   idx = gfn_to_index(slot->base_gfn + slot->npages - 1,
+  slot->base_gfn, level) + 1;
+
+   while (idx--) {
+   kvm_unmap_rmapp(kvm, rmapp + idx, slot, 0);
+
+

[PATCH v4 6/6] KVM: MMU: make kvm_mmu_zap_all preemptable

2013-04-26 Thread Xiao Guangrong
Now, kvm_mmu_zap_all is only called in the path of mmu_notifier->release,
at that time, vcpu has stopped that means no new page will be create, we
can use lock-break technique to avoid potential soft lockup

(Note: at this time, the mmu-lock still has contention between ->release
 and other mmu-notify handlers.)

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   11 ++-
 1 files changed, 10 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 63110c7..46d1d47 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4197,12 +4197,21 @@ void kvm_mmu_zap_all(struct kvm *kvm)
struct kvm_mmu_page *sp, *node;
LIST_HEAD(invalid_list);
 
+   might_sleep();
+
spin_lock(&kvm->mmu_lock);
 restart:
-   list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
+   list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
goto restart;
 
+   if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+   kvm_mmu_commit_zap_page(kvm, &invalid_list);
+   cond_resched_lock(&kvm->mmu_lock);
+   goto restart;
+   }
+   }
+
kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
 }
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v4 3/6] KVM: MMU: introduce kvm_clear_all_lpage_info

2013-04-26 Thread Xiao Guangrong
This function is used to reset the large page info of all guest pages
which will be used in later patch

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/x86.c |   25 +
 arch/x86/kvm/x86.h |2 ++
 2 files changed, 27 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 52b4e97..8e4494c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6951,6 +6951,31 @@ static void memslot_set_lpage_disallowed(struct 
kvm_memory_slot *slot,
}
 }
 
+static void clear_memslot_lpage_info(struct kvm_memory_slot *slot)
+{
+   int i;
+
+   for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
+   int lpages;
+   int level = i + 1;
+
+   lpages = gfn_to_index(slot->base_gfn + slot->npages - 1,
+ slot->base_gfn, level) + 1;
+
+   memset(slot->arch.lpage_info[i - 1], 0,
+  sizeof(*slot->arch.lpage_info[i - 1]));
+   memslot_set_lpage_disallowed(slot, slot->npages, i, lpages);
+   }
+}
+
+void kvm_clear_all_lpage_info(struct kvm *kvm)
+{
+   struct kvm_memory_slot *slot;
+
+   kvm_for_each_memslot(slot, kvm->memslots)
+   clear_memslot_lpage_info(slot);
+}
+
 int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 {
int i;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index e224f7a..beae540 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -108,6 +108,8 @@ static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu 
*vcpu, gpa_t gpa)
return false;
 }
 
+void kvm_clear_all_lpage_info(struct kvm *kvm);
+
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v4 5/6] KVM: x86: use the fast way to invalid all pages

2013-04-26 Thread Xiao Guangrong
Replace kvm_mmu_zap_all by kvm_mmu_invalid_all_pages except on
the path of mmu_notifier->release() which will be fixed in
the later patch

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/x86.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8e4494c..809a053 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5483,7 +5483,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt 
*ctxt)
 * to ensure that the updated hypercall appears atomically across all
 * VCPUs.
 */
-   kvm_mmu_zap_all(vcpu->kvm);
+   kvm_mmu_invalid_memslot_pages(vcpu->kvm, NULL);
 
kvm_x86_ops->patch_hypercall(vcpu, instruction);
 
@@ -7093,7 +7093,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
   struct kvm_memory_slot *slot)
 {
-   kvm_arch_flush_shadow_all(kvm);
+   kvm_mmu_invalid_memslot_pages(kvm, slot);
 }
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v4 1/6] KVM: MMU: drop unnecessary kvm_reload_remote_mmus

2013-04-26 Thread Xiao Guangrong
It is the responsibility of kvm_mmu_zap_all that keeps the
consistent of mmu and tlbs. And it is also unnecessary after
zap all mmio sptes since no mmio spte exists on root shadow
page and it can not be cached into tlb

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/x86.c |5 +
 1 files changed, 1 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2a434bf..91dd9f4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7039,16 +7039,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 * If memory slot is created, or moved, we need to clear all
 * mmio sptes.
 */
-   if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
+   if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE))
kvm_mmu_zap_mmio_sptes(kvm);
-   kvm_reload_remote_mmus(kvm);
-   }
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
kvm_mmu_zap_all(kvm);
-   kvm_reload_remote_mmus(kvm);
 }
 
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v4 0/6] KVM: MMU: fast zap all shadow pages

2013-04-26 Thread Xiao Guangrong
This patchset is based on current 'queue' branch on kvm tree.

Changlog:

V4:
  1): drop unmapping invalid rmap out of mmu-lock and use lock-break technique
  instead. Thanks to Gleb's comments.

  2): needn't handle invalid-gen pages specially due to page table always
  switched by KVM_REQ_MMU_RELOAD. Thanks to Marcelo's comments.

V3:
  completely redesign the algorithm, please see below.

V2:
  - do not reset n_requested_mmu_pages and n_max_mmu_pages
  - batch free root shadow pages to reduce vcpu notification and mmu-lock
contention
  - remove the first patch that introduce kvm->arch.mmu_cache since we only
'memset zero' on hashtable rather than all mmu cache members in this
version
  - remove unnecessary kvm_reload_remote_mmus after kvm_mmu_zap_all

* Issue
The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
walk and zap all shadow pages one by one, also it need to zap all guest
page's rmap and all shadow page's parent spte list. Particularly, things
become worse if guest uses more memory or vcpus. It is not good for
scalability.

* Idea
KVM maintains a global mmu invalid generation-number which is stored in
kvm->arch.mmu_valid_gen and every shadow page stores the current global
generation-number into sp->mmu_valid_gen when it is created.

When KVM need zap all shadow pages sptes, it just simply increase the
global generation-number then reload root shadow pages on all vcpus.
Vcpu will create a new shadow page table according to current kvm's
generation-number. It ensures the old pages are not used any more.

The invalid-gen pages (sp->mmu_valid_gen != kvm->arch.mmu_valid_gen)
are keeped in mmu-cache until page allocator reclaims page.

* Challenges
Some page invalidation is requested when memslot is moved or deleted
and kvm is being destroy who call zap_all_pages to delete all sp using
their rmap and lpage-info, after call zap_all_pages, the rmap and lpage-info
will be freed.

For the lpage-info, we clear all lpage count when do zap-all-pages, then
all invalid shadow pages are not counted in lpage-info, after that lpage-info
on the invalid memslot can be safely freed. This is also good for the
performance - it allows guest to use hugepage as far as possible.

For the rmap, we use lock-break technique to zap all sptes linked on the
invalid rmap, it is not very effective but good for the first step.

* TODO
Unmapping invalid rmap out of mmu-lock with a clear way.

Xiao Guangrong (6):
  KVM: MMU: drop unnecessary kvm_reload_remote_mmus
  KVM: x86: introduce memslot_set_lpage_disallowed
  KVM: MMU: introduce kvm_clear_all_lpage_info
  KVM: MMU: fast invalid all shadow pages
  KVM: x86: use the fast way to invalid all pages
  KVM: MMU: make kvm_mmu_zap_all preemptable

 arch/x86/include/asm/kvm_host.h |2 +
 arch/x86/kvm/mmu.c  |   88 ++-
 arch/x86/kvm/mmu.h  |2 +
 arch/x86/kvm/x86.c  |   87 ---
 arch/x86/kvm/x86.h  |2 +
 5 files changed, 155 insertions(+), 26 deletions(-)

-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 3/6] KVM: MMU: make return value of mmio page fault handler more readable

2013-04-25 Thread Xiao Guangrong
On 04/24/2013 09:34 PM, Gleb Natapov wrote:

>> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
>> index 2adcbc2..6b4ba1e 100644
>> --- a/arch/x86/kvm/mmu.h
>> +++ b/arch/x86/kvm/mmu.h
>> @@ -52,6 +52,20 @@
>>  
>>  int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 
>> sptes[4]);
>>  void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
>> +
>> +/*
>> + * Return values of handle_mmio_page_fault_common:
>> + * RET_MMIO_PF_EMU: it is a real mmio page fault, emulate the instruction
>> + *  directly.
>> + * RET_MMIO_PF_RETRY: let CPU fault again on the address.
>> + * RET_MMIO_PF_BUG: bug is detected.
>> + */
>> +enum {
>> +RET_MMIO_PF_EMU = 1,
> Make it RET_MMIO_PF_EMULATE please.

Good to me, will do.

Thanks!

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 6/6] KVM: MMU: init kvm generation close to mmio wrap-around value

2013-04-25 Thread Xiao Guangrong
On 04/24/2013 08:59 PM, Gleb Natapov wrote:
> On Mon, Apr 01, 2013 at 05:56:49PM +0800, Xiao Guangrong wrote:
>> Then it has chance to trigger mmio generation number wrap-around
>>
>> Signed-off-by: Xiao Guangrong 
>> ---
>>  arch/x86/include/asm/kvm_host.h |1 +
>>  arch/x86/kvm/mmu.c  |8 
>>  virt/kvm/kvm_main.c |6 ++
>>  3 files changed, 15 insertions(+), 0 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h 
>> b/arch/x86/include/asm/kvm_host.h
>> index 6c1e642..4e1f7cb 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -767,6 +767,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
>>   struct kvm_memory_slot *slot,
>>   gfn_t gfn_offset, unsigned long mask);
>>  void kvm_mmu_zap_all(struct kvm *kvm);
>> +void kvm_arch_init_generation(struct kvm *kvm);
>>  void kvm_mmu_invalid_mmio_sptes(struct kvm *kvm);
>>  unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
>>  void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int 
>> kvm_nr_mmu_pages);
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index d314e21..dcc059c 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -4279,6 +4279,14 @@ restart:
>>  spin_unlock(&kvm->mmu_lock);
>>  }
>>  
>> +void kvm_arch_init_generation(struct kvm *kvm)
>> +{
>> +mutex_lock(&kvm->slots_lock);
>> +/* It is easier to trigger mmio generation-number wrap-around. */
>> +kvm_memslots(kvm)->generation = MMIO_MAX_GEN - 13;
> kvm_memslots(kvm)->generation should never overflow since
> (read|write)_cached mechanism does not handle it. Initialising it to
> anything but 0 makes overflow more likely.
> 
> You can hide mmio overflow trick in kvm_current_mmio_generation():
> 
> static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
> {
>   return (kvm_memslots(kvm)->generation + MMIO_MAX_GEN - 13) & 
> MMIO_GEN_MASK;
> }

Very smart idea. Thanks you, Gleb!



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 00/15] KVM: MMU: fast zap all shadow pages

2013-04-23 Thread Xiao Guangrong
On 04/23/2013 02:28 PM, Gleb Natapov wrote:
> On Tue, Apr 23, 2013 at 08:19:02AM +0800, Xiao Guangrong wrote:
>> On 04/22/2013 05:21 PM, Gleb Natapov wrote:
>>> On Sun, Apr 21, 2013 at 10:09:29PM +0800, Xiao Guangrong wrote:
>>>> On 04/21/2013 09:03 PM, Gleb Natapov wrote:
>>>>> On Tue, Apr 16, 2013 at 02:32:38PM +0800, Xiao Guangrong wrote:
>>>>>> This patchset is based on my previous two patchset:
>>>>>> [PATCH 0/2] KVM: x86: avoid potential soft lockup and unneeded mmu reload
>>>>>> (https://lkml.org/lkml/2013/4/1/2)
>>>>>>
>>>>>> [PATCH v2 0/6] KVM: MMU: fast invalid all mmio sptes
>>>>>> (https://lkml.org/lkml/2013/4/1/134)
>>>>>>
>>>>>> Changlog:
>>>>>> V3:
>>>>>>   completely redesign the algorithm, please see below.
>>>>>>
>>>>> This looks pretty complicated. Is it still needed in order to avoid soft
>>>>> lockups after "avoid potential soft lockup and unneeded mmu reload" patch?
>>>>
>>>> Yes.
>>>>
>>>> I discussed this point with Marcelo:
>>>>
>>>> ==
>>>> BTW, to my honest, i do not think spin_needbreak is a good way - it does
>>>> not fix the hot-lock contention and it just occupies more cpu time to avoid
>>>> possible soft lock-ups.
>>>>
>>>> Especially, zap-all-shadow-pages can let other vcpus fault and vcpus 
>>>> contest
>>>> mmu-lock, then zap-all-shadow-pages release mmu-lock and wait, other vcpus
>>>> create page tables again. zap-all-shadow-page need long time to be 
>>>> finished,
>>>> the worst case is, it can not completed forever on intensive vcpu and 
>>>> memory
>>>> usage.
>>>>
>>> So what about mixed approach: use generation numbers and reload roots to
>>> quickly invalidate all shadow pages and then do kvm_mmu_zap_all_invalid().
>>> kvm_mmu_zap_all_invalid() is a new function that invalidates only shadow
>>> pages with stale generation number (and uses lock break technique). It
>>> may traverse active_mmu_pages from tail to head since new shadow pages
>>> will be added to the head of the list or it may use invalid slot rmap to
>>> find exactly what should be invalidated.
>>
>> I prefer to unmapping the invalid rmap instead of zapping stale shadow pages
>> in kvm_mmu_zap_all_invalid(), the former is faster.
>>
> Not sure what do you mean here. What is "unmapping the invalid rmap"?

it is like you said below:
==
kvm_mmu_zap_all_invalid(slot) will only zap shadow pages that are
reachable from the slot's rmap
==
My suggestion is zapping the spte that are linked in the slot's rmap.

> 
>> This way may help but not good, after reload mmu with the new generation 
>> number,
>> all of the vcpu will fault in a long time, try to hold mmu-lock is not good 
>> even
>> if use lock break technique.
> If kvm_mmu_zap_all_invalid(slot) will only zap shadow pages that are
> reachable from the slot's rmap, as opposite to zapping all invalid
> shadow pages, it will have much less work to do. The slots that we
> add/remove during hot plug are usually small. To guaranty reasonable
> forward progress we can break the lock only after certain amount of
> shadow pages are invalidated. All other invalid shadow pages will be
> zapped in make_mmu_pages_available() and zapping will be spread between
> page faults.

No interested in hot-remove memory?

BTW, could you please review my previous patchsets and apply them if its
looks ok? ;)

[PATCH 0/2] KVM: x86: avoid potential soft lockup and unneeded mmu reload
(https://lkml.org/lkml/2013/4/1/2)

[PATCH v2 0/6] KVM: MMU: fast invalid all mmio sptes
(https://lkml.org/lkml/2013/4/1/134)

Thanks!


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 00/15] KVM: MMU: fast zap all shadow pages

2013-04-22 Thread Xiao Guangrong
On 04/22/2013 05:21 PM, Gleb Natapov wrote:
> On Sun, Apr 21, 2013 at 10:09:29PM +0800, Xiao Guangrong wrote:
>> On 04/21/2013 09:03 PM, Gleb Natapov wrote:
>>> On Tue, Apr 16, 2013 at 02:32:38PM +0800, Xiao Guangrong wrote:
>>>> This patchset is based on my previous two patchset:
>>>> [PATCH 0/2] KVM: x86: avoid potential soft lockup and unneeded mmu reload
>>>> (https://lkml.org/lkml/2013/4/1/2)
>>>>
>>>> [PATCH v2 0/6] KVM: MMU: fast invalid all mmio sptes
>>>> (https://lkml.org/lkml/2013/4/1/134)
>>>>
>>>> Changlog:
>>>> V3:
>>>>   completely redesign the algorithm, please see below.
>>>>
>>> This looks pretty complicated. Is it still needed in order to avoid soft
>>> lockups after "avoid potential soft lockup and unneeded mmu reload" patch?
>>
>> Yes.
>>
>> I discussed this point with Marcelo:
>>
>> ==
>> BTW, to my honest, i do not think spin_needbreak is a good way - it does
>> not fix the hot-lock contention and it just occupies more cpu time to avoid
>> possible soft lock-ups.
>>
>> Especially, zap-all-shadow-pages can let other vcpus fault and vcpus contest
>> mmu-lock, then zap-all-shadow-pages release mmu-lock and wait, other vcpus
>> create page tables again. zap-all-shadow-page need long time to be finished,
>> the worst case is, it can not completed forever on intensive vcpu and memory
>> usage.
>>
> So what about mixed approach: use generation numbers and reload roots to
> quickly invalidate all shadow pages and then do kvm_mmu_zap_all_invalid().
> kvm_mmu_zap_all_invalid() is a new function that invalidates only shadow
> pages with stale generation number (and uses lock break technique). It
> may traverse active_mmu_pages from tail to head since new shadow pages
> will be added to the head of the list or it may use invalid slot rmap to
> find exactly what should be invalidated.

I prefer to unmapping the invalid rmap instead of zapping stale shadow pages
in kvm_mmu_zap_all_invalid(), the former is faster.

This way may help but not good, after reload mmu with the new generation number,
all of the vcpu will fault in a long time, try to hold mmu-lock is not good even
if use lock break technique.

I think we can do this step first, then unmapping invalid rmap out of mmu-lock
later.

> 
>> I still think the right way to fix this kind of thing is optimization for
>> mmu-lock.
>> ==
>>
>> Which parts scare you? Let's find a way to optimize for it. ;). For example,
>> if you do not like unmap_memslot_rmap_nolock(), we can simplify it - We can
>> use walk_shadow_page_lockless_begin() and walk_shadow_page_lockless_end() to
>> protect spte instead of kvm->being_unmaped_rmap.
>>
> 
> kvm->being_unmaped_rmap is particularly tricky, although looks

Okay. Will use walk_shadow_page_lockless_begin() and 
walk_shadow_page_lockless_end
instead.

> correct. Additional indirection with rmap ops also does not help following
> the code. I'd rather have if(slot is invalid) in a couple of places where
> things should be done differently. In most places it will be WARN_ON(slot
> is invalid).

Less change, good to me, will do. ;)

Thanks!

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 00/15] KVM: MMU: fast zap all shadow pages

2013-04-21 Thread Xiao Guangrong
On 04/21/2013 11:24 PM, Marcelo Tosatti wrote:
> On Sun, Apr 21, 2013 at 10:09:29PM +0800, Xiao Guangrong wrote:
>> On 04/21/2013 09:03 PM, Gleb Natapov wrote:
>>> On Tue, Apr 16, 2013 at 02:32:38PM +0800, Xiao Guangrong wrote:
>>>> This patchset is based on my previous two patchset:
>>>> [PATCH 0/2] KVM: x86: avoid potential soft lockup and unneeded mmu reload
>>>> (https://lkml.org/lkml/2013/4/1/2)
>>>>
>>>> [PATCH v2 0/6] KVM: MMU: fast invalid all mmio sptes
>>>> (https://lkml.org/lkml/2013/4/1/134)
>>>>
>>>> Changlog:
>>>> V3:
>>>>   completely redesign the algorithm, please see below.
>>>>
>>> This looks pretty complicated. Is it still needed in order to avoid soft
>>> lockups after "avoid potential soft lockup and unneeded mmu reload" patch?
>>
>> Yes.
>>
>> I discussed this point with Marcelo:
>>
>> ==
>> BTW, to my honest, i do not think spin_needbreak is a good way - it does
>> not fix the hot-lock contention and it just occupies more cpu time to avoid
>> possible soft lock-ups.
>>
>> Especially, zap-all-shadow-pages can let other vcpus fault and vcpus contest
>> mmu-lock, then zap-all-shadow-pages release mmu-lock and wait, other vcpus
>> create page tables again. zap-all-shadow-page need long time to be finished,
>> the worst case is, it can not completed forever on intensive vcpu and memory
>> usage.
>>
>> I still think the right way to fix this kind of thing is optimization for
>> mmu-lock.
>> ==
>>
>> Which parts scare you? Let's find a way to optimize for it. ;). For example,
>> if you do not like unmap_memslot_rmap_nolock(), we can simplify it - We can
>> use walk_shadow_page_lockless_begin() and walk_shadow_page_lockless_end() to
>> protect spte instead of kvm->being_unmaped_rmap.
>>
>> Thanks!
> 
> Xiao,
> 
> You can just remove all shadow rmaps now that you've agreed per-memslot
> flushes are not necessary. Which then gets rid of necessity for lockless 
> rmap accesses. Right?

Hi Marcelo,

I am worried about:

==
We can not release all rmaps. If we do this, ->invalidate_page and
->invalidate_range_start can not find any spte using the host page,
that means, Accessed/Dirty for host page is missing tracked.
(missing call kvm_set_pfn_accessed and kvm_set_pfn_dirty properly.)

[https://lkml.org/lkml/2013/4/18/358]
==

Do you think this is a issue? What's your idea?

Thanks!

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 00/15] KVM: MMU: fast zap all shadow pages

2013-04-21 Thread Xiao Guangrong
On 04/21/2013 09:03 PM, Gleb Natapov wrote:
> On Tue, Apr 16, 2013 at 02:32:38PM +0800, Xiao Guangrong wrote:
>> This patchset is based on my previous two patchset:
>> [PATCH 0/2] KVM: x86: avoid potential soft lockup and unneeded mmu reload
>> (https://lkml.org/lkml/2013/4/1/2)
>>
>> [PATCH v2 0/6] KVM: MMU: fast invalid all mmio sptes
>> (https://lkml.org/lkml/2013/4/1/134)
>>
>> Changlog:
>> V3:
>>   completely redesign the algorithm, please see below.
>>
> This looks pretty complicated. Is it still needed in order to avoid soft
> lockups after "avoid potential soft lockup and unneeded mmu reload" patch?

Yes.

I discussed this point with Marcelo:

==
BTW, to my honest, i do not think spin_needbreak is a good way - it does
not fix the hot-lock contention and it just occupies more cpu time to avoid
possible soft lock-ups.

Especially, zap-all-shadow-pages can let other vcpus fault and vcpus contest
mmu-lock, then zap-all-shadow-pages release mmu-lock and wait, other vcpus
create page tables again. zap-all-shadow-page need long time to be finished,
the worst case is, it can not completed forever on intensive vcpu and memory
usage.

I still think the right way to fix this kind of thing is optimization for
mmu-lock.
==

Which parts scare you? Let's find a way to optimize for it. ;). For example,
if you do not like unmap_memslot_rmap_nolock(), we can simplify it - We can
use walk_shadow_page_lockless_begin() and walk_shadow_page_lockless_end() to
protect spte instead of kvm->being_unmaped_rmap.

Thanks!

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 15/15] KVM: MMU: replace kvm_zap_all with kvm_mmu_invalid_all_pages

2013-04-21 Thread Xiao Guangrong
On 04/21/2013 01:18 AM, Marcelo Tosatti wrote:
> On Thu, Apr 18, 2013 at 12:03:45PM +0800, Xiao Guangrong wrote:
>> On 04/18/2013 08:08 AM, Marcelo Tosatti wrote:
>>> On Tue, Apr 16, 2013 at 02:32:53PM +0800, Xiao Guangrong wrote:
>>>> Use kvm_mmu_invalid_all_pages in kvm_arch_flush_shadow_all and
>>>> rename kvm_zap_all to kvm_free_all which is used to free all
>>>> memmory used by kvm mmu when vm is being destroyed, at this time,
>>>> no vcpu exists and mmu-notify has been unregistered, so we can
>>>> free the shadow pages out of mmu-lock
>>>
>>> Since there is no contention for mmu-lock its also not a problem to 
>>> grab the lock right?
>>
>> This still has contention. Other mmu-notify can happen when we handle
>> ->release(). On the other handle, spin-lock is not preemptable.
> 
> Don't think so:

Hi Marcelo,

The comment of ->release() says:

/*
 * Called either by mmu_notifier_unregister or when the mm is
 * being destroyed by exit_mmap, always before all pages are
 * freed. This can run concurrently with other mmu notifier
 * methods (the ones invoked outside the mm context)
> 
> kvm_coalesced_mmio_free(kvm);
> #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
> #else
> kvm_arch_flush_shadow_all(kvm);
> #endif
> kvm_arch_destroy_vm(kvm);

The contention does not exist in the code you listed above. It happens when
vm abnormally exits (for example, VM is killed). Please refer to
commit 3ad3d90 (mm: mmu_notifier: fix freed page still mapped in secondary MMU).
The current mmu-notify code is wrong and i have posted a patch to fix it which
can be found at:
http://marc.info/?l=kvm&m=136609583232031&w=2

Maybe i misunderstand your meaning. This patch tries to use 
kvm_mmu_invalid_all_pages
in ->release and rename kvm_zap_all to kvm_free_all. Do you mean we can still 
use
mmu-lock in kvm_free_all()? If yes, I do not have strong opinion on this point 
and
will keep kvm_free_all under the protection of mmu-lock.

> 
>>> Automated verification of locking/srcu might complain.
>>
>> We hold slot-lock to free shadow page out of mmu-lock, it can avoid
>> the complain. No?
> 
> Not if it realizes srcu is required to access the data structures.

It seems that kvm->srcu is only used to protect kvm->memslots, in kvm_memslots:

static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
{
return rcu_dereference_check(kvm->memslots,
srcu_read_lock_held(&kvm->srcu)
|| lockdep_is_held(&kvm->slots_lock));
}

kvm->memslots can be safely accessed when hold kvm->srcu _or_ slots_lock.

Thanks!

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 12/15] KVM: MMU: fast invalid all shadow pages

2013-04-18 Thread Xiao Guangrong
On 04/18/2013 09:29 PM, Marcelo Tosatti wrote:
> On Thu, Apr 18, 2013 at 10:03:06AM -0300, Marcelo Tosatti wrote:
>> On Thu, Apr 18, 2013 at 12:00:16PM +0800, Xiao Guangrong wrote:
>>>>
>>>> What is the justification for this? 
>>>
>>> We want the rmap of being deleted memslot is removed-only that is
>>> needed for unmapping rmap out of mmu-lock.
>>>
>>> ==
>>> 1) do not corrupt the rmap
>>> 2) keep pte-list-descs available
>>> 3) keep shadow page available
>>>
>>> Resolve 1):
>>> we make the invalid rmap be remove-only that means we only delete and
>>> clear spte from the rmap, no new sptes can be added to it.
>>> This is reasonable since kvm can not do address translation on invalid rmap
>>> (gfn_to_pfn is failed on invalid memslot) and all sptes on invalid rmap can
>>> not be reused (they belong to invalid shadow page).
>>> ==
>>>
>>> clear_flush_young / test_young / change_pte of mmu-notify can rewrite
>>> rmap with the present-spte (P bit is set), we should umap rmap in
>>> these handlers.
>>>
>>>>
>>>>> +
>>>>> + /*
>>>>> +  * To ensure that all vcpus and mmu-notify are not clearing
>>>>> +  * spte and rmap entry.
>>>>> +  */
>>>>> + synchronize_srcu_expedited(&kvm->srcu);
>>>>> +}
>>>>> +
>>>>>  #ifdef MMU_DEBUG
>>>>>  static int is_empty_shadow_page(u64 *spt)
>>>>>  {
>>>>> @@ -2219,6 +2283,11 @@ static void clear_sp_write_flooding_count(u64 
>>>>> *spte)
>>>>>   __clear_sp_write_flooding_count(sp);
>>>>>  }
>>>>>  
>>>>> +static bool is_valid_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
>>>>> +{
>>>>> + return likely(sp->mmu_valid_gen == kvm->arch.mmu_valid_gen);
>>>>> +}
>>>>> +
>>>>>  static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
>>>>>gfn_t gfn,
>>>>>gva_t gaddr,
>>>>> @@ -2245,6 +2314,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
>>>>> kvm_vcpu *vcpu,
>>>>>   role.quadrant = quadrant;
>>>>>   }
>>>>>   for_each_gfn_sp(vcpu->kvm, sp, gfn) {
>>>>> + if (!is_valid_sp(vcpu->kvm, sp))
>>>>> + continue;
>>>>> +
>>>>>   if (!need_sync && sp->unsync)
>>>>>   need_sync = true;
>>>>>  
>>>>> @@ -2281,6 +2353,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
>>>>> kvm_vcpu *vcpu,
>>>>>  
>>>>>   account_shadowed(vcpu->kvm, gfn);
>>>>>   }
>>>>> + sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
>>>>>   init_shadow_page_table(sp);
>>>>>   trace_kvm_mmu_get_page(sp, true);
>>>>>   return sp;
>>>>> @@ -2451,8 +2524,12 @@ static int kvm_mmu_prepare_zap_page(struct kvm 
>>>>> *kvm, struct kvm_mmu_page *sp,
>>>>>   ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
>>>>>   kvm_mmu_page_unlink_children(kvm, sp);
>>>>
>>>> The rmaps[] arrays linked to !is_valid_sp() shadow pages should not be
>>>> accessed (as they have been freed already).
>>>>
>>>> I suppose the is_valid_sp() conditional below should be moved earlier,
>>>> before kvm_mmu_unlink_parents or any other rmap access.
>>>>
>>>> This is fine: the !is_valid_sp() shadow pages are only reachable
>>>> by SLAB and the hypervisor itself.
>>>
>>> Unfortunately we can not do this. :(
>>>
>>> The sptes in shadow pape can linked to many slots, if the spte is linked
>>> to the rmap of being deleted memslot, it is ok, otherwise, the rmap of
>>> still used memslot is miss updated.
>>>
>>> For example, slot 0 is being deleted, sp->spte[0] is linked on slot[0].rmap,
>>> sp->spte[1] is linked on slot[1].rmap. If we do not access rmap of this 
>>> 'sp',
>>> the already-freed spte[1] is still linked on slot[1].rmap.
>>>
>>> We can let kvm update the rmap for sp->spte[1] and do not unlink 
>>> sp->spte[0].
>>> This is also not allowed sin

Re: [PATCH v3 08/15] KVM: MMU: allow unmap invalid rmap out of mmu-lock

2013-04-18 Thread Xiao Guangrong
On 04/18/2013 07:38 PM, Gleb Natapov wrote:
> On Thu, Apr 18, 2013 at 07:22:23PM +0800, Xiao Guangrong wrote:
>> On 04/18/2013 07:00 PM, Gleb Natapov wrote:
>>> On Tue, Apr 16, 2013 at 02:32:46PM +0800, Xiao Guangrong wrote:
>>>> pte_list_clear_concurrently allows us to reset pte-desc entry
>>>> out of mmu-lock. We can reset spte out of mmu-lock if we can protect the
>>>> lifecycle of sp, we use this way to achieve the goal:
>>>>
>>>> unmap_memslot_rmap_nolock():
>>>> for-each-rmap-in-slot:
>>>>   preempt_disable
>>>>   kvm->arch.being_unmapped_rmap = rmapp
>>>>   clear spte and reset rmap entry
>>>>   kvm->arch.being_unmapped_rmap = NULL
>>>>   preempt_enable
>>>>
>>>> Other patch like zap-sp and mmu-notify which are protected
>>>> by mmu-lock:
>>>>   clear spte and reset rmap entry
>>>> retry:
>>>>   if (kvm->arch.being_unmapped_rmap == rmap)
>>>>goto retry
>>>> (the wait is very rare and clear one rmap is very fast, it
>>>> is not bad even if wait is needed)
>>>>
>>> I do not understand what how this achieve the goal. Suppose that rmap
>>> == X and kvm->arch.being_unmapped_rmap == NULL so "goto retry" is skipped,
>>> but moment later unmap_memslot_rmap_nolock() does
>>> vm->arch.being_unmapped_rmap = X.
>>
>> Access rmap is always safe since rmap and its entries are valid until
>> memslot is destroyed.
>>
>> This algorithm protects spte since it can be freed in the protection of 
>> mmu-lock.
>>
>> In your scenario:
>>
>> ==
>>CPU 1  CPU 2
>>
>> vcpu / mmu-notify access the RMAP unmap rmap out of mmu-lock which 
>> is under
>> which is under mmu-lock   slot-lock
>>
>> zap spte1
>> clear RMAP entry
>>
>> kvm->arch.being_unmapped_rmap = NULL,
>> do not wait
>>
>> free spte1
>>
>> set kvm->arch.being_unmapped_rmap = 
>> RMAP
>> walking RMAP and do not see spet1 on 
>> RMAP
>> (the entry of spte 1 has been reset 
>> by CPU 1)
> and what prevents this from happening concurrently with "clear RMAP
> entry"? Is it safe?

All the possible changes on the RMAP entry is from valid-spte to 
PTE_LIST_SPTE_SKIP.
(no valid-spte to valid-spte / no spte to new-spte)

There are three possible cases:
case 1): both two paths can see the valid-spte.
 the worst case is, the host page can be double A/D tracked
 (multi calling of kvm_set_pfn_accessed/kvm_set_pfn_dirty), it is safe.

case 2): only the path under protection of mmu-lock see the valid-spte
 this is safe since RMAP and spte are always valid under mmu-lock

case 3): only the path out of mmu-lock see the valid-spte
 then the path under mmu-lock will being wait until the no-lock path has
 finished. The spte is valid and no-lock path is safe to call
 kvm_set_pfn_accessed/kvm_set_pfn_dirty.

Do you get any potential issue?


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 08/15] KVM: MMU: allow unmap invalid rmap out of mmu-lock

2013-04-18 Thread Xiao Guangrong
On 04/18/2013 07:00 PM, Gleb Natapov wrote:
> On Tue, Apr 16, 2013 at 02:32:46PM +0800, Xiao Guangrong wrote:
>> pte_list_clear_concurrently allows us to reset pte-desc entry
>> out of mmu-lock. We can reset spte out of mmu-lock if we can protect the
>> lifecycle of sp, we use this way to achieve the goal:
>>
>> unmap_memslot_rmap_nolock():
>> for-each-rmap-in-slot:
>>   preempt_disable
>>   kvm->arch.being_unmapped_rmap = rmapp
>>   clear spte and reset rmap entry
>>   kvm->arch.being_unmapped_rmap = NULL
>>   preempt_enable
>>
>> Other patch like zap-sp and mmu-notify which are protected
>> by mmu-lock:
>>   clear spte and reset rmap entry
>> retry:
>>   if (kvm->arch.being_unmapped_rmap == rmap)
>>  goto retry
>> (the wait is very rare and clear one rmap is very fast, it
>> is not bad even if wait is needed)
>>
> I do not understand what how this achieve the goal. Suppose that rmap
> == X and kvm->arch.being_unmapped_rmap == NULL so "goto retry" is skipped,
> but moment later unmap_memslot_rmap_nolock() does
> vm->arch.being_unmapped_rmap = X.

Access rmap is always safe since rmap and its entries are valid until
memslot is destroyed.

This algorithm protects spte since it can be freed in the protection of 
mmu-lock.

In your scenario:

==
   CPU 1  CPU 2

vcpu / mmu-notify access the RMAP unmap rmap out of mmu-lock which is 
under
which is under mmu-lock   slot-lock

zap spte1
clear RMAP entry

kvm->arch.being_unmapped_rmap = NULL,
do not wait

free spte1

set kvm->arch.being_unmapped_rmap = RMAP
walking RMAP and do not see spet1 on 
RMAP
(the entry of spte 1 has been reset by 
CPU 1)
set kvm->arch.being_unmapped_rmap = NULL
==

That protect CPU 2 can not access the freed-spte.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 15/15] KVM: MMU: replace kvm_zap_all with kvm_mmu_invalid_all_pages

2013-04-17 Thread Xiao Guangrong
On 04/18/2013 08:08 AM, Marcelo Tosatti wrote:
> On Tue, Apr 16, 2013 at 02:32:53PM +0800, Xiao Guangrong wrote:
>> Use kvm_mmu_invalid_all_pages in kvm_arch_flush_shadow_all and
>> rename kvm_zap_all to kvm_free_all which is used to free all
>> memmory used by kvm mmu when vm is being destroyed, at this time,
>> no vcpu exists and mmu-notify has been unregistered, so we can
>> free the shadow pages out of mmu-lock
> 
> Since there is no contention for mmu-lock its also not a problem to 
> grab the lock right?

This still has contention. Other mmu-notify can happen when we handle
->release(). On the other handle, spin-lock is not preemptable.

> 
> Automated verification of locking/srcu might complain.

We hold slot-lock to free shadow page out of mmu-lock, it can avoid
the complain. No?



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 12/15] KVM: MMU: fast invalid all shadow pages

2013-04-17 Thread Xiao Guangrong
On 04/18/2013 08:05 AM, Marcelo Tosatti wrote:
> On Tue, Apr 16, 2013 at 02:32:50PM +0800, Xiao Guangrong wrote:
>> The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
>> walk and zap all shadow pages one by one, also it need to zap all guest
>> page's rmap and all shadow page's parent spte list. Particularly, things
>> become worse if guest uses more memory or vcpus. It is not good for
>> scalability.
>>
>> In this patch, we introduce a faster way to invalid all shadow pages.
>> KVM maintains a global mmu invalid generation-number which is stored in
>> kvm->arch.mmu_valid_gen and every shadow page stores the current global
>> generation-number into sp->mmu_valid_gen when it is created.
>>
>> When KVM need zap all shadow pages sptes, it just simply increase the
>> global generation-number then reload root shadow pages on all vcpus.
>> Vcpu will create a new shadow page table according to current kvm's
>> generation-number. It ensures the old pages are not used any more.
>>
>> The invalid-gen pages (sp->mmu_valid_gen != kvm->arch.mmu_valid_gen)
>> are keeped in mmu-cache until page allocator reclaims page.
>>
>> If the invalidation is due to memslot changed, its rmap amd lpage-info
>> will be freed soon, in order to avoiding use invalid memory, we unmap
>> all sptes on its rmap and always reset the large-info all memslots so
>> that rmap and lpage info can be safely freed.
>>
>> Signed-off-by: Xiao Guangrong 
>> ---
>>  arch/x86/include/asm/kvm_host.h |2 +
>>  arch/x86/kvm/mmu.c  |   85 
>> +-
>>  arch/x86/kvm/mmu.h  |4 ++
>>  arch/x86/kvm/x86.c  |6 +++
>>  4 files changed, 94 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h 
>> b/arch/x86/include/asm/kvm_host.h
>> index 1ad9a34..6f8ee18 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -223,6 +223,7 @@ struct kvm_mmu_page {
>>  int root_count;  /* Currently serving as active root */
>>  unsigned int unsync_children;
>>  unsigned long parent_ptes;  /* Reverse mapping for parent_pte */
>> +unsigned long mmu_valid_gen;
>>  DECLARE_BITMAP(unsync_child_bitmap, 512);
>>  
>>  #ifdef CONFIG_X86_32
>> @@ -531,6 +532,7 @@ struct kvm_arch {
>>  unsigned int n_requested_mmu_pages;
>>  unsigned int n_max_mmu_pages;
>>  unsigned int indirect_shadow_pages;
>> +unsigned long mmu_valid_gen;
>>  struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
>>  /*
>>   * Hash table of struct kvm_mmu_page.
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index 9ac584f..12129b7 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -1732,6 +1732,11 @@ static struct rmap_operations invalid_rmap_ops = {
>>  .rmap_unmap = kvm_unmap_invalid_rmapp
>>  };
>>  
>> +static void init_invalid_memslot_rmap_ops(struct kvm_memory_slot *slot)
>> +{
>> +slot->arch.ops = &invalid_rmap_ops;
>> +}
>> +
>>  typedef void (*handle_rmap_fun)(unsigned long *rmapp, void *data);
>>  static void walk_memslot_rmap_nolock(struct kvm_memory_slot *slot,
>>   handle_rmap_fun fun, void *data)
>> @@ -1812,6 +1817,65 @@ void free_meslot_rmap_desc_nolock(struct 
>> kvm_memory_slot *slot)
>>  walk_memslot_rmap_nolock(slot, free_rmap_desc_nolock, NULL);
>>  }
>>  
>> +/*
>> + * Fast invalid all shadow pages belong to @slot.
>> + *
>> + * @slot != NULL means the invalidation is caused the memslot specified
>> + * by @slot is being deleted, in this case, we should ensure that rmap
>> + * and lpage-info of the @slot can not be used after calling the function.
>> + * Specially, if @slot is INVALID_ALL_SLOTS, all slots will be deleted
>> + * soon, it always happens when kvm is being destroyed.
>> + *
>> + * @slot == NULL means the invalidation due to other reasons, we need
>> + * not care rmap and lpage-info since they are still valid after calling
>> + * the function.
>> + */
>> +void kvm_mmu_invalid_memslot_pages(struct kvm *kvm,
>> +   struct kvm_memory_slot *slot)
>> +{
>> +struct kvm_memory_slot *each_slot;
>> +
>> +spin_lock(&kvm->mmu_lock);
>> +kvm->arch.mmu_valid_gen++;
>> +
>> +if (slot == INVALID_ALL_SLOTS)
>> +kvm_for_each_memslot(eac

Re: [PATCH v3 07/15] KVM: MMU: introduce invalid rmap handlers

2013-04-17 Thread Xiao Guangrong
On 04/18/2013 07:38 AM, Marcelo Tosatti wrote:
> On Tue, Apr 16, 2013 at 02:32:45PM +0800, Xiao Guangrong wrote:
>> Invalid rmaps is the rmap of the invalid memslot which is being
>> deleted, especially, we can treat all rmaps are invalid when
>> kvm is being destroyed since all memslot will be deleted soon.
>> MMU should remove all sptes on these rmaps before the invalid
>> memslot fully deleted
>>
>> The reason why we separately handle invalid rmap is we want to
>> unmap invalid-rmap out of mmu-lock to achieve scale performance
>> on intensive memory and vcpu used guest
> 
> Better try to make the code simpler, and introduce complexity only 
> if necessary.

Marcelo,

This code is necessary to implement "unmap invalid rmap out of mmu-lock",
the reason why we need it is that ...

> 
> The idea to zap the roots is very elegant and apparently effective. What
> are its problems?

I mentioned it in 00/15:

* Challenges
Some page invalidation is requested when memslot is moved or deleted
and kvm is being destroy who call zap_all_pages to delete all sp using
their rmap and lpage-info, after call zap_all_pages, the rmap and lpage-info
will be freed. So, we should implement a fast way to delete sp from the rmap
and lpage-info.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: mmu_notifier: re-fix freed page still mapped in secondary MMU

2013-04-17 Thread Xiao Guangrong
On 04/18/2013 02:45 AM, Robin Holt wrote:

>>> For the v3.10 release, we should work on making this more
>>> correct and completely documented.
>>
>> Better document is always welcomed.
>>
>> Double call ->release is not bad, like i mentioned it in the changelog:
>>
>> it is really rare (e.g, can not happen on kvm since mmu-notify is 
>> unregistered
>> after exit_mmap()) and the later call of multiple ->release should be
>> fast since all the pages have already been released by the first call.
>>
>> But, of course, it's great if you have a _light_ way to avoid this.
>
> Getting my test environment set back up took longer than I would have 
> liked.
>
> Your patch passed.  I got no NULL-pointer derefs.

 Thanks for your test again.

>
> How would you feel about adding the following to your patch?

 I prefer to make these changes as a separate patch, this change is the
 improvement, please do not mix it with bugfix.
>>>
>>> I think your "improvement" classification is a bit deceiving.  My previous
>>> patch fixed the bug in calling release multiple times.  Your patch without
>>> this will reintroduce that buggy behavior.  Just because the bug is already
>>> worked around by KVM does not mean it is not a bug.
>>
>> As your tested, calling ->release() multiple times can work, but just make 
>> your
>> testcase more _slower_. So your changes is trying to speed it up - it is a
>> improvement.
>>
>> Well, _if_ it is really a bug, could you please do not fix two bugs in one 
>> patch?
> 
> The code, as is, does not call ->release() multiple times.  Your code
> changes the behavior to call it multiple times.  You are introducing the
> bug by your code changes.  Why not fix the bug you create in the patch
> which creates it?

Andrew, your thought?


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: mmu_notifier: re-fix freed page still mapped in secondary MMU

2013-04-17 Thread Xiao Guangrong
On 04/17/2013 10:10 PM, Robin Holt wrote:
> On Wed, Apr 17, 2013 at 10:55:26AM +0800, Xiao Guangrong wrote:
>> On 04/17/2013 02:08 AM, Robin Holt wrote:
>>> On Tue, Apr 16, 2013 at 09:07:20PM +0800, Xiao Guangrong wrote:
>>>> On 04/16/2013 07:43 PM, Robin Holt wrote:
>>>>> Argh.  Taking a step back helped clear my head.
>>>>>
>>>>> For the -stable releases, I agree we should just go with your
>>>>> revert-plus-hlist_del_init_rcu patch.  I will give it a test
>>>>> when I am in the office.
>>>>
>>>> Okay. Wait for your test report. Thank you in advance.
>>>>
>>>>>
>>>>> For the v3.10 release, we should work on making this more
>>>>> correct and completely documented.
>>>>
>>>> Better document is always welcomed.
>>>>
>>>> Double call ->release is not bad, like i mentioned it in the changelog:
>>>>
>>>> it is really rare (e.g, can not happen on kvm since mmu-notify is 
>>>> unregistered
>>>> after exit_mmap()) and the later call of multiple ->release should be
>>>> fast since all the pages have already been released by the first call.
>>>>
>>>> But, of course, it's great if you have a _light_ way to avoid this.
>>>
>>> Getting my test environment set back up took longer than I would have liked.
>>>
>>> Your patch passed.  I got no NULL-pointer derefs.
>>
>> Thanks for your test again.
>>
>>>
>>> How would you feel about adding the following to your patch?
>>
>> I prefer to make these changes as a separate patch, this change is the
>> improvement, please do not mix it with bugfix.
> 
> I think your "improvement" classification is a bit deceiving.  My previous
> patch fixed the bug in calling release multiple times.  Your patch without
> this will reintroduce that buggy behavior.  Just because the bug is already
> worked around by KVM does not mean it is not a bug.

As your tested, calling ->release() multiple times can work, but just make your
testcase more _slower_. So your changes is trying to speed it up - it is a
improvement.

Well, _if_ it is really a bug, could you please do not fix two bugs in one 
patch?

Thanks!

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: mmu_notifier: re-fix freed page still mapped in secondary MMU

2013-04-16 Thread Xiao Guangrong
On 04/17/2013 02:08 AM, Robin Holt wrote:
> On Tue, Apr 16, 2013 at 09:07:20PM +0800, Xiao Guangrong wrote:
>> On 04/16/2013 07:43 PM, Robin Holt wrote:
>>> Argh.  Taking a step back helped clear my head.
>>>
>>> For the -stable releases, I agree we should just go with your
>>> revert-plus-hlist_del_init_rcu patch.  I will give it a test
>>> when I am in the office.
>>
>> Okay. Wait for your test report. Thank you in advance.
>>
>>>
>>> For the v3.10 release, we should work on making this more
>>> correct and completely documented.
>>
>> Better document is always welcomed.
>>
>> Double call ->release is not bad, like i mentioned it in the changelog:
>>
>> it is really rare (e.g, can not happen on kvm since mmu-notify is 
>> unregistered
>> after exit_mmap()) and the later call of multiple ->release should be
>> fast since all the pages have already been released by the first call.
>>
>> But, of course, it's great if you have a _light_ way to avoid this.
> 
> Getting my test environment set back up took longer than I would have liked.
> 
> Your patch passed.  I got no NULL-pointer derefs.

Thanks for your test again.

> 
> How would you feel about adding the following to your patch?

I prefer to make these changes as a separate patch, this change is the
improvement, please do not mix it with bugfix.

You can make a patchset (comments improvement and this change) based on
my fix.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: mmu_notifier: re-fix freed page still mapped in secondary MMU

2013-04-16 Thread Xiao Guangrong
On 04/16/2013 07:43 PM, Robin Holt wrote:
> Argh.  Taking a step back helped clear my head.
> 
> For the -stable releases, I agree we should just go with your
> revert-plus-hlist_del_init_rcu patch.  I will give it a test
> when I am in the office.

Okay. Wait for your test report. Thank you in advance.

> 
> For the v3.10 release, we should work on making this more
> correct and completely documented.

Better document is always welcomed.

Double call ->release is not bad, like i mentioned it in the changelog:

it is really rare (e.g, can not happen on kvm since mmu-notify is unregistered
after exit_mmap()) and the later call of multiple ->release should be
fast since all the pages have already been released by the first call.

But, of course, it's great if you have a _light_ way to avoid this.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: mmu_notifier: re-fix freed page still mapped in secondary MMU

2013-04-16 Thread Xiao Guangrong
On 04/16/2013 05:31 PM, Robin Holt wrote:
> On Tue, Apr 16, 2013 at 02:39:49PM +0800, Xiao Guangrong wrote:
>> The commit 751efd8610d3 (mmu_notifier_unregister NULL Pointer deref
>> and multiple ->release()) breaks the fix:
>> 3ad3d901bbcfb15a5e4690e55350db0899095a68
>> (mm: mmu_notifier: fix freed page still mapped in secondary MMU)
> 
> Can you describe how the page is still mapped?  I thought I had all
> cases covered.  Whichever call hits first, I thought we had one callout
> to the registered notifiers.  Are you saying we need multiple callouts?

No.

You patch did this:

hlist_del_init_rcu(&mn->hlist);1 <==
+   spin_unlock(&mm->mmu_notifier_mm->lock);
+
+   /*
+* Clear sptes. (see 'release' description in mmu_notifier.h)
+*/
+   if (mn->ops->release)
+   mn->ops->release(mn, mm);2 <==
+
+   spin_lock(&mm->mmu_notifier_mm->lock);

At point 1, you delete the notify, but the page is still on LRU. Other
cpu can reclaim the page but without call ->invalid_page().

At point 2, you call ->release(), the secondary MMU make page Accessed/Dirty
but that page has already been on the free-list of page-alloctor.

> 
> Also, shouldn't you be asking for a revert commit and then supply a
> subsequent commit for the real fix?  I thought that was the process for
> doing a revert.

Can not do that pure reversion since your patch moved hlist_for_each_entry_rcu
which has been modified now.

Should i do pure-eversion + hlist_for_each_entry_rcu update first?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] mm: mmu_notifier: re-fix freed page still mapped in secondary MMU

2013-04-16 Thread Xiao Guangrong
The commit 751efd8610d3 (mmu_notifier_unregister NULL Pointer deref
and multiple ->release()) breaks the fix:
3ad3d901bbcfb15a5e4690e55350db0899095a68
(mm: mmu_notifier: fix freed page still mapped in secondary MMU)

This patch reverts the commit and simply fix the bug spotted
by that patch

This bug is spotted by commit 751efd8610d3:
==
There is a race condition between mmu_notifier_unregister() and
__mmu_notifier_release().

Assume two tasks, one calling mmu_notifier_unregister() as a result of a
filp_close() ->flush() callout (task A), and the other calling
mmu_notifier_release() from an mmput() (task B).

A   B
t1  srcu_read_lock()
t2  if (!hlist_unhashed())
t3  srcu_read_unlock()
t4  srcu_read_lock()
t5  hlist_del_init_rcu()
t6  synchronize_srcu()
t7  srcu_read_unlock()
t8  hlist_del_rcu()  <--- NULL pointer deref.
==

This can be fixed by using hlist_del_init_rcu instead of hlist_del_rcu.

The another issue spotted in the commit is
"multiple ->release() callouts", we needn't care it too much because
it is really rare (e.g, can not happen on kvm since mmu-notify is unregistered
after exit_mmap()) and the later call of multiple ->release should be
fast since all the pages have already been released by the first call.

Signed-off-by: Xiao Guangrong 
---
 mm/mmu_notifier.c |   81 +++--
 1 files changed, 41 insertions(+), 40 deletions(-)

diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index be04122..606777a 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -40,48 +40,45 @@ void __mmu_notifier_release(struct mm_struct *mm)
int id;

/*
-* srcu_read_lock() here will block synchronize_srcu() in
-* mmu_notifier_unregister() until all registered
-* ->release() callouts this function makes have
-* returned.
+* SRCU here will block mmu_notifier_unregister until
+* ->release returns.
 */
id = srcu_read_lock(&srcu);
+   hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist)
+   /*
+* if ->release runs before mmu_notifier_unregister it
+* must be handled as it's the only way for the driver
+* to flush all existing sptes and stop the driver
+* from establishing any more sptes before all the
+* pages in the mm are freed.
+*/
+   if (mn->ops->release)
+   mn->ops->release(mn, mm);
+   srcu_read_unlock(&srcu, id);
+
spin_lock(&mm->mmu_notifier_mm->lock);
while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
mn = hlist_entry(mm->mmu_notifier_mm->list.first,
 struct mmu_notifier,
 hlist);
-
/*
-* Unlink.  This will prevent mmu_notifier_unregister()
-* from also making the ->release() callout.
+* We arrived before mmu_notifier_unregister so
+* mmu_notifier_unregister will do nothing other than
+* to wait ->release to finish and
+* mmu_notifier_unregister to return.
 */
hlist_del_init_rcu(&mn->hlist);
-   spin_unlock(&mm->mmu_notifier_mm->lock);
-
-   /*
-* Clear sptes. (see 'release' description in mmu_notifier.h)
-*/
-   if (mn->ops->release)
-   mn->ops->release(mn, mm);
-
-   spin_lock(&mm->mmu_notifier_mm->lock);
}
spin_unlock(&mm->mmu_notifier_mm->lock);

/*
-* All callouts to ->release() which we have done are complete.
-* Allow synchronize_srcu() in mmu_notifier_unregister() to complete
-*/
-   srcu_read_unlock(&srcu, id);
-
-   /*
-* mmu_notifier_unregister() may have unlinked a notifier and may
-* still be calling out to it.  Additionally, other notifiers
-* may have been active via vmtruncate() et. al. Block here
-* to ensure that all notifier callouts for this mm have been
-* completed and the sptes are really cleaned up before returning
-* to exit_mmap().
+* synchronize_srcu here prevents mmu_notifier_release to
+* return to exit_mmap (which would proceed freeing all pages
+* in the mm) until the ->release method returns, if it was
+* invoked by mmu_notifier_unregister.
+*
+   

[PATCH v3 03/15] KVM: x86: do not reuse rmap when memslot is moved

2013-04-15 Thread Xiao Guangrong
Let kvm do not reuse the rmap of the memslot which is being moved
then the rmap of moved or deleted memslot can only be unmapped, no
new spte can be added on it.

This is good for us to unmap rmap out of mmu-lock in the later patches

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/x86.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 447789c..839e666 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6939,7 +6939,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
enum kvm_mr_change change)
 {
-   if (change == KVM_MR_CREATE)
+   if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
if (kvm_arch_create_memslot(memslot))
return -ENOMEM;
 
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 07/15] KVM: MMU: introduce invalid rmap handlers

2013-04-15 Thread Xiao Guangrong
Invalid rmaps is the rmap of the invalid memslot which is being
deleted, especially, we can treat all rmaps are invalid when
kvm is being destroyed since all memslot will be deleted soon.
MMU should remove all sptes on these rmaps before the invalid
memslot fully deleted

The reason why we separately handle invalid rmap is we want to
unmap invalid-rmap out of mmu-lock to achieve scale performance
on intensive memory and vcpu used guest

This patch make all the operations on invalid rmap are clearing
spte and reset rmap's entry. In the later patch, we will introduce
the path out of mmu-lock to unmap invalid rmap

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   80 
 1 files changed, 80 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 850eab5..2a7a5d0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1606,6 +1606,86 @@ void init_memslot_rmap_ops(struct kvm_memory_slot *slot)
slot->arch.ops = &normal_rmap_ops;
 }
 
+static int invalid_rmap_add(struct kvm_vcpu *vcpu, u64 *spte,
+   unsigned long *pte_list)
+{
+   WARN_ON(1);
+   return 0;
+}
+
+static void invalid_rmap_remove(u64 *spte, unsigned long *rmapp)
+{
+   pte_list_clear_concurrently(spte, rmapp);
+}
+
+static bool invalid_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
+  bool pt_protect)
+{
+   WARN_ON(1);
+   return false;
+}
+
+static int __kvm_unmap_invalid_rmapp(unsigned long *rmapp)
+{
+   u64 *sptep;
+   struct rmap_iterator iter;
+
+   for (sptep = rmap_get_first(*rmapp, &iter); sptep;
+ sptep = rmap_get_next(&iter)) {
+   if (sptep == PTE_LIST_SPTE_SKIP)
+   continue;
+
+   /* Do not call .rmap_remove(). */
+   if (mmu_spte_clear_track_bits(sptep))
+   pte_list_clear_concurrently(sptep, rmapp);
+   }
+
+   return 0;
+}
+
+static int kvm_unmap_invalid_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+   return __kvm_unmap_invalid_rmapp(rmapp);
+}
+
+static int invalid_rmap_set_pte(struct kvm *kvm, unsigned long *rmapp,
+   pte_t *ptep)
+{
+   return kvm_unmap_invalid_rmapp(kvm, rmapp);
+}
+
+/*
+ * Invalid rmaps is the rmap of the invalid memslot which is being
+ * deleted, especially, we can treat all rmaps are invalid when
+ * kvm is being destroyed since all memslot will be deleted soon.
+ * MMU should remove all sptes on these rmaps before the invalid
+ * memslot fully deleted.
+ *
+ * VCPUs can not do address translation on invalid memslots, that
+ * means no sptes can be added to their rmaps and no shadow page
+ * can be created in their memory regions, so rmap_add and
+ * rmap_write_protect on invalid memslot should never happen.
+ * Any sptes on invalid rmaps are stale and can not be reused,
+ * we drop all sptes on any other operations. So, all handlers
+ * on invalid rmap do the same thing - remove and zap sptes on
+ * the rmap.
+ *
+ * KVM use pte_list_clear_concurrently to clear spte on invalid
+ * rmap which resets rmap's entry but keeps rmap's memory. The
+ * rmap is fully destroyed when free the invalid memslot.
+ */
+static struct rmap_operations invalid_rmap_ops = {
+   .rmap_add = invalid_rmap_add,
+   .rmap_remove = invalid_rmap_remove,
+
+   .rmap_write_protect = invalid_rmap_write_protect,
+
+   .rmap_set_pte = invalid_rmap_set_pte,
+   .rmap_age = kvm_unmap_invalid_rmapp,
+   .rmap_test_age = kvm_unmap_invalid_rmapp,
+   .rmap_unmap = kvm_unmap_invalid_rmapp
+};
+
 #ifdef MMU_DEBUG
 static int is_empty_shadow_page(u64 *spt)
 {
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 09/15] KVM: MMU: introduce free_meslot_rmap_desc_nolock

2013-04-15 Thread Xiao Guangrong
It frees pte-list-descs used by memslot rmap after update
memslot is completed

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   26 ++
 arch/x86/kvm/mmu.h |1 +
 2 files changed, 27 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e6414d2..9ac584f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1049,6 +1049,22 @@ static void pte_list_clear_concurrently(u64 *spte, 
unsigned long *pte_list)
return;
 }
 
+static void pte_list_free_desc(unsigned long *pte_list)
+{
+   struct pte_list_desc *desc, *next;
+   unsigned long pte_value = *pte_list;
+
+   if (!(pte_value & 1))
+   return;
+
+   desc = (struct pte_list_desc *)(pte_value & ~1ul);
+   do {
+   next = desc->more;
+   mmu_free_pte_list_desc(desc);
+   desc = next;
+   } while (desc);
+}
+
 typedef void (*pte_list_walk_fn) (u64 *spte);
 static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
 {
@@ -1786,6 +1802,16 @@ unmap_memslot_rmap_nolock(struct kvm *kvm, struct 
kvm_memory_slot *slot)
walk_memslot_rmap_nolock(slot, unmap_invalid_rmap_nolock, kvm);
 }
 
+static void free_rmap_desc_nolock(unsigned long *rmapp, void *data)
+{
+   pte_list_free_desc(rmapp);
+}
+
+void free_meslot_rmap_desc_nolock(struct kvm_memory_slot *slot)
+{
+   walk_memslot_rmap_nolock(slot, free_rmap_desc_nolock, NULL);
+}
+
 #ifdef MMU_DEBUG
 static int is_empty_shadow_page(u64 *spt)
 {
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index d6aa31a..ab434b7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -130,4 +130,5 @@ struct rmap_operations {
 };
 
 void init_memslot_rmap_ops(struct kvm_memory_slot *slot);
+void free_meslot_rmap_desc_nolock(struct kvm_memory_slot *slot);
 #endif
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 06/15] KVM: MMU: allow concurrently clearing spte on remove-only pte-list

2013-04-15 Thread Xiao Guangrong
This patch introduce PTE_LIST_SPTE_SKIP which is the placeholder and
it will be set on pte-list after removing a spte so that other sptes
on this pte_list are not moved and the pte-list-descs on the pte-list
are not freed.

If vcpu can not add spte to the pte-list (e.g. the rmap on invalid
memslot) and spte can not be freed during pte-list walk, we can
concurrently clear sptes on the pte-list, the worst case is, we double
zap a spte that is safe.

This patch only ensures that concurrently zapping pte-list is safe,
we will keep spte available during concurrently clearing in the later
patches

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   62 +++
 1 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 99ad2a4..850eab5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -900,6 +900,18 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t 
large_gfn)
 }
 
 /*
+ * It is the placeholder and it will be set on pte-list after removing
+ * a spte so that other sptes on this pte_list are not moved and the
+ * pte-list-descs on the pte-list are not freed.
+ *
+ * If vcpu can not add spte to the pte-list (e.g. the rmap on invalid
+ * memslot) and spte can not be freed during pte-list walk, we can
+ * cocurrently clear sptes on the pte-list, the worst case is, we double
+ * zap a spte that is safe.
+ */
+#define PTE_LIST_SPTE_SKIP (u64 *)((~0x0ul) & (~1))
+
+/*
  * Pte mapping structures:
  *
  * If pte_list bit zero is zero, then pte_list point to the spte.
@@ -1003,6 +1015,40 @@ static void pte_list_remove(u64 *spte, unsigned long 
*pte_list)
}
 }
 
+static void pte_list_clear_concurrently(u64 *spte, unsigned long *pte_list)
+{
+   struct pte_list_desc *desc;
+   unsigned long pte_value = *pte_list;
+   int i;
+
+   /* Empty pte list stores nothing. */
+   WARN_ON(!pte_value);
+
+   if (!(pte_value & 1)) {
+   if ((u64 *)pte_value == spte) {
+   *pte_list = (unsigned long)PTE_LIST_SPTE_SKIP;
+   return;
+   }
+
+   /* someone has already cleared it. */
+   WARN_ON(pte_value != (unsigned long)PTE_LIST_SPTE_SKIP);
+   return;
+   }
+
+   desc = (struct pte_list_desc *)(pte_value & ~1ul);
+   while (desc) {
+   for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
+   if (desc->sptes[i] == spte) {
+   desc->sptes[i] = PTE_LIST_SPTE_SKIP;
+   return;
+   }
+
+   desc = desc->more;
+   }
+
+   return;
+}
+
 typedef void (*pte_list_walk_fn) (u64 *spte);
 static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
 {
@@ -1214,6 +1260,12 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool 
*flush, bool pt_protect)
return false;
 }
 
+/* PTE_LIST_SPTE_SKIP is only used on invalid rmap. */
+static void check_valid_sptep(u64 *sptep)
+{
+   WARN_ON(sptep == PTE_LIST_SPTE_SKIP || !is_rmap_spte(*sptep));
+}
+
 static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
 bool pt_protect)
 {
@@ -1222,7 +1274,7 @@ static bool __rmap_write_protect(struct kvm *kvm, 
unsigned long *rmapp,
bool flush = false;
 
for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
-   BUG_ON(!(*sptep & PT_PRESENT_MASK));
+   check_valid_sptep(sptep);
if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {
sptep = rmap_get_first(*rmapp, &iter);
continue;
@@ -1293,7 +1345,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long 
*rmapp)
int need_tlb_flush = 0;
 
while ((sptep = rmap_get_first(*rmapp, &iter))) {
-   BUG_ON(!(*sptep & PT_PRESENT_MASK));
+   check_valid_sptep(sptep);
rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, 
*sptep);
 
drop_spte(kvm, sptep);
@@ -1322,7 +1374,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned 
long *rmapp,
new_pfn = pte_pfn(*ptep);
 
for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
-   BUG_ON(!is_shadow_present_pte(*sptep));
+   check_valid_sptep(sptep);
rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep);
 
need_flush = 1;
@@ -1455,7 +1507,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long 
*rmapp)
 
for (sptep = rmap_get_first(*rmapp, &iter); sptep;
 sptep = rmap_get_next(&iter)) {
-   BUG_ON(!is_shadow_present_pte(*sptep));
+   check_valid_sptep(sptep);
 
if (*sptep & shadow_accessed_mask) {
   

[PATCH v3 11/15] KVM: MMU: introduce kvm_clear_all_lpage_info

2013-04-15 Thread Xiao Guangrong
This function is used to reset the large page info of all guest page
which will be used in later patch

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/x86.c |   25 +
 arch/x86/kvm/x86.h |2 ++
 2 files changed, 27 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0c5bb2c..fc4956c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6909,6 +6909,31 @@ static void memslot_set_lpage_disallowed(struct 
kvm_memory_slot *slot,
}
 }
 
+static void clear_memslot_lpage_info(struct kvm_memory_slot *slot)
+{
+   int i;
+
+   for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
+   int lpages;
+   int level = i + 1;
+
+   lpages = gfn_to_index(slot->base_gfn + slot->npages - 1,
+ slot->base_gfn, level) + 1;
+
+   memset(slot->arch.lpage_info[i - 1], 0,
+  sizeof(*slot->arch.lpage_info[i - 1]));
+   memslot_set_lpage_disallowed(slot, slot->npages, i, lpages);
+   }
+}
+
+void kvm_clear_all_lpage_info(struct kvm *kvm)
+{
+   struct kvm_memory_slot *slot;
+
+   kvm_for_each_memslot(slot, kvm->memslots)
+   clear_memslot_lpage_info(slot);
+}
+
 static int kvm_arch_create_memslot(struct kvm_memory_slot *slot)
 {
unsigned long npages = slot->npages;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index e224f7a..beae540 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -108,6 +108,8 @@ static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu 
*vcpu, gpa_t gpa)
return false;
 }
 
+void kvm_clear_all_lpage_info(struct kvm *kvm);
+
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 08/15] KVM: MMU: allow unmap invalid rmap out of mmu-lock

2013-04-15 Thread Xiao Guangrong
pte_list_clear_concurrently allows us to reset pte-desc entry
out of mmu-lock. We can reset spte out of mmu-lock if we can protect the
lifecycle of sp, we use this way to achieve the goal:

unmap_memslot_rmap_nolock():
for-each-rmap-in-slot:
  preempt_disable
  kvm->arch.being_unmapped_rmap = rmapp
  clear spte and reset rmap entry
  kvm->arch.being_unmapped_rmap = NULL
  preempt_enable

Other patch like zap-sp and mmu-notify which are protected
by mmu-lock:
  clear spte and reset rmap entry
retry:
  if (kvm->arch.being_unmapped_rmap == rmap)
goto retry
(the wait is very rare and clear one rmap is very fast, it
is not bad even if wait is needed)

Then, we can sure the spte is always available when we do
unmap_memslot_rmap_nolock

Signed-off-by: Xiao Guangrong 
---
 arch/x86/include/asm/kvm_host.h |2 +
 arch/x86/kvm/mmu.c  |  114 ---
 arch/x86/kvm/mmu.h  |2 +-
 3 files changed, 110 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5fd6ed1..1ad9a34 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -536,6 +536,8 @@ struct kvm_arch {
 * Hash table of struct kvm_mmu_page.
 */
struct list_head active_mmu_pages;
+   unsigned long *being_unmapped_rmap;
+
struct list_head assigned_dev_head;
struct iommu_domain *iommu_domain;
int iommu_flags;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2a7a5d0..e6414d2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1104,10 +1104,10 @@ static int slot_rmap_add(struct kvm_memory_slot *slot,
return slot->arch.ops->rmap_add(vcpu, spte, rmapp);
 }
 
-static void slot_rmap_remove(struct kvm_memory_slot *slot,
+static void slot_rmap_remove(struct kvm_memory_slot *slot, struct kvm *kvm,
 unsigned long *rmapp, u64 *spte)
 {
-   slot->arch.ops->rmap_remove(spte, rmapp);
+   slot->arch.ops->rmap_remove(kvm, spte, rmapp);
 }
 
 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -1132,7 +1132,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
sp = page_header(__pa(spte));
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
rmapp = gfn_to_rmap(kvm, &slot, gfn, sp->role.level);
-   slot_rmap_remove(slot, rmapp, spte);
+   slot_rmap_remove(slot, kvm, rmapp, spte);
 }
 
 /*
@@ -1589,9 +1589,14 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
return kvm_handle_hva(kvm, hva, 0, slot_rmap_test_age);
 }
 
+static void rmap_remove_spte(struct kvm *kvm, u64 *spte, unsigned long *rmapp)
+{
+   pte_list_remove(spte, rmapp);
+}
+
 static struct rmap_operations normal_rmap_ops = {
.rmap_add = pte_list_add,
-   .rmap_remove = pte_list_remove,
+   .rmap_remove = rmap_remove_spte,
 
.rmap_write_protect = __rmap_write_protect,
 
@@ -1613,9 +1618,27 @@ static int invalid_rmap_add(struct kvm_vcpu *vcpu, u64 
*spte,
return 0;
 }
 
-static void invalid_rmap_remove(u64 *spte, unsigned long *rmapp)
+static void sync_being_unmapped_rmap(struct kvm *kvm, unsigned long *rmapp)
+{
+   /*
+* Ensure all the sptes on the rmap have been zapped and
+* the rmap's entries have been reset so that
+* unmap_invalid_rmap_nolock can not get any spte from the
+* rmap after calling sync_being_unmapped_rmap().
+*/
+   smp_mb();
+retry:
+   if (unlikely(ACCESS_ONCE(kvm->arch.being_unmapped_rmap) == rmapp)) {
+   cpu_relax();
+   goto retry;
+   }
+}
+
+static void
+invalid_rmap_remove(struct kvm *kvm, u64 *spte, unsigned long *rmapp)
 {
pte_list_clear_concurrently(spte, rmapp);
+   sync_being_unmapped_rmap(kvm, rmapp);
 }
 
 static bool invalid_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
@@ -1635,7 +1658,11 @@ static int __kvm_unmap_invalid_rmapp(unsigned long 
*rmapp)
if (sptep == PTE_LIST_SPTE_SKIP)
continue;
 
-   /* Do not call .rmap_remove(). */
+   /*
+* Do not call .rmap_remove() since we do not want to wait
+* on sync_being_unmapped_rmap() when all sptes should be
+* removed from the rmap.
+*/
if (mmu_spte_clear_track_bits(sptep))
pte_list_clear_concurrently(sptep, rmapp);
}
@@ -1645,7 +1672,10 @@ static int __kvm_unmap_invalid_rmapp(unsigned long 
*rmapp)
 
 static int kvm_unmap_invalid_rmapp(struct kvm *kvm, unsigned long *rmapp)
 {
-   return __kvm_unmap_invalid_rmapp(rmapp);
+   int ret = __kvm_unmap_invalid_rmapp(rmapp);
+
+   sync_being_unmapped_rmap(kvm, rmapp);
+   return ret;
 }
 
 static int invalid_rmap_set_pte(struct kvm *kvm, unsigned long

[PATCH v3 13/15] KVM: x86: use the fast way to invalid all pages

2013-04-15 Thread Xiao Guangrong
Replace kvm_mmu_zap_all by kvm_mmu_invalid_all_pages except on
the path of mmu_notifier->release() which will be replaced in
the later patch

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/x86.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6dbb80c..6e7c85b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5465,7 +5465,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt 
*ctxt)
 * to ensure that the updated hypercall appears atomically across all
 * VCPUs.
 */
-   kvm_mmu_zap_all(vcpu->kvm);
+   kvm_mmu_invalid_memslot_pages(vcpu->kvm, NULL);
 
kvm_x86_ops->patch_hypercall(vcpu, instruction);
 
@@ -7062,7 +7062,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
   struct kvm_memory_slot *slot)
 {
-   kvm_arch_flush_shadow_all(kvm);
+   kvm_mmu_invalid_memslot_pages(kvm, slot);
 }
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 14/15] KVM: move srcu_read_lock/srcu_read_unlock to arch-specified code

2013-04-15 Thread Xiao Guangrong
Move srcu_read_lock/srcu_read_unlock in kvm_mmu_notifier_release
to kvm_arch_flush_shadow_all since we will hold slot-lock instead
of srcu

Only ARM, POWERPC and x86 are using mmu-notify and
kvm_arch_flush_shadow_all on ARM and POWERPC does nothing, so we
only need to modify the code on x86

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/x86.c  |4 
 virt/kvm/kvm_main.c |3 ---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6e7c85b..d3dd0d5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7056,7 +7056,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
+   int idx;
+
+   idx = srcu_read_lock(&kvm->srcu);
kvm_mmu_zap_all(kvm);
+   srcu_read_unlock(&kvm->srcu, idx);
 }
 
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index acc9f30..f48eef9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -418,11 +418,8 @@ static void kvm_mmu_notifier_release(struct mmu_notifier 
*mn,
 struct mm_struct *mm)
 {
struct kvm *kvm = mmu_notifier_to_kvm(mn);
-   int idx;
 
-   idx = srcu_read_lock(&kvm->srcu);
kvm_arch_flush_shadow_all(kvm);
-   srcu_read_unlock(&kvm->srcu, idx);
 }
 
 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 12/15] KVM: MMU: fast invalid all shadow pages

2013-04-15 Thread Xiao Guangrong
The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
walk and zap all shadow pages one by one, also it need to zap all guest
page's rmap and all shadow page's parent spte list. Particularly, things
become worse if guest uses more memory or vcpus. It is not good for
scalability.

In this patch, we introduce a faster way to invalid all shadow pages.
KVM maintains a global mmu invalid generation-number which is stored in
kvm->arch.mmu_valid_gen and every shadow page stores the current global
generation-number into sp->mmu_valid_gen when it is created.

When KVM need zap all shadow pages sptes, it just simply increase the
global generation-number then reload root shadow pages on all vcpus.
Vcpu will create a new shadow page table according to current kvm's
generation-number. It ensures the old pages are not used any more.

The invalid-gen pages (sp->mmu_valid_gen != kvm->arch.mmu_valid_gen)
are keeped in mmu-cache until page allocator reclaims page.

If the invalidation is due to memslot changed, its rmap amd lpage-info
will be freed soon, in order to avoiding use invalid memory, we unmap
all sptes on its rmap and always reset the large-info all memslots so
that rmap and lpage info can be safely freed.

Signed-off-by: Xiao Guangrong 
---
 arch/x86/include/asm/kvm_host.h |2 +
 arch/x86/kvm/mmu.c  |   85 +-
 arch/x86/kvm/mmu.h  |4 ++
 arch/x86/kvm/x86.c  |6 +++
 4 files changed, 94 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1ad9a34..6f8ee18 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -223,6 +223,7 @@ struct kvm_mmu_page {
int root_count;  /* Currently serving as active root */
unsigned int unsync_children;
unsigned long parent_ptes;  /* Reverse mapping for parent_pte */
+   unsigned long mmu_valid_gen;
DECLARE_BITMAP(unsync_child_bitmap, 512);
 
 #ifdef CONFIG_X86_32
@@ -531,6 +532,7 @@ struct kvm_arch {
unsigned int n_requested_mmu_pages;
unsigned int n_max_mmu_pages;
unsigned int indirect_shadow_pages;
+   unsigned long mmu_valid_gen;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
 * Hash table of struct kvm_mmu_page.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9ac584f..12129b7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1732,6 +1732,11 @@ static struct rmap_operations invalid_rmap_ops = {
.rmap_unmap = kvm_unmap_invalid_rmapp
 };
 
+static void init_invalid_memslot_rmap_ops(struct kvm_memory_slot *slot)
+{
+   slot->arch.ops = &invalid_rmap_ops;
+}
+
 typedef void (*handle_rmap_fun)(unsigned long *rmapp, void *data);
 static void walk_memslot_rmap_nolock(struct kvm_memory_slot *slot,
 handle_rmap_fun fun, void *data)
@@ -1812,6 +1817,65 @@ void free_meslot_rmap_desc_nolock(struct kvm_memory_slot 
*slot)
walk_memslot_rmap_nolock(slot, free_rmap_desc_nolock, NULL);
 }
 
+/*
+ * Fast invalid all shadow pages belong to @slot.
+ *
+ * @slot != NULL means the invalidation is caused the memslot specified
+ * by @slot is being deleted, in this case, we should ensure that rmap
+ * and lpage-info of the @slot can not be used after calling the function.
+ * Specially, if @slot is INVALID_ALL_SLOTS, all slots will be deleted
+ * soon, it always happens when kvm is being destroyed.
+ *
+ * @slot == NULL means the invalidation due to other reasons, we need
+ * not care rmap and lpage-info since they are still valid after calling
+ * the function.
+ */
+void kvm_mmu_invalid_memslot_pages(struct kvm *kvm,
+  struct kvm_memory_slot *slot)
+{
+   struct kvm_memory_slot *each_slot;
+
+   spin_lock(&kvm->mmu_lock);
+   kvm->arch.mmu_valid_gen++;
+
+   if (slot == INVALID_ALL_SLOTS)
+   kvm_for_each_memslot(each_slot, kvm_memslots(kvm))
+   init_invalid_memslot_rmap_ops(each_slot);
+   else if (slot)
+   init_invalid_memslot_rmap_ops(slot);
+
+   /*
+* All shadow paes are invalid, reset the large page info,
+* then we can safely desotry the memslot, it is also good
+* for large page used.
+*/
+   kvm_clear_all_lpage_info(kvm);
+
+   /*
+* Notify all vcpus to reload its shadow page table
+* and flush TLB. Then all vcpus will switch to new
+* shadow page table with the new mmu_valid_gen.
+*
+* Note: we should do this under the protection of
+* mmu-lock, otherwise, vcpu would purge shadow page
+* but miss tlb flush.
+*/
+   kvm_reload_remote_mmus(kvm);
+   spin_unlock(&kvm->mmu_lock);
+
+   if (slot == INVALID_ALL_SLOTS)
+

[PATCH v3 15/15] KVM: MMU: replace kvm_zap_all with kvm_mmu_invalid_all_pages

2013-04-15 Thread Xiao Guangrong
Use kvm_mmu_invalid_all_pages in kvm_arch_flush_shadow_all and
rename kvm_zap_all to kvm_free_all which is used to free all
memmory used by kvm mmu when vm is being destroyed, at this time,
no vcpu exists and mmu-notify has been unregistered, so we can
free the shadow pages out of mmu-lock

Signed-off-by: Xiao Guangrong 
---
 arch/x86/include/asm/kvm_host.h |2 +-
 arch/x86/kvm/mmu.c  |   15 ++-
 arch/x86/kvm/x86.c  |9 -
 3 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6f8ee18..a336055 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -771,7 +771,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int 
slot);
 void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 struct kvm_memory_slot *slot,
 gfn_t gfn_offset, unsigned long mask);
-void kvm_mmu_zap_all(struct kvm *kvm);
+void kvm_mmu_free_all(struct kvm *kvm);
 void kvm_arch_init_generation(struct kvm *kvm);
 void kvm_mmu_invalid_mmio_sptes(struct kvm *kvm);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 12129b7..10c43ea 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4639,28 +4639,17 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, 
int slot)
spin_unlock(&kvm->mmu_lock);
 }
 
-void kvm_mmu_zap_all(struct kvm *kvm)
+void kvm_mmu_free_all(struct kvm *kvm)
 {
struct kvm_mmu_page *sp, *node;
LIST_HEAD(invalid_list);
 
-   might_sleep();
-
-   spin_lock(&kvm->mmu_lock);
 restart:
-   list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
+   list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
goto restart;
 
-   if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
-   kvm_mmu_commit_zap_page(kvm, &invalid_list);
-   cond_resched_lock(&kvm->mmu_lock);
-   goto restart;
-   }
-   }
-
kvm_mmu_commit_zap_page(kvm, &invalid_list);
-   spin_unlock(&kvm->mmu_lock);
 }
 
 static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d3dd0d5..4bb88f5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6840,6 +6840,7 @@ void kvm_arch_sync_events(struct kvm *kvm)
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+   kvm_mmu_free_all(kvm);
kvm_iommu_unmap_guest(kvm);
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
@@ -7056,11 +7057,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
-   int idx;
-
-   idx = srcu_read_lock(&kvm->srcu);
-   kvm_mmu_zap_all(kvm);
-   srcu_read_unlock(&kvm->srcu, idx);
+   mutex_lock(&kvm->slots_lock);
+   kvm_mmu_invalid_memslot_pages(kvm, INVALID_ALL_SLOTS);
+   mutex_unlock(&kvm->slots_lock);
 }
 
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 10/15] KVM: x86: introduce memslot_set_lpage_disallowed

2013-04-15 Thread Xiao Guangrong
It is used to set disallowed lage page on the specified level, can be
used in later patch

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/x86.c |   53 ++-
 1 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bec83cd..0c5bb2c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6875,13 +6875,46 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
}
 }
 
+static void memslot_set_lpage_disallowed(struct kvm_memory_slot *slot,
+unsigned long npages,
+int lpage_size, int lpages)
+{
+   struct kvm_lpage_info *lpage_info;
+   unsigned long ugfn;
+   int level = lpage_size + 1;
+
+   WARN_ON(!lpage_size);
+
+   lpage_info = slot->arch.lpage_info[lpage_size - 1];
+
+   if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
+   lpage_info[0].write_count = 1;
+
+   if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
+   lpage_info[lpages - 1].write_count = 1;
+
+   ugfn = slot->userspace_addr >> PAGE_SHIFT;
+
+   /*
+* If the gfn and userspace address are not aligned wrt each
+* other, or if explicitly asked to, disable large page
+* support for this slot
+*/
+   if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
+ !kvm_largepages_enabled()) {
+   unsigned long j;
+
+   for (j = 0; j < lpages; ++j)
+   lpage_info[j].write_count = 1;
+   }
+}
+
 static int kvm_arch_create_memslot(struct kvm_memory_slot *slot)
 {
unsigned long npages = slot->npages;
int i;
 
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
-   unsigned long ugfn;
int lpages;
int level = i + 1;
 
@@ -6900,23 +6933,7 @@ static int kvm_arch_create_memslot(struct 
kvm_memory_slot *slot)
if (!slot->arch.lpage_info[i - 1])
goto out_free;
 
-   if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
-   slot->arch.lpage_info[i - 1][0].write_count = 1;
-   if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 
1))
-   slot->arch.lpage_info[i - 1][lpages - 1].write_count = 
1;
-   ugfn = slot->userspace_addr >> PAGE_SHIFT;
-   /*
-* If the gfn and userspace address are not aligned wrt each
-* other, or if explicitly asked to, disable large page
-* support for this slot
-*/
-   if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) 
||
-   !kvm_largepages_enabled()) {
-   unsigned long j;
-
-   for (j = 0; j < lpages; ++j)
-   slot->arch.lpage_info[i - 1][j].write_count = 1;
-   }
+   memslot_set_lpage_disallowed(slot, npages, i, lpages);
}
 
init_memslot_rmap_ops(slot);
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 04/15] KVM: MMU: abstract memslot rmap related operations

2013-04-15 Thread Xiao Guangrong
Introduce slot_rmap_* functions to abstract memslot rmap related
operations which makes the later patch more clearer

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c   |  108 +-
 arch/x86/kvm/mmu_audit.c |   10 +++--
 2 files changed, 84 insertions(+), 34 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index dcc059c..514f5b1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1033,14 +1033,14 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int 
level,
 }
 
 /*
- * Take gfn and return the reverse mapping to it.
+ * Take gfn and return the memslot and reverse mapping to it.
  */
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
+static unsigned long *gfn_to_rmap(struct kvm *kvm,
+ struct kvm_memory_slot **slot,
+ gfn_t gfn, int level)
 {
-   struct kvm_memory_slot *slot;
-
-   slot = gfn_to_memslot(kvm, gfn);
-   return __gfn_to_rmap(gfn, level, slot);
+   *slot = gfn_to_memslot(kvm, gfn);
+   return __gfn_to_rmap(gfn, level, *slot);
 }
 
 static bool rmap_can_add(struct kvm_vcpu *vcpu)
@@ -1051,27 +1051,42 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu)
return mmu_memory_cache_free_objects(cache);
 }
 
+static int slot_rmap_add(struct kvm_memory_slot *slot,
+struct kvm_vcpu *vcpu, unsigned long *rmapp,
+u64 *spte)
+{
+   return pte_list_add(vcpu, spte, rmapp);
+}
+
+static void slot_rmap_remove(struct kvm_memory_slot *slot,
+unsigned long *rmapp, u64 *spte)
+{
+   pte_list_remove(spte, rmapp);
+}
+
 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 {
+   struct kvm_memory_slot *slot;
struct kvm_mmu_page *sp;
unsigned long *rmapp;
 
sp = page_header(__pa(spte));
kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
-   rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
-   return pte_list_add(vcpu, spte, rmapp);
+   rmapp = gfn_to_rmap(vcpu->kvm,  &slot, gfn, sp->role.level);
+   return slot_rmap_add(slot, vcpu, rmapp, spte);
 }
 
 static void rmap_remove(struct kvm *kvm, u64 *spte)
 {
+   struct kvm_memory_slot *slot;
struct kvm_mmu_page *sp;
gfn_t gfn;
unsigned long *rmapp;
 
sp = page_header(__pa(spte));
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
-   rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
-   pte_list_remove(spte, rmapp);
+   rmapp = gfn_to_rmap(kvm, &slot, gfn, sp->role.level);
+   slot_rmap_remove(slot, rmapp, spte);
 }
 
 /*
@@ -1219,6 +1234,13 @@ static bool __rmap_write_protect(struct kvm *kvm, 
unsigned long *rmapp,
return flush;
 }
 
+static bool slot_rmap_write_protect(struct kvm_memory_slot *slot,
+   struct kvm *kvm, unsigned long *rmapp,
+   bool pt_protect)
+{
+   return __rmap_write_protect(kvm, rmapp, pt_protect);
+}
+
 /**
  * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
  * @kvm: kvm instance
@@ -1238,7 +1260,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
while (mask) {
rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
  PT_PAGE_TABLE_LEVEL, slot);
-   __rmap_write_protect(kvm, rmapp, false);
+   slot_rmap_write_protect(slot, kvm, rmapp, false);
 
/* clear the first set bit */
mask &= mask - 1;
@@ -1257,14 +1279,14 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
for (i = PT_PAGE_TABLE_LEVEL;
 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
rmapp = __gfn_to_rmap(gfn, i, slot);
-   write_protected |= __rmap_write_protect(kvm, rmapp, true);
+   write_protected |= slot_rmap_write_protect(slot, kvm, rmapp,
+  true);
}
 
return write_protected;
 }
 
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
-  struct kvm_memory_slot *slot, unsigned long data)
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
 {
u64 *sptep;
struct rmap_iterator iter;
@@ -1281,14 +1303,19 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned 
long *rmapp,
return need_tlb_flush;
 }
 
+static int slot_rmap_unmap(struct kvm *kvm, unsigned long *rmapp,
+  struct kvm_memory_slot *slot, unsigned long data)
+{
+   return kvm_unmap_rmapp(kvm, rmapp);
+}
+
 static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
-struct kvm_memory_slot *slot, unsigned long data)
+pte_t *ptep)
 {
u64 *sptep;

[PATCH v3 01/15] KVM: x86: clean up and optimize for kvm_arch_free_memslot

2013-04-15 Thread Xiao Guangrong
memslot rmap and lpage-info are never partly reused and nothing need
be freed when new memslot is created

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/x86.c |   21 -
 1 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4be4733..b0be7ec 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6856,19 +6856,22 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
 {
int i;
 
+   if (dont && dont->arch.rmap[0] == free->arch.rmap[0])
+   return;
+
+   /* It is a empty memslot. */
+   if (!free->arch.rmap[0])
+   return;
+
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
-   if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
-   kvm_kvfree(free->arch.rmap[i]);
-   free->arch.rmap[i] = NULL;
-   }
+   kvm_kvfree(free->arch.rmap[i]);
+   free->arch.rmap[i] = NULL;
+
if (i == 0)
continue;
 
-   if (!dont || free->arch.lpage_info[i - 1] !=
-dont->arch.lpage_info[i - 1]) {
-   kvm_kvfree(free->arch.lpage_info[i - 1]);
-   free->arch.lpage_info[i - 1] = NULL;
-   }
+   kvm_kvfree(free->arch.lpage_info[i - 1]);
+   free->arch.lpage_info[i - 1] = NULL;
}
 }
 
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 00/15] KVM: MMU: fast zap all shadow pages

2013-04-15 Thread Xiao Guangrong
This patchset is based on my previous two patchset:
[PATCH 0/2] KVM: x86: avoid potential soft lockup and unneeded mmu reload
(https://lkml.org/lkml/2013/4/1/2)

[PATCH v2 0/6] KVM: MMU: fast invalid all mmio sptes
(https://lkml.org/lkml/2013/4/1/134)

Changlog:
V3:
  completely redesign the algorithm, please see below.

V2:
  - do not reset n_requested_mmu_pages and n_max_mmu_pages
  - batch free root shadow pages to reduce vcpu notification and mmu-lock
contention
  - remove the first patch that introduce kvm->arch.mmu_cache since we only
'memset zero' on hashtable rather than all mmu cache members in this
version
  - remove unnecessary kvm_reload_remote_mmus after kvm_mmu_zap_all

* Issue
The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
walk and zap all shadow pages one by one, also it need to zap all guest
page's rmap and all shadow page's parent spte list. Particularly, things
become worse if guest uses more memory or vcpus. It is not good for
scalability.

* Idea
KVM maintains a global mmu invalid generation-number which is stored in
kvm->arch.mmu_valid_gen and every shadow page stores the current global
generation-number into sp->mmu_valid_gen when it is created.

When KVM need zap all shadow pages sptes, it just simply increase the
global generation-number then reload root shadow pages on all vcpus.
Vcpu will create a new shadow page table according to current kvm's
generation-number. It ensures the old pages are not used any more.

The invalid-gen pages (sp->mmu_valid_gen != kvm->arch.mmu_valid_gen)
are keeped in mmu-cache until page allocator reclaims page.

* Challenges
Some page invalidation is requested when memslot is moved or deleted
and kvm is being destroy who call zap_all_pages to delete all sp using
their rmap and lpage-info, after call zap_all_pages, the rmap and lpage-info
will be freed. So, we should implement a fast way to delete sp from the rmap
and lpage-info.

For the lpage-info, we clear all lpage count when do zap-all-pages, then
all invalid shadow pages are not counted in lpage-info, after that lpage-info
on the invalid memslot can be safely freed. This is also good for the
performance - it allows guest to use hugepage as far as possible.

For the rmap, we introduce a way to unmap rmap out of mmu-lock.
In order to do that, we should resolve these problems:
1) do not corrupt the rmap
2) keep pte-list-descs available
3) keep shadow page available

Resolve 1):
we make the invalid rmap be remove-only that means we only delete and
clear spte from the rmap, no new sptes can be added to it.
This is reasonable since kvm can not do address translation on invalid rmap
(gfn_to_pfn is failed on invalid memslot) and all sptes on invalid rmap can
not be reused (they belong to invalid shadow page).

Resolve 2):
We use the placeholder (PTE_LIST_SPTE_SKIP) to indicate spte has been deleted
from the rmap instead of freeing pte-list-descs and moving sptes. Then, the
pte-list-desc entry are available when concurrently unmap the rmap.
The pte-list-descs are freed when the memslot is not visible to all vcpus.

Resolve 3):
we protect the lifecycle of sp by this algorithm:

unmap-rmap-out-of-mmu-lock():
for-each-rmap-in-slot:
  preempt_disable
  kvm->arch.being_unmapped_rmap = rmapp

  clear spte and reset rmap entry

  kvm->arch.being_unmapped_rmap = NULL
  preempt_enable

Other patch like zap-sp and mmu-notify which are protected
by mmu-lock:

  clear spte and reset rmap entry
retry:
  if (kvm->arch.being_unmapped_rmap == rmap)
goto retry
(the wait is very rare and clear one rmap is very fast, it
is not bad even if wait is needed)

Then, we can sure the spte is always available when we concurrently unmap the
rmap


* TODO
Use a better algorithm to free pte-list-desc, for example, we can link them
together by desc->more.

* Performance
We observably reduce the contention of mmu-lock and make the invalidation
be preemptable.

Xiao Guangrong (15):
  KVM: x86: clean up and optimize for kvm_arch_free_memslot
  KVM: fold kvm_arch_create_memslot into kvm_arch_prepare_memory_region
  KVM: x86: do not reuse rmap when memslot is moved
  KVM: MMU: abstract memslot rmap related operations
  KVM: MMU: allow per-rmap operations
  KVM: MMU: allow concurrently clearing spte on remove-only pte-list
  KVM: MMU: introduce invalid rmap handlers
  KVM: MMU: allow unmap invalid rmap out of mmu-lock
  KVM: MMU: introduce free_meslot_rmap_desc_nolock
  KVM: x86: introduce memslot_set_lpage_disallowed
  KVM: MMU: introduce kvm_clear_all_lpage_info
  KVM: MMU: fast invalid all shadow pages
  KVM: x86: use the fast way to invalid all pages
  KVM: move srcu_read_lock/srcu_read_unlock to arch-specified code
  KVM: MMU: replace kvm_zap_all with kvm_mmu_invalid_all_pages

 arch/arm/kvm/arm.c  |5 -
 arch/ia64/kvm/kvm-ia64.c|5 -
 arch/powerpc/kvm/powerpc.c  |8 +-

[PATCH v3 02/15] KVM: fold kvm_arch_create_memslot into kvm_arch_prepare_memory_region

2013-04-15 Thread Xiao Guangrong
It removes a arch-specified interface and also removes unnecessary
empty functions on some architectures

Signed-off-by: Xiao Guangrong 
---
 arch/arm/kvm/arm.c |5 -
 arch/ia64/kvm/kvm-ia64.c   |5 -
 arch/powerpc/kvm/powerpc.c |8 ++--
 arch/s390/kvm/kvm-s390.c   |5 -
 arch/x86/kvm/x86.c |7 ++-
 include/linux/kvm_host.h   |1 -
 virt/kvm/kvm_main.c|8 ++--
 7 files changed, 14 insertions(+), 25 deletions(-)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index e4ad0bb..c76e63e 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -159,11 +159,6 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
 {
 }
 
-int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
-{
-   return 0;
-}
-
 /**
  * kvm_arch_destroy_vm - destroy the VM data structure
  * @kvm:   pointer to the KVM struct
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 7a54455..fcfb03b 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1553,11 +1553,6 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
 {
 }
 
-int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
-{
-   return 0;
-}
-
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *memslot,
struct kvm_userspace_memory_region *mem,
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 16b4595..aab8039 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -405,9 +405,9 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
kvmppc_core_free_memslot(free, dont);
 }
 
-int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+static int kvm_arch_create_memslot(struct kvm_memory_slot *slot)
 {
-   return kvmppc_core_create_memslot(slot, npages);
+   return kvmppc_core_create_memslot(slot, slot->npages);
 }
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -415,6 +415,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
   struct kvm_userspace_memory_region *mem,
   enum kvm_mr_change change)
 {
+   if (change == KVM_MR_CREATE)
+   if (kvm_arch_create_memslot(memslot))
+   return -ENOMEM;
+
return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
 }
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 33161b4..7bfd6f6 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -967,11 +967,6 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
 {
 }
 
-int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
-{
-   return 0;
-}
-
 /* Section: memory related */
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
   struct kvm_memory_slot *memslot,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b0be7ec..447789c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6875,8 +6875,9 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
}
 }
 
-int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+static int kvm_arch_create_memslot(struct kvm_memory_slot *slot)
 {
+   unsigned long npages = slot->npages;
int i;
 
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
@@ -6938,6 +6939,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
enum kvm_mr_change change)
 {
+   if (change == KVM_MR_CREATE)
+   if (kvm_arch_create_memslot(memslot))
+   return -ENOMEM;
+
/*
 * Only private memory slots need to be mapped here since
 * KVM_SET_MEMORY_REGION ioctl is no longer supported.
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1c0be23..f39ec18 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -493,7 +493,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem);
 void kvm_arch_free_memslot(struct kvm_memory_slot *free,
   struct kvm_memory_slot *dont);
-int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long 
npages);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *memslot,
struct kvm_userspace_memory_region *mem,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d21694a..acc9f30 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -825,13 +825,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
new.dirty_bitmap = NULL;
 
-   r = -ENOMEM;
-   if (change == 

[PATCH v3 05/15] KVM: MMU: allow per-rmap operations

2013-04-15 Thread Xiao Guangrong
Introduce rmap_operations to allow rmap having different operations,
then, we are able to handle invalid rmap specially

Signed-off-by: Xiao Guangrong 
---
 arch/x86/include/asm/kvm_host.h |1 +
 arch/x86/kvm/mmu.c  |   31 ---
 arch/x86/kvm/mmu.h  |   16 
 arch/x86/kvm/x86.c  |1 +
 4 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4e1f7cb..5fd6ed1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -511,6 +511,7 @@ struct kvm_lpage_info {
 };
 
 struct kvm_arch_memory_slot {
+   struct rmap_operations *ops;
unsigned long *rmap[KVM_NR_PAGE_SIZES];
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
 };
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 514f5b1..99ad2a4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1055,13 +1055,13 @@ static int slot_rmap_add(struct kvm_memory_slot *slot,
 struct kvm_vcpu *vcpu, unsigned long *rmapp,
 u64 *spte)
 {
-   return pte_list_add(vcpu, spte, rmapp);
+   return slot->arch.ops->rmap_add(vcpu, spte, rmapp);
 }
 
 static void slot_rmap_remove(struct kvm_memory_slot *slot,
 unsigned long *rmapp, u64 *spte)
 {
-   pte_list_remove(spte, rmapp);
+   slot->arch.ops->rmap_remove(spte, rmapp);
 }
 
 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -1238,7 +1238,7 @@ static bool slot_rmap_write_protect(struct 
kvm_memory_slot *slot,
struct kvm *kvm, unsigned long *rmapp,
bool pt_protect)
 {
-   return __rmap_write_protect(kvm, rmapp, pt_protect);
+   return slot->arch.ops->rmap_write_protect(kvm, rmapp, pt_protect);
 }
 
 /**
@@ -1306,7 +1306,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long 
*rmapp)
 static int slot_rmap_unmap(struct kvm *kvm, unsigned long *rmapp,
   struct kvm_memory_slot *slot, unsigned long data)
 {
-   return kvm_unmap_rmapp(kvm, rmapp);
+   return slot->arch.ops->rmap_unmap(kvm, rmapp);
 }
 
 static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
@@ -1353,7 +1353,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned 
long *rmapp,
 static int slot_rmap_set_pte(struct kvm *kvm, unsigned long *rmapp,
 struct kvm_memory_slot *slot, unsigned long data)
 {
-   return kvm_set_pte_rmapp(kvm, rmapp, (pte_t *)data);
+   return slot->arch.ops->rmap_set_pte(kvm, rmapp, (pte_t *)data);
 }
 
 static int kvm_handle_hva_range(struct kvm *kvm,
@@ -1470,7 +1470,7 @@ out:
 static int slot_rmap_age(struct kvm *kvm, unsigned long *rmapp,
 struct kvm_memory_slot *slot, unsigned long data)
 {
-   int young = kvm_age_rmapp(kvm, rmapp);
+   int young = slot->arch.ops->rmap_age(kvm, rmapp);
 
/* @data has hva passed to kvm_age_hva(). */
trace_kvm_age_page(data, slot, young);
@@ -1508,7 +1508,7 @@ static int slot_rmap_test_age(struct kvm *kvm, unsigned 
long *rmapp,
  struct kvm_memory_slot *slot,
  unsigned long data)
 {
-   return kvm_test_age_rmapp(kvm, rmapp);
+   return slot->arch.ops->rmap_test_age(kvm, rmapp);
 }
 
 #define RMAP_RECYCLE_THRESHOLD 1000
@@ -1537,6 +1537,23 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
return kvm_handle_hva(kvm, hva, 0, slot_rmap_test_age);
 }
 
+static struct rmap_operations normal_rmap_ops = {
+   .rmap_add = pte_list_add,
+   .rmap_remove = pte_list_remove,
+
+   .rmap_write_protect = __rmap_write_protect,
+
+   .rmap_set_pte = kvm_set_pte_rmapp,
+   .rmap_age = kvm_age_rmapp,
+   .rmap_test_age = kvm_test_age_rmapp,
+   .rmap_unmap = kvm_unmap_rmapp
+};
+
+void init_memslot_rmap_ops(struct kvm_memory_slot *slot)
+{
+   slot->arch.ops = &normal_rmap_ops;
+}
+
 #ifdef MMU_DEBUG
 static int is_empty_shadow_page(u64 *spt)
 {
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index ffd40d1..bb2b22e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -114,4 +114,20 @@ static inline bool permission_fault(struct kvm_mmu *mmu, 
unsigned pte_access,
return (mmu->permissions[pfec >> 1] >> pte_access) & 1;
 }
 
+struct rmap_operations {
+   int (*rmap_add)(struct kvm_vcpu *vcpu, u64 *spte,
+   unsigned long *rmap);
+   void (*rmap_remove)(u64 *spte, unsigned long *rmap);
+
+   bool (*rmap_write_protect)(struct kvm *kvm, unsigned long *rmap,
+  bool pt_protect);
+
+   int (*rmap_set_pte)(struct kvm *kvm, unsigned long *rmap,
+   pte_t *ptep);
+   int (*rmap_age)(struct kvm

Re: [PATCH v2 0/6] KVM: MMU: fast invalid all mmio sptes

2013-04-15 Thread Xiao Guangrong

Hi Marcelo,

On 04/16/2013 08:54 AM, Marcelo Tosatti wrote:
> On Mon, Apr 01, 2013 at 05:56:43PM +0800, Xiao Guangrong wrote:
>> Changelog in v2:
>>   - rename kvm_mmu_invalid_mmio_spte to kvm_mmu_invalid_mmio_sptes
>>   - use kvm->memslots->generation as kvm global generation-number
>>   - fix comment and codestyle
>>   - init kvm generation close to mmio wrap-around value
>>   - keep kvm_mmu_zap_mmio_sptes
>>
>> The current way is holding hot mmu-lock and walking all shadow pages, this
>> is not scale. This patchset tries to introduce a very simple and scale way
>> to fast invalid all mmio sptes - it need not walk any shadow pages and hold
>> any locks.
>>
>> The idea is simple:
>> KVM maintains a global mmio invalid generation-number which is stored in
>> kvm->memslots.generation and every mmio spte stores the current global
>> generation-number into his available bits when it is created
>>
>> When KVM need zap all mmio sptes, it just simply increase the global
>> generation-number. When guests do mmio access, KVM intercepts a MMIO #PF
>> then it walks the shadow page table and get the mmio spte. If the
>> generation-number on the spte does not equal the global generation-number,
>> it will go to the normal #PF handler to update the mmio spte
>>
>> Since 19 bits are used to store generation-number on mmio spte, we zap all
>> mmio sptes when the number is round
> 
> Hi Xiao,
> 
> Is it still necessary with generation numbers at 'struct shadow_page'
> level (which covers the slot deletion case).

Yes.

> 
> That is, once kvm_mmu_zap_all is fixed to increase generation count and
> nuke roots, can't that be used instead with similar effectiveness for
> SLOT_CREATE/SLOT_MOVE cases?

It seems not easy. :(

We can not increase kvm's generation count for SLOT_CREATE since any change
on kvm->generation_count will cause all vcpus fault on _all_ memory region.

We also can not separately update mmio-sp's generation count instead of
zapping them since a sp can have both mmio-spte and normal-spte, we should
zap the normal spte on a mmio-sp.

Thanks!

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 4/6] KVM: MMU: fast invalid all mmio sptes

2013-04-01 Thread Xiao Guangrong
This patch tries to introduce a very simple and scale way to invalid all
mmio sptes - it need not walk any shadow pages and hold mmu-lock

KVM maintains a global mmio invalid generation-number which is stored in
kvm->memslots.generation and every mmio spte stores the current global
generation-number into his available bits when it is created

When KVM need zap all mmio sptes, it just simply increase the global
generation-number. When guests do mmio access, KVM intercepts a MMIO #PF
then it walks the shadow page table and get the mmio spte. If the
generation-number on the spte does not equal the global generation-number,
it will go to the normal #PF handler to update the mmio spte

Since 19 bits are used to store generation-number on mmio spte, we zap all
mmio sptes when the number is round

Signed-off-by: Xiao Guangrong 
---
 arch/x86/include/asm/kvm_host.h |2 +-
 arch/x86/kvm/mmu.c  |   54 +--
 arch/x86/kvm/mmu.h  |3 ++
 arch/x86/kvm/paging_tmpl.h  |7 +++-
 arch/x86/kvm/vmx.c  |4 +++
 arch/x86/kvm/x86.c  |3 +-
 6 files changed, 60 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b5a6462..6c1e642 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -767,7 +767,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 struct kvm_memory_slot *slot,
 gfn_t gfn_offset, unsigned long mask);
 void kvm_mmu_zap_all(struct kvm *kvm);
-void kvm_mmu_zap_mmio_sptes(struct kvm *kvm);
+void kvm_mmu_invalid_mmio_sptes(struct kvm *kvm);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 31c5586..1020152 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -205,9 +205,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 #define MMIO_SPTE_GEN_LOW_SHIFT3
 #define MMIO_SPTE_GEN_HIGH_SHIFT   52
 
+#define MMIO_GEN_SHIFT 19
 #define MMIO_GEN_LOW_SHIFT 9
 #define MMIO_GEN_LOW_MASK  ((1 << MMIO_GEN_LOW_SHIFT) - 1)
-#define MMIO_MAX_GEN   ((1 << 19) - 1)
+#define MMIO_GEN_MASK  ((1 << MMIO_GEN_SHIFT) - 1)
+#define MMIO_MAX_GEN   ((1 << MMIO_GEN_SHIFT) - 1)
 
 static u64 generation_mmio_spte_mask(unsigned int gen)
 {
@@ -231,15 +233,21 @@ static unsigned int get_mmio_spte_generation(u64 spte)
return gen;
 }
 
+static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
+{
+   return kvm_memslots(kvm)->generation & MMIO_GEN_MASK;
+}
+
 static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
   unsigned access)
 {
-   u64 mask = generation_mmio_spte_mask(0);
+   unsigned int gen = kvm_current_mmio_generation(kvm);
+   u64 mask = generation_mmio_spte_mask(gen);
 
access &= ACC_WRITE_MASK | ACC_USER_MASK;
mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT;
 
-   trace_mark_mmio_spte(sptep, gfn, access, 0);
+   trace_mark_mmio_spte(sptep, gfn, access, gen);
mmu_spte_set(sptep, mask);
 }
 
@@ -269,6 +277,12 @@ static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, 
gfn_t gfn,
return false;
 }
 
+static bool check_mmio_spte(struct kvm *kvm, u64 spte)
+{
+   return get_mmio_spte_generation(spte) ==
+ kvm_current_mmio_generation(kvm);
+}
+
 static inline u64 rsvd_bits(int s, int e)
 {
return ((1ULL << (e - s + 1)) - 1) << s;
@@ -3195,6 +3209,9 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, 
u64 addr, bool direct)
gfn_t gfn = get_mmio_spte_gfn(spte);
unsigned access = get_mmio_spte_access(spte);
 
+   if (unlikely(!check_mmio_spte(vcpu->kvm, spte)))
+   return RET_MMIO_PF_INVALID;
+
if (direct)
addr = 0;
 
@@ -3236,8 +3253,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, 
gva_t gva,
 
pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
 
-   if (unlikely(error_code & PFERR_RSVD_MASK))
-   return handle_mmio_page_fault(vcpu, gva, error_code, true);
+   if (unlikely(error_code & PFERR_RSVD_MASK)) {
+   r = handle_mmio_page_fault(vcpu, gva, error_code, true);
+
+   if (likely(r != RET_MMIO_PF_INVALID))
+   return r;
+   }
 
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -3313,8 +3334,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t 
gpa, u32 error_code,
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
-   if (unlikely(e

[PATCH v2 0/6] KVM: MMU: fast invalid all mmio sptes

2013-04-01 Thread Xiao Guangrong
Changelog in v2:
  - rename kvm_mmu_invalid_mmio_spte to kvm_mmu_invalid_mmio_sptes
  - use kvm->memslots->generation as kvm global generation-number
  - fix comment and codestyle
  - init kvm generation close to mmio wrap-around value
  - keep kvm_mmu_zap_mmio_sptes

The current way is holding hot mmu-lock and walking all shadow pages, this
is not scale. This patchset tries to introduce a very simple and scale way
to fast invalid all mmio sptes - it need not walk any shadow pages and hold
any locks.

The idea is simple:
KVM maintains a global mmio invalid generation-number which is stored in
kvm->memslots.generation and every mmio spte stores the current global
generation-number into his available bits when it is created

When KVM need zap all mmio sptes, it just simply increase the global
generation-number. When guests do mmio access, KVM intercepts a MMIO #PF
then it walks the shadow page table and get the mmio spte. If the
generation-number on the spte does not equal the global generation-number,
it will go to the normal #PF handler to update the mmio spte

Since 19 bits are used to store generation-number on mmio spte, we zap all
mmio sptes when the number is round

Xiao Guangrong (6):
  KVM: MMU: retain more available bits on mmio spte
  KVM: MMU: store generation-number into mmio spte
  KVM: MMU: make return value of mmio page fault handler more readable
  KVM: MMU: fast invalid all mmio sptes
  KVM: MMU: add tracepoint for check_mmio_spte
  KVM: MMU: init kvm generation close to mmio wrap-around value

 arch/x86/include/asm/kvm_host.h |3 +-
 arch/x86/kvm/mmu.c  |  134 +++
 arch/x86/kvm/mmu.h  |   17 +
 arch/x86/kvm/mmutrace.h |   34 +-
 arch/x86/kvm/paging_tmpl.h  |   10 ++-
 arch/x86/kvm/vmx.c  |   12 +++-
 arch/x86/kvm/x86.c  |   11 +++-
 virt/kvm/kvm_main.c |6 ++
 8 files changed, 186 insertions(+), 41 deletions(-)

-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


<    2   3   4   5   6   7   8   9   10   11   >