On 31/03/21 23:08, Ben Gardon wrote:
To reduce lock contention and interference with page fault handlers,
allow the TDP MMU function to zap a GFN range to operate under the MMU
read lock.

Signed-off-by: Ben Gardon <bgar...@google.com>
---
  arch/x86/kvm/mmu/mmu.c     |  15 ++++--
  arch/x86/kvm/mmu/tdp_mmu.c | 102 ++++++++++++++++++++++++++-----------
  arch/x86/kvm/mmu/tdp_mmu.h |   6 ++-
  3 files changed, 87 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 667d64daa82c..dcbfc784cf2f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3155,7 +3155,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t 
*root_hpa,
        sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
if (is_tdp_mmu_page(sp))
-               kvm_tdp_mmu_put_root(kvm, sp);
+               kvm_tdp_mmu_put_root(kvm, sp, false);
        else if (!--sp->root_count && sp->role.invalid)
                kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
@@ -5514,13 +5514,17 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
                }
        }
+ write_unlock(&kvm->mmu_lock);
+
        if (is_tdp_mmu_enabled(kvm)) {
-               flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
+               read_lock(&kvm->mmu_lock);
+               flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end,
+                                                 true);
                if (flush)
                        kvm_flush_remote_tlbs(kvm);
-       }
- write_unlock(&kvm->mmu_lock);
+               read_unlock(&kvm->mmu_lock);
+       }
  }

This will conflict with Sean's MMU notifier series patches:

KVM: x86/mmu: Pass address space ID to __kvm_tdp_mmu_zap_gfn_range()

What I can do for now is change the mmu.c part of that patch to

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index e6e02360ef67..9882bbd9b742 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5510,15 +5510,15 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t 
gfn_start, gfn_t gfn_end)
                }
        }
- if (flush)
-               kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
-
        if (is_tdp_mmu_enabled(kvm)) {
-               flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
-               if (flush)
-                       kvm_flush_remote_tlbs(kvm);
+               for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+                       flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
+                                                         gfn_end, flush);
        }
+ if (flush)
+               kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
+
        write_unlock(&kvm->mmu_lock);
 }
but you will have to add a separate "if (flush)" when moving the write_unlock
earlier, since there's no downgrade function for rwlocks.  In practice it's
not a huge deal since unless running nested there will be only one active MMU.

Paolo

  static bool slot_rmap_write_protect(struct kvm *kvm,
@@ -5959,7 +5963,8 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
                WARN_ON_ONCE(!sp->lpage_disallowed);
                if (is_tdp_mmu_page(sp)) {
                        kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn,
-                               sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level));
+                               sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level),
+                               false);
                } else {
                        kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
                        WARN_ON_ONCE(sp->lpage_disallowed);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index d255125059c4..0e99e4675dd4 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -27,6 +27,15 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
        INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
  }
+static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
+                                                            bool shared)
+{
+       if (shared)
+               lockdep_assert_held_read(&kvm->mmu_lock);
+       else
+               lockdep_assert_held_write(&kvm->mmu_lock);
+}
+
  void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  {
        if (!kvm->arch.tdp_mmu_enabled)
@@ -42,7 +51,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  }
static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
-                         gfn_t start, gfn_t end, bool can_yield);
+                         gfn_t start, gfn_t end, bool can_yield, bool shared);
static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
  {
@@ -66,11 +75,12 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head 
*head)
        tdp_mmu_free_sp(sp);
  }
-void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+                         bool shared)
  {
        gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
- lockdep_assert_held_write(&kvm->mmu_lock);
+       kvm_lockdep_assert_mmu_lock_held(kvm, shared);
if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
                return;
@@ -81,7 +91,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct 
kvm_mmu_page *root)
        list_del_rcu(&root->link);
        spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
- zap_gfn_range(kvm, root, 0, max_gfn, false);
+       zap_gfn_range(kvm, root, 0, max_gfn, false, shared);
call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
  }
@@ -94,11 +104,11 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct 
kvm_mmu_page *root)
   * function will return NULL.
   */
  static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
-                                             struct kvm_mmu_page *prev_root)
+                                             struct kvm_mmu_page *prev_root,
+                                             bool shared)
  {
        struct kvm_mmu_page *next_root;
- lockdep_assert_held_write(&kvm->mmu_lock); rcu_read_lock(); @@ -117,7 +127,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
        rcu_read_unlock();
if (prev_root)
-               kvm_tdp_mmu_put_root(kvm, prev_root);
+               kvm_tdp_mmu_put_root(kvm, prev_root, shared);
return next_root;
  }
@@ -127,11 +137,15 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm 
*kvm,
   * This makes it safe to release the MMU lock and yield within the loop, but
   * if exiting the loop early, the caller must drop the reference to the most
   * recent root. (Unless keeping a live reference is desirable.)
+ *
+ * If shared is set, this function is operating under the MMU lock in read
+ * mode. In the unlikely event that this thread must free a root, the lock
+ * will be temporarily dropped and reacquired in write mode.
   */
-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root)  \
-       for (_root = tdp_mmu_next_root(_kvm, NULL);     \
-            _root;                                     \
-            _root = tdp_mmu_next_root(_kvm, _root))
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared) \
+       for (_root = tdp_mmu_next_root(_kvm, NULL, _shared);    \
+            _root;                                             \
+            _root = tdp_mmu_next_root(_kvm, _root, _shared))
/* Only safe under the MMU lock in write mode, without yielding. */
  #define for_each_tdp_mmu_root(_kvm, _root)                            \
@@ -632,7 +646,8 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm 
*kvm,
   * Return false if a yield was not needed.
   */
  static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
-                                            struct tdp_iter *iter, bool flush)
+                                            struct tdp_iter *iter, bool flush,
+                                            bool shared)
  {
        /* Ensure forward progress has been made before yielding. */
        if (iter->next_last_level_gfn == iter->yielded_gfn)
@@ -644,7 +659,11 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm 
*kvm,
                if (flush)
                        kvm_flush_remote_tlbs(kvm);
- cond_resched_rwlock_write(&kvm->mmu_lock);
+               if (shared)
+                       cond_resched_rwlock_read(&kvm->mmu_lock);
+               else
+                       cond_resched_rwlock_write(&kvm->mmu_lock);
+
                rcu_read_lock();
WARN_ON(iter->gfn > iter->next_last_level_gfn);
@@ -662,23 +681,33 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm 
*kvm,
   * non-root pages mapping GFNs strictly within that range. Returns true if
   * SPTEs have been cleared and a TLB flush is needed before releasing the
   * MMU lock.
+ *
   * If can_yield is true, will release the MMU lock and reschedule if the
   * scheduler needs the CPU or there is contention on the MMU lock. If this
   * function cannot yield, it will not release the MMU lock or reschedule and
   * the caller must ensure it does not supply too large a GFN range, or the
   * operation can cause a soft lockup.
+ *
+ * If shared is true, this thread holds the MMU lock in read mode and must
+ * account for the possibility that other threads are modifying the paging
+ * structures concurrently. If shared is false, this thread should hold the
+ * MMU lock in write mode.
   */
  static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
-                         gfn_t start, gfn_t end, bool can_yield)
+                         gfn_t start, gfn_t end, bool can_yield, bool shared)
  {
        struct tdp_iter iter;
        bool flush_needed = false;
+ kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
        rcu_read_lock();
tdp_root_for_each_pte(iter, root, start, end) {
+retry:
                if (can_yield &&
-                   tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
+                   tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed,
+                                             shared)) {
                        flush_needed = false;
                        continue;
                }
@@ -696,8 +725,17 @@ static bool zap_gfn_range(struct kvm *kvm, struct 
kvm_mmu_page *root,
                    !is_last_spte(iter.old_spte, iter.level))
                        continue;
- tdp_mmu_set_spte(kvm, &iter, 0);
-               flush_needed = true;
+               if (!shared) {
+                       tdp_mmu_set_spte(kvm, &iter, 0);
+                       flush_needed = true;
+               } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
+                       /*
+                        * The iter must explicitly re-read the SPTE because
+                        * the atomic cmpxchg failed.
+                        */
+                       iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+                       goto retry;
+               }
        }
rcu_read_unlock();
@@ -709,14 +747,20 @@ static bool zap_gfn_range(struct kvm *kvm, struct 
kvm_mmu_page *root,
   * non-root pages mapping GFNs strictly within that range. Returns true if
   * SPTEs have been cleared and a TLB flush is needed before releasing the
   * MMU lock.
+ *
+ * If shared is true, this thread holds the MMU lock in read mode and must
+ * account for the possibility that other threads are modifying the paging
+ * structures concurrently. If shared is false, this thread should hold the
+ * MMU in write mode.
   */
-bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
+bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
+                              bool shared)
  {
        struct kvm_mmu_page *root;
        bool flush = false;
- for_each_tdp_mmu_root_yield_safe(kvm, root)
-               flush |= zap_gfn_range(kvm, root, start, end, true);
+       for_each_tdp_mmu_root_yield_safe(kvm, root, shared)
+               flush |= zap_gfn_range(kvm, root, start, end, true, shared);
return flush;
  }
@@ -726,7 +770,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
        gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
        bool flush;
- flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn);
+       flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn, false);
        if (flush)
                kvm_flush_remote_tlbs(kvm);
  }
@@ -893,7 +937,7 @@ static __always_inline int 
kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
        int ret = 0;
        int as_id;
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
+       for_each_tdp_mmu_root_yield_safe(kvm, root, false) {
                as_id = kvm_mmu_page_as_id(root);
                slots = __kvm_memslots(kvm, as_id);
                kvm_for_each_memslot(memslot, slots) {
@@ -933,7 +977,7 @@ static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
                                     struct kvm_mmu_page *root, gfn_t start,
                                     gfn_t end, unsigned long unused)
  {
-       return zap_gfn_range(kvm, root, start, end, false);
+       return zap_gfn_range(kvm, root, start, end, false, false);
  }
int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
@@ -1098,7 +1142,7 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct 
kvm_mmu_page *root,
for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
                                   min_level, start, end) {
-               if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, false))
                        continue;
if (!is_shadow_present_pte(iter.old_spte) ||
@@ -1128,7 +1172,7 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct 
kvm_memory_slot *slot,
        int root_as_id;
        bool spte_set = false;
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
+       for_each_tdp_mmu_root_yield_safe(kvm, root, false) {
                root_as_id = kvm_mmu_page_as_id(root);
                if (root_as_id != slot->as_id)
                        continue;
@@ -1157,7 +1201,7 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct 
kvm_mmu_page *root,
        rcu_read_lock();
tdp_root_for_each_leaf_pte(iter, root, start, end) {
-               if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, false))
                        continue;
if (spte_ad_need_write_protect(iter.old_spte)) {
@@ -1193,7 +1237,7 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct 
kvm_memory_slot *slot)
        int root_as_id;
        bool spte_set = false;
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
+       for_each_tdp_mmu_root_yield_safe(kvm, root, false) {
                root_as_id = kvm_mmu_page_as_id(root);
                if (root_as_id != slot->as_id)
                        continue;
@@ -1291,7 +1335,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
        rcu_read_lock();
tdp_root_for_each_pte(iter, root, start, end) {
-               if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set, false)) {
                        spte_set = false;
                        continue;
                }
@@ -1326,7 +1370,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
        struct kvm_mmu_page *root;
        int root_as_id;
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
+       for_each_tdp_mmu_root_yield_safe(kvm, root, false) {
                root_as_id = kvm_mmu_page_as_id(root);
                if (root_as_id != slot->as_id)
                        continue;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 9961df505067..855e58856815 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -13,9 +13,11 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct 
kvm *kvm,
        return refcount_inc_not_zero(&root->tdp_mmu_root_count);
  }
-void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root);
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+                         bool shared);
-bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end);
+bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
+                              bool shared);
  void kvm_tdp_mmu_zap_all(struct kvm *kvm);
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,


Reply via email to