Re: [Cluster-devel] [GFS2 PATCH] GFS2: Convert LRU list to use a hash table to reduce contention

Steven Whitehouse Mon, 03 Jul 2017 02:22:14 -0700

Hi,


On 30/06/17 19:18, Bob Peterson wrote:

Hi,

This patch changes GFS2's LRU list from a single linked list with
a much-contended spin_lock to a hash table of linked lists, thus
having more locks with less contention. The key for the table is
set round-robin to evenly distribute the glocks into the buckets.

Signed-off-by: Bob Peterson <[email protected]>

This is a bit confused I think. There needs to be some more work donehere to figure out exactly what the problem is, but I'm not very keen onthis solution for a number of reasons...


 - It still claims to be LRU, but in fact it isn't any more

- The list might have been broken up, but the counter is still global,so that there is still the possibility for contention on that- There is no performance data to say whether or not there is anyimprovement or not- There should not in general be huge numbers of LRU operations onglocks anyway, so why is the LRU list contended in the first place?- It introduces a new global counter at glock creation time, sosomething else that might land up being a contention point

The important question is what causes the contention on the LRU list inthe first place - ideally we'd look at that issue in order to avoid somany list operations, rather than making it very complicated and non-LRU- at the very least there should be a justification for the new algorithm,


Steve.

---
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6cd71c5..e1e16c4 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -63,10 +63,32 @@ static void do_xmote(struct gfs2_glock *gl, struct 
gfs2_holder *gh, unsigned int

static struct dentry *gfs2_root;

  static struct workqueue_struct *glock_workqueue;
+static atomic_t lru_bucket = ATOMIC_INIT(0);
+
  struct workqueue_struct *gfs2_delete_workqueue;
-static LIST_HEAD(lru_list);
+
+#define LRU_HASH_BITS 12
+#define LRU_HASH_BUCKETS BIT(LRU_HASH_BITS)
+#define LRU_HASH_MASK (LRU_HASH_BUCKETS - 1)
+
+struct gfs2_lru_bucket {
+       struct list_head lru_list;
+       rwlock_t lru_lock;
+};
+
+static struct gfs2_lru_bucket lru_locks[LRU_HASH_BUCKETS];
+
+static inline void lock_lru_bucket(unsigned bucket)
+{
+       write_lock(&lru_locks[bucket & LRU_HASH_MASK].lru_lock);
+}
+
+static inline void unlock_lru_bucket(unsigned bucket)
+{
+       write_unlock(&lru_locks[bucket & LRU_HASH_MASK].lru_lock);
+}
+
  static atomic_t lru_count = ATOMIC_INIT(0);
-static DEFINE_SPINLOCK(lru_lock);

#define GFS2_GL_HASH_SHIFT 15

  #define GFS2_GL_HASH_SIZE       BIT(GFS2_GL_HASH_SHIFT)
@@ -126,30 +148,36 @@ static int demote_ok(const struct gfs2_glock *gl)
        return 1;
  }

-

-void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
+static void remove_from_lru_locked(struct gfs2_glock *gl)
  {
-       spin_lock(&lru_lock);
-
-       if (!list_empty(&gl->gl_lru))
+       if (!list_empty(&gl->gl_lru)) {
                list_del_init(&gl->gl_lru);
-       else
-               atomic_inc(&lru_count);
+               atomic_dec(&lru_count);
+       }
+}
+
+static void glock_remove_from_lru(struct gfs2_glock *gl)
+{
+       if (!test_and_clear_bit(GLF_LRU, &gl->gl_flags))
+               return;

- list_add_tail(&gl->gl_lru, &lru_list);

-       set_bit(GLF_LRU, &gl->gl_flags);
-       spin_unlock(&lru_lock);
+       lock_lru_bucket(gl->gl_lru_bucket);
+       remove_from_lru_locked(gl);
+       unlock_lru_bucket(gl->gl_lru_bucket);
  }

-static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)

+void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
  {
-       spin_lock(&lru_lock);
-       if (!list_empty(&gl->gl_lru)) {
-               list_del_init(&gl->gl_lru);
-               atomic_dec(&lru_count);
-               clear_bit(GLF_LRU, &gl->gl_flags);
-       }
-       spin_unlock(&lru_lock);
+       glock_remove_from_lru(gl);
+       if (test_and_set_bit(GLF_LRU, &gl->gl_flags))
+               return;
+
+       lock_lru_bucket(gl->gl_lru_bucket);
+       BUG_ON(!list_empty(&gl->gl_lru));
+       atomic_inc(&lru_count);
+       list_add_tail(&gl->gl_lru,
+                     &lru_locks[gl->gl_lru_bucket & LRU_HASH_MASK].lru_list);
+       unlock_lru_bucket(gl->gl_lru_bucket);
  }

/*

@@ -182,7 +210,7 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)

lockref_mark_dead(&gl->gl_lockref);- gfs2_glock_remove_from_lru(gl);

+       glock_remove_from_lru(gl);
        spin_unlock(&gl->gl_lockref.lock);
        rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms);
        GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
@@ -746,6 +774,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        gl->gl_hold_time = GL_GLOCK_DFT_HOLD;
        INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
        INIT_WORK(&gl->gl_delete, delete_work_func);
+       gl->gl_lru_bucket = atomic_inc_return(&lru_bucket);
+       if (gl->gl_lru_bucket >= LRU_HASH_BUCKETS) {
+               gl->gl_lru_bucket = 0;
+               atomic_set(&lru_bucket, gl->gl_lru_bucket);
+       }

mapping = gfs2_glock2aspace(gl);

        if (mapping) {
@@ -1010,8 +1043,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
                return -EIO;

- if (test_bit(GLF_LRU, &gl->gl_flags))

-               gfs2_glock_remove_from_lru(gl);
+       glock_remove_from_lru(gl);

spin_lock(&gl->gl_lockref.lock);

        add_to_queue(gh);
@@ -1343,24 +1375,20 @@ static int glock_cmp(void *priv, struct list_head *a, 
struct list_head *b)
  }

/**

- * gfs2_dispose_glock_lru - Demote a list of glocks
+ * gfs2_dispose_glock_lru - Demote a list of glocks from the same LRU bucket
   * @list: The list to dispose of
   *
   * Disposing of glocks may involve disk accesses, so that here we sort
   * the glocks by number (i.e. disk location of the inodes) so that if
   * there are any such accesses, they'll be sent in order (mostly).
- *
- * Must be called under the lru_lock, but may drop and retake this
- * lock. While the lru_lock is dropped, entries may vanish from the
- * list, but no new entries will appear on the list (since it is
- * private)
   */

-static void gfs2_dispose_glock_lru(struct list_head *list)

-__releases(&lru_lock)
-__acquires(&lru_lock)
+static long gfs2_dispose_glock_lru(struct list_head *list)
  {
-       struct gfs2_glock *gl;
+       struct gfs2_glock *gl = NULL;
+       LIST_HEAD(rejects);
+       int reject_count = 0;
+       long disposed = 0;

list_sort(NULL, list, glock_cmp);@@ -1369,23 +1397,30 @@ __acquires(&lru_lock)

                list_del_init(&gl->gl_lru);
                if (!spin_trylock(&gl->gl_lockref.lock)) {
  add_back_to_lru:
-                       list_add(&gl->gl_lru, &lru_list);
-                       atomic_inc(&lru_count);
+                       list_add_tail(&gl->gl_lru, &rejects);
+                       reject_count++;
                        continue;
                }
                if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
                        spin_unlock(&gl->gl_lockref.lock);
                        goto add_back_to_lru;
                }
-               clear_bit(GLF_LRU, &gl->gl_flags);
+               disposed++;
                gl->gl_lockref.count++;
                if (demote_ok(gl))
                        handle_callback(gl, LM_ST_UNLOCKED, 0, false);
                WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags));
                __gfs2_glock_queue_work(gl, 0);
                spin_unlock(&gl->gl_lockref.lock);
-               cond_resched_lock(&lru_lock);
        }
+       if (!list_empty(&rejects)) {
+               gl = list_first_entry(&rejects, struct gfs2_glock, gl_lru);
+               lock_lru_bucket(gl->gl_lru_bucket);
+               list_splice(&rejects, &lru_locks[gl->gl_lru_bucket].lru_list);
+               atomic_add(reject_count, &lru_count);
+               unlock_lru_bucket(gl->gl_lru_bucket);
+       }
+       return disposed;
  }

/**

@@ -1399,30 +1434,33 @@ __acquires(&lru_lock)

static long gfs2_scan_glock_lru(int nr)

  {
-       struct gfs2_glock *gl;
-       LIST_HEAD(skipped);
+       struct gfs2_glock *gl, *tmp;
        LIST_HEAD(dispose);
+       int start_bucket, bucket;
        long freed = 0;
+       static int last_lru_scan = 0;

- spin_lock(&lru_lock);

-       while ((nr-- >= 0) && !list_empty(&lru_list)) {
-               gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
-
-               /* Test for being demotable */
-               if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
-                       list_move(&gl->gl_lru, &dispose);
-                       atomic_dec(&lru_count);
-                       freed++;
-                       continue;
+       start_bucket = bucket = (last_lru_scan & LRU_HASH_MASK);
+       do {
+               lock_lru_bucket(bucket);
+               list_for_each_entry_safe(gl, tmp, &lru_locks[bucket].lru_list,
+                                        gl_lru) {
+                       /* Test for being demotable */
+                       if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
+                               if (test_and_clear_bit(GLF_LRU, &gl->gl_flags))
+                                       remove_from_lru_locked(gl);
+                               list_add_tail(&gl->gl_lru, &dispose);
+                               nr--;
+                               if (nr == 0)
+                                       break;
+                       }
                }
-
-               list_move(&gl->gl_lru, &skipped);
-       }
-       list_splice(&skipped, &lru_list);
-       if (!list_empty(&dispose))
-               gfs2_dispose_glock_lru(&dispose);
-       spin_unlock(&lru_lock);
-
+               unlock_lru_bucket(bucket);
+               if (!list_empty(&dispose))
+                       freed += gfs2_dispose_glock_lru(&dispose);
+               bucket = (bucket + 1) & LRU_HASH_MASK;
+       } while (nr && (bucket != start_bucket));
+       last_lru_scan = bucket;
        return freed;
  }

@@ -1504,7 +1542,7 @@ static void thaw_glock(struct gfs2_glock *gl)static void clear_glock(struct gfs2_glock *gl)

  {
-       gfs2_glock_remove_from_lru(gl);
+       glock_remove_from_lru(gl);

spin_lock(&gl->gl_lockref.lock);

        if (gl->gl_state != LM_ST_UNLOCKED)
@@ -1704,7 +1742,7 @@ void gfs2_dump_glock(struct seq_file *seq, const struct 
gfs2_glock *gl)
        dtime *= 1000000/HZ; /* demote time in uSec */
        if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
                dtime = 0;
-       gfs2_print_dbg(seq, "G:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d 
m:%ld\n",
+       gfs2_print_dbg(seq, "G:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d 
m:%ld l:%d\n",
                  state2str(gl->gl_state),
                  gl->gl_name.ln_type,
                  (unsigned long long)gl->gl_name.ln_number,
@@ -1713,7 +1751,8 @@ void gfs2_dump_glock(struct seq_file *seq, const struct 
gfs2_glock *gl)
                  state2str(gl->gl_demote_state), dtime,
                  atomic_read(&gl->gl_ail_count),
                  atomic_read(&gl->gl_revokes),
-                 (int)gl->gl_lockref.count, gl->gl_hold_time);
+                 (int)gl->gl_lockref.count, gl->gl_hold_time,
+                 gl->gl_lru_bucket);

list_for_each_entry(gh, &gl->gl_holders, gh_list)

                dump_holder(seq, gh);
@@ -1797,6 +1836,12 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, 
void *iter_ptr)
  int __init gfs2_glock_init(void)
  {
        int ret;
+       unsigned i;
+
+       for (i = 0; i < LRU_HASH_BUCKETS; i++) {
+               INIT_LIST_HEAD(&lru_locks[i].lru_list);
+               rwlock_init(&lru_locks[i].lru_lock);
+       }

ret = rhashtable_init(&gl_hash_table, &ht_parms);

        if (ret < 0)
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 2ad003b..e8c0f2b 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -374,6 +374,7 @@ struct gfs2_glock {
                } gl_vm;
        };
        struct rhash_head gl_node;
+       int gl_lru_bucket;
  };

#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Convert LRU list to use a hash table to reduce contention

Reply via email to