[Cluster-devel] [GFS2 PATCH] GFS2: Set of distributed preferences for rgrps

Bob Peterson Wed, 08 Oct 2014 20:26:28 -0700

Hi,

This patch uses journal numbers to evenly distribute which node
prefers which resource groups for block allocations. This is to
help performance. The idea is to make each node in the cluster
prefer to use a certain subset of resource groups for its block
allocations. Other nodes use a different subset, thus minimizing
inter-node DLM communications.


GFS1 has a similar scheme, but with GFS1, each node starts out
using an initial section of the file system and works their way
forward. While this works for the most part, simultaneous writes
by different nodes can cause the heads to bounce on some devices.
Also, nodes tend to stray from their initial locations. With this
patch, the preferred resource groups are assigned in a round-robin
fashion.

Tests prove that this patch can be a major performance boost for
some applications. For example, using one particular application,
I posted these run times before applying this patch:

Run 1 time: 2hr 49min 39sec
Run 2 time: 2hr 57min 59sec
Run 3 time: 3hr 1min 10sec

With the patch applied the times are improved by almost a third:

Run 1 time: 2hr 7min 31sec
Run 2 time: 2hr 5min 49sec
Run 3 time: 2hr 5min 1sec
Run 4 time: 2hr 4min 35sec
Run 5 time: 2hr 5min 55sec

Regards,

Bob Peterson
Red Hat File Systems

Signed-off-by: Bob Peterson <rpete...@redhat.com> 
---
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index f98fa37..c9f4f4c 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -97,6 +97,7 @@ struct gfs2_rgrpd {
 #define GFS2_RDF_CHECK         0x10000000 /* check for unlinked inodes */
 #define GFS2_RDF_UPTODATE      0x20000000 /* rg is up to date */
 #define GFS2_RDF_ERROR         0x40000000 /* error in rg */
+#define GFS2_RDF_PREFERRED     0x80000000 /* This rgrp is preferred */
 #define GFS2_RDF_MASK          0xf0000000 /* mask for internal flags */
        spinlock_t rd_rsspin;           /* protects reservation related vars */
        struct rb_root rd_rstree;       /* multi-block reservation tree */
@@ -809,6 +810,7 @@ struct gfs2_sbd {
        char sd_table_name[GFS2_FSNAME_LEN];
        char sd_proto_name[GFS2_FSNAME_LEN];
 
+       int sd_nodes;
        /* Debugging crud */
 
        unsigned long sd_last_warning;
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 641383a..5aeb03a 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1113,6 +1113,8 @@ static void gdlm_recover_done(void *arg, struct dlm_slot 
*slots, int num_slots,
        struct gfs2_sbd *sdp = arg;
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
 
+       BUG_ON(num_slots == 0);
+       sdp->sd_nodes = num_slots;
        /* ensure the ls jid arrays are large enough */
        set_recover_size(sdp, slots, num_slots);
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index d3eae24..bf3193f 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -134,6 +134,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        atomic_set(&sdp->sd_log_freeze, 0);
        atomic_set(&sdp->sd_frozen_root, 0);
        init_waitqueue_head(&sdp->sd_frozen_root_wait);
+       sdp->sd_nodes = 1;
 
        return sdp;
 }
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 986c33f..bd8bddc 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -936,7 +936,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
        rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize;
        rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * 
bsize) - 1;
        rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
-       rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
+       rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
        if (rgd->rd_data > sdp->sd_max_rg_data)
                sdp->sd_max_rg_data = rgd->rd_data;
        spin_lock(&sdp->sd_rindex_spin);
@@ -955,6 +955,36 @@ fail:
 }
 
 /**
+ * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
+ * @sdp: the GFS2 superblock
+ *
+ * The purpose of this function is to select a subset of the resource groups
+ * and mark them as PREFERRED. We do it in such a way that each node prefers
+ * to use a unique set of rgrps to minimize glock contention.
+ */
+static void set_rgrp_preferences(struct gfs2_sbd *sdp)
+{
+       struct gfs2_rgrpd *rgd, *first;
+       int i;
+
+       /* Skip an initial number of rgrps, based on this node's journal ID.
+          That should start each node out on its own set. */
+       rgd = gfs2_rgrpd_get_first(sdp);
+       for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
+               rgd = gfs2_rgrpd_get_next(rgd);
+       first = rgd;
+
+       do {
+               rgd->rd_flags |= GFS2_RDF_PREFERRED;
+               for (i = 0; i < sdp->sd_nodes; i++) {
+                       rgd = gfs2_rgrpd_get_next(rgd);
+                       if (rgd == first)
+                               break;
+               }
+       } while (rgd != first);
+}
+
+/**
  * gfs2_ri_update - Pull in a new resource index from the disk
  * @ip: pointer to the rindex inode
  *
@@ -973,6 +1003,8 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
        if (error < 0)
                return error;
 
+       set_rgrp_preferences(sdp);
+
        sdp->sd_rindex_uptodate = 1;
        return 0;
 }
@@ -1891,6 +1923,25 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, 
const struct gfs2_rgrpd *b
 }
 
 /**
+ * fast_to_acquire - determine if a resource group will be fast to acquire
+ *
+ * If this is one of our preferred rgrps, it should be quicker to acquire,
+ * because we tried to set ourselves up as dlm lock master.
+ */
+static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
+{
+       struct gfs2_glock *gl = rgd->rd_gl;
+
+       if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
+           !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
+           !test_bit(GLF_DEMOTE, &gl->gl_flags))
+               return 1;
+       if (rgd->rd_flags & GFS2_RDF_PREFERRED)
+               return 1;
+       return 0;
+}
+
+/**
  * gfs2_inplace_reserve - Reserve space in the filesystem
  * @ip: the inode to reserve space for
  * @ap: the allocation parameters
@@ -1932,10 +1983,15 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const 
struct gfs2_alloc_parms *a
                        rg_locked = 0;
                        if (skip && skip--)
                                goto next_rgrp;
-                       if (!gfs2_rs_active(rs) && (loops < 2) &&
-                            gfs2_rgrp_used_recently(rs, 1000) &&
-                            gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
-                               goto next_rgrp;
+                       if (!gfs2_rs_active(rs)) {
+                               if (loops == 0 &&
+                                   !fast_to_acquire(rs->rs_rbm.rgd))
+                                       goto next_rgrp;
+                               if ((loops < 3) &&
+                                   gfs2_rgrp_used_recently(rs, 1000) &&
+                                   gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
+                                       goto next_rgrp;
+                       }
                        error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
                                                   LM_ST_EXCLUSIVE, flags,
                                                   &rs->rs_rgd_gh);

[Cluster-devel] [GFS2 PATCH] GFS2: Set of distributed preferences for rgrps

Reply via email to