Hi, This patch uses journal numbers to evenly distribute which node prefers which resource groups for block allocations. This is to help performance. The idea is to make each node in the cluster prefer to use a certain subset of resource groups for its block allocations. Other nodes use a different subset, thus minimizing inter-node DLM communications.
GFS1 has a similar scheme, but with GFS1, each node starts out using an initial section of the file system and works their way forward. While this works for the most part, simultaneous writes by different nodes can cause the heads to bounce on some devices. Also, nodes tend to stray from their initial locations. With this patch, the preferred resource groups are assigned in a round-robin fashion. Tests prove that this patch can be a major performance boost for some applications. For example, using one particular application, I posted these run times before applying this patch: Run 1 time: 2hr 49min 39sec Run 2 time: 2hr 57min 59sec Run 3 time: 3hr 1min 10sec With the patch applied the times are improved by almost a third: Run 1 time: 2hr 7min 31sec Run 2 time: 2hr 5min 49sec Run 3 time: 2hr 5min 1sec Run 4 time: 2hr 4min 35sec Run 5 time: 2hr 5min 55sec Regards, Bob Peterson Red Hat File Systems Signed-off-by: Bob Peterson <rpete...@redhat.com> --- diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index f98fa37..c9f4f4c 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -97,6 +97,7 @@ struct gfs2_rgrpd { #define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ #define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */ #define GFS2_RDF_ERROR 0x40000000 /* error in rg */ +#define GFS2_RDF_PREFERRED 0x80000000 /* This rgrp is preferred */ #define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */ spinlock_t rd_rsspin; /* protects reservation related vars */ struct rb_root rd_rstree; /* multi-block reservation tree */ @@ -809,6 +810,7 @@ struct gfs2_sbd { char sd_table_name[GFS2_FSNAME_LEN]; char sd_proto_name[GFS2_FSNAME_LEN]; + int sd_nodes; /* Debugging crud */ unsigned long sd_last_warning; diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 641383a..5aeb03a 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1113,6 +1113,8 @@ static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots, struct gfs2_sbd *sdp = arg; struct lm_lockstruct *ls = &sdp->sd_lockstruct; + BUG_ON(num_slots == 0); + sdp->sd_nodes = num_slots; /* ensure the ls jid arrays are large enough */ set_recover_size(sdp, slots, num_slots); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index d3eae24..bf3193f 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -134,6 +134,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) atomic_set(&sdp->sd_log_freeze, 0); atomic_set(&sdp->sd_frozen_root, 0); init_waitqueue_head(&sdp->sd_frozen_root_wait); + sdp->sd_nodes = 1; return sdp; } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 986c33f..bd8bddc 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -936,7 +936,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize; rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1; rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; - rgd->rd_flags &= ~GFS2_RDF_UPTODATE; + rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED); if (rgd->rd_data > sdp->sd_max_rg_data) sdp->sd_max_rg_data = rgd->rd_data; spin_lock(&sdp->sd_rindex_spin); @@ -955,6 +955,36 @@ fail: } /** + * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use + * @sdp: the GFS2 superblock + * + * The purpose of this function is to select a subset of the resource groups + * and mark them as PREFERRED. We do it in such a way that each node prefers + * to use a unique set of rgrps to minimize glock contention. + */ +static void set_rgrp_preferences(struct gfs2_sbd *sdp) +{ + struct gfs2_rgrpd *rgd, *first; + int i; + + /* Skip an initial number of rgrps, based on this node's journal ID. + That should start each node out on its own set. */ + rgd = gfs2_rgrpd_get_first(sdp); + for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++) + rgd = gfs2_rgrpd_get_next(rgd); + first = rgd; + + do { + rgd->rd_flags |= GFS2_RDF_PREFERRED; + for (i = 0; i < sdp->sd_nodes; i++) { + rgd = gfs2_rgrpd_get_next(rgd); + if (rgd == first) + break; + } + } while (rgd != first); +} + +/** * gfs2_ri_update - Pull in a new resource index from the disk * @ip: pointer to the rindex inode * @@ -973,6 +1003,8 @@ static int gfs2_ri_update(struct gfs2_inode *ip) if (error < 0) return error; + set_rgrp_preferences(sdp); + sdp->sd_rindex_uptodate = 1; return 0; } @@ -1891,6 +1923,25 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *b } /** + * fast_to_acquire - determine if a resource group will be fast to acquire + * + * If this is one of our preferred rgrps, it should be quicker to acquire, + * because we tried to set ourselves up as dlm lock master. + */ +static inline int fast_to_acquire(struct gfs2_rgrpd *rgd) +{ + struct gfs2_glock *gl = rgd->rd_gl; + + if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) && + !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) && + !test_bit(GLF_DEMOTE, &gl->gl_flags)) + return 1; + if (rgd->rd_flags & GFS2_RDF_PREFERRED) + return 1; + return 0; +} + +/** * gfs2_inplace_reserve - Reserve space in the filesystem * @ip: the inode to reserve space for * @ap: the allocation parameters @@ -1932,10 +1983,15 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a rg_locked = 0; if (skip && skip--) goto next_rgrp; - if (!gfs2_rs_active(rs) && (loops < 2) && - gfs2_rgrp_used_recently(rs, 1000) && - gfs2_rgrp_congested(rs->rs_rbm.rgd, loops)) - goto next_rgrp; + if (!gfs2_rs_active(rs)) { + if (loops == 0 && + !fast_to_acquire(rs->rs_rbm.rgd)) + goto next_rgrp; + if ((loops < 3) && + gfs2_rgrp_used_recently(rs, 1000) && + gfs2_rgrp_congested(rs->rs_rbm.rgd, loops)) + goto next_rgrp; + } error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl, LM_ST_EXCLUSIVE, flags, &rs->rs_rgd_gh);