Hi Bob, On 15 May 2018 at 20:29, Bob Peterson <[email protected]> wrote: > This patch takes advantage of the new glock holder sharing > feature for rgrps. New functions rgrp_lock and rgrp_unlock > use a new semaphore to gain temporary exclusive access to the > rgrps. This is needed whenever a multi-block reservation is > reserved, and every time a function needs to add an rgrp to a > transaction. > > Since multiple rgrp glocks held by gfs2_rlist_alloc all require > the same access, the state parameter has been removed, and the > new holder flag is specified. > > Also, function gfs2_check_blk_type, which had previously been > using LM_ST_SHARED for access to the rgrp has been switched > to LM_ST_EXCLUSIVE with the new holder flag. This means > multiple nodes no longer hold the rgrp in SH at the same time. > However, with the new flag, local holders will still share the > rgrp, but we avoid the glock state transition from EX to SH > and back, (which may involve both DLM and the workqueue) which > is another performance boost for the block allocator.
this is looking much better than the previous version. > Signed-off-by: Bob Peterson <[email protected]> > --- > fs/gfs2/bmap.c | 2 +- > fs/gfs2/dir.c | 2 +- > fs/gfs2/incore.h | 2 ++ > fs/gfs2/inode.c | 7 ++-- > fs/gfs2/rgrp.c | 86 ++++++++++++++++++++++++++++++++++++++++-------- > fs/gfs2/rgrp.h | 2 +- > fs/gfs2/super.c | 8 +++-- > fs/gfs2/xattr.c | 8 +++-- > 8 files changed, 91 insertions(+), 26 deletions(-) > > diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c > index 0590e93494f7..ac484e6623c9 100644 > --- a/fs/gfs2/bmap.c > +++ b/fs/gfs2/bmap.c > @@ -1108,7 +1108,7 @@ static int sweep_bh_for_rgrps(struct gfs2_inode *ip, > struct gfs2_holder *rd_gh, > goto out; > } > ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, > - 0, rd_gh); > + LM_FLAG_NODE_EX, rd_gh); > if (ret) > goto out; > > diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c > index d9fb0ad6cc30..7327a9d43692 100644 > --- a/fs/gfs2/dir.c > +++ b/fs/gfs2/dir.c > @@ -2020,7 +2020,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 > index, u32 len, > l_blocks++; > } > > - gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); > + gfs2_rlist_alloc(&rlist); > > for (x = 0; x < rlist.rl_rgrps; x++) { > struct gfs2_rgrpd *rgd = > gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl); > diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h > index 0bbbaa9b05cb..03ddd01d58d4 100644 > --- a/fs/gfs2/incore.h > +++ b/fs/gfs2/incore.h > @@ -23,6 +23,7 @@ > #include <linux/percpu.h> > #include <linux/lockref.h> > #include <linux/rhashtable.h> > +#include <linux/semaphore.h> > > #define DIO_WAIT 0x00000010 > #define DIO_METADATA 0x00000020 > @@ -100,6 +101,7 @@ struct gfs2_rgrpd { > #define GFS2_RDF_PREFERRED 0x80000000 /* This rgrp is preferred */ > #define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */ > spinlock_t rd_rsspin; /* protects reservation related vars > */ > + struct semaphore rd_sem; /* ensures local rgrp exclusivity */ > struct rb_root rd_rstree; /* multi-block reservation tree */ > }; > > diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c > index 8700eb815638..3176cadfc309 100644 > --- a/fs/gfs2/inode.c > +++ b/fs/gfs2/inode.c > @@ -1121,8 +1121,8 @@ static int gfs2_unlink(struct inode *dir, struct dentry > *dentry) > if (!rgd) > goto out_inodes; > > - gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); > - > + gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, LM_FLAG_NODE_EX, > + ghs + 2); > > error = gfs2_glock_nq(ghs); /* parent */ > if (error) > @@ -1409,7 +1409,8 @@ static int gfs2_rename(struct inode *odir, struct > dentry *odentry, > */ > nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr, 1); > if (nrgd) > - gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs > + num_gh++); > + gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, > + LM_FLAG_NODE_EX, ghs + num_gh++); > } > > for (x = 0; x < num_gh; x++) { > diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c > index 8b683917a27e..7891229b836d 100644 > --- a/fs/gfs2/rgrp.c > +++ b/fs/gfs2/rgrp.c > @@ -904,6 +904,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) > rgd->rd_data = be32_to_cpu(buf.ri_data); > rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes); > spin_lock_init(&rgd->rd_rsspin); > + sema_init(&rgd->rd_sem, 1); > > error = compute_bitstructs(rgd); > if (error) > @@ -1344,6 +1345,25 @@ int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 > offset, > return -EIO; > } > > +/** > + * rgrp_lock - gain exclusive access to an rgrp > + * @gl: the glock > + * > + * This function waits for exclusive access to an rgrp whose glock is held in > + * EX, but shared via the LM_FLAG_NODE_EX holder flag. > + */ > +static void rgrp_lock(struct gfs2_rgrpd *rgd) > +{ > + BUG_ON(!gfs2_glock_is_held_excl(rgd->rd_gl)); > + down(&rgd->rd_sem); > +} > + > +static void rgrp_unlock(struct gfs2_rgrpd *rgd) > +{ > + BUG_ON(!gfs2_glock_is_held_excl(rgd->rd_gl)); > + up(&rgd->rd_sem); > +} > + > /** > * gfs2_fitrim - Generate discard requests for unused bits of the filesystem > * @filp: Any file on the filesystem > @@ -1399,7 +1419,8 @@ int gfs2_fitrim(struct file *filp, void __user *argp) > > while (1) { > > - ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); > + ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, > + LM_FLAG_NODE_EX, &gh); > if (ret) > goto out; > > @@ -1420,11 +1441,13 @@ int gfs2_fitrim(struct file *filp, void __user *argp) > /* Mark rgrp as having been trimmed */ > ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0); > if (ret == 0) { > + rgrp_lock(rgd); > bh = rgd->rd_bits[0].bi_bh; > rgd->rd_flags |= GFS2_RGF_TRIMMED; > gfs2_trans_add_meta(rgd->rd_gl, bh); > gfs2_rgrp_out(rgd, bh->b_data); > gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, bh->b_data); > + rgrp_unlock(rgd); > gfs2_trans_end(sdp); > } > } > @@ -1768,6 +1791,16 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 > state, u32 *minext, > * > * Returns: 0 if no error > * The inode, if one has been found, in inode. > + * We must be careful to avoid deadlock here: > + * > + * All transactions expect: sd_log_flush_lock followed by rgrp ex (if > neeeded), > + * but try_rgrp_unlink takes sd_log_flush_lock outside a transaction and > + * therefore must not have the rgrp ex already held. To avoid deadlock, we > + * drop the rgrp ex lock before taking the log_flush_lock, then reacquire it > + * to protect our call to gfs2_rbm_find. > + * > + * Also note that rgrp_unlock must come AFTER the caller does gfs2_rs_deltree > + * because rgrp ex needs to be held before making reservations. > */ > > static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 > skip) > @@ -1781,7 +1814,12 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, > u64 *last_unlinked, u64 skip > struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 }; > > while (1) { > + /* As explained above, we need to drop the rgrp ex lock and > + * reacquire it after we get for sd_log_flush_lock. > + */ > + rgrp_unlock(rgd); > down_write(&sdp->sd_log_flush_lock); > + rgrp_lock(rgd); > error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL, > true); > up_write(&sdp->sd_log_flush_lock); > @@ -1980,7 +2018,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct > gfs2_alloc_parms *ap) > struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); > struct gfs2_rgrpd *begin = NULL; > struct gfs2_blkreserv *rs = &ip->i_res; > - int error = 0, rg_locked, flags = 0; > + int error = 0, rg_locked, flags = LM_FLAG_NODE_EX; > u64 last_unlinked = NO_BLOCK; > int loops = 0; > u32 skip = 0; > @@ -1991,6 +2029,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct > gfs2_alloc_parms *ap) > return -EINVAL; > if (gfs2_rs_active(rs)) { > begin = rs->rs_rbm.rgd; > + flags = LM_FLAG_NODE_EX; > } else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) { > rs->rs_rbm.rgd = begin = ip->i_rgd; > } else { > @@ -2023,16 +2062,20 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, > struct gfs2_alloc_parms *ap) > &rs->rs_rgd_gh); > if (unlikely(error)) > return error; > + rgrp_lock(rs->rs_rbm.rgd); > if (!gfs2_rs_active(rs) && (loops < 2) && > gfs2_rgrp_congested(rs->rs_rbm.rgd, loops)) > goto skip_rgrp; > if (sdp->sd_args.ar_rgrplvb) { > error = update_rgrp_lvb(rs->rs_rbm.rgd); > if (unlikely(error)) { > + rgrp_unlock(rs->rs_rbm.rgd); > gfs2_glock_dq_uninit(&rs->rs_rgd_gh); > return error; > } > } > + } else { > + rgrp_lock(rs->rs_rbm.rgd); > } > > /* Skip unuseable resource groups */ > @@ -2058,14 +2101,21 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, > struct gfs2_alloc_parms *ap) > rs->rs_rbm.rgd->rd_free_clone >= ap->min_target)) { > ip->i_rgd = rs->rs_rbm.rgd; > ap->allowed = ip->i_rgd->rd_free_clone; > + rgrp_unlock(rs->rs_rbm.rgd); > return 0; > } > check_rgrp: > /* Check for unlinked inodes which can be reclaimed */ > - if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK) > + if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK) { > + /* Drop reservation if we couldn't use reserved rgrp > */ > + if (gfs2_rs_active(rs)) > + gfs2_rs_deltree(rs); > try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked, > ip->i_no_addr); > + } > skip_rgrp: > + rgrp_unlock(rs->rs_rbm.rgd); > + > /* Drop reservation, if we couldn't use reserved rgrp */ > if (gfs2_rs_active(rs)) > gfs2_rs_deltree(rs); > @@ -2169,7 +2219,7 @@ static void gfs2_alloc_extent(const struct gfs2_rbm > *rbm, bool dinode, > } > > /** > - * rgblk_free - Change alloc state of given block(s) > + * rgblk_free_and_lock - Change alloc state of given block(s) and lock rgrp > ex > * @sdp: the filesystem > * @bstart: the start of a run of blocks to free > * @blen: the length of the block run (all must lie within ONE RG!) > @@ -2178,8 +2228,8 @@ static void gfs2_alloc_extent(const struct gfs2_rbm > *rbm, bool dinode, > * Returns: Resource group containing the block(s) > */ > > -static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, > - u32 blen, unsigned char new_state) > +static struct gfs2_rgrpd *rgblk_free_and_lock(struct gfs2_sbd *sdp, u64 > bstart, > + u32 blen, unsigned char > new_state) > { > struct gfs2_rbm rbm; > struct gfs2_bitmap *bi, *bi_prev = NULL; > @@ -2192,6 +2242,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd > *sdp, u64 bstart, > } > > gfs2_rbm_from_block(&rbm, bstart); > + rgrp_lock(rbm.rgd); > while (blen--) { > bi = rbm_bi(&rbm); > if (bi != bi_prev) { > @@ -2341,6 +2392,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, > unsigned int *nblocks, > int error; > > gfs2_set_alloc_start(&rbm, ip, dinode); > + rgrp_lock(rbm.rgd); > error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false); > > if (error == -ENOSPC) { > @@ -2395,6 +2447,8 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, > unsigned int *nblocks, > gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data); > gfs2_rgrp_ondisk2lvb(rbm.rgd->rd_rgl, > rbm.rgd->rd_bits[0].bi_bh->b_data); > > + rgrp_unlock(rbm.rgd); > + > gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0); > if (dinode) > gfs2_trans_add_unrevoke(sdp, block, *nblocks); > @@ -2408,6 +2462,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, > unsigned int *nblocks, > return 0; > > rgrp_error: > + rgrp_unlock(rbm.rgd); > gfs2_rgrp_error(rbm.rgd); > return -EIO; > } > @@ -2426,7 +2481,7 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 > bstart, u32 blen, int meta) > struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); > struct gfs2_rgrpd *rgd; > > - rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE); > + rgd = rgblk_free_and_lock(sdp, bstart, blen, GFS2_BLKST_FREE); > if (!rgd) > return; > trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE); > @@ -2435,6 +2490,7 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 > bstart, u32 blen, int meta) > gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); > gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); > gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); > + rgrp_unlock(rgd); > > /* Directories keep their data in the metadata address space */ > if (meta || ip->i_depth) > @@ -2465,7 +2521,7 @@ void gfs2_unlink_di(struct inode *inode) > struct gfs2_rgrpd *rgd; > u64 blkno = ip->i_no_addr; > > - rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED); > + rgd = rgblk_free_and_lock(sdp, blkno, 1, GFS2_BLKST_UNLINKED); > if (!rgd) > return; > trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED); > @@ -2473,6 +2529,7 @@ void gfs2_unlink_di(struct inode *inode) > gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); > gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); > update_rgrp_lvb_unlinked(rgd, 1); > + rgrp_unlock(rgd); > } > > void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) > @@ -2480,7 +2537,7 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct > gfs2_inode *ip) > struct gfs2_sbd *sdp = rgd->rd_sbd; > struct gfs2_rgrpd *tmp_rgd; > > - tmp_rgd = rgblk_free(sdp, ip->i_no_addr, 1, GFS2_BLKST_FREE); > + tmp_rgd = rgblk_free_and_lock(sdp, ip->i_no_addr, 1, GFS2_BLKST_FREE); > if (!tmp_rgd) > return; > gfs2_assert_withdraw(sdp, rgd == tmp_rgd); > @@ -2494,6 +2551,7 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct > gfs2_inode *ip) > gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); > gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); > update_rgrp_lvb_unlinked(rgd, -1); > + rgrp_unlock(rgd); > > gfs2_statfs_change(sdp, 0, +1, -1); > trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE); > @@ -2522,7 +2580,8 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 > no_addr, unsigned int type) > if (!rgd) > goto fail; > > - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); > + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, > + LM_FLAG_NODE_EX, &rgd_gh); Can you move this into a separate commit? It's not obvious if this change is a winner. > if (error) > goto fail; > > @@ -2601,16 +2660,15 @@ void gfs2_rlist_add(struct gfs2_inode *ip, struct > gfs2_rgrp_list *rlist, > * > */ > > -void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state) > +void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist) > { > unsigned int x; > > rlist->rl_ghs = kmalloc(rlist->rl_rgrps * sizeof(struct gfs2_holder), > GFP_NOFS | __GFP_NOFAIL); > for (x = 0; x < rlist->rl_rgrps; x++) > - gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, > - state, 0, > - &rlist->rl_ghs[x]); > + gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, LM_ST_EXCLUSIVE, > + LM_FLAG_NODE_EX, &rlist->rl_ghs[x]); > } > > /** > diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h > index e90478e2f545..ca40cdcc4b7e 100644 > --- a/fs/gfs2/rgrp.h > +++ b/fs/gfs2/rgrp.h > @@ -68,7 +68,7 @@ struct gfs2_rgrp_list { > > extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list > *rlist, > u64 block); > -extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int > state); > +extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist); > extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); > extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); > extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock > *gl); > diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c > index cf5c7f3080d2..57f30f96b3b6 100644 > --- a/fs/gfs2/super.c > +++ b/fs/gfs2/super.c > @@ -1131,8 +1131,9 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, > struct gfs2_statfs_change_host > done = 0; > else if (rgd_next && !error) { > error = gfs2_glock_nq_init(rgd_next->rd_gl, > - LM_ST_SHARED, > - GL_ASYNC, > + LM_ST_EXCLUSIVE, > + GL_ASYNC| > + LM_FLAG_NODE_EX, These is no explanation for this change. > gh); > rgd_next = gfs2_rgrpd_get_next(rgd_next); > done = 0; > @@ -1507,7 +1508,8 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip) > goto out_qs; > } > > - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); > + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, > + LM_FLAG_NODE_EX, &gh); > if (error) > goto out_qs; > > diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c > index f2bce1e0f6fb..6a698a74dcb5 100644 > --- a/fs/gfs2/xattr.c > +++ b/fs/gfs2/xattr.c > @@ -262,7 +262,8 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, > struct buffer_head *bh, > return -EIO; > } > > - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh); > + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, > + LM_FLAG_NODE_EX, &rg_gh); > if (error) > return error; > > @@ -1314,7 +1315,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) > else > goto out; > > - gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); > + gfs2_rlist_alloc(&rlist); > > for (x = 0; x < rlist.rl_rgrps; x++) { > struct gfs2_rgrpd *rgd = > gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl); > @@ -1397,7 +1398,8 @@ static int ea_dealloc_block(struct gfs2_inode *ip) > return -EIO; > } > > - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); > + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, > + LM_FLAG_NODE_EX, &gh); > if (error) > return error; > > -- > 2.17.0 > Thanks, Andreas
