Add a rs_reserved field to struct gfs2_blkreserv to keep track of the number of blocks reserved by this particular reservation. When making a reservation with gfs2_inplace_reserve, this field is set to somewhere between ap->min_target and ap->target depending on the number of free blocks in the resource group. When allocating blocks with gfs2_alloc_blocks, rs_reserved is decremented accordingly. Eventually, any reserved but not consumed blocks are returned to the resource group by gfs2_inplace_release (via gfs2_adjust_reservation).
The reservation tree (rd_rstree) is unaffected by this change: the reservations it tracks are still advisory, and the sizes of those reservations (rs_free) are still determined by the tentative allocation sizes (i_sizehint). Since rd_reserved now tracks the number of reserved blocks rather than the number of tentatively rd_reserved blocks, we may end up with slightly different allocation patterns, though. The rd_extfail_pt optimization will still cause ill-suited resource groups to be skipped quickly. We expect to augment this with a patch that will reserve an extent of blocks rather than just reserving a number of blocks in gfs2_inplace_reserve. gfs2_alloc_blocks will then be able to consume that reserved extent before scanning for additional available blocks; this should eliminate double bitmap scanning in most cases. Signed-off-by: Andreas Gruenbacher <agrue...@redhat.com> --- fs/gfs2/file.c | 4 +-- fs/gfs2/incore.h | 1 + fs/gfs2/rgrp.c | 73 ++++++++++++++++++++++++++------------------ fs/gfs2/trace_gfs2.h | 8 +++-- 4 files changed, 52 insertions(+), 34 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 45a17b770d97d..0106abe8c6ee0 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1022,8 +1022,8 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t goto out_qunlock; /* check if the selected rgrp limits our max_blks further */ - if (ap.allowed && ap.allowed < max_blks) - max_blks = ap.allowed; + if (ip->i_res.rs_reserved < max_blks) + max_blks = ip->i_res.rs_reserved; /* Almost done. Calculate bytes that can be written using * max_blks. We also recompute max_bytes, data_blocks and diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index aaab2af6a2d98..ca25043fc26df 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -288,6 +288,7 @@ struct gfs2_blkreserv { struct gfs2_rgrpd *rs_rgd; u64 rs_start; /* start of reservation */ u32 rs_free; /* how many blocks are still free */ + u32 rs_reserved; /* number of reserved blocks */ }; /* diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 733e21cd4cf25..1f427459a584d 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -675,9 +675,6 @@ static void __rs_deltree(struct gfs2_blkreserv *rs) u64 last_block = rs->rs_start + rs->rs_free - 1; struct gfs2_bitmap *start, *last; - /* return reserved blocks to the rgrp */ - BUG_ON(rs->rs_rgd->rd_reserved < rs->rs_free); - rs->rs_rgd->rd_reserved -= rs->rs_free; /* The rgrp extent failure point is likely not to increase; it will only do so if the freed blocks are somehow contiguous with a span of free blocks that follows. Still, @@ -1546,39 +1543,31 @@ static void rs_insert(struct gfs2_inode *ip) rb_link_node(&rs->rs_node, parent, newn); rb_insert_color(&rs->rs_node, &rgd->rd_rstree); - - /* Do our rgrp accounting for the reservation */ - rgd->rd_reserved += rs->rs_free; /* blocks reserved */ spin_unlock(&rgd->rd_rsspin); trace_gfs2_rs(rs, TRACE_RS_INSERT); } /** - * rgd_free - return the number of free blocks we can allocate. + * rgd_free - compute the number of blocks we can allocate * @rgd: the resource group * - * This function returns the number of free blocks for an rgrp. - * That's the clone-free blocks (blocks that are free, not including those - * still being used for unlinked files that haven't been deleted.) - * - * It also subtracts any blocks reserved by someone else, but does not - * include free blocks that are still part of our current reservation, - * because obviously we can (and will) allocate them. + * Compute the number of blocks we can allocate in @rgd. That's the clone-free + * blocks (blocks that are free, not including those still being used for + * unlinked files that haven't been deleted) minus the blocks currently + * reserved by any reservations other than @rs. */ static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs) { - u32 tot_reserved, tot_free; - - if (WARN_ON_ONCE(rgd->rd_reserved < rs->rs_free)) - return 0; - tot_reserved = rgd->rd_reserved - rs->rs_free; - - if (rgd->rd_free_clone < tot_reserved) - tot_reserved = 0; + u32 free = 0; - tot_free = rgd->rd_free_clone - tot_reserved; - - return tot_free; + spin_lock(&rgd->rd_rsspin); + if (!WARN_ON_ONCE(rgd->rd_free_clone < rgd->rd_reserved)) { + free = rgd->rd_free_clone - rgd->rd_reserved; + if (rgd == rs->rs_rgd) + free += rs->rs_reserved; + } + spin_unlock(&rgd->rd_rsspin); + return free; } /** @@ -1606,7 +1595,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target); extlen = clamp(extlen, (u32)RGRP_RSRV_MINBLKS, free_blocks); } - if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen)) + if (free_blocks < extlen) return; /* Find bitmap block that contains bits for goal block */ @@ -2061,8 +2050,7 @@ static inline int fast_to_acquire(struct gfs2_rgrpd *rgd) * We try our best to find an rgrp that has at least ap->target blocks * available. After a couple of passes (loops == 2), the prospects of finding * such an rgrp diminish. At this stage, we return the first rgrp that has - * at least ap->min_target blocks available. Either way, we set ap->allowed to - * the number of blocks available in the chosen rgrp. + * at least ap->min_target blocks available. * * Returns: 0 on success, * -ENOMEM if a suitable rgrp can't be found @@ -2079,6 +2067,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) int loops = 0; u32 free_blocks, skip = 0; + BUG_ON(rs->rs_reserved); + if (sdp->sd_args.ar_rgrplvb) flags |= GL_SKIP; if (gfs2_assert_warn(sdp, ap->target)) @@ -2152,7 +2142,14 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) if (free_blocks >= ap->target || (loops == 2 && ap->min_target && free_blocks >= ap->min_target)) { - ap->allowed = free_blocks; + struct gfs2_rgrpd *rgd = rs->rs_rgd; + + rs->rs_reserved = ap->target; + if (rs->rs_reserved > free_blocks) + rs->rs_reserved = free_blocks; + spin_lock(&rs->rs_rgd->rd_rsspin); + rgd->rd_reserved += rs->rs_reserved; + spin_unlock(&rs->rs_rgd->rd_rsspin); return 0; } check_rgrp: @@ -2204,6 +2201,17 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) void gfs2_inplace_release(struct gfs2_inode *ip) { + struct gfs2_blkreserv *rs = &ip->i_res; + + if (rs->rs_reserved) { + struct gfs2_rgrpd *rgd = rs->rs_rgd; + + spin_lock(&rgd->rd_rsspin); + BUG_ON(rgd->rd_reserved < rs->rs_reserved); + rgd->rd_reserved -= rs->rs_reserved; + spin_unlock(&rs->rs_rgd->rd_rsspin); + rs->rs_reserved = 0; + } if (gfs2_holder_initialized(&ip->i_rgd_gh)) gfs2_glock_dq_uninit(&ip->i_rgd_gh); } @@ -2341,6 +2349,8 @@ static int gfs2_adjust_reservation(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd = rbm->rgd; int error = 0; + BUG_ON(rs->rs_reserved < len); + spin_lock(&rgd->rd_rsspin); if (unlikely(rgd->rd_free < len || rgd->rd_free_clone < len)) { fs_warn(sdp, "rgrp free block accounting error (%u %u %u)\n", @@ -2350,6 +2360,8 @@ static int gfs2_adjust_reservation(struct gfs2_inode *ip, } rgd->rd_free -= len; rgd->rd_free_clone -= len; + rgd->rd_reserved -= len; + rs->rs_reserved -= len; if (gfs2_rs_active(rs)) { u64 start = gfs2_rbm_to_block(rbm); @@ -2359,7 +2371,6 @@ static int gfs2_adjust_reservation(struct gfs2_inode *ip, rs->rs_start += len; rlen = min(rs->rs_free, len); rs->rs_free -= rlen; - rgd->rd_reserved -= rlen; trace_gfs2_rs(rs, TRACE_RS_CLAIM); if (rs->rs_start < rgd->rd_data0 + rgd->rd_data && rs->rs_free) @@ -2426,6 +2437,8 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, u64 block; /* block, within the file system scope */ int error; + BUG_ON(ip->i_res.rs_reserved < *nblocks); + gfs2_set_alloc_start(&rbm, ip, dinode); error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false); diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 7586c7629497f..282fcb1a242f9 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -598,6 +598,7 @@ TRACE_EVENT(gfs2_rs, __field( u64, inum ) __field( u64, start ) __field( u32, free ) + __field( u32, reserved ) __field( u8, func ) ), @@ -610,17 +611,20 @@ TRACE_EVENT(gfs2_rs, i_res)->i_no_addr; __entry->start = rs->rs_start; __entry->free = rs->rs_free; + __entry->reserved = rs->rs_reserved; __entry->func = func; ), - TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%lu rr:%lu %s f:%lu", + TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%lu rr:%lu %s f:%lu r:%lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->inum, (unsigned long long)__entry->start, (unsigned long long)__entry->rd_addr, (unsigned long)__entry->rd_free_clone, (unsigned long)__entry->rd_reserved, - rs_func_name(__entry->func), (unsigned long)__entry->free) + rs_func_name(__entry->func), + (unsigned long)__entry->free, + (unsigned long)__entry->reserved) ); #endif /* _TRACE_GFS2_H */ -- 2.19.1.546.g028f9c799.dirty