[Cluster-devel] [RFC 12/17] gfs2: Add per-reservation reserved block accounting

Andreas Gruenbacher Wed, 14 Oct 2020 03:00:04 -0700

Add a rs_reserved field to struct gfs2_blkreserv to keep track of the
number of blocks reserved by this particular reservation.  When making a
reservation with gfs2_inplace_reserve, this field is set to somewhere
between ap->min_target and ap->target depending on the number of free
blocks in the resource group.  When allocating blocks with
gfs2_alloc_blocks, rs_reserved is decremented accordingly.  Eventually,
any reserved but not consumed blocks are returned to the resource group
by gfs2_inplace_release.


The reservation tree (rd_rstree) is unaffected by this change: the
reservations it tracks are still advisory, and the sizes of those
reservations (rs_free) are still determined by the tentative allocation
sizes (i_sizehint).  Since rd_reserved now tracks the number of reserved
blocks rather than the number of tentatively reserved blocks, we may end
up with slightly different allocation patterns, though. The
rd_extfail_pt optimization will still cause ill-suited resource groups
to be skipped quickly.

Signed-off-by: Andreas Gruenbacher <[email protected]>
---
 fs/gfs2/file.c       |   4 +-
 fs/gfs2/incore.h     |   1 +
 fs/gfs2/lops.c       |   1 +
 fs/gfs2/rgrp.c       | 110 ++++++++++++++++++++++---------------------
 fs/gfs2/trace_gfs2.h |   8 +++-
 5 files changed, 66 insertions(+), 58 deletions(-)

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index b39b339feddc..7246aaf86862 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1115,8 +1115,8 @@ static long __gfs2_fallocate(struct file *file, int mode, 
loff_t offset, loff_t
                        goto out_qunlock;
 
                /* check if the selected rgrp limits our max_blks further */
-               if (ap.allowed && ap.allowed < max_blks)
-                       max_blks = ap.allowed;
+               if (ip->i_res.rs_reserved < max_blks)
+                       max_blks = ip->i_res.rs_reserved;
 
                /* Almost done. Calculate bytes that can be written using
                 * max_blks. We also recompute max_bytes, data_blocks and
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index fb487e90eac7..6f0790d3f71d 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -287,6 +287,7 @@ struct gfs2_blkreserv {
        struct gfs2_rgrpd *rs_rgd;
        u64 rs_start;                 /* start of reservation */
        u32 rs_free;                  /* how many blocks are still free */
+       u32 rs_reserved;              /* number of reserved blocks */
 };
 
 /*
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index ed1da4323967..f621a021e21b 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -83,6 +83,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd)
               bd->bd_bh->b_data + bi->bi_offset, bi->bi_bytes);
        clear_bit(GBF_FULL, &bi->bi_flags);
        rgd->rd_free_clone = rgd->rd_free;
+       BUG_ON(rgd->rd_free_clone < rgd->rd_reserved);
        rgd->rd_extfail_pt = rgd->rd_free;
 }
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index c4a19798d3aa..2f6eca015fab 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -646,10 +646,6 @@ static void __rs_deltree(struct gfs2_blkreserv *rs)
        RB_CLEAR_NODE(&rs->rs_node);
 
        if (rs->rs_free) {
-               /* return reserved blocks to the rgrp */
-               BUG_ON(rs->rs_rgd->rd_reserved < rs->rs_free);
-               rs->rs_rgd->rd_reserved -= rs->rs_free;
-
                /* The rgrp extent failure point is likely not to increase;
                   it will only do so if the freed blocks are somehow
                   contiguous with a span of free blocks that follows. Still,
@@ -1229,6 +1225,7 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
                rgrp_set_bitmap_flags(rgd);
                rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
                rgd->rd_free_clone = rgd->rd_free;
+               BUG_ON(rgd->rd_reserved);
                /* max out the rgrp allocation failure point */
                rgd->rd_extfail_pt = rgd->rd_free;
        }
@@ -1278,6 +1275,7 @@ static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
        rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
        rgrp_set_bitmap_flags(rgd);
        rgd->rd_free_clone = rgd->rd_free;
+       BUG_ON(rgd->rd_reserved);
        /* max out the rgrp allocation failure point */
        rgd->rd_extfail_pt = rgd->rd_free;
        rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
@@ -1515,41 +1513,10 @@ static void rs_insert(struct gfs2_inode *ip)
 
        rb_link_node(&rs->rs_node, parent, newn);
        rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
-
-       /* Do our rgrp accounting for the reservation */
-       rgd->rd_reserved += rs->rs_free; /* blocks reserved */
        spin_unlock(&rgd->rd_rsspin);
        trace_gfs2_rs(rs, TRACE_RS_INSERT);
 }
 
-/**
- * rgd_free - return the number of free blocks we can allocate.
- * @rgd: the resource group
- *
- * This function returns the number of free blocks for an rgrp.
- * That's the clone-free blocks (blocks that are free, not including those
- * still being used for unlinked files that haven't been deleted.)
- *
- * It also subtracts any blocks reserved by someone else, but does not
- * include free blocks that are still part of our current reservation,
- * because obviously we can (and will) allocate them.
- */
-static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs)
-{
-       u32 tot_reserved, tot_free;
-
-       if (WARN_ON_ONCE(rgd->rd_reserved < rs->rs_free))
-               return 0;
-       tot_reserved = rgd->rd_reserved - rs->rs_free;
-
-       if (rgd->rd_free_clone < tot_reserved)
-               tot_reserved = 0;
-
-       tot_free = rgd->rd_free_clone - tot_reserved;
-
-       return tot_free;
-}
-
 /**
  * rg_mblk_search - find a group of multiple free blocks to form a reservation
  * @rgd: the resource group descriptor
@@ -1565,7 +1532,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct 
gfs2_inode *ip,
        u64 goal;
        struct gfs2_blkreserv *rs = &ip->i_res;
        u32 extlen;
-       u32 free_blocks = rgd_free(rgd, rs);
+       u32 blocks_available;
        int ret;
        struct inode *inode = &ip->i_inode;
 
@@ -1575,8 +1542,16 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, 
struct gfs2_inode *ip,
                extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target);
                extlen = max_t(u32, extlen, RGRP_RSRV_MINBLKS);
        }
-       if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
+
+       spin_lock(&rgd->rd_rsspin);
+       blocks_available = rgd->rd_free_clone - rgd->rd_reserved;
+       if (rgd == rs->rs_rgd)
+               blocks_available += rs->rs_reserved;
+       if (blocks_available < extlen) {
+               spin_unlock(&rgd->rd_rsspin);
                return;
+       }
+       spin_unlock(&rgd->rd_rsspin);
 
        /* Find bitmap block that contains bits for goal block */
        if (rgrp_contains_block(rgd, ip->i_goal))
@@ -2024,8 +1999,7 @@ static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
  * We try our best to find an rgrp that has at least ap->target blocks
  * available. After a couple of passes (loops == 2), the prospects of finding
  * such an rgrp diminish. At this stage, we return the first rgrp that has
- * at least ap->min_target blocks available. Either way, we set ap->allowed to
- * the number of blocks available in the chosen rgrp.
+ * at least ap->min_target blocks available.
  *
  * Returns: 0 on success,
  *          -ENOMEM if a suitable rgrp can't be found
@@ -2041,7 +2015,9 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct 
gfs2_alloc_parms *ap)
        u64 last_unlinked = NO_BLOCK;
        u32 target = ap->target;
        int loops = 0;
-       u32 free_blocks, skip = 0;
+       u32 blocks_available, skip = 0;
+
+       BUG_ON(rs->rs_reserved);
 
        if (sdp->sd_args.ar_rgrplvb)
                flags |= GL_SKIP;
@@ -2062,6 +2038,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct 
gfs2_alloc_parms *ap)
                return -EBADSLT;
 
        while (loops < 3) {
+               struct gfs2_rgrpd *rgd;
+
                rg_locked = 1;
 
                if (!gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl)) {
@@ -2112,11 +2090,19 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct 
gfs2_alloc_parms *ap)
                        goto check_rgrp;
 
                /* If rgrp has enough free space, use it */
-               free_blocks = rgd_free(rs->rs_rgd, rs);
-               if (free_blocks >= target) {
-                       ap->allowed = free_blocks;
-                       return 0;
+               rgd = rs->rs_rgd;
+               spin_lock(&rgd->rd_rsspin);
+               blocks_available = rgd->rd_free_clone - rgd->rd_reserved;
+               if (blocks_available < target) {
+                       spin_unlock(&rgd->rd_rsspin);
+                       goto check_rgrp;
                }
+               rs->rs_reserved = ap->target;
+               if (rs->rs_reserved > blocks_available)
+                       rs->rs_reserved = blocks_available;
+               rgd->rd_reserved += rs->rs_reserved;
+               spin_unlock(&rgd->rd_rsspin);
+               return 0;
 check_rgrp:
                /* Check for unlinked inodes which can be reclaimed */
                if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK)
@@ -2169,6 +2155,17 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct 
gfs2_alloc_parms *ap)
 
 void gfs2_inplace_release(struct gfs2_inode *ip)
 {
+       struct gfs2_blkreserv *rs = &ip->i_res;
+
+       if (rs->rs_reserved) {
+               struct gfs2_rgrpd *rgd = rs->rs_rgd;
+
+               spin_lock(&rgd->rd_rsspin);
+               BUG_ON(rgd->rd_reserved < rs->rs_reserved);
+               rgd->rd_reserved -= rs->rs_reserved;
+               spin_unlock(&rgd->rd_rsspin);
+               rs->rs_reserved = 0;
+       }
        if (gfs2_holder_initialized(&ip->i_rgd_gh))
                gfs2_glock_dq_uninit(&ip->i_rgd_gh);
 }
@@ -2310,7 +2307,8 @@ static void gfs2_adjust_reservation(struct gfs2_inode *ip,
        struct gfs2_blkreserv *rs = &ip->i_res;
        struct gfs2_rgrpd *rgd = rbm->rgd;
 
-       spin_lock(&rgd->rd_rsspin);
+       BUG_ON(rs->rs_reserved < len);
+       rs->rs_reserved -= len;
        if (gfs2_rs_active(rs)) {
                u64 start = gfs2_rbm_to_block(rbm);
 
@@ -2320,19 +2318,16 @@ static void gfs2_adjust_reservation(struct gfs2_inode 
*ip,
                        rs->rs_start += len;
                        rlen = min(rs->rs_free, len);
                        rs->rs_free -= rlen;
-                       rgd->rd_reserved -= rlen;
                        trace_gfs2_rs(rs, TRACE_RS_CLAIM);
                        if (rs->rs_start < rgd->rd_data0 + rgd->rd_data &&
                            rs->rs_free)
-                               goto out;
+                               return;
                        /* We used up our block reservation, so we should
                           reserve more blocks next time. */
                        atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint);
                }
                __rs_deltree(rs);
        }
-out:
-       spin_unlock(&rgd->rd_rsspin);
 }
 
 /**
@@ -2386,6 +2381,8 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, 
unsigned int *nblocks,
        u32 minext = 1;
        int error = -ENOSPC;
 
+       BUG_ON(ip->i_res.rs_reserved < *nblocks);
+
        if (gfs2_rs_active(&ip->i_res)) {
                gfs2_set_alloc_start(&rbm, ip, dinode);
                error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &minext, 
&ip->i_res, false);
@@ -2407,8 +2404,6 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, 
unsigned int *nblocks,
        gfs2_alloc_extent(&rbm, dinode, nblocks);
        block = gfs2_rbm_to_block(&rbm);
        rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
-       if (gfs2_rs_active(&ip->i_res))
-               gfs2_adjust_reservation(ip, &rbm, *nblocks);
        if (!dinode) {
                ip->i_goal = block + *nblocks - 1;
                error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -2421,12 +2416,20 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, 
unsigned int *nblocks,
                        brelse(dibh);
                }
        }
-       if (rbm.rgd->rd_free < *nblocks) {
+       spin_lock(&rbm.rgd->rd_rsspin);
+       gfs2_adjust_reservation(ip, &rbm, *nblocks);
+       if (rbm.rgd->rd_reserved < *nblocks) {
                fs_warn(sdp, "nblocks=%u\n", *nblocks);
+               spin_unlock(&rbm.rgd->rd_rsspin);
                goto rgrp_error;
        }
-
+       BUG_ON(rbm.rgd->rd_reserved < *nblocks);
+       BUG_ON(rbm.rgd->rd_free_clone < *nblocks);
+       BUG_ON(rbm.rgd->rd_free < *nblocks);
+       rbm.rgd->rd_reserved -= *nblocks;
+       rbm.rgd->rd_free_clone -= *nblocks;
        rbm.rgd->rd_free -= *nblocks;
+       spin_unlock(&rbm.rgd->rd_rsspin);
        if (dinode) {
                rbm.rgd->rd_dinodes++;
                *generation = rbm.rgd->rd_igeneration++;
@@ -2443,7 +2446,6 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, 
unsigned int *nblocks,
 
        gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
 
-       rbm.rgd->rd_free_clone -= *nblocks;
        trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
                               dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
        *bn = block;
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 7586c7629497..282fcb1a242f 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -598,6 +598,7 @@ TRACE_EVENT(gfs2_rs,
                __field(        u64,    inum                    )
                __field(        u64,    start                   )
                __field(        u32,    free                    )
+               __field(        u32,    reserved                )
                __field(        u8,     func                    )
        ),
 
@@ -610,17 +611,20 @@ TRACE_EVENT(gfs2_rs,
                                                       i_res)->i_no_addr;
                __entry->start          = rs->rs_start;
                __entry->free           = rs->rs_free;
+               __entry->reserved       = rs->rs_reserved;
                __entry->func           = func;
        ),
 
-       TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%lu rr:%lu %s f:%lu",
+       TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%lu rr:%lu %s f:%lu 
r:%lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long long)__entry->inum,
                  (unsigned long long)__entry->start,
                  (unsigned long long)__entry->rd_addr,
                  (unsigned long)__entry->rd_free_clone,
                  (unsigned long)__entry->rd_reserved,
-                 rs_func_name(__entry->func), (unsigned long)__entry->free)
+                 rs_func_name(__entry->func),
+                 (unsigned long)__entry->free,
+                 (unsigned long)__entry->reserved)
 );
 
 #endif /* _TRACE_GFS2_H */
-- 
2.26.2

[Cluster-devel] [RFC 12/17] gfs2: Add per-reservation reserved block accounting

Reply via email to