Add two more btrees analagous to the rebalance_work btree: bitset btrees
that refer to extents in the extents and reflink btrees.

rebalance_work_hipri: this is for extents that are being moved off of a
BCH_MEMBER_STATE_failed device (device evacuates; we'll likely rename
that member state).

This is to ensure that evacuates aren't blocked by other work, e.g.
background_compression

rebalance_work_pending: this is for extents that we'd like to move but
can't - currently this only happens when the target is full. When we
detect this we can set the pending bit in bch_extent_rebalance; extents
in this btree won't be processed until some external event happens (e.g.
a new device is added to the array).

This fixes a bug where rebalance will spin when more data is stored in
the filesystem than fits in the target specified (e.g. a tiered SSD/HDD
setup with more data than fits on background_target0.

NOTE - we'd really like extents rebalance_work_pending to additionally
be indexed by the target we want to move them to; a large complicated
setup can have many targets with different directories on different
targets. We don't want to have to scan all of rebalance_work_pending,
just the extents for the target that now has free space.

We can't do that yet, though: that will require support for btrees with
larger sized integer keys (currently btree keys are fixed at 160 bit
integers).

Signed-off-by: Kent Overstreet <[email protected]>
---
 fs/bcachefs/bcachefs_format.h |   8 +++
 fs/bcachefs/buckets.c         |  32 +---------
 fs/bcachefs/extents.c         |  10 ----
 fs/bcachefs/extents.h         |  11 ++++
 fs/bcachefs/rebalance.c       | 106 +++++++++++++++++++++++++++++-----
 fs/bcachefs/rebalance.h       |  20 ++++++-
 6 files changed, 129 insertions(+), 58 deletions(-)

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index b2de993d802b..28c0c876e14b 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1425,6 +1425,14 @@ enum btree_id_flags {
          BTREE_IS_snapshot_field|                                              
\
          BTREE_IS_write_buffer,                                                
\
          BIT_ULL(KEY_TYPE_accounting))                                         
\
+       x(rebalance_work_hipri,         21,                                     
\
+         BTREE_IS_snapshot_field|                                              
\
+         BTREE_IS_write_buffer,                                                
\
+         BIT_ULL(KEY_TYPE_set))                                                
\
+       x(rebalance_work_pending,       22,                                     
\
+         BTREE_IS_snapshot_field|                                              
\
+         BTREE_IS_write_buffer,                                                
\
+         BIT_ULL(KEY_TYPE_set))                                                
\
 
 enum btree_id {
 #define x(name, nr, ...) BTREE_ID_##name = nr,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 436634a5f77c..afa97af1fb8b 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -901,35 +901,9 @@ int bch2_trigger_extent(struct btree_trans *trans,
                                return ret;
                }
 
-               unsigned old_r = bch2_bkey_needs_rb(old);
-               unsigned new_r = bch2_bkey_needs_rb(new.s_c);
-               if (old_r != new_r) {
-                       /* XXX: slowpath, put in a a separate function */
-                       int delta = (int) !!new_r - (int) !!old_r;
-                       if ((flags & BTREE_TRIGGER_transactional) && delta) {
-                               int ret = bch2_btree_bit_mod_buffered(trans, 
BTREE_ID_rebalance_work,
-                                                                 new.k->p, 
delta > 0);
-                               if (ret)
-                                       return ret;
-                       }
-
-                       s64 v[1] = { 0 };
-#define x(n)                                                                   
        \
-                       if ((old_r ^ new_r) & BIT(BCH_REBALANCE_##n)) {         
        \
-                               v[0] = old_r & BIT(BCH_REBALANCE_##n)           
        \
-                                       ? -(s64) old.k->size                    
        \
-                                       :        new.k->size;                   
        \
-                                                                               
        \
-                               int ret = bch2_disk_accounting_mod2(trans,      
        \
-                                                       flags & 
BTREE_TRIGGER_gc,       \
-                                                       v, rebalance_work,      
        \
-                                                       BCH_REBALANCE_##n);     
        \
-                               if (ret)                                        
        \
-                                       return ret;                             
        \
-                       }
-                       BCH_REBALANCE_OPTS()
-#undef x
-               }
+               int ret = bch2_trigger_extent_rebalance(trans, old, new, flags);
+               if (ret)
+                       return ret;
        }
 
        return 0;
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 016242ffc98d..6c3964d3efca 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -803,16 +803,6 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct 
bkey_s_c k)
        return replicas;
 }
 
-static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct 
extent_ptr_decoded *p)
-{
-       if (p->ptr.cached)
-               return 0;
-
-       return p->has_ec
-               ? p->ec.redundancy + 1
-               : ca->mi.durability;
-}
-
 unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct 
extent_ptr_decoded *p)
 {
        struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->ptr.dev);
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 03ea7c689d9a..934754e36854 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -603,6 +603,17 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c);
 unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
 
 unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
+
+static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct 
extent_ptr_decoded *p)
+{
+       if (p->ptr.cached)
+               return 0;
+
+       return p->has_ec
+               ? p->ec.redundancy + 1
+               : ca->mi.durability;
+}
+
 unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct 
extent_ptr_decoded *);
 unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded 
*);
 unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 525e4d1716c5..b5907e72bcfc 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -46,6 +46,60 @@ const struct bch_extent_rebalance 
*bch2_bkey_rebalance_opts(struct bkey_s_c k)
        return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
 }
 
+static enum btree_id rb_work_btree(const struct bch_extent_rebalance *r)
+{
+       if (!r || !r->need_rb)
+               return 0;
+       if (r->hipri)
+               return BTREE_ID_rebalance_work_hipri;
+       if (r->pending)
+               return BTREE_ID_rebalance_work_pending;
+       return BTREE_ID_rebalance_work;
+}
+
+int __bch2_trigger_extent_rebalance(struct btree_trans *trans,
+                                   struct bkey_s_c old, struct bkey_s_c new,
+                                   enum btree_iter_update_trigger_flags flags)
+{
+       const struct bch_extent_rebalance *old_r = 
bch2_bkey_rebalance_opts(old);
+       const struct bch_extent_rebalance *new_r = 
bch2_bkey_rebalance_opts(new);
+
+       enum btree_id old_btree = rb_work_btree(old_r);
+       enum btree_id new_btree = rb_work_btree(new_r);
+
+       if (old_btree && old_btree != new_btree) {
+               int ret = bch2_btree_bit_mod_buffered(trans, old_btree, 
old.k->p, false);
+               if (ret)
+                       return ret;
+       }
+
+       if (new_btree && old_btree != new_btree) {
+               int ret = bch2_btree_bit_mod_buffered(trans, new_btree, 
new.k->p, true);
+               if (ret)
+                       return ret;
+       }
+
+       unsigned old_n = old_r ? old_r->need_rb : 0;
+       unsigned new_n = new_r ? new_r->need_rb : 0;
+
+       s64 v[1] = { 0 };
+#define x(n)                                                                   
\
+       if ((old_n ^ new_n) & BIT(BCH_REBALANCE_##n)) {                         
\
+               v[0] = old_n & BIT(BCH_REBALANCE_##n)                           
\
+                       ? -(s64) old.k->size                                    
\
+                       :        new.k->size;                                   
\
+                                                                               
\
+               int ret = bch2_disk_accounting_mod2(trans,                      
\
+                                                   flags & BTREE_TRIGGER_gc,   
\
+                                                   v, rebalance_work,          
\
+                                                   BCH_REBALANCE_##n);         
\
+               if (ret)                                                        
\
+                       return ret;                                             
\
+       }
+       BCH_REBALANCE_OPTS()
+#undef x
+}
+
 static struct bch_extent_rebalance
 bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k,
                          struct bch_inode_opts *opts,
@@ -64,10 +118,6 @@ bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c 
k,
        if (bch2_bkey_extent_ptrs_flags(ptrs) & 
BIT_ULL(BCH_EXTENT_FLAG_poisoned))
                return r;
 
-       const struct bch_extent_rebalance *old_r = 
bch2_bkey_ptrs_rebalance_opts(ptrs);
-       if (old_r)
-               r = *old_r;
-
 #define x(_name)                                                       \
        if (k.k->type != KEY_TYPE_reflink_v ||                          \
            may_update_indirect ||                                      \
@@ -96,6 +146,10 @@ bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c 
k,
                incompressible  |= p.crc.compression_type == 
BCH_COMPRESSION_TYPE_incompressible;
                unwritten       |= p.ptr.unwritten;
 
+               struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
+               if (!ca)
+                       goto next;
+
                if (!p.ptr.cached) {
                        if (p.crc.compression_type != compression_type)
                                *compress_ptrs |= ptr_idx;
@@ -106,13 +160,18 @@ bch2_bkey_needs_rebalance(struct bch_fs *c, struct 
bkey_s_c k,
                        if (target && !bch2_dev_in_target(c, p.ptr.dev, target))
                                *move_ptrs |= ptr_idx;
 
-                       unsigned d = bch2_extent_ptr_durability(c, &p);
+                       if (ca->mi.state == BCH_MEMBER_STATE_failed)
+                               r.hipri = 1;
+
+                       unsigned d = ca->mi.state != BCH_MEMBER_STATE_failed
+                               ? __extent_ptr_durability(ca, &p)
+                               : 0;
                        durability += d;
                        min_durability = min(min_durability, d);
 
                        ec |= p.has_ec;
                }
-
+next:
                ptr_idx <<= 1;
        }
 
@@ -131,6 +190,10 @@ bch2_bkey_needs_rebalance(struct bch_fs *c, struct 
bkey_s_c k,
                r.need_rb |= BIT(BCH_REBALANCE_data_replicas);
        if (*move_ptrs)
                r.need_rb |= BIT(BCH_REBALANCE_background_target);
+
+       const struct bch_extent_rebalance *old = 
bch2_bkey_ptrs_rebalance_opts(ptrs);
+       if (old && !(old->need_rb & ~r.need_rb))
+               r.pending = old->pending;
        return r;
 }
 
@@ -1152,25 +1215,29 @@ int bch2_fs_rebalance_init(struct bch_fs *c)
        return 0;
 }
 
+/* need better helpers for iterating in parallel */
+
 static int check_rebalance_work_one(struct btree_trans *trans,
                                    struct btree_iter *extent_iter,
                                    struct btree_iter *rebalance_iter,
+                                   struct btree_iter *rebalance_hipri_iter,
+                                   struct btree_iter *rebalance_pending_iter,
                                    struct per_snapshot_io_opts 
*snapshot_io_opts,
                                    struct bkey_buf *last_flushed)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_s_c extent_k, rebalance_k;
+       struct bkey_s_c extent_k, rb_k;
        CLASS(printbuf, buf)();
 
        int ret = bkey_err(extent_k     = bch2_btree_iter_peek(extent_iter)) ?:
-                 bkey_err(rebalance_k  = bch2_btree_iter_peek(rebalance_iter));
+                 bkey_err(rb_k = bch2_btree_iter_peek(rebalance_iter));
        if (ret)
                return ret;
 
        if (!extent_k.k &&
            extent_iter->btree_id == BTREE_ID_reflink &&
-           (!rebalance_k.k ||
-            rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) {
+           (!rb_k.k ||
+            rb_k.k->p.inode >= BCACHEFS_ROOT_INO)) {
                bch2_trans_iter_exit(extent_iter);
                bch2_trans_iter_init(trans, extent_iter,
                                     BTREE_ID_extents, POS_MIN,
@@ -1179,25 +1246,25 @@ static int check_rebalance_work_one(struct btree_trans 
*trans,
                return bch_err_throw(c, transaction_restart_nested);
        }
 
-       if (!extent_k.k && !rebalance_k.k)
+       if (!extent_k.k && !rb_k.k)
                return 1;
 
        int cmp = bpos_cmp(extent_k.k    ? extent_k.k->p    : SPOS_MAX,
-                          rebalance_k.k ? rebalance_k.k->p : SPOS_MAX);
+                          rb_k.k       ? rb_k.k->p : SPOS_MAX);
 
        struct bkey deleted;
        bkey_init(&deleted);
 
        if (cmp < 0) {
                deleted.p = extent_k.k->p;
-               rebalance_k.k = &deleted;
+               rb_k.k = &deleted;
        } else if (cmp > 0) {
-               deleted.p = rebalance_k.k->p;
+               deleted.p = rb_k.k->p;
                extent_k.k = &deleted;
        }
 
        bool should_have_rebalance = bch2_bkey_needs_rb(extent_k);
-       bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set;
+       bool have_rebalance = rb_k.k->type == KEY_TYPE_set;
 
        if (should_have_rebalance != have_rebalance) {
                ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, 
last_flushed);
@@ -1248,6 +1315,10 @@ int bch2_check_rebalance_work(struct bch_fs *c)
                                       BTREE_ITER_prefetch);
        CLASS(btree_iter, rebalance_iter)(trans, BTREE_ID_rebalance_work, 
POS_MIN,
                                          BTREE_ITER_prefetch);
+       CLASS(btree_iter, rebalance_hipri_iter)(trans, 
BTREE_ID_rebalance_work_hipri, POS_MIN,
+                                         BTREE_ITER_prefetch);
+       CLASS(btree_iter, rebalance_pending_iter)(trans, 
BTREE_ID_rebalance_work_pending, POS_MIN,
+                                         BTREE_ITER_prefetch);
 
        struct per_snapshot_io_opts snapshot_io_opts;
        per_snapshot_io_opts_init(&snapshot_io_opts, c);
@@ -1265,7 +1336,10 @@ int bch2_check_rebalance_work(struct bch_fs *c)
 
                bch2_trans_begin(trans);
 
-               ret = check_rebalance_work_one(trans, &extent_iter, 
&rebalance_iter,
+               ret = check_rebalance_work_one(trans, &extent_iter,
+                                              &rebalance_iter,
+                                              &rebalance_hipri_iter,
+                                              &rebalance_pending_iter,
                                               &snapshot_io_opts, 
&last_flushed);
 
                if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index f6b74d5e1210..27c36568a626 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -22,10 +22,24 @@ static inline struct bch_extent_rebalance 
io_opts_to_rebalance_opts(struct bch_f
 
 const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
 
-static inline int bch2_bkey_needs_rb(struct bkey_s_c k)
+int __bch2_trigger_extent_rebalance(struct btree_trans *,
+                                   struct bkey_s_c, struct bkey_s_c,
+                                   enum btree_iter_update_trigger_flags);
+
+static inline int bch2_trigger_extent_rebalance(struct btree_trans *trans,
+                                 struct bkey_s_c old, struct bkey_s_c new,
+                                 enum btree_iter_update_trigger_flags flags)
 {
-       const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
-       return r ? r->need_rb : 0;
+       const struct bch_extent_rebalance *old_r = 
bch2_bkey_rebalance_opts(old);
+       const struct bch_extent_rebalance *new_r = 
bch2_bkey_rebalance_opts(new);
+
+       if ((!old_r && !new_r) ||
+           (old_r->need_rb     == new_r->need_rb &&
+            old_r->hipri       == new_r->hipri &&
+            old_r->pending     == new_r->pending))
+               return 0;
+
+       return __bch2_trigger_extent_rebalance(trans, old, new, flags);
 }
 
 /* Inodes in different snapshots may have different IO options: */
-- 
2.50.1


Reply via email to