"Extent needs rebalance for data_replicas" cannot be a pure function of
the extent with the current bch_extent_rebalance, which is required for
the triggers - they would create inconsistencies in the rebalance_work
accounting and btrees when e.g. device durability changes.

So: this adds bch_extent_rebalance.needs_rb, which has flags for all the
reasons rebalance may need to process an extent: the trigger now uses
just these flags instead of running the full "does this extent need
rebalance" calculations in bch2_bkey_sectors_need_rebalance().

Additionally:
- Instead of a single accounting counter for pending rebalance work,
  this is now split out into different counters for the different io
  path options rebalance handles (compression, data_checksum, replicas,
  erasure_code, etc.)

- "Does this extent need to be rebalanced?" is now centralized in
  bch2_bkey_set_needs_rebalance()

- "Is new rebalance_work allowed in this context" is
  new_needs_rb_allowed() - this enforces that extents match the
  specified io path options, with clearly defined exceptions (e.g.
  accounting for races with option changes, and foreground writes are
  allowed to add background_compression and background_target work)

XXX: split this patch up more

XXX: define a new on disk format version, and upgrade/downgrade table
entries

Compatibility notes: still undecided if we'll stick with redefining the
existing bch_extent_rebalance, or add a new extent entry type for
bch_extent_rebalance_v2 - there are pros and cons to both

If we redefine the existing bch_extent_rebalance, on upgrade
check_rebalance_work will correct all the existing bch_extent_rebalance
entries (along with accounting, rebalance_work btrees) - except indirect
extents will need special handling, which we likely need anyways

On downgrade, old versions don't have a recovery pass that
checks/fixes bch_extent_rebalance from the io path options - but they do
that on data move, so we're probably more or less ok; some wonkiness in
rebalance_work accounting would be expected

Adding a bch_extent_rebalance_v2 would be an incompatible upgrade
(adding new extent entry types is always an incompatible upgrade,
unfortunately) - and it'd require keeping around compatibility code for
e.g. the triggers to handle the old bch_extent_rebalance...

Signed-off-by: Kent Overstreet <[email protected]>
---
 fs/bcachefs/buckets.c                |  50 ++--
 fs/bcachefs/data_update.c            |  26 --
 fs/bcachefs/disk_accounting_format.h |   1 +
 fs/bcachefs/rebalance.c              | 403 ++++++++++++++-------------
 fs/bcachefs/rebalance.h              |  21 +-
 fs/bcachefs/rebalance_format.h       |  62 +++--
 fs/bcachefs/trace.h                  |   5 -
 7 files changed, 288 insertions(+), 280 deletions(-)

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 6be1cc9ba0da..436634a5f77c 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -871,7 +871,6 @@ int bch2_trigger_extent(struct btree_trans *trans,
                        struct bkey_s_c old, struct bkey_s new,
                        enum btree_iter_update_trigger_flags flags)
 {
-       struct bch_fs *c = trans->c;
        struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
        struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
        unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) 
new_ptrs.start;
@@ -902,29 +901,34 @@ int bch2_trigger_extent(struct btree_trans *trans,
                                return ret;
                }
 
-               int need_rebalance_delta = 0;
-               s64 need_rebalance_sectors_delta[1] = { 0 };
-
-               s64 s = bch2_bkey_sectors_need_rebalance(c, old);
-               need_rebalance_delta -= s != 0;
-               need_rebalance_sectors_delta[0] -= s;
-
-               s = bch2_bkey_sectors_need_rebalance(c, new.s_c);
-               need_rebalance_delta += s != 0;
-               need_rebalance_sectors_delta[0] += s;
-
-               if ((flags & BTREE_TRIGGER_transactional) && 
need_rebalance_delta) {
-                       int ret = bch2_btree_bit_mod_buffered(trans, 
BTREE_ID_rebalance_work,
-                                                         new.k->p, 
need_rebalance_delta > 0);
-                       if (ret)
-                               return ret;
-               }
+               unsigned old_r = bch2_bkey_needs_rb(old);
+               unsigned new_r = bch2_bkey_needs_rb(new.s_c);
+               if (old_r != new_r) {
+                       /* XXX: slowpath, put in a a separate function */
+                       int delta = (int) !!new_r - (int) !!old_r;
+                       if ((flags & BTREE_TRIGGER_transactional) && delta) {
+                               int ret = bch2_btree_bit_mod_buffered(trans, 
BTREE_ID_rebalance_work,
+                                                                 new.k->p, 
delta > 0);
+                               if (ret)
+                                       return ret;
+                       }
 
-               if (need_rebalance_sectors_delta[0]) {
-                       int ret = bch2_disk_accounting_mod2(trans, flags & 
BTREE_TRIGGER_gc,
-                                                           
need_rebalance_sectors_delta, rebalance_work);
-                       if (ret)
-                               return ret;
+                       s64 v[1] = { 0 };
+#define x(n)                                                                   
        \
+                       if ((old_r ^ new_r) & BIT(BCH_REBALANCE_##n)) {         
        \
+                               v[0] = old_r & BIT(BCH_REBALANCE_##n)           
        \
+                                       ? -(s64) old.k->size                    
        \
+                                       :        new.k->size;                   
        \
+                                                                               
        \
+                               int ret = bch2_disk_accounting_mod2(trans,      
        \
+                                                       flags & 
BTREE_TRIGGER_gc,       \
+                                                       v, rebalance_work,      
        \
+                                                       BCH_REBALANCE_##n);     
        \
+                               if (ret)                                        
        \
+                                       return ret;                             
        \
+                       }
+                       BCH_REBALANCE_OPTS()
+#undef x
                }
        }
 
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 0f968bab7d93..2466f7a1c9e6 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -207,28 +207,6 @@ static void trace_data_update2(struct data_update *m,
        trace_data_update(c, buf.buf);
 }
 
-noinline_for_stack
-static void trace_io_move_created_rebalance2(struct data_update *m,
-                                            struct bkey_s_c old, struct 
bkey_s_c k,
-                                            struct bkey_i *insert)
-{
-       struct bch_fs *c = m->op.c;
-       CLASS(printbuf, buf)();
-
-       bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
-
-       prt_str(&buf, "\nold: ");
-       bch2_bkey_val_to_text(&buf, c, old);
-       prt_str(&buf, "\nk:   ");
-       bch2_bkey_val_to_text(&buf, c, k);
-       prt_str(&buf, "\nnew: ");
-       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-
-       trace_io_move_created_rebalance(c, buf.buf);
-
-       count_event(c, io_move_created_rebalance);
-}
-
 noinline_for_stack
 static int data_update_invalid_bkey(struct data_update *m,
                                    struct bkey_s_c old, struct bkey_s_c k,
@@ -449,10 +427,6 @@ static int __bch2_data_update_index_update(struct 
btree_trans *trans,
                if (trace_data_update_enabled())
                        trace_data_update2(m, old, k, insert);
 
-               if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) 
* k.k->size >
-                   bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size)
-                       trace_io_move_created_rebalance2(m, old, k, insert);
-
                ret =   bch2_trans_commit(trans, &op->res,
                                NULL,
                                BCH_TRANS_COMMIT_no_check_rw|
diff --git a/fs/bcachefs/disk_accounting_format.h 
b/fs/bcachefs/disk_accounting_format.h
index 8269af1dbe2a..4aa7f83f5d75 100644
--- a/fs/bcachefs/disk_accounting_format.h
+++ b/fs/bcachefs/disk_accounting_format.h
@@ -200,6 +200,7 @@ struct bch_acct_inum {
  * move, extents counted here are also in the rebalance_work btree.
  */
 struct bch_acct_rebalance_work {
+       __u8                    opt;
 };
 
 struct disk_accounting_pos {
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 33bddbd33088..3a6cd54613a1 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -40,49 +40,54 @@ static const struct bch_extent_rebalance 
*bch2_bkey_ptrs_rebalance_opts(struct b
        return NULL;
 }
 
-static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct 
bkey_s_c k)
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
 {
        return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
 }
 
-static void bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k,
-                                     struct bch_inode_opts *io_opts,
-                                     unsigned *move_ptrs,
-                                     unsigned *compress_ptrs,
-                                     unsigned *csum_ptrs,
-                                     u64 *sectors)
+static struct bch_extent_rebalance
+bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k,
+                         struct bch_inode_opts *opts,
+                         unsigned *move_ptrs,
+                         unsigned *compress_ptrs,
+                         unsigned *csum_ptrs,
+                         bool may_update_indirect)
 {
        *move_ptrs      = 0;
        *compress_ptrs  = 0;
        *csum_ptrs      = 0;
-       *sectors        = 0;
 
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
-       const struct bch_extent_rebalance *rb_opts = 
bch2_bkey_ptrs_rebalance_opts(ptrs);
-       if (!io_opts && !rb_opts)
-               return;
+       struct bch_extent_rebalance r = { .type = 
BIT(BCH_EXTENT_ENTRY_rebalance) };
 
        if (bch2_bkey_extent_ptrs_flags(ptrs) & 
BIT_ULL(BCH_EXTENT_FLAG_poisoned))
-               return;
-
-       unsigned compression_type =
-               bch2_compression_opt_to_type(io_opts
-                                            ? io_opts->background_compression
-                                            : rb_opts->background_compression);
-       unsigned csum_type = bch2_csum_opt_to_type(io_opts
-                                                  ? io_opts->data_checksum
-                                                  : rb_opts->data_checksum, 
true);
-       unsigned target = io_opts
-               ? io_opts->background_target
-               : rb_opts->background_target;
+               return r;
+
+       const struct bch_extent_rebalance *old_r = 
bch2_bkey_ptrs_rebalance_opts(ptrs);
+       if (old_r)
+               r = *old_r;
+
+#define x(_name)                                                       \
+       if (k.k->type != KEY_TYPE_reflink_v ||                          \
+           may_update_indirect ||                                      \
+           (!opts->_name##_from_inode && !r._name##_from_inode)) {     \
+               r._name                 = opts->_name;                  \
+               r._name##_from_inode    = opts->_name##_from_inode;     \
+       }
+       BCH_REBALANCE_OPTS()
+#undef x
+
+       unsigned compression_type = 
bch2_compression_opt_to_type(r.background_compression);
+       unsigned csum_type      = bch2_csum_opt_to_type(r.data_checksum, true);
+       unsigned target         = r.background_target;
        if (target && !bch2_target_accepts_data(c, BCH_DATA_user, target))
                target = 0;
 
+       bool incompressible = false, unwritten = false, ec = false;
+       unsigned durability = 0, min_durability = INT_MAX;
+
        const union bch_extent_entry *entry;
        struct extent_ptr_decoded p;
-       bool incompressible = false, unwritten = false;
-
        unsigned ptr_idx = 1;
 
        guard(rcu)();
@@ -99,6 +104,12 @@ static void bch2_bkey_needs_rebalance(struct bch_fs *c, 
struct bkey_s_c k,
 
                        if (target && !bch2_dev_in_target(c, p.ptr.dev, target))
                                *move_ptrs |= ptr_idx;
+
+                       unsigned d = bch2_extent_ptr_durability(c, &p);
+                       durability += d;
+                       min_durability = min(min_durability, d);
+
+                       ec |= p.has_ec;
                }
 
                ptr_idx <<= 1;
@@ -109,48 +120,123 @@ static void bch2_bkey_needs_rebalance(struct bch_fs *c, 
struct bkey_s_c k,
        if (incompressible)
                *compress_ptrs = 0;
 
-       unsigned rb_ptrs = *move_ptrs | *compress_ptrs | *csum_ptrs;
-
-       if (!rb_ptrs)
-               return;
-
-       ptr_idx = 1;
-       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-               if (rb_ptrs & ptr_idx)
-                       *sectors += p.crc.compressed_size;
-               ptr_idx <<= 1;
-       }
+       if (*csum_ptrs)
+               r.need_rb |= BIT(BCH_REBALANCE_data_checksum);
+       if (*compress_ptrs)
+               r.need_rb |= BIT(BCH_REBALANCE_background_compression);
+       if (r.erasure_code != ec)
+               r.need_rb |= BIT(BCH_REBALANCE_erasure_code);
+       if (durability < r.data_replicas || durability >= r.data_replicas + 
min_durability)
+               r.need_rb |= BIT(BCH_REBALANCE_data_replicas);
+       if (*move_ptrs)
+               r.need_rb |= BIT(BCH_REBALANCE_background_target);
+       return r;
 }
 
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
+static int check_rebalance_scan_cookie(struct btree_trans *trans, u64 inum, 
bool *v)
 {
-       unsigned move_ptrs      = 0;
-       unsigned compress_ptrs  = 0;
-       unsigned csum_ptrs      = 0;
-       u64 sectors             = 0;
+       if (*v)
+               return 0;
 
-       bch2_bkey_needs_rebalance(c, k, NULL, &move_ptrs, &compress_ptrs, 
&csum_ptrs, &sectors);
-       return sectors;
+       /*
+        * If opts need to be propagated to the extent, a scan cookie should be
+        * present:
+        */
+       CLASS(btree_iter, iter)(trans, BTREE_ID_rebalance_work,
+                               SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+                               BTREE_ITER_intent);
+       struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
+       int ret = bkey_err(k);
+       if (ret)
+               return ret;
+
+       *v = k.k->type == KEY_TYPE_cookie;
+       return 0;
 }
 
-static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
-                                             struct bch_inode_opts *opts,
-                                             struct bkey_s_c k)
+static int new_needs_rb_allowed(struct btree_trans *trans,
+                               struct per_snapshot_io_opts *s,
+                               struct bkey_s_c k,
+                               enum set_needs_rebalance_ctx ctx,
+                               unsigned opt_change_cookie,
+                               const struct bch_extent_rebalance *old,
+                               const struct bch_extent_rebalance *new,
+                               unsigned new_need_rb)
 {
-       unsigned move_ptrs      = 0;
-       unsigned compress_ptrs  = 0;
-       unsigned csum_ptrs      = 0;
-       u64 sectors             = 0;
+       struct bch_fs *c = trans->c;
+       /*
+        * New need_rb - pointers that don't match the current io path options -
+        * are only allowed in certain situations:
+        *
+        * Propagating new options: from bch2_set_rebalance_needs_scan
+        *
+        * Foreground writes: background_compression and background_target are
+        * allowed
+        *
+        * Foreground writes: we may have raced with an option change:
+        * opt_change_cookie checks for this
+        *
+        * XXX: foreground writes should still match compression,
+        * foreground_target - figure out how to check for this
+        */
+       if (ctx == SET_NEEDS_REBALANCE_opt_change ||
+           ctx == SET_NEEDS_REBALANCE_opt_change_indirect)
+               return 0;
 
-       bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, &compress_ptrs, 
&csum_ptrs, &sectors);
-       return move_ptrs|compress_ptrs|csum_ptrs;
-}
+       if (ctx == SET_NEEDS_REBALANCE_foreground) {
+               new_need_rb &= ~(BIT(BCH_REBALANCE_background_compression)|
+                                BIT(BCH_REBALANCE_background_target));
+               if (!new_need_rb)
+                       return 0;
 
-static inline bool bkey_should_have_rb_opts(struct bch_fs *c,
-                                           struct bch_inode_opts *opts,
-                                           struct bkey_s_c k)
-{
-       return k.k->type == KEY_TYPE_reflink_v || 
bch2_bkey_ptrs_need_rebalance(c, opts, k);
+               if (opt_change_cookie != atomic_read(&c->opt_change_cookie))
+                       return 0;
+       }
+
+       /*
+        * Either the extent data or the extent io options (from
+        * bch_extent_rebalance) should match the io_opts from the
+        * inode/filesystem, unless
+        *
+        * - There's a scan pending to propagate new options
+        * - It's an indirect extent: it may be referenced by inodes
+        *   with inconsistent options
+        *
+        * For efficiency (so that we can cache checking for scan
+        * cookies), only check option consistency when we're called
+        * with snapshot_io_opts - don't bother when we're called from
+        * move_data_phys() -> get_io_opts_one()
+        *
+        * Note that we can cache the existence of a cookie, but not the
+        * non-existence, to avoid spurious false positives.
+        */
+       bool scan_cookie = false;
+       int ret = check_rebalance_scan_cookie(trans, 0,                 s ? 
&s->fs_scan_cookie : &scan_cookie) ?:
+                 check_rebalance_scan_cookie(trans, k.k->p.inode,      s ? 
&s->inum_scan_cookie : &scan_cookie);
+       if (ret)
+               return ret;
+
+       if (scan_cookie)
+               return 0;
+
+       CLASS(printbuf, buf)();
+
+       prt_printf(&buf, "extent with incorrect/missing rebalance opts:\n");
+       bch2_bkey_val_to_text(&buf, c, k);
+
+       const struct bch_extent_rebalance _old = {};
+       if (!old)
+               old = &_old;
+
+#define x(_name)                                                               
\
+       if (new_need_rb & BIT(BCH_REBALANCE_##_name))                           
\
+               prt_printf(&buf, "\n" #_name " %u != %u", old->_name, 
new->_name);
+       BCH_REBALANCE_OPTS()
+#undef x
+
+       fsck_err(trans, extent_io_opts_not_set, "%s", buf.buf);
+fsck_err:
+       return ret;
 }
 
 int bch2_bkey_set_needs_rebalance(struct btree_trans *trans,
@@ -158,7 +244,7 @@ int bch2_bkey_set_needs_rebalance(struct btree_trans *trans,
                                  struct bch_inode_opts *opts,
                                  struct bkey_i *_k,
                                  enum set_needs_rebalance_ctx ctx,
-                                 u32 change_cookie)
+                                 unsigned opt_change_cookie)
 {
        if (!bkey_extent_is_direct_data(&_k->k))
                return 0;
@@ -168,51 +254,44 @@ int bch2_bkey_set_needs_rebalance(struct btree_trans 
*trans,
        struct bch_extent_rebalance *old =
                (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
 
-       if (bkey_should_have_rb_opts(c, opts, k.s_c)) {
-               if (!old) {
-                       old = bkey_val_end(k);
-                       k.k->u64s += sizeof(*old) / sizeof(u64);
-               }
+       unsigned move_ptrs      = 0;
+       unsigned compress_ptrs  = 0;
+       unsigned csum_ptrs      = 0;
+       struct bch_extent_rebalance new =
+               bch2_bkey_needs_rebalance(c, k.s_c, opts, &move_ptrs, 
&compress_ptrs, &csum_ptrs,
+                                         ctx == 
SET_NEEDS_REBALANCE_opt_change_indirect);
 
-               *old = io_opts_to_rebalance_opts(c, opts);
-       } else {
-               if (old)
-                       extent_entry_drop(k, (union bch_extent_entry *) old);
-       }
+       bool should_have_rb = k.k->type == KEY_TYPE_reflink_v || new.need_rb;
 
-       return 0;
-}
+       if (should_have_rb == !!old &&
+           (should_have_rb ? !memcmp(old, &new, sizeof(new)) : !old))
+               return 0;
 
-static int have_rebalance_scan_cookie(struct btree_trans *trans, u64 inum)
-{
-       /*
-        * If opts need to be propagated to the extent, a scan cookie should be
-        * present:
-        */
-       CLASS(btree_iter, iter)(trans, BTREE_ID_rebalance_work,
-                               SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
-                               BTREE_ITER_intent);
-       struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
-       int ret = bkey_err(k);
-       if (ret)
-               return ret;
+       unsigned new_need_rb = new.need_rb & ~(old ? old->need_rb : 0);
 
-       if (k.k->type == KEY_TYPE_cookie)
-               return 1;
+       if (unlikely(new_need_rb)) {
+               int ret = new_needs_rb_allowed(trans, snapshot_io_opts,
+                                              k.s_c, ctx, opt_change_cookie,
+                                              old, &new, new_need_rb);
+               if (ret)
+                       return ret;
+       }
 
-       if (!inum)
-               return 0;
+       if (should_have_rb) {
+               if (!old) {
+                       old = bkey_val_end(k);
+                       k.k->u64s += sizeof(*old) / sizeof(u64);
+               }
 
-       bch2_btree_iter_set_pos(&iter, SPOS(0, REBALANCE_WORK_SCAN_OFFSET, 
U32_MAX));
-       k = bch2_btree_iter_peek_slot(&iter);
-       ret = bkey_err(k);
-       if (ret)
-               return ret;
+               *old = new;
+       } else if (old)
+               extent_entry_drop(k, (union bch_extent_entry *) old);
 
-       return k.k->type == KEY_TYPE_cookie;
+       return 0;
 }
 
 static int bch2_get_update_rebalance_opts(struct btree_trans *trans,
+                                         struct per_snapshot_io_opts 
*snapshot_io_opts,
                                          struct bch_inode_opts *io_opts,
                                          struct btree_iter *iter,
                                          struct bkey_s_c k,
@@ -227,59 +306,22 @@ static int bch2_get_update_rebalance_opts(struct 
btree_trans *trans,
        if (!bkey_extent_is_direct_data(k.k))
                return 0;
 
-       bool may_update_indirect = ctx == 
SET_NEEDS_REBALANCE_opt_change_indirect;
+       struct bch_extent_rebalance *old =
+               (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k);
 
-       /*
-        * If it's an indirect extent, and we walked to it directly, we won't
-        * have the options from the inode that were directly applied: options
-        * from the extent take precedence - unless the io_opts option came from
-        * the inode and may_update_indirect is true (walked from a
-        * REFLINK_P_MAY_UPDATE_OPTIONS pointer).
-        */
-       const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
-       if (old && k.k->type == KEY_TYPE_reflink_v) {
-#define x(_name)                                                               
\
-               if (old->_name##_from_inode &&                                  
\
-                   !(may_update_indirect && io_opts->_name##_from_inode)) {    
\
-                       io_opts->_name = old->_name;                            
\
-                       io_opts->_name##_from_inode = true;                     
\
-               }
-               BCH_REBALANCE_OPTS()
-#undef x
-       }
+       unsigned move_ptrs      = 0;
+       unsigned compress_ptrs  = 0;
+       unsigned csum_ptrs      = 0;
+       struct bch_extent_rebalance new =
+               bch2_bkey_needs_rebalance(c, k, io_opts, &move_ptrs, 
&compress_ptrs, &csum_ptrs,
+                                         ctx == 
SET_NEEDS_REBALANCE_opt_change_indirect);
 
-       struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, io_opts);
+       bool should_have_rb = k.k->type == KEY_TYPE_reflink_v || new.need_rb;
 
-       if (bkey_should_have_rb_opts(c, io_opts, k)
-           ? old && !memcmp(old, &new, sizeof(new))
-           : !old)
+       if (should_have_rb == !!old &&
+           (should_have_rb ? !memcmp(old, &new, sizeof(new)) : !old))
                return 0;
 
-       if (k.k->type != KEY_TYPE_reflink_v) {
-               ret = have_rebalance_scan_cookie(trans, k.k->p.inode);
-               if (ret < 0)
-                       return ret;
-
-               if (!ret) {
-                       CLASS(printbuf, buf)();
-
-                       prt_printf(&buf, "extent with incorrect/missing 
rebalance opts:\n");
-                       bch2_bkey_val_to_text(&buf, c, k);
-
-                       const struct bch_extent_rebalance _old = {};
-                       if (!old)
-                               old = &_old;
-#define x(_name)                                                               
\
-                       if (old->_name != new._name)                            
\
-                               prt_printf(&buf, "\n" #_name " %u != %u",       
\
-                                          old->_name, new._name);              
\
-                       BCH_REBALANCE_OPTS()
-#undef x
-
-                       fsck_err(trans, extent_io_opts_not_set, "%s", buf.buf);
-               }
-       }
-
        struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8);
        ret = PTR_ERR_OR_ZERO(n);
        if (ret)
@@ -289,12 +331,10 @@ static int bch2_get_update_rebalance_opts(struct 
btree_trans *trans,
 
        /* On successfull transaction commit, @k was invalidated: */
 
-       ret = bch2_bkey_set_needs_rebalance(trans, NULL, io_opts, n, ctx, 0) ?:
+       return  bch2_bkey_set_needs_rebalance(trans, snapshot_io_opts, io_opts, 
n, ctx, 0) ?:
                bch2_trans_update(trans, iter, n, 
BTREE_UPDATE_internal_snapshot_node) ?:
                bch2_trans_commit(trans, NULL, NULL, 0) ?:
                bch_err_throw(c, transaction_restart_commit);
-fsck_err:
-       return ret;
 }
 
 static struct bch_inode_opts *bch2_extent_get_io_opts(struct btree_trans 
*trans,
@@ -334,7 +374,8 @@ static struct bch_inode_opts 
*bch2_extent_get_io_opts(struct btree_trans *trans,
 
                        darray_push(&io_opts->d, e);
                }));
-               io_opts->cur_inum = extent_pos.inode;
+               io_opts->cur_inum               = extent_pos.inode;
+               io_opts->inum_scan_cookie       = false;
        }
 
        ret = ret ?: trans_was_restarted(trans, restart_count);
@@ -357,12 +398,13 @@ struct bch_inode_opts 
*bch2_extent_get_apply_io_opts(struct btree_trans *trans,
                          enum set_needs_rebalance_ctx ctx)
 {
        struct bch_inode_opts *opts =
-               bch2_extent_get_io_opts(trans, snapshot_io_opts, extent_pos, 
extent_iter, extent_k);
+               bch2_extent_get_io_opts(trans, snapshot_io_opts,
+                                       extent_pos, extent_iter, extent_k);
        if (IS_ERR(opts) || btree_iter_path(trans, extent_iter)->level)
                return opts;
 
-       int ret = bch2_get_update_rebalance_opts(trans, opts, extent_iter, 
extent_k,
-                                                SET_NEEDS_REBALANCE_other);
+       int ret = bch2_get_update_rebalance_opts(trans, snapshot_io_opts, opts,
+                                                extent_iter, extent_k, ctx);
        return ret ? ERR_PTR(ret) : opts;
 }
 
@@ -393,8 +435,7 @@ int bch2_extent_get_io_opts_one(struct btree_trans *trans,
                }
        }
 
-       return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, 
extent_k,
-                                             ctx);
+       return bch2_get_update_rebalance_opts(trans, NULL, io_opts, 
extent_iter, extent_k, ctx);
 }
 
 static const char * const bch2_rebalance_state_strs[] = {
@@ -507,23 +548,6 @@ static struct bkey_i *next_rebalance_entry(struct 
btree_trans *trans,
        return &(&darray_pop(buf))->k_i;
 }
 
-static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
-                                          struct btree_iter *iter,
-                                          struct bkey_s_c k)
-{
-       if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k))
-               return 0;
-
-       struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
-       int ret = PTR_ERR_OR_ZERO(n);
-       if (ret)
-               return ret;
-
-       extent_entry_drop(bkey_i_to_s(n),
-                         (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
-       return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-}
-
 static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
                        struct per_snapshot_io_opts *snapshot_io_opts,
                        struct bpos work_pos,
@@ -552,22 +576,23 @@ static struct bkey_s_c next_rebalance_extent(struct 
btree_trans *trans,
 
        *opts_ret = opts;
 
+       unsigned move_ptrs      = 0;
+       unsigned compress_ptrs  = 0;
+       unsigned csum_ptrs      = 0;
+       struct bch_extent_rebalance r =
+               bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, 
&compress_ptrs, &csum_ptrs, false);
+
        memset(data_opts, 0, sizeof(*data_opts));
-       data_opts->rewrite_ptrs         = bch2_bkey_ptrs_need_rebalance(c, 
opts, k);
+       data_opts->rewrite_ptrs         = move_ptrs|compress_ptrs|csum_ptrs;
        data_opts->target               = opts->background_target;
        data_opts->write_flags          |= BCH_WRITE_only_specified_devs;
 
-       if (!data_opts->rewrite_ptrs) {
-               /*
-                * device we would want to write to offline? devices in target
-                * changed?
-                *
-                * We'll now need a full scan before this extent is picked up
-                * again:
-                */
-               int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, 
k);
-               if (ret)
-                       return bkey_s_c_err(ret);
+       if (!data_opts->rewrite_ptrs &&
+           !data_opts->kill_ptrs &&
+           !data_opts->kill_ec_ptrs &&
+           !data_opts->extra_replicas) {
+               /* XXX: better error message */
+               bch_err(c, "goto extent to rebalance but nothing to do, 
confused");
                return bkey_s_c_null;
        }
 
@@ -577,13 +602,6 @@ static struct bkey_s_c next_rebalance_extent(struct 
btree_trans *trans,
                bch2_bkey_val_to_text(&buf, c, k);
                prt_newline(&buf);
 
-               unsigned move_ptrs      = 0;
-               unsigned compress_ptrs  = 0;
-               unsigned csum_ptrs      = 0;
-               u64 sectors             = 0;
-
-               bch2_bkey_needs_rebalance(c, k, opts, &move_ptrs, 
&compress_ptrs, &csum_ptrs, &sectors);
-
                if (move_ptrs) {
                        prt_str(&buf, "move=");
                        bch2_target_to_text(&buf, c, opts->background_target);
@@ -671,6 +689,7 @@ static int do_rebalance_extent(struct moving_context *ctxt,
 
 static int do_rebalance_scan_indirect(struct btree_trans *trans,
                                      struct bkey_s_c_reflink_p p,
+                                     struct per_snapshot_io_opts 
*snapshot_io_opts,
                                      struct bch_inode_opts *opts)
 {
        u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad);
@@ -681,7 +700,7 @@ static int do_rebalance_scan_indirect(struct btree_trans 
*trans,
                                     POS(0, idx), BTREE_ITER_not_extents, k, ({
                if (bpos_ge(bkey_start_pos(k.k), POS(0, end)))
                        break;
-               bch2_get_update_rebalance_opts(trans, opts, &iter, k,
+               bch2_get_update_rebalance_opts(trans, snapshot_io_opts, opts, 
&iter, k,
                                               
SET_NEEDS_REBALANCE_opt_change_indirect);
        }));
        if (ret)
@@ -726,7 +745,8 @@ static int do_rebalance_scan(struct moving_context *ctxt,
                (inum &&
                 k.k->type == KEY_TYPE_reflink_p &&
                 REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)
-                ? do_rebalance_scan_indirect(trans, bkey_s_c_to_reflink_p(k), 
opts)
+                ? do_rebalance_scan_indirect(trans, bkey_s_c_to_reflink_p(k),
+                                             snapshot_io_opts, opts)
                 : 0);
        })) ?:
        commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
@@ -1047,8 +1067,7 @@ static int check_rebalance_work_one(struct btree_trans 
*trans,
                extent_k.k = &deleted;
        }
 
-       bool should_have_rebalance =
-               bch2_bkey_sectors_need_rebalance(c, extent_k) != 0;
+       bool should_have_rebalance = bch2_bkey_needs_rb(extent_k);
        bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set;
 
        if (should_have_rebalance != have_rebalance) {
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index fd873894c8b6..dde7e4cb9533 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -10,7 +10,7 @@
 static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct 
bch_fs *c,
                                                                    struct 
bch_inode_opts *opts)
 {
-       struct bch_extent_rebalance r = {
+       return (struct bch_extent_rebalance) {
                .type = BIT(BCH_EXTENT_ENTRY_rebalance),
 #define x(_name)                                                       \
                ._name = opts->_name,                                   \
@@ -18,15 +18,15 @@ static inline struct bch_extent_rebalance 
io_opts_to_rebalance_opts(struct bch_f
                BCH_REBALANCE_OPTS()
 #undef x
        };
-
-       if (r.background_target &&
-           !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target))
-               r.background_target = 0;
-
-       return r;
 };
 
-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
+
+static inline int bch2_bkey_needs_rb(struct bkey_s_c k)
+{
+       const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+       return r ? r->need_rb : 0;
+}
 
 /* Inodes in different snapshots may have different IO options: */
 struct snapshot_io_opts_entry {
@@ -36,6 +36,9 @@ struct snapshot_io_opts_entry {
 
 struct per_snapshot_io_opts {
        u64                     cur_inum;
+       bool                    fs_scan_cookie;
+       bool                    inum_scan_cookie;
+
        struct bch_inode_opts   fs_io_opts;
        DARRAY(struct snapshot_io_opts_entry) d;
 };
@@ -60,7 +63,7 @@ enum set_needs_rebalance_ctx {
 
 int bch2_bkey_set_needs_rebalance(struct btree_trans *,
                                  struct per_snapshot_io_opts *, struct 
bch_inode_opts *,
-                                 struct bkey_i *, enum 
set_needs_rebalance_ctx, u32);
+                                 struct bkey_i *, enum 
set_needs_rebalance_ctx, unsigned);
 
 struct bch_inode_opts *bch2_extent_get_apply_io_opts(struct btree_trans *,
                          struct per_snapshot_io_opts *, struct bpos,
diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h
index ff9a1342a22b..c744a29c8fa5 100644
--- a/fs/bcachefs/rebalance_format.h
+++ b/fs/bcachefs/rebalance_format.h
@@ -2,52 +2,64 @@
 #ifndef _BCACHEFS_REBALANCE_FORMAT_H
 #define _BCACHEFS_REBALANCE_FORMAT_H
 
+/* subset of BCH_INODE_OPTS */
+#define BCH_REBALANCE_OPTS()                   \
+       x(data_replicas)                        \
+       x(data_checksum)                        \
+       x(erasure_code)                         \
+       x(background_compression)               \
+       x(background_target)                    \
+       x(promote_target)
+
+enum bch_rebalance_opts {
+#define x(n)   BCH_REBALANCE_##n,
+       BCH_REBALANCE_OPTS()
+#undef x
+};
+
 struct bch_extent_rebalance {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
        __u64                   type:6,
-                               unused:3,
+                               unused:5,
+                               hipri:1,
+                               pending:1,
+                               need_rb:5,
 
-                               promote_target_from_inode:1,
-                               erasure_code_from_inode:1,
+                               data_replicas_from_inode:1,
                                data_checksum_from_inode:1,
+                               erasure_code_from_inode:1,
                                background_compression_from_inode:1,
-                               data_replicas_from_inode:1,
                                background_target_from_inode:1,
+                               promote_target_from_inode:1,
 
-                               promote_target:16,
-                               erasure_code:1,
+                               data_replicas:3,
                                data_checksum:4,
-                               data_replicas:4,
+                               erasure_code:1,
                                background_compression:8, /* enum 
bch_compression_opt */
-                               background_target:16;
+                               background_target:12,
+                               promote_target:12;
 #elif defined (__BIG_ENDIAN_BITFIELD)
-       __u64                   background_target:16,
+       __u64                   promote_target:12,
+                               background_target:12,
                                background_compression:8,
-                               data_replicas:4,
-                               data_checksum:4,
                                erasure_code:1,
-                               promote_target:16,
+                               data_checksum:4,
+                               data_replicas:3,
 
+                               promote_target_from_inode:1,
                                background_target_from_inode:1,
-                               data_replicas_from_inode:1,
                                background_compression_from_inode:1,
-                               data_checksum_from_inode:1,
                                erasure_code_from_inode:1,
-                               promote_target_from_inode:1,
+                               data_checksum_from_inode:1,
+                               data_replicas_from_inode:1,
 
-                               unused:3,
+                               need_rb:5,
+                               pending:1,
+                               hipri:1,
+                               unused:5,
                                type:6;
 #endif
 };
 
-/* subset of BCH_INODE_OPTS */
-#define BCH_REBALANCE_OPTS()                   \
-       x(data_checksum)                        \
-       x(background_compression)               \
-       x(data_replicas)                        \
-       x(promote_target)                       \
-       x(background_target)                    \
-       x(erasure_code)
-
 #endif /* _BCACHEFS_REBALANCE_FORMAT_H */
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 269cdf1a87a4..915c3201fe16 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1331,11 +1331,6 @@ DEFINE_EVENT(fs_str, io_move_pred,
        TP_ARGS(c, str)
 );
 
-DEFINE_EVENT(fs_str, io_move_created_rebalance,
-       TP_PROTO(struct bch_fs *c, const char *str),
-       TP_ARGS(c, str)
-);
-
 DEFINE_EVENT(fs_str, io_move_evacuate_bucket,
        TP_PROTO(struct bch_fs *c, const char *str),
        TP_ARGS(c, str)
-- 
2.50.1


Reply via email to