On Fri, Nov 10, 2023 at 11:31:44AM -0500, Kent Overstreet wrote:
> Journal replay now first attempts to replay keys in sorted order,
> similar to how the btree write buffer flush path works.
>
> Any keys that can not be replayed due to journal deadlock are then left
> for later and replayed in journal order, unpinning journal entries as we
> go.
>
> Signed-off-by: Kent Overstreet <[email protected]>
> ---
Just a question and a nit..
> fs/bcachefs/errcode.h | 1 -
> fs/bcachefs/recovery.c | 84 ++++++++++++++++++++++++++++--------------
> 2 files changed, 56 insertions(+), 29 deletions(-)
>
...
> diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
> index 6eafff2557a4..3524d1e0003e 100644
> --- a/fs/bcachefs/recovery.c
> +++ b/fs/bcachefs/recovery.c
...
> @@ -166,26 +159,61 @@ static int bch2_journal_replay(struct bch_fs *c)
> goto err;
> }
>
> - for (i = 0; i < keys->nr; i++) {
> - k = keys_sorted[i];
> + /*
> + * First, attempt to replay keys in sorted order. This is more
> + * efficient, but some might fail if that would cause a journal
> + * deadlock.
> + */
What does "sorted order" mean here vs. when we sort by journal seq
below?
> + for (size_t i = 0; i < keys->nr; i++) {
> + cond_resched();
> +
> + struct journal_key *k = keys->d + i;
> +
> + ret = commit_do(trans, NULL, NULL,
> + BTREE_INSERT_NOFAIL|
> + BTREE_INSERT_JOURNAL_RECLAIM|
> + (!k->allocated ? BTREE_INSERT_JOURNAL_REPLAY :
> 0),
> + bch2_journal_replay_key(trans, k));
> + BUG_ON(!ret && !k->overwritten);
> + if (ret) {
> + ret = darray_push(&keys_sorted, k);
> + if (ret)
> + goto err;
> + }
> + }
> +
> + /*
> + * Now, replay any remaining keys in the order in which they appear in
> + * the journal, unpinning those journal entries as we go:
> + */
> + sort(keys_sorted.data, keys_sorted.nr,
> + sizeof(keys_sorted.data[0]),
> + journal_sort_seq_cmp, NULL);
>
> + darray_for_each(keys_sorted, kp) {
> cond_resched();
>
> + struct journal_key *k = *kp;
> +
> replay_now_at(j, k->journal_seq);
>
> - ret = bch2_trans_do(c, NULL, NULL,
> - BTREE_INSERT_NOFAIL|
> - (!k->allocated
> - ?
> BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim
> - : 0),
> + ret = commit_do(trans, NULL, NULL,
> + BTREE_INSERT_NOFAIL|
> + (!k->allocated
> + ?
> BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim
> + : 0),
> bch2_journal_replay_key(trans, k));
> - if (ret) {
> - bch_err(c, "journal replay: error while replaying key
> at btree %s level %u: %s",
> - bch2_btree_id_str(k->btree_id), k->level,
> bch2_err_str(ret));
> + bch_err_msg(c, ret, "while replaying key at btree %s level %u:",
> + bch2_btree_id_str(k->btree_id), k->level);
> + if (ret)
> goto err;
> - }
> +
> + BUG_ON(!k->overwritten);
> }
>
> + bch2_trans_put(trans);
> + trans = NULL;
> +
Nit: This looks spurious w/ the exit path, just FWIW.
Brian
> replay_now_at(j, j->replay_journal_seq_end);
> j->replay_journal_seq = 0;
>
> @@ -196,10 +224,10 @@ static int bch2_journal_replay(struct bch_fs *c)
> if (keys->nr && !ret)
> bch2_journal_log_msg(c, "journal replay finished");
> err:
> - kvfree(keys_sorted);
> -
> - if (ret)
> - bch_err_fn(c, ret);
> + if (trans)
> + bch2_trans_put(trans);
> + darray_exit(&keys_sorted);
> + bch_err_fn(c, ret);
> return ret;
> }
>
> --
> 2.42.0
>
>