On Fri, Nov 10, 2023 at 11:31:44AM -0500, Kent Overstreet wrote:
> Journal replay now first attempts to replay keys in sorted order,
> similar to how the btree write buffer flush path works.
> 
> Any keys that can not be replayed due to journal deadlock are then left
> for later and replayed in journal order, unpinning journal entries as we
> go.
> 
> Signed-off-by: Kent Overstreet <[email protected]>
> ---

Just a question and a nit..

>  fs/bcachefs/errcode.h  |  1 -
>  fs/bcachefs/recovery.c | 84 ++++++++++++++++++++++++++++--------------
>  2 files changed, 56 insertions(+), 29 deletions(-)
> 
...
> diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
> index 6eafff2557a4..3524d1e0003e 100644
> --- a/fs/bcachefs/recovery.c
> +++ b/fs/bcachefs/recovery.c
...
> @@ -166,26 +159,61 @@ static int bch2_journal_replay(struct bch_fs *c)
>                       goto err;
>       }
>  
> -     for (i = 0; i < keys->nr; i++) {
> -             k = keys_sorted[i];
> +     /*
> +      * First, attempt to replay keys in sorted order. This is more
> +      * efficient, but some might fail if that would cause a journal
> +      * deadlock.
> +      */

What does "sorted order" mean here vs. when we sort by journal seq
below?

> +     for (size_t i = 0; i < keys->nr; i++) {
> +             cond_resched();
> +
> +             struct journal_key *k = keys->d + i;
> +
> +             ret = commit_do(trans, NULL, NULL,
> +                             BTREE_INSERT_NOFAIL|
> +                             BTREE_INSERT_JOURNAL_RECLAIM|
> +                             (!k->allocated ? BTREE_INSERT_JOURNAL_REPLAY : 
> 0),
> +                          bch2_journal_replay_key(trans, k));
> +             BUG_ON(!ret && !k->overwritten);
> +             if (ret) {
> +                     ret = darray_push(&keys_sorted, k);
> +                     if (ret)
> +                             goto err;
> +             }
> +     }
> +
> +     /*
> +      * Now, replay any remaining keys in the order in which they appear in
> +      * the journal, unpinning those journal entries as we go:
> +      */
> +     sort(keys_sorted.data, keys_sorted.nr,
> +          sizeof(keys_sorted.data[0]),
> +          journal_sort_seq_cmp, NULL);
>  
> +     darray_for_each(keys_sorted, kp) {
>               cond_resched();
>  
> +             struct journal_key *k = *kp;
> +
>               replay_now_at(j, k->journal_seq);
>  
> -             ret = bch2_trans_do(c, NULL, NULL,
> -                                 BTREE_INSERT_NOFAIL|
> -                                 (!k->allocated
> -                                  ? 
> BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim
> -                                  : 0),
> +             ret = commit_do(trans, NULL, NULL,
> +                             BTREE_INSERT_NOFAIL|
> +                             (!k->allocated
> +                              ? 
> BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim
> +                              : 0),
>                            bch2_journal_replay_key(trans, k));
> -             if (ret) {
> -                     bch_err(c, "journal replay: error while replaying key 
> at btree %s level %u: %s",
> -                             bch2_btree_id_str(k->btree_id), k->level, 
> bch2_err_str(ret));
> +             bch_err_msg(c, ret, "while replaying key at btree %s level %u:",
> +                         bch2_btree_id_str(k->btree_id), k->level);
> +             if (ret)
>                       goto err;
> -             }
> +
> +             BUG_ON(!k->overwritten);
>       }
>  
> +     bch2_trans_put(trans);
> +     trans = NULL;
> +

Nit: This looks spurious w/ the exit path, just FWIW.

Brian

>       replay_now_at(j, j->replay_journal_seq_end);
>       j->replay_journal_seq = 0;
>  
> @@ -196,10 +224,10 @@ static int bch2_journal_replay(struct bch_fs *c)
>       if (keys->nr && !ret)
>               bch2_journal_log_msg(c, "journal replay finished");
>  err:
> -     kvfree(keys_sorted);
> -
> -     if (ret)
> -             bch_err_fn(c, ret);
> +     if (trans)
> +             bch2_trans_put(trans);
> +     darray_exit(&keys_sorted);
> +     bch_err_fn(c, ret);
>       return ret;
>  }
>  
> -- 
> 2.42.0
> 
> 


Reply via email to