Bob, On Thu, May 23, 2019 at 3:05 PM Bob Peterson <rpete...@redhat.com> wrote: > Before this patch, gfs2 kept track of journal io errors in two > places sd_log_error and the SDF_AIL1_IO_ERROR flag in sd_flags. > This patch consolidates the two into sd_log_error so that it > reflects the first error encountered writing to the journal. > In future patches, we will take advantage of this by checking > this value rather than having to check both when reacting to > io errors. > > In addition, this fixes a tight loop in unmount: If buffers > get on the ail1 list and an io error occurs elsewhere, the > ail1 list would never be cleared because they were always busy. > So unmount would hang, waiting for the ail1 list to empty. > > Signed-off-by: Bob Peterson <rpete...@redhat.com> > --- > fs/gfs2/incore.h | 7 +++---- > fs/gfs2/log.c | 20 +++++++++++++++----- > fs/gfs2/quota.c | 2 +- > 3 files changed, 19 insertions(+), 10 deletions(-) > > diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h > index b261168be298..39cec5361ba5 100644 > --- a/fs/gfs2/incore.h > +++ b/fs/gfs2/incore.h > @@ -620,9 +620,8 @@ enum { > SDF_RORECOVERY = 7, /* read only recovery */ > SDF_SKIP_DLM_UNLOCK = 8, > SDF_FORCE_AIL_FLUSH = 9, > - SDF_AIL1_IO_ERROR = 10, > - SDF_FS_FROZEN = 11, > - SDF_WITHDRAWING = 12, /* Will withdraw eventually */ > + SDF_FS_FROZEN = 10, > + SDF_WITHDRAWING = 11, /* Will withdraw eventually */ > }; > > enum gfs2_freeze_state { > @@ -831,7 +830,7 @@ struct gfs2_sbd { > atomic_t sd_log_in_flight; > struct bio *sd_log_bio; > wait_queue_head_t sd_log_flush_wait; > - int sd_log_error; > + int sd_log_error; /* First log error */ > > atomic_t sd_reserving_log; > wait_queue_head_t sd_reserving_log_wait; > diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c > index 0fe11bde796b..9784763fbb4e 100644 > --- a/fs/gfs2/log.c > +++ b/fs/gfs2/log.c > @@ -108,8 +108,7 @@ __acquires(&sdp->sd_ail_lock) > > if (!buffer_busy(bh)) { > if (!buffer_uptodate(bh) && > - !test_and_set_bit(SDF_AIL1_IO_ERROR, > - &sdp->sd_flags)) { > + !cmpxchg(&sdp->sd_log_error, 0, -EIO)) { > gfs2_io_error_bh(sdp, bh); > set_bit(SDF_WITHDRAWING, &sdp->sd_flags); > } > @@ -203,10 +202,21 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, > struct gfs2_trans *tr) > bd_ail_st_list) { > bh = bd->bd_bh; > gfs2_assert(sdp, bd->bd_tr == tr); > - if (buffer_busy(bh)) > + /** > + * If another process flagged an io error, e.g. writing to the > + * journal, error all other bhs and move them off the ail1 to > + * prevent a tight loop when unmount tries to flush ail1, > + * regardless of whether they're still busy. If no outside > + * errors were found and the buffer is busy, move to the next. > + * If the ail buffer is not busy and caught an error, flag it > + * for others. > + */ > + if (sdp->sd_log_error) { > + gfs2_io_error_bh(sdp, bh);
some of the error handling here is still sketchy: the only place where sd_log_error is set without withdrawing the filesystem is quotad_error. If the filesystem has already been marked SDF_WITHDRAWING or SDF_WITHDRAWN, gfs2_io_error_bh will be a no-op. It seems that we want to set SDF_WITHDRAWING here for the quotad_error case instead of calling gfs2_io_error_bh? > + } else if (buffer_busy(bh)) { > continue; > - if (!buffer_uptodate(bh) && > - !test_and_set_bit(SDF_AIL1_IO_ERROR, &sdp->sd_flags)) { > + } else if (!buffer_uptodate(bh) && > + !cmpxchg(&sdp->sd_log_error, 0, -EIO)) { > gfs2_io_error_bh(sdp, bh); > set_bit(SDF_WITHDRAWING, &sdp->sd_flags); > } > diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c > index a8dfc86fd682..8871fca9102f 100644 > --- a/fs/gfs2/quota.c > +++ b/fs/gfs2/quota.c > @@ -1480,7 +1480,7 @@ static void quotad_error(struct gfs2_sbd *sdp, const > char *msg, int error) > return; > if (!gfs2_withdrawn(sdp)) { > fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error); > - sdp->sd_log_error = error; > + cmpxchg(&sdp->sd_log_error, 0, error); > wake_up(&sdp->sd_logd_waitq); > } > } > -- > 2.21.0 > Thanks, Andreas