This patch addresses various problems with gfs2/dlm recovery.

For example, suppose a node with a bunch of gfs2 mounts suddenly
reboots due to kernel panic, and dlm determines it should perform
recovery. DLM does so from a pseudo-state machine calling various
callbacks into lock_dlm to perform a sequence of steps. It uses
generation numbers and recover bits in dlm "control" lock lvbs.

Now suppose another node tries to recover the failed node's
journal, but in so doing, encounters an IO error or withdraws
due to unforeseen circumstances, such as an hba driver failure.
In these cases, the recovery would eventually bail out, but it
would still update its generation number in the lvb. The other
nodes would all see the newer generation number and think they
don't need to do recovery because the generation number is newer
than the last one they saw, and therefore someone else has already
taken care of it.

If the file system has an io error or is withdrawn, it cannot
safely replay any journals (its own or others) but someone else
still needs to do it. Therefore we don't want it messing with
the journal recovery generation numbers: the local generation
numbers eventually get put into the lvb generation numbers to be
seen by all nodes.

This patch adds checks to many of the callbacks used by dlm
in its recovery state machine so that the functions are ignored
and skipped if an io error has occurred or if the file system
was withdraw.

Signed-off-by: Bob Peterson <rpete...@redhat.com>
---
 fs/gfs2/lock_dlm.c | 36 ++++++++++++++++++++++++++++++++++++
 fs/gfs2/util.c     |  2 +-
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 31df26ed7854..8b94f34c5c0f 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1081,6 +1081,14 @@ static void gdlm_recover_prep(void *arg)
        struct gfs2_sbd *sdp = arg;
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
 
+       if (atomic_read(&sdp->sd_log_errors)) {
+               fs_err(sdp, "recover_prep ignored due to io error.\n");
+               return;
+       }
+       if (withdrawn(sdp)) {
+               fs_err(sdp, "recover_prep ignored due to withdraw.\n");
+               return;
+       }
        spin_lock(&ls->ls_recover_spin);
        ls->ls_recover_block = ls->ls_recover_start;
        set_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
@@ -1103,6 +1111,16 @@ static void gdlm_recover_slot(void *arg, struct dlm_slot 
*slot)
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
        int jid = slot->slot - 1;
 
+       if (atomic_read(&sdp->sd_log_errors)) {
+               fs_err(sdp, "recover_slot jid %d ignored due to io error.\n",
+                      jid);
+               return;
+       }
+       if (withdrawn(sdp)) {
+               fs_err(sdp, "recover_slot jid %d ignored due to withdraw.\n",
+                      jid);
+               return;
+       }
        spin_lock(&ls->ls_recover_spin);
        if (ls->ls_recover_size < jid + 1) {
                fs_err(sdp, "recover_slot jid %d gen %u short size %d\n",
@@ -1127,6 +1145,14 @@ static void gdlm_recover_done(void *arg, struct dlm_slot 
*slots, int num_slots,
        struct gfs2_sbd *sdp = arg;
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
 
+       if (atomic_read(&sdp->sd_log_errors)) {
+               fs_err(sdp, "recover_done ignored due to io error.\n");
+               return;
+       }
+       if (withdrawn(sdp)) {
+               fs_err(sdp, "recover_done ignored due to withdraw.\n");
+               return;
+       }
        /* ensure the ls jid arrays are large enough */
        set_recover_size(sdp, slots, num_slots);
 
@@ -1154,6 +1180,16 @@ static void gdlm_recovery_result(struct gfs2_sbd *sdp, 
unsigned int jid,
 {
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
 
+       if (atomic_read(&sdp->sd_log_errors)) {
+               fs_err(sdp, "recovery_result jid %d ignored due to io error.\n",
+                      jid);
+               return;
+       }
+       if (withdrawn(sdp)) {
+               fs_err(sdp, "recovery_result jid %d ignored due to withdraw.\n",
+                      jid);
+               return;
+       }
        if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
                return;
 
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 717aef772c60..ca6de80b5e8b 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -260,7 +260,7 @@ void gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct 
buffer_head *bh,
                        const char *function, char *file, unsigned int line,
                        bool withdraw)
 {
-       if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+       if (!withdrawn(sdp))
                fs_err(sdp,
                       "fatal: I/O error\n"
                       "  block = %llu\n"
-- 
2.20.1

Reply via email to