Before this patch, when a node had an io error or was withdrawn,
the dlm would send its normal callbacks to start recovery. But
if the file system has an io error, it cannot safely replay its
journal anyway; someone else needs to do it. Also, if it's withdrawn,
we don't want it messing with the journal recovery generation
numbers. If we try to do journal recovery and update the generation
number, it can confuse the remaining functioning nodes into
believing the recovery is not necessary.

This patch adds checks to the callbacks used by dlm
so that the functions are skipped if an io error has occurred
or if the file system was withdraw.

Signed-off-by: Bob Peterson <[email protected]>
---
 fs/gfs2/lock_dlm.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 31df26ed7854..c75fe5544ffc 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1081,6 +1081,14 @@ static void gdlm_recover_prep(void *arg)
        struct gfs2_sbd *sdp = arg;
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
 
+       if (test_bit(SDF_AIL1_IO_ERROR, &sdp->sd_flags)) {
+               fs_err(sdp, "recover prep ignored due to io error.\n");
+               return;
+       }
+       if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
+               fs_err(sdp, "recover prep ignored due to withdraw.\n");
+               return;
+       }
        spin_lock(&ls->ls_recover_spin);
        ls->ls_recover_block = ls->ls_recover_start;
        set_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
@@ -1103,6 +1111,16 @@ static void gdlm_recover_slot(void *arg, struct dlm_slot 
*slot)
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
        int jid = slot->slot - 1;
 
+       if (test_bit(SDF_AIL1_IO_ERROR, &sdp->sd_flags)) {
+               fs_err(sdp, "recover slot jid %d ignored due to io error.\n",
+                      jid);
+               return;
+       }
+       if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
+               fs_err(sdp, "recover slot jid %d ignored due to withdraw.\n",
+                      jid);
+               return;
+       }
        spin_lock(&ls->ls_recover_spin);
        if (ls->ls_recover_size < jid + 1) {
                fs_err(sdp, "recover_slot jid %d gen %u short size %d\n",
@@ -1127,6 +1145,14 @@ static void gdlm_recover_done(void *arg, struct dlm_slot 
*slots, int num_slots,
        struct gfs2_sbd *sdp = arg;
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
 
+       if (test_bit(SDF_AIL1_IO_ERROR, &sdp->sd_flags)) {
+               fs_err(sdp, "recover done ignored due to io error.\n");
+               return;
+       }
+       if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
+               fs_err(sdp, "recover done ignored due to withdraw.\n");
+               return;
+       }
        /* ensure the ls jid arrays are large enough */
        set_recover_size(sdp, slots, num_slots);
 
@@ -1154,6 +1180,16 @@ static void gdlm_recovery_result(struct gfs2_sbd *sdp, 
unsigned int jid,
 {
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
 
+       if (test_bit(SDF_AIL1_IO_ERROR, &sdp->sd_flags)) {
+               fs_err(sdp, "recovery result jid %d ignored due to io error.\n",
+                      jid);
+               return;
+       }
+       if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
+               fs_err(sdp, "recovery result jid %d ignored due to withdraw.\n",
+                      jid);
+               return;
+       }
        if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
                return;
 
-- 
2.17.2

Reply via email to