[Cluster-devel] [GFS2 PATCH 7/9] gfs2: Check for log write errors and withdraw in rgrp_go_inval
Before this patch, function rgrp_go_inval just assumed all the writes submitted to the journal were finished and successful. But if they're not, and a revoke fails to make its way to the journal, a journal replay on another node will cause corruption if we let the go_inval function continue and tell dlm to release the glock to another node. This patch adds a couple assert_withdraws in the rgrp_go_inval function. The assert should cause another node to replay the journal before continuing, thus protecting the rgrp glock and maintaining the integrity of the rgrp. Signed-off-by: Bob Peterson --- fs/gfs2/glops.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index f372a6f169a2..64b8e5e808d8 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -168,8 +168,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl) error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end); mapping_set_error(mapping, error); gfs2_ail_empty_gl(gl); - gfs2_assert_withdraw(gl->gl_name.ln_sbd, -gl->gl_name.ln_sbd->sd_log_error == 0); + gfs2_assert_withdraw(sdp, gl->gl_name.ln_sbd->sd_log_error == 0); spin_lock(&gl->gl_lockref.lock); rgd = gl->gl_object; @@ -202,6 +201,8 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end); gfs2_ail_empty_gl(gl); + gfs2_assert_withdraw(sdp, sdp->sd_log_error == 0); + if (rgd) rgd->rd_flags &= ~GFS2_RDF_UPTODATE; } -- 2.20.1
[Cluster-devel] [GFS2 PATCH 1/9] gfs2: Introduce concept of a pending withdraw
File system withdraws can be delayed when inconsistencies are discovered when we cannot withdraw immediately, for example, when critical spin_locks are held. But delaying the withdraw can cause gfs2 to ignore the error and keep running for a short period of time. For example, an rgrp glock may be dequeued and demoted while there are still buffers that haven't been properly revoked, due to io errors writing to the journal. This patch introduces a new concept of a delayed withdraw, which means an inconsistency has been discovered and we need to withdraw at the earliest possible opportunity. In these cases, we aren't quite withdrawn yet, but we still need to not dequeue glocks and other critical things. If we dequeue the glocks and the withdraw results in our journal being replayed, the replay could overwrite data that's been modified by a different node that acquired the glock in the meantime. Signed-off-by: Bob Peterson --- fs/gfs2/aops.c | 4 ++-- fs/gfs2/file.c | 2 +- fs/gfs2/glock.c | 7 +++ fs/gfs2/glops.c | 2 +- fs/gfs2/incore.h | 1 + fs/gfs2/log.c| 20 fs/gfs2/meta_io.c| 6 +++--- fs/gfs2/ops_fstype.c | 3 +-- fs/gfs2/quota.c | 2 +- fs/gfs2/super.c | 6 +++--- fs/gfs2/sys.c| 2 +- fs/gfs2/util.c | 1 + fs/gfs2/util.h | 8 13 files changed, 34 insertions(+), 30 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 05dd78f4b2b3..0d3cde8a61cd 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -521,7 +521,7 @@ static int __gfs2_readpage(void *file, struct page *page) error = mpage_readpage(page, gfs2_block_map); } - if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + if (unlikely(withdrawn(sdp))) return -EIO; return error; @@ -638,7 +638,7 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping, gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + if (unlikely(withdrawn(sdp))) ret = -EIO; return ret; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index a2dea5bc0427..4046f6ac7f13 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1169,7 +1169,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) cmd = F_SETLK; fl->fl_type = F_UNLCK; } - if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { + if (unlikely(withdrawn(sdp))) { if (fl->fl_type == F_UNLCK) locks_lock_file_wait(file, fl); return -EIO; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index f66773c71bcd..c6d6e478f5e3 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -542,7 +542,7 @@ __acquires(&gl->gl_lockref.lock) unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0); int ret; - if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) && + if (unlikely(withdrawn(sdp)) && target != LM_ST_UNLOCKED) return; lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | @@ -579,8 +579,7 @@ __acquires(&gl->gl_lockref.lock) } else if (ret) { fs_err(sdp, "lm_lock ret %d\n", ret); - GLOCK_BUG_ON(gl, !test_bit(SDF_SHUTDOWN, - &sdp->sd_flags)); + GLOCK_BUG_ON(gl, !withdrawn(sdp)); } } else { /* lock_nolock */ finish_xmote(gl, target); @@ -1092,7 +1091,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh) struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; int error = 0; - if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + if (unlikely(withdrawn(sdp))) return -EIO; if (test_bit(GLF_LRU, &gl->gl_flags)) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index f15b4c57c4bd..9c86c8004ba7 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -539,7 +539,7 @@ static int freeze_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh) gfs2_consist(sdp); /* Initialize some head of the log stuff */ - if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) { + if (!withdrawn(sdp)) { sdp->sd_log_sequence = head.lh_sequence + 1; gfs2_log_pointers_init(sdp, head.lh_blkno); } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index cdf07b408f54..8380d4db8be6 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -621,6 +621,7 @@ enum { SDF_SKIP_DLM_UNLOCK = 8, SDF_FORCE_AIL_FLUSH = 9, SDF_AIL1_IO_ERROR = 10, + SDF_PENDING_WITHDRAW= 11, /* Will withdraw eventually */ }; enum gfs2_freeze_state { diff --git a/fs/gfs2/log.c b/fs/gfs2/log
[Cluster-devel] [GFS2 PATCH 8/9] gfs2: Do log_flush in gfs2_ail_empty_gl even if ail list is empty
Before this patch, if gfs2_ail_empty_gl saw there was nothing on the ail list, it would return and not flush the log. The problem is that there could still be a revoke for the rgrp sitting on the sd_log_le_revoke list that's been recently taken off the ail list. But that revoke still needs to be written, and the rgrp_go_inval still needs to call log_flush_wait to ensure the revokes are all properly written to the journal before we relinquish control of the glock to another node. If we give the glock to another node before we have this knowledge, the node might crash and its journal replayed, in which case the missing revoke would allow the journal replay to replay the rgrp over top of the rgrp we already gave to another node, thus overwriting its changes and corrupting the file system. This patch makes gfs2_ail_empty_gl still call gfs2_log_flush rather than returning. Signed-off-by: Bob Peterson --- fs/gfs2/glops.c | 23 +-- fs/gfs2/log.c | 2 +- fs/gfs2/log.h | 1 + 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 64b8e5e808d8..adae9ecf8311 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -94,8 +94,26 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) INIT_LIST_HEAD(&tr.tr_databuf); tr.tr_revokes = atomic_read(&gl->gl_ail_count); - if (!tr.tr_revokes) - return; + if (!tr.tr_revokes) { + /** +* We have nothing on the ail, but there could be revokes on +* the sdp revoke queue, in which case, we still want to flush +* the log and wait for it to finish. +* +* If the sdp revoke list is empty too, we might still have an +* io outstanding for writing revokes, so we should wait for +* it before proceeding. +* +* If none of these conditions are true, our revokes are all +* flushed and we can return. +*/ + if (!list_empty(&sdp->sd_log_le_revoke)) + goto flush; + else if (atomic_read(&sdp->sd_log_in_flight)) + log_flush_wait(sdp); + else + return; + } /* A shortened, inline version of gfs2_trans_begin() * tr->alloced is not set since the transaction structure is @@ -110,6 +128,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) __gfs2_ail_flush(gl, 0, tr.tr_revokes); gfs2_trans_end(sdp); +flush: gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_AIL_EMPTY_GL); } diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 0d0dec3231c9..610cd2637dc5 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -515,7 +515,7 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail) } -static void log_flush_wait(struct gfs2_sbd *sdp) +void log_flush_wait(struct gfs2_sbd *sdp) { DEFINE_WAIT(wait); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 1bc9bd444b28..bd2d08d0f21c 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -75,6 +75,7 @@ extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 type); extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc); +extern void log_flush_wait(struct gfs2_sbd *sdp); extern void gfs2_log_shutdown(struct gfs2_sbd *sdp); extern int gfs2_logd(void *data); -- 2.20.1
[Cluster-devel] [GFS2 PATCH 3/9] gfs2: Empty the ail for the glock when rgrps are invalidated
Before this patch, function rgrp_go_inval would not invalidate the ail list, which meant that there might still be buffers outstanding on the ail that had revokes still pending. If the revokes had still not been written when the glock was given to another node, and that node (with outstanding revokes) died for some reason, the resulting journal replay would replay the un-revoked rgrps, thus wiping out changes made by the node who rightfully received the rgrp in EX. This caused metadata corruption. Signed-off-by: Bob Peterson --- fs/gfs2/glops.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 9c86c8004ba7..4b0e52bf5825 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -166,6 +166,8 @@ static void rgrp_go_sync(struct gfs2_glock *gl) error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end); mapping_set_error(mapping, error); gfs2_ail_empty_gl(gl); + gfs2_assert_withdraw(gl->gl_name.ln_sbd, +gl->gl_name.ln_sbd->sd_log_error == 0); spin_lock(&gl->gl_lockref.lock); rgd = gl->gl_object; @@ -196,6 +198,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) WARN_ON_ONCE(!(flags & DIO_METADATA)); gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end); + gfs2_ail_empty_gl(gl); if (rgd) rgd->rd_flags &= ~GFS2_RDF_UPTODATE; -- 2.20.1
[Cluster-devel] [GFS2 PATCH 0/9] GFS2: Withdraw corruption patches
I consider this more of a preliminary "collection" of patches rather than a "patch set" per se. In other words, most of these do not rely upon the previous patches, although some do. Some of them may be removed without a lot of difficulty if they are found to be problematic. I thought about sending them out individually, but decided against it. These patches address a bunch of problems related to journal replay overwriting valid gfs2 metadata due to io errors, withdraws and such. These seem to fix several metadata corruption problems I've been able to reliably recreate lately with multi-node multi-file system recovery tests. I'm not convinced we need all these patches, but they seem to work well when put together. I just wanted to throw them out here for review so people could point out if anything doesn't make sense. In other words, there are likely to be revisions. Bob Peterson (9): gfs2: Introduce concept of a pending withdraw gfs2: Ignore recovery attempts if gfs2 has io error or is withdrawn gfs2: Empty the ail for the glock when rgrps are invalidated gfs2: Force withdraw to replay journals and wait for it to finish gfs2: Keep transactions on ail1 list until after issuing revokes gfs2: Make secondary withdrawers wait for first withdrawer gfs2: Check for log write errors and withdraw in rgrp_go_inval gfs2: Do log_flush in gfs2_ail_empty_gl even if ail list is empty dlm: recover slot regardless of whether we still have a connection fs/dlm/member.c | 11 +-- fs/gfs2/aops.c | 4 +- fs/gfs2/file.c | 2 +- fs/gfs2/glock.c | 40 +-- fs/gfs2/glock.h | 1 + fs/gfs2/glops.c | 90 ++-- fs/gfs2/incore.h | 10 +++ fs/gfs2/lock_dlm.c | 68 ++ fs/gfs2/log.c| 74 +++- fs/gfs2/log.h| 1 + fs/gfs2/meta_io.c| 6 +- fs/gfs2/ops_fstype.c | 51 ++ fs/gfs2/quota.c | 2 +- fs/gfs2/super.c | 30 fs/gfs2/super.h | 1 + fs/gfs2/sys.c| 2 +- fs/gfs2/util.c | 160 ++- fs/gfs2/util.h | 11 +++ 18 files changed, 443 insertions(+), 121 deletions(-) -- 2.20.1
[Cluster-devel] [GFS2 PATCH 2/9] gfs2: Ignore recovery attempts if gfs2 has io error or is withdrawn
This patch addresses various problems with gfs2/dlm recovery. For example, suppose a node with a bunch of gfs2 mounts suddenly reboots due to kernel panic, and dlm determines it should perform recovery. DLM does so from a pseudo-state machine calling various callbacks into lock_dlm to perform a sequence of steps. It uses generation numbers and recover bits in dlm "control" lock lvbs. Now suppose another node tries to recover the failed node's journal, but in so doing, encounters an IO error or withdraws due to unforeseen circumstances, such as an hba driver failure. In these cases, the recovery would eventually bail out, but it would still update its generation number in the lvb. The other nodes would all see the newer generation number and think they don't need to do recovery because the generation number is newer than the last one they saw, and therefore someone else has already taken care of it. If the file system has an io error or is withdrawn, it cannot safely replay any journals (its own or others) but someone else still needs to do it. Therefore we don't want it messing with the journal recovery generation numbers: the local generation numbers eventually get put into the lvb generation numbers to be seen by all nodes. This patch adds checks to many of the callbacks used by dlm in its recovery state machine so that the functions are ignored and skipped if an io error has occurred or if the file system was withdraw. Signed-off-by: Bob Peterson --- fs/gfs2/lock_dlm.c | 36 fs/gfs2/util.c | 2 +- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 31df26ed7854..d2cb2fe1c3f3 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1081,6 +1081,14 @@ static void gdlm_recover_prep(void *arg) struct gfs2_sbd *sdp = arg; struct lm_lockstruct *ls = &sdp->sd_lockstruct; + if (test_bit(SDF_AIL1_IO_ERROR, &sdp->sd_flags)) { + fs_err(sdp, "recover_prep ignored due to io error.\n"); + return; + } + if (withdrawn(sdp)) { + fs_err(sdp, "recover_prep ignored due to withdraw.\n"); + return; + } spin_lock(&ls->ls_recover_spin); ls->ls_recover_block = ls->ls_recover_start; set_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags); @@ -1103,6 +,16 @@ static void gdlm_recover_slot(void *arg, struct dlm_slot *slot) struct lm_lockstruct *ls = &sdp->sd_lockstruct; int jid = slot->slot - 1; + if (test_bit(SDF_AIL1_IO_ERROR, &sdp->sd_flags)) { + fs_err(sdp, "recover_slot jid %d ignored due to io error.\n", + jid); + return; + } + if (withdrawn(sdp)) { + fs_err(sdp, "recover_slot jid %d ignored due to withdraw.\n", + jid); + return; + } spin_lock(&ls->ls_recover_spin); if (ls->ls_recover_size < jid + 1) { fs_err(sdp, "recover_slot jid %d gen %u short size %d\n", @@ -1127,6 +1145,14 @@ static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots, struct gfs2_sbd *sdp = arg; struct lm_lockstruct *ls = &sdp->sd_lockstruct; + if (test_bit(SDF_AIL1_IO_ERROR, &sdp->sd_flags)) { + fs_err(sdp, "recover_done ignored due to io error.\n"); + return; + } + if (withdrawn(sdp)) { + fs_err(sdp, "recover_done ignored due to withdraw.\n"); + return; + } /* ensure the ls jid arrays are large enough */ set_recover_size(sdp, slots, num_slots); @@ -1154,6 +1180,16 @@ static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid, { struct lm_lockstruct *ls = &sdp->sd_lockstruct; + if (test_bit(SDF_AIL1_IO_ERROR, &sdp->sd_flags)) { + fs_err(sdp, "recovery_result jid %d ignored due to io error.\n", + jid); + return; + } + if (withdrawn(sdp)) { + fs_err(sdp, "recovery_result jid %d ignored due to withdraw.\n", + jid); + return; + } if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags)) return; diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 717aef772c60..ca6de80b5e8b 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -260,7 +260,7 @@ void gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, const char *function, char *file, unsigned int line, bool withdraw) { - if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + if (!withdrawn(sdp)) fs_err(sdp, "fatal: I/O error\n" " block = %llu\n" -- 2.20.1
[Cluster-devel] [GFS2 PATCH 6/9] gfs2: Make secondary withdrawers wait for first withdrawer
Before this patch, if a process encountered an error and decided to withdraw, if another process was already in the process of withdrawing, the secondary withdraw would be silently ignored, which set it free to proceed with its processing, unlock any locks, etc. That's correct behavior if the original withdrawer encounters further errors down the road. However, second withdrawers need to wait for the first withdrawer to finish its withdraw before proceeding. If we don't wait we could end up assuming everything is alright, unlock glocks and telling other nodes they can have the glock, despite the fact that a withdraw is still ongoing and may require a journal replay before any locks are released. For example, if an rgrp glock is freed by a process that didn't wait for the withdraw, a journal replay could introduce file system corruption by replaying a rgrp block that has already been granted to another node. This patch makes secondary withdrawers wait until the primary withdrawer is finished with its processing before proceeding. Signed-off-by: Bob Peterson --- fs/gfs2/incore.h | 3 +++ fs/gfs2/util.c | 9 - 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 2ddae1326ce2..51ca34594ac2 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -626,6 +626,7 @@ enum { SDF_REMOTE_WITHDRAW = 12, /* Performing remote recovery */ SDF_WITHDRAW_RECOVERY = 13, /* Wait for journal recovery when we are withdrawing */ + SDF_WITHDRAW_COMPLETE = 14, /* Withdraw is compete */ }; enum gfs2_freeze_state { @@ -836,6 +837,8 @@ struct gfs2_sbd { struct bio *sd_log_bio; wait_queue_head_t sd_log_flush_wait; int sd_log_error; + atomic_t sd_withdrawer; + wait_queue_head_t sd_withdraw_wait; atomic_t sd_reserving_log; wait_queue_head_t sd_reserving_log_wait; diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 75f67284bba8..6e0a85fc1d46 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -188,9 +188,15 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...) struct va_format vaf; if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW && - test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) { + fs_warn(sdp, "Waiting for process %d to withdraw.\n", + atomic_read(&sdp->sd_withdrawer)); + wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_COMPLETE, + TASK_UNINTERRUPTIBLE); return 0; + } + atomic_set(&sdp->sd_withdrawer, pid_nr(task_pid(current))); clear_bit(SDF_PENDING_WITHDRAW, &sdp->sd_flags); if (fmt) { va_start(args, fmt); @@ -221,6 +227,7 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...) set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags); fs_err(sdp, "File system withdrawn\n"); dump_stack(); + wake_up_bit(&sdp->sd_flags, SDF_WITHDRAW_COMPLETE); } if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) -- 2.20.1
[Cluster-devel] [GFS2 PATCH 9/9] dlm: recover slot regardless of whether we still have a connection
Before this patch dlm would skip the recover_slot phase of recovery if it still had a valid comm connection to the failed node. However, gfs2 still needs to perform journal replay, otherwise we run the risk of journal replay that happens at reboot time overwriting metadata we've since modified after we release the locks. Signed-off-by: Bob Peterson --- fs/dlm/member.c | 11 +++ 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 0bc43b35d2c5..155bd52eb018 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -463,17 +463,12 @@ static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb) if (!ls->ls_ops || !ls->ls_ops->recover_slot) return; - /* if there is no comms connection with this node - or the present comms connection is newer - than the one when this member was added, then - we consider the node to have failed (versus - being removed due to dlm_release_lockspace) */ + /* Recover the slot regardless of whether we have a valid connection. +* The node may have simply withdrawn, but still needs its journal +* replayed. */ error = dlm_comm_seq(memb->nodeid, &seq); - if (!error && seq == memb->comm_seq) - return; - slot.nodeid = memb->nodeid; slot.slot = memb->slot; -- 2.20.1
[Cluster-devel] [GFS2 PATCH 4/9] gfs2: Force withdraw to replay journals and wait for it to finish
When a node withdraws from a file system, it often leaves its journal in an incomplete state. This is especially true when the withdraw is caused by io errors writing to the journal. Before this patch, a withdraw would try to write a "shutdown" record to the journal, tell dlm it's done with the file system, and none of the other nodes know about the problem. Later, when the problem is fixed and the withdrawn node is rebooted, it would then discover that its own journal was incomplete, and replay it. However, replaying it at this point is almost guaranteed to introduce corruption because the other nodes are likely to have used affected resource groups that appeared in the journal since the time of the withdraw. Replaying the journal later will overwrite any changes made, and not through any fault of dlm, which was instructed during the withdraw to release those resources. This patch makes file system withdraws seen by the entire cluster. Withdrawing nodes dequeue their journal glock to allow recovery. The remaining nodes check all the journals to see if they are clean or in need of replay. They try to replay dirty journals, but only the journals of withdrawn nodes will be "not busy" and therefore available for replay. Until the journal replay is complete, no i/o related glocks may be given out, to ensure that the replay does not cause the aforementioned corruption: We cannot allow any journal replay to overwrite blocks associated with a glock once it is held. The glocks not affected by a withdraw are permitted to be passed around as normal during a withdraw. A new glops flag, called GLOF_OK_AT_WITHDRAW, indicates glocks that may be passed around freely while a withdraw is taking place. One such glock is the "live" glock which is now used to signal when a withdraw occurs. When a withdraw occurs, the node signals its withdraw by dequeueing the "live" glock and trying to enqueue it in EX mode, thus forcing the other nodes to all see a demote request, by way of a "1CB" (one callback) try lock. The "live" glock is not granted in EX; the callback is only just used to indicate a withdraw has occurred. Note that all nodes in the cluster must wait for the recovering node to finish replaying the withdrawing node's journal before continuing. To this end, it checks that the journals are clean multiple times in a retry loop. Signed-off-by: Bob Peterson --- fs/gfs2/glock.c | 35 -- fs/gfs2/glock.h | 1 + fs/gfs2/glops.c | 61 +- fs/gfs2/incore.h | 6 ++ fs/gfs2/lock_dlm.c | 32 ++ fs/gfs2/log.c| 22 +-- fs/gfs2/meta_io.c| 2 +- fs/gfs2/ops_fstype.c | 48 ++ fs/gfs2/super.c | 24 --- fs/gfs2/super.h | 1 + fs/gfs2/util.c | 148 ++- fs/gfs2/util.h | 3 + 12 files changed, 315 insertions(+), 68 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c6d6e478f5e3..20fb6cdf7829 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -242,7 +242,8 @@ static void __gfs2_glock_put(struct gfs2_glock *gl) gfs2_glock_remove_from_lru(gl); spin_unlock(&gl->gl_lockref.lock); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); - GLOCK_BUG_ON(gl, mapping && mapping->nrpages); + GLOCK_BUG_ON(gl, mapping && mapping->nrpages && +!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)); trace_gfs2_glock_put(gl); sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); } @@ -543,6 +544,8 @@ __acquires(&gl->gl_lockref.lock) int ret; if (unlikely(withdrawn(sdp)) && + !(glops->go_flags & GLOF_OK_AT_WITHDRAW) && + (gh && !(LM_FLAG_NOEXP & gh->gh_flags)) && target != LM_ST_UNLOCKED) return; lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | @@ -561,9 +564,10 @@ __acquires(&gl->gl_lockref.lock) (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB))) clear_bit(GLF_BLOCKING, &gl->gl_flags); spin_unlock(&gl->gl_lockref.lock); - if (glops->go_sync) + if (glops->go_sync && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) glops->go_sync(gl); - if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) + if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags) && + !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA); clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); @@ -1091,7 +1095,8 @@ int gfs2_glock_nq(struct gfs2_holder *gh) struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; int error = 0; - if (unlikely(withdrawn(sdp))) + if (unlikely(withdrawn(sdp) && !(LM_FLAG_NOEXP & gh->gh_flags) && +!(gl->gl_ops->go_flags & GLOF_OK_AT_WITHDRAW))) return -EIO; if (test_bit(GLF_LRU, &gl->gl_flags)) @@ -1135,11 +1140,28 @@ int gfs2_glock_poll(struct gf
[Cluster-devel] [GFS2 PATCH 5/9] gfs2: Keep transactions on ail1 list until after issuing revokes
Before this patch, function gfs2_write_revokes would call function gfs2_ail1_empty, then run the ail1 list, issuing revokes. But gfs2_ail1_empty can move transactions to the ail2 list, and thus, their revokes were never issued. This patch adds a new parameter to gfs2_ail1_empty that allows the transactions to remain on the ail1 list until it can issue revokes for them. Then, if they have no more buffers, they're moved to the ail2 list after the revokes are added. Signed-off-by: Bob Peterson --- fs/gfs2/log.c | 30 ++ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 81550038ace3..0d0dec3231c9 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -217,11 +217,12 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr) /** * gfs2_ail1_empty - Try to empty the ail1 lists * @sdp: The superblock + * @move_empty_to_ail2: 1 if transaction to be moved to ail2 when empty * * Tries to empty the ail1 lists, starting with the oldest first */ -static int gfs2_ail1_empty(struct gfs2_sbd *sdp) +static int gfs2_ail1_empty(struct gfs2_sbd *sdp, bool move_empty_to_ail2) { struct gfs2_trans *tr, *s; int oldest_tr = 1; @@ -230,10 +231,12 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp) spin_lock(&sdp->sd_ail_lock); list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) { gfs2_ail1_empty_one(sdp, tr); - if (list_empty(&tr->tr_ail1_list) && oldest_tr) - list_move(&tr->tr_list, &sdp->sd_ail2_list); - else + if (list_empty(&tr->tr_ail1_list) && oldest_tr) { + if (move_empty_to_ail2) + list_move(&tr->tr_list, &sdp->sd_ail2_list); + } else { oldest_tr = 0; + } } ret = list_empty(&sdp->sd_ail1_list); spin_unlock(&sdp->sd_ail_lock); @@ -609,12 +612,12 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) void gfs2_write_revokes(struct gfs2_sbd *sdp) { - struct gfs2_trans *tr; + struct gfs2_trans *tr, *s; struct gfs2_bufdata *bd, *tmp; int have_revokes = 0; int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64); - gfs2_ail1_empty(sdp); + gfs2_ail1_empty(sdp, false); spin_lock(&sdp->sd_ail_lock); list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) { list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) { @@ -640,17 +643,20 @@ void gfs2_write_revokes(struct gfs2_sbd *sdp) } gfs2_log_lock(sdp); spin_lock(&sdp->sd_ail_lock); - list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) { list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) { if (max_revokes == 0) - goto out_of_blocks; + break; if (!list_empty(&bd->bd_list)) continue; gfs2_add_revoke(sdp, bd); max_revokes--; } + if (list_empty(&tr->tr_ail1_list)) + list_move(&tr->tr_list, &sdp->sd_ail2_list); + if (max_revokes == 0) + break; } -out_of_blocks: spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); @@ -842,7 +848,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) for (;;) { gfs2_ail1_start(sdp); gfs2_ail1_wait(sdp); - if (gfs2_ail1_empty(sdp)) + if (gfs2_ail1_empty(sdp, true)) break; } atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ @@ -1008,7 +1014,7 @@ int gfs2_logd(void *data) did_flush = false; if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { - gfs2_ail1_empty(sdp); + gfs2_ail1_empty(sdp, true); if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | @@ -1019,7 +1025,7 @@ int gfs2_logd(void *data) if (gfs2_ail_flush_reqd(sdp)) { gfs2_ail1_start(sdp); gfs2_ail1_wait(sdp); - gfs2_ail1_empty(sdp); + gfs2_ail1_empty(sdp, true); if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
[Cluster-devel] [GFS2 PATCH] Revert "gfs2: read journal in large chunks to locate the head"
This reverts commit 2a5f14f279f59143139bcd1606903f2f80a34241. This patch causes xfstests generic/311 to fail. Reverting this for now until we have a proper fix. Signed-off-by: Abhi Das --- fs/gfs2/glops.c | 1 - fs/gfs2/log.c| 4 +- fs/gfs2/lops.c | 190 ++- fs/gfs2/lops.h | 4 +- fs/gfs2/ops_fstype.c | 1 - fs/gfs2/recovery.c | 123 fs/gfs2/recovery.h | 2 + fs/gfs2/super.c | 1 - 8 files changed, 134 insertions(+), 192 deletions(-) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index f15b4c57c4bd..78510ab91835 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -28,7 +28,6 @@ #include "util.h" #include "trans.h" #include "dir.h" -#include "lops.h" struct workqueue_struct *gfs2_freeze_wq; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 5bfaf381921a..b8830fda51e8 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -733,7 +733,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, lh->lh_crc = cpu_to_be32(crc); gfs2_log_write(sdp, page, sb->s_blocksize, 0, addr); - gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE | op_flags); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, op_flags); log_flush_wait(sdp); } @@ -810,7 +810,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) gfs2_ordered_write(sdp); lops_before_commit(sdp, tr); - gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, 0); if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_flush_wait(sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 94dcab655bc0..2295042bc625 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -17,9 +17,7 @@ #include #include #include -#include -#include "bmap.h" #include "dir.h" #include "gfs2.h" #include "incore.h" @@ -195,6 +193,7 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, /** * gfs2_end_log_write - end of i/o to the log * @bio: The bio + * @error: Status of i/o request * * Each bio_vec contains either data from the pagecache or data * relating to the log itself. Here we iterate over the bio_vec @@ -231,19 +230,20 @@ static void gfs2_end_log_write(struct bio *bio) /** * gfs2_log_submit_bio - Submit any pending log bio * @biop: Address of the bio pointer - * @opf: REQ_OP | op_flags + * @op: REQ_OP + * @op_flags: req_flag_bits * * Submit any pending part-built or full bio to the block device. If * there is no pending bio, then this is a no-op. */ -void gfs2_log_submit_bio(struct bio **biop, int opf) +void gfs2_log_submit_bio(struct bio **biop, int op, int op_flags) { struct bio *bio = *biop; if (bio) { struct gfs2_sbd *sdp = bio->bi_private; atomic_inc(&sdp->sd_log_in_flight); - bio->bi_opf = opf; + bio_set_op_attrs(bio, op, op_flags); submit_bio(bio); *biop = NULL; } @@ -304,7 +304,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno, nblk >>= sdp->sd_fsb2bb_shift; if (blkno == nblk && !flush) return bio; - gfs2_log_submit_bio(biop, op); + gfs2_log_submit_bio(biop, op, 0); } *biop = gfs2_log_alloc_bio(sdp, blkno, end_io); @@ -375,184 +375,6 @@ void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) gfs2_log_bmap(sdp)); } -/** - * gfs2_end_log_read - end I/O callback for reads from the log - * @bio: The bio - * - * Simply unlock the pages in the bio. The main thread will wait on them and - * process them in order as necessary. - */ - -static void gfs2_end_log_read(struct bio *bio) -{ - struct page *page; - struct bio_vec *bvec; - int i; - - bio_for_each_segment_all(bvec, bio, i) { - page = bvec->bv_page; - if (bio->bi_status) { - int err = blk_status_to_errno(bio->bi_status); - - SetPageError(page); - mapping_set_error(page->mapping, err); - } - unlock_page(page); - } - - bio_put(bio); -} - -/** - * gfs2_jhead_pg_srch - Look for the journal head in a given page. - * @jd: The journal descriptor - * @page: The page to look in - * - * Returns: 1 if found, 0 otherwise. - */ - -static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, - struct gfs2_log_header_host *head, - struct page *page) -{ - struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); - struct gfs2_log_header_host uninitialized_var(lh); - void *kaddr = kmap_atomic(page); - unsigned int offset; - bool ret = false; - - for (offset = 0; offset < PA
[Cluster-devel] [PATCH] Revert "gfs2: read journal in large chunks to locate the head"
This reverts commit 2a5f14f279f59143139bcd1606903f2f80a34241. This patch causes xfstests generic/311 to fail. Reverting this for now until we have a proper fix. Signed-off-by: Abhi Das Signed-off-by: Bob Peterson --- fs/gfs2/glops.c | 1 - fs/gfs2/log.c| 4 +- fs/gfs2/lops.c | 190 ++- fs/gfs2/lops.h | 4 +- fs/gfs2/ops_fstype.c | 1 - fs/gfs2/recovery.c | 123 + fs/gfs2/recovery.h | 2 + fs/gfs2/super.c | 1 - 8 files changed, 134 insertions(+), 192 deletions(-) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index f15b4c57c4bd..78510ab91835 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -28,7 +28,6 @@ #include "util.h" #include "trans.h" #include "dir.h" -#include "lops.h" struct workqueue_struct *gfs2_freeze_wq; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 5bfaf381921a..b8830fda51e8 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -733,7 +733,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, lh->lh_crc = cpu_to_be32(crc); gfs2_log_write(sdp, page, sb->s_blocksize, 0, addr); - gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE | op_flags); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, op_flags); log_flush_wait(sdp); } @@ -810,7 +810,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) gfs2_ordered_write(sdp); lops_before_commit(sdp, tr); - gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, 0); if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_flush_wait(sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 94dcab655bc0..2295042bc625 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -17,9 +17,7 @@ #include #include #include -#include -#include "bmap.h" #include "dir.h" #include "gfs2.h" #include "incore.h" @@ -195,6 +193,7 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, /** * gfs2_end_log_write - end of i/o to the log * @bio: The bio + * @error: Status of i/o request * * Each bio_vec contains either data from the pagecache or data * relating to the log itself. Here we iterate over the bio_vec @@ -231,19 +230,20 @@ static void gfs2_end_log_write(struct bio *bio) /** * gfs2_log_submit_bio - Submit any pending log bio * @biop: Address of the bio pointer - * @opf: REQ_OP | op_flags + * @op: REQ_OP + * @op_flags: req_flag_bits * * Submit any pending part-built or full bio to the block device. If * there is no pending bio, then this is a no-op. */ -void gfs2_log_submit_bio(struct bio **biop, int opf) +void gfs2_log_submit_bio(struct bio **biop, int op, int op_flags) { struct bio *bio = *biop; if (bio) { struct gfs2_sbd *sdp = bio->bi_private; atomic_inc(&sdp->sd_log_in_flight); - bio->bi_opf = opf; + bio_set_op_attrs(bio, op, op_flags); submit_bio(bio); *biop = NULL; } @@ -304,7 +304,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno, nblk >>= sdp->sd_fsb2bb_shift; if (blkno == nblk && !flush) return bio; - gfs2_log_submit_bio(biop, op); + gfs2_log_submit_bio(biop, op, 0); } *biop = gfs2_log_alloc_bio(sdp, blkno, end_io); @@ -375,184 +375,6 @@ void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) gfs2_log_bmap(sdp)); } -/** - * gfs2_end_log_read - end I/O callback for reads from the log - * @bio: The bio - * - * Simply unlock the pages in the bio. The main thread will wait on them and - * process them in order as necessary. - */ - -static void gfs2_end_log_read(struct bio *bio) -{ - struct page *page; - struct bio_vec *bvec; - int i; - - bio_for_each_segment_all(bvec, bio, i) { - page = bvec->bv_page; - if (bio->bi_status) { - int err = blk_status_to_errno(bio->bi_status); - - SetPageError(page); - mapping_set_error(page->mapping, err); - } - unlock_page(page); - } - - bio_put(bio); -} - -/** - * gfs2_jhead_pg_srch - Look for the journal head in a given page. - * @jd: The journal descriptor - * @page: The page to look in - * - * Returns: 1 if found, 0 otherwise. - */ - -static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, - struct gfs2_log_header_host *head, - struct page *page) -{ - struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); - struct gfs2_log_header_host uninitialized_var(lh); - void *kaddr = kmap_atomic(page); - unsigned int offset; - bool ret = fal