Repository: kudu Updated Branches: refs/heads/master c73f023da -> fcb0be638
[consensus] KUDU-2367 fix replica health report This patch addresses KUDU-2367,where a leader replica under particular conditions reports follower's health status as FAILED instead of FAILED_UNRECOVERABLE. I.e., the bug in the code of the PeerMessageQueue::PeerHealthStatus() method resulted in having the leader reporting the health status of the replica still as FAILED even after the replica fell behind the WAL segment GC threshold. Also, a couple of already existing integration tests are updated to provide appropriate coverage for this patch. Change-Id: If77474adc8f618f2cda35f992190133138fdb511 Reviewed-on: http://gerrit.cloudera.org:8080/9755 Tested-by: Alexey Serbin <aser...@cloudera.com> Reviewed-by: Mike Percy <mpe...@apache.org> Tested-by: Kudu Jenkins Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/fcb0be63 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/fcb0be63 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/fcb0be63 Branch: refs/heads/master Commit: fcb0be6381a47155e171eb50a333af502bbf506f Parents: c73f023 Author: Alexey Serbin <aser...@cloudera.com> Authored: Thu Mar 22 01:02:32 2018 -0700 Committer: Mike Percy <mpe...@apache.org> Committed: Fri Mar 23 05:30:52 2018 +0000 ---------------------------------------------------------------------- src/kudu/consensus/consensus_queue-test.cc | 67 +++++++++++++++++++- src/kudu/consensus/consensus_queue.cc | 42 +++++++++--- src/kudu/consensus/consensus_queue.h | 1 + .../raft_consensus-itest-base.cc | 7 +- .../raft_consensus-itest-base.h | 5 +- .../raft_consensus_nonvoter-itest.cc | 40 ++++++++---- 6 files changed, 137 insertions(+), 25 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/fcb0be63/src/kudu/consensus/consensus_queue-test.cc ---------------------------------------------------------------------- diff --git a/src/kudu/consensus/consensus_queue-test.cc b/src/kudu/consensus/consensus_queue-test.cc index c8dc086..c3cfd24 100644 --- a/src/kudu/consensus/consensus_queue-test.cc +++ b/src/kudu/consensus/consensus_queue-test.cc @@ -59,9 +59,9 @@ #include "kudu/util/threadpool.h" DECLARE_int32(consensus_max_batch_size_bytes); +DECLARE_int32(follower_unavailable_considered_failed_sec); -METRIC_DECLARE_entity(tablet); - +using kudu::consensus::HealthReportPB; using std::vector; namespace kudu { @@ -189,7 +189,7 @@ class ConsensusQueueTest : public KuduTest { void WaitForLocalPeerToAckIndex(int index) { while (true) { - PeerMessageQueue::TrackedPeer leader = queue_->GetTrackedPeerForTests(kLeaderUuid); + const auto leader = queue_->GetTrackedPeerForTests(kLeaderUuid); if (leader.last_received.index() >= index) { break; } @@ -945,5 +945,66 @@ TEST_F(ConsensusQueueTest, TestFollowerCommittedIndexAndMetrics) { ASSERT_EQ(5, queue_->metrics_.num_ops_behind_leader->value()); } +// Unit test for the PeerMessageQueue::PeerHealthStatus() method. +TEST(ConsensusQueueUnitTest, PeerHealthStatus) { + static constexpr PeerStatus kPeerStatusesForUnknown[] = { + PeerStatus::NEW, + PeerStatus::REMOTE_ERROR, + PeerStatus::RPC_LAYER_ERROR, + PeerStatus::TABLET_NOT_FOUND, + PeerStatus::INVALID_TERM, + PeerStatus::CANNOT_PREPARE, + PeerStatus::LMP_MISMATCH, + }; + + RaftPeerPB peer_pb; + PeerMessageQueue::TrackedPeer peer(peer_pb); + EXPECT_EQ(HealthReportPB::UNKNOWN, PeerMessageQueue::PeerHealthStatus(peer)); + for (auto status : kPeerStatusesForUnknown) { + peer.last_exchange_status = status; + EXPECT_EQ(HealthReportPB::UNKNOWN, PeerMessageQueue::PeerHealthStatus(peer)); + } + + peer.last_exchange_status = PeerStatus::TABLET_FAILED; + EXPECT_EQ(HealthReportPB::FAILED_UNRECOVERABLE, PeerMessageQueue::PeerHealthStatus(peer)); + + peer.last_exchange_status = PeerStatus::OK; + EXPECT_EQ(HealthReportPB::HEALTHY, PeerMessageQueue::PeerHealthStatus(peer)); + + peer.wal_catchup_possible = false; + EXPECT_EQ(HealthReportPB::FAILED_UNRECOVERABLE, PeerMessageQueue::PeerHealthStatus(peer)); + + peer.wal_catchup_possible = true; + EXPECT_EQ(HealthReportPB::HEALTHY, PeerMessageQueue::PeerHealthStatus(peer)); + + peer.last_communication_time -= + MonoDelta::FromSeconds(FLAGS_follower_unavailable_considered_failed_sec + 1); + EXPECT_EQ(HealthReportPB::FAILED, PeerMessageQueue::PeerHealthStatus(peer)); + for (auto status : kPeerStatusesForUnknown) { + peer.last_exchange_status = status; + EXPECT_EQ(HealthReportPB::FAILED, PeerMessageQueue::PeerHealthStatus(peer)); + } + + peer.last_exchange_status = PeerStatus::TABLET_FAILED; + EXPECT_EQ(HealthReportPB::FAILED_UNRECOVERABLE, PeerMessageQueue::PeerHealthStatus(peer)); + + peer.last_exchange_status = PeerStatus::OK; + EXPECT_EQ(HealthReportPB::FAILED, PeerMessageQueue::PeerHealthStatus(peer)); + + peer.wal_catchup_possible = false; + EXPECT_EQ(HealthReportPB::FAILED_UNRECOVERABLE, PeerMessageQueue::PeerHealthStatus(peer)); + + for (auto status : kPeerStatusesForUnknown) { + peer.last_exchange_status = status; + EXPECT_EQ(HealthReportPB::FAILED_UNRECOVERABLE, PeerMessageQueue::PeerHealthStatus(peer)); + } + + peer.last_exchange_status = PeerStatus::OK; + EXPECT_EQ(HealthReportPB::FAILED_UNRECOVERABLE, PeerMessageQueue::PeerHealthStatus(peer)); + + peer.last_exchange_status = PeerStatus::TABLET_FAILED; + EXPECT_EQ(HealthReportPB::FAILED_UNRECOVERABLE, PeerMessageQueue::PeerHealthStatus(peer)); +} + } // namespace consensus } // namespace kudu http://git-wip-us.apache.org/repos/asf/kudu/blob/fcb0be63/src/kudu/consensus/consensus_queue.cc ---------------------------------------------------------------------- diff --git a/src/kudu/consensus/consensus_queue.cc b/src/kudu/consensus/consensus_queue.cc index 804739c..63ffd74 100644 --- a/src/kudu/consensus/consensus_queue.cc +++ b/src/kudu/consensus/consensus_queue.cc @@ -552,24 +552,48 @@ void PeerMessageQueue::UpdatePeerHealthUnlocked(TrackedPeer* peer) { } } +// While reporting on the replica health status, it's important to report on +// the 'definitive' health statuses once they surface. That allows the system +// to expedite decisions on replica replacement because the more 'definitive' +// statuses have less uncertainty and provide more information (compared +// with less 'definitive' statuses). Informally, the level of 'definitiveness' +// could be measured by the number of possible state transitions on the replica +// health status state diagram. +// +// The health status chain below has increasing level of 'definitiveness' +// left to right: +// +// UNKNOWN --> HEALTHY --> FAILED --> FAILED_UNRECOVERABLE +// +// For example, in the case when a replica has been unreachable longer than the +// time interval specified by the --follower_unavailable_considered_failed_sec +// flag, the system should start reporting its health status as FAILED. +// However, once the replica falls behind the WAL log GC threshold, the system +// should start reporting its healths status as FAILED_UNRECOVERABLE. The code +// below is written to adhere to that informal policy. HealthReportPB::HealthStatus PeerMessageQueue::PeerHealthStatus(const TrackedPeer& peer) { - // Replicas which have been unreachable for too long are considered failed. - auto max_unreachable = MonoDelta::FromSeconds(FLAGS_follower_unavailable_considered_failed_sec); - if (MonoTime::Now() - peer.last_communication_time > max_unreachable) { - return HealthReportPB::FAILED; - } - - // Replicas that have fallen behind the leader's retained WAL are failed - // and have no chance to come back. + // Replicas that have fallen behind the leader's retained WAL segments are + // failed irrecoverably and will not come back because they cannot ever catch + // up with the leader replica. if (!peer.wal_catchup_possible) { return HealthReportPB::FAILED_UNRECOVERABLE; } - // Replicas returning TABLET_FAILED status are considered failed. + // Replicas returning TABLET_FAILED status are considered irrecoverably + // failed because the TABLED_FAILED status manifests about IO failures + // caused by disk corruption, etc. if (peer.last_exchange_status == PeerStatus::TABLET_FAILED) { return HealthReportPB::FAILED_UNRECOVERABLE; } + // Replicas which have been unreachable for too long are considered failed, + // unless it's known that they have failed irrecoverably (see above). They + // might come back at some point and successfully catch up with the leader. + auto max_unreachable = MonoDelta::FromSeconds(FLAGS_follower_unavailable_considered_failed_sec); + if (MonoTime::Now() - peer.last_communication_time > max_unreachable) { + return HealthReportPB::FAILED; + } + // The happy case: replicas returned OK during the recent exchange are considered healthy. if (peer.last_exchange_status == PeerStatus::OK) { return HealthReportPB::HEALTHY; http://git-wip-us.apache.org/repos/asf/kudu/blob/fcb0be63/src/kudu/consensus/consensus_queue.h ---------------------------------------------------------------------- diff --git a/src/kudu/consensus/consensus_queue.h b/src/kudu/consensus/consensus_queue.h index cbbc522..a2b79e2 100644 --- a/src/kudu/consensus/consensus_queue.h +++ b/src/kudu/consensus/consensus_queue.h @@ -360,6 +360,7 @@ class PeerMessageQueue { FRIEND_TEST(ConsensusQueueTest, TestQueueAdvancesCommittedIndex); FRIEND_TEST(ConsensusQueueTest, TestQueueMovesWatermarksBackward); FRIEND_TEST(ConsensusQueueTest, TestFollowerCommittedIndexAndMetrics); + FRIEND_TEST(ConsensusQueueUnitTest, PeerHealthStatus); FRIEND_TEST(RaftConsensusQuorumTest, TestReplicasEnforceTheLogMatchingProperty); // Mode specifies how the queue currently behaves: http://git-wip-us.apache.org/repos/asf/kudu/blob/fcb0be63/src/kudu/integration-tests/raft_consensus-itest-base.cc ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/raft_consensus-itest-base.cc b/src/kudu/integration-tests/raft_consensus-itest-base.cc index aab6981..1fee8a8 100644 --- a/src/kudu/integration-tests/raft_consensus-itest-base.cc +++ b/src/kudu/integration-tests/raft_consensus-itest-base.cc @@ -197,7 +197,8 @@ void RaftConsensusITestBase::CauseFollowerToFallBehindLogGC( string* leader_uuid, int64_t* orig_term, string* fell_behind_uuid, - BehindWalGcBehavior tserver_behavior) { + BehindWalGcBehavior tserver_behavior, + const MonoDelta& pre_workload_delay) { MonoDelta kTimeout = MonoDelta::FromSeconds(10); // Wait for all of the replicas to have acknowledged the elected // leader and logged the first NO_OP. @@ -234,6 +235,10 @@ void RaftConsensusITestBase::CauseFollowerToFallBehindLogGC( *leader_uuid = leader->uuid(); int leader_index = cluster_->tablet_server_index_by_uuid(*leader_uuid); + if (pre_workload_delay.Initialized()) { + SleepFor(pre_workload_delay); + } + TestWorkload workload(cluster_.get()); workload.set_table_name(kTableId); workload.set_timeout_allowed(true); http://git-wip-us.apache.org/repos/asf/kudu/blob/fcb0be63/src/kudu/integration-tests/raft_consensus-itest-base.h ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/raft_consensus-itest-base.h b/src/kudu/integration-tests/raft_consensus-itest-base.h index a4266ad..c00426d 100644 --- a/src/kudu/integration-tests/raft_consensus-itest-base.h +++ b/src/kudu/integration-tests/raft_consensus-itest-base.h @@ -28,6 +28,8 @@ namespace kudu { +class MonoDelta; + namespace cluster { class ExternalTabletServer; } @@ -84,7 +86,8 @@ class RaftConsensusITestBase : public TabletServerIntegrationTestBase { std::string* leader_uuid, int64_t* orig_term, std::string* fell_behind_uuid, - BehindWalGcBehavior tserver_behavior = BehindWalGcBehavior::STOP_CONTINUE); + BehindWalGcBehavior tserver_behavior = BehindWalGcBehavior::STOP_CONTINUE, + const MonoDelta& pre_workload_delay = {}); CountDownLatch inserters_; }; http://git-wip-us.apache.org/repos/asf/kudu/blob/fcb0be63/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc b/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc index 84a6d94..da19cc4 100644 --- a/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc +++ b/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc @@ -1927,20 +1927,28 @@ TEST_P(IncompatibleReplicaReplacementSchemesITest, MasterAndTserverMisconfig) { } } +// This test scenario runs the system with the 3-4-3 replica management scheme +// having the total number of tablet servers equal to the replication factor +// of the tablet being tested. The scenario makes one follower replica +// fall behind the WAL segment GC threshold. The system should be able to +// replace the failed replica 'in-place', i.e. no additional tablet server is +// needed for the cluster to recover in such situations. The scenario verifies +// that the re-replication works as expected when the tablet server hosting the +// failed replica is: +// ** paused and then resumed, emulating a lagging tablet server +// ** shut down and then started back up class ReplicaBehindWalGcThresholdITest : public RaftConsensusNonVoterITest, - public ::testing::WithParamInterface<RaftConsensusITestBase::BehindWalGcBehavior> { + public ::testing::WithParamInterface< + std::tuple<RaftConsensusITestBase::BehindWalGcBehavior, bool>> { }; INSTANTIATE_TEST_CASE_P(, ReplicaBehindWalGcThresholdITest, - ::testing::Values(RaftConsensusITestBase::BehindWalGcBehavior::STOP_CONTINUE, - RaftConsensusITestBase::BehindWalGcBehavior::SHUTDOWN_RESTART, - RaftConsensusITestBase::BehindWalGcBehavior::SHUTDOWN)); - -// Test that the catalog manager running with the 3-4-3 scheme is able to do -// 'in-place' replica replacement when replica falls behind the WAL segment GC, -// i.e. no additional tablet server is needed in case of tablet with replication -// factor 3 when there are just 3 tablet servers in the cluster. + ::testing::Combine( + ::testing::Values(RaftConsensusITestBase::BehindWalGcBehavior::STOP_CONTINUE, + RaftConsensusITestBase::BehindWalGcBehavior::SHUTDOWN_RESTART, + RaftConsensusITestBase::BehindWalGcBehavior::SHUTDOWN), + ::testing::Bool())); TEST_P(ReplicaBehindWalGcThresholdITest, ReplicaReplacement) { if (!AllowSlowTests()) { LOG(WARNING) << "test is skipped; set KUDU_ALLOW_SLOW_TESTS=1 to run"; @@ -1953,7 +1961,8 @@ TEST_P(ReplicaBehindWalGcThresholdITest, ReplicaReplacement) { const auto kUnavaiableFailedSec = 5; FLAGS_num_replicas = kReplicasNum; FLAGS_num_tablet_servers = kReplicasNum; - const auto tserver_behavior = GetParam(); + const auto tserver_behavior = std::get<0>(GetParam()); + const bool do_delay_workload = std::get<1>(GetParam()); vector<string> master_flags = { // This scenario runs with the 3-4-3 replica management scheme. @@ -1981,8 +1990,17 @@ TEST_P(ReplicaBehindWalGcThresholdITest, ReplicaReplacement) { string follower_uuid; string leader_uuid; int64_t orig_term; + MonoDelta delay; + if (do_delay_workload) { + // That's to make the leader replica to report the state of the tablet as + // FAILED first. Later on, when the replica falls behind the WAL segment GC + // threshold, the leader replica should report the follower's health status + // as FAILED_UNRECOVERABLE. + delay = MonoDelta::FromSeconds(3 * kUnavaiableFailedSec); + } NO_FATALS(CauseFollowerToFallBehindLogGC( - tablet_servers_, &leader_uuid, &orig_term, &follower_uuid, tserver_behavior)); + tablet_servers_, &leader_uuid, &orig_term, &follower_uuid, + tserver_behavior, delay)); // The catalog manager should evict the replicas which fell behing the WAL // segment GC threshold right away.