Repository: kudu Updated Branches: refs/heads/master 1fcce4220 -> f6777db35
[consensus] KUDU-2335 increment term on explicit step down Prior to this patch, the catalog manager could get a tablet report from a former leader replica with empty leader UUID and old term. A dedicated logic in the catalog manager's code (see section 7d(i)) would amend the empty leader UUID to replace it with the previous leader's UUID. As a result of those shenanigans, catalog manager would interpret the incoming report as a report from a leader replica that reports its own health status as UNKNOWN. The TwoConcurrentRebalancers scenario of the recently introduced ConcurrentRebalancersTest reproduces the issue pretty often (about 1 in 100 runs failed), so it was easy to pin-point the problem. Mike sketched the fix and I ran the new code via dist-test about 1K times and verified the problem is gone. As for the test coverage, in addition to the already mentioned would-be-flaky TwoConcurrentRebalancers scenario, I modified RaftConsensusElectionITest.LeaderStepDown to reliably catch regressions. Additionally, I updated the TabletCopyITest.TestRejectRogueLeader scenario not asking rogue leader to step down: otherwise it would fail. Change-Id: I4e1f1446176a78ba04e74dd1153f9048a32d8d5f Reviewed-on: http://gerrit.cloudera.org:8080/11122 Reviewed-by: Mike Percy <mpe...@apache.org> Tested-by: Alexey Serbin <aser...@cloudera.com> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/f6777db3 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/f6777db3 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/f6777db3 Branch: refs/heads/master Commit: f6777db35dfb3880ea188990809102d167c6d557 Parents: 1fcce42 Author: Alexey Serbin <aser...@cloudera.com> Authored: Fri Aug 3 15:30:22 2018 -0700 Committer: Alexey Serbin <aser...@cloudera.com> Committed: Mon Aug 6 05:58:08 2018 +0000 ---------------------------------------------------------------------- src/kudu/consensus/raft_consensus.cc | 4 +- .../raft_consensus_election-itest.cc | 41 ++++++++++++++------ src/kudu/integration-tests/tablet_copy-itest.cc | 13 ++----- 3 files changed, 35 insertions(+), 23 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/f6777db3/src/kudu/consensus/raft_consensus.cc ---------------------------------------------------------------------- diff --git a/src/kudu/consensus/raft_consensus.cc b/src/kudu/consensus/raft_consensus.cc index f81637e..cd58896 100644 --- a/src/kudu/consensus/raft_consensus.cc +++ b/src/kudu/consensus/raft_consensus.cc @@ -552,8 +552,8 @@ Status RaftConsensus::StepDown(LeaderStepDownResponsePB* resp) { return Status::OK(); } LOG_WITH_PREFIX_UNLOCKED(INFO) << "Received request to step down"; - RETURN_NOT_OK(BecomeReplicaUnlocked()); - + RETURN_NOT_OK(HandleTermAdvanceUnlocked(CurrentTermUnlocked() + 1, + SKIP_FLUSH_TO_DISK)); // Snooze the failure detector for an extra leader failure timeout. // This should ensure that a different replica is elected leader after this one steps down. SnoozeFailureDetector(string("explicit stepdown request"), MonoDelta::FromMilliseconds( http://git-wip-us.apache.org/repos/asf/kudu/blob/f6777db3/src/kudu/integration-tests/raft_consensus_election-itest.cc ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/raft_consensus_election-itest.cc b/src/kudu/integration-tests/raft_consensus_election-itest.cc index b10294a..3debfb5 100644 --- a/src/kudu/integration-tests/raft_consensus_election-itest.cc +++ b/src/kudu/integration-tests/raft_consensus_election-itest.cc @@ -26,6 +26,7 @@ #include <gtest/gtest.h> #include "kudu/common/wire_protocol.pb.h" +#include "kudu/consensus/consensus.pb.h" #include "kudu/consensus/metadata.pb.h" #include "kudu/gutil/gscoped_ptr.h" #include "kudu/gutil/map-util.h" @@ -59,8 +60,10 @@ METRIC_DECLARE_counter(transaction_memory_pressure_rejections); METRIC_DECLARE_gauge_int64(raft_term); using kudu::cluster::ExternalTabletServer; +using kudu::consensus::ConsensusStatePB; using kudu::consensus::RaftPeerPB; using kudu::itest::AddServer; +using kudu::itest::GetConsensusState; using kudu::itest::GetReplicaStatusAndCheckIfLeader; using kudu::itest::LeaderStepDown; using kudu::itest::RemoveServer; @@ -278,6 +281,7 @@ TEST_F(RaftConsensusElectionITest, AutomaticLeaderElectionOneReplica) { } TEST_F(RaftConsensusElectionITest, LeaderStepDown) { + const auto kTimeout = MonoDelta::FromSeconds(10); const vector<string> kTsFlags = { "--enable_leader_failure_detection=false" }; @@ -293,25 +297,39 @@ TEST_F(RaftConsensusElectionITest, LeaderStepDown) { AppendValuesFromMap(tablet_servers_, &tservers); // Start with no leader. - Status s = GetReplicaStatusAndCheckIfLeader(tservers[0], tablet_id_, MonoDelta::FromSeconds(10)); + const auto* ts = tservers[0]; + Status s = GetReplicaStatusAndCheckIfLeader(ts, tablet_id_, kTimeout); ASSERT_TRUE(s.IsIllegalState()) << "TS #0 should not be leader yet: " << s.ToString(); // Become leader. - ASSERT_OK(StartElection(tservers[0], tablet_id_, MonoDelta::FromSeconds(10))); - ASSERT_OK(WaitUntilLeader(tservers[0], tablet_id_, MonoDelta::FromSeconds(10))); - ASSERT_OK(WriteSimpleTestRow(tservers[0], tablet_id_, RowOperationsPB::INSERT, - kTestRowKey, kTestRowIntVal, "foo", MonoDelta::FromSeconds(10))); - ASSERT_OK(WaitForServersToAgree(MonoDelta::FromSeconds(10), tablet_servers_, tablet_id_, 2)); + ASSERT_OK(StartElection(ts, tablet_id_, kTimeout)); + ASSERT_OK(WaitUntilLeader(ts, tablet_id_, kTimeout)); + ASSERT_OK(WriteSimpleTestRow(ts, tablet_id_, RowOperationsPB::INSERT, + kTestRowKey, kTestRowIntVal, "foo", kTimeout)); + ASSERT_OK(WaitForServersToAgree(kTimeout, tablet_servers_, tablet_id_, 2)); + + // Get the Raft term from the newly established leader. + ConsensusStatePB cstate_before; + ASSERT_OK(GetConsensusState(ts, tablet_id_, kTimeout, + consensus::EXCLUDE_HEALTH_REPORT, &cstate_before)); // Step down and test that a 2nd stepdown returns the expected result. - ASSERT_OK(LeaderStepDown(tservers[0], tablet_id_, MonoDelta::FromSeconds(10))); + ASSERT_OK(LeaderStepDown(ts, tablet_id_, kTimeout)); + + // Get the Raft term from the leader that has just stepped down. + ConsensusStatePB cstate_after; + ASSERT_OK(GetConsensusState(ts, tablet_id_, kTimeout, + consensus::EXCLUDE_HEALTH_REPORT, &cstate_after)); + // The stepped-down leader should increment its run-time Raft term. + EXPECT_GT(cstate_after.current_term(), cstate_before.current_term()); + TabletServerErrorPB error; - s = LeaderStepDown(tservers[0], tablet_id_, MonoDelta::FromSeconds(10), &error); + s = LeaderStepDown(ts, tablet_id_, kTimeout, &error); ASSERT_TRUE(s.IsIllegalState()) << "TS #0 should not be leader anymore: " << s.ToString(); ASSERT_EQ(TabletServerErrorPB::NOT_THE_LEADER, error.code()) << SecureShortDebugString(error); - s = WriteSimpleTestRow(tservers[0], tablet_id_, RowOperationsPB::INSERT, - kTestRowKey, kTestRowIntVal, "foo", MonoDelta::FromSeconds(10)); + s = WriteSimpleTestRow(ts, tablet_id_, RowOperationsPB::INSERT, + kTestRowKey, kTestRowIntVal, "foo", kTimeout); ASSERT_TRUE(s.IsIllegalState()) << "TS #0 should not accept writes as follower: " << s.ToString(); } @@ -517,8 +535,7 @@ TEST_F(RaftConsensusElectionITest, TombstonedVoteAfterFailedTabletCopy) { "tablet_copy_fault_crash_on_fetch_all", "1.0")); auto* leader_ts = tablet_servers_[leader_uuid]; auto* follower_ts = tablet_servers_[follower_uuid]; - ASSERT_OK(itest::AddServer(leader_ts, tablet_id_, follower_ts, RaftPeerPB::VOTER, - kTimeout)); + ASSERT_OK(AddServer(leader_ts, tablet_id_, follower_ts, RaftPeerPB::VOTER, kTimeout)); ASSERT_OK(cluster_->tablet_server_by_uuid(follower_uuid)->WaitForInjectedCrash(kTimeout)); // Shut down the rest of the cluster, then only bring back the node we http://git-wip-us.apache.org/repos/asf/kudu/blob/f6777db3/src/kudu/integration-tests/tablet_copy-itest.cc ---------------------------------------------------------------------- diff --git a/src/kudu/integration-tests/tablet_copy-itest.cc b/src/kudu/integration-tests/tablet_copy-itest.cc index e8fc265..d4cf644 100644 --- a/src/kudu/integration-tests/tablet_copy-itest.cc +++ b/src/kudu/integration-tests/tablet_copy-itest.cc @@ -256,8 +256,7 @@ TEST_F(TabletCopyITest, TestRejectRogueLeader) { ASSERT_OK(cluster_->tablet_server(zombie_leader_index)->Resume()); // Loop for a few seconds to ensure that the tablet doesn't transition to READY. - MonoTime deadline = MonoTime::Now(); - deadline.AddDelta(MonoDelta::FromSeconds(5)); + MonoTime deadline = MonoTime::Now() + MonoDelta::FromSeconds(5); while (MonoTime::Now() < deadline) { ASSERT_OK(itest::ListTablets(ts, timeout, &tablets)); ASSERT_EQ(1, tablets.size()); @@ -267,13 +266,9 @@ TEST_F(TabletCopyITest, TestRejectRogueLeader) { LOG(INFO) << "the rogue leader was not able to tablet copy the tombstoned follower"; - // Force the rogue leader to step down. - // Then, send a tablet copy start request from a "fake" leader that - // sends an up-to-date term in the RB request but the actual term stored - // in the copy source's consensus metadata would still be old. - LOG(INFO) << "forcing rogue leader T " << tablet_id << " P " << zombie_leader_uuid - << " to step down..."; - ASSERT_OK(itest::LeaderStepDown(ts_map_[zombie_leader_uuid], tablet_id, timeout)); + // Send a tablet copy start request from a "fake" leader that + // sends an up-to-date term in the tablet copy request but the actual term + // stored in the copy source's consensus metadata would still be old. ExternalTabletServer* zombie_ets = cluster_->tablet_server(zombie_leader_index); // It's not necessarily part of the API but this could return faliure due to // rejecting the remote. We intend to make that part async though, so ignoring