Repository: kudu Updated Branches: refs/heads/master 45c1512fc -> b3ab84979
KUDU-1135 (part 2): avoid flushing metadata twice when starting an election Change-Id: I231273a1cfa92275788dd99c78e284ecd0543d7a Reviewed-on: http://gerrit.cloudera.org:8080/4702 Tested-by: Kudu Jenkins Reviewed-by: Todd Lipcon <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/717c70d5 Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/717c70d5 Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/717c70d5 Branch: refs/heads/master Commit: 717c70d59552195d90960a3e45f9bdcd5c08af98 Parents: 45c1512 Author: Todd Lipcon <[email protected]> Authored: Wed Oct 12 14:59:35 2016 -0700 Committer: Todd Lipcon <[email protected]> Committed: Tue Oct 18 05:19:54 2016 +0000 ---------------------------------------------------------------------- src/kudu/consensus/raft_consensus.cc | 15 ++++++--------- src/kudu/consensus/raft_consensus.h | 3 --- src/kudu/consensus/raft_consensus_quorum-test.cc | 6 ++++++ 3 files changed, 12 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/717c70d5/src/kudu/consensus/raft_consensus.cc ---------------------------------------------------------------------- diff --git a/src/kudu/consensus/raft_consensus.cc b/src/kudu/consensus/raft_consensus.cc index 4c3de3d..ac12ec7 100644 --- a/src/kudu/consensus/raft_consensus.cc +++ b/src/kudu/consensus/raft_consensus.cc @@ -350,7 +350,7 @@ Status RaftConsensus::EmulateElection() { LOG_WITH_PREFIX_UNLOCKED(INFO) << "Emulating election..."; // Assume leadership of new term. - RETURN_NOT_OK(IncrementTermUnlocked()); + RETURN_NOT_OK(HandleTermAdvanceUnlocked(state_->GetCurrentTermUnlocked() + 1)); SetLeaderUuidUnlocked(state_->GetPeerUuid()); return BecomeLeaderUnlocked(); } @@ -419,11 +419,12 @@ Status RaftConsensus::StartElection(ElectionMode mode, ElectionReason reason) { // Increment the term and vote for ourselves, unless it's a pre-election. if (mode != PRE_ELECTION) { - // TODO(todd): the IncrementTermUnlocked call flushes to disk once, and then - // the SetVotedForCurrentTerm flushes again. We should avoid flushing to disk - // on the term bump. // TODO(mpercy): Consider using a separate Mutex for voting, which must sync to disk. - RETURN_NOT_OK(IncrementTermUnlocked()); + + // We skip flushing the term to disk because setting the vote just below also + // flushes to disk, and the double fsync doesn't buy us anything. + RETURN_NOT_OK(HandleTermAdvanceUnlocked(state_->GetCurrentTermUnlocked() + 1, + ReplicaState::SKIP_FLUSH_TO_DISK)); RETURN_NOT_OK(state_->SetVotedForCurrentTermUnlocked(state_->GetPeerUuid())); } @@ -2086,10 +2087,6 @@ MonoDelta RaftConsensus::LeaderElectionExpBackoffDeltaUnlocked() { return MonoDelta::FromMilliseconds(timeout); } -Status RaftConsensus::IncrementTermUnlocked() { - return HandleTermAdvanceUnlocked(state_->GetCurrentTermUnlocked() + 1); -} - Status RaftConsensus::HandleTermAdvanceUnlocked(ConsensusTerm new_term, ReplicaState::FlushToDisk flush) { if (new_term <= state_->GetCurrentTermUnlocked()) { http://git-wip-us.apache.org/repos/asf/kudu/blob/717c70d5/src/kudu/consensus/raft_consensus.h ---------------------------------------------------------------------- diff --git a/src/kudu/consensus/raft_consensus.h b/src/kudu/consensus/raft_consensus.h index 33bf7da..37f7f83 100644 --- a/src/kudu/consensus/raft_consensus.h +++ b/src/kudu/consensus/raft_consensus.h @@ -373,9 +373,6 @@ class RaftConsensus : public Consensus, // The maximum delta is capped by 'FLAGS_leader_failure_exp_backoff_max_delta_ms'. MonoDelta LeaderElectionExpBackoffDeltaUnlocked(); - // Increment the term to the next term, resetting the current leader, etc. - Status IncrementTermUnlocked(); - // Handle when the term has advanced beyond the current term. // // 'flush' may be used to control whether the term change is flushed to disk. http://git-wip-us.apache.org/repos/asf/kudu/blob/717c70d5/src/kudu/consensus/raft_consensus_quorum-test.cc ---------------------------------------------------------------------- diff --git a/src/kudu/consensus/raft_consensus_quorum-test.cc b/src/kudu/consensus/raft_consensus_quorum-test.cc index 2ca8e1b..d9336ac 100644 --- a/src/kudu/consensus/raft_consensus_quorum-test.cc +++ b/src/kudu/consensus/raft_consensus_quorum-test.cc @@ -896,11 +896,17 @@ TEST_F(RaftConsensusQuorumTest, TestLeaderElectionWithQuiescedQuorum) { // This will force an election in which we expect to make the last // non-shutdown peer in the list become leader. + int flush_count_before = new_leader->GetReplicaStateForTests() + ->consensus_metadata_for_tests()->flush_count_for_tests(); LOG(INFO) << "Running election for future leader with index " << (current_config_size - 1); ASSERT_OK(new_leader->StartElection(Consensus::ELECT_EVEN_IF_LEADER_IS_ALIVE, Consensus::EXTERNAL_REQUEST)); WaitUntilLeaderForTests(new_leader.get()); LOG(INFO) << "Election won"; + int flush_count_after = new_leader->GetReplicaStateForTests() + ->consensus_metadata_for_tests()->flush_count_for_tests(); + ASSERT_EQ(flush_count_after, flush_count_before + 1) + << "Expected only one consensus metadata flush for a leader election"; // ... replicating a set of messages to the new leader should now be possible. REPLICATE_SEQUENCE_OF_MESSAGES(10,
