Repository: kudu
Updated Branches:
  refs/heads/master 45c1512fc -> b3ab84979


KUDU-1135 (part 2): avoid flushing metadata twice when starting an election

Change-Id: I231273a1cfa92275788dd99c78e284ecd0543d7a
Reviewed-on: http://gerrit.cloudera.org:8080/4702
Tested-by: Kudu Jenkins
Reviewed-by: Todd Lipcon <t...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/717c70d5
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/717c70d5
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/717c70d5

Branch: refs/heads/master
Commit: 717c70d59552195d90960a3e45f9bdcd5c08af98
Parents: 45c1512
Author: Todd Lipcon <t...@apache.org>
Authored: Wed Oct 12 14:59:35 2016 -0700
Committer: Todd Lipcon <t...@apache.org>
Committed: Tue Oct 18 05:19:54 2016 +0000

----------------------------------------------------------------------
 src/kudu/consensus/raft_consensus.cc             | 15 ++++++---------
 src/kudu/consensus/raft_consensus.h              |  3 ---
 src/kudu/consensus/raft_consensus_quorum-test.cc |  6 ++++++
 3 files changed, 12 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/717c70d5/src/kudu/consensus/raft_consensus.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/raft_consensus.cc 
b/src/kudu/consensus/raft_consensus.cc
index 4c3de3d..ac12ec7 100644
--- a/src/kudu/consensus/raft_consensus.cc
+++ b/src/kudu/consensus/raft_consensus.cc
@@ -350,7 +350,7 @@ Status RaftConsensus::EmulateElection() {
   LOG_WITH_PREFIX_UNLOCKED(INFO) << "Emulating election...";
 
   // Assume leadership of new term.
-  RETURN_NOT_OK(IncrementTermUnlocked());
+  RETURN_NOT_OK(HandleTermAdvanceUnlocked(state_->GetCurrentTermUnlocked() + 
1));
   SetLeaderUuidUnlocked(state_->GetPeerUuid());
   return BecomeLeaderUnlocked();
 }
@@ -419,11 +419,12 @@ Status RaftConsensus::StartElection(ElectionMode mode, 
ElectionReason reason) {
 
     // Increment the term and vote for ourselves, unless it's a pre-election.
     if (mode != PRE_ELECTION) {
-      // TODO(todd): the IncrementTermUnlocked call flushes to disk once, and 
then
-      // the SetVotedForCurrentTerm flushes again. We should avoid flushing to 
disk
-      // on the term bump.
       // TODO(mpercy): Consider using a separate Mutex for voting, which must 
sync to disk.
-      RETURN_NOT_OK(IncrementTermUnlocked());
+
+      // We skip flushing the term to disk because setting the vote just below 
also
+      // flushes to disk, and the double fsync doesn't buy us anything.
+      RETURN_NOT_OK(HandleTermAdvanceUnlocked(state_->GetCurrentTermUnlocked() 
+ 1,
+                                              
ReplicaState::SKIP_FLUSH_TO_DISK));
       
RETURN_NOT_OK(state_->SetVotedForCurrentTermUnlocked(state_->GetPeerUuid()));
     }
 
@@ -2086,10 +2087,6 @@ MonoDelta 
RaftConsensus::LeaderElectionExpBackoffDeltaUnlocked() {
   return MonoDelta::FromMilliseconds(timeout);
 }
 
-Status RaftConsensus::IncrementTermUnlocked() {
-  return HandleTermAdvanceUnlocked(state_->GetCurrentTermUnlocked() + 1);
-}
-
 Status RaftConsensus::HandleTermAdvanceUnlocked(ConsensusTerm new_term,
                                                 ReplicaState::FlushToDisk 
flush) {
   if (new_term <= state_->GetCurrentTermUnlocked()) {

http://git-wip-us.apache.org/repos/asf/kudu/blob/717c70d5/src/kudu/consensus/raft_consensus.h
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/raft_consensus.h 
b/src/kudu/consensus/raft_consensus.h
index 33bf7da..37f7f83 100644
--- a/src/kudu/consensus/raft_consensus.h
+++ b/src/kudu/consensus/raft_consensus.h
@@ -373,9 +373,6 @@ class RaftConsensus : public Consensus,
   // The maximum delta is capped by 
'FLAGS_leader_failure_exp_backoff_max_delta_ms'.
   MonoDelta LeaderElectionExpBackoffDeltaUnlocked();
 
-  // Increment the term to the next term, resetting the current leader, etc.
-  Status IncrementTermUnlocked();
-
   // Handle when the term has advanced beyond the current term.
   //
   // 'flush' may be used to control whether the term change is flushed to disk.

http://git-wip-us.apache.org/repos/asf/kudu/blob/717c70d5/src/kudu/consensus/raft_consensus_quorum-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/raft_consensus_quorum-test.cc 
b/src/kudu/consensus/raft_consensus_quorum-test.cc
index 2ca8e1b..d9336ac 100644
--- a/src/kudu/consensus/raft_consensus_quorum-test.cc
+++ b/src/kudu/consensus/raft_consensus_quorum-test.cc
@@ -896,11 +896,17 @@ TEST_F(RaftConsensusQuorumTest, 
TestLeaderElectionWithQuiescedQuorum) {
 
     // This will force an election in which we expect to make the last
     // non-shutdown peer in the list become leader.
+    int flush_count_before = new_leader->GetReplicaStateForTests()
+        ->consensus_metadata_for_tests()->flush_count_for_tests();
     LOG(INFO) << "Running election for future leader with index " << 
(current_config_size - 1);
     
ASSERT_OK(new_leader->StartElection(Consensus::ELECT_EVEN_IF_LEADER_IS_ALIVE,
                                         Consensus::EXTERNAL_REQUEST));
     WaitUntilLeaderForTests(new_leader.get());
     LOG(INFO) << "Election won";
+    int flush_count_after = new_leader->GetReplicaStateForTests()
+        ->consensus_metadata_for_tests()->flush_count_for_tests();
+    ASSERT_EQ(flush_count_after, flush_count_before + 1)
+        << "Expected only one consensus metadata flush for a leader election";
 
     // ... replicating a set of messages to the new leader should now be 
possible.
     REPLICATE_SEQUENCE_OF_MESSAGES(10,

Reply via email to