This is an automated email from the ASF dual-hosted git repository.

adar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new b32283d  KUDU-2155: disable failure detector around elections
b32283d is described below

commit b32283d2e5ce3d88e4f6afdeedf1c616721cce3a
Author: Adar Dembo <[email protected]>
AuthorDate: Sat Sep 23 12:58:43 2017 -0700

    KUDU-2155: disable failure detector around elections
    
    This is a more complete fix for KUDU-2149 which disables the failure
    detector completely around a leader election.
    
    There are several changes to make this happen:
    1. The FD is changed to use a one-shot timer, which automatically disables
       upon firing.
    2. Because all elections are guaranteed to reach DoElectionCallback, that's
       where we reenable the FD.
    3. We provide a special case for pre-elections where FD reenabling is
       deferred until after the subsequent real election finishes.
    
    I'm still not convinced this is the cleanest approach, but it seems to work.
    
    Change-Id: Idcd311cee028c48e908f290d60c474e8a4557d97
    Reviewed-on: http://gerrit.cloudera.org:8080/8134
    Tested-by: Kudu Jenkins
    Reviewed-by: Alexey Serbin <[email protected]>
    Reviewed-by: Andrew Wong <[email protected]>
---
 src/kudu/consensus/raft_consensus.cc | 103 ++++++++++++++++++++++-------------
 src/kudu/consensus/raft_consensus.h  |  35 ++++--------
 2 files changed, 77 insertions(+), 61 deletions(-)

diff --git a/src/kudu/consensus/raft_consensus.cc 
b/src/kudu/consensus/raft_consensus.cc
index d9ca4f4..bf10e63 100644
--- a/src/kudu/consensus/raft_consensus.cc
+++ b/src/kudu/consensus/raft_consensus.cc
@@ -67,6 +67,7 @@
 #include "kudu/util/process_memory.h"
 #include "kudu/util/random.h"
 #include "kudu/util/random_util.h"
+#include "kudu/util/scoped_cleanup.h"
 #include "kudu/util/status.h"
 #include "kudu/util/thread_restrictions.h"
 #include "kudu/util/threadpool.h"
@@ -296,6 +297,8 @@ Status RaftConsensus::Start(const ConsensusBootstrapInfo& 
info,
   // Capture a weak_ptr reference into the functor so it can safely handle
   // outliving the consensus instance.
   weak_ptr<RaftConsensus> w = shared_from_this();
+  PeriodicTimer::Options opts;
+  opts.one_shot = true;
   failure_detector_ = PeriodicTimer::Create(
       peer_proxy_factory_->messenger(),
       [w]() {
@@ -303,10 +306,9 @@ Status RaftConsensus::Start(const ConsensusBootstrapInfo& 
info,
           consensus->ReportFailureDetected();
         }
       },
-      MinimumElectionTimeout());
+      MinimumElectionTimeout(),
+      opts);
 
-  PeriodicTimer::Options opts;
-  opts.one_shot = true;
   transfer_period_timer_ = PeriodicTimer::Create(
       peer_proxy_factory_->messenger(),
       [w]() {
@@ -650,20 +652,29 @@ scoped_refptr<ConsensusRound> RaftConsensus::NewRound(
 }
 
 void RaftConsensus::ReportFailureDetectedTask() {
-  std::unique_lock<simple_spinlock> try_lock(failure_detector_election_lock_,
-                                             std::try_to_lock);
-  if (try_lock.owns_lock()) {
-    WARN_NOT_OK_EVERY_N_SECS(StartElection(FLAGS_raft_enable_pre_election ?
-        PRE_ELECTION : NORMAL_ELECTION, ELECTION_TIMEOUT_EXPIRED),
-                             LogPrefixThreadSafe() + "failed to trigger leader 
election", 10);
+  Status s = StartElection(FLAGS_raft_enable_pre_election ?
+      PRE_ELECTION : NORMAL_ELECTION, ELECTION_TIMEOUT_EXPIRED);
+  if (PREDICT_FALSE(!s.ok())) {
+    WARN_NOT_OK_EVERY_N_SECS(
+        s, LogPrefixThreadSafe() + "failed to trigger leader election", 10);
+    // Normally the failure detector would be enabled at the end of the 
election,
+    // but since the election failed to start, we must reenable explicitly.
+    EnableFailureDetector();
   }
 }
 
 void RaftConsensus::ReportFailureDetected() {
-  // We're running on a timer thread; start an election on a different thread 
pool.
-  WARN_NOT_OK(raft_pool_token_->SubmitFunc(std::bind(
-      &RaftConsensus::ReportFailureDetectedTask, shared_from_this())),
-              LogPrefixThreadSafe() + "failed to submit failure detected 
task");
+  // We're running on a timer thread; start an election on a different thread.
+  //
+  // There's no need to reenable the failure detector; if this fails, it's a
+  // sign that RaftConsensus has stopped and we no longer need failure 
detection.
+  Status s = raft_pool_token_->SubmitFunc(std::bind(
+      &RaftConsensus::ReportFailureDetectedTask, shared_from_this()));
+  if (PREDICT_FALSE(!s.ok())) {
+    static const char* msg = "failed to submit failure detected task";
+    CHECK(s.IsServiceUnavailable()) << LogPrefixThreadSafe() << msg;
+    WARN_NOT_OK(s, LogPrefixThreadSafe() + msg);
+  }
 }
 
 Status RaftConsensus::BecomeLeaderUnlocked() {
@@ -2596,14 +2607,17 @@ void RaftConsensus::DumpStatusHtml(std::ostream& out) 
const {
 }
 
 void RaftConsensus::ElectionCallback(ElectionReason reason, const 
ElectionResult& result) {
-  // The election callback runs on a reactor thread, so we need to defer to our
-  // threadpool. If the threadpool is already shut down for some reason, it's 
OK --
-  // we're OK with the callback never running.
-  
WARN_NOT_OK(raft_pool_token_->SubmitFunc(std::bind(&RaftConsensus::DoElectionCallback,
-                                                     shared_from_this(),
-                                                     reason,
-                                                     result)),
-              LogPrefixThreadSafe() + "Unable to run election callback");
+  // We're running on a reactor thread; service the callback on another thread.
+  //
+  // There's no need to reenable the failure detector; if this fails, it's a
+  // sign that RaftConsensus has stopped and we no longer need failure 
detection.
+  Status s = raft_pool_token_->SubmitFunc(std::bind(
+      &RaftConsensus::DoElectionCallback, shared_from_this(), reason, result));
+  if (!s.ok()) {
+    static const char* msg = "unable to run election callback";
+    CHECK(s.IsServiceUnavailable()) << LogPrefixThreadSafe() << msg;
+    WARN_NOT_OK(s, LogPrefixThreadSafe() + msg);
+  }
 }
 
 void RaftConsensus::DoElectionCallback(ElectionReason reason, const 
ElectionResult& result) {
@@ -2611,9 +2625,9 @@ void RaftConsensus::DoElectionCallback(ElectionReason 
reason, const ElectionResu
   const bool was_pre_election = result.vote_request.is_pre_election();
   const char* election_type = was_pre_election ? "pre-election" : "election";
 
-  // The vote was granted, become leader.
   ThreadRestrictions::AssertWaitAllowed();
-  UniqueLock lock(lock_);
+  UniqueLock l(lock_);
+
   Status s = CheckRunningUnlocked();
   if (PREDICT_FALSE(!s.ok())) {
     LOG_WITH_PREFIX_UNLOCKED(INFO) << "Received " << election_type << " 
callback for term "
@@ -2622,13 +2636,23 @@ void RaftConsensus::DoElectionCallback(ElectionReason 
reason, const ElectionResu
     return;
   }
 
-  // Snooze to avoid the election timer firing again as much as possible.
-  // We need to snooze when we win and when we lose:
-  // - When we win because we're about to disable the timer and become leader.
+  // If this election was not triggered by the failure detector, the fd may
+  // still be enabled and needs to be snoozed, both if we win and lose:
+  // - When we win because we're about to disable it and become leader.
   // - When we lose or otherwise we can fall into a cycle, where everyone keeps
   //   triggering elections but no election ever completes because by the time 
they
   //   finish another one is triggered already.
-  SnoozeFailureDetector(string("election complete"), 
LeaderElectionExpBackoffDeltaUnlocked());
+  //
+  // This is a no-op if the failure detector is disabled..
+  MonoDelta snooze_delta = LeaderElectionExpBackoffDeltaUnlocked();
+  SnoozeFailureDetector(string("election complete"), snooze_delta);
+  auto enable_fd = MakeScopedCleanup([&]() {
+    // The failure detector was disabled if it triggered this election. 
Reenable
+    // it when exiting this function.
+    if (reason == ELECTION_TIMEOUT_EXPIRED) {
+      EnableFailureDetector(snooze_delta);
+    }
+  });
 
   if (result.decision == VOTE_DENIED) {
     failed_elections_since_stable_leader_++;
@@ -2665,27 +2689,31 @@ void RaftConsensus::DoElectionCallback(ElectionReason 
reason, const ElectionResu
   if (election_started_in_term != CurrentTermUnlocked()) {
     LOG_WITH_PREFIX_UNLOCKED(INFO)
         << "Leader " << election_type << " decision vote started in "
-        << "defunct term " << election_started_in_term << ": "
-        << (result.decision == VOTE_GRANTED ? "won" : "lost");
+        << "defunct term " << election_started_in_term << ": won";
     return;
   }
 
   if (!cmeta_->IsVoterInConfig(peer_uuid(), ACTIVE_CONFIG)) {
     LOG_WITH_PREFIX_UNLOCKED(WARNING) << "Leader " << election_type
                                       << " decision while not in active 
config. "
-                                      << "Result: Term " << election_term << 
": "
-                                      << (result.decision == VOTE_GRANTED ? 
"won" : "lost")
-                                      << ". RaftConfig: "
+                                      << "Result: Term " << election_term
+                                      << ": won. RaftConfig: "
                                       << 
SecureShortDebugString(cmeta_->ActiveConfig());
     return;
   }
 
+  // At this point we're either already leader, we're going to become leader,
+  // or we're going to run a real election.
+  //
+  // In all cases, do not reenable the failure detector.
+  enable_fd.cancel();
+
   if (cmeta_->active_role() == RaftPeerPB::LEADER) {
     // If this was a pre-election, it's possible to see the following 
interleaving:
     //
-    //  1. Term N (follower): send a real election for term N
-    //  2. Election callback expires again
-    //  3. Term N (follower): send a pre-election for term N+1
+    //  1. Term N (follower): send a real election for term N.
+    //  2. An externally-triggered election is started.
+    //  3. Term N (follower): send a pre-election for term N+1.
     //  4. Election callback for real election from term N completes.
     //     Peer is now leader for term N.
     //  5. Pre-election callback from term N+1 completes, even though
@@ -2695,8 +2723,7 @@ void RaftConsensus::DoElectionCallback(ElectionReason 
reason, const ElectionResu
     if (was_pre_election) return;
     LOG_WITH_PREFIX_UNLOCKED(DFATAL)
         << "Leader " << election_type << " callback while already leader! "
-        << "Result: Term " << election_term << ": "
-        << (result.decision == VOTE_GRANTED ? "won" : "lost");
+        << "Result: Term " << election_term << ": won";
     return;
   }
 
@@ -2704,7 +2731,7 @@ void RaftConsensus::DoElectionCallback(ElectionReason 
reason, const ElectionResu
 
   if (was_pre_election) {
     // We just won the pre-election. So, we need to call a real election.
-    lock.unlock();
+    l.unlock();
     WARN_NOT_OK_EVERY_N_SECS(StartElection(NORMAL_ELECTION, reason),
                              "Couldn't start leader election after successful 
pre-election", 10);
   } else {
diff --git a/src/kudu/consensus/raft_consensus.h 
b/src/kudu/consensus/raft_consensus.h
index 2b3d704..99c5638 100644
--- a/src/kudu/consensus/raft_consensus.h
+++ b/src/kudu/consensus/raft_consensus.h
@@ -639,19 +639,24 @@ class RaftConsensus : public 
std::enable_shared_from_this<RaftConsensus>,
   void ElectionCallback(ElectionReason reason, const ElectionResult& result);
   void DoElectionCallback(ElectionReason reason, const ElectionResult& result);
 
-  // Start tracking the leader for failures. This typically occurs at startup
-  // and when the local peer steps down as leader.
+  // Starts tracking the leader for failures. This occurs at startup, when a
+  // local peer transitions from LEADER to FOLLOWER or from NON_VOTER to VOTER,
+  // or after a failed election.
+  //
+  // If the failure detector is "snoozed" (see SnoozeFailureDetector()), it
+  // means some leader activity was observed and the failure detection period
+  // should be reset.
   //
   // If 'delta' is set, it is used as the initial period for leader failure
   // detection. Otherwise, the minimum election timeout is used.
   //
-  // If the failure detector is already registered, has no effect.
-  void EnableFailureDetector(boost::optional<MonoDelta> delta);
+  // If the failure detector is already enabled, this has no effect.
+  void EnableFailureDetector(boost::optional<MonoDelta> delta = boost::none);
 
-  // Stop tracking the current leader for failures. This typically occurs when
-  // the local peer becomes leader.
+  // Stops tracking the leader for failures. This occurs when a local peer
+  // transitions from FOLLOWER to LEADER or from VOTER to NON_VOTER.
   //
-  // If the failure detector is already disabled, has no effect.
+  // If the failure detector is already disabled, this has no effect.
   void DisableFailureDetector();
 
   // Enables or disables the failure detector based on the role of the local
@@ -890,22 +895,6 @@ class RaftConsensus : public 
std::enable_shared_from_this<RaftConsensus>,
   boost::optional<std::string> designated_successor_uuid_;
   std::shared_ptr<rpc::PeriodicTimer> transfer_period_timer_;
 
-  // Lock held while starting a failure-triggered election.
-  //
-  // After reporting a failure and asynchronously starting an election, the
-  // failure detector immediately rearms. If the election starts slowly (i.e.
-  // there's a lot of contention on the consensus lock, or persisting votes is
-  // really slow due to other I/O), more elections may start and "stack" on
-  // top of the first. Forcing the starting of elections to serialize on this
-  // lock prevents that from happening. See KUDU-2149 for more details.
-  //
-  // Note: the lock is only ever acquired via try_lock(); if it cannot be
-  // acquired, a StartElection() is in progress so the next one is skipped.
-  //
-  // TODO(KUDU-2155): should be replaced with explicit disabling/enabling of
-  // the failure detector during elections.
-  simple_spinlock failure_detector_election_lock_;
-
   // Any RequestVote() arriving before this timestamp is ignored (i.e. 
responded
   // to with NO vote). This prevents abandoned or partitioned nodes from
   // disturbing the healthy leader.

Reply via email to