Repository: kudu
Updated Branches:
  refs/heads/master c73f023da -> fcb0be638


[consensus] KUDU-2367 fix replica health report

This patch addresses KUDU-2367,where a leader replica under particular
conditions reports follower's health status as FAILED instead of
FAILED_UNRECOVERABLE. I.e., the bug in the code of the
PeerMessageQueue::PeerHealthStatus() method resulted in having the
leader reporting the health status of the replica still as FAILED even
after the replica fell behind the WAL segment GC threshold.

Also, a couple of already existing integration tests are updated to
provide appropriate coverage for this patch.

Change-Id: If77474adc8f618f2cda35f992190133138fdb511
Reviewed-on: http://gerrit.cloudera.org:8080/9755
Tested-by: Alexey Serbin <aser...@cloudera.com>
Reviewed-by: Mike Percy <mpe...@apache.org>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/fcb0be63
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/fcb0be63
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/fcb0be63

Branch: refs/heads/master
Commit: fcb0be6381a47155e171eb50a333af502bbf506f
Parents: c73f023
Author: Alexey Serbin <aser...@cloudera.com>
Authored: Thu Mar 22 01:02:32 2018 -0700
Committer: Mike Percy <mpe...@apache.org>
Committed: Fri Mar 23 05:30:52 2018 +0000

----------------------------------------------------------------------
 src/kudu/consensus/consensus_queue-test.cc      | 67 +++++++++++++++++++-
 src/kudu/consensus/consensus_queue.cc           | 42 +++++++++---
 src/kudu/consensus/consensus_queue.h            |  1 +
 .../raft_consensus-itest-base.cc                |  7 +-
 .../raft_consensus-itest-base.h                 |  5 +-
 .../raft_consensus_nonvoter-itest.cc            | 40 ++++++++----
 6 files changed, 137 insertions(+), 25 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/fcb0be63/src/kudu/consensus/consensus_queue-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_queue-test.cc 
b/src/kudu/consensus/consensus_queue-test.cc
index c8dc086..c3cfd24 100644
--- a/src/kudu/consensus/consensus_queue-test.cc
+++ b/src/kudu/consensus/consensus_queue-test.cc
@@ -59,9 +59,9 @@
 #include "kudu/util/threadpool.h"
 
 DECLARE_int32(consensus_max_batch_size_bytes);
+DECLARE_int32(follower_unavailable_considered_failed_sec);
 
-METRIC_DECLARE_entity(tablet);
-
+using kudu::consensus::HealthReportPB;
 using std::vector;
 
 namespace kudu {
@@ -189,7 +189,7 @@ class ConsensusQueueTest : public KuduTest {
 
   void WaitForLocalPeerToAckIndex(int index) {
     while (true) {
-      PeerMessageQueue::TrackedPeer leader = 
queue_->GetTrackedPeerForTests(kLeaderUuid);
+      const auto leader = queue_->GetTrackedPeerForTests(kLeaderUuid);
       if (leader.last_received.index() >= index) {
         break;
       }
@@ -945,5 +945,66 @@ TEST_F(ConsensusQueueTest, 
TestFollowerCommittedIndexAndMetrics) {
   ASSERT_EQ(5, queue_->metrics_.num_ops_behind_leader->value());
 }
 
+// Unit test for the PeerMessageQueue::PeerHealthStatus() method.
+TEST(ConsensusQueueUnitTest, PeerHealthStatus) {
+  static constexpr PeerStatus kPeerStatusesForUnknown[] = {
+     PeerStatus::NEW,
+     PeerStatus::REMOTE_ERROR,
+     PeerStatus::RPC_LAYER_ERROR,
+     PeerStatus::TABLET_NOT_FOUND,
+     PeerStatus::INVALID_TERM,
+     PeerStatus::CANNOT_PREPARE,
+     PeerStatus::LMP_MISMATCH,
+  };
+
+  RaftPeerPB peer_pb;
+  PeerMessageQueue::TrackedPeer peer(peer_pb);
+  EXPECT_EQ(HealthReportPB::UNKNOWN, PeerMessageQueue::PeerHealthStatus(peer));
+  for (auto status : kPeerStatusesForUnknown) {
+    peer.last_exchange_status = status;
+    EXPECT_EQ(HealthReportPB::UNKNOWN, 
PeerMessageQueue::PeerHealthStatus(peer));
+  }
+
+  peer.last_exchange_status = PeerStatus::TABLET_FAILED;
+  EXPECT_EQ(HealthReportPB::FAILED_UNRECOVERABLE, 
PeerMessageQueue::PeerHealthStatus(peer));
+
+  peer.last_exchange_status = PeerStatus::OK;
+  EXPECT_EQ(HealthReportPB::HEALTHY, PeerMessageQueue::PeerHealthStatus(peer));
+
+  peer.wal_catchup_possible = false;
+  EXPECT_EQ(HealthReportPB::FAILED_UNRECOVERABLE, 
PeerMessageQueue::PeerHealthStatus(peer));
+
+  peer.wal_catchup_possible = true;
+  EXPECT_EQ(HealthReportPB::HEALTHY, PeerMessageQueue::PeerHealthStatus(peer));
+
+  peer.last_communication_time -=
+      MonoDelta::FromSeconds(FLAGS_follower_unavailable_considered_failed_sec 
+ 1);
+  EXPECT_EQ(HealthReportPB::FAILED, PeerMessageQueue::PeerHealthStatus(peer));
+  for (auto status : kPeerStatusesForUnknown) {
+    peer.last_exchange_status = status;
+    EXPECT_EQ(HealthReportPB::FAILED, 
PeerMessageQueue::PeerHealthStatus(peer));
+  }
+
+  peer.last_exchange_status = PeerStatus::TABLET_FAILED;
+  EXPECT_EQ(HealthReportPB::FAILED_UNRECOVERABLE, 
PeerMessageQueue::PeerHealthStatus(peer));
+
+  peer.last_exchange_status = PeerStatus::OK;
+  EXPECT_EQ(HealthReportPB::FAILED, PeerMessageQueue::PeerHealthStatus(peer));
+
+  peer.wal_catchup_possible = false;
+  EXPECT_EQ(HealthReportPB::FAILED_UNRECOVERABLE, 
PeerMessageQueue::PeerHealthStatus(peer));
+
+  for (auto status : kPeerStatusesForUnknown) {
+    peer.last_exchange_status = status;
+    EXPECT_EQ(HealthReportPB::FAILED_UNRECOVERABLE, 
PeerMessageQueue::PeerHealthStatus(peer));
+  }
+
+  peer.last_exchange_status = PeerStatus::OK;
+  EXPECT_EQ(HealthReportPB::FAILED_UNRECOVERABLE, 
PeerMessageQueue::PeerHealthStatus(peer));
+
+  peer.last_exchange_status = PeerStatus::TABLET_FAILED;
+  EXPECT_EQ(HealthReportPB::FAILED_UNRECOVERABLE, 
PeerMessageQueue::PeerHealthStatus(peer));
+}
+
 }  // namespace consensus
 }  // namespace kudu

http://git-wip-us.apache.org/repos/asf/kudu/blob/fcb0be63/src/kudu/consensus/consensus_queue.cc
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_queue.cc 
b/src/kudu/consensus/consensus_queue.cc
index 804739c..63ffd74 100644
--- a/src/kudu/consensus/consensus_queue.cc
+++ b/src/kudu/consensus/consensus_queue.cc
@@ -552,24 +552,48 @@ void 
PeerMessageQueue::UpdatePeerHealthUnlocked(TrackedPeer* peer) {
   }
 }
 
+// While reporting on the replica health status, it's important to report on
+// the 'definitive' health statuses once they surface. That allows the system
+// to expedite decisions on replica replacement because the more 'definitive'
+// statuses have less uncertainty and provide more information (compared
+// with less 'definitive' statuses). Informally, the level of 'definitiveness'
+// could be measured by the number of possible state transitions on the replica
+// health status state diagram.
+//
+// The health status chain below has increasing level of 'definitiveness'
+// left to right:
+//
+//   UNKNOWN --> HEALTHY --> FAILED --> FAILED_UNRECOVERABLE
+//
+// For example, in the case when a replica has been unreachable longer than the
+// time interval specified by the --follower_unavailable_considered_failed_sec
+// flag, the system should start reporting its health status as FAILED.
+// However, once the replica falls behind the WAL log GC threshold, the system
+// should start reporting its healths status as FAILED_UNRECOVERABLE. The code
+// below is written to adhere to that informal policy.
 HealthReportPB::HealthStatus PeerMessageQueue::PeerHealthStatus(const 
TrackedPeer& peer) {
-  // Replicas which have been unreachable for too long are considered failed.
-  auto max_unreachable = 
MonoDelta::FromSeconds(FLAGS_follower_unavailable_considered_failed_sec);
-  if (MonoTime::Now() - peer.last_communication_time > max_unreachable) {
-    return HealthReportPB::FAILED;
-  }
-
-  // Replicas that have fallen behind the leader's retained WAL are failed
-  // and have no chance to come back.
+  // Replicas that have fallen behind the leader's retained WAL segments are
+  // failed irrecoverably and will not come back because they cannot ever catch
+  // up with the leader replica.
   if (!peer.wal_catchup_possible) {
     return HealthReportPB::FAILED_UNRECOVERABLE;
   }
 
-  // Replicas returning TABLET_FAILED status are considered failed.
+  // Replicas returning TABLET_FAILED status are considered irrecoverably
+  // failed because the TABLED_FAILED status manifests about IO failures
+  // caused by disk corruption, etc.
   if (peer.last_exchange_status == PeerStatus::TABLET_FAILED) {
     return HealthReportPB::FAILED_UNRECOVERABLE;
   }
 
+  // Replicas which have been unreachable for too long are considered failed,
+  // unless it's known that they have failed irrecoverably (see above). They
+  // might come back at some point and successfully catch up with the leader.
+  auto max_unreachable = 
MonoDelta::FromSeconds(FLAGS_follower_unavailable_considered_failed_sec);
+  if (MonoTime::Now() - peer.last_communication_time > max_unreachable) {
+    return HealthReportPB::FAILED;
+  }
+
   // The happy case: replicas returned OK during the recent exchange are 
considered healthy.
   if (peer.last_exchange_status == PeerStatus::OK) {
     return HealthReportPB::HEALTHY;

http://git-wip-us.apache.org/repos/asf/kudu/blob/fcb0be63/src/kudu/consensus/consensus_queue.h
----------------------------------------------------------------------
diff --git a/src/kudu/consensus/consensus_queue.h 
b/src/kudu/consensus/consensus_queue.h
index cbbc522..a2b79e2 100644
--- a/src/kudu/consensus/consensus_queue.h
+++ b/src/kudu/consensus/consensus_queue.h
@@ -360,6 +360,7 @@ class PeerMessageQueue {
   FRIEND_TEST(ConsensusQueueTest, TestQueueAdvancesCommittedIndex);
   FRIEND_TEST(ConsensusQueueTest, TestQueueMovesWatermarksBackward);
   FRIEND_TEST(ConsensusQueueTest, TestFollowerCommittedIndexAndMetrics);
+  FRIEND_TEST(ConsensusQueueUnitTest, PeerHealthStatus);
   FRIEND_TEST(RaftConsensusQuorumTest, 
TestReplicasEnforceTheLogMatchingProperty);
 
   // Mode specifies how the queue currently behaves:

http://git-wip-us.apache.org/repos/asf/kudu/blob/fcb0be63/src/kudu/integration-tests/raft_consensus-itest-base.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus-itest-base.cc 
b/src/kudu/integration-tests/raft_consensus-itest-base.cc
index aab6981..1fee8a8 100644
--- a/src/kudu/integration-tests/raft_consensus-itest-base.cc
+++ b/src/kudu/integration-tests/raft_consensus-itest-base.cc
@@ -197,7 +197,8 @@ void RaftConsensusITestBase::CauseFollowerToFallBehindLogGC(
     string* leader_uuid,
     int64_t* orig_term,
     string* fell_behind_uuid,
-    BehindWalGcBehavior tserver_behavior) {
+    BehindWalGcBehavior tserver_behavior,
+    const MonoDelta& pre_workload_delay) {
   MonoDelta kTimeout = MonoDelta::FromSeconds(10);
   // Wait for all of the replicas to have acknowledged the elected
   // leader and logged the first NO_OP.
@@ -234,6 +235,10 @@ void 
RaftConsensusITestBase::CauseFollowerToFallBehindLogGC(
   *leader_uuid = leader->uuid();
   int leader_index = cluster_->tablet_server_index_by_uuid(*leader_uuid);
 
+  if (pre_workload_delay.Initialized()) {
+    SleepFor(pre_workload_delay);
+  }
+
   TestWorkload workload(cluster_.get());
   workload.set_table_name(kTableId);
   workload.set_timeout_allowed(true);

http://git-wip-us.apache.org/repos/asf/kudu/blob/fcb0be63/src/kudu/integration-tests/raft_consensus-itest-base.h
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus-itest-base.h 
b/src/kudu/integration-tests/raft_consensus-itest-base.h
index a4266ad..c00426d 100644
--- a/src/kudu/integration-tests/raft_consensus-itest-base.h
+++ b/src/kudu/integration-tests/raft_consensus-itest-base.h
@@ -28,6 +28,8 @@
 
 namespace kudu {
 
+class MonoDelta;
+
 namespace cluster {
 class ExternalTabletServer;
 }
@@ -84,7 +86,8 @@ class RaftConsensusITestBase : public 
TabletServerIntegrationTestBase {
       std::string* leader_uuid,
       int64_t* orig_term,
       std::string* fell_behind_uuid,
-      BehindWalGcBehavior tserver_behavior = 
BehindWalGcBehavior::STOP_CONTINUE);
+      BehindWalGcBehavior tserver_behavior = 
BehindWalGcBehavior::STOP_CONTINUE,
+      const MonoDelta& pre_workload_delay = {});
 
   CountDownLatch inserters_;
 };

http://git-wip-us.apache.org/repos/asf/kudu/blob/fcb0be63/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc 
b/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc
index 84a6d94..da19cc4 100644
--- a/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus_nonvoter-itest.cc
@@ -1927,20 +1927,28 @@ TEST_P(IncompatibleReplicaReplacementSchemesITest, 
MasterAndTserverMisconfig) {
   }
 }
 
+// This test scenario runs the system with the 3-4-3 replica management scheme
+// having the total number of tablet servers equal to the replication factor
+// of the tablet being tested. The scenario makes one follower replica
+// fall behind the WAL segment GC threshold. The system should be able to
+// replace the failed replica 'in-place', i.e. no additional tablet server is
+// needed for the cluster to recover in such situations. The scenario verifies
+// that the re-replication works as expected when the tablet server hosting the
+// failed replica is:
+//   ** paused and then resumed, emulating a lagging tablet server
+//   ** shut down and then started back up
 class ReplicaBehindWalGcThresholdITest :
     public RaftConsensusNonVoterITest,
-    public 
::testing::WithParamInterface<RaftConsensusITestBase::BehindWalGcBehavior> {
+    public ::testing::WithParamInterface<
+        std::tuple<RaftConsensusITestBase::BehindWalGcBehavior, bool>> {
 };
 INSTANTIATE_TEST_CASE_P(,
     ReplicaBehindWalGcThresholdITest,
-    
::testing::Values(RaftConsensusITestBase::BehindWalGcBehavior::STOP_CONTINUE,
-                      
RaftConsensusITestBase::BehindWalGcBehavior::SHUTDOWN_RESTART,
-                      RaftConsensusITestBase::BehindWalGcBehavior::SHUTDOWN));
-
-// Test that the catalog manager running with the 3-4-3 scheme is able to do
-// 'in-place' replica replacement when replica falls behind the WAL segment GC,
-// i.e. no additional tablet server is needed in case of tablet with 
replication
-// factor 3 when there are just 3 tablet servers in the cluster.
+    ::testing::Combine(
+        
::testing::Values(RaftConsensusITestBase::BehindWalGcBehavior::STOP_CONTINUE,
+                          
RaftConsensusITestBase::BehindWalGcBehavior::SHUTDOWN_RESTART,
+                          
RaftConsensusITestBase::BehindWalGcBehavior::SHUTDOWN),
+        ::testing::Bool()));
 TEST_P(ReplicaBehindWalGcThresholdITest, ReplicaReplacement) {
   if (!AllowSlowTests()) {
     LOG(WARNING) << "test is skipped; set KUDU_ALLOW_SLOW_TESTS=1 to run";
@@ -1953,7 +1961,8 @@ TEST_P(ReplicaBehindWalGcThresholdITest, 
ReplicaReplacement) {
   const auto kUnavaiableFailedSec = 5;
   FLAGS_num_replicas = kReplicasNum;
   FLAGS_num_tablet_servers = kReplicasNum;
-  const auto tserver_behavior = GetParam();
+  const auto tserver_behavior = std::get<0>(GetParam());
+  const bool do_delay_workload = std::get<1>(GetParam());
 
   vector<string> master_flags = {
     // This scenario runs with the 3-4-3 replica management scheme.
@@ -1981,8 +1990,17 @@ TEST_P(ReplicaBehindWalGcThresholdITest, 
ReplicaReplacement) {
   string follower_uuid;
   string leader_uuid;
   int64_t orig_term;
+  MonoDelta delay;
+  if (do_delay_workload) {
+    // That's to make the leader replica to report the state of the tablet as
+    // FAILED first. Later on, when the replica falls behind the WAL segment GC
+    // threshold, the leader replica should report the follower's health status
+    // as FAILED_UNRECOVERABLE.
+    delay = MonoDelta::FromSeconds(3 * kUnavaiableFailedSec);
+  }
   NO_FATALS(CauseFollowerToFallBehindLogGC(
-      tablet_servers_, &leader_uuid, &orig_term, &follower_uuid, 
tserver_behavior));
+      tablet_servers_, &leader_uuid, &orig_term, &follower_uuid,
+      tserver_behavior, delay));
 
   // The catalog manager should evict the replicas which fell behing the WAL
   // segment GC threshold right away.

Reply via email to