[kudu] branch master updated: [tests] fix flakiness in RaftConsensusITest.TestSlowFollower

alexey Mon, 22 Nov 2021 19:47:09 -0800

This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git



The following commit(s) were added to refs/heads/master by this push:
     new b6d8c1d  [tests] fix flakiness in RaftConsensusITest.TestSlowFollower
b6d8c1d is described below

commit b6d8c1db5847c636ccc9bb6d801f37da08d4560b
Author: Alexey Serbin <[email protected]>
AuthorDate: Mon Nov 22 16:00:01 2021 -0800

    [tests] fix flakiness in RaftConsensusITest.TestSlowFollower
    
    The results for the DEBUG build running with --stress_cpu_threads=16:
    
      before: 7 out of 256 failed
        http://dist-test.cloudera.org/job?job_id=aserbin.1637624733.12826
    
      after : 0 out of 256 failed
        http://dist-test.cloudera.org/job?job_id=aserbin.1637624357.6197
    
    I also took the liberty of updating a few error messages to be more
    concise (saw those while troubleshooting the flakiness in the test
    scenario).
    
    Change-Id: Id4b4245023935deff58ba60840c9a8a01d1da2b1
    Reviewed-on: http://gerrit.cloudera.org:8080/18047
    Reviewed-by: Alexey Serbin <[email protected]>
    Tested-by: Alexey Serbin <[email protected]>
---
 src/kudu/consensus/time_manager.cc                 | 19 +++++++++++--------
 src/kudu/integration-tests/raft_consensus-itest.cc | 19 +++++++++++++++----
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/src/kudu/consensus/time_manager.cc 
b/src/kudu/consensus/time_manager.cc
index 03418c6..e82b6e8 100644
--- a/src/kudu/consensus/time_manager.cc
+++ b/src/kudu/consensus/time_manager.cc
@@ -175,9 +175,10 @@ bool 
TimeManager::HasAdvancedSafeTimeRecentlyUnlocked(string* error_message) {
   max_last_advanced = std::max<int64_t>(max_last_advanced, 100LL);
   MonoDelta max_delta = MonoDelta::FromMilliseconds(max_last_advanced);
   if (time_since_last_advance > max_delta) {
-    *error_message = Substitute("Tablet hasn't heard from leader, or there 
hasn't been a stable "
-                                "leader for: $0 secs, (max is $1):",
-                                time_since_last_advance.ToString(), 
max_delta.ToString());
+    *error_message = Substitute(
+        "tablet hasn't heard from leader or there hasn't been a stable leader "
+        "for $0 (maximum allowed $1)",
+        time_since_last_advance.ToString(), max_delta.ToString());
     return false;
   }
   return true;
@@ -187,14 +188,16 @@ bool TimeManager::IsSafeTimeLaggingUnlocked(Timestamp 
timestamp, string* error_m
   DCHECK(lock_.is_locked());
 
   // Can't calculate safe time lag for the logical clock.
-  if (PREDICT_FALSE(!clock_->HasPhysicalComponent())) return false;
+  if (PREDICT_FALSE(!clock_->HasPhysicalComponent())) {
+    return false;
+  }
   MonoDelta safe_time_diff = clock_->GetPhysicalComponentDifference(timestamp,
                                                                     
last_safe_ts_);
   if (safe_time_diff.ToMilliseconds() > FLAGS_safe_time_max_lag_ms) {
-    *error_message = Substitute("Tablet is lagging too much to be able to 
serve snapshot scan. "
-                                "Lagging by: $0 ms, (max is $1 ms):",
-                                safe_time_diff.ToMilliseconds(),
-                                FLAGS_safe_time_max_lag_ms);
+    *error_message = Substitute(
+        "tablet is lagging too much to be able to serve a snapshot scan: "
+        "current lag $0 ms (maximum allowed $1 ms)",
+        safe_time_diff.ToMilliseconds(), FLAGS_safe_time_max_lag_ms);
     return true;
   }
   return false;
diff --git a/src/kudu/integration-tests/raft_consensus-itest.cc 
b/src/kudu/integration-tests/raft_consensus-itest.cc
index e8902b8..1ee3ecb 100644
--- a/src/kudu/integration-tests/raft_consensus-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus-itest.cc
@@ -36,6 +36,7 @@
 #include "kudu/client/write_op.h"
 #include "kudu/common/common.pb.h"
 #include "kudu/common/partial_row.h"
+#include "kudu/common/row_operations.pb.h"
 #include "kudu/common/wire_protocol-test-util.h"
 #include "kudu/common/wire_protocol.h"
 #include "kudu/common/wire_protocol.pb.h"
@@ -2267,7 +2268,17 @@ TEST_F(RaftConsensusITest, 
TestCommitIndexFarBehindAfterLeaderElection) {
 TEST_F(RaftConsensusITest, TestSlowFollower) {
   SKIP_IF_SLOW_NOT_ALLOWED();
 
-  NO_FATALS(BuildAndStart());
+  // Leaving the default 
--missed_heartbeats_before_rejecting_snapshot_scans=1.5
+  // makes the scenario prone to flakiness with the default setting of
+  // --raft_heartbeat_interval_ms=500 because the injected WAL latency of 
1000ms
+  // is above of 750 ms (500 * 1.5 = 750). We don't want too many scan requests
+  // to be rejected because the follower replica hasn't heard from the leader
+  // when the safe time has already been advanced beyond the timestamp of the
+  // snapshot scan request. The customized setting for the
+  // --missed_heartbeats_before_rejecting_snapshot_scans flag helps making the
+  // scenario more stable.
+  NO_FATALS(BuildAndStart(
+      {"--missed_heartbeats_before_rejecting_snapshot_scans=3"}));
 
   TServerDetails* leader;
   ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id_, &leader));
@@ -2275,9 +2286,9 @@ TEST_F(RaftConsensusITest, TestSlowFollower) {
   for (int i = 0; i < cluster_->num_tablet_servers(); i++) {
     ExternalTabletServer* ts = cluster_->tablet_server(i);
     if (ts->instance_id().permanent_uuid() != leader->uuid()) {
-      TServerDetails* follower;
-      follower = GetReplicaWithUuidOrNull(tablet_id_, 
ts->instance_id().permanent_uuid());
-      ASSERT_TRUE(follower);
+      TServerDetails* follower =
+          GetReplicaWithUuidOrNull(tablet_id_, 
ts->instance_id().permanent_uuid());
+      ASSERT_NE(nullptr, follower);
       NO_FATALS(EnableLogLatency(follower->generic_proxy.get()));
       num_reconfigured++;
       break;

[kudu] branch master updated: [tests] fix flakiness in RaftConsensusITest.TestSlowFollower

Reply via email to