This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new b6d8c1d [tests] fix flakiness in RaftConsensusITest.TestSlowFollower
b6d8c1d is described below
commit b6d8c1db5847c636ccc9bb6d801f37da08d4560b
Author: Alexey Serbin <[email protected]>
AuthorDate: Mon Nov 22 16:00:01 2021 -0800
[tests] fix flakiness in RaftConsensusITest.TestSlowFollower
The results for the DEBUG build running with --stress_cpu_threads=16:
before: 7 out of 256 failed
http://dist-test.cloudera.org/job?job_id=aserbin.1637624733.12826
after : 0 out of 256 failed
http://dist-test.cloudera.org/job?job_id=aserbin.1637624357.6197
I also took the liberty of updating a few error messages to be more
concise (saw those while troubleshooting the flakiness in the test
scenario).
Change-Id: Id4b4245023935deff58ba60840c9a8a01d1da2b1
Reviewed-on: http://gerrit.cloudera.org:8080/18047
Reviewed-by: Alexey Serbin <[email protected]>
Tested-by: Alexey Serbin <[email protected]>
---
src/kudu/consensus/time_manager.cc | 19 +++++++++++--------
src/kudu/integration-tests/raft_consensus-itest.cc | 19 +++++++++++++++----
2 files changed, 26 insertions(+), 12 deletions(-)
diff --git a/src/kudu/consensus/time_manager.cc
b/src/kudu/consensus/time_manager.cc
index 03418c6..e82b6e8 100644
--- a/src/kudu/consensus/time_manager.cc
+++ b/src/kudu/consensus/time_manager.cc
@@ -175,9 +175,10 @@ bool
TimeManager::HasAdvancedSafeTimeRecentlyUnlocked(string* error_message) {
max_last_advanced = std::max<int64_t>(max_last_advanced, 100LL);
MonoDelta max_delta = MonoDelta::FromMilliseconds(max_last_advanced);
if (time_since_last_advance > max_delta) {
- *error_message = Substitute("Tablet hasn't heard from leader, or there
hasn't been a stable "
- "leader for: $0 secs, (max is $1):",
- time_since_last_advance.ToString(),
max_delta.ToString());
+ *error_message = Substitute(
+ "tablet hasn't heard from leader or there hasn't been a stable leader "
+ "for $0 (maximum allowed $1)",
+ time_since_last_advance.ToString(), max_delta.ToString());
return false;
}
return true;
@@ -187,14 +188,16 @@ bool TimeManager::IsSafeTimeLaggingUnlocked(Timestamp
timestamp, string* error_m
DCHECK(lock_.is_locked());
// Can't calculate safe time lag for the logical clock.
- if (PREDICT_FALSE(!clock_->HasPhysicalComponent())) return false;
+ if (PREDICT_FALSE(!clock_->HasPhysicalComponent())) {
+ return false;
+ }
MonoDelta safe_time_diff = clock_->GetPhysicalComponentDifference(timestamp,
last_safe_ts_);
if (safe_time_diff.ToMilliseconds() > FLAGS_safe_time_max_lag_ms) {
- *error_message = Substitute("Tablet is lagging too much to be able to
serve snapshot scan. "
- "Lagging by: $0 ms, (max is $1 ms):",
- safe_time_diff.ToMilliseconds(),
- FLAGS_safe_time_max_lag_ms);
+ *error_message = Substitute(
+ "tablet is lagging too much to be able to serve a snapshot scan: "
+ "current lag $0 ms (maximum allowed $1 ms)",
+ safe_time_diff.ToMilliseconds(), FLAGS_safe_time_max_lag_ms);
return true;
}
return false;
diff --git a/src/kudu/integration-tests/raft_consensus-itest.cc
b/src/kudu/integration-tests/raft_consensus-itest.cc
index e8902b8..1ee3ecb 100644
--- a/src/kudu/integration-tests/raft_consensus-itest.cc
+++ b/src/kudu/integration-tests/raft_consensus-itest.cc
@@ -36,6 +36,7 @@
#include "kudu/client/write_op.h"
#include "kudu/common/common.pb.h"
#include "kudu/common/partial_row.h"
+#include "kudu/common/row_operations.pb.h"
#include "kudu/common/wire_protocol-test-util.h"
#include "kudu/common/wire_protocol.h"
#include "kudu/common/wire_protocol.pb.h"
@@ -2267,7 +2268,17 @@ TEST_F(RaftConsensusITest,
TestCommitIndexFarBehindAfterLeaderElection) {
TEST_F(RaftConsensusITest, TestSlowFollower) {
SKIP_IF_SLOW_NOT_ALLOWED();
- NO_FATALS(BuildAndStart());
+ // Leaving the default
--missed_heartbeats_before_rejecting_snapshot_scans=1.5
+ // makes the scenario prone to flakiness with the default setting of
+ // --raft_heartbeat_interval_ms=500 because the injected WAL latency of
1000ms
+ // is above of 750 ms (500 * 1.5 = 750). We don't want too many scan requests
+ // to be rejected because the follower replica hasn't heard from the leader
+ // when the safe time has already been advanced beyond the timestamp of the
+ // snapshot scan request. The customized setting for the
+ // --missed_heartbeats_before_rejecting_snapshot_scans flag helps making the
+ // scenario more stable.
+ NO_FATALS(BuildAndStart(
+ {"--missed_heartbeats_before_rejecting_snapshot_scans=3"}));
TServerDetails* leader;
ASSERT_OK(GetLeaderReplicaWithRetries(tablet_id_, &leader));
@@ -2275,9 +2286,9 @@ TEST_F(RaftConsensusITest, TestSlowFollower) {
for (int i = 0; i < cluster_->num_tablet_servers(); i++) {
ExternalTabletServer* ts = cluster_->tablet_server(i);
if (ts->instance_id().permanent_uuid() != leader->uuid()) {
- TServerDetails* follower;
- follower = GetReplicaWithUuidOrNull(tablet_id_,
ts->instance_id().permanent_uuid());
- ASSERT_TRUE(follower);
+ TServerDetails* follower =
+ GetReplicaWithUuidOrNull(tablet_id_,
ts->instance_id().permanent_uuid());
+ ASSERT_NE(nullptr, follower);
NO_FATALS(EnableLogLatency(follower->generic_proxy.get()));
num_reconfigured++;
break;