This is an automated email from the ASF dual-hosted git repository.
awong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 6e4dd49 KUDU-3046: deflake TabletServerQuiescingITest
6e4dd49 is described below
commit 6e4dd49a4716f1aed4f533a85b794dfd04f3ab96
Author: Andrew Wong <[email protected]>
AuthorDate: Mon Jan 27 18:58:50 2020 -0800
KUDU-3046: deflake TabletServerQuiescingITest
The test was flaky for a number of reasons including:
- Slowness in TSAN mode along with a low Raft timeout meant workloads
would fail to even create tablets.
- Addressed this by increasing the heartbeat interval in TSAN mode.
- Not hitting the exact number of scanners when running the tool because
of a TOCTOU race between checking the number of scanners and running
the tool.
- Addressed this by reducing the number of read threads and thus
reducing the degrees of freedom with which the tool can run (either
0 scanners or 1 scanner).
- TestAbruptStepdownWhileAllQuiescing failed because the test would step
down a leader without the guarantee that it was the latest leader, so
a leader could still exist even after stepping down.
- Addressed this by stepping down on all tablet servers just to be
sure, and retrying if necessary via ASSERT_EVENTUALLY.
There appears to be another source of flakiness that are less specific
to this test, but this dropped flakiness from failing 4/100 to failing
9/2000 (all due to a TSAN issue in the TestWorkload that I'm still
getting to the bottom of).
Change-Id: I3f9ef531062c4b66648840e04962070768fbad5d
Reviewed-on: http://gerrit.cloudera.org:8080/15113
Reviewed-by: Adar Dembo <[email protected]>
Tested-by: Kudu Jenkins
Reviewed-by: Alexey Serbin <[email protected]>
---
.../tablet_server_quiescing-itest.cc | 54 +++++++++++++++++-----
1 file changed, 43 insertions(+), 11 deletions(-)
diff --git a/src/kudu/integration-tests/tablet_server_quiescing-itest.cc
b/src/kudu/integration-tests/tablet_server_quiescing-itest.cc
index 3b660ea..75352d4 100644
--- a/src/kudu/integration-tests/tablet_server_quiescing-itest.cc
+++ b/src/kudu/integration-tests/tablet_server_quiescing-itest.cc
@@ -75,6 +75,19 @@ using std::unique_ptr;
using std::vector;
using strings::Substitute;
+namespace {
+
+// Some of these tests will set a low Raft timeout to move election traffic
+// along more quickly. When built with TSAN, this can lead to timeouts, so ease
+// up a bit.
+#ifdef THREAD_SANITIZER
+ constexpr int kLowRaftTimeout = 300;
+#else
+ constexpr int kLowRaftTimeout = 100;
+#endif
+
+} // anonymous namespace
+
namespace kudu {
namespace itest {
@@ -126,7 +139,7 @@ TEST_F(TServerQuiescingITest,
TestQuiescingServerDoesntTriggerElections) {
const int kNumReplicas = 3;
const int kNumTablets = 10;
// This test will change leaders frequently, so set a low Raft heartbeat.
- FLAGS_raft_heartbeat_interval_ms = 100;
+ FLAGS_raft_heartbeat_interval_ms = kLowRaftTimeout;
NO_FATALS(StartCluster(kNumReplicas));
// Set up a table with some replicas.
@@ -194,7 +207,7 @@ TEST_F(TServerQuiescingITest,
TestQuiescingLeaderTransfersLeadership) {
// able to elect a leader.
TEST_F(TServerQuiescingITest, TestMajorityQuiescingElectsLeader) {
const int kNumReplicas = 3;
- FLAGS_raft_heartbeat_interval_ms = 100;
+ FLAGS_raft_heartbeat_interval_ms = kLowRaftTimeout;
NO_FATALS(StartCluster(kNumReplicas));
vector<string> tablet_ids;
NO_FATALS(CreateWorkloadTable(/*num_tablets*/1, &tablet_ids));
@@ -253,7 +266,7 @@ TEST_F(TServerQuiescingITest, TestDoesntAllowNewScans) {
TEST_F(TServerQuiescingITest, TestDoesntAllowNewScansLeadersOnly) {
const int kNumReplicas = 3;
// This test will change leaders frequently, so set a low Raft heartbeat.
- FLAGS_raft_heartbeat_interval_ms = 100;
+ FLAGS_raft_heartbeat_interval_ms = kLowRaftTimeout;
// Set a tiny batch size to encourage many batches for a single scan. This
// will emulate long-running scans.
FLAGS_scanner_default_batch_size_bytes = 1;
@@ -294,7 +307,7 @@ TEST_F(TServerQuiescingITest,
TestDoesntAllowNewScansLeadersOnly) {
// the leader, even while quiescing, will remain leader.
TEST_F(TServerQuiescingITest, TestQuiesceLeaderWhileFollowersCatchingUp) {
const int kNumReplicas = 3;
- FLAGS_raft_heartbeat_interval_ms = 100;
+ FLAGS_raft_heartbeat_interval_ms = kLowRaftTimeout;
NO_FATALS(StartCluster(kNumReplicas));
auto rw_workload = CreateFaultIntolerantRWWorkload();
rw_workload->set_num_tablets(1);
@@ -364,6 +377,7 @@ TEST_F(TServerQuiescingITest, TestQuiescingToolBasics) {
auto* ts = cluster_->mini_tablet_server(0);
auto rw_workload = CreateFaultIntolerantRWWorkload();
rw_workload->Setup();
+ rw_workload->set_num_read_threads(1);
ASSERT_FALSE(ts->server()->quiescing());
// First, call the start tool a couple of times.
for (int i = 0; i < 2; i++) {
@@ -670,6 +684,7 @@ TEST_P(TServerQuiescingParamITest,
TestAbruptStepdownWhileAllQuiescing) {
vector<string> tablet_ids;
NO_FATALS(CreateWorkloadTable(/*num_tablets*/1, &tablet_ids));
+ // Ensure we get a leader.
TServerDetails* leader_details;
const auto kLeaderTimeout = MonoDelta::FromSeconds(10);
const auto& tablet_id = tablet_ids[0];
@@ -679,13 +694,30 @@ TEST_P(TServerQuiescingParamITest,
TestAbruptStepdownWhileAllQuiescing) {
for (int i = 0; i < cluster_->num_tablet_servers(); i++) {
*cluster_->mini_tablet_server(i)->server()->mutable_quiescing() = true;
}
- // Once we've stepped down, while quiescing, no new leader should be elected.
- // Wait extra long to be sure.
- ASSERT_OK(LeaderStepDown(leader_details, tablet_id, kLeaderTimeout));
- MonoDelta election_timeout = MonoDelta::FromMilliseconds(
- 2 * FLAGS_raft_heartbeat_interval_ms *
FLAGS_leader_failure_max_missed_heartbeat_periods);
- Status s = FindTabletLeader(ts_map_, tablet_id, election_timeout,
&leader_details);
- ASSERT_TRUE(s.IsTimedOut()) << s.ToString();
+ // Abruptly step down our tablet servers. We could find the leader and just
+ // step it down, but it's hard to guarantee that the found leader is of the
+ // latest term.
+ //
+ // So to be sure, try on all our replicas -- eventually we'll step down on
+ // the latest leader, and we won't be able to elect a new leader since all
+ // servers are quiescing.
+ ASSERT_EVENTUALLY([&] {
+ bool stepped_down = false;
+ for (const auto& ts_and_details : ts_map_) {
+ Status s = LeaderStepDown(ts_and_details.second, tablet_id,
kLeaderTimeout);
+ if (s.ok()) {
+ stepped_down = true;
+ break;
+ }
+ LOG(INFO) << Substitute("Request to step down failed on $0: $1",
+ ts_and_details.first, s.ToString());
+ }
+ ASSERT_TRUE(stepped_down);
+
+ // There should be no leaders.
+ Status s = FindTabletLeader(ts_map_, tablet_id, kLeaderTimeout,
&leader_details);
+ ASSERT_TRUE(s.IsTimedOut()) << s.ToString();
+ });
}
INSTANTIATE_TEST_CASE_P(NumReplicas, TServerQuiescingParamITest,
::testing::Values(1, 3));