This is an automated email from the ASF dual-hosted git repository. awong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git
commit 546e68cfd39e2a6f1b16bd1ddb580d1ebc97c9a4 Author: Andrew Wong <[email protected]> AuthorDate: Fri Feb 5 18:04:23 2021 -0800 txn_commit-itest: deflake TestCommitTasksReloadOnLeadershipChange The test shows up on the flaky test dashboard as failing around 20% of the time. As it turns out, transferring leadership by quiescing multiple replicas can lead to flakiness if we happen to pick a lagging replica as the new leader. Instead of targeting a specific tablet server as the host of the new leaders, we'll now just quiesce the old leader tablet server and stop quiescing the other tablet servers. I ran the test in DEBUG mode 100 times. Before this patch, it failed 16 times; with it, it passed 100/100 times. Change-Id: I2b27864e72888367eb0af7de59e044a9e018c31b Reviewed-on: http://gerrit.cloudera.org:8080/17031 Tested-by: Kudu Jenkins Reviewed-by: Hao Hao <[email protected]> --- src/kudu/integration-tests/txn_commit-itest.cc | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/kudu/integration-tests/txn_commit-itest.cc b/src/kudu/integration-tests/txn_commit-itest.cc index 084435e..23fb461 100644 --- a/src/kudu/integration-tests/txn_commit-itest.cc +++ b/src/kudu/integration-tests/txn_commit-itest.cc @@ -749,7 +749,6 @@ class ThreeNodeTxnCommitITest : public TxnCommitITest { *cluster_->mini_tablet_server(i)->server()->mutable_quiescing() = i != leader_idx; } leader_ts_ = cluster_->mini_tablet_server(leader_idx); - non_leader_ts_ = cluster_->mini_tablet_server(leader_idx + 1); // We should have two leaders for our table, and one for the // TxnStatusManager. ASSERT_EVENTUALLY([&] { @@ -758,7 +757,6 @@ class ThreeNodeTxnCommitITest : public TxnCommitITest { } protected: MiniTabletServer* leader_ts_; - MiniTabletServer* non_leader_ts_; }; TEST_F(ThreeNodeTxnCommitITest, TestCommitTasksReloadOnLeadershipChange) { @@ -776,13 +774,16 @@ TEST_F(ThreeNodeTxnCommitITest, TestCommitTasksReloadOnLeadershipChange) { ASSERT_FALSE(is_complete); FLAGS_txn_schedule_background_tasks = true; - // Change our quiescing state and bring the previous leader down so a new - // leader can be elected. - auto* new_leader_ts = non_leader_ts_; - *new_leader_ts->server()->mutable_quiescing() = false; + // Change our quiescing states so a new leader can be elected. *leader_ts_->server()->mutable_quiescing() = true; + for (int i = 0; i < cluster_->num_tablet_servers(); i++) { + auto* mts = cluster_->mini_tablet_server(i); + if (leader_ts_ != mts) { + *mts->server()->mutable_quiescing() = false; + } + } ASSERT_EVENTUALLY([&] { - ASSERT_EQ(3, new_leader_ts->server()->num_raft_leaders()->value()); + ASSERT_EQ(0, leader_ts_->server()->num_raft_leaders()->value()); }); // Upon becoming leader, we should have started our commit task and completed // the commit.
