This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new bd18cd1 [kudu-admin-test] fix
TestUnsafeChangeConfigWithFiveReplicaConfig
bd18cd1 is described below
commit bd18cd1cf196e0fa4b9051e51deed789090b1721
Author: Alexey Serbin <[email protected]>
AuthorDate: Tue Oct 13 23:35:31 2020 -0700
[kudu-admin-test] fix TestUnsafeChangeConfigWithFiveReplicaConfig
I saw the AdminCliTest.TestUnsafeChangeConfigWithFiveReplicaConfig
scenario failing pre-commit TSAN build [1] with the error like below:
Timed out: Committed consensus opid_index does not equal 1
after waiting for 30.014s. Last opid: 2.2
As I understand, the reason for failing the assertion is an extra
election round that has happened after startup of a tablet. This patch
addresses the issue in the TestUnsafeChangeConfigWithFiveReplicaConfig
and other scenarios which might be affected if same ever happens again.
[1] http://jenkins.kudu.apache.org/job/kudu-gerrit/22585/BUILD_TYPE=TSAN
Change-Id: I1e77c440b872a42c68b6d7d1cb0563fee0470fc2
Reviewed-on: http://gerrit.cloudera.org:8080/16601
Tested-by: Alexey Serbin <[email protected]>
Reviewed-by: Andrew Wong <[email protected]>
---
src/kudu/tools/kudu-admin-test.cc | 33 +++++++++++++++++----------------
1 file changed, 17 insertions(+), 16 deletions(-)
diff --git a/src/kudu/tools/kudu-admin-test.cc
b/src/kudu/tools/kudu-admin-test.cc
index 1294377..f549e74 100644
--- a/src/kudu/tools/kudu-admin-test.cc
+++ b/src/kudu/tools/kudu-admin-test.cc
@@ -517,15 +517,14 @@ TEST_F(AdminCliTest,
TestUnsafeChangeConfigOnSingleFollower) {
LOG(INFO) << "Tablet locations:\n" << SecureDebugString(tablet_locations);
ASSERT_TRUE(has_leader) << SecureDebugString(tablet_locations);
- // Wait for initial NO_OP to be committed by the leader.
TServerDetails* leader_ts;
- vector<TServerDetails*> followers;
ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id, kTimeout,
&leader_ts));
+ vector<TServerDetails*> followers;
ASSERT_EVENTUALLY([&] {
ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id, kTimeout,
&followers));
});
- OpId opid;
- ASSERT_OK(WaitForOpFromCurrentTerm(leader_ts, tablet_id, COMMITTED_OPID,
kTimeout, &opid));
+ // Wait for initial NO_OP to be committed by the leader.
+ ASSERT_OK(WaitForOpFromCurrentTerm(leader_ts, tablet_id, COMMITTED_OPID,
kTimeout));
// Shut down master so it doesn't interfere while we shut down the leader and
// one of the other followers.
@@ -552,6 +551,7 @@ TEST_F(AdminCliTest,
TestUnsafeChangeConfigOnSingleFollower) {
3, tablet_id, kTimeout,
WAIT_FOR_LEADER, VOTER_REPLICA,
&has_leader, &tablet_locations));
+ OpId opid;
ASSERT_OK(WaitForOpFromCurrentTerm(followers[0], tablet_id, COMMITTED_OPID,
kTimeout, &opid));
active_tablet_servers.clear();
@@ -608,14 +608,14 @@ TEST_F(AdminCliTest,
TestUnsafeChangeConfigOnSingleLeader) {
LOG(INFO) << "Tablet locations:\n" << SecureDebugString(tablet_locations);
ASSERT_TRUE(has_leader) << SecureDebugString(tablet_locations);
- // Wait for initial NO_OP to be committed by the leader.
TServerDetails* leader_ts;
ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout,
&leader_ts));
- ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
vector<TServerDetails*> followers;
ASSERT_EVENTUALLY([&] {
ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout,
&followers));
});
+ // Wait for initial NO_OP to be committed by the leader.
+ ASSERT_OK(WaitForOpFromCurrentTerm(leader_ts, tablet_id_, COMMITTED_OPID,
kTimeout));
// Shut down servers follower1 and follower2,
// so that we can force new config on remaining leader.
@@ -693,14 +693,14 @@ TEST_F(AdminCliTest,
TestUnsafeChangeConfigForConfigWithTwoNodes) {
LOG(INFO) << "Tablet locations:\n" << SecureDebugString(tablet_locations);
ASSERT_TRUE(has_leader) << SecureDebugString(tablet_locations);
- // Wait for initial NO_OP to be committed by the leader.
TServerDetails* leader_ts;
ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout,
&leader_ts));
- ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
vector<TServerDetails*> followers;
ASSERT_EVENTUALLY([&] {
ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout,
&followers));
});
+ // Wait for initial NO_OP to be committed by the leader.
+ ASSERT_OK(WaitForOpFromCurrentTerm(leader_ts, tablet_id_, COMMITTED_OPID,
kTimeout));
// Shut down leader and prepare 2-node config.
cluster_->tablet_server_by_uuid(leader_ts->uuid())->Shutdown();
@@ -777,15 +777,16 @@ TEST_F(AdminCliTest,
TestUnsafeChangeConfigWithFiveReplicaConfig) {
LOG(INFO) << "Tablet locations:\n" << SecureDebugString(tablet_locations);
ASSERT_TRUE(has_leader) << SecureDebugString(tablet_locations);
- // Wait for initial NO_OP to be committed by the leader.
TServerDetails* leader_ts;
ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout,
&leader_ts));
- ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
vector<TServerDetails*> followers;
ASSERT_EVENTUALLY([&] {
ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout,
&followers));
});
ASSERT_EQ(followers.size(), 4);
+ // Wait for initial NO_OP to be committed by the leader.
+ ASSERT_OK(WaitForOpFromCurrentTerm(leader_ts, tablet_id_, COMMITTED_OPID,
kTimeout));
+
cluster_->tablet_server_by_uuid(followers[2]->uuid())->Shutdown();
cluster_->tablet_server_by_uuid(followers[3]->uuid())->Shutdown();
cluster_->tablet_server_by_uuid(leader_ts->uuid())->Shutdown();
@@ -856,15 +857,15 @@ TEST_F(AdminCliTest,
TestUnsafeChangeConfigLeaderWithPendingConfig) {
LOG(INFO) << "Tablet locations:\n" << SecureDebugString(tablet_locations);
ASSERT_TRUE(has_leader) << SecureDebugString(tablet_locations);
- // Wait for initial NO_OP to be committed by the leader.
TServerDetails* leader_ts;
ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout,
&leader_ts));
- ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
vector<TServerDetails*> followers;
ASSERT_EVENTUALLY([&] {
ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout,
&followers));
});
ASSERT_EQ(followers.size(), 2);
+ // Wait for initial NO_OP to be committed by the leader.
+ ASSERT_OK(WaitForOpFromCurrentTerm(leader_ts, tablet_id_, COMMITTED_OPID,
kTimeout));
// Shut down servers follower1 and follower2,
// so that leader can't replicate future config change ops.
@@ -944,14 +945,14 @@ TEST_F(AdminCliTest,
TestUnsafeChangeConfigFollowerWithPendingConfig) {
LOG(INFO) << "Tablet locations:\n" << SecureDebugString(tablet_locations);
ASSERT_TRUE(has_leader) << SecureDebugString(tablet_locations);
- // Wait for initial NO_OP to be committed by the leader.
TServerDetails* leader_ts;
ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout,
&leader_ts));
- ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
vector<TServerDetails*> followers;
ASSERT_EVENTUALLY([&] {
ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout,
&followers));
});
+ // Wait for initial NO_OP to be committed by the leader.
+ ASSERT_OK(WaitForOpFromCurrentTerm(leader_ts, tablet_id_, COMMITTED_OPID,
kTimeout));
// Shut down servers follower1 and follower2,
// so that leader can't replicate future config change ops.
@@ -1043,14 +1044,14 @@ TEST_F(AdminCliTest,
TestUnsafeChangeConfigWithPendingConfigsOnWAL) {
LOG(INFO) << "Tablet locations:\n" << SecureDebugString(tablet_locations);
ASSERT_TRUE(has_leader) << SecureDebugString(tablet_locations);
- // Wait for initial NO_OP to be committed by the leader.
TServerDetails* leader_ts;
ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout,
&leader_ts));
- ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
vector<TServerDetails*> followers;
ASSERT_EVENTUALLY([&] {
ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout,
&followers));
});
+ // Wait for initial NO_OP to be committed by the leader.
+ ASSERT_OK(WaitForOpFromCurrentTerm(leader_ts, tablet_id_, COMMITTED_OPID,
kTimeout));
// Shut down servers follower1 and follower2,
// so that leader can't replicate future config change ops.