This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new bd18cd1  [kudu-admin-test] fix 
TestUnsafeChangeConfigWithFiveReplicaConfig
bd18cd1 is described below

commit bd18cd1cf196e0fa4b9051e51deed789090b1721
Author: Alexey Serbin <[email protected]>
AuthorDate: Tue Oct 13 23:35:31 2020 -0700

    [kudu-admin-test] fix TestUnsafeChangeConfigWithFiveReplicaConfig
    
    I saw the AdminCliTest.TestUnsafeChangeConfigWithFiveReplicaConfig
    scenario failing pre-commit TSAN build [1] with the error like below:
    
      Timed out: Committed consensus opid_index does not equal 1
          after waiting for 30.014s. Last opid: 2.2
    
    As I understand, the reason for failing the assertion is an extra
    election round that has happened after startup of a tablet.  This patch
    addresses the issue in the TestUnsafeChangeConfigWithFiveReplicaConfig
    and other scenarios which might be affected if same ever happens again.
    
    [1] http://jenkins.kudu.apache.org/job/kudu-gerrit/22585/BUILD_TYPE=TSAN
    
    Change-Id: I1e77c440b872a42c68b6d7d1cb0563fee0470fc2
    Reviewed-on: http://gerrit.cloudera.org:8080/16601
    Tested-by: Alexey Serbin <[email protected]>
    Reviewed-by: Andrew Wong <[email protected]>
---
 src/kudu/tools/kudu-admin-test.cc | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/src/kudu/tools/kudu-admin-test.cc 
b/src/kudu/tools/kudu-admin-test.cc
index 1294377..f549e74 100644
--- a/src/kudu/tools/kudu-admin-test.cc
+++ b/src/kudu/tools/kudu-admin-test.cc
@@ -517,15 +517,14 @@ TEST_F(AdminCliTest, 
TestUnsafeChangeConfigOnSingleFollower) {
   LOG(INFO) << "Tablet locations:\n" << SecureDebugString(tablet_locations);
   ASSERT_TRUE(has_leader) << SecureDebugString(tablet_locations);
 
-  // Wait for initial NO_OP to be committed by the leader.
   TServerDetails* leader_ts;
-  vector<TServerDetails*> followers;
   ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id, kTimeout, 
&leader_ts));
+  vector<TServerDetails*> followers;
   ASSERT_EVENTUALLY([&] {
     ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id, kTimeout, 
&followers));
   });
-  OpId opid;
-  ASSERT_OK(WaitForOpFromCurrentTerm(leader_ts, tablet_id, COMMITTED_OPID, 
kTimeout, &opid));
+  // Wait for initial NO_OP to be committed by the leader.
+  ASSERT_OK(WaitForOpFromCurrentTerm(leader_ts, tablet_id, COMMITTED_OPID, 
kTimeout));
 
   // Shut down master so it doesn't interfere while we shut down the leader and
   // one of the other followers.
@@ -552,6 +551,7 @@ TEST_F(AdminCliTest, 
TestUnsafeChangeConfigOnSingleFollower) {
                                             3, tablet_id, kTimeout,
                                             WAIT_FOR_LEADER, VOTER_REPLICA,
                                             &has_leader, &tablet_locations));
+  OpId opid;
   ASSERT_OK(WaitForOpFromCurrentTerm(followers[0], tablet_id, COMMITTED_OPID, 
kTimeout, &opid));
 
   active_tablet_servers.clear();
@@ -608,14 +608,14 @@ TEST_F(AdminCliTest, 
TestUnsafeChangeConfigOnSingleLeader) {
   LOG(INFO) << "Tablet locations:\n" << SecureDebugString(tablet_locations);
   ASSERT_TRUE(has_leader) << SecureDebugString(tablet_locations);
 
-  // Wait for initial NO_OP to be committed by the leader.
   TServerDetails* leader_ts;
   ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout, 
&leader_ts));
-  ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
   vector<TServerDetails*> followers;
   ASSERT_EVENTUALLY([&] {
     ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout, 
&followers));
   });
+  // Wait for initial NO_OP to be committed by the leader.
+  ASSERT_OK(WaitForOpFromCurrentTerm(leader_ts, tablet_id_, COMMITTED_OPID, 
kTimeout));
 
   // Shut down servers follower1 and follower2,
   // so that we can force new config on remaining leader.
@@ -693,14 +693,14 @@ TEST_F(AdminCliTest, 
TestUnsafeChangeConfigForConfigWithTwoNodes) {
   LOG(INFO) << "Tablet locations:\n" << SecureDebugString(tablet_locations);
   ASSERT_TRUE(has_leader) << SecureDebugString(tablet_locations);
 
-  // Wait for initial NO_OP to be committed by the leader.
   TServerDetails* leader_ts;
   ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout, 
&leader_ts));
-  ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
   vector<TServerDetails*> followers;
   ASSERT_EVENTUALLY([&] {
     ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout, 
&followers));
   });
+  // Wait for initial NO_OP to be committed by the leader.
+  ASSERT_OK(WaitForOpFromCurrentTerm(leader_ts, tablet_id_, COMMITTED_OPID, 
kTimeout));
 
   // Shut down leader and prepare 2-node config.
   cluster_->tablet_server_by_uuid(leader_ts->uuid())->Shutdown();
@@ -777,15 +777,16 @@ TEST_F(AdminCliTest, 
TestUnsafeChangeConfigWithFiveReplicaConfig) {
   LOG(INFO) << "Tablet locations:\n" << SecureDebugString(tablet_locations);
   ASSERT_TRUE(has_leader) << SecureDebugString(tablet_locations);
 
-  // Wait for initial NO_OP to be committed by the leader.
   TServerDetails* leader_ts;
   ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout, 
&leader_ts));
-  ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
   vector<TServerDetails*> followers;
   ASSERT_EVENTUALLY([&] {
     ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout, 
&followers));
   });
   ASSERT_EQ(followers.size(), 4);
+  // Wait for initial NO_OP to be committed by the leader.
+  ASSERT_OK(WaitForOpFromCurrentTerm(leader_ts, tablet_id_, COMMITTED_OPID, 
kTimeout));
+
   cluster_->tablet_server_by_uuid(followers[2]->uuid())->Shutdown();
   cluster_->tablet_server_by_uuid(followers[3]->uuid())->Shutdown();
   cluster_->tablet_server_by_uuid(leader_ts->uuid())->Shutdown();
@@ -856,15 +857,15 @@ TEST_F(AdminCliTest, 
TestUnsafeChangeConfigLeaderWithPendingConfig) {
   LOG(INFO) << "Tablet locations:\n" << SecureDebugString(tablet_locations);
   ASSERT_TRUE(has_leader) << SecureDebugString(tablet_locations);
 
-  // Wait for initial NO_OP to be committed by the leader.
   TServerDetails* leader_ts;
   ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout, 
&leader_ts));
-  ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
   vector<TServerDetails*> followers;
   ASSERT_EVENTUALLY([&] {
     ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout, 
&followers));
   });
   ASSERT_EQ(followers.size(), 2);
+  // Wait for initial NO_OP to be committed by the leader.
+  ASSERT_OK(WaitForOpFromCurrentTerm(leader_ts, tablet_id_, COMMITTED_OPID, 
kTimeout));
 
   // Shut down servers follower1 and follower2,
   // so that leader can't replicate future config change ops.
@@ -944,14 +945,14 @@ TEST_F(AdminCliTest, 
TestUnsafeChangeConfigFollowerWithPendingConfig) {
   LOG(INFO) << "Tablet locations:\n" << SecureDebugString(tablet_locations);
   ASSERT_TRUE(has_leader) << SecureDebugString(tablet_locations);
 
-  // Wait for initial NO_OP to be committed by the leader.
   TServerDetails* leader_ts;
   ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout, 
&leader_ts));
-  ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
   vector<TServerDetails*> followers;
   ASSERT_EVENTUALLY([&] {
     ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout, 
&followers));
   });
+  // Wait for initial NO_OP to be committed by the leader.
+  ASSERT_OK(WaitForOpFromCurrentTerm(leader_ts, tablet_id_, COMMITTED_OPID, 
kTimeout));
 
   // Shut down servers follower1 and follower2,
   // so that leader can't replicate future config change ops.
@@ -1043,14 +1044,14 @@ TEST_F(AdminCliTest, 
TestUnsafeChangeConfigWithPendingConfigsOnWAL) {
   LOG(INFO) << "Tablet locations:\n" << SecureDebugString(tablet_locations);
   ASSERT_TRUE(has_leader) << SecureDebugString(tablet_locations);
 
-  // Wait for initial NO_OP to be committed by the leader.
   TServerDetails* leader_ts;
   ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout, 
&leader_ts));
-  ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
   vector<TServerDetails*> followers;
   ASSERT_EVENTUALLY([&] {
     ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout, 
&followers));
   });
+  // Wait for initial NO_OP to be committed by the leader.
+  ASSERT_OK(WaitForOpFromCurrentTerm(leader_ts, tablet_id_, COMMITTED_OPID, 
kTimeout));
 
   // Shut down servers follower1 and follower2,
   // so that leader can't replicate future config change ops.

Reply via email to