This is an automated email from the ASF dual-hosted git repository.

awong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new 5975eb9  KUDU-2962 Fix kudu::itest::FindTabletFollowers() test utility 
function
5975eb9 is described below

commit 5975eb9ace115d8a4783cc4cb7438387397e46da
Author: Bankim Bhavsar <[email protected]>
AuthorDate: Tue Oct 22 17:10:29 2019 -0700

    KUDU-2962 Fix kudu::itest::FindTabletFollowers() test utility function
    
    Fixes the FindTabletFollowers() test utility function to query
    each tablet server and form the list of followers instead
    of simply finding the leader replica and returning supplied
    tablet servers with the leader replica removed.
    
    Earlier implementation is incorrect when one or more of the
    supplied tablet servers does not host the specified tablet_id.
    
    Change-Id: I94146873b09cb95c54175d16305b713365cfeaa8
    Reviewed-on: http://gerrit.cloudera.org:8080/14533
    Reviewed-by: Alexey Serbin <[email protected]>
    Tested-by: Kudu Jenkins
---
 src/kudu/integration-tests/cluster_itest_util.cc   | 101 +++++++++++++++++++--
 .../timestamp_advancement-itest.cc                 |   4 +-
 src/kudu/tools/kudu-admin-test.cc                  |  70 ++++++++++++--
 3 files changed, 160 insertions(+), 15 deletions(-)

diff --git a/src/kudu/integration-tests/cluster_itest_util.cc 
b/src/kudu/integration-tests/cluster_itest_util.cc
index 2d520f6..7e12c56 100644
--- a/src/kudu/integration-tests/cluster_itest_util.cc
+++ b/src/kudu/integration-tests/cluster_itest_util.cc
@@ -19,6 +19,8 @@
 
 #include <algorithm>
 #include <ostream>
+#include <set>
+#include <utility>
 
 #include <boost/optional/optional.hpp>
 #include <glog/logging.h>
@@ -39,6 +41,7 @@
 #include "kudu/gutil/basictypes.h"
 #include "kudu/gutil/map-util.h"
 #include "kudu/gutil/port.h"
+#include "kudu/gutil/stl_util.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/master/master.pb.h"
 #include "kudu/master/master.proxy.h"
@@ -642,19 +645,103 @@ Status FindTabletLeader(const TabletServerMap& 
tablet_servers,
                                      s.ToString()));
 }
 
+// Fills the out parameter "followers" with the tablet servers hosting the 
"tablet_id".
+// Non-empty "tablet_servers" is expected to include all the tablet servers 
and not subset
+// of tablet servers that host the "tablet_id".
+// Returns an error if the tablet servers do not have consensus or cannot be 
reached.
 Status FindTabletFollowers(const TabletServerMap& tablet_servers,
                            const string& tablet_id,
                            const MonoDelta& timeout,
                            vector<TServerDetails*>* followers) {
-  vector<TServerDetails*> tservers;
-  AppendValuesFromMap(tablet_servers, &tservers);
-  TServerDetails* leader;
-  RETURN_NOT_OK(FindTabletLeader(tablet_servers, tablet_id, timeout, &leader));
-  for (TServerDetails* ts : tservers) {
-    if (ts->uuid() != leader->uuid()) {
-      followers->push_back(ts);
+  DCHECK(!tablet_servers.empty());
+
+  // Sorted sets allow for set intersection needed below.
+  // uuids of all supplied tablet servers.
+  std::set<string> tablet_server_uuids;
+  // uuids of the tablet server peers that host the specified tablet_id.
+  std::set<string> peer_uuids;
+  // uuid of the leader tablet server hosting the specified tablet_id.
+  string leader_uuid;
+  const MonoTime start = MonoTime::Now();
+  const MonoTime deadline = MonoTime::Now() + timeout;
+  bool no_leader_or_peers_found = false;
+  for (const auto& entry : tablet_servers) {
+    const auto now = MonoTime::Now();
+    if (now > deadline) {
+      return Status::TimedOut(Substitute("Unable to find followers for tablet 
$0 after $1.",
+                                         tablet_id, (now - start).ToString()));
     }
+    const auto& tserver_uuid = entry.first;
+    const auto* tserver = entry.second;
+    tablet_server_uuids.emplace(tserver_uuid);
+
+    const MonoDelta remaining_timeout = deadline - now;
+    ConsensusStatePB cstate;
+    Status s = GetConsensusState(tserver, tablet_id, remaining_timeout, 
EXCLUDE_HEALTH_REPORT,
+                                 &cstate);
+    if (!s.ok()) {
+      // Failure could be due to tablet server not hosting the tablet which is 
okay or other issue.
+      // If all the tablet servers return error then the failure is reported 
back. See below.
+      continue;
+    }
+    // At this point tablet server does host the tablet but it's possible 
there is no leader
+    // or peers are unknown.
+    if (cstate.committed_config().peers_size() == 0 || 
!cstate.has_leader_uuid()) {
+      no_leader_or_peers_found = true;
+      continue;
+    }
+
+    std::set<string> curr_peer_uuids;
+    for (const auto& peer : cstate.committed_config().peers()) {
+      curr_peer_uuids.emplace(peer.permanent_uuid());
+    }
+    DCHECK(!curr_peer_uuids.empty());
+    DCHECK(!cstate.leader_uuid().empty());
+    if (!leader_uuid.empty()) {
+      DCHECK(!peer_uuids.empty());
+      // Sanity checking that tablet servers with the specified tablet are 
reporting
+      // the same leader and set of peers.
+      if (leader_uuid != cstate.leader_uuid()) {
+        return Status::IllegalState(Substitute(
+            "Leader $0 reported by tablet server $1 for tablet $2 doesn't 
match with leader $3 "
+            "reported by other tablet servers.", cstate.leader_uuid(), 
tserver_uuid, tablet_id,
+            leader_uuid));
+      }
+      if (peer_uuids != curr_peer_uuids) {
+        return Status::IllegalState(Substitute(
+            "Peers reported by tablet server $0 for tablet $1 don't match with 
peers reported by "
+            "other tablet servers.", tserver_uuid, tablet_id));
+      }
+    } else {
+      DCHECK(peer_uuids.empty());
+      leader_uuid = cstate.leader_uuid();
+      peer_uuids.swap(curr_peer_uuids);
+    }
+  }
+
+  // Unable to get leader and peer information from multiple supplied tablet 
servers.
+  if (leader_uuid.empty()) {
+    DCHECK(peer_uuids.empty());
+
+    return no_leader_or_peers_found ?
+           Status::IllegalState(
+               Substitute("No leader or peers found for tablet $0.", 
tablet_id)) :
+           Status::NotFound(
+               Substitute("No tablet server found with tablet $0 or tablet 
servers not reachable.",
+                          tablet_id));
   }
+
+  if (peer_uuids != STLSetIntersection(peer_uuids, tablet_server_uuids)) {
+    return Status::InvalidArgument(Substitute(
+        "Not all peers reported by tablet servers are part of the supplied 
tablet servers."));
+  }
+
+  for (const auto& tserver_uuid : peer_uuids) {
+    if (tserver_uuid != leader_uuid) {
+      followers->emplace_back(FindOrDie(tablet_servers, tserver_uuid));
+    }
+  }
+
   return Status::OK();
 }
 
diff --git a/src/kudu/integration-tests/timestamp_advancement-itest.cc 
b/src/kudu/integration-tests/timestamp_advancement-itest.cc
index 344bcf6..7cede18 100644
--- a/src/kudu/integration-tests/timestamp_advancement-itest.cc
+++ b/src/kudu/integration-tests/timestamp_advancement-itest.cc
@@ -302,7 +302,9 @@ TEST_F(TimestampAdvancementITest, Kudu2463Test) {
   TServerDetails* leader;
   ASSERT_OK(FindTabletLeader(ts_map_, tablet_id, kTimeout, &leader));
   vector<TServerDetails*> followers;
-  ASSERT_OK(FindTabletFollowers(ts_map_, tablet_id, kTimeout, &followers));
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_OK(FindTabletFollowers(ts_map_, tablet_id, kTimeout, &followers));
+  });
   ASSERT_FALSE(followers.empty());
   for (int i = 0; i < 20; i++) {
     RaftPeerPB::MemberType type = i % 2 == 0 ? RaftPeerPB::NON_VOTER : 
RaftPeerPB::VOTER;
diff --git a/src/kudu/tools/kudu-admin-test.cc 
b/src/kudu/tools/kudu-admin-test.cc
index feb4d35..6670975 100644
--- a/src/kudu/tools/kudu-admin-test.cc
+++ b/src/kudu/tools/kudu-admin-test.cc
@@ -180,6 +180,48 @@ class AdminCliTest : public 
tserver::TabletServerIntegrationTestBase {
   }
 };
 
+TEST_F(AdminCliTest, TestFindTabletFollowers) {
+  FLAGS_num_tablet_servers = 5;
+  FLAGS_num_replicas = 3;
+
+  NO_FATALS(BuildAndStart());
+
+  TServerDetails* leader;
+  const auto timeout = MonoDelta::FromSeconds(30);
+  ASSERT_OK(FindTabletLeader(tablet_servers_, tablet_id_, timeout, &leader));
+  vector<TServerDetails*> followers;
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_OK(FindTabletFollowers(tablet_servers_, tablet_id_, timeout, 
&followers));
+  });
+  ASSERT_EQ(FLAGS_num_replicas - 1, followers.size());
+  for (const auto& follower : followers) {
+    ASSERT_NE(leader->uuid(), follower->uuid());
+  }
+}
+
+TEST_F(AdminCliTest, TestFindTabletFollowersWithIncompleteTabletServers) {
+  FLAGS_num_tablet_servers = 5;
+  FLAGS_num_replicas = 3;
+
+  NO_FATALS(BuildAndStart());
+
+  TServerDetails* leader;
+  const auto timeout = MonoDelta::FromSeconds(30);
+  ASSERT_OK(FindTabletLeader(tablet_servers_, tablet_id_, timeout, &leader));
+
+  // Incomplete tablet servers that contains only the leader tablet server.
+  TabletServerMap incomplete_tablet_servers;
+  InsertOrDie(&incomplete_tablet_servers, leader->uuid(), leader);
+  vector<TServerDetails*> followers;
+  ASSERT_EVENTUALLY([&] {
+    Status s = FindTabletFollowers(incomplete_tablet_servers, tablet_id_, 
timeout, &followers);
+    ASSERT_TRUE(s.IsInvalidArgument());
+    ASSERT_STR_CONTAINS(
+        s.ToString(),
+        "Not all peers reported by tablet servers are part of the supplied 
tablet servers");
+  });
+}
+
 // Test config change while running a workload.
 // 1. Instantiate external mini cluster with 3 TS.
 // 2. Create table with 2 replicas.
@@ -479,7 +521,9 @@ TEST_F(AdminCliTest, 
TestUnsafeChangeConfigOnSingleFollower) {
   TServerDetails* leader_ts;
   vector<TServerDetails*> followers;
   ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id, kTimeout, 
&leader_ts));
-  ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id, kTimeout, 
&followers));
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id, kTimeout, 
&followers));
+  });
   OpId opid;
   ASSERT_OK(WaitForOpFromCurrentTerm(leader_ts, tablet_id, COMMITTED_OPID, 
kTimeout, &opid));
 
@@ -569,7 +613,9 @@ TEST_F(AdminCliTest, TestUnsafeChangeConfigOnSingleLeader) {
   ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout, 
&leader_ts));
   ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
   vector<TServerDetails*> followers;
-  ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout, 
&followers));
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout, 
&followers));
+  });
 
   // Shut down servers follower1 and follower2,
   // so that we can force new config on remaining leader.
@@ -652,7 +698,9 @@ TEST_F(AdminCliTest, 
TestUnsafeChangeConfigForConfigWithTwoNodes) {
   ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout, 
&leader_ts));
   ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
   vector<TServerDetails*> followers;
-  ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout, 
&followers));
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout, 
&followers));
+  });
 
   // Shut down leader and prepare 2-node config.
   cluster_->tablet_server_by_uuid(leader_ts->uuid())->Shutdown();
@@ -734,7 +782,9 @@ TEST_F(AdminCliTest, 
TestUnsafeChangeConfigWithFiveReplicaConfig) {
   ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout, 
&leader_ts));
   ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
   vector<TServerDetails*> followers;
-  ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout, 
&followers));
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout, 
&followers));
+  });
   ASSERT_EQ(followers.size(), 4);
   cluster_->tablet_server_by_uuid(followers[2]->uuid())->Shutdown();
   cluster_->tablet_server_by_uuid(followers[3]->uuid())->Shutdown();
@@ -811,7 +861,9 @@ TEST_F(AdminCliTest, 
TestUnsafeChangeConfigLeaderWithPendingConfig) {
   ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout, 
&leader_ts));
   ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
   vector<TServerDetails*> followers;
-  ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout, 
&followers));
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout, 
&followers));
+  });
   ASSERT_EQ(followers.size(), 2);
 
   // Shut down servers follower1 and follower2,
@@ -897,7 +949,9 @@ TEST_F(AdminCliTest, 
TestUnsafeChangeConfigFollowerWithPendingConfig) {
   ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout, 
&leader_ts));
   ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
   vector<TServerDetails*> followers;
-  ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout, 
&followers));
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout, 
&followers));
+  });
 
   // Shut down servers follower1 and follower2,
   // so that leader can't replicate future config change ops.
@@ -994,7 +1048,9 @@ TEST_F(AdminCliTest, 
TestUnsafeChangeConfigWithPendingConfigsOnWAL) {
   ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout, 
&leader_ts));
   ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
   vector<TServerDetails*> followers;
-  ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout, 
&followers));
+  ASSERT_EVENTUALLY([&] {
+    ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout, 
&followers));
+  });
 
   // Shut down servers follower1 and follower2,
   // so that leader can't replicate future config change ops.

Reply via email to