This is an automated email from the ASF dual-hosted git repository.
awong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 5975eb9 KUDU-2962 Fix kudu::itest::FindTabletFollowers() test utility
function
5975eb9 is described below
commit 5975eb9ace115d8a4783cc4cb7438387397e46da
Author: Bankim Bhavsar <[email protected]>
AuthorDate: Tue Oct 22 17:10:29 2019 -0700
KUDU-2962 Fix kudu::itest::FindTabletFollowers() test utility function
Fixes the FindTabletFollowers() test utility function to query
each tablet server and form the list of followers instead
of simply finding the leader replica and returning supplied
tablet servers with the leader replica removed.
Earlier implementation is incorrect when one or more of the
supplied tablet servers does not host the specified tablet_id.
Change-Id: I94146873b09cb95c54175d16305b713365cfeaa8
Reviewed-on: http://gerrit.cloudera.org:8080/14533
Reviewed-by: Alexey Serbin <[email protected]>
Tested-by: Kudu Jenkins
---
src/kudu/integration-tests/cluster_itest_util.cc | 101 +++++++++++++++++++--
.../timestamp_advancement-itest.cc | 4 +-
src/kudu/tools/kudu-admin-test.cc | 70 ++++++++++++--
3 files changed, 160 insertions(+), 15 deletions(-)
diff --git a/src/kudu/integration-tests/cluster_itest_util.cc
b/src/kudu/integration-tests/cluster_itest_util.cc
index 2d520f6..7e12c56 100644
--- a/src/kudu/integration-tests/cluster_itest_util.cc
+++ b/src/kudu/integration-tests/cluster_itest_util.cc
@@ -19,6 +19,8 @@
#include <algorithm>
#include <ostream>
+#include <set>
+#include <utility>
#include <boost/optional/optional.hpp>
#include <glog/logging.h>
@@ -39,6 +41,7 @@
#include "kudu/gutil/basictypes.h"
#include "kudu/gutil/map-util.h"
#include "kudu/gutil/port.h"
+#include "kudu/gutil/stl_util.h"
#include "kudu/gutil/strings/substitute.h"
#include "kudu/master/master.pb.h"
#include "kudu/master/master.proxy.h"
@@ -642,19 +645,103 @@ Status FindTabletLeader(const TabletServerMap&
tablet_servers,
s.ToString()));
}
+// Fills the out parameter "followers" with the tablet servers hosting the
"tablet_id".
+// Non-empty "tablet_servers" is expected to include all the tablet servers
and not subset
+// of tablet servers that host the "tablet_id".
+// Returns an error if the tablet servers do not have consensus or cannot be
reached.
Status FindTabletFollowers(const TabletServerMap& tablet_servers,
const string& tablet_id,
const MonoDelta& timeout,
vector<TServerDetails*>* followers) {
- vector<TServerDetails*> tservers;
- AppendValuesFromMap(tablet_servers, &tservers);
- TServerDetails* leader;
- RETURN_NOT_OK(FindTabletLeader(tablet_servers, tablet_id, timeout, &leader));
- for (TServerDetails* ts : tservers) {
- if (ts->uuid() != leader->uuid()) {
- followers->push_back(ts);
+ DCHECK(!tablet_servers.empty());
+
+ // Sorted sets allow for set intersection needed below.
+ // uuids of all supplied tablet servers.
+ std::set<string> tablet_server_uuids;
+ // uuids of the tablet server peers that host the specified tablet_id.
+ std::set<string> peer_uuids;
+ // uuid of the leader tablet server hosting the specified tablet_id.
+ string leader_uuid;
+ const MonoTime start = MonoTime::Now();
+ const MonoTime deadline = MonoTime::Now() + timeout;
+ bool no_leader_or_peers_found = false;
+ for (const auto& entry : tablet_servers) {
+ const auto now = MonoTime::Now();
+ if (now > deadline) {
+ return Status::TimedOut(Substitute("Unable to find followers for tablet
$0 after $1.",
+ tablet_id, (now - start).ToString()));
}
+ const auto& tserver_uuid = entry.first;
+ const auto* tserver = entry.second;
+ tablet_server_uuids.emplace(tserver_uuid);
+
+ const MonoDelta remaining_timeout = deadline - now;
+ ConsensusStatePB cstate;
+ Status s = GetConsensusState(tserver, tablet_id, remaining_timeout,
EXCLUDE_HEALTH_REPORT,
+ &cstate);
+ if (!s.ok()) {
+ // Failure could be due to tablet server not hosting the tablet which is
okay or other issue.
+ // If all the tablet servers return error then the failure is reported
back. See below.
+ continue;
+ }
+ // At this point tablet server does host the tablet but it's possible
there is no leader
+ // or peers are unknown.
+ if (cstate.committed_config().peers_size() == 0 ||
!cstate.has_leader_uuid()) {
+ no_leader_or_peers_found = true;
+ continue;
+ }
+
+ std::set<string> curr_peer_uuids;
+ for (const auto& peer : cstate.committed_config().peers()) {
+ curr_peer_uuids.emplace(peer.permanent_uuid());
+ }
+ DCHECK(!curr_peer_uuids.empty());
+ DCHECK(!cstate.leader_uuid().empty());
+ if (!leader_uuid.empty()) {
+ DCHECK(!peer_uuids.empty());
+ // Sanity checking that tablet servers with the specified tablet are
reporting
+ // the same leader and set of peers.
+ if (leader_uuid != cstate.leader_uuid()) {
+ return Status::IllegalState(Substitute(
+ "Leader $0 reported by tablet server $1 for tablet $2 doesn't
match with leader $3 "
+ "reported by other tablet servers.", cstate.leader_uuid(),
tserver_uuid, tablet_id,
+ leader_uuid));
+ }
+ if (peer_uuids != curr_peer_uuids) {
+ return Status::IllegalState(Substitute(
+ "Peers reported by tablet server $0 for tablet $1 don't match with
peers reported by "
+ "other tablet servers.", tserver_uuid, tablet_id));
+ }
+ } else {
+ DCHECK(peer_uuids.empty());
+ leader_uuid = cstate.leader_uuid();
+ peer_uuids.swap(curr_peer_uuids);
+ }
+ }
+
+ // Unable to get leader and peer information from multiple supplied tablet
servers.
+ if (leader_uuid.empty()) {
+ DCHECK(peer_uuids.empty());
+
+ return no_leader_or_peers_found ?
+ Status::IllegalState(
+ Substitute("No leader or peers found for tablet $0.",
tablet_id)) :
+ Status::NotFound(
+ Substitute("No tablet server found with tablet $0 or tablet
servers not reachable.",
+ tablet_id));
}
+
+ if (peer_uuids != STLSetIntersection(peer_uuids, tablet_server_uuids)) {
+ return Status::InvalidArgument(Substitute(
+ "Not all peers reported by tablet servers are part of the supplied
tablet servers."));
+ }
+
+ for (const auto& tserver_uuid : peer_uuids) {
+ if (tserver_uuid != leader_uuid) {
+ followers->emplace_back(FindOrDie(tablet_servers, tserver_uuid));
+ }
+ }
+
return Status::OK();
}
diff --git a/src/kudu/integration-tests/timestamp_advancement-itest.cc
b/src/kudu/integration-tests/timestamp_advancement-itest.cc
index 344bcf6..7cede18 100644
--- a/src/kudu/integration-tests/timestamp_advancement-itest.cc
+++ b/src/kudu/integration-tests/timestamp_advancement-itest.cc
@@ -302,7 +302,9 @@ TEST_F(TimestampAdvancementITest, Kudu2463Test) {
TServerDetails* leader;
ASSERT_OK(FindTabletLeader(ts_map_, tablet_id, kTimeout, &leader));
vector<TServerDetails*> followers;
- ASSERT_OK(FindTabletFollowers(ts_map_, tablet_id, kTimeout, &followers));
+ ASSERT_EVENTUALLY([&] {
+ ASSERT_OK(FindTabletFollowers(ts_map_, tablet_id, kTimeout, &followers));
+ });
ASSERT_FALSE(followers.empty());
for (int i = 0; i < 20; i++) {
RaftPeerPB::MemberType type = i % 2 == 0 ? RaftPeerPB::NON_VOTER :
RaftPeerPB::VOTER;
diff --git a/src/kudu/tools/kudu-admin-test.cc
b/src/kudu/tools/kudu-admin-test.cc
index feb4d35..6670975 100644
--- a/src/kudu/tools/kudu-admin-test.cc
+++ b/src/kudu/tools/kudu-admin-test.cc
@@ -180,6 +180,48 @@ class AdminCliTest : public
tserver::TabletServerIntegrationTestBase {
}
};
+TEST_F(AdminCliTest, TestFindTabletFollowers) {
+ FLAGS_num_tablet_servers = 5;
+ FLAGS_num_replicas = 3;
+
+ NO_FATALS(BuildAndStart());
+
+ TServerDetails* leader;
+ const auto timeout = MonoDelta::FromSeconds(30);
+ ASSERT_OK(FindTabletLeader(tablet_servers_, tablet_id_, timeout, &leader));
+ vector<TServerDetails*> followers;
+ ASSERT_EVENTUALLY([&] {
+ ASSERT_OK(FindTabletFollowers(tablet_servers_, tablet_id_, timeout,
&followers));
+ });
+ ASSERT_EQ(FLAGS_num_replicas - 1, followers.size());
+ for (const auto& follower : followers) {
+ ASSERT_NE(leader->uuid(), follower->uuid());
+ }
+}
+
+TEST_F(AdminCliTest, TestFindTabletFollowersWithIncompleteTabletServers) {
+ FLAGS_num_tablet_servers = 5;
+ FLAGS_num_replicas = 3;
+
+ NO_FATALS(BuildAndStart());
+
+ TServerDetails* leader;
+ const auto timeout = MonoDelta::FromSeconds(30);
+ ASSERT_OK(FindTabletLeader(tablet_servers_, tablet_id_, timeout, &leader));
+
+ // Incomplete tablet servers that contains only the leader tablet server.
+ TabletServerMap incomplete_tablet_servers;
+ InsertOrDie(&incomplete_tablet_servers, leader->uuid(), leader);
+ vector<TServerDetails*> followers;
+ ASSERT_EVENTUALLY([&] {
+ Status s = FindTabletFollowers(incomplete_tablet_servers, tablet_id_,
timeout, &followers);
+ ASSERT_TRUE(s.IsInvalidArgument());
+ ASSERT_STR_CONTAINS(
+ s.ToString(),
+ "Not all peers reported by tablet servers are part of the supplied
tablet servers");
+ });
+}
+
// Test config change while running a workload.
// 1. Instantiate external mini cluster with 3 TS.
// 2. Create table with 2 replicas.
@@ -479,7 +521,9 @@ TEST_F(AdminCliTest,
TestUnsafeChangeConfigOnSingleFollower) {
TServerDetails* leader_ts;
vector<TServerDetails*> followers;
ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id, kTimeout,
&leader_ts));
- ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id, kTimeout,
&followers));
+ ASSERT_EVENTUALLY([&] {
+ ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id, kTimeout,
&followers));
+ });
OpId opid;
ASSERT_OK(WaitForOpFromCurrentTerm(leader_ts, tablet_id, COMMITTED_OPID,
kTimeout, &opid));
@@ -569,7 +613,9 @@ TEST_F(AdminCliTest, TestUnsafeChangeConfigOnSingleLeader) {
ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout,
&leader_ts));
ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
vector<TServerDetails*> followers;
- ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout,
&followers));
+ ASSERT_EVENTUALLY([&] {
+ ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout,
&followers));
+ });
// Shut down servers follower1 and follower2,
// so that we can force new config on remaining leader.
@@ -652,7 +698,9 @@ TEST_F(AdminCliTest,
TestUnsafeChangeConfigForConfigWithTwoNodes) {
ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout,
&leader_ts));
ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
vector<TServerDetails*> followers;
- ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout,
&followers));
+ ASSERT_EVENTUALLY([&] {
+ ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout,
&followers));
+ });
// Shut down leader and prepare 2-node config.
cluster_->tablet_server_by_uuid(leader_ts->uuid())->Shutdown();
@@ -734,7 +782,9 @@ TEST_F(AdminCliTest,
TestUnsafeChangeConfigWithFiveReplicaConfig) {
ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout,
&leader_ts));
ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
vector<TServerDetails*> followers;
- ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout,
&followers));
+ ASSERT_EVENTUALLY([&] {
+ ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout,
&followers));
+ });
ASSERT_EQ(followers.size(), 4);
cluster_->tablet_server_by_uuid(followers[2]->uuid())->Shutdown();
cluster_->tablet_server_by_uuid(followers[3]->uuid())->Shutdown();
@@ -811,7 +861,9 @@ TEST_F(AdminCliTest,
TestUnsafeChangeConfigLeaderWithPendingConfig) {
ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout,
&leader_ts));
ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
vector<TServerDetails*> followers;
- ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout,
&followers));
+ ASSERT_EVENTUALLY([&] {
+ ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout,
&followers));
+ });
ASSERT_EQ(followers.size(), 2);
// Shut down servers follower1 and follower2,
@@ -897,7 +949,9 @@ TEST_F(AdminCliTest,
TestUnsafeChangeConfigFollowerWithPendingConfig) {
ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout,
&leader_ts));
ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
vector<TServerDetails*> followers;
- ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout,
&followers));
+ ASSERT_EVENTUALLY([&] {
+ ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout,
&followers));
+ });
// Shut down servers follower1 and follower2,
// so that leader can't replicate future config change ops.
@@ -994,7 +1048,9 @@ TEST_F(AdminCliTest,
TestUnsafeChangeConfigWithPendingConfigsOnWAL) {
ASSERT_OK(FindTabletLeader(active_tablet_servers, tablet_id_, kTimeout,
&leader_ts));
ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id_, kTimeout));
vector<TServerDetails*> followers;
- ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout,
&followers));
+ ASSERT_EVENTUALLY([&] {
+ ASSERT_OK(FindTabletFollowers(active_tablet_servers, tablet_id_, kTimeout,
&followers));
+ });
// Shut down servers follower1 and follower2,
// so that leader can't replicate future config change ops.