This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 9f7c80d [rebalancer_tool] refactor on IgnoredTserversRunner
9f7c80d is described below
commit 9f7c80d630ebd7a7e2da2edacd910bcbd00f29ee
Author: zhangyifan27 <[email protected]>
AuthorDate: Thu Dec 30 11:44:16 2021 +0800
[rebalancer_tool] refactor on IgnoredTserversRunner
ClusterInfo::tservers_to_empty is not needed to be populated when
'BuildClusterInfo' called, we use it in two cases:
- print replica count about tservers_to_empty.
- get replicas running on tservers_to_empty (used in IgnoredTserverRunner).
But 'BuildClusterInfo" could be called in many places.
This patch removed it from ClusterInfo so we could get the information
only when needed.
Change-Id: I4e51349d12fffb54458142169b9e7789c8d3987c
Reviewed-on: http://gerrit.cloudera.org:8080/18132
Tested-by: Alexey Serbin <[email protected]>
Reviewed-by: Alexey Serbin <[email protected]>
---
src/kudu/rebalance/rebalance_algo.h | 4 --
src/kudu/rebalance/rebalancer.cc | 69 +++++++++++++++++++++++-------
src/kudu/rebalance/rebalancer.h | 14 +++++++
src/kudu/tools/rebalancer_tool.cc | 83 ++++++++++++++-----------------------
src/kudu/tools/rebalancer_tool.h | 19 +++------
5 files changed, 106 insertions(+), 83 deletions(-)
diff --git a/src/kudu/rebalance/rebalance_algo.h
b/src/kudu/rebalance/rebalance_algo.h
index b827f0a..67f1267 100644
--- a/src/kudu/rebalance/rebalance_algo.h
+++ b/src/kudu/rebalance/rebalance_algo.h
@@ -88,10 +88,6 @@ struct ClusterInfo {
// Locality information for a cluster.
ClusterLocalityInfo locality;
-
- // Mapping tserver identifier --> total replica count on the server.
- // Replicas on these tablet servers need to move to other tservers in the
cluster.
- std::unordered_map<std::string, int> tservers_to_empty;
};
// A directive to move some replica of a table between two tablet servers.
diff --git a/src/kudu/rebalance/rebalancer.cc b/src/kudu/rebalance/rebalancer.cc
index 9f2e0a2..2d5fdc2 100644
--- a/src/kudu/rebalance/rebalancer.cc
+++ b/src/kudu/rebalance/rebalancer.cc
@@ -321,7 +321,8 @@ Status Rebalancer::BuildClusterInfo(const ClusterRawInfo&
raw_info,
skipped_tserver_msgs.emplace_back(
Substitute("skipping tablet server $0 ($1) because of its "
"non-HEALTHY status ($2)",
- s.uuid, s.address,
+ s.uuid,
+ s.address,
ServerHealthToString(s.health)));
unhealthy_tablet_servers.emplace(s.uuid);
continue;
@@ -454,25 +455,65 @@ Status Rebalancer::BuildClusterInfo(const ClusterRawInfo&
raw_info,
table_info_by_skew.emplace(max_count - min_count, std::move(tbi));
}
- // Populate ClusterInfo::tservers_to_empty.
- // TODO(zhangyifan): extract this to a seperate method.
- if (config_.move_replicas_from_ignored_tservers) {
- auto& tservers_to_empty = result_info.tservers_to_empty;
- for (const auto& ignored_tserver : config_.ignored_tservers) {
- if (ContainsKey(unhealthy_tablet_servers, ignored_tserver)) {
- continue;
- }
- int replica_count = FindWithDefault(tserver_replicas_count,
ignored_tserver, 0);
- tservers_to_empty.emplace(ignored_tserver, replica_count);
- }
- }
-
// TODO(aserbin): add sanity checks on the result.
*info = std::move(result_info);
return Status::OK();
}
+void Rebalancer::GetTServersToEmpty(const ClusterRawInfo& raw_info,
+ unordered_set<string>* tservers_to_empty)
const {
+ DCHECK(tservers_to_empty);
+ tservers_to_empty->clear();
+ for (const auto& ts_summary : raw_info.tserver_summaries) {
+ if (ts_summary.health == ServerHealth::HEALTHY &&
+ ContainsKey(config_.ignored_tservers, ts_summary.uuid)) {
+ tservers_to_empty->emplace(ts_summary.uuid);
+ }
+ }
+}
+
+void Rebalancer::BuildTServersToEmptyInfo(const ClusterRawInfo& raw_info,
+ const MovesInProgress&
moves_in_progress,
+ const unordered_set<string>&
tservers_to_empty,
+ TServersToEmptyMap*
tservers_to_empty_map) {
+ DCHECK(tservers_to_empty_map);
+ tservers_to_empty_map->clear();
+ for (const auto& tablet_summary : raw_info.tablet_summaries) {
+ if (ContainsKey(moves_in_progress, tablet_summary.id)) {
+ continue;
+ }
+ if (tablet_summary.result != cluster_summary::HealthCheckResult::HEALTHY &&
+ tablet_summary.result !=
cluster_summary::HealthCheckResult::RECOVERING) {
+ LOG(INFO) << Substitute(
+ "tablet $0: not considering replicas for movement "
+ "since the tablet's status is '$1'",
+ tablet_summary.id,
+ cluster_summary::HealthCheckResultToString(tablet_summary.result));
+ continue;
+ }
+ TabletInfo tablet_info;
+ for (const auto& replica_info : tablet_summary.replicas) {
+ if (replica_info.is_leader && replica_info.consensus_state) {
+ const auto& cstate = *replica_info.consensus_state;
+ if (cstate.opid_index) {
+ tablet_info.tablet_id = tablet_summary.id;
+ tablet_info.config_idx = *cstate.opid_index;
+ break;
+ }
+ }
+ }
+ for (const auto& replica_info : tablet_summary.replicas) {
+ if (!ContainsKey(tservers_to_empty, replica_info.ts_uuid)) {
+ continue;
+ }
+ auto& tablets =
+ LookupOrEmplace(tservers_to_empty_map, replica_info.ts_uuid,
vector<TabletInfo>());
+ tablets.emplace_back(tablet_info);
+ }
+ }
+}
+
void BuildTabletExtraInfoMap(
const ClusterRawInfo& raw_info,
std::unordered_map<std::string, TabletExtraInfo>* extra_info_by_tablet_id)
{
diff --git a/src/kudu/rebalance/rebalancer.h b/src/kudu/rebalance/rebalancer.h
index 31d3364..15cf5bc 100644
--- a/src/kudu/rebalance/rebalancer.h
+++ b/src/kudu/rebalance/rebalancer.h
@@ -193,6 +193,20 @@ class Rebalancer {
const MovesInProgress& moves_in_progress,
ClusterInfo* info) const;
+ struct TabletInfo {
+ std::string tablet_id;
+ boost::optional<int64_t> config_idx; // For CAS-like change of Raft
configs.
+ };
+
+ // Mapping tserver UUID to tablets on it.
+ typedef std::unordered_map<std::string, std::vector<TabletInfo>>
TServersToEmptyMap;
+ void GetTServersToEmpty(const ClusterRawInfo& raw_info,
+ std::unordered_set<std::string>* tservers_to_empty)
const;
+ static void BuildTServersToEmptyInfo(const ClusterRawInfo& raw_info,
+ const MovesInProgress&
moves_in_progress,
+ const std::unordered_set<std::string>&
tservers_to_empty,
+ TServersToEmptyMap*
tservers_to_empty_map);
+
protected:
// Helper class to find and schedule next available rebalancing move
operation
// and track already scheduled ones.
diff --git a/src/kudu/tools/rebalancer_tool.cc
b/src/kudu/tools/rebalancer_tool.cc
index 45d22b4..b69ca05 100644
--- a/src/kudu/tools/rebalancer_tool.cc
+++ b/src/kudu/tools/rebalancer_tool.cc
@@ -110,8 +110,8 @@ Status RebalancerTool::PrintStats(ostream& out) {
return Status::OK();
}
- // Print information about replica count of healthy ignored tservers.
- RETURN_NOT_OK(PrintIgnoredTserversStats(ci, out));
+ // Print information about tservers need to empty.
+ RETURN_NOT_OK(PrintIgnoredTserversStats(raw_info, out));
if (ts_id_by_location.size() == 1) {
// That's about printing information about the whole cluster.
@@ -350,16 +350,20 @@ Status RebalancerTool::KsckResultsToClusterRawInfo(const
boost::optional<string>
return Status::OK();
}
-Status RebalancerTool::PrintIgnoredTserversStats(const ClusterInfo& ci,
+Status RebalancerTool::PrintIgnoredTserversStats(const ClusterRawInfo&
raw_info,
ostream& out) const {
- const auto& tservers_to_empty = ci.tservers_to_empty;
- if (tservers_to_empty.empty()) {
+ if (config_.ignored_tservers.empty() ||
!config_.move_replicas_from_ignored_tservers) {
return Status::OK();
}
+ unordered_set<string> tservers_to_empty;
+ GetTServersToEmpty(raw_info, &tservers_to_empty);
+ TServersToEmptyMap tservers_to_empty_map;
+ BuildTServersToEmptyInfo(raw_info, MovesInProgress(), tservers_to_empty,
&tservers_to_empty_map);
out << "Per-server replica distribution summary for tservers_to_empty:" <<
endl;
DataTable summary({"Server UUID", "Replica Count"});
- for (const auto& elem: tservers_to_empty) {
- summary.AddRow({ elem.first, to_string(elem.second) });
+ for (const auto& ts : tservers_to_empty) {
+ auto* tablets = FindOrNull(tservers_to_empty_map, ts);
+ summary.AddRow({ts, tablets ? to_string(tablets->size()) : "0"});
}
RETURN_NOT_OK(summary.PrintTo(out));
out << endl;
@@ -1376,52 +1380,26 @@ Status
RebalancerTool::IgnoredTserversRunner::GetReplaceMoves(
const rebalance::ClusterInfo& ci,
const rebalance::ClusterRawInfo& raw_info,
vector<Rebalancer::ReplicaMove>* replica_moves) {
- RETURN_NOT_OK(CheckIgnoredTServers(raw_info, ci));
+ unordered_set<string> tservers_to_empty;
+ rebalancer_->GetTServersToEmpty(raw_info, &tservers_to_empty);
+ RETURN_NOT_OK(CheckIgnoredTServers(raw_info, ci, tservers_to_empty));
- // Build IgnoredTserversInfo.
- IgnoredTserversInfo ignored_tservers_info;
- for (const auto& tablet_summary : raw_info.tablet_summaries) {
- if (ContainsKey(scheduled_moves_, tablet_summary.id)) {
- continue;
- }
- if (tablet_summary.result != cluster_summary::HealthCheckResult::HEALTHY &&
- tablet_summary.result !=
cluster_summary::HealthCheckResult::RECOVERING) {
- LOG(INFO) << Substitute("tablet $0: not considering replicas for
movement "
- "since the tablet's status is '$1'",
- tablet_summary.id,
-
cluster_summary::HealthCheckResultToString(tablet_summary.result));
- continue;
- }
- TabletInfo tablet_info;
- for (const auto& replica_info : tablet_summary.replicas) {
- if (replica_info.is_leader && replica_info.consensus_state) {
- const auto& cstate = *replica_info.consensus_state;
- if (cstate.opid_index) {
- tablet_info.tablet_id = tablet_summary.id;
- tablet_info.config_idx = *cstate.opid_index;
- break;
- }
- }
- }
- for (const auto& replica_info : tablet_summary.replicas) {
- if (!ContainsKey(ci.tservers_to_empty, replica_info.ts_uuid)) {
- continue;
- }
- auto& tablets = LookupOrEmplace(
- &ignored_tservers_info, replica_info.ts_uuid, vector<TabletInfo>());
- tablets.emplace_back(tablet_info);
- }
- }
- GetMovesFromIgnoredTservers(ignored_tservers_info, replica_moves);
+ TServersToEmptyMap tservers_to_empty_map;
+ BuildTServersToEmptyInfo(raw_info, scheduled_moves_, tservers_to_empty,
&tservers_to_empty_map);
+
+ GetMovesFromIgnoredTservers(tservers_to_empty_map, replica_moves);
return Status::OK();
}
-Status RebalancerTool::IgnoredTserversRunner::CheckIgnoredTServers(const
ClusterRawInfo& raw_info,
- const
ClusterInfo& ci) {
- if (ci.tservers_to_empty.empty()) {
+Status RebalancerTool::IgnoredTserversRunner::CheckIgnoredTServers(
+ const ClusterRawInfo& raw_info,
+ const ClusterInfo& ci,
+ const unordered_set<string>& tservers_to_empty) {
+ if (tservers_to_empty.empty()) {
return Status::OK();
}
+ // Check whether it is possible to move replicas emptying some tablet
servers.
int remaining_tservers_count =
ci.balance.servers_by_total_replica_count.size();
int max_replication_factor = 0;
for (const auto& s : raw_info.table_summaries) {
@@ -1435,18 +1413,19 @@ Status
RebalancerTool::IgnoredTserversRunner::CheckIgnoredTServers(const Cluster
max_replication_factor));
}
- for (const auto& it : ci.tservers_to_empty) {
- const string& ts_uuid = it.first;
- if (!ContainsKey(raw_info.tservers_in_maintenance_mode, ts_uuid)) {
- return Status::IllegalState(Substitute(
- "You should set maintenance mode for tablet server $0 first",
ts_uuid));
+ // Make sure tablet servers that we need to empty are set maintenance mode.
+ for (const auto& ts : tservers_to_empty) {
+ if (!ContainsKey(raw_info.tservers_in_maintenance_mode, ts)) {
+ return Status::IllegalState(
+ Substitute("You should set maintenance mode for tablet server $0
first", ts));
}
}
+
return Status::OK();
}
void RebalancerTool::IgnoredTserversRunner::GetMovesFromIgnoredTservers(
- const IgnoredTserversInfo& ignored_tservers_info,
+ const TServersToEmptyMap& ignored_tservers_info,
vector<Rebalancer::ReplicaMove>* replica_moves) {
DCHECK(replica_moves);
if (ignored_tservers_info.empty()) {
diff --git a/src/kudu/tools/rebalancer_tool.h b/src/kudu/tools/rebalancer_tool.h
index fb601b4..2beda27 100644
--- a/src/kudu/tools/rebalancer_tool.h
+++ b/src/kudu/tools/rebalancer_tool.h
@@ -343,14 +343,6 @@ class RebalancerTool : public rebalance::Rebalancer {
// Key is tserver UUID which corresponds to value.ts_uuid_from.
typedef std::unordered_multimap<std::string, Rebalancer::ReplicaMove>
MovesToSchedule;
- struct TabletInfo {
- std::string tablet_id;
- boost::optional<int64_t> config_idx; // For CAS-like change of Raft
configs.
- };
-
- // Mapping tserver UUID to tablets on it.
- typedef std::unordered_map<std::string, std::vector<TabletInfo>>
IgnoredTserversInfo;
-
// Get replica moves to move replicas from healthy ignored tservers.
// If returns Status::OK() with replica_moves empty, there would be
// no replica on the healthy ignored tservers.
@@ -360,10 +352,11 @@ class RebalancerTool : public rebalance::Rebalancer {
// Return Status::OK() if it's safe to move all replicas from the ignored
to other servers
// and all tservers that need to empty are in maintenance mode.
- Status CheckIgnoredTServers(const rebalance::ClusterRawInfo& raw_info,
- const rebalance::ClusterInfo& ci);
+ static Status CheckIgnoredTServers(const rebalance::ClusterRawInfo&
raw_info,
+ const rebalance::ClusterInfo& ci,
+ const std::unordered_set<std::string>&
tservers_to_empty);
- void GetMovesFromIgnoredTservers(const IgnoredTserversInfo&
ignored_tservers_info,
+ void GetMovesFromIgnoredTservers(const TServersToEmptyMap&
ignored_tservers_info,
std::vector<Rebalancer::ReplicaMove>*
replica_moves);
// Random device and generator for selecting among multiple choices, when
appropriate.
@@ -380,8 +373,8 @@ class RebalancerTool : public rebalance::Rebalancer {
const KsckResults& ksck_info,
rebalance::ClusterRawInfo* raw_info);
- // Print replica count infomation on ClusterInfo::tservers_to_empty.
- Status PrintIgnoredTserversStats(const rebalance::ClusterInfo& ci,
+ // Print replica count infomation about tservers need to empty.
+ Status PrintIgnoredTserversStats(const rebalance::ClusterRawInfo& raw_info,
std::ostream& out) const;
// Print information on the cross-location balance.