This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new a20089f rebalancer_tool: limit max_moves_per_server for PolicyFixer
a20089f is described below
commit a20089faa0ed4bcc9aa3cf01d5b7d55c273faa12
Author: zhangyifan27 <[email protected]>
AuthorDate: Mon Oct 21 19:44:39 2019 +0800
rebalancer_tool: limit max_moves_per_server for PolicyFixer
This patch completed the TODO in PolicyFixer and did some
other cleanup in rebalancer_tool.
Change-Id: I3499299d1fb6bebae7351a1c7a7c9b114990db89
Reviewed-on: http://gerrit.cloudera.org:8080/14519
Tested-by: Kudu Jenkins
Reviewed-by: Alexey Serbin <[email protected]>
---
src/kudu/tools/rebalancer_tool.cc | 69 ++++++++++++++++++++++++---------------
src/kudu/tools/rebalancer_tool.h | 3 ++
2 files changed, 45 insertions(+), 27 deletions(-)
diff --git a/src/kudu/tools/rebalancer_tool.cc
b/src/kudu/tools/rebalancer_tool.cc
index 60231e1..7aa505f 100644
--- a/src/kudu/tools/rebalancer_tool.cc
+++ b/src/kudu/tools/rebalancer_tool.cc
@@ -779,7 +779,6 @@ bool
RebalancerTool::AlgoBasedRunner::ScheduleNextMove(bool* has_errors,
return true;
}
- DCHECK(!s.ok());
// The source replica is not found in the tablet's consensus config
// or the tablet does not exit anymore. The replica might already
// moved because of some other concurrent activity, e.g.
@@ -830,7 +829,6 @@ bool
RebalancerTool::AlgoBasedRunner::UpdateMovesInProgressStatus(
it = scheduled_moves_.erase(it);
continue;
}
- DCHECK(s.ok());
if (is_complete) {
// The move has completed (success or failure): update the stats on the
// pending operations per server.
@@ -1034,10 +1032,8 @@ void
RebalancerTool::AlgoBasedRunner::UpdateOnMoveScheduled(
bool is_success) {
if (is_success) {
Rebalancer::ReplicaMove move_info = { tablet_uuid, src_ts_uuid,
dst_ts_uuid };
- scheduled_moves_.emplace(tablet_uuid, std::move(move_info));
// Only one replica of a tablet can be moved at a time.
- // TODO(aserbin): clarify on duplicates
- //DCHECK(ins.second);
+ EmplaceOrDie(&scheduled_moves_, tablet_uuid, std::move(move_info));
}
UpdateOnMoveScheduledImpl(idx, src_ts_uuid, is_success, &src_op_indices_);
UpdateOnMoveScheduledImpl(idx, dst_ts_uuid, is_success, &dst_op_indices_);
@@ -1164,25 +1160,9 @@ bool RebalancerTool::PolicyFixer::ScheduleNextMove(bool*
has_errors,
*has_errors = true;
return false;
}
-
- // Remove the element from moves_to_schedule_.
- bool erased = false;
- auto range = moves_to_schedule_.equal_range(move_info.ts_uuid_from);
- for (auto it = range.first; it != range.second; ++it) {
- if (move_info.tablet_uuid == it->second.tablet_uuid) {
- moves_to_schedule_.erase(it);
- erased = true;
- break;
- }
- }
- CHECK(erased) << Substitute("T $0 P $1: move information not found",
- move_info.tablet_uuid, move_info.ts_uuid_from);
LOG(INFO) << Substitute("tablet $0: '$1' -> '?' move scheduled",
move_info.tablet_uuid, move_info.ts_uuid_from);
- // Add information on scheduled move into the scheduled_moves_.
- // Only one replica of a tablet can be moved at a time.
- auto tablet_uuid = move_info.tablet_uuid;
- EmplaceOrDie(&scheduled_moves_, std::move(tablet_uuid),
std::move(move_info));
+ UpdateOnMoveScheduled(std::move(move_info));
return true;
}
@@ -1215,7 +1195,6 @@ bool
RebalancerTool::PolicyFixer::UpdateMovesInProgressStatus(
it = scheduled_moves_.erase(it);
continue;
}
- DCHECK(s.ok());
if (is_complete) {
// The replacement has completed (success or failure): update the stats
// on the pending operations per server.
@@ -1297,13 +1276,14 @@ Status RebalancerTool::PolicyFixer::GetNextMovesImpl(
bool RebalancerTool::PolicyFixer::FindNextMove(Rebalancer::ReplicaMove* move) {
DCHECK(move);
- // TODO(aserbin): use pessimistic /2 limit for max_moves_per_servers_
- // since the desitnation servers for the move of the replica marked with
+ // use pessimistic /2 limit for max_moves_per_server_ since the
+ // desitnation servers for the move of the replica marked with
// the REPLACE attribute is not known.
// Load the least loaded (in terms of scheduled moves) tablet servers first.
- for (const auto& elem : ts_per_op_count_) {
- const auto& ts_uuid = elem.second;
+ for (auto it = ts_per_op_count_.begin(); it != ts_per_op_count_.end() &&
+ it->first <= max_moves_per_server_ / 2; ++it) {
+ const auto& ts_uuid = it->second;
if (FindCopy(moves_to_schedule_, ts_uuid, move)) {
return true;
}
@@ -1311,5 +1291,40 @@ bool
RebalancerTool::PolicyFixer::FindNextMove(Rebalancer::ReplicaMove* move) {
return false;
}
+void
RebalancerTool::PolicyFixer::UpdateOnMoveScheduled(Rebalancer::ReplicaMove
move) {
+ const auto tablet_uuid = move.tablet_uuid;
+ const auto ts_uuid = move.ts_uuid_from;
+
+ // Add information on scheduled move into the scheduled_moves_.
+ // Only one replica of a tablet can be moved at a time.
+ EmplaceOrDie(&scheduled_moves_, tablet_uuid, std::move(move));
+
+ // Remove the element from moves_to_schedule_.
+ bool erased = false;
+ auto range = moves_to_schedule_.equal_range(ts_uuid);
+ for (auto it = range.first; it != range.second; ++it) {
+ if (tablet_uuid == it->second.tablet_uuid) {
+ moves_to_schedule_.erase(it);
+ erased = true;
+ break;
+ }
+ }
+ CHECK(erased) << Substitute("T $0 P $1: move information not found",
tablet_uuid, ts_uuid);
+
+ // Update helper containers.
+ const auto op_count = op_count_per_ts_[ts_uuid]++;
+ const auto op_range = ts_per_op_count_.equal_range(op_count);
+ bool ts_op_count_updated = false;
+ for (auto it = op_range.first; it != op_range.second; ++it) {
+ if (it->second == ts_uuid) {
+ ts_per_op_count_.erase(it);
+ ts_per_op_count_.emplace(op_count + 1, ts_uuid);
+ ts_op_count_updated = true;
+ break;
+ }
+ }
+ DCHECK(ts_op_count_updated);
+}
+
} // namespace tools
} // namespace kudu
diff --git a/src/kudu/tools/rebalancer_tool.h b/src/kudu/tools/rebalancer_tool.h
index 2b1f33f..7747e67 100644
--- a/src/kudu/tools/rebalancer_tool.h
+++ b/src/kudu/tools/rebalancer_tool.h
@@ -288,6 +288,9 @@ class RebalancerTool : public rebalance::Rebalancer {
bool FindNextMove(Rebalancer::ReplicaMove* move);
+ // Update the helper containers once a move operation has been scheduled.
+ void UpdateOnMoveScheduled(Rebalancer::ReplicaMove move);
+
// Moves yet to schedule.
MovesToSchedule moves_to_schedule_;
};