saintstack commented on a change in pull request #3356:
URL: https://github.com/apache/hbase/pull/3356#discussion_r668085556
##########
File path:
hbase-balancer/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
##########
@@ -303,54 +303,55 @@ private String getBalanceReason(double total, double
sumMultiplier) {
boolean needsBalance(TableName tableName, BalancerClusterState cluster) {
ClusterLoadState cs = new ClusterLoadState(cluster.clusterState);
if (cs.getNumServers() < MIN_SERVER_BALANCE) {
- if (LOG.isDebugEnabled()) {
- LOG.debug("Not running balancer because only " + cs.getNumServers()
+ LOG.info("Not running balancer because only " + cs.getNumServers()
+ " active regionserver(s)");
- }
sendRejectionReasonToRingBuffer(() -> "The number of RegionServers " +
cs.getNumServers() +
" < MIN_SERVER_BALANCE(" + MIN_SERVER_BALANCE + ")", null);
return false;
}
if (areSomeRegionReplicasColocated(cluster)) {
+ LOG.info("Running balancer because at least one server hosts replicas of
the same region.");
Review comment:
good
##########
File path:
hbase-balancer/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
##########
@@ -303,54 +303,55 @@ private String getBalanceReason(double total, double
sumMultiplier) {
boolean needsBalance(TableName tableName, BalancerClusterState cluster) {
ClusterLoadState cs = new ClusterLoadState(cluster.clusterState);
if (cs.getNumServers() < MIN_SERVER_BALANCE) {
- if (LOG.isDebugEnabled()) {
- LOG.debug("Not running balancer because only " + cs.getNumServers()
+ LOG.info("Not running balancer because only " + cs.getNumServers()
+ " active regionserver(s)");
- }
sendRejectionReasonToRingBuffer(() -> "The number of RegionServers " +
cs.getNumServers() +
" < MIN_SERVER_BALANCE(" + MIN_SERVER_BALANCE + ")", null);
return false;
}
if (areSomeRegionReplicasColocated(cluster)) {
+ LOG.info("Running balancer because at least one server hosts replicas of
the same region.");
return true;
}
if (idleRegionServerExist(cluster)){
+ LOG.info("Running balancer because cluster has idle server(s).");
Review comment:
And idea is that a rebalance would make it so the load is distributed?
##########
File path:
hbase-balancer/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
##########
@@ -423,8 +424,9 @@ private long calculateMaxSteps(BalancerClusterState
cluster) {
maxSteps);
}
}
- LOG.info("start StochasticLoadBalancer.balancer, initCost=" + currentCost
+ ", functionCost="
- + functionCost() + " computedMaxSteps: " + computedMaxSteps);
+ LOG.info("Start StochasticLoadBalancer.balancer, initial weighted average
imbalance={}, "
Review comment:
Could we say anything here on what these values mean ? For the operator?
(Need a ',' after the functionCost).
##########
File path:
hbase-balancer/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
##########
@@ -303,54 +303,55 @@ private String getBalanceReason(double total, double
sumMultiplier) {
boolean needsBalance(TableName tableName, BalancerClusterState cluster) {
ClusterLoadState cs = new ClusterLoadState(cluster.clusterState);
if (cs.getNumServers() < MIN_SERVER_BALANCE) {
- if (LOG.isDebugEnabled()) {
- LOG.debug("Not running balancer because only " + cs.getNumServers()
+ LOG.info("Not running balancer because only " + cs.getNumServers()
+ " active regionserver(s)");
- }
sendRejectionReasonToRingBuffer(() -> "The number of RegionServers " +
cs.getNumServers() +
" < MIN_SERVER_BALANCE(" + MIN_SERVER_BALANCE + ")", null);
return false;
}
if (areSomeRegionReplicasColocated(cluster)) {
+ LOG.info("Running balancer because at least one server hosts replicas of
the same region.");
return true;
}
if (idleRegionServerExist(cluster)){
+ LOG.info("Running balancer because cluster has idle server(s).");
return true;
}
+ sumMultiplier = 0.0f;
double total = 0.0;
- float sumMultiplier = 0.0f;
for (CostFunction c : costFunctions) {
float multiplier = c.getMultiplier();
- if (multiplier <= 0) {
- LOG.trace("{} not needed because multiplier is <= 0",
c.getClass().getSimpleName());
- continue;
- }
+ double cost = c.cost();
if (!c.isNeeded()) {
LOG.trace("{} not needed", c.getClass().getSimpleName());
continue;
}
+ total += cost * multiplier;
sumMultiplier += multiplier;
- total += c.cost() * multiplier;
+ }
+ if (sumMultiplier <= 0) {
+ LOG.error("At least one cost function needs a multiplier > 0. For
example, set "
Review comment:
Good
##########
File path:
hbase-balancer/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
##########
@@ -423,8 +424,9 @@ private long calculateMaxSteps(BalancerClusterState
cluster) {
maxSteps);
}
}
- LOG.info("start StochasticLoadBalancer.balancer, initCost=" + currentCost
+ ", functionCost="
- + functionCost() + " computedMaxSteps: " + computedMaxSteps);
+ LOG.info("Start StochasticLoadBalancer.balancer, initial weighted average
imbalance={}, "
Review comment:
imbalance?
(Looked it up... "lack of proportion or relation between corresponding
things." <= Nice)
##########
File path:
hbase-balancer/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
##########
@@ -303,54 +303,55 @@ private String getBalanceReason(double total, double
sumMultiplier) {
boolean needsBalance(TableName tableName, BalancerClusterState cluster) {
ClusterLoadState cs = new ClusterLoadState(cluster.clusterState);
if (cs.getNumServers() < MIN_SERVER_BALANCE) {
- if (LOG.isDebugEnabled()) {
- LOG.debug("Not running balancer because only " + cs.getNumServers()
+ LOG.info("Not running balancer because only " + cs.getNumServers()
+ " active regionserver(s)");
- }
sendRejectionReasonToRingBuffer(() -> "The number of RegionServers " +
cs.getNumServers() +
" < MIN_SERVER_BALANCE(" + MIN_SERVER_BALANCE + ")", null);
return false;
}
if (areSomeRegionReplicasColocated(cluster)) {
+ LOG.info("Running balancer because at least one server hosts replicas of
the same region.");
return true;
}
if (idleRegionServerExist(cluster)){
+ LOG.info("Running balancer because cluster has idle server(s).");
return true;
}
+ sumMultiplier = 0.0f;
double total = 0.0;
- float sumMultiplier = 0.0f;
for (CostFunction c : costFunctions) {
float multiplier = c.getMultiplier();
- if (multiplier <= 0) {
- LOG.trace("{} not needed because multiplier is <= 0",
c.getClass().getSimpleName());
- continue;
- }
+ double cost = c.cost();
if (!c.isNeeded()) {
LOG.trace("{} not needed", c.getClass().getSimpleName());
continue;
}
+ total += cost * multiplier;
sumMultiplier += multiplier;
- total += c.cost() * multiplier;
+ }
+ if (sumMultiplier <= 0) {
+ LOG.error("At least one cost function needs a multiplier > 0. For
example, set "
+ + "hbase.master.balancer.stochastic.regionCountCost to a positive
value or default");
+ return false;
}
- boolean balanced = total <= 0 || sumMultiplier <= 0 ||
- (sumMultiplier > 0 && (total / sumMultiplier) < minCostNeedBalance);
+ boolean balanced = (total / sumMultiplier < minCostNeedBalance);
+
if (balanced) {
final double calculatedTotal = total;
- final double calculatedMultiplier = sumMultiplier;
- sendRejectionReasonToRingBuffer(() -> getBalanceReason(calculatedTotal,
calculatedMultiplier),
- costFunctions);
- }
- if (LOG.isDebugEnabled()) {
- LOG.debug("{} {}; total cost={}, sum multiplier={}; cost/multiplier to
need a balance is {}",
- balanced ? "Skipping load balancing because balanced" : "We need to
load balance",
- isByTable ? String.format("table (%s)", tableName) : "cluster",
- total, sumMultiplier, minCostNeedBalance);
- if (LOG.isTraceEnabled()) {
- LOG.trace("Balance decision detailed function costs={}",
functionCost());
- }
+ sendRejectionReasonToRingBuffer(() ->
+ getBalanceReason(calculatedTotal, sumMultiplier), costFunctions);
+ LOG.info("{} - skipping load balancing because weighted average
imbalance={} <= "
+ + "threshold({}). If you want more aggressive balancing, either
lower "
+ + "hbase.master.balancer.stochastic.minCostNeedBalance from {} or
increase the relative "
+ + "multiplier(s) of the specific cost function(s). functionCost={}",
Review comment:
Good
##########
File path:
hbase-balancer/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
##########
@@ -303,54 +302,54 @@ private String getBalanceReason(double total, double
sumMultiplier) {
boolean needsBalance(TableName tableName, BalancerClusterState cluster) {
ClusterLoadState cs = new ClusterLoadState(cluster.clusterState);
if (cs.getNumServers() < MIN_SERVER_BALANCE) {
- if (LOG.isDebugEnabled()) {
- LOG.debug("Not running balancer because only " + cs.getNumServers()
+ LOG.info("Not running balancer because only " + cs.getNumServers()
+ " active regionserver(s)");
- }
sendRejectionReasonToRingBuffer(() -> "The number of RegionServers " +
cs.getNumServers() +
" < MIN_SERVER_BALANCE(" + MIN_SERVER_BALANCE + ")", null);
return false;
}
if (areSomeRegionReplicasColocated(cluster)) {
+ LOG.info("Running balancer because at least one server hosts replicas of
the same region.");
return true;
}
if (idleRegionServerExist(cluster)){
+ LOG.info("Running balancer because cluster has idle server(s).");
return true;
}
+ sumMultiplier = 0.0f;
double total = 0.0;
- float sumMultiplier = 0.0f;
for (CostFunction c : costFunctions) {
float multiplier = c.getMultiplier();
- if (multiplier <= 0) {
- LOG.trace("{} not needed because multiplier is <= 0",
c.getClass().getSimpleName());
- continue;
- }
+ double cost = c.cost();
if (!c.isNeeded()) {
LOG.trace("{} not needed", c.getClass().getSimpleName());
continue;
}
+ total += cost * multiplier;
sumMultiplier += multiplier;
- total += c.cost() * multiplier;
+ }
+ if (sumMultiplier <= 0)
+ {
+ LOG.error("At least one cost function needs a multiplier > 0");
+ return false;
Review comment:
What about Seans' comment above?
##########
File path:
hbase-balancer/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
##########
@@ -470,17 +470,17 @@ private long calculateMaxSteps(BalancerClusterState
cluster) {
updateStochasticCosts(tableName, curOverallCost, curFunctionCosts);
if (initCost > currentCost) {
List<RegionPlan> plans = createRegionPlans(cluster);
- LOG.info("Finished computing new load balance plan. Computation took {}"
+
- " to try {} different iterations. Found a solution that moves " +
- "{} regions; Going from a computed cost of {}" +
- " to a new cost of {}", java.time.Duration.ofMillis(endTime -
startTime),
- step, plans.size(), initCost, currentCost);
+ LOG.info("Finished computing new load balance plan. Computation took {}
ms" +
+ " to try {} different iterations. Found a solution that moves " +
+ "{} regions; Going from a computed cost of {}" +
+ " to a new cost of {}. Move plan to be submitted to master to
execute",
+ endTime - startTime, step, plans.size(), initCost, currentCost);
sendRegionPlansToRingBuffer(plans, currentCost, initCost,
initFunctionTotalCosts, step);
return plans;
}
LOG.info("Could not find a better load balance plan. Tried {} different
configurations in " +
Review comment:
Was this addressed?
##########
File path:
hbase-balancer/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
##########
@@ -470,17 +470,17 @@ private long calculateMaxSteps(BalancerClusterState
cluster) {
updateStochasticCosts(tableName, curOverallCost, curFunctionCosts);
if (initCost > currentCost) {
List<RegionPlan> plans = createRegionPlans(cluster);
- LOG.info("Finished computing new load balance plan. Computation took {}"
+
- " to try {} different iterations. Found a solution that moves " +
- "{} regions; Going from a computed cost of {}" +
- " to a new cost of {}", java.time.Duration.ofMillis(endTime -
startTime),
- step, plans.size(), initCost, currentCost);
+ LOG.info("Finished computing new load balance plan. Computation took {}
ms" +
+ " to try {} different iterations. Found a solution that moves " +
+ "{} regions; Going from a computed cost of {}" +
+ " to a new cost of {}. Move plan to be submitted to master to
execute",
Review comment:
This addressed?
##########
File path:
hbase-balancer/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
##########
@@ -470,17 +470,17 @@ private long calculateMaxSteps(BalancerClusterState
cluster) {
updateStochasticCosts(tableName, curOverallCost, curFunctionCosts);
if (initCost > currentCost) {
List<RegionPlan> plans = createRegionPlans(cluster);
- LOG.info("Finished computing new load balance plan. Computation took {}"
+
- " to try {} different iterations. Found a solution that moves " +
- "{} regions; Going from a computed cost of {}" +
- " to a new cost of {}", java.time.Duration.ofMillis(endTime -
startTime),
- step, plans.size(), initCost, currentCost);
+ LOG.info("Finished computing new load balance plan. Computation took {}
ms" +
Review comment:
This addressed?
##########
File path:
hbase-balancer/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java
##########
@@ -343,15 +343,15 @@ boolean needsBalance(TableName tableName,
BalancerClusterState cluster) {
sendRejectionReasonToRingBuffer(() -> getBalanceReason(calculatedTotal,
calculatedMultiplier),
costFunctions);
}
- if (LOG.isDebugEnabled()) {
- LOG.debug("{} {}; total cost={}, sum multiplier={}; cost/multiplier to
need a balance is {}",
- balanced ? "Skipping load balancing because balanced" : "We need to
load balance",
- isByTable ? String.format("table (%s)", tableName) : "cluster",
- total, sumMultiplier, minCostNeedBalance);
- if (LOG.isTraceEnabled()) {
- LOG.trace("Balance decision detailed function costs={}",
functionCost());
- }
+ LOG.info("{} {}; total cost={}, sum multiplier={}; cost/multiplier to need
a balance is {}",
+ balanced ? "Skipping load balancing because balanced" :
Review comment:
I was thinking either explaining terms or pointing to doc or elsewhere
where terms are explained and how the balancer machinations engage w/ them. Can
do in a later PR, yes. This one is a big improvement as is.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]