This is an automated email from the ASF dual-hosted git repository.
sammichen pushed a commit to branch HDDS-5713
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/HDDS-5713 by this push:
new 2fa932c10c9 HDDS-13667. [DiskBalancer] Improve DiskBalancer CLI
message for failed commands (#9091)
2fa932c10c9 is described below
commit 2fa932c10c9a1d9d009b4bccea6abc618c1c408f
Author: Gargi Jaiswal <[email protected]>
AuthorDate: Thu Oct 16 13:51:30 2025 +0530
HDDS-13667. [DiskBalancer] Improve DiskBalancer CLI message for failed
commands (#9091)
---
.../hadoop/hdds/scm/node/DiskBalancerManager.java | 34 ++++++++++++++--
.../cli/datanode/DiskBalancerCommonOptions.java | 3 +-
.../cli/datanode/DiskBalancerStartSubcommand.java | 8 ++--
.../cli/datanode/DiskBalancerStopSubcommand.java | 7 ++--
.../cli/datanode/DiskBalancerUpdateSubcommand.java | 8 ++--
.../smoketest/diskbalancer/testdiskbalancer.robot | 2 +-
...skBalancerDuringDecommissionAndMaintenance.java | 45 ++++++++++------------
7 files changed, 65 insertions(+), 42 deletions(-)
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DiskBalancerManager.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DiskBalancerManager.java
index 61006ebf3b2..932de3c1aa4 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DiskBalancerManager.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DiskBalancerManager.java
@@ -110,7 +110,7 @@ public List<HddsProtos.DatanodeDiskBalancerInfoProto>
getDiskBalancerStatus(
try {
NodeStatus nodeStatus = nodeManager.getNodeStatus(dn);
if (nodeStatus != NodeStatus.inServiceHealthy()) {
- LOG.warn("Datanode {} is not in optimal state for disk
balancing." +
+ LOG.warn("Datanode {} is not in healthy state for disk
balancing." +
" NodeStatus: {}", dn.getHostName(), nodeStatus);
return false;
}
@@ -164,9 +164,7 @@ public List<DatanodeAdminError> startDiskBalancer(
List<DatanodeAdminError> errors = new ArrayList<>();
for (DatanodeDetails dn : dns) {
try {
- if (!nodeManager.getNodeStatus(dn).isHealthy()) {
- errors.add(new DatanodeAdminError(dn.getHostName(),
- "Datanode not in healthy state"));
+ if (!isDatanodeInHealthyState(dn, errors)) {
continue;
}
// If command doesn't have configuration change, then we reuse the
@@ -202,6 +200,9 @@ public List<DatanodeAdminError>
stopDiskBalancer(List<String> hosts)
List<DatanodeAdminError> errors = new ArrayList<>();
for (DatanodeDetails dn : dns) {
try {
+ if (!isDatanodeInHealthyState(dn, errors)) {
+ continue;
+ }
DiskBalancerCommand command = new DiskBalancerCommand(
HddsProtos.DiskBalancerOpType.STOP, null);
sendCommand(dn, command);
@@ -237,6 +238,9 @@ public List<DatanodeAdminError>
updateDiskBalancerConfiguration(
List<DatanodeAdminError> errors = new ArrayList<>();
for (DatanodeDetails dn : dns) {
try {
+ if (!isDatanodeInHealthyState(dn, errors)) {
+ continue;
+ }
// If command doesn't have configuration change, then we reuse the
// latest configuration reported from Datnaodes
DiskBalancerConfiguration updateConf = attachDiskBalancerConf(dn,
@@ -251,6 +255,28 @@ public List<DatanodeAdminError>
updateDiskBalancerConfiguration(
return errors;
}
+ /**
+ * Checks if the datanode is in healthy state for disk balancing operations.
+ * A datanode is considered to be in heathy state when it has both:
+ * - NodeOperationalState.IN_SERVICE(not decommissioning, decommissioned, or
in maintenance)
+ * - NodeState.HEALTHY(not stale, dead, or readonly)
+ *
+ * @param dn the DatanodeDetails to check
+ * @param errors list to add any error messages if the datanode is not in
healthy state
+ * @return true if the datanode is in healthy state (IN_SERVICE and
HEALTHY), false otherwise
+ */
+ private boolean isDatanodeInHealthyState(DatanodeDetails dn,
+ List<DatanodeAdminError> errors) throws NodeNotFoundException {
+ NodeStatus nodeStatus = nodeManager.getNodeStatus(dn);
+ if (!nodeStatus.equals(NodeStatus.inServiceHealthy())) {
+ errors.add(new DatanodeAdminError(dn.getHostName(),
+ "Datanode is not in healthy state for disk balancing. " +
+ "NodeStatus: " + nodeStatus));
+ return false;
+ }
+ return true;
+ }
+
private boolean shouldReturnDatanode(
HddsProtos.DiskBalancerRunningStatus status,
DatanodeDetails datanodeDetails) {
diff --git
a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/datanode/DiskBalancerCommonOptions.java
b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/datanode/DiskBalancerCommonOptions.java
index 2ec295682dc..a5127d29318 100644
---
a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/datanode/DiskBalancerCommonOptions.java
+++
b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/datanode/DiskBalancerCommonOptions.java
@@ -57,7 +57,8 @@ public boolean check() {
}
public String getHostString() {
- return isAllHosts() ? "All datanodes" : String.join("\n", getDatanodes());
+ return isAllHosts() ? "which are IN_SERVICE and HEALTHY."
+ : ":\n" + String.join("\n", getDatanodes());
}
public List<String> getSpecifiedDatanodes() {
diff --git
a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/datanode/DiskBalancerStartSubcommand.java
b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/datanode/DiskBalancerStartSubcommand.java
index f2731f86b6f..21046791137 100644
---
a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/datanode/DiskBalancerStartSubcommand.java
+++
b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/datanode/DiskBalancerStartSubcommand.java
@@ -69,10 +69,10 @@ public void execute(ScmClient scmClient) throws IOException
{
scmClient.startDiskBalancer(threshold, bandwidthInMB, parallelThread,
stopAfterDiskEven,
commonOptions.getSpecifiedDatanodes());
- System.out.println("Start DiskBalancer on datanode(s):\n" +
- commonOptions.getHostString());
-
- if (!errors.isEmpty()) {
+ if (errors.isEmpty()) {
+ System.out.println("Starting DiskBalancer on datanode(s) " +
+ commonOptions.getHostString());
+ } else {
for (DatanodeAdminError error : errors) {
System.err.println("Error: " + error.getHostname() + ": "
+ error.getError());
diff --git
a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/datanode/DiskBalancerStopSubcommand.java
b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/datanode/DiskBalancerStopSubcommand.java
index cfb0cd60499..e506ad170bf 100644
---
a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/datanode/DiskBalancerStopSubcommand.java
+++
b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/datanode/DiskBalancerStopSubcommand.java
@@ -49,9 +49,10 @@ public void execute(ScmClient scmClient) throws IOException {
List<DatanodeAdminError> errors = scmClient.stopDiskBalancer(
commonOptions.getSpecifiedDatanodes());
- System.out.println("Stopping DiskBalancer on datanode(s):\n" +
- commonOptions.getHostString());
- if (!errors.isEmpty()) {
+ if (errors.isEmpty()) {
+ System.out.println("Stopping DiskBalancer on datanode(s) " +
+ commonOptions.getHostString());
+ } else {
for (DatanodeAdminError error : errors) {
System.err.println("Error: " + error.getHostname() + ": "
+ error.getError());
diff --git
a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/datanode/DiskBalancerUpdateSubcommand.java
b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/datanode/DiskBalancerUpdateSubcommand.java
index 37e50ad6c94..0f1f96e7e04 100644
---
a/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/datanode/DiskBalancerUpdateSubcommand.java
+++
b/hadoop-ozone/cli-admin/src/main/java/org/apache/hadoop/hdds/scm/cli/datanode/DiskBalancerUpdateSubcommand.java
@@ -69,10 +69,10 @@ public void execute(ScmClient scmClient) throws IOException
{
scmClient.updateDiskBalancerConfiguration(threshold, bandwidthInMB,
parallelThread, stopAfterDiskEven,
commonOptions.getSpecifiedDatanodes());
- System.out.println("Update DiskBalancer Configuration on datanode(s):\n" +
- commonOptions.getHostString());
-
- if (!errors.isEmpty()) {
+ if (errors.isEmpty()) {
+ System.out.println("Update DiskBalancer Configuration on datanode(s) " +
+ commonOptions.getHostString());
+ } else {
for (DatanodeAdminError error : errors) {
System.err.println("Error: " + error.getHostname() + ": "
+ error.getError());
diff --git
a/hadoop-ozone/dist/src/main/smoketest/diskbalancer/testdiskbalancer.robot
b/hadoop-ozone/dist/src/main/smoketest/diskbalancer/testdiskbalancer.robot
index 007ca42edcd..45fd9384e87 100644
--- a/hadoop-ozone/dist/src/main/smoketest/diskbalancer/testdiskbalancer.robot
+++ b/hadoop-ozone/dist/src/main/smoketest/diskbalancer/testdiskbalancer.robot
@@ -27,7 +27,7 @@ Check failure with non-admin user to start, stop and update
diskbalancer
Check success with admin user for start, stop and update diskbalancer
Run Keyword Kinit test user testuser
testuser.keytab
${result} = Execute ozone admin datanode
diskbalancer start -a
- Should Contain ${result}
Start DiskBalancer on datanode(s)
+ Should Contain ${result}
Starting DiskBalancer on datanode(s)
${result} = Execute ozone admin datanode
diskbalancer stop -a
Should Contain ${result}
Stopping DiskBalancer on datanode(s)
${result} = Execute ozone admin datanode
diskbalancer update -t 0.0002 -a
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/node/TestDiskBalancerDuringDecommissionAndMaintenance.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/node/TestDiskBalancerDuringDecommissionAndMaintenance.java
index 791182ed1ad..b6a15a268af 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/node/TestDiskBalancerDuringDecommissionAndMaintenance.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/node/TestDiskBalancerDuringDecommissionAndMaintenance.java
@@ -36,6 +36,7 @@
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.scm.DatanodeAdminError;
import org.apache.hadoop.hdds.scm.HddsTestUtils;
import org.apache.hadoop.hdds.scm.PlacementPolicy;
import org.apache.hadoop.hdds.scm.ScmConfigKeys;
@@ -256,28 +257,26 @@ public void testStopDiskBalancerOnDecommissioningNode()
throws Exception {
100, 5000);
// Attempt to stop disk balancer on the decommissioning DN
- diskBalancerManager.stopDiskBalancer(dnAddressList);
+ List<DatanodeAdminError> dnErrors =
diskBalancerManager.stopDiskBalancer(dnAddressList);
- // Verify disk balancer is now explicitly stopped (operationalState
becomes STOPPED)
- final String expectedLogForStop =
- "DiskBalancer operational state changing from PAUSED_BY_NODE_STATE to
STOPPED";
- GenericTestUtils.waitFor(() ->
serviceLog.getOutput().contains(expectedLogForStop),
- 100, 5000);
+ // Verify disk balancer stop command is not sent to decommissioning DN
+ assertEquals(1, dnErrors.size());
+ assertTrue(dnErrors.get(0).getError()
+ .contains("Datanode is not in healthy state for disk balancing"));
//Recommission the node
scmClient.recommissionNodes(dnAddressList);
waitForDnToReachOpState(nm, dn, IN_SERVICE);
- // Verify it does not automatically restart (since it was explicitly
stopped)
- HddsProtos.DatanodeDiskBalancerInfoProto statusAfterRecommission =
- diskBalancerManager.getDiskBalancerStatus(dnAddressList, null,
- ClientVersion.CURRENT_VERSION).stream().findFirst().orElse(null);
- assertEquals(HddsProtos.DiskBalancerRunningStatus.STOPPED,
statusAfterRecommission.getRunningStatus());
+ // Verify it automatically resumes (since explicit stop command was not
sent)
+ GenericTestUtils.waitFor(() -> {
+ String dnLogs = serviceLog.getOutput();
+ return dnLogs.contains("Resuming DiskBalancerService to running state as
Node state changed to IN_SERVICE.");
+ }, 100, 5000);
}
@Test
public void testStartDiskBalancerOnDecommissioningNode() throws Exception {
- LogCapturer serviceLog =
LogCapturer.captureLogs(DiskBalancerService.class);
LogCapturer supervisorLog =
LogCapturer.captureLogs(ReplicationSupervisor.class);
List<HddsDatanodeService> dns = cluster.getHddsDatanodes();
@@ -307,30 +306,26 @@ public void testStartDiskBalancerOnDecommissioningNode()
throws Exception {
100, 5000);
// Attempt to start disk balancer on the decommissioning DN
- diskBalancerManager.startDiskBalancer(
- 10.0,
- 10L,
- 1,
- false,
- dnAddressList);
+ List<DatanodeAdminError> dnErrors = diskBalancerManager.startDiskBalancer(
+ 10.0, 10L, 1, false, dnAddressList);
- // Verify disk balancer goes to PAUSED_BY_NODE_STATE
- final String expectedLogForPause =
- "DiskBalancer operational state changing from STOPPED to
PAUSED_BY_NODE_STATE";
- GenericTestUtils.waitFor(() ->
serviceLog.getOutput().contains(expectedLogForPause),
- 100, 5000);
+ // Verify disk balancer start command is not sent to decommissioning DN
+ assertEquals(1, dnErrors.size());
+ assertTrue(dnErrors.get(0).getError()
+ .contains("Datanode is not in healthy state for disk balancing"));
//Recommission the node
scmClient.recommissionNodes(dnAddressList);
waitForDnToReachOpState(nm, dn, IN_SERVICE);
- // Verify it automatically restart (since it was explicitly started)
+ // Verify it does not automatically resume (since it was explicit
+ // start command was not sent to decommissioning DN)
GenericTestUtils.waitFor(() -> {
try {
HddsProtos.DatanodeDiskBalancerInfoProto status =
diskBalancerManager.getDiskBalancerStatus(dnAddressList, null,
ClientVersion.CURRENT_VERSION).stream().findFirst().orElse(null);
- return status != null && status.getRunningStatus() ==
HddsProtos.DiskBalancerRunningStatus.RUNNING;
+ return status != null && status.getRunningStatus() ==
HddsProtos.DiskBalancerRunningStatus.STOPPED;
} catch (IOException e) {
return false;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]