siddhantsangwan commented on code in PR #6367:
URL: https://github.com/apache/ozone/pull/6367#discussion_r1539064328
##########
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java:
##########
@@ -368,6 +384,57 @@ public synchronized void startDecommission(DatanodeDetails
dn)
}
}
+ private synchronized boolean
checkIfDecommissionPossible(List<DatanodeDetails> dns, List<DatanodeAdminError>
errors) {
+ int minInService = -1, numDecom = dns.size();
+ int inServiceTotal =
nodeManager.getNodeCount(NodeStatus.inServiceHealthy());
+ for (DatanodeDetails dn : dns) {
+ try {
+ NodeStatus nodeStatus = getNodeStatus(dn);
+ NodeOperationalState opState = nodeStatus.getOperationalState();
+ if (opState != NodeOperationalState.IN_SERVICE) {
+ numDecom--;
+ }
+ } catch (NodeNotFoundException ex) {
+ numDecom--;
+ }
+ }
+
+ for (DatanodeDetails dn : dns) {
+ Set<ContainerID> containers;
+ try {
+ containers = nodeManager.getContainers(dn);
+ } catch (NodeNotFoundException ex) {
+ LOG.warn("The host {} was not found in SCM. Ignoring the request to " +
+ "decommission it", dn.getHostName());
+ errors.add(new DatanodeAdminError(dn.getHostName(),
+ "The host was not found in SCM"));
+ continue; // ignore the DN and continue to next one
+ }
+
+ for (ContainerID cid : containers) {
+ ContainerInfo cif;
+ try {
+ cif = containerManager.getContainer(cid);
+ } catch (ContainerNotFoundException ex) {
+ continue; // ignore the container and continue to next one
+ }
+ if (cif.getState().equals(HddsProtos.LifeCycleState.DELETED) ||
Review Comment:
We'll have to synchronize on the container info object as discussed in the
design doc. For example, in `AbstractContainerReportHandler`:
```
synchronized (containerInfo) {
updateContainerStats(datanodeDetails, containerInfo, replicaProto);
if (!updateContainerState(datanodeDetails, containerInfo, replicaProto,
publisher)) {
updateContainerReplica(datanodeDetails, containerId, replicaProto);
}
}
```
##########
hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/scm/protocolPB/StorageContainerLocationProtocolClientSideTranslatorPB.java:
##########
@@ -528,10 +528,11 @@ public HddsProtos.Node queryNode(UUID uuid) throws
IOException {
/**
* Attempts to decommission the list of nodes.
* @param nodes The list of hostnames or hostname:ports to decommission
+ * @param force boolean flag that skips fail-early checks and tries to
decommission nodes
* @throws IOException
*/
@Override
- public List<DatanodeAdminError> decommissionNodes(List<String> nodes)
+ public List<DatanodeAdminError> decommissionNodes(List<String> nodes,
boolean force)
Review Comment:
We need to send the `force` arguement in the actual proto message as well.
##########
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java:
##########
@@ -368,6 +384,57 @@ public synchronized void startDecommission(DatanodeDetails
dn)
}
}
+ private synchronized boolean
checkIfDecommissionPossible(List<DatanodeDetails> dns, List<DatanodeAdminError>
errors) {
+ int minInService = -1, numDecom = dns.size();
+ int inServiceTotal =
nodeManager.getNodeCount(NodeStatus.inServiceHealthy());
+ for (DatanodeDetails dn : dns) {
+ try {
+ NodeStatus nodeStatus = getNodeStatus(dn);
+ NodeOperationalState opState = nodeStatus.getOperationalState();
+ if (opState != NodeOperationalState.IN_SERVICE) {
+ numDecom--;
+ }
+ } catch (NodeNotFoundException ex) {
+ numDecom--;
+ }
+ }
+
+ for (DatanodeDetails dn : dns) {
+ Set<ContainerID> containers;
+ try {
+ containers = nodeManager.getContainers(dn);
+ } catch (NodeNotFoundException ex) {
+ LOG.warn("The host {} was not found in SCM. Ignoring the request to " +
+ "decommission it", dn.getHostName());
+ errors.add(new DatanodeAdminError(dn.getHostName(),
+ "The host was not found in SCM"));
+ continue; // ignore the DN and continue to next one
+ }
+
+ for (ContainerID cid : containers) {
+ ContainerInfo cif;
+ try {
+ cif = containerManager.getContainer(cid);
+ } catch (ContainerNotFoundException ex) {
+ continue; // ignore the container and continue to next one
+ }
+ if (cif.getState().equals(HddsProtos.LifeCycleState.DELETED) ||
+ cif.getState().equals(HddsProtos.LifeCycleState.DELETING)) {
+ continue;
+ }
+ int reqNodes = cif.getReplicationConfig().getRequiredNodes();
+ if ((inServiceTotal - numDecom) < reqNodes) {
+ return false;
+ }
+ if (reqNodes > minInService) {
Review Comment:
I think you forgot to remove `minInService`.
##########
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java:
##########
@@ -305,9 +312,18 @@ public DatanodeAdminMonitor getMonitor() {
}
public synchronized List<DatanodeAdminError> decommissionNodes(
- List<String> nodes) {
+ List<String> nodes, boolean force) {
List<DatanodeAdminError> errors = new ArrayList<>();
List<DatanodeDetails> dns = mapHostnamesToDatanodes(nodes, errors);
+ // add check for fail-early if force flag is not set
+ if (!force) {
+ boolean decommissionPossible = checkIfDecommissionPossible(dns, errors);
+ if (!decommissionPossible) {
+ LOG.error("Cannot decommission nodes as sufficient node are not
available.");
Review Comment:
Let's mention hostnames of the DNs in the log at line 322 as well. Also I
think it's best add a log before line 319 that says we're checking if
decommission is possible and shows the value of `force`.
##########
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java:
##########
@@ -368,6 +384,57 @@ public synchronized void startDecommission(DatanodeDetails
dn)
}
}
+ private synchronized boolean
checkIfDecommissionPossible(List<DatanodeDetails> dns, List<DatanodeAdminError>
errors) {
+ int minInService = -1, numDecom = dns.size();
+ int inServiceTotal =
nodeManager.getNodeCount(NodeStatus.inServiceHealthy());
+ for (DatanodeDetails dn : dns) {
+ try {
+ NodeStatus nodeStatus = getNodeStatus(dn);
+ NodeOperationalState opState = nodeStatus.getOperationalState();
+ if (opState != NodeOperationalState.IN_SERVICE) {
+ numDecom--;
+ }
+ } catch (NodeNotFoundException ex) {
+ numDecom--;
+ }
+ }
+
+ for (DatanodeDetails dn : dns) {
+ Set<ContainerID> containers;
+ try {
+ containers = nodeManager.getContainers(dn);
+ } catch (NodeNotFoundException ex) {
+ LOG.warn("The host {} was not found in SCM. Ignoring the request to " +
+ "decommission it", dn.getHostName());
+ errors.add(new DatanodeAdminError(dn.getHostName(),
+ "The host was not found in SCM"));
+ continue; // ignore the DN and continue to next one
+ }
+
+ for (ContainerID cid : containers) {
+ ContainerInfo cif;
+ try {
+ cif = containerManager.getContainer(cid);
+ } catch (ContainerNotFoundException ex) {
+ continue; // ignore the container and continue to next one
+ }
+ if (cif.getState().equals(HddsProtos.LifeCycleState.DELETED) ||
+ cif.getState().equals(HddsProtos.LifeCycleState.DELETING)) {
+ continue;
+ }
+ int reqNodes = cif.getReplicationConfig().getRequiredNodes();
+ if ((inServiceTotal - numDecom) < reqNodes) {
+ return false;
Review Comment:
Let's log all necessary details when returning false, at the INFO level.
Such as the total in-service nodes, number of nodes to be decommissioned, valid
nodes out of that, and the number of required nodes. And also the DN and the
ContainerInfo because of which we're failing early.
##########
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java:
##########
@@ -368,6 +384,57 @@ public synchronized void startDecommission(DatanodeDetails
dn)
}
}
+ private synchronized boolean
checkIfDecommissionPossible(List<DatanodeDetails> dns, List<DatanodeAdminError>
errors) {
+ int minInService = -1, numDecom = dns.size();
+ int inServiceTotal =
nodeManager.getNodeCount(NodeStatus.inServiceHealthy());
+ for (DatanodeDetails dn : dns) {
+ try {
+ NodeStatus nodeStatus = getNodeStatus(dn);
+ NodeOperationalState opState = nodeStatus.getOperationalState();
+ if (opState != NodeOperationalState.IN_SERVICE) {
+ numDecom--;
+ }
+ } catch (NodeNotFoundException ex) {
+ numDecom--;
+ }
+ }
+
+ for (DatanodeDetails dn : dns) {
Review Comment:
We need to ignore non in-service DNs in this loop too. Let's add some tests
for non in-service and `NodeNotFoundException` DNs.
##########
hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/client/ScmClient.java:
##########
@@ -217,11 +217,12 @@ List<HddsProtos.Node>
queryNode(HddsProtos.NodeOperationalState opState,
* Allows a list of hosts to be decommissioned. The hosts are identified
* by their hostname and optionally port in the format foo.com:port.
* @param hosts A list of hostnames, optionally with port
+ * @param force boolean flag that forcefully tries to decommission nodes
Review Comment:
```suggestion
* @param force true to forcefully decommission Datanodes
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]