This is an automated email from the ASF dual-hosted git repository.
sodonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 70070a9030 HDDS-9555. Decommission should not wait on deleting
containers (#5502)
70070a9030 is described below
commit 70070a9030e2dff86c6f6e53c16a25daeb9d97c8
Author: Stephen O'Donnell <[email protected]>
AuthorDate: Tue Oct 31 08:35:22 2023 +0000
HDDS-9555. Decommission should not wait on deleting containers (#5502)
---
.../hdds/scm/node/DatanodeAdminMonitorImpl.java | 18 +++++++++++---
.../hdds/scm/node/TestDatanodeAdminMonitor.java | 28 ++++++++++++++++++++++
2 files changed, 43 insertions(+), 3 deletions(-)
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorImpl.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorImpl.java
index 34a69047c8..59948c0ce8 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorImpl.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorImpl.java
@@ -336,6 +336,7 @@ public class DatanodeAdminMonitorImpl implements
DatanodeAdminMonitor {
private boolean checkContainersReplicatedOnNode(DatanodeDetails dn)
throws NodeNotFoundException {
int sufficientlyReplicated = 0;
+ int deleting = 0;
int underReplicated = 0;
int unhealthy = 0;
List<ContainerID> underReplicatedIDs = new ArrayList<>();
@@ -346,6 +347,17 @@ public class DatanodeAdminMonitorImpl implements
DatanodeAdminMonitor {
try {
ContainerReplicaCount replicaSet =
replicationManager.getContainerReplicaCount(cid);
+
+ // If a container is deleted or deleting, and we have a replica on this
+ // datanode, just ignore it. It should not block decommission.
+ HddsProtos.LifeCycleState containerState
+ = replicaSet.getContainer().getState();
+ if (containerState == HddsProtos.LifeCycleState.DELETED
+ || containerState == HddsProtos.LifeCycleState.DELETING) {
+ deleting++;
+ continue;
+ }
+
if (replicaSet.isSufficientlyReplicatedForOffline(dn, nodeManager)) {
sufficientlyReplicated++;
} else {
@@ -389,9 +401,9 @@ public class DatanodeAdminMonitorImpl implements
DatanodeAdminMonitor {
"in containerManager", cid, dn);
}
}
- LOG.info("{} has {} sufficientlyReplicated, {} underReplicated and {} " +
- "unhealthy containers",
- dn, sufficientlyReplicated, underReplicated, unhealthy);
+ LOG.info("{} has {} sufficientlyReplicated, {} deleting, {} " +
+ "underReplicated and {} unhealthy containers",
+ dn, sufficientlyReplicated, deleting, underReplicated, unhealthy);
containerStateByHost.put(dn.getHostName(),
new ContainerStateInWorkflow(dn.getHostName(),
sufficientlyReplicated,
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
index ac9755e972..48afbe0682 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
@@ -295,6 +295,34 @@ public class TestDatanodeAdminMonitor {
nodeManager.getNodeStatus(dn1).getOperationalState());
}
+ @Test
+ public void testDecommissionNotBlockedByDeletingContainers()
+ throws NodeNotFoundException, ContainerNotFoundException {
+ DatanodeDetails dn1 = MockDatanodeDetails.randomDatanodeDetails();
+ nodeManager.register(dn1,
+ new NodeStatus(HddsProtos.NodeOperationalState.DECOMMISSIONING,
+ HddsProtos.NodeState.HEALTHY));
+
+ nodeManager.setContainers(dn1, generateContainers(3));
+ // Mock Replication Manager to return ContainerReplicaCount's which
+ // is deleting on a decommissioning and an IN_SERVICE node.
+ DatanodeAdminMonitorTestUtil
+ .mockGetContainerReplicaCount(
+ repManager,
+ HddsProtos.LifeCycleState.DELETING,
+ DECOMMISSIONED,
+ IN_SERVICE);
+
+ // Run the monitor for the first time and the node will transition to
+ // DECOMMISSIONED as there are no pipelines to close and no containers to
+ // replicate.
+ monitor.startMonitoring(dn1);
+ monitor.run();
+ assertEquals(0, monitor.getTrackedNodeCount());
+ assertEquals(HddsProtos.NodeOperationalState.DECOMMISSIONED,
+ nodeManager.getNodeStatus(dn1).getOperationalState());
+ }
+
@Test
public void testDecommissionNodeWithUnrecoverableECContainer()
throws NodeNotFoundException, ContainerNotFoundException {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]