This is an automated email from the ASF dual-hosted git repository.

sodonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 70070a9030 HDDS-9555. Decommission should not wait on deleting 
containers (#5502)
70070a9030 is described below

commit 70070a9030e2dff86c6f6e53c16a25daeb9d97c8
Author: Stephen O'Donnell <[email protected]>
AuthorDate: Tue Oct 31 08:35:22 2023 +0000

    HDDS-9555. Decommission should not wait on deleting containers (#5502)
---
 .../hdds/scm/node/DatanodeAdminMonitorImpl.java    | 18 +++++++++++---
 .../hdds/scm/node/TestDatanodeAdminMonitor.java    | 28 ++++++++++++++++++++++
 2 files changed, 43 insertions(+), 3 deletions(-)

diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorImpl.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorImpl.java
index 34a69047c8..59948c0ce8 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorImpl.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorImpl.java
@@ -336,6 +336,7 @@ public class DatanodeAdminMonitorImpl implements 
DatanodeAdminMonitor {
   private boolean checkContainersReplicatedOnNode(DatanodeDetails dn)
       throws NodeNotFoundException {
     int sufficientlyReplicated = 0;
+    int deleting = 0;
     int underReplicated = 0;
     int unhealthy = 0;
     List<ContainerID> underReplicatedIDs = new ArrayList<>();
@@ -346,6 +347,17 @@ public class DatanodeAdminMonitorImpl implements 
DatanodeAdminMonitor {
       try {
         ContainerReplicaCount replicaSet =
             replicationManager.getContainerReplicaCount(cid);
+
+        // If a container is deleted or deleting, and we have a replica on this
+        // datanode, just ignore it. It should not block decommission.
+        HddsProtos.LifeCycleState containerState
+            = replicaSet.getContainer().getState();
+        if (containerState == HddsProtos.LifeCycleState.DELETED
+            || containerState == HddsProtos.LifeCycleState.DELETING) {
+          deleting++;
+          continue;
+        }
+
         if (replicaSet.isSufficientlyReplicatedForOffline(dn, nodeManager)) {
           sufficientlyReplicated++;
         } else {
@@ -389,9 +401,9 @@ public class DatanodeAdminMonitorImpl implements 
DatanodeAdminMonitor {
             "in containerManager", cid, dn);
       }
     }
-    LOG.info("{} has {} sufficientlyReplicated, {} underReplicated and {} " +
-        "unhealthy containers",
-        dn, sufficientlyReplicated, underReplicated, unhealthy);
+    LOG.info("{} has {} sufficientlyReplicated, {} deleting, {} " +
+            "underReplicated and {} unhealthy containers",
+        dn, sufficientlyReplicated, deleting, underReplicated, unhealthy);
     containerStateByHost.put(dn.getHostName(),
         new ContainerStateInWorkflow(dn.getHostName(),
             sufficientlyReplicated,
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
index ac9755e972..48afbe0682 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
@@ -295,6 +295,34 @@ public class TestDatanodeAdminMonitor {
         nodeManager.getNodeStatus(dn1).getOperationalState());
   }
 
+  @Test
+  public void testDecommissionNotBlockedByDeletingContainers()
+      throws NodeNotFoundException, ContainerNotFoundException {
+    DatanodeDetails dn1 = MockDatanodeDetails.randomDatanodeDetails();
+    nodeManager.register(dn1,
+        new NodeStatus(HddsProtos.NodeOperationalState.DECOMMISSIONING,
+            HddsProtos.NodeState.HEALTHY));
+
+    nodeManager.setContainers(dn1, generateContainers(3));
+    // Mock Replication Manager to return ContainerReplicaCount's which
+    // is deleting on a decommissioning and an IN_SERVICE node.
+    DatanodeAdminMonitorTestUtil
+        .mockGetContainerReplicaCount(
+            repManager,
+            HddsProtos.LifeCycleState.DELETING,
+            DECOMMISSIONED,
+            IN_SERVICE);
+
+    // Run the monitor for the first time and the node will transition to
+    // DECOMMISSIONED as there are no pipelines to close and no containers to
+    // replicate.
+    monitor.startMonitoring(dn1);
+    monitor.run();
+    assertEquals(0, monitor.getTrackedNodeCount());
+    assertEquals(HddsProtos.NodeOperationalState.DECOMMISSIONED,
+        nodeManager.getNodeStatus(dn1).getOperationalState());
+  }
+
   @Test
   public void testDecommissionNodeWithUnrecoverableECContainer()
       throws NodeNotFoundException, ContainerNotFoundException {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to