This is an automated email from the ASF dual-hosted git repository.

sodonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new ffe2a2e53b HDDS-9737. Legacy Replication Manager should consider that 
UNHEALTHY replicas might be decommissioning (#5674)
ffe2a2e53b is described below

commit ffe2a2e53bea60863d1517d20c3bf4a7b9c4c043
Author: Siddhant Sangwan <[email protected]>
AuthorDate: Mon Nov 27 18:02:36 2023 +0530

    HDDS-9737. Legacy Replication Manager should consider that UNHEALTHY 
replicas might be decommissioning (#5674)
---
 .../LegacyRatisContainerReplicaCount.java          |  8 +++
 .../replication/LegacyReplicationManager.java      | 26 ++++++++-
 .../replication/TestLegacyReplicationManager.java  | 29 ++++++++++
 .../hdds/scm/node/TestDatanodeAdminMonitor.java    | 67 ++++++++++++++++++++++
 4 files changed, 128 insertions(+), 2 deletions(-)

diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyRatisContainerReplicaCount.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyRatisContainerReplicaCount.java
index 90c7afebcc..f708ae1ead 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyRatisContainerReplicaCount.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyRatisContainerReplicaCount.java
@@ -23,6 +23,7 @@ import org.apache.hadoop.hdds.scm.container.ContainerInfo;
 import org.apache.hadoop.hdds.scm.container.ContainerReplica;
 import org.apache.hadoop.hdds.scm.node.NodeManager;
 
+import java.util.List;
 import java.util.Set;
 
 import static 
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_SERVICE;
@@ -50,6 +51,13 @@ public class LegacyRatisContainerReplicaCount extends
         minHealthyForMaintenance);
   }
 
+  public LegacyRatisContainerReplicaCount(ContainerInfo container,
+      Set<ContainerReplica> replicas, List<ContainerReplicaOp> pendingOps,
+      int minHealthyForMaintenance, boolean considerUnhealthy) {
+    super(container, replicas, pendingOps, minHealthyForMaintenance,
+        considerUnhealthy);
+  }
+
   @Override
   protected int healthyReplicaCountAdapter() {
     return -getMisMatchedReplicaCount();
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java
index 9a51537028..07a8f730ec 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java
@@ -1051,7 +1051,7 @@ public class LegacyReplicationManager {
     synchronized (container) {
       final Set<ContainerReplica> replica = containerManager
           .getContainerReplicas(container.containerID());
-      return getContainerReplicaCount(container, replica);
+      return getReplicaCountOptionallyConsiderUnhealthy(container, replica);
     }
   }
 
@@ -1076,6 +1076,28 @@ public class LegacyReplicationManager {
         minHealthyForMaintenance);
   }
 
+  private RatisContainerReplicaCount 
getReplicaCountOptionallyConsiderUnhealthy(
+      ContainerInfo container, Set<ContainerReplica> replicas) {
+    LegacyRatisContainerReplicaCount withUnhealthy =
+        new LegacyRatisContainerReplicaCount(container, replicas,
+            getPendingOps(container.containerID()), minHealthyForMaintenance,
+            true);
+    if (withUnhealthy.getHealthyReplicaCount() == 0 &&
+        withUnhealthy.getUnhealthyReplicaCount() > 0) {
+      // if the container has only UNHEALTHY replicas, return the correct
+      // RatisContainerReplicaCount object which can handle UNHEALTHY replicas
+      return withUnhealthy;
+    }
+
+    return new LegacyRatisContainerReplicaCount(
+        container,
+        replicas,
+        getInflightAdd(container.containerID()),
+        getInflightDel(container.containerID()),
+        container.getReplicationConfig().getRequiredNodes(),
+        minHealthyForMaintenance);
+  }
+
   /**
    * Returns true if more than 50% of the container replicas with unique
    * originNodeId are in QUASI_CLOSED state.
@@ -1345,7 +1367,7 @@ public class LegacyReplicationManager {
     List<ContainerReplica> replicas = replicaSet.getReplicas();
 
     RatisContainerReplicaCount unhealthyReplicaSet =
-        new RatisContainerReplicaCount(container,
+        new LegacyRatisContainerReplicaCount(container,
             new HashSet<>(replicaSet.getReplicas()),
             getPendingOps(container.containerID()),
             minHealthyForMaintenance,
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java
index 43a2fb3263..c9bd4bddbd 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java
@@ -312,6 +312,35 @@ public class TestLegacyReplicationManager {
       replicationManager.start();
       Assertions.assertTrue(replicationManager.isRunning());
     }
+
+    @Test
+    public void testGetContainerReplicaCount()
+        throws IOException, TimeoutException {
+      ContainerInfo container = createContainer(LifeCycleState.QUASI_CLOSED);
+      addReplica(container, NodeStatus.inServiceHealthy(), UNHEALTHY);
+      addReplica(container, NodeStatus.inServiceHealthy(), UNHEALTHY);
+      ContainerReplica decommissioningReplica =
+          addReplica(container, new NodeStatus(DECOMMISSIONING, HEALTHY),
+              UNHEALTHY);
+
+      ContainerReplicaCount replicaCount =
+          replicationManager.getLegacyReplicationManager()
+              .getContainerReplicaCount(container);
+
+      Assertions.assertTrue(
+          replicaCount instanceof LegacyRatisContainerReplicaCount);
+      Assertions.assertFalse(replicaCount.isSufficientlyReplicated());
+      Assertions.assertFalse(replicaCount.isSufficientlyReplicatedForOffline(
+          decommissioningReplica.getDatanodeDetails(), nodeManager));
+
+      addReplica(container, NodeStatus.inServiceHealthy(), UNHEALTHY);
+      replicaCount = replicationManager.getLegacyReplicationManager()
+          .getContainerReplicaCount(container);
+      Assertions.assertTrue(replicaCount.isSufficientlyReplicated());
+      Assertions.assertTrue(replicaCount.isSufficientlyReplicatedForOffline(
+          decommissioningReplica.getDatanodeDetails(), nodeManager));
+      Assertions.assertTrue(replicaCount.isHealthyEnoughForOffline());
+    }
   }
 
   /**
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
index 48afbe0682..09ff7cb0ff 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
@@ -42,6 +42,7 @@ import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import org.mockito.Mockito;
 import java.io.IOException;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;
 
@@ -295,6 +296,72 @@ public class TestDatanodeAdminMonitor {
         nodeManager.getNodeStatus(dn1).getOperationalState());
   }
 
+  /**
+   * Consider a QUASI_CLOSED container with only UNHEALTHY replicas. If one
+   * of its nodes is decommissioned, the decommissioning should succeed.
+   */
+  @Test
+  public void testQuasiClosedContainerWithAllUnhealthyReplicas()
+      throws NodeNotFoundException, ContainerNotFoundException {
+    conf.setBoolean("hdds.scm.replication.enable.legacy", true);
+
+    DatanodeDetails decommissioningNode =
+        MockDatanodeDetails.randomDatanodeDetails();
+    nodeManager.register(decommissioningNode,
+        new NodeStatus(HddsProtos.NodeOperationalState.DECOMMISSIONING,
+            HddsProtos.NodeState.HEALTHY));
+    ContainerInfo container = ReplicationTestUtil.createContainer(
+        HddsProtos.LifeCycleState.QUASI_CLOSED,
+        RatisReplicationConfig.getInstance(
+            HddsProtos.ReplicationFactor.THREE));
+    Set<ContainerReplica> replicas =
+        ReplicationTestUtil.createReplicas(container.containerID(),
+            State.UNHEALTHY, 0, 0);
+
+    ContainerReplica decommissioningReplica =
+        ReplicationTestUtil.createContainerReplica(container.containerID(), 0,
+            DECOMMISSIONING, State.UNHEALTHY, container.getNumberOfKeys(),
+            container.getUsedBytes(), decommissioningNode,
+            decommissioningNode.getUuid());
+    replicas.add(decommissioningReplica);
+    nodeManager.setContainers(decommissioningNode,
+        ImmutableSet.of(container.containerID()));
+
+    Mockito.when(repManager.getContainerReplicaCount(
+            Mockito.eq(container.containerID())))
+        .thenReturn(new LegacyRatisContainerReplicaCount(container, replicas,
+            Collections.emptyList(), 2, true));
+
+    // start monitoring dn1
+    monitor.startMonitoring(decommissioningNode);
+    monitor.run();
+    assertEquals(1, monitor.getTrackedNodeCount());
+    assertEquals(HddsProtos.NodeOperationalState.DECOMMISSIONING,
+        nodeManager.getNodeStatus(decommissioningNode).getOperationalState());
+
+    // Running the monitor again causes it to remain DECOMMISSIONING
+    // as nothing has changed.
+    monitor.run();
+    assertEquals(HddsProtos.NodeOperationalState.DECOMMISSIONING,
+        nodeManager.getNodeStatus(decommissioningNode).getOperationalState());
+
+    // add a copy of the UNHEALTHY replica on a new node, decommissioningNode
+    // should get decommissioned now
+    ContainerReplica copyOfUnhealthyOnNewNode =
+        decommissioningReplica.toBuilder()
+            .setDatanodeDetails(MockDatanodeDetails.randomDatanodeDetails())
+            .build();
+    replicas.add(copyOfUnhealthyOnNewNode);
+    Mockito.when(repManager.getContainerReplicaCount(
+            Mockito.eq(container.containerID())))
+        .thenReturn(new LegacyRatisContainerReplicaCount(container, replicas,
+            Collections.emptyList(), 3, true));
+    monitor.run();
+    assertEquals(0, monitor.getTrackedNodeCount());
+    assertEquals(HddsProtos.NodeOperationalState.DECOMMISSIONED,
+        nodeManager.getNodeStatus(decommissioningNode).getOperationalState());
+  }
+
   @Test
   public void testDecommissionNotBlockedByDeletingContainers()
       throws NodeNotFoundException, ContainerNotFoundException {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to