This is an automated email from the ASF dual-hosted git repository.

siddhant pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 551ef3c020 HDDS-7640. EC: UNHEALTHY replicas not replaced by healthy 
replicas from a CLOSED container by RM (#4083)
551ef3c020 is described below

commit 551ef3c0206b3db3d5ec3250c35abbd5c5826be6
Author: Siddhant Sangwan <[email protected]>
AuthorDate: Wed Dec 14 19:40:30 2022 +0530

    HDDS-7640. EC: UNHEALTHY replicas not replaced by healthy replicas from a 
CLOSED container by RM (#4083)
---
 .../ClosedWithMismatchedReplicasHandler.java       | 61 +++++++--------
 .../replication/TestReplicationManager.java        | 90 ++++++++++++++++++++++
 .../TestClosedWithMismatchedReplicasHandler.java   | 16 +++-
 3 files changed, 133 insertions(+), 34 deletions(-)

diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ClosedWithMismatchedReplicasHandler.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ClosedWithMismatchedReplicasHandler.java
index f5bb19323a..eaf6664b5b 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ClosedWithMismatchedReplicasHandler.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ClosedWithMismatchedReplicasHandler.java
@@ -23,11 +23,10 @@ import org.apache.hadoop.hdds.scm.container.ContainerInfo;
 import org.apache.hadoop.hdds.scm.container.ContainerReplica;
 import org.apache.hadoop.hdds.scm.container.replication.ContainerCheckRequest;
 import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
-import java.util.Iterator;
-import java.util.List;
 import java.util.Set;
-import java.util.stream.Collectors;
 
 /**
  * Handler to process containers which are closed, but some replicas are still
@@ -35,6 +34,8 @@ import java.util.stream.Collectors;
  * mis-matched replica to close it.
  */
 public class ClosedWithMismatchedReplicasHandler extends AbstractCheck {
+  public static final Logger LOG =
+      LoggerFactory.getLogger(ClosedWithMismatchedReplicasHandler.class);
 
   private ReplicationManager replicationManager;
 
@@ -43,6 +44,13 @@ public class ClosedWithMismatchedReplicasHandler extends 
AbstractCheck {
     this.replicationManager = replicationManager;
   }
 
+  /**
+   * Handles CLOSED EC or RATIS container. If some replicas are CLOSING or
+   * OPEN, this sends a force-close command for them.
+   * @param request ContainerCheckRequest object representing the container
+   * @return always returns true so that other handlers in the chain can fix
+   * issues such as under replication
+   */
   @Override
   public boolean handle(ContainerCheckRequest request) {
     ContainerInfo containerInfo = request.getContainerInfo();
@@ -51,39 +59,32 @@ public class ClosedWithMismatchedReplicasHandler extends 
AbstractCheck {
       // Handler is only relevant for CLOSED containers.
       return false;
     }
-    List<ContainerReplica> unhealthyReplicas = replicas.stream()
-        .filter(r -> !ReplicationManager
-            .compareState(containerInfo.getState(), r.getState()))
-        .collect(Collectors.toList());
 
-    if (unhealthyReplicas.size() > 0) {
-      handleUnhealthyReplicas(containerInfo, unhealthyReplicas);
-      return true;
+    // close replica if its state is OPEN or CLOSING
+    for (ContainerReplica replica : replicas) {
+      if (isMismatched(replica)) {
+        LOG.debug("Sending close command for mismatched replica {} of " +
+            "container {}.", replica, containerInfo);
+        replicationManager.sendCloseContainerReplicaCommand(
+            containerInfo, replica.getDatanodeDetails(), true);
+      }
     }
+
+    /*
+     This handler is unique because it always returns false. This allows
+     handlers further in the chain to fix issues such as under replication.
+     */
     return false;
   }
 
   /**
-   * Handles unhealthy container.
-   * A container is inconsistent if any of the replica state doesn't
-   * match the container state. We have to take appropriate action
-   * based on state of the replica.
-   *
-   * @param container ContainerInfo
-   * @param unhealthyReplicas List of ContainerReplica
+   * If a CLOSED container has an OPEN or CLOSING replica, there is a state
+   * mismatch.
+   * @param replica replica to check for mismatch
+   * @return true if the replica is in CLOSING or OPEN state, else false
    */
-  private void handleUnhealthyReplicas(final ContainerInfo container,
-      List<ContainerReplica> unhealthyReplicas) {
-    Iterator<ContainerReplica> iterator = unhealthyReplicas.iterator();
-    while (iterator.hasNext()) {
-      final ContainerReplica replica = iterator.next();
-      final ContainerReplicaProto.State state = replica.getState();
-      if (state == ContainerReplicaProto.State.OPEN
-          || state == ContainerReplicaProto.State.CLOSING) {
-        replicationManager.sendCloseContainerReplicaCommand(
-            container, replica.getDatanodeDetails(), true);
-        iterator.remove();
-      }
-    }
+  private boolean isMismatched(ContainerReplica replica) {
+    return replica.getState() == ContainerReplicaProto.State.OPEN ||
+        replica.getState() == ContainerReplicaProto.State.CLOSING;
   }
 }
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
index 92ec499570..f537ec5628 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
@@ -62,9 +62,11 @@ import java.util.Set;
 
 import static 
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_WAIT_TIME_AFTER_SAFE_MODE_EXIT;
 import static 
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.DECOMMISSIONING;
+import static 
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_SERVICE;
 import static 
org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor.THREE;
 import static 
org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaOp.PendingOpType.ADD;
 import static 
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createContainerInfo;
+import static 
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createContainerReplica;
 import static 
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createReplicas;
 import static org.mockito.ArgumentMatchers.any;
 import static org.mockito.ArgumentMatchers.anyInt;
@@ -245,6 +247,94 @@ public class TestReplicationManager {
         ReplicationManagerReport.HealthState.MISSING));
   }
 
+  /**
+   * A closed EC container with 3 closed and 2 unhealthy replicas is under
+   * replicated. RM should add it to under replicated queue.
+   */
+  @Test
+  public void testUnderReplicatedClosedContainerWithUnhealthyReplicas()
+      throws ContainerNotFoundException {
+    ContainerInfo container = createContainerInfo(repConfig, 1,
+        HddsProtos.LifeCycleState.CLOSED);
+    Set<ContainerReplica> replicas = addReplicas(container,
+        ContainerReplicaProto.State.CLOSED, 1, 2, 3);
+    ContainerReplica unhealthyReplica1 =
+        createContainerReplica(container.containerID(), 4,
+            IN_SERVICE, ContainerReplicaProto.State.UNHEALTHY);
+    ContainerReplica unhealthyReplica2 =
+        createContainerReplica(container.containerID(), 5,
+            IN_SERVICE, ContainerReplicaProto.State.UNHEALTHY);
+    replicas.add(unhealthyReplica1);
+    replicas.add(unhealthyReplica2);
+
+    replicationManager.processContainer(
+        container, repQueue, repReport);
+
+    Assert.assertEquals(1, repQueue.underReplicatedQueueSize());
+    Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
+    Assert.assertEquals(1, repReport.getStat(
+        ReplicationManagerReport.HealthState.UNDER_REPLICATED));
+  }
+
+  /**
+   * A closed EC container with 2 closed and 3 unhealthy replicas is
+   * unrecoverable. It should not be queued to under replicated queue but
+   * should be recorded as missing (currently, we're calling an unrecoverable
+   * EC container missing).
+   */
+  @Test
+  public void testUnrecoverableClosedContainerWithUnhealthyReplicas()
+      throws ContainerNotFoundException {
+    ContainerInfo container = createContainerInfo(repConfig, 1,
+        HddsProtos.LifeCycleState.CLOSED);
+    Set<ContainerReplica> replicas = addReplicas(container,
+        ContainerReplicaProto.State.UNHEALTHY, 3, 4, 5);
+    ContainerReplica closedReplica1 =
+        createContainerReplica(container.containerID(), 1,
+            IN_SERVICE, ContainerReplicaProto.State.CLOSED);
+    ContainerReplica closedReplica2 =
+        createContainerReplica(container.containerID(), 2,
+            IN_SERVICE, ContainerReplicaProto.State.CLOSED);
+    replicas.add(closedReplica1);
+    replicas.add(closedReplica2);
+
+    replicationManager.processContainer(
+        container, repQueue, repReport);
+
+    Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
+    Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
+    Assert.assertEquals(1, repReport.getStat(
+        ReplicationManagerReport.HealthState.UNDER_REPLICATED));
+    Assert.assertEquals(1, repReport.getStat(
+        ReplicationManagerReport.HealthState.MISSING));
+  }
+
+  @Test
+  public void
+      testUnderReplicatedClosedContainerWithUnHealthyAndClosingReplicas()
+      throws ContainerNotFoundException {
+    ContainerInfo container = createContainerInfo(repConfig, 1,
+        HddsProtos.LifeCycleState.CLOSED);
+    Set<ContainerReplica> replicas = addReplicas(container,
+        ContainerReplicaProto.State.CLOSED, 1, 2, 3);
+    ContainerReplica unhealthyReplica1 =
+        createContainerReplica(container.containerID(), 4,
+            IN_SERVICE, ContainerReplicaProto.State.UNHEALTHY);
+    ContainerReplica unhealthyReplica2 =
+        createContainerReplica(container.containerID(), 5,
+            IN_SERVICE, ContainerReplicaProto.State.CLOSING);
+    replicas.add(unhealthyReplica1);
+    replicas.add(unhealthyReplica2);
+
+    replicationManager.processContainer(
+        container, repQueue, repReport);
+
+    Assert.assertEquals(1, repQueue.underReplicatedQueueSize());
+    Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
+    Assert.assertEquals(1, repReport.getStat(
+        ReplicationManagerReport.HealthState.UNDER_REPLICATED));
+  }
+
   @Test
   public void testUnderAndOverReplicated()
       throws ContainerNotFoundException {
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestClosedWithMismatchedReplicasHandler.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestClosedWithMismatchedReplicasHandler.java
index ff2faa0ebc..f2b3a72885 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestClosedWithMismatchedReplicasHandler.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestClosedWithMismatchedReplicasHandler.java
@@ -99,7 +99,7 @@ public class TestClosedWithMismatchedReplicasHandler {
   }
 
   @Test
-  public void testClosedMissMatchContainerReturnsTrue() {
+  public void testCloseCommandSentForMismatchedReplicas() {
     ContainerInfo containerInfo = ReplicationTestUtil.createContainerInfo(
         ecReplicationConfig, 1, CLOSED);
     Set<ContainerReplica> containerReplicas = ReplicationTestUtil
@@ -126,7 +126,10 @@ public class TestClosedWithMismatchedReplicasHandler {
         .setContainerInfo(containerInfo)
         .setContainerReplicas(containerReplicas)
         .build();
-    Assertions.assertTrue(handler.handle(request));
+
+    // this handler always returns false so other handlers can fix issues
+    // such as under replication
+    Assertions.assertFalse(handler.handle(request));
 
     Mockito.verify(replicationManager, times(1))
         .sendCloseContainerReplicaCommand(
@@ -134,6 +137,7 @@ public class TestClosedWithMismatchedReplicasHandler {
     Mockito.verify(replicationManager, times(1))
         .sendCloseContainerReplicaCommand(
             containerInfo, mismatch2.getDatanodeDetails(), true);
+    // close command should not be sent for unhealthy replica
     Mockito.verify(replicationManager, times(0))
         .sendCloseContainerReplicaCommand(
             containerInfo, mismatch3.getDatanodeDetails(), true);
@@ -177,7 +181,7 @@ public class TestClosedWithMismatchedReplicasHandler {
   }
 
   @Test
-  public void testClosedMissMatchRatisContainerReturnsTrue() {
+  public void testCloseCommandSentForMismatchedRatisReplicas() {
     ContainerInfo containerInfo = ReplicationTestUtil.createContainerInfo(
         ratisReplicationConfig, 1, CLOSED);
     ContainerReplica mismatch1 = ReplicationTestUtil.createContainerReplica(
@@ -202,7 +206,10 @@ public class TestClosedWithMismatchedReplicasHandler {
         .setContainerInfo(containerInfo)
         .setContainerReplicas(containerReplicas)
         .build();
-    Assertions.assertTrue(handler.handle(request));
+
+    // this handler always returns false so other handlers can fix issues
+    // such as under replication
+    Assertions.assertFalse(handler.handle(request));
 
     Mockito.verify(replicationManager, times(1))
         .sendCloseContainerReplicaCommand(
@@ -210,6 +217,7 @@ public class TestClosedWithMismatchedReplicasHandler {
     Mockito.verify(replicationManager, times(1))
         .sendCloseContainerReplicaCommand(
             containerInfo, mismatch2.getDatanodeDetails(), true);
+    // close command should not be sent for unhealthy replica
     Mockito.verify(replicationManager, times(0))
         .sendCloseContainerReplicaCommand(
             containerInfo, mismatch3.getDatanodeDetails(), true);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to