This is an automated email from the ASF dual-hosted git repository.

arp pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 52edf7ac17 HDDS-9075. QUASI_CLOSED Replica with incorrect sequenceID 
should be deleted by SCM. (#5120)
52edf7ac17 is described below

commit 52edf7ac17feb22f7fde8ff283bfdf6adcee83d8
Author: Nandakumar <[email protected]>
AuthorDate: Fri Jul 28 08:09:51 2023 +0530

    HDDS-9075. QUASI_CLOSED Replica with incorrect sequenceID should be deleted 
by SCM. (#5120)
---
 .../replication/LegacyReplicationManager.java      | 52 ++++++++++++--
 .../replication/RatisContainerReplicaCount.java    | 10 ++-
 .../replication/RatisOverReplicationHandler.java   |  4 +-
 .../replication/RatisUnderReplicationHandler.java  | 11 +--
 .../container/replication/ReplicationTestUtil.java | 11 +++
 .../replication/TestLegacyReplicationManager.java  | 23 +++----
 .../TestRatisContainerReplicaCount.java            | 79 ++++++++++++++++++++++
 .../TestRatisOverReplicationHandler.java           | 30 ++++++++
 .../TestRatisUnderReplicationHandler.java          | 49 ++++++++++++++
 .../replication/TestReplicationManager.java        | 55 +++++++++++++++
 .../health/TestRatisReplicationCheckHandler.java   | 64 ++++++++++++++++++
 .../TestRatisUnhealthyReplicationCheckHandler.java | 58 ++++++++++++++++
 12 files changed, 418 insertions(+), 28 deletions(-)

diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java
index 63fdc95bdf..c851470eae 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java
@@ -521,18 +521,23 @@ public class LegacyReplicationManager {
          * Check if the container is over replicated and take appropriate
          * action.
          */
-        if (replicaSet.isOverReplicated()) {
-          handleOverReplicatedHealthy(container, replicaSet, report);
+        if (replicaSet.getReplicas().size() >
+            container.getReplicationConfig().getRequiredNodes()) {
+          if (replicaSet.isHealthy()) {
+            handleOverReplicatedHealthy(container, replicaSet, report);
+          } else {
+            handleOverReplicatedExcessUnhealthy(container, replicaSet, report);
+          }
           return;
         }
 
       /*
-       If we get here, the container is not over replicated or under replicated
-       but it may be "unhealthy", which means it has one or more replica which
-       are not in the same state as the container itself.
+       * If we get here, the container is not over replicated or under
+       * replicated, but it may be "unhealthy", which means it has one or
+       * more replica which are not in the same state as the container itself.
        */
         if (!replicaSet.isHealthy()) {
-          handleOverReplicatedExcessUnhealthy(container, replicaSet, report);
+          handleContainerWithUnhealthyReplica(container, replicaSet);
         }
       }
     } catch (ContainerNotFoundException ex) {
@@ -1268,6 +1273,41 @@ public class LegacyReplicationManager {
     }
   }
 
+  /**
+   * This method handles container with unhealthy replica by over-replicating
+   * the healthy replica. Once the container becomes over-replicated,
+   * we delete the unhealthy replica in the next cycle of replication manager
+   * in handleOverReplicatedExcessUnhealthy method.
+   */
+  private void handleContainerWithUnhealthyReplica(
+      final ContainerInfo container,
+      final RatisContainerReplicaCount replicaSet) {
+    /*
+     * When there is an Unhealthy or Quasi Closed replica with incorrect
+     * sequence id for a Closed container, it should be deleted and one of
+     * the healthy replica has to be re-replicated.
+     *
+     * We first do the re-replication and over replicate the container,
+     * in the next cycle of replication manager the excess unhealthy replica
+     * is deleted.
+     */
+
+    if (container.getState() == LifeCycleState.CLOSED) {
+      final List<ContainerReplica> replicas = replicaSet.getReplicas();
+      final List<ContainerReplica> replicationSources = getReplicationSources(
+          container, replicaSet.getReplicas(), State.CLOSED);
+      if (replicationSources.isEmpty()) {
+        LOG.warn("No healthy CLOSED replica for replication.");
+        return;
+      }
+      final ContainerPlacementStatus placementStatus = getPlacementStatus(
+          new HashSet<>(replicationSources),
+          container.getReplicationConfig().getRequiredNodes());
+      replicateAnyWithTopology(container, replicationSources,
+          placementStatus, replicas.size() - replicationSources.size());
+    }
+  }
+
   /**
    * Returns the replicas from {@code replicas} that:
    *  - Do not have in flight deletions
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisContainerReplicaCount.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisContainerReplicaCount.java
index 47b0da9be6..e8ccaebfbf 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisContainerReplicaCount.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisContainerReplicaCount.java
@@ -137,8 +137,16 @@ public class RatisContainerReplicaCount implements 
ContainerReplicaCount {
     for (ContainerReplica cr : replicas) {
       HddsProtos.NodeOperationalState state =
           cr.getDatanodeDetails().getPersistedOpState();
+
+      /*
+       * When there is Quasi Closed Replica with incorrect sequence id
+       * for a Closed container, it's treated as unhealthy.
+       */
       boolean unhealthy =
-          cr.getState() == ContainerReplicaProto.State.UNHEALTHY;
+          cr.getState() == ContainerReplicaProto.State.UNHEALTHY ||
+              (cr.getState() == ContainerReplicaProto.State.QUASI_CLOSED &&
+                  container.getState() == HddsProtos.LifeCycleState.CLOSED &&
+                  container.getSequenceId() != cr.getSequenceId());
 
       if (state == DECOMMISSIONED || state == DECOMMISSIONING) {
         if (unhealthy) {
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisOverReplicationHandler.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisOverReplicationHandler.java
index 55108ff6b8..dd00e01ba7 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisOverReplicationHandler.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisOverReplicationHandler.java
@@ -83,8 +83,8 @@ public class RatisOverReplicationHandler
 
     // We are going to check for over replication, so we should filter out any
     // replicas that are not in a HEALTHY state. This is because a replica can
-    // be healthy, stale or dead. If it is dead is will be quickly removed from
-    // scm. If it is state, there is a good chance the DN is offline and the
+    // be healthy, stale or dead. If it is dead it will be quickly removed from
+    // scm. If it is stale, there is a good chance the DN is offline and the
     // replica will go away soon. So, if we have a container that is over
     // replicated with a HEALTHY and STALE replica, and we decide to delete the
     // HEALTHY one, and then the STALE ones goes away, we will lose them both.
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisUnderReplicationHandler.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisUnderReplicationHandler.java
index 0e4c4b1d42..fae435df29 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisUnderReplicationHandler.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisUnderReplicationHandler.java
@@ -215,12 +215,13 @@ public class RatisUnderReplicationHandler
       }
     }
 
-    Predicate<ContainerReplica> predicate;
+    Predicate<ContainerReplica> predicate =
+        replica -> replica.getState() == State.CLOSED ||
+        replica.getState() == State.QUASI_CLOSED;
+
     if (replicaCount.getHealthyReplicaCount() == 0) {
-      predicate = replica -> replica.getState() == State.UNHEALTHY;
-    } else {
-      predicate = replica -> replica.getState() == State.CLOSED ||
-          replica.getState() == State.QUASI_CLOSED;
+      predicate = predicate.or(
+          replica -> replica.getState() == State.UNHEALTHY);
     }
 
     /*
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationTestUtil.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationTestUtil.java
index 1b9cc85e2f..3f0f0e78a2 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationTestUtil.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationTestUtil.java
@@ -197,6 +197,17 @@ public final class ReplicationTestUtil {
         .build();
   }
 
+  public static ContainerInfo createContainerInfo(
+      ReplicationConfig replicationConfig, long containerID,
+      HddsProtos.LifeCycleState containerState, long sequenceID) {
+    return TestContainerInfo.newBuilderForTest()
+        .setContainerID(containerID)
+        .setReplicationConfig(replicationConfig)
+        .setState(containerState)
+        .setSequenceId(sequenceID)
+        .build();
+  }
+
   public static ContainerInfo createContainerInfo(
       ReplicationConfig replicationConfig, long containerID,
       HddsProtos.LifeCycleState containerState, long keyCount, long bytesUsed) 
{
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java
index f493138e89..8bf22b64a1 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java
@@ -67,6 +67,7 @@ import 
org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionExcepti
 import org.apache.hadoop.ozone.protocol.commands.CommandForDatanode;
 import org.apache.ozone.test.GenericTestUtils;
 import org.apache.ozone.test.TestClock;
+import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeEach;
@@ -975,7 +976,6 @@ public class TestLegacyReplicationManager {
       assertUnderReplicatedCount(0);
       boolean unhealthyDeleted = false;
       boolean closedDeleted = false;
-      UUID closedDeletedUUID = null;
 
       for (CommandForDatanode<?> command :
           datanodeCommandHandler.getReceivedCommands()) {
@@ -986,19 +986,19 @@ public class TestLegacyReplicationManager {
             unhealthyDeleted = true;
           } else {
             closedDeleted = true;
-            closedDeletedUUID = command.getDatanodeId();
           }
         }
       }
 
-      Assertions.assertFalse(unhealthyDeleted);
-      Assertions.assertTrue(closedDeleted);
+      Assertions.assertTrue(unhealthyDeleted);
+      Assertions.assertFalse(closedDeleted);
+
+      containerStateManager.removeContainerReplica(
+          container.containerID(), unhealthy);
 
       // Do a second run.
       assertReplicaScheduled(0);
       assertUnderReplicatedCount(0);
-      unhealthyDeleted = false;
-      closedDeleted = false;
       for (CommandForDatanode<?> command :
           datanodeCommandHandler.getReceivedCommands()) {
         if (command.getCommand().getType() ==
@@ -1008,8 +1008,6 @@ public class TestLegacyReplicationManager {
             unhealthyDeleted = true;
           } else {
             closedDeleted = true;
-            // The delete command should have been left over from the last run.
-            Assertions.assertEquals(closedDeletedUUID, 
command.getDatanodeId());
           }
         }
       }
@@ -2280,8 +2278,6 @@ public class TestLegacyReplicationManager {
               id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails());
       final ContainerReplica replicaFour = getReplicas(
               id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails());
-      final ContainerReplica replicaFive = getReplicas(
-              id, UNHEALTHY, 1000L, originNodeId, randomDatanodeDetails());
 
       containerStateManager.addContainer(container.getProtobuf());
       containerStateManager.updateContainerReplica(id, replicaOne);
@@ -2289,7 +2285,6 @@ public class TestLegacyReplicationManager {
       containerStateManager.updateContainerReplica(
               id, replicaThree);
       containerStateManager.updateContainerReplica(id, replicaFour);
-      containerStateManager.updateContainerReplica(id, replicaFive);
 
       // Ensure a mis-replicated status is returned for any containers in this
       // test where there are exactly 3 replicas checked.
@@ -2318,9 +2313,6 @@ public class TestLegacyReplicationManager {
       Assertions.assertEquals(currentDeleteCommandCount,
               replicationManager.getMetrics().getDeletionCmdsSentTotal());
 
-      Assertions.assertFalse(datanodeCommandHandler.received(
-              SCMCommandProto.Type.deleteContainerCommand,
-              replicaFive.getDatanodeDetails()));
       Assertions.assertEquals(0, getInflightCount(InflightType.DELETION));
       Assertions.assertEquals(0, replicationManager.getMetrics()
               .getInflightDeletion());
@@ -2372,7 +2364,10 @@ public class TestLegacyReplicationManager {
       assertOverReplicatedCount(1);
     }
 
+
     @Test
+    @Disabled("This test doesn't properly test Rack Placement Policy as" +
+        " LegacyReplicationManager doesn't handle rack aware delete properly.")
     public void testOverReplicatedAndPolicyUnSatisfiedAndDeleted()
             throws IOException, TimeoutException {
       final ContainerInfo container = getContainer(LifeCycleState.CLOSED);
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisContainerReplicaCount.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisContainerReplicaCount.java
index 0380f2b8f8..f4bf6d40b6 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisContainerReplicaCount.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisContainerReplicaCount.java
@@ -40,9 +40,11 @@ import static 
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalSt
 import static 
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.ENTERING_MAINTENANCE;
 import static 
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_MAINTENANCE;
 import static 
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_SERVICE;
+import 
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State;
 import static 
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.CLOSED;
 import static 
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.CLOSING;
 import static 
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.OPEN;
+import static 
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.QUASI_CLOSED;
 import static 
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.UNHEALTHY;
 import static 
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createContainerInfo;
 import static 
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createContainerReplica;
@@ -754,6 +756,83 @@ class TestRatisContainerReplicaCount {
 
   }
 
+  @Test
+  void testQuasiClosedReplicaWithCorrectSequenceID() {
+    final ContainerInfo container =
+        createContainerInfo(RatisReplicationConfig.getInstance(
+                HddsProtos.ReplicationFactor.THREE), 1L,
+            HddsProtos.LifeCycleState.CLOSED);
+    final Set<ContainerReplica> replicas =
+        createReplicas(container.containerID(), CLOSED, 0, 0);
+    final Set<ContainerReplica> quasiClosedReplica =
+        createReplicas(container.containerID(), QUASI_CLOSED, 0);
+    replicas.addAll(quasiClosedReplica);
+
+    final RatisContainerReplicaCount crc =
+        new RatisContainerReplicaCount(container, replicas,
+            Collections.emptyList(), 2, false);
+    validate(crc, true, 0, false, false);
+    assertTrue(crc.isSufficientlyReplicated(true));
+    assertEquals(0, crc.getUnhealthyReplicaCount());
+
+    // With additional unhealthy replica
+
+    final Set<ContainerReplica> unhealthyReplica =
+        createReplicas(container.containerID(), UNHEALTHY, 0);
+    replicas.addAll(unhealthyReplica);
+
+    final RatisContainerReplicaCount crcWithUnhealthy =
+        new RatisContainerReplicaCount(container, replicas,
+            Collections.emptyList(), 2, false);
+    validate(crcWithUnhealthy, true, 0, false, false);
+    assertTrue(crcWithUnhealthy.isSufficientlyReplicated(true));
+    assertEquals(1, crcWithUnhealthy.getUnhealthyReplicaCount());
+  }
+
+  @Test
+  void testQuasiClosedReplicaWithInCorrectSequenceID() {
+
+    final long sequenceID = 101;
+    final ContainerInfo container =
+        createContainerInfo(RatisReplicationConfig.getInstance(
+                HddsProtos.ReplicationFactor.THREE), 1L,
+            HddsProtos.LifeCycleState.CLOSED, sequenceID);
+    final ContainerID containerID = container.containerID();
+
+    final ContainerReplica replicaOne = createContainerReplica(
+        containerID, 0, IN_SERVICE, State.CLOSED, sequenceID);
+    final ContainerReplica replicaTwo = createContainerReplica(
+        containerID, 0, IN_SERVICE, State.CLOSED, sequenceID);
+
+    final ContainerReplica quasiCloseReplica = createContainerReplica(
+        containerID, 0, IN_SERVICE, State.QUASI_CLOSED, sequenceID - 5);
+
+    final Set<ContainerReplica> replicas = new HashSet<>();
+    replicas.add(replicaOne);
+    replicas.add(replicaTwo);
+    replicas.add(quasiCloseReplica);
+
+    final RatisContainerReplicaCount crc =
+        new RatisContainerReplicaCount(container, replicas,
+            Collections.emptyList(), 2, false);
+    validate(crc, false, 1, false, false);
+    assertFalse(crc.isSufficientlyReplicated(true));
+    assertEquals(1, crc.getUnhealthyReplicaCount());
+
+    // With additional unhealthy replica
+
+    final Set<ContainerReplica> unhealthyReplica =
+        createReplicas(container.containerID(), UNHEALTHY, 0);
+    replicas.addAll(unhealthyReplica);
+
+    final RatisContainerReplicaCount crcWithUnhealthy =
+        new RatisContainerReplicaCount(container, replicas,
+            Collections.emptyList(), 2, false);
+    validate(crcWithUnhealthy, false, 1, false, false);
+    assertFalse(crcWithUnhealthy.isSufficientlyReplicated(true));
+    assertEquals(2, crcWithUnhealthy.getUnhealthyReplicaCount());
+  }
+
   private void validate(RatisContainerReplicaCount rcnt,
       boolean sufficientlyReplicated, int replicaDelta,
       boolean overReplicated, boolean insufficientDueToOutOfService) {
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisOverReplicationHandler.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisOverReplicationHandler.java
index c83c2a30d9..5a9fd20288 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisOverReplicationHandler.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisOverReplicationHandler.java
@@ -25,6 +25,7 @@ import org.apache.hadoop.hdds.protocol.DatanodeDetails;
 import org.apache.hadoop.hdds.protocol.MockDatanodeDetails;
 import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
 import 
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto;
+import 
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State;
 import org.apache.hadoop.hdds.scm.PlacementPolicy;
 import org.apache.hadoop.hdds.scm.container.ContainerInfo;
 import org.apache.hadoop.hdds.scm.container.ContainerReplica;
@@ -50,6 +51,7 @@ import java.util.Set;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.stream.Collectors;
 
+import static 
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_SERVICE;
 import static 
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createContainer;
 import static 
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createContainerReplica;
 import static 
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createReplicas;
@@ -278,6 +280,34 @@ public class TestRatisOverReplicationHandler {
 
     testProcessing(replicas, pendingOps, getOverReplicatedHealthResult(), 0);
   }
+  @Test
+  public void testOverReplicationOfQuasiClosedReplicaWithWrongSequenceID()
+      throws IOException {
+    final long sequenceID = 20;
+    container = ReplicationTestUtil.createContainerInfo(
+        RATIS_REPLICATION_CONFIG, 1,
+        HddsProtos.LifeCycleState.CLOSED, sequenceID);
+
+    final Set<ContainerReplica> replicas = new HashSet<>(2);
+    replicas.add(createContainerReplica(container.containerID(), 0,
+        IN_SERVICE, State.CLOSED, sequenceID));
+    replicas.add(createContainerReplica(container.containerID(), 0,
+        IN_SERVICE, State.CLOSED, sequenceID));
+
+    final ContainerReplica quasiClosedReplica =
+        createContainerReplica(container.containerID(), 0,
+            IN_SERVICE, State.QUASI_CLOSED, sequenceID - 1);
+    replicas.add(quasiClosedReplica);
+    testProcessing(replicas, Collections.emptyList(),
+        getOverReplicatedHealthResult(), 0);
+
+    // Add another CLOSED replica
+    replicas.add(createContainerReplica(container.containerID(), 0,
+        IN_SERVICE, ContainerReplicaProto.State.CLOSED, sequenceID));
+
+    testProcessing(replicas, Collections.emptyList(),
+        getOverReplicatedHealthResult(), 1);
+  }
 
   @Test
   public void testDeleteThrottlingMisMatchedReplica() throws IOException {
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisUnderReplicationHandler.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisUnderReplicationHandler.java
index e2cdea09cc..d83249e46e 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisUnderReplicationHandler.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisUnderReplicationHandler.java
@@ -57,6 +57,7 @@ import static 
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalSt
 import static 
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.ENTERING_MAINTENANCE;
 import static 
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_MAINTENANCE;
 import static 
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_SERVICE;
+import static 
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createContainerInfo;
 import static 
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createContainerReplica;
 import static 
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createReplicas;
 import static org.mockito.ArgumentMatchers.any;
@@ -384,6 +385,54 @@ public class TestRatisUnderReplicationHandler {
     Assertions.assertTrue(excludedNodes.contains(pendingRemove));
   }
 
+  @Test
+  public void testUnderReplicationDueToQuasiClosedReplicaWithWrongSequenceID()
+      throws IOException {
+    final long sequenceID = 20;
+    container = ReplicationTestUtil.createContainerInfo(
+        RATIS_REPLICATION_CONFIG, 1,
+        HddsProtos.LifeCycleState.CLOSED, sequenceID);
+
+    final Set<ContainerReplica> replicas = new HashSet<>(2);
+    replicas.add(createContainerReplica(container.containerID(), 0,
+        IN_SERVICE, State.CLOSED, sequenceID));
+
+    final ContainerReplica quasiClosedReplica =
+        createContainerReplica(container.containerID(), 0,
+            IN_SERVICE, State.QUASI_CLOSED, sequenceID - 1);
+    replicas.add(quasiClosedReplica);
+
+    final Set<Pair<DatanodeDetails, SCMCommand<?>>> commands =
+        testProcessing(replicas, Collections.emptyList(),
+            getUnderReplicatedHealthResult(), 2, 2);
+    commands.forEach(
+        command -> Assert.assertNotEquals(
+            quasiClosedReplica.getDatanodeDetails(),
+            command.getKey()));
+  }
+
+  @Test
+  public void testOnlyQuasiClosedReplicaWithWrongSequenceIdIsAvailable()
+      throws IOException {
+    final long sequenceID = 20;
+    container = createContainerInfo(RATIS_REPLICATION_CONFIG, 1,
+        HddsProtos.LifeCycleState.CLOSED, sequenceID);
+
+    final Set<ContainerReplica> replicas = new HashSet<>(1);
+    final ContainerReplica quasiClosedReplica =
+        createContainerReplica(container.containerID(), 0,
+            IN_SERVICE, State.QUASI_CLOSED, sequenceID - 1);
+    replicas.add(quasiClosedReplica);
+
+    final Set<Pair<DatanodeDetails, SCMCommand<?>>> commands =
+        testProcessing(replicas, Collections.emptyList(),
+            getUnderReplicatedHealthResult(), 2, 2);
+    commands.forEach(
+        command -> Assert.assertEquals(
+            quasiClosedReplica.getDatanodeDetails(),
+            command.getKey()));
+  }
+
 
   /**
    * Tests whether the specified expectNumCommands number of commands are
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
index e68af943b1..e764db7eff 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
@@ -406,6 +406,61 @@ public class TestReplicationManager {
         commandsSent.iterator().next().getValue().getType());
   }
 
+  /**
+   * When there is Quasi Closed Replica with incorrect sequence id
+   * for a Closed container, it's treated as unhealthy and deleted.
+   * The deletion should happen only after we re-replicate a healthy
+   * replica.
+   */
+  @Test
+  public void testClosedContainerWithQuasiClosedReplicaWithWrongSequence()
+      throws IOException, NodeNotFoundException {
+    final RatisReplicationConfig ratisRepConfig =
+        RatisReplicationConfig.getInstance(THREE);
+    final long sequenceId = 101;
+
+    final ContainerInfo container = createContainerInfo(ratisRepConfig, 1,
+        HddsProtos.LifeCycleState.CLOSED);
+    final ContainerReplica replicaOne = createContainerReplica(
+        ContainerID.valueOf(1), 0, IN_SERVICE,
+        ContainerReplicaProto.State.CLOSED, sequenceId);
+    final ContainerReplica replicaTwo = createContainerReplica(
+        ContainerID.valueOf(1), 0, IN_SERVICE,
+        ContainerReplicaProto.State.CLOSED, sequenceId);
+
+    final ContainerReplica quasiCloseReplica = createContainerReplica(
+        ContainerID.valueOf(1), 0, IN_SERVICE,
+        ContainerReplicaProto.State.QUASI_CLOSED, sequenceId - 5);
+
+    Set<ContainerReplica> replicas = new HashSet<>();
+    replicas.add(replicaOne);
+    replicas.add(replicaTwo);
+    replicas.add(quasiCloseReplica);
+    storeContainerAndReplicas(container, replicas);
+
+    replicationManager.processContainer(container, repQueue, repReport);
+    assertEquals(1, repReport.getStat(
+        ReplicationManagerReport.HealthState.UNDER_REPLICATED));
+    assertEquals(0, repReport.getStat(
+        ReplicationManagerReport.HealthState.OVER_REPLICATED));
+    assertEquals(1, repQueue.underReplicatedQueueSize());
+    assertEquals(0, repQueue.overReplicatedQueueSize());
+
+    // Add the third CLOSED replica
+    replicas.add(createContainerReplica(
+        ContainerID.valueOf(1), 0, IN_SERVICE,
+        ContainerReplicaProto.State.CLOSED, sequenceId));
+    storeContainerAndReplicas(container, replicas);
+
+    replicationManager.processContainer(container, repQueue, repReport);
+    assertEquals(1, repReport.getStat(
+        ReplicationManagerReport.HealthState.UNDER_REPLICATED));
+    assertEquals(1, repReport.getStat(
+        ReplicationManagerReport.HealthState.OVER_REPLICATED));
+    assertEquals(1, repQueue.underReplicatedQueueSize());
+    assertEquals(1, repQueue.overReplicatedQueueSize());
+  }
+
   @Test
   public void testHealthyContainer() throws ContainerNotFoundException {
     ContainerInfo container = createContainerInfo(repConfig, 1,
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisReplicationCheckHandler.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisReplicationCheckHandler.java
index a960e6549c..e2de891966 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisReplicationCheckHandler.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisReplicationCheckHandler.java
@@ -37,6 +37,7 @@ import 
org.apache.hadoop.hdds.scm.container.replication.ContainerHealthResult.Ov
 import 
org.apache.hadoop.hdds.scm.container.replication.ContainerHealthResult.UnderReplicatedHealthResult;
 import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaOp;
 import org.apache.hadoop.hdds.scm.container.replication.ReplicationQueue;
+import org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
@@ -46,6 +47,7 @@ import org.mockito.Mockito;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 
@@ -828,4 +830,66 @@ public class TestRatisReplicationCheckHandler {
         ReplicationManagerReport.HealthState.MIS_REPLICATED));
   }
 
+  @Test
+  public void testWithQuasiClosedReplicas() {
+    final long sequenceID = 20;
+    final ContainerInfo container = ReplicationTestUtil.createContainerInfo(
+        repConfig, 1, HddsProtos.LifeCycleState.CLOSED, sequenceID);
+
+    final Set<ContainerReplica> replicas = new HashSet<>(2);
+    replicas.add(createContainerReplica(container.containerID(), 0,
+        IN_SERVICE, State.CLOSED, sequenceID));
+    replicas.add(createContainerReplica(container.containerID(), 0,
+        IN_SERVICE, State.CLOSED, sequenceID));
+
+    final ContainerReplica quasiClosedReplica =
+        createContainerReplica(container.containerID(), 0,
+            IN_SERVICE, State.QUASI_CLOSED, sequenceID);
+    replicas.add(quasiClosedReplica);
+    requestBuilder.setContainerReplicas(replicas)
+        .setContainerInfo(container);
+    final ContainerHealthResult result =
+        healthCheck.checkHealth(requestBuilder.build());
+
+    assertEquals(HealthState.HEALTHY, result.getHealthState());
+
+    assertFalse(healthCheck.handle(requestBuilder.build()));
+    assertEquals(0, repQueue.underReplicatedQueueSize());
+    assertEquals(0, repQueue.overReplicatedQueueSize());
+  }
+
+  @Test
+  public void testWithQuasiClosedReplicasWithWrongSequenceID() {
+    final long sequenceID = 20;
+    final ContainerInfo container = ReplicationTestUtil.createContainerInfo(
+        repConfig, 1, HddsProtos.LifeCycleState.CLOSED, sequenceID);
+
+    final Set<ContainerReplica> replicas = new HashSet<>(2);
+    replicas.add(createContainerReplica(container.containerID(), 0,
+        IN_SERVICE, State.CLOSED, sequenceID));
+    replicas.add(createContainerReplica(container.containerID(), 0,
+        IN_SERVICE, State.CLOSED, sequenceID));
+
+    final ContainerReplica quasiClosedReplica =
+        createContainerReplica(container.containerID(), 0,
+            IN_SERVICE, State.QUASI_CLOSED, sequenceID - 1);
+    replicas.add(quasiClosedReplica);
+
+    requestBuilder.setContainerReplicas(replicas)
+        .setContainerInfo(container);
+    UnderReplicatedHealthResult result = (UnderReplicatedHealthResult)
+        healthCheck.checkHealth(requestBuilder.build());
+
+    assertEquals(HealthState.UNDER_REPLICATED, result.getHealthState());
+    assertEquals(1, result.getRemainingRedundancy());
+    assertFalse(result.isReplicatedOkAfterPending());
+    assertFalse(result.underReplicatedDueToOutOfService());
+
+    assertTrue(healthCheck.handle(requestBuilder.build()));
+    assertEquals(1, repQueue.underReplicatedQueueSize());
+    assertEquals(0, repQueue.overReplicatedQueueSize());
+    assertEquals(1, report.getStat(
+        ReplicationManagerReport.HealthState.UNDER_REPLICATED));
+  }
+
 }
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisUnhealthyReplicationCheckHandler.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisUnhealthyReplicationCheckHandler.java
index f17cef2a0d..91b2a143c1 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisUnhealthyReplicationCheckHandler.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisUnhealthyReplicationCheckHandler.java
@@ -24,6 +24,7 @@ import org.apache.hadoop.hdds.client.ReplicationConfig;
 import org.apache.hadoop.hdds.protocol.MockDatanodeDetails;
 import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
 import 
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto;
+import 
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State;
 import org.apache.hadoop.hdds.scm.container.ContainerInfo;
 import org.apache.hadoop.hdds.scm.container.ContainerReplica;
 import org.apache.hadoop.hdds.scm.container.ReplicationManagerReport;
@@ -31,11 +32,13 @@ import 
org.apache.hadoop.hdds.scm.container.replication.ContainerCheckRequest;
 import org.apache.hadoop.hdds.scm.container.replication.ContainerHealthResult;
 import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaOp;
 import org.apache.hadoop.hdds.scm.container.replication.ReplicationQueue;
+import org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 
 import java.io.IOException;
 import java.util.Collections;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 
@@ -333,4 +336,59 @@ public class TestRatisUnhealthyReplicationCheckHandler {
     assertEquals(1,
         report.getStat(ReplicationManagerReport.HealthState.UNHEALTHY));
   }
+
+  @Test
+  public void testWithQuasiClosedReplica() {
+    final long sequenceID = 20;
+    final ContainerInfo container = ReplicationTestUtil.createContainerInfo(
+        repConfig, 1, HddsProtos.LifeCycleState.CLOSED, sequenceID);
+
+    final Set<ContainerReplica> replicas = new HashSet<>(2);
+    replicas.add(createContainerReplica(container.containerID(), 0,
+        IN_SERVICE, State.CLOSED, sequenceID));
+    replicas.add(createContainerReplica(container.containerID(), 0,
+        IN_SERVICE, State.CLOSED, sequenceID));
+
+    final ContainerReplica quasiClosedReplica =
+        createContainerReplica(container.containerID(), 0,
+            IN_SERVICE, State.QUASI_CLOSED, sequenceID);
+    replicas.add(quasiClosedReplica);
+
+    requestBuilder.setContainerReplicas(replicas).setContainerInfo(container);
+    assertFalse(handler.handle(requestBuilder.build()));
+    assertEquals(0, repQueue.underReplicatedQueueSize());
+    assertEquals(0, repQueue.overReplicatedQueueSize());
+  }
+  
+  @Test
+  public void overReplicationCheckWithQuasiClosedReplica() {
+    final long sequenceID = 20;
+    final ContainerInfo container = ReplicationTestUtil.createContainerInfo(
+        repConfig, 1, HddsProtos.LifeCycleState.CLOSED, sequenceID);
+
+    final Set<ContainerReplica> replicas = new HashSet<>(2);
+    replicas.add(createContainerReplica(container.containerID(), 0,
+        IN_SERVICE, State.CLOSED, sequenceID));
+    replicas.add(createContainerReplica(container.containerID(), 0,
+        IN_SERVICE, State.CLOSED, sequenceID));
+    replicas.add(createContainerReplica(container.containerID(), 0,
+        IN_SERVICE, State.CLOSED, sequenceID));
+
+    final ContainerReplica quasiClosedReplica =
+        createContainerReplica(container.containerID(), 0,
+            IN_SERVICE, State.QUASI_CLOSED, sequenceID - 1);
+    replicas.add(quasiClosedReplica);
+
+    requestBuilder.setContainerReplicas(replicas).setContainerInfo(container);
+    assertTrue(handler.handle(requestBuilder.build()));
+    assertEquals(0, repQueue.underReplicatedQueueSize());
+    assertEquals(1, repQueue.overReplicatedQueueSize());
+    assertEquals(1, report.getStat(
+        ReplicationManagerReport.HealthState.OVER_REPLICATED));
+    assertEquals(1,
+        report.getStat(ReplicationManagerReport.HealthState.UNHEALTHY));
+  }
+
+
+
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to