This is an automated email from the ASF dual-hosted git repository.
arp pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 52edf7ac17 HDDS-9075. QUASI_CLOSED Replica with incorrect sequenceID
should be deleted by SCM. (#5120)
52edf7ac17 is described below
commit 52edf7ac17feb22f7fde8ff283bfdf6adcee83d8
Author: Nandakumar <[email protected]>
AuthorDate: Fri Jul 28 08:09:51 2023 +0530
HDDS-9075. QUASI_CLOSED Replica with incorrect sequenceID should be deleted
by SCM. (#5120)
---
.../replication/LegacyReplicationManager.java | 52 ++++++++++++--
.../replication/RatisContainerReplicaCount.java | 10 ++-
.../replication/RatisOverReplicationHandler.java | 4 +-
.../replication/RatisUnderReplicationHandler.java | 11 +--
.../container/replication/ReplicationTestUtil.java | 11 +++
.../replication/TestLegacyReplicationManager.java | 23 +++----
.../TestRatisContainerReplicaCount.java | 79 ++++++++++++++++++++++
.../TestRatisOverReplicationHandler.java | 30 ++++++++
.../TestRatisUnderReplicationHandler.java | 49 ++++++++++++++
.../replication/TestReplicationManager.java | 55 +++++++++++++++
.../health/TestRatisReplicationCheckHandler.java | 64 ++++++++++++++++++
.../TestRatisUnhealthyReplicationCheckHandler.java | 58 ++++++++++++++++
12 files changed, 418 insertions(+), 28 deletions(-)
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java
index 63fdc95bdf..c851470eae 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java
@@ -521,18 +521,23 @@ public class LegacyReplicationManager {
* Check if the container is over replicated and take appropriate
* action.
*/
- if (replicaSet.isOverReplicated()) {
- handleOverReplicatedHealthy(container, replicaSet, report);
+ if (replicaSet.getReplicas().size() >
+ container.getReplicationConfig().getRequiredNodes()) {
+ if (replicaSet.isHealthy()) {
+ handleOverReplicatedHealthy(container, replicaSet, report);
+ } else {
+ handleOverReplicatedExcessUnhealthy(container, replicaSet, report);
+ }
return;
}
/*
- If we get here, the container is not over replicated or under replicated
- but it may be "unhealthy", which means it has one or more replica which
- are not in the same state as the container itself.
+ * If we get here, the container is not over replicated or under
+ * replicated, but it may be "unhealthy", which means it has one or
+ * more replica which are not in the same state as the container itself.
*/
if (!replicaSet.isHealthy()) {
- handleOverReplicatedExcessUnhealthy(container, replicaSet, report);
+ handleContainerWithUnhealthyReplica(container, replicaSet);
}
}
} catch (ContainerNotFoundException ex) {
@@ -1268,6 +1273,41 @@ public class LegacyReplicationManager {
}
}
+ /**
+ * This method handles container with unhealthy replica by over-replicating
+ * the healthy replica. Once the container becomes over-replicated,
+ * we delete the unhealthy replica in the next cycle of replication manager
+ * in handleOverReplicatedExcessUnhealthy method.
+ */
+ private void handleContainerWithUnhealthyReplica(
+ final ContainerInfo container,
+ final RatisContainerReplicaCount replicaSet) {
+ /*
+ * When there is an Unhealthy or Quasi Closed replica with incorrect
+ * sequence id for a Closed container, it should be deleted and one of
+ * the healthy replica has to be re-replicated.
+ *
+ * We first do the re-replication and over replicate the container,
+ * in the next cycle of replication manager the excess unhealthy replica
+ * is deleted.
+ */
+
+ if (container.getState() == LifeCycleState.CLOSED) {
+ final List<ContainerReplica> replicas = replicaSet.getReplicas();
+ final List<ContainerReplica> replicationSources = getReplicationSources(
+ container, replicaSet.getReplicas(), State.CLOSED);
+ if (replicationSources.isEmpty()) {
+ LOG.warn("No healthy CLOSED replica for replication.");
+ return;
+ }
+ final ContainerPlacementStatus placementStatus = getPlacementStatus(
+ new HashSet<>(replicationSources),
+ container.getReplicationConfig().getRequiredNodes());
+ replicateAnyWithTopology(container, replicationSources,
+ placementStatus, replicas.size() - replicationSources.size());
+ }
+ }
+
/**
* Returns the replicas from {@code replicas} that:
* - Do not have in flight deletions
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisContainerReplicaCount.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisContainerReplicaCount.java
index 47b0da9be6..e8ccaebfbf 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisContainerReplicaCount.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisContainerReplicaCount.java
@@ -137,8 +137,16 @@ public class RatisContainerReplicaCount implements
ContainerReplicaCount {
for (ContainerReplica cr : replicas) {
HddsProtos.NodeOperationalState state =
cr.getDatanodeDetails().getPersistedOpState();
+
+ /*
+ * When there is Quasi Closed Replica with incorrect sequence id
+ * for a Closed container, it's treated as unhealthy.
+ */
boolean unhealthy =
- cr.getState() == ContainerReplicaProto.State.UNHEALTHY;
+ cr.getState() == ContainerReplicaProto.State.UNHEALTHY ||
+ (cr.getState() == ContainerReplicaProto.State.QUASI_CLOSED &&
+ container.getState() == HddsProtos.LifeCycleState.CLOSED &&
+ container.getSequenceId() != cr.getSequenceId());
if (state == DECOMMISSIONED || state == DECOMMISSIONING) {
if (unhealthy) {
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisOverReplicationHandler.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisOverReplicationHandler.java
index 55108ff6b8..dd00e01ba7 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisOverReplicationHandler.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisOverReplicationHandler.java
@@ -83,8 +83,8 @@ public class RatisOverReplicationHandler
// We are going to check for over replication, so we should filter out any
// replicas that are not in a HEALTHY state. This is because a replica can
- // be healthy, stale or dead. If it is dead is will be quickly removed from
- // scm. If it is state, there is a good chance the DN is offline and the
+ // be healthy, stale or dead. If it is dead it will be quickly removed from
+ // scm. If it is stale, there is a good chance the DN is offline and the
// replica will go away soon. So, if we have a container that is over
// replicated with a HEALTHY and STALE replica, and we decide to delete the
// HEALTHY one, and then the STALE ones goes away, we will lose them both.
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisUnderReplicationHandler.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisUnderReplicationHandler.java
index 0e4c4b1d42..fae435df29 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisUnderReplicationHandler.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisUnderReplicationHandler.java
@@ -215,12 +215,13 @@ public class RatisUnderReplicationHandler
}
}
- Predicate<ContainerReplica> predicate;
+ Predicate<ContainerReplica> predicate =
+ replica -> replica.getState() == State.CLOSED ||
+ replica.getState() == State.QUASI_CLOSED;
+
if (replicaCount.getHealthyReplicaCount() == 0) {
- predicate = replica -> replica.getState() == State.UNHEALTHY;
- } else {
- predicate = replica -> replica.getState() == State.CLOSED ||
- replica.getState() == State.QUASI_CLOSED;
+ predicate = predicate.or(
+ replica -> replica.getState() == State.UNHEALTHY);
}
/*
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationTestUtil.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationTestUtil.java
index 1b9cc85e2f..3f0f0e78a2 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationTestUtil.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationTestUtil.java
@@ -197,6 +197,17 @@ public final class ReplicationTestUtil {
.build();
}
+ public static ContainerInfo createContainerInfo(
+ ReplicationConfig replicationConfig, long containerID,
+ HddsProtos.LifeCycleState containerState, long sequenceID) {
+ return TestContainerInfo.newBuilderForTest()
+ .setContainerID(containerID)
+ .setReplicationConfig(replicationConfig)
+ .setState(containerState)
+ .setSequenceId(sequenceID)
+ .build();
+ }
+
public static ContainerInfo createContainerInfo(
ReplicationConfig replicationConfig, long containerID,
HddsProtos.LifeCycleState containerState, long keyCount, long bytesUsed)
{
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java
index f493138e89..8bf22b64a1 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java
@@ -67,6 +67,7 @@ import
org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionExcepti
import org.apache.hadoop.ozone.protocol.commands.CommandForDatanode;
import org.apache.ozone.test.GenericTestUtils;
import org.apache.ozone.test.TestClock;
+import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
@@ -975,7 +976,6 @@ public class TestLegacyReplicationManager {
assertUnderReplicatedCount(0);
boolean unhealthyDeleted = false;
boolean closedDeleted = false;
- UUID closedDeletedUUID = null;
for (CommandForDatanode<?> command :
datanodeCommandHandler.getReceivedCommands()) {
@@ -986,19 +986,19 @@ public class TestLegacyReplicationManager {
unhealthyDeleted = true;
} else {
closedDeleted = true;
- closedDeletedUUID = command.getDatanodeId();
}
}
}
- Assertions.assertFalse(unhealthyDeleted);
- Assertions.assertTrue(closedDeleted);
+ Assertions.assertTrue(unhealthyDeleted);
+ Assertions.assertFalse(closedDeleted);
+
+ containerStateManager.removeContainerReplica(
+ container.containerID(), unhealthy);
// Do a second run.
assertReplicaScheduled(0);
assertUnderReplicatedCount(0);
- unhealthyDeleted = false;
- closedDeleted = false;
for (CommandForDatanode<?> command :
datanodeCommandHandler.getReceivedCommands()) {
if (command.getCommand().getType() ==
@@ -1008,8 +1008,6 @@ public class TestLegacyReplicationManager {
unhealthyDeleted = true;
} else {
closedDeleted = true;
- // The delete command should have been left over from the last run.
- Assertions.assertEquals(closedDeletedUUID,
command.getDatanodeId());
}
}
}
@@ -2280,8 +2278,6 @@ public class TestLegacyReplicationManager {
id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails());
final ContainerReplica replicaFour = getReplicas(
id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails());
- final ContainerReplica replicaFive = getReplicas(
- id, UNHEALTHY, 1000L, originNodeId, randomDatanodeDetails());
containerStateManager.addContainer(container.getProtobuf());
containerStateManager.updateContainerReplica(id, replicaOne);
@@ -2289,7 +2285,6 @@ public class TestLegacyReplicationManager {
containerStateManager.updateContainerReplica(
id, replicaThree);
containerStateManager.updateContainerReplica(id, replicaFour);
- containerStateManager.updateContainerReplica(id, replicaFive);
// Ensure a mis-replicated status is returned for any containers in this
// test where there are exactly 3 replicas checked.
@@ -2318,9 +2313,6 @@ public class TestLegacyReplicationManager {
Assertions.assertEquals(currentDeleteCommandCount,
replicationManager.getMetrics().getDeletionCmdsSentTotal());
- Assertions.assertFalse(datanodeCommandHandler.received(
- SCMCommandProto.Type.deleteContainerCommand,
- replicaFive.getDatanodeDetails()));
Assertions.assertEquals(0, getInflightCount(InflightType.DELETION));
Assertions.assertEquals(0, replicationManager.getMetrics()
.getInflightDeletion());
@@ -2372,7 +2364,10 @@ public class TestLegacyReplicationManager {
assertOverReplicatedCount(1);
}
+
@Test
+ @Disabled("This test doesn't properly test Rack Placement Policy as" +
+ " LegacyReplicationManager doesn't handle rack aware delete properly.")
public void testOverReplicatedAndPolicyUnSatisfiedAndDeleted()
throws IOException, TimeoutException {
final ContainerInfo container = getContainer(LifeCycleState.CLOSED);
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisContainerReplicaCount.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisContainerReplicaCount.java
index 0380f2b8f8..f4bf6d40b6 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisContainerReplicaCount.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisContainerReplicaCount.java
@@ -40,9 +40,11 @@ import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalSt
import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.ENTERING_MAINTENANCE;
import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_MAINTENANCE;
import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_SERVICE;
+import
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State;
import static
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.CLOSED;
import static
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.CLOSING;
import static
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.OPEN;
+import static
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.QUASI_CLOSED;
import static
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.UNHEALTHY;
import static
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createContainerInfo;
import static
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createContainerReplica;
@@ -754,6 +756,83 @@ class TestRatisContainerReplicaCount {
}
+ @Test
+ void testQuasiClosedReplicaWithCorrectSequenceID() {
+ final ContainerInfo container =
+ createContainerInfo(RatisReplicationConfig.getInstance(
+ HddsProtos.ReplicationFactor.THREE), 1L,
+ HddsProtos.LifeCycleState.CLOSED);
+ final Set<ContainerReplica> replicas =
+ createReplicas(container.containerID(), CLOSED, 0, 0);
+ final Set<ContainerReplica> quasiClosedReplica =
+ createReplicas(container.containerID(), QUASI_CLOSED, 0);
+ replicas.addAll(quasiClosedReplica);
+
+ final RatisContainerReplicaCount crc =
+ new RatisContainerReplicaCount(container, replicas,
+ Collections.emptyList(), 2, false);
+ validate(crc, true, 0, false, false);
+ assertTrue(crc.isSufficientlyReplicated(true));
+ assertEquals(0, crc.getUnhealthyReplicaCount());
+
+ // With additional unhealthy replica
+
+ final Set<ContainerReplica> unhealthyReplica =
+ createReplicas(container.containerID(), UNHEALTHY, 0);
+ replicas.addAll(unhealthyReplica);
+
+ final RatisContainerReplicaCount crcWithUnhealthy =
+ new RatisContainerReplicaCount(container, replicas,
+ Collections.emptyList(), 2, false);
+ validate(crcWithUnhealthy, true, 0, false, false);
+ assertTrue(crcWithUnhealthy.isSufficientlyReplicated(true));
+ assertEquals(1, crcWithUnhealthy.getUnhealthyReplicaCount());
+ }
+
+ @Test
+ void testQuasiClosedReplicaWithInCorrectSequenceID() {
+
+ final long sequenceID = 101;
+ final ContainerInfo container =
+ createContainerInfo(RatisReplicationConfig.getInstance(
+ HddsProtos.ReplicationFactor.THREE), 1L,
+ HddsProtos.LifeCycleState.CLOSED, sequenceID);
+ final ContainerID containerID = container.containerID();
+
+ final ContainerReplica replicaOne = createContainerReplica(
+ containerID, 0, IN_SERVICE, State.CLOSED, sequenceID);
+ final ContainerReplica replicaTwo = createContainerReplica(
+ containerID, 0, IN_SERVICE, State.CLOSED, sequenceID);
+
+ final ContainerReplica quasiCloseReplica = createContainerReplica(
+ containerID, 0, IN_SERVICE, State.QUASI_CLOSED, sequenceID - 5);
+
+ final Set<ContainerReplica> replicas = new HashSet<>();
+ replicas.add(replicaOne);
+ replicas.add(replicaTwo);
+ replicas.add(quasiCloseReplica);
+
+ final RatisContainerReplicaCount crc =
+ new RatisContainerReplicaCount(container, replicas,
+ Collections.emptyList(), 2, false);
+ validate(crc, false, 1, false, false);
+ assertFalse(crc.isSufficientlyReplicated(true));
+ assertEquals(1, crc.getUnhealthyReplicaCount());
+
+ // With additional unhealthy replica
+
+ final Set<ContainerReplica> unhealthyReplica =
+ createReplicas(container.containerID(), UNHEALTHY, 0);
+ replicas.addAll(unhealthyReplica);
+
+ final RatisContainerReplicaCount crcWithUnhealthy =
+ new RatisContainerReplicaCount(container, replicas,
+ Collections.emptyList(), 2, false);
+ validate(crcWithUnhealthy, false, 1, false, false);
+ assertFalse(crcWithUnhealthy.isSufficientlyReplicated(true));
+ assertEquals(2, crcWithUnhealthy.getUnhealthyReplicaCount());
+ }
+
private void validate(RatisContainerReplicaCount rcnt,
boolean sufficientlyReplicated, int replicaDelta,
boolean overReplicated, boolean insufficientDueToOutOfService) {
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisOverReplicationHandler.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisOverReplicationHandler.java
index c83c2a30d9..5a9fd20288 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisOverReplicationHandler.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisOverReplicationHandler.java
@@ -25,6 +25,7 @@ import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.protocol.MockDatanodeDetails;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto;
+import
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State;
import org.apache.hadoop.hdds.scm.PlacementPolicy;
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
import org.apache.hadoop.hdds.scm.container.ContainerReplica;
@@ -50,6 +51,7 @@ import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.stream.Collectors;
+import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_SERVICE;
import static
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createContainer;
import static
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createContainerReplica;
import static
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createReplicas;
@@ -278,6 +280,34 @@ public class TestRatisOverReplicationHandler {
testProcessing(replicas, pendingOps, getOverReplicatedHealthResult(), 0);
}
+ @Test
+ public void testOverReplicationOfQuasiClosedReplicaWithWrongSequenceID()
+ throws IOException {
+ final long sequenceID = 20;
+ container = ReplicationTestUtil.createContainerInfo(
+ RATIS_REPLICATION_CONFIG, 1,
+ HddsProtos.LifeCycleState.CLOSED, sequenceID);
+
+ final Set<ContainerReplica> replicas = new HashSet<>(2);
+ replicas.add(createContainerReplica(container.containerID(), 0,
+ IN_SERVICE, State.CLOSED, sequenceID));
+ replicas.add(createContainerReplica(container.containerID(), 0,
+ IN_SERVICE, State.CLOSED, sequenceID));
+
+ final ContainerReplica quasiClosedReplica =
+ createContainerReplica(container.containerID(), 0,
+ IN_SERVICE, State.QUASI_CLOSED, sequenceID - 1);
+ replicas.add(quasiClosedReplica);
+ testProcessing(replicas, Collections.emptyList(),
+ getOverReplicatedHealthResult(), 0);
+
+ // Add another CLOSED replica
+ replicas.add(createContainerReplica(container.containerID(), 0,
+ IN_SERVICE, ContainerReplicaProto.State.CLOSED, sequenceID));
+
+ testProcessing(replicas, Collections.emptyList(),
+ getOverReplicatedHealthResult(), 1);
+ }
@Test
public void testDeleteThrottlingMisMatchedReplica() throws IOException {
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisUnderReplicationHandler.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisUnderReplicationHandler.java
index e2cdea09cc..d83249e46e 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisUnderReplicationHandler.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisUnderReplicationHandler.java
@@ -57,6 +57,7 @@ import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalSt
import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.ENTERING_MAINTENANCE;
import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_MAINTENANCE;
import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_SERVICE;
+import static
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createContainerInfo;
import static
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createContainerReplica;
import static
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createReplicas;
import static org.mockito.ArgumentMatchers.any;
@@ -384,6 +385,54 @@ public class TestRatisUnderReplicationHandler {
Assertions.assertTrue(excludedNodes.contains(pendingRemove));
}
+ @Test
+ public void testUnderReplicationDueToQuasiClosedReplicaWithWrongSequenceID()
+ throws IOException {
+ final long sequenceID = 20;
+ container = ReplicationTestUtil.createContainerInfo(
+ RATIS_REPLICATION_CONFIG, 1,
+ HddsProtos.LifeCycleState.CLOSED, sequenceID);
+
+ final Set<ContainerReplica> replicas = new HashSet<>(2);
+ replicas.add(createContainerReplica(container.containerID(), 0,
+ IN_SERVICE, State.CLOSED, sequenceID));
+
+ final ContainerReplica quasiClosedReplica =
+ createContainerReplica(container.containerID(), 0,
+ IN_SERVICE, State.QUASI_CLOSED, sequenceID - 1);
+ replicas.add(quasiClosedReplica);
+
+ final Set<Pair<DatanodeDetails, SCMCommand<?>>> commands =
+ testProcessing(replicas, Collections.emptyList(),
+ getUnderReplicatedHealthResult(), 2, 2);
+ commands.forEach(
+ command -> Assert.assertNotEquals(
+ quasiClosedReplica.getDatanodeDetails(),
+ command.getKey()));
+ }
+
+ @Test
+ public void testOnlyQuasiClosedReplicaWithWrongSequenceIdIsAvailable()
+ throws IOException {
+ final long sequenceID = 20;
+ container = createContainerInfo(RATIS_REPLICATION_CONFIG, 1,
+ HddsProtos.LifeCycleState.CLOSED, sequenceID);
+
+ final Set<ContainerReplica> replicas = new HashSet<>(1);
+ final ContainerReplica quasiClosedReplica =
+ createContainerReplica(container.containerID(), 0,
+ IN_SERVICE, State.QUASI_CLOSED, sequenceID - 1);
+ replicas.add(quasiClosedReplica);
+
+ final Set<Pair<DatanodeDetails, SCMCommand<?>>> commands =
+ testProcessing(replicas, Collections.emptyList(),
+ getUnderReplicatedHealthResult(), 2, 2);
+ commands.forEach(
+ command -> Assert.assertEquals(
+ quasiClosedReplica.getDatanodeDetails(),
+ command.getKey()));
+ }
+
/**
* Tests whether the specified expectNumCommands number of commands are
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
index e68af943b1..e764db7eff 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
@@ -406,6 +406,61 @@ public class TestReplicationManager {
commandsSent.iterator().next().getValue().getType());
}
+ /**
+ * When there is Quasi Closed Replica with incorrect sequence id
+ * for a Closed container, it's treated as unhealthy and deleted.
+ * The deletion should happen only after we re-replicate a healthy
+ * replica.
+ */
+ @Test
+ public void testClosedContainerWithQuasiClosedReplicaWithWrongSequence()
+ throws IOException, NodeNotFoundException {
+ final RatisReplicationConfig ratisRepConfig =
+ RatisReplicationConfig.getInstance(THREE);
+ final long sequenceId = 101;
+
+ final ContainerInfo container = createContainerInfo(ratisRepConfig, 1,
+ HddsProtos.LifeCycleState.CLOSED);
+ final ContainerReplica replicaOne = createContainerReplica(
+ ContainerID.valueOf(1), 0, IN_SERVICE,
+ ContainerReplicaProto.State.CLOSED, sequenceId);
+ final ContainerReplica replicaTwo = createContainerReplica(
+ ContainerID.valueOf(1), 0, IN_SERVICE,
+ ContainerReplicaProto.State.CLOSED, sequenceId);
+
+ final ContainerReplica quasiCloseReplica = createContainerReplica(
+ ContainerID.valueOf(1), 0, IN_SERVICE,
+ ContainerReplicaProto.State.QUASI_CLOSED, sequenceId - 5);
+
+ Set<ContainerReplica> replicas = new HashSet<>();
+ replicas.add(replicaOne);
+ replicas.add(replicaTwo);
+ replicas.add(quasiCloseReplica);
+ storeContainerAndReplicas(container, replicas);
+
+ replicationManager.processContainer(container, repQueue, repReport);
+ assertEquals(1, repReport.getStat(
+ ReplicationManagerReport.HealthState.UNDER_REPLICATED));
+ assertEquals(0, repReport.getStat(
+ ReplicationManagerReport.HealthState.OVER_REPLICATED));
+ assertEquals(1, repQueue.underReplicatedQueueSize());
+ assertEquals(0, repQueue.overReplicatedQueueSize());
+
+ // Add the third CLOSED replica
+ replicas.add(createContainerReplica(
+ ContainerID.valueOf(1), 0, IN_SERVICE,
+ ContainerReplicaProto.State.CLOSED, sequenceId));
+ storeContainerAndReplicas(container, replicas);
+
+ replicationManager.processContainer(container, repQueue, repReport);
+ assertEquals(1, repReport.getStat(
+ ReplicationManagerReport.HealthState.UNDER_REPLICATED));
+ assertEquals(1, repReport.getStat(
+ ReplicationManagerReport.HealthState.OVER_REPLICATED));
+ assertEquals(1, repQueue.underReplicatedQueueSize());
+ assertEquals(1, repQueue.overReplicatedQueueSize());
+ }
+
@Test
public void testHealthyContainer() throws ContainerNotFoundException {
ContainerInfo container = createContainerInfo(repConfig, 1,
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisReplicationCheckHandler.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisReplicationCheckHandler.java
index a960e6549c..e2de891966 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisReplicationCheckHandler.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisReplicationCheckHandler.java
@@ -37,6 +37,7 @@ import
org.apache.hadoop.hdds.scm.container.replication.ContainerHealthResult.Ov
import
org.apache.hadoop.hdds.scm.container.replication.ContainerHealthResult.UnderReplicatedHealthResult;
import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaOp;
import org.apache.hadoop.hdds.scm.container.replication.ReplicationQueue;
+import org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
@@ -46,6 +47,7 @@ import org.mockito.Mockito;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.HashSet;
import java.util.List;
import java.util.Set;
@@ -828,4 +830,66 @@ public class TestRatisReplicationCheckHandler {
ReplicationManagerReport.HealthState.MIS_REPLICATED));
}
+ @Test
+ public void testWithQuasiClosedReplicas() {
+ final long sequenceID = 20;
+ final ContainerInfo container = ReplicationTestUtil.createContainerInfo(
+ repConfig, 1, HddsProtos.LifeCycleState.CLOSED, sequenceID);
+
+ final Set<ContainerReplica> replicas = new HashSet<>(2);
+ replicas.add(createContainerReplica(container.containerID(), 0,
+ IN_SERVICE, State.CLOSED, sequenceID));
+ replicas.add(createContainerReplica(container.containerID(), 0,
+ IN_SERVICE, State.CLOSED, sequenceID));
+
+ final ContainerReplica quasiClosedReplica =
+ createContainerReplica(container.containerID(), 0,
+ IN_SERVICE, State.QUASI_CLOSED, sequenceID);
+ replicas.add(quasiClosedReplica);
+ requestBuilder.setContainerReplicas(replicas)
+ .setContainerInfo(container);
+ final ContainerHealthResult result =
+ healthCheck.checkHealth(requestBuilder.build());
+
+ assertEquals(HealthState.HEALTHY, result.getHealthState());
+
+ assertFalse(healthCheck.handle(requestBuilder.build()));
+ assertEquals(0, repQueue.underReplicatedQueueSize());
+ assertEquals(0, repQueue.overReplicatedQueueSize());
+ }
+
+ @Test
+ public void testWithQuasiClosedReplicasWithWrongSequenceID() {
+ final long sequenceID = 20;
+ final ContainerInfo container = ReplicationTestUtil.createContainerInfo(
+ repConfig, 1, HddsProtos.LifeCycleState.CLOSED, sequenceID);
+
+ final Set<ContainerReplica> replicas = new HashSet<>(2);
+ replicas.add(createContainerReplica(container.containerID(), 0,
+ IN_SERVICE, State.CLOSED, sequenceID));
+ replicas.add(createContainerReplica(container.containerID(), 0,
+ IN_SERVICE, State.CLOSED, sequenceID));
+
+ final ContainerReplica quasiClosedReplica =
+ createContainerReplica(container.containerID(), 0,
+ IN_SERVICE, State.QUASI_CLOSED, sequenceID - 1);
+ replicas.add(quasiClosedReplica);
+
+ requestBuilder.setContainerReplicas(replicas)
+ .setContainerInfo(container);
+ UnderReplicatedHealthResult result = (UnderReplicatedHealthResult)
+ healthCheck.checkHealth(requestBuilder.build());
+
+ assertEquals(HealthState.UNDER_REPLICATED, result.getHealthState());
+ assertEquals(1, result.getRemainingRedundancy());
+ assertFalse(result.isReplicatedOkAfterPending());
+ assertFalse(result.underReplicatedDueToOutOfService());
+
+ assertTrue(healthCheck.handle(requestBuilder.build()));
+ assertEquals(1, repQueue.underReplicatedQueueSize());
+ assertEquals(0, repQueue.overReplicatedQueueSize());
+ assertEquals(1, report.getStat(
+ ReplicationManagerReport.HealthState.UNDER_REPLICATED));
+ }
+
}
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisUnhealthyReplicationCheckHandler.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisUnhealthyReplicationCheckHandler.java
index f17cef2a0d..91b2a143c1 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisUnhealthyReplicationCheckHandler.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisUnhealthyReplicationCheckHandler.java
@@ -24,6 +24,7 @@ import org.apache.hadoop.hdds.client.ReplicationConfig;
import org.apache.hadoop.hdds.protocol.MockDatanodeDetails;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto;
+import
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State;
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
import org.apache.hadoop.hdds.scm.container.ContainerReplica;
import org.apache.hadoop.hdds.scm.container.ReplicationManagerReport;
@@ -31,11 +32,13 @@ import
org.apache.hadoop.hdds.scm.container.replication.ContainerCheckRequest;
import org.apache.hadoop.hdds.scm.container.replication.ContainerHealthResult;
import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaOp;
import org.apache.hadoop.hdds.scm.container.replication.ReplicationQueue;
+import org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.Collections;
+import java.util.HashSet;
import java.util.List;
import java.util.Set;
@@ -333,4 +336,59 @@ public class TestRatisUnhealthyReplicationCheckHandler {
assertEquals(1,
report.getStat(ReplicationManagerReport.HealthState.UNHEALTHY));
}
+
+ @Test
+ public void testWithQuasiClosedReplica() {
+ final long sequenceID = 20;
+ final ContainerInfo container = ReplicationTestUtil.createContainerInfo(
+ repConfig, 1, HddsProtos.LifeCycleState.CLOSED, sequenceID);
+
+ final Set<ContainerReplica> replicas = new HashSet<>(2);
+ replicas.add(createContainerReplica(container.containerID(), 0,
+ IN_SERVICE, State.CLOSED, sequenceID));
+ replicas.add(createContainerReplica(container.containerID(), 0,
+ IN_SERVICE, State.CLOSED, sequenceID));
+
+ final ContainerReplica quasiClosedReplica =
+ createContainerReplica(container.containerID(), 0,
+ IN_SERVICE, State.QUASI_CLOSED, sequenceID);
+ replicas.add(quasiClosedReplica);
+
+ requestBuilder.setContainerReplicas(replicas).setContainerInfo(container);
+ assertFalse(handler.handle(requestBuilder.build()));
+ assertEquals(0, repQueue.underReplicatedQueueSize());
+ assertEquals(0, repQueue.overReplicatedQueueSize());
+ }
+
+ @Test
+ public void overReplicationCheckWithQuasiClosedReplica() {
+ final long sequenceID = 20;
+ final ContainerInfo container = ReplicationTestUtil.createContainerInfo(
+ repConfig, 1, HddsProtos.LifeCycleState.CLOSED, sequenceID);
+
+ final Set<ContainerReplica> replicas = new HashSet<>(2);
+ replicas.add(createContainerReplica(container.containerID(), 0,
+ IN_SERVICE, State.CLOSED, sequenceID));
+ replicas.add(createContainerReplica(container.containerID(), 0,
+ IN_SERVICE, State.CLOSED, sequenceID));
+ replicas.add(createContainerReplica(container.containerID(), 0,
+ IN_SERVICE, State.CLOSED, sequenceID));
+
+ final ContainerReplica quasiClosedReplica =
+ createContainerReplica(container.containerID(), 0,
+ IN_SERVICE, State.QUASI_CLOSED, sequenceID - 1);
+ replicas.add(quasiClosedReplica);
+
+ requestBuilder.setContainerReplicas(replicas).setContainerInfo(container);
+ assertTrue(handler.handle(requestBuilder.build()));
+ assertEquals(0, repQueue.underReplicatedQueueSize());
+ assertEquals(1, repQueue.overReplicatedQueueSize());
+ assertEquals(1, report.getStat(
+ ReplicationManagerReport.HealthState.OVER_REPLICATED));
+ assertEquals(1,
+ report.getStat(ReplicationManagerReport.HealthState.UNHEALTHY));
+ }
+
+
+
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]