This is an automated email from the ASF dual-hosted git repository.
siddhant pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 551ef3c020 HDDS-7640. EC: UNHEALTHY replicas not replaced by healthy
replicas from a CLOSED container by RM (#4083)
551ef3c020 is described below
commit 551ef3c0206b3db3d5ec3250c35abbd5c5826be6
Author: Siddhant Sangwan <[email protected]>
AuthorDate: Wed Dec 14 19:40:30 2022 +0530
HDDS-7640. EC: UNHEALTHY replicas not replaced by healthy replicas from a
CLOSED container by RM (#4083)
---
.../ClosedWithMismatchedReplicasHandler.java | 61 +++++++--------
.../replication/TestReplicationManager.java | 90 ++++++++++++++++++++++
.../TestClosedWithMismatchedReplicasHandler.java | 16 +++-
3 files changed, 133 insertions(+), 34 deletions(-)
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ClosedWithMismatchedReplicasHandler.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ClosedWithMismatchedReplicasHandler.java
index f5bb19323a..eaf6664b5b 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ClosedWithMismatchedReplicasHandler.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ClosedWithMismatchedReplicasHandler.java
@@ -23,11 +23,10 @@ import org.apache.hadoop.hdds.scm.container.ContainerInfo;
import org.apache.hadoop.hdds.scm.container.ContainerReplica;
import org.apache.hadoop.hdds.scm.container.replication.ContainerCheckRequest;
import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
-import java.util.Iterator;
-import java.util.List;
import java.util.Set;
-import java.util.stream.Collectors;
/**
* Handler to process containers which are closed, but some replicas are still
@@ -35,6 +34,8 @@ import java.util.stream.Collectors;
* mis-matched replica to close it.
*/
public class ClosedWithMismatchedReplicasHandler extends AbstractCheck {
+ public static final Logger LOG =
+ LoggerFactory.getLogger(ClosedWithMismatchedReplicasHandler.class);
private ReplicationManager replicationManager;
@@ -43,6 +44,13 @@ public class ClosedWithMismatchedReplicasHandler extends
AbstractCheck {
this.replicationManager = replicationManager;
}
+ /**
+ * Handles CLOSED EC or RATIS container. If some replicas are CLOSING or
+ * OPEN, this sends a force-close command for them.
+ * @param request ContainerCheckRequest object representing the container
+ * @return always returns true so that other handlers in the chain can fix
+ * issues such as under replication
+ */
@Override
public boolean handle(ContainerCheckRequest request) {
ContainerInfo containerInfo = request.getContainerInfo();
@@ -51,39 +59,32 @@ public class ClosedWithMismatchedReplicasHandler extends
AbstractCheck {
// Handler is only relevant for CLOSED containers.
return false;
}
- List<ContainerReplica> unhealthyReplicas = replicas.stream()
- .filter(r -> !ReplicationManager
- .compareState(containerInfo.getState(), r.getState()))
- .collect(Collectors.toList());
- if (unhealthyReplicas.size() > 0) {
- handleUnhealthyReplicas(containerInfo, unhealthyReplicas);
- return true;
+ // close replica if its state is OPEN or CLOSING
+ for (ContainerReplica replica : replicas) {
+ if (isMismatched(replica)) {
+ LOG.debug("Sending close command for mismatched replica {} of " +
+ "container {}.", replica, containerInfo);
+ replicationManager.sendCloseContainerReplicaCommand(
+ containerInfo, replica.getDatanodeDetails(), true);
+ }
}
+
+ /*
+ This handler is unique because it always returns false. This allows
+ handlers further in the chain to fix issues such as under replication.
+ */
return false;
}
/**
- * Handles unhealthy container.
- * A container is inconsistent if any of the replica state doesn't
- * match the container state. We have to take appropriate action
- * based on state of the replica.
- *
- * @param container ContainerInfo
- * @param unhealthyReplicas List of ContainerReplica
+ * If a CLOSED container has an OPEN or CLOSING replica, there is a state
+ * mismatch.
+ * @param replica replica to check for mismatch
+ * @return true if the replica is in CLOSING or OPEN state, else false
*/
- private void handleUnhealthyReplicas(final ContainerInfo container,
- List<ContainerReplica> unhealthyReplicas) {
- Iterator<ContainerReplica> iterator = unhealthyReplicas.iterator();
- while (iterator.hasNext()) {
- final ContainerReplica replica = iterator.next();
- final ContainerReplicaProto.State state = replica.getState();
- if (state == ContainerReplicaProto.State.OPEN
- || state == ContainerReplicaProto.State.CLOSING) {
- replicationManager.sendCloseContainerReplicaCommand(
- container, replica.getDatanodeDetails(), true);
- iterator.remove();
- }
- }
+ private boolean isMismatched(ContainerReplica replica) {
+ return replica.getState() == ContainerReplicaProto.State.OPEN ||
+ replica.getState() == ContainerReplicaProto.State.CLOSING;
}
}
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
index 92ec499570..f537ec5628 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
@@ -62,9 +62,11 @@ import java.util.Set;
import static
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_WAIT_TIME_AFTER_SAFE_MODE_EXIT;
import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.DECOMMISSIONING;
+import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_SERVICE;
import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor.THREE;
import static
org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaOp.PendingOpType.ADD;
import static
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createContainerInfo;
+import static
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createContainerReplica;
import static
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createReplicas;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.anyInt;
@@ -245,6 +247,94 @@ public class TestReplicationManager {
ReplicationManagerReport.HealthState.MISSING));
}
+ /**
+ * A closed EC container with 3 closed and 2 unhealthy replicas is under
+ * replicated. RM should add it to under replicated queue.
+ */
+ @Test
+ public void testUnderReplicatedClosedContainerWithUnhealthyReplicas()
+ throws ContainerNotFoundException {
+ ContainerInfo container = createContainerInfo(repConfig, 1,
+ HddsProtos.LifeCycleState.CLOSED);
+ Set<ContainerReplica> replicas = addReplicas(container,
+ ContainerReplicaProto.State.CLOSED, 1, 2, 3);
+ ContainerReplica unhealthyReplica1 =
+ createContainerReplica(container.containerID(), 4,
+ IN_SERVICE, ContainerReplicaProto.State.UNHEALTHY);
+ ContainerReplica unhealthyReplica2 =
+ createContainerReplica(container.containerID(), 5,
+ IN_SERVICE, ContainerReplicaProto.State.UNHEALTHY);
+ replicas.add(unhealthyReplica1);
+ replicas.add(unhealthyReplica2);
+
+ replicationManager.processContainer(
+ container, repQueue, repReport);
+
+ Assert.assertEquals(1, repQueue.underReplicatedQueueSize());
+ Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
+ Assert.assertEquals(1, repReport.getStat(
+ ReplicationManagerReport.HealthState.UNDER_REPLICATED));
+ }
+
+ /**
+ * A closed EC container with 2 closed and 3 unhealthy replicas is
+ * unrecoverable. It should not be queued to under replicated queue but
+ * should be recorded as missing (currently, we're calling an unrecoverable
+ * EC container missing).
+ */
+ @Test
+ public void testUnrecoverableClosedContainerWithUnhealthyReplicas()
+ throws ContainerNotFoundException {
+ ContainerInfo container = createContainerInfo(repConfig, 1,
+ HddsProtos.LifeCycleState.CLOSED);
+ Set<ContainerReplica> replicas = addReplicas(container,
+ ContainerReplicaProto.State.UNHEALTHY, 3, 4, 5);
+ ContainerReplica closedReplica1 =
+ createContainerReplica(container.containerID(), 1,
+ IN_SERVICE, ContainerReplicaProto.State.CLOSED);
+ ContainerReplica closedReplica2 =
+ createContainerReplica(container.containerID(), 2,
+ IN_SERVICE, ContainerReplicaProto.State.CLOSED);
+ replicas.add(closedReplica1);
+ replicas.add(closedReplica2);
+
+ replicationManager.processContainer(
+ container, repQueue, repReport);
+
+ Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
+ Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
+ Assert.assertEquals(1, repReport.getStat(
+ ReplicationManagerReport.HealthState.UNDER_REPLICATED));
+ Assert.assertEquals(1, repReport.getStat(
+ ReplicationManagerReport.HealthState.MISSING));
+ }
+
+ @Test
+ public void
+ testUnderReplicatedClosedContainerWithUnHealthyAndClosingReplicas()
+ throws ContainerNotFoundException {
+ ContainerInfo container = createContainerInfo(repConfig, 1,
+ HddsProtos.LifeCycleState.CLOSED);
+ Set<ContainerReplica> replicas = addReplicas(container,
+ ContainerReplicaProto.State.CLOSED, 1, 2, 3);
+ ContainerReplica unhealthyReplica1 =
+ createContainerReplica(container.containerID(), 4,
+ IN_SERVICE, ContainerReplicaProto.State.UNHEALTHY);
+ ContainerReplica unhealthyReplica2 =
+ createContainerReplica(container.containerID(), 5,
+ IN_SERVICE, ContainerReplicaProto.State.CLOSING);
+ replicas.add(unhealthyReplica1);
+ replicas.add(unhealthyReplica2);
+
+ replicationManager.processContainer(
+ container, repQueue, repReport);
+
+ Assert.assertEquals(1, repQueue.underReplicatedQueueSize());
+ Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
+ Assert.assertEquals(1, repReport.getStat(
+ ReplicationManagerReport.HealthState.UNDER_REPLICATED));
+ }
+
@Test
public void testUnderAndOverReplicated()
throws ContainerNotFoundException {
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestClosedWithMismatchedReplicasHandler.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestClosedWithMismatchedReplicasHandler.java
index ff2faa0ebc..f2b3a72885 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestClosedWithMismatchedReplicasHandler.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestClosedWithMismatchedReplicasHandler.java
@@ -99,7 +99,7 @@ public class TestClosedWithMismatchedReplicasHandler {
}
@Test
- public void testClosedMissMatchContainerReturnsTrue() {
+ public void testCloseCommandSentForMismatchedReplicas() {
ContainerInfo containerInfo = ReplicationTestUtil.createContainerInfo(
ecReplicationConfig, 1, CLOSED);
Set<ContainerReplica> containerReplicas = ReplicationTestUtil
@@ -126,7 +126,10 @@ public class TestClosedWithMismatchedReplicasHandler {
.setContainerInfo(containerInfo)
.setContainerReplicas(containerReplicas)
.build();
- Assertions.assertTrue(handler.handle(request));
+
+ // this handler always returns false so other handlers can fix issues
+ // such as under replication
+ Assertions.assertFalse(handler.handle(request));
Mockito.verify(replicationManager, times(1))
.sendCloseContainerReplicaCommand(
@@ -134,6 +137,7 @@ public class TestClosedWithMismatchedReplicasHandler {
Mockito.verify(replicationManager, times(1))
.sendCloseContainerReplicaCommand(
containerInfo, mismatch2.getDatanodeDetails(), true);
+ // close command should not be sent for unhealthy replica
Mockito.verify(replicationManager, times(0))
.sendCloseContainerReplicaCommand(
containerInfo, mismatch3.getDatanodeDetails(), true);
@@ -177,7 +181,7 @@ public class TestClosedWithMismatchedReplicasHandler {
}
@Test
- public void testClosedMissMatchRatisContainerReturnsTrue() {
+ public void testCloseCommandSentForMismatchedRatisReplicas() {
ContainerInfo containerInfo = ReplicationTestUtil.createContainerInfo(
ratisReplicationConfig, 1, CLOSED);
ContainerReplica mismatch1 = ReplicationTestUtil.createContainerReplica(
@@ -202,7 +206,10 @@ public class TestClosedWithMismatchedReplicasHandler {
.setContainerInfo(containerInfo)
.setContainerReplicas(containerReplicas)
.build();
- Assertions.assertTrue(handler.handle(request));
+
+ // this handler always returns false so other handlers can fix issues
+ // such as under replication
+ Assertions.assertFalse(handler.handle(request));
Mockito.verify(replicationManager, times(1))
.sendCloseContainerReplicaCommand(
@@ -210,6 +217,7 @@ public class TestClosedWithMismatchedReplicasHandler {
Mockito.verify(replicationManager, times(1))
.sendCloseContainerReplicaCommand(
containerInfo, mismatch2.getDatanodeDetails(), true);
+ // close command should not be sent for unhealthy replica
Mockito.verify(replicationManager, times(0))
.sendCloseContainerReplicaCommand(
containerInfo, mismatch3.getDatanodeDetails(), true);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]