This is an automated email from the ASF dual-hosted git repository.
adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 04d6e0bc54 HDDS-10231. ContainerStateManager should not finalize OPEN
containers without a Pipeline. (#6123)
04d6e0bc54 is described below
commit 04d6e0bc5421ac4c666122d6755a4d132fc39cf0
Author: Nandakumar Vadivelu <[email protected]>
AuthorDate: Tue Jan 30 21:50:36 2024 +0530
HDDS-10231. ContainerStateManager should not finalize OPEN containers
without a Pipeline. (#6123)
---
.../scm/container/ReplicationManagerReport.java | 5 +-
.../container/TestReplicationManagerReport.java | 1 +
.../scm/container/ContainerStateManagerImpl.java | 10 ++--
.../container/replication/ReplicationManager.java | 10 ++++
.../replication/health/OpenContainerHandler.java | 24 +++++----
.../replication/TestReplicationManager.java | 14 +++++
.../health/TestOpenContainerHandler.java | 60 ++++++++++++++++++++++
7 files changed, 107 insertions(+), 17 deletions(-)
diff --git
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManagerReport.java
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManagerReport.java
index 1dbbc73843..a10846bd61 100644
---
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManagerReport.java
+++
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManagerReport.java
@@ -74,7 +74,10 @@ public class ReplicationManagerReport {
"OpenUnhealthyContainers"),
QUASI_CLOSED_STUCK(
"Containers QuasiClosed with insufficient datanode origins",
- "StuckQuasiClosedContainers");
+ "StuckQuasiClosedContainers"),
+ OPEN_WITHOUT_PIPELINE(
+ "Containers in OPEN state without any healthy Pipeline",
+ "OpenContainersWithoutPipeline");
private String description;
private String metricName;
diff --git
a/hadoop-hdds/common/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManagerReport.java
b/hadoop-hdds/common/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManagerReport.java
index 3bf2ef4023..f022a6030c 100644
---
a/hadoop-hdds/common/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManagerReport.java
+++
b/hadoop-hdds/common/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManagerReport.java
@@ -112,6 +112,7 @@ class TestReplicationManagerReport {
assertEquals(0, stats.get("EMPTY").longValue());
assertEquals(0, stats.get("OPEN_UNHEALTHY").longValue());
assertEquals(0, stats.get("QUASI_CLOSED_STUCK").longValue());
+ assertEquals(0, stats.get("OPEN_WITHOUT_PIPELINE").longValue());
JsonNode samples = json.get("samples");
assertEquals(ARRAY, samples.get("UNDER_REPLICATED").getNodeType());
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerStateManagerImpl.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerStateManagerImpl.java
index 72d90abe1f..62e1f01935 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerStateManagerImpl.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerStateManagerImpl.java
@@ -251,16 +251,12 @@ public final class ContainerStateManagerImpl
pipelineManager.addContainerToPipelineSCMStart(
container.getPipelineID(), container.containerID());
} catch (PipelineNotFoundException ex) {
+ // We are ignoring this here. The container will be moved to
+ // CLOSING state by ReplicationManager's OpenContainerHandler
+ // For more info: HDDS-10231
LOG.warn("Found container {} which is in OPEN state with " +
"pipeline {} that does not exist. Marking container for " +
"closing.", container, container.getPipelineID());
- try {
- updateContainerState(container.containerID().getProtobuf(),
- LifeCycleEvent.FINALIZE);
- } catch (InvalidStateTransitionException e) {
- // This cannot happen.
- LOG.warn("Unable to finalize Container {}.", container);
- }
}
}
}
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
index 979cff799f..a3661243be 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
@@ -61,6 +61,7 @@ import org.apache.hadoop.hdds.scm.ha.SCMService;
import org.apache.hadoop.hdds.scm.node.NodeManager;
import org.apache.hadoop.hdds.scm.node.NodeStatus;
import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
+import org.apache.hadoop.hdds.scm.pipeline.PipelineNotFoundException;
import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
import org.apache.hadoop.hdds.server.events.EventPublisher;
import
org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException;
@@ -1545,5 +1546,14 @@ public class ReplicationManager implements SCMService {
private static boolean isEC(ReplicationConfig replicationConfig) {
return replicationConfig.getReplicationType() == EC;
}
+
+ public boolean hasHealthyPipeline(ContainerInfo container) {
+ try {
+ return scmContext.getScm().getPipelineManager()
+ .getPipeline(container.getPipelineID()) != null;
+ } catch (PipelineNotFoundException e) {
+ return false;
+ }
+ }
}
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/OpenContainerHandler.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/OpenContainerHandler.java
index 2c0b405db9..21c3c76d3e 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/OpenContainerHandler.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/OpenContainerHandler.java
@@ -53,20 +53,26 @@ public class OpenContainerHandler extends AbstractCheck {
if (containerInfo.getState() == HddsProtos.LifeCycleState.OPEN) {
LOG.debug("Checking open container {} in OpenContainerHandler",
containerInfo);
- if (!isOpenContainerHealthy(
- containerInfo, request.getContainerReplicas())) {
- // This is an unhealthy open container, so we need to trigger the
- // close process on it.
- LOG.debug("Container {} is open but unhealthy. Triggering close.",
- containerInfo);
- request.getReport().incrementAndSample(
- ReplicationManagerReport.HealthState.OPEN_UNHEALTHY,
+ final boolean noPipeline =
!replicationManager.hasHealthyPipeline(containerInfo);
+ // Minor optimization. If noPipeline is true, isOpenContainerHealthy
will not
+ // be called.
+ final boolean unhealthy = noPipeline ||
!isOpenContainerHealthy(containerInfo,
+ request.getContainerReplicas());
+ if (unhealthy) {
+ // For an OPEN container, we close the container
+ // if the container has no Pipeline or if the container is unhealthy.
+ LOG.info("Container {} is open but {}. Triggering close.",
+ containerInfo, noPipeline ? "has no Pipeline" : "unhealthy");
+
+ request.getReport().incrementAndSample(noPipeline ?
+ ReplicationManagerReport.HealthState.OPEN_WITHOUT_PIPELINE :
+ ReplicationManagerReport.HealthState.OPEN_UNHEALTHY,
containerInfo.containerID());
+
if (!request.isReadOnly()) {
replicationManager
.sendCloseContainerEvent(containerInfo.containerID());
}
- return true;
}
// For open containers we do not want to do any further processing in RM
// so return true to stop the command chain.
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
index c67008c097..47844f32fb 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
@@ -28,6 +28,7 @@ import org.apache.hadoop.hdds.protocol.MockDatanodeDetails;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto;
import
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.SCMCommandProto;
+import org.apache.hadoop.hdds.scm.HddsTestUtils;
import org.apache.hadoop.hdds.scm.PlacementPolicy;
import org.apache.hadoop.hdds.scm.container.ContainerID;
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
@@ -43,6 +44,9 @@ import org.apache.hadoop.hdds.scm.ha.SCMServiceManager;
import org.apache.hadoop.hdds.scm.node.NodeManager;
import org.apache.hadoop.hdds.scm.node.NodeStatus;
import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
+import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
+import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
+import org.apache.hadoop.hdds.security.token.ContainerTokenGenerator;
import org.apache.hadoop.hdds.server.events.EventPublisher;
import org.apache.hadoop.ozone.protocol.commands.DeleteContainerCommand;
import
org.apache.hadoop.ozone.protocol.commands.ReconstructECContainersCommand;
@@ -174,6 +178,16 @@ public class TestReplicationManager {
// Ensure that RM will run when asked.
when(scmContext.isLeaderReady()).thenReturn(true);
when(scmContext.isInSafeMode()).thenReturn(false);
+
+ PipelineManager pipelineManager = mock(PipelineManager.class);
+ when(pipelineManager.getPipeline(any()))
+ .thenReturn(HddsTestUtils.getRandomPipeline());
+
+ StorageContainerManager scm = mock(StorageContainerManager.class);
+ when(scm.getPipelineManager()).thenReturn(pipelineManager);
+
when(scm.getContainerTokenGenerator()).thenReturn(ContainerTokenGenerator.DISABLED);
+
+ when(scmContext.getScm()).thenReturn(scm);
}
private ReplicationManager createReplicationManager() throws IOException {
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestOpenContainerHandler.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestOpenContainerHandler.java
index a950008ec9..dec61610d1 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestOpenContainerHandler.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestOpenContainerHandler.java
@@ -24,17 +24,20 @@ import
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolPro
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
import org.apache.hadoop.hdds.scm.container.ContainerReplica;
import org.apache.hadoop.hdds.scm.container.ReplicationManagerReport;
+import
org.apache.hadoop.hdds.scm.container.ReplicationManagerReport.HealthState;
import org.apache.hadoop.hdds.scm.container.replication.ContainerCheckRequest;
import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
import org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
+import org.mockito.Mockito;
import java.util.Collections;
import java.util.Set;
import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.CLOSED;
import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.OPEN;
+import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.mockito.Mockito.mock;
@@ -58,6 +61,7 @@ public class TestOpenContainerHandler {
ratisReplicationConfig = RatisReplicationConfig.getInstance(
HddsProtos.ReplicationFactor.THREE);
replicationManager = mock(ReplicationManager.class);
+
Mockito.when(replicationManager.hasHealthyPipeline(any())).thenReturn(true);
openContainerHandler = new OpenContainerHandler(replicationManager);
}
@@ -119,8 +123,36 @@ public class TestOpenContainerHandler {
assertTrue(openContainerHandler.handle(readRequest));
verify(replicationManager, times(1))
.sendCloseContainerEvent(containerInfo.containerID());
+ assertEquals(1, request.getReport().getStat(HealthState.OPEN_UNHEALTHY));
}
+ @Test
+ public void testOpenContainerWithoutPipelineIsClosed() {
+
Mockito.when(replicationManager.hasHealthyPipeline(any())).thenReturn(false);
+ ContainerInfo containerInfo = ReplicationTestUtil.createContainerInfo(
+ ecReplicationConfig, 1, OPEN);
+ Set<ContainerReplica> containerReplicas = ReplicationTestUtil
+ .createReplicas(containerInfo.containerID(),
+ ContainerReplicaProto.State.OPEN, 1, 2, 3, 4);
+ ContainerCheckRequest request = new ContainerCheckRequest.Builder()
+ .setPendingOps(Collections.emptyList())
+ .setReport(new ReplicationManagerReport())
+ .setContainerInfo(containerInfo)
+ .setContainerReplicas(containerReplicas)
+ .build();
+ ContainerCheckRequest readRequest = new ContainerCheckRequest.Builder()
+ .setPendingOps(Collections.emptyList())
+ .setReport(new ReplicationManagerReport())
+ .setContainerInfo(containerInfo)
+ .setContainerReplicas(containerReplicas)
+ .setReadOnly(true)
+ .build();
+ assertTrue(openContainerHandler.handle(request));
+ assertTrue(openContainerHandler.handle(readRequest));
+ verify(replicationManager, times(1))
+ .sendCloseContainerEvent(containerInfo.containerID());
+ assertEquals(1,
request.getReport().getStat(HealthState.OPEN_WITHOUT_PIPELINE));
+ }
@Test
public void testClosedRatisContainerReturnsFalse() {
ContainerInfo containerInfo = ReplicationTestUtil.createContainerInfo(
@@ -178,5 +210,33 @@ public class TestOpenContainerHandler {
assertTrue(openContainerHandler.handle(request));
assertTrue(openContainerHandler.handle(readRequest));
verify(replicationManager, times(1)).sendCloseContainerEvent(any());
+ assertEquals(1, request.getReport().getStat(HealthState.OPEN_UNHEALTHY));
+ }
+
+ @Test
+ public void testOpenRatisContainerWithoutPipelineIsClosed() {
+
Mockito.when(replicationManager.hasHealthyPipeline(any())).thenReturn(false);
+ ContainerInfo containerInfo = ReplicationTestUtil.createContainerInfo(
+ ratisReplicationConfig, 1, OPEN);
+ Set<ContainerReplica> containerReplicas = ReplicationTestUtil
+ .createReplicas(containerInfo.containerID(),
+ ContainerReplicaProto.State.OPEN, 0, 0, 0);
+ ContainerCheckRequest request = new ContainerCheckRequest.Builder()
+ .setPendingOps(Collections.emptyList())
+ .setReport(new ReplicationManagerReport())
+ .setContainerInfo(containerInfo)
+ .setContainerReplicas(containerReplicas)
+ .build();
+ ContainerCheckRequest readRequest = new ContainerCheckRequest.Builder()
+ .setPendingOps(Collections.emptyList())
+ .setReport(new ReplicationManagerReport())
+ .setContainerInfo(containerInfo)
+ .setContainerReplicas(containerReplicas)
+ .setReadOnly(true)
+ .build();
+ assertTrue(openContainerHandler.handle(request));
+ assertTrue(openContainerHandler.handle(readRequest));
+ verify(replicationManager, times(1)).sendCloseContainerEvent(any());
+ assertEquals(1,
request.getReport().getStat(HealthState.OPEN_WITHOUT_PIPELINE));
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]