This is an automated email from the ASF dual-hosted git repository.
sodonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 3bc2b79764 HDDS-7788. Ratis OverReplicationHandler should exclude
stale replicas (#4183)
3bc2b79764 is described below
commit 3bc2b79764be1e98968474c49403f49745406290
Author: Stephen O'Donnell <[email protected]>
AuthorDate: Wed Jan 18 12:01:58 2023 +0000
HDDS-7788. Ratis OverReplicationHandler should exclude stale replicas
(#4183)
---
.../replication/RatisOverReplicationHandler.java | 26 +++++++++++++++++++---
.../container/replication/ReplicationManager.java | 2 +-
.../TestRatisOverReplicationHandler.java | 26 +++++++++++++++++++++-
3 files changed, 49 insertions(+), 5 deletions(-)
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisOverReplicationHandler.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisOverReplicationHandler.java
index 982325d500..d3bef7d9e2 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisOverReplicationHandler.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisOverReplicationHandler.java
@@ -23,6 +23,7 @@ import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import org.apache.hadoop.hdds.scm.PlacementPolicy;
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
import org.apache.hadoop.hdds.scm.container.ContainerReplica;
+import org.apache.hadoop.hdds.scm.node.NodeManager;
import org.apache.hadoop.ozone.protocol.commands.DeleteContainerCommand;
import org.apache.hadoop.ozone.protocol.commands.SCMCommand;
import org.slf4j.Logger;
@@ -52,8 +53,12 @@ public class RatisOverReplicationHandler
public static final Logger LOG =
LoggerFactory.getLogger(RatisOverReplicationHandler.class);
- public RatisOverReplicationHandler(PlacementPolicy placementPolicy) {
+ private final NodeManager nodeManager;
+
+ public RatisOverReplicationHandler(PlacementPolicy placementPolicy,
+ NodeManager nodeManager) {
super(placementPolicy);
+ this.nodeManager = nodeManager;
}
/**
@@ -79,6 +84,20 @@ public class RatisOverReplicationHandler
ContainerInfo containerInfo = result.getContainerInfo();
LOG.debug("Handling container {}.", containerInfo);
+ // We are going to check for over replication, so we should filter out any
+ // replicas that are not in a HEALTHY state. This is because a replica can
+ // be healthy, stale or dead. If it is dead is will be quickly removed from
+ // scm. If it is state, there is a good chance the DN is offline and the
+ // replica will go away soon. So, if we have a container that is over
+ // replicated with a HEALTHY and STALE replica, and we decide to delete the
+ // HEALTHY one, and then the STALE ones goes away, we will lose them both.
+ // To avoid this, we will filter out any non-healthy replicas first.
+ Set<ContainerReplica> healthyReplicas = replicas.stream()
+ .filter(r -> ReplicationManager.getNodeStatus(
+ r.getDatanodeDetails(), nodeManager).isHealthy()
+ )
+ .collect(Collectors.toSet());
+
// count pending adds and deletes
int pendingAdd = 0, pendingDelete = 0;
for (ContainerReplicaOp op : pendingOps) {
@@ -89,8 +108,9 @@ public class RatisOverReplicationHandler
}
}
RatisContainerReplicaCount replicaCount =
- new RatisContainerReplicaCount(containerInfo, replicas, pendingAdd,
- pendingDelete, containerInfo.getReplicationFactor().getNumber(),
+ new RatisContainerReplicaCount(containerInfo, healthyReplicas,
+ pendingAdd, pendingDelete,
+ containerInfo.getReplicationFactor().getNumber(),
minHealthyForMaintenance);
// verify that this container is actually over replicated
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
index 6f93e81ead..665f58fa73 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
@@ -229,7 +229,7 @@ public class ReplicationManager implements SCMService {
ratisUnderReplicationHandler = new RatisUnderReplicationHandler(
ratisContainerPlacement, conf, nodeManager);
ratisOverReplicationHandler =
- new RatisOverReplicationHandler(ratisContainerPlacement);
+ new RatisOverReplicationHandler(ratisContainerPlacement, nodeManager);
underReplicatedProcessor =
new UnderReplicatedProcessor(this,
rmConf.getUnderReplicatedInterval());
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisOverReplicationHandler.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisOverReplicationHandler.java
index 46e844d9a7..0bf0f1d746 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisOverReplicationHandler.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisOverReplicationHandler.java
@@ -28,6 +28,8 @@ import org.apache.hadoop.hdds.scm.PlacementPolicy;
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
import org.apache.hadoop.hdds.scm.container.ContainerReplica;
import
org.apache.hadoop.hdds.scm.container.placement.algorithms.ContainerPlacementStatusDefault;
+import org.apache.hadoop.hdds.scm.node.NodeManager;
+import org.apache.hadoop.hdds.scm.node.NodeStatus;
import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
import org.apache.hadoop.ozone.protocol.commands.SCMCommand;
import org.apache.ozone.test.GenericTestUtils;
@@ -56,6 +58,7 @@ public class TestRatisOverReplicationHandler {
private static final RatisReplicationConfig RATIS_REPLICATION_CONFIG =
RatisReplicationConfig.getInstance(HddsProtos.ReplicationFactor.THREE);
private PlacementPolicy policy;
+ private NodeManager nodeManager;
@Before
public void setup() throws NodeNotFoundException {
@@ -67,6 +70,10 @@ public class TestRatisOverReplicationHandler {
Mockito.anyList(), Mockito.anyInt()))
.thenReturn(new ContainerPlacementStatusDefault(2, 2, 3));
+ nodeManager = Mockito.mock(NodeManager.class);
+ Mockito.when(nodeManager.getNodeStatus(Mockito.any()))
+ .thenReturn(NodeStatus.inServiceHealthy());
+
GenericTestUtils.setLogLevel(RatisOverReplicationHandler.LOG, Level.DEBUG);
}
@@ -88,6 +95,23 @@ public class TestRatisOverReplicationHandler {
1);
}
+ /**
+ * Container has 4 replicas and 1 stale so none should be deleted.
+ */
+ @Test
+ public void testOverReplicatedClosedContainerWithStale() throws IOException,
+ NodeNotFoundException {
+ Set<ContainerReplica> replicas = createReplicas(container.containerID(),
+ ContainerReplicaProto.State.CLOSED, 0, 0, 0, 0);
+
+ ContainerReplica stale = replicas.stream().findFirst().get();
+ Mockito.when(nodeManager.getNodeStatus(stale.getDatanodeDetails()))
+ .thenReturn(NodeStatus.inServiceStale());
+
+ testProcessing(replicas, Collections.emptyList(),
+ getOverReplicatedHealthResult(), 0);
+ }
+
/**
* The container is quasi closed. All 4 replicas are quasi closed and
* originate from the same datanode. This container is over replicated.
@@ -261,7 +285,7 @@ public class TestRatisOverReplicationHandler {
ContainerHealthResult healthResult,
int expectNumCommands) throws IOException {
RatisOverReplicationHandler handler =
- new RatisOverReplicationHandler(policy);
+ new RatisOverReplicationHandler(policy, nodeManager);
Map<DatanodeDetails, SCMCommand<?>> commands =
handler.processAndCreateCommands(replicas, pendingOps,
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]