This is an automated email from the ASF dual-hosted git repository.
sodonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new ffe2a2e53b HDDS-9737. Legacy Replication Manager should consider that
UNHEALTHY replicas might be decommissioning (#5674)
ffe2a2e53b is described below
commit ffe2a2e53bea60863d1517d20c3bf4a7b9c4c043
Author: Siddhant Sangwan <[email protected]>
AuthorDate: Mon Nov 27 18:02:36 2023 +0530
HDDS-9737. Legacy Replication Manager should consider that UNHEALTHY
replicas might be decommissioning (#5674)
---
.../LegacyRatisContainerReplicaCount.java | 8 +++
.../replication/LegacyReplicationManager.java | 26 ++++++++-
.../replication/TestLegacyReplicationManager.java | 29 ++++++++++
.../hdds/scm/node/TestDatanodeAdminMonitor.java | 67 ++++++++++++++++++++++
4 files changed, 128 insertions(+), 2 deletions(-)
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyRatisContainerReplicaCount.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyRatisContainerReplicaCount.java
index 90c7afebcc..f708ae1ead 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyRatisContainerReplicaCount.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyRatisContainerReplicaCount.java
@@ -23,6 +23,7 @@ import org.apache.hadoop.hdds.scm.container.ContainerInfo;
import org.apache.hadoop.hdds.scm.container.ContainerReplica;
import org.apache.hadoop.hdds.scm.node.NodeManager;
+import java.util.List;
import java.util.Set;
import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_SERVICE;
@@ -50,6 +51,13 @@ public class LegacyRatisContainerReplicaCount extends
minHealthyForMaintenance);
}
+ public LegacyRatisContainerReplicaCount(ContainerInfo container,
+ Set<ContainerReplica> replicas, List<ContainerReplicaOp> pendingOps,
+ int minHealthyForMaintenance, boolean considerUnhealthy) {
+ super(container, replicas, pendingOps, minHealthyForMaintenance,
+ considerUnhealthy);
+ }
+
@Override
protected int healthyReplicaCountAdapter() {
return -getMisMatchedReplicaCount();
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java
index 9a51537028..07a8f730ec 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java
@@ -1051,7 +1051,7 @@ public class LegacyReplicationManager {
synchronized (container) {
final Set<ContainerReplica> replica = containerManager
.getContainerReplicas(container.containerID());
- return getContainerReplicaCount(container, replica);
+ return getReplicaCountOptionallyConsiderUnhealthy(container, replica);
}
}
@@ -1076,6 +1076,28 @@ public class LegacyReplicationManager {
minHealthyForMaintenance);
}
+ private RatisContainerReplicaCount
getReplicaCountOptionallyConsiderUnhealthy(
+ ContainerInfo container, Set<ContainerReplica> replicas) {
+ LegacyRatisContainerReplicaCount withUnhealthy =
+ new LegacyRatisContainerReplicaCount(container, replicas,
+ getPendingOps(container.containerID()), minHealthyForMaintenance,
+ true);
+ if (withUnhealthy.getHealthyReplicaCount() == 0 &&
+ withUnhealthy.getUnhealthyReplicaCount() > 0) {
+ // if the container has only UNHEALTHY replicas, return the correct
+ // RatisContainerReplicaCount object which can handle UNHEALTHY replicas
+ return withUnhealthy;
+ }
+
+ return new LegacyRatisContainerReplicaCount(
+ container,
+ replicas,
+ getInflightAdd(container.containerID()),
+ getInflightDel(container.containerID()),
+ container.getReplicationConfig().getRequiredNodes(),
+ minHealthyForMaintenance);
+ }
+
/**
* Returns true if more than 50% of the container replicas with unique
* originNodeId are in QUASI_CLOSED state.
@@ -1345,7 +1367,7 @@ public class LegacyReplicationManager {
List<ContainerReplica> replicas = replicaSet.getReplicas();
RatisContainerReplicaCount unhealthyReplicaSet =
- new RatisContainerReplicaCount(container,
+ new LegacyRatisContainerReplicaCount(container,
new HashSet<>(replicaSet.getReplicas()),
getPendingOps(container.containerID()),
minHealthyForMaintenance,
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java
index 43a2fb3263..c9bd4bddbd 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java
@@ -312,6 +312,35 @@ public class TestLegacyReplicationManager {
replicationManager.start();
Assertions.assertTrue(replicationManager.isRunning());
}
+
+ @Test
+ public void testGetContainerReplicaCount()
+ throws IOException, TimeoutException {
+ ContainerInfo container = createContainer(LifeCycleState.QUASI_CLOSED);
+ addReplica(container, NodeStatus.inServiceHealthy(), UNHEALTHY);
+ addReplica(container, NodeStatus.inServiceHealthy(), UNHEALTHY);
+ ContainerReplica decommissioningReplica =
+ addReplica(container, new NodeStatus(DECOMMISSIONING, HEALTHY),
+ UNHEALTHY);
+
+ ContainerReplicaCount replicaCount =
+ replicationManager.getLegacyReplicationManager()
+ .getContainerReplicaCount(container);
+
+ Assertions.assertTrue(
+ replicaCount instanceof LegacyRatisContainerReplicaCount);
+ Assertions.assertFalse(replicaCount.isSufficientlyReplicated());
+ Assertions.assertFalse(replicaCount.isSufficientlyReplicatedForOffline(
+ decommissioningReplica.getDatanodeDetails(), nodeManager));
+
+ addReplica(container, NodeStatus.inServiceHealthy(), UNHEALTHY);
+ replicaCount = replicationManager.getLegacyReplicationManager()
+ .getContainerReplicaCount(container);
+ Assertions.assertTrue(replicaCount.isSufficientlyReplicated());
+ Assertions.assertTrue(replicaCount.isSufficientlyReplicatedForOffline(
+ decommissioningReplica.getDatanodeDetails(), nodeManager));
+ Assertions.assertTrue(replicaCount.isHealthyEnoughForOffline());
+ }
}
/**
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
index 48afbe0682..09ff7cb0ff 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
@@ -42,6 +42,7 @@ import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
import java.io.IOException;
+import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
@@ -295,6 +296,72 @@ public class TestDatanodeAdminMonitor {
nodeManager.getNodeStatus(dn1).getOperationalState());
}
+ /**
+ * Consider a QUASI_CLOSED container with only UNHEALTHY replicas. If one
+ * of its nodes is decommissioned, the decommissioning should succeed.
+ */
+ @Test
+ public void testQuasiClosedContainerWithAllUnhealthyReplicas()
+ throws NodeNotFoundException, ContainerNotFoundException {
+ conf.setBoolean("hdds.scm.replication.enable.legacy", true);
+
+ DatanodeDetails decommissioningNode =
+ MockDatanodeDetails.randomDatanodeDetails();
+ nodeManager.register(decommissioningNode,
+ new NodeStatus(HddsProtos.NodeOperationalState.DECOMMISSIONING,
+ HddsProtos.NodeState.HEALTHY));
+ ContainerInfo container = ReplicationTestUtil.createContainer(
+ HddsProtos.LifeCycleState.QUASI_CLOSED,
+ RatisReplicationConfig.getInstance(
+ HddsProtos.ReplicationFactor.THREE));
+ Set<ContainerReplica> replicas =
+ ReplicationTestUtil.createReplicas(container.containerID(),
+ State.UNHEALTHY, 0, 0);
+
+ ContainerReplica decommissioningReplica =
+ ReplicationTestUtil.createContainerReplica(container.containerID(), 0,
+ DECOMMISSIONING, State.UNHEALTHY, container.getNumberOfKeys(),
+ container.getUsedBytes(), decommissioningNode,
+ decommissioningNode.getUuid());
+ replicas.add(decommissioningReplica);
+ nodeManager.setContainers(decommissioningNode,
+ ImmutableSet.of(container.containerID()));
+
+ Mockito.when(repManager.getContainerReplicaCount(
+ Mockito.eq(container.containerID())))
+ .thenReturn(new LegacyRatisContainerReplicaCount(container, replicas,
+ Collections.emptyList(), 2, true));
+
+ // start monitoring dn1
+ monitor.startMonitoring(decommissioningNode);
+ monitor.run();
+ assertEquals(1, monitor.getTrackedNodeCount());
+ assertEquals(HddsProtos.NodeOperationalState.DECOMMISSIONING,
+ nodeManager.getNodeStatus(decommissioningNode).getOperationalState());
+
+ // Running the monitor again causes it to remain DECOMMISSIONING
+ // as nothing has changed.
+ monitor.run();
+ assertEquals(HddsProtos.NodeOperationalState.DECOMMISSIONING,
+ nodeManager.getNodeStatus(decommissioningNode).getOperationalState());
+
+ // add a copy of the UNHEALTHY replica on a new node, decommissioningNode
+ // should get decommissioned now
+ ContainerReplica copyOfUnhealthyOnNewNode =
+ decommissioningReplica.toBuilder()
+ .setDatanodeDetails(MockDatanodeDetails.randomDatanodeDetails())
+ .build();
+ replicas.add(copyOfUnhealthyOnNewNode);
+ Mockito.when(repManager.getContainerReplicaCount(
+ Mockito.eq(container.containerID())))
+ .thenReturn(new LegacyRatisContainerReplicaCount(container, replicas,
+ Collections.emptyList(), 3, true));
+ monitor.run();
+ assertEquals(0, monitor.getTrackedNodeCount());
+ assertEquals(HddsProtos.NodeOperationalState.DECOMMISSIONED,
+ nodeManager.getNodeStatus(decommissioningNode).getOperationalState());
+ }
+
@Test
public void testDecommissionNotBlockedByDeletingContainers()
throws NodeNotFoundException, ContainerNotFoundException {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]