This is an automated email from the ASF dual-hosted git repository.
ritesh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new ef480765d8 HDDS-7654. EC: ReplicationManager - merge mis-rep queue
into under replicated queue (#4099)
ef480765d8 is described below
commit ef480765d8eb74fdcdfc9f6a47200c541c60b06a
Author: Stephen O'Donnell <[email protected]>
AuthorDate: Mon Dec 19 17:29:13 2022 +0000
HDDS-7654. EC: ReplicationManager - merge mis-rep queue into under
replicated queue (#4099)
Co-authored-by: S O'Donnell <[email protected]>
---
.../replication/ContainerHealthResult.java | 35 ++++++++++++++++------
.../container/replication/ReplicationManager.java | 17 +++++++++--
.../container/replication/ReplicationQueue.java | 17 -----------
.../replication/TestReplicationManager.java | 15 ++++++++++
.../health/TestECReplicationCheckHandler.java | 6 +---
.../health/TestRatisReplicationCheckHandler.java | 5 +---
6 files changed, 58 insertions(+), 37 deletions(-)
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ContainerHealthResult.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ContainerHealthResult.java
index 5d80268ef4..6393f0349d 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ContainerHealthResult.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ContainerHealthResult.java
@@ -113,7 +113,15 @@ public class ContainerHealthResult {
public UnderReplicatedHealthResult(ContainerInfo containerInfo,
int remainingRedundancy, boolean dueToDecommission,
boolean replicatedOkWithPending, boolean unrecoverable) {
- super(containerInfo, HealthState.UNDER_REPLICATED);
+ this(containerInfo, remainingRedundancy, dueToDecommission,
+ replicatedOkWithPending, unrecoverable,
HealthState.UNDER_REPLICATED);
+ }
+
+ protected UnderReplicatedHealthResult(ContainerInfo containerInfo,
+ int remainingRedundancy, boolean dueToDecommission,
+ boolean replicatedOkWithPending, boolean unrecoverable,
+ HealthState healthState) {
+ super(containerInfo, healthState);
this.remainingRedundancy = remainingRedundancy;
this.dueToDecommission = dueToDecommission;
this.sufficientlyReplicatedAfterPending = replicatedOkWithPending;
@@ -148,7 +156,7 @@ public class ContainerHealthResult {
if (dueToDecommission) {
result += DECOMMISSION_REDUNDANCY;
} else {
- result += remainingRedundancy;
+ result += getRemainingRedundancy();
}
return result;
}
@@ -207,19 +215,28 @@ public class ContainerHealthResult {
* containers are not spread across enough racks.
*/
public static class MisReplicatedHealthResult
- extends ContainerHealthResult {
+ extends UnderReplicatedHealthResult {
- private final boolean replicatedOkAfterPending;
+ /**
+ * In UnderReplicatedHealthState, DECOMMISSION_REDUNDANCY is defined as
+ * 5 so that containers which are really under replicated get fixed as a
+ * priority over decommissioning hosts. We have defined that a container
+ * can only be mis replicated if it is not over or under replicated. Fixing
+ * mis replication is arguably less important than competing a
decommission.
+ * So as a lot of mis replicated container do not block decommission, we
+ * set the redundancy of mis replicated containers to 6 so they sort after
+ * under / over replicated and decommissioning replicas in the under
+ * replication queue.
+ */
+ private static final int MIS_REP_REDUNDANCY = 6;
public MisReplicatedHealthResult(ContainerInfo containerInfo,
boolean replicatedOkAfterPending) {
- super(containerInfo, HealthState.MIS_REPLICATED);
- this.replicatedOkAfterPending = replicatedOkAfterPending;
+ super(containerInfo, MIS_REP_REDUNDANCY, false,
+ replicatedOkAfterPending, false,
+ HealthState.MIS_REPLICATED);
}
- public boolean isReplicatedOkAfterPending() {
- return replicatedOkAfterPending;
- }
}
/**
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
index 351eec7178..5fd0ad2ea2 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
@@ -159,6 +159,7 @@ public class ReplicationManager implements SCMService {
private ReplicationQueue replicationQueue;
private final ECUnderReplicationHandler ecUnderReplicationHandler;
private final ECOverReplicationHandler ecOverReplicationHandler;
+ private final ECMisReplicationHandler ecMisReplicationHandler;
private final RatisUnderReplicationHandler ratisUnderReplicationHandler;
private final RatisOverReplicationHandler ratisOverReplicationHandler;
private final int maintenanceRedundancy;
@@ -223,6 +224,8 @@ public class ReplicationManager implements SCMService {
ecContainerPlacement, conf, nodeManager, this);
ecOverReplicationHandler =
new ECOverReplicationHandler(ecContainerPlacement, nodeManager);
+ ecMisReplicationHandler = new ECMisReplicationHandler(ecContainerPlacement,
+ conf, nodeManager);
ratisUnderReplicationHandler = new RatisUnderReplicationHandler(
ratisContainerPlacement, conf, nodeManager);
ratisOverReplicationHandler =
@@ -525,8 +528,18 @@ public class ReplicationManager implements SCMService {
List<ContainerReplicaOp> pendingOps =
containerReplicaPendingOps.getPendingOps(containerID);
if (result.getContainerInfo().getReplicationType() == EC) {
- return ecUnderReplicationHandler.processAndCreateCommands(replicas,
- pendingOps, result, maintenanceRedundancy);
+ if (result.getHealthState()
+ == ContainerHealthResult.HealthState.UNDER_REPLICATED) {
+ return ecUnderReplicationHandler.processAndCreateCommands(replicas,
+ pendingOps, result, maintenanceRedundancy);
+ } else if (result.getHealthState()
+ == ContainerHealthResult.HealthState.MIS_REPLICATED) {
+ return ecMisReplicationHandler.processAndCreateCommands(replicas,
+ pendingOps, result, maintenanceRedundancy);
+ } else {
+ throw new IllegalArgumentException("Unexpected health state: "
+ + result.getHealthState());
+ }
}
return ratisUnderReplicationHandler.processAndCreateCommands(replicas,
pendingOps, result, ratisMaintenanceMinReplicas);
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationQueue.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationQueue.java
index a14d21c76c..d27c1d9c61 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationQueue.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationQueue.java
@@ -32,8 +32,6 @@ public class ReplicationQueue {
underRepQueue;
private final Queue<ContainerHealthResult.OverReplicatedHealthResult>
overRepQueue;
- private final Queue<ContainerHealthResult.MisReplicatedHealthResult>
- misRepQueue;
public ReplicationQueue() {
underRepQueue = new PriorityQueue<>(
@@ -42,7 +40,6 @@ public class ReplicationQueue {
.thenComparing(ContainerHealthResult
.UnderReplicatedHealthResult::getRequeueCount));
overRepQueue = new LinkedList<>();
- misRepQueue = new LinkedList<>();
}
public void enqueue(ContainerHealthResult.UnderReplicatedHealthResult
@@ -55,11 +52,6 @@ public class ReplicationQueue {
overRepQueue.add(overReplicatedHealthResult);
}
- public void enqueue(ContainerHealthResult.MisReplicatedHealthResult
- misReplicatedHealthResult) {
- misRepQueue.add(misReplicatedHealthResult);
- }
-
public ContainerHealthResult.UnderReplicatedHealthResult
dequeueUnderReplicatedContainer() {
return underRepQueue.poll();
@@ -70,11 +62,6 @@ public class ReplicationQueue {
return overRepQueue.poll();
}
- public ContainerHealthResult.MisReplicatedHealthResult
- dequeueMisReplicatedContainer() {
- return misRepQueue.poll();
- }
-
public int underReplicatedQueueSize() {
return underRepQueue.size();
}
@@ -83,8 +70,4 @@ public class ReplicationQueue {
return overRepQueue.size();
}
- public int misReplicatedQueueSize() {
- return misRepQueue.size();
- }
-
}
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
index f537ec5628..dd4a9e385c 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
@@ -390,6 +390,13 @@ public class TestReplicationManager {
@Test
public void testUnderReplicationQueuePopulated() {
+ // Make it always return mis-replicated. Only a perfectly replicated
+ // container should make it the mis-replicated state as under / over
+ // replicated take precedence.
+ Mockito.when(ecPlacementPolicy.validateContainerPlacement(
+ anyList(), anyInt()))
+ .thenReturn(new ContainerPlacementStatusDefault(1, 2, 3));
+
ContainerInfo decomContainer = createContainerInfo(repConfig, 1,
HddsProtos.LifeCycleState.CLOSED);
addReplicas(decomContainer, ContainerReplicaProto.State.CLOSED,
@@ -404,6 +411,10 @@ public class TestReplicationManager {
HddsProtos.LifeCycleState.CLOSED);
addReplicas(underRep0, ContainerReplicaProto.State.CLOSED, 1, 2, 3);
+ ContainerInfo misRep = createContainerInfo(repConfig, 4,
+ HddsProtos.LifeCycleState.CLOSED);
+ addReplicas(misRep, ContainerReplicaProto.State.CLOSED, 1, 2, 3, 4, 5);
+
enableProcessAll();
replicationManager.processAll();
@@ -438,6 +449,10 @@ public class TestReplicationManager {
res = replicationManager.dequeueUnderReplicatedContainer();
Assert.assertEquals(underRep0, res.getContainerInfo());
+ // Next is the mis-rep container, which has a remaining redundancy of 6.
+ res = replicationManager.dequeueUnderReplicatedContainer();
+ Assert.assertEquals(misRep, res.getContainerInfo());
+
res = replicationManager.dequeueUnderReplicatedContainer();
Assert.assertNull(res);
}
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestECReplicationCheckHandler.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestECReplicationCheckHandler.java
index 06f68ee91c..8b7aa665c8 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestECReplicationCheckHandler.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestECReplicationCheckHandler.java
@@ -481,9 +481,8 @@ public class TestECReplicationCheckHandler {
Assert.assertEquals(HealthState.MIS_REPLICATED, result.getHealthState());
Assert.assertTrue(healthCheck.handle(request));
- Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
+ Assert.assertEquals(1, repQueue.underReplicatedQueueSize());
Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
- Assert.assertEquals(1, repQueue.misReplicatedQueueSize());
Assert.assertEquals(0, report.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
Assert.assertEquals(0, report.getStat(
@@ -531,7 +530,6 @@ public class TestECReplicationCheckHandler {
Assert.assertTrue(healthCheck.handle(request));
Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
- Assert.assertEquals(0, repQueue.misReplicatedQueueSize());
Assert.assertEquals(0, report.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
Assert.assertEquals(0, report.getStat(
@@ -567,7 +565,6 @@ public class TestECReplicationCheckHandler {
Assert.assertTrue(healthCheck.handle(request));
Assert.assertEquals(1, repQueue.underReplicatedQueueSize());
Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
- Assert.assertEquals(0, repQueue.misReplicatedQueueSize());
Assert.assertEquals(1, report.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
Assert.assertEquals(0, report.getStat(
@@ -604,7 +601,6 @@ public class TestECReplicationCheckHandler {
Assert.assertTrue(healthCheck.handle(request));
Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
Assert.assertEquals(1, repQueue.overReplicatedQueueSize());
- Assert.assertEquals(0, repQueue.misReplicatedQueueSize());
Assert.assertEquals(0, report.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
Assert.assertEquals(1, report.getStat(
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisReplicationCheckHandler.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisReplicationCheckHandler.java
index f6493b7817..9cc2c6263d 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisReplicationCheckHandler.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestRatisReplicationCheckHandler.java
@@ -423,7 +423,6 @@ public class TestRatisReplicationCheckHandler {
Assert.assertTrue(healthCheck.handle(requestBuilder.build()));
Assert.assertEquals(1, repQueue.underReplicatedQueueSize());
Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
- Assert.assertEquals(0, repQueue.misReplicatedQueueSize());
Assert.assertEquals(1, report.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
Assert.assertEquals(0, report.getStat(
@@ -468,7 +467,6 @@ public class TestRatisReplicationCheckHandler {
Assert.assertTrue(healthCheck.handle(requestBuilder.build()));
Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
- Assert.assertEquals(0, repQueue.misReplicatedQueueSize());
Assert.assertEquals(1, report.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
Assert.assertEquals(0, report.getStat(
@@ -494,9 +492,8 @@ public class TestRatisReplicationCheckHandler {
Assert.assertFalse(result.isReplicatedOkAfterPending());
Assert.assertTrue(healthCheck.handle(requestBuilder.build()));
- Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
+ Assert.assertEquals(1, repQueue.underReplicatedQueueSize());
Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
- Assert.assertEquals(1, repQueue.misReplicatedQueueSize());
Assert.assertEquals(0, report.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
Assert.assertEquals(1, report.getStat(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]