This is an automated email from the ASF dual-hosted git repository.
adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new d77df14822 HDDS-8801. ReplicationManager: Add metric to count how
often replication is throttled (#4864)
d77df14822 is described below
commit d77df1482261a60a55ecc193a3f0208a8a3adf2d
Author: Stephen O'Donnell <[email protected]>
AuthorDate: Fri Jun 9 21:00:30 2023 +0100
HDDS-8801. ReplicationManager: Add metric to count how often replication is
throttled (#4864)
---
.../container/replication/ReplicationManager.java | 5 +
.../replication/ReplicationManagerMetrics.java | 41 ++++
.../replication/TestReplicationManager.java | 232 +++++++++++----------
3 files changed, 173 insertions(+), 105 deletions(-)
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
index aab18b1bc5..460655ba39 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
@@ -19,6 +19,7 @@
package org.apache.hadoop.hdds.scm.container.replication;
import com.google.common.annotations.VisibleForTesting;
+import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.hdds.HddsConfigKeys;
import org.apache.hadoop.hdds.conf.Config;
@@ -502,6 +503,7 @@ public class ReplicationManager implements SCMService {
int commandCount = nodeManager.getTotalDatanodeCommandCount(datanode,
Type.deleteContainerCommand);
if (commandCount >= datanodeDeleteLimit) {
+ metrics.incrDeleteContainerCmdsDeferredTotal();
throw new CommandTargetOverloadedException("Cannot schedule a delete "
+
"container command for container " + container.containerID() +
" on datanode " + datanode + " as it has too many pending delete "
+
@@ -533,6 +535,7 @@ public class ReplicationManager implements SCMService {
List<Pair<Integer, DatanodeDetails>> sourceWithCmds =
getAvailableDatanodesForReplication(sources);
if (sourceWithCmds.isEmpty()) {
+ metrics.incrReplicateContainerCmdsDeferredTotal();
throw new CommandTargetOverloadedException("No sources with capacity " +
"available for replication of container " + containerID + " to " +
target);
@@ -553,6 +556,7 @@ public class ReplicationManager implements SCMService {
List<Pair<Integer, DatanodeDetails>> targetWithCmds =
getAvailableDatanodesForReplication(targets);
if (targetWithCmds.isEmpty()) {
+ metrics.incrECReconstructionCmdsDeferredTotal();
throw new CommandTargetOverloadedException("No target with capacity " +
"available for reconstruction of " + containerInfo.getContainerID());
}
@@ -1407,6 +1411,7 @@ public class ReplicationManager implements SCMService {
return ReplicationManager.class.getSimpleName();
}
+ @SuppressFBWarnings("IS2_INCONSISTENT_SYNC")
public ReplicationManagerMetrics getMetrics() {
return metrics;
}
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManagerMetrics.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManagerMetrics.java
index 40922c84af..9c62761a61 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManagerMetrics.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManagerMetrics.java
@@ -171,6 +171,19 @@ public final class ReplicationManagerMetrics implements
MetricsSource {
@Metric("Number of EC replicas scheduled for delete which timed out.")
private MutableCounterLong ecReplicaDeleteTimeoutTotal;
+ @Metric("NUmber of Reconstruct EC Container commands that could not be sent "
+ + "due to the pending commands on the target datanode")
+ private MutableCounterLong ecReconstructionCmdsDeferredTotal;
+
+ @Metric("Number of delete container commands that could not be sent due "
+ + "to the pending commands on the target datanode")
+ private MutableCounterLong deleteContainerCmdsDeferredTotal;
+
+ @Metric("Number of replicate container commands that could not be sent due "
+ + "to the pending commands on all source datanodes")
+ private MutableCounterLong replicateContainerCmdsDeferredTotal;
+
+
public ReplicationManagerMetrics(ReplicationManager manager) {
this.registry = new MetricsRegistry(METRICS_SOURCE_NAME);
this.replicationManager = manager;
@@ -223,6 +236,9 @@ public final class ReplicationManagerMetrics implements
MetricsSource {
ecReconstructionCmdsSentTotal.snapshot(builder, all);
ecReplicaCreateTimeoutTotal.snapshot(builder, all);
ecReplicasDeletedTotal.snapshot(builder, all);
+ ecReconstructionCmdsDeferredTotal.snapshot(builder, all);
+ deleteContainerCmdsDeferredTotal.snapshot(builder, all);
+ replicateContainerCmdsDeferredTotal.snapshot(builder, all);
}
public void unRegister() {
@@ -372,6 +388,18 @@ public final class ReplicationManagerMetrics implements
MetricsSource {
this.ecReconstructionCmdsSentTotal.incr();
}
+ public void incrECReconstructionCmdsDeferredTotal() {
+ this.ecReconstructionCmdsDeferredTotal.incr();
+ }
+
+ public void incrDeleteContainerCmdsDeferredTotal() {
+ this.deleteContainerCmdsDeferredTotal.incr();
+ }
+
+ public void incrReplicateContainerCmdsDeferredTotal() {
+ this.replicateContainerCmdsDeferredTotal.incr();
+ }
+
public long getEcReplication() {
return replicationManager.getContainerReplicaPendingOps()
.getPendingOpCount(ContainerReplicaOp.PendingOpType.ADD);
@@ -417,4 +445,17 @@ public final class ReplicationManagerMetrics implements
MetricsSource {
public long getEcReplicasDeletedTotal() {
return ecReplicasDeletedTotal.value();
}
+
+ public long getEcReconstructionCmdsDeferredTotal() {
+ return ecReconstructionCmdsDeferredTotal.value();
+ }
+
+ public long getDeleteContainerCmdsDeferredTotal() {
+ return deleteContainerCmdsDeferredTotal.value();
+ }
+
+ public long getReplicateContainerCmdsDeferredTotal() {
+ return replicateContainerCmdsDeferredTotal.value();
+ }
+
}
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
index e3311edfe7..d9c5e2f60c 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java
@@ -82,6 +82,7 @@ import static
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUt
import static
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createReplicas;
import static
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.createReplicasWithSameOrigin;
import static
org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil.getNoNodesTestPlacementPolicy;
+import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThrows;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.anyInt;
@@ -193,18 +194,18 @@ public class TestReplicationManager {
MockDatanodeDetails.randomDatanodeDetails(), 1, Integer.MAX_VALUE);
containerReplicaPendingOps.scheduleDeleteReplica(ContainerID.valueOf(2),
MockDatanodeDetails.randomDatanodeDetails(), 1, Integer.MAX_VALUE);
- Assert.assertEquals(1, containerReplicaPendingOps
+ assertEquals(1, containerReplicaPendingOps
.getPendingOpCount(ContainerReplicaOp.PendingOpType.ADD));
- Assert.assertEquals(1, containerReplicaPendingOps
+ assertEquals(1, containerReplicaPendingOps
.getPendingOpCount(ContainerReplicaOp.PendingOpType.DELETE));
// Registers against serviceManager and notifies the status has changed.
enableProcessAll();
// Pending ops should be cleared.
- Assert.assertEquals(0, containerReplicaPendingOps
+ assertEquals(0, containerReplicaPendingOps
.getPendingOpCount(ContainerReplicaOp.PendingOpType.ADD));
- Assert.assertEquals(0, containerReplicaPendingOps
+ assertEquals(0, containerReplicaPendingOps
.getPendingOpCount(ContainerReplicaOp.PendingOpType.DELETE));
}
@@ -216,8 +217,8 @@ public class TestReplicationManager {
addReplicas(container, ContainerReplicaProto.State.OPEN, 1, 2, 3, 4);
replicationManager.processContainer(
container, repQueue, repReport);
- Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
- Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
+ assertEquals(0, repQueue.underReplicatedQueueSize());
+ assertEquals(0, repQueue.overReplicatedQueueSize());
}
@Test
@@ -231,10 +232,10 @@ public class TestReplicationManager {
container, repQueue, repReport);
Mockito.verify(eventPublisher, Mockito.times(1))
.fireEvent(SCMEvents.CLOSE_CONTAINER, container.containerID());
- Assert.assertEquals(1, repReport.getStat(
+ assertEquals(1, repReport.getStat(
ReplicationManagerReport.HealthState.OPEN_UNHEALTHY));
- Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
- Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
+ assertEquals(0, repQueue.underReplicatedQueueSize());
+ assertEquals(0, repQueue.overReplicatedQueueSize());
}
@Test
@@ -256,14 +257,14 @@ public class TestReplicationManager {
Mockito.verify(nodeManager, Mockito.times(3))
.addDatanodeCommand(any(), any());
- Assert.assertEquals(1, repReport.getStat(
+ assertEquals(1, repReport.getStat(
ReplicationManagerReport.HealthState.OVER_REPLICATED));
- Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
+ assertEquals(0, repQueue.underReplicatedQueueSize());
/*
Though over replicated, this container should not be added to over
replicated queue until all replicas are closed.
*/
- Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
+ assertEquals(0, repQueue.overReplicatedQueueSize());
}
@Test
@@ -289,10 +290,10 @@ public class TestReplicationManager {
Mockito.verify(nodeManager, Mockito.times(1))
.addDatanodeCommand(any(), any());
- Assert.assertEquals(1, repReport.getStat(
+ assertEquals(1, repReport.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
- Assert.assertEquals(1, repQueue.underReplicatedQueueSize());
- Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
+ assertEquals(1, repQueue.underReplicatedQueueSize());
+ assertEquals(0, repQueue.overReplicatedQueueSize());
}
@Test
@@ -311,12 +312,12 @@ public class TestReplicationManager {
replicas.add(unhealthyOnDecommissioning);
replicationManager.processContainer(container, repQueue, repReport);
- Assert.assertEquals(1, repReport.getStat(
+ assertEquals(1, repReport.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
- Assert.assertEquals(1, repReport.getStat(
+ assertEquals(1, repReport.getStat(
ReplicationManagerReport.HealthState.UNHEALTHY));
- Assert.assertEquals(1, repQueue.underReplicatedQueueSize());
- Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
+ assertEquals(1, repQueue.underReplicatedQueueSize());
+ assertEquals(0, repQueue.overReplicatedQueueSize());
}
/**
@@ -343,12 +344,12 @@ public class TestReplicationManager {
storeContainerAndReplicas(container, replicas);
replicationManager.processContainer(container, repQueue, repReport);
- Assert.assertEquals(0, repReport.getStat(
+ assertEquals(0, repReport.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
- Assert.assertEquals(1, repReport.getStat(
+ assertEquals(1, repReport.getStat(
ReplicationManagerReport.HealthState.OVER_REPLICATED));
- Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
- Assert.assertEquals(1, repQueue.overReplicatedQueueSize());
+ assertEquals(0, repQueue.underReplicatedQueueSize());
+ assertEquals(1, repQueue.overReplicatedQueueSize());
RatisOverReplicationHandler handler = new RatisOverReplicationHandler(
ratisPlacementPolicy, replicationManager);
@@ -358,9 +359,9 @@ public class TestReplicationManager {
handler.processAndSendCommands(replicas, Collections.emptyList(),
repQueue.dequeueOverReplicatedContainer(), 2);
Assert.assertTrue(commandsSent.iterator().hasNext());
- Assert.assertEquals(unhealthy.getDatanodeDetails().getUuid(),
+ assertEquals(unhealthy.getDatanodeDetails().getUuid(),
commandsSent.iterator().next().getKey());
- Assert.assertEquals(SCMCommandProto.Type.deleteContainerCommand,
+ assertEquals(SCMCommandProto.Type.deleteContainerCommand,
commandsSent.iterator().next().getValue().getType());
}
@@ -382,12 +383,12 @@ public class TestReplicationManager {
storeContainerAndReplicas(container, replicas);
replicationManager.processContainer(container, repQueue, repReport);
- Assert.assertEquals(0, repReport.getStat(
+ assertEquals(0, repReport.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
- Assert.assertEquals(1, repReport.getStat(
+ assertEquals(1, repReport.getStat(
ReplicationManagerReport.HealthState.OVER_REPLICATED));
- Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
- Assert.assertEquals(1, repQueue.overReplicatedQueueSize());
+ assertEquals(0, repQueue.underReplicatedQueueSize());
+ assertEquals(1, repQueue.overReplicatedQueueSize());
RatisOverReplicationHandler handler = new RatisOverReplicationHandler(
ratisPlacementPolicy, replicationManager);
@@ -401,7 +402,7 @@ public class TestReplicationManager {
// unhealthy replica can't be deleted because it has a unique origin DN
Assert.assertNotEquals(unhealthy.getDatanodeDetails().getUuid(),
commandsSent.iterator().next().getKey());
- Assert.assertEquals(SCMCommandProto.Type.deleteContainerCommand,
+ assertEquals(SCMCommandProto.Type.deleteContainerCommand,
commandsSent.iterator().next().getValue().getType());
}
@@ -413,8 +414,8 @@ public class TestReplicationManager {
replicationManager.processContainer(
container, repQueue, repReport);
- Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
- Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
+ assertEquals(0, repQueue.underReplicatedQueueSize());
+ assertEquals(0, repQueue.overReplicatedQueueSize());
}
@Test
@@ -425,9 +426,9 @@ public class TestReplicationManager {
replicationManager.processContainer(
container, repQueue, repReport);
- Assert.assertEquals(1, repQueue.underReplicatedQueueSize());
- Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
- Assert.assertEquals(1, repReport.getStat(
+ assertEquals(1, repQueue.underReplicatedQueueSize());
+ assertEquals(0, repQueue.overReplicatedQueueSize());
+ assertEquals(1, repReport.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
}
@@ -445,7 +446,7 @@ public class TestReplicationManager {
ContainerHealthResult result =
replicationManager.getContainerReplicationHealth(container, replicas);
- Assert.assertEquals(ContainerHealthResult.HealthState.UNDER_REPLICATED,
+ assertEquals(ContainerHealthResult.HealthState.UNDER_REPLICATED,
result.getHealthState());
// Test the same for a RATIS container
@@ -458,7 +459,7 @@ public class TestReplicationManager {
result =
replicationManager.getContainerReplicationHealth(container, replicas);
- Assert.assertEquals(ContainerHealthResult.HealthState.UNDER_REPLICATED,
+ assertEquals(ContainerHealthResult.HealthState.UNDER_REPLICATED,
result.getHealthState());
}
@@ -476,12 +477,12 @@ public class TestReplicationManager {
container, repQueue, repReport);
// As the pending replication fixes the under replication, nothing is added
// to the under replication list.
- Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
- Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
+ assertEquals(0, repQueue.underReplicatedQueueSize());
+ assertEquals(0, repQueue.overReplicatedQueueSize());
// As the container is still under replicated, as the pending have not
// completed yet, the container is still marked as under-replicated in the
// report.
- Assert.assertEquals(1, repReport.getStat(
+ assertEquals(1, repReport.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
}
@@ -496,11 +497,11 @@ public class TestReplicationManager {
container, repQueue, repReport);
// If it is unrecoverable, there is no point in putting it into the under
// replication list. It will be checked again on the next RM run.
- Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
- Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
- Assert.assertEquals(0, repReport.getStat(
+ assertEquals(0, repQueue.underReplicatedQueueSize());
+ assertEquals(0, repQueue.overReplicatedQueueSize());
+ assertEquals(0, repReport.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
- Assert.assertEquals(1, repReport.getStat(
+ assertEquals(1, repReport.getStat(
ReplicationManagerReport.HealthState.MISSING));
}
@@ -519,15 +520,15 @@ public class TestReplicationManager {
replicationManager.processContainer(container, repQueue, repReport);
// If it is unrecoverable, there is no point in putting it into the under
// replication list. It will be checked again on the next RM run.
- Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
- Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
- Assert.assertEquals(0, repReport.getStat(
+ assertEquals(0, repQueue.underReplicatedQueueSize());
+ assertEquals(0, repQueue.overReplicatedQueueSize());
+ assertEquals(0, repReport.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
- Assert.assertEquals(0, repReport.getStat(
+ assertEquals(0, repReport.getStat(
ReplicationManagerReport.HealthState.MISSING));
// As it is marked empty in the report, it must have gone through the
// empty container handler, indicating is was handled as empty.
- Assert.assertEquals(1, repReport.getStat(
+ assertEquals(1, repReport.getStat(
ReplicationManagerReport.HealthState.EMPTY));
}
@@ -555,9 +556,9 @@ public class TestReplicationManager {
replicationManager.processContainer(
container, repQueue, repReport);
- Assert.assertEquals(1, repQueue.underReplicatedQueueSize());
- Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
- Assert.assertEquals(1, repReport.getStat(
+ assertEquals(1, repQueue.underReplicatedQueueSize());
+ assertEquals(0, repQueue.overReplicatedQueueSize());
+ assertEquals(1, repReport.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
}
@@ -586,11 +587,11 @@ public class TestReplicationManager {
replicationManager.processContainer(
container, repQueue, repReport);
- Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
- Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
- Assert.assertEquals(0, repReport.getStat(
+ assertEquals(0, repQueue.underReplicatedQueueSize());
+ assertEquals(0, repQueue.overReplicatedQueueSize());
+ assertEquals(0, repReport.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
- Assert.assertEquals(1, repReport.getStat(
+ assertEquals(1, repReport.getStat(
ReplicationManagerReport.HealthState.MISSING));
}
@@ -614,9 +615,9 @@ public class TestReplicationManager {
replicationManager.processContainer(
container, repQueue, repReport);
- Assert.assertEquals(1, repQueue.underReplicatedQueueSize());
- Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
- Assert.assertEquals(1, repReport.getStat(
+ assertEquals(1, repQueue.underReplicatedQueueSize());
+ assertEquals(0, repQueue.overReplicatedQueueSize());
+ assertEquals(1, repReport.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
}
@@ -632,11 +633,11 @@ public class TestReplicationManager {
// If it is both under and over replicated, we set it to the most important
// state, which is under-replicated. When that is fixed, over replication
// will be handled.
- Assert.assertEquals(1, repQueue.underReplicatedQueueSize());
- Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
- Assert.assertEquals(1, repReport.getStat(
+ assertEquals(1, repQueue.underReplicatedQueueSize());
+ assertEquals(0, repQueue.overReplicatedQueueSize());
+ assertEquals(1, repReport.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
- Assert.assertEquals(0, repReport.getStat(
+ assertEquals(0, repReport.getStat(
ReplicationManagerReport.HealthState.OVER_REPLICATED));
}
@@ -668,11 +669,11 @@ public class TestReplicationManager {
// assert that this container is seen as under replicated
replicationManager.processContainer(container, repQueue, repReport);
- Assert.assertEquals(1, repQueue.underReplicatedQueueSize());
- Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
- Assert.assertEquals(1, repReport.getStat(
+ assertEquals(1, repQueue.underReplicatedQueueSize());
+ assertEquals(0, repQueue.overReplicatedQueueSize());
+ assertEquals(1, repReport.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
- Assert.assertEquals(0, repReport.getStat(
+ assertEquals(0, repReport.getStat(
ReplicationManagerReport.HealthState.OVER_REPLICATED));
// now, pass this container to ec under replication handling
@@ -689,17 +690,17 @@ public class TestReplicationManager {
repQueue.dequeueUnderReplicatedContainer(), 1));
// a delete command should also have been sent for UNHEALTHY replica of
// index 1
- Assert.assertEquals(1, commandsSent.size());
+ assertEquals(1, commandsSent.size());
Pair<UUID, SCMCommand<?>> command = commandsSent.iterator().next();
Assertions.assertEquals(SCMCommandProto.Type.deleteContainerCommand,
command.getValue().getType());
DeleteContainerCommand deleteCommand =
(DeleteContainerCommand) command.getValue();
- Assert.assertEquals(unhealthyReplica1.getDatanodeDetails().getUuid(),
+ assertEquals(unhealthyReplica1.getDatanodeDetails().getUuid(),
command.getKey());
- Assert.assertEquals(container.containerID(),
+ assertEquals(container.containerID(),
ContainerID.valueOf(deleteCommand.getContainerID()));
- Assert.assertEquals(unhealthyReplica1.getReplicaIndex(),
+ assertEquals(unhealthyReplica1.getReplicaIndex(),
deleteCommand.getReplicaIndex());
}
@@ -711,9 +712,9 @@ public class TestReplicationManager {
1, 2, 3, 4, 5, 5);
replicationManager.processContainer(
container, repQueue, repReport);
- Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
- Assert.assertEquals(1, repQueue.overReplicatedQueueSize());
- Assert.assertEquals(1, repReport.getStat(
+ assertEquals(0, repQueue.underReplicatedQueueSize());
+ assertEquals(1, repQueue.overReplicatedQueueSize());
+ assertEquals(1, repReport.getStat(
ReplicationManagerReport.HealthState.OVER_REPLICATED));
}
@@ -729,11 +730,11 @@ public class TestReplicationManager {
clock.millis() + 10000);
replicationManager.processContainer(
container, repQueue, repReport);
- Assert.assertEquals(0, repQueue.underReplicatedQueueSize());
+ assertEquals(0, repQueue.underReplicatedQueueSize());
// If the pending replication fixes the over-replication, nothing is added
// to the over replication list.
- Assert.assertEquals(0, repQueue.overReplicatedQueueSize());
- Assert.assertEquals(1, repReport.getStat(
+ assertEquals(0, repQueue.overReplicatedQueueSize());
+ assertEquals(1, repReport.getStat(
ReplicationManagerReport.HealthState.OVER_REPLICATED));
}
@@ -772,7 +773,7 @@ public class TestReplicationManager {
// Get the first message off the queue - it should be underRep0.
ContainerHealthResult.UnderReplicatedHealthResult res
= queue.dequeueUnderReplicatedContainer();
- Assert.assertEquals(underRep0, res.getContainerInfo());
+ assertEquals(underRep0, res.getContainerInfo());
// Now requeue it
queue.enqueue(res);
@@ -782,7 +783,7 @@ public class TestReplicationManager {
// and 1 retry. They will have the same weighted redundancy so lesser
// retries should come first
res = queue.dequeueUnderReplicatedContainer();
- Assert.assertEquals(underRep1, res.getContainerInfo());
+ assertEquals(underRep1, res.getContainerInfo());
// Next message is underRep0. It starts with a weighted redundancy of 0 + 1
// retry. The other message on the queue is a decommission only with a
@@ -791,18 +792,18 @@ public class TestReplicationManager {
// one will be next due to having less retries.
for (int i = 0; i < 4; i++) {
res = queue.dequeueUnderReplicatedContainer();
- Assert.assertEquals(underRep0, res.getContainerInfo());
+ assertEquals(underRep0, res.getContainerInfo());
queue.enqueue(res);
}
res = queue.dequeueUnderReplicatedContainer();
- Assert.assertEquals(decomContainer, res.getContainerInfo());
+ assertEquals(decomContainer, res.getContainerInfo());
res = queue.dequeueUnderReplicatedContainer();
- Assert.assertEquals(underRep0, res.getContainerInfo());
+ assertEquals(underRep0, res.getContainerInfo());
// Next is the mis-rep container, which has a remaining redundancy of 6.
res = queue.dequeueUnderReplicatedContainer();
- Assert.assertEquals(misRep, res.getContainerInfo());
+ assertEquals(misRep, res.getContainerInfo());
res = queue.dequeueUnderReplicatedContainer();
Assert.assertNull(res);
@@ -939,7 +940,7 @@ public class TestReplicationManager {
.getObject(ReplicationManager.ReplicationManagerConfiguration.class);
long expectedDeadline = clock.millis() + rmConf.getEventTimeout() -
rmConf.getDatanodeTimeoutOffset();
- Assert.assertEquals(expectedDeadline, command.getDeadline());
+ assertEquals(expectedDeadline, command.getDeadline());
List<ContainerReplicaOp> ops = containerReplicaPendingOps.getPendingOps(
containerInfo.containerID());
@@ -1005,7 +1006,7 @@ public class TestReplicationManager {
.getObject(ReplicationManager.ReplicationManagerConfiguration.class);
long expectedDeadline = clock.millis() + rmConf.getEventTimeout() -
rmConf.getDatanodeTimeoutOffset();
- Assert.assertEquals(expectedDeadline, command.getDeadline());
+ assertEquals(expectedDeadline, command.getDeadline());
List<ContainerReplicaOp> ops = containerReplicaPendingOps.getPendingOps(
containerInfo.containerID());
@@ -1067,6 +1068,8 @@ public class TestReplicationManager {
testReplicationCommand(cmdTarget, sourceNodes.keySet(), 0,
MockDatanodeDetails.randomDatanodeDetails());
+ assertEquals(0, replicationManager.getMetrics()
+ .getReplicateContainerCmdsDeferredTotal());
}
@Test
@@ -1112,10 +1115,9 @@ public class TestReplicationManager {
Assertions.assertEquals(replicaIndex, cmd.getReplicaIndex());
}
- @Test(expected = CommandTargetOverloadedException.class)
+ @Test
public void testSendThrottledReplicateContainerCommandThrowsWhenNoSources()
- throws CommandTargetOverloadedException, NodeNotFoundException,
- NotLeaderException {
+ throws NodeNotFoundException {
// Reconstruction commands also count toward the limit, so set things up
// so that the nodes are at the limit caused by 1 reconstruction command
// and the remaining replication commands
@@ -1135,8 +1137,14 @@ public class TestReplicationManager {
DatanodeDetails destination = MockDatanodeDetails.randomDatanodeDetails();
ContainerInfo container = ReplicationTestUtil.createContainerInfo(
repConfig, 1, HddsProtos.LifeCycleState.CLOSED, 10, 20);
- replicationManager.sendThrottledReplicationCommand(
- container, sourceNodes, destination, 0);
+
+ long overLoadedCount = replicationManager.getMetrics()
+ .getReplicateContainerCmdsDeferredTotal();
+ assertThrows(CommandTargetOverloadedException.class,
+ () -> replicationManager.sendThrottledReplicationCommand(
+ container, sourceNodes, destination, 0));
+ assertEquals(overLoadedCount + 1, replicationManager.getMetrics()
+ .getReplicateContainerCmdsDeferredTotal());
}
@Test
@@ -1161,12 +1169,13 @@ public class TestReplicationManager {
Assertions.assertEquals(1, commandsSent.size());
Pair<UUID, SCMCommand<?>> cmd = commandsSent.iterator().next();
Assertions.assertEquals(cmdTarget.getUuid(), cmd.getLeft());
+ assertEquals(0, replicationManager.getMetrics()
+ .getEcReconstructionCmdsDeferredTotal());
}
- @Test(expected = CommandTargetOverloadedException.class)
+ @Test
public void testSendThrottledReconstructionCommandThrowsWhenNoTargets()
- throws CommandTargetOverloadedException, NodeNotFoundException,
- NotLeaderException {
+ throws NodeNotFoundException {
int limit = replicationManager.getConfig().getDatanodeReplicationLimit();
int reconstructionWeight = replicationManager.getConfig()
.getReconstructionCommandWeight();
@@ -1185,7 +1194,13 @@ public class TestReplicationManager {
ReconstructECContainersCommand command = createReconstructionCommand(
container, MockDatanodeDetails.randomDatanodeDetails(),
MockDatanodeDetails.randomDatanodeDetails());
- replicationManager.sendThrottledReconstructionCommand(container, command);
+ long overLoadedCount = replicationManager.getMetrics()
+ .getEcReconstructionCmdsDeferredTotal();
+ assertThrows(CommandTargetOverloadedException.class,
+ () -> replicationManager.sendThrottledReconstructionCommand(
+ container, command));
+ assertEquals(overLoadedCount + 1, replicationManager.getMetrics()
+ .getEcReconstructionCmdsDeferredTotal());
}
private ReconstructECContainersCommand createReconstructionCommand(
@@ -1216,13 +1231,14 @@ public class TestReplicationManager {
ContainerInfo container = ReplicationTestUtil.createContainerInfo(
repConfig, 1, HddsProtos.LifeCycleState.CLOSED, 10, 20);
replicationManager.sendThrottledDeleteCommand(container, 1, target, true);
- Assert.assertEquals(commandsSent.size(), 1);
+ assertEquals(commandsSent.size(), 1);
+ assertEquals(0, replicationManager.getMetrics()
+ .getDeleteContainerCmdsDeferredTotal());
}
- @Test(expected = CommandTargetOverloadedException.class)
+ @Test
public void testCreateThrottledDeleteContainerCommandThrowsWhenNoSources()
- throws CommandTargetOverloadedException, NodeNotFoundException,
- NotLeaderException {
+ throws NodeNotFoundException {
int limit = replicationManager.getConfig().getDatanodeDeleteLimit();
Mockito.when(nodeManager.getTotalDatanodeCommandCount(any(),
@@ -1232,7 +1248,13 @@ public class TestReplicationManager {
DatanodeDetails target = MockDatanodeDetails.randomDatanodeDetails();
ContainerInfo container = ReplicationTestUtil.createContainerInfo(
repConfig, 1, HddsProtos.LifeCycleState.CLOSED, 10, 20);
- replicationManager.sendThrottledDeleteCommand(container, 1, target, true);
+ long overLoadedCount = replicationManager.getMetrics()
+ .getDeleteContainerCmdsDeferredTotal();
+ assertThrows(CommandTargetOverloadedException.class,
+ () -> replicationManager.sendThrottledDeleteCommand(
+ container, 1, target, true));
+ assertEquals(overLoadedCount + 1, replicationManager.getMetrics()
+ .getDeleteContainerCmdsDeferredTotal());
}
@Test
@@ -1259,7 +1281,7 @@ public class TestReplicationManager {
MockDatanodeDetails.randomDatanodeDetails(), 1);
Set<DatanodeDetails> excluded = replicationManager.getExcludedNodes();
- Assert.assertEquals(1, excluded.size());
+ assertEquals(1, excluded.size());
// dn 3 was at the limit already, so should be added when filtering the
// nodes
Assert.assertTrue(excluded.contains(dn3));
@@ -1267,13 +1289,13 @@ public class TestReplicationManager {
// Trigger an update for dn3, but it should stay in the excluded list as
its
// count is still at the limit.
replicationManager.datanodeCommandCountUpdated(dn3);
- Assert.assertEquals(replicationManager.getExcludedNodes().size(), 1);
+ assertEquals(replicationManager.getExcludedNodes().size(), 1);
// Starting maintenance on dn3 increases its limits, so it should no longer
// be excluded
dn3.setPersistedOpState(ENTERING_MAINTENANCE);
replicationManager.datanodeCommandCountUpdated(dn3);
- Assert.assertEquals(0, replicationManager.getExcludedNodes().size());
+ assertEquals(0, replicationManager.getExcludedNodes().size());
// now sent a reconstruction command. It should be sent to dn2, which is
// at the lowest count, but this command should push it to the limit and
@@ -1282,19 +1304,19 @@ public class TestReplicationManager {
container, dn1, dn2);
replicationManager.sendThrottledReconstructionCommand(container, command);
excluded = replicationManager.getExcludedNodes();
- Assert.assertEquals(1, excluded.size());
+ assertEquals(1, excluded.size());
// dn 2 reached the limit from the reconstruction command
Assert.assertTrue(excluded.contains(dn2));
// Update received for DN2, it should be cleared from the excluded list.
replicationManager.datanodeCommandCountUpdated(dn2);
excluded = replicationManager.getExcludedNodes();
- Assert.assertEquals(0, excluded.size());
+ assertEquals(0, excluded.size());
// Finally, update received for DN1 - it is not excluded and should not
// be added or cause any problems by not being there
replicationManager.datanodeCommandCountUpdated(dn1);
- Assert.assertEquals(0, excluded.size());
+ assertEquals(0, excluded.size());
}
@Test
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]