This is an automated email from the ASF dual-hosted git repository.
siddhant pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new fd0eddda8c HDDS-7642. EC: Add debug logging to the Replication Manager
check handlers (#4087)
fd0eddda8c is described below
commit fd0eddda8ce051188e694abb367cf14868d284ad
Author: Stephen O'Donnell <[email protected]>
AuthorDate: Thu Dec 15 07:00:10 2022 +0000
HDDS-7642. EC: Add debug logging to the Replication Manager check handlers
(#4087)
---
.../scm/container/replication/ReplicationManager.java | 2 ++
.../health/ClosedWithMismatchedReplicasHandler.java | 3 +++
.../health/ClosedWithUnhealthyReplicasHandler.java | 4 +++-
.../replication/health/ClosingContainerHandler.java | 7 +++++++
.../replication/health/DeletingContainerHandler.java | 5 +++++
.../replication/health/ECReplicationCheckHandler.java | 18 +++++++++++++++++-
.../replication/health/EmptyContainerHandler.java | 6 ++----
.../replication/health/OpenContainerHandler.java | 11 ++++++++++-
.../health/QuasiClosedContainerHandler.java | 4 ++++
.../health/RatisReplicationCheckHandler.java | 14 ++++++++++++++
10 files changed, 67 insertions(+), 7 deletions(-)
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
index 21e08f460c..351eec7178 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
@@ -409,6 +409,8 @@ public class ReplicationManager implements SCMService {
*/
public void sendDeleteCommand(final ContainerInfo container, int
replicaIndex,
final DatanodeDetails datanode) throws NotLeaderException {
+ LOG.debug("Sending delete command for container {} and index {} on {}",
+ container, replicaIndex, datanode);
final DeleteContainerCommand deleteCommand =
new DeleteContainerCommand(container.containerID(), false);
deleteCommand.setReplicaIndex(replicaIndex);
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ClosedWithMismatchedReplicasHandler.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ClosedWithMismatchedReplicasHandler.java
index eaf6664b5b..4428428d17 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ClosedWithMismatchedReplicasHandler.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ClosedWithMismatchedReplicasHandler.java
@@ -34,6 +34,7 @@ import java.util.Set;
* mis-matched replica to close it.
*/
public class ClosedWithMismatchedReplicasHandler extends AbstractCheck {
+
public static final Logger LOG =
LoggerFactory.getLogger(ClosedWithMismatchedReplicasHandler.class);
@@ -59,6 +60,8 @@ public class ClosedWithMismatchedReplicasHandler extends
AbstractCheck {
// Handler is only relevant for CLOSED containers.
return false;
}
+ LOG.debug("Checking container {} in ClosedWithMismatchedReplicasHandler",
+ containerInfo);
// close replica if its state is OPEN or CLOSING
for (ContainerReplica replica : replicas) {
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ClosedWithUnhealthyReplicasHandler.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ClosedWithUnhealthyReplicasHandler.java
index fd4e3005ca..3b9b30d3a4 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ClosedWithUnhealthyReplicasHandler.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ClosedWithUnhealthyReplicasHandler.java
@@ -77,7 +77,8 @@ public class ClosedWithUnhealthyReplicasHandler extends
AbstractCheck {
if (containerInfo.getState() != HddsProtos.LifeCycleState.CLOSED) {
return false;
}
-
+ LOG.debug("Checking container {} in ClosedWithUnhealthyReplicasHandler",
+ containerInfo);
Set<ContainerReplica> replicas = request.getContainerReplicas();
// create a set of indexes that are closed
Set<Integer> closedIndexes = replicas.stream()
@@ -111,6 +112,7 @@ public class ClosedWithUnhealthyReplicasHandler extends
AbstractCheck {
ReplicationManagerReport.HealthState.UNHEALTHY,
containerInfo.containerID());
}
+ LOG.debug("Returning {} for container {}", foundUnhealthy, containerInfo);
return foundUnhealthy;
}
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ClosingContainerHandler.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ClosingContainerHandler.java
index 103f7d6646..c06581bf2a 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ClosingContainerHandler.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ClosingContainerHandler.java
@@ -24,11 +24,16 @@ import org.apache.hadoop.hdds.scm.container.ContainerInfo;
import org.apache.hadoop.hdds.scm.container.ContainerReplica;
import org.apache.hadoop.hdds.scm.container.replication.ContainerCheckRequest;
import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* Class used in Replication Manager to close replicas of CLOSING containers.
*/
public class ClosingContainerHandler extends AbstractCheck {
+ private static final Logger LOG =
+ LoggerFactory.getLogger(ClosingContainerHandler.class);
+
private final ReplicationManager replicationManager;
public ClosingContainerHandler(ReplicationManager replicationManager) {
@@ -50,6 +55,8 @@ public class ClosingContainerHandler extends AbstractCheck {
if (containerInfo.getState() != HddsProtos.LifeCycleState.CLOSING) {
return false;
}
+ LOG.debug("Checking container {} in ClosingContainerHandler",
+ containerInfo);
boolean forceClose = request.getContainerInfo().getReplicationConfig()
.getReplicationType() != HddsProtos.ReplicationType.RATIS;
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/DeletingContainerHandler.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/DeletingContainerHandler.java
index a31e5ecb16..9bc0160a3b 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/DeletingContainerHandler.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/DeletingContainerHandler.java
@@ -60,14 +60,19 @@ public class DeletingContainerHandler extends AbstractCheck
{
HddsProtos.LifeCycleState containerState = containerInfo.getState();
if (containerState == HddsProtos.LifeCycleState.DELETED) {
+ LOG.debug("Container {} is DELETED so returning true", containerInfo);
return true;
}
if (containerState != HddsProtos.LifeCycleState.DELETING) {
return false;
}
+ LOG.debug("Checking container {} in DeletingContainerHandler",
+ containerInfo);
if (request.getContainerReplicas().size() == 0) {
+ LOG.debug("Deleting Container {} has no replicas so marking for cleanup"
+
+ " and returning true", containerInfo);
replicationManager.updateContainerState(
cID, HddsProtos.LifeCycleEvent.CLEANUP);
return true;
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ECReplicationCheckHandler.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ECReplicationCheckHandler.java
index 2aba554498..cdd2f565ea 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ECReplicationCheckHandler.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ECReplicationCheckHandler.java
@@ -28,6 +28,8 @@ import
org.apache.hadoop.hdds.scm.container.replication.ContainerCheckRequest;
import org.apache.hadoop.hdds.scm.container.replication.ContainerHealthResult;
import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaOp;
import
org.apache.hadoop.hdds.scm.container.replication.ECContainerReplicaCount;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Collections;
@@ -44,6 +46,9 @@ import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType.E
*/
public class ECReplicationCheckHandler extends AbstractCheck {
+ private static final Logger LOG =
+ LoggerFactory.getLogger(ECReplicationCheckHandler.class);
+
private final PlacementPolicy placementPolicy;
public ECReplicationCheckHandler(PlacementPolicy placementPolicy) {
@@ -60,6 +65,7 @@ public class ECReplicationCheckHandler extends AbstractCheck {
ContainerInfo container = request.getContainerInfo();
ContainerID containerID = container.containerID();
ContainerHealthResult health = checkHealth(request);
+ LOG.debug("Checking container {} in ECReplicationCheckHandler", container);
if (health.getHealthState() == ContainerHealthResult.HealthState.HEALTHY) {
// If the container is healthy, there is nothing else to do in this
// handler so return as unhandled so any further handlers will be tried.
@@ -82,6 +88,10 @@ public class ECReplicationCheckHandler extends AbstractCheck
{
!underHealth.isUnrecoverable()) {
request.getReplicationQueue().enqueue(underHealth);
}
+ LOG.debug("Container {} is Under Replicated. isReplicatedOkAfterPending "
+ + "is [{}]. isUnrecoverable is [{}]", container,
+ underHealth.isReplicatedOkAfterPending(),
+ underHealth.isUnrecoverable());
return true;
} else if (health.getHealthState()
== ContainerHealthResult.HealthState.OVER_REPLICATED) {
@@ -92,6 +102,8 @@ public class ECReplicationCheckHandler extends AbstractCheck
{
if (!overHealth.isReplicatedOkAfterPending()) {
request.getReplicationQueue().enqueue(overHealth);
}
+ LOG.debug("Container {} is Over Replicated. isReplicatedOkAfterPending "
+ + "is [{}]", container, overHealth.isReplicatedOkAfterPending());
return true;
} else if (health.getHealthState() ==
ContainerHealthResult.HealthState.MIS_REPLICATED) {
@@ -102,10 +114,14 @@ public class ECReplicationCheckHandler extends
AbstractCheck {
if (!misRepHealth.isReplicatedOkAfterPending()) {
request.getReplicationQueue().enqueue(misRepHealth);
}
+ LOG.debug("Container {} is Mis Replicated. isReplicatedOkAfterPending "
+ + "is [{}]", container, misRepHealth.isReplicatedOkAfterPending());
return true;
}
- // Should not get here, but incase it does the container is not healthy,
+ // Should not get here, but in case it does the container is not healthy,
// but is also not under or over replicated.
+ LOG.warn("Container {} is not healthy but is not under, over or "
+ + " mis-replicated. Should not happen.", container);
return false;
}
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/EmptyContainerHandler.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/EmptyContainerHandler.java
index 19c8d0a93a..808f3f4165 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/EmptyContainerHandler.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/EmptyContainerHandler.java
@@ -60,7 +60,8 @@ public class EmptyContainerHandler extends AbstractCheck {
request.getReport()
.incrementAndSample(ReplicationManagerReport.HealthState.EMPTY,
containerInfo.containerID());
-
+ LOG.debug("Container {} is empty and closed, marking as DELETING",
+ containerInfo);
// delete replicas if they are closed and empty
deleteContainerReplicas(containerInfo, replicas);
@@ -109,9 +110,6 @@ public class EmptyContainerHandler extends AbstractCheck {
rp.getState() == ContainerReplicaProto.State.CLOSED);
Preconditions.assertTrue(rp.getKeyCount() == 0);
- LOG.debug("Trying to delete empty replica with index {} for container " +
- "{} on datanode {}", rp.getReplicaIndex(),
- containerInfo.containerID(),
rp.getDatanodeDetails().getUuidString());
try {
replicationManager.sendDeleteCommand(containerInfo,
rp.getReplicaIndex(), rp.getDatanodeDetails());
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/OpenContainerHandler.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/OpenContainerHandler.java
index 666b1a2287..a644f5e834 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/OpenContainerHandler.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/OpenContainerHandler.java
@@ -23,6 +23,8 @@ import org.apache.hadoop.hdds.scm.container.ContainerReplica;
import org.apache.hadoop.hdds.scm.container.ReplicationManagerReport;
import org.apache.hadoop.hdds.scm.container.replication.ContainerCheckRequest;
import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import java.util.Set;
@@ -36,7 +38,10 @@ import static
org.apache.hadoop.hdds.scm.container.replication.ReplicationManage
*/
public class OpenContainerHandler extends AbstractCheck {
- private ReplicationManager replicationManager;
+ private static final Logger LOG =
+ LoggerFactory.getLogger(OpenContainerHandler.class);
+
+ private final ReplicationManager replicationManager;
public OpenContainerHandler(ReplicationManager replicationManager) {
this.replicationManager = replicationManager;
@@ -46,10 +51,14 @@ public class OpenContainerHandler extends AbstractCheck {
public boolean handle(ContainerCheckRequest request) {
ContainerInfo containerInfo = request.getContainerInfo();
if (containerInfo.getState() == HddsProtos.LifeCycleState.OPEN) {
+ LOG.debug("Checking open container {} in OpenContainerHandler",
+ containerInfo);
if (!isOpenContainerHealthy(
containerInfo, request.getContainerReplicas())) {
// This is an unhealthy open container, so we need to trigger the
// close process on it.
+ LOG.debug("Container {} is open but unhealthy. Triggering close.",
+ containerInfo);
request.getReport().incrementAndSample(
ReplicationManagerReport.HealthState.OPEN_UNHEALTHY,
containerInfo.containerID());
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/QuasiClosedContainerHandler.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/QuasiClosedContainerHandler.java
index 449d0776e1..01f6a05d77 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/QuasiClosedContainerHandler.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/QuasiClosedContainerHandler.java
@@ -64,12 +64,16 @@ public class QuasiClosedContainerHandler extends
AbstractCheck {
if (containerInfo.getState() != HddsProtos.LifeCycleState.QUASI_CLOSED) {
return false;
}
+ LOG.debug("Checking container {} in QuasiClosedContainerHandler",
+ containerInfo);
Set<ContainerReplica> replicas = request.getContainerReplicas();
if (canForceCloseContainer(containerInfo, replicas)) {
forceCloseContainer(containerInfo, replicas);
return true;
} else {
+ LOG.debug("Container {} cannot be force closed and is stuck in " +
+ "QUASI_CLOSED", containerInfo);
request.getReport().incrementAndSample(
ReplicationManagerReport.HealthState.QUASI_CLOSED_STUCK,
containerInfo.containerID());
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/RatisReplicationCheckHandler.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/RatisReplicationCheckHandler.java
index 9d33498dda..91dd51a607 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/RatisReplicationCheckHandler.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/RatisReplicationCheckHandler.java
@@ -66,6 +66,8 @@ public class RatisReplicationCheckHandler extends
AbstractCheck {
ReplicationManagerReport report = request.getReport();
ContainerInfo container = request.getContainerInfo();
ContainerHealthResult health = checkHealth(request);
+ LOG.debug("Checking container {} in RatisReplicationCheckHandler",
+ container);
if (health.getHealthState() == ContainerHealthResult.HealthState.HEALTHY) {
// If the container is healthy, there is nothing else to do in this
// handler so return as unhandled so any further handlers will be tried.
@@ -88,6 +90,10 @@ public class RatisReplicationCheckHandler extends
AbstractCheck {
!underHealth.isReplicatedOkAfterPending()) {
request.getReplicationQueue().enqueue(underHealth);
}
+ LOG.debug("Container {} is Under Replicated. isReplicatedOkAfterPending"
+
+ " is [{}]. isUnrecoverable is [{}]", container,
+ underHealth.isReplicatedOkAfterPending(),
+ underHealth.isUnrecoverable());
return true;
}
@@ -101,6 +107,8 @@ public class RatisReplicationCheckHandler extends
AbstractCheck {
if (!overHealth.isReplicatedOkAfterPending()) {
request.getReplicationQueue().enqueue(overHealth);
}
+ LOG.debug("Container {} is Over Replicated. isReplicatedOkAfterPending" +
+ " is [{}]", container, overHealth.isReplicatedOkAfterPending());
return true;
}
if (health.getHealthState() ==
@@ -113,8 +121,14 @@ public class RatisReplicationCheckHandler extends
AbstractCheck {
if (!misRepHealth.isReplicatedOkAfterPending()) {
request.getReplicationQueue().enqueue(misRepHealth);
}
+ LOG.debug("Container {} is Mid Replicated. isReplicatedOkAfterPending" +
+ " is [{}]", container, misRepHealth.isReplicatedOkAfterPending());
return true;
}
+ // Should not get here, but in case it does the container is not healthy,
+ // but is also not under, over or mis replicated.
+ LOG.warn("Container {} is not healthy but is not under, over or "
+ + " mis-replicated. Should not happen.", container);
return false;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]