This is an automated email from the ASF dual-hosted git repository.

adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 9036e1ab6e8 HDDS-13535. Show under/over-replication in `replicas 
verify --container-state` results (#9135)
9036e1ab6e8 is described below

commit 9036e1ab6e87f5fb848bed2a29cc7d4f5ba51897
Author: ChenChen Lai <[email protected]>
AuthorDate: Tue Dec 9 18:42:57 2025 +0800

    HDDS-13535. Show under/over-replication in `replicas verify 
--container-state` results (#9135)
---
 .../smoketest/debug/ozone-debug-keywords.robot     |  20 ++++
 .../debug/replicas/ContainerStateVerifier.java     | 105 ++++++++++++++++-----
 2 files changed, 102 insertions(+), 23 deletions(-)

diff --git 
a/hadoop-ozone/dist/src/main/smoketest/debug/ozone-debug-keywords.robot 
b/hadoop-ozone/dist/src/main/smoketest/debug/ozone-debug-keywords.robot
index d75bdd20607..aa51febb318 100644
--- a/hadoop-ozone/dist/src/main/smoketest/debug/ozone-debug-keywords.robot
+++ b/hadoop-ozone/dist/src/main/smoketest/debug/ozone-debug-keywords.robot
@@ -43,6 +43,11 @@ Parse replicas verify JSON output
 Check to Verify Replicas
     [Arguments]    ${json}  ${check_type}  ${faulty_datanode}  
${expected_message}
     ${replicas} =    Get From Dictionary    ${json['keys'][0]['blocks'][0]}    
replicas
+    Run Keyword If    '${check_type}' == 'containerState'    Check Container 
State Replicas    ${replicas}  ${faulty_datanode}  ${expected_message}
+    ...    ELSE    Check Standard Replicas    ${replicas}  ${check_type}  
${faulty_datanode}  ${expected_message}
+
+Check Standard Replicas
+    [Arguments]    ${replicas}  ${check_type}  ${faulty_datanode}  
${expected_message}
     FOR    ${replica}    IN    @{replicas}
         ${datanode} =     Get From Dictionary    ${replica}    datanode
         ${hostname} =     Get From Dictionary    ${datanode}   hostname
@@ -50,6 +55,21 @@ Check to Verify Replicas
         Run Keyword If    '${hostname}' != '${faulty_datanode}'    Check 
Replica Passed    ${replica}  ${check_type}
     END
 
+Check Container State Replicas
+    [Arguments]    ${replicas}  ${faulty_datanode}  ${expected_message}
+    FOR    ${replica}    IN    @{replicas}
+        ${datanode} =     Get From Dictionary    ${replica}    datanode
+        ${hostname} =     Get From Dictionary    ${datanode}   hostname
+        ${checks} =       Get From Dictionary    ${replica}    checks
+        ${check} =        Get From List          ${checks}     0
+        Should Be Equal    ${check['type']}    containerState
+        Should Be Equal    ${check['pass']}    ${False}
+        ${actual_message} =    Set Variable    
${check['failures'][0]['message']}
+
+        Run Keyword If    '${hostname}' == '${faulty_datanode}'    Should 
Contain    ${actual_message}    ${expected_message}
+        ...    ELSE    Should Match Regexp    ${actual_message}    Replica 
state is (OPEN|CLOSING|QUASI_CLOSED|CLOSED)
+    END
+
 Check Replica Failed
     [Arguments]    ${replica}  ${check_type}  ${expected_message}
     ${checks} =     Get From Dictionary    ${replica}    checks
diff --git 
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ContainerStateVerifier.java
 
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ContainerStateVerifier.java
index 465f1e99c94..0ed4cb10898 100644
--- 
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ContainerStateVerifier.java
+++ 
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ContainerStateVerifier.java
@@ -21,6 +21,7 @@
 import com.google.common.cache.CacheBuilder;
 import java.io.IOException;
 import java.util.EnumSet;
+import java.util.List;
 import java.util.Objects;
 import java.util.Set;
 import org.apache.hadoop.hdds.conf.OzoneConfiguration;
@@ -32,6 +33,8 @@
 import org.apache.hadoop.hdds.scm.XceiverClientSpi;
 import org.apache.hadoop.hdds.scm.cli.ContainerOperationClient;
 import org.apache.hadoop.hdds.scm.container.ContainerInfo;
+import org.apache.hadoop.hdds.scm.container.ContainerReplicaInfo;
+import org.apache.hadoop.hdds.scm.container.replication.ContainerHealthResult;
 import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
 import org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls;
 import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo;
@@ -45,8 +48,8 @@ public class ContainerStateVerifier implements 
ReplicaVerifier {
   private static final long DEFAULT_CONTAINER_CACHE_SIZE = 1000000;
   private final ContainerOperationClient containerOperationClient;
   private final XceiverClientManager xceiverClientManager;
-  // cache for container info and encodedToken from the SCM
-  private final Cache<Long, ContainerInfoToken> encodedTokenCache;
+  // cache for information about the container from SCM
+  private final Cache<Long, ContainerInformation> containerCache;
 
   private static final Set<ContainerDataProto.State> GOOD_REPLICA_STATES =
       EnumSet.of(
@@ -73,7 +76,9 @@ public ContainerStateVerifier(OzoneConfiguration conf, long 
containerCacheSize)
               ". Falling back to default: " + DEFAULT_CONTAINER_CACHE_SIZE);
       containerCacheSize = DEFAULT_CONTAINER_CACHE_SIZE;
     }
-    encodedTokenCache = 
CacheBuilder.newBuilder().maximumSize(containerCacheSize).build();
+    containerCache = CacheBuilder.newBuilder()
+        .maximumSize(containerCacheSize)
+        .build();
   }
 
   @Override
@@ -87,19 +92,29 @@ public BlockVerificationResult verifyBlock(DatanodeDetails 
datanode, OmKeyLocati
       StringBuilder replicaCheckMsg = new StringBuilder().append("Replica 
state is ");
       boolean pass = false;
 
-      ContainerInfoToken containerInfoToken = 
getContainerInfoToken(keyLocation.getContainerID());
-      ContainerDataProto containerData = 
fetchContainerDataFromDatanode(datanode, keyLocation.getContainerID(),
-          keyLocation, containerInfoToken);
+      long containerID = keyLocation.getContainerID();
+      ContainerInformation containerInformation = 
fetchContainerInformationFromSCM(containerID);
+      ContainerDataProto containerData = 
fetchContainerDataFromDatanode(datanode, containerID,
+          keyLocation, containerInformation.getEncodedToken());
 
       if (containerData == null) {
         return BlockVerificationResult.failIncomplete("No container data 
returned from DN.");
       }
       ContainerDataProto.State state = containerData.getState();
       replicaCheckMsg.append(state.name());
-      if (areContainerAndReplicasInGoodState(state, 
containerInfoToken.getContainerState())) {
+      boolean replicaStateGood = areContainerAndReplicasInGoodState(state, 
containerInformation.getContainerState());
+      replicaCheckMsg.append(", Container state in SCM is 
").append(containerInformation.getContainerState());
+
+      String replicationStatus = containerInformation.getReplicationStatus();
+      replicaCheckMsg.append(", ").append(replicationStatus);
+
+      // Replication status check evaluates container-level health by counting 
healthy replicas
+      // across all datanodes. Therefore, when a container is UNDER_REPLICATED 
or OVER_REPLICATED,
+      // this information should be reflected in all replica outputs, not just 
the unhealthy ones.
+      if 
(replicationStatus.startsWith(ContainerHealthResult.HealthState.HEALTHY.name())
+          && replicaStateGood) {
         pass = true;
       }
-      replicaCheckMsg.append(", Container state in SCM is 
").append(containerInfoToken.getContainerState());
 
       if (pass) {
         return BlockVerificationResult.pass();
@@ -123,13 +138,12 @@ private boolean 
areContainerAndReplicasInGoodState(ContainerDataProto.State repl
 
   private ContainerDataProto fetchContainerDataFromDatanode(DatanodeDetails 
dn, long containerId,
                                                             OmKeyLocationInfo 
keyLocation,
-                                                            ContainerInfoToken 
containerInfoToken)
+                                                            String 
encodedToken)
       throws IOException {
     XceiverClientSpi client = null;
     ReadContainerResponseProto response;
     try {
       Pipeline pipeline = keyLocation.getPipeline().copyForReadFromNode(dn);
-      String encodedToken = containerInfoToken.getEncodedToken();
 
       client = xceiverClientManager.acquireClientForReadData(pipeline);
       response = ContainerProtocolCalls
@@ -146,27 +160,67 @@ private ContainerDataProto 
fetchContainerDataFromDatanode(DatanodeDetails dn, lo
     return response.getContainerData();
   }
 
-  private ContainerInfoToken getContainerInfoToken(long containerId)
+  private ContainerInformation fetchContainerInformationFromSCM(long 
containerId)
       throws IOException {
-    ContainerInfoToken cachedData = 
encodedTokenCache.getIfPresent(containerId);
+    ContainerInformation cachedData = containerCache.getIfPresent(containerId);
     if (cachedData != null) {
       return cachedData;
     }
-    // Cache miss - fetch and store
-    ContainerInfo info = containerOperationClient.getContainer(containerId);
+    // Cache miss - fetch container info, token, and compute replication status
+    ContainerInfo containerInfo = 
containerOperationClient.getContainer(containerId);
     String encodeToken = 
containerOperationClient.getEncodedContainerToken(containerId);
-    cachedData = new ContainerInfoToken(info.getState(), encodeToken);
-    encodedTokenCache.put(containerId, cachedData);
+    String replicationStatus = computeReplicationStatus(containerId, 
containerInfo);
+    cachedData = new ContainerInformation(containerInfo.getState(), 
encodeToken, replicationStatus);
+    containerCache.put(containerId, cachedData);
     return cachedData;
   }
 
-  private static class ContainerInfoToken {
-    private HddsProtos.LifeCycleState state;
+  private String computeReplicationStatus(long containerId, ContainerInfo 
containerInfo) {
+    try {
+      List<ContainerReplicaInfo> replicaInfos =
+          containerOperationClient.getContainerReplicas(containerId);
+
+      if (replicaInfos.isEmpty()) {
+        return ContainerHealthResult.HealthState.UNDER_REPLICATED
+            + ": no replicas found";
+      }
+
+      int requiredNodes =
+          containerInfo.getReplicationConfig().getRequiredNodes();
+      int healthyReplicas = 0;
+
+      for (ContainerReplicaInfo replicaInfo : replicaInfos) {
+        if (!"UNHEALTHY".equals(replicaInfo.getState())) {
+          healthyReplicas++;
+        }
+      }
+
+      if (healthyReplicas == requiredNodes) {
+        return ContainerHealthResult.HealthState.HEALTHY.toString();
+      }
+
+      ContainerHealthResult.HealthState status =
+          healthyReplicas < requiredNodes
+              ? ContainerHealthResult.HealthState.UNDER_REPLICATED
+              : ContainerHealthResult.HealthState.OVER_REPLICATED;
+
+      return String.format("%s: %d/%d healthy replicas",
+          status, healthyReplicas, requiredNodes);
+    } catch (Exception e) {
+      return "REPLICATION_CHECK_FAILED: " + e.getMessage();
+    }
+  }
+
+  /** Information from SCM about the container needed for each replica. */
+  private static class ContainerInformation {
+    private final HddsProtos.LifeCycleState state;
     private final String encodedToken;
+    private final String replicationStatus;
 
-    ContainerInfoToken(HddsProtos.LifeCycleState lifeState, String token) {
+    ContainerInformation(HddsProtos.LifeCycleState lifeState, String token, 
String replicationStatus) {
       this.state = lifeState;
       this.encodedToken = token;
+      this.replicationStatus = replicationStatus;
     }
 
     @Override
@@ -174,17 +228,18 @@ public boolean equals(Object o) {
       if (this == o) {
         return true;
       }
-      if (!(o instanceof ContainerInfoToken)) {
+      if (!(o instanceof ContainerInformation)) {
         return false;
       }
-      ContainerInfoToken key = (ContainerInfoToken) o;
+      ContainerInformation key = (ContainerInformation) o;
       return Objects.equals(state, key.state) &&
-          Objects.equals(encodedToken, key.encodedToken);
+          Objects.equals(encodedToken, key.encodedToken) &&
+          Objects.equals(replicationStatus, key.replicationStatus);
     }
 
     @Override
     public int hashCode() {
-      return Objects.hash(state, encodedToken);
+      return Objects.hash(state, encodedToken, replicationStatus);
     }
 
     public HddsProtos.LifeCycleState getContainerState() {
@@ -194,6 +249,10 @@ public HddsProtos.LifeCycleState getContainerState() {
     public String getEncodedToken() {
       return encodedToken;
     }
+
+    public String getReplicationStatus() {
+      return replicationStatus;
+    }
   }
 
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to