This is an automated email from the ASF dual-hosted git repository.

tejaskriya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 3c25e7d634 HDDS-12595. Add verifier for container replica states 
(#8422)
3c25e7d634 is described below

commit 3c25e7d63441b6c281391c44202eaca179440ab6
Author: Tejaskriya <[email protected]>
AuthorDate: Tue Jun 3 13:53:42 2025 +0530

    HDDS-12595. Add verifier for container replica states (#8422)
---
 .../main/smoketest/debug/ozone-debug-tests.robot   |   3 +-
 .../hadoop/ozone/shell/TestOzoneDebugShell.java    |   3 +-
 .../debug/replicas/ContainerStateVerifier.java     | 194 +++++++++++++++++++++
 .../ozone/debug/replicas/ReplicasVerify.java       |  18 ++
 4 files changed, 215 insertions(+), 3 deletions(-)

diff --git a/hadoop-ozone/dist/src/main/smoketest/debug/ozone-debug-tests.robot 
b/hadoop-ozone/dist/src/main/smoketest/debug/ozone-debug-tests.robot
index 532f931bb5..f402e29d78 100644
--- a/hadoop-ozone/dist/src/main/smoketest/debug/ozone-debug-tests.robot
+++ b/hadoop-ozone/dist/src/main/smoketest/debug/ozone-debug-tests.robot
@@ -27,7 +27,6 @@ ${VOLUME}           cli-debug-volume${PREFIX}
 ${BUCKET}           cli-debug-bucket
 ${DEBUGKEY}         debugKey
 ${TESTFILE}         testfile
-${OM_SERVICE_ID}    %{OM_SERVICE_ID}
 
 *** Keywords ***
 Write keys
@@ -38,7 +37,7 @@ Write keys
 
 *** Test Cases ***
 Test ozone debug replicas verify checksums
-    ${output} =    Execute   ozone debug replicas verify --checksums 
o3://${OM_SERVICE_ID}/${VOLUME}/${BUCKET}/${TESTFILE} --output-dir ${TEMP_DIR}
+    ${output} =    Execute   ozone debug replicas verify --checksums 
--block-existence --container-state 
o3://${OM_SERVICE_ID}/${VOLUME}/${BUCKET}/${TESTFILE} --output-dir ${TEMP_DIR}
     ${json} =      Evaluate  json.loads('''${output}''')      json
 
     # 'keys' array should be empty if all keys and their replicas passed 
checksum verification
diff --git 
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/shell/TestOzoneDebugShell.java
 
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/shell/TestOzoneDebugShell.java
index 6fc9db29e8..a9662abdea 100644
--- 
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/shell/TestOzoneDebugShell.java
+++ 
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/shell/TestOzoneDebugShell.java
@@ -103,7 +103,8 @@ public void testReplicasVerifyCmd(boolean isEcKey) throws 
Exception {
     String[] args = new String[] {
         getSetConfStringFromConf(OMConfigKeys.OZONE_OM_ADDRESS_KEY),
         getSetConfStringFromConf(ScmConfigKeys.OZONE_SCM_CLIENT_ADDRESS_KEY),
-        "replicas", "verify", "--checksums", "--block-existence", fullKeyPath, 
"--output-dir", "/"//, "--all-results"
+        "replicas", "verify", "--checksums", "--block-existence", 
"--container-state", fullKeyPath,
+        "--output-dir", "/"//, "--all-results"
     };
 
     int exitCode = ozoneDebugShell.execute(args);
diff --git 
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ContainerStateVerifier.java
 
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ContainerStateVerifier.java
new file mode 100644
index 0000000000..c34d2e17d3
--- /dev/null
+++ 
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ContainerStateVerifier.java
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.debug.replicas;
+
+import static 
org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor.ONE;
+
+import com.google.common.cache.Cache;
+import com.google.common.cache.CacheBuilder;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Objects;
+import org.apache.hadoop.hdds.client.StandaloneReplicationConfig;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import 
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto;
+import 
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ReadContainerResponseProto;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.scm.XceiverClientManager;
+import org.apache.hadoop.hdds.scm.XceiverClientSpi;
+import org.apache.hadoop.hdds.scm.cli.ContainerOperationClient;
+import org.apache.hadoop.hdds.scm.container.ContainerInfo;
+import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
+import org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls;
+import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo;
+
+/**
+ * Verifies the state of a replica from the DN.
+ * [DELETED, UNHEALTHY, INVALID] are considered bad states.
+ */
+public class ContainerStateVerifier implements ReplicaVerifier {
+  private static final String CHECK_TYPE = "containerState";
+  private static final long DEFAULT_CONTAINER_CACHE_SIZE = 1000000;
+  private final ContainerOperationClient containerOperationClient;
+  private final XceiverClientManager xceiverClientManager;
+  // cache for container info and encodedToken from the SCM
+  private final Cache<Long, ContainerInfoToken> encodedTokenCache;
+
+  public ContainerStateVerifier(OzoneConfiguration conf, long 
containerCacheSize) throws IOException {
+    containerOperationClient = new ContainerOperationClient(conf);
+    xceiverClientManager = containerOperationClient.getXceiverClientManager();
+
+    if (containerCacheSize < 1) {
+      System.err.println("Invalid cache size provided: " + containerCacheSize +
+              ". Falling back to default: " + DEFAULT_CONTAINER_CACHE_SIZE);
+      containerCacheSize = DEFAULT_CONTAINER_CACHE_SIZE;
+    }
+    encodedTokenCache = 
CacheBuilder.newBuilder().maximumSize(containerCacheSize).build();
+  }
+
+  @Override
+  public String getType() {
+    return CHECK_TYPE;
+  }
+
+  @Override
+  public BlockVerificationResult verifyBlock(DatanodeDetails datanode, 
OmKeyLocationInfo keyLocation,
+                                             int replicaIndex) {
+    try {
+      StringBuilder replicaCheckMsg = new StringBuilder().append("Replica 
state is ");
+      boolean pass = false;
+
+      ContainerInfoToken containerInfoToken = 
getContainerInfoToken(keyLocation.getContainerID());
+      ContainerDataProto containerData = 
fetchContainerDataFromDatanode(datanode, keyLocation.getContainerID(),
+          keyLocation, replicaIndex, containerInfoToken);
+
+      if (containerData == null) {
+        return BlockVerificationResult.failIncomplete("No container data 
returned from DN.");
+      }
+      ContainerDataProto.State state = containerData.getState();
+      replicaCheckMsg.append(state.name());
+      if (areContainerAndReplicasInGoodState(state, 
containerInfoToken.getContainerState())) {
+        pass = true;
+      }
+      replicaCheckMsg.append(", Container state in SCM is 
").append(containerInfoToken.getContainerState());
+
+      if (pass) {
+        return BlockVerificationResult.pass();
+      } else {
+        return BlockVerificationResult.failCheck(replicaCheckMsg.toString());
+      }
+    } catch (IOException e) {
+      if (e.getMessage().contains("ContainerID") && 
e.getMessage().contains("does not exist")) {
+        // if container "does not exist", mark it as failed instead of 
incomplete
+        return BlockVerificationResult.failCheck(e.getMessage());
+      }
+      return BlockVerificationResult.failIncomplete(e.getMessage());
+    }
+  }
+
+  private boolean areContainerAndReplicasInGoodState(ContainerDataProto.State 
replicaState,
+      HddsProtos.LifeCycleState containerState) {
+    return (replicaState != ContainerDataProto.State.UNHEALTHY &&
+        replicaState != ContainerDataProto.State.INVALID &&
+        replicaState != ContainerDataProto.State.DELETED &&
+        containerState != HddsProtos.LifeCycleState.DELETING &&
+        containerState != HddsProtos.LifeCycleState.DELETED);
+  }
+
+  private ContainerDataProto fetchContainerDataFromDatanode(DatanodeDetails 
dn, long containerId,
+                                                            OmKeyLocationInfo 
keyLocation, int replicaIndex,
+                                                            ContainerInfoToken 
containerInfoToken)
+      throws IOException {
+    XceiverClientSpi client = null;
+    ReadContainerResponseProto response;
+    try {
+      Pipeline pipeline = Pipeline.newBuilder(keyLocation.getPipeline())
+          .setId(dn.getID())
+          .setReplicationConfig(StandaloneReplicationConfig.getInstance(ONE))
+          .setNodes(Collections.singletonList(dn))
+          .setReplicaIndexes(Collections.singletonMap(dn, replicaIndex))
+          .build();
+      String encodedToken = containerInfoToken.getEncodedToken();
+
+      client = xceiverClientManager.acquireClientForReadData(pipeline);
+      response = ContainerProtocolCalls
+          .readContainer(client, containerId, encodedToken);
+    } finally {
+      if (client != null) {
+        xceiverClientManager.releaseClient(client, false);
+      }
+    }
+
+    if (!response.hasContainerData()) {
+      return null;
+    }
+    return response.getContainerData();
+  }
+
+  private ContainerInfoToken getContainerInfoToken(long containerId)
+      throws IOException {
+    ContainerInfoToken cachedData = 
encodedTokenCache.getIfPresent(containerId);
+    if (cachedData != null) {
+      return cachedData;
+    }
+    // Cache miss - fetch and store
+    ContainerInfo info = containerOperationClient.getContainer(containerId);
+    String encodeToken = 
containerOperationClient.getEncodedContainerToken(containerId);
+    cachedData = new ContainerInfoToken(info.getState(), encodeToken);
+    encodedTokenCache.put(containerId, cachedData);
+    return cachedData;
+  }
+
+  private static class ContainerInfoToken {
+    private HddsProtos.LifeCycleState state;
+    private final String encodedToken;
+
+    ContainerInfoToken(HddsProtos.LifeCycleState lifeState, String token) {
+      this.state = lifeState;
+      this.encodedToken = token;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+      if (this == o) {
+        return true;
+      }
+      if (!(o instanceof ContainerInfoToken)) {
+        return false;
+      }
+      ContainerInfoToken key = (ContainerInfoToken) o;
+      return Objects.equals(state, key.state) &&
+          Objects.equals(encodedToken, key.encodedToken);
+    }
+
+    @Override
+    public int hashCode() {
+      return Objects.hash(state, encodedToken);
+    }
+
+    public HddsProtos.LifeCycleState getContainerState() {
+      return state;
+    }
+
+    public String getEncodedToken() {
+      return encodedToken;
+    }
+  }
+
+}
diff --git 
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ReplicasVerify.java
 
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ReplicasVerify.java
index 38ebee0e8a..2397f8c3d1 100644
--- 
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ReplicasVerify.java
+++ 
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ReplicasVerify.java
@@ -67,6 +67,14 @@ public class ReplicasVerify extends Handler {
   @CommandLine.ArgGroup(exclusive = false, multiplicity = "1")
   private Verification verification;
 
+  @CommandLine.Option(names = {"--container-cache-size"},
+      description = "Size (in number of containers) of the in-memory cache for 
container state verification " +
+          "'--container-state'. Default is 1 million containers (which takes 
around 43MB). " +
+          "Value must be greater than zero, otherwise the default of 1 million 
is considered. " +
+          "Note: This option is ignored if '--container-state' option is not 
used.",
+      defaultValue = "1000000")
+  private long containerCacheSize;
+
   private List<ReplicaVerifier> replicaVerifiers;
 
   @Override
@@ -80,6 +88,9 @@ protected void execute(OzoneClient client, OzoneAddress 
address) throws IOExcept
     if (verification.doExecuteBlockExistence) {
       replicaVerifiers.add(new BlockExistenceVerifier(getConf()));
     }
+    if (verification.doExecuteReplicaState) {
+      replicaVerifiers.add(new ContainerStateVerifier(getConf(), 
containerCacheSize));
+    }
 
     findCandidateKeys(client, address);
   }
@@ -222,5 +233,12 @@ static class Verification {
         defaultValue = "false")
     private boolean doExecuteBlockExistence;
 
+    @CommandLine.Option(names = "--container-state",
+        description = "Check the container and replica states. " +
+            "Containers in [DELETING, DELETED] states, or " +
+            "it's replicas in [DELETED, UNHEALTHY, INVALID] states fail the 
check.",
+        defaultValue = "false")
+    private boolean doExecuteReplicaState;
+
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to