This is an automated email from the ASF dual-hosted git repository.
tejaskriya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 3c25e7d634 HDDS-12595. Add verifier for container replica states
(#8422)
3c25e7d634 is described below
commit 3c25e7d63441b6c281391c44202eaca179440ab6
Author: Tejaskriya <[email protected]>
AuthorDate: Tue Jun 3 13:53:42 2025 +0530
HDDS-12595. Add verifier for container replica states (#8422)
---
.../main/smoketest/debug/ozone-debug-tests.robot | 3 +-
.../hadoop/ozone/shell/TestOzoneDebugShell.java | 3 +-
.../debug/replicas/ContainerStateVerifier.java | 194 +++++++++++++++++++++
.../ozone/debug/replicas/ReplicasVerify.java | 18 ++
4 files changed, 215 insertions(+), 3 deletions(-)
diff --git a/hadoop-ozone/dist/src/main/smoketest/debug/ozone-debug-tests.robot
b/hadoop-ozone/dist/src/main/smoketest/debug/ozone-debug-tests.robot
index 532f931bb5..f402e29d78 100644
--- a/hadoop-ozone/dist/src/main/smoketest/debug/ozone-debug-tests.robot
+++ b/hadoop-ozone/dist/src/main/smoketest/debug/ozone-debug-tests.robot
@@ -27,7 +27,6 @@ ${VOLUME} cli-debug-volume${PREFIX}
${BUCKET} cli-debug-bucket
${DEBUGKEY} debugKey
${TESTFILE} testfile
-${OM_SERVICE_ID} %{OM_SERVICE_ID}
*** Keywords ***
Write keys
@@ -38,7 +37,7 @@ Write keys
*** Test Cases ***
Test ozone debug replicas verify checksums
- ${output} = Execute ozone debug replicas verify --checksums
o3://${OM_SERVICE_ID}/${VOLUME}/${BUCKET}/${TESTFILE} --output-dir ${TEMP_DIR}
+ ${output} = Execute ozone debug replicas verify --checksums
--block-existence --container-state
o3://${OM_SERVICE_ID}/${VOLUME}/${BUCKET}/${TESTFILE} --output-dir ${TEMP_DIR}
${json} = Evaluate json.loads('''${output}''') json
# 'keys' array should be empty if all keys and their replicas passed
checksum verification
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/shell/TestOzoneDebugShell.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/shell/TestOzoneDebugShell.java
index 6fc9db29e8..a9662abdea 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/shell/TestOzoneDebugShell.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/shell/TestOzoneDebugShell.java
@@ -103,7 +103,8 @@ public void testReplicasVerifyCmd(boolean isEcKey) throws
Exception {
String[] args = new String[] {
getSetConfStringFromConf(OMConfigKeys.OZONE_OM_ADDRESS_KEY),
getSetConfStringFromConf(ScmConfigKeys.OZONE_SCM_CLIENT_ADDRESS_KEY),
- "replicas", "verify", "--checksums", "--block-existence", fullKeyPath,
"--output-dir", "/"//, "--all-results"
+ "replicas", "verify", "--checksums", "--block-existence",
"--container-state", fullKeyPath,
+ "--output-dir", "/"//, "--all-results"
};
int exitCode = ozoneDebugShell.execute(args);
diff --git
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ContainerStateVerifier.java
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ContainerStateVerifier.java
new file mode 100644
index 0000000000..c34d2e17d3
--- /dev/null
+++
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ContainerStateVerifier.java
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.debug.replicas;
+
+import static
org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor.ONE;
+
+import com.google.common.cache.Cache;
+import com.google.common.cache.CacheBuilder;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Objects;
+import org.apache.hadoop.hdds.client.StandaloneReplicationConfig;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto;
+import
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ReadContainerResponseProto;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.scm.XceiverClientManager;
+import org.apache.hadoop.hdds.scm.XceiverClientSpi;
+import org.apache.hadoop.hdds.scm.cli.ContainerOperationClient;
+import org.apache.hadoop.hdds.scm.container.ContainerInfo;
+import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
+import org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls;
+import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo;
+
+/**
+ * Verifies the state of a replica from the DN.
+ * [DELETED, UNHEALTHY, INVALID] are considered bad states.
+ */
+public class ContainerStateVerifier implements ReplicaVerifier {
+ private static final String CHECK_TYPE = "containerState";
+ private static final long DEFAULT_CONTAINER_CACHE_SIZE = 1000000;
+ private final ContainerOperationClient containerOperationClient;
+ private final XceiverClientManager xceiverClientManager;
+ // cache for container info and encodedToken from the SCM
+ private final Cache<Long, ContainerInfoToken> encodedTokenCache;
+
+ public ContainerStateVerifier(OzoneConfiguration conf, long
containerCacheSize) throws IOException {
+ containerOperationClient = new ContainerOperationClient(conf);
+ xceiverClientManager = containerOperationClient.getXceiverClientManager();
+
+ if (containerCacheSize < 1) {
+ System.err.println("Invalid cache size provided: " + containerCacheSize +
+ ". Falling back to default: " + DEFAULT_CONTAINER_CACHE_SIZE);
+ containerCacheSize = DEFAULT_CONTAINER_CACHE_SIZE;
+ }
+ encodedTokenCache =
CacheBuilder.newBuilder().maximumSize(containerCacheSize).build();
+ }
+
+ @Override
+ public String getType() {
+ return CHECK_TYPE;
+ }
+
+ @Override
+ public BlockVerificationResult verifyBlock(DatanodeDetails datanode,
OmKeyLocationInfo keyLocation,
+ int replicaIndex) {
+ try {
+ StringBuilder replicaCheckMsg = new StringBuilder().append("Replica
state is ");
+ boolean pass = false;
+
+ ContainerInfoToken containerInfoToken =
getContainerInfoToken(keyLocation.getContainerID());
+ ContainerDataProto containerData =
fetchContainerDataFromDatanode(datanode, keyLocation.getContainerID(),
+ keyLocation, replicaIndex, containerInfoToken);
+
+ if (containerData == null) {
+ return BlockVerificationResult.failIncomplete("No container data
returned from DN.");
+ }
+ ContainerDataProto.State state = containerData.getState();
+ replicaCheckMsg.append(state.name());
+ if (areContainerAndReplicasInGoodState(state,
containerInfoToken.getContainerState())) {
+ pass = true;
+ }
+ replicaCheckMsg.append(", Container state in SCM is
").append(containerInfoToken.getContainerState());
+
+ if (pass) {
+ return BlockVerificationResult.pass();
+ } else {
+ return BlockVerificationResult.failCheck(replicaCheckMsg.toString());
+ }
+ } catch (IOException e) {
+ if (e.getMessage().contains("ContainerID") &&
e.getMessage().contains("does not exist")) {
+ // if container "does not exist", mark it as failed instead of
incomplete
+ return BlockVerificationResult.failCheck(e.getMessage());
+ }
+ return BlockVerificationResult.failIncomplete(e.getMessage());
+ }
+ }
+
+ private boolean areContainerAndReplicasInGoodState(ContainerDataProto.State
replicaState,
+ HddsProtos.LifeCycleState containerState) {
+ return (replicaState != ContainerDataProto.State.UNHEALTHY &&
+ replicaState != ContainerDataProto.State.INVALID &&
+ replicaState != ContainerDataProto.State.DELETED &&
+ containerState != HddsProtos.LifeCycleState.DELETING &&
+ containerState != HddsProtos.LifeCycleState.DELETED);
+ }
+
+ private ContainerDataProto fetchContainerDataFromDatanode(DatanodeDetails
dn, long containerId,
+ OmKeyLocationInfo
keyLocation, int replicaIndex,
+ ContainerInfoToken
containerInfoToken)
+ throws IOException {
+ XceiverClientSpi client = null;
+ ReadContainerResponseProto response;
+ try {
+ Pipeline pipeline = Pipeline.newBuilder(keyLocation.getPipeline())
+ .setId(dn.getID())
+ .setReplicationConfig(StandaloneReplicationConfig.getInstance(ONE))
+ .setNodes(Collections.singletonList(dn))
+ .setReplicaIndexes(Collections.singletonMap(dn, replicaIndex))
+ .build();
+ String encodedToken = containerInfoToken.getEncodedToken();
+
+ client = xceiverClientManager.acquireClientForReadData(pipeline);
+ response = ContainerProtocolCalls
+ .readContainer(client, containerId, encodedToken);
+ } finally {
+ if (client != null) {
+ xceiverClientManager.releaseClient(client, false);
+ }
+ }
+
+ if (!response.hasContainerData()) {
+ return null;
+ }
+ return response.getContainerData();
+ }
+
+ private ContainerInfoToken getContainerInfoToken(long containerId)
+ throws IOException {
+ ContainerInfoToken cachedData =
encodedTokenCache.getIfPresent(containerId);
+ if (cachedData != null) {
+ return cachedData;
+ }
+ // Cache miss - fetch and store
+ ContainerInfo info = containerOperationClient.getContainer(containerId);
+ String encodeToken =
containerOperationClient.getEncodedContainerToken(containerId);
+ cachedData = new ContainerInfoToken(info.getState(), encodeToken);
+ encodedTokenCache.put(containerId, cachedData);
+ return cachedData;
+ }
+
+ private static class ContainerInfoToken {
+ private HddsProtos.LifeCycleState state;
+ private final String encodedToken;
+
+ ContainerInfoToken(HddsProtos.LifeCycleState lifeState, String token) {
+ this.state = lifeState;
+ this.encodedToken = token;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (!(o instanceof ContainerInfoToken)) {
+ return false;
+ }
+ ContainerInfoToken key = (ContainerInfoToken) o;
+ return Objects.equals(state, key.state) &&
+ Objects.equals(encodedToken, key.encodedToken);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(state, encodedToken);
+ }
+
+ public HddsProtos.LifeCycleState getContainerState() {
+ return state;
+ }
+
+ public String getEncodedToken() {
+ return encodedToken;
+ }
+ }
+
+}
diff --git
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ReplicasVerify.java
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ReplicasVerify.java
index 38ebee0e8a..2397f8c3d1 100644
---
a/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ReplicasVerify.java
+++
b/hadoop-ozone/tools/src/main/java/org/apache/hadoop/ozone/debug/replicas/ReplicasVerify.java
@@ -67,6 +67,14 @@ public class ReplicasVerify extends Handler {
@CommandLine.ArgGroup(exclusive = false, multiplicity = "1")
private Verification verification;
+ @CommandLine.Option(names = {"--container-cache-size"},
+ description = "Size (in number of containers) of the in-memory cache for
container state verification " +
+ "'--container-state'. Default is 1 million containers (which takes
around 43MB). " +
+ "Value must be greater than zero, otherwise the default of 1 million
is considered. " +
+ "Note: This option is ignored if '--container-state' option is not
used.",
+ defaultValue = "1000000")
+ private long containerCacheSize;
+
private List<ReplicaVerifier> replicaVerifiers;
@Override
@@ -80,6 +88,9 @@ protected void execute(OzoneClient client, OzoneAddress
address) throws IOExcept
if (verification.doExecuteBlockExistence) {
replicaVerifiers.add(new BlockExistenceVerifier(getConf()));
}
+ if (verification.doExecuteReplicaState) {
+ replicaVerifiers.add(new ContainerStateVerifier(getConf(),
containerCacheSize));
+ }
findCandidateKeys(client, address);
}
@@ -222,5 +233,12 @@ static class Verification {
defaultValue = "false")
private boolean doExecuteBlockExistence;
+ @CommandLine.Option(names = "--container-state",
+ description = "Check the container and replica states. " +
+ "Containers in [DELETING, DELETED] states, or " +
+ "it's replicas in [DELETED, UNHEALTHY, INVALID] states fail the
check.",
+ defaultValue = "false")
+ private boolean doExecuteReplicaState;
+
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]