This is an automated email from the ASF dual-hosted git repository.

tejaskriya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new e06c19312ac HDDS-13123. Add testing for the `ozone repair om 
skip-ratis-transaction` command (#8810)
e06c19312ac is described below

commit e06c19312ac742406b1f4ff07d4395c67dc6e443
Author: Tejaskriya <87555809+tejaskr...@users.noreply.github.com>
AuthorDate: Wed Jul 30 14:03:57 2025 +0530

    HDDS-13123. Add testing for the `ozone repair om skip-ratis-transaction` 
command (#8810)
---
 dev-support/byteman/fail-create-bucket.btm         | 28 +++++++++++++
 .../compose/ozonesecure-ha/test-repair-tools.sh    | 48 ++++++++++++++++++++++
 .../repair/ratis-transaction-repair.robot          | 35 ++++++++++++++++
 3 files changed, 111 insertions(+)

diff --git a/dev-support/byteman/fail-create-bucket.btm 
b/dev-support/byteman/fail-create-bucket.btm
new file mode 100644
index 00000000000..f624e45717c
--- /dev/null
+++ b/dev-support/byteman/fail-create-bucket.btm
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# This script instruments ozone manager to fail a CreateBucket request for a 
specific name
+#
+
+RULE Crash OM with CreateBucket
+CLASS org.apache.hadoop.ozone.om.request.bucket.OMBucketCreateRequest
+METHOD validateAndUpdateCache
+AT ENTRY
+IF TRUE
+DO
+  traceln("--> crashing CreateBucket request");
+  THROW new RuntimeException("Byteman crashes OM");
+ENDRULE
\ No newline at end of file
diff --git 
a/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/test-repair-tools.sh 
b/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/test-repair-tools.sh
index f181f5c6570..da54e913deb 100644
--- a/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/test-repair-tools.sh
+++ b/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/test-repair-tools.sh
@@ -40,6 +40,54 @@ create_data_dirs dn{1..5} kms om{1..3} recon s3g scm{1..3}
 
 start_docker_env
 
+repair_and_restart_om() {
+  local om_container="$1"
+  local om_id="$2"
+  echo "Waiting for container '${om_container}' to stop..."
+  # Loop until the container is not running
+  timeout=60  # seconds
+  start_time=$(date +%s)
+  while [ "$(docker inspect -f '{{.State.Running}}' "${om_container}" 
2>/dev/null)" == "true" ]; do
+    current_time=$(date +%s)
+    elapsed=$((current_time - start_time))
+
+    if [ "$elapsed" -ge "$timeout" ]; then
+      echo "Timeout: Container '${om_container}' did not stop within 
${timeout} seconds."
+      exit 1
+    fi
+    sleep 1
+  done
+  echo "Container '${om_container}' has stopped."
+
+  logpath=$(execute_command_in_container ${SCM} bash -c "find / -type f -path 
'/*/$om_id/*/log_inprogress_0' 2>/dev/null | head -n 1")
+  echo "Ratis log segment file path: ${logpath}"
+
+  execute_command_in_container ${SCM} bash -c "ozone repair om srt 
-b=/opt/hadoop/compose/ozonesecure-ha/data/$om_id/backup1 --index=2 
-s=${logpath}"
+  echo "Repair command executed for ${om_id}."
+  docker start "${om_container}"
+  echo "Container '${om_container}' started again."
+  bucketTable=$(execute_command_in_container ${SCM} bash -c "ozone debug ldb 
--db=/opt/hadoop/compose/ozonesecure-ha/data/$om_id/metadata/om.db scan 
--cf=bucketTable")
+  echo "Bucket table for ${om_id}:"
+  if echo "$bucketTable" | grep -q "bucket-crash-1"; then
+    echo "bucket 'bucket-crash-1' should not have been created, but it is 
present in the bucketTable of $om_id"
+    exit 1
+  else
+    echo "bucket 'bucket-crash-1' is not present in the bucketTable of $om_id 
as expected."
+  fi
+}
+
+echo "Testing ratis transaction repair on all OMs"
+execute_robot_test ${SCM} kinit.robot
+execute_robot_test ${SCM} repair/ratis-transaction-repair.robot
+repair_and_restart_om "ozonesecure-ha-om1-1" "om1"
+repair_and_restart_om "ozonesecure-ha-om2-1" "om2"
+repair_and_restart_om "ozonesecure-ha-om3-1" "om3"
+if ! execute_command_in_container scm1.org timeout 15s ozone sh volume list 
1>/dev/null; then
+  echo "Command timed out or failed => OMs are not running as expected. Test 
for repairing ratis transaction failed."
+  exit 1
+fi
+echo "Testing ratis transaction repair completed successfully."
+
 execute_robot_test ${OM} kinit.robot
 
 echo "Creating test keys to verify om compaction"
diff --git 
a/hadoop-ozone/dist/src/main/smoketest/repair/ratis-transaction-repair.robot 
b/hadoop-ozone/dist/src/main/smoketest/repair/ratis-transaction-repair.robot
new file mode 100644
index 00000000000..e3a8d7cd997
--- /dev/null
+++ b/hadoop-ozone/dist/src/main/smoketest/repair/ratis-transaction-repair.robot
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+*** Settings ***
+Documentation       Test recovering from OM crash due to transaction failure
+Library             OperatingSystem
+Library             BuiltIn
+Library             Process
+Resource            ../lib/os.robot
+Resource            ../ozone-fi/BytemanKeywords.robot
+
+*** Variables ***
+${VOLUME}               test-txn-vol
+${BAD_BUCKET}           bucket-crash-1
+${CRASH_RULE}           /opt/hadoop/share/ozone/byteman/fail-create-bucket.btm
+${TIMEOUT}              10 seconds
+
+*** Test Cases ***
+Verify OM crash at bucket create
+    Inject Fault Into OMs Only      ${CRASH_RULE}
+    Execute         ozone sh volume create o3://${OM_SERVICE_ID}/${VOLUME}
+    Run Process     ozone sh bucket create 
o3://${OM_SERVICE_ID}/${VOLUME}/${BAD_BUCKET}    timeout=${TIMEOUT}    
shell=True
+    Remove Fault From OMs Only      ${CRASH_RULE}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@ozone.apache.org
For additional commands, e-mail: commits-h...@ozone.apache.org

Reply via email to