This is an automated email from the ASF dual-hosted git repository. tejaskriya pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push: new e06c19312ac HDDS-13123. Add testing for the `ozone repair om skip-ratis-transaction` command (#8810) e06c19312ac is described below commit e06c19312ac742406b1f4ff07d4395c67dc6e443 Author: Tejaskriya <87555809+tejaskr...@users.noreply.github.com> AuthorDate: Wed Jul 30 14:03:57 2025 +0530 HDDS-13123. Add testing for the `ozone repair om skip-ratis-transaction` command (#8810) --- dev-support/byteman/fail-create-bucket.btm | 28 +++++++++++++ .../compose/ozonesecure-ha/test-repair-tools.sh | 48 ++++++++++++++++++++++ .../repair/ratis-transaction-repair.robot | 35 ++++++++++++++++ 3 files changed, 111 insertions(+) diff --git a/dev-support/byteman/fail-create-bucket.btm b/dev-support/byteman/fail-create-bucket.btm new file mode 100644 index 00000000000..f624e45717c --- /dev/null +++ b/dev-support/byteman/fail-create-bucket.btm @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# This script instruments ozone manager to fail a CreateBucket request for a specific name +# + +RULE Crash OM with CreateBucket +CLASS org.apache.hadoop.ozone.om.request.bucket.OMBucketCreateRequest +METHOD validateAndUpdateCache +AT ENTRY +IF TRUE +DO + traceln("--> crashing CreateBucket request"); + THROW new RuntimeException("Byteman crashes OM"); +ENDRULE \ No newline at end of file diff --git a/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/test-repair-tools.sh b/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/test-repair-tools.sh index f181f5c6570..da54e913deb 100644 --- a/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/test-repair-tools.sh +++ b/hadoop-ozone/dist/src/main/compose/ozonesecure-ha/test-repair-tools.sh @@ -40,6 +40,54 @@ create_data_dirs dn{1..5} kms om{1..3} recon s3g scm{1..3} start_docker_env +repair_and_restart_om() { + local om_container="$1" + local om_id="$2" + echo "Waiting for container '${om_container}' to stop..." + # Loop until the container is not running + timeout=60 # seconds + start_time=$(date +%s) + while [ "$(docker inspect -f '{{.State.Running}}' "${om_container}" 2>/dev/null)" == "true" ]; do + current_time=$(date +%s) + elapsed=$((current_time - start_time)) + + if [ "$elapsed" -ge "$timeout" ]; then + echo "Timeout: Container '${om_container}' did not stop within ${timeout} seconds." + exit 1 + fi + sleep 1 + done + echo "Container '${om_container}' has stopped." + + logpath=$(execute_command_in_container ${SCM} bash -c "find / -type f -path '/*/$om_id/*/log_inprogress_0' 2>/dev/null | head -n 1") + echo "Ratis log segment file path: ${logpath}" + + execute_command_in_container ${SCM} bash -c "ozone repair om srt -b=/opt/hadoop/compose/ozonesecure-ha/data/$om_id/backup1 --index=2 -s=${logpath}" + echo "Repair command executed for ${om_id}." + docker start "${om_container}" + echo "Container '${om_container}' started again." + bucketTable=$(execute_command_in_container ${SCM} bash -c "ozone debug ldb --db=/opt/hadoop/compose/ozonesecure-ha/data/$om_id/metadata/om.db scan --cf=bucketTable") + echo "Bucket table for ${om_id}:" + if echo "$bucketTable" | grep -q "bucket-crash-1"; then + echo "bucket 'bucket-crash-1' should not have been created, but it is present in the bucketTable of $om_id" + exit 1 + else + echo "bucket 'bucket-crash-1' is not present in the bucketTable of $om_id as expected." + fi +} + +echo "Testing ratis transaction repair on all OMs" +execute_robot_test ${SCM} kinit.robot +execute_robot_test ${SCM} repair/ratis-transaction-repair.robot +repair_and_restart_om "ozonesecure-ha-om1-1" "om1" +repair_and_restart_om "ozonesecure-ha-om2-1" "om2" +repair_and_restart_om "ozonesecure-ha-om3-1" "om3" +if ! execute_command_in_container scm1.org timeout 15s ozone sh volume list 1>/dev/null; then + echo "Command timed out or failed => OMs are not running as expected. Test for repairing ratis transaction failed." + exit 1 +fi +echo "Testing ratis transaction repair completed successfully." + execute_robot_test ${OM} kinit.robot echo "Creating test keys to verify om compaction" diff --git a/hadoop-ozone/dist/src/main/smoketest/repair/ratis-transaction-repair.robot b/hadoop-ozone/dist/src/main/smoketest/repair/ratis-transaction-repair.robot new file mode 100644 index 00000000000..e3a8d7cd997 --- /dev/null +++ b/hadoop-ozone/dist/src/main/smoketest/repair/ratis-transaction-repair.robot @@ -0,0 +1,35 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +*** Settings *** +Documentation Test recovering from OM crash due to transaction failure +Library OperatingSystem +Library BuiltIn +Library Process +Resource ../lib/os.robot +Resource ../ozone-fi/BytemanKeywords.robot + +*** Variables *** +${VOLUME} test-txn-vol +${BAD_BUCKET} bucket-crash-1 +${CRASH_RULE} /opt/hadoop/share/ozone/byteman/fail-create-bucket.btm +${TIMEOUT} 10 seconds + +*** Test Cases *** +Verify OM crash at bucket create + Inject Fault Into OMs Only ${CRASH_RULE} + Execute ozone sh volume create o3://${OM_SERVICE_ID}/${VOLUME} + Run Process ozone sh bucket create o3://${OM_SERVICE_ID}/${VOLUME}/${BAD_BUCKET} timeout=${TIMEOUT} shell=True + Remove Fault From OMs Only ${CRASH_RULE} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@ozone.apache.org For additional commands, e-mail: commits-h...@ozone.apache.org