This is an automated email from the ASF dual-hosted git repository.
davidarthur pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/kafka.git
The following commit(s) were added to refs/heads/trunk by this push:
new 420f69abbd1 MINOR Add a thread dump on build timeout (#17181)
420f69abbd1 is described below
commit 420f69abbd1e8537edee0c8f78a51b93744bafb7
Author: David Arthur <[email protected]>
AuthorDate: Fri Sep 13 11:16:50 2024 -0400
MINOR Add a thread dump on build timeout (#17181)
In the case of a CI timeout, this patch uses jstack to capture thread dumps
from the Gradle test workers.
These thread dumps are stored in files which are later archived by the CI
workflow.
This patch also increases the compression level to 9 for our
"actions/upload-artifact" steps to save a bit of storage space.
Reviewers: Chia-Ping Tsai <[email protected]>
---
.github/scripts/junit.py | 10 ++++++++--
.github/scripts/thread-dump.sh | 35 +++++++++++++++++++++++++++++++++++
.github/workflows/build.yml | 22 ++++++++++++++++++++--
3 files changed, 63 insertions(+), 4 deletions(-)
diff --git a/.github/scripts/junit.py b/.github/scripts/junit.py
index 4864c45196a..60561cbf136 100644
--- a/.github/scripts/junit.py
+++ b/.github/scripts/junit.py
@@ -219,7 +219,7 @@ if __name__ == "__main__":
logger.info(f"Finished processing {len(reports)} reports")
# Print summary
- report_url = get_env("REPORT_URL")
+ report_url = get_env("JUNIT_REPORT_URL")
report_md = f"Download [HTML report]({report_url})."
summary = (f"{total_run} tests cases run in {duration}. "
f"{total_success} {PASSED}, {total_failures} {FAILED}, "
@@ -259,9 +259,15 @@ if __name__ == "__main__":
# Print special message if there was a timeout
exit_code = get_env("GRADLE_EXIT_CODE", int)
if exit_code == 124:
+ thread_dump_url = get_env("THREAD_DUMP_URL")
logger.debug(f"Gradle command timed out. These are partial results!")
logger.debug(summary)
- logger.debug("Failing this step because the tests timed out.")
+ if thread_dump_url:
+ print(f"\nThe JUnit tests were cancelled due to a timeout. Thread
dumps were generated before the job was cancelled. "
+ f"Download [thread dumps]({thread_dump_url}).\n")
+ logger.debug(f"Failing this step because the tests timed out.
Thread dumps were taken and archived here: {thread_dump_url}")
+ else:
+ logger.debug(f"Failing this step because the tests timed out.
Thread dumps were not archived, check logs in JUnit step.")
exit(1)
elif exit_code in (0, 1):
logger.debug(summary)
diff --git a/.github/scripts/thread-dump.sh b/.github/scripts/thread-dump.sh
new file mode 100755
index 00000000000..8f387a3974c
--- /dev/null
+++ b/.github/scripts/thread-dump.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SLEEP_MINUTES=$(($TIMEOUT_MINUTES-5))
+echo "Dumping threads in $SLEEP_MINUTES minutes"
+sleep $(($SLEEP_MINUTES*60));
+
+echo "Timed out after $SLEEP_MINUTES minutes. Dumping threads now..."
+mkdir thread-dumps
+sleep 5;
+
+for GRADLE_WORKER_PID in `jps | grep GradleWorkerMain | awk -F" " '{print
$1}'`;
+do
+ echo "Dumping threads for GradleWorkerMain pid $GRADLE_WORKER_PID into
$FILENAME";
+ FILENAME="thread-dumps/GradleWorkerMain-$GRADLE_WORKER_PID.txt"
+ jstack $GRADLE_WORKER_PID > $FILENAME
+ if ! grep -q "kafka" $FILENAME; then
+ echo "No match for 'kafka' in thread dump file $FILENAME, discarding it."
+ rm $FILENAME;
+ fi;
+ sleep 5;
+done;
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 226d8e2fb16..666e0a92f87 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -75,6 +75,7 @@ jobs:
name: check-reports-${{ matrix.java }}
path: |
**/build/**/*.html
+ compression-level: 9
if-no-files-found: ignore
- name: Annotate checkstyle errors
# Avoid duplicate annotations, only run on java 21
@@ -110,9 +111,12 @@ jobs:
# --continue: Keep running even if a test fails
# -PcommitId Prevent the Git SHA being written into the jar files
(which breaks caching)
id: junit-test
+ env:
+ TIMEOUT_MINUTES: 180 # 3 hours
run: |
set +e
- timeout 180m ./gradlew --build-cache --continue \
+ ./.github/scripts/thread-dump.sh &
+ timeout ${TIMEOUT_MINUTES}m ./gradlew --build-cache --continue \
${{ inputs.is-public-fork == 'true' && '--no-scan' || '--scan' }} \
-PtestLoggingEvents=started,passed,skipped,failed \
-PmaxParallelForks=2 \
@@ -128,12 +132,24 @@ jobs:
name: junit-reports-${{ matrix.java }}
path: |
**/build/reports/tests/test/*
+ compression-level: 9
+ if-no-files-found: ignore
+ - name: Archive Thread Dumps
+ id: thread-dump-upload-artifact
+ if: always() && steps.junit-test.outputs.exitcode == '124'
+ uses: actions/upload-artifact@v4
+ with:
+ name: junit-thread-dumps-${{ matrix.java }}
+ path: |
+ thread-dumps/*
+ compression-level: 9
if-no-files-found: ignore
- name: Parse JUnit tests
run: python .github/scripts/junit.py >> $GITHUB_STEP_SUMMARY
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- REPORT_URL: ${{ steps.junit-upload-artifact.outputs.artifact-url }}
+ JUNIT_REPORT_URL: ${{
steps.junit-upload-artifact.outputs.artifact-url }}
+ THREAD_DUMP_URL: ${{
steps.thread-dump-upload-artifact.outputs.artifact-url }}
GRADLE_EXIT_CODE: ${{ steps.junit-test.outputs.exitcode }}
- name: Archive Build Scan
if: always()
@@ -141,3 +157,5 @@ jobs:
with:
name: build-scan-test-${{ matrix.java }}
path: ~/.gradle/build-scan-data
+ compression-level: 9
+ if-no-files-found: ignore