This is an automated email from the ASF dual-hosted git repository.
potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/main by this push:
new bbdb45dcb9c Better handle timeouts on test failures (#46993)
bbdb45dcb9c is described below
commit bbdb45dcb9cf0b837936fe10ebb7c1fd2e852f69
Author: Jarek Potiuk <[email protected]>
AuthorDate: Sun Feb 23 14:38:58 2025 +0100
Better handle timeouts on test failures (#46993)
We used to send SIGQUIT to running containers in tests if they
were running for too long - that was supposed to handled the case
when a test was hanging in the way that pytest timeout handling could
not break individual tests. That signal was set about 10 minutes before
the total job timeout to give the tests time to upload log files and
print diagnostics information before the runner was killed by Github
Actions mechanisms.
This however was not enough - we had recently a number of hanging
tests that were failing after 1h 58 minutes - which means that our
signal sending to docker containers were not effective - and our
SIGQUIT was not effective.
This PR makes the timeout code more resilient and possibly gives us
a chance to see what is going on:
1) we are printing more diagnostics information when we attempt to
send the signals
2) we are sending SIGTERM instead of SIGQUIT as this seems to be
more standard way of stopping containers (SIGQUIT was default
STOPSIGNAL in the past but SIGTERM is now more commonly used)
3) we wait 10 seconds after sending the signal and if the containers
are still running we send SIGKILL to the containers which - in
theory kill the containers always, unconditionally - and give
us a chance to print and upload diagnostics information.
---
.../airflow_breeze/commands/testing_commands.py | 44 +++++++++++++++++++---
1 file changed, 39 insertions(+), 5 deletions(-)
diff --git a/dev/breeze/src/airflow_breeze/commands/testing_commands.py
b/dev/breeze/src/airflow_breeze/commands/testing_commands.py
index 288b4f8ef64..b1fd3e81276 100644
--- a/dev/breeze/src/airflow_breeze/commands/testing_commands.py
+++ b/dev/breeze/src/airflow_breeze/commands/testing_commands.py
@@ -21,6 +21,7 @@ import os
import signal
import sys
from datetime import datetime
+from subprocess import CalledProcessError, CompletedProcess
from time import sleep
import click
@@ -100,6 +101,8 @@ from airflow_breeze.utils.run_tests import (
from airflow_breeze.utils.run_utils import run_command
from airflow_breeze.utils.selective_checks import ALL_CI_SELECTIVE_TEST_TYPES
+GRACE_CONTAINER_STOP_TIMEOUT = 10 # Timeout in seconds to wait for containers
to get killed
+
LOW_MEMORY_CONDITION = 8 * 1024 * 1024 * 1024
DEFAULT_TOTAL_TEST_TIMEOUT = 6500 # 6500 seconds = 1h 48 minutes
@@ -1100,18 +1103,49 @@ def python_api_client_tests(
@contextlib.contextmanager
def run_with_timeout(timeout: int):
def timeout_handler(signum, frame):
- get_console().print("[error]Timeout reached. Killing the
container(s)[/]")
- list_of_containers = run_command(
+ get_console().print("[warning]Timeout reached. Killing the
container(s)[/]:")
+ _print_all_containers()
+ list_of_containers = _get_running_containers().stdout.splitlines()
+ get_console().print("[warning]Attempting to send TERM signal to all
remaining containers:")
+ get_console().print(list_of_containers)
+ _send_signal_to_containers(list_of_containers, "SIGTERM")
+ get_console().print(f"[warning]Waiting {GRACE_CONTAINER_STOP_TIMEOUT}
seconds for containers to stop")
+ sleep(GRACE_CONTAINER_STOP_TIMEOUT)
+ containers_left = _get_running_containers().stdout.splitlines()
+ if containers_left:
+ get_console().print("[warning]Some containers are still running.
Killing them with SIGKILL:")
+ get_console().print(containers_left)
+ _send_signal_to_containers(list_of_containers, "SIGKILL")
+ get_console().print(
+ f"[warning]Waiting {GRACE_CONTAINER_STOP_TIMEOUT} seconds for
containers to stop"
+ )
+ sleep(GRACE_CONTAINER_STOP_TIMEOUT)
+ containers_left = _get_running_containers().stdout.splitlines()
+ if containers_left:
+ get_console().print("[error]Some containers are still running.
Exiting anyway.")
+ get_console().print(containers_left)
+ sys.exit(1)
+
+ def _send_signal_to_containers(list_of_containers: list[str], signal: str):
+ run_command(
+ ["docker", "kill", "--signal", signal, *list_of_containers],
+ check=True,
+ capture_output=False,
+ text=True,
+ )
+
+ def _get_running_containers() -> CompletedProcess | CalledProcessError:
+ return run_command(
["docker", "ps", "-q"],
check=True,
capture_output=True,
text=True,
)
+
+ def _print_all_containers():
run_command(
- ["docker", "kill", "--signal", "SIGQUIT"] +
list_of_containers.stdout.splitlines(),
+ ["docker", "ps"],
check=True,
- capture_output=False,
- text=True,
)
signal.signal(signal.SIGALRM, timeout_handler)