This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git


The following commit(s) were added to refs/heads/main by this push:
     new bbdb45dcb9c Better handle timeouts on test failures (#46993)
bbdb45dcb9c is described below

commit bbdb45dcb9cf0b837936fe10ebb7c1fd2e852f69
Author: Jarek Potiuk <[email protected]>
AuthorDate: Sun Feb 23 14:38:58 2025 +0100

    Better handle timeouts on test failures (#46993)
    
    We used to send SIGQUIT to running containers in tests if they
    were running for too long - that was supposed to handled the case
    when a test was hanging in the way that pytest timeout handling could
    not break individual tests. That signal was set about 10 minutes before
    the total job timeout to give the tests time to upload log files and
    print diagnostics information before the runner was killed by Github
    Actions mechanisms.
    
    This however was not enough - we had recently a number of hanging
    tests that were failing after 1h 58 minutes - which means that our
    signal sending to docker containers were not effective - and our
    SIGQUIT was not effective.
    
    This PR makes the timeout code more resilient and possibly gives us
    a chance to see what is going on:
    
    1) we are printing more diagnostics information when we attempt to
       send the signals
    2) we are sending SIGTERM instead of SIGQUIT as this seems to be
       more standard way of stopping containers (SIGQUIT was default
       STOPSIGNAL in the past but SIGTERM is now more commonly used)
    3) we wait 10 seconds after sending the signal and if the containers
       are still running we send SIGKILL to the containers which - in
       theory kill the containers always, unconditionally - and give
       us a chance to print and upload diagnostics information.
---
 .../airflow_breeze/commands/testing_commands.py    | 44 +++++++++++++++++++---
 1 file changed, 39 insertions(+), 5 deletions(-)

diff --git a/dev/breeze/src/airflow_breeze/commands/testing_commands.py 
b/dev/breeze/src/airflow_breeze/commands/testing_commands.py
index 288b4f8ef64..b1fd3e81276 100644
--- a/dev/breeze/src/airflow_breeze/commands/testing_commands.py
+++ b/dev/breeze/src/airflow_breeze/commands/testing_commands.py
@@ -21,6 +21,7 @@ import os
 import signal
 import sys
 from datetime import datetime
+from subprocess import CalledProcessError, CompletedProcess
 from time import sleep
 
 import click
@@ -100,6 +101,8 @@ from airflow_breeze.utils.run_tests import (
 from airflow_breeze.utils.run_utils import run_command
 from airflow_breeze.utils.selective_checks import ALL_CI_SELECTIVE_TEST_TYPES
 
+GRACE_CONTAINER_STOP_TIMEOUT = 10  # Timeout in seconds to wait for containers 
to get killed
+
 LOW_MEMORY_CONDITION = 8 * 1024 * 1024 * 1024
 DEFAULT_TOTAL_TEST_TIMEOUT = 6500  # 6500 seconds = 1h 48 minutes
 
@@ -1100,18 +1103,49 @@ def python_api_client_tests(
 @contextlib.contextmanager
 def run_with_timeout(timeout: int):
     def timeout_handler(signum, frame):
-        get_console().print("[error]Timeout reached. Killing the 
container(s)[/]")
-        list_of_containers = run_command(
+        get_console().print("[warning]Timeout reached. Killing the 
container(s)[/]:")
+        _print_all_containers()
+        list_of_containers = _get_running_containers().stdout.splitlines()
+        get_console().print("[warning]Attempting to send TERM signal to all 
remaining containers:")
+        get_console().print(list_of_containers)
+        _send_signal_to_containers(list_of_containers, "SIGTERM")
+        get_console().print(f"[warning]Waiting {GRACE_CONTAINER_STOP_TIMEOUT} 
seconds for containers to stop")
+        sleep(GRACE_CONTAINER_STOP_TIMEOUT)
+        containers_left = _get_running_containers().stdout.splitlines()
+        if containers_left:
+            get_console().print("[warning]Some containers are still running. 
Killing them with SIGKILL:")
+            get_console().print(containers_left)
+            _send_signal_to_containers(list_of_containers, "SIGKILL")
+            get_console().print(
+                f"[warning]Waiting {GRACE_CONTAINER_STOP_TIMEOUT} seconds for 
containers to stop"
+            )
+            sleep(GRACE_CONTAINER_STOP_TIMEOUT)
+            containers_left = _get_running_containers().stdout.splitlines()
+            if containers_left:
+                get_console().print("[error]Some containers are still running. 
Exiting anyway.")
+                get_console().print(containers_left)
+                sys.exit(1)
+
+    def _send_signal_to_containers(list_of_containers: list[str], signal: str):
+        run_command(
+            ["docker", "kill", "--signal", signal, *list_of_containers],
+            check=True,
+            capture_output=False,
+            text=True,
+        )
+
+    def _get_running_containers() -> CompletedProcess | CalledProcessError:
+        return run_command(
             ["docker", "ps", "-q"],
             check=True,
             capture_output=True,
             text=True,
         )
+
+    def _print_all_containers():
         run_command(
-            ["docker", "kill", "--signal", "SIGQUIT"] + 
list_of_containers.stdout.splitlines(),
+            ["docker", "ps"],
             check=True,
-            capture_output=False,
-            text=True,
         )
 
     signal.signal(signal.SIGALRM, timeout_handler)

Reply via email to