This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch v3-1-test
in repository https://gitbox.apache.org/repos/asf/airflow.git


The following commit(s) were added to refs/heads/v3-1-test by this push:
     new 208603bcd06 [v3-1-test] Reduct k8s test flakiness (#59885) (#59913)
208603bcd06 is described below

commit 208603bcd068ab9dd7d10a7c4382664c1d52a035
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Mon Dec 29 22:05:12 2025 +0100

    [v3-1-test] Reduct k8s test flakiness (#59885) (#59913)
    
    - Increase timeout from 10 to 20min
    - Add 2 auto-retries (so 3 attempts in total) for timeout errors
    (cherry picked from commit 15f3212fa2c0b4c28733947de67c3576c46757df)
    
    Co-authored-by: Dev-iL <[email protected]>
---
 .../airflow_breeze/commands/kubernetes_commands.py | 76 ++++++++++++++++++----
 1 file changed, 64 insertions(+), 12 deletions(-)

diff --git a/dev/breeze/src/airflow_breeze/commands/kubernetes_commands.py 
b/dev/breeze/src/airflow_breeze/commands/kubernetes_commands.py
index d8c68cb03d7..9f5a4a7f9bc 100644
--- a/dev/breeze/src/airflow_breeze/commands/kubernetes_commands.py
+++ b/dev/breeze/src/airflow_breeze/commands/kubernetes_commands.py
@@ -1021,7 +1021,7 @@ def _deploy_helm_chart(
             "--kube-context",
             kubectl_context,
             "--timeout",
-            "10m0s",
+            "20m0s",
             "--namespace",
             HELM_AIRFLOW_NAMESPACE,
             "--set",
@@ -1067,12 +1067,30 @@ def _deploy_helm_chart(
             kubernetes_version=kubernetes_version,
             output=output,
             check=False,
+            capture_output=True,
+            text=True,
         )
+        # Print captured output to the console/output file
+        if result.stdout:
+            get_console(output=output).print(result.stdout)
+        if result.stderr:
+            get_console(output=output).print(result.stderr)
         if result.returncode == 0:
             get_console(output=output).print(f"[success]Deployed 
{cluster_name} with airflow Helm Chart.")
         return result
 
 
+def _is_helm_timeout_error(result: RunCommandResult) -> bool:
+    """Check if the Helm command failed due to a timeout."""
+    # Check stderr and stdout for timeout-related messages
+    error_output = ""
+    if hasattr(result, "stderr") and result.stderr:
+        error_output += result.stderr if isinstance(result.stderr, str) else 
result.stderr.decode()
+    if hasattr(result, "stdout") and result.stdout:
+        error_output += result.stdout if isinstance(result.stdout, str) else 
result.stdout.decode()
+    return "timed out waiting for the condition" in error_output
+
+
 def _deploy_airflow(
     python: str,
     kubernetes_version: str,
@@ -1083,20 +1101,52 @@ def _deploy_airflow(
     use_standard_naming: bool,
     extra_options: tuple[str, ...] | None = None,
     multi_namespace_mode: bool = False,
+    num_tries: int = 1,
 ) -> tuple[int, str]:
     action = "Deploying" if not upgrade else "Upgrading"
     cluster_name = get_kind_cluster_name(python=python, 
kubernetes_version=kubernetes_version)
-    get_console(output=output).print(f"[info]{action} Airflow for cluster 
{cluster_name}")
-    result = _deploy_helm_chart(
-        python=python,
-        kubernetes_version=kubernetes_version,
-        output=output,
-        upgrade=upgrade,
-        executor=executor,
-        use_standard_naming=use_standard_naming,
-        extra_options=extra_options,
-        multi_namespace_mode=multi_namespace_mode,
-    )
+    kubectl_context = get_kubectl_cluster_name(python=python, 
kubernetes_version=kubernetes_version)
+    while True:
+        get_console(output=output).print(f"[info]{action} Airflow for cluster 
{cluster_name}")
+        result = _deploy_helm_chart(
+            python=python,
+            kubernetes_version=kubernetes_version,
+            output=output,
+            upgrade=upgrade,
+            executor=executor,
+            use_standard_naming=use_standard_naming,
+            extra_options=extra_options,
+            multi_namespace_mode=multi_namespace_mode,
+        )
+        if result.returncode == 0:
+            break
+        # Only retry on timeout errors, fail immediately for other errors
+        if not _is_helm_timeout_error(result):
+            return result.returncode, f"{action} Airflow to {cluster_name}"
+        num_tries -= 1
+        if num_tries == 0:
+            return result.returncode, f"{action} Airflow to {cluster_name}"
+        get_console(output=output).print(
+            f"[warning]Helm deployment timed out for {cluster_name}. "
+            f"Retrying! There are {num_tries} tries left.\n"
+        )
+        # Uninstall the failed release before retrying
+        run_command_with_k8s_env(
+            [
+                "helm",
+                "uninstall",
+                "airflow",
+                "--kube-context",
+                kubectl_context,
+                "--namespace",
+                HELM_AIRFLOW_NAMESPACE,
+                "--ignore-not-found",
+            ],
+            python=python,
+            kubernetes_version=kubernetes_version,
+            output=output,
+            check=False,
+        )
     if result.returncode == 0:
         if multi_namespace_mode:
             # duplicate Airflow configmaps, secrets and service accounts to 
test namespace
@@ -1660,6 +1710,7 @@ def _run_complete_tests(
             wait_time_in_seconds=wait_time_in_seconds,
             extra_options=extra_options,
             multi_namespace_mode=True,
+            num_tries=3,
         )
         if returncode != 0:
             _logs(python=python, kubernetes_version=kubernetes_version)
@@ -1693,6 +1744,7 @@ def _run_complete_tests(
                 wait_time_in_seconds=wait_time_in_seconds,
                 extra_options=extra_options,
                 multi_namespace_mode=True,
+                num_tries=3,
             )
             if returncode != 0 or include_success_outputs:
                 _logs(python=python, kubernetes_version=kubernetes_version)

Reply via email to