This is an automated email from the ASF dual-hosted git repository.

shahar pushed a commit to branch v3-1-test
in repository https://gitbox.apache.org/repos/asf/airflow.git


The following commit(s) were added to refs/heads/v3-1-test by this push:
     new e5d67c4b1dd [v3-1-test] Fix flaky OTel integration test with DNS 
health check (#61070) (#61242) (#61286)
e5d67c4b1dd is described below

commit e5d67c4b1dd0070a762801f43fe2e142ded1848e
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Sat Jan 31 16:49:48 2026 +0200

    [v3-1-test] Fix flaky OTel integration test with DNS health check (#61070) 
(#61242) (#61286)
    
    * Fix flaky OTel integration test with DNS health check (#61070)
    
    * Update airflow-core/tests/integration/otel/test_otel.py
    
    
    
    ---------
    (cherry picked from commit 8ac25dd658e9ea6a29aaa85215e17c180765c821)
    
    Co-authored-by: Abhishek Mishra <[email protected]>
    Co-authored-by: Henry Chen <[email protected]>
---
 airflow-core/tests/integration/otel/test_otel.py | 51 +++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/airflow-core/tests/integration/otel/test_otel.py 
b/airflow-core/tests/integration/otel/test_otel.py
index e218820b292..51f70c51291 100644
--- a/airflow-core/tests/integration/otel/test_otel.py
+++ b/airflow-core/tests/integration/otel/test_otel.py
@@ -20,6 +20,7 @@ import json
 import logging
 import os
 import signal
+import socket
 import subprocess
 import time
 
@@ -52,6 +53,44 @@ from tests_common.test_utils.version_compat import 
AIRFLOW_V_3_0_PLUS, AIRFLOW_V
 log = logging.getLogger("integration.otel.test_otel")
 
 
+def wait_for_otel_collector(host: str, port: int, timeout: int = 120) -> None:
+    """
+    Wait for the OTel collector to be reachable before running tests.
+
+    This prevents flaky test failures caused by transient DNS resolution issues
+    (e.g., 'Temporary failure in name resolution' for breeze-otel-collector).
+
+    Note: If the collector is not reachable after timeout, logs a warning but
+    does not fail - allows tests to run and fail naturally if needed.
+    """
+    deadline = time.monotonic() + timeout
+    last_error = None
+    while time.monotonic() < deadline:
+        try:
+            # Test DNS resolution and TCP connectivity
+            with socket.create_connection((host, port), timeout=5):
+                pass
+            log.info("OTel collector at %s:%d is reachable.", host, port)
+            return
+        except (socket.gaierror, TimeoutError, OSError) as e:
+            last_error = e
+            log.debug(
+                "OTel collector at %s:%d not reachable: %s. Retrying...",
+                host,
+                port,
+                e,
+            )
+            time.sleep(2)
+    log.warning(
+        "OTel collector at %s:%d is not reachable after %ds. Last error: %s. "
+        "Tests will proceed but may fail if collector is required.",
+        host,
+        port,
+        timeout,
+        last_error,
+    )
+
+
 def unpause_trigger_dag_and_get_run_id(dag_id: str) -> str:
     unpause_command = ["airflow", "dags", "unpause", dag_id]
 
@@ -611,9 +650,17 @@ class TestOtelIntegration:
 
     @classmethod
     def setup_class(cls):
+        otel_host = "breeze-otel-collector"
+        otel_port = 4318
+
+        # Wait for OTel collector to be reachable before running tests.
+        # This prevents flaky test failures caused by transient DNS resolution 
issues
+        # during scheduler handoff (see 
https://github.com/apache/airflow/issues/61070).
+        wait_for_otel_collector(otel_host, otel_port)
+
         os.environ["AIRFLOW__TRACES__OTEL_ON"] = "True"
-        os.environ["AIRFLOW__TRACES__OTEL_HOST"] = "breeze-otel-collector"
-        os.environ["AIRFLOW__TRACES__OTEL_PORT"] = "4318"
+        os.environ["AIRFLOW__TRACES__OTEL_HOST"] = otel_host
+        os.environ["AIRFLOW__TRACES__OTEL_PORT"] = str(otel_port)
         if cls.use_otel != "true":
             os.environ["AIRFLOW__TRACES__OTEL_DEBUGGING_ON"] = "True"
 

Reply via email to