jscheffl commented on code in PR #58397:
URL: https://github.com/apache/airflow/pull/58397#discussion_r2540646860


##########
providers/cncf/kubernetes/src/airflow/providers/cncf/kubernetes/kubernetes_helper_functions.py:
##########
@@ -39,6 +42,61 @@
 POD_NAME_MAX_LENGTH = 63  # Matches Linux kernel's HOST_NAME_MAX default value 
minus 1.
 
 
+class PodLaunchFailedException(AirflowException):
+    """When pod launching fails in KubernetesPodOperator."""
+
+
+class KubernetesApiException(AirflowException):
+    """When communication with kubernetes API fails."""
+
+
+API_RETRIES = conf.getint("workers", "api_retries", fallback=5)
+API_RETRY_WAIT_MIN = conf.getfloat("workers", "api_retry_wait_min", fallback=1)
+API_RETRY_WAIT_MAX = conf.getfloat("workers", "api_retry_wait_max", 
fallback=15)
+
+_default_wait = tenacity.wait_exponential(min=API_RETRY_WAIT_MIN, 
max=API_RETRY_WAIT_MAX)
+
+TRANSIENT_STATUS_CODES = {409, 429, 500, 502, 503, 504}
+
+
+def _should_retry_api(exc: BaseException) -> bool:
+    """Retry on selected ApiException status codes, plus plain HTTP/timeout 
errors."""
+    if isinstance(exc, ApiException):
+        return exc.status in TRANSIENT_STATUS_CODES
+    return isinstance(exc, (HTTPError, KubernetesApiException))
+
+
+class WaitRetryAfterOrExponential(tenacity.wait.wait_base):
+    """Wait strategy that honors Retry-After header on 429, else falls back to 
exponential backoff."""
+
+    def __call__(self, retry_state):
+        exc = retry_state.outcome.exception() if retry_state.outcome else None
+        if isinstance(exc, ApiException) and exc.status == 429:
+            retry_after = (exc.headers or {}).get("Retry-After")
+            if retry_after:
+                try:
+                    return float(int(retry_after))
+                except ValueError:
+                    pass
+        # Inline exponential fallback
+        return _default_wait(retry_state)
+
+
+def generic_api_retry(func):
+    """
+    Retry to Kubernetes API calls.
+
+    - Retries only transient ApiException status codes.
+    - Honors Retry-After on 429.
+    """
+    return tenacity.retry(

Review Comment:
   Having a sleep-over just jumped into my head: With the current (centralized) 
handling of retries the connectivity problems are (hopefully) all masked now. 
But it will be totally silent, so an admin/operator will not see that there are 
underlying problems until it totally breaks, will just take longer due to 
retries.
   
   Can you add some logging for the failed / retries, for example like in 
https://github.com/apache/airflow/blob/main/providers/ssh/src/airflow/providers/ssh/hooks/ssh.py#L349
 ?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to