Nataneljpwd commented on code in PR #61778:
URL: https://github.com/apache/airflow/pull/61778#discussion_r2819170532
##########
providers/cncf/kubernetes/src/airflow/providers/cncf/kubernetes/utils/pod_manager.py:
##########
@@ -188,19 +188,43 @@ def detect_pod_terminate_early_issues(pod: V1Pod) -> str
| None:
"""
Identify issues that justify terminating the pod early.
+ This method distinguishes between permanent failures (e.g., invalid image
names)
+ and transient errors (e.g., rate limits) that should be retried by
Kubernetes.
+
:param pod: The pod object to check.
:return: An error message if an issue is detected; otherwise, None.
"""
+ # Indicators in error messages that suggest transient issues
+ TRANSIENT_ERROR_PATTERNS = [
+ "pull qps exceeded",
+ "rate limit",
+ "too many requests",
+ "quota exceeded",
+ "temporarily unavailable",
+ "timeout",
+ ]
+
pod_status = pod.status
if pod_status.container_statuses:
for container_status in pod_status.container_statuses:
container_state: V1ContainerState = container_status.state
container_waiting: V1ContainerStateWaiting | None =
container_state.waiting
- if container_waiting:
- if container_waiting.reason in ["ErrImagePull",
"ImagePullBackOff", "InvalidImageName"]:
+ if not container_waiting:
+ continue
+
+ if container_waiting.reason in ["InvalidImageName",
"ErrImageNeverPull"]:
+ return (
+ f"Pod docker image cannot be pulled, unable to start:
{container_waiting.reason}"
+ f"\n{container_waiting.message or ''}"
+ )
+
+ if container_waiting.reason in ["ErrImagePull",
"ImagePullBackOff"]:
Review Comment:
Maybe the array here should be given a name to explain why only these 2
states are checked
And I think this check, as it is more specific should come before the upper
one, while the upper one is more of a catch all and does not even need an if,
rather just return
##########
providers/cncf/kubernetes/src/airflow/providers/cncf/kubernetes/utils/pod_manager.py:
##########
@@ -188,19 +188,43 @@ def detect_pod_terminate_early_issues(pod: V1Pod) -> str
| None:
"""
Identify issues that justify terminating the pod early.
+ This method distinguishes between permanent failures (e.g., invalid image
names)
+ and transient errors (e.g., rate limits) that should be retried by
Kubernetes.
+
:param pod: The pod object to check.
:return: An error message if an issue is detected; otherwise, None.
"""
+ # Indicators in error messages that suggest transient issues
+ TRANSIENT_ERROR_PATTERNS = [
+ "pull qps exceeded",
+ "rate limit",
+ "too many requests",
+ "quota exceeded",
+ "temporarily unavailable",
+ "timeout",
+ ]
+
pod_status = pod.status
if pod_status.container_statuses:
for container_status in pod_status.container_statuses:
container_state: V1ContainerState = container_status.state
container_waiting: V1ContainerStateWaiting | None =
container_state.waiting
- if container_waiting:
- if container_waiting.reason in ["ErrImagePull",
"ImagePullBackOff", "InvalidImageName"]:
+ if not container_waiting:
+ continue
+
+ if container_waiting.reason in ["InvalidImageName",
"ErrImageNeverPull"]:
Review Comment:
I think with this change weight miss some cases, maybe it is better to move
it under the later if and don't check for the reason, as I would rather crash
when I can't know why a pod is waiting rather than silently ignore it
As what if the registry is unavailable ?
Will this be caught later on?
##########
providers/cncf/kubernetes/src/airflow/providers/cncf/kubernetes/utils/pod_manager.py:
##########
@@ -188,19 +188,43 @@ def detect_pod_terminate_early_issues(pod: V1Pod) -> str
| None:
"""
Identify issues that justify terminating the pod early.
+ This method distinguishes between permanent failures (e.g., invalid image
names)
+ and transient errors (e.g., rate limits) that should be retried by
Kubernetes.
+
:param pod: The pod object to check.
:return: An error message if an issue is detected; otherwise, None.
"""
+ # Indicators in error messages that suggest transient issues
+ TRANSIENT_ERROR_PATTERNS = [
+ "pull qps exceeded",
+ "rate limit",
+ "too many requests",
+ "quota exceeded",
+ "temporarily unavailable",
+ "timeout",
+ ]
+
pod_status = pod.status
if pod_status.container_statuses:
for container_status in pod_status.container_statuses:
container_state: V1ContainerState = container_status.state
container_waiting: V1ContainerStateWaiting | None =
container_state.waiting
- if container_waiting:
- if container_waiting.reason in ["ErrImagePull",
"ImagePullBackOff", "InvalidImageName"]:
+ if not container_waiting:
+ continue
+
+ if container_waiting.reason in ["InvalidImageName",
"ErrImageNeverPull"]:
+ return (
+ f"Pod docker image cannot be pulled, unable to start:
{container_waiting.reason}"
Review Comment:
Minor nit, we can remove the words "Pod docker"
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]