This is an automated email from the ASF dual-hosted git repository.
onikolas pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/main by this push:
new e00771d3408 remove ECS Operator retry mechanism on task failed to
start (#53083)
e00771d3408 is described below
commit e00771d340812330ffd13d1e335951b251f1c72a
Author: Vladimir Binshtok <[email protected]>
AuthorDate: Wed Jul 16 00:55:47 2025 +0300
remove ECS Operator retry mechanism on task failed to start (#53083)
Previously, in the case of an EcsTaskFailToStart error, the EcsOperator
performs a retry of the _check_success_task function (and not the task) with an
empty self.arn parameter, which returns None, leads Airflow to mark the DAG as
successful despite the task failing to start.
Since no actual task retry was performed, this commit removes all the
"retry" logic, which will result in Airflow marking the task as failed.
---
.../src/airflow/providers/amazon/aws/hooks/ecs.py | 12 +----------
.../airflow/providers/amazon/aws/operators/ecs.py | 9 +--------
.../amazon/tests/unit/amazon/aws/hooks/test_ecs.py | 23 ++--------------------
3 files changed, 4 insertions(+), 40 deletions(-)
diff --git a/providers/amazon/src/airflow/providers/amazon/aws/hooks/ecs.py
b/providers/amazon/src/airflow/providers/amazon/aws/hooks/ecs.py
index 05a94d48cea..70c10ced8f1 100644
--- a/providers/amazon/src/airflow/providers/amazon/aws/hooks/ecs.py
+++ b/providers/amazon/src/airflow/providers/amazon/aws/hooks/ecs.py
@@ -19,7 +19,7 @@ from __future__ import annotations
from typing import TYPE_CHECKING, Protocol, runtime_checkable
-from airflow.providers.amazon.aws.exceptions import EcsOperatorError,
EcsTaskFailToStart
+from airflow.providers.amazon.aws.exceptions import EcsOperatorError
from airflow.providers.amazon.aws.hooks.base_aws import AwsGenericHook
from airflow.providers.amazon.aws.utils import _StringCompareEnum
@@ -38,16 +38,6 @@ def should_retry(exception: Exception):
return False
-def should_retry_eni(exception: Exception):
- """Check if exception is related to ENI (Elastic Network Interfaces)."""
- if isinstance(exception, EcsTaskFailToStart):
- return any(
- eni_reason in exception.message
- for eni_reason in ["network interface provisioning",
"ResourceInitializationError"]
- )
- return False
-
-
class EcsClusterStates(_StringCompareEnum):
"""Contains the possible State values of an ECS Cluster."""
diff --git a/providers/amazon/src/airflow/providers/amazon/aws/operators/ecs.py
b/providers/amazon/src/airflow/providers/amazon/aws/operators/ecs.py
index f7d74eee0dd..30baf423986 100644
--- a/providers/amazon/src/airflow/providers/amazon/aws/operators/ecs.py
+++ b/providers/amazon/src/airflow/providers/amazon/aws/operators/ecs.py
@@ -26,8 +26,7 @@ from typing import TYPE_CHECKING, Any
from airflow.configuration import conf
from airflow.exceptions import AirflowException
from airflow.providers.amazon.aws.exceptions import EcsOperatorError,
EcsTaskFailToStart
-from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
-from airflow.providers.amazon.aws.hooks.ecs import EcsClusterStates, EcsHook,
should_retry_eni
+from airflow.providers.amazon.aws.hooks.ecs import EcsClusterStates, EcsHook
from airflow.providers.amazon.aws.hooks.logs import AwsLogsHook
from airflow.providers.amazon.aws.operators.base_aws import AwsBaseOperator
from airflow.providers.amazon.aws.triggers.ecs import (
@@ -688,7 +687,6 @@ class EcsRunTaskOperator(EcsBaseOperator):
logger=self.log,
)
- @AwsBaseHook.retry(should_retry_eni)
def _check_success_task(self) -> None:
if not self.client or not self.arn:
return
@@ -701,11 +699,6 @@ class EcsRunTaskOperator(EcsBaseOperator):
for task in response["tasks"]:
if task.get("stopCode", "") == "TaskFailedToStart":
- # Reset task arn here otherwise the retry run will not start
- # a new task but keep polling the old dead one
- # I'm not resetting it for other exceptions here because
- # EcsTaskFailToStart is the only exception that's being
retried at the moment
- self.arn = None
raise EcsTaskFailToStart(f"The task failed to start due to:
{task.get('stoppedReason', '')}")
# This is a `stoppedReason` that indicates a task has not
diff --git a/providers/amazon/tests/unit/amazon/aws/hooks/test_ecs.py
b/providers/amazon/tests/unit/amazon/aws/hooks/test_ecs.py
index db1e494f949..20350ae2938 100644
--- a/providers/amazon/tests/unit/amazon/aws/hooks/test_ecs.py
+++ b/providers/amazon/tests/unit/amazon/aws/hooks/test_ecs.py
@@ -20,8 +20,8 @@ from unittest import mock
import pytest
-from airflow.providers.amazon.aws.exceptions import EcsOperatorError,
EcsTaskFailToStart
-from airflow.providers.amazon.aws.hooks.ecs import EcsHook, should_retry,
should_retry_eni
+from airflow.providers.amazon.aws.exceptions import EcsOperatorError
+from airflow.providers.amazon.aws.hooks.ecs import EcsHook, should_retry
DEFAULT_CONN_ID: str = "aws_default"
REGION: str = "us-east-1"
@@ -59,22 +59,3 @@ class TestShouldRetry:
def test_return_false_on_invalid_reason(self):
assert not should_retry(EcsOperatorError([{"reason":
"CLUSTER_NOT_FOUND"}], "Foo"))
-
-
-class TestShouldRetryEni:
- def test_return_true_on_valid_reason(self):
- assert should_retry_eni(
- EcsTaskFailToStart(
- "The task failed to start due to: "
- "Timeout waiting for network interface provisioning to
complete."
- )
- )
-
- def test_return_false_on_invalid_reason(self):
- assert not should_retry_eni(
- EcsTaskFailToStart(
- "The task failed to start due to: "
- "CannotPullContainerError: "
- "ref pull has been retried 5 time(s): failed to resolve
reference"
- )
- )