[ 
https://issues.apache.org/jira/browse/AIRFLOW-3046?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16654033#comment-16654033
 ] 

ASF GitHub Bot commented on AIRFLOW-3046:
-----------------------------------------

ashb closed pull request #4039: [AIRFLOW-3046] Fix ECS Operator mistakenly 
reports success
URL: https://github.com/apache/incubator-airflow/pull/4039
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/airflow/contrib/operators/ecs_operator.py 
b/airflow/contrib/operators/ecs_operator.py
index 8bad285ffd..2217398512 100644
--- a/airflow/contrib/operators/ecs_operator.py
+++ b/airflow/contrib/operators/ecs_operator.py
@@ -17,6 +17,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import sys
+import re
 
 from airflow.exceptions import AirflowException
 from airflow.models import BaseOperator
@@ -139,6 +140,15 @@ def _check_success_task(self):
             raise AirflowException(response)
 
         for task in response['tasks']:
+            # This is a `stoppedReason` that indicates a task has not
+            # successfully finished, but there is no other indication of 
failure
+            # in the response.
+            # See, 
https://docs.aws.amazon.com/AmazonECS/latest/developerguide/stopped-task-errors.html
 # noqa E501
+            if re.match(r'Host EC2 \(instance .+?\) (stopped|terminated)\.',
+                        task.get('stoppedReason', '')):
+                raise AirflowException(
+                    'The task was stopped because the host instance 
terminated: {}'.
+                    format(task.get('stoppedReason', '')))
             containers = task['containers']
             for container in containers:
                 if container.get('lastStatus') == 'STOPPED' and \
diff --git a/tests/contrib/operators/test_ecs_operator.py 
b/tests/contrib/operators/test_ecs_operator.py
index 5f8c220260..51391c7b1e 100644
--- a/tests/contrib/operators/test_ecs_operator.py
+++ b/tests/contrib/operators/test_ecs_operator.py
@@ -172,8 +172,10 @@ def test_wait_end_tasks(self):
 
         self.ecs._wait_for_task_ended()
         client_mock.get_waiter.assert_called_once_with('tasks_stopped')
-        
client_mock.get_waiter.return_value.wait.assert_called_once_with(cluster='c', 
tasks=['arn'])
-        self.assertEquals(sys.maxsize, 
client_mock.get_waiter.return_value.config.max_attempts)
+        client_mock.get_waiter.return_value.wait.assert_called_once_with(
+            cluster='c', tasks=['arn'])
+        self.assertEquals(
+            sys.maxsize, 
client_mock.get_waiter.return_value.config.max_attempts)
 
     def test_check_success_tasks_raises(self):
         client_mock = mock.Mock()
@@ -197,7 +199,8 @@ def test_check_success_tasks_raises(self):
         self.assertIn("'name': 'foo'", str(e.exception))
         self.assertIn("'lastStatus': 'STOPPED'", str(e.exception))
         self.assertIn("'exitCode': 1", str(e.exception))
-        client_mock.describe_tasks.assert_called_once_with(cluster='c', 
tasks=['arn'])
+        client_mock.describe_tasks.assert_called_once_with(
+            cluster='c', tasks=['arn'])
 
     def test_check_success_tasks_raises_pending(self):
         client_mock = mock.Mock()
@@ -217,7 +220,8 @@ def test_check_success_tasks_raises_pending(self):
         self.assertIn("This task is still pending ", str(e.exception))
         self.assertIn("'name': 'container-name'", str(e.exception))
         self.assertIn("'lastStatus': 'PENDING'", str(e.exception))
-        client_mock.describe_tasks.assert_called_once_with(cluster='c', 
tasks=['arn'])
+        client_mock.describe_tasks.assert_called_once_with(
+            cluster='c', tasks=['arn'])
 
     def test_check_success_tasks_raises_multiple(self):
         client_mock = mock.Mock()
@@ -236,7 +240,42 @@ def test_check_success_tasks_raises_multiple(self):
             }]
         }
         self.ecs._check_success_task()
-        client_mock.describe_tasks.assert_called_once_with(cluster='c', 
tasks=['arn'])
+        client_mock.describe_tasks.assert_called_once_with(
+            cluster='c', tasks=['arn'])
+
+    def test_host_terminated_raises(self):
+        client_mock = mock.Mock()
+        self.ecs.client = client_mock
+        self.ecs.arn = 'arn'
+        client_mock.describe_tasks.return_value = {
+            'tasks': [{
+                'stoppedReason': 'Host EC2 (instance i-1234567890abcdef) 
terminated.',
+                "containers": [
+                    {
+                        "containerArn": 
"arn:aws:ecs:us-east-1:012345678910:container/e1ed7aac-d9b2-4315-8726-d2432bf11868",
  # noqa: E501
+                        "lastStatus": "RUNNING",
+                        "name": "wordpress",
+                        "taskArn": 
"arn:aws:ecs:us-east-1:012345678910:task/d8c67b3c-ac87-4ffe-a847-4785bc3a8b55"  
# noqa: E501
+                    }
+                ],
+                "desiredStatus": "STOPPED",
+                "lastStatus": "STOPPED",
+                "taskArn": 
"arn:aws:ecs:us-east-1:012345678910:task/d8c67b3c-ac87-4ffe-a847-4785bc3a8b55", 
 # noqa: E501
+                "taskDefinitionArn": 
"arn:aws:ecs:us-east-1:012345678910:task-definition/hello_world:11"  # noqa: 
E501
+
+            }]
+        }
+
+        with self.assertRaises(AirflowException) as e:
+            self.ecs._check_success_task()
+
+        self.assertIn(
+            "The task was stopped because the host instance terminated:",
+            str(e.exception))
+        self.assertIn("Host EC2 (", str(e.exception))
+        self.assertIn(") terminated", str(e.exception))
+        client_mock.describe_tasks.assert_called_once_with(
+            cluster='c', tasks=['arn'])
 
     def test_check_success_task_not_raises(self):
         client_mock = mock.Mock()
@@ -252,7 +291,8 @@ def test_check_success_task_not_raises(self):
             }]
         }
         self.ecs._check_success_task()
-        client_mock.describe_tasks.assert_called_once_with(cluster='c', 
tasks=['arn'])
+        client_mock.describe_tasks.assert_called_once_with(
+            cluster='c', tasks=['arn'])
 
 
 if __name__ == '__main__':


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> ECS Operator mistakenly reports success when task is killed due to EC2 host 
> termination
> ---------------------------------------------------------------------------------------
>
>                 Key: AIRFLOW-3046
>                 URL: https://issues.apache.org/jira/browse/AIRFLOW-3046
>             Project: Apache Airflow
>          Issue Type: Bug
>          Components: contrib, operators
>            Reporter: Dan MacTough
>            Priority: Major
>             Fix For: 1.10.1
>
>
> We have ECS clusters made up of EC2 spot fleets. Among other things, this 
> means hosts can be terminated on short notice. When this happens, all tasks 
> (and associated containers) get terminated, as well.
> We expect that when that happens for Airflow task instances using the ECS 
> Operator, those instances will be marked as failures and retried.
> Instead, they are marked as successful.
> As a result, the immediate downstream task fails, causing the scheduled DAG 
> run to fail.
> Here's an example of the Airflow log output when this happens:
> {noformat}
> [2018-09-12 01:02:02,712] {ecs_operator.py:112} INFO - ECS Task stopped, 
> check status: {'tasks': [{'taskArn': 
> 'arn:aws:ecs:us-east-1:111111111111:task/32d43a1d-fbc7-4659-815d-9133bde11cdc',
>  'clusterArn': 'arn:aws:ecs:us-east-1:111111111111:cluster/processing', 
> 'taskDefinitionArn': 
> 'arn:aws:ecs:us-east-1:111111111111:task-definition/foobar-testing_dataEngineering_rd:76',
>  'containerInstanceArn': 
> 'arn:aws:ecs:us-east-1:111111111111:container-instance/7431f0a6-8fc5-4eff-8196-32f77d286a61',
>  'overrides': {'containerOverrides': [{'name': 'foobar-testing', 'command': 
> ['./bin/generate-features.sh', '2018-09-11']}]}, 'lastStatus': 'STOPPED', 
> 'desiredStatus': 'STOPPED', 'cpu': '4096', 'memory': '60000', 'containers': 
> [{'containerArn': 
> 'arn:aws:ecs:us-east-1:111111111111:container/0d5cc553-f894-4f9a-b17c-9f80f7ce8d0a',
>  'taskArn': 
> 'arn:aws:ecs:us-east-1:111111111111:task/32d43a1d-fbc7-4659-815d-9133bde11cdc',
>  'name': 'foobar-testing', 'lastStatus': 'RUNNING', 'networkBindings': [], 
> 'networkInterfaces': [], 'healthStatus': 'UNKNOWN'}], 'startedBy': 'Airflow', 
> 'version': 3, 'stoppedReason': 'Host EC2 (instance i-02cf23bbd5ae26194) 
> terminated.', 'connectivity': 'CONNECTED', 'connectivityAt': 
> datetime.datetime(2018, 9, 12, 0, 6, 30, 245000, tzinfo=tzlocal()), 
> 'pullStartedAt': datetime.datetime(2018, 9, 12, 0, 6, 32, 748000, 
> tzinfo=tzlocal()), 'pullStoppedAt': datetime.datetime(2018, 9, 12, 0, 6, 59, 
> 748000, tzinfo=tzlocal()), 'createdAt': datetime.datetime(2018, 9, 12, 0, 6, 
> 30, 245000, tzinfo=tzlocal()), 'startedAt': datetime.datetime(2018, 9, 12, 0, 
> 7, 0, 748000, tzinfo=tzlocal()), 'stoppingAt': datetime.datetime(2018, 9, 12, 
> 1, 2, 0, 91000, tzinfo=tzlocal()), 'stoppedAt': datetime.datetime(2018, 9, 
> 12, 1, 2, 0, 91000, tzinfo=tzlocal()), 'group': 
> 'family:foobar-testing_dataEngineering_rd', 'launchType': 'EC2', 
> 'attachments': [], 'healthStatus': 'UNKNOWN'}], 'failures': [], 
> 'ResponseMetadata': {'RequestId': '758c791f-b627-11e8-83f7-2b76f4796ed2', 
> 'HTTPStatusCode': 200, 'HTTPHeaders': {'server': 'Server', 'date': 'Wed, 12 
> Sep 2018 01:02:02 GMT', 'content-type': 'application/x-amz-json-1.1', 
> 'content-length': '1412', 'connection': 'keep-alive', 'x-amzn-requestid': 
> '758c791f-b627-11e8-83f7-2b76f4796ed2'}, 'RetryAttempts': 0}}{noformat}
> I believe the function that checks whether the task is successful needs at 
> least one more check. 
> We are currently running a modified version of the ECS Operator that contains 
> the following {{_check_success_task}} function to address this failure 
> condition:
> {code}
>     def _check_success_task(self):
>         response = self.client.describe_tasks(
>             cluster=self.cluster,
>             tasks=[self.arn]
>         )
>         self.log.info('ECS Task stopped, check status: %s', response)
>         if len(response.get('failures', [])) > 0:
>             raise AirflowException(response)
>         for task in response['tasks']:
>             if 'terminated' in task.get('stoppedReason', '').lower():
>                 raise AirflowException('The task was stopped because the host 
> instance terminated: {}'.format(
>                     task.get('stoppedReason', '')))
>             containers = task['containers']
>             for container in containers:
>                 if container.get('lastStatus') == 'STOPPED' and \
>                         container['exitCode'] != 0:
>                     raise AirflowException(
>                         'This task is not in success state {}'.format(task))
>                 elif container.get('lastStatus') == 'PENDING':
>                     raise AirflowException(
>                         'This task is still pending {}'.format(task))
>                 elif 'error' in container.get('reason', '').lower():
>                     raise AirflowException(
>                         'This containers encounter an error during launching 
> : {}'.
>                         format(container.get('reason', '').lower()))
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to