Repository: aurora Updated Branches: refs/heads/master efe865651 -> f054e9b10
Unhandled exception should not strand runner in STARTING state. If the ThermoTaskRunner encounters an Exception when trying to fork the process, it bubbles this up to the Executor which does not handle execptions other than TaskError. This leads to the executor leaving the task in STARTING state and we end up with tasks that get stranded in this state. Fix it so that any unknown expection that is thrown when starting a runner leads to task failure and get marked as FAILED. Testing Done: ./gradlew test ./pants test src/test/python/apache:: Reviewed at https://reviews.apache.org/r/67967/ Project: http://git-wip-us.apache.org/repos/asf/aurora/repo Commit: http://git-wip-us.apache.org/repos/asf/aurora/commit/f054e9b1 Tree: http://git-wip-us.apache.org/repos/asf/aurora/tree/f054e9b1 Diff: http://git-wip-us.apache.org/repos/asf/aurora/diff/f054e9b1 Branch: refs/heads/master Commit: f054e9b1095a7ecacbbc2fa72ce0a842a3297859 Parents: efe8656 Author: Santhosh Kumar Shanmugham <[email protected]> Authored: Wed Jul 18 15:23:27 2018 -0700 Committer: Santhosh Kumar <[email protected]> Committed: Wed Jul 18 15:23:27 2018 -0700 ---------------------------------------------------------------------- .../apache/aurora/executor/aurora_executor.py | 3 +++ .../aurora/executor/test_thermos_executor.py | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/aurora/blob/f054e9b1/src/main/python/apache/aurora/executor/aurora_executor.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/executor/aurora_executor.py b/src/main/python/apache/aurora/executor/aurora_executor.py index 8a9958f..94f58a1 100644 --- a/src/main/python/apache/aurora/executor/aurora_executor.py +++ b/src/main/python/apache/aurora/executor/aurora_executor.py @@ -155,6 +155,9 @@ class AuroraExecutor(ExecutorBase, Observable): except Timeout: self._die(driver, mesos_pb2.TASK_LOST, 'Timed out waiting for task to start!') return False + except Exception as e: + self._die(driver, mesos_pb2.TASK_FAILED, 'Unknown exception starting runner: %s' % e) + return False self.runner_started.set() log.debug('Task started.') http://git-wip-us.apache.org/repos/asf/aurora/blob/f054e9b1/src/test/python/apache/aurora/executor/test_thermos_executor.py ---------------------------------------------------------------------- diff --git a/src/test/python/apache/aurora/executor/test_thermos_executor.py b/src/test/python/apache/aurora/executor/test_thermos_executor.py index f6ae1be..09f286c 100644 --- a/src/test/python/apache/aurora/executor/test_thermos_executor.py +++ b/src/test/python/apache/aurora/executor/test_thermos_executor.py @@ -83,6 +83,11 @@ class FailingStartingTaskRunner(ThermosTaskRunner): raise TaskError('I am an idiot!') +class ErroringStartingTaskRunner(ThermosTaskRunner): + def start(self): + raise Exception('I am an idiot!') + + class FailingSandbox(DirectorySandbox): def __init__(self, root, exception_type, **kwargs): self._exception_type = exception_type @@ -513,6 +518,20 @@ class TestThermosExecutor(object): updates = proxy_driver.method_calls['sendStatusUpdate'] assert updates[-1][0][0].state == mesos_pb2.TASK_FAILED + def test_unknown_exception_runner_start(self): + proxy_driver = ProxyDriver() + + with temporary_dir() as td: + runner_provider = make_provider(td, ErroringStartingTaskRunner) + te = FastThermosExecutor( + runner_provider=runner_provider, + sandbox_provider=DefaultTestSandboxProvider()) + te.launchTask(proxy_driver, make_task(HELLO_WORLD_MTI)) + proxy_driver.wait_stopped() + + updates = proxy_driver.method_calls['sendStatusUpdate'] + assert updates[-1][0][0].state == mesos_pb2.TASK_FAILED + def test_failing_runner_initialize(self): proxy_driver = ProxyDriver()
