Repository: aurora Updated Branches: refs/heads/master 827b9abea -> 73ceeb22a
Daemonize all deadline calls in aurora executor. If we do not daemonize, it's possible for the aurora executor to send TASK_KILLED and then block indefinitely on shutdown. This way the aurora executor process will at least exit, allow the cgroup to tear down all active processes. Testing Done: ./pants test src/test/python/apache/aurora/executor:: Bugs closed: AURORA-698 Reviewed at https://reviews.apache.org/r/34484/ Project: http://git-wip-us.apache.org/repos/asf/aurora/repo Commit: http://git-wip-us.apache.org/repos/asf/aurora/commit/73ceeb22 Tree: http://git-wip-us.apache.org/repos/asf/aurora/tree/73ceeb22 Diff: http://git-wip-us.apache.org/repos/asf/aurora/diff/73ceeb22 Branch: refs/heads/master Commit: 73ceeb22a18e4b3df3bffb04cf7d58527066fb5a Parents: 827b9ab Author: Brian Wickman <[email protected]> Authored: Mon Jun 1 15:20:25 2015 -0700 Committer: Brian Wickman <[email protected]> Committed: Mon Jun 1 15:20:25 2015 -0700 ---------------------------------------------------------------------- .../apache/aurora/executor/aurora_executor.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/aurora/blob/73ceeb22/src/main/python/apache/aurora/executor/aurora_executor.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/executor/aurora_executor.py b/src/main/python/apache/aurora/executor/aurora_executor.py index df0df0c..7ad179e 100644 --- a/src/main/python/apache/aurora/executor/aurora_executor.py +++ b/src/main/python/apache/aurora/executor/aurora_executor.py @@ -31,6 +31,10 @@ from .executor_base import ExecutorBase from .status_manager import StatusManager +def propagate_deadline(*args, **kw): + return deadline(*args, daemon=True, propagate=True, **kw) + + class AuroraExecutor(ExecutorBase, Observable): PERSISTENCE_WAIT = Amount(5, Time.SECONDS) SANDBOX_INITIALIZATION_TIMEOUT = Amount(10, Time.MINUTES) @@ -118,8 +122,7 @@ class AuroraExecutor(ExecutorBase, Observable): self._sandbox = self._sandbox_provider.from_assigned_task(assigned_task) self.sandbox_initialized.set() try: - deadline(self._sandbox.create, timeout=self.SANDBOX_INITIALIZATION_TIMEOUT, - daemon=True, propagate=True) + propagate_deadline(self._sandbox.create, timeout=self.SANDBOX_INITIALIZATION_TIMEOUT) except Timeout: self._die(driver, mesos_pb2.TASK_FAILED, 'Timed out waiting for sandbox to initialize!') return @@ -134,7 +137,7 @@ class AuroraExecutor(ExecutorBase, Observable): self._die(driver, mesos_pb2.TASK_KILLED, 'Task killed during initialization.') try: - deadline(self._runner.start, timeout=self.START_TIMEOUT, propagate=True) + propagate_deadline(self._runner.start, timeout=self.START_TIMEOUT) except TaskError as e: self._die(driver, mesos_pb2.TASK_FAILED, 'Task initialization failed: %s' % e) return False @@ -186,14 +189,20 @@ class AuroraExecutor(ExecutorBase, Observable): runner_status = self._runner.status try: - deadline(self._chained_checker.stop, timeout=self.STOP_TIMEOUT) + propagate_deadline(self._chained_checker.stop, timeout=self.STOP_TIMEOUT) except Timeout: log.error('Failed to stop all checkers within deadline.') + except Exception: + log.error('Failed to stop health checkers:') + log.error(traceback.format_exc()) try: - deadline(self._runner.stop, timeout=self.STOP_TIMEOUT) + propagate_deadline(self._runner.stop, timeout=self.STOP_TIMEOUT) except Timeout: log.error('Failed to stop runner within deadline.') + except Exception: + log.error('Failed to stop runner:') + log.error(traceback.format_exc()) # If the runner was alive when _shutdown was called, defer to the status_result, # otherwise the runner's terminal state is the preferred state.
