Repository: aurora Updated Branches: refs/heads/master d106b4ecc -> 06d25fd4d
Terminate the executor on unhandled errors This commit consits of two independent parts: a) ensure we interrupt the main thread when there are unhandled exceptions b) ensure the main thread of the executor can be interrupted Testing Done: This bug is pretty hard to reproduce and test. I therefore opted for a manual verification and injected an exception throw shortly before the last statement of the `AuroraExecutor._shutdown` method. Without this patch, this resulted in hanging executors on the host. With this patch everything is terminated as expected. For details of the suffessful run, please see the executor logs below. Please note that the `apport.fileutils` is due to Ubuntu messing with its Python installation. This is not critical. ``` twitter.common.app debug: Initializing: apache.thermos.common.excepthook (Exception termination handler.) I1031 15:59:37.188621 25437 exec.cpp:162] Version: 1.2.0 I1031 15:59:37.192201 25429 exec.cpp:237] Executor registered on agent 93259518-14f4-4956-a39c-aa615bff9a5e-S0 Writing log files to disk in /var/lib/mesos/slaves/93259518-14f4-4956-a39c-aa615bff9a5e-S0/frameworks/7b202c2e-8796-4f27-afeb-8b76ba4b3037-0000/executors/thermos-www-data-prod-hello-0-d8d50c2f-e79b-467d-8c65-cca3cb44cf9c/runs/54a5ed51-aa9b-476f-9f75-0b42bd6dfa8d ERROR] Unhandled error in <StatusManager(Thread-7 [TID=25450], started daemon 139968452134656)>. Interrupting main thread. Traceback (most recent call last): File "/root/.pex/install/twitter.common.exceptions-0.3.7-py2-none-any.whl.f6376bcca9bfda5eba4396de2676af5dfe36237d/twitter.common.exceptions-0.3.7-py2-none-any.whl/twitter/common/exceptions/__init__.py", line 126, in _excepting_run self.__real_run(*args, **kw) File "apache/aurora/executor/status_manager.py", line 62, in run File "apache/aurora/executor/aurora_executor.py", line 236, in _shutdown RuntimeError: Woops! Exception in thread Thread-7 [TID=25450]: Traceback (most recent call last): File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner self.run() File "/root/.pex/install/twitter.common.decorators-0.3.7-py2-none-any.whl.b23f2874a4392741fca582d9e0528c08e0335c68/twitter.common.decorators-0.3.7-py2-none-any.whl/twitter/common/decorators/threads.py", line 115, in identified return instancemethod(self, *args, **kwargs) File "/root/.pex/install/twitter.common.exceptions-0.3.7-py2-none-any.whl.f6376bcca9bfda5eba4396de2676af5dfe36237d/twitter.common.exceptions-0.3.7-py2-none-any.whl/twitter/common/exceptions/__init__.py", line 130, in _excepting_run sys.excepthook(*sys.exc_info()) File "apache/thermos/common/excepthook.py", line 41, in teardown_handler self._former_hook()(exc_type, value, trace) File "/usr/lib/python2.7/dist-packages/apport_python_hook.py", line 63, in apport_excepthook from apport.fileutils import likely_packaged, get_recent_crashes ImportError: No module named apport.fileutils twitter.common.app debug: main exited with ^C twitter.common.app debug: Shutting application down. twitter.common.app debug: Running exit function for apache.thermos.common.excepthook (Exception termination handler.) twitter.common.app debug: Running exit function for twitter.common.log (Logging subsystem.) twitter.common.app debug: Finishing up module teardown. twitter.common.app debug: Active thread: <_MainThread(MainThread, started 139968622749504)> twitter.common.app debug: Active thread (daemon): <TaskResourceMonitor(TaskResourceMonitor[www-data-prod-hello-0-d8d50c2f-e79b-467d-8c65-cca3cb44cf9c] [TID=25449], started daemon 139967951009536)> twitter.common.app debug: Active thread (daemon): <_DummyThread(Dummy-13, started daemon 139968485705472)> twitter.common.app debug: Active thread (daemon): <WaitThread(Thread-9, started daemon 139967934224128)> twitter.common.app debug: Active thread (daemon): <WaitThread(Thread-12, started daemon 139967942616832)> twitter.common.app debug: Active thread (daemon): <_DummyThread(Dummy-3, started daemon 139968510883584)> twitter.common.app debug: Active thread (daemon): <WaitThread(Thread-11, started daemon 139967925831424)> twitter.common.app debug: Exiting cleanly. ``` Corresponding agent logs, indicating that Mesos knows about the crash on teardown: ``` I1031 15:59:54.692739 1956 slave.cpp:4769] Executor 'thermos-www-data-prod-hello-0-d8d50c2f-e79b-467d-8c65-cca3cb44cf9c' of framework 7b202c2e-8796-4f27-afeb-8b76ba4b3037-0000 exited with status 130 I1031 15:59:54.692834 1956 slave.cpp:4869] Cleaning up executor 'thermos-www-data-prod-hello-0-d8d50c2f-e79b-467d-8c65-cca3cb44cf9c' of framework 7b202c2e-8796-4f27-afeb-8b76ba4b3037-0000 at executor(1)@192.168.33.7:48931 I1031 15:59:54.692996 1956 slave.cpp:4957] Cleaning up framework 7b202c2e-8796-4f27-afeb-8b76ba4b3037-0000 ``` Bugs closed: AURORA-1955 Reviewed at https://reviews.apache.org/r/63443/ Project: http://git-wip-us.apache.org/repos/asf/aurora/repo Commit: http://git-wip-us.apache.org/repos/asf/aurora/commit/06d25fd4 Tree: http://git-wip-us.apache.org/repos/asf/aurora/tree/06d25fd4 Diff: http://git-wip-us.apache.org/repos/asf/aurora/diff/06d25fd4 Branch: refs/heads/master Commit: 06d25fd4d5abbf1e5e59683bfa488b176b3721e8 Parents: d106b4e Author: Stephan Erb <s...@apache.org> Authored: Thu Nov 2 12:01:40 2017 +0100 Committer: Stephan Erb <s...@apache.org> Committed: Thu Nov 2 12:01:40 2017 +0100 ---------------------------------------------------------------------- .../executor/bin/thermos_executor_main.py | 29 ++++++++++- src/main/python/apache/aurora/tools/thermos.py | 2 + .../apache/aurora/tools/thermos_observer.py | 20 ++------ .../python/apache/thermos/common/excepthook.py | 51 ++++++++++++++++++++ .../apache/thermos/runner/thermos_runner.py | 2 + 5 files changed, 85 insertions(+), 19 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/aurora/blob/06d25fd4/src/main/python/apache/aurora/executor/bin/thermos_executor_main.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/executor/bin/thermos_executor_main.py b/src/main/python/apache/aurora/executor/bin/thermos_executor_main.py index a191cf9..1f0cf98 100644 --- a/src/main/python/apache/aurora/executor/bin/thermos_executor_main.py +++ b/src/main/python/apache/aurora/executor/bin/thermos_executor_main.py @@ -26,6 +26,7 @@ import sys import traceback from twitter.common import app, log +from twitter.common.exceptions import ExceptionalThread from twitter.common.log.options import LogOptions from apache.aurora.config.schema.base import LoggerDestination, LoggerMode @@ -40,6 +41,7 @@ from apache.aurora.executor.thermos_task_runner import ( DefaultThermosTaskRunnerProvider, UserOverrideThermosTaskRunnerProvider ) +from apache.thermos.common.excepthook import ExceptionTerminationHandler try: from mesos.executor import MesosExecutorDriver @@ -290,6 +292,21 @@ def initialize(options): return thermos_executor +class ExecutorDriverThread(ExceptionalThread): + """ Start the executor and wait until it is stopped. + + This is decoupled from the main thread, because only the main thread can receive signals + and we would miss those when blocking in the driver run method. + """ + + def __init__(self, driver): + self._driver = driver + super(ExecutorDriverThread, self).__init__() + + def run(self): + self._driver.run() + + def proxy_main(): def main(args, options): if MesosExecutorDriver is None: @@ -304,9 +321,17 @@ def proxy_main(): # time period ExecutorTimeout(thermos_executor.launched, driver).start() - # Start executor - driver.run() + # Start executor and wait until it is stopped. + driver_thread = ExecutorDriverThread(driver) + driver_thread.start() + try: + while driver_thread.isAlive(): + driver_thread.join(5) + except (KeyboardInterrupt, SystemExit): + driver.stop() + raise log.info('MesosExecutorDriver.run() has finished.') + app.register_module(ExceptionTerminationHandler()) app.main() http://git-wip-us.apache.org/repos/asf/aurora/blob/06d25fd4/src/main/python/apache/aurora/tools/thermos.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/tools/thermos.py b/src/main/python/apache/aurora/tools/thermos.py index de20c06..5dab1d2 100644 --- a/src/main/python/apache/aurora/tools/thermos.py +++ b/src/main/python/apache/aurora/tools/thermos.py @@ -20,6 +20,7 @@ from twitter.common.log.options import LogOptions from apache.aurora.executor.common.path_detector import MesosPathDetector from apache.thermos.cli.common import register_path_detector from apache.thermos.cli.main import register_commands, register_options +from apache.thermos.common.excepthook import ExceptionTerminationHandler register_commands(app) register_options(app) @@ -46,4 +47,5 @@ register_path_detector(MesosPathDetector()) LogOptions.disable_disk_logging() LogOptions.set_stderr_log_level('google:INFO') +app.register_module(ExceptionTerminationHandler()) app.main() http://git-wip-us.apache.org/repos/asf/aurora/blob/06d25fd4/src/main/python/apache/aurora/tools/thermos_observer.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/tools/thermos_observer.py b/src/main/python/apache/aurora/tools/thermos_observer.py index 0318f99..dd9f0c4 100644 --- a/src/main/python/apache/aurora/tools/thermos_observer.py +++ b/src/main/python/apache/aurora/tools/thermos_observer.py @@ -14,17 +14,15 @@ """A Mesos-customized entry point to the thermos_observer webserver.""" -import sys -import thread -import threading import time -from twitter.common import app, log +from twitter.common import app from twitter.common.exceptions import ExceptionalThread from twitter.common.log.options import LogOptions from twitter.common.quantity import Amount, Time from apache.aurora.executor.common.path_detector import MesosPathDetector +from apache.thermos.common.excepthook import ExceptionTerminationHandler from apache.thermos.monitoring.resource import TaskResourceMonitor from apache.thermos.observer.http.configure import configure_server from apache.thermos.observer.task_observer import TaskObserver @@ -92,18 +90,6 @@ def initialize(options): Amount(options.task_disk_collection_interval_secs, Time.SECONDS)) -def handle_error(exc_type, value, traceback): - """ Tear down the observer in case of unhandled errors. - - By using ExceptionalThread throughout the observer we have ensured that sys.excepthook will - be called for every unhandled exception, even for those not originating in the main thread. - """ - log.error("An unhandled error occured. Tearing down.", exc_info=(exc_type, value, traceback)) - # TODO: In Python 3.4 we will be able to use threading.main_thread() - if not isinstance(threading.current_thread(), threading._MainThread): - thread.interrupt_main() - - def main(_, options): observer = initialize(options) observer.start() @@ -116,6 +102,6 @@ def main(_, options): sleep_forever() -sys.excepthook = handle_error LogOptions.set_stderr_log_level('google:INFO') +app.register_module(ExceptionTerminationHandler()) app.main() http://git-wip-us.apache.org/repos/asf/aurora/blob/06d25fd4/src/main/python/apache/thermos/common/excepthook.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/thermos/common/excepthook.py b/src/main/python/apache/thermos/common/excepthook.py new file mode 100644 index 0000000..0b470fd --- /dev/null +++ b/src/main/python/apache/thermos/common/excepthook.py @@ -0,0 +1,51 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sys +import thread +import threading + +from twitter.common import app, log + + +class ExceptionTerminationHandler(app.Module): + """ An application module tha will terminate the app process in case of unhandled errors. + + By using ExceptionalThread throughout the codebase we have ensured that sys.excepthook will + be called for every unhandled exception, even for those not originating in the main thread. + """ + + def __init__(self): + app.Module.__init__(self, __name__, description="Exception termination handler.") + + def setup_function(self): + self._saved_hook = sys.excepthook + + def teardown_handler(exc_type, value, trace): + if isinstance(threading.current_thread(), threading._MainThread): + self._former_hook()(exc_type, value, trace) + else: + try: + log.error("Unhandled error in %s. Interrupting main thread.", threading.current_thread()) + self._former_hook()(exc_type, value, trace) + finally: + thread.interrupt_main() + + sys.excepthook = teardown_handler + + def _former_hook(self): + return getattr(self, '_saved_hook', sys.__excepthook__) + + def teardown_function(self): + sys.excepthook = self._former_hook() http://git-wip-us.apache.org/repos/asf/aurora/blob/06d25fd4/src/main/python/apache/thermos/runner/thermos_runner.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/thermos/runner/thermos_runner.py b/src/main/python/apache/thermos/runner/thermos_runner.py index 847f51e..fa4f0fb 100644 --- a/src/main/python/apache/thermos/runner/thermos_runner.py +++ b/src/main/python/apache/thermos/runner/thermos_runner.py @@ -22,6 +22,7 @@ import traceback from twitter.common import app, log +from apache.thermos.common.excepthook import ExceptionTerminationHandler from apache.thermos.common.options import add_port_to from apache.thermos.common.planner import TaskPlanner from apache.thermos.common.statuses import ( @@ -264,4 +265,5 @@ def main(args, opts): return proxy_main(args, opts) +app.register_module(ExceptionTerminationHandler()) app.main()