We found several zombie executors on a cluster. Thermos logs indicate
reaching system limits while trying to shutdown(?). Mesos agent is unable
to get status of this container from docker daemon (docker inspect fails).
Shouldn't thermos exit in such a case?


 22 WARNING: Your kernel does not support swap limit capabilities,
memory limited without swap.
 23 twitter.common.app debug: Initializing: twitter.common.log
(Logging subsystem.)
 24 Writing log files to disk in /mnt/mesos/sandbox
 25 I1023 19:04:32.261165     7 exec.cpp:162] Version: 1.2.0
 26 I1023 19:04:32.264870    42 exec.cpp:237] Executor registered on
agent b4fff262-c925-4edf-a2ef-2a5bbe89c42b-S3295
 27 Writing log files to disk in /mnt/mesos/sandbox
 28 Traceback (most recent call last):
 29   File 
"/root/.pex/install/twitter.common.exceptions-0.3.7-py2-none-any.whl.f6376bcca9bfda5eba4396de2676af5dfe36237d/twitter.common.exceptions-0.3.7-py2-none-any.whl/twitter/common/exceptions/__init__.py",
line 1    26, in _excepting_run
 30     self.__real_run(*args, **kw)
 31   File "apache/thermos/monitoring/resource.py", line 243, in run
 32   File 
"/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/event_muxer.py",
lin    e 79, in wait
 33     thread.start()
 34   File "/usr/lib/python2.7/threading.py", line 745, in start
 35     _start_new_thread(self.__bootstrap, ())
 36 thread.error: can't start new thread
 37 ERROR] *Failed to stop health checkers:*
 38 ERROR] Traceback (most recent call last):
 39   File "apache/aurora/executor/aurora_executor.py", line 209, in _shutdown
 40     propagate_deadline(self._chained_checker.stop,
timeout=self.STOP_TIMEOUT)
 41   File "apache/aurora/executor/aurora_executor.py", line 35, in
propagate_deadline
 42     return deadline(*args, daemon=True, propagate=True, **kw)
 43   File 
"/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deadline.py",
line 6    1, in deadline
 44     AnonymousThread().start()
 45   File "/usr/lib/python2.7/threading.py", line 745, in start
 46     _start_new_thread(self.__bootstrap, ())
 47 *error: can't start new thread*

48

 49 ERROR]* Failed to stop runner:*
50 ERROR] Traceback (most recent call last):
 51   File "apache/aurora/executor/aurora_executor.py", line 217, in _shutdown
 52     propagate_deadline(self._runner.stop, timeout=self.STOP_TIMEOUT)
 53   File "apache/aurora/executor/aurora_executor.py", line 35, in
propagate_deadline
 54     return deadline(*args, daemon=True, propagate=True, **kw)
 55   File 
"/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deadline.py",
line 6    1, in deadline
 56     AnonymousThread().start()
 57   File "/usr/lib/python2.7/threading.py", line 745, in start
 58     _start_new_thread(self.__bootstrap, ())
 59 *error: can't start new thread
* 60
 61 Traceback (most recent call last):
 62   File 
"/root/.pex/install/twitter.common.exceptions-0.3.7-py2-none-any.whl.f6376bcca9bfda5eba4396de2676af5dfe36237d/twitter.common.exceptions-0.3.7-py2-none-any.whl/twitter/common/exceptions/__init__.py",
line 1    26, in _excepting_run
 63     self.__real_run(*args, **kw)
 64   File "apache/aurora/executor/status_manager.py", line 62, in run
 65   File "apache/aurora/executor/aurora_executor.py", line 235, in _shutdown
 66   File 
"/root/.pex/install/twitter.common.concurrent-0.3.7-py2-none-any.whl.f1ab836a5554c86d07fa3f075905c95fb20c78dd/twitter.common.concurrent-0.3.7-py2-none-any.whl/twitter/common/concurrent/deferred.py",
line 5    6, in defer
 67     deferred.start()
 68   File "/usr/lib/python2.7/threading.py", line 745, in start
 69     _start_new_thread(self.__bootstrap, ())
 70* thread.error: can't start new thread*

Reply via email to