[
https://issues.apache.org/jira/browse/AURORA-1799?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15827017#comment-15827017
]
Zameer Manji commented on AURORA-1799:
--------------------------------------
Today [~benley] reported something similar in Slack:
{noformat}
ERROR] Failed to stop health checkers:
ERROR] Traceback (most recent call last):
File "apache/aurora/executor/aurora_executor.py", line 192, in _shutdown
propagate_deadline(self._chained_checker.stop, timeout=self.STOP_TIMEOUT)
File "apache/aurora/executor/aurora_executor.py", line 35, in
propagate_deadline
return deadline(*args, daemon=True, propagate=True, **kw)
File
"/root/.pex/install/twitter.common.concurrent-0.3.3-py2-none-any.whl.33d9c24da69d7478b4aa6d76f474f3773a61f6f9/twitter.common.concurrent-0.3.3-py2-none-any.whl/twitter/common/concurrent/dead
line.py", line 61, in deadline
AnonymousThread().start()
File "/usr/lib/python2.7/threading.py", line 745, in start
_start_new_thread(self.__bootstrap, ())
error: can't start new thread
ERROR] Failed to stop runner:
ERROR] Traceback (most recent call last):
File "apache/aurora/executor/aurora_executor.py", line 200, in _shutdown
propagate_deadline(self._runner.stop, timeout=self.STOP_TIMEOUT)
File "apache/aurora/executor/aurora_executor.py", line 35, in
propagate_deadline
return deadline(*args, daemon=True, propagate=True, **kw)
File
"/root/.pex/install/twitter.common.concurrent-0.3.3-py2-none-any.whl.33d9c24da69d7478b4aa6d76f474f3773a61f6f9/twitter.common.concurrent-0.3.3-py2-none-any.whl/twitter/common/concurrent/dead
line.py", line 61, in deadline
AnonymousThread().start()
File "/usr/lib/python2.7/threading.py", line 745, in start
_start_new_thread(self.__bootstrap, ())
error: can't start new thread
Traceback (most recent call last):
File
"/root/.pex/install/twitter.common.exceptions-0.3.3-py2-none-any.whl.57572b1f0a301c36c91adf2c704d0e8dd4d48429/twitter.common.exceptions-0.3.3-py2-none-any.whl/twitter/common/exceptions/__in
it__.py", line 126, in _excepting_run
self.__real_run(*args, **kw)
File "apache/aurora/executor/status_manager.py", line 50, in run
File "apache/aurora/executor/aurora_executor.py", line 218, in _shutdown
File
"/root/.pex/install/twitter.common.concurrent-0.3.3-py2-none-any.whl.33d9c24da69d7478b4aa6d76f474f3773a61f6f9/twitter.common.concurrent-0.3.3-py2-none-any.whl/twitter/common/concurrent/defe
rred.py", line 56, in defer
deferred.start()
File "/usr/lib/python2.7/threading.py", line 745, in start
_start_new_thread(self.__bootstrap, ())
thread.error: can't start new thread
Traceback (most recent call last):
File
"/root/.pex/install/twitter.common.exceptions-0.3.3-py2-none-any.whl.57572b1f0a301c36c91adf2c704d0e8dd4d48429/twitter.common.exceptions-0.3.3-py2-none-any.whl/twitter/common/exceptions/__in
it__.py", line 126, in _excepting_run
self.__real_run(*args, **kw)
File "apache/thermos/monitoring/resource.py", line 239, in run
File
"/root/.pex/install/twitter.common.concurrent-0.3.3-py2-none-any.whl.33d9c24da69d7478b4aa6d76f474f3773a61f6f9/twitter.common.concurrent-0.3.3-py2-none-any.whl/twitter/common/concurrent/even
t_muxer.py", line 79, in wait
thread.start()
File "/usr/lib/python2.7/threading.py", line 745, in start
_start_new_thread(self.__bootstrap, ())
thread.error: can't start new thread
E0116 20:46:46.568775 34 socket.hpp:174] Shutdown failed on fd=13: Transport
endpoint is not connected [107]
E0116 20:46:51.789016 34 socket.hpp:174] Shutdown failed on fd=14: Transport
endpoint is not connected [107]
E0116 20:50:47.904999 34 socket.hpp:174] Shutdown failed on fd=13: Transport
endpoint is not connected [107]
E0116 20:50:48.097457 34 socket.hpp:174] Shutdown failed on fd=13: Transport
endpoint is not connected [107]
E0116 20:50:50.277053 34 socket.hpp:174] Shutdown failed on fd=13: Transport
endpoint is not connected [107]
E0116 20:50:51.006816 34 socket.hpp:174] Shutdown failed on fd=13: Transport
endpoint is not connected [107]
E0116 20:50:51.022123 34 socket.hpp:174] Shutdown failed on fd=13: Transport
endpoint is not connected [107]
E0116 20:50:51.244179 34 socket.hpp:174] Shutdown failed on fd=13: Transport
endpoint is not connected [107]
E0116 20:50:55.407006 34 socket.hpp:174] Shutdown failed on fd=14: Transport
endpoint is not connected [107]
E0116 20:50:55.410759 34 socket.hpp:174] Shutdown failed on fd=15: Transport
endpoint is not connected [107]
E0116 20:50:56.703348 34 socket.hpp:174] Shutdown failed on fd=14: Transport
endpoint is not connected [107]
E0116 20:50:56.707471 34 socket.hpp:174] Shutdown failed on fd=15: Transport
endpoint is not connected [107]
E0116 20:50:56.712406 34 socket.hpp:174] Shutdown failed on fd=16: Transport
endpoint is not connected [107]
E0116 20:50:57.053045 34 socket.hpp:174] Shutdown failed on fd=14: Transport
endpoint is not connected [107]
E0116 20:50:57.379636 34 socket.hpp:174] Shutdown failed on fd=14: Transport
endpoint is not connected [107]
E0116 20:50:57.454205 34 socket.hpp:174] Shutdown failed on fd=14: Transport
endpoint is not connected [107]
E0116 20:50:57.848105 34 socket.hpp:174] Shutdown failed on fd=14: Transport
endpoint is not connected [107]
E0116 20:50:59.661581 34 socket.hpp:174] Shutdown failed on fd=15: Transport
endpoint is not connected [107]
E0116 20:51:03.007069 34 socket.hpp:174] Shutdown failed on fd=14: Transport
endpoint is not connected [107]
E0116 20:51:08.751930 34 socket.hpp:174] Shutdown failed on fd=14: Transport
endpoint is not connected [107]
E0116 20:51:08.833519 34 socket.hpp:174] Shutdown failed on fd=13: Transport
endpoint is not connected [107]
E0116 20:51:09.091882 34 socket.hpp:174] Shutdown failed on fd=13: Transport
endpoint is not connected [107]
E0116 20:51:09.166265 34 socket.hpp:174] Shutdown failed on fd=13: Transport
endpoint is not connected [107]
E0116 20:51:09.275421 34 socket.hpp:174] Shutdown failed on fd=13: Transport
endpoint is not connected [107]
E0116 20:51:09.322955 34 socket.hpp:174] Shutdown failed on fd=13: Transport
endpoint is not connected [107]
E0116 20:51:09.434495 34 socket.hpp:174] Shutdown failed on fd=13: Transport
endpoint is not connected [107]
E0116 20:51:09.495900 34 socket.hpp:174] Shutdown failed on fd=13: Transport
endpoint is not connected [107]
{noformat}
> Thermos does not handle low memory scenarios gracefully
> -------------------------------------------------------
>
> Key: AURORA-1799
> URL: https://issues.apache.org/jira/browse/AURORA-1799
> Project: Aurora
> Issue Type: Bug
> Reporter: Zameer Manji
>
> Background:
> In an environment where Aurora is used to launch Docker containers via the
> DockerContainerizer, it was observed that some tasks would not be killed.
> What happened is that a task was allocated with a low amount of memory but
> demanded a lot. This caused the linux OOM killer to be invoked. Unlike the
> MesosContainerizer, the agent doesn't tear down the container when the OOM
> killer is invoked. Instead the OOM killer just kills a process in the
> container and thermos and mesos are unaware (unless a process directly
> launched by thermos is killed).
> I observed in the scheduler logs that the scheduler was trying to kill a
> container every reconciliation period but it never died. The slave had the
> logs indicating it received the killTask RPC and forwarded it to Thermos.
> The thermos logs had several entries like every hour:
> {noformat}
> I1018 20:39:18.102894 6 executor_base.py:45] Executor
> [aaeac4c8-2b2f-4351-874b-a16bea1b36b0-S147]: Activating kill manager.
> I1018 20:39:18.103034 6 executor_base.py:45] Executor
> [aaeac4c8-2b2f-4351-874b-a16bea1b36b0-S147]: killTask returned.
> I1018 21:39:17.859935 6 executor_base.py:45] Executor
> [aaeac4c8-2b2f-4351-874b-a16bea1b36b0-S147]: killTask got task_id: value:
> "<task_id>"
> {noformat}
> However, the tasks was never killed. Looking at the stderr of thermos I saw
> the following entries:
> {noformat}
> Logged from file resource.py, line 155
> Traceback (most recent call last):
> File "/usr/lib/python2.7/logging/__init__.py", line 883, in emit
> self.flush()
> File "/usr/lib/python2.7/logging/__init__.py", line 843, in flush
> self.stream.flush()
> IOError: [Errno 12] Cannot allocate memory
> {noformat}
> and
> {noformat}
> Logged from file thermos_task_runner.py, line 171
> Traceback (most recent call last):
> File
> "/root/.pex/install/twitter.common.exceptions-0.3.3-py2-none-any.whl.2a67b833b1517d179ef1c8dc6f2dac1023d51e3c/twitter.common.exceptions-0.3.3-py2-none-any.whl/twitter/common/exceptions/__init__.py",
> line 126, in _excepting_run
> File "apache/aurora/executor/status_manager.py", line 47, in run
> File "apache/aurora/executor/common/status_checker.py", line 97, in status
> File "apache/aurora/executor/thermos_task_runner.py", line 358, in status
> File "apache/aurora/executor/thermos_task_runner.py", line 186, in
> compute_status
> File "apache/aurora/executor/thermos_task_runner.py", line 136, in
> task_state
> File "apache/thermos/monitoring/monitor.py", line 118, in task_state
> File "apache/thermos/monitoring/monitor.py", line 114, in get_state
> File "apache/thermos/monitoring/monitor.py", line 77, in _apply_states
> File
> "/root/.pex/install/twitter.common.recordio-0.3.3-py2-none-any.whl.9f1e9394eca1bc33ad7d10ae3025301866824139/twitter.common.recordio-0.3.3-py2-none-any.whl/twitter/common/recordio/recordio.py",
> line 182, in try_read
> class InvalidTypeException(Error): pass
> File
> "/root/.pex/install/twitter.common.recordio-0.3.3-py2-none-any.whl.9f1e9394eca1bc33ad7d10ae3025301866824139/twitter.common.recordio-0.3.3-py2-none-any.whl/twitter/common/recordio/recordio.py",
> line 168, in read
> return RecordIO.Reader.do_read(self._fp, self._codec)
> File
> "/root/.pex/install/twitter.common.recordio-0.3.3-py2-none-any.whl.9f1e9394eca1bc33ad7d10ae3025301866824139/twitter.common.recordio-0.3.3-py2-none-any.whl/twitter/common/recordio/recordio.py",
> line 135, in do_read
> header = fp.read(RecordIO.RECORD_HEADER_SIZE)
> File
> "/root/.pex/install/twitter.common.recordio-0.3.3-py2-none-any.whl.9f1e9394eca1bc33ad7d10ae3025301866824139/twitter.common.recordio-0.3.3-py2-none-any.whl/twitter/common/recordio/filelike.py",
> line 81, in read
> return self._fp.read(length)
> IOError: [Errno 12] Cannot allocate memory
> {noformat}
> It seems the regular avenues of reading checkpoints or logging data, thermos
> would get an IOError. Some part of twitter common installs an excepthook to
> log the exception, but we don't seem to do anything else.
> I think we should probably install our own exception hook to send a
> {{LOST_TASK}} with the exception information instead of failing to kill the
> task.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)