[ 
https://issues.apache.org/jira/browse/AURORA-1799?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15827017#comment-15827017
 ] 

Zameer Manji commented on AURORA-1799:
--------------------------------------

Today [~benley] reported something similar in Slack:

{noformat}
ERROR] Failed to stop health checkers:
ERROR] Traceback (most recent call last):
  File "apache/aurora/executor/aurora_executor.py", line 192, in _shutdown
    propagate_deadline(self._chained_checker.stop, timeout=self.STOP_TIMEOUT)
  File "apache/aurora/executor/aurora_executor.py", line 35, in 
propagate_deadline
    return deadline(*args, daemon=True, propagate=True, **kw)
  File 
"/root/.pex/install/twitter.common.concurrent-0.3.3-py2-none-any.whl.33d9c24da69d7478b4aa6d76f474f3773a61f6f9/twitter.common.concurrent-0.3.3-py2-none-any.whl/twitter/common/concurrent/dead
line.py", line 61, in deadline
    AnonymousThread().start()
  File "/usr/lib/python2.7/threading.py", line 745, in start
    _start_new_thread(self.__bootstrap, ())
error: can't start new thread
ERROR] Failed to stop runner:
ERROR] Traceback (most recent call last):
  File "apache/aurora/executor/aurora_executor.py", line 200, in _shutdown
    propagate_deadline(self._runner.stop, timeout=self.STOP_TIMEOUT)
  File "apache/aurora/executor/aurora_executor.py", line 35, in 
propagate_deadline
    return deadline(*args, daemon=True, propagate=True, **kw)
  File 
"/root/.pex/install/twitter.common.concurrent-0.3.3-py2-none-any.whl.33d9c24da69d7478b4aa6d76f474f3773a61f6f9/twitter.common.concurrent-0.3.3-py2-none-any.whl/twitter/common/concurrent/dead
line.py", line 61, in deadline
    AnonymousThread().start()
  File "/usr/lib/python2.7/threading.py", line 745, in start
    _start_new_thread(self.__bootstrap, ())
error: can't start new thread
Traceback (most recent call last):
  File 
"/root/.pex/install/twitter.common.exceptions-0.3.3-py2-none-any.whl.57572b1f0a301c36c91adf2c704d0e8dd4d48429/twitter.common.exceptions-0.3.3-py2-none-any.whl/twitter/common/exceptions/__in
it__.py", line 126, in _excepting_run
    self.__real_run(*args, **kw)
  File "apache/aurora/executor/status_manager.py", line 50, in run
  File "apache/aurora/executor/aurora_executor.py", line 218, in _shutdown
  File 
"/root/.pex/install/twitter.common.concurrent-0.3.3-py2-none-any.whl.33d9c24da69d7478b4aa6d76f474f3773a61f6f9/twitter.common.concurrent-0.3.3-py2-none-any.whl/twitter/common/concurrent/defe
rred.py", line 56, in defer
    deferred.start()
  File "/usr/lib/python2.7/threading.py", line 745, in start
    _start_new_thread(self.__bootstrap, ())
thread.error: can't start new thread
Traceback (most recent call last):
  File 
"/root/.pex/install/twitter.common.exceptions-0.3.3-py2-none-any.whl.57572b1f0a301c36c91adf2c704d0e8dd4d48429/twitter.common.exceptions-0.3.3-py2-none-any.whl/twitter/common/exceptions/__in
it__.py", line 126, in _excepting_run
    self.__real_run(*args, **kw)
  File "apache/thermos/monitoring/resource.py", line 239, in run
  File 
"/root/.pex/install/twitter.common.concurrent-0.3.3-py2-none-any.whl.33d9c24da69d7478b4aa6d76f474f3773a61f6f9/twitter.common.concurrent-0.3.3-py2-none-any.whl/twitter/common/concurrent/even
t_muxer.py", line 79, in wait
    thread.start()
  File "/usr/lib/python2.7/threading.py", line 745, in start
    _start_new_thread(self.__bootstrap, ())
thread.error: can't start new thread

E0116 20:46:46.568775    34 socket.hpp:174] Shutdown failed on fd=13: Transport 
endpoint is not connected [107]
E0116 20:46:51.789016    34 socket.hpp:174] Shutdown failed on fd=14: Transport 
endpoint is not connected [107]
E0116 20:50:47.904999    34 socket.hpp:174] Shutdown failed on fd=13: Transport 
endpoint is not connected [107]
E0116 20:50:48.097457    34 socket.hpp:174] Shutdown failed on fd=13: Transport 
endpoint is not connected [107]
E0116 20:50:50.277053    34 socket.hpp:174] Shutdown failed on fd=13: Transport 
endpoint is not connected [107]
E0116 20:50:51.006816    34 socket.hpp:174] Shutdown failed on fd=13: Transport 
endpoint is not connected [107]
E0116 20:50:51.022123    34 socket.hpp:174] Shutdown failed on fd=13: Transport 
endpoint is not connected [107]
E0116 20:50:51.244179    34 socket.hpp:174] Shutdown failed on fd=13: Transport 
endpoint is not connected [107]
E0116 20:50:55.407006    34 socket.hpp:174] Shutdown failed on fd=14: Transport 
endpoint is not connected [107]
E0116 20:50:55.410759    34 socket.hpp:174] Shutdown failed on fd=15: Transport 
endpoint is not connected [107]
E0116 20:50:56.703348    34 socket.hpp:174] Shutdown failed on fd=14: Transport 
endpoint is not connected [107]
E0116 20:50:56.707471    34 socket.hpp:174] Shutdown failed on fd=15: Transport 
endpoint is not connected [107]
E0116 20:50:56.712406    34 socket.hpp:174] Shutdown failed on fd=16: Transport 
endpoint is not connected [107]
E0116 20:50:57.053045    34 socket.hpp:174] Shutdown failed on fd=14: Transport 
endpoint is not connected [107]
E0116 20:50:57.379636    34 socket.hpp:174] Shutdown failed on fd=14: Transport 
endpoint is not connected [107]
E0116 20:50:57.454205    34 socket.hpp:174] Shutdown failed on fd=14: Transport 
endpoint is not connected [107]
E0116 20:50:57.848105    34 socket.hpp:174] Shutdown failed on fd=14: Transport 
endpoint is not connected [107]
E0116 20:50:59.661581    34 socket.hpp:174] Shutdown failed on fd=15: Transport 
endpoint is not connected [107]
E0116 20:51:03.007069    34 socket.hpp:174] Shutdown failed on fd=14: Transport 
endpoint is not connected [107]
E0116 20:51:08.751930    34 socket.hpp:174] Shutdown failed on fd=14: Transport 
endpoint is not connected [107]
E0116 20:51:08.833519    34 socket.hpp:174] Shutdown failed on fd=13: Transport 
endpoint is not connected [107]
E0116 20:51:09.091882    34 socket.hpp:174] Shutdown failed on fd=13: Transport 
endpoint is not connected [107]
E0116 20:51:09.166265    34 socket.hpp:174] Shutdown failed on fd=13: Transport 
endpoint is not connected [107]
E0116 20:51:09.275421    34 socket.hpp:174] Shutdown failed on fd=13: Transport 
endpoint is not connected [107]
E0116 20:51:09.322955    34 socket.hpp:174] Shutdown failed on fd=13: Transport 
endpoint is not connected [107]
E0116 20:51:09.434495    34 socket.hpp:174] Shutdown failed on fd=13: Transport 
endpoint is not connected [107]
E0116 20:51:09.495900    34 socket.hpp:174] Shutdown failed on fd=13: Transport 
endpoint is not connected [107]
{noformat}

> Thermos does not handle low memory scenarios gracefully
> -------------------------------------------------------
>
>                 Key: AURORA-1799
>                 URL: https://issues.apache.org/jira/browse/AURORA-1799
>             Project: Aurora
>          Issue Type: Bug
>            Reporter: Zameer Manji
>
> Background:
> In an environment where Aurora is used to launch Docker containers via the 
> DockerContainerizer, it was observed that some tasks would not be killed.
> What happened is that a task was allocated with a low amount of memory but 
> demanded a lot. This caused the linux OOM killer to be invoked. Unlike the 
> MesosContainerizer, the agent doesn't tear down the container when the OOM 
> killer is invoked. Instead the OOM killer just kills a process in the 
> container and thermos and mesos are unaware (unless a process directly 
> launched by thermos is killed).
> I observed in the scheduler logs that the scheduler was trying to kill a 
> container every reconciliation period but it never died. The slave had the 
> logs indicating it received the killTask RPC and forwarded it to Thermos.
> The thermos logs had several entries like every hour:
> {noformat}
> I1018 20:39:18.102894 6 executor_base.py:45] Executor 
> [aaeac4c8-2b2f-4351-874b-a16bea1b36b0-S147]: Activating kill manager.
> I1018 20:39:18.103034 6 executor_base.py:45] Executor 
> [aaeac4c8-2b2f-4351-874b-a16bea1b36b0-S147]: killTask returned.
> I1018 21:39:17.859935 6 executor_base.py:45] Executor 
> [aaeac4c8-2b2f-4351-874b-a16bea1b36b0-S147]: killTask got task_id: value: 
> "<task_id>"
> {noformat}
> However, the tasks was never killed. Looking at the stderr of thermos I saw 
> the following entries:
> {noformat}
> Logged from file resource.py, line 155
> Traceback (most recent call last):
>   File "/usr/lib/python2.7/logging/__init__.py", line 883, in emit
>     self.flush()
>   File "/usr/lib/python2.7/logging/__init__.py", line 843, in flush
>     self.stream.flush()
> IOError: [Errno 12] Cannot allocate memory
> {noformat}
> and 
> {noformat}
> Logged from file thermos_task_runner.py, line 171
> Traceback (most recent call last):
>   File 
> "/root/.pex/install/twitter.common.exceptions-0.3.3-py2-none-any.whl.2a67b833b1517d179ef1c8dc6f2dac1023d51e3c/twitter.common.exceptions-0.3.3-py2-none-any.whl/twitter/common/exceptions/__init__.py",
>  line 126, in _excepting_run
>   File "apache/aurora/executor/status_manager.py", line 47, in run
>   File "apache/aurora/executor/common/status_checker.py", line 97, in status
>   File "apache/aurora/executor/thermos_task_runner.py", line 358, in status
>   File "apache/aurora/executor/thermos_task_runner.py", line 186, in 
> compute_status
>   File "apache/aurora/executor/thermos_task_runner.py", line 136, in 
> task_state
>   File "apache/thermos/monitoring/monitor.py", line 118, in task_state
>   File "apache/thermos/monitoring/monitor.py", line 114, in get_state
>   File "apache/thermos/monitoring/monitor.py", line 77, in _apply_states
>   File 
> "/root/.pex/install/twitter.common.recordio-0.3.3-py2-none-any.whl.9f1e9394eca1bc33ad7d10ae3025301866824139/twitter.common.recordio-0.3.3-py2-none-any.whl/twitter/common/recordio/recordio.py",
>  line 182, in try_read
>     class InvalidTypeException(Error): pass
>   File 
> "/root/.pex/install/twitter.common.recordio-0.3.3-py2-none-any.whl.9f1e9394eca1bc33ad7d10ae3025301866824139/twitter.common.recordio-0.3.3-py2-none-any.whl/twitter/common/recordio/recordio.py",
>  line 168, in read
>     return RecordIO.Reader.do_read(self._fp, self._codec)
>   File 
> "/root/.pex/install/twitter.common.recordio-0.3.3-py2-none-any.whl.9f1e9394eca1bc33ad7d10ae3025301866824139/twitter.common.recordio-0.3.3-py2-none-any.whl/twitter/common/recordio/recordio.py",
>  line 135, in do_read
>     header = fp.read(RecordIO.RECORD_HEADER_SIZE)
>   File 
> "/root/.pex/install/twitter.common.recordio-0.3.3-py2-none-any.whl.9f1e9394eca1bc33ad7d10ae3025301866824139/twitter.common.recordio-0.3.3-py2-none-any.whl/twitter/common/recordio/filelike.py",
>  line 81, in read
>     return self._fp.read(length)
> IOError: [Errno 12] Cannot allocate memory
> {noformat}
> It seems the regular avenues of reading checkpoints or logging data, thermos 
> would get an IOError. Some part of twitter common installs an excepthook to 
> log the exception, but we don't seem to do anything else.
> I think we should probably install our own exception hook to send a 
> {{LOST_TASK}} with the exception information instead of failing to kill the 
> task.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to