[
https://issues.apache.org/jira/browse/AURORA-1724?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15353291#comment-15353291
]
John Sirois commented on AURORA-1724:
-------------------------------------
I've reproed, but with some debug logging added - the mode ends up as the wrong
type leading to this issue.
>From master here:
{noformat}
$ git log -1
commit 73dd2a867fc977323164b34b3001452f94750300
Author: Mehrdad Nurolahzade <[email protected]>
Date: Tue Jun 21 09:40:15 2016 -0700
AURORA-1458 Add tier into the UI "show config" summary
Reviewed at https://reviews.apache.org/r/48796/
{noformat}
The debug logging added:
{noformat}
$ git diff -U1
diff --git a/src/main/python/apache/thermos/core/process.py
b/src/main/python/apache/thermos/core/process.py
index 1791b5f..7a007ee 100644
--- a/src/main/python/apache/thermos/core/process.py
+++ b/src/main/python/apache/thermos/core/process.py
@@ -134,2 +134,7 @@ class ProcessBase(object):
self._logger_mode = logger_mode
+
+ log.debug('>>> ProcessBase logging:')
+ log.debug(' destination: %s %s' % (self._logger_destination,
type(self._logger_destination)))
+ log.debug(' mode: %s %s' % (self._logger_mode, type(self._logger_mode)))
+
self._rotate_log_size = rotate_log_size
diff --git a/src/main/python/apache/thermos/core/runner.py
b/src/main/python/apache/thermos/core/runner.py
index 3ebf86e..1c1a5c1 100644
--- a/src/main/python/apache/thermos/core/runner.py
+++ b/src/main/python/apache/thermos/core/runner.py
@@ -475,2 +475,8 @@ class TaskRunner(object):
self._process_logger_mode = process_logger_mode
+
+ log.debug('>>> TaskRunner logging:')
+ log.debug(' destination: %s %s' % (self._process_logger_destination,
+
type(self._process_logger_destination)))
+ log.debug(' mode: %s %s' % (self._process_logger_mode,
type(self._process_logger_mode)))
+
self._rotate_log_size_mb = rotate_log_size_mb
{noformat}
The vagrant cluster change:
{noformat}
vagrant@aurora:~/aurora/examples/jobs$ diff
/etc/init/aurora-scheduler.conf.orig /etc/init/aurora-scheduler.conf
45c45
< -thermos_executor_flags="--announcer-ensemble localhost:2181
--announcer-zookeeper-auth-config
/home/vagrant/aurora/examples/vagrant/config/announcer-auth.json" \
---
> -thermos_executor_flags="--announcer-ensemble localhost:2181
> --announcer-zookeeper-auth-config
> /home/vagrant/aurora/examples/vagrant/config/announcer-auth.json
> --runner-logger-mode=rotate --runner-rotate-log-size-mb=100
> --runner-rotate-log-backups=10" \
{noformat}
I get the same backtrace in stderr:
{noformat}
vagrant@aurora:~/aurora/examples/jobs$ cat
/var/lib/mesos/slaves/ec508867-ab19-472d-99b9-d3cb4751b3f4-S0/frameworks/ec508867-ab19-472d-99b9-d3cb4751b3f4-0000/executors/thermos-www-data-prod-hello-0-85cac21a-2fcc-4d10-9ff4-a7259327d784/runs/latest/stderr
I0628 16:06:37.745342 9407 fetcher.cpp:424] Fetcher Info:
{"cache_directory":"\/tmp\/mesos\/fetch\/slaves\/ec508867-ab19-472d-99b9-d3cb4751b3f4-S0\/root","items":[{"action":"BYPASS_CACHE","uri":{"executable":true,"extract":true,"value":"\/home\/vagrant\/aurora\/dist\/thermos_executor.pex"}}],"sandbox_directory":"\/var\/lib\/mesos\/slaves\/ec508867-ab19-472d-99b9-d3cb4751b3f4-S0\/frameworks\/ec508867-ab19-472d-99b9-d3cb4751b3f4-0000\/executors\/thermos-www-data-prod-hello-0-85cac21a-2fcc-4d10-9ff4-a7259327d784\/runs\/ea3cb76d-9204-4086-bfbc-4a1a0bc8de78","user":"root"}
I0628 16:06:37.749626 9407 fetcher.cpp:379] Fetching URI
'/home/vagrant/aurora/dist/thermos_executor.pex'
I0628 16:06:37.749681 9407 fetcher.cpp:250] Fetching directly into the sandbox
directory
I0628 16:06:37.749730 9407 fetcher.cpp:187] Fetching URI
'/home/vagrant/aurora/dist/thermos_executor.pex'
I0628 16:06:37.749775 9407 fetcher.cpp:167] Copying resource with command:cp
'/home/vagrant/aurora/dist/thermos_executor.pex'
'/var/lib/mesos/slaves/ec508867-ab19-472d-99b9-d3cb4751b3f4-S0/frameworks/ec508867-ab19-472d-99b9-d3cb4751b3f4-0000/executors/thermos-www-data-prod-hello-0-85cac21a-2fcc-4d10-9ff4-a7259327d784/runs/ea3cb76d-9204-4086-bfbc-4a1a0bc8de78/thermos_executor.pex'
I0628 16:06:37.850827 9407 fetcher.cpp:456] Fetched
'/home/vagrant/aurora/dist/thermos_executor.pex' to
'/var/lib/mesos/slaves/ec508867-ab19-472d-99b9-d3cb4751b3f4-S0/frameworks/ec508867-ab19-472d-99b9-d3cb4751b3f4-0000/executors/thermos-www-data-prod-hello-0-85cac21a-2fcc-4d10-9ff4-a7259327d784/runs/ea3cb76d-9204-4086-bfbc-4a1a0bc8de78/thermos_executor.pex'
+ /usr/libexec/mesos/mesos-containerizer mount --help=false
--operation=make-rslave --path=/
+ grep -E /var/lib/mesos/.+ /proc/self/mountinfo
+ xargs --no-run-if-empty umount -l
+ + grepcut -v -d ea3cb76d-9204-4086-bfbc-4a1a0bc8de78 -f5
twitter.common.app debug: Initializing: twitter.common.log (Logging subsystem.)
Writing log files to disk in
/var/lib/mesos/slaves/ec508867-ab19-472d-99b9-d3cb4751b3f4-S0/frameworks/ec508867-ab19-472d-99b9-d3cb4751b3f4-0000/executors/thermos-www-data-prod-hello-0-85cac21a-2fcc-4d10-9ff4-a7259327d784/runs/ea3cb76d-9204-4086-bfbc-4a1a0bc8de78
I0628 16:06:39.311792 9418 exec.cpp:134] Version: 0.27.2
I0628 16:06:39.327440 9428 exec.cpp:208] Executor registered on slave
ec508867-ab19-472d-99b9-d3cb4751b3f4-S0
Writing log files to disk in
/var/lib/mesos/slaves/ec508867-ab19-472d-99b9-d3cb4751b3f4-S0/frameworks/ec508867-ab19-472d-99b9-d3cb4751b3f4-0000/executors/thermos-www-data-prod-hello-0-85cac21a-2fcc-4d10-9ff4-a7259327d784/runs/ea3cb76d-9204-4086-bfbc-4a1a0bc8de78
ERROR] Caught exception in self.control(): Logger mode rotate is invalid.
ERROR] Traceback (most recent call last):
File "apache/thermos/core/runner.py", line 578, in control
yield
File "apache/thermos/core/runner.py", line 922, in run
self._run()
File "apache/thermos/core/runner.py", line 929, in _run
iteration_wait = runner.run()
File "apache/thermos/core/runner.py", line 291, in run
launched = self.runner._run_plan(self.runner._regular_plan)
File "apache/thermos/core/runner.py", line 849, in _run_plan
self._set_process_status(process_name, ProcessState.WAITING)
File "apache/thermos/core/runner.py", line 693, in _set_process_status
self._dispatcher.dispatch(self._state, runner_ckpt, self._recovery)
File "apache/thermos/common/ckpt.py", line 384, in dispatch
self._run_process_dispatch(process_update.state, process_update)
File "apache/thermos/common/ckpt.py", line 215, in _run_process_dispatch
getattr(handler, handler_function)(process_update)
File "apache/thermos/core/runner.py", line 106, in on_waiting
process_update.process, process_update.seq + 1))
File "apache/thermos/core/runner.py", line 729, in
_task_process_from_process_name
preserve_env=self._preserve_env)
File "apache/thermos/core/process.py", line 353, in __init__
ProcessBase.__init__(self, *args, **kw)
File "apache/thermos/core/process.py", line 147, in __init__
raise ValueError("Logger mode %s is invalid." % self._logger_mode)
ValueError: Logger mode rotate is invalid.
twitter.common.app debug: Shutting application down.
twitter.common.app debug: Running exit function for twitter.common.log (Logging
subsystem.)
twitter.common.app debug: Finishing up module teardown.
twitter.common.app debug: Active thread: <_MainThread(MainThread, started
140311295522624)>
twitter.common.app debug: Active thread (daemon):
<TaskResourceMonitor(TaskResourceMonitor[www-data-prod-hello-0-85cac21a-2fcc-4d10-9ff4-a7259327d784]
[TID=9442], started daemon 140310956054272)>
twitter.common.app debug: Active thread (daemon): <WaitThread(Thread-13,
started daemon 140310441080576)>
twitter.common.app debug: Active thread (daemon): <WaitThread(Thread-12,
started daemon 140310449473280)>
twitter.common.app debug: Active thread (daemon): <_DummyThread(Dummy-2,
started daemon 140310989625088)>
twitter.common.app debug: Active thread (daemon): <WaitThread(Thread-8,
started daemon 140310474651392)>
twitter.common.app debug: Exiting cleanly.
{noformat}
And find from the new logging:
{noformat}
vagrant@aurora:~/aurora/examples/jobs$ grep -A2 ">>>"
/var/lib/mesos/slaves/ec508867-ab19-472d-99b9-d3cb4751b3f4-S0/frameworks/ec508867-ab19-472d-99b9-d3cb4751b3f4-0000/executors/thermos-www-data-prod-hello-0-85cac21a-2fcc-4d10-9ff4-a7259327d784/runs/latest/thermos_runner.DEBUG
D0628 16:06:40.411820 9436 runner.py:477] >>> TaskRunner logging:
D0628 16:06:40.412385 9436 runner.py:479] destination: file <type 'str'>
D0628 16:06:40.412695 9436 runner.py:480] mode: rotate <type 'str'>
--
D0628 16:06:40.608428 9436 process.py:136] >>> ProcessBase logging:
D0628 16:06:40.608752 9436 process.py:137] destination: file <type 'str'>
D0628 16:06:40.608982 9436 process.py:138] mode: ('rotate',) <type 'tuple'>
{noformat}
> Thermos runner fails if log rotation is enabled
> -----------------------------------------------
>
> Key: AURORA-1724
> URL: https://issues.apache.org/jira/browse/AURORA-1724
> Project: Aurora
> Issue Type: Bug
> Reporter: Stephan Erb
> Priority: Critical
>
> Thermos is failing to start when log rotation is enabled as [described in our
> documentation|https://github.com/apache/aurora/blob/master/docs/operations/configuration.md#log-rotation]
> Steps to reproduce:
> * start the latest developer vagrant box (or even then released 0.14)
> * update to scheduler wide {{thermos_executor_flags}} to include
> {{--runner-logger-mode=rotate --runner-rotate-log-size-mb=100
> --runner-rotate-log-backups=10}} as documented (see above)
> * launch a task
> * observe that the task has failed with {{LOST : Runner died while task was
> active.}}
> The thermos stderr then contains this traceback:
> {code}
> ERROR] Caught exception in self.control(): Logger mode rotate is invalid.
> ERROR] Traceback (most recent call last):
> File "apache/thermos/core/runner.py", line 572, in control
> yield
> File "apache/thermos/core/runner.py", line 916, in run
> self._run()
> File "apache/thermos/core/runner.py", line 923, in _run
> iteration_wait = runner.run()
> File "apache/thermos/core/runner.py", line 291, in run
> launched = self.runner._run_plan(self.runner._regular_plan)
> File "apache/thermos/core/runner.py", line 843, in _run_plan
> self._set_process_status(process_name, ProcessState.WAITING)
> File "apache/thermos/core/runner.py", line 687, in _set_process_status
> self._dispatcher.dispatch(self._state, runner_ckpt, self._recovery)
> File "apache/thermos/common/ckpt.py", line 384, in dispatch
> self._run_process_dispatch(process_update.state, process_update)
> File "apache/thermos/common/ckpt.py", line 215, in _run_process_dispatch
> getattr(handler, handler_function)(process_update)
> File "apache/thermos/core/runner.py", line 106, in on_waiting
> process_update.process, process_update.seq + 1))
> File "apache/thermos/core/runner.py", line 723, in
> _task_process_from_process_name
> preserve_env=self._preserve_env)
> File "apache/thermos/core/process.py", line 348, in __init__
> ProcessBase.__init__(self, *args, **kw)
> File "apache/thermos/core/process.py", line 142, in __init__
> raise ValueError("Logger mode %s is invalid." % self._logger_mode)
> ValueError: Logger mode rotate is invalid.
> {code}
> What's kind of obscure here: "rotate" is one of the [supported
> modes|https://github.com/apache/aurora/blob/master/src/main/python/apache/thermos/core/process.py#L73-L81].
>
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)