[
https://issues.apache.org/jira/browse/AURORA-175?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14144104#comment-14144104
]
brian wickman commented on AURORA-175:
--------------------------------------
Seems to be an issue in psutil
{noformat}
I0922 17:07:00.212234 27053 exec.cpp:251] Received reconnect request from slave
20140729-023029-1890854154-5050-33440-170
I0922 17:07:00.223006 27047 exec.cpp:228] Executor re-registered on slave
20140729-023029-1890854154-5050-33440-170
ERROR] Caught exception in self.control(): 'getpwuid(): uid not found: 13241'
ERROR] Traceback (most recent call last):
File
"/var/lib/mesos/slaves/20140729-023029-1890854154-5050-33440-170/frameworks/201103282247-0000000019-0000/executors/thermos-1409459211102-drobinson-test-example-0-3d048ca5-29a4-47b0-824e-fd6c7c9cf9e8/runs/020cc0ec-03cf-4a7a-8b2b-8fcc21a75d21/thermos_runner.pex/.deps/apache.thermos.core-0.5.1_DEV1407435006-py2.6.egg/apache/thermos/core/runner.py",
line 556, in control
yield
File
"/var/lib/mesos/slaves/20140729-023029-1890854154-5050-33440-170/frameworks/201103282247-0000000019-0000/executors/thermos-1409459211102-drobinson-test-example-0-3d048ca5-29a4-47b0-824e-fd6c7c9cf9e8/runs/020cc0ec-03cf-4a7a-8b2b-8fcc21a75d21/thermos_runner.pex/.deps/apache.thermos.core-0.5.1_DEV1407435006-py2.6.egg/apache/thermos/core/runner.py",
line 843, in run
self._run()
File
"/var/lib/mesos/slaves/20140729-023029-1890854154-5050-33440-170/frameworks/201103282247-0000000019-0000/executors/thermos-1409459211102-drobinson-test-example-0-3d048ca5-29a4-47b0-824e-fd6c7c9cf9e8/runs/020cc0ec-03cf-4a7a-8b2b-8fcc21a75d21/thermos_runner.pex/.deps/apache.thermos.core-0.5.1_DEV1407435006-py2.6.egg/apache/thermos/core/runner.py",
line 850, in _run
iteration_wait = runner.run()
File
"/var/lib/mesos/slaves/20140729-023029-1890854154-5050-33440-170/frameworks/201103282247-0000000019-0000/executors/thermos-1409459211102-drobinson-test-example-0-3d048ca5-29a4-47b0-824e-fd6c7c9cf9e8/runs/020cc0ec-03cf-4a7a-8b2b-8fcc21a75d21/thermos_runner.pex/.deps/apache.thermos.core-0.5.1_DEV1407435006-py2.6.egg/apache/thermos/core/runner.py",
line 290, in run
launched = self.runner._run_plan(self.runner._regular_plan)
File
"/var/lib/mesos/slaves/20140729-023029-1890854154-5050-33440-170/frameworks/201103282247-0000000019-0000/executors/thermos-1409459211102-drobinson-test-example-0-3d048ca5-29a4-47b0-824e-fd6c7c9cf9e8/runs/020cc0ec-03cf-4a7a-8b2b-8fcc21a75d21/thermos_runner.pex/.deps/apache.thermos.core-0.5.1_DEV1407435006-py2.6.egg/apache/thermos/core/runner.py",
line 754, in _run_plan
if self.is_process_lost(process_name):
File
"/var/lib/mesos/slaves/20140729-023029-1890854154-5050-33440-170/frameworks/201103282247-0000000019-0000/executors/thermos-1409459211102-drobinson-test-example-0-3d048ca5-29a4-47b0-824e-fd6c7c9cf9e8/runs/020cc0ec-03cf-4a7a-8b2b-8fcc21a75d21/thermos_runner.pex/.deps/apache.thermos.core-0.5.1_DEV1407435006-py2.6.egg/apache/thermos/core/runner.py",
line 737, in is_process_lost
if forked_but_never_came_up() or running_but_coordinator_died():
File
"/var/lib/mesos/slaves/20140729-023029-1890854154-5050-33440-170/frameworks/201103282247-0000000019-0000/executors/thermos-1409459211102-drobinson-test-example-0-3d048ca5-29a4-47b0-824e-fd6c7c9cf9e8/runs/020cc0ec-03cf-4a7a-8b2b-8fcc21a75d21/thermos_runner.pex/.deps/apache.thermos.core-0.5.1_DEV1407435006-py2.6.egg/apache/thermos/core/runner.py",
line 730, in running_but_coordinator_died
coordinator_pid, _, _ = TaskRunnerHelper.scan_process(self.state,
process_name)
File
"/var/lib/mesos/slaves/20140729-023029-1890854154-5050-33440-170/frameworks/201103282247-0000000019-0000/executors/thermos-1409459211102-drobinson-test-example-0-3d048ca5-29a4-47b0-824e-fd6c7c9cf9e8/runs/020cc0ec-03cf-4a7a-8b2b-8fcc21a75d21/thermos_runner.pex/.deps/apache.thermos.core-0.5.1_DEV1407435006-py2.6.egg/apache/thermos/core/helper.py",
line 130, in scan_process
if cls.this_is_really_our_pid(coordinator_process, process_owner,
process_run.fork_time):
File
"/var/lib/mesos/slaves/20140729-023029-1890854154-5050-33440-170/frameworks/201103282247-0000000019-0000/executors/thermos-1409459211102-drobinson-test-example-0-3d048ca5-29a4-47b0-824e-fd6c7c9cf9e8/runs/020cc0ec-03cf-4a7a-8b2b-8fcc21a75d21/thermos_runner.pex/.deps/apache.thermos.core-0.5.1_DEV1407435006-py2.6.egg/apache/thermos/core/helper.py",
line 102, in this_is_really_our_pid
if process.username != current_user:
File
"/root/.pex/install/psutil-1.1.2-py2.6-linux-x86_64.egg.3c8008ea22662d5ffd7a5f0b056c41fdc13c1c49/psutil-1.1.2-py2.6-linux-x86_64.egg/psutil/__init__.py",
line 437, in username
return pwd.getpwuid(self.uids.real).pw_name
KeyError: 'getpwuid(): uid not found: 13241'
{noformat}
> thermos runner should discriminate failures using exit status
> -------------------------------------------------------------
>
> Key: AURORA-175
> URL: https://issues.apache.org/jira/browse/AURORA-175
> Project: Aurora
> Issue Type: Task
> Components: Executor, Thermos
> Reporter: brian wickman
> Priority: Critical
>
> We do the correct thing on the executor side when there is a
> configuration/interpolation problem (report FAILURE.)
> On the thermos_runner side, we don't exit with a separate exit status for bad
> configuration, nor do we even pay attention to the exit status from the
> thermos executor. So when the runner exits unexpectedly, that's always
> treated as a LOST. Instead there should be a contract between the
> thermos_runner and thermos_executor about certain classes of failures
> indicated by exit statuses so that we can differentiate between legit LOST
> and FAILURE, for example if the user no longer exists on the box.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)