Repository: aurora Updated Branches: refs/heads/master 193f17ee4 -> 17ade117b
Execute shell-based health checks as the task user. Bugs closed: AURORA-1641 Reviewed at https://reviews.apache.org/r/45506/ Project: http://git-wip-us.apache.org/repos/asf/aurora/repo Commit: http://git-wip-us.apache.org/repos/asf/aurora/commit/17ade117 Tree: http://git-wip-us.apache.org/repos/asf/aurora/tree/17ade117 Diff: http://git-wip-us.apache.org/repos/asf/aurora/diff/17ade117 Branch: refs/heads/master Commit: 17ade117b8171d4ef4be62fd4b617a024658551d Parents: 193f17e Author: Bill Farner <[email protected]> Authored: Thu Mar 31 14:48:15 2016 -0700 Committer: Bill Farner <[email protected]> Committed: Thu Mar 31 14:48:15 2016 -0700 ---------------------------------------------------------------------- .../apache/aurora/common/health_check/shell.py | 23 ++++-- .../aurora/executor/common/health_checker.py | 14 +++- .../aurora/common/health_check/test_shell.py | 79 +++++++++----------- .../executor/common/test_health_checker.py | 7 +- .../http/http_example_bad_healthcheck.aurora | 21 ++++-- .../sh/org/apache/aurora/e2e/test_end_to_end.sh | 2 +- 6 files changed, 79 insertions(+), 67 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/aurora/blob/17ade117/src/main/python/apache/aurora/common/health_check/shell.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/common/health_check/shell.py b/src/main/python/apache/aurora/common/health_check/shell.py index 6cb7dfc..3575082 100644 --- a/src/main/python/apache/aurora/common/health_check/shell.py +++ b/src/main/python/apache/aurora/common/health_check/shell.py @@ -26,16 +26,19 @@ else: class ShellHealthCheck(object): - def __init__(self, cmd, timeout_secs=None): + def __init__(self, cmd, preexec_fn=None, timeout_secs=None): """ Initialize with the commmand we would like to call. :param cmd: Command to execute that is expected to have a 0 return code on success. :type cmd: str + :param preexec_fn: Callable to invoke just before the child shell process is executed. + :type preexec_fn: callable :param timeout_secs: Timeout in seconds. :type timeout_secs: int """ - self.cmd = cmd - self.timeout_secs = timeout_secs + self._cmd = cmd + self._preexec_fn = preexec_fn + self._timeout_secs = timeout_secs def __call__(self): """ @@ -45,14 +48,18 @@ class ShellHealthCheck(object): :rtype tuple: """ try: - subprocess.check_call(self.cmd, timeout=self.timeout_secs, shell=True) + subprocess.check_call( + self._cmd, + timeout=self._timeout_secs, + shell=True, + preexec_fn=self._preexec_fn) return True, None except subprocess.CalledProcessError as reason: # The command didn't return a 0 so provide reason for failure. return False, str(reason) + except subprocess.TimeoutExpired: + return False, 'Health check timed out.' except OSError as e: - reason = 'OSError: {error}'.format(error=e.strerror) - return False, reason + return False, 'OSError: %s' % e.strerror except ValueError: - reason = 'Invalid commmand.' - return False, reason + return False, 'Invalid commmand.' http://git-wip-us.apache.org/repos/asf/aurora/blob/17ade117/src/main/python/apache/aurora/executor/common/health_checker.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/executor/common/health_checker.py b/src/main/python/apache/aurora/executor/common/health_checker.py index 28fd3ec..d8675be 100644 --- a/src/main/python/apache/aurora/executor/common/health_checker.py +++ b/src/main/python/apache/aurora/executor/common/health_checker.py @@ -13,6 +13,7 @@ # import os.path +import pwd import threading import time import traceback @@ -243,10 +244,15 @@ class HealthCheckerProvider(StatusCheckerProvider): task=assigned_task, cmd=shell_command ) - shell_signaler = ShellHealthCheck( - cmd=interpolated_command, - timeout_secs=timeout_secs, - ) + + pw_entry = pwd.getpwnam(assigned_task.task.job.role) + def demote_to_user(): + os.setgid(pw_entry.pw_gid) + os.setuid(pw_entry.pw_uid) + + shell_signaler = ShellHealthCheck(cmd=interpolated_command, + preexec_fn=demote_to_user, + timeout_secs=timeout_secs) a_health_checker = lambda: shell_signaler() else: portmap = resolve_ports(mesos_task, assigned_task.assignedPorts) http://git-wip-us.apache.org/repos/asf/aurora/blob/17ade117/src/test/python/apache/aurora/common/health_check/test_shell.py ---------------------------------------------------------------------- diff --git a/src/test/python/apache/aurora/common/health_check/test_shell.py b/src/test/python/apache/aurora/common/health_check/test_shell.py index 7026af8..011464c 100644 --- a/src/test/python/apache/aurora/common/health_check/test_shell.py +++ b/src/test/python/apache/aurora/common/health_check/test_shell.py @@ -31,65 +31,56 @@ else: class TestHealthChecker(unittest.TestCase): - @mock.patch('subprocess32.check_call') - def test_health_check_ok(self, mock_sub): - timeout = 30 - cmd = 'success cmd' - shell = ShellHealthCheck(cmd, timeout_secs=timeout) + @mock.patch('subprocess32.check_call', autospec=True) + def test_health_check_ok(self, mock_popen): + shell = ShellHealthCheck('cmd', timeout_secs=30) success, msg = shell() self.assertTrue(success) self.assertIsNone(msg) - mock_sub.assert_called_once_with( - 'success cmd', - timeout=30, - shell=True, - ) - - @mock.patch('subprocess32.check_call') - def test_health_check_failed(self, mock_sub): - timeout = 30 + mock_popen.assert_called_once_with('cmd', shell=True, timeout=30, preexec_fn=mock.ANY) + + @mock.patch('subprocess32.check_call', autospec=True) + def test_health_check_failed(self, mock_popen): # Fail due to command returning a non-0 exit status. - mock_sub.side_effect = subprocess.CalledProcessError(1, 'failed') - cmd = 'cmd to fail' - shell = ShellHealthCheck(cmd, timeout_secs=timeout) + mock_popen.side_effect = subprocess.CalledProcessError(1, 'failed') + + shell = ShellHealthCheck('cmd', timeout_secs=30) success, msg = shell() - mock_sub.assert_called_once_with( - 'cmd to fail', - timeout=30, - shell=True, - ) + mock_popen.assert_called_once_with('cmd', shell=True, timeout=30, preexec_fn=mock.ANY) + self.assertFalse(success) self.assertEqual(msg, "Command 'failed' returned non-zero exit status 1") - @mock.patch('subprocess32.check_call') - def test_health_check_os_error(self, mock_sub): - timeout = 30 + @mock.patch('subprocess32.check_call', autospec=True) + def test_health_check_timeout(self, mock_popen): + # Fail due to command returning a non-0 exit status. + mock_popen.side_effect = subprocess.TimeoutExpired('failed', timeout=30) + + shell = ShellHealthCheck('cmd', timeout_secs=30) + success, msg = shell() + mock_popen.assert_called_once_with('cmd', shell=True, timeout=30, preexec_fn=mock.ANY) + + self.assertFalse(success) + self.assertEqual(msg, 'Health check timed out.') + + @mock.patch('subprocess32.check_call', autospec=True) + def test_health_check_os_error(self, mock_popen): # Fail due to command not existing. - mock_sub.side_effect = OSError(1, 'failed') - cmd = 'cmd to not exist' - shell = ShellHealthCheck(cmd, timeout_secs=timeout) + mock_popen.side_effect = OSError(1, 'failed') + + shell = ShellHealthCheck('cmd', timeout_secs=30) success, msg = shell() - mock_sub.assert_called_once_with( - 'cmd to not exist', - timeout=30, - shell=True, - ) + mock_popen.assert_called_once_with('cmd', shell=True, timeout=30, preexec_fn=mock.ANY) self.assertFalse(success) self.assertEqual(msg, 'OSError: failed') - @mock.patch('subprocess32.check_call') - def test_health_check_value_error(self, mock_sub): - timeout = 30 + @mock.patch('subprocess32.check_call', autospec=True) + def test_health_check_value_error(self, mock_popen): # Invalid commmand passed in raises ValueError. - mock_sub.side_effect = ValueError('Could not read command.') - cmd = 'defensive cmd' + mock_popen.side_effect = ValueError('Could not read command.') timeout = 10 - shell = ShellHealthCheck(cmd, timeout_secs=timeout) + shell = ShellHealthCheck('cmd', timeout_secs=timeout) success, msg = shell() - mock_sub.assert_called_once_with( - 'defensive cmd', - timeout=10, - shell=True, - ) + mock_popen.assert_called_once_with('cmd', shell=True, timeout=10, preexec_fn=mock.ANY) self.assertFalse(success) self.assertEqual(msg, 'Invalid commmand.') http://git-wip-us.apache.org/repos/asf/aurora/blob/17ade117/src/test/python/apache/aurora/executor/common/test_health_checker.py ---------------------------------------------------------------------- diff --git a/src/test/python/apache/aurora/executor/common/test_health_checker.py b/src/test/python/apache/aurora/executor/common/test_health_checker.py index 19c4f76..b0ce81a 100644 --- a/src/test/python/apache/aurora/executor/common/test_health_checker.py +++ b/src/test/python/apache/aurora/executor/common/test_health_checker.py @@ -39,7 +39,7 @@ from apache.aurora.executor.common.sandbox import SandboxInterface from .fixtures import HELLO_WORLD, MESOS_JOB -from gen.apache.aurora.api.ttypes import AssignedTask, ExecutorConfig, TaskConfig +from gen.apache.aurora.api.ttypes import AssignedTask, ExecutorConfig, JobKey, TaskConfig class TestHealthChecker(unittest.TestCase): @@ -246,13 +246,15 @@ class TestHealthCheckerProvider(unittest.TestCase): assert health_checker.threaded_health_checker.interval == interval_secs assert health_checker.threaded_health_checker.initial_interval == initial_interval_secs - def test_from_assigned_task_shell(self): + @mock.patch('pwd.getpwnam') + def test_from_assigned_task_shell(self, mock_getpwnam): interval_secs = 17 initial_interval_secs = 3 max_consecutive_failures = 2 timeout_secs = 5 shell_config = ShellHealthChecker(shell_command='failed command') task_config = TaskConfig( + job=JobKey(role='role', environment='env', name='name'), executorConfig=ExecutorConfig( name='thermos-generic', data=MESOS_JOB( @@ -276,6 +278,7 @@ class TestHealthCheckerProvider(unittest.TestCase): assert health_checker.threaded_health_checker.initial_interval == initial_interval_secs hct_max_fail = health_checker.threaded_health_checker.max_consecutive_failures assert hct_max_fail == max_consecutive_failures + mock_getpwnam.assert_called_once_with(task_config.job.role) def test_interpolate_cmd(self): """Making sure thermos.ports[foo] gets correctly substituted with assignedPorts info.""" http://git-wip-us.apache.org/repos/asf/aurora/blob/17ade117/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora ---------------------------------------------------------------------- diff --git a/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora b/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora index c709b03..0534c9e 100644 --- a/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora +++ b/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora @@ -18,7 +18,8 @@ DEFAULT_CMD = 'cp /vagrant/src/test/sh/org/apache/aurora/e2e/http_example.py .' run_server = Process( name = 'run_server', - cmdline = 'python http_example.py {{thermos.ports[http]}}') + cmdline = 'python http_example.py {{thermos.ports[http]}}' +) stage_server = Process( name = 'stage_server', @@ -29,18 +30,22 @@ test_task = Task( name = 'http_example', resources = Resources(cpu=0.5, ram=32*MB, disk=64*MB), processes = [stage_server, run_server], - constraints = order(stage_server, run_server)) + constraints = order(stage_server, run_server) +) update_config = UpdateConfig(watch_secs=10, batch_size=2) # "I am going to fail" config. shell_config = ShellHealthChecker( - shell_command='grep foo' - ) + # This shell validates two things: + # 1. the shell command is not run as a privileged user + # 2. a failed shell command results in a health check failure + shell_command='rm -f /etc/passwd' +) health_check_config = HealthCheckConfig( - health_checker=HealthCheckerConfig(shell=shell_config), - initial_interval_secs=5, - interval_secs=1, - ) + health_checker=HealthCheckerConfig(shell=shell_config), + initial_interval_secs=5, + interval_secs=1, +) job = Service( cluster = 'devcluster', http://git-wip-us.apache.org/repos/asf/aurora/blob/17ade117/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh ---------------------------------------------------------------------- diff --git a/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh b/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh index e1c12bb..77a4c36 100755 --- a/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh +++ b/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh @@ -202,7 +202,7 @@ test_update_fail() { | tail -n +2 | awk '{print $2}') # || is so that we don't return an EXIT so that `trap collect_result` doesn't get triggered. aurora update wait $_jobkey $_update_id || echo $? - # MAKING SURE WE ROLLED BACK. + # Making sure we rolled back. local status=$(aurora update info $_jobkey $_update_id | grep 'Current status' | awk '{print $NF}') if [[ $status != "ROLLED_BACK" ]]; then echo "Update should have completed in ROLLED_BACK state due to failed healthcheck."
