Repository: aurora Updated Branches: refs/heads/master f91ecd107 -> 0d7f946f7
Add support for performing health checks with a shell command. Bugs closed: AURORA-1551 Reviewed at https://reviews.apache.org/r/41154/ Project: http://git-wip-us.apache.org/repos/asf/aurora/repo Commit: http://git-wip-us.apache.org/repos/asf/aurora/commit/0d7f946f Tree: http://git-wip-us.apache.org/repos/asf/aurora/tree/0d7f946f Diff: http://git-wip-us.apache.org/repos/asf/aurora/diff/0d7f946f Branch: refs/heads/master Commit: 0d7f946f76e4600cec878e23d5b2a44702f9b4f6 Parents: f91ecd1 Author: Dmitriy Shirchenko <[email protected]> Authored: Mon Dec 14 21:37:44 2015 -0800 Committer: Maxim Khutornenko <[email protected]> Committed: Mon Dec 14 21:37:44 2015 -0800 ---------------------------------------------------------------------- 3rdparty/python/requirements.txt | 1 + docs/configuration-reference.md | 16 +-- src/main/python/apache/aurora/client/config.py | 44 ++++++++ src/main/python/apache/aurora/common/BUILD | 1 + .../aurora/common/health_check/__init__.py | 13 +++ .../aurora/common/health_check/http_signaler.py | 112 +++++++++++++++++++ .../apache/aurora/common/health_check/shell.py | 60 ++++++++++ .../apache/aurora/common/http_signaler.py | 105 ----------------- .../python/apache/aurora/config/schema/base.py | 10 +- .../aurora/executor/common/health_checker.py | 53 ++++++--- .../apache/aurora/executor/http_lifecycle.py | 2 +- src/test/python/apache/aurora/client/BUILD | 1 + .../python/apache/aurora/client/test_config.py | 65 +++++++++++ src/test/python/apache/aurora/common/BUILD | 10 -- .../apache/aurora/common/health_check/BUILD | 39 +++++++ .../aurora/common/health_check/__init__.py | 13 +++ .../common/health_check/test_http_signaler.py | 111 ++++++++++++++++++ .../aurora/common/health_check/test_shell.py | 91 +++++++++++++++ .../apache/aurora/common/test_http_signaler.py | 111 ------------------ .../executor/common/test_health_checker.py | 56 +++++++++- .../apache/aurora/e2e/http/http_example.aurora | 2 +- .../http/http_example_bad_healthcheck.aurora | 73 ++++++++++++ .../aurora/e2e/http/http_example_updated.aurora | 2 +- .../sh/org/apache/aurora/e2e/test_end_to_end.sh | 38 ++++++- 24 files changed, 769 insertions(+), 260 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/3rdparty/python/requirements.txt ---------------------------------------------------------------------- diff --git a/3rdparty/python/requirements.txt b/3rdparty/python/requirements.txt index 16fa35b..8c72880 100644 --- a/3rdparty/python/requirements.txt +++ b/3rdparty/python/requirements.txt @@ -24,6 +24,7 @@ psutil==3.2.2 pystachio==0.8.0 requests==2.7.0 requests-kerberos==0.7.0 +subprocess32==3.2.7 thrift==0.9.1 twitter.common.app==0.3.3 twitter.common.collections==0.3.3 http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/docs/configuration-reference.md ---------------------------------------------------------------------- diff --git a/docs/configuration-reference.md b/docs/configuration-reference.md index 077979a..07149c7 100644 --- a/docs/configuration-reference.md +++ b/docs/configuration-reference.md @@ -380,17 +380,19 @@ Parameters for controlling the rate and policy of rolling updates. ### HealthCheckConfig Objects -Parameters for controlling a task's health checks via HTTP. +Parameters for controlling a task's health checks via HTTP or a shell command. | object | type | description | ------- | :-------: | -------- -| ```initial_interval_secs``` | Integer | Initial delay for performing an HTTP health check. (Default: 15) -| ```interval_secs``` | Integer | Interval on which to check the task's health via HTTP. (Default: 10) -| ```max_consecutive_failures``` | Integer | Maximum number of consecutive failures that tolerated before considering a task unhealthy (Default: 0) -| ```timeout_secs``` | Integer | HTTP request timeout. (Default: 1) | ```endpoint``` | String | HTTP endpoint to check (Default: /health) -| ```expected_response``` | String | If not empty, fail the health check if the response differs. Case insensitive. (Default: ok) -| ```expected_response_code``` | Integer | If not zero, fail the health check if the response code differs. (Default: 0) +| ```expected_response``` | String | If not empty, fail the HTTP health check if the response differs. Case insensitive. (Default: ok) +| ```expected_response_code``` | Integer | If not zero, fail the HTTP health check if the response code differs. (Default: 0) +| ```initial_interval_secs``` | Integer | Initial delay for performing a health check. (Default: 15) +| ```interval_secs``` | Integer | Interval on which to check the task's health. (Default: 10) +| ```max_consecutive_failures``` | Integer | Maximum number of consecutive failures that will be tolerated before considering a task unhealthy (Default: 0) +| ```shell_command``` | String | An alternative to HTTP health checking. Specifies a shell command that will be executed. Any non-zero exit status will be interpreted as a health check failure. +| ```type``` | String | 'http' or 'shell'. (Default: 'http') +| ```timeout_secs``` | Integer | HTTP request timeout. (Default: 1) ### Announcer Objects http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/main/python/apache/aurora/client/config.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/client/config.py b/src/main/python/apache/aurora/client/config.py index 2fc1255..161c362 100644 --- a/src/main/python/apache/aurora/client/config.py +++ b/src/main/python/apache/aurora/client/config.py @@ -22,6 +22,8 @@ import math import re import sys +from pystachio.composite import Empty + from apache.aurora.client import binding_helper from apache.aurora.client.base import die from apache.aurora.config import AuroraConfig @@ -68,6 +70,47 @@ def _validate_environment_name(config): __validate_env(env_name, 'Environment') +CANNOT_HAVE_HTTP_ARGS_WITH_SHELL_ERROR = ''' +shell_command does not support supplied http arguments. +''' + +CANNOT_HAVE_SHELL_ARGS_WITH_HTTP_ERROR = ''' +Cannot define shell_commmand for HTTP health check. +''' + +INVALID_HEALTH_CHECK_TYPE = ''' +Invalid health check type {health_check_type}. +''' + +MUST_PROVIDE_SHELL_COMMAND_ERROR = ''' +Must provide a shell command for shell type. +''' + + +HTTP_HEALTH_CHECK = 'http' +SHELL_HEALTH_CHECK = 'shell' + + +# TODO (AURORA-1552): Add config validation to the executor +def _validate_health_check_config(config): + health_check_config = config.health_check_config() + health_check_type = health_check_config.type().get() + + # Make sure we either have HTTP or SHELL. + if health_check_type not in {HTTP_HEALTH_CHECK, SHELL_HEALTH_CHECK}: + die(INVALID_HEALTH_CHECK_TYPE.format(health_check_type=health_check_type)) + if health_check_type == SHELL_HEALTH_CHECK: + # SHELL options + shell_command = health_check_config.shell_command() + if shell_command == Empty: + # Must define a command. + die(MUST_PROVIDE_SHELL_COMMAND_ERROR) + elif health_check_type == HTTP_HEALTH_CHECK: + if health_check_config.shell_command() != Empty: + # No shell_command for HTTP. + die(CANNOT_HAVE_SHELL_ARGS_WITH_HTTP_ERROR) + + UPDATE_CONFIG_MAX_FAILURES_ERROR = ''' max_total_failures in update_config must be lesser than the job size. Based on your job size (%s) you should use max_total_failures <= %s. @@ -118,6 +161,7 @@ def validate_config(config, env=None): _validate_update_config(config) _validate_announce_configuration(config) _validate_environment_name(config) + _validate_health_check_config(config) class GlobalHookRegistry(object): http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/main/python/apache/aurora/common/BUILD ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/common/BUILD b/src/main/python/apache/aurora/common/BUILD index 5fce3d0..0e4c510 100644 --- a/src/main/python/apache/aurora/common/BUILD +++ b/src/main/python/apache/aurora/common/BUILD @@ -21,6 +21,7 @@ python_library( '3rdparty/python:pex', '3rdparty/python:pystachio', '3rdparty/python:requests', + '3rdparty/python:subprocess32', '3rdparty/python:thrift', '3rdparty/python:twitter.common.collections', '3rdparty/python:twitter.common.lang', http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/main/python/apache/aurora/common/health_check/__init__.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/common/health_check/__init__.py b/src/main/python/apache/aurora/common/health_check/__init__.py new file mode 100644 index 0000000..0663a9a --- /dev/null +++ b/src/main/python/apache/aurora/common/health_check/__init__.py @@ -0,0 +1,13 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/main/python/apache/aurora/common/health_check/http_signaler.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/common/health_check/http_signaler.py b/src/main/python/apache/aurora/common/health_check/http_signaler.py new file mode 100644 index 0000000..41b6bfb --- /dev/null +++ b/src/main/python/apache/aurora/common/health_check/http_signaler.py @@ -0,0 +1,112 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import contextlib +import os +from socket import timeout as SocketTimeout + +from twitter.common import log +from twitter.common.lang import Compatibility + +if Compatibility.PY3: + from http.client import HTTPException + import urllib.request as urllib_request + from urllib.error import URLError, HTTPError +else: + from httplib import HTTPException + import urllib2 as urllib_request + from urllib2 import URLError, HTTPError + + +class HttpSignaler(object): + """Simple HTTP endpoint wrapper to check health or trigger quitquitquit/abortabortabort""" + TIMEOUT_SECS = 1.0 + FAILURE_REASON_LENGTH = 10 + + class Error(Exception): pass + class QueryError(Error): pass + + def __init__(self, port, host='localhost', timeout_secs=None): + self._host = host + self._url_base = 'http://%s:%d' % (host, port) + if timeout_secs is None: + env_timeout = os.getenv('AURORA_HTTP_SIGNALER_TIMEOUT_SECS') + if env_timeout is not None: + log.info('Using timeout %s secs (from AURORA_HTTP_SIGNALER_TIMEOUT_SECS).' % env_timeout) + self._timeout_secs = float(env_timeout) + else: + log.debug('Using timeout %s secs (default).' % self.TIMEOUT_SECS) + self._timeout_secs = self.TIMEOUT_SECS + else: + log.debug('Using timeout %s secs.' % timeout_secs) + self._timeout_secs = timeout_secs + + def url(self, endpoint): + return self._url_base + endpoint + + @property + def opener(self): + return urllib_request.urlopen + + def query(self, endpoint, data=None): + """Request an HTTP endpoint with a GET request (or POST if data is not None)""" + url = self.url(endpoint) + log.debug("%s: %s %s" % (self.__class__.__name__, 'GET' if data is None else 'POST', url)) + + def raise_error(reason): + raise self.QueryError('Failed to signal %s: %s' % (self.url(endpoint), reason)) + + try: + with contextlib.closing( + self.opener(url, data, timeout=self._timeout_secs)) as fp: + return (fp.read(), fp.getcode()) + except (HTTPException, SocketTimeout) as e: + # the type of an HTTPException is typically more useful than its contents (since for example + # BadStatusLines are often empty). likewise with socket.timeout. + raise_error('Error within %s' % e.__class__.__name__) + except HTTPError as e: + return ('', e.code) + except URLError as e: + raise_error(e) + except Exception as e: + raise_error('Unexpected error: %s' % e) + + def __call__(self, endpoint, use_post_method=False, expected_response=None, + expected_response_code=None): + """ + Returns a (boolean, string|None) tuple of (call success, failure reason) + :type endpoint: str + :type use_post_method: bool + :type expected_response: str + :type expected_response_code: int + :rtype (bool, str): + """ + try: + response, response_code = self.query(endpoint, '' if use_post_method else None) + response = response.strip().lower() + if expected_response and response != expected_response.lower(): + reason = 'Response differs from expected response (expected "%s", got "%s")' + def shorten(string): + return (string if len(string) < self.FAILURE_REASON_LENGTH + else "%s..." % string[:self.FAILURE_REASON_LENGTH - 3]) + log.warning(reason % (expected_response, response)) + return (False, reason % (shorten(str(expected_response)), shorten(str(response)))) + elif expected_response_code and response_code != expected_response_code: + reason = 'Response code differs from expected response (expected %i, got %i)' + log.warning(reason % (expected_response_code, response_code)) + return (False, reason % (expected_response_code, response_code)) + else: + return (True, None) + except self.QueryError as e: + return (False, str(e)) http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/main/python/apache/aurora/common/health_check/shell.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/common/health_check/shell.py b/src/main/python/apache/aurora/common/health_check/shell.py new file mode 100644 index 0000000..890bf0c --- /dev/null +++ b/src/main/python/apache/aurora/common/health_check/shell.py @@ -0,0 +1,60 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import shlex +import sys + +# Recommended pattern for Python 2 and 3 support from https://github.com/google/python-subprocess32 +# Backport which adds bug fixes and timeout support for Python 2.7 +if os.name == 'posix' and sys.version_info[0] < 3: + import subprocess32 as subprocess +else: + # subprocess is included as part of Python standard lib in Python 3+. + import subprocess + + +class ShellHealthCheck(object): + + def __init__(self, cmd, timeout_secs=None): + """ + Initialize with the commmand we would like to call. + :param cmd: Command to execute that is expected to have a 0 return code on success. + :type cmd: str + :param timeout_secs: Timeout in seconds. + :type timeout_secs: int + """ + self.cmd = cmd + self.timeout_secs = timeout_secs + + def __call__(self): + """ + Call a shell command line health check. + + :return: A tuple of (bool, str) + :rtype tuple: + """ + cmd = shlex.split(self.cmd) + try: + subprocess.check_call(cmd, timeout=self.timeout_secs) + return True, None + except subprocess.CalledProcessError as reason: + # The command didn't return a 0 so provide reason for failure. + return False, str(reason) + except OSError as e: + reason = 'OSError: {error}'.format(error=e.strerror) + return False, reason + except ValueError: + reason = 'Invalid commmand.' + return False, reason http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/main/python/apache/aurora/common/http_signaler.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/common/http_signaler.py b/src/main/python/apache/aurora/common/http_signaler.py deleted file mode 100644 index a3193f3..0000000 --- a/src/main/python/apache/aurora/common/http_signaler.py +++ /dev/null @@ -1,105 +0,0 @@ -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import contextlib -import os -from socket import timeout as SocketTimeout - -from twitter.common import log -from twitter.common.lang import Compatibility - -if Compatibility.PY3: - from http.client import HTTPException - import urllib.request as urllib_request - from urllib.error import URLError, HTTPError -else: - from httplib import HTTPException - import urllib2 as urllib_request - from urllib2 import URLError, HTTPError - - -class HttpSignaler(object): - """Simple HTTP endpoint wrapper to check health or trigger quitquitquit/abortabortabort""" - TIMEOUT_SECS = 1.0 - FAILURE_REASON_LENGTH = 10 - - class Error(Exception): pass - class QueryError(Error): pass - - def __init__(self, port, host='localhost', timeout_secs=None): - self._host = host - self._url_base = 'http://%s:%d' % (host, port) - if timeout_secs is None: - env_timeout = os.getenv('AURORA_HTTP_SIGNALER_TIMEOUT_SECS') - if env_timeout is not None: - log.info('Using timeout %s secs (from AURORA_HTTP_SIGNALER_TIMEOUT_SECS).' % env_timeout) - self._timeout_secs = float(env_timeout) - else: - log.debug('Using timeout %s secs (default).' % self.TIMEOUT_SECS) - self._timeout_secs = self.TIMEOUT_SECS - else: - log.debug('Using timeout %s secs.' % timeout_secs) - self._timeout_secs = timeout_secs - - def url(self, endpoint): - return self._url_base + endpoint - - @property - def opener(self): - return urllib_request.urlopen - - def query(self, endpoint, data=None): - """Request an HTTP endpoint with a GET request (or POST if data is not None)""" - url = self.url(endpoint) - log.debug("%s: %s %s" % (self.__class__.__name__, 'GET' if data is None else 'POST', url)) - - def raise_error(reason): - raise self.QueryError('Failed to signal %s: %s' % (self.url(endpoint), reason)) - - try: - with contextlib.closing( - self.opener(url, data, timeout=self._timeout_secs)) as fp: - return (fp.read(), fp.getcode()) - except (HTTPException, SocketTimeout) as e: - # the type of an HTTPException is typically more useful than its contents (since for example - # BadStatusLines are often empty). likewise with socket.timeout. - raise_error('Error within %s' % e.__class__.__name__) - except HTTPError as e: - return ('', e.code) - except URLError as e: - raise_error(e) - except Exception as e: - raise_error('Unexpected error: %s' % e) - - def __call__(self, endpoint, use_post_method=False, expected_response=None, - expected_response_code=None): - """Returns a (boolean, string|None) tuple of (call success, failure reason)""" - try: - response, response_code = self.query(endpoint, '' if use_post_method else None) - response = response.strip().lower() - if expected_response and response != expected_response.lower(): - reason = 'Response differs from expected response (expected "%s", got "%s")' - def shorten(string): - return (string if len(string) < self.FAILURE_REASON_LENGTH - else "%s..." % string[:self.FAILURE_REASON_LENGTH - 3]) - log.warning(reason % (expected_response, response)) - return (False, reason % (shorten(str(expected_response)), shorten(str(response)))) - elif expected_response_code and response_code != expected_response_code: - reason = 'Response code differs from expected response (expected %i, got %i)' - log.warning(reason % (expected_response_code, response_code)) - return (False, reason % (expected_response_code, response_code)) - else: - return (True, None) - except self.QueryError as e: - return (False, str(e)) http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/main/python/apache/aurora/config/schema/base.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/config/schema/base.py b/src/main/python/apache/aurora/config/schema/base.py index 398f737..e752482 100644 --- a/src/main/python/apache/aurora/config/schema/base.py +++ b/src/main/python/apache/aurora/config/schema/base.py @@ -37,13 +37,15 @@ class UpdateConfig(Struct): class HealthCheckConfig(Struct): - initial_interval_secs = Default(Float, 15.0) - interval_secs = Default(Float, 10.0) - timeout_secs = Default(Float, 1.0) - max_consecutive_failures = Default(Integer, 0) endpoint = Default(String, '/health') expected_response = Default(String, 'ok') expected_response_code = Default(Integer, 0) + initial_interval_secs = Default(Float, 15.0) + interval_secs = Default(Float, 10.0) + max_consecutive_failures = Default(Integer, 0) + shell_command = String + type = Default(String, 'http') + timeout_secs = Default(Float, 1.0) class HttpLifecycleConfig(Struct): http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/main/python/apache/aurora/executor/common/health_checker.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/executor/common/health_checker.py b/src/main/python/apache/aurora/executor/common/health_checker.py index 03fdf0a..cba4e8c 100644 --- a/src/main/python/apache/aurora/executor/common/health_checker.py +++ b/src/main/python/apache/aurora/executor/common/health_checker.py @@ -22,11 +22,15 @@ from twitter.common import log from twitter.common.exceptions import ExceptionalThread from twitter.common.metrics import LambdaGauge -from apache.aurora.common.http_signaler import HttpSignaler +from apache.aurora.common.health_check.http_signaler import HttpSignaler +from apache.aurora.common.health_check.shell import ShellHealthCheck from .status_checker import StatusChecker, StatusCheckerProvider, StatusResult from .task_info import mesos_task_instance_from_assigned_task, resolve_ports +HTTP_HEALTH_CHECK = 'http' +SHELL_HEALTH_CHECK = 'shell' + class ThreadedHealthChecker(ExceptionalThread): """Perform a health check to determine if a service is healthy or not @@ -200,23 +204,44 @@ class HealthChecker(StatusChecker): class HealthCheckerProvider(StatusCheckerProvider): def from_assigned_task(self, assigned_task, sandbox): + """ + :param assigned_task: + :param sandbox: + :return: Instance of a HealthChecker. + """ mesos_task = mesos_task_instance_from_assigned_task(assigned_task) portmap = resolve_ports(mesos_task, assigned_task.assignedPorts) - if 'health' not in portmap: - return None - health_check_config = mesos_task.health_check_config().get() - http_signaler = HttpSignaler( + health_check_type = health_check_config.get('type') + + # We don't need a port if we are running a shell command. + if health_check_type == HTTP_HEALTH_CHECK and 'health' not in portmap: + return None + timeout_secs = health_check_config.get('timeout_secs') + + if health_check_type == SHELL_HEALTH_CHECK: + shell_command = health_check_config.get('shell_command') + shell_signaler = ShellHealthCheck( + cmd=shell_command, + timeout_secs=timeout_secs + ) + a_health_checker = lambda: shell_signaler() + else: + http_signaler = HttpSignaler( portmap['health'], - timeout_secs=health_check_config.get('timeout_secs')) + timeout_secs=timeout_secs) + a_health_checker = lambda: http_signaler( + endpoint=health_check_config.get('endpoint'), + expected_response=health_check_config.get('expected_response'), + expected_response_code=health_check_config.get('expected_response_code') + ) + health_checker = HealthChecker( - lambda: http_signaler( - endpoint=health_check_config.get('endpoint'), - expected_response=health_check_config.get('expected_response'), - expected_response_code=health_check_config.get('expected_response_code')), - sandbox, - interval_secs=health_check_config.get('interval_secs'), - initial_interval_secs=health_check_config.get('initial_interval_secs'), - max_consecutive_failures=health_check_config.get('max_consecutive_failures')) + a_health_checker, + sandbox, + interval_secs=health_check_config.get('interval_secs'), + initial_interval_secs=health_check_config.get('initial_interval_secs'), + max_consecutive_failures=health_check_config.get('max_consecutive_failures')) + return health_checker http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/main/python/apache/aurora/executor/http_lifecycle.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/executor/http_lifecycle.py b/src/main/python/apache/aurora/executor/http_lifecycle.py index 6d578cc..9280bf2 100644 --- a/src/main/python/apache/aurora/executor/http_lifecycle.py +++ b/src/main/python/apache/aurora/executor/http_lifecycle.py @@ -17,7 +17,7 @@ import time from twitter.common import log from twitter.common.quantity import Amount, Time -from apache.aurora.common.http_signaler import HttpSignaler +from apache.aurora.common.health_check.http_signaler import HttpSignaler from .common.task_runner import TaskError, TaskRunner http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/python/apache/aurora/client/BUILD ---------------------------------------------------------------------- diff --git a/src/test/python/apache/aurora/client/BUILD b/src/test/python/apache/aurora/client/BUILD index 1ead9ae..c9d7616 100644 --- a/src/test/python/apache/aurora/client/BUILD +++ b/src/test/python/apache/aurora/client/BUILD @@ -46,6 +46,7 @@ python_tests(name = 'config', sources = ['test_config.py'], dependencies = [ '3rdparty/python:mox', + '3rdparty/python:pystachio', 'src/main/python/apache/aurora/client', ], ) http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/python/apache/aurora/client/test_config.py ---------------------------------------------------------------------- diff --git a/src/test/python/apache/aurora/client/test_config.py b/src/test/python/apache/aurora/client/test_config.py index b1a3c18..8fd112f 100644 --- a/src/test/python/apache/aurora/client/test_config.py +++ b/src/test/python/apache/aurora/client/test_config.py @@ -180,6 +180,71 @@ def test_dedicated_portmap(): constraints={'foo': 'bar'}))) +def test_health_check_config_http_ok(): + base_job = Job( + name='hello_bond', role='james', cluster='marine-cluster', + health_check_config=HealthCheckConfig( + max_consecutive_failures=1, + type='http', + ), + task=Task(name='main', processes=[], + resources=Resources(cpu=0.1, ram=64 * MB, disk=64 * MB))) + config._validate_health_check_config(AuroraConfig(base_job)) + + +def test_health_check_config_shell_ok(): + base_job = Job( + name='hello_bond', role='james', cluster='marine-cluster', + health_check_config=HealthCheckConfig( + max_consecutive_failures=1, + type='shell', + shell_command='foo bar' + ), + task=Task(name='main', processes=[], + resources=Resources(cpu=0.1, ram=64 * MB, disk=64 * MB))) + config._validate_health_check_config(AuroraConfig(base_job)) + + +def test_health_check_config_invalid_type(): + base_job = Job( + name='hello_bond', role='james', cluster='marine-cluster', + health_check_config=HealthCheckConfig( + max_consecutive_failures=1, + type='foo', + ), + task=Task(name='main', processes=[], + resources=Resources(cpu=0.1, ram=64 * MB, disk=64 * MB))) + with pytest.raises(SystemExit): + config._validate_health_check_config(AuroraConfig(base_job)) + + +def test_health_check_config_http_and_shell_defined(): + base_job = Job( + name='hello_bond', role='james', cluster='marine-cluster', + health_check_config=HealthCheckConfig( + max_consecutive_failures=1, + type='http', + shell_command='foo bar' + ), + task=Task(name='main', processes=[], + resources=Resources(cpu=0.1, ram=64 * MB, disk=64 * MB))) + with pytest.raises(SystemExit): + config._validate_health_check_config(AuroraConfig(base_job)) + + +def test_health_check_config_shell_no_command(): + base_job = Job( + name='hello_bond', role='james', cluster='marine-cluster', + health_check_config=HealthCheckConfig( + max_consecutive_failures=1, + type='shell', + ), + task=Task(name='main', processes=[], + resources=Resources(cpu=0.1, ram=64 * MB, disk=64 * MB))) + with pytest.raises(SystemExit): + config._validate_health_check_config(AuroraConfig(base_job)) + + def test_update_config_passes_with_default_values(): base_job = Job( name='hello_world', role='john_doe', cluster='test-cluster', http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/python/apache/aurora/common/BUILD ---------------------------------------------------------------------- diff --git a/src/test/python/apache/aurora/common/BUILD b/src/test/python/apache/aurora/common/BUILD index f903c19..7909ab5 100644 --- a/src/test/python/apache/aurora/common/BUILD +++ b/src/test/python/apache/aurora/common/BUILD @@ -19,7 +19,6 @@ target( ':test_cluster', ':test_clusters', ':test_cluster_option', - ':test_http_signaler', ':test_pex_version', ':test_shellify', ':test_transport', @@ -69,15 +68,6 @@ python_tests( ) python_tests( - name = 'test_http_signaler', - sources = ['test_http_signaler.py'], - dependencies = [ - '3rdparty/python:mox', - 'src/main/python/apache/aurora/common', - ] -) - -python_tests( name = 'test_shellify', sources = ['test_shellify.py'], dependencies = [ http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/python/apache/aurora/common/health_check/BUILD ---------------------------------------------------------------------- diff --git a/src/test/python/apache/aurora/common/health_check/BUILD b/src/test/python/apache/aurora/common/health_check/BUILD new file mode 100644 index 0000000..98a2481 --- /dev/null +++ b/src/test/python/apache/aurora/common/health_check/BUILD @@ -0,0 +1,39 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +target( + name = 'all', + dependencies = [ + ':test_http_signaler', + ':test_shell', + ] +) + +python_tests( + name = 'test_http_signaler', + sources = ['test_http_signaler.py'], + dependencies = [ + '3rdparty/python:mox', + 'src/main/python/apache/aurora/common', + ] +) + +python_tests( + name = 'test_shell', + sources = ['test_shell.py'], + dependencies = [ + '3rdparty/python:mock', + 'src/main/python/apache/aurora/common', + ] +) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/python/apache/aurora/common/health_check/__init__.py ---------------------------------------------------------------------- diff --git a/src/test/python/apache/aurora/common/health_check/__init__.py b/src/test/python/apache/aurora/common/health_check/__init__.py new file mode 100644 index 0000000..0663a9a --- /dev/null +++ b/src/test/python/apache/aurora/common/health_check/__init__.py @@ -0,0 +1,13 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/python/apache/aurora/common/health_check/test_http_signaler.py ---------------------------------------------------------------------- diff --git a/src/test/python/apache/aurora/common/health_check/test_http_signaler.py b/src/test/python/apache/aurora/common/health_check/test_http_signaler.py new file mode 100644 index 0000000..0338b81 --- /dev/null +++ b/src/test/python/apache/aurora/common/health_check/test_http_signaler.py @@ -0,0 +1,111 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest +from socket import timeout as SocketTimeout + +import mox +from twitter.common.lang import Compatibility + +from apache.aurora.common.health_check.http_signaler import HttpSignaler + +if Compatibility.PY3: + import urllib.request as urllib_request +else: + import urllib2 as urllib_request + + +class OpenedURL(object): + def __init__(self, content, code=200): + self.content = content + self.code = code + + def read(self): + return self.content + + def close(self): + pass + + def getcode(self): + return self.code + + +class TestHttpSignaler(unittest.TestCase): + PORT = 12345 + + def setUp(self): + self._mox = mox.Mox() + + def tearDown(self): + self._mox.UnsetStubs() + self._mox.VerifyAll() + + def test_all_calls_ok(self): + self._mox.StubOutWithMock(urllib_request, 'urlopen') + urllib_request.urlopen( + 'http://localhost:%s/quitquitquit' % self.PORT, '', timeout=1.0).AndReturn(OpenedURL('')) + urllib_request.urlopen( + 'http://localhost:%s/abortabortabort' % self.PORT, '', timeout=1.0).AndReturn(OpenedURL('')) + + self._mox.ReplayAll() + + signaler = HttpSignaler(self.PORT) + assert signaler('/quitquitquit', use_post_method=True) == (True, None) + assert signaler('/abortabortabort', use_post_method=True) == (True, None) + + def test_health_checks(self): + self._mox.StubOutWithMock(urllib_request, 'urlopen') + urllib_request.urlopen( + 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn(OpenedURL('ok')) + urllib_request.urlopen( + 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn(OpenedURL('not ok')) + urllib_request.urlopen( + 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn( + OpenedURL('not ok', code=200)) + urllib_request.urlopen( + 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn( + OpenedURL('ok', code=400)) + urllib_request.urlopen( + 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndRaise( + urllib_request.HTTPError('', 501, '', None, None)) + urllib_request.urlopen( + 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn( + OpenedURL('ok', code=200)) + urllib_request.urlopen( + 'http://localhost:%s/random/endpoint' % self.PORT, None, timeout=1.0).AndReturn( + OpenedURL('ok')) + + self._mox.ReplayAll() + + signaler = HttpSignaler(self.PORT) + assert signaler('/health', expected_response='ok') == (True, None) + assert signaler('/health', expected_response='ok') == ( + False, 'Response differs from expected response (expected "ok", got "not ok")') + assert signaler('/health', expected_response_code=200) == (True, None) + assert signaler('/health', expected_response_code=200) == ( + False, 'Response code differs from expected response (expected 200, got 400)') + assert signaler('/health', expected_response_code=200) == ( + False, 'Response code differs from expected response (expected 200, got 501)') + assert signaler('/health', expected_response='ok', expected_response_code=200) == (True, None) + assert signaler('/random/endpoint', expected_response='ok') == (True, None) + + def test_exception(self): + self._mox.StubOutWithMock(urllib_request, 'urlopen') + urllib_request.urlopen( + 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndRaise( + SocketTimeout('Timed out')) + + self._mox.ReplayAll() + + assert not HttpSignaler(self.PORT)('/health', expected_response='ok')[0] http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/python/apache/aurora/common/health_check/test_shell.py ---------------------------------------------------------------------- diff --git a/src/test/python/apache/aurora/common/health_check/test_shell.py b/src/test/python/apache/aurora/common/health_check/test_shell.py new file mode 100644 index 0000000..84f717f --- /dev/null +++ b/src/test/python/apache/aurora/common/health_check/test_shell.py @@ -0,0 +1,91 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import sys +import unittest + +import mock + +from apache.aurora.common.health_check.shell import ShellHealthCheck + +# Recommended pattern for Python 2 and 3 support from https://github.com/google/python-subprocess32 +# Backport which adds bug fixes and timeout support for Python 2.7 +if os.name == 'posix' and sys.version_info[0] < 3: + import subprocess32 as subprocess +else: + # subprocess is included as part of Python standard lib in Python 3+. + import subprocess + + +class TestHealthChecker(unittest.TestCase): + + @mock.patch('subprocess32.check_call') + def test_health_check_ok(self, mock_sub): + timeout = 30 + cmd = 'success cmd' + shell = ShellHealthCheck(cmd, timeout_secs=timeout) + success, msg = shell() + self.assertTrue(success) + self.assertIsNone(msg) + mock_sub.assert_called_once_with( + ['success', 'cmd'], + timeout=30 + ) + + @mock.patch('subprocess32.check_call') + def test_health_check_failed(self, mock_sub): + timeout = 30 + # Fail due to command returning a non-0 exit status. + mock_sub.side_effect = subprocess.CalledProcessError(1, 'failed') + cmd = 'cmd to fail' + shell = ShellHealthCheck(cmd, timeout_secs=timeout) + success, msg = shell() + mock_sub.assert_called_once_with( + ['cmd', 'to', 'fail'], + timeout=30 + ) + self.assertFalse(success) + self.assertEqual(msg, "Command 'failed' returned non-zero exit status 1") + + @mock.patch('subprocess32.check_call') + def test_health_check_os_error(self, mock_sub): + timeout = 30 + # Fail due to command not existing. + mock_sub.side_effect = OSError(1, 'failed') + cmd = 'cmd to not exist' + shell = ShellHealthCheck(cmd, timeout_secs=timeout) + success, msg = shell() + mock_sub.assert_called_once_with( + ['cmd', 'to', 'not', 'exist'], + timeout=30 + ) + self.assertFalse(success) + self.assertEqual(msg, 'OSError: failed') + + @mock.patch('subprocess32.check_call') + def test_health_check_value_error(self, mock_sub): + timeout = 30 + # Invalid commmand passed in raises ValueError. + mock_sub.side_effect = ValueError('Could not read command.') + cmd = 'defensive cmd' + timeout = 10 + shell = ShellHealthCheck(cmd, timeout_secs=timeout) + success, msg = shell() + mock_sub.assert_called_once_with( + ['defensive', 'cmd'], + timeout=10 + ) + self.assertFalse(success) + self.assertEqual(msg, 'Invalid commmand.') http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/python/apache/aurora/common/test_http_signaler.py ---------------------------------------------------------------------- diff --git a/src/test/python/apache/aurora/common/test_http_signaler.py b/src/test/python/apache/aurora/common/test_http_signaler.py deleted file mode 100644 index f68c71a..0000000 --- a/src/test/python/apache/aurora/common/test_http_signaler.py +++ /dev/null @@ -1,111 +0,0 @@ -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import unittest -from socket import timeout as SocketTimeout - -import mox -from twitter.common.lang import Compatibility - -from apache.aurora.common.http_signaler import HttpSignaler - -if Compatibility.PY3: - import urllib.request as urllib_request -else: - import urllib2 as urllib_request - - -class OpenedURL(object): - def __init__(self, content, code=200): - self.content = content - self.code = code - - def read(self): - return self.content - - def close(self): - pass - - def getcode(self): - return self.code - - -class TestHttpSignaler(unittest.TestCase): - PORT = 12345 - - def setUp(self): - self._mox = mox.Mox() - - def tearDown(self): - self._mox.UnsetStubs() - self._mox.VerifyAll() - - def test_all_calls_ok(self): - self._mox.StubOutWithMock(urllib_request, 'urlopen') - urllib_request.urlopen( - 'http://localhost:%s/quitquitquit' % self.PORT, '', timeout=1.0).AndReturn(OpenedURL('')) - urllib_request.urlopen( - 'http://localhost:%s/abortabortabort' % self.PORT, '', timeout=1.0).AndReturn(OpenedURL('')) - - self._mox.ReplayAll() - - signaler = HttpSignaler(self.PORT) - assert signaler('/quitquitquit', use_post_method=True) == (True, None) - assert signaler('/abortabortabort', use_post_method=True) == (True, None) - - def test_health_checks(self): - self._mox.StubOutWithMock(urllib_request, 'urlopen') - urllib_request.urlopen( - 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn(OpenedURL('ok')) - urllib_request.urlopen( - 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn(OpenedURL('not ok')) - urllib_request.urlopen( - 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn( - OpenedURL('not ok', code=200)) - urllib_request.urlopen( - 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn( - OpenedURL('ok', code=400)) - urllib_request.urlopen( - 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndRaise( - urllib_request.HTTPError('', 501, '', None, None)) - urllib_request.urlopen( - 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn( - OpenedURL('ok', code=200)) - urllib_request.urlopen( - 'http://localhost:%s/random/endpoint' % self.PORT, None, timeout=1.0).AndReturn( - OpenedURL('ok')) - - self._mox.ReplayAll() - - signaler = HttpSignaler(self.PORT) - assert signaler('/health', expected_response='ok') == (True, None) - assert signaler('/health', expected_response='ok') == ( - False, 'Response differs from expected response (expected "ok", got "not ok")') - assert signaler('/health', expected_response_code=200) == (True, None) - assert signaler('/health', expected_response_code=200) == ( - False, 'Response code differs from expected response (expected 200, got 400)') - assert signaler('/health', expected_response_code=200) == ( - False, 'Response code differs from expected response (expected 200, got 501)') - assert signaler('/health', expected_response='ok', expected_response_code=200) == (True, None) - assert signaler('/random/endpoint', expected_response='ok') == (True, None) - - def test_exception(self): - self._mox.StubOutWithMock(urllib_request, 'urlopen') - urllib_request.urlopen( - 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndRaise( - SocketTimeout('Timed out')) - - self._mox.ReplayAll() - - assert not HttpSignaler(self.PORT)('/health', expected_response='ok')[0] http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/python/apache/aurora/executor/common/test_health_checker.py ---------------------------------------------------------------------- diff --git a/src/test/python/apache/aurora/executor/common/test_health_checker.py b/src/test/python/apache/aurora/executor/common/test_health_checker.py index 27c7171..8561abc 100644 --- a/src/test/python/apache/aurora/executor/common/test_health_checker.py +++ b/src/test/python/apache/aurora/executor/common/test_health_checker.py @@ -22,7 +22,7 @@ from mesos.interface.mesos_pb2 import TaskState from twitter.common.exceptions import ExceptionalThread from twitter.common.testing.clock import ThreadedClock -from apache.aurora.common.http_signaler import HttpSignaler +from apache.aurora.common.health_check.http_signaler import HttpSignaler from apache.aurora.config.schema.base import HealthCheckConfig from apache.aurora.executor.common.health_checker import ( HealthChecker, @@ -181,7 +181,7 @@ class TestHealthChecker(unittest.TestCase): class TestHealthCheckerProvider(unittest.TestCase): - def test_from_assigned_task(self): + def test_from_assigned_task_http(self): interval_secs = 17 initial_interval_secs = 3 max_consecutive_failures = 2 @@ -206,6 +206,58 @@ class TestHealthCheckerProvider(unittest.TestCase): hct_max_fail = health_checker.threaded_health_checker.max_consecutive_failures assert hct_max_fail == max_consecutive_failures + def test_from_assigned_task_generic(self): + interval_secs = 17 + initial_interval_secs = 3 + max_consecutive_failures = 2 + timeout_secs = 5 + task_config = TaskConfig( + executorConfig=ExecutorConfig( + name='thermos-generic', + data=MESOS_JOB( + task=HELLO_WORLD, + health_check_config=HealthCheckConfig( + interval_secs=interval_secs, + initial_interval_secs=initial_interval_secs, + max_consecutive_failures=max_consecutive_failures, + timeout_secs=timeout_secs, + type='shell', + shell_command='failed command' + ) + ).json_dumps() + ) + ) + assigned_task = AssignedTask(task=task_config, instanceId=1, assignedPorts={'health': 9001}) + health_checker = HealthCheckerProvider().from_assigned_task(assigned_task, None) + assert health_checker.threaded_health_checker.interval == interval_secs + assert health_checker.threaded_health_checker.initial_interval == initial_interval_secs + hct_max_fail = health_checker.threaded_health_checker.max_consecutive_failures + assert hct_max_fail == max_consecutive_failures + + def test_from_assigned_task_no_health_port(self): + interval_secs = 17 + initial_interval_secs = 3 + max_consecutive_failures = 2 + timeout_secs = 5 + task_config = TaskConfig( + executorConfig=ExecutorConfig( + name='thermos-generic', + data=MESOS_JOB( + task=HELLO_WORLD, + health_check_config=HealthCheckConfig( + interval_secs=interval_secs, + initial_interval_secs=initial_interval_secs, + max_consecutive_failures=max_consecutive_failures, + timeout_secs=timeout_secs, + ) + ).json_dumps() + ) + ) + # No health port and we don't have a shell_command. + assigned_task = AssignedTask(task=task_config, instanceId=1, assignedPorts={'http': 9001}) + health_checker = HealthCheckerProvider().from_assigned_task(assigned_task, None) + self.assertIsNone(health_checker) + class TestThreadedHealthChecker(unittest.TestCase): def setUp(self): http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora ---------------------------------------------------------------------- diff --git a/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora b/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora index dc55109..bb4fdec 100644 --- a/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora +++ b/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora @@ -27,7 +27,7 @@ stage_server = Process( test_task = Task( name = 'http_example', - resources = Resources(cpu=0.5, ram=32*MB, disk=64*MB), + resources = Resources(cpu=0.4, ram=32*MB, disk=64*MB), processes = [stage_server, run_server], constraints = order(stage_server, run_server)) http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora ---------------------------------------------------------------------- diff --git a/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora b/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora new file mode 100644 index 0000000..37f2e9c --- /dev/null +++ b/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora @@ -0,0 +1,73 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import getpass + +DEFAULT_CMD = 'cp /vagrant/src/test/sh/org/apache/aurora/e2e/http_example.py .' + +run_server = Process( + name = 'run_server', + cmdline = 'python http_example.py {{thermos.ports[http]}}') + +stage_server = Process( + name = 'stage_server', + cmdline = '{{cmd}}' +) + +test_task = Task( + name = 'http_example', + resources = Resources(cpu=0.5, ram=32*MB, disk=64*MB), + processes = [stage_server, run_server], + constraints = order(stage_server, run_server)) + +update_config = UpdateConfig(watch_secs=10, batch_size=2) +# "I am going to fail" config. +health_check_config = HealthCheckConfig( + initial_interval_secs=5, + interval_secs=1, + type='shell', + shell_command='grep foo' + ) + +job = Service( + cluster = 'devcluster', + instances = 2, + update_config = update_config, + health_check_config = health_check_config, + task = test_task, + role = getpass.getuser(), + environment = 'test', + contact = '{{role}}@localhost', + announce = Announcer(), +) + +jobs = [ + job( + name = 'http_example' + ).bind( + cmd = DEFAULT_CMD + ), + job( + name = 'http_example_revocable', + tier = 'revocable' + ).bind( + cmd = DEFAULT_CMD + ), + job( + name = 'http_example_docker', + container = Container(docker=Docker(image = 'http_example')) + ).bind( + cmd = 'cp /tmp/http_example.py .' + ) +] http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/sh/org/apache/aurora/e2e/http/http_example_updated.aurora ---------------------------------------------------------------------- diff --git a/src/test/sh/org/apache/aurora/e2e/http/http_example_updated.aurora b/src/test/sh/org/apache/aurora/e2e/http/http_example_updated.aurora index f098de9..b33e8f5 100644 --- a/src/test/sh/org/apache/aurora/e2e/http/http_example_updated.aurora +++ b/src/test/sh/org/apache/aurora/e2e/http/http_example_updated.aurora @@ -27,7 +27,7 @@ stage_server = Process( test_task = SequentialTask( name = 'http_example', - resources = Resources(cpu=0.5, ram=34*MB, disk=64*MB), + resources = Resources(cpu=0.4, ram=34*MB, disk=64*MB), processes = [stage_server, run_server]) update_config = UpdateConfig(watch_secs=10, batch_size=3) http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh ---------------------------------------------------------------------- diff --git a/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh b/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh index d7c61e2..9ccf6dc 100755 --- a/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh +++ b/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh @@ -45,7 +45,7 @@ collect_result() { then echo "OK (all tests passed)" else - echo "!!! FAIL (something returned non-zero)" + echo "!!! FAIL (something returned non-zero) for $BASH_COMMAND" # Attempt to clean up any state we left behind. tear_down fi @@ -173,6 +173,30 @@ test_update() { fi } +test_update_fail() { + local _jobkey=$1 _config=$2 _cluster=$3 _bad_healthcheck_config=$4 + # Make sure our updates works. + aurora update start $_jobkey $_config + assert_update_state $_jobkey 'ROLLING_FORWARD' + local _update_id=$(aurora update list $_jobkey --status ROLLING_FORWARD \ + | tail -n +2 | awk '{print $2}') + # Need to wait until udpate finishes before we can start one that we want to fail. + aurora update wait $_jobkey $_update_id + + # Starting update with a health check that is meant to fail. Expected behavior is roll back. + aurora update start $_jobkey $_bad_healthcheck_config + local _update_id=$(aurora update list $_jobkey --status active \ + | tail -n +2 | awk '{print $2}') + # || is so that we don't return an EXIT so that `trap collect_result` doesn't get triggered. + aurora update wait $_jobkey $_update_id || echo $? + # MAKING SURE WE ROLLED BACK. + local status=$(aurora update info $_jobkey $_update_id | grep 'Current status' | awk '{print $NF}') + if [[ $status != "ROLLED_BACK" ]]; then + echo "Update should have completed in ROLLED_BACK state due to failed healthcheck." + exit 1 + fi +} + test_announce() { local _role=$1 _env=$2 _job=$3 @@ -231,7 +255,8 @@ test_quota() { test_http_example() { local _cluster=$1 _role=$2 _env=$3 local _base_config=$4 _updated_config=$5 - local _job=$6 + local _bad_healthcheck_config=$6 + local _job=$7 local _jobkey="$_cluster/$_role/$_env/$_job" test_config $_base_config $_jobkey @@ -242,6 +267,9 @@ test_http_example() { test_observer_ui $_cluster $_role $_job test_restart $_jobkey test_update $_jobkey $_updated_config $_cluster + test_update_fail $_jobkey $_base_config $_cluster $_bad_healthcheck_config + # Running test_update second time to change state to success. + test_update $_jobkey $_updated_config $_cluster test_announce $_role $_env $_job test_run $_jobkey test_legacy_update $_jobkey $_base_config @@ -252,7 +280,7 @@ test_http_example() { test_http_revocable_example() { local _cluster=$1 _role=$2 _env=$3 local _base_config=$4 - local _job=$6 + local _job=$7 local _jobkey="$_cluster/$_role/$_env/$_job" test_create $_jobkey $_base_config @@ -274,7 +302,7 @@ restore_netrc() { test_basic_auth_unauthenticated() { local _cluster=$1 _role=$2 _env=$3 local _config=$4 - local _job=$6 + local _job=$7 local _jobkey="$_cluster/$_role/$_env/$_job" mv ~/.netrc ~/.netrc.bak @@ -301,6 +329,7 @@ TEST_JOB_REVOCABLE=http_example_revocable TEST_JOB_DOCKER=http_example_docker TEST_CONFIG_FILE=$EXAMPLE_DIR/http_example.aurora TEST_CONFIG_UPDATED_FILE=$EXAMPLE_DIR/http_example_updated.aurora +TEST_BAD_HEALTHCHECK_CONFIG_UPDATED_FILE=$EXAMPLE_DIR/http_example_bad_healthcheck.aurora BASE_ARGS=( $TEST_CLUSTER @@ -308,6 +337,7 @@ BASE_ARGS=( $TEST_ENV $TEST_CONFIG_FILE $TEST_CONFIG_UPDATED_FILE + $TEST_BAD_HEALTHCHECK_CONFIG_UPDATED_FILE ) TEST_JOB_ARGS=("${BASE_ARGS[@]}" "$TEST_JOB")
