Repository: aurora Updated Branches: refs/heads/master 05d75e5dc -> 27a602d2c
Make health check configurable This allows the endpoint, expected response and expected response code to be configured by the user. Testing Done: Unittests added and manurally verified in vagrant enviroment. Bugs closed: AURORA-316 Reviewed at https://reviews.apache.org/r/32295/ Project: http://git-wip-us.apache.org/repos/asf/aurora/repo Commit: http://git-wip-us.apache.org/repos/asf/aurora/commit/27a602d2 Tree: http://git-wip-us.apache.org/repos/asf/aurora/tree/27a602d2 Diff: http://git-wip-us.apache.org/repos/asf/aurora/diff/27a602d2 Branch: refs/heads/master Commit: 27a602d2c9efdd1cd2591c9c754f086c04ad0eb9 Parents: 05d75e5 Author: Brian Brazil <[email protected]> Authored: Fri Apr 3 15:33:05 2015 -0700 Committer: Brian Wickman <[email protected]> Committed: Fri Apr 3 15:33:05 2015 -0700 ---------------------------------------------------------------------- docs/configuration-reference.md | 5 +- .../apache/aurora/common/http_signaler.py | 29 ++++++---- .../python/apache/aurora/config/schema/base.py | 3 + .../aurora/executor/common/health_checker.py | 5 +- .../apache/aurora/common/test_http_signaler.py | 59 ++++++++++++++++---- .../executor/common/test_health_checker.py | 22 ++++---- 6 files changed, 86 insertions(+), 37 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/aurora/blob/27a602d2/docs/configuration-reference.md ---------------------------------------------------------------------- diff --git a/docs/configuration-reference.md b/docs/configuration-reference.md index af332f2..fb753ea 100644 --- a/docs/configuration-reference.md +++ b/docs/configuration-reference.md @@ -359,8 +359,11 @@ Parameters for controlling a task's health checks via HTTP. | ------- | :-------: | -------- | ```initial_interval_secs``` | Integer | Initial delay for performing an HTTP health check. (Default: 15) | ```interval_secs``` | Integer | Interval on which to check the task's health via HTTP. (Default: 10) -| ```timeout_secs``` | Integer | HTTP request timeout. (Default: 1) | ```max_consecutive_failures``` | Integer | Maximum number of consecutive failures that tolerated before considering a task unhealthy (Default: 0) +| ```timeout_secs``` | Integer | HTTP request timeout. (Default: 1) +| ```endpoint``` | String | HTTP endpoint to check (Default: /health) +| ```expected_response``` | String | If not empty, fail the health check if the response differs. Case insensitive. (Default: ok) +| ```expected_response_code``` | Integer | If not zero, fail the health check if the response code differs. (Default: 0) ### Announcer Objects http://git-wip-us.apache.org/repos/asf/aurora/blob/27a602d2/src/main/python/apache/aurora/common/http_signaler.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/common/http_signaler.py b/src/main/python/apache/aurora/common/http_signaler.py index e3e819d..531f1fe 100644 --- a/src/main/python/apache/aurora/common/http_signaler.py +++ b/src/main/python/apache/aurora/common/http_signaler.py @@ -39,7 +39,7 @@ class HttpSignaler(object): def __init__(self, port, host='localhost', timeout_secs=None): self._host = host - self._url_base = 'http://%s:%d/' % (host, port) + self._url_base = 'http://%s:%d' % (host, port) if timeout_secs is None: env_timeout = os.getenv('AURORA_HTTP_SIGNALER_TIMEOUT_SECS') if env_timeout is not None: @@ -70,37 +70,42 @@ class HttpSignaler(object): try: with contextlib.closing( self.opener(url, data, timeout=self._timeout_secs)) as fp: - return fp.read() + return (fp.read(), fp.getcode()) except (HTTPException, SocketTimeout) as e: # the type of an HTTPException is typically more useful than its contents (since for example # BadStatusLines are often empty). likewise with socket.timeout. raise_error('Error within %s' % e.__class__.__name__) - except (URLError, HTTPError) as e: + except HTTPError as e: + return ('', e.code) + except URLError as e: raise_error(e) except Exception as e: raise_error('Unexpected error: %s' % e) - def __call__(self, endpoint, use_post_method=False, expected_response=None): + def __call__(self, endpoint, use_post_method=False, expected_response=None, + expected_response_code=None): """Returns a (boolean, string|None) tuple of (call success, failure reason)""" try: - response = self.query(endpoint, '' if use_post_method else None).strip().lower() - if expected_response is not None and response != expected_response: + response, response_code = self.query(endpoint, '' if use_post_method else None) + response = response.strip().lower() + if expected_response and response != expected_response.lower(): + reason = 'Response differs from expected response (expected "%s", got "%s")' def shorten(string): return (string if len(string) < self.FAILURE_REASON_LENGTH else "%s..." % string[:self.FAILURE_REASON_LENGTH - 3]) - reason = 'Response differs from expected response (expected "%s", got "%s")' log.warning(reason % (expected_response, response)) return (False, reason % (shorten(str(expected_response)), shorten(str(response)))) + elif expected_response_code and response_code != expected_response_code: + reason = 'Response code differs from expected response (expected %i, got %i)' + log.warning(reason % (expected_response_code, response_code)) + return (False, reason % (expected_response_code, response_code)) else: return (True, None) except self.QueryError as e: return (False, str(e)) - def health(self): - return self('health', use_post_method=False, expected_response='ok') - def quitquitquit(self): - return self('quitquitquit', use_post_method=True) + return self('/quitquitquit', use_post_method=True) def abortabortabort(self): - return self('abortabortabort', use_post_method=True) + return self('/abortabortabort', use_post_method=True) http://git-wip-us.apache.org/repos/asf/aurora/blob/27a602d2/src/main/python/apache/aurora/config/schema/base.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/config/schema/base.py b/src/main/python/apache/aurora/config/schema/base.py index a87524a..ec9f983 100644 --- a/src/main/python/apache/aurora/config/schema/base.py +++ b/src/main/python/apache/aurora/config/schema/base.py @@ -40,6 +40,9 @@ class HealthCheckConfig(Struct): interval_secs = Default(Float, 10.0) timeout_secs = Default(Float, 1.0) max_consecutive_failures = Default(Integer, 0) + endpoint = Default(String, '/health') + expected_response = Default(String, 'ok') + expected_response_code = Default(Integer, 0) class Announcer(Struct): http://git-wip-us.apache.org/repos/asf/aurora/blob/27a602d2/src/main/python/apache/aurora/executor/common/health_checker.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/executor/common/health_checker.py b/src/main/python/apache/aurora/executor/common/health_checker.py index cfc29c3..03fdf0a 100644 --- a/src/main/python/apache/aurora/executor/common/health_checker.py +++ b/src/main/python/apache/aurora/executor/common/health_checker.py @@ -211,7 +211,10 @@ class HealthCheckerProvider(StatusCheckerProvider): portmap['health'], timeout_secs=health_check_config.get('timeout_secs')) health_checker = HealthChecker( - http_signaler.health, + lambda: http_signaler( + endpoint=health_check_config.get('endpoint'), + expected_response=health_check_config.get('expected_response'), + expected_response_code=health_check_config.get('expected_response_code')), sandbox, interval_secs=health_check_config.get('interval_secs'), initial_interval_secs=health_check_config.get('initial_interval_secs'), http://git-wip-us.apache.org/repos/asf/aurora/blob/27a602d2/src/test/python/apache/aurora/common/test_http_signaler.py ---------------------------------------------------------------------- diff --git a/src/test/python/apache/aurora/common/test_http_signaler.py b/src/test/python/apache/aurora/common/test_http_signaler.py index f5f8419..c6a2170 100644 --- a/src/test/python/apache/aurora/common/test_http_signaler.py +++ b/src/test/python/apache/aurora/common/test_http_signaler.py @@ -25,7 +25,20 @@ if Compatibility.PY3: else: import urllib2 as urllib_request -StringIO = Compatibility.StringIO + +class OpenedURL(object): + def __init__(self, content, code=200): + self.content = content + self.code = code + + def read(self): + return self.content + + def close(self): + pass + + def getcode(self): + return self.code class TestHttpSignaler(unittest.TestCase): @@ -41,29 +54,51 @@ class TestHttpSignaler(unittest.TestCase): def test_all_calls_ok(self): self._mox.StubOutWithMock(urllib_request, 'urlopen') urllib_request.urlopen( - 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn(StringIO('ok')) - urllib_request.urlopen( - 'http://localhost:%s/quitquitquit' % self.PORT, '', timeout=1.0).AndReturn(StringIO('')) + 'http://localhost:%s/quitquitquit' % self.PORT, '', timeout=1.0).AndReturn(OpenedURL('')) urllib_request.urlopen( - 'http://localhost:%s/abortabortabort' % self.PORT, '', timeout=1.0).AndReturn(StringIO('')) + 'http://localhost:%s/abortabortabort' % self.PORT, '', timeout=1.0).AndReturn(OpenedURL('')) self._mox.ReplayAll() signaler = HttpSignaler(self.PORT) - assert signaler.health() == (True, None) assert signaler.quitquitquit() == (True, None) assert signaler.abortabortabort() == (True, None) - def test_health_not_ok(self): + def test_health_checks(self): self._mox.StubOutWithMock(urllib_request, 'urlopen') urllib_request.urlopen( - 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn(StringIO('not ok')) + 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn(OpenedURL('ok')) + urllib_request.urlopen( + 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn(OpenedURL('not ok')) + urllib_request.urlopen( + 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn( + OpenedURL('not ok', code=200)) + urllib_request.urlopen( + 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn( + OpenedURL('ok', code=400)) + urllib_request.urlopen( + 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndRaise( + urllib_request.HTTPError('', 501, '', None, None)) + urllib_request.urlopen( + 'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn( + OpenedURL('ok', code=200)) + urllib_request.urlopen( + 'http://localhost:%s/random/endpoint' % self.PORT, None, timeout=1.0).AndReturn( + OpenedURL('ok')) self._mox.ReplayAll() - health, reason = HttpSignaler(self.PORT).health() - assert not health - assert reason.startswith('Response differs from expected response') + signaler = HttpSignaler(self.PORT) + assert signaler('/health', expected_response='ok') == (True, None) + assert signaler('/health', expected_response='ok') == ( + False, 'Response differs from expected response (expected "ok", got "not ok")') + assert signaler('/health', expected_response_code=200) == (True, None) + assert signaler('/health', expected_response_code=200) == ( + False, 'Response code differs from expected response (expected 200, got 400)') + assert signaler('/health', expected_response_code=200) == ( + False, 'Response code differs from expected response (expected 200, got 501)') + assert signaler('/health', expected_response='ok', expected_response_code=200) == (True, None) + assert signaler('/random/endpoint', expected_response='ok') == (True, None) def test_exception(self): self._mox.StubOutWithMock(urllib_request, 'urlopen') @@ -73,4 +108,4 @@ class TestHttpSignaler(unittest.TestCase): self._mox.ReplayAll() - assert not HttpSignaler(self.PORT).health()[0] + assert not HttpSignaler(self.PORT)('/health', expected_response='ok')[0] http://git-wip-us.apache.org/repos/asf/aurora/blob/27a602d2/src/test/python/apache/aurora/executor/common/test_health_checker.py ---------------------------------------------------------------------- diff --git a/src/test/python/apache/aurora/executor/common/test_health_checker.py b/src/test/python/apache/aurora/executor/common/test_health_checker.py index 1b4423a..27c7171 100644 --- a/src/test/python/apache/aurora/executor/common/test_health_checker.py +++ b/src/test/python/apache/aurora/executor/common/test_health_checker.py @@ -44,7 +44,7 @@ class TestHealthChecker(unittest.TestCase): self.fake_health_checks = [] def mock_health_check(): return self.fake_health_checks.pop(0) - self._checker.health = mock.Mock(spec=self._checker.health) + self._checker.health = mock.Mock(spec=self._checker.__call__) self._checker.health.side_effect = mock_health_check def append_health_checks(self, status, num_calls=1): @@ -209,8 +209,8 @@ class TestHealthCheckerProvider(unittest.TestCase): class TestThreadedHealthChecker(unittest.TestCase): def setUp(self): - self.signaler = mock.Mock(spec=HttpSignaler) - self.signaler.health.return_value = (True, 'Fake') + self.health = mock.Mock() + self.health.return_value = (True, 'Fake') self.sandbox = mock.Mock(spec_set=SandboxInterface) self.sandbox.exists.return_value = True @@ -222,14 +222,14 @@ class TestThreadedHealthChecker(unittest.TestCase): self.clock = mock.Mock(spec=time) self.clock.time.return_value = 1.0 self.health_checker = HealthChecker( - self.signaler.health, + self.health, None, self.interval_secs, self.initial_interval_secs, self.max_consecutive_failures, self.clock) self.health_checker_sandbox_exists = HealthChecker( - self.signaler.health, + self.health, self.sandbox, self.interval_secs, self.initial_interval_secs, @@ -238,29 +238,29 @@ class TestThreadedHealthChecker(unittest.TestCase): def test_perform_check_if_not_disabled_snooze_file_is_none(self): self.health_checker.threaded_health_checker.snooze_file = None - assert self.signaler.health.call_count == 0 + assert self.health.call_count == 0 assert self.health_checker_sandbox_exists.metrics.sample()['snoozed'] == 0 self.health_checker.threaded_health_checker._perform_check_if_not_disabled() - assert self.signaler.health.call_count == 1 + assert self.health.call_count == 1 assert self.health_checker_sandbox_exists.metrics.sample()['snoozed'] == 0 @mock.patch('os.path', spec_set=os.path) def test_perform_check_if_not_disabled_no_snooze_file(self, mock_os_path): mock_os_path.isfile.return_value = False - assert self.signaler.health.call_count == 0 + assert self.health.call_count == 0 assert self.health_checker_sandbox_exists.metrics.sample()['snoozed'] == 0 self.health_checker_sandbox_exists.threaded_health_checker._perform_check_if_not_disabled() - assert self.signaler.health.call_count == 1 + assert self.health.call_count == 1 assert self.health_checker_sandbox_exists.metrics.sample()['snoozed'] == 0 @mock.patch('os.path', spec_set=os.path) def test_perform_check_if_not_disabled_snooze_file_exists(self, mock_os_path): mock_os_path.isfile.return_value = True - assert self.signaler.health.call_count == 0 + assert self.health.call_count == 0 assert self.health_checker_sandbox_exists.metrics.sample()['snoozed'] == 0 result = ( self.health_checker_sandbox_exists.threaded_health_checker._perform_check_if_not_disabled()) - assert self.signaler.health.call_count == 0 + assert self.health.call_count == 0 assert self.health_checker_sandbox_exists.metrics.sample()['snoozed'] == 1 assert result == (True, None)
