Repository: aurora Updated Branches: refs/heads/master 0c1770581 -> d05ea2bcc
Extend warm-up time by `max_consecutive_failures` attempts. It is possible to set the health checks such that a task can continually fail health checks with intermittent successes and still succeed an update. Essentially a task fails health checks during the `initial_interval_secs` and an additional `max_consecutive_failures`, and then perform a successful health check to become healthy. To be backward compatible to the above configuration, include the `max_consecutive_failures` when computing `max_attempts_to_running`. Bugs closed: AURORA-1841 Reviewed at https://reviews.apache.org/r/54299/ Project: http://git-wip-us.apache.org/repos/asf/aurora/repo Commit: http://git-wip-us.apache.org/repos/asf/aurora/commit/d05ea2bc Tree: http://git-wip-us.apache.org/repos/asf/aurora/tree/d05ea2bc Diff: http://git-wip-us.apache.org/repos/asf/aurora/diff/d05ea2bc Branch: refs/heads/master Commit: d05ea2bcc4af20af0b5e0a7f3995d02ff25a1626 Parents: 0c17705 Author: Santhosh Kumar Shanmugham <[email protected]> Authored: Wed Dec 7 20:40:59 2016 -0600 Committer: Joshua Cohen <[email protected]> Committed: Wed Dec 7 20:40:59 2016 -0600 ---------------------------------------------------------------------- docs/features/services.md | 6 +- .../apache/aurora/executor/aurora_executor.py | 6 +- .../aurora/executor/common/health_checker.py | 27 +- .../executor/common/test_health_checker.py | 271 ++++++++++++++++--- 4 files changed, 262 insertions(+), 48 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/aurora/blob/d05ea2bc/docs/features/services.md ---------------------------------------------------------------------- diff --git a/docs/features/services.md b/docs/features/services.md index 50189ee..f26b4cc 100644 --- a/docs/features/services.md +++ b/docs/features/services.md @@ -94,9 +94,9 @@ Starting with the 0.17.0 release, job updates rely only on task health-checks by a `min_consecutive_successes` parameter on the HealthCheckConfig object. This parameter represents the number of successful health checks needed before a task is moved into the `RUNNING` state. Tasks that do not have enough successful health checks within the first `n` attempts, are moved to the -`FAILED` state, where `n = ceil(initial_interval_secs/interval_secs) + min_consecutive_successes`. -In order to accommodate variability during task warm up, `initial_interval_secs` will -act as a grace period. Any health-check failures during the first `m` attempts are ignored and +`FAILED` state, where `n = ceil(initial_interval_secs/interval_secs) + max_consecutive_failures + +min_consecutive_successes`. In order to accommodate variability during task warm up, `initial_interval_secs` +will act as a grace period. Any health-check failures during the first `m` attempts are ignored and do not count towards `max_consecutive_failures`, where `m = ceil(initial_interval_secs/interval_secs)`. As [job updates](job-updates.md) are based only on health-checks, it is not necessary to set http://git-wip-us.apache.org/repos/asf/aurora/blob/d05ea2bc/src/main/python/apache/aurora/executor/aurora_executor.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/executor/aurora_executor.py b/src/main/python/apache/aurora/executor/aurora_executor.py index d01fcb9..81461cb 100644 --- a/src/main/python/apache/aurora/executor/aurora_executor.py +++ b/src/main/python/apache/aurora/executor/aurora_executor.py @@ -184,9 +184,9 @@ class AuroraExecutor(ExecutorBase, Observable): self._status_manager.start() self.status_manager_started.set() - def _signal_running(self, reason): - log.info('Send TASK_RUNNING status update. reason: %s' % reason) - self.send_update(self._driver, self._task_id, mesos_pb2.TASK_RUNNING, reason) + def _signal_running(self, status_result): + log.info('Send TASK_RUNNING status update. status: %s' % status_result) + self.send_update(self._driver, self._task_id, mesos_pb2.TASK_RUNNING, status_result.reason) def _signal_kill_manager(self, driver, task_id, reason): if self._task_id is None: http://git-wip-us.apache.org/repos/asf/aurora/blob/d05ea2bc/src/main/python/apache/aurora/executor/common/health_checker.py ---------------------------------------------------------------------- diff --git a/src/main/python/apache/aurora/executor/common/health_checker.py b/src/main/python/apache/aurora/executor/common/health_checker.py index 12af9d8..a5fb18f 100644 --- a/src/main/python/apache/aurora/executor/common/health_checker.py +++ b/src/main/python/apache/aurora/executor/common/health_checker.py @@ -102,9 +102,19 @@ class ThreadedHealthChecker(ExceptionalThread): # to guarantee the number of health checks during the grace period. # Relying on time might cause non-deterministic behavior since the # health checks can be spaced apart by interval_secs + epsilon. - self.forgiving_attempts = math.ceil(self.grace_period_secs / self.interval) - - self.max_attempts_to_running = self.forgiving_attempts + self.min_consecutive_successes + self.forgiving_attempts = math.ceil(self.grace_period_secs / float(self.interval)) + + # In the older version (without min_consecutive_successes) it is possible for a task + # to make limping progress where the health checks fail all the time but never breach + # the max_consecutive_failures limit and end up updated successfully. + # Also a task can survive failures during initial_interval_secs and an additional + # max_consecutive_failures and still update successfully. + + # Although initial_interval_secs is supposed to count for the task warm up time, to be + # backward compatible add max_consecutive_failures to the max_attempts_to_running. + self.max_attempts_to_running = (self.forgiving_attempts + + self.max_consecutive_failures + + self.min_consecutive_successes) self.running = False self.healthy, self.reason = True, None @@ -164,15 +174,14 @@ class ThreadedHealthChecker(ExceptionalThread): if not self.running: attempts_remaining = self.max_attempts_to_running - self.attempts successes_needed = self.min_consecutive_successes - self.current_consecutive_successes - if successes_needed > attempts_remaining: + if attempts_remaining > 1 and successes_needed > attempts_remaining: return True return False def _should_ignore_failure(self): - if not self.running: - if self.attempts <= self.forgiving_attempts: - log.warning('Ignoring failure of attempt: %s' % self.attempts) - return True + if self.attempts <= self.forgiving_attempts: + log.warning('Ignoring failure of attempt: %s' % self.attempts) + return True return False def _should_enforce_deadline(self): @@ -192,7 +201,7 @@ class ThreadedHealthChecker(ExceptionalThread): return self.healthy, self.reason is_healthy, reason = self._perform_check_if_not_disabled() - if not self.running: + if self.attempts <= self.max_attempts_to_running: self.attempts += 1 self._maybe_update_health_check_count(is_healthy, reason) return is_healthy, reason http://git-wip-us.apache.org/repos/asf/aurora/blob/d05ea2bc/src/test/python/apache/aurora/executor/common/test_health_checker.py ---------------------------------------------------------------------- diff --git a/src/test/python/apache/aurora/executor/common/test_health_checker.py b/src/test/python/apache/aurora/executor/common/test_health_checker.py index e2a7f16..86a71c7 100644 --- a/src/test/python/apache/aurora/executor/common/test_health_checker.py +++ b/src/test/python/apache/aurora/executor/common/test_health_checker.py @@ -98,6 +98,60 @@ class TestHealthChecker(unittest.TestCase): hct.stop() assert self._checker.health.call_count == 2 + def test_ignore_failures_after_running_inside_grace_period(self): + '''Grace period is 2 x interval and health checks succeed then fail.''' + + self.append_health_checks(True) + self.append_health_checks(False) + hct = HealthChecker( + self._checker.health, + interval_secs=self.interval_secs, + clock=self._clock) + hct.start() + assert self._clock.converge(threads=[hct.threaded_health_checker]) + self._clock.assert_waiting(hct.threaded_health_checker, self.interval_secs) + assert hct.status == StatusResult('Task is healthy.', TaskState.Value('TASK_RUNNING')) + assert hct.threaded_health_checker.running is True + assert hct.threaded_health_checker.current_consecutive_failures == 0 + self._clock.tick(self.interval_secs) + assert self._clock.converge(threads=[hct.threaded_health_checker]) + self._clock.assert_waiting(hct.threaded_health_checker, self.interval_secs) + assert hct.status == StatusResult('Task is healthy.', TaskState.Value('TASK_RUNNING')) + assert hct.threaded_health_checker.running is True + assert hct.threaded_health_checker.current_consecutive_failures == 0 + hct.stop() + assert self._checker.health.call_count == 2 + + def test_does_not_ignores_failures_after_running_outside_grace_period(self): + '''Grace period is 2 x interval and health checks succeed then fail.''' + + self.append_health_checks(True) + self.append_health_checks(False, num_calls=2) + hct = HealthChecker( + self._checker.health, + interval_secs=self.interval_secs, + clock=self._clock) + hct.start() + assert self._clock.converge(threads=[hct.threaded_health_checker]) + self._clock.assert_waiting(hct.threaded_health_checker, self.interval_secs) + assert hct.status == StatusResult('Task is healthy.', TaskState.Value('TASK_RUNNING')) + assert hct.threaded_health_checker.running is True + assert hct.threaded_health_checker.current_consecutive_failures == 0 + self._clock.tick(self.interval_secs) + assert self._clock.converge(threads=[hct.threaded_health_checker]) + self._clock.assert_waiting(hct.threaded_health_checker, self.interval_secs) + assert hct.status == StatusResult('Task is healthy.', TaskState.Value('TASK_RUNNING')) + assert hct.threaded_health_checker.running is True + assert hct.threaded_health_checker.current_consecutive_failures == 0 + self._clock.tick(self.interval_secs) + assert self._clock.converge(threads=[hct.threaded_health_checker]) + self._clock.assert_waiting(hct.threaded_health_checker, self.interval_secs) + assert hct.status == StatusResult('Failed health check! reason', TaskState.Value('TASK_FAILED')) + assert hct.threaded_health_checker.running is True + assert hct.threaded_health_checker.current_consecutive_failures == 1 + hct.stop() + assert self._checker.health.call_count == 3 + def test_grace_period_2x_failure(self): ''' Grace period is 2 x interval and all health checks fail. @@ -156,6 +210,68 @@ class TestHealthChecker(unittest.TestCase): hct.stop() assert self._checker.health.call_count == 3 + def test_include_max_failure_to_forgiving_attempts(self): + ''' + Health checks fail but never breaches `max_consecutive_failures` + ''' + max_consecutive_failures = 4 + + # health checks fail within grace period + self.append_health_checks(False, num_calls=2) + + # health checks fails max_consecutive_failures times and then succeeds + self.append_health_checks(False, num_calls=max_consecutive_failures) + self.append_health_checks(True) + + # health checks fails max_consecutive_failures times and then succeeds + self.append_health_checks(False, num_calls=max_consecutive_failures) + self.append_health_checks(False) + + hct = HealthChecker( + self._checker.health, + interval_secs=self.interval_secs, + max_consecutive_failures=max_consecutive_failures, + clock=self._clock) + hct.start() + + # failures ignored inside grace period + for _ in range(2): + assert self._clock.converge(threads=[hct.threaded_health_checker]) + self._clock.assert_waiting(hct.threaded_health_checker, self.interval_secs) + assert hct.status == StatusResult(None, TaskState.Value('TASK_STARTING')) + assert hct.threaded_health_checker.running is False + self._clock.tick(self.interval_secs) + + # failures never breach max + for _ in range(max_consecutive_failures): + assert self._clock.converge(threads=[hct.threaded_health_checker]) + self._clock.assert_waiting(hct.threaded_health_checker, self.interval_secs) + assert hct.status == StatusResult(None, TaskState.Value('TASK_STARTING')) + assert hct.threaded_health_checker.running is False + self._clock.tick(self.interval_secs) + + assert self._clock.converge(threads=[hct.threaded_health_checker]) + self._clock.assert_waiting(hct.threaded_health_checker, self.interval_secs) + assert hct.status == StatusResult('Task is healthy.', TaskState.Value('TASK_RUNNING')) + assert hct.threaded_health_checker.running is True + self._clock.tick(self.interval_secs) + + # failures breach max, causes task failure + for _ in range(max_consecutive_failures): + assert self._clock.converge(threads=[hct.threaded_health_checker]) + self._clock.assert_waiting(hct.threaded_health_checker, self.interval_secs) + assert hct.status == StatusResult('Task is healthy.', TaskState.Value('TASK_RUNNING')) + assert hct.threaded_health_checker.running is True + self._clock.tick(self.interval_secs) + + assert self._clock.converge(threads=[hct.threaded_health_checker]) + self._clock.assert_waiting(hct.threaded_health_checker, self.interval_secs) + assert hct.status == StatusResult('Failed health check! reason', TaskState.Value('TASK_FAILED')) + assert hct.threaded_health_checker.running is True + + hct.stop() + assert self._checker.health.call_count == 12 + def test_initial_interval_whatev(self): self.append_health_checks(False, 2) hct = HealthChecker( @@ -217,29 +333,39 @@ class TestHealthChecker(unittest.TestCase): '''Verify that health check is failed fast''' grace_period_secs = self.initial_interval_secs interval_secs = self.interval_secs - self.append_health_checks(False, num_calls=3) + self.append_health_checks(False, num_calls=5) hct = HealthChecker( self._checker.health, interval_secs=interval_secs, grace_period_secs=grace_period_secs, max_consecutive_failures=2, - min_consecutive_successes=2, + min_consecutive_successes=4, clock=self._clock) hct.start() - # 3 consecutive health check failures causes fail-fast - self._clock.converge(threads=[hct.threaded_health_checker]) - self._clock.assert_waiting(hct.threaded_health_checker, interval_secs) - assert hct.status == StatusResult(None, TaskState.Value('TASK_STARTING')) # failure is ignored inside grace_period_secs - assert hct.metrics.sample()['consecutive_failures'] == 0 - self._clock.tick(interval_secs) + for _ in range(2): + self._clock.converge(threads=[hct.threaded_health_checker]) + self._clock.assert_waiting(hct.threaded_health_checker, interval_secs) + assert hct.status == StatusResult(None, TaskState.Value('TASK_STARTING')) + assert hct.metrics.sample()['consecutive_failures'] == 0 + self._clock.tick(interval_secs) + + # 3 consecutive health check failures causes fail-fast + for attempt in range(2): + self._clock.converge(threads=[hct.threaded_health_checker]) + self._clock.assert_waiting(hct.threaded_health_checker, interval_secs) + assert hct.status == StatusResult(None, TaskState.Value('TASK_STARTING')) + # failure is not ignored outside grace_period_secs + assert hct.metrics.sample()['consecutive_failures'] == (attempt + 1) + self._clock.tick(interval_secs) + self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, interval_secs) assert hct.status == StatusResult('Failed health check! reason', TaskState.Value('TASK_FAILED')) - assert hct.metrics.sample()['consecutive_failures'] == 1 + assert hct.metrics.sample()['consecutive_failures'] == 3 hct.stop() - assert self._checker.health.call_count == 2 + assert self._checker.health.call_count == 5 @pytest.mark.skipif('True', reason='Flaky test (AURORA-1182)') def test_health_checker_metrics(self): @@ -607,48 +733,65 @@ class TestThreadedHealthChecker(unittest.TestCase): def test_maybe_update_health_check_count_reset_count(self): hc = self.health_checker.threaded_health_checker - hc.running = True + hc.attempts = hc.forgiving_attempts assert hc.current_consecutive_failures == 0 assert hc.current_consecutive_successes == 0 + hc.attempts += 1 hc._maybe_update_health_check_count(True, 'reason-1') assert hc.current_consecutive_failures == 0 assert hc.current_consecutive_successes == 1 + hc.attempts += 1 hc._maybe_update_health_check_count(False, 'reason-2') assert hc.current_consecutive_failures == 1 assert hc.current_consecutive_successes == 0 + hc.attempts += 1 hc._maybe_update_health_check_count(True, 'reason-3') assert hc.current_consecutive_failures == 0 assert hc.current_consecutive_successes == 1 - def test_maybe_update_health_check_count_ignore_failures_before_callback(self): + def test_maybe_update_health_check_count_ignore_failures_within_grace_period(self): hc = self.health_checker.threaded_health_checker - hc.running = False + assert hc.current_consecutive_failures == 0 assert hc.current_consecutive_successes == 0 + hc.attempts += 1 hc._maybe_update_health_check_count(False, 'reason-1') assert hc.current_consecutive_failures == 0 assert hc.current_consecutive_successes == 0 + hc.attempts += 1 hc._maybe_update_health_check_count(False, 'reason-2') assert hc.current_consecutive_failures == 0 assert hc.current_consecutive_successes == 0 - def test_maybe_update_health_check_count_dont_ignore_failures_after_callback(self): + hc.attempts += 1 + hc._maybe_update_health_check_count(False, 'reason-3') + assert hc.current_consecutive_failures == 1 + assert hc.current_consecutive_successes == 0 + + hc.attempts += 1 + hc._maybe_update_health_check_count(False, 'reason-4') + assert hc.current_consecutive_failures == 2 + assert hc.current_consecutive_successes == 0 + + def test_maybe_update_health_check_count_dont_ignore_failures_after_grace_period(self): hc = self.health_checker.threaded_health_checker - hc.running = True + hc.attempts = hc.forgiving_attempts assert hc.current_consecutive_failures == 0 assert hc.current_consecutive_successes == 0 + hc.attempts += 1 hc._maybe_update_health_check_count(False, 'reason-1') assert hc.current_consecutive_failures == 1 assert hc.current_consecutive_successes == 0 + hc.attempts += 1 hc._maybe_update_health_check_count(False, 'reason-2') assert hc.current_consecutive_failures == 2 assert hc.current_consecutive_successes == 0 @@ -668,25 +811,40 @@ class TestThreadedHealthChecker(unittest.TestCase): hc.attempts += 1 hc._maybe_update_health_check_count(False, 'reason-2') + assert hc.current_consecutive_failures == 0 + assert hc.current_consecutive_successes == 0 + assert hc.running is False + + hc.attempts += 1 + hc._maybe_update_health_check_count(False, 'reason-3') assert hc.current_consecutive_failures == 1 assert hc.current_consecutive_successes == 0 assert hc.running is False + + hc.attempts += 1 + hc._maybe_update_health_check_count(False, 'reason-4') + assert hc.current_consecutive_failures == 2 + assert hc.current_consecutive_successes == 0 + assert hc.running is False assert hc.healthy is False - assert hc.reason == 'reason-2' + assert hc.reason == 'reason-4' - def test_maybe_update_health_check_count_max_failures(self): + def test_maybe_update_health_check_count_max_failures_1(self): hc = self.health_checker.threaded_health_checker - hc.running = True + hc.current_consecutive_successes = 1 + hc.attempts = hc.forgiving_attempts assert hc.current_consecutive_failures == 0 - assert hc.current_consecutive_successes == 0 + assert hc.current_consecutive_successes == 1 assert hc.healthy is True + hc.attempts += 1 hc._maybe_update_health_check_count(False, 'reason-1') assert hc.current_consecutive_failures == 1 assert hc.current_consecutive_successes == 0 assert hc.healthy is True + hc.attempts += 1 hc._maybe_update_health_check_count(False, 'reason-2') assert hc.current_consecutive_failures == 2 assert hc.current_consecutive_successes == 0 @@ -700,18 +858,21 @@ class TestThreadedHealthChecker(unittest.TestCase): assert hc.running is False assert hc.healthy is True + hc.attempts += 1 hc._maybe_update_health_check_count(True, 'reason') assert hc.current_consecutive_failures == 0 assert hc.current_consecutive_successes == 1 assert hc.running is False assert hc.healthy is True + hc.attempts += 1 hc._maybe_update_health_check_count(True, 'reason') assert hc.current_consecutive_failures == 0 assert hc.current_consecutive_successes == 2 assert hc.running is True assert hc.healthy is True + hc.attempts += 1 hc._maybe_update_health_check_count(True, 'reason') assert hc.current_consecutive_failures == 0 assert hc.current_consecutive_successes == 3 @@ -735,11 +896,11 @@ class TestThreadedHealthChecker(unittest.TestCase): def test_run_failure(self): self.health.return_value = (False, 'failure') mock_is_set = mock.Mock(spec=threading._Event.is_set) - liveness = [False, False, False, True] + liveness = [False, False, False, False, True] mock_is_set.side_effect = lambda: liveness.pop(0) self.health_checker.threaded_health_checker.dead.is_set = mock_is_set self.health_checker.threaded_health_checker.run() - assert self.clock.sleep.call_count == 3 + assert self.clock.sleep.call_count == 4 assert self.health_checker.threaded_health_checker.current_consecutive_failures == 2 assert self.health_checker.threaded_health_checker.current_consecutive_successes == 0 assert self.health_checker.threaded_health_checker.running is False @@ -747,19 +908,19 @@ class TestThreadedHealthChecker(unittest.TestCase): assert self.health_checker.threaded_health_checker.reason == 'failure' def test_run_failure_unhealthy_when_failfast(self): - health_status = [(False, 'failure-1'), (True, None), (False, 'failure-3')] + health_status = [(False, 'failure-1'), (True, None), (False, 'failure-3'), (False, 'failure-4')] self.health.side_effect = lambda: health_status.pop(0) mock_is_set = mock.Mock(spec=threading._Event.is_set) - liveness = [False, False, False, True] + liveness = [False, False, False, False, True] mock_is_set.side_effect = lambda: liveness.pop(0) self.health_checker.threaded_health_checker.dead.is_set = mock_is_set self.health_checker.threaded_health_checker.run() - assert self.clock.sleep.call_count == 3 - assert self.health_checker.threaded_health_checker.current_consecutive_failures == 1 + assert self.clock.sleep.call_count == 4 + assert self.health_checker.threaded_health_checker.current_consecutive_failures == 2 assert self.health_checker.threaded_health_checker.current_consecutive_successes == 0 assert self.health_checker.threaded_health_checker.running is False assert self.health_checker.threaded_health_checker.healthy is False - assert self.health_checker.threaded_health_checker.reason == 'failure-3' + assert self.health_checker.threaded_health_checker.reason == 'failure-4' def test_run_unhealthy_after_callback(self): health_status = [(True, None), (True, None), (False, 'failure-4'), (False, 'failure-5')] @@ -845,48 +1006,55 @@ class TestThreadedHealthCheckerWithDefaults(unittest.TestCase): def test_maybe_update_health_check_count_reset_count(self): hc = self.health_checker.threaded_health_checker - hc.running = True + hc.attempts = hc.forgiving_attempts assert hc.current_consecutive_failures == 0 assert hc.current_consecutive_successes == 0 + hc.attempts += 1 hc._maybe_update_health_check_count(True, 'reason-1') assert hc.current_consecutive_failures == 0 assert hc.current_consecutive_successes == 1 + hc.attempts += 1 hc._maybe_update_health_check_count(False, 'reason-2') assert hc.current_consecutive_failures == 1 assert hc.current_consecutive_successes == 0 + hc.attempts += 1 hc._maybe_update_health_check_count(True, 'reason-3') assert hc.current_consecutive_failures == 0 assert hc.current_consecutive_successes == 1 - def test_maybe_update_health_check_count_ignore_failures_before_callback(self): + def test_maybe_update_health_check_count_ignore_failures_inside_grace_period(self): hc = self.health_checker.threaded_health_checker - hc.running = False + assert hc.current_consecutive_failures == 0 assert hc.current_consecutive_successes == 0 + hc.attempts += 1 hc._maybe_update_health_check_count(False, 'reason-1') assert hc.current_consecutive_failures == 0 assert hc.current_consecutive_successes == 0 + hc.attempts += 1 hc._maybe_update_health_check_count(False, 'reason-2') assert hc.current_consecutive_failures == 0 assert hc.current_consecutive_successes == 0 - def test_maybe_update_health_check_count_dont_ignore_failures_after_callback(self): + def test_maybe_update_health_check_count_dont_ignore_failures_after_grace_period(self): hc = self.health_checker.threaded_health_checker - hc.running = True + hc.attempts = hc.forgiving_attempts assert hc.current_consecutive_failures == 0 assert hc.current_consecutive_successes == 0 + hc.attempts += 1 hc._maybe_update_health_check_count(False, 'reason-1') assert hc.current_consecutive_failures == 1 assert hc.current_consecutive_successes == 0 + hc.attempts += 1 hc._maybe_update_health_check_count(False, 'reason-2') assert hc.current_consecutive_failures == 2 assert hc.current_consecutive_successes == 0 @@ -920,12 +1088,14 @@ class TestThreadedHealthCheckerWithDefaults(unittest.TestCase): def test_maybe_update_health_check_count_max_failures(self): hc = self.health_checker.threaded_health_checker - hc.running = True + hc.attempts = hc.forgiving_attempts + hc.current_consecutive_successes = 1 assert hc.current_consecutive_failures == 0 - assert hc.current_consecutive_successes == 0 + assert hc.current_consecutive_successes == 1 assert hc.healthy is True + hc.attempts += 1 hc._maybe_update_health_check_count(False, 'reason-1') assert hc.current_consecutive_failures == 1 assert hc.current_consecutive_successes == 0 @@ -939,6 +1109,7 @@ class TestThreadedHealthCheckerWithDefaults(unittest.TestCase): assert hc.healthy is True assert hc.running is False + hc.attempts += 1 hc._maybe_update_health_check_count(True, 'reason') assert hc.current_consecutive_failures == 0 assert hc.current_consecutive_successes == 1 @@ -978,6 +1149,40 @@ class TestThreadedHealthCheckerWithDefaults(unittest.TestCase): assert self.health_checker.threaded_health_checker.reason == 'failure' @mock.patch('apache.aurora.executor.common.health_checker.time.sleep', spec=time.sleep) + def test_first_success_after_grace_period_and_max_consecutive_failures(self, mock_sleep): + mock_sleep.return_value = None + health_status = [(False, 'failure-1'), (False, 'failure-2'), (True, None)] + self.health.side_effect = lambda: health_status.pop(0) + mock_is_set = mock.Mock(spec=threading._Event.is_set) + liveness = [False, False, False, True] + mock_is_set.side_effect = lambda: liveness.pop(0) + self.health_checker.threaded_health_checker.dead.is_set = mock_is_set + self.health_checker.threaded_health_checker.run() + assert mock_sleep.call_count == 3 + assert self.health_checker.threaded_health_checker.current_consecutive_failures == 0 + assert self.health_checker.threaded_health_checker.current_consecutive_successes == 1 + assert self.health_checker.threaded_health_checker.running is True + assert self.health_checker.threaded_health_checker.healthy is True + assert self.health_checker.threaded_health_checker.reason is None + + @mock.patch('apache.aurora.executor.common.health_checker.time.sleep', spec=time.sleep) + def test_success_then_failures_ignored_till_grace_period_ends(self, mock_sleep): + mock_sleep.return_value = None + health_status = [(True, None), (False, 'failure-2'), (False, 'failure-3')] + self.health.side_effect = lambda: health_status.pop(0) + mock_is_set = mock.Mock(spec=threading._Event.is_set) + liveness = [False, False, False, True] + mock_is_set.side_effect = lambda: liveness.pop(0) + self.health_checker.threaded_health_checker.dead.is_set = mock_is_set + self.health_checker.threaded_health_checker.run() + assert mock_sleep.call_count == 3 + assert self.health_checker.threaded_health_checker.current_consecutive_failures == 1 + assert self.health_checker.threaded_health_checker.current_consecutive_successes == 0 + assert self.health_checker.threaded_health_checker.running is True + assert self.health_checker.threaded_health_checker.healthy is False + assert self.health_checker.threaded_health_checker.reason == 'failure-3' + + @mock.patch('apache.aurora.executor.common.health_checker.time.sleep', spec=time.sleep) def test_run_failure_unhealthy_when_failfast(self, mock_sleep): mock_sleep.return_value = None health_status = [(False, 'failure-1'), (False, 'failure-2'), (False, 'failure-3')]
