Repository: aurora
Updated Branches:
  refs/heads/master f91ecd107 -> 0d7f946f7


Add support for performing health checks with a shell command.

Bugs closed: AURORA-1551

Reviewed at https://reviews.apache.org/r/41154/


Project: http://git-wip-us.apache.org/repos/asf/aurora/repo
Commit: http://git-wip-us.apache.org/repos/asf/aurora/commit/0d7f946f
Tree: http://git-wip-us.apache.org/repos/asf/aurora/tree/0d7f946f
Diff: http://git-wip-us.apache.org/repos/asf/aurora/diff/0d7f946f

Branch: refs/heads/master
Commit: 0d7f946f76e4600cec878e23d5b2a44702f9b4f6
Parents: f91ecd1
Author: Dmitriy Shirchenko <[email protected]>
Authored: Mon Dec 14 21:37:44 2015 -0800
Committer: Maxim Khutornenko <[email protected]>
Committed: Mon Dec 14 21:37:44 2015 -0800

----------------------------------------------------------------------
 3rdparty/python/requirements.txt                |   1 +
 docs/configuration-reference.md                 |  16 +--
 src/main/python/apache/aurora/client/config.py  |  44 ++++++++
 src/main/python/apache/aurora/common/BUILD      |   1 +
 .../aurora/common/health_check/__init__.py      |  13 +++
 .../aurora/common/health_check/http_signaler.py | 112 +++++++++++++++++++
 .../apache/aurora/common/health_check/shell.py  |  60 ++++++++++
 .../apache/aurora/common/http_signaler.py       | 105 -----------------
 .../python/apache/aurora/config/schema/base.py  |  10 +-
 .../aurora/executor/common/health_checker.py    |  53 ++++++---
 .../apache/aurora/executor/http_lifecycle.py    |   2 +-
 src/test/python/apache/aurora/client/BUILD      |   1 +
 .../python/apache/aurora/client/test_config.py  |  65 +++++++++++
 src/test/python/apache/aurora/common/BUILD      |  10 --
 .../apache/aurora/common/health_check/BUILD     |  39 +++++++
 .../aurora/common/health_check/__init__.py      |  13 +++
 .../common/health_check/test_http_signaler.py   | 111 ++++++++++++++++++
 .../aurora/common/health_check/test_shell.py    |  91 +++++++++++++++
 .../apache/aurora/common/test_http_signaler.py  | 111 ------------------
 .../executor/common/test_health_checker.py      |  56 +++++++++-
 .../apache/aurora/e2e/http/http_example.aurora  |   2 +-
 .../http/http_example_bad_healthcheck.aurora    |  73 ++++++++++++
 .../aurora/e2e/http/http_example_updated.aurora |   2 +-
 .../sh/org/apache/aurora/e2e/test_end_to_end.sh |  38 ++++++-
 24 files changed, 769 insertions(+), 260 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/3rdparty/python/requirements.txt
----------------------------------------------------------------------
diff --git a/3rdparty/python/requirements.txt b/3rdparty/python/requirements.txt
index 16fa35b..8c72880 100644
--- a/3rdparty/python/requirements.txt
+++ b/3rdparty/python/requirements.txt
@@ -24,6 +24,7 @@ psutil==3.2.2
 pystachio==0.8.0
 requests==2.7.0
 requests-kerberos==0.7.0
+subprocess32==3.2.7
 thrift==0.9.1
 twitter.common.app==0.3.3
 twitter.common.collections==0.3.3

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/docs/configuration-reference.md
----------------------------------------------------------------------
diff --git a/docs/configuration-reference.md b/docs/configuration-reference.md
index 077979a..07149c7 100644
--- a/docs/configuration-reference.md
+++ b/docs/configuration-reference.md
@@ -380,17 +380,19 @@ Parameters for controlling the rate and policy of rolling 
updates.
 
 ### HealthCheckConfig Objects
 
-Parameters for controlling a task's health checks via HTTP.
+Parameters for controlling a task's health checks via HTTP or a shell command.
 
 | object                         | type      | description
 | -------                        | :-------: | --------
-| ```initial_interval_secs```    | Integer   | Initial delay for performing an 
HTTP health check. (Default: 15)
-| ```interval_secs```            | Integer   | Interval on which to check the 
task's health via HTTP. (Default: 10)
-| ```max_consecutive_failures``` | Integer   | Maximum number of consecutive 
failures that tolerated before considering a task unhealthy (Default: 0)
-| ```timeout_secs```             | Integer   | HTTP request timeout. (Default: 
1)
 | ```endpoint```                 | String    | HTTP endpoint to check 
(Default: /health)
-| ```expected_response```        | String    | If not empty, fail the health 
check if the response differs. Case insensitive. (Default: ok)
-| ```expected_response_code```   | Integer   | If not zero, fail the health 
check if the response code differs. (Default: 0)
+| ```expected_response```        | String    | If not empty, fail the HTTP 
health check if the response differs. Case insensitive. (Default: ok)
+| ```expected_response_code```   | Integer   | If not zero, fail the HTTP 
health check if the response code differs. (Default: 0)
+| ```initial_interval_secs```    | Integer   | Initial delay for performing a 
health check. (Default: 15)
+| ```interval_secs```            | Integer   | Interval on which to check the 
task's health. (Default: 10)
+| ```max_consecutive_failures``` | Integer   | Maximum number of consecutive 
failures that will be tolerated before considering a task unhealthy (Default: 0)
+| ```shell_command```            | String    | An alternative to HTTP health 
checking. Specifies a shell command that will be executed. Any non-zero exit 
status will be interpreted as a health check failure.
+| ```type```                     | String    | 'http' or 'shell'. (Default: 
'http')
+| ```timeout_secs```             | Integer   | HTTP request timeout. (Default: 
1)
 
 ### Announcer Objects
 

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/main/python/apache/aurora/client/config.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/client/config.py 
b/src/main/python/apache/aurora/client/config.py
index 2fc1255..161c362 100644
--- a/src/main/python/apache/aurora/client/config.py
+++ b/src/main/python/apache/aurora/client/config.py
@@ -22,6 +22,8 @@ import math
 import re
 import sys
 
+from pystachio.composite import Empty
+
 from apache.aurora.client import binding_helper
 from apache.aurora.client.base import die
 from apache.aurora.config import AuroraConfig
@@ -68,6 +70,47 @@ def _validate_environment_name(config):
   __validate_env(env_name, 'Environment')
 
 
+CANNOT_HAVE_HTTP_ARGS_WITH_SHELL_ERROR = '''
+shell_command does not support supplied http arguments.
+'''
+
+CANNOT_HAVE_SHELL_ARGS_WITH_HTTP_ERROR = '''
+Cannot define shell_commmand for HTTP health check.
+'''
+
+INVALID_HEALTH_CHECK_TYPE = '''
+Invalid health check type {health_check_type}.
+'''
+
+MUST_PROVIDE_SHELL_COMMAND_ERROR = '''
+Must provide a shell command for shell type.
+'''
+
+
+HTTP_HEALTH_CHECK = 'http'
+SHELL_HEALTH_CHECK = 'shell'
+
+
+# TODO (AURORA-1552): Add config validation to the executor
+def _validate_health_check_config(config):
+  health_check_config = config.health_check_config()
+  health_check_type = health_check_config.type().get()
+
+  # Make sure we either have HTTP or SHELL.
+  if health_check_type not in {HTTP_HEALTH_CHECK, SHELL_HEALTH_CHECK}:
+    die(INVALID_HEALTH_CHECK_TYPE.format(health_check_type=health_check_type))
+  if health_check_type == SHELL_HEALTH_CHECK:
+    # SHELL options
+    shell_command = health_check_config.shell_command()
+    if shell_command == Empty:
+      # Must define a command.
+      die(MUST_PROVIDE_SHELL_COMMAND_ERROR)
+  elif health_check_type == HTTP_HEALTH_CHECK:
+    if health_check_config.shell_command() != Empty:
+      # No shell_command for HTTP.
+      die(CANNOT_HAVE_SHELL_ARGS_WITH_HTTP_ERROR)
+
+
 UPDATE_CONFIG_MAX_FAILURES_ERROR = '''
 max_total_failures in update_config must be lesser than the job size.
 Based on your job size (%s) you should use max_total_failures <= %s.
@@ -118,6 +161,7 @@ def validate_config(config, env=None):
   _validate_update_config(config)
   _validate_announce_configuration(config)
   _validate_environment_name(config)
+  _validate_health_check_config(config)
 
 
 class GlobalHookRegistry(object):

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/main/python/apache/aurora/common/BUILD
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/common/BUILD 
b/src/main/python/apache/aurora/common/BUILD
index 5fce3d0..0e4c510 100644
--- a/src/main/python/apache/aurora/common/BUILD
+++ b/src/main/python/apache/aurora/common/BUILD
@@ -21,6 +21,7 @@ python_library(
     '3rdparty/python:pex',
     '3rdparty/python:pystachio',
     '3rdparty/python:requests',
+    '3rdparty/python:subprocess32',
     '3rdparty/python:thrift',
     '3rdparty/python:twitter.common.collections',
     '3rdparty/python:twitter.common.lang',

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/main/python/apache/aurora/common/health_check/__init__.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/common/health_check/__init__.py 
b/src/main/python/apache/aurora/common/health_check/__init__.py
new file mode 100644
index 0000000..0663a9a
--- /dev/null
+++ b/src/main/python/apache/aurora/common/health_check/__init__.py
@@ -0,0 +1,13 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/main/python/apache/aurora/common/health_check/http_signaler.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/common/health_check/http_signaler.py 
b/src/main/python/apache/aurora/common/health_check/http_signaler.py
new file mode 100644
index 0000000..41b6bfb
--- /dev/null
+++ b/src/main/python/apache/aurora/common/health_check/http_signaler.py
@@ -0,0 +1,112 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import contextlib
+import os
+from socket import timeout as SocketTimeout
+
+from twitter.common import log
+from twitter.common.lang import Compatibility
+
+if Compatibility.PY3:
+  from http.client import HTTPException
+  import urllib.request as urllib_request
+  from urllib.error import URLError, HTTPError
+else:
+  from httplib import HTTPException
+  import urllib2 as urllib_request
+  from urllib2 import URLError, HTTPError
+
+
+class HttpSignaler(object):
+  """Simple HTTP endpoint wrapper to check health or trigger 
quitquitquit/abortabortabort"""
+  TIMEOUT_SECS = 1.0
+  FAILURE_REASON_LENGTH = 10
+
+  class Error(Exception): pass
+  class QueryError(Error): pass
+
+  def __init__(self, port, host='localhost', timeout_secs=None):
+    self._host = host
+    self._url_base = 'http://%s:%d' % (host, port)
+    if timeout_secs is None:
+      env_timeout = os.getenv('AURORA_HTTP_SIGNALER_TIMEOUT_SECS')
+      if env_timeout is not None:
+        log.info('Using timeout %s secs (from 
AURORA_HTTP_SIGNALER_TIMEOUT_SECS).' % env_timeout)
+        self._timeout_secs = float(env_timeout)
+      else:
+        log.debug('Using timeout %s secs (default).' % self.TIMEOUT_SECS)
+        self._timeout_secs = self.TIMEOUT_SECS
+    else:
+      log.debug('Using timeout %s secs.' % timeout_secs)
+      self._timeout_secs = timeout_secs
+
+  def url(self, endpoint):
+    return self._url_base + endpoint
+
+  @property
+  def opener(self):
+    return urllib_request.urlopen
+
+  def query(self, endpoint, data=None):
+    """Request an HTTP endpoint with a GET request (or POST if data is not 
None)"""
+    url = self.url(endpoint)
+    log.debug("%s: %s %s" % (self.__class__.__name__, 'GET' if data is None 
else 'POST', url))
+
+    def raise_error(reason):
+      raise self.QueryError('Failed to signal %s: %s' % (self.url(endpoint), 
reason))
+
+    try:
+      with contextlib.closing(
+          self.opener(url, data, timeout=self._timeout_secs)) as fp:
+        return (fp.read(), fp.getcode())
+    except (HTTPException, SocketTimeout) as e:
+      # the type of an HTTPException is typically more useful than its 
contents (since for example
+      # BadStatusLines are often empty). likewise with socket.timeout.
+      raise_error('Error within %s' % e.__class__.__name__)
+    except HTTPError as e:
+      return ('', e.code)
+    except URLError as e:
+      raise_error(e)
+    except Exception as e:
+      raise_error('Unexpected error: %s' % e)
+
+  def __call__(self, endpoint, use_post_method=False, expected_response=None,
+      expected_response_code=None):
+    """
+    Returns a (boolean, string|None) tuple of (call success, failure reason)
+    :type endpoint: str
+    :type use_post_method: bool
+    :type expected_response: str
+    :type expected_response_code: int
+    :rtype (bool, str):
+    """
+    try:
+      response, response_code = self.query(endpoint, '' if use_post_method 
else None)
+      response = response.strip().lower()
+      if expected_response and response != expected_response.lower():
+        reason = 'Response differs from expected response (expected "%s", got 
"%s")'
+        def shorten(string):
+          return (string if len(string) < self.FAILURE_REASON_LENGTH
+                         else "%s..." % string[:self.FAILURE_REASON_LENGTH - 
3])
+        log.warning(reason % (expected_response, response))
+        return (False, reason % (shorten(str(expected_response)), 
shorten(str(response))))
+      elif expected_response_code and response_code != expected_response_code:
+        reason = 'Response code differs from expected response (expected %i, 
got %i)'
+        log.warning(reason % (expected_response_code, response_code))
+        return (False, reason % (expected_response_code, response_code))
+      else:
+        return (True, None)
+    except self.QueryError as e:
+      return (False, str(e))

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/main/python/apache/aurora/common/health_check/shell.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/common/health_check/shell.py 
b/src/main/python/apache/aurora/common/health_check/shell.py
new file mode 100644
index 0000000..890bf0c
--- /dev/null
+++ b/src/main/python/apache/aurora/common/health_check/shell.py
@@ -0,0 +1,60 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import shlex
+import sys
+
+# Recommended pattern for Python 2 and 3 support from 
https://github.com/google/python-subprocess32
+# Backport which adds bug fixes and timeout support for Python 2.7
+if os.name == 'posix' and sys.version_info[0] < 3:
+  import subprocess32 as subprocess
+else:
+  # subprocess is included as part of Python standard lib in Python 3+.
+  import subprocess
+
+
+class ShellHealthCheck(object):
+
+  def __init__(self, cmd, timeout_secs=None):
+    """
+    Initialize with the commmand we would like to call.
+    :param cmd: Command to execute that is expected to have a 0 return code on 
success.
+    :type cmd: str
+    :param timeout_secs: Timeout in seconds.
+    :type timeout_secs: int
+    """
+    self.cmd = cmd
+    self.timeout_secs = timeout_secs
+
+  def __call__(self):
+    """
+    Call a shell command line health check.
+
+    :return: A tuple of (bool, str)
+    :rtype tuple:
+    """
+    cmd = shlex.split(self.cmd)
+    try:
+      subprocess.check_call(cmd, timeout=self.timeout_secs)
+      return True, None
+    except subprocess.CalledProcessError as reason:
+      # The command didn't return a 0 so provide reason for failure.
+      return False, str(reason)
+    except OSError as e:
+      reason = 'OSError: {error}'.format(error=e.strerror)
+      return False, reason
+    except ValueError:
+      reason = 'Invalid commmand.'
+      return False, reason

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/main/python/apache/aurora/common/http_signaler.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/common/http_signaler.py 
b/src/main/python/apache/aurora/common/http_signaler.py
deleted file mode 100644
index a3193f3..0000000
--- a/src/main/python/apache/aurora/common/http_signaler.py
+++ /dev/null
@@ -1,105 +0,0 @@
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import contextlib
-import os
-from socket import timeout as SocketTimeout
-
-from twitter.common import log
-from twitter.common.lang import Compatibility
-
-if Compatibility.PY3:
-  from http.client import HTTPException
-  import urllib.request as urllib_request
-  from urllib.error import URLError, HTTPError
-else:
-  from httplib import HTTPException
-  import urllib2 as urllib_request
-  from urllib2 import URLError, HTTPError
-
-
-class HttpSignaler(object):
-  """Simple HTTP endpoint wrapper to check health or trigger 
quitquitquit/abortabortabort"""
-  TIMEOUT_SECS = 1.0
-  FAILURE_REASON_LENGTH = 10
-
-  class Error(Exception): pass
-  class QueryError(Error): pass
-
-  def __init__(self, port, host='localhost', timeout_secs=None):
-    self._host = host
-    self._url_base = 'http://%s:%d' % (host, port)
-    if timeout_secs is None:
-      env_timeout = os.getenv('AURORA_HTTP_SIGNALER_TIMEOUT_SECS')
-      if env_timeout is not None:
-        log.info('Using timeout %s secs (from 
AURORA_HTTP_SIGNALER_TIMEOUT_SECS).' % env_timeout)
-        self._timeout_secs = float(env_timeout)
-      else:
-        log.debug('Using timeout %s secs (default).' % self.TIMEOUT_SECS)
-        self._timeout_secs = self.TIMEOUT_SECS
-    else:
-      log.debug('Using timeout %s secs.' % timeout_secs)
-      self._timeout_secs = timeout_secs
-
-  def url(self, endpoint):
-    return self._url_base + endpoint
-
-  @property
-  def opener(self):
-    return urllib_request.urlopen
-
-  def query(self, endpoint, data=None):
-    """Request an HTTP endpoint with a GET request (or POST if data is not 
None)"""
-    url = self.url(endpoint)
-    log.debug("%s: %s %s" % (self.__class__.__name__, 'GET' if data is None 
else 'POST', url))
-
-    def raise_error(reason):
-      raise self.QueryError('Failed to signal %s: %s' % (self.url(endpoint), 
reason))
-
-    try:
-      with contextlib.closing(
-          self.opener(url, data, timeout=self._timeout_secs)) as fp:
-        return (fp.read(), fp.getcode())
-    except (HTTPException, SocketTimeout) as e:
-      # the type of an HTTPException is typically more useful than its 
contents (since for example
-      # BadStatusLines are often empty). likewise with socket.timeout.
-      raise_error('Error within %s' % e.__class__.__name__)
-    except HTTPError as e:
-      return ('', e.code)
-    except URLError as e:
-      raise_error(e)
-    except Exception as e:
-      raise_error('Unexpected error: %s' % e)
-
-  def __call__(self, endpoint, use_post_method=False, expected_response=None,
-      expected_response_code=None):
-    """Returns a (boolean, string|None) tuple of (call success, failure 
reason)"""
-    try:
-      response, response_code = self.query(endpoint, '' if use_post_method 
else None)
-      response = response.strip().lower()
-      if expected_response and response != expected_response.lower():
-        reason = 'Response differs from expected response (expected "%s", got 
"%s")'
-        def shorten(string):
-          return (string if len(string) < self.FAILURE_REASON_LENGTH
-                         else "%s..." % string[:self.FAILURE_REASON_LENGTH - 
3])
-        log.warning(reason % (expected_response, response))
-        return (False, reason % (shorten(str(expected_response)), 
shorten(str(response))))
-      elif expected_response_code and response_code != expected_response_code:
-        reason = 'Response code differs from expected response (expected %i, 
got %i)'
-        log.warning(reason % (expected_response_code, response_code))
-        return (False, reason % (expected_response_code, response_code))
-      else:
-        return (True, None)
-    except self.QueryError as e:
-      return (False, str(e))

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/main/python/apache/aurora/config/schema/base.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/config/schema/base.py 
b/src/main/python/apache/aurora/config/schema/base.py
index 398f737..e752482 100644
--- a/src/main/python/apache/aurora/config/schema/base.py
+++ b/src/main/python/apache/aurora/config/schema/base.py
@@ -37,13 +37,15 @@ class UpdateConfig(Struct):
 
 
 class HealthCheckConfig(Struct):
-  initial_interval_secs    = Default(Float, 15.0)
-  interval_secs            = Default(Float, 10.0)
-  timeout_secs             = Default(Float, 1.0)
-  max_consecutive_failures = Default(Integer, 0)
   endpoint                 = Default(String, '/health')
   expected_response        = Default(String, 'ok')
   expected_response_code   = Default(Integer, 0)
+  initial_interval_secs    = Default(Float, 15.0)
+  interval_secs            = Default(Float, 10.0)
+  max_consecutive_failures = Default(Integer, 0)
+  shell_command            = String
+  type                     = Default(String, 'http')
+  timeout_secs             = Default(Float, 1.0)
 
 
 class HttpLifecycleConfig(Struct):

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/main/python/apache/aurora/executor/common/health_checker.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/executor/common/health_checker.py 
b/src/main/python/apache/aurora/executor/common/health_checker.py
index 03fdf0a..cba4e8c 100644
--- a/src/main/python/apache/aurora/executor/common/health_checker.py
+++ b/src/main/python/apache/aurora/executor/common/health_checker.py
@@ -22,11 +22,15 @@ from twitter.common import log
 from twitter.common.exceptions import ExceptionalThread
 from twitter.common.metrics import LambdaGauge
 
-from apache.aurora.common.http_signaler import HttpSignaler
+from apache.aurora.common.health_check.http_signaler import HttpSignaler
+from apache.aurora.common.health_check.shell import ShellHealthCheck
 
 from .status_checker import StatusChecker, StatusCheckerProvider, StatusResult
 from .task_info import mesos_task_instance_from_assigned_task, resolve_ports
 
+HTTP_HEALTH_CHECK = 'http'
+SHELL_HEALTH_CHECK = 'shell'
+
 
 class ThreadedHealthChecker(ExceptionalThread):
   """Perform a health check to determine if a service is healthy or not
@@ -200,23 +204,44 @@ class HealthChecker(StatusChecker):
 
 class HealthCheckerProvider(StatusCheckerProvider):
   def from_assigned_task(self, assigned_task, sandbox):
+    """
+    :param assigned_task:
+    :param sandbox:
+    :return: Instance of a HealthChecker.
+    """
     mesos_task = mesos_task_instance_from_assigned_task(assigned_task)
     portmap = resolve_ports(mesos_task, assigned_task.assignedPorts)
 
-    if 'health' not in portmap:
-      return None
-
     health_check_config = mesos_task.health_check_config().get()
-    http_signaler = HttpSignaler(
+    health_check_type = health_check_config.get('type')
+
+    # We don't need a port if we are running a shell command.
+    if health_check_type == HTTP_HEALTH_CHECK and 'health' not in portmap:
+      return None
+    timeout_secs = health_check_config.get('timeout_secs')
+
+    if health_check_type == SHELL_HEALTH_CHECK:
+      shell_command = health_check_config.get('shell_command')
+      shell_signaler = ShellHealthCheck(
+        cmd=shell_command,
+        timeout_secs=timeout_secs
+      )
+      a_health_checker = lambda: shell_signaler()
+    else:
+      http_signaler = HttpSignaler(
         portmap['health'],
-        timeout_secs=health_check_config.get('timeout_secs'))
+        timeout_secs=timeout_secs)
+      a_health_checker = lambda: http_signaler(
+        endpoint=health_check_config.get('endpoint'),
+        expected_response=health_check_config.get('expected_response'),
+        
expected_response_code=health_check_config.get('expected_response_code')
+      )
+
     health_checker = HealthChecker(
-        lambda: http_signaler(
-            endpoint=health_check_config.get('endpoint'),
-            expected_response=health_check_config.get('expected_response'),
-            
expected_response_code=health_check_config.get('expected_response_code')),
-        sandbox,
-        interval_secs=health_check_config.get('interval_secs'),
-        initial_interval_secs=health_check_config.get('initial_interval_secs'),
-        
max_consecutive_failures=health_check_config.get('max_consecutive_failures'))
+      a_health_checker,
+      sandbox,
+      interval_secs=health_check_config.get('interval_secs'),
+      initial_interval_secs=health_check_config.get('initial_interval_secs'),
+      
max_consecutive_failures=health_check_config.get('max_consecutive_failures'))
+
     return health_checker

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/main/python/apache/aurora/executor/http_lifecycle.py
----------------------------------------------------------------------
diff --git a/src/main/python/apache/aurora/executor/http_lifecycle.py 
b/src/main/python/apache/aurora/executor/http_lifecycle.py
index 6d578cc..9280bf2 100644
--- a/src/main/python/apache/aurora/executor/http_lifecycle.py
+++ b/src/main/python/apache/aurora/executor/http_lifecycle.py
@@ -17,7 +17,7 @@ import time
 from twitter.common import log
 from twitter.common.quantity import Amount, Time
 
-from apache.aurora.common.http_signaler import HttpSignaler
+from apache.aurora.common.health_check.http_signaler import HttpSignaler
 
 from .common.task_runner import TaskError, TaskRunner
 

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/python/apache/aurora/client/BUILD
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/client/BUILD 
b/src/test/python/apache/aurora/client/BUILD
index 1ead9ae..c9d7616 100644
--- a/src/test/python/apache/aurora/client/BUILD
+++ b/src/test/python/apache/aurora/client/BUILD
@@ -46,6 +46,7 @@ python_tests(name = 'config',
   sources = ['test_config.py'],
   dependencies = [
     '3rdparty/python:mox',
+    '3rdparty/python:pystachio',
     'src/main/python/apache/aurora/client',
   ],
 )

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/python/apache/aurora/client/test_config.py
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/client/test_config.py 
b/src/test/python/apache/aurora/client/test_config.py
index b1a3c18..8fd112f 100644
--- a/src/test/python/apache/aurora/client/test_config.py
+++ b/src/test/python/apache/aurora/client/test_config.py
@@ -180,6 +180,71 @@ def test_dedicated_portmap():
                               constraints={'foo': 'bar'})))
 
 
+def test_health_check_config_http_ok():
+  base_job = Job(
+    name='hello_bond', role='james', cluster='marine-cluster',
+    health_check_config=HealthCheckConfig(
+      max_consecutive_failures=1,
+      type='http',
+    ),
+    task=Task(name='main', processes=[],
+              resources=Resources(cpu=0.1, ram=64 * MB, disk=64 * MB)))
+  config._validate_health_check_config(AuroraConfig(base_job))
+
+
+def test_health_check_config_shell_ok():
+  base_job = Job(
+    name='hello_bond', role='james', cluster='marine-cluster',
+    health_check_config=HealthCheckConfig(
+      max_consecutive_failures=1,
+      type='shell',
+      shell_command='foo bar'
+    ),
+    task=Task(name='main', processes=[],
+              resources=Resources(cpu=0.1, ram=64 * MB, disk=64 * MB)))
+  config._validate_health_check_config(AuroraConfig(base_job))
+
+
+def test_health_check_config_invalid_type():
+  base_job = Job(
+    name='hello_bond', role='james', cluster='marine-cluster',
+    health_check_config=HealthCheckConfig(
+      max_consecutive_failures=1,
+      type='foo',
+    ),
+    task=Task(name='main', processes=[],
+              resources=Resources(cpu=0.1, ram=64 * MB, disk=64 * MB)))
+  with pytest.raises(SystemExit):
+    config._validate_health_check_config(AuroraConfig(base_job))
+
+
+def test_health_check_config_http_and_shell_defined():
+  base_job = Job(
+    name='hello_bond', role='james', cluster='marine-cluster',
+    health_check_config=HealthCheckConfig(
+      max_consecutive_failures=1,
+      type='http',
+      shell_command='foo bar'
+    ),
+    task=Task(name='main', processes=[],
+              resources=Resources(cpu=0.1, ram=64 * MB, disk=64 * MB)))
+  with pytest.raises(SystemExit):
+    config._validate_health_check_config(AuroraConfig(base_job))
+
+
+def test_health_check_config_shell_no_command():
+  base_job = Job(
+    name='hello_bond', role='james', cluster='marine-cluster',
+    health_check_config=HealthCheckConfig(
+      max_consecutive_failures=1,
+      type='shell',
+    ),
+    task=Task(name='main', processes=[],
+              resources=Resources(cpu=0.1, ram=64 * MB, disk=64 * MB)))
+  with pytest.raises(SystemExit):
+    config._validate_health_check_config(AuroraConfig(base_job))
+
+
 def test_update_config_passes_with_default_values():
   base_job = Job(
     name='hello_world', role='john_doe', cluster='test-cluster',

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/python/apache/aurora/common/BUILD
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/common/BUILD 
b/src/test/python/apache/aurora/common/BUILD
index f903c19..7909ab5 100644
--- a/src/test/python/apache/aurora/common/BUILD
+++ b/src/test/python/apache/aurora/common/BUILD
@@ -19,7 +19,6 @@ target(
     ':test_cluster',
     ':test_clusters',
     ':test_cluster_option',
-    ':test_http_signaler',
     ':test_pex_version',
     ':test_shellify',
     ':test_transport',
@@ -69,15 +68,6 @@ python_tests(
 )
 
 python_tests(
-  name = 'test_http_signaler',
-  sources = ['test_http_signaler.py'],
-  dependencies = [
-    '3rdparty/python:mox',
-    'src/main/python/apache/aurora/common',
-  ]
-)
-
-python_tests(
   name = 'test_shellify',
   sources = ['test_shellify.py'],
   dependencies = [

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/python/apache/aurora/common/health_check/BUILD
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/common/health_check/BUILD 
b/src/test/python/apache/aurora/common/health_check/BUILD
new file mode 100644
index 0000000..98a2481
--- /dev/null
+++ b/src/test/python/apache/aurora/common/health_check/BUILD
@@ -0,0 +1,39 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+target(
+  name = 'all',
+  dependencies = [
+    ':test_http_signaler',
+    ':test_shell',
+  ]
+)
+
+python_tests(
+  name = 'test_http_signaler',
+  sources = ['test_http_signaler.py'],
+  dependencies = [
+    '3rdparty/python:mox',
+    'src/main/python/apache/aurora/common',
+  ]
+)
+
+python_tests(
+  name = 'test_shell',
+  sources = ['test_shell.py'],
+  dependencies = [
+  '3rdparty/python:mock',
+  'src/main/python/apache/aurora/common',
+  ]
+)
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/python/apache/aurora/common/health_check/__init__.py
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/common/health_check/__init__.py 
b/src/test/python/apache/aurora/common/health_check/__init__.py
new file mode 100644
index 0000000..0663a9a
--- /dev/null
+++ b/src/test/python/apache/aurora/common/health_check/__init__.py
@@ -0,0 +1,13 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/python/apache/aurora/common/health_check/test_http_signaler.py
----------------------------------------------------------------------
diff --git 
a/src/test/python/apache/aurora/common/health_check/test_http_signaler.py 
b/src/test/python/apache/aurora/common/health_check/test_http_signaler.py
new file mode 100644
index 0000000..0338b81
--- /dev/null
+++ b/src/test/python/apache/aurora/common/health_check/test_http_signaler.py
@@ -0,0 +1,111 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import unittest
+from socket import timeout as SocketTimeout
+
+import mox
+from twitter.common.lang import Compatibility
+
+from apache.aurora.common.health_check.http_signaler import HttpSignaler
+
+if Compatibility.PY3:
+  import urllib.request as urllib_request
+else:
+  import urllib2 as urllib_request
+
+
+class OpenedURL(object):
+  def __init__(self, content, code=200):
+    self.content = content
+    self.code = code
+
+  def read(self):
+    return self.content
+
+  def close(self):
+    pass
+
+  def getcode(self):
+    return self.code
+
+
+class TestHttpSignaler(unittest.TestCase):
+  PORT = 12345
+
+  def setUp(self):
+    self._mox = mox.Mox()
+
+  def tearDown(self):
+    self._mox.UnsetStubs()
+    self._mox.VerifyAll()
+
+  def test_all_calls_ok(self):
+    self._mox.StubOutWithMock(urllib_request, 'urlopen')
+    urllib_request.urlopen(
+      'http://localhost:%s/quitquitquit' % self.PORT, '', 
timeout=1.0).AndReturn(OpenedURL(''))
+    urllib_request.urlopen(
+      'http://localhost:%s/abortabortabort' % self.PORT, '', 
timeout=1.0).AndReturn(OpenedURL(''))
+
+    self._mox.ReplayAll()
+
+    signaler = HttpSignaler(self.PORT)
+    assert signaler('/quitquitquit', use_post_method=True) == (True, None)
+    assert signaler('/abortabortabort', use_post_method=True) == (True, None)
+
+  def test_health_checks(self):
+    self._mox.StubOutWithMock(urllib_request, 'urlopen')
+    urllib_request.urlopen(
+      'http://localhost:%s/health' % self.PORT, None, 
timeout=1.0).AndReturn(OpenedURL('ok'))
+    urllib_request.urlopen(
+      'http://localhost:%s/health' % self.PORT, None, 
timeout=1.0).AndReturn(OpenedURL('not ok'))
+    urllib_request.urlopen(
+      'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn(
+          OpenedURL('not ok', code=200))
+    urllib_request.urlopen(
+      'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn(
+          OpenedURL('ok', code=400))
+    urllib_request.urlopen(
+      'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndRaise(
+          urllib_request.HTTPError('', 501, '', None, None))
+    urllib_request.urlopen(
+      'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn(
+          OpenedURL('ok', code=200))
+    urllib_request.urlopen(
+      'http://localhost:%s/random/endpoint' % self.PORT, None, 
timeout=1.0).AndReturn(
+          OpenedURL('ok'))
+
+    self._mox.ReplayAll()
+
+    signaler = HttpSignaler(self.PORT)
+    assert signaler('/health', expected_response='ok') == (True, None)
+    assert signaler('/health', expected_response='ok') == (
+        False, 'Response differs from expected response (expected "ok", got 
"not ok")')
+    assert signaler('/health', expected_response_code=200) == (True, None)
+    assert signaler('/health', expected_response_code=200) == (
+        False, 'Response code differs from expected response (expected 200, 
got 400)')
+    assert signaler('/health', expected_response_code=200) == (
+        False, 'Response code differs from expected response (expected 200, 
got 501)')
+    assert signaler('/health', expected_response='ok', 
expected_response_code=200) == (True, None)
+    assert signaler('/random/endpoint', expected_response='ok') == (True, None)
+
+  def test_exception(self):
+    self._mox.StubOutWithMock(urllib_request, 'urlopen')
+    urllib_request.urlopen(
+        'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndRaise(
+            SocketTimeout('Timed out'))
+
+    self._mox.ReplayAll()
+
+    assert not HttpSignaler(self.PORT)('/health', expected_response='ok')[0]

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/python/apache/aurora/common/health_check/test_shell.py
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/common/health_check/test_shell.py 
b/src/test/python/apache/aurora/common/health_check/test_shell.py
new file mode 100644
index 0000000..84f717f
--- /dev/null
+++ b/src/test/python/apache/aurora/common/health_check/test_shell.py
@@ -0,0 +1,91 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import sys
+import unittest
+
+import mock
+
+from apache.aurora.common.health_check.shell import ShellHealthCheck
+
+# Recommended pattern for Python 2 and 3 support from 
https://github.com/google/python-subprocess32
+# Backport which adds bug fixes and timeout support for Python 2.7
+if os.name == 'posix' and sys.version_info[0] < 3:
+  import subprocess32 as subprocess
+else:
+  # subprocess is included as part of Python standard lib in Python 3+.
+  import subprocess
+
+
+class TestHealthChecker(unittest.TestCase):
+
+  @mock.patch('subprocess32.check_call')
+  def test_health_check_ok(self, mock_sub):
+    timeout = 30
+    cmd = 'success cmd'
+    shell = ShellHealthCheck(cmd, timeout_secs=timeout)
+    success, msg = shell()
+    self.assertTrue(success)
+    self.assertIsNone(msg)
+    mock_sub.assert_called_once_with(
+      ['success', 'cmd'],
+      timeout=30
+    )
+
+  @mock.patch('subprocess32.check_call')
+  def test_health_check_failed(self, mock_sub):
+    timeout = 30
+    # Fail due to command returning a non-0 exit status.
+    mock_sub.side_effect = subprocess.CalledProcessError(1, 'failed')
+    cmd = 'cmd to fail'
+    shell = ShellHealthCheck(cmd, timeout_secs=timeout)
+    success, msg = shell()
+    mock_sub.assert_called_once_with(
+      ['cmd', 'to', 'fail'],
+      timeout=30
+    )
+    self.assertFalse(success)
+    self.assertEqual(msg, "Command 'failed' returned non-zero exit status 1")
+
+  @mock.patch('subprocess32.check_call')
+  def test_health_check_os_error(self, mock_sub):
+    timeout = 30
+    # Fail due to command not existing.
+    mock_sub.side_effect = OSError(1, 'failed')
+    cmd = 'cmd to not exist'
+    shell = ShellHealthCheck(cmd, timeout_secs=timeout)
+    success, msg = shell()
+    mock_sub.assert_called_once_with(
+      ['cmd', 'to', 'not', 'exist'],
+      timeout=30
+    )
+    self.assertFalse(success)
+    self.assertEqual(msg, 'OSError: failed')
+
+  @mock.patch('subprocess32.check_call')
+  def test_health_check_value_error(self, mock_sub):
+    timeout = 30
+    # Invalid commmand passed in raises ValueError.
+    mock_sub.side_effect = ValueError('Could not read command.')
+    cmd = 'defensive cmd'
+    timeout = 10
+    shell = ShellHealthCheck(cmd, timeout_secs=timeout)
+    success, msg = shell()
+    mock_sub.assert_called_once_with(
+      ['defensive', 'cmd'],
+      timeout=10
+    )
+    self.assertFalse(success)
+    self.assertEqual(msg, 'Invalid commmand.')

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/python/apache/aurora/common/test_http_signaler.py
----------------------------------------------------------------------
diff --git a/src/test/python/apache/aurora/common/test_http_signaler.py 
b/src/test/python/apache/aurora/common/test_http_signaler.py
deleted file mode 100644
index f68c71a..0000000
--- a/src/test/python/apache/aurora/common/test_http_signaler.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import unittest
-from socket import timeout as SocketTimeout
-
-import mox
-from twitter.common.lang import Compatibility
-
-from apache.aurora.common.http_signaler import HttpSignaler
-
-if Compatibility.PY3:
-  import urllib.request as urllib_request
-else:
-  import urllib2 as urllib_request
-
-
-class OpenedURL(object):
-  def __init__(self, content, code=200):
-    self.content = content
-    self.code = code
-
-  def read(self):
-    return self.content
-
-  def close(self):
-    pass
-
-  def getcode(self):
-    return self.code
-
-
-class TestHttpSignaler(unittest.TestCase):
-  PORT = 12345
-
-  def setUp(self):
-    self._mox = mox.Mox()
-
-  def tearDown(self):
-    self._mox.UnsetStubs()
-    self._mox.VerifyAll()
-
-  def test_all_calls_ok(self):
-    self._mox.StubOutWithMock(urllib_request, 'urlopen')
-    urllib_request.urlopen(
-      'http://localhost:%s/quitquitquit' % self.PORT, '', 
timeout=1.0).AndReturn(OpenedURL(''))
-    urllib_request.urlopen(
-      'http://localhost:%s/abortabortabort' % self.PORT, '', 
timeout=1.0).AndReturn(OpenedURL(''))
-
-    self._mox.ReplayAll()
-
-    signaler = HttpSignaler(self.PORT)
-    assert signaler('/quitquitquit', use_post_method=True) == (True, None)
-    assert signaler('/abortabortabort', use_post_method=True) == (True, None)
-
-  def test_health_checks(self):
-    self._mox.StubOutWithMock(urllib_request, 'urlopen')
-    urllib_request.urlopen(
-      'http://localhost:%s/health' % self.PORT, None, 
timeout=1.0).AndReturn(OpenedURL('ok'))
-    urllib_request.urlopen(
-      'http://localhost:%s/health' % self.PORT, None, 
timeout=1.0).AndReturn(OpenedURL('not ok'))
-    urllib_request.urlopen(
-      'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn(
-          OpenedURL('not ok', code=200))
-    urllib_request.urlopen(
-      'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn(
-          OpenedURL('ok', code=400))
-    urllib_request.urlopen(
-      'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndRaise(
-          urllib_request.HTTPError('', 501, '', None, None))
-    urllib_request.urlopen(
-      'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndReturn(
-          OpenedURL('ok', code=200))
-    urllib_request.urlopen(
-      'http://localhost:%s/random/endpoint' % self.PORT, None, 
timeout=1.0).AndReturn(
-          OpenedURL('ok'))
-
-    self._mox.ReplayAll()
-
-    signaler = HttpSignaler(self.PORT)
-    assert signaler('/health', expected_response='ok') == (True, None)
-    assert signaler('/health', expected_response='ok') == (
-        False, 'Response differs from expected response (expected "ok", got 
"not ok")')
-    assert signaler('/health', expected_response_code=200) == (True, None)
-    assert signaler('/health', expected_response_code=200) == (
-        False, 'Response code differs from expected response (expected 200, 
got 400)')
-    assert signaler('/health', expected_response_code=200) == (
-        False, 'Response code differs from expected response (expected 200, 
got 501)')
-    assert signaler('/health', expected_response='ok', 
expected_response_code=200) == (True, None)
-    assert signaler('/random/endpoint', expected_response='ok') == (True, None)
-
-  def test_exception(self):
-    self._mox.StubOutWithMock(urllib_request, 'urlopen')
-    urllib_request.urlopen(
-        'http://localhost:%s/health' % self.PORT, None, timeout=1.0).AndRaise(
-            SocketTimeout('Timed out'))
-
-    self._mox.ReplayAll()
-
-    assert not HttpSignaler(self.PORT)('/health', expected_response='ok')[0]

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/python/apache/aurora/executor/common/test_health_checker.py
----------------------------------------------------------------------
diff --git 
a/src/test/python/apache/aurora/executor/common/test_health_checker.py 
b/src/test/python/apache/aurora/executor/common/test_health_checker.py
index 27c7171..8561abc 100644
--- a/src/test/python/apache/aurora/executor/common/test_health_checker.py
+++ b/src/test/python/apache/aurora/executor/common/test_health_checker.py
@@ -22,7 +22,7 @@ from mesos.interface.mesos_pb2 import TaskState
 from twitter.common.exceptions import ExceptionalThread
 from twitter.common.testing.clock import ThreadedClock
 
-from apache.aurora.common.http_signaler import HttpSignaler
+from apache.aurora.common.health_check.http_signaler import HttpSignaler
 from apache.aurora.config.schema.base import HealthCheckConfig
 from apache.aurora.executor.common.health_checker import (
     HealthChecker,
@@ -181,7 +181,7 @@ class TestHealthChecker(unittest.TestCase):
 
 
 class TestHealthCheckerProvider(unittest.TestCase):
-  def test_from_assigned_task(self):
+  def test_from_assigned_task_http(self):
     interval_secs = 17
     initial_interval_secs = 3
     max_consecutive_failures = 2
@@ -206,6 +206,58 @@ class TestHealthCheckerProvider(unittest.TestCase):
     hct_max_fail = 
health_checker.threaded_health_checker.max_consecutive_failures
     assert hct_max_fail == max_consecutive_failures
 
+  def test_from_assigned_task_generic(self):
+    interval_secs = 17
+    initial_interval_secs = 3
+    max_consecutive_failures = 2
+    timeout_secs = 5
+    task_config = TaskConfig(
+        executorConfig=ExecutorConfig(
+            name='thermos-generic',
+            data=MESOS_JOB(
+                task=HELLO_WORLD,
+                health_check_config=HealthCheckConfig(
+                    interval_secs=interval_secs,
+                    initial_interval_secs=initial_interval_secs,
+                    max_consecutive_failures=max_consecutive_failures,
+                    timeout_secs=timeout_secs,
+                    type='shell',
+                    shell_command='failed command'
+                )
+            ).json_dumps()
+        )
+    )
+    assigned_task = AssignedTask(task=task_config, instanceId=1, 
assignedPorts={'health': 9001})
+    health_checker = HealthCheckerProvider().from_assigned_task(assigned_task, 
None)
+    assert health_checker.threaded_health_checker.interval == interval_secs
+    assert health_checker.threaded_health_checker.initial_interval == 
initial_interval_secs
+    hct_max_fail = 
health_checker.threaded_health_checker.max_consecutive_failures
+    assert hct_max_fail == max_consecutive_failures
+
+  def test_from_assigned_task_no_health_port(self):
+    interval_secs = 17
+    initial_interval_secs = 3
+    max_consecutive_failures = 2
+    timeout_secs = 5
+    task_config = TaskConfig(
+        executorConfig=ExecutorConfig(
+            name='thermos-generic',
+            data=MESOS_JOB(
+                task=HELLO_WORLD,
+                health_check_config=HealthCheckConfig(
+                    interval_secs=interval_secs,
+                    initial_interval_secs=initial_interval_secs,
+                    max_consecutive_failures=max_consecutive_failures,
+                    timeout_secs=timeout_secs,
+                )
+            ).json_dumps()
+        )
+    )
+    # No health port and we don't have a shell_command.
+    assigned_task = AssignedTask(task=task_config, instanceId=1, 
assignedPorts={'http': 9001})
+    health_checker = HealthCheckerProvider().from_assigned_task(assigned_task, 
None)
+    self.assertIsNone(health_checker)
+
 
 class TestThreadedHealthChecker(unittest.TestCase):
   def setUp(self):

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora
----------------------------------------------------------------------
diff --git a/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora 
b/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora
index dc55109..bb4fdec 100644
--- a/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora
+++ b/src/test/sh/org/apache/aurora/e2e/http/http_example.aurora
@@ -27,7 +27,7 @@ stage_server = Process(
 
 test_task = Task(
   name = 'http_example',
-  resources = Resources(cpu=0.5, ram=32*MB, disk=64*MB),
+  resources = Resources(cpu=0.4, ram=32*MB, disk=64*MB),
   processes = [stage_server, run_server],
   constraints = order(stage_server, run_server))
 

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora
----------------------------------------------------------------------
diff --git 
a/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora 
b/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora
new file mode 100644
index 0000000..37f2e9c
--- /dev/null
+++ b/src/test/sh/org/apache/aurora/e2e/http/http_example_bad_healthcheck.aurora
@@ -0,0 +1,73 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import getpass
+
+DEFAULT_CMD = 'cp /vagrant/src/test/sh/org/apache/aurora/e2e/http_example.py .'
+
+run_server = Process(
+  name = 'run_server',
+  cmdline = 'python http_example.py {{thermos.ports[http]}}')
+
+stage_server = Process(
+  name = 'stage_server',
+  cmdline = '{{cmd}}'
+)
+
+test_task = Task(
+  name = 'http_example',
+  resources = Resources(cpu=0.5, ram=32*MB, disk=64*MB),
+  processes = [stage_server, run_server],
+  constraints = order(stage_server, run_server))
+
+update_config = UpdateConfig(watch_secs=10, batch_size=2)
+# "I am going to fail" config.
+health_check_config = HealthCheckConfig(
+    initial_interval_secs=5,
+    interval_secs=1,
+    type='shell',
+    shell_command='grep foo'
+    )
+
+job = Service(
+  cluster = 'devcluster',
+  instances = 2,
+  update_config = update_config,
+  health_check_config = health_check_config,
+  task = test_task,
+  role = getpass.getuser(),
+  environment = 'test',
+  contact = '{{role}}@localhost',
+  announce = Announcer(),
+)
+
+jobs = [
+  job(
+    name = 'http_example'
+  ).bind(
+    cmd = DEFAULT_CMD
+  ),
+  job(
+    name = 'http_example_revocable',
+    tier = 'revocable'
+  ).bind(
+    cmd = DEFAULT_CMD
+  ),
+  job(
+    name = 'http_example_docker',
+    container = Container(docker=Docker(image = 'http_example'))
+  ).bind(
+    cmd = 'cp /tmp/http_example.py .'
+  )
+]

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/sh/org/apache/aurora/e2e/http/http_example_updated.aurora
----------------------------------------------------------------------
diff --git a/src/test/sh/org/apache/aurora/e2e/http/http_example_updated.aurora 
b/src/test/sh/org/apache/aurora/e2e/http/http_example_updated.aurora
index f098de9..b33e8f5 100644
--- a/src/test/sh/org/apache/aurora/e2e/http/http_example_updated.aurora
+++ b/src/test/sh/org/apache/aurora/e2e/http/http_example_updated.aurora
@@ -27,7 +27,7 @@ stage_server = Process(
 
 test_task = SequentialTask(
   name = 'http_example',
-  resources = Resources(cpu=0.5, ram=34*MB, disk=64*MB),
+  resources = Resources(cpu=0.4, ram=34*MB, disk=64*MB),
   processes = [stage_server, run_server])
 
 update_config = UpdateConfig(watch_secs=10, batch_size=3)

http://git-wip-us.apache.org/repos/asf/aurora/blob/0d7f946f/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh
----------------------------------------------------------------------
diff --git a/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh 
b/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh
index d7c61e2..9ccf6dc 100755
--- a/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh
+++ b/src/test/sh/org/apache/aurora/e2e/test_end_to_end.sh
@@ -45,7 +45,7 @@ collect_result() {
   then
     echo "OK (all tests passed)"
   else
-    echo "!!! FAIL (something returned non-zero)"
+    echo "!!! FAIL (something returned non-zero) for $BASH_COMMAND"
     # Attempt to clean up any state we left behind.
     tear_down
   fi
@@ -173,6 +173,30 @@ test_update() {
   fi
 }
 
+test_update_fail() {
+  local _jobkey=$1 _config=$2 _cluster=$3  _bad_healthcheck_config=$4
+  # Make sure our updates works.
+  aurora update start $_jobkey $_config
+  assert_update_state $_jobkey 'ROLLING_FORWARD'
+  local _update_id=$(aurora update list $_jobkey --status ROLLING_FORWARD \
+      | tail -n +2 | awk '{print $2}')
+  # Need to wait until udpate finishes before we can start one that we want to 
fail.
+  aurora update wait $_jobkey $_update_id
+
+  # Starting update with a health check that is meant to fail. Expected 
behavior is roll back.
+  aurora update start $_jobkey $_bad_healthcheck_config
+  local _update_id=$(aurora update list $_jobkey --status active \
+      | tail -n +2 | awk '{print $2}')
+  # || is so that we don't return an EXIT so that `trap collect_result` 
doesn't get triggered.
+  aurora update wait $_jobkey $_update_id || echo $?
+  # MAKING SURE WE ROLLED BACK.
+  local status=$(aurora update info $_jobkey $_update_id | grep 'Current 
status' | awk '{print $NF}')
+  if [[ $status != "ROLLED_BACK" ]]; then
+    echo "Update should have completed in ROLLED_BACK state due to failed 
healthcheck."
+    exit 1
+  fi
+}
+
 test_announce() {
   local _role=$1 _env=$2 _job=$3
 
@@ -231,7 +255,8 @@ test_quota() {
 test_http_example() {
   local _cluster=$1 _role=$2 _env=$3
   local _base_config=$4 _updated_config=$5
-  local _job=$6
+  local _bad_healthcheck_config=$6
+  local _job=$7
   local _jobkey="$_cluster/$_role/$_env/$_job"
 
   test_config $_base_config $_jobkey
@@ -242,6 +267,9 @@ test_http_example() {
   test_observer_ui $_cluster $_role $_job
   test_restart $_jobkey
   test_update $_jobkey $_updated_config $_cluster
+  test_update_fail $_jobkey $_base_config  $_cluster $_bad_healthcheck_config
+  # Running test_update second time to change state to success.
+  test_update $_jobkey $_updated_config $_cluster
   test_announce $_role $_env $_job
   test_run $_jobkey
   test_legacy_update $_jobkey $_base_config
@@ -252,7 +280,7 @@ test_http_example() {
 test_http_revocable_example() {
   local _cluster=$1 _role=$2 _env=$3
   local _base_config=$4
-  local _job=$6
+  local _job=$7
   local _jobkey="$_cluster/$_role/$_env/$_job"
 
   test_create $_jobkey $_base_config
@@ -274,7 +302,7 @@ restore_netrc() {
 test_basic_auth_unauthenticated() {
   local _cluster=$1 _role=$2 _env=$3
   local _config=$4
-  local _job=$6
+  local _job=$7
   local _jobkey="$_cluster/$_role/$_env/$_job"
 
   mv ~/.netrc ~/.netrc.bak
@@ -301,6 +329,7 @@ TEST_JOB_REVOCABLE=http_example_revocable
 TEST_JOB_DOCKER=http_example_docker
 TEST_CONFIG_FILE=$EXAMPLE_DIR/http_example.aurora
 TEST_CONFIG_UPDATED_FILE=$EXAMPLE_DIR/http_example_updated.aurora
+TEST_BAD_HEALTHCHECK_CONFIG_UPDATED_FILE=$EXAMPLE_DIR/http_example_bad_healthcheck.aurora
 
 BASE_ARGS=(
   $TEST_CLUSTER
@@ -308,6 +337,7 @@ BASE_ARGS=(
   $TEST_ENV
   $TEST_CONFIG_FILE
   $TEST_CONFIG_UPDATED_FILE
+  $TEST_BAD_HEALTHCHECK_CONFIG_UPDATED_FILE
 )
 
 TEST_JOB_ARGS=("${BASE_ARGS[@]}" "$TEST_JOB")

Reply via email to