Repository: ambari Updated Branches: refs/heads/branch-2.4 cd71f8d9a -> c2e741e52
AMBARI-16910. Hive Server Interactive. Change the timeout to 120 secs for LLAP alert command. Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/04e23a4c Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/04e23a4c Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/04e23a4c Branch: refs/heads/branch-2.4 Commit: 04e23a4cfa60528deb164028ebafb06fdcdadbde Parents: cd71f8d Author: Swapan Shridhar <[email protected]> Authored: Thu May 26 19:59:34 2016 -0700 Committer: Swapan Shridhar <[email protected]> Committed: Thu May 26 20:00:52 2016 -0700 ---------------------------------------------------------------------- .../common-services/HIVE/0.12.0.2.0/alerts.json | 2 +- .../package/alerts/alert_llap_app_status.py | 41 ++++++++++++-------- 2 files changed, 25 insertions(+), 18 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/04e23a4c/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json index 0fad732..bffc030 100644 --- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json +++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json @@ -161,7 +161,7 @@ { "name": "check.command.timeout", "display_name": "Command Timeout", - "value": 15.0, + "value": 120.0, "type": "NUMERIC", "description": "The maximum time before check command will be killed by timeout", "units": "seconds", http://git-wip-us.apache.org/repos/asf/ambari/blob/04e23a4c/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_llap_app_status.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_llap_app_status.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_llap_app_status.py index b18c366..12c5c19 100644 --- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_llap_app_status.py +++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_llap_app_status.py @@ -33,10 +33,10 @@ from resource_management.core.resources import Execute from resource_management.core import global_lock -OK_MESSAGE = "APP is in : '{0}' state. Check took {1:.3f}s" -MESSAGE_WITH_STATE_AND_INSTANCES = "APP is in : '{0}' state. Instances 'live' : {1}, 'desired' : {2}. Check took {3:.3f}s" -CRITICAL_MESSAGE_WITH_STATE = "APP is in : '{0}' state. Check took {1:.3f}s" -CRITICAL_MESSAGE = "APP information couldn't be retrieved. Check took {0:.3f}s" +OK_MESSAGE = "The application reported a '{0}' state in {1:.3f}s" +MESSAGE_WITH_STATE_AND_INSTANCES = "The application reported a '{0}' state in {1:.3f}s. [Live: {2}, Desired: {3}]" +CRITICAL_MESSAGE_WITH_STATE = "The application reported a '{0}' state. Check took {1:.3f}s" +CRITICAL_MESSAGE = "Application information could not be retrieved" # results codes CRITICAL_RESULT_CODE = 'CRITICAL' @@ -68,9 +68,15 @@ KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY = '{{kerberos-env/executable_search_paths}} CHECK_COMMAND_TIMEOUT_KEY = 'check.command.timeout' -CHECK_COMMAND_TIMEOUT_DEFAULT = 15.0 +CHECK_COMMAND_TIMEOUT_DEFAULT = 120.0 +# Mapping of LLAP app states to 'user friendly' state names. +llap_app_state_dict = {'RUNNING_ALL': 'RUNNING', + 'RUNNING_PARTIAL': 'RUNNING', + 'COMPLETE': 'NOT RUNNING', + 'LAUNCHING': 'LAUNCHING', + 'APP_NOT_FOUND': 'APP NOT FOUND'} logger = logging.getLogger('ambari_alerts') @@ -163,11 +169,12 @@ def execute(configurations={}, parameters={}, host_name=None): result_code = UKNOWN_STATUS_CODE return (result_code, [alert_label]) - if llap_app_info['state'].upper() in ['RUNNING_ALL']: + retrieved_llap_app_state = llap_app_info['state'].upper() + if retrieved_llap_app_state in ['RUNNING_ALL']: result_code = OK_RESULT_CODE total_time = time.time() - start_time - alert_label = OK_MESSAGE.format(llap_app_info['state'], total_time) - elif llap_app_info['state'].upper() in ['RUNNING_PARTIAL']: + alert_label = OK_MESSAGE.format(llap_app_state_dict.get(retrieved_llap_app_state, retrieved_llap_app_state), total_time) + elif retrieved_llap_app_state in ['RUNNING_PARTIAL']: live_instances = 0 desired_instances = 0 percentInstancesUp = 0 @@ -176,7 +183,7 @@ def execute(configurations={}, parameters={}, host_name=None): if 'liveInstances' not in llap_app_info or 'desiredInstances' not in llap_app_info: result_code = CRITICAL_RESULT_CODE total_time = time.time() - start_time - alert_label = CRITICAL_MESSAGE_WITH_STATE.format(llap_app_info['state'], total_time) + alert_label = CRITICAL_MESSAGE_WITH_STATE.format(llap_app_state_dict.get(retrieved_llap_app_state, retrieved_llap_app_state), total_time) return (result_code, [alert_label]) live_instances = llap_app_info['liveInstances'] @@ -184,28 +191,28 @@ def execute(configurations={}, parameters={}, host_name=None): if live_instances < 0 or desired_instances <= 0: result_code = CRITICAL_RESULT_CODE total_time = time.time() - start_time - alert_label = MESSAGE_WITH_STATE_AND_INSTANCES.format(llap_app_info['state'], total_time) + alert_label = CRITICAL_MESSAGE_WITH_STATE.format(llap_app_state_dict.get(retrieved_llap_app_state, retrieved_llap_app_state), total_time) return (result_code, [alert_label]) percentInstancesUp = float(live_instances) / desired_instances * 100 if percentInstancesUp >= percent_desired_instances_to_be_up: result_code = OK_RESULT_CODE total_time = time.time() - start_time - alert_label = MESSAGE_WITH_STATE_AND_INSTANCES.format(llap_app_info['state'], + alert_label = MESSAGE_WITH_STATE_AND_INSTANCES.format(llap_app_state_dict.get(retrieved_llap_app_state, retrieved_llap_app_state), + total_time, llap_app_info['liveInstances'], - llap_app_info['desiredInstances'], - total_time) + llap_app_info['desiredInstances']) else: result_code = CRITICAL_RESULT_CODE total_time = time.time() - start_time - alert_label = MESSAGE_WITH_STATE_AND_INSTANCES.format(llap_app_info['state'], + alert_label = MESSAGE_WITH_STATE_AND_INSTANCES.format(llap_app_state_dict.get(retrieved_llap_app_state, retrieved_llap_app_state), + total_time, llap_app_info['liveInstances'], - llap_app_info['desiredInstances'], - total_time) + llap_app_info['desiredInstances']) else: result_code = CRITICAL_RESULT_CODE total_time = time.time() - start_time - alert_label = CRITICAL_MESSAGE_WITH_STATE.format(llap_app_info['state'], total_time) + alert_label = CRITICAL_MESSAGE_WITH_STATE.format(llap_app_state_dict.get(retrieved_llap_app_state, retrieved_llap_app_state), total_time) except: alert_label = traceback.format_exc() traceback.format_exc()
