Repository: ambari Updated Branches: refs/heads/trunk 9bef76ba0 -> 82bea1cbf
AMBARI-16149. Support for LLAP alert in Ambari. Also fixes Hive Metastore alert failure. Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/82bea1cb Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/82bea1cb Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/82bea1cb Branch: refs/heads/trunk Commit: 82bea1cbfde826d13579b516f19e730e051adf72 Parents: 9bef76b Author: Swapan Shridhar <[email protected]> Authored: Wed Apr 27 18:17:41 2016 -0700 Committer: Swapan Shridhar <[email protected]> Committed: Thu Apr 28 02:41:45 2016 -0700 ---------------------------------------------------------------------- .../common-services/HIVE/0.12.0.2.0/alerts.json | 47 ++++ .../package/alerts/alert_hive_metastore.py | 22 +- .../package/alerts/alert_llap_app_status.py | 213 +++++++++++++++++++ .../0.12.0.2.0/package/scripts/params_linux.py | 1 + .../HIVE/configuration/hive-interactive-env.xml | 6 + 5 files changed, 280 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/82bea1cb/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json index 9f0466c..0fad732 100644 --- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json +++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json @@ -146,6 +146,53 @@ } ] } + }, + { + "name": "llap_application", + "label": "LLAP Application", + "description": "This alert is triggered if the LLAP Application cannot be determined to be up and responding to requests.", + "interval": 3, + "scope": "ANY", + "enabled": true, + "source": { + "type": "SCRIPT", + "path": "HIVE/0.12.0.2.0/package/alerts/alert_llap_app_status.py", + "parameters": [ + { + "name": "check.command.timeout", + "display_name": "Command Timeout", + "value": 15.0, + "type": "NUMERIC", + "description": "The maximum time before check command will be killed by timeout", + "units": "seconds", + "threshold": "CRITICAL" + }, + { + "name": "default.hive.user", + "display_name": "Default HIVE User", + "value": "hive", + "type": "STRING", + "description": "The user that will run the Hive commands if not specified in cluster-env", + "visibility": "HIDDEN" + }, + { + "name": "default.hive.principal", + "display_name": "Default HIVE Principal", + "value": "[email protected]", + "type": "STRING", + "description": "The principal to use when retrieving the kerberos ticket if not specified in cluster-env", + "visibility": "HIDDEN" + }, + { + "name": "default.hive.keytab", + "display_name": "Default HIVE Keytab", + "value": "/etc/security/keytabs/hive.llap.zk.sm.keytab", + "type": "STRING", + "description": "The keytab to use when retrieving the kerberos ticket if not specified in cluster-env.", + "visibility": "HIDDEN" + } + ] + } } ], "WEBHCAT_SERVER": [ http://git-wip-us.apache.org/repos/asf/ambari/blob/82bea1cb/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py index a556410..e02ed5a 100644 --- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py +++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py @@ -31,9 +31,6 @@ from resource_management.core.resources import Execute from ambari_commons.os_check import OSConst from ambari_commons.os_family_impl import OsFamilyFuncImpl, OsFamilyImpl -import params - -stack_root = params.stack_root OK_MESSAGE = "Metastore OK - Hive command took {0:.3f}s" CRITICAL_MESSAGE = "Metastore on {0} failed ({1})" @@ -58,10 +55,10 @@ SMOKEUSER_PRINCIPAL_DEFAULT = '[email protected]' SMOKEUSER_SCRIPT_PARAM_KEY = 'default.smoke.user' SMOKEUSER_DEFAULT = 'ambari-qa' -HIVE_CONF_DIR = format("{stack_root}/current/hive-metastore/conf/conf.server") +STACK_ROOT = '{{cluster-env/stack_root}}' + HIVE_CONF_DIR_LEGACY = '/etc/hive/conf.server' -HIVE_BIN_DIR = format("{stack_root}/current/hive-metastore/bin") HIVE_BIN_DIR_LEGACY = '/usr/lib/hive/bin' CHECK_COMMAND_TIMEOUT_KEY = 'check.command.timeout' @@ -69,6 +66,7 @@ CHECK_COMMAND_TIMEOUT_DEFAULT = 60.0 HADOOPUSER_KEY = '{{cluster-env/hadoop.user.name}}' HADOOPUSER_DEFAULT = 'hadoop' + logger = logging.getLogger('ambari_alerts') @OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT) @@ -78,7 +76,8 @@ def get_tokens(): to build the dictionary passed into execute """ return (SECURITY_ENABLED_KEY,SMOKEUSER_KEYTAB_KEY,SMOKEUSER_PRINCIPAL_KEY, - HIVE_METASTORE_URIS_KEY, SMOKEUSER_KEY, KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY) + HIVE_METASTORE_URIS_KEY, SMOKEUSER_KEY, KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY, + STACK_ROOT) @OsFamilyFuncImpl(os_family=OSConst.WINSRV_FAMILY) def get_tokens(): @@ -174,9 +173,14 @@ def execute(configurations={}, parameters={}, host_name=None): conf_dir = HIVE_CONF_DIR_LEGACY bin_dir = HIVE_BIN_DIR_LEGACY - if os.path.exists(HIVE_CONF_DIR): - conf_dir = HIVE_CONF_DIR - bin_dir = HIVE_BIN_DIR + + if STACK_ROOT in configurations: + hive_conf_dir = configurations[STACK_ROOT] + format("/current/hive-metastore/conf/conf.server") + hive_bin_dir = configurations[STACK_ROOT] + format("/current/hive-metastore/bin") + + if os.path.exists(hive_conf_dir): + conf_dir = hive_conf_dir + bin_dir = hive_bin_dir cmd = format("export HIVE_CONF_DIR='{conf_dir}' ; " "hive --hiveconf hive.metastore.uris={metastore_uri}\ http://git-wip-us.apache.org/repos/asf/ambari/blob/82bea1cb/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_llap_app_status.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_llap_app_status.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_llap_app_status.py new file mode 100644 index 0000000..b18c366 --- /dev/null +++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_llap_app_status.py @@ -0,0 +1,213 @@ +#!/usr/bin/env python + +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import time +import logging +import traceback +import json +import subprocess + +from resource_management.libraries.functions import format +from resource_management.libraries.functions import get_kinit_path +from ambari_commons.os_check import OSConst +from ambari_commons.os_family_impl import OsFamilyFuncImpl, OsFamilyImpl +from resource_management.core import shell +from resource_management.core.resources import Execute +from resource_management.core import global_lock + + +OK_MESSAGE = "APP is in : '{0}' state. Check took {1:.3f}s" +MESSAGE_WITH_STATE_AND_INSTANCES = "APP is in : '{0}' state. Instances 'live' : {1}, 'desired' : {2}. Check took {3:.3f}s" +CRITICAL_MESSAGE_WITH_STATE = "APP is in : '{0}' state. Check took {1:.3f}s" +CRITICAL_MESSAGE = "APP information couldn't be retrieved. Check took {0:.3f}s" + +# results codes +CRITICAL_RESULT_CODE = 'CRITICAL' +OK_RESULT_CODE = 'OK' +UKNOWN_STATUS_CODE = 'UNKNOWN' + + +SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}' + +HIVE_PRINCIPAL_KEY = '{{hive-interactive-site/hive.llap.zk.sm.principal}}' +HIVE_PRINCIPAL_DEFAULT = 'default.hive.principal' + +HIVE_PRINCIPAL_KEYTAB_KEY = '{{hive-interactive-site/hive.llap.zk.sm.keytab.file}}' +HIVE_PRINCIPAL_KEYTAB_DEFAULT = 'default.hive.keytab' + +HIVE_AUTHENTICATION_DEFAULT = 'NOSASL' + +HIVE_USER_KEY = '{{hive-env/hive_user}}' +HIVE_USER_DEFAULT = 'default.smoke.user' + +STACK_ROOT = '{{cluster-env/stack_root}}' +STACK_ROOT_DEFAULT = "/usr/hdp" + +LLAP_APP_NAME_KEY = '{{hive-interactive-env/llap_app_name}}' +LLAP_APP_NAME_DEFAULT = 'llap0' + +# The configured Kerberos executable search paths, if any +KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY = '{{kerberos-env/executable_search_paths}}' + + +CHECK_COMMAND_TIMEOUT_KEY = 'check.command.timeout' +CHECK_COMMAND_TIMEOUT_DEFAULT = 15.0 + + + +logger = logging.getLogger('ambari_alerts') + +@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT) +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return (SECURITY_ENABLED_KEY, KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY, HIVE_PRINCIPAL_KEY, HIVE_PRINCIPAL_KEYTAB_KEY, + HIVE_USER_KEY, STACK_ROOT, LLAP_APP_NAME_KEY) + + +@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT) +def execute(configurations={}, parameters={}, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + configurations (dictionary): a mapping of configuration key to value + parameters (dictionary): a mapping of script parameter key to value + host_name (string): the name of this host where the alert is running + """ + + if configurations is None: + return ('UNKNOWN', ['There were no configurations supplied to the script.']) + + result_code = None + + try: + security_enabled = False + if SECURITY_ENABLED_KEY in configurations: + security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' + + check_command_timeout = CHECK_COMMAND_TIMEOUT_DEFAULT + if CHECK_COMMAND_TIMEOUT_KEY in configurations: + check_command_timeout = int(parameters[CHECK_COMMAND_TIMEOUT_KEY]) + + hive_user = HIVE_USER_DEFAULT + if HIVE_USER_KEY in configurations: + hive_user = configurations[HIVE_USER_KEY] + + llap_app_name = LLAP_APP_NAME_DEFAULT + if LLAP_APP_NAME_KEY in configurations: + llap_app_name = configurations[LLAP_APP_NAME_KEY] + + if security_enabled: + llap_principal = HIVE_PRINCIPAL_DEFAULT + if HIVE_PRINCIPAL_KEY in configurations: + llap_principal = configurations[HIVE_PRINCIPAL_KEY] + + llap_keytab = HIVE_PRINCIPAL_KEYTAB_DEFAULT + if HIVE_PRINCIPAL_KEYTAB_KEY in configurations: + llap_keytab = configurations[HIVE_PRINCIPAL_KEYTAB_KEY] + + # Get the configured Kerberos executable search paths, if any + if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: + kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] + else: + kerberos_executable_search_paths = None + + kinit_path_local = get_kinit_path(kerberos_executable_search_paths) + kinitcmd=format("{kinit_path_local} -kt {llap_keytab} {llap_principal}; ") + + # prevent concurrent kinit + kinit_lock = global_lock.get_lock(global_lock.LOCK_TYPE_KERBEROS) + kinit_lock.acquire() + try: + Execute(kinitcmd, user=hive_user,#status_params.hive_user, + path=["/bin/", "/usr/bin/", "/usr/lib/hive/bin/", "/usr/sbin/"], + timeout=10) + finally: + kinit_lock.release() + + + + start_time = time.time() + if STACK_ROOT in configurations: + llap_status_cmd = configurations[STACK_ROOT] + format("/current/hive-server2-hive2/bin/hive --service llapstatus --name {llap_app_name}") + else: + llap_status_cmd = format("/usr/hdp/current/hive-server2-hive2/bin/hive --service llapstatus --name {llap_app_name}") + + code, output, error = shell.checked_call(llap_status_cmd, user=hive_user, stderr=subprocess.PIPE, + timeout=check_command_timeout, + logoutput=False) + llap_app_info = json.loads(output) + + if llap_app_info is None or 'state' not in llap_app_info: + alert_label = traceback.format_exc() + result_code = UKNOWN_STATUS_CODE + return (result_code, [alert_label]) + + if llap_app_info['state'].upper() in ['RUNNING_ALL']: + result_code = OK_RESULT_CODE + total_time = time.time() - start_time + alert_label = OK_MESSAGE.format(llap_app_info['state'], total_time) + elif llap_app_info['state'].upper() in ['RUNNING_PARTIAL']: + live_instances = 0 + desired_instances = 0 + percentInstancesUp = 0 + percent_desired_instances_to_be_up = 80 + # Get 'live' and 'desired' instances + if 'liveInstances' not in llap_app_info or 'desiredInstances' not in llap_app_info: + result_code = CRITICAL_RESULT_CODE + total_time = time.time() - start_time + alert_label = CRITICAL_MESSAGE_WITH_STATE.format(llap_app_info['state'], total_time) + return (result_code, [alert_label]) + + live_instances = llap_app_info['liveInstances'] + desired_instances = llap_app_info['desiredInstances'] + if live_instances < 0 or desired_instances <= 0: + result_code = CRITICAL_RESULT_CODE + total_time = time.time() - start_time + alert_label = MESSAGE_WITH_STATE_AND_INSTANCES.format(llap_app_info['state'], total_time) + return (result_code, [alert_label]) + + percentInstancesUp = float(live_instances) / desired_instances * 100 + if percentInstancesUp >= percent_desired_instances_to_be_up: + result_code = OK_RESULT_CODE + total_time = time.time() - start_time + alert_label = MESSAGE_WITH_STATE_AND_INSTANCES.format(llap_app_info['state'], + llap_app_info['liveInstances'], + llap_app_info['desiredInstances'], + total_time) + else: + result_code = CRITICAL_RESULT_CODE + total_time = time.time() - start_time + alert_label = MESSAGE_WITH_STATE_AND_INSTANCES.format(llap_app_info['state'], + llap_app_info['liveInstances'], + llap_app_info['desiredInstances'], + total_time) + else: + result_code = CRITICAL_RESULT_CODE + total_time = time.time() - start_time + alert_label = CRITICAL_MESSAGE_WITH_STATE.format(llap_app_info['state'], total_time) + except: + alert_label = traceback.format_exc() + traceback.format_exc() + result_code = UKNOWN_STATUS_CODE + return (result_code, [alert_label]) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/82bea1cb/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/scripts/params_linux.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/scripts/params_linux.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/scripts/params_linux.py index 22e1b55..a4f5378 100644 --- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/scripts/params_linux.py +++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/scripts/params_linux.py @@ -559,6 +559,7 @@ if has_hive_interactive: llap_log_level = config['configurations']['hive-interactive-env']['llap_log_level'] hive_llap_io_mem_size = config['configurations']['hive-interactive-site']['hive.llap.io.memory.size'] llap_heap_size = config['configurations']['hive-interactive-env']['llap_heap_size'] + llap_app_name = config['configurations']['hive-interactive-env']['llap_app_name'] if security_enabled: hive_llap_keytab_file = config['configurations']['hive-interactive-site']['hive.llap.zk.sm.keytab.file'] hive_headless_keytab = config['configurations']['hive-interactive-site']['hive.llap.zk.sm.principal'] http://git-wip-us.apache.org/repos/asf/ambari/blob/82bea1cb/ambari-server/src/main/resources/stacks/HDP/2.5/services/HIVE/configuration/hive-interactive-env.xml ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/2.5/services/HIVE/configuration/hive-interactive-env.xml b/ambari-server/src/main/resources/stacks/HDP/2.5/services/HIVE/configuration/hive-interactive-env.xml index aad9c47..a4d39e1 100644 --- a/ambari-server/src/main/resources/stacks/HDP/2.5/services/HIVE/configuration/hive-interactive-env.xml +++ b/ambari-server/src/main/resources/stacks/HDP/2.5/services/HIVE/configuration/hive-interactive-env.xml @@ -184,6 +184,12 @@ <description>LLAP app logging level</description> <display-name>LLAP app logging level</display-name> </property> + <property> + <name>llap_app_name</name> + <value>llap0</value> + <description>LLAP app name</description> + <display-name>LLAP app name</display-name> + </property> <!-- hive-env.sh --> <property>
