AMBARI-8755. Oozie server check alert fails in secured mode (aonishuk)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/ec37c603 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/ec37c603 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/ec37c603 Branch: refs/heads/trunk Commit: ec37c603c92969251f6c89318d403452f555b7ba Parents: 79cffa1 Author: Andrew Onishuk <[email protected]> Authored: Thu Dec 25 16:28:34 2014 +0200 Committer: Andrew Onishuk <[email protected]> Committed: Thu Dec 25 16:28:34 2014 +0200 ---------------------------------------------------------------------- .../python/ambari_agent/alerts/script_alert.py | 43 +++-- .../src/test/python/ambari_agent/TestAlerts.py | 4 +- .../common-services/AMS/0.1.0/alerts.json | 2 +- .../alerts/alert_ambari_metrics_monitor.py | 80 +++++++++ .../files/alert_ambari_metrics_monitor.py | 80 --------- .../common-services/FLUME/1.4.0.2.0/alerts.json | 2 +- .../package/alerts/alert_flume_agent_status.py | 99 +++++++++++ .../package/files/alert_flume_agent_status.py | 99 ----------- .../common-services/HDFS/2.1.0.2.0/alerts.json | 4 +- .../package/alerts/alert_checkpoint_time.py | 136 +++++++++++++++ .../package/alerts/alert_ha_namenode_health.py | 166 +++++++++++++++++++ .../package/files/alert_checkpoint_time.py | 136 --------------- .../package/files/alert_ha_namenode_health.py | 166 ------------------- .../common-services/HIVE/0.12.0.2.0/alerts.json | 4 +- .../package/alerts/alert_hive_thrift_port.py | 124 ++++++++++++++ .../package/alerts/alert_webhcat_server.py | 111 +++++++++++++ .../package/files/alert_hive_thrift_port.py | 127 -------------- .../package/files/alert_webhcat_server.py | 111 ------------- .../common-services/OOZIE/4.0.0.2.0/alerts.json | 2 +- .../package/alerts/alert_check_oozie_server.py | 81 +++++++++ .../package/files/alert_check_oozie_server.py | 74 --------- .../stacks/HDP/1.3.2/services/HIVE/alerts.json | 4 +- .../package/alerts/alert_hive_thrift_port.py | 124 ++++++++++++++ .../HIVE/package/alerts/alert_webhcat_server.py | 111 +++++++++++++ .../package/files/alert_hive_thrift_port.py | 127 -------------- .../HIVE/package/files/alert_webhcat_server.py | 111 ------------- .../HDP/1.3.2/services/MAPREDUCE/alerts.json | 2 +- .../alerts/alert_mapreduce_directory_space.py | 93 +++++++++++ .../files/alert_mapreduce_directory_space.py | 95 ----------- .../stacks/HDP/1.3.2/services/OOZIE/alerts.json | 2 +- .../package/alerts/alert_check_oozie_server.py | 81 +++++++++ .../package/files/alert_check_oozie_server.py | 74 --------- .../stacks/HDP/2.0.6/services/YARN/alerts.json | 2 +- .../package/alerts/alert_nodemanager_health.py | 123 ++++++++++++++ .../package/files/alert_nodemanager_health.py | 123 -------------- 35 files changed, 1371 insertions(+), 1352 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py b/ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py index f39ab6f..660bddf 100644 --- a/ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py +++ b/ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py @@ -21,7 +21,9 @@ limitations under the License. import imp import logging import os +import re from alerts.base_alert import BaseAlert +from resource_management.core.environment import Environment from symbol import parameters logger = logging.getLogger() @@ -43,6 +45,7 @@ class ScriptAlert(BaseAlert): self.stacks_dir = None self.common_services_dir = None self.host_scripts_dir = None + self.path_to_script = None if 'path' in alert_source_meta: self.path = alert_source_meta['path'] @@ -81,8 +84,16 @@ class ScriptAlert(BaseAlert): parameters = {} for key in self.config_value_dict: parameters['{{' + key + '}}'] = self.config_value_dict[key] - - return cmd_module.execute(parameters, self.host_name) + + # try to get basedir for scripts + # it's needed for server side scripts to properly use resource management + matchObj = re.match( r'((.*)services\/(.*)\/package\/)', self.path_to_script) + if matchObj: + basedir = matchObj.group(1) + with Environment(basedir) as env: + return cmd_module.execute(parameters, self.host_name) + else: + return cmd_module.execute(parameters, self.host_name) else: return (self.RESULT_UNKNOWN, ["Unable to execute script {0}".format(self.path)]) @@ -92,35 +103,35 @@ class ScriptAlert(BaseAlert): raise Exception("The attribute 'path' must be specified") paths = self.path.split('/') - path_to_script = self.path + self.path_to_script = self.path # if the path doesn't exist and stacks dir is defined, try that - if not os.path.exists(path_to_script) and self.stacks_dir is not None: - path_to_script = os.path.join(self.stacks_dir, *paths) + if not os.path.exists(self.path_to_script) and self.stacks_dir is not None: + self.path_to_script = os.path.join(self.stacks_dir, *paths) # if the path doesn't exist and common services dir is defined, try that - if not os.path.exists(path_to_script) and self.common_services_dir is not None: - path_to_script = os.path.join(self.common_services_dir, *paths) + if not os.path.exists(self.path_to_script) and self.common_services_dir is not None: + self.path_to_script = os.path.join(self.common_services_dir, *paths) # if the path doesn't exist and the host script dir is defined, try that - if not os.path.exists(path_to_script) and self.host_scripts_dir is not None: - path_to_script = os.path.join(self.host_scripts_dir, *paths) + if not os.path.exists(self.path_to_script) and self.host_scripts_dir is not None: + self.path_to_script = os.path.join(self.host_scripts_dir, *paths) # if the path can't be evaluated, throw exception - if not os.path.exists(path_to_script) or not os.path.isfile(path_to_script): + if not os.path.exists(self.path_to_script) or not os.path.isfile(self.path_to_script): raise Exception( "Unable to find '{0}' as an absolute path or part of {1} or {2}".format(self.path, self.stacks_dir, self.host_scripts_dir)) if logger.isEnabledFor(logging.DEBUG): - logger.debug("Executing script check {0}".format(path_to_script)) + logger.debug("Executing script check {0}".format(self.path_to_script)) - if (not path_to_script.endswith('.py')): - logger.error("Unable to execute script {0}".format(path_to_script)) + if (not self.path_to_script.endswith('.py')): + logger.error("Unable to execute script {0}".format(self.path_to_script)) return None - - return imp.load_source(self._find_value('name'), path_to_script) + + return imp.load_source(self._find_value('name'), self.path_to_script) def _get_reporting_text(self, state): @@ -132,4 +143,4 @@ class ScriptAlert(BaseAlert): :param state: the state of the alert in uppercase (such as OK, WARNING, etc) :return: the parameterized text ''' - return '{0}' \ No newline at end of file + return '{0}' http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-agent/src/test/python/ambari_agent/TestAlerts.py ---------------------------------------------------------------------- diff --git a/ambari-agent/src/test/python/ambari_agent/TestAlerts.py b/ambari-agent/src/test/python/ambari_agent/TestAlerts.py index 34911b5..0ac1e00 100644 --- a/ambari-agent/src/test/python/ambari_agent/TestAlerts.py +++ b/ambari-agent/src/test/python/ambari_agent/TestAlerts.py @@ -21,6 +21,7 @@ limitations under the License. import os import socket import sys +import re from ambari_agent.AlertSchedulerHandler import AlertSchedulerHandler from ambari_agent.alerts.collector import AlertCollector @@ -31,7 +32,7 @@ from ambari_agent.alerts.web_alert import WebAlert from ambari_agent.apscheduler.scheduler import Scheduler from collections import namedtuple -from mock.mock import patch +from mock.mock import MagicMock, patch from unittest import TestCase class TestAlerts(TestCase): @@ -195,6 +196,7 @@ class TestAlerts(TestCase): pa.collect() + @patch.object(re, 'match', new = MagicMock()) def test_script_alert(self): json = { "name": "namenode_process", http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/common-services/AMS/0.1.0/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/AMS/0.1.0/alerts.json b/ambari-server/src/main/resources/common-services/AMS/0.1.0/alerts.json index 700f021..93b224a 100644 --- a/ambari-server/src/main/resources/common-services/AMS/0.1.0/alerts.json +++ b/ambari-server/src/main/resources/common-services/AMS/0.1.0/alerts.json @@ -147,7 +147,7 @@ "scope": "ANY", "source": { "type": "SCRIPT", - "path": "AMS/0.1.0/package/files/alert_ambari_metrics_monitor.py" + "path": "AMS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py" } } ] http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/common-services/AMS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/AMS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py b/ambari-server/src/main/resources/common-services/AMS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py new file mode 100644 index 0000000..5841267 --- /dev/null +++ b/ambari-server/src/main/resources/common-services/AMS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python + +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import socket + +from resource_management.libraries.functions.check_process_status import check_process_status +from resource_management.core.exceptions import ComponentIsNotRunning + +RESULT_CODE_OK = 'OK' +RESULT_CODE_CRITICAL = 'CRITICAL' +RESULT_CODE_UNKNOWN = 'UNKNOWN' + +AMS_MONITOR_PID_PATH = '/var/run/ambari-metrics-monitor/ambari-metrics-monitor.pid' + + +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return () + + +def is_monitor_process_live(pid_file): + """ + Gets whether the AMS monitor represented by the specified file is running. + :param pid_file: the PID file of the monitor to check + :return: True if the monitor is running, False otherwise + """ + live = False + + try: + check_process_status(pid_file) + live = True + except ComponentIsNotRunning: + pass + + return live + + +def execute(parameters=None, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + parameters (dictionary): a mapping of parameter key to value + host_name (string): the name of this host where the alert is running + """ + + if parameters is None: + return (RESULT_CODE_UNKNOWN, ['There were no parameters supplied to the script.']) + + if host_name is None: + host_name = socket.getfqdn() + + ams_monitor_process_running = is_monitor_process_live(AMS_MONITOR_PID_PATH) + + alert_state = RESULT_CODE_OK if ams_monitor_process_running else RESULT_CODE_CRITICAL + + alert_label = 'Ambari Monitor is running on {0}' if ams_monitor_process_running else 'Ambari Monitor is NOT running on {0}' + alert_label = alert_label.format(host_name) + + return (alert_state, [alert_label]) http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/common-services/AMS/0.1.0/package/files/alert_ambari_metrics_monitor.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/AMS/0.1.0/package/files/alert_ambari_metrics_monitor.py b/ambari-server/src/main/resources/common-services/AMS/0.1.0/package/files/alert_ambari_metrics_monitor.py deleted file mode 100644 index 5841267..0000000 --- a/ambari-server/src/main/resources/common-services/AMS/0.1.0/package/files/alert_ambari_metrics_monitor.py +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env python - -""" -Licensed to the Apache Software Foundation (ASF) under one -or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. The ASF licenses this file -to you under the Apache License, Version 2.0 (the -"License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import socket - -from resource_management.libraries.functions.check_process_status import check_process_status -from resource_management.core.exceptions import ComponentIsNotRunning - -RESULT_CODE_OK = 'OK' -RESULT_CODE_CRITICAL = 'CRITICAL' -RESULT_CODE_UNKNOWN = 'UNKNOWN' - -AMS_MONITOR_PID_PATH = '/var/run/ambari-metrics-monitor/ambari-metrics-monitor.pid' - - -def get_tokens(): - """ - Returns a tuple of tokens in the format {{site/property}} that will be used - to build the dictionary passed into execute - """ - return () - - -def is_monitor_process_live(pid_file): - """ - Gets whether the AMS monitor represented by the specified file is running. - :param pid_file: the PID file of the monitor to check - :return: True if the monitor is running, False otherwise - """ - live = False - - try: - check_process_status(pid_file) - live = True - except ComponentIsNotRunning: - pass - - return live - - -def execute(parameters=None, host_name=None): - """ - Returns a tuple containing the result code and a pre-formatted result label - - Keyword arguments: - parameters (dictionary): a mapping of parameter key to value - host_name (string): the name of this host where the alert is running - """ - - if parameters is None: - return (RESULT_CODE_UNKNOWN, ['There were no parameters supplied to the script.']) - - if host_name is None: - host_name = socket.getfqdn() - - ams_monitor_process_running = is_monitor_process_live(AMS_MONITOR_PID_PATH) - - alert_state = RESULT_CODE_OK if ams_monitor_process_running else RESULT_CODE_CRITICAL - - alert_label = 'Ambari Monitor is running on {0}' if ams_monitor_process_running else 'Ambari Monitor is NOT running on {0}' - alert_label = alert_label.format(host_name) - - return (alert_state, [alert_label]) http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/alerts.json b/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/alerts.json index b40afd9..865c471 100644 --- a/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/alerts.json +++ b/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/alerts.json @@ -10,7 +10,7 @@ "scope": "ANY", "source": { "type": "SCRIPT", - "path": "FLUME/1.4.0.2.0/package/files/alert_flume_agent_status.py" + "path": "FLUME/1.4.0.2.0/package/alerts/alert_flume_agent_status.py" } } ] http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/package/alerts/alert_flume_agent_status.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/package/alerts/alert_flume_agent_status.py b/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/package/alerts/alert_flume_agent_status.py new file mode 100644 index 0000000..b183bbc --- /dev/null +++ b/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/package/alerts/alert_flume_agent_status.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python + +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import socket + +from resource_management.libraries.functions.flume_agent_helper import find_expected_agent_names +from resource_management.libraries.functions.flume_agent_helper import get_flume_status + +RESULT_CODE_OK = 'OK' +RESULT_CODE_CRITICAL = 'CRITICAL' +RESULT_CODE_UNKNOWN = 'UNKNOWN' + +FLUME_CONF_DIR_KEY = '{{flume-env/flume_conf_dir}}' + +FLUME_RUN_DIR = '/var/run/flume' + +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return (FLUME_CONF_DIR_KEY,) + + +def execute(parameters=None, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + parameters (dictionary): a mapping of parameter key to value + host_name (string): the name of this host where the alert is running + """ + + if parameters is None: + return (RESULT_CODE_UNKNOWN, ['There were no parameters supplied to the script.']) + + flume_conf_directory = None + if FLUME_CONF_DIR_KEY in parameters: + flume_conf_directory = parameters[FLUME_CONF_DIR_KEY] + + if flume_conf_directory is None: + return (RESULT_CODE_UNKNOWN, ['The Flume configuration directory is a required parameter.']) + + if host_name is None: + host_name = socket.getfqdn() + + processes = get_flume_status(flume_conf_directory, FLUME_RUN_DIR) + expected_agents = find_expected_agent_names(flume_conf_directory) + + alert_label = '' + alert_state = RESULT_CODE_OK + + if len(processes) == 0 and len(expected_agents) == 0: + alert_label = 'No agents defined on {0}'.format(host_name) + else: + ok = [] + critical = [] + text_arr = [] + + for process in processes: + if not process.has_key('status') or process['status'] == 'NOT_RUNNING': + critical.append(process['name']) + else: + ok.append(process['name']) + + if len(critical) > 0: + text_arr.append("{0} {1} NOT running".format(", ".join(critical), + "is" if len(critical) == 1 else "are")) + + if len(ok) > 0: + text_arr.append("{0} {1} running".format(", ".join(ok), + "is" if len(ok) == 1 else "are")) + + plural = len(critical) > 1 or len(ok) > 1 + alert_label = "Agent{0} {1} {2}".format( + "s" if plural else "", + " and ".join(text_arr), + "on " + host_name) + + alert_state = RESULT_CODE_CRITICAL if len(critical) > 0 else RESULT_CODE_OK + + return (alert_state, [alert_label]) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/package/files/alert_flume_agent_status.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/package/files/alert_flume_agent_status.py b/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/package/files/alert_flume_agent_status.py deleted file mode 100644 index b183bbc..0000000 --- a/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/package/files/alert_flume_agent_status.py +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python - -""" -Licensed to the Apache Software Foundation (ASF) under one -or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. The ASF licenses this file -to you under the Apache License, Version 2.0 (the -"License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import socket - -from resource_management.libraries.functions.flume_agent_helper import find_expected_agent_names -from resource_management.libraries.functions.flume_agent_helper import get_flume_status - -RESULT_CODE_OK = 'OK' -RESULT_CODE_CRITICAL = 'CRITICAL' -RESULT_CODE_UNKNOWN = 'UNKNOWN' - -FLUME_CONF_DIR_KEY = '{{flume-env/flume_conf_dir}}' - -FLUME_RUN_DIR = '/var/run/flume' - -def get_tokens(): - """ - Returns a tuple of tokens in the format {{site/property}} that will be used - to build the dictionary passed into execute - """ - return (FLUME_CONF_DIR_KEY,) - - -def execute(parameters=None, host_name=None): - """ - Returns a tuple containing the result code and a pre-formatted result label - - Keyword arguments: - parameters (dictionary): a mapping of parameter key to value - host_name (string): the name of this host where the alert is running - """ - - if parameters is None: - return (RESULT_CODE_UNKNOWN, ['There were no parameters supplied to the script.']) - - flume_conf_directory = None - if FLUME_CONF_DIR_KEY in parameters: - flume_conf_directory = parameters[FLUME_CONF_DIR_KEY] - - if flume_conf_directory is None: - return (RESULT_CODE_UNKNOWN, ['The Flume configuration directory is a required parameter.']) - - if host_name is None: - host_name = socket.getfqdn() - - processes = get_flume_status(flume_conf_directory, FLUME_RUN_DIR) - expected_agents = find_expected_agent_names(flume_conf_directory) - - alert_label = '' - alert_state = RESULT_CODE_OK - - if len(processes) == 0 and len(expected_agents) == 0: - alert_label = 'No agents defined on {0}'.format(host_name) - else: - ok = [] - critical = [] - text_arr = [] - - for process in processes: - if not process.has_key('status') or process['status'] == 'NOT_RUNNING': - critical.append(process['name']) - else: - ok.append(process['name']) - - if len(critical) > 0: - text_arr.append("{0} {1} NOT running".format(", ".join(critical), - "is" if len(critical) == 1 else "are")) - - if len(ok) > 0: - text_arr.append("{0} {1} running".format(", ".join(ok), - "is" if len(ok) == 1 else "are")) - - plural = len(critical) > 1 or len(ok) > 1 - alert_label = "Agent{0} {1} {2}".format( - "s" if plural else "", - " and ".join(text_arr), - "on " + host_name) - - alert_state = RESULT_CODE_CRITICAL if len(critical) > 0 else RESULT_CODE_OK - - return (alert_state, [alert_label]) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json index e2db2f2..8de4b3a 100644 --- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json +++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json @@ -330,7 +330,7 @@ "enabled": true, "source": { "type": "SCRIPT", - "path": "HDFS/2.1.0.2.0/package/files/alert_checkpoint_time.py" + "path": "HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py" } }, { @@ -343,7 +343,7 @@ "ignore_host": true, "source": { "type": "SCRIPT", - "path": "HDFS/2.1.0.2.0/package/files/alert_ha_namenode_health.py" + "path": "HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py" } } ], http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py new file mode 100644 index 0000000..410608f --- /dev/null +++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python + +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import time +import urllib2 +import json + +LABEL = 'Last Checkpoint: [{h} hours, {m} minutes, {tx} transactions]' + +NN_HTTP_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.http-address}}' +NN_HTTPS_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.https-address}}' +NN_HTTP_POLICY_KEY = '{{hdfs-site/dfs.http.policy}}' +NN_CHECKPOINT_TX_KEY = '{{hdfs-site/dfs.namenode.checkpoint.txns}}' +NN_CHECKPOINT_PERIOD_KEY = '{{hdfs-site/dfs.namenode.checkpoint.period}}' + +PERCENT_WARNING = 200 +PERCENT_CRITICAL = 200 + +CHECKPOINT_TX_DEFAULT = 1000000 +CHECKPOINT_PERIOD_DEFAULT = 21600 + +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return (NN_HTTP_ADDRESS_KEY, NN_HTTPS_ADDRESS_KEY, NN_HTTP_POLICY_KEY, + NN_CHECKPOINT_TX_KEY, NN_CHECKPOINT_PERIOD_KEY) + + +def execute(parameters=None, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + parameters (dictionary): a mapping of parameter key to value + host_name (string): the name of this host where the alert is running + """ + + if parameters is None: + return (('UNKNOWN', ['There were no parameters supplied to the script.'])) + + uri = None + scheme = 'http' + http_uri = None + https_uri = None + http_policy = 'HTTP_ONLY' + percent_warning = PERCENT_WARNING + percent_critical = PERCENT_CRITICAL + checkpoint_tx = CHECKPOINT_TX_DEFAULT + checkpoint_period = CHECKPOINT_PERIOD_DEFAULT + + if NN_HTTP_ADDRESS_KEY in parameters: + http_uri = parameters[NN_HTTP_ADDRESS_KEY] + + if NN_HTTPS_ADDRESS_KEY in parameters: + https_uri = parameters[NN_HTTPS_ADDRESS_KEY] + + if NN_HTTP_POLICY_KEY in parameters: + http_policy = parameters[NN_HTTP_POLICY_KEY] + + if NN_CHECKPOINT_TX_KEY in parameters: + checkpoint_tx = parameters[NN_CHECKPOINT_TX_KEY] + + if NN_CHECKPOINT_PERIOD_KEY in parameters: + checkpoint_period = parameters[NN_CHECKPOINT_PERIOD_KEY] + + # determine the right URI and whether to use SSL + uri = http_uri + if http_policy == 'HTTPS_ONLY': + scheme = 'https' + + if https_uri is not None: + uri = https_uri + + current_time = int(round(time.time() * 1000)) + + last_checkpoint_time_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".format(scheme,uri) + journal_transaction_info_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(scheme,uri) + + # start out assuming an OK status + label = None + result_code = "OK" + + try: + last_checkpoint_time = int(get_value_from_jmx(last_checkpoint_time_qry,"LastCheckpointTime")) + journal_transaction_info = get_value_from_jmx(journal_transaction_info_qry,"JournalTransactionInfo") + journal_transaction_info_dict = json.loads(journal_transaction_info) + + last_tx = int(journal_transaction_info_dict['LastAppliedOrWrittenTxId']) + most_recent_tx = int(journal_transaction_info_dict['MostRecentCheckpointTxId']) + transaction_difference = last_tx - most_recent_tx + + delta = (current_time - last_checkpoint_time)/1000 + + label = LABEL.format(h=get_time(delta)['h'], m=get_time(delta)['m'], tx=transaction_difference) + + if (transaction_difference > int(checkpoint_tx)) and (float(delta) / int(checkpoint_period)*100 >= int(percent_critical)): + result_code = 'CRITICAL' + elif (transaction_difference > int(checkpoint_tx)) and (float(delta) / int(checkpoint_period)*100 >= int(percent_warning)): + result_code = 'WARNING' + + except Exception, e: + label = str(e) + result_code = 'UNKNOWN' + + return ((result_code, [label])) + +def get_time(delta): + h = int(delta/3600) + m = int((delta % 3600)/60) + return {'h':h, 'm':m} + + +def get_value_from_jmx(qry, property): + response = urllib2.urlopen(qry) + data=response.read() + data_dict = json.loads(data) + return data_dict["beans"][0][property] http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py new file mode 100644 index 0000000..fc1541d --- /dev/null +++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python + +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import urllib2 +import json + +RESULT_STATE_OK = 'OK' +RESULT_STATE_CRITICAL = 'CRITICAL' +RESULT_STATE_UNKNOWN = 'UNKNOWN' +RESULT_STATE_SKIPPED = 'SKIPPED' + +HDFS_NN_STATE_ACTIVE = 'active' +HDFS_NN_STATE_STANDBY = 'standby' + +HDFS_SITE_KEY = '{{hdfs-site}}' +NAMESERVICE_KEY = '{{hdfs-site/dfs.nameservices}}' +NN_HTTP_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.http-address}}' +NN_HTTPS_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.https-address}}' +DFS_POLICY_KEY = '{{hdfs-site/dfs.http.policy}}' + +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return (HDFS_SITE_KEY, NAMESERVICE_KEY, NN_HTTP_ADDRESS_KEY, + NN_HTTPS_ADDRESS_KEY, DFS_POLICY_KEY) + + +def execute(parameters=None, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + parameters (dictionary): a mapping of parameter key to value + host_name (string): the name of this host where the alert is running + """ + if parameters is None: + return (RESULT_STATE_UNKNOWN, ['There were no parameters supplied to the script.']) + + # if not in HA mode, then SKIP + if not NAMESERVICE_KEY in parameters: + return (RESULT_STATE_SKIPPED, ['NameNode HA is not enabled']) + + # hdfs-site is required + if not HDFS_SITE_KEY in parameters: + return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)]) + + # determine whether or not SSL is enabled + is_ssl_enabled = False + if DFS_POLICY_KEY in parameters: + dfs_policy = parameters[DFS_POLICY_KEY] + if dfs_policy == "HTTPS_ONLY": + is_ssl_enabled = True + + name_service = parameters[NAMESERVICE_KEY] + hdfs_site = parameters[HDFS_SITE_KEY] + + # look for dfs.ha.namenodes.foo + nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service + if not nn_unique_ids_key in hdfs_site: + return (RESULT_STATE_UNKNOWN, ['Unable to find unique namenode alias key {0}'.format(nn_unique_ids_key)]) + + namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}' + jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus" + + if is_ssl_enabled: + namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}' + jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus" + + + active_namenodes = [] + standby_namenodes = [] + unknown_namenodes = [] + + # now we have something like 'nn1,nn2,nn3,nn4' + # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id] + # ie dfs.namenode.http-address.hacluster.nn1 + nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',') + for nn_unique_id in nn_unique_ids: + key = namenode_http_fragment.format(name_service,nn_unique_id) + + if key in hdfs_site: + # use str() to ensure that unicode strings do not have the u' in them + value = str(hdfs_site[key]) + + try: + jmx_uri = jmx_uri_fragment.format(value) + state = get_value_from_jmx(jmx_uri,'State') + + if state == HDFS_NN_STATE_ACTIVE: + active_namenodes.append(value) + elif state == HDFS_NN_STATE_STANDBY: + standby_namenodes.append(value) + else: + unknown_namenodes.append(value) + except: + unknown_namenodes.append(value) + + # now that the request is done, determine if this host is the host that + # should report the status of the HA topology + is_active_namenode = False + for active_namenode in active_namenodes: + if active_namenode.startswith(host_name): + is_active_namenode = True + + # there's only one scenario here; there is exactly 1 active and 1 standby + is_topology_healthy = len(active_namenodes) == 1 and len(standby_namenodes) == 1 + + result_label = 'Active{0}, Standby{1}, Unknown{2}'.format(str(active_namenodes), + str(standby_namenodes), str(unknown_namenodes)) + + # Healthy Topology: + # - Active NN reports the alert, standby does not + # + # Unhealthy Topology: + # - Report the alert if this is the first named host + # - Report the alert if not the first named host, but the other host + # could not report its status + if is_topology_healthy: + if is_active_namenode is True: + return (RESULT_STATE_OK, [result_label]) + else: + return (RESULT_STATE_SKIPPED, ['Another host will report this alert']) + else: + # dfs.namenode.rpc-address.service.alias is guaranteed in HA mode + first_listed_host_key = 'dfs.namenode.rpc-address.{0}.{1}'.format( + name_service, nn_unique_ids[0]) + + first_listed_host = '' + if first_listed_host_key in hdfs_site: + first_listed_host = hdfs_site[first_listed_host_key] + + is_first_listed_host = False + if first_listed_host.startswith(host_name): + is_first_listed_host = True + + if is_first_listed_host: + return (RESULT_STATE_CRITICAL, [result_label]) + else: + # not the first listed host, but the first host might be in the unknown + return (RESULT_STATE_SKIPPED, ['Another host will report this alert']) + + +def get_value_from_jmx(qry, property): + response = urllib2.urlopen(qry) + data=response.read() + data_dict = json.loads(data) + return data_dict["beans"][0][property] http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/files/alert_checkpoint_time.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/files/alert_checkpoint_time.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/files/alert_checkpoint_time.py deleted file mode 100644 index 410608f..0000000 --- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/files/alert_checkpoint_time.py +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env python - -""" -Licensed to the Apache Software Foundation (ASF) under one -or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. The ASF licenses this file -to you under the Apache License, Version 2.0 (the -"License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import time -import urllib2 -import json - -LABEL = 'Last Checkpoint: [{h} hours, {m} minutes, {tx} transactions]' - -NN_HTTP_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.http-address}}' -NN_HTTPS_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.https-address}}' -NN_HTTP_POLICY_KEY = '{{hdfs-site/dfs.http.policy}}' -NN_CHECKPOINT_TX_KEY = '{{hdfs-site/dfs.namenode.checkpoint.txns}}' -NN_CHECKPOINT_PERIOD_KEY = '{{hdfs-site/dfs.namenode.checkpoint.period}}' - -PERCENT_WARNING = 200 -PERCENT_CRITICAL = 200 - -CHECKPOINT_TX_DEFAULT = 1000000 -CHECKPOINT_PERIOD_DEFAULT = 21600 - -def get_tokens(): - """ - Returns a tuple of tokens in the format {{site/property}} that will be used - to build the dictionary passed into execute - """ - return (NN_HTTP_ADDRESS_KEY, NN_HTTPS_ADDRESS_KEY, NN_HTTP_POLICY_KEY, - NN_CHECKPOINT_TX_KEY, NN_CHECKPOINT_PERIOD_KEY) - - -def execute(parameters=None, host_name=None): - """ - Returns a tuple containing the result code and a pre-formatted result label - - Keyword arguments: - parameters (dictionary): a mapping of parameter key to value - host_name (string): the name of this host where the alert is running - """ - - if parameters is None: - return (('UNKNOWN', ['There were no parameters supplied to the script.'])) - - uri = None - scheme = 'http' - http_uri = None - https_uri = None - http_policy = 'HTTP_ONLY' - percent_warning = PERCENT_WARNING - percent_critical = PERCENT_CRITICAL - checkpoint_tx = CHECKPOINT_TX_DEFAULT - checkpoint_period = CHECKPOINT_PERIOD_DEFAULT - - if NN_HTTP_ADDRESS_KEY in parameters: - http_uri = parameters[NN_HTTP_ADDRESS_KEY] - - if NN_HTTPS_ADDRESS_KEY in parameters: - https_uri = parameters[NN_HTTPS_ADDRESS_KEY] - - if NN_HTTP_POLICY_KEY in parameters: - http_policy = parameters[NN_HTTP_POLICY_KEY] - - if NN_CHECKPOINT_TX_KEY in parameters: - checkpoint_tx = parameters[NN_CHECKPOINT_TX_KEY] - - if NN_CHECKPOINT_PERIOD_KEY in parameters: - checkpoint_period = parameters[NN_CHECKPOINT_PERIOD_KEY] - - # determine the right URI and whether to use SSL - uri = http_uri - if http_policy == 'HTTPS_ONLY': - scheme = 'https' - - if https_uri is not None: - uri = https_uri - - current_time = int(round(time.time() * 1000)) - - last_checkpoint_time_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".format(scheme,uri) - journal_transaction_info_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(scheme,uri) - - # start out assuming an OK status - label = None - result_code = "OK" - - try: - last_checkpoint_time = int(get_value_from_jmx(last_checkpoint_time_qry,"LastCheckpointTime")) - journal_transaction_info = get_value_from_jmx(journal_transaction_info_qry,"JournalTransactionInfo") - journal_transaction_info_dict = json.loads(journal_transaction_info) - - last_tx = int(journal_transaction_info_dict['LastAppliedOrWrittenTxId']) - most_recent_tx = int(journal_transaction_info_dict['MostRecentCheckpointTxId']) - transaction_difference = last_tx - most_recent_tx - - delta = (current_time - last_checkpoint_time)/1000 - - label = LABEL.format(h=get_time(delta)['h'], m=get_time(delta)['m'], tx=transaction_difference) - - if (transaction_difference > int(checkpoint_tx)) and (float(delta) / int(checkpoint_period)*100 >= int(percent_critical)): - result_code = 'CRITICAL' - elif (transaction_difference > int(checkpoint_tx)) and (float(delta) / int(checkpoint_period)*100 >= int(percent_warning)): - result_code = 'WARNING' - - except Exception, e: - label = str(e) - result_code = 'UNKNOWN' - - return ((result_code, [label])) - -def get_time(delta): - h = int(delta/3600) - m = int((delta % 3600)/60) - return {'h':h, 'm':m} - - -def get_value_from_jmx(qry, property): - response = urllib2.urlopen(qry) - data=response.read() - data_dict = json.loads(data) - return data_dict["beans"][0][property] http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/files/alert_ha_namenode_health.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/files/alert_ha_namenode_health.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/files/alert_ha_namenode_health.py deleted file mode 100644 index fc1541d..0000000 --- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/files/alert_ha_namenode_health.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python - -""" -Licensed to the Apache Software Foundation (ASF) under one -or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. The ASF licenses this file -to you under the Apache License, Version 2.0 (the -"License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import urllib2 -import json - -RESULT_STATE_OK = 'OK' -RESULT_STATE_CRITICAL = 'CRITICAL' -RESULT_STATE_UNKNOWN = 'UNKNOWN' -RESULT_STATE_SKIPPED = 'SKIPPED' - -HDFS_NN_STATE_ACTIVE = 'active' -HDFS_NN_STATE_STANDBY = 'standby' - -HDFS_SITE_KEY = '{{hdfs-site}}' -NAMESERVICE_KEY = '{{hdfs-site/dfs.nameservices}}' -NN_HTTP_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.http-address}}' -NN_HTTPS_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.https-address}}' -DFS_POLICY_KEY = '{{hdfs-site/dfs.http.policy}}' - -def get_tokens(): - """ - Returns a tuple of tokens in the format {{site/property}} that will be used - to build the dictionary passed into execute - """ - return (HDFS_SITE_KEY, NAMESERVICE_KEY, NN_HTTP_ADDRESS_KEY, - NN_HTTPS_ADDRESS_KEY, DFS_POLICY_KEY) - - -def execute(parameters=None, host_name=None): - """ - Returns a tuple containing the result code and a pre-formatted result label - - Keyword arguments: - parameters (dictionary): a mapping of parameter key to value - host_name (string): the name of this host where the alert is running - """ - if parameters is None: - return (RESULT_STATE_UNKNOWN, ['There were no parameters supplied to the script.']) - - # if not in HA mode, then SKIP - if not NAMESERVICE_KEY in parameters: - return (RESULT_STATE_SKIPPED, ['NameNode HA is not enabled']) - - # hdfs-site is required - if not HDFS_SITE_KEY in parameters: - return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)]) - - # determine whether or not SSL is enabled - is_ssl_enabled = False - if DFS_POLICY_KEY in parameters: - dfs_policy = parameters[DFS_POLICY_KEY] - if dfs_policy == "HTTPS_ONLY": - is_ssl_enabled = True - - name_service = parameters[NAMESERVICE_KEY] - hdfs_site = parameters[HDFS_SITE_KEY] - - # look for dfs.ha.namenodes.foo - nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service - if not nn_unique_ids_key in hdfs_site: - return (RESULT_STATE_UNKNOWN, ['Unable to find unique namenode alias key {0}'.format(nn_unique_ids_key)]) - - namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}' - jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus" - - if is_ssl_enabled: - namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}' - jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus" - - - active_namenodes = [] - standby_namenodes = [] - unknown_namenodes = [] - - # now we have something like 'nn1,nn2,nn3,nn4' - # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id] - # ie dfs.namenode.http-address.hacluster.nn1 - nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',') - for nn_unique_id in nn_unique_ids: - key = namenode_http_fragment.format(name_service,nn_unique_id) - - if key in hdfs_site: - # use str() to ensure that unicode strings do not have the u' in them - value = str(hdfs_site[key]) - - try: - jmx_uri = jmx_uri_fragment.format(value) - state = get_value_from_jmx(jmx_uri,'State') - - if state == HDFS_NN_STATE_ACTIVE: - active_namenodes.append(value) - elif state == HDFS_NN_STATE_STANDBY: - standby_namenodes.append(value) - else: - unknown_namenodes.append(value) - except: - unknown_namenodes.append(value) - - # now that the request is done, determine if this host is the host that - # should report the status of the HA topology - is_active_namenode = False - for active_namenode in active_namenodes: - if active_namenode.startswith(host_name): - is_active_namenode = True - - # there's only one scenario here; there is exactly 1 active and 1 standby - is_topology_healthy = len(active_namenodes) == 1 and len(standby_namenodes) == 1 - - result_label = 'Active{0}, Standby{1}, Unknown{2}'.format(str(active_namenodes), - str(standby_namenodes), str(unknown_namenodes)) - - # Healthy Topology: - # - Active NN reports the alert, standby does not - # - # Unhealthy Topology: - # - Report the alert if this is the first named host - # - Report the alert if not the first named host, but the other host - # could not report its status - if is_topology_healthy: - if is_active_namenode is True: - return (RESULT_STATE_OK, [result_label]) - else: - return (RESULT_STATE_SKIPPED, ['Another host will report this alert']) - else: - # dfs.namenode.rpc-address.service.alias is guaranteed in HA mode - first_listed_host_key = 'dfs.namenode.rpc-address.{0}.{1}'.format( - name_service, nn_unique_ids[0]) - - first_listed_host = '' - if first_listed_host_key in hdfs_site: - first_listed_host = hdfs_site[first_listed_host_key] - - is_first_listed_host = False - if first_listed_host.startswith(host_name): - is_first_listed_host = True - - if is_first_listed_host: - return (RESULT_STATE_CRITICAL, [result_label]) - else: - # not the first listed host, but the first host might be in the unknown - return (RESULT_STATE_SKIPPED, ['Another host will report this alert']) - - -def get_value_from_jmx(qry, property): - response = urllib2.urlopen(qry) - data=response.read() - data_dict = json.loads(data) - return data_dict["beans"][0][property] http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json index 2dedbe9..750f586 100644 --- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json +++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json @@ -38,7 +38,7 @@ "enabled": true, "source": { "type": "SCRIPT", - "path": "HIVE/0.12.0.2.0/package/files/alert_hive_thrift_port.py" + "path": "HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py" } } ], @@ -52,7 +52,7 @@ "enabled": true, "source": { "type": "SCRIPT", - "path": "HIVE/0.12.0.2.0/package/files/alert_webhcat_server.py" + "path": "HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py" } } ] http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py new file mode 100644 index 0000000..ebfbf55 --- /dev/null +++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python + +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import json +import socket +import time +import traceback +import urllib2 +from resource_management.libraries.functions import hive_check +from resource_management.libraries.functions import format +from resource_management.libraries.functions import get_kinit_path + +OK_MESSAGE = "TCP OK - %.4f response on port %s" +CRITICAL_MESSAGE = "Connection failed on host {0}:{1}" + +HIVE_SERVER_THRIFT_PORT_KEY = '{{hive-site/hive.server2.thrift.port}}' +SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}' +HIVE_SERVER2_AUTHENTICATION_KEY = '{{hive-site/hive.server2.authentication}}' +HIVE_SERVER_PRINCIPAL_KEY = '{{hive-site/hive.server2.authentication.kerberos.principal}}' +SMOKEUSER_KEYTAB_KEY = '{{cluster-env/smokeuser_keytab}}' +SMOKEUSER_KEY = '{{cluster-env/smokeuser}}' + +PERCENT_WARNING = 200 +PERCENT_CRITICAL = 200 + +THRIFT_PORT_DEFAULT = 10000 +HIVE_SERVER_PRINCIPAL_DEFAULT = 'hive/[email protected]' +HIVE_SERVER2_AUTHENTICATION_DEFAULT = 'NOSASL' +SMOKEUSER_KEYTAB_DEFAULT = '/etc/security/keytabs/smokeuser.headless.keytab' +SMOKEUSER_DEFAULT = 'ambari-qa' + +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return (HIVE_SERVER_THRIFT_PORT_KEY,SECURITY_ENABLED_KEY,HIVE_SERVER2_AUTHENTICATION_KEY,HIVE_SERVER_PRINCIPAL_KEY,SMOKEUSER_KEYTAB_KEY,SMOKEUSER_KEY) + + +def execute(parameters=None, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + parameters (dictionary): a mapping of parameter key to value + host_name (string): the name of this host where the alert is running + """ + + if parameters is None: + return (('UNKNOWN', ['There were no parameters supplied to the script.'])) + + thrift_port = THRIFT_PORT_DEFAULT + if HIVE_SERVER_THRIFT_PORT_KEY in parameters: + thrift_port = int(parameters[HIVE_SERVER_THRIFT_PORT_KEY]) + + security_enabled = False + if SECURITY_ENABLED_KEY in parameters: + security_enabled = str(parameters[SECURITY_ENABLED_KEY]).upper() == 'TRUE' + + hive_server2_authentication = HIVE_SERVER2_AUTHENTICATION_DEFAULT + if HIVE_SERVER2_AUTHENTICATION_KEY in parameters: + hive_server2_authentication = parameters[HIVE_SERVER2_AUTHENTICATION_KEY] + + smokeuser = SMOKEUSER_DEFAULT + if SMOKEUSER_KEY in parameters: + smokeuser = parameters[SMOKEUSER_KEY] + + result_code = None + + if security_enabled: + hive_server_principal = HIVE_SERVER_PRINCIPAL_DEFAULT + if HIVE_SERVER_PRINCIPAL_KEY in parameters: + hive_server_principal = parameters[HIVE_SERVER_PRINCIPAL_KEY] + smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT + if SMOKEUSER_KEYTAB_KEY in parameters: + smokeuser_keytab = parameters[SMOKEUSER_KEYTAB_KEY] + kinit_path_local = get_kinit_path(["/usr/bin", "/usr/kerberos/bin", "/usr/sbin"]) + kinitcmd=format("{kinit_path_local} -kt {smokeuser_keytab} {smokeuser}; ") + else: + hive_server_principal = None + kinitcmd=None + + try: + if host_name is None: + host_name = socket.getfqdn() + + start_time = time.time() + try: + hive_check.check_thrift_port_sasl(host_name, thrift_port, hive_server2_authentication, + hive_server_principal, kinitcmd, smokeuser) + is_thrift_port_ok = True + except: + is_thrift_port_ok = False + + if is_thrift_port_ok == True: + result_code = 'OK' + total_time = time.time() - start_time + label = OK_MESSAGE % (total_time, thrift_port) + else: + result_code = 'CRITICAL' + label = CRITICAL_MESSAGE.format(host_name,thrift_port) + + except Exception, e: + label = str(e) + result_code = 'UNKNOWN' + + return ((result_code, [label])) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py new file mode 100644 index 0000000..44840de --- /dev/null +++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python + +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import json +import socket +import time +import urllib2 + +RESULT_CODE_OK = 'OK' +RESULT_CODE_CRITICAL = 'CRITICAL' +RESULT_CODE_UNKNOWN = 'UNKNOWN' + +OK_MESSAGE = 'TCP OK - {0:.4f} response on port {1}' +CRITICAL_CONNECTION_MESSAGE = 'Connection failed on host {0}:{1}' +CRITICAL_TEMPLETON_STATUS_MESSAGE = 'WebHCat returned an unexpected status of "{0}"' +CRITICAL_TEMPLETON_UNKNOWN_JSON_MESSAGE = 'Unable to determine WebHCat health from unexpected JSON response' + +TEMPLETON_PORT_KEY = '{{webhcat-site/templeton.port}}' +SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}' + +TEMPLETON_OK_RESPONSE = 'ok' +TEMPLETON_PORT_DEFAULT = 50111 + +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return (TEMPLETON_PORT_KEY,SECURITY_ENABLED_KEY) + + +def execute(parameters=None, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + parameters (dictionary): a mapping of parameter key to value + host_name (string): the name of this host where the alert is running + """ + + result_code = RESULT_CODE_UNKNOWN + + if parameters is None: + return (result_code, ['There were no parameters supplied to the script.']) + + templeton_port = TEMPLETON_PORT_DEFAULT + if TEMPLETON_PORT_KEY in parameters: + templeton_port = int(parameters[TEMPLETON_PORT_KEY]) + + security_enabled = False + if SECURITY_ENABLED_KEY in parameters: + security_enabled = parameters[SECURITY_ENABLED_KEY].lower() == 'true' + + scheme = 'http' + if security_enabled is True: + scheme = 'https' + + label = '' + url_response = None + templeton_status = '' + total_time = 0 + + try: + # the alert will always run on the webhcat host + if host_name is None: + host_name = socket.getfqdn() + + query = "{0}://{1}:{2}/templeton/v1/status".format(scheme, host_name, + templeton_port) + + # execute the query for the JSON that includes templeton status + start_time = time.time() + url_response = urllib2.urlopen(query) + total_time = time.time() - start_time + except: + label = CRITICAL_CONNECTION_MESSAGE.format(host_name,templeton_port) + return (RESULT_CODE_CRITICAL, [label]) + + # URL response received, parse it + try: + json_response = json.loads(url_response.read()) + templeton_status = json_response['status'] + except: + return (RESULT_CODE_CRITICAL, [CRITICAL_TEMPLETON_UNKNOWN_JSON_MESSAGE]) + + # proper JSON received, compare against known value + if templeton_status.lower() == TEMPLETON_OK_RESPONSE: + result_code = RESULT_CODE_OK + label = OK_MESSAGE.format(total_time, templeton_port) + else: + result_code = RESULT_CODE_CRITICAL + label = CRITICAL_TEMPLETON_STATUS_MESSAGE.format(templeton_status) + + return (result_code, [label]) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/files/alert_hive_thrift_port.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/files/alert_hive_thrift_port.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/files/alert_hive_thrift_port.py deleted file mode 100644 index 499640f..0000000 --- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/files/alert_hive_thrift_port.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env python - -""" -Licensed to the Apache Software Foundation (ASF) under one -or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. The ASF licenses this file -to you under the Apache License, Version 2.0 (the -"License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import json -import socket -import time -import traceback -import urllib2 -from resource_management.libraries.functions import hive_check -from resource_management.libraries.functions import format -from resource_management.libraries.functions import get_kinit_path -from resource_management.core.environment import Environment - -OK_MESSAGE = "TCP OK - %.4f response on port %s" -CRITICAL_MESSAGE = "Connection failed on host {0}:{1}" - -HIVE_SERVER_THRIFT_PORT_KEY = '{{hive-site/hive.server2.thrift.port}}' -SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}' -HIVE_SERVER2_AUTHENTICATION_KEY = '{{hive-site/hive.server2.authentication}}' -HIVE_SERVER_PRINCIPAL_KEY = '{{hive-site/hive.server2.authentication.kerberos.principal}}' -SMOKEUSER_KEYTAB_KEY = '{{cluster-env/smokeuser_keytab}}' -SMOKEUSER_KEY = '{{cluster-env/smokeuser}}' - -PERCENT_WARNING = 200 -PERCENT_CRITICAL = 200 - -THRIFT_PORT_DEFAULT = 10000 -HIVE_SERVER_PRINCIPAL_DEFAULT = 'hive/[email protected]' -HIVE_SERVER2_AUTHENTICATION_DEFAULT = 'NOSASL' -SMOKEUSER_KEYTAB_DEFAULT = '/etc/security/keytabs/smokeuser.headless.keytab' -SMOKEUSER_DEFAULT = 'ambari-qa' - -def get_tokens(): - """ - Returns a tuple of tokens in the format {{site/property}} that will be used - to build the dictionary passed into execute - """ - return (HIVE_SERVER_THRIFT_PORT_KEY,SECURITY_ENABLED_KEY,HIVE_SERVER2_AUTHENTICATION_KEY,HIVE_SERVER_PRINCIPAL_KEY,SMOKEUSER_KEYTAB_KEY,SMOKEUSER_KEY) - - -def execute(parameters=None, host_name=None): - """ - Returns a tuple containing the result code and a pre-formatted result label - - Keyword arguments: - parameters (dictionary): a mapping of parameter key to value - host_name (string): the name of this host where the alert is running - """ - - if parameters is None: - return (('UNKNOWN', ['There were no parameters supplied to the script.'])) - - thrift_port = THRIFT_PORT_DEFAULT - if HIVE_SERVER_THRIFT_PORT_KEY in parameters: - thrift_port = int(parameters[HIVE_SERVER_THRIFT_PORT_KEY]) - - security_enabled = False - if SECURITY_ENABLED_KEY in parameters: - security_enabled = str(parameters[SECURITY_ENABLED_KEY]).upper() == 'TRUE' - - hive_server2_authentication = HIVE_SERVER2_AUTHENTICATION_DEFAULT - if HIVE_SERVER2_AUTHENTICATION_KEY in parameters: - hive_server2_authentication = parameters[HIVE_SERVER2_AUTHENTICATION_KEY] - - smokeuser = SMOKEUSER_DEFAULT - if SMOKEUSER_KEY in parameters: - smokeuser = parameters[SMOKEUSER_KEY] - - result_code = None - - if security_enabled: - hive_server_principal = HIVE_SERVER_PRINCIPAL_DEFAULT - if HIVE_SERVER_PRINCIPAL_KEY in parameters: - hive_server_principal = parameters[HIVE_SERVER_PRINCIPAL_KEY] - smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT - if SMOKEUSER_KEYTAB_KEY in parameters: - smokeuser_keytab = parameters[SMOKEUSER_KEYTAB_KEY] - with Environment() as env: - kinit_path_local = get_kinit_path(["/usr/bin", "/usr/kerberos/bin", "/usr/sbin"]) - kinitcmd=format("{kinit_path_local} -kt {smokeuser_keytab} {smokeuser}; ") - else: - hive_server_principal = None - kinitcmd=None - - try: - if host_name is None: - host_name = socket.getfqdn() - - start_time = time.time() - try: - with Environment() as env: - hive_check.check_thrift_port_sasl(host_name, thrift_port, hive_server2_authentication, - hive_server_principal, kinitcmd, smokeuser) - is_thrift_port_ok = True - except: - is_thrift_port_ok = False - - if is_thrift_port_ok == True: - result_code = 'OK' - total_time = time.time() - start_time - label = OK_MESSAGE % (total_time, thrift_port) - else: - result_code = 'CRITICAL' - label = CRITICAL_MESSAGE.format(host_name,thrift_port) - - except Exception, e: - label = str(e) - result_code = 'UNKNOWN' - - return ((result_code, [label])) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/files/alert_webhcat_server.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/files/alert_webhcat_server.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/files/alert_webhcat_server.py deleted file mode 100644 index 44840de..0000000 --- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/files/alert_webhcat_server.py +++ /dev/null @@ -1,111 +0,0 @@ -#!/usr/bin/env python - -""" -Licensed to the Apache Software Foundation (ASF) under one -or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. The ASF licenses this file -to you under the Apache License, Version 2.0 (the -"License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import json -import socket -import time -import urllib2 - -RESULT_CODE_OK = 'OK' -RESULT_CODE_CRITICAL = 'CRITICAL' -RESULT_CODE_UNKNOWN = 'UNKNOWN' - -OK_MESSAGE = 'TCP OK - {0:.4f} response on port {1}' -CRITICAL_CONNECTION_MESSAGE = 'Connection failed on host {0}:{1}' -CRITICAL_TEMPLETON_STATUS_MESSAGE = 'WebHCat returned an unexpected status of "{0}"' -CRITICAL_TEMPLETON_UNKNOWN_JSON_MESSAGE = 'Unable to determine WebHCat health from unexpected JSON response' - -TEMPLETON_PORT_KEY = '{{webhcat-site/templeton.port}}' -SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}' - -TEMPLETON_OK_RESPONSE = 'ok' -TEMPLETON_PORT_DEFAULT = 50111 - -def get_tokens(): - """ - Returns a tuple of tokens in the format {{site/property}} that will be used - to build the dictionary passed into execute - """ - return (TEMPLETON_PORT_KEY,SECURITY_ENABLED_KEY) - - -def execute(parameters=None, host_name=None): - """ - Returns a tuple containing the result code and a pre-formatted result label - - Keyword arguments: - parameters (dictionary): a mapping of parameter key to value - host_name (string): the name of this host where the alert is running - """ - - result_code = RESULT_CODE_UNKNOWN - - if parameters is None: - return (result_code, ['There were no parameters supplied to the script.']) - - templeton_port = TEMPLETON_PORT_DEFAULT - if TEMPLETON_PORT_KEY in parameters: - templeton_port = int(parameters[TEMPLETON_PORT_KEY]) - - security_enabled = False - if SECURITY_ENABLED_KEY in parameters: - security_enabled = parameters[SECURITY_ENABLED_KEY].lower() == 'true' - - scheme = 'http' - if security_enabled is True: - scheme = 'https' - - label = '' - url_response = None - templeton_status = '' - total_time = 0 - - try: - # the alert will always run on the webhcat host - if host_name is None: - host_name = socket.getfqdn() - - query = "{0}://{1}:{2}/templeton/v1/status".format(scheme, host_name, - templeton_port) - - # execute the query for the JSON that includes templeton status - start_time = time.time() - url_response = urllib2.urlopen(query) - total_time = time.time() - start_time - except: - label = CRITICAL_CONNECTION_MESSAGE.format(host_name,templeton_port) - return (RESULT_CODE_CRITICAL, [label]) - - # URL response received, parse it - try: - json_response = json.loads(url_response.read()) - templeton_status = json_response['status'] - except: - return (RESULT_CODE_CRITICAL, [CRITICAL_TEMPLETON_UNKNOWN_JSON_MESSAGE]) - - # proper JSON received, compare against known value - if templeton_status.lower() == TEMPLETON_OK_RESPONSE: - result_code = RESULT_CODE_OK - label = OK_MESSAGE.format(total_time, templeton_port) - else: - result_code = RESULT_CODE_CRITICAL - label = CRITICAL_TEMPLETON_STATUS_MESSAGE.format(templeton_status) - - return (result_code, [label]) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/alerts.json b/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/alerts.json index 9e74cdd..970c9d4 100644 --- a/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/alerts.json +++ b/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/alerts.json @@ -34,7 +34,7 @@ "scope": "ANY", "source": { "type": "SCRIPT", - "path": "OOZIE/4.0.0.2.0/package/files/alert_check_oozie_server.py" + "path": "OOZIE/4.0.0.2.0/package/alerts/alert_check_oozie_server.py" } } ] http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/alerts/alert_check_oozie_server.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/alerts/alert_check_oozie_server.py b/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/alerts/alert_check_oozie_server.py new file mode 100644 index 0000000..4e3e6ae --- /dev/null +++ b/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/alerts/alert_check_oozie_server.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python + +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from resource_management import * +from resource_management.libraries.functions import format +from resource_management.libraries.functions import get_kinit_path +from resource_management.core.environment import Environment + +RESULT_CODE_OK = 'OK' +RESULT_CODE_CRITICAL = 'CRITICAL' +RESULT_CODE_UNKNOWN = 'UNKNOWN' + +OOZIE_URL_KEY = '{{oozie-site/oozie.base.url}}' +SECURITY_ENABLED = '{{cluster-env/security_enabled}}' +SMOKEUSER_KEY = '{{cluster-env/smokeuser}}' +SMOKEUSER_KEYTAB_KEY = '{{cluster-env/smokeuser_keytab}}' + +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return (OOZIE_URL_KEY, SMOKEUSER_KEY, SECURITY_ENABLED,SMOKEUSER_KEYTAB_KEY) + +def execute(parameters=None, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + parameters (dictionary): a mapping of parameter key to value + host_name (string): the name of this host where the alert is running + """ + + if parameters is None: + return (RESULT_CODE_UNKNOWN, ['There were no parameters supplied to the script.']) + + security_enabled = False + if set([OOZIE_URL_KEY, SMOKEUSER_KEY, SECURITY_ENABLED]).issubset(parameters): + oozie_url = parameters[OOZIE_URL_KEY] + smokeuser = parameters[SMOKEUSER_KEY] + security_enabled = str(parameters[SECURITY_ENABLED]).upper() == 'TRUE' + else: + return (RESULT_CODE_UNKNOWN, ['The Oozie URL and Smokeuser are a required parameters.']) + + try: + if security_enabled: + if set([SMOKEUSER_KEYTAB_KEY]).issubset(parameters): + smokeuser_keytab = parameters[SMOKEUSER_KEYTAB_KEY] + else: + return (RESULT_CODE_UNKNOWN, ['The Smokeuser keytab is required when security is enabled.']) + kinit_path_local = get_kinit_path(["/usr/bin", "/usr/kerberos/bin", "/usr/sbin"]) + kinitcmd = format("{kinit_path_local} -kt {smokeuser_keytab} {smokeuser}; ") + + Execute(kinitcmd, + user=smokeuser, + ) + + Execute(format("source /etc/oozie/conf/oozie-env.sh ; oozie admin -oozie {oozie_url} -status"), + user=smokeuser, + ) + return (RESULT_CODE_OK, ["Oozie check success"]) + + except Exception, ex: + return (RESULT_CODE_CRITICAL, [str(ex)]) http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/files/alert_check_oozie_server.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/files/alert_check_oozie_server.py b/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/files/alert_check_oozie_server.py deleted file mode 100644 index 7bf1255..0000000 --- a/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/files/alert_check_oozie_server.py +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env python - -""" -Licensed to the Apache Software Foundation (ASF) under one -or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. The ASF licenses this file -to you under the Apache License, Version 2.0 (the -"License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import subprocess -from subprocess import CalledProcessError - -RESULT_CODE_OK = 'OK' -RESULT_CODE_CRITICAL = 'CRITICAL' -RESULT_CODE_UNKNOWN = 'UNKNOWN' - -OOZIE_URL_KEY = '{{oozie-site/oozie.base.url}}' - -def get_tokens(): - """ - Returns a tuple of tokens in the format {{site/property}} that will be used - to build the dictionary passed into execute - """ - return (OOZIE_URL_KEY) - - -def execute(parameters=None, host_name=None): - """ - Returns a tuple containing the result code and a pre-formatted result label - - Keyword arguments: - parameters (dictionary): a mapping of parameter key to value - host_name (string): the name of this host where the alert is running - """ - - if parameters is None: - return (RESULT_CODE_UNKNOWN, ['There were no parameters supplied to the script.']) - - oozie_url = None - if OOZIE_URL_KEY in parameters: - oozie_url = parameters[OOZIE_URL_KEY] - - if oozie_url is None: - return (RESULT_CODE_UNKNOWN, ['The Oozie URL is a required parameter.']) - - try: - # oozie admin -oozie http://server:11000/oozie -status - oozie_process = subprocess.Popen(['oozie', 'admin', '-oozie', - oozie_url, '-status'], stderr=subprocess.PIPE, stdout=subprocess.PIPE) - - oozie_output, oozie_error = oozie_process.communicate() - oozie_return_code = oozie_process.returncode - - if oozie_return_code == 0: - # strip trailing newlines - oozie_output = str(oozie_output).strip('\n') - return (RESULT_CODE_OK, [oozie_output]) - else: - oozie_error = str(oozie_error).strip('\n') - return (RESULT_CODE_CRITICAL, [oozie_error]) - - except CalledProcessError, cpe: - return (RESULT_CODE_CRITICAL, [str(cpe)]) http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HIVE/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HIVE/alerts.json b/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HIVE/alerts.json index a16ef62..ffad3ce 100644 --- a/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HIVE/alerts.json +++ b/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HIVE/alerts.json @@ -38,7 +38,7 @@ "enabled": true, "source": { "type": "SCRIPT", - "path": "HDP/1.3.2/services/HIVE/package/files/alert_hive_thrift_port.py" + "path": "HDP/1.3.2/services/HIVE/package/alerts/alert_hive_thrift_port.py" } } ], @@ -52,7 +52,7 @@ "enabled": true, "source": { "type": "SCRIPT", - "path": "HDP/1.3.2/services/HIVE/package/files/alert_webhcat_server.py" + "path": "HDP/1.3.2/services/HIVE/package/alerts/alert_webhcat_server.py" } } ] http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HIVE/package/alerts/alert_hive_thrift_port.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HIVE/package/alerts/alert_hive_thrift_port.py b/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HIVE/package/alerts/alert_hive_thrift_port.py new file mode 100644 index 0000000..ebfbf55 --- /dev/null +++ b/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HIVE/package/alerts/alert_hive_thrift_port.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python + +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import json +import socket +import time +import traceback +import urllib2 +from resource_management.libraries.functions import hive_check +from resource_management.libraries.functions import format +from resource_management.libraries.functions import get_kinit_path + +OK_MESSAGE = "TCP OK - %.4f response on port %s" +CRITICAL_MESSAGE = "Connection failed on host {0}:{1}" + +HIVE_SERVER_THRIFT_PORT_KEY = '{{hive-site/hive.server2.thrift.port}}' +SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}' +HIVE_SERVER2_AUTHENTICATION_KEY = '{{hive-site/hive.server2.authentication}}' +HIVE_SERVER_PRINCIPAL_KEY = '{{hive-site/hive.server2.authentication.kerberos.principal}}' +SMOKEUSER_KEYTAB_KEY = '{{cluster-env/smokeuser_keytab}}' +SMOKEUSER_KEY = '{{cluster-env/smokeuser}}' + +PERCENT_WARNING = 200 +PERCENT_CRITICAL = 200 + +THRIFT_PORT_DEFAULT = 10000 +HIVE_SERVER_PRINCIPAL_DEFAULT = 'hive/[email protected]' +HIVE_SERVER2_AUTHENTICATION_DEFAULT = 'NOSASL' +SMOKEUSER_KEYTAB_DEFAULT = '/etc/security/keytabs/smokeuser.headless.keytab' +SMOKEUSER_DEFAULT = 'ambari-qa' + +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return (HIVE_SERVER_THRIFT_PORT_KEY,SECURITY_ENABLED_KEY,HIVE_SERVER2_AUTHENTICATION_KEY,HIVE_SERVER_PRINCIPAL_KEY,SMOKEUSER_KEYTAB_KEY,SMOKEUSER_KEY) + + +def execute(parameters=None, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + parameters (dictionary): a mapping of parameter key to value + host_name (string): the name of this host where the alert is running + """ + + if parameters is None: + return (('UNKNOWN', ['There were no parameters supplied to the script.'])) + + thrift_port = THRIFT_PORT_DEFAULT + if HIVE_SERVER_THRIFT_PORT_KEY in parameters: + thrift_port = int(parameters[HIVE_SERVER_THRIFT_PORT_KEY]) + + security_enabled = False + if SECURITY_ENABLED_KEY in parameters: + security_enabled = str(parameters[SECURITY_ENABLED_KEY]).upper() == 'TRUE' + + hive_server2_authentication = HIVE_SERVER2_AUTHENTICATION_DEFAULT + if HIVE_SERVER2_AUTHENTICATION_KEY in parameters: + hive_server2_authentication = parameters[HIVE_SERVER2_AUTHENTICATION_KEY] + + smokeuser = SMOKEUSER_DEFAULT + if SMOKEUSER_KEY in parameters: + smokeuser = parameters[SMOKEUSER_KEY] + + result_code = None + + if security_enabled: + hive_server_principal = HIVE_SERVER_PRINCIPAL_DEFAULT + if HIVE_SERVER_PRINCIPAL_KEY in parameters: + hive_server_principal = parameters[HIVE_SERVER_PRINCIPAL_KEY] + smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT + if SMOKEUSER_KEYTAB_KEY in parameters: + smokeuser_keytab = parameters[SMOKEUSER_KEYTAB_KEY] + kinit_path_local = get_kinit_path(["/usr/bin", "/usr/kerberos/bin", "/usr/sbin"]) + kinitcmd=format("{kinit_path_local} -kt {smokeuser_keytab} {smokeuser}; ") + else: + hive_server_principal = None + kinitcmd=None + + try: + if host_name is None: + host_name = socket.getfqdn() + + start_time = time.time() + try: + hive_check.check_thrift_port_sasl(host_name, thrift_port, hive_server2_authentication, + hive_server_principal, kinitcmd, smokeuser) + is_thrift_port_ok = True + except: + is_thrift_port_ok = False + + if is_thrift_port_ok == True: + result_code = 'OK' + total_time = time.time() - start_time + label = OK_MESSAGE % (total_time, thrift_port) + else: + result_code = 'CRITICAL' + label = CRITICAL_MESSAGE.format(host_name,thrift_port) + + except Exception, e: + label = str(e) + result_code = 'UNKNOWN' + + return ((result_code, [label])) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/ec37c603/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HIVE/package/alerts/alert_webhcat_server.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HIVE/package/alerts/alert_webhcat_server.py b/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HIVE/package/alerts/alert_webhcat_server.py new file mode 100644 index 0000000..44840de --- /dev/null +++ b/ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HIVE/package/alerts/alert_webhcat_server.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python + +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import json +import socket +import time +import urllib2 + +RESULT_CODE_OK = 'OK' +RESULT_CODE_CRITICAL = 'CRITICAL' +RESULT_CODE_UNKNOWN = 'UNKNOWN' + +OK_MESSAGE = 'TCP OK - {0:.4f} response on port {1}' +CRITICAL_CONNECTION_MESSAGE = 'Connection failed on host {0}:{1}' +CRITICAL_TEMPLETON_STATUS_MESSAGE = 'WebHCat returned an unexpected status of "{0}"' +CRITICAL_TEMPLETON_UNKNOWN_JSON_MESSAGE = 'Unable to determine WebHCat health from unexpected JSON response' + +TEMPLETON_PORT_KEY = '{{webhcat-site/templeton.port}}' +SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}' + +TEMPLETON_OK_RESPONSE = 'ok' +TEMPLETON_PORT_DEFAULT = 50111 + +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return (TEMPLETON_PORT_KEY,SECURITY_ENABLED_KEY) + + +def execute(parameters=None, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + parameters (dictionary): a mapping of parameter key to value + host_name (string): the name of this host where the alert is running + """ + + result_code = RESULT_CODE_UNKNOWN + + if parameters is None: + return (result_code, ['There were no parameters supplied to the script.']) + + templeton_port = TEMPLETON_PORT_DEFAULT + if TEMPLETON_PORT_KEY in parameters: + templeton_port = int(parameters[TEMPLETON_PORT_KEY]) + + security_enabled = False + if SECURITY_ENABLED_KEY in parameters: + security_enabled = parameters[SECURITY_ENABLED_KEY].lower() == 'true' + + scheme = 'http' + if security_enabled is True: + scheme = 'https' + + label = '' + url_response = None + templeton_status = '' + total_time = 0 + + try: + # the alert will always run on the webhcat host + if host_name is None: + host_name = socket.getfqdn() + + query = "{0}://{1}:{2}/templeton/v1/status".format(scheme, host_name, + templeton_port) + + # execute the query for the JSON that includes templeton status + start_time = time.time() + url_response = urllib2.urlopen(query) + total_time = time.time() - start_time + except: + label = CRITICAL_CONNECTION_MESSAGE.format(host_name,templeton_port) + return (RESULT_CODE_CRITICAL, [label]) + + # URL response received, parse it + try: + json_response = json.loads(url_response.read()) + templeton_status = json_response['status'] + except: + return (RESULT_CODE_CRITICAL, [CRITICAL_TEMPLETON_UNKNOWN_JSON_MESSAGE]) + + # proper JSON received, compare against known value + if templeton_status.lower() == TEMPLETON_OK_RESPONSE: + result_code = RESULT_CODE_OK + label = OK_MESSAGE.format(total_time, templeton_port) + else: + result_code = RESULT_CODE_CRITICAL + label = CRITICAL_TEMPLETON_STATUS_MESSAGE.format(templeton_status) + + return (result_code, [label]) \ No newline at end of file
