AMBARI-10816 - Expose Customizable Parameters For SCRIPT Alerts (jonathanhurley)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/6727c1dc Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/6727c1dc Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/6727c1dc Branch: refs/heads/trunk Commit: 6727c1dc8c32abbc4ea1a7e3c9652707bb4fd23d Parents: bc55f2d Author: Jonathan Hurley <[email protected]> Authored: Wed Apr 29 11:51:57 2015 -0400 Committer: Jonathan Hurley <[email protected]> Committed: Wed Apr 29 11:52:04 2015 -0400 ---------------------------------------------------------------------- .../python/ambari_agent/alerts/script_alert.py | 22 ++- .../src/test/python/ambari_agent/TestAlerts.py | 55 ++++++ .../ambari_agent/dummy_files/test_script.py | 19 +- .../ambari/server/state/alert/ScriptSource.java | 197 ++++++++++++++++++- ambari-server/src/main/resources/alerts.json | 31 ++- .../alerts/alert_ambari_metrics_monitor.py | 13 +- .../common-services/FLUME/1.4.0.2.0/alerts.json | 11 +- .../package/alerts/alert_flume_agent_status.py | 23 ++- .../common-services/HDFS/2.1.0.2.0/alerts.json | 44 ++++- .../package/alerts/alert_checkpoint_time.py | 68 ++++--- .../package/alerts/alert_ha_namenode_health.py | 36 ++-- .../common-services/HIVE/0.12.0.2.0/alerts.json | 63 +++++- .../package/alerts/alert_hive_metastore.py | 66 ++++--- .../package/alerts/alert_hive_thrift_port.py | 81 +++++--- .../package/alerts/alert_webhcat_server.py | 49 +++-- .../package/alerts/alert_check_oozie_server.py | 35 ++-- .../alerts/check_supervisor_process_win.py | 5 +- .../common-services/YARN/2.1.0.2.0/alerts.json | 26 ++- .../package/alerts/alert_nodemanager_health.py | 33 ++-- .../alerts/alert_nodemanagers_summary.py | 38 ++-- .../resources/host_scripts/alert_disk_space.py | 55 ++++-- .../BIGTOP/0.8/services/FLUME/alerts.json | 11 +- .../package/files/alert_flume_agent_status.py | 25 ++- .../stacks/BIGTOP/0.8/services/HDFS/alerts.json | 44 ++++- .../HDFS/package/files/alert_checkpoint_time.py | 68 ++++--- .../package/files/alert_ha_namenode_health.py | 40 ++-- .../package/files/alert_hive_thrift_port.py | 82 +++++--- .../package/files/alert_check_oozie_server.py | 130 +++++++----- .../package/files/alert_webhcat_server.py | 49 +++-- .../stacks/BIGTOP/0.8/services/YARN/alerts.json | 13 +- .../package/files/alert_nodemanager_health.py | 37 ++-- 31 files changed, 1096 insertions(+), 373 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py b/ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py index d79e0a7..b8b4daf 100644 --- a/ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py +++ b/ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py @@ -46,6 +46,7 @@ class ScriptAlert(BaseAlert): self.common_services_dir = None self.host_scripts_dir = None self.path_to_script = None + self.parameters = {} if 'path' in alert_source_meta: self.path = alert_source_meta['path'] @@ -59,11 +60,24 @@ class ScriptAlert(BaseAlert): if 'host_scripts_directory' in alert_source_meta: self.host_scripts_dir = alert_source_meta['host_scripts_directory'] + # convert a list of script parameters, like timeouts, into a dictionary + # so the the scripts can easily lookup the data + if 'parameters' in alert_source_meta: + parameters = alert_source_meta['parameters'] + for parameter in parameters: + if 'name' not in parameter or 'value' not in parameter: + continue + + # create the dictionary value + parameter_name = parameter['name'] + parameter_value = parameter['value'] + self.parameters[parameter_name] = parameter_value + def _collect(self): cmd_module = self._load_source() if cmd_module is not None: - parameters = {} + configurations = {} try: tokens = cmd_module.get_tokens() @@ -73,7 +87,7 @@ class ScriptAlert(BaseAlert): for token in tokens: value = self._get_configuration_value(token) if value is not None: - parameters[token] = value + configurations[token] = value except AttributeError: # it's OK if the module doesn't have get_tokens() ; no tokens will # be passed in so hopefully the script doesn't need any @@ -85,9 +99,9 @@ class ScriptAlert(BaseAlert): if matchObj: basedir = matchObj.group(1) with Environment(basedir, tmp_dir=self.config.get('agent', 'tmp_dir')) as env: - return cmd_module.execute(parameters, self.host_name) + return cmd_module.execute(configurations, self.parameters, self.host_name) else: - return cmd_module.execute(parameters, self.host_name) + return cmd_module.execute(configurations, self.parameters, self.host_name) else: return (self.RESULT_UNKNOWN, ["Unable to execute script {0}".format(self.path)]) http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-agent/src/test/python/ambari_agent/TestAlerts.py ---------------------------------------------------------------------- diff --git a/ambari-agent/src/test/python/ambari_agent/TestAlerts.py b/ambari-agent/src/test/python/ambari_agent/TestAlerts.py index a574422..af357bc 100644 --- a/ambari-agent/src/test/python/ambari_agent/TestAlerts.py +++ b/ambari-agent/src/test/python/ambari_agent/TestAlerts.py @@ -210,6 +210,40 @@ class TestAlerts(TestCase): self.assertEquals('bar is rendered-bar, baz is rendered-baz', alerts[0]['text']) + def test_script_alert_with_parameters(self): + definition_json = self._get_script_alert_definition_with_parameters() + + # normally set by AlertSchedulerHandler + definition_json['source']['stacks_directory'] = os.path.join('ambari_agent', 'dummy_files') + definition_json['source']['common_services_directory'] = os.path.join('ambari_agent', 'common-services') + definition_json['source']['host_scripts_directory'] = os.path.join('ambari_agent', 'host_scripts') + + configuration = {'foo-site' : + { 'bar': 'rendered-bar', 'baz' : 'rendered-baz' } + } + + collector = AlertCollector() + cluster_configuration = self.__get_cluster_configuration() + self.__update_cluster_configuration(cluster_configuration, configuration) + + alert = ScriptAlert(definition_json, definition_json['source'], MagicMock()) + alert.set_helpers(collector, cluster_configuration ) + alert.set_cluster("c1", "c6401.ambari.apache.org") + + self.assertEquals(definition_json['source']['path'], alert.path) + self.assertEquals(definition_json['source']['stacks_directory'], alert.stacks_dir) + self.assertEquals(definition_json['source']['common_services_directory'], alert.common_services_dir) + self.assertEquals(definition_json['source']['host_scripts_directory'], alert.host_scripts_dir) + + alert.collect() + + alerts = collector.alerts() + self.assertEquals(0, len(collector.alerts())) + + self.assertEquals('OK', alerts[0]['state']) + self.assertEquals('Script parameter detected: foo bar baz', alerts[0]['text']) + + @patch.object(MetricAlert, "_load_jmx") def test_metric_alert(self, ma_load_jmx_mock): definition_json = self._get_metric_alert_definition() @@ -969,6 +1003,27 @@ class TestAlerts(TestCase): } } + def _get_script_alert_definition_with_parameters(self): + return { + "name": "namenode_process", + "service": "HDFS", + "component": "NAMENODE", + "label": "NameNode process", + "interval": 6, + "scope": "host", + "enabled": True, + "uuid": "c1f73191-4481-4435-8dae-fd380e4c0be1", + "source": { + "type": "SCRIPT", + "path": "test_script.py", + "parameters": [ + { + "name": "script.parameter.foo", + "value": "foo bar baz" + } + ] + } + } def _get_port_alert_definition(self): return { "name": "namenode_process", http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-agent/src/test/python/ambari_agent/dummy_files/test_script.py ---------------------------------------------------------------------- diff --git a/ambari-agent/src/test/python/ambari_agent/dummy_files/test_script.py b/ambari-agent/src/test/python/ambari_agent/dummy_files/test_script.py index 3598d86..de4f9b9 100644 --- a/ambari-agent/src/test/python/ambari_agent/dummy_files/test_script.py +++ b/ambari-agent/src/test/python/ambari_agent/dummy_files/test_script.py @@ -26,18 +26,23 @@ def get_tokens(): return ('{{foo-site/bar}}','{{foo-site/baz}}') -def execute(parameters=None, host_name=None): +def execute(configurations={}, parameters={}, host_name=None): ''' returns a tuple containing the result code and a pre-formatted result label ''' - if parameters is not None: - if '{{foo-site/bar}}' in parameters: - bar = parameters['{{foo-site/bar}}'] + + # short circuit the script when a parameter is present + if "script.parameter.foo" in parameters: + return "OK", ["Script parameter detected: " + parameters["script.parameter.foo"]] + + if configurations is not None: + if '{{foo-site/bar}}' in configurations: + bar = configurations['{{foo-site/bar}}'] - if '{{foo-site/baz}}' in parameters: - baz = parameters['{{foo-site/baz}}'] + if '{{foo-site/baz}}' in configurations: + baz = configurations['{{foo-site/baz}}'] - if '{{foo-site/skip}}' in parameters: + if '{{foo-site/skip}}' in configurations: return ('SKIPPED', ['This alert is skipped and will not be in the collector']) label = "bar is {0}, baz is {1}".format(bar, baz) http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/java/org/apache/ambari/server/state/alert/ScriptSource.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/state/alert/ScriptSource.java b/ambari-server/src/main/java/org/apache/ambari/server/state/alert/ScriptSource.java index 4c84bcf..f67a135 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/state/alert/ScriptSource.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/state/alert/ScriptSource.java @@ -17,6 +17,10 @@ */ package org.apache.ambari.server.state.alert; +import java.util.List; + +import org.apache.ambari.server.state.AlertState; + import com.google.gson.annotations.SerializedName; /** @@ -31,6 +35,12 @@ public class ScriptSource extends Source { private String m_path = null; /** + * A list of all of the script parameters, if any. + */ + @SerializedName("parameters") + private List<ScriptParameter> m_parameters; + + /** * @return the path to the script file. */ public String getPath() { @@ -38,7 +48,7 @@ public class ScriptSource extends Source { } /** - * + * {@inheritDoc} */ @Override public int hashCode() { @@ -50,7 +60,7 @@ public class ScriptSource extends Source { } /** - * + * {@inheritDoc} */ @Override public boolean equals(Object obj) { @@ -78,4 +88,187 @@ public class ScriptSource extends Source { return true; } + + /** + * The {@link ScriptParameter} class represents a single parameter that can be + * passed into a script alert. + */ + public static class ScriptParameter { + @SerializedName("name") + private String m_name; + + @SerializedName("display_name") + private String m_displayName; + + @SerializedName("units") + private String m_units; + + @SerializedName("value") + private Object m_value; + + @SerializedName("description") + private String m_description; + + @SerializedName("type") + private ScriptParameterType m_type; + + /** + * If this script parameter controls a threshold, then its specified here, + * otherwise it's {@code null}. + */ + @SerializedName("threshold") + private AlertState m_threshold; + + /** + * Gets the unique name of the parameter. + * + * @return the name + */ + public String getName() { + return m_name; + } + + /** + * Gets the human readable name of the parameter. + * + * @return the displayName + */ + public String getDisplayName() { + return m_displayName; + } + + /** + * Gets the display units of the paramter. + * + * @return the units + */ + public String getUnits() { + return m_units; + } + + /** + * Gets the value of the parameter. + * + * @return the value + */ + public Object getValue() { + return m_value; + } + + /** + * Gets the description of the parameter. + * + * @return the description + */ + public String getDescription() { + return m_description; + } + + /** + * Gets the threshold that this parameter directly controls, or {@code null} + * for none. + * + * @return the threshold, or {@code null}. + */ + public AlertState getThreshold() { + return m_threshold; + } + + /** + * {@inheritDoc} + */ + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((m_description == null) ? 0 : m_description.hashCode()); + result = prime * result + ((m_displayName == null) ? 0 : m_displayName.hashCode()); + result = prime * result + ((m_name == null) ? 0 : m_name.hashCode()); + result = prime * result + ((m_threshold == null) ? 0 : m_threshold.hashCode()); + result = prime * result + ((m_type == null) ? 0 : m_type.hashCode()); + result = prime * result + ((m_units == null) ? 0 : m_units.hashCode()); + result = prime * result + ((m_value == null) ? 0 : m_value.hashCode()); + return result; + } + + /** + * {@inheritDoc} + */ + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + ScriptParameter other = (ScriptParameter) obj; + if (m_description == null) { + if (other.m_description != null) { + return false; + } + } else if (!m_description.equals(other.m_description)) { + return false; + } + if (m_displayName == null) { + if (other.m_displayName != null) { + return false; + } + } else if (!m_displayName.equals(other.m_displayName)) { + return false; + } + if (m_name == null) { + if (other.m_name != null) { + return false; + } + } else if (!m_name.equals(other.m_name)) { + return false; + } + if (m_threshold != other.m_threshold) { + return false; + } + if (m_type != other.m_type) { + return false; + } + if (m_units == null) { + if (other.m_units != null) { + return false; + } + } else if (!m_units.equals(other.m_units)) { + return false; + } + if (m_value == null) { + if (other.m_value != null) { + return false; + } + } else if (!m_value.equals(other.m_value)) { + return false; + } + return true; + } + + + /** + * The {@link ScriptParameterType} enum represents the value type. + */ + public enum ScriptParameterType { + /** + * String + */ + STRING, + + /** + * Integers, longs, floats, etc. + */ + NUMERIC, + + /** + * A percent value, expessed as a float. + */ + PERCENT + } + } } http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/resources/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/alerts.json b/ambari-server/src/main/resources/alerts.json index 0d19f42..a7cef7f 100644 --- a/ambari-server/src/main/resources/alerts.json +++ b/ambari-server/src/main/resources/alerts.json @@ -38,7 +38,36 @@ "enabled": true, "source": { "type": "SCRIPT", - "path": "alert_disk_space.py" + "path": "alert_disk_space.py", + "parameters": [ + { + "name": "minimum.free.space", + "display_name": "Minimum Free Space", + "value": 5000000000, + "type": "NUMERIC", + "description": "The overall amount of free disk space left before an alert is triggered.", + "units": "bytes", + "threshold": "WARNING" + }, + { + "name": "percent.used.space.warning.threshold", + "display_name": "Warning", + "value": 0.5, + "type": "PERCENT", + "description": "The percent of disk space consumed before a warning is triggered.", + "units": "%", + "threshold": "WARNING" + }, + { + "name": "percent.free.space.critical.threshold", + "display_name": "Critical", + "value": 0.8, + "type": "PERCENT", + "description": "The percent of disk space consumed before a critical alert is triggered.", + "units": "%", + "threshold": "CRITICAL" + } + ] } } ] http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py b/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py index 04a2e01..fa44a7f 100644 --- a/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py +++ b/ambari-server/src/main/resources/common-services/AMBARI_METRICS/0.1.0/package/alerts/alert_ambari_metrics_monitor.py @@ -73,20 +73,21 @@ def is_monitor_process_live(pid_file): return live -def execute(parameters=None, host_name=None): +def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: - parameters (dictionary): a mapping of parameter key to value + configurations (dictionary): a mapping of configuration key to value + parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ - if parameters is None: - return (RESULT_CODE_UNKNOWN, ['There were no parameters supplied to the script.']) + if configurations is None: + return (RESULT_CODE_UNKNOWN, ['There were no configurations supplied to the script.']) - if set([AMS_MONITOR_PID_DIR]).issubset(parameters): - AMS_MONITOR_PID_PATH = os.path.join(parameters[AMS_MONITOR_PID_DIR], 'ambari-metrics-monitor.pid') + if set([AMS_MONITOR_PID_DIR]).issubset(configurations): + AMS_MONITOR_PID_PATH = os.path.join(configurations[AMS_MONITOR_PID_DIR], 'ambari-metrics-monitor.pid') else: return (RESULT_CODE_UNKNOWN, ['The ams_monitor_pid_dir is a required parameter.']) http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/alerts.json b/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/alerts.json index e23e77b..80ebb91 100644 --- a/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/alerts.json +++ b/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/alerts.json @@ -10,7 +10,16 @@ "scope": "ANY", "source": { "type": "SCRIPT", - "path": "FLUME/1.4.0.2.0/package/alerts/alert_flume_agent_status.py" + "path": "FLUME/1.4.0.2.0/package/alerts/alert_flume_agent_status.py", + "parameters": [ + { + "name": "run.directory", + "display_name": "Run Directory", + "value": "/var/run/flume", + "type": "STRING", + "description": "The directory where flume agent processes will place their PID files." + } + ] } } ] http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/package/alerts/alert_flume_agent_status.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/package/alerts/alert_flume_agent_status.py b/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/package/alerts/alert_flume_agent_status.py index b183bbc..5236d37 100644 --- a/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/package/alerts/alert_flume_agent_status.py +++ b/ambari-server/src/main/resources/common-services/FLUME/1.4.0.2.0/package/alerts/alert_flume_agent_status.py @@ -29,7 +29,8 @@ RESULT_CODE_UNKNOWN = 'UNKNOWN' FLUME_CONF_DIR_KEY = '{{flume-env/flume_conf_dir}}' -FLUME_RUN_DIR = '/var/run/flume' +FLUME_RUN_DIR_KEY = "run.directory" +FLUME_RUN_DIR_DEFAULT = '/var/run/flume' def get_tokens(): """ @@ -39,21 +40,22 @@ def get_tokens(): return (FLUME_CONF_DIR_KEY,) -def execute(parameters=None, host_name=None): +def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: - parameters (dictionary): a mapping of parameter key to value + configurations (dictionary): a mapping of configuration key to value + parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ - if parameters is None: - return (RESULT_CODE_UNKNOWN, ['There were no parameters supplied to the script.']) + if configurations is None: + return (RESULT_CODE_UNKNOWN, ['There were no configurations supplied to the script.']) flume_conf_directory = None - if FLUME_CONF_DIR_KEY in parameters: - flume_conf_directory = parameters[FLUME_CONF_DIR_KEY] + if FLUME_CONF_DIR_KEY in configurations: + flume_conf_directory = configurations[FLUME_CONF_DIR_KEY] if flume_conf_directory is None: return (RESULT_CODE_UNKNOWN, ['The Flume configuration directory is a required parameter.']) @@ -61,7 +63,12 @@ def execute(parameters=None, host_name=None): if host_name is None: host_name = socket.getfqdn() - processes = get_flume_status(flume_conf_directory, FLUME_RUN_DIR) + # parse script arguments + flume_run_directory = FLUME_RUN_DIR_DEFAULT + if FLUME_RUN_DIR_KEY in parameters: + flume_run_directory = parameters[FLUME_RUN_DIR_KEY] + + processes = get_flume_status(flume_conf_directory, flume_run_directory) expected_agents = find_expected_agent_names(flume_conf_directory) alert_label = '' http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json index b67c306..9502e14 100644 --- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json +++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json @@ -387,7 +387,36 @@ "enabled": true, "source": { "type": "SCRIPT", - "path": "HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py" + "path": "HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py", + "parameters": [ + { + "name": "connection.timeout", + "display_name": "Connection Timeout", + "value": 5.0, + "type": "NUMERIC", + "description": "The maximum time before this alert is considered to be CRITICAL", + "units": "seconds", + "threshold": "CRITICAL" + }, + { + "name": "checkpoint.time.warning.threshold", + "display_name": "Checkpoint Warning", + "value": 2.0, + "type": "PERCENT", + "description": "The percentage of the last checkpoint time greater than the interval in order to trigger a warning alert.", + "units": "%", + "threshold": "WARNING" + }, + { + "name": "checkpoint.time.critical.threshold", + "display_name": "Checkpoint Critical", + "value": 2.0, + "type": "PERCENT", + "description": "The percentage of the last checkpoint time greater than the interval in order to trigger a critical alert.", + "units": "%", + "threshold": "CRITICAL" + } + ] } }, { @@ -400,7 +429,18 @@ "ignore_host": true, "source": { "type": "SCRIPT", - "path": "HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py" + "path": "HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py", + "parameters": [ + { + "name": "connection.timeout", + "display_name": "Connection Timeout", + "value": 5.0, + "type": "NUMERIC", + "description": "The maximum time before this alert is considered to be CRITICAL", + "units": "seconds", + "threshold": "CRITICAL" + } + ] } } ], http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py index 2455d3b..b44c2d4 100644 --- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py +++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py @@ -30,13 +30,17 @@ NN_HTTP_POLICY_KEY = '{{hdfs-site/dfs.http.policy}}' NN_CHECKPOINT_TX_KEY = '{{hdfs-site/dfs.namenode.checkpoint.txns}}' NN_CHECKPOINT_PERIOD_KEY = '{{hdfs-site/dfs.namenode.checkpoint.period}}' -PERCENT_WARNING = 200 -PERCENT_CRITICAL = 200 +PERCENT_WARNING_KEY = 'checkpoint.time.warning.threshold' +PERCENT_WARNING_DEFAULT = 200 + +PERCENT_CRITICAL_KEY = 'checkpoint.time.critical.threshold' +PERCENT_CRITICAL_DEFAULT = 200 CHECKPOINT_TX_DEFAULT = 1000000 CHECKPOINT_PERIOD_DEFAULT = 21600 -CONNECTION_TIMEOUT = 5.0 +CONNECTION_TIMEOUT_KEY = 'connection.timeout' +CONNECTION_TIMEOUT_DEFAULT = 5.0 def get_tokens(): """ @@ -47,43 +51,55 @@ def get_tokens(): NN_CHECKPOINT_TX_KEY, NN_CHECKPOINT_PERIOD_KEY) -def execute(parameters=None, host_name=None): +def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: - parameters (dictionary): a mapping of parameter key to value + configurations (dictionary): a mapping of configuration key to value + parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ - if parameters is None: - return (('UNKNOWN', ['There were no parameters supplied to the script.'])) + if configurations is None: + return (('UNKNOWN', ['There were no configurations supplied to the script.'])) uri = None scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' - percent_warning = PERCENT_WARNING - percent_critical = PERCENT_CRITICAL checkpoint_tx = CHECKPOINT_TX_DEFAULT checkpoint_period = CHECKPOINT_PERIOD_DEFAULT - if NN_HTTP_ADDRESS_KEY in parameters: - http_uri = parameters[NN_HTTP_ADDRESS_KEY] + if NN_HTTP_ADDRESS_KEY in configurations: + http_uri = configurations[NN_HTTP_ADDRESS_KEY] - if NN_HTTPS_ADDRESS_KEY in parameters: - https_uri = parameters[NN_HTTPS_ADDRESS_KEY] + if NN_HTTPS_ADDRESS_KEY in configurations: + https_uri = configurations[NN_HTTPS_ADDRESS_KEY] - if NN_HTTP_POLICY_KEY in parameters: - http_policy = parameters[NN_HTTP_POLICY_KEY] + if NN_HTTP_POLICY_KEY in configurations: + http_policy = configurations[NN_HTTP_POLICY_KEY] - if NN_CHECKPOINT_TX_KEY in parameters: - checkpoint_tx = parameters[NN_CHECKPOINT_TX_KEY] + if NN_CHECKPOINT_TX_KEY in configurations: + checkpoint_tx = configurations[NN_CHECKPOINT_TX_KEY] + + if NN_CHECKPOINT_PERIOD_KEY in configurations: + checkpoint_period = configurations[NN_CHECKPOINT_PERIOD_KEY] + + # parse script arguments + connection_timeout = CONNECTION_TIMEOUT_DEFAULT + if CONNECTION_TIMEOUT_KEY in parameters: + connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) + + percent_warning = PERCENT_WARNING_DEFAULT + if PERCENT_WARNING_KEY in parameters: + percent_warning = float(parameters[PERCENT_WARNING_KEY]) * 100 + + percent_critical = PERCENT_CRITICAL_DEFAULT + if PERCENT_CRITICAL_KEY in parameters: + percent_critical = float(parameters[PERCENT_CRITICAL_KEY]) * 100 - if NN_CHECKPOINT_PERIOD_KEY in parameters: - checkpoint_period = parameters[NN_CHECKPOINT_PERIOD_KEY] - # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': @@ -102,8 +118,12 @@ def execute(parameters=None, host_name=None): result_code = "OK" try: - last_checkpoint_time = int(get_value_from_jmx(last_checkpoint_time_qry,"LastCheckpointTime")) - journal_transaction_info = get_value_from_jmx(journal_transaction_info_qry,"JournalTransactionInfo") + last_checkpoint_time = int(get_value_from_jmx(last_checkpoint_time_qry, + "LastCheckpointTime", connection_timeout)) + + journal_transaction_info = get_value_from_jmx(journal_transaction_info_qry, + "JournalTransactionInfo", connection_timeout) + journal_transaction_info_dict = json.loads(journal_transaction_info) last_tx = int(journal_transaction_info_dict['LastAppliedOrWrittenTxId']) @@ -131,11 +151,11 @@ def get_time(delta): return {'h':h, 'm':m} -def get_value_from_jmx(query, jmx_property): +def get_value_from_jmx(query, jmx_property, connection_timeout): response = None try: - response = urllib2.urlopen(query, timeout=CONNECTION_TIMEOUT) + response = urllib2.urlopen(query, timeout=connection_timeout) data = response.read() data_dict = json.loads(data) http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py index 2066d46..0d0e473 100644 --- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py +++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py @@ -35,7 +35,8 @@ NN_HTTP_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.http-address}}' NN_HTTPS_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.https-address}}' DFS_POLICY_KEY = '{{hdfs-site/dfs.http.policy}}' -CONNECTION_TIMEOUT = 5.0 +CONNECTION_TIMEOUT_KEY = 'connection.timeout' +CONNECTION_TIMEOUT_DEFAULT = 5.0 def get_tokens(): """ @@ -46,34 +47,41 @@ def get_tokens(): NN_HTTPS_ADDRESS_KEY, DFS_POLICY_KEY) -def execute(parameters=None, host_name=None): +def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: - parameters (dictionary): a mapping of parameter key to value + configurations (dictionary): a mapping of configuration key to value + parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ - if parameters is None: - return (RESULT_STATE_UNKNOWN, ['There were no parameters supplied to the script.']) + if configurations is None: + return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.']) # if not in HA mode, then SKIP - if not NAMESERVICE_KEY in parameters: + if not NAMESERVICE_KEY in configurations: return (RESULT_STATE_SKIPPED, ['NameNode HA is not enabled']) # hdfs-site is required - if not HDFS_SITE_KEY in parameters: + if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)]) + # parse script arguments + connection_timeout = CONNECTION_TIMEOUT_DEFAULT + if CONNECTION_TIMEOUT_KEY in parameters: + connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) + + # determine whether or not SSL is enabled is_ssl_enabled = False - if DFS_POLICY_KEY in parameters: - dfs_policy = parameters[DFS_POLICY_KEY] + if DFS_POLICY_KEY in configurations: + dfs_policy = configurations[DFS_POLICY_KEY] if dfs_policy == "HTTPS_ONLY": is_ssl_enabled = True - name_service = parameters[NAMESERVICE_KEY] - hdfs_site = parameters[HDFS_SITE_KEY] + name_service = configurations[NAMESERVICE_KEY] + hdfs_site = configurations[HDFS_SITE_KEY] # look for dfs.ha.namenodes.foo nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service @@ -105,7 +113,7 @@ def execute(parameters=None, host_name=None): try: jmx_uri = jmx_uri_fragment.format(value) - state = get_value_from_jmx(jmx_uri,'State') + state = get_value_from_jmx(jmx_uri, 'State', connection_timeout) if state == HDFS_NN_STATE_ACTIVE: active_namenodes.append(value) @@ -161,11 +169,11 @@ def execute(parameters=None, host_name=None): return (RESULT_STATE_SKIPPED, ['Another host will report this alert']) -def get_value_from_jmx(query, jmx_property): +def get_value_from_jmx(query, jmx_property, connection_timeout): response = None try: - response = urllib2.urlopen(query, timeout=CONNECTION_TIMEOUT) + response = urllib2.urlopen(query, timeout=connection_timeout) data = response.read() data_dict = json.loads(data) http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json index 9170b8f..2bdc532 100644 --- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json +++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json @@ -11,7 +11,30 @@ "enabled": true, "source": { "type": "SCRIPT", - "path": "HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py" + "path": "HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py", + "parameters": [ + { + "name": "default.smoke.user", + "display_name": "Default Smoke User", + "value": "ambari-qa", + "type": "STRING", + "description": "The user that will run the Hive commands if not specified in cluster-env/smokeuser" + }, + { + "name": "default.smoke.principal", + "display_name": "Default Smoke Principal", + "value": "[email protected]", + "type": "STRING", + "description": "The principal to use when retrieving the kerberos ticket if not specified in cluster-env/smokeuser_principal_name" + }, + { + "name": "default.smoke.keytab", + "display_name": "Default Smoke Keytab", + "value": "/etc/security/keytabs/smokeuser.headless.keytab", + "type": "STRING", + "description": "The keytab to use when retrieving the kerberos ticket if not specified in cluster-env/smokeuser_keytab" + } + ] } } ], @@ -25,7 +48,30 @@ "enabled": true, "source": { "type": "SCRIPT", - "path": "HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py" + "path": "HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py", + "parameters": [ + { + "name": "default.smoke.user", + "display_name": "Default Smoke User", + "value": "ambari-qa", + "type": "STRING", + "description": "The user that will run the Hive commands if not specified in cluster-env/smokeuser" + }, + { + "name": "default.smoke.principal", + "display_name": "Default Smoke Principal", + "value": "[email protected]", + "type": "STRING", + "description": "The principal to use when retrieving the kerberos ticket if not specified in cluster-env/smokeuser_principal_name" + }, + { + "name": "default.smoke.keytab", + "display_name": "Default Smoke Keytab", + "value": "/etc/security/keytabs/smokeuser.headless.keytab", + "type": "STRING", + "description": "The keytab to use when retrieving the kerberos ticket if not specified in cluster-env/smokeuser_keytab" + } + ] } } ], @@ -39,7 +85,18 @@ "enabled": true, "source": { "type": "SCRIPT", - "path": "HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py" + "path": "HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py", + "parameters": [ + { + "name": "connection.timeout", + "display_name": "Connection Timeout", + "value": 5.0, + "type": "NUMERIC", + "description": "The maximum time before this alert is considered to be CRITICAL", + "units": "seconds", + "threshold": "CRITICAL" + } + ] } } ] http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py index 9f871be..a7ed54f 100644 --- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py +++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py @@ -33,15 +33,20 @@ SMOKEUSER_KEYTAB_KEY = '{{cluster-env/smokeuser_keytab}}' SMOKEUSER_PRINCIPAL_KEY = '{{cluster-env/smokeuser_principal_name}}' SMOKEUSER_KEY = '{{cluster-env/smokeuser}}' HIVE_METASTORE_URIS_KEY = '{{hive-site/hive.metastore.uris}}' + # The configured Kerberos executable search paths, if any KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY = '{{kerberos-env/executable_search_paths}}' -PERCENT_WARNING = 200 -PERCENT_CRITICAL = 200 - - +# default keytab location +SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY = 'default.smoke.keytab' SMOKEUSER_KEYTAB_DEFAULT = '/etc/security/keytabs/smokeuser.headless.keytab' + +# default smoke principal +SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY = 'default.smoke.principal' SMOKEUSER_PRINCIPAL_DEFAULT = '[email protected]' + +# default smoke user +SMOKEUSER_SCRIPT_PARAM_KEY = 'default.smoke.user' SMOKEUSER_DEFAULT = 'ambari-qa' def get_tokens(): @@ -53,46 +58,61 @@ def get_tokens(): HIVE_METASTORE_URIS_KEY, SMOKEUSER_KEY, KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY) -def execute(parameters=None, host_name=None): +def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: - parameters (dictionary): a mapping of parameter key to value + configurations (dictionary): a mapping of configuration key to value + parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ - if parameters is None: - return (('UNKNOWN', ['There were no parameters supplied to the script.'])) + if configurations is None: + return (('UNKNOWN', ['There were no configurations supplied to the script.'])) - if not HIVE_METASTORE_URIS_KEY in parameters: + if not HIVE_METASTORE_URIS_KEY in configurations: return (('UNKNOWN', ['Hive metastore uris were not supplied to the script.'])) - metastore_uris = parameters[HIVE_METASTORE_URIS_KEY].split(',') + + metastore_uris = configurations[HIVE_METASTORE_URIS_KEY].split(',') security_enabled = False - if SECURITY_ENABLED_KEY in parameters: - security_enabled = str(parameters[SECURITY_ENABLED_KEY]).upper() == 'TRUE' + if SECURITY_ENABLED_KEY in configurations: + security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' + # defaults + smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT smokeuser_principal = SMOKEUSER_PRINCIPAL_DEFAULT - if SMOKEUSER_PRINCIPAL_KEY in parameters: - smokeuser_principal = parameters[SMOKEUSER_PRINCIPAL_KEY] - smokeuser = SMOKEUSER_DEFAULT - if SMOKEUSER_KEY in parameters: - smokeuser = parameters[SMOKEUSER_KEY] + + # check script params + if SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters: + smokeuser_principal = parameters[SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY] + + if SMOKEUSER_SCRIPT_PARAM_KEY in parameters: + smokeuser = parameters[SMOKEUSER_SCRIPT_PARAM_KEY] + + if SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY in parameters: + smokeuser_keytab = parameters[SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY] + + + # check configurations last as they should always take precedence + if SMOKEUSER_PRINCIPAL_KEY in configurations: + smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY] + + if SMOKEUSER_KEY in configurations: + smokeuser = configurations[SMOKEUSER_KEY] result_code = None try: if security_enabled: - smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT - - if SMOKEUSER_KEYTAB_KEY in parameters: - smokeuser_keytab = parameters[SMOKEUSER_KEYTAB_KEY] + if SMOKEUSER_KEYTAB_KEY in configurations: + smokeuser_keytab = configurations[SMOKEUSER_KEYTAB_KEY] # Get the configured Kerberos executable search paths, if any - if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in parameters: - kerberos_executable_search_paths = parameters[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] + if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: + kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] else: kerberos_executable_search_paths = None http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py index 3e32db9..35217fc 100644 --- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py +++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py @@ -36,18 +36,25 @@ HIVE_SERVER_PRINCIPAL_KEY = '{{hive-site/hive.server2.authentication.kerberos.pr SMOKEUSER_KEYTAB_KEY = '{{cluster-env/smokeuser_keytab}}' SMOKEUSER_PRINCIPAL_KEY = '{{cluster-env/smokeuser_principal_name}}' SMOKEUSER_KEY = '{{cluster-env/smokeuser}}' + # The configured Kerberos executable search paths, if any KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY = '{{kerberos-env/executable_search_paths}}' -PERCENT_WARNING = 200 -PERCENT_CRITICAL = 200 - THRIFT_PORT_DEFAULT = 10000 HIVE_SERVER_TRANSPORT_MODE_DEFAULT = 'binary' HIVE_SERVER_PRINCIPAL_DEFAULT = 'hive/[email protected]' HIVE_SERVER2_AUTHENTICATION_DEFAULT = 'NOSASL' + +# default keytab location +SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY = 'default.smoke.keytab' SMOKEUSER_KEYTAB_DEFAULT = '/etc/security/keytabs/smokeuser.headless.keytab' + +# default smoke principal +SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY = 'default.smoke.principal' SMOKEUSER_PRINCIPAL_DEFAULT = '[email protected]' + +# default smoke user +SMOKEUSER_SCRIPT_PARAM_KEY = 'default.smoke.user' SMOKEUSER_DEFAULT = 'ambari-qa' def get_tokens(): @@ -61,59 +68,73 @@ def get_tokens(): HIVE_SERVER_TRANSPORT_MODE_KEY,KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY) -def execute(parameters=None, host_name=None): +def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: - parameters (dictionary): a mapping of parameter key to value + configurations (dictionary): a mapping of configuration key to value + parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ - if parameters is None: - return ('UNKNOWN', ['There were no parameters supplied to the script.']) + if configurations is None: + return ('UNKNOWN', ['There were no configurations supplied to the script.']) transport_mode = HIVE_SERVER_TRANSPORT_MODE_DEFAULT - if HIVE_SERVER_TRANSPORT_MODE_KEY in parameters: - transport_mode = parameters[HIVE_SERVER_TRANSPORT_MODE_KEY] + if HIVE_SERVER_TRANSPORT_MODE_KEY in configurations: + transport_mode = configurations[HIVE_SERVER_TRANSPORT_MODE_KEY] port = THRIFT_PORT_DEFAULT - if transport_mode.lower() == 'binary' and HIVE_SERVER_THRIFT_PORT_KEY in parameters: - port = int(parameters[HIVE_SERVER_THRIFT_PORT_KEY]) - elif transport_mode.lower() == 'http' and HIVE_SERVER_THRIFT_HTTP_PORT_KEY in parameters: - port = int(parameters[HIVE_SERVER_THRIFT_HTTP_PORT_KEY]) + if transport_mode.lower() == 'binary' and HIVE_SERVER_THRIFT_PORT_KEY in configurations: + port = int(configurations[HIVE_SERVER_THRIFT_PORT_KEY]) + elif transport_mode.lower() == 'http' and HIVE_SERVER_THRIFT_HTTP_PORT_KEY in configurations: + port = int(configurations[HIVE_SERVER_THRIFT_HTTP_PORT_KEY]) security_enabled = False - if SECURITY_ENABLED_KEY in parameters: - security_enabled = str(parameters[SECURITY_ENABLED_KEY]).upper() == 'TRUE' + if SECURITY_ENABLED_KEY in configurations: + security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' hive_server2_authentication = HIVE_SERVER2_AUTHENTICATION_DEFAULT - if HIVE_SERVER2_AUTHENTICATION_KEY in parameters: - hive_server2_authentication = parameters[HIVE_SERVER2_AUTHENTICATION_KEY] + if HIVE_SERVER2_AUTHENTICATION_KEY in configurations: + hive_server2_authentication = configurations[HIVE_SERVER2_AUTHENTICATION_KEY] + # defaults + smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT smokeuser_principal = SMOKEUSER_PRINCIPAL_DEFAULT - if SMOKEUSER_PRINCIPAL_KEY in parameters: - smokeuser_principal = parameters[SMOKEUSER_PRINCIPAL_KEY] - smokeuser = SMOKEUSER_DEFAULT - if SMOKEUSER_KEY in parameters: - smokeuser = parameters[SMOKEUSER_KEY] + + # check script params + if SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters: + smokeuser_principal = parameters[SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY] + + if SMOKEUSER_SCRIPT_PARAM_KEY in parameters: + smokeuser = parameters[SMOKEUSER_SCRIPT_PARAM_KEY] + + if SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY in parameters: + smokeuser_keytab = parameters[SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY] + + + # check configurations last as they should always take precedence + if SMOKEUSER_PRINCIPAL_KEY in configurations: + smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY] + + if SMOKEUSER_KEY in configurations: + smokeuser = configurations[SMOKEUSER_KEY] result_code = None if security_enabled: hive_server_principal = HIVE_SERVER_PRINCIPAL_DEFAULT - if HIVE_SERVER_PRINCIPAL_KEY in parameters: - hive_server_principal = parameters[HIVE_SERVER_PRINCIPAL_KEY] - - smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT + if HIVE_SERVER_PRINCIPAL_KEY in configurations: + hive_server_principal = configurations[HIVE_SERVER_PRINCIPAL_KEY] - if SMOKEUSER_KEYTAB_KEY in parameters: - smokeuser_keytab = parameters[SMOKEUSER_KEYTAB_KEY] + if SMOKEUSER_KEYTAB_KEY in configurations: + smokeuser_keytab = configurations[SMOKEUSER_KEYTAB_KEY] # Get the configured Kerberos executable search paths, if any - if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in parameters: - kerberos_executable_search_paths = parameters[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] + if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: + kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] else: kerberos_executable_search_paths = None http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py index dd20be4..7ee375e 100644 --- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py +++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py @@ -53,8 +53,10 @@ KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY = '{{kerberos-env/executable_search_paths}} WEBHCAT_OK_RESPONSE = 'ok' WEBHCAT_PORT_DEFAULT = 50111 -CURL_CONNECTION_TIMEOUT = '5' -CONNECTION_TIMEOUT = 5.0 +CONNECTION_TIMEOUT_KEY = 'connection.timeout' +CONNECTION_TIMEOUT_DEFAULT = 5.0 +CURL_CONNECTION_TIMEOUT_DEFAULT = str(int(CONNECTION_TIMEOUT_DEFAULT)) + def get_tokens(): """ @@ -64,27 +66,36 @@ def get_tokens(): return (TEMPLETON_PORT_KEY, SECURITY_ENABLED_KEY, WEBHCAT_KEYTAB_KEY, WEBHCAT_PRINCIPAL_KEY, KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY) -def execute(parameters=None, host_name=None): +def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: - parameters (dictionary): a mapping of parameter key to value + configurations (dictionary): a mapping of configuration key to value + parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ result_code = RESULT_CODE_UNKNOWN - if parameters is None: - return (result_code, ['There were no parameters supplied to the script.']) + if configurations is None: + return (result_code, ['There were no configurations supplied to the script.']) webhcat_port = WEBHCAT_PORT_DEFAULT - if TEMPLETON_PORT_KEY in parameters: - webhcat_port = int(parameters[TEMPLETON_PORT_KEY]) + if TEMPLETON_PORT_KEY in configurations: + webhcat_port = int(configurations[TEMPLETON_PORT_KEY]) security_enabled = False - if SECURITY_ENABLED_KEY in parameters: - security_enabled = parameters[SECURITY_ENABLED_KEY].lower() == 'true' + if SECURITY_ENABLED_KEY in configurations: + security_enabled = configurations[SECURITY_ENABLED_KEY].lower() == 'true' + + # parse script arguments + connection_timeout = CONNECTION_TIMEOUT_DEFAULT + curl_connection_timeout = CURL_CONNECTION_TIMEOUT_DEFAULT + if CONNECTION_TIMEOUT_KEY in parameters: + connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) + curl_connection_timeout = str(int(connection_timeout)) + # the alert will always run on the webhcat host if host_name is None: @@ -98,12 +109,12 @@ def execute(parameters=None, host_name=None): json_response = {} if security_enabled: - if WEBHCAT_KEYTAB_KEY not in parameters or WEBHCAT_PRINCIPAL_KEY not in parameters: - return (RESULT_CODE_UNKNOWN, [str(parameters)]) + if WEBHCAT_KEYTAB_KEY not in configurations or WEBHCAT_PRINCIPAL_KEY not in configurations: + return (RESULT_CODE_UNKNOWN, [str(configurations)]) try: - webhcat_keytab = parameters[WEBHCAT_KEYTAB_KEY] - webhcat_principal = parameters[WEBHCAT_PRINCIPAL_KEY] + webhcat_keytab = configurations[WEBHCAT_KEYTAB_KEY] + webhcat_principal = configurations[WEBHCAT_PRINCIPAL_KEY] # substitute _HOST in kerberos principal with actual fqdn webhcat_principal = webhcat_principal.replace('_HOST', host_name) @@ -115,8 +126,8 @@ def execute(parameters=None, host_name=None): kerberos_env = {'KRB5CCNAME': ccache_file} # Get the configured Kerberos executable search paths, if any - if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in parameters: - kerberos_executable_search_paths = parameters[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] + if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: + kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] else: kerberos_executable_search_paths = None @@ -136,7 +147,7 @@ def execute(parameters=None, host_name=None): # make a single curl call to get just the http code curl = subprocess.Popen(['curl', '--negotiate', '-u', ':', '-sL', '-w', - '%{http_code}', '--connect-timeout', CURL_CONNECTION_TIMEOUT, + '%{http_code}', '--connect-timeout', curl_connection_timeout, '-o', '/dev/null', query_url], stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=kerberos_env) stdout, stderr = curl.communicate() @@ -160,7 +171,7 @@ def execute(parameters=None, host_name=None): # now that we have the http status and it was 200, get the content start_time = time.time() curl = subprocess.Popen(['curl', '--negotiate', '-u', ':', '-sL', - '--connect-timeout', CURL_CONNECTION_TIMEOUT, query_url, ], + '--connect-timeout', curl_connection_timeout, query_url, ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=kerberos_env) stdout, stderr = curl.communicate() @@ -178,7 +189,7 @@ def execute(parameters=None, host_name=None): try: # execute the query for the JSON that includes WebHCat status start_time = time.time() - url_response = urllib2.urlopen(query_url, timeout=CONNECTION_TIMEOUT) + url_response = urllib2.urlopen(query_url, timeout=connection_timeout) total_time = time.time() - start_time json_response = json.loads(url_response.read()) http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/alerts/alert_check_oozie_server.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/alerts/alert_check_oozie_server.py b/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/alerts/alert_check_oozie_server.py index cbb6299..c0dc18a 100644 --- a/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/alerts/alert_check_oozie_server.py +++ b/ambari-server/src/main/resources/common-services/OOZIE/4.0.0.2.0/package/alerts/alert_check_oozie_server.py @@ -59,7 +59,7 @@ def get_tokens(): return (OOZIE_URL_KEY, OOZIE_PRINCIPAL, SECURITY_ENABLED, OOZIE_KEYTAB, KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY) @OsFamilyFuncImpl(os_family=OSConst.WINSRV_FAMILY) -def get_check_command(oozie_url, host_name, parameters): +def get_check_command(oozie_url, host_name, configurations): from resource_management.libraries.functions import reload_windows_env reload_windows_env() oozie_home = os.environ['OOZIE_HOME'] @@ -67,20 +67,20 @@ def get_check_command(oozie_url, host_name, parameters): return (command, None) @OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT) -def get_check_command(oozie_url, host_name, parameters): +def get_check_command(oozie_url, host_name, configurations): security_enabled = False - if SECURITY_ENABLED in parameters: - security_enabled = str(parameters[SECURITY_ENABLED]).upper() == 'TRUE' + if SECURITY_ENABLED in configurations: + security_enabled = str(configurations[SECURITY_ENABLED]).upper() == 'TRUE' kerberos_env = None if security_enabled: - if OOZIE_KEYTAB in parameters and OOZIE_PRINCIPAL in parameters: - oozie_keytab = parameters[OOZIE_KEYTAB] - oozie_principal = parameters[OOZIE_PRINCIPAL] + if OOZIE_KEYTAB in configurations and OOZIE_PRINCIPAL in configurations: + oozie_keytab = configurations[OOZIE_KEYTAB] + oozie_principal = configurations[OOZIE_PRINCIPAL] # substitute _HOST in kerberos principal with actual fqdn oozie_principal = oozie_principal.replace('_HOST', host_name) else: - raise KerberosPropertiesNotFound('The Oozie keytab and principal are required parameters when security is enabled.') + raise KerberosPropertiesNotFound('The Oozie keytab and principal are required configurations when security is enabled.') # Create the kerberos credentials cache (ccache) file and set it in the environment to use # when executing curl @@ -89,8 +89,8 @@ def get_check_command(oozie_url, host_name, parameters): kerberos_env = {'KRB5CCNAME': ccache_file} # Get the configured Kerberos executable search paths, if any - if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in parameters: - kerberos_executable_search_paths = parameters[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] + if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: + kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] else: kerberos_executable_search_paths = None @@ -110,30 +110,31 @@ def get_check_command(oozie_url, host_name, parameters): command = format("source /etc/oozie/conf/oozie-env.sh ; oozie admin -oozie {oozie_url} -status") return (command, kerberos_env) -def execute(parameters=None, host_name=None): +def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: - parameters (dictionary): a mapping of parameter key to value + configurations (dictionary): a mapping of configuration key to value + parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ - if parameters is None: - return (RESULT_CODE_UNKNOWN, ['There were no parameters supplied to the script.']) + if configurations is None: + return (RESULT_CODE_UNKNOWN, ['There were no configurations supplied to the script.']) - if not OOZIE_URL_KEY in parameters: + if not OOZIE_URL_KEY in configurations: return (RESULT_CODE_UNKNOWN, ['The Oozie URL is a required parameter.']) # use localhost on Windows, 0.0.0.0 on others; 0.0.0.0 means bind to all # interfaces, which doesn't work on Windows localhost_address = 'localhost' if OSCheck.get_os_family() == OSConst.WINSRV_FAMILY else '0.0.0.0' - oozie_url = parameters[OOZIE_URL_KEY] + oozie_url = configurations[OOZIE_URL_KEY] oozie_url = oozie_url.replace(urlparse(oozie_url).hostname,localhost_address) try: - command, env = get_check_command(oozie_url, host_name, parameters) + command, env = get_check_command(oozie_url, host_name, configurations) # execute the command Execute(command, environment=env) http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/resources/common-services/STORM/0.9.1.2.1/package/alerts/check_supervisor_process_win.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/STORM/0.9.1.2.1/package/alerts/check_supervisor_process_win.py b/ambari-server/src/main/resources/common-services/STORM/0.9.1.2.1/package/alerts/check_supervisor_process_win.py index dcae64a..a698415 100644 --- a/ambari-server/src/main/resources/common-services/STORM/0.9.1.2.1/package/alerts/check_supervisor_process_win.py +++ b/ambari-server/src/main/resources/common-services/STORM/0.9.1.2.1/package/alerts/check_supervisor_process_win.py @@ -33,12 +33,13 @@ def get_tokens(): """ return () -def execute(parameters=None, host_name=None): +def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: - parameters (dictionary): a mapping of parameter key to value + configurations (dictionary): a mapping of configuration key to value + parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/alerts.json b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/alerts.json index a446808..d25dd78 100644 --- a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/alerts.json +++ b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/alerts.json @@ -203,7 +203,18 @@ "enabled": true, "source": { "type": "SCRIPT", - "path": "YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py" + "path": "YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py", + "parameters": [ + { + "name": "connection.timeout", + "display_name": "Connection Timeout", + "value": 5.0, + "type": "NUMERIC", + "description": "The maximum time before this alert is considered to be CRITICAL", + "units": "seconds", + "threshold": "CRITICAL" + } + ] } } ], @@ -337,7 +348,18 @@ "enabled": true, "source": { "type": "SCRIPT", - "path": "YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py" + "path": "YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py", + "parameters": [ + { + "name": "connection.timeout", + "display_name": "Connection Timeout", + "value": 5.0, + "type": "NUMERIC", + "description": "The maximum time before this alert is considered to be CRITICAL", + "units": "seconds", + "threshold": "CRITICAL" + } + ] } } ], http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py index 26bf2a0..1cdeb97 100644 --- a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py +++ b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py @@ -40,7 +40,8 @@ CRITICAL_NODEMANAGER_UNKNOWN_JSON_MESSAGE = 'Unable to determine NodeManager hea NODEMANAGER_DEFAULT_PORT = 8042 -CONNECTION_TIMEOUT = 5.0 +CONNECTION_TIMEOUT_KEY = 'connection.timeout' +CONNECTION_TIMEOUT_DEFAULT = 5.0 def get_tokens(): """ @@ -51,32 +52,40 @@ def get_tokens(): YARN_HTTP_POLICY_KEY) -def execute(parameters=None, host_name=None): +def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: - parameters (dictionary): a mapping of parameter key to value + configurations (dictionary): a mapping of configuration key to value + parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ result_code = RESULT_CODE_UNKNOWN - if parameters is None: - return (result_code, ['There were no parameters supplied to the script.']) + if configurations is None: + return (result_code, ['There were no configurations supplied to the script.']) scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' - if NODEMANAGER_HTTP_ADDRESS_KEY in parameters: - http_uri = parameters[NODEMANAGER_HTTP_ADDRESS_KEY] + if NODEMANAGER_HTTP_ADDRESS_KEY in configurations: + http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY] - if NODEMANAGER_HTTPS_ADDRESS_KEY in parameters: - https_uri = parameters[NODEMANAGER_HTTPS_ADDRESS_KEY] + if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations: + https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY] + + if YARN_HTTP_POLICY_KEY in configurations: + http_policy = configurations[YARN_HTTP_POLICY_KEY] + + + # parse script arguments + connection_timeout = CONNECTION_TIMEOUT_DEFAULT + if CONNECTION_TIMEOUT_KEY in parameters: + connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) - if YARN_HTTP_POLICY_KEY in parameters: - http_policy = parameters[YARN_HTTP_POLICY_KEY] # determine the right URI and whether to use SSL uri = http_uri @@ -108,7 +117,7 @@ def execute(parameters=None, host_name=None): try: # execute the query for the JSON that includes templeton status - url_response = urllib2.urlopen(query, timeout=CONNECTION_TIMEOUT) + url_response = urllib2.urlopen(query, timeout=connection_timeout) except urllib2.HTTPError, httpError: label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query, str(httpError)) http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py index 7c00625..eafdbd0 100644 --- a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py +++ b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py @@ -30,7 +30,8 @@ NODEMANAGER_HTTP_ADDRESS_KEY = '{{yarn-site/yarn.resourcemanager.webapp.address} NODEMANAGER_HTTPS_ADDRESS_KEY = '{{yarn-site/yarn.resourcemanager.webapp.https.address}}' YARN_HTTP_POLICY_KEY = '{{yarn-site/yarn.http.policy}}' -CONNECTION_TIMEOUT = 5.0 +CONNECTION_TIMEOUT_KEY = 'connection.timeout' +CONNECTION_TIMEOUT_DEFAULT = 5.0 def get_tokens(): """ @@ -41,32 +42,38 @@ def get_tokens(): YARN_HTTP_POLICY_KEY -def execute(parameters=None, host_name=None): +def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: - parameters (dictionary): a mapping of parameter key to value + configurations (dictionary): a mapping of configuration key to value + parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ - if parameters is None: - return (('UNKNOWN', ['There were no parameters supplied to the script.'])) + if configurations is None: + return (('UNKNOWN', ['There were no configurations supplied to the script.'])) scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' - if NODEMANAGER_HTTP_ADDRESS_KEY in parameters: - http_uri = parameters[NODEMANAGER_HTTP_ADDRESS_KEY] + if NODEMANAGER_HTTP_ADDRESS_KEY in configurations: + http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY] - if NODEMANAGER_HTTPS_ADDRESS_KEY in parameters: - https_uri = parameters[NODEMANAGER_HTTPS_ADDRESS_KEY] + if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations: + https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY] + + if YARN_HTTP_POLICY_KEY in configurations: + http_policy = configurations[YARN_HTTP_POLICY_KEY] + + # parse script arguments + connection_timeout = CONNECTION_TIMEOUT_DEFAULT + if CONNECTION_TIMEOUT_KEY in parameters: + connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) - if YARN_HTTP_POLICY_KEY in parameters: - http_policy = parameters[YARN_HTTP_POLICY_KEY] - # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': @@ -78,7 +85,8 @@ def execute(parameters=None, host_name=None): live_nodemanagers_qry = "{0}://{1}/jmx?qry=Hadoop:service=ResourceManager,name=RMNMInfo".format(scheme, uri) try: - live_nodemanagers = json.loads(get_value_from_jmx(live_nodemanagers_qry, "LiveNodeManagers")) + live_nodemanagers = json.loads(get_value_from_jmx(live_nodemanagers_qry, + "LiveNodeManagers", connection_timeout)) unhealthy_count = 0 @@ -104,14 +112,14 @@ def execute(parameters=None, host_name=None): return (result_code, [label]) -def get_value_from_jmx(query, jmx_property): +def get_value_from_jmx(query, jmx_property, connection_timeout): response = None try: # use a customer header process that will look for the non-standard # "Refresh" header and attempt to follow the redirect url_opener = urllib2.build_opener(RefreshHeaderProcessor()) - response = url_opener.open(query, timeout=CONNECTION_TIMEOUT) + response = url_opener.open(query, timeout=connection_timeout) data = response.read() data_dict = json.loads(data) http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/resources/host_scripts/alert_disk_space.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/host_scripts/alert_disk_space.py b/ambari-server/src/main/resources/host_scripts/alert_disk_space.py index 68e22df..94f19d6 100644 --- a/ambari-server/src/main/resources/host_scripts/alert_disk_space.py +++ b/ambari-server/src/main/resources/host_scripts/alert_disk_space.py @@ -25,7 +25,16 @@ from ambari_commons.os_family_impl import OsFamilyFuncImpl, OsFamilyImpl from ambari_commons import OSConst DiskInfo = collections.namedtuple('DiskInfo', 'total used free path') -MIN_FREE_SPACE = 5000000000L # 5GB + +# script parameter keys +MIN_FREE_SPACE_KEY = "minimum.free.space" +PERCENT_USED_WARNING_KEY = "percent.used.space.warning.threshold" +PERCENT_USED_CRITICAL_KEY = "percent.free.space.critical.threshold" + +# defaults in case no script parameters are passed +MIN_FREE_SPACE_DEFAULT = 5000000000L +PERCENT_USED_WARNING_DEFAULT = 50 +PERCENT_USED_CRITICAL_DEFAULT = 80 # the location where HDP installs components when using HDP 2.2+ HDP_HOME_DIR = "/usr/hdp" @@ -40,8 +49,9 @@ def get_tokens(): """ return None + @OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT) -def execute(parameters=None, host_name=None): +def execute(configurations={}, parameters={}, host_name=None): """ Performs advanced disk checks under Linux. This will first attempt to check the HDP installation directories if they exist. If they do not exist, @@ -50,7 +60,8 @@ def execute(parameters=None, host_name=None): Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: - parameters (dictionary): a mapping of parameter key to value + configurations (dictionary): a mapping of configuration key to value + parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ @@ -66,21 +77,40 @@ def execute(parameters=None, host_name=None): try: disk_usage = _get_disk_usage(path) - result_code, label = _get_warnings_for_partition(disk_usage) + result_code, label = _get_warnings_for_partition(parameters, disk_usage) except NotImplementedError, platform_error: return 'CRITICAL', [str(platform_error)] return result_code, [label] -def _get_warnings_for_partition(disk_usage): + +def _get_warnings_for_partition(parameters, disk_usage): + + # start with hard coded defaults + min_free_space = MIN_FREE_SPACE_DEFAULT + warning_percent = PERCENT_USED_WARNING_DEFAULT + critical_percent = PERCENT_USED_CRITICAL_DEFAULT + + # parse script parameters + if MIN_FREE_SPACE_KEY in parameters: + # long(float(5e9)) seems like gson likes scientific notation + min_free_space = long(float(parameters[MIN_FREE_SPACE_KEY])) + + if PERCENT_USED_WARNING_KEY in parameters: + warning_percent = float(parameters[PERCENT_USED_WARNING_KEY]) * 100 + + if PERCENT_USED_CRITICAL_KEY in parameters: + critical_percent = float(parameters[PERCENT_USED_CRITICAL_KEY]) * 100 + + if disk_usage is None or disk_usage.total == 0: return 'CRITICAL', ['Unable to determine the disk usage'] result_code = 'OK' percent = disk_usage.used / float(disk_usage.total) * 100 - if percent > 80: + if percent > critical_percent: result_code = 'CRITICAL' - elif percent > 50: + elif percent > warning_percent: result_code = 'WARNING' label = 'Capacity Used: [{0:.2f}%, {1}], Capacity Total: [{2}]'.format( @@ -92,26 +122,27 @@ def _get_warnings_for_partition(disk_usage): if result_code == 'OK': # Check absolute disk space value - if disk_usage.free < MIN_FREE_SPACE: + if disk_usage.free < min_free_space: result_code = 'WARNING' - label += '. Total free space is less than {0}'.format(_get_formatted_size(MIN_FREE_SPACE)) + label += '. Total free space is less than {0}'.format(_get_formatted_size(min_free_space)) return result_code, label @OsFamilyFuncImpl(os_family=OSConst.WINSRV_FAMILY) -def execute(parameters=None, host_name=None): +def execute(configurations={}, parameters={}, host_name=None): """ Performs simplified disk checks under Windows Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: - parameters (dictionary): a mapping of parameter key to value + configurations (dictionary): a mapping of configuration key to value + parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ try: disk_usage = _get_disk_usage() - result = _get_warnings_for_partition(disk_usage) + result = _get_warnings_for_partition(parameters, disk_usage) except NotImplementedError, platform_error: result = ('CRITICAL', [str(platform_error)]) return result http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/FLUME/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/FLUME/alerts.json b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/FLUME/alerts.json index 3b117d0..cac5855 100644 --- a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/FLUME/alerts.json +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/FLUME/alerts.json @@ -10,7 +10,16 @@ "scope": "ANY", "source": { "type": "SCRIPT", - "path": "BIGTOP/0.8/services/FLUME/package/files/alert_flume_agent_status.py" + "path": "BIGTOP/0.8/services/FLUME/package/files/alert_flume_agent_status.py", + "parameters": [ + { + "name": "run.directory", + "display_name": "Run Directory", + "value": "/var/run/flume", + "type": "STRING", + "description": "The directory where flume agent processes will place their PID files." + } + ] } } ] http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/FLUME/package/files/alert_flume_agent_status.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/FLUME/package/files/alert_flume_agent_status.py b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/FLUME/package/files/alert_flume_agent_status.py index b183bbc..0838b4c 100644 --- a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/FLUME/package/files/alert_flume_agent_status.py +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/FLUME/package/files/alert_flume_agent_status.py @@ -29,7 +29,8 @@ RESULT_CODE_UNKNOWN = 'UNKNOWN' FLUME_CONF_DIR_KEY = '{{flume-env/flume_conf_dir}}' -FLUME_RUN_DIR = '/var/run/flume' +FLUME_RUN_DIR_KEY = "run.directory" +FLUME_RUN_DIR_DEFAULT = '/var/run/flume' def get_tokens(): """ @@ -37,23 +38,24 @@ def get_tokens(): to build the dictionary passed into execute """ return (FLUME_CONF_DIR_KEY,) - -def execute(parameters=None, host_name=None): + +def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: - parameters (dictionary): a mapping of parameter key to value + configurations (dictionary): a mapping of configuration key to value + parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ - if parameters is None: - return (RESULT_CODE_UNKNOWN, ['There were no parameters supplied to the script.']) + if configurations is None: + return (RESULT_CODE_UNKNOWN, ['There were no configurations supplied to the script.']) flume_conf_directory = None - if FLUME_CONF_DIR_KEY in parameters: - flume_conf_directory = parameters[FLUME_CONF_DIR_KEY] + if FLUME_CONF_DIR_KEY in configurations: + flume_conf_directory = configurations[FLUME_CONF_DIR_KEY] if flume_conf_directory is None: return (RESULT_CODE_UNKNOWN, ['The Flume configuration directory is a required parameter.']) @@ -61,7 +63,12 @@ def execute(parameters=None, host_name=None): if host_name is None: host_name = socket.getfqdn() - processes = get_flume_status(flume_conf_directory, FLUME_RUN_DIR) + # parse script arguments + flume_run_directory = FLUME_RUN_DIR_DEFAULT + if FLUME_RUN_DIR_KEY in parameters: + flume_run_directory = parameters[FLUME_RUN_DIR_KEY] + + processes = get_flume_status(flume_conf_directory, flume_run_directory) expected_agents = find_expected_agent_names(flume_conf_directory) alert_label = '' http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/alerts.json b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/alerts.json index 8e08ef2..c0b9b0b 100644 --- a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/alerts.json +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/alerts.json @@ -387,7 +387,36 @@ "enabled": true, "source": { "type": "SCRIPT", - "path": "BIGTOP/0.8/services/HDFS/package/files/alert_checkpoint_time.py" + "path": "BIGTOP/0.8/services/HDFS/package/files/alert_checkpoint_time.py", + "parameters": [ + { + "name": "connection.timeout", + "display_name": "Connection Timeout", + "value": 5.0, + "type": "NUMERIC", + "description": "The maximum time before this alert is considered to be CRITICAL", + "units": "seconds", + "threshold": "CRITICAL" + }, + { + "name": "checkpoint_time_warning_threshold", + "display_name": "Checkpoint Warning", + "value": 2.0, + "type": "PERCENT", + "description": "The percentage of the last checkpoint time greater than the interval in order to trigger a warning alert.", + "units": "%", + "threshold": "WARNING" + }, + { + "name": "checkpoint_time_critical_threshold", + "display_name": "Checkpoint Critical", + "value": 2.0, + "type": "PERCENT", + "description": "The percentage of the last checkpoint time greater than the interval in order to trigger a critical alert.", + "units": "%", + "threshold": "CRITICAL" + } + ] } }, { @@ -400,7 +429,18 @@ "ignore_host": true, "source": { "type": "SCRIPT", - "path": "BIGTOP/0.8/services/HDFS/package/files/alert_ha_namenode_health.py" + "path": "BIGTOP/0.8/services/HDFS/package/files/alert_ha_namenode_health.py", + "parameters": [ + { + "name": "connection.timeout", + "display_name": "Connection Timeout", + "value": 5.0, + "type": "NUMERIC", + "description": "The maximum time before this alert is considered to be CRITICAL", + "units": "seconds", + "threshold": "CRITICAL" + } + ] } } ], http://git-wip-us.apache.org/repos/asf/ambari/blob/6727c1dc/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_checkpoint_time.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_checkpoint_time.py b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_checkpoint_time.py index 2455d3b..b44c2d4 100644 --- a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_checkpoint_time.py +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_checkpoint_time.py @@ -30,13 +30,17 @@ NN_HTTP_POLICY_KEY = '{{hdfs-site/dfs.http.policy}}' NN_CHECKPOINT_TX_KEY = '{{hdfs-site/dfs.namenode.checkpoint.txns}}' NN_CHECKPOINT_PERIOD_KEY = '{{hdfs-site/dfs.namenode.checkpoint.period}}' -PERCENT_WARNING = 200 -PERCENT_CRITICAL = 200 +PERCENT_WARNING_KEY = 'checkpoint.time.warning.threshold' +PERCENT_WARNING_DEFAULT = 200 + +PERCENT_CRITICAL_KEY = 'checkpoint.time.critical.threshold' +PERCENT_CRITICAL_DEFAULT = 200 CHECKPOINT_TX_DEFAULT = 1000000 CHECKPOINT_PERIOD_DEFAULT = 21600 -CONNECTION_TIMEOUT = 5.0 +CONNECTION_TIMEOUT_KEY = 'connection.timeout' +CONNECTION_TIMEOUT_DEFAULT = 5.0 def get_tokens(): """ @@ -47,43 +51,55 @@ def get_tokens(): NN_CHECKPOINT_TX_KEY, NN_CHECKPOINT_PERIOD_KEY) -def execute(parameters=None, host_name=None): +def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: - parameters (dictionary): a mapping of parameter key to value + configurations (dictionary): a mapping of configuration key to value + parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ - if parameters is None: - return (('UNKNOWN', ['There were no parameters supplied to the script.'])) + if configurations is None: + return (('UNKNOWN', ['There were no configurations supplied to the script.'])) uri = None scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' - percent_warning = PERCENT_WARNING - percent_critical = PERCENT_CRITICAL checkpoint_tx = CHECKPOINT_TX_DEFAULT checkpoint_period = CHECKPOINT_PERIOD_DEFAULT - if NN_HTTP_ADDRESS_KEY in parameters: - http_uri = parameters[NN_HTTP_ADDRESS_KEY] + if NN_HTTP_ADDRESS_KEY in configurations: + http_uri = configurations[NN_HTTP_ADDRESS_KEY] - if NN_HTTPS_ADDRESS_KEY in parameters: - https_uri = parameters[NN_HTTPS_ADDRESS_KEY] + if NN_HTTPS_ADDRESS_KEY in configurations: + https_uri = configurations[NN_HTTPS_ADDRESS_KEY] - if NN_HTTP_POLICY_KEY in parameters: - http_policy = parameters[NN_HTTP_POLICY_KEY] + if NN_HTTP_POLICY_KEY in configurations: + http_policy = configurations[NN_HTTP_POLICY_KEY] - if NN_CHECKPOINT_TX_KEY in parameters: - checkpoint_tx = parameters[NN_CHECKPOINT_TX_KEY] + if NN_CHECKPOINT_TX_KEY in configurations: + checkpoint_tx = configurations[NN_CHECKPOINT_TX_KEY] + + if NN_CHECKPOINT_PERIOD_KEY in configurations: + checkpoint_period = configurations[NN_CHECKPOINT_PERIOD_KEY] + + # parse script arguments + connection_timeout = CONNECTION_TIMEOUT_DEFAULT + if CONNECTION_TIMEOUT_KEY in parameters: + connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) + + percent_warning = PERCENT_WARNING_DEFAULT + if PERCENT_WARNING_KEY in parameters: + percent_warning = float(parameters[PERCENT_WARNING_KEY]) * 100 + + percent_critical = PERCENT_CRITICAL_DEFAULT + if PERCENT_CRITICAL_KEY in parameters: + percent_critical = float(parameters[PERCENT_CRITICAL_KEY]) * 100 - if NN_CHECKPOINT_PERIOD_KEY in parameters: - checkpoint_period = parameters[NN_CHECKPOINT_PERIOD_KEY] - # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': @@ -102,8 +118,12 @@ def execute(parameters=None, host_name=None): result_code = "OK" try: - last_checkpoint_time = int(get_value_from_jmx(last_checkpoint_time_qry,"LastCheckpointTime")) - journal_transaction_info = get_value_from_jmx(journal_transaction_info_qry,"JournalTransactionInfo") + last_checkpoint_time = int(get_value_from_jmx(last_checkpoint_time_qry, + "LastCheckpointTime", connection_timeout)) + + journal_transaction_info = get_value_from_jmx(journal_transaction_info_qry, + "JournalTransactionInfo", connection_timeout) + journal_transaction_info_dict = json.loads(journal_transaction_info) last_tx = int(journal_transaction_info_dict['LastAppliedOrWrittenTxId']) @@ -131,11 +151,11 @@ def get_time(delta): return {'h':h, 'm':m} -def get_value_from_jmx(query, jmx_property): +def get_value_from_jmx(query, jmx_property, connection_timeout): response = None try: - response = urllib2.urlopen(query, timeout=CONNECTION_TIMEOUT) + response = urllib2.urlopen(query, timeout=connection_timeout) data = response.read() data_dict = json.loads(data)
