Repository: ambari Updated Branches: refs/heads/trunk f4af798a5 -> d348f5db5
AMBARI-8084 - Alerts: Convert Bigtop Stack Nagios Alerts (jonathanhurley) Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/d348f5db Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/d348f5db Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/d348f5db Branch: refs/heads/trunk Commit: d348f5db54e3a59a7a228fa698a8eb6d05712d6c Parents: f4af798 Author: Jonathan Hurley <jhur...@hortonworks.com> Authored: Thu Nov 6 12:24:38 2014 -0500 Committer: Jonathan Hurley <jhur...@hortonworks.com> Committed: Fri Nov 7 16:12:17 2014 -0500 ---------------------------------------------------------------------- .../server/api/services/AmbariMetaInfo.java | 6 - .../AlertDefinitionResourceProvider.java | 1 + .../internal/AlertGroupResourceProvider.java | 7 + .../BIGTOP/0.8/services/FLUME/alerts.json | 17 + .../package/files/alert_flume_agent_status.py | 99 ++++ .../BIGTOP/0.8/services/GANGLIA/alerts.json | 107 +++++ .../BIGTOP/0.8/services/HBASE/alerts.json | 109 +++++ .../stacks/BIGTOP/0.8/services/HDFS/alerts.json | 480 +++++++++++++++++++ .../HDFS/package/files/alert_checkpoint_time.py | 136 ++++++ .../package/files/alert_ha_namenode_health.py | 166 +++++++ .../stacks/BIGTOP/0.8/services/HIVE/alerts.json | 39 ++ .../package/files/alert_hive_thrift_port.py | 89 ++++ .../BIGTOP/0.8/services/OOZIE/alerts.json | 40 ++ .../package/files/alert_check_oozie_server.py | 74 +++ .../BIGTOP/0.8/services/WEBHCAT/alerts.json | 18 + .../package/files/alert_webhcat_server.py | 111 +++++ .../stacks/BIGTOP/0.8/services/YARN/alerts.json | 321 +++++++++++++ .../package/files/alert_nodemanager_health.py | 123 +++++ .../BIGTOP/0.8/services/ZOOKEEPER/alerts.json | 51 ++ .../AlertDefinitionResourceProviderTest.java | 5 + .../AlertGroupResourceProviderTest.java | 31 ++ 21 files changed, 2024 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/d348f5db/ambari-server/src/main/java/org/apache/ambari/server/api/services/AmbariMetaInfo.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/api/services/AmbariMetaInfo.java b/ambari-server/src/main/java/org/apache/ambari/server/api/services/AmbariMetaInfo.java index 3d67fe9..bb4c569 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/api/services/AmbariMetaInfo.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/api/services/AmbariMetaInfo.java @@ -1275,12 +1275,6 @@ public class AmbariMetaInfo { stackDefinitions.addAll(serviceDefinitions); } - // if there are no alert definitions defined for the cluster services - // then don't do anything and go to the next cluster - if (null == stackDefinitions || stackDefinitions.size() == 0) { - continue; - } - List<AlertDefinitionEntity> persist = new ArrayList<AlertDefinitionEntity>(); List<AlertDefinitionEntity> entities = alertDefinitionDao.findAll(clusterId); http://git-wip-us.apache.org/repos/asf/ambari/blob/d348f5db/ambari-server/src/main/java/org/apache/ambari/server/controller/internal/AlertDefinitionResourceProvider.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/controller/internal/AlertDefinitionResourceProvider.java b/ambari-server/src/main/java/org/apache/ambari/server/controller/internal/AlertDefinitionResourceProvider.java index a8a7f67..f66fc1d 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/controller/internal/AlertDefinitionResourceProvider.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/controller/internal/AlertDefinitionResourceProvider.java @@ -616,6 +616,7 @@ public class AlertDefinitionResourceProvider extends AbstractControllerResourceP setResourceProperty(resource, ALERT_DEF_SERVICE_NAME, entity.getServiceName(), requestedIds); setResourceProperty(resource, ALERT_DEF_COMPONENT_NAME, entity.getComponentName(), requestedIds); setResourceProperty(resource, ALERT_DEF_ENABLED, Boolean.valueOf(entity.getEnabled()), requestedIds); + setResourceProperty(resource, ALERT_DEF_IGNORE_HOST, Boolean.valueOf(entity.isHostIgnored()), requestedIds); setResourceProperty(resource, ALERT_DEF_SCOPE, entity.getScope(), requestedIds); boolean sourceTypeRequested = setResourceProperty(resource, http://git-wip-us.apache.org/repos/asf/ambari/blob/d348f5db/ambari-server/src/main/java/org/apache/ambari/server/controller/internal/AlertGroupResourceProvider.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/controller/internal/AlertGroupResourceProvider.java b/ambari-server/src/main/java/org/apache/ambari/server/controller/internal/AlertGroupResourceProvider.java index 50820a7..c93ef29 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/controller/internal/AlertGroupResourceProvider.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/controller/internal/AlertGroupResourceProvider.java @@ -223,6 +223,13 @@ public class AlertGroupResourceProvider extends LOG.info("Deleting alert target {}", groupId); final AlertGroupEntity entity = s_dao.findGroupById(groupId.longValue()); + if (entity.isDefault()) { + // default groups cannot be removed + LOG.warn("The default alert group for {} cannot be removed", + entity.getServiceName()); + + continue; + } modifyResources(new Command<Void>() { @Override http://git-wip-us.apache.org/repos/asf/ambari/blob/d348f5db/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/FLUME/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/FLUME/alerts.json b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/FLUME/alerts.json new file mode 100644 index 0000000..86fb854 --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/FLUME/alerts.json @@ -0,0 +1,17 @@ +{ + "FLUME": { + "service": [], + "FLUME_HANDLER": [ + { + "name": "flume_agent_status", + "label": "Flume Agent Status", + "interval": 1, + "scope": "ANY", + "source": { + "type": "SCRIPT", + "path": "BIGTOP/0.8/services/FLUME/package/files/alert_flume_agent_status.py" + } + } + ] + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/d348f5db/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/FLUME/package/files/alert_flume_agent_status.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/FLUME/package/files/alert_flume_agent_status.py b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/FLUME/package/files/alert_flume_agent_status.py new file mode 100644 index 0000000..b183bbc --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/FLUME/package/files/alert_flume_agent_status.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python + +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import socket + +from resource_management.libraries.functions.flume_agent_helper import find_expected_agent_names +from resource_management.libraries.functions.flume_agent_helper import get_flume_status + +RESULT_CODE_OK = 'OK' +RESULT_CODE_CRITICAL = 'CRITICAL' +RESULT_CODE_UNKNOWN = 'UNKNOWN' + +FLUME_CONF_DIR_KEY = '{{flume-env/flume_conf_dir}}' + +FLUME_RUN_DIR = '/var/run/flume' + +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return (FLUME_CONF_DIR_KEY,) + + +def execute(parameters=None, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + parameters (dictionary): a mapping of parameter key to value + host_name (string): the name of this host where the alert is running + """ + + if parameters is None: + return (RESULT_CODE_UNKNOWN, ['There were no parameters supplied to the script.']) + + flume_conf_directory = None + if FLUME_CONF_DIR_KEY in parameters: + flume_conf_directory = parameters[FLUME_CONF_DIR_KEY] + + if flume_conf_directory is None: + return (RESULT_CODE_UNKNOWN, ['The Flume configuration directory is a required parameter.']) + + if host_name is None: + host_name = socket.getfqdn() + + processes = get_flume_status(flume_conf_directory, FLUME_RUN_DIR) + expected_agents = find_expected_agent_names(flume_conf_directory) + + alert_label = '' + alert_state = RESULT_CODE_OK + + if len(processes) == 0 and len(expected_agents) == 0: + alert_label = 'No agents defined on {0}'.format(host_name) + else: + ok = [] + critical = [] + text_arr = [] + + for process in processes: + if not process.has_key('status') or process['status'] == 'NOT_RUNNING': + critical.append(process['name']) + else: + ok.append(process['name']) + + if len(critical) > 0: + text_arr.append("{0} {1} NOT running".format(", ".join(critical), + "is" if len(critical) == 1 else "are")) + + if len(ok) > 0: + text_arr.append("{0} {1} running".format(", ".join(ok), + "is" if len(ok) == 1 else "are")) + + plural = len(critical) > 1 or len(ok) > 1 + alert_label = "Agent{0} {1} {2}".format( + "s" if plural else "", + " and ".join(text_arr), + "on " + host_name) + + alert_state = RESULT_CODE_CRITICAL if len(critical) > 0 else RESULT_CODE_OK + + return (alert_state, [alert_label]) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/d348f5db/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/GANGLIA/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/GANGLIA/alerts.json b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/GANGLIA/alerts.json new file mode 100644 index 0000000..05053a3 --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/GANGLIA/alerts.json @@ -0,0 +1,107 @@ +{ + "GANGLIA": { + "service": [], + "GANGLIA_SERVER": [ + { + "name": "ganglia_server_process", + "label": "Ganglia Server Process", + "interval": 1, + "scope": "ANY", + "enabled": true, + "source": { + "type": "PORT", + "uri": "8651", + "default_port": 8651, + "reporting": { + "ok": { + "text": "TCP OK - {0:.4f} response on port {1}" + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}" + } + } + } + }, + { + "name": "ganglia_monitor_hdfs_namenode", + "label": "Ganglia NameNode Process Monitor", + "interval": 1, + "scope": "ANY", + "enabled": true, + "source": { + "type": "PORT", + "uri": "8661", + "default_port": 8661, + "reporting": { + "ok": { + "text": "TCP OK - {0:.4f} response on port {1}" + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}" + } + } + } + }, + { + "name": "ganglia_monitor_hbase_master", + "label": "Ganglia HBase Master Process Monitor", + "interval": 1, + "scope": "ANY", + "enabled": true, + "source": { + "type": "PORT", + "uri": "8663", + "default_port": 8663, + "reporting": { + "ok": { + "text": "TCP OK - {0:.4f} response on port {1}" + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}" + } + } + } + }, + { + "name": "ganglia_monitor_yarn_resourcemanager", + "label": "Ganglia ResourceManager Process Monitor", + "interval": 1, + "scope": "ANY", + "enabled": true, + "source": { + "type": "PORT", + "uri": "8664", + "default_port": 8664, + "reporting": { + "ok": { + "text": "TCP OK - {0:.4f} response on port {1}" + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}" + } + } + } + }, + { + "name": "ganglia_monitor_mapreduce_history_server", + "label": "Ganglia History Server Process Monitor", + "interval": 1, + "scope": "ANY", + "enabled": true, + "source": { + "type": "PORT", + "uri": "8666", + "default_port": 8666, + "reporting": { + "ok": { + "text": "TCP OK - {0:.4f} response on port {1}" + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}" + } + } + } + } + ] + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/d348f5db/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HBASE/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HBASE/alerts.json b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HBASE/alerts.json new file mode 100644 index 0000000..b8b8cab --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HBASE/alerts.json @@ -0,0 +1,109 @@ +{ + "HBASE": { + "service": [ + { + "name": "hbase_regionserver_process_percent", + "label": "Percent RegionServers Available", + "interval": 1, + "scope": "SERVICE", + "enabled": true, + "source": { + "type": "AGGREGATE", + "alert_name": "hbase_regionserver_process", + "reporting": { + "ok": { + "text": "affected: [{1}], total: [{0}]" + }, + "warning": { + "text": "affected: [{1}], total: [{0}]", + "value": 0.1 + }, + "critical": { + "text": "affected: [{1}], total: [{0}]", + "value": 0.3 + } + } + } + } + ], + "HBASE_MASTER": [ + { + "name": "hbase_master_process", + "label": "HBase Master Process", + "interval": 1, + "scope": "ANY", + "source": { + "type": "PORT", + "uri": "{{hbase-site/hbase.master.port}}", + "default_port": 60000, + "reporting": { + "ok": { + "text": "TCP OK - {0:.4f} response on port {1}" + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}" + } + } + } + }, + { + "name": "hbase_master_cpu", + "label": "HBase Maser CPU Utilization", + "interval": 5, + "scope": "ANY", + "enabled": true, + "source": { + "type": "METRIC", + "uri": { + "http": "{{hbase-site/hbase.master.info.port}}", + "https": "{{hbase-site/hbase.master.info.port}}", + "https_property": "{{cluster-env/security_enabled}}", + "https_property_value": "true", + "default_port": 60010 + }, + "reporting": { + "ok": { + "text": "{1} CPU, load {0:.1%}" + }, + "warning": { + "text": "{1} CPU, load {0:.1%}", + "value": 200 + }, + "critical": { + "text": "{1} CPU, load {0:.1%}", + "value": 250 + } + }, + "jmx": { + "property_list": [ + "java.lang:type=OperatingSystem/SystemCpuLoad", + "java.lang:type=OperatingSystem/AvailableProcessors" + ], + "value": "{0} * 100" + } + } + } + ], + "HBASE_REGIONSERVER": [ + { + "name": "hbase_regionserver_process", + "label": "HBase RegionServer Process", + "interval": 1, + "scope": "HOST", + "source": { + "type": "PORT", + "uri": "{{hbase-site/hbase.regionserver.info.port}}", + "default_port": 60030, + "reporting": { + "ok": { + "text": "TCP OK - {0:.4f} response on port {1}" + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}" + } + } + } + } + ] + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/d348f5db/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/alerts.json b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/alerts.json new file mode 100644 index 0000000..96cb931 --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/alerts.json @@ -0,0 +1,480 @@ +{ + "HDFS":{ + "service": [ + { + "name": "datanode_process_percent", + "label": "Percent DataNodes Available", + "interval": 1, + "scope": "SERVICE", + "enabled": true, + "source": { + "type": "AGGREGATE", + "alert_name": "datanode_process", + "reporting": { + "ok": { + "text": "affected: [{1}], total: [{0}]" + }, + "warning": { + "text": "affected: [{1}], total: [{0}]", + "value": 0.1 + }, + "critical": { + "text": "affected: [{1}], total: [{0}]", + "value": 0.3 + } + } + } + }, + { + "name": "datanode_storage_percent", + "label": "Percent DataNodes With Available Space", + "interval": 1, + "scope": "SERVICE", + "enabled": true, + "source": { + "type": "AGGREGATE", + "alert_name": "datanode_storage", + "reporting": { + "ok": { + "text": "affected: [{1}], total: [{0}]" + }, + "warning": { + "text": "affected: [{1}], total: [{0}]", + "value": 0.1 + }, + "critical": { + "text": "affected: [{1}], total: [{0}]", + "value": 0.3 + } + } + } + }, + { + "name": "journalnode_process_percent", + "label": "Percent JournalNodes Available", + "interval": 1, + "scope": "SERVICE", + "enabled": true, + "source": { + "type": "AGGREGATE", + "alert_name": "journalnode_process", + "reporting": { + "ok": { + "text": "affected: [{1}], total: [{0}]" + }, + "warning": { + "text": "affected: [{1}], total: [{0}]", + "value": 0.33 + }, + "critical": { + "text": "affected: [{1}], total: [{0}]", + "value": 0.50 + } + } + } + } + ], + "NAMENODE": [ + { + "name": "namenode_webui", + "label": "NameNode Web UI", + "interval": 1, + "scope": "ANY", + "enabled": true, + "source": { + "type": "WEB", + "uri": { + "http": "{{hdfs-site/dfs.namenode.http-address}}", + "https": "{{hdfs-site/dfs.namenode.https-address}}", + "https_property": "{{hdfs-site/dfs.http.policy}}", + "https_property_value": "HTTPS_ONLY" + }, + "reporting": { + "ok": { + "text": "HTTP {0} response in {2:.4f} seconds" + }, + "warning":{ + "text": "HTTP {0} response in {2:.4f} seconds" + }, + "critical": { + "text": "Connection failed to {1}" + } + } + } + }, + { + "name": "namenode_cpu", + "label": "NameNode Host CPU Utilization", + "interval": 5, + "scope": "ANY", + "enabled": true, + "source": { + "type": "METRIC", + "uri": { + "http": "{{hdfs-site/dfs.namenode.http-address}}", + "https": "{{hdfs-site/dfs.namenode.https-address}}", + "https_property": "{{hdfs-site/dfs.http.policy}}", + "https_property_value": "HTTPS_ONLY" + }, + "reporting": { + "ok": { + "text": "{1} CPU, load {0:.1%}" + }, + "warning": { + "text": "{1} CPU, load {0:.1%}", + "value": 200 + }, + "critical": { + "text": "{1} CPU, load {0:.1%}", + "value": 250 + } + }, + "jmx": { + "property_list": [ + "java.lang:type=OperatingSystem/SystemCpuLoad", + "java.lang:type=OperatingSystem/AvailableProcessors" + ], + "value": "{0} * 100" + } + } + }, + { + "name": "namenode_hdfs_blocks_health", + "label": "NameNode Blocks Health", + "interval": 2, + "scope": "ANY", + "enabled": true, + "source": { + "type": "METRIC", + "uri": { + "http": "{{hdfs-site/dfs.namenode.http-address}}", + "https": "{{hdfs-site/dfs.namenode.https-address}}", + "https_property": "{{hdfs-site/dfs.http.policy}}", + "https_property_value": "HTTPS_ONLY" + }, + "reporting": { + "ok": { + "text": "Total Blocks:[{1}], Missing Blocks:[{0}]" + }, + "warning": { + "text": "Total Blocks:[{1}], Missing Blocks:[{0}]", + "value": 1 + }, + "critical": { + "text": "Total Blocks:[{1}], Missing Blocks:[{0}]", + "value": 1 + } + }, + "jmx": { + "property_list": [ + "Hadoop:service=NameNode,name=FSNamesystem/MissingBlocks", + "Hadoop:service=NameNode,name=FSNamesystem/BlocksTotal" + ], + "value": "{0}" + } + } + }, + { + "name": "namenode_hdfs_capacity_utilization", + "label": "HDFS Capacity Utilization", + "interval": 2, + "scope": "ANY", + "enabled": true, + "source": { + "type": "METRIC", + "uri": { + "http": "{{hdfs-site/dfs.namenode.http-address}}", + "https": "{{hdfs-site/dfs.namenode.https-address}}", + "https_property": "{{hdfs-site/dfs.http.policy}}", + "https_property_value": "HTTPS_ONLY" + }, + "reporting": { + "ok": { + "text": "Capacity Used:[{2:d}%, {0}], Capacity Remaining:[{1}]" + }, + "warning": { + "text": "Capacity Used:[{2:d}%, {0}], Capacity Remaining:[{1}]", + "value": 80 + }, + "critical": { + "text": "Capacity Used:[{2:d}%, {0}], Capacity Remaining:[{1}]", + "value": 90 + } + }, + "jmx": { + "property_list": [ + "Hadoop:service=NameNode,name=FSNamesystemState/CapacityUsed", + "Hadoop:service=NameNode,name=FSNamesystemState/CapacityRemaining" + ], + "value": "{0}/({0} + {1}) * 100" + } + } + }, + { + "name": "namenode_rpc_latency", + "label": "NameNode RPC Latency", + "interval": 2, + "scope": "ANY", + "enabled": true, + "source": { + "type": "METRIC", + "uri": { + "http": "{{hdfs-site/dfs.namenode.http-address}}", + "https": "{{hdfs-site/dfs.namenode.https-address}}", + "https_property": "{{hdfs-site/dfs.http.policy}}", + "https_property_value": "HTTPS_ONLY" + }, + "reporting": { + "ok": { + "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]" + }, + "warning": { + "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]", + "value": 3000 + }, + "critical": { + "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]", + "value": 5000 + } + }, + "jmx": { + "property_list": [ + "Hadoop:service=NameNode,name=RpcActivityForPort*/RpcQueueTimeAvgTime", + "Hadoop:service=NameNode,name=RpcActivityForPort*/RpcProcessingTimeAvgTime" + ], + "value": "{0}" + } + } + }, + { + "name": "namenode_directory_status", + "label": "NameNode Directory Status", + "interval": 1, + "scope": "ANY", + "enabled": true, + "source": { + "type": "METRIC", + "uri": { + "http": "{{hdfs-site/dfs.namenode.http-address}}", + "https": "{{hdfs-site/dfs.namenode.https-address}}", + "https_property": "{{hdfs-site/dfs.http.policy}}", + "https_property_value": "HTTPS_ONLY" + }, + "reporting": { + "ok": { + "text": "Directories are healthy" + }, + "warning": { + "text": "Failed directory count: {1}", + "value": 1 + }, + "critical": { + "text": "Failed directory count: {1}", + "value": 1 + } + }, + "jmx": { + "property_list": [ + "Hadoop:service=NameNode,name=NameNodeInfo/NameDirStatuses" + ], + "value": "calculate(args)\ndef calculate(args):\n import json\n json_statuses = json.loads({0})\n return len(json_statuses['failed']) if 'failed' in json_statuses else 0" + } + } + }, + { + "name": "namenode_process", + "label": "NameNode Process", + "interval": 1, + "scope": "ANY", + "enabled": true, + "source": { + "type": "PORT", + "uri": "{{hdfs-site/dfs.namenode.http-address}}", + "default_port": 50070, + "reporting": { + "ok": { + "text": "TCP OK - {0:.4f} response on port {1}" + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}" + } + } + } + }, + { + "name": "namenode_last_checkpoint", + "label": "NameNode Last Checkpoint", + "interval": 1, + "scope": "ANY", + "enabled": true, + "source": { + "type": "SCRIPT", + "path": "BIGTOP/0.8/services/HDFS/package/files/alert_checkpoint_time.py" + } + }, + { + "name": "namenode_ha_health", + "label": "NameNode High Availability Health", + "interval": 1, + "scope": "ANY", + "enabled": true, + "ignore_host": true, + "source": { + "type": "SCRIPT", + "path": "BIGTOP/0.8/services/HDFS/package/files/alert_ha_namenode_health.py" + } + } + ], + "SECONDARY_NAMENODE": [ + { + "name": "secondary_namenode_process", + "label": "Secondary NameNode Process", + "interval": 1, + "scope": "ANY", + "enabled": true, + "source": { + "type": "PORT", + "uri": "{{hdfs-site/dfs.namenode.secondary.http-address}}", + "default_port": 50071, + "reporting": { + "ok": { + "text": "TCP OK - {0:.4f} response on port {1}" + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}" + } + } + } + } + ], + "JOURNALNODE": [ + { + "name": "journalnode_process", + "label": "JournalNode Process", + "interval": 1, + "scope": "HOST", + "enabled": true, + "source": { + "type": "PORT", + "uri": "{{hdfs-site/dfs.journalnode.http-address}}", + "default_port": 8480, + "reporting": { + "ok": { + "text": "TCP OK - {0:.4f} response on port {1}" + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}" + } + } + } + } + ], + "DATANODE": [ + { + "name": "datanode_process", + "label": "DateNode Process", + "interval": 1, + "scope": "HOST", + "enabled": true, + "source": { + "type": "PORT", + "uri": "{{hdfs-site/dfs.datanode.address}}", + "default_port": 50010, + "reporting": { + "ok": { + "text": "TCP OK - {0:.4f} response on port {1}" + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}" + } + } + } + }, + { + "name": "datanode_webui", + "label": "DataNode Web UI", + "interval": 1, + "scope": "HOST", + "enabled": true, + "source": { + "type": "WEB", + "uri": { + "http": "{{hdfs-site/dfs.datanode.http.address}}", + "https": "{{hdfs-site/dfs.datanode.https.address}}", + "https_property": "{{hdfs-site/dfs.http.policy}}", + "https_property_value": "HTTPS_ONLY" + }, + "reporting": { + "ok": { + "text": "HTTP {0} response in {2:.4f} seconds" + }, + "warning":{ + "text": "HTTP {0} response in {2:.4f} seconds" + }, + "critical": { + "text": "Connection failed to {1}" + } + } + } + }, + { + "name": "datanode_storage", + "label": "DataNode Storage", + "interval": 2, + "scope": "HOST", + "enabled": true, + "source": { + "type": "METRIC", + "uri": { + "http": "{{hdfs-site/dfs.datanode.http.address}}", + "https": "{{hdfs-site/dfs.datanode.https.address}}", + "https_property": "{{hdfs-site/dfs.http.policy}}", + "https_property_value": "HTTPS_ONLY" + }, + "reporting": { + "ok": { + "text": "Remaining Capacity:[{0}], Total Capacity:[{2:d}% Used, {1}]" + }, + "warning": { + "text": "Remaining Capacity:[{0}], Total Capacity:[{2:d}% Used, {1}]", + "value": 80 + }, + "critical": { + "text": "Remaining Capacity:[{0}], Total Capacity:[{2:d}% Used, {1}]", + "value": 90 + } + }, + "jmx": { + "property_list": [ + "Hadoop:service=DataNode,name=FSDatasetState-*/Remaining", + "Hadoop:service=DataNode,name=FSDatasetState-*/Capacity" + ], + "value": "({1} - {0})/{1} * 100" + } + } + } + ], + "ZKFC": [ + { + "name": "hdfs_zookeeper_failover_controller_process", + "label": "ZooKeeper Failover Controller Process", + "interval": 1, + "scope": "ANY", + "enabled": true, + "source": { + "type": "PORT", + "uri": "{{core-site/ha.zookeeper.quorum}}", + "default_port": 2181, + "reporting": { + "ok": { + "text": "TCP OK - {0:.4f} response on port {1}" + }, + "critical": { + "text": "Connection failed: {0} on host {1}:{2}" + } + } + } + } + ] + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/d348f5db/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_checkpoint_time.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_checkpoint_time.py b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_checkpoint_time.py new file mode 100644 index 0000000..410608f --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_checkpoint_time.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python + +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import time +import urllib2 +import json + +LABEL = 'Last Checkpoint: [{h} hours, {m} minutes, {tx} transactions]' + +NN_HTTP_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.http-address}}' +NN_HTTPS_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.https-address}}' +NN_HTTP_POLICY_KEY = '{{hdfs-site/dfs.http.policy}}' +NN_CHECKPOINT_TX_KEY = '{{hdfs-site/dfs.namenode.checkpoint.txns}}' +NN_CHECKPOINT_PERIOD_KEY = '{{hdfs-site/dfs.namenode.checkpoint.period}}' + +PERCENT_WARNING = 200 +PERCENT_CRITICAL = 200 + +CHECKPOINT_TX_DEFAULT = 1000000 +CHECKPOINT_PERIOD_DEFAULT = 21600 + +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return (NN_HTTP_ADDRESS_KEY, NN_HTTPS_ADDRESS_KEY, NN_HTTP_POLICY_KEY, + NN_CHECKPOINT_TX_KEY, NN_CHECKPOINT_PERIOD_KEY) + + +def execute(parameters=None, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + parameters (dictionary): a mapping of parameter key to value + host_name (string): the name of this host where the alert is running + """ + + if parameters is None: + return (('UNKNOWN', ['There were no parameters supplied to the script.'])) + + uri = None + scheme = 'http' + http_uri = None + https_uri = None + http_policy = 'HTTP_ONLY' + percent_warning = PERCENT_WARNING + percent_critical = PERCENT_CRITICAL + checkpoint_tx = CHECKPOINT_TX_DEFAULT + checkpoint_period = CHECKPOINT_PERIOD_DEFAULT + + if NN_HTTP_ADDRESS_KEY in parameters: + http_uri = parameters[NN_HTTP_ADDRESS_KEY] + + if NN_HTTPS_ADDRESS_KEY in parameters: + https_uri = parameters[NN_HTTPS_ADDRESS_KEY] + + if NN_HTTP_POLICY_KEY in parameters: + http_policy = parameters[NN_HTTP_POLICY_KEY] + + if NN_CHECKPOINT_TX_KEY in parameters: + checkpoint_tx = parameters[NN_CHECKPOINT_TX_KEY] + + if NN_CHECKPOINT_PERIOD_KEY in parameters: + checkpoint_period = parameters[NN_CHECKPOINT_PERIOD_KEY] + + # determine the right URI and whether to use SSL + uri = http_uri + if http_policy == 'HTTPS_ONLY': + scheme = 'https' + + if https_uri is not None: + uri = https_uri + + current_time = int(round(time.time() * 1000)) + + last_checkpoint_time_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".format(scheme,uri) + journal_transaction_info_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(scheme,uri) + + # start out assuming an OK status + label = None + result_code = "OK" + + try: + last_checkpoint_time = int(get_value_from_jmx(last_checkpoint_time_qry,"LastCheckpointTime")) + journal_transaction_info = get_value_from_jmx(journal_transaction_info_qry,"JournalTransactionInfo") + journal_transaction_info_dict = json.loads(journal_transaction_info) + + last_tx = int(journal_transaction_info_dict['LastAppliedOrWrittenTxId']) + most_recent_tx = int(journal_transaction_info_dict['MostRecentCheckpointTxId']) + transaction_difference = last_tx - most_recent_tx + + delta = (current_time - last_checkpoint_time)/1000 + + label = LABEL.format(h=get_time(delta)['h'], m=get_time(delta)['m'], tx=transaction_difference) + + if (transaction_difference > int(checkpoint_tx)) and (float(delta) / int(checkpoint_period)*100 >= int(percent_critical)): + result_code = 'CRITICAL' + elif (transaction_difference > int(checkpoint_tx)) and (float(delta) / int(checkpoint_period)*100 >= int(percent_warning)): + result_code = 'WARNING' + + except Exception, e: + label = str(e) + result_code = 'UNKNOWN' + + return ((result_code, [label])) + +def get_time(delta): + h = int(delta/3600) + m = int((delta % 3600)/60) + return {'h':h, 'm':m} + + +def get_value_from_jmx(qry, property): + response = urllib2.urlopen(qry) + data=response.read() + data_dict = json.loads(data) + return data_dict["beans"][0][property] http://git-wip-us.apache.org/repos/asf/ambari/blob/d348f5db/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_ha_namenode_health.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_ha_namenode_health.py b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_ha_namenode_health.py new file mode 100644 index 0000000..fc1541d --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_ha_namenode_health.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python + +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import urllib2 +import json + +RESULT_STATE_OK = 'OK' +RESULT_STATE_CRITICAL = 'CRITICAL' +RESULT_STATE_UNKNOWN = 'UNKNOWN' +RESULT_STATE_SKIPPED = 'SKIPPED' + +HDFS_NN_STATE_ACTIVE = 'active' +HDFS_NN_STATE_STANDBY = 'standby' + +HDFS_SITE_KEY = '{{hdfs-site}}' +NAMESERVICE_KEY = '{{hdfs-site/dfs.nameservices}}' +NN_HTTP_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.http-address}}' +NN_HTTPS_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.https-address}}' +DFS_POLICY_KEY = '{{hdfs-site/dfs.http.policy}}' + +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return (HDFS_SITE_KEY, NAMESERVICE_KEY, NN_HTTP_ADDRESS_KEY, + NN_HTTPS_ADDRESS_KEY, DFS_POLICY_KEY) + + +def execute(parameters=None, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + parameters (dictionary): a mapping of parameter key to value + host_name (string): the name of this host where the alert is running + """ + if parameters is None: + return (RESULT_STATE_UNKNOWN, ['There were no parameters supplied to the script.']) + + # if not in HA mode, then SKIP + if not NAMESERVICE_KEY in parameters: + return (RESULT_STATE_SKIPPED, ['NameNode HA is not enabled']) + + # hdfs-site is required + if not HDFS_SITE_KEY in parameters: + return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)]) + + # determine whether or not SSL is enabled + is_ssl_enabled = False + if DFS_POLICY_KEY in parameters: + dfs_policy = parameters[DFS_POLICY_KEY] + if dfs_policy == "HTTPS_ONLY": + is_ssl_enabled = True + + name_service = parameters[NAMESERVICE_KEY] + hdfs_site = parameters[HDFS_SITE_KEY] + + # look for dfs.ha.namenodes.foo + nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service + if not nn_unique_ids_key in hdfs_site: + return (RESULT_STATE_UNKNOWN, ['Unable to find unique namenode alias key {0}'.format(nn_unique_ids_key)]) + + namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}' + jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus" + + if is_ssl_enabled: + namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}' + jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus" + + + active_namenodes = [] + standby_namenodes = [] + unknown_namenodes = [] + + # now we have something like 'nn1,nn2,nn3,nn4' + # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id] + # ie dfs.namenode.http-address.hacluster.nn1 + nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',') + for nn_unique_id in nn_unique_ids: + key = namenode_http_fragment.format(name_service,nn_unique_id) + + if key in hdfs_site: + # use str() to ensure that unicode strings do not have the u' in them + value = str(hdfs_site[key]) + + try: + jmx_uri = jmx_uri_fragment.format(value) + state = get_value_from_jmx(jmx_uri,'State') + + if state == HDFS_NN_STATE_ACTIVE: + active_namenodes.append(value) + elif state == HDFS_NN_STATE_STANDBY: + standby_namenodes.append(value) + else: + unknown_namenodes.append(value) + except: + unknown_namenodes.append(value) + + # now that the request is done, determine if this host is the host that + # should report the status of the HA topology + is_active_namenode = False + for active_namenode in active_namenodes: + if active_namenode.startswith(host_name): + is_active_namenode = True + + # there's only one scenario here; there is exactly 1 active and 1 standby + is_topology_healthy = len(active_namenodes) == 1 and len(standby_namenodes) == 1 + + result_label = 'Active{0}, Standby{1}, Unknown{2}'.format(str(active_namenodes), + str(standby_namenodes), str(unknown_namenodes)) + + # Healthy Topology: + # - Active NN reports the alert, standby does not + # + # Unhealthy Topology: + # - Report the alert if this is the first named host + # - Report the alert if not the first named host, but the other host + # could not report its status + if is_topology_healthy: + if is_active_namenode is True: + return (RESULT_STATE_OK, [result_label]) + else: + return (RESULT_STATE_SKIPPED, ['Another host will report this alert']) + else: + # dfs.namenode.rpc-address.service.alias is guaranteed in HA mode + first_listed_host_key = 'dfs.namenode.rpc-address.{0}.{1}'.format( + name_service, nn_unique_ids[0]) + + first_listed_host = '' + if first_listed_host_key in hdfs_site: + first_listed_host = hdfs_site[first_listed_host_key] + + is_first_listed_host = False + if first_listed_host.startswith(host_name): + is_first_listed_host = True + + if is_first_listed_host: + return (RESULT_STATE_CRITICAL, [result_label]) + else: + # not the first listed host, but the first host might be in the unknown + return (RESULT_STATE_SKIPPED, ['Another host will report this alert']) + + +def get_value_from_jmx(qry, property): + response = urllib2.urlopen(qry) + data=response.read() + data_dict = json.loads(data) + return data_dict["beans"][0][property] http://git-wip-us.apache.org/repos/asf/ambari/blob/d348f5db/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HIVE/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HIVE/alerts.json b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HIVE/alerts.json new file mode 100644 index 0000000..3c279a8 --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HIVE/alerts.json @@ -0,0 +1,39 @@ +{ + "HIVE": { + "service": [], + "HIVE_METASTORE": [ + { + "name": "hive_metastore_process", + "label": "Hive Metastore Process", + "interval": 1, + "scope": "ANY", + "source": { + "type": "PORT", + "uri": "{{hive-site/hive.metastore.uris}}", + "default_port": 9083, + "reporting": { + "ok": { + "text": "TCP OK - {0:.4f} response on port {1}" + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}" + } + } + } + } + ], + "HIVE_SERVER": [ + { + "name": "hive_server_process", + "label": "HiveServer2 Process", + "interval": 1, + "scope": "ANY", + "enabled": true, + "source": { + "type": "SCRIPT", + "path": "BIGTOP/0.8/services/HIVE/package/files/alert_hive_thrift_port.py" + } + } + ] + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/d348f5db/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HIVE/package/files/alert_hive_thrift_port.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HIVE/package/files/alert_hive_thrift_port.py b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HIVE/package/files/alert_hive_thrift_port.py new file mode 100644 index 0000000..bd3f276 --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HIVE/package/files/alert_hive_thrift_port.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python + +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import json +import socket +import time +import traceback +import urllib2 +from resource_management.libraries.functions import hive_check + +OK_MESSAGE = "TCP OK - %.4f response on port %s" +CRITICAL_MESSAGE = "Connection failed on host {0}:{1}" + +HIVE_SERVER_THRIFT_PORT_KEY = '{{hive-site/hive.server2.thrift.port}}' +SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}' + +PERCENT_WARNING = 200 +PERCENT_CRITICAL = 200 + +THRIFT_PORT_DEFAULT = 10000 + +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return (HIVE_SERVER_THRIFT_PORT_KEY,SECURITY_ENABLED_KEY) + + +def execute(parameters=None, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + parameters (dictionary): a mapping of parameter key to value + host_name (string): the name of this host where the alert is running + """ + + if parameters is None: + return (('UNKNOWN', ['There were no parameters supplied to the script.'])) + + thrift_port = THRIFT_PORT_DEFAULT + if HIVE_SERVER_THRIFT_PORT_KEY in parameters: + thrift_port = int(parameters[HIVE_SERVER_THRIFT_PORT_KEY]) + + security_enabled = False + if SECURITY_ENABLED_KEY in parameters: + security_enabled = bool(parameters[SECURITY_ENABLED_KEY]) + + result_code = None + + try: + if host_name is None: + host_name = socket.getfqdn() + + start_time = time.time() + is_thrift_port_ok = hive_check.check_thrift_port_sasl(host_name, + thrift_port, security_enabled=security_enabled) + + if is_thrift_port_ok == True: + result_code = 'OK' + total_time = time.time() - start_time + label = OK_MESSAGE % (total_time, thrift_port) + else: + result_code = 'CRITICAL' + label = CRITICAL_MESSAGE.format(host_name,thrift_port) + + except Exception, e: + label = str(e) + result_code = 'UNKNOWN' + + return ((result_code, [label])) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/d348f5db/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/OOZIE/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/OOZIE/alerts.json b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/OOZIE/alerts.json new file mode 100644 index 0000000..478f887 --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/OOZIE/alerts.json @@ -0,0 +1,40 @@ +{ + "OOZIE": { + "service": [], + "OOZIE_SERVER": [ + { + "name": "oozie_server_webui", + "label": "Oozie Server Web UI", + "interval": 1, + "scope": "ANY", + "source": { + "type": "WEB", + "uri": { + "http": "{{oozie-site/oozie.base.url}}/oozie" + }, + "reporting": { + "ok": { + "text": "HTTP {0} response in {2:.4f} seconds" + }, + "warning":{ + "text": "HTTP {0} response in {2:.4f} seconds" + }, + "critical": { + "text": "Connection failed to {1}" + } + } + } + }, + { + "name": "oozie_server_status", + "label": "Oozie Server Status", + "interval": 1, + "scope": "ANY", + "source": { + "type": "SCRIPT", + "path": "BIGTOP/0.8/services/OOZIE/package/files/alert_check_oozie_server.py" + } + } + ] + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/d348f5db/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/OOZIE/package/files/alert_check_oozie_server.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/OOZIE/package/files/alert_check_oozie_server.py b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/OOZIE/package/files/alert_check_oozie_server.py new file mode 100644 index 0000000..7bf1255 --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/OOZIE/package/files/alert_check_oozie_server.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python + +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import subprocess +from subprocess import CalledProcessError + +RESULT_CODE_OK = 'OK' +RESULT_CODE_CRITICAL = 'CRITICAL' +RESULT_CODE_UNKNOWN = 'UNKNOWN' + +OOZIE_URL_KEY = '{{oozie-site/oozie.base.url}}' + +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return (OOZIE_URL_KEY) + + +def execute(parameters=None, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + parameters (dictionary): a mapping of parameter key to value + host_name (string): the name of this host where the alert is running + """ + + if parameters is None: + return (RESULT_CODE_UNKNOWN, ['There were no parameters supplied to the script.']) + + oozie_url = None + if OOZIE_URL_KEY in parameters: + oozie_url = parameters[OOZIE_URL_KEY] + + if oozie_url is None: + return (RESULT_CODE_UNKNOWN, ['The Oozie URL is a required parameter.']) + + try: + # oozie admin -oozie http://server:11000/oozie -status + oozie_process = subprocess.Popen(['oozie', 'admin', '-oozie', + oozie_url, '-status'], stderr=subprocess.PIPE, stdout=subprocess.PIPE) + + oozie_output, oozie_error = oozie_process.communicate() + oozie_return_code = oozie_process.returncode + + if oozie_return_code == 0: + # strip trailing newlines + oozie_output = str(oozie_output).strip('\n') + return (RESULT_CODE_OK, [oozie_output]) + else: + oozie_error = str(oozie_error).strip('\n') + return (RESULT_CODE_CRITICAL, [oozie_error]) + + except CalledProcessError, cpe: + return (RESULT_CODE_CRITICAL, [str(cpe)]) http://git-wip-us.apache.org/repos/asf/ambari/blob/d348f5db/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/WEBHCAT/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/WEBHCAT/alerts.json b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/WEBHCAT/alerts.json new file mode 100644 index 0000000..88b5e3b --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/WEBHCAT/alerts.json @@ -0,0 +1,18 @@ +{ + "WEBHCAT": { + "service": [], + "WEBHCAT_SERVER": [ + { + "name": "hive_webhcat_server_status", + "label": "WebHCat Server Status", + "interval": 1, + "scope": "ANY", + "enabled": true, + "source": { + "type": "SCRIPT", + "path": "BIGTOP/0.8/services/WEBHCAT/package/files/alert_webhcat_server.py" + } + } + ] + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/d348f5db/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/WEBHCAT/package/files/alert_webhcat_server.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/WEBHCAT/package/files/alert_webhcat_server.py b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/WEBHCAT/package/files/alert_webhcat_server.py new file mode 100644 index 0000000..44840de --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/WEBHCAT/package/files/alert_webhcat_server.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python + +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import json +import socket +import time +import urllib2 + +RESULT_CODE_OK = 'OK' +RESULT_CODE_CRITICAL = 'CRITICAL' +RESULT_CODE_UNKNOWN = 'UNKNOWN' + +OK_MESSAGE = 'TCP OK - {0:.4f} response on port {1}' +CRITICAL_CONNECTION_MESSAGE = 'Connection failed on host {0}:{1}' +CRITICAL_TEMPLETON_STATUS_MESSAGE = 'WebHCat returned an unexpected status of "{0}"' +CRITICAL_TEMPLETON_UNKNOWN_JSON_MESSAGE = 'Unable to determine WebHCat health from unexpected JSON response' + +TEMPLETON_PORT_KEY = '{{webhcat-site/templeton.port}}' +SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}' + +TEMPLETON_OK_RESPONSE = 'ok' +TEMPLETON_PORT_DEFAULT = 50111 + +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return (TEMPLETON_PORT_KEY,SECURITY_ENABLED_KEY) + + +def execute(parameters=None, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + parameters (dictionary): a mapping of parameter key to value + host_name (string): the name of this host where the alert is running + """ + + result_code = RESULT_CODE_UNKNOWN + + if parameters is None: + return (result_code, ['There were no parameters supplied to the script.']) + + templeton_port = TEMPLETON_PORT_DEFAULT + if TEMPLETON_PORT_KEY in parameters: + templeton_port = int(parameters[TEMPLETON_PORT_KEY]) + + security_enabled = False + if SECURITY_ENABLED_KEY in parameters: + security_enabled = parameters[SECURITY_ENABLED_KEY].lower() == 'true' + + scheme = 'http' + if security_enabled is True: + scheme = 'https' + + label = '' + url_response = None + templeton_status = '' + total_time = 0 + + try: + # the alert will always run on the webhcat host + if host_name is None: + host_name = socket.getfqdn() + + query = "{0}://{1}:{2}/templeton/v1/status".format(scheme, host_name, + templeton_port) + + # execute the query for the JSON that includes templeton status + start_time = time.time() + url_response = urllib2.urlopen(query) + total_time = time.time() - start_time + except: + label = CRITICAL_CONNECTION_MESSAGE.format(host_name,templeton_port) + return (RESULT_CODE_CRITICAL, [label]) + + # URL response received, parse it + try: + json_response = json.loads(url_response.read()) + templeton_status = json_response['status'] + except: + return (RESULT_CODE_CRITICAL, [CRITICAL_TEMPLETON_UNKNOWN_JSON_MESSAGE]) + + # proper JSON received, compare against known value + if templeton_status.lower() == TEMPLETON_OK_RESPONSE: + result_code = RESULT_CODE_OK + label = OK_MESSAGE.format(total_time, templeton_port) + else: + result_code = RESULT_CODE_CRITICAL + label = CRITICAL_TEMPLETON_STATUS_MESSAGE.format(templeton_status) + + return (result_code, [label]) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/d348f5db/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/YARN/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/YARN/alerts.json b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/YARN/alerts.json new file mode 100644 index 0000000..e0100e5 --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/YARN/alerts.json @@ -0,0 +1,321 @@ +{ + "MAPREDUCE2": { + "service": [], + "HISTORYSERVER": [ + { + "name": "mapreduce_history_server_webui", + "label": "History Server Web UI", + "interval": 1, + "scope": "ANY", + "source": { + "type": "WEB", + "uri": { + "http": "{{mapred-site/mapreduce.jobhistory.webapp.address}}", + "https": "{{mapred-site/mapreduce.jobhistory.webapp.https.address}}", + "https_property": "{{mapred-site/mapreduce.jobhistory.http.policy}}", + "https_property_value": "HTTPS_ONLY" + }, + "reporting": { + "ok": { + "text": "HTTP {0} response in {2:.4f} seconds" + }, + "warning":{ + "text": "HTTP {0} response in {2:.4f} seconds" + }, + "critical": { + "text": "Connection failed to {1}" + } + } + } + }, + { + "name": "mapreduce_history_server_cpu", + "label": "History Server CPU Utilization", + "interval": 5, + "scope": "ANY", + "enabled": true, + "source": { + "type": "METRIC", + "uri": { + "http": "{{mapred-site/mapreduce.jobhistory.webapp.address}}", + "https": "{{mapred-site/mapreduce.jobhistory.webapp.https.address}}", + "https_property": "{{mapred-site/mapreduce.jobhistory.http.policy}}", + "https_property_value": "HTTPS_ONLY" + }, + "reporting": { + "ok": { + "text": "{1} CPU, load {0:.1%}" + }, + "warning": { + "text": "{1} CPU, load {0:.1%}", + "value": 200 + }, + "critical": { + "text": "{1} CPU, load {0:.1%}", + "value": 250 + } + }, + "jmx": { + "property_list": [ + "java.lang:type=OperatingSystem/SystemCpuLoad", + "java.lang:type=OperatingSystem/AvailableProcessors" + ], + "value": "{0} * 100" + } + } + }, + { + "name": "mapreduce_history_server_rpc_latency", + "label": "History Server RPC Latency", + "interval": 5, + "scope": "ANY", + "enabled": true, + "source": { + "type": "METRIC", + "uri": { + "http": "{{mapred-site/mapreduce.jobhistory.webapp.address}}", + "https": "{{mapred-site/mapreduce.jobhistory.webapp.https.address}}", + "https_property": "{{mapred-site/mapreduce.jobhistory.http.policy}}", + "https_property_value": "HTTPS_ONLY" + }, + "reporting": { + "ok": { + "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]" + }, + "warning": { + "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]", + "value": 3000 + }, + "critical": { + "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]", + "value": 5000 + } + }, + "jmx": { + "property_list": [ + "Hadoop:service=JobHistoryServer,name=RpcActivityForPort*/RpcQueueTimeAvgTime", + "Hadoop:service=JobHistoryServer,name=RpcActivityForPort*/RpcProcessingTimeAvgTime" + ], + "value": "{0}" + } + } + }, + { + "name": "mapreduce_history_server_process", + "label": "History Server Process", + "interval": 1, + "scope": "ANY", + "enabled": true, + "source": { + "type": "PORT", + "uri": "{{mapred-site/mapreduce.jobhistory.webapp.address}}", + "default_port": 19888, + "reporting": { + "ok": { + "text": "TCP OK - {0:.4f} response on port {1}" + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}" + } + } + } + } + ] + }, + "YARN": { + "service": [ + { + "name": "yarn_nodemanager_webui_percent", + "label": "Percent NodeManagers Available", + "interval": 1, + "scope": "SERVICE", + "enabled": true, + "source": { + "type": "AGGREGATE", + "alert_name": "yarn_nodemanager_webui", + "reporting": { + "ok": { + "text": "affected: [{1}], total: [{0}]" + }, + "warning": { + "text": "affected: [{1}], total: [{0}]", + "value": 0.1 + }, + "critical": { + "text": "affected: [{1}], total: [{0}]", + "value": 0.3 + } + } + } + } + ], + "NODEMANAGER": [ + { + "name": "yarn_nodemanager_webui", + "label": "NodeManager Web UI", + "interval": 1, + "scope": "HOST", + "source": { + "type": "WEB", + "uri": { + "http": "{{yarn-site/yarn.nodemanager.webapp.address}}", + "https": "{{yarn-site/yarn.nodemanager.webapp.https.address}}", + "https_property": "{{yarn-site/yarn.http.policy}}", + "https_property_value": "HTTPS_ONLY", + "default_port": 8042 + }, + "reporting": { + "ok": { + "text": "HTTP {0} response in {2:.4f} seconds" + }, + "warning":{ + "text": "HTTP {0} response in {2:.4f} seconds" + }, + "critical": { + "text": "Connection failed to {1}" + } + } + } + }, + { + "name": "yarn_nodemanager_health", + "label": "NodeManager Health", + "interval": 1, + "scope": "HOST", + "enabled": true, + "source": { + "type": "SCRIPT", + "path": "BIGTOP/0.8/services/YARN/package/files/alert_nodemanager_health.py" + } + } + ], + "RESOURCEMANAGER": [ + { + "name": "yarn_resourcemanager_webui", + "label": "ResourceManager Web UI", + "interval": 1, + "scope": "ANY", + "source": { + "type": "WEB", + "uri": { + "http": "{{yarn-site/yarn.resourcemanager.webapp.address}}", + "https": "{{yarn-site/yarn.resourcemanager.webapp.https.address}}", + "https_property": "{{yarn-site/yarn.http.policy}}", + "https_property_value": "HTTPS_ONLY" + }, + "reporting": { + "ok": { + "text": "HTTP {0} response in {2:.4f} seconds" + }, + "warning":{ + "text": "HTTP {0} response in {2:.4f} seconds" + }, + "critical": { + "text": "Connection failed to {1}" + } + } + } + }, + { + "name": "yarn_resourcemanager_cpu", + "label": "ResourceManager CPU Utilization", + "interval": 5, + "scope": "ANY", + "enabled": true, + "source": { + "type": "METRIC", + "uri": { + "http": "{{yarn-site/yarn.resourcemanager.webapp.address}}", + "https": "{{yarn-site/yarn.resourcemanager.webapp.https.address}}", + "https_property": "{{yarn-site/yarn.http.policy}}", + "https_property_value": "HTTPS_ONLY" + }, + "reporting": { + "ok": { + "text": "{1} CPU, load {0:.1%}" + }, + "warning": { + "text": "{1} CPU, load {0:.1%}", + "value": 200 + }, + "critical": { + "text": "{1} CPU, load {0:.1%}", + "value": 250 + } + }, + "jmx": { + "property_list": [ + "java.lang:type=OperatingSystem/SystemCpuLoad", + "java.lang:type=OperatingSystem/AvailableProcessors" + ], + "value": "{0} * 100" + } + } + }, + { + "name": "yarn_resourcemanager_rpc_latency", + "label": "ResourceManager RPC Latency", + "interval": 5, + "scope": "ANY", + "enabled": true, + "source": { + "type": "METRIC", + "uri": { + "http": "{{yarn-site/yarn.resourcemanager.webapp.address}}", + "https": "{{yarn-site/yarn.resourcemanager.webapp.https.address}}", + "https_property": "{{yarn-site/yarn.http.policy}}", + "https_property_value": "HTTPS_ONLY" + }, + "reporting": { + "ok": { + "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]" + }, + "warning": { + "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]", + "value": 3000 + }, + "critical": { + "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]", + "value": 5000 + } + }, + "jmx": { + "property_list": [ + "Hadoop:service=ResourceManager,name=RpcActivityForPort*/RpcQueueTimeAvgTime", + "Hadoop:service=ResourceManager,name=RpcActivityForPort*/RpcProcessingTimeAvgTime" + ], + "value": "{0}" + } + } + } + ], + "APP_TIMELINE_SERVER": [ + { + "name": "yarn_app_timeline_server_webui", + "label": "App Timeline Web UI", + "interval": 1, + "scope": "ANY", + "source": { + "type": "WEB", + "uri": { + "http": "{{yarn-site/yarn.timeline-service.webapp.address}}", + "https": "{{yarn-site/yarn.timeline-service.webapp.https.address}}", + "https_property": "{{yarn-site/yarn.http.policy}}", + "https_property_value": "HTTPS_ONLY" + }, + "reporting": { + "ok": { + "text": "HTTP {0} response in {2:.4f} seconds" + }, + "warning":{ + "text": "HTTP {0} response in {2:.4f} seconds" + }, + "critical": { + "text": "Connection failed to {1}" + } + } + } + } + ] + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/d348f5db/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/YARN/package/files/alert_nodemanager_health.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/YARN/package/files/alert_nodemanager_health.py b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/YARN/package/files/alert_nodemanager_health.py new file mode 100644 index 0000000..b1de951 --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/YARN/package/files/alert_nodemanager_health.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python + +""" +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import json +import socket +import urllib2 + +RESULT_CODE_OK = 'OK' +RESULT_CODE_CRITICAL = 'CRITICAL' +RESULT_CODE_UNKNOWN = 'UNKNOWN' + +NODEMANAGER_HTTP_ADDRESS_KEY = '{{yarn-site/yarn.nodemanager.webapp.address}}' +NODEMANAGER_HTTPS_ADDRESS_KEY = '{{yarn-site/yarn.nodemanager.webapp.https.address}}' +YARN_HTTP_POLICY_KEY = '{{yarn-site/yarn.http.policy}}' + +OK_MESSAGE = 'NodeManager Healthy' +CRITICAL_CONNECTION_MESSAGE = 'Connection failed to {0}' +CRITICAL_NODEMANAGER_STATUS_MESSAGE = 'NodeManager returned an unexpected status of "{0}"' +CRITICAL_NODEMANAGER_UNKNOWN_JSON_MESSAGE = 'Unable to determine NodeManager health from unexpected JSON response' + +NODEMANAGER_DEFAULT_PORT = 8042 + +def get_tokens(): + """ + Returns a tuple of tokens in the format {{site/property}} that will be used + to build the dictionary passed into execute + """ + return (NODEMANAGER_HTTP_ADDRESS_KEY,NODEMANAGER_HTTPS_ADDRESS_KEY, + YARN_HTTP_POLICY_KEY) + + +def execute(parameters=None, host_name=None): + """ + Returns a tuple containing the result code and a pre-formatted result label + + Keyword arguments: + parameters (dictionary): a mapping of parameter key to value + host_name (string): the name of this host where the alert is running + """ + result_code = RESULT_CODE_UNKNOWN + + if parameters is None: + return (result_code, ['There were no parameters supplied to the script.']) + + scheme = 'http' + http_uri = None + https_uri = None + http_policy = 'HTTP_ONLY' + + if NODEMANAGER_HTTP_ADDRESS_KEY in parameters: + http_uri = parameters[NODEMANAGER_HTTP_ADDRESS_KEY] + + if NODEMANAGER_HTTPS_ADDRESS_KEY in parameters: + https_uri = parameters[NODEMANAGER_HTTPS_ADDRESS_KEY] + + if YARN_HTTP_POLICY_KEY in parameters: + http_policy = parameters[YARN_HTTP_POLICY_KEY] + + # determine the right URI and whether to use SSL + uri = http_uri + if http_policy == 'HTTPS_ONLY': + scheme = 'https' + + if https_uri is not None: + uri = https_uri + + label = '' + url_response = None + node_healthy = 'false' + total_time = 0 + + # some yarn-site structures don't have the web ui address + if uri is None: + if host_name is None: + host_name = socket.getfqdn() + + uri = '{0}:{1}'.format(host_name, NODEMANAGER_DEFAULT_PORT) + + try: + query = "{0}://{1}/ws/v1/node/info".format(scheme,uri) + + # execute the query for the JSON that includes templeton status + url_response = urllib2.urlopen(query) + except: + label = CRITICAL_CONNECTION_MESSAGE.format(uri) + return (RESULT_CODE_CRITICAL, [label]) + + # URL response received, parse it + try: + json_response = json.loads(url_response.read()) + node_healthy = json_response['nodeInfo']['nodeHealthy'] + + # convert boolean to string + node_healthy = str(node_healthy) + except: + return (RESULT_CODE_CRITICAL, [query]) + + # proper JSON received, compare against known value + if node_healthy.lower() == 'true': + result_code = RESULT_CODE_OK + label = OK_MESSAGE + else: + result_code = RESULT_CODE_CRITICAL + label = CRITICAL_NODEMANAGER_STATUS_MESSAGE.format(node_healthy) + + return (result_code, [label]) http://git-wip-us.apache.org/repos/asf/ambari/blob/d348f5db/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/ZOOKEEPER/alerts.json ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/ZOOKEEPER/alerts.json b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/ZOOKEEPER/alerts.json new file mode 100644 index 0000000..be210ea --- /dev/null +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/ZOOKEEPER/alerts.json @@ -0,0 +1,51 @@ +{ + "ZOOKEEPER": { + "service": [ + { + "name": "zookeeper_server_process_percent", + "label": "Percent ZooKeeper Servers Available", + "interval": 1, + "scope": "SERVICE", + "enabled": true, + "source": { + "type": "AGGREGATE", + "alert_name": "zookeeper_server_process", + "reporting": { + "ok": { + "text": "affected: [{1}], total: [{0}]" + }, + "warning": { + "text": "affected: [{1}], total: [{0}]", + "value": 0.35 + }, + "critical": { + "text": "affected: [{1}], total: [{0}]", + "value": 0.70 + } + } + } + } + ], + "ZOOKEEPER_SERVER": [ + { + "name": "zookeeper_server_process", + "label": "ZooKeeper Server Process", + "interval": 1, + "scope": "ANY", + "source": { + "type": "PORT", + "uri": "{{zookeeper-env/clientPort}}", + "default_port": 2181, + "reporting": { + "ok": { + "text": "TCP OK - {0:.4f} response on port {1}" + }, + "critical": { + "text": "Connection failed: {0} to {1}:{2}" + } + } + } + } + ] + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/d348f5db/ambari-server/src/test/java/org/apache/ambari/server/controller/internal/AlertDefinitionResourceProviderTest.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/java/org/apache/ambari/server/controller/internal/AlertDefinitionResourceProviderTest.java b/ambari-server/src/test/java/org/apache/ambari/server/controller/internal/AlertDefinitionResourceProviderTest.java index d4f5fb4..b7e8ced 100644 --- a/ambari-server/src/test/java/org/apache/ambari/server/controller/internal/AlertDefinitionResourceProviderTest.java +++ b/ambari-server/src/test/java/org/apache/ambari/server/controller/internal/AlertDefinitionResourceProviderTest.java @@ -161,6 +161,7 @@ public class AlertDefinitionResourceProviderTest { AlertDefinitionResourceProvider.ALERT_DEF_ID, AlertDefinitionResourceProvider.ALERT_DEF_NAME, AlertDefinitionResourceProvider.ALERT_DEF_LABEL, + AlertDefinitionResourceProvider.ALERT_DEF_IGNORE_HOST, AlertDefinitionResourceProvider.ALERT_DEF_SOURCE, AlertDefinitionResourceProvider.ALERT_DEF_SOURCE_TYPE); @@ -201,6 +202,10 @@ public class AlertDefinitionResourceProviderTest { Assert.assertEquals("Mock Label", r.getPropertyValue(AlertDefinitionResourceProvider.ALERT_DEF_LABEL)); + Assert.assertEquals( + Boolean.FALSE, + r.getPropertyValue(AlertDefinitionResourceProvider.ALERT_DEF_IGNORE_HOST)); + Assert.assertNotNull(r.getPropertyValue("AlertDefinition/source/type")); } http://git-wip-us.apache.org/repos/asf/ambari/blob/d348f5db/ambari-server/src/test/java/org/apache/ambari/server/controller/internal/AlertGroupResourceProviderTest.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/java/org/apache/ambari/server/controller/internal/AlertGroupResourceProviderTest.java b/ambari-server/src/test/java/org/apache/ambari/server/controller/internal/AlertGroupResourceProviderTest.java index 9aad9a1..3d9f331 100644 --- a/ambari-server/src/test/java/org/apache/ambari/server/controller/internal/AlertGroupResourceProviderTest.java +++ b/ambari-server/src/test/java/org/apache/ambari/server/controller/internal/AlertGroupResourceProviderTest.java @@ -522,6 +522,37 @@ public class AlertGroupResourceProviderTest { } /** + * Tests that a default group cannot be deleted via the resource provider. + * + * @throws Exception + */ + @Test + public void testDeleteDefaultGroup() throws Exception { + AlertGroupEntity group = new AlertGroupEntity(); + group.setGroupId(ALERT_GROUP_ID); + group.setDefault(true); + group.setGroupName(ALERT_GROUP_NAME); + group.setAlertDefinitions(getMockDefinitions()); + group.setAlertTargets(getMockTargets()); + + resetToStrict(m_dao); + expect(m_dao.findGroupById(ALERT_GROUP_ID)).andReturn(group).anyTimes(); + + replay(m_dao); + + AlertGroupResourceProvider provider = createProvider(m_amc); + + Predicate predicate = new PredicateBuilder().property( + AlertGroupResourceProvider.ALERT_GROUP_CLUSTER_NAME).equals( + ALERT_GROUP_CLUSTER_NAME).and().property( + AlertGroupResourceProvider.ALERT_GROUP_ID).equals( + ALERT_GROUP_ID.toString()).toPredicate(); + + provider.deleteResources(predicate); + verify(m_dao); + } + + /** * @param amc * @return */