AMBARI-7284 - (Apache AMBARI-7284) Hadoop cluster alerts need updates for Hadoop 2.4 and 2.5
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/a14ca238 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/a14ca238 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/a14ca238 Branch: refs/heads/branch-alerts-dev Commit: a14ca23882a76c1daa27038c8e00a20d231ce55f Parents: 3f932cf Author: Artem Baranchuk <abaranc...@hortonworks.com> Authored: Mon Sep 15 19:50:29 2014 +0300 Committer: Artem Baranchuk <abaranc...@hortonworks.com> Committed: Thu Sep 18 13:47:48 2014 +0300 ---------------------------------------------------------------------- .../services/NAGIOS/package/files/sys_logger.py | 30 +++++--- .../test/nagios/plugins/test_sys_logger.py | 77 ++++++++++++++++++-- 2 files changed, 91 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/a14ca238/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/sys_logger.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/sys_logger.py b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/sys_logger.py index 8f0a415..e86a8fb 100644 --- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/sys_logger.py +++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/sys_logger.py @@ -114,14 +114,24 @@ msg_ids = {'Host::Ping':'host_down', 'GANGLIA::Ganglia Monitor process for ResourceManager':'ganglia_monitor_process', 'GANGLIA::Ganglia Monitor process for HistoryServer':'ganglia_monitor_process', 'HBASEMASTER::HBase Master process':'hbase_master_process', + 'HBASE::Percent RegionServers live':'regionservers_down', 'REGIONSERVER::RegionServer process':'regionserver_process', 'NAGIOS::Nagios status log freshness':'nagios_process', 'FLUME::Flume Agent process':'flume_agent_process', 'OOZIE::Oozie Server status':'oozie_server_process', 'HIVE-METASTORE::Hive Metastore status':'hive_metastore_process', - 'WEBHCAT::WebHCat Server status':'webhcat_server_process', - 'RESOURCEMANAGER::ResourceManager process':'resourcemanager_process', - 'NODEMANAGER::NodeManager process':'nodemanager_process', + 'WEBHCAT::WebHCat Server status':'webhcat_down', + 'RESOURCEMANAGER::ResourceManager process':'resourcemanager_process_down', + 'RESOURCEMANAGER::ResourceManager RPC latency':'resourcemanager_rpc_latency', + 'RESOURCEMANAGER::ResourceManager CPU utilization':'resourcemanager_cpu_utilization', + 'RESOURCEMANAGER::ResourceManager Web UI':'recourcemanager_ui', + 'NODEMANAGER::NodeManager process':'nodemanager_process_down', + 'NODEMANAGER::NodeManager health':'nodemanager_health', + 'NODEMANAGER::Percent NodeManagers live':'nodemanagers_down', + 'APP_TIMELINE_SERVER::App Timeline Server process':'timelineserver_process', + 'JOBHISTORY::HistoryServer RPC latency':'historyserver_rpc_latency', + 'JOBHISTORY::HistoryServer CPU utilization':'historyserver_cpu_utilization', + 'JOBHISTORY::HistoryServer Web UI':'historyserver_ui', 'JOBHISTORY::HistoryServer process':'historyserver_process'} # Determine the severity of the TVI alert based on the Nagios alert state. @@ -142,13 +152,13 @@ def determine_severity(state, service): # Determine the msg id for the TVI alert from based on the service which generates the Nagios alert. # The msg id is used to correlate a log msg to a TVI rule. def determine_msg_id(service, severity): - if msg_ids.has_key(service): - msg_id = msg_ids[service] - if severity == 'OK': - msg_id = '{0}_ok'.format(msg_id) - - return msg_id - else: return 'HADOOP_UNKNOWN_MSG' + for k, v in msg_ids.iteritems(): + if(k in service): + msg_id = v + if severity == 'OK': + msg_id = '{0}_ok'.format(msg_id) + return msg_id + return 'HADOOP_UNKNOWN_MSG' # Determine the domain. Currently the domain is always 'Hadoop'. http://git-wip-us.apache.org/repos/asf/ambari/blob/a14ca238/contrib/addons/test/nagios/plugins/test_sys_logger.py ---------------------------------------------------------------------- diff --git a/contrib/addons/test/nagios/plugins/test_sys_logger.py b/contrib/addons/test/nagios/plugins/test_sys_logger.py index eb7a8fe..49c5de8 100644 --- a/contrib/addons/test/nagios/plugins/test_sys_logger.py +++ b/contrib/addons/test/nagios/plugins/test_sys_logger.py @@ -259,6 +259,13 @@ test('Hadoop_RegionServer_Down:OK', 'OK: Hadoop: regionservers_down_ok# SERVICE MSG', 'HARD', '1', 'OK', 'HBASE::Percent region servers down', 'SERVICE MSG') +test('HBASE_RegionServer_live', + 'Critical: Hadoop: regionservers_down# SERVICE MSG', + 'HARD', '1', 'CRITICAL', 'HBASE::Percent RegionServers live', 'SERVICE MSG') +test('HBASE_RegionServer_live:OK', + 'OK: Hadoop: regionservers_down_ok# SERVICE MSG', + 'HARD', '1', 'OK', 'HBASE::Percent RegionServers live', 'SERVICE MSG') + # Hadoop_Hive_Metastore_Process_Down test('Hadoop_Hive_Metastore_Process_Down', 'Critical: Hadoop: hive_metastore_process_down# SERVICE MSG', @@ -548,26 +555,48 @@ test('Hive_Metastore_status:OK', 'HARD', '1', 'OK', 'HIVE-METASTORE::Hive Metastore status', 'SERVICE MSG') test('WebHCat_Server_status', - 'Critical: Hadoop: webhcat_server_process# SERVICE MSG', + 'Critical: Hadoop: webhcat_down# SERVICE MSG', 'HARD', '1', 'CRITICAL', 'WEBHCAT::WebHCat Server status', 'SERVICE MSG') test('WebHCat_Server_status:OK', - 'OK: Hadoop: webhcat_server_process_ok# SERVICE MSG', + 'OK: Hadoop: webhcat_down_ok# SERVICE MSG', 'HARD', '1', 'OK', 'WEBHCAT::WebHCat Server status', 'SERVICE MSG') test('ResourceManager_process', - 'Critical: Hadoop: resourcemanager_process# SERVICE MSG', + 'Critical: Hadoop: resourcemanager_process_down# SERVICE MSG', 'HARD', '1', 'CRITICAL', 'RESOURCEMANAGER::ResourceManager process', 'SERVICE MSG') test('ResourceManager_process:OK', - 'OK: Hadoop: resourcemanager_process_ok# SERVICE MSG', + 'OK: Hadoop: resourcemanager_process_down_ok# SERVICE MSG', 'HARD', '1', 'OK', 'RESOURCEMANAGER::ResourceManager process', 'SERVICE MSG') +test('AppTimeline_process', + 'Critical: Hadoop: timelineserver_process# SERVICE MSG', + 'HARD', '1', 'CRITICAL', 'APP_TIMELINE_SERVER::App Timeline Server process', 'SERVICE MSG') +test('AppTimeline_process:OK', + 'OK: Hadoop: timelineserver_process_ok# SERVICE MSG', + 'HARD', '1', 'OK', 'APP_TIMELINE_SERVER::App Timeline Server process', 'SERVICE MSG') + test('NodeManager_process', - 'Critical: Hadoop: nodemanager_process# SERVICE MSG', + 'Critical: Hadoop: nodemanager_process_down# SERVICE MSG', 'HARD', '1', 'CRITICAL', 'NODEMANAGER::NodeManager process', 'SERVICE MSG') test('NodeManager_process:OK', - 'OK: Hadoop: nodemanager_process_ok# SERVICE MSG', + 'OK: Hadoop: nodemanager_process_down_ok# SERVICE MSG', 'HARD', '1', 'OK', 'NODEMANAGER::NodeManager process', 'SERVICE MSG') +test('NodeManager_health', + 'Critical: Hadoop: nodemanager_health# SERVICE MSG', + 'HARD', '1', 'CRITICAL', 'NODEMANAGER::NodeManager health', 'SERVICE MSG') +test('NodeManager_health:OK', + 'OK: Hadoop: nodemanager_health_ok# SERVICE MSG', + 'HARD', '1', 'OK', 'NODEMANAGER::NodeManager health', 'SERVICE MSG') + +test('NodeManager_live', + 'Critical: Hadoop: nodemanagers_down# SERVICE MSG', + 'HARD', '1', 'CRITICAL', 'NODEMANAGER::Percent NodeManagers live', 'SERVICE MSG') +test('NodeManager_live:OK', + 'OK: Hadoop: nodemanagers_down_ok# SERVICE MSG', + 'HARD', '1', 'OK', 'NODEMANAGER::Percent NodeManagers live', 'SERVICE MSG') + + test('HistoryServer_process', 'Critical: Hadoop: historyserver_process# SERVICE MSG', 'HARD', '1', 'CRITICAL', 'JOBHISTORY::HistoryServer process', 'SERVICE MSG') @@ -575,5 +604,41 @@ test('HistoryServer_process:OK', 'OK: Hadoop: historyserver_process_ok# SERVICE MSG', 'HARD', '1', 'OK', 'JOBHISTORY::HistoryServer process', 'SERVICE MSG') +test('HistoryServer_RPC_latency', + 'Critical: Hadoop: historyserver_rpc_latency# SERVICE MSG', + 'HARD', '1', 'CRITICAL', 'JOBHISTORY::HistoryServer RPC latency', 'SERVICE MSG') +test('HistoryServer_RPC_latency:OK', + 'OK: Hadoop: historyserver_rpc_latency_ok# SERVICE MSG', + 'HARD', '1', 'OK', 'JOBHISTORY::HistoryServer RPC latency', 'SERVICE MSG') + +test('HistoryServer_CPU_utilization', + 'Critical: Hadoop: historyserver_cpu_utilization# SERVICE MSG', + 'HARD', '1', 'CRITICAL', 'JOBHISTORY::HistoryServer CPU utilization', 'SERVICE MSG') +test('HistoryServer_CPU_utilization:OK', + 'OK: Hadoop: historyserver_cpu_utilization_ok# SERVICE MSG', + 'HARD', '1', 'OK', 'JOBHISTORY::HistoryServer CPU utilization', 'SERVICE MSG') + +test('HistoryServer_Web_UI', + 'Critical: Hadoop: historyserver_ui# SERVICE MSG', + 'HARD', '1', 'CRITICAL', 'JOBHISTORY::HistoryServer Web UI', 'SERVICE MSG') +test('HistoryServer_Web_UI:OK', + 'OK: Hadoop: historyserver_ui_ok# SERVICE MSG', + 'HARD', '1', 'OK', 'JOBHISTORY::HistoryServer Web UI', 'SERVICE MSG') + +test('ResourceManager_rpc_latency', + 'Critical: Hadoop: resourcemanager_rpc_latency# SERVICE MSG', + 'HARD', '1', 'CRITICAL', 'RESOURCEMANAGER::ResourceManager RPC latency', 'SERVICE MSG') +test('ResourceManager_rpc_latency:OK', + 'OK: Hadoop: resourcemanager_rpc_latency_ok# SERVICE MSG', + 'HARD', '1', 'OK', 'RESOURCEMANAGER::ResourceManager RPC latency', 'SERVICE MSG') + +test('ResourceManager_cpu_utilization', + 'Critical: Hadoop: resourcemanager_cpu_utilization# SERVICE MSG', + 'HARD', '1', 'CRITICAL', 'RESOURCEMANAGER::ResourceManager CPU utilization', 'SERVICE MSG') +test('ResourceManager_cpu_utilization:OK', + 'OK: Hadoop: resourcemanager_cpu_utilization_ok# SERVICE MSG', + 'HARD', '1', 'OK', 'RESOURCEMANAGER::ResourceManager CPU utilization', 'SERVICE MSG') + + summary()