Repository: ambari Updated Branches: refs/heads/trunk 61c8f5f98 -> 3e873a950
AMBARI-7700 - Ambari alerts for Hive metastore, Oozie, and ZK not right (AMBARI-7700, 7701, 7703) Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/3e873a95 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/3e873a95 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/3e873a95 Branch: refs/heads/trunk Commit: 3e873a95037adb2f49f83c74cc0a44fcd2fbac6d Parents: 61c8f5f Author: Artem Baranchuk <[email protected]> Authored: Tue Oct 14 16:20:56 2014 +0300 Committer: Artem Baranchuk <[email protected]> Committed: Tue Oct 14 19:14:09 2014 +0300 ---------------------------------------------------------------------- .../services/NAGIOS/package/files/sys_logger.py | 5 +- .../src/addOns/nagios/plugins/sys_logger.py | 81 +++++++++++--------- .../test/nagios/plugins/test_sys_logger.py | 20 +++-- 3 files changed, 63 insertions(+), 43 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/3e873a95/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/sys_logger.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/sys_logger.py b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/sys_logger.py index e86a8fb..6683342 100644 --- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/sys_logger.py +++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/sys_logger.py @@ -82,6 +82,7 @@ msg_ids = {'Host::Ping':'host_down', 'REGIONSERVER::RegionServer process down':'regionserver_process_down', 'HBASE::Percent RegionServers down':'regionservers_down', 'HIVE-METASTORE::Hive Metastore status check':'hive_metastore_process_down', + 'HIVE-METASTORE::Hive Metastore process':'hive_metastore_process_down', 'ZOOKEEPER::Percent ZooKeeper Servers down':'zookeepers_down', 'ZOOKEEPER::ZooKeeper Server process down':'zookeeper_process_down', 'OOZIE::Oozie Server status check':'oozie_down', @@ -103,7 +104,7 @@ msg_ids = {'Host::Ping':'host_down', 'NAMENODE::NameNode process':'namenode_process', 'NAMENODE::Secondary NameNode process':'secondary_namenode_process', 'JOURNALNODE::JournalNode process':'journalnode_process', - 'ZOOKEEPER::ZooKeeper Server process':'zookeeper_server_process', + 'ZOOKEEPER::ZooKeeper Server process':'zookeeper_process_down', 'JOBTRACKER::JobTracker process':'jobtracker_process', 'TASKTRACKER::TaskTracker process':'tasktracker_process', 'GANGLIA::Ganglia Server process':'ganglia_server_process', @@ -118,7 +119,7 @@ msg_ids = {'Host::Ping':'host_down', 'REGIONSERVER::RegionServer process':'regionserver_process', 'NAGIOS::Nagios status log freshness':'nagios_process', 'FLUME::Flume Agent process':'flume_agent_process', - 'OOZIE::Oozie Server status':'oozie_server_process', + 'OOZIE::Oozie Server status':'oozie_down', 'HIVE-METASTORE::Hive Metastore status':'hive_metastore_process', 'WEBHCAT::WebHCat Server status':'webhcat_down', 'RESOURCEMANAGER::ResourceManager process':'resourcemanager_process_down', http://git-wip-us.apache.org/repos/asf/ambari/blob/3e873a95/contrib/addons/src/addOns/nagios/plugins/sys_logger.py ---------------------------------------------------------------------- diff --git a/contrib/addons/src/addOns/nagios/plugins/sys_logger.py b/contrib/addons/src/addOns/nagios/plugins/sys_logger.py index 8f0a415..2e353f4 100644 --- a/contrib/addons/src/addOns/nagios/plugins/sys_logger.py +++ b/contrib/addons/src/addOns/nagios/plugins/sys_logger.py @@ -82,6 +82,7 @@ msg_ids = {'Host::Ping':'host_down', 'REGIONSERVER::RegionServer process down':'regionserver_process_down', 'HBASE::Percent RegionServers down':'regionservers_down', 'HIVE-METASTORE::Hive Metastore status check':'hive_metastore_process_down', + 'HIVE-METASTORE::Hive Metastore process':'hive_metastore_process_down', 'ZOOKEEPER::Percent ZooKeeper Servers down':'zookeepers_down', 'ZOOKEEPER::ZooKeeper Server process down':'zookeeper_process_down', 'OOZIE::Oozie Server status check':'oozie_down', @@ -103,7 +104,7 @@ msg_ids = {'Host::Ping':'host_down', 'NAMENODE::NameNode process':'namenode_process', 'NAMENODE::Secondary NameNode process':'secondary_namenode_process', 'JOURNALNODE::JournalNode process':'journalnode_process', - 'ZOOKEEPER::ZooKeeper Server process':'zookeeper_server_process', + 'ZOOKEEPER::ZooKeeper Server process':'zookeeper_process_down', 'JOBTRACKER::JobTracker process':'jobtracker_process', 'TASKTRACKER::TaskTracker process':'tasktracker_process', 'GANGLIA::Ganglia Server process':'ganglia_server_process', @@ -114,73 +115,83 @@ msg_ids = {'Host::Ping':'host_down', 'GANGLIA::Ganglia Monitor process for ResourceManager':'ganglia_monitor_process', 'GANGLIA::Ganglia Monitor process for HistoryServer':'ganglia_monitor_process', 'HBASEMASTER::HBase Master process':'hbase_master_process', + 'HBASE::Percent RegionServers live':'regionservers_down', 'REGIONSERVER::RegionServer process':'regionserver_process', 'NAGIOS::Nagios status log freshness':'nagios_process', 'FLUME::Flume Agent process':'flume_agent_process', - 'OOZIE::Oozie Server status':'oozie_server_process', + 'OOZIE::Oozie Server status':'oozie_down', 'HIVE-METASTORE::Hive Metastore status':'hive_metastore_process', - 'WEBHCAT::WebHCat Server status':'webhcat_server_process', - 'RESOURCEMANAGER::ResourceManager process':'resourcemanager_process', - 'NODEMANAGER::NodeManager process':'nodemanager_process', + 'WEBHCAT::WebHCat Server status':'webhcat_down', + 'RESOURCEMANAGER::ResourceManager process':'resourcemanager_process_down', + 'RESOURCEMANAGER::ResourceManager RPC latency':'resourcemanager_rpc_latency', + 'RESOURCEMANAGER::ResourceManager CPU utilization':'resourcemanager_cpu_utilization', + 'RESOURCEMANAGER::ResourceManager Web UI':'recourcemanager_ui', + 'NODEMANAGER::NodeManager process':'nodemanager_process_down', + 'NODEMANAGER::NodeManager health':'nodemanager_health', + 'NODEMANAGER::Percent NodeManagers live':'nodemanagers_down', + 'APP_TIMELINE_SERVER::App Timeline Server process':'timelineserver_process', + 'JOBHISTORY::HistoryServer RPC latency':'historyserver_rpc_latency', + 'JOBHISTORY::HistoryServer CPU utilization':'historyserver_cpu_utilization', + 'JOBHISTORY::HistoryServer Web UI':'historyserver_ui', 'JOBHISTORY::HistoryServer process':'historyserver_process'} # Determine the severity of the TVI alert based on the Nagios alert state. def determine_severity(state, service): - if severities.has_key(state): - severity = severities[state] - else: severity = 'Warning' + if severities.has_key(state): + severity = severities[state] + else: severity = 'Warning' - # For some alerts, warning should be converted to Degraded - if severity == 'Warning' and service in degraded_alert_services: - severity = 'Degraded' - elif severity != 'OK' and service in fatal_alert_services: - severity = 'Fatal' + # For some alerts, warning should be converted to Degraded + if severity == 'Warning' and service in degraded_alert_services: + severity = 'Degraded' + elif severity != 'OK' and service in fatal_alert_services: + severity = 'Fatal' - return severity + return severity # Determine the msg id for the TVI alert from based on the service which generates the Nagios alert. # The msg id is used to correlate a log msg to a TVI rule. def determine_msg_id(service, severity): - if msg_ids.has_key(service): - msg_id = msg_ids[service] - if severity == 'OK': - msg_id = '{0}_ok'.format(msg_id) - - return msg_id - else: return 'HADOOP_UNKNOWN_MSG' + for k, v in msg_ids.iteritems(): + if(k in service): + msg_id = v + if severity == 'OK': + msg_id = '{0}_ok'.format(msg_id) + return msg_id + return 'HADOOP_UNKNOWN_MSG' # Determine the domain. Currently the domain is always 'Hadoop'. def determine_domain(): - return 'Hadoop' + return 'Hadoop' # log the TVI msg to the syslog def log_tvi_msg(msg): - syslog.openlog('nagios', syslog.LOG_PID) - syslog.syslog(msg) + syslog.openlog('nagios', syslog.LOG_PID) + syslog.syslog(msg) # generate a tvi log msg from a Hadoop alert def generate_tvi_log_msg(alert_type, attempt, state, service, msg): - # Determine the TVI msg contents - severity = determine_severity(state, service) # The TVI alert severity. - domain = determine_domain() # The domain specified in the TVI alert. - msg_id = determine_msg_id(service, severity) # The msg_id used to correlate to a TVI rule. + # Determine the TVI msg contents + severity = determine_severity(state, service) # The TVI alert severity. + domain = determine_domain() # The domain specified in the TVI alert. + msg_id = determine_msg_id(service, severity) # The msg_id used to correlate to a TVI rule. - # Only log HARD alerts - if alert_type == 'HARD': - # Format and log msg - log_tvi_msg('{0}: {1}: {2}# {3}'.format(severity, domain, msg_id, msg)) + # Only log HARD alerts + if alert_type == 'HARD': + # Format and log msg + log_tvi_msg('{0}: {1}: {2}# {3}'.format(severity, domain, msg_id, msg)) # main method which is called when invoked on the command line def main(): - generate_tvi_log_msg(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5]) + generate_tvi_log_msg(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5]) # run the main method if __name__ == '__main__': - main() - sys.exit(0) \ No newline at end of file + main() + sys.exit(0) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/3e873a95/contrib/addons/test/nagios/plugins/test_sys_logger.py ---------------------------------------------------------------------- diff --git a/contrib/addons/test/nagios/plugins/test_sys_logger.py b/contrib/addons/test/nagios/plugins/test_sys_logger.py index 49c5de8..08339e3 100644 --- a/contrib/addons/test/nagios/plugins/test_sys_logger.py +++ b/contrib/addons/test/nagios/plugins/test_sys_logger.py @@ -267,14 +267,22 @@ test('HBASE_RegionServer_live:OK', 'HARD', '1', 'OK', 'HBASE::Percent RegionServers live', 'SERVICE MSG') # Hadoop_Hive_Metastore_Process_Down -test('Hadoop_Hive_Metastore_Process_Down', +test('Hadoop_Hive_Metastore_Status_Check_Down', 'Critical: Hadoop: hive_metastore_process_down# SERVICE MSG', 'HARD', '1', 'CRITICAL', 'HIVE-METASTORE::HIVE-METASTORE status check', 'SERVICE MSG') -test('Hadoop_Hive_Metastore_Process_Down:OK', +test('Hadoop_Hive_Metastore_Status_Check_Down:OK', 'OK: Hadoop: hive_metastore_process_down_ok# SERVICE MSG', 'HARD', '1', 'OK', 'HIVE-METASTORE::HIVE-METASTORE status check', 'SERVICE MSG') +test('Hadoop_Hive_Metastore_Process_Down', + 'Critical: Hadoop: hive_metastore_process_down# SERVICE MSG', + 'HARD', '1', 'CRITICAL', 'HIVE-METASTORE::Hive Metastore process', 'SERVICE MSG') + +test('Hadoop_Hive_Metastore_Process_Down:OK', + 'OK: Hadoop: hive_metastore_process_down_ok# SERVICE MSG', + 'HARD', '1', 'OK', 'HIVE-METASTORE::Hive Metastore process', 'SERVICE MSG') + # Hadoop_Zookeeper_Down test('Hadoop_Zookeeper_Down', 'Critical: Hadoop: zookeepers_down# SERVICE MSG', @@ -448,10 +456,10 @@ test('JournalNode_process:OK', 'HARD', '1', 'OK', 'JOURNALNODE::JournalNode process', 'SERVICE MSG') test('ZooKeeper_Server_process', - 'Critical: Hadoop: zookeeper_server_process# SERVICE MSG', + 'Critical: Hadoop: zookeeper_process_down# SERVICE MSG', 'HARD', '1', 'CRITICAL', 'ZOOKEEPER::ZooKeeper Server process', 'SERVICE MSG') test('ZooKeeper_Server_process:OK', - 'OK: Hadoop: zookeeper_server_process_ok# SERVICE MSG', + 'OK: Hadoop: zookeeper_process_down_ok# SERVICE MSG', 'HARD', '1', 'OK', 'ZOOKEEPER::ZooKeeper Server process', 'SERVICE MSG') test('JobTracker_process', @@ -541,10 +549,10 @@ test('Flume_Agent_process:OK', 'HARD', '1', 'OK', 'FLUME::Flume Agent process', 'SERVICE MSG') test('Oozie_Server_status', - 'Critical: Hadoop: oozie_server_process# SERVICE MSG', + 'Critical: Hadoop: oozie_down# SERVICE MSG', 'HARD', '1', 'CRITICAL', 'OOZIE::Oozie Server status', 'SERVICE MSG') test('Oozie_Server_status:OK', - 'OK: Hadoop: oozie_server_process_ok# SERVICE MSG', + 'OK: Hadoop: oozie_down_ok# SERVICE MSG', 'HARD', '1', 'OK', 'OOZIE::Oozie Server status', 'SERVICE MSG') test('Hive_Metastore_status',
