Repository: ambari
Updated Branches:
  refs/heads/trunk 61c8f5f98 -> 3e873a950


AMBARI-7700 - Ambari alerts for Hive metastore, Oozie, and ZK not right 
(AMBARI-7700, 7701, 7703)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/3e873a95
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/3e873a95
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/3e873a95

Branch: refs/heads/trunk
Commit: 3e873a95037adb2f49f83c74cc0a44fcd2fbac6d
Parents: 61c8f5f
Author: Artem Baranchuk <[email protected]>
Authored: Tue Oct 14 16:20:56 2014 +0300
Committer: Artem Baranchuk <[email protected]>
Committed: Tue Oct 14 19:14:09 2014 +0300

----------------------------------------------------------------------
 .../services/NAGIOS/package/files/sys_logger.py |  5 +-
 .../src/addOns/nagios/plugins/sys_logger.py     | 81 +++++++++++---------
 .../test/nagios/plugins/test_sys_logger.py      | 20 +++--
 3 files changed, 63 insertions(+), 43 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/3e873a95/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/sys_logger.py
----------------------------------------------------------------------
diff --git 
a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/sys_logger.py
 
b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/sys_logger.py
index e86a8fb..6683342 100644
--- 
a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/sys_logger.py
+++ 
b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/sys_logger.py
@@ -82,6 +82,7 @@ msg_ids = {'Host::Ping':'host_down',
            'REGIONSERVER::RegionServer process 
down':'regionserver_process_down',
            'HBASE::Percent RegionServers down':'regionservers_down',
            'HIVE-METASTORE::Hive Metastore status 
check':'hive_metastore_process_down',
+           'HIVE-METASTORE::Hive Metastore 
process':'hive_metastore_process_down',
            'ZOOKEEPER::Percent ZooKeeper Servers down':'zookeepers_down',
            'ZOOKEEPER::ZooKeeper Server process down':'zookeeper_process_down',
            'OOZIE::Oozie Server status check':'oozie_down',
@@ -103,7 +104,7 @@ msg_ids = {'Host::Ping':'host_down',
            'NAMENODE::NameNode process':'namenode_process',
            'NAMENODE::Secondary NameNode process':'secondary_namenode_process',
            'JOURNALNODE::JournalNode process':'journalnode_process',
-           'ZOOKEEPER::ZooKeeper Server process':'zookeeper_server_process',
+           'ZOOKEEPER::ZooKeeper Server process':'zookeeper_process_down',
            'JOBTRACKER::JobTracker process':'jobtracker_process',
            'TASKTRACKER::TaskTracker process':'tasktracker_process',
            'GANGLIA::Ganglia Server process':'ganglia_server_process',
@@ -118,7 +119,7 @@ msg_ids = {'Host::Ping':'host_down',
            'REGIONSERVER::RegionServer process':'regionserver_process',
            'NAGIOS::Nagios status log freshness':'nagios_process',
            'FLUME::Flume Agent process':'flume_agent_process',
-           'OOZIE::Oozie Server status':'oozie_server_process',
+           'OOZIE::Oozie Server status':'oozie_down',
            'HIVE-METASTORE::Hive Metastore status':'hive_metastore_process',
            'WEBHCAT::WebHCat Server status':'webhcat_down',
            'RESOURCEMANAGER::ResourceManager 
process':'resourcemanager_process_down',

http://git-wip-us.apache.org/repos/asf/ambari/blob/3e873a95/contrib/addons/src/addOns/nagios/plugins/sys_logger.py
----------------------------------------------------------------------
diff --git a/contrib/addons/src/addOns/nagios/plugins/sys_logger.py 
b/contrib/addons/src/addOns/nagios/plugins/sys_logger.py
index 8f0a415..2e353f4 100644
--- a/contrib/addons/src/addOns/nagios/plugins/sys_logger.py
+++ b/contrib/addons/src/addOns/nagios/plugins/sys_logger.py
@@ -82,6 +82,7 @@ msg_ids = {'Host::Ping':'host_down',
            'REGIONSERVER::RegionServer process 
down':'regionserver_process_down',
            'HBASE::Percent RegionServers down':'regionservers_down',
            'HIVE-METASTORE::Hive Metastore status 
check':'hive_metastore_process_down',
+           'HIVE-METASTORE::Hive Metastore 
process':'hive_metastore_process_down',
            'ZOOKEEPER::Percent ZooKeeper Servers down':'zookeepers_down',
            'ZOOKEEPER::ZooKeeper Server process down':'zookeeper_process_down',
            'OOZIE::Oozie Server status check':'oozie_down',
@@ -103,7 +104,7 @@ msg_ids = {'Host::Ping':'host_down',
            'NAMENODE::NameNode process':'namenode_process',
            'NAMENODE::Secondary NameNode process':'secondary_namenode_process',
            'JOURNALNODE::JournalNode process':'journalnode_process',
-           'ZOOKEEPER::ZooKeeper Server process':'zookeeper_server_process',
+           'ZOOKEEPER::ZooKeeper Server process':'zookeeper_process_down',
            'JOBTRACKER::JobTracker process':'jobtracker_process',
            'TASKTRACKER::TaskTracker process':'tasktracker_process',
            'GANGLIA::Ganglia Server process':'ganglia_server_process',
@@ -114,73 +115,83 @@ msg_ids = {'Host::Ping':'host_down',
            'GANGLIA::Ganglia Monitor process for 
ResourceManager':'ganglia_monitor_process',
            'GANGLIA::Ganglia Monitor process for 
HistoryServer':'ganglia_monitor_process',
            'HBASEMASTER::HBase Master process':'hbase_master_process',
+           'HBASE::Percent RegionServers live':'regionservers_down',
            'REGIONSERVER::RegionServer process':'regionserver_process',
            'NAGIOS::Nagios status log freshness':'nagios_process',
            'FLUME::Flume Agent process':'flume_agent_process',
-           'OOZIE::Oozie Server status':'oozie_server_process',
+           'OOZIE::Oozie Server status':'oozie_down',
            'HIVE-METASTORE::Hive Metastore status':'hive_metastore_process',
-           'WEBHCAT::WebHCat Server status':'webhcat_server_process',
-           'RESOURCEMANAGER::ResourceManager 
process':'resourcemanager_process',
-           'NODEMANAGER::NodeManager process':'nodemanager_process',
+           'WEBHCAT::WebHCat Server status':'webhcat_down',
+           'RESOURCEMANAGER::ResourceManager 
process':'resourcemanager_process_down',
+           'RESOURCEMANAGER::ResourceManager RPC 
latency':'resourcemanager_rpc_latency',
+           'RESOURCEMANAGER::ResourceManager CPU 
utilization':'resourcemanager_cpu_utilization',
+           'RESOURCEMANAGER::ResourceManager Web UI':'recourcemanager_ui',
+           'NODEMANAGER::NodeManager process':'nodemanager_process_down',
+           'NODEMANAGER::NodeManager health':'nodemanager_health',
+           'NODEMANAGER::Percent NodeManagers live':'nodemanagers_down',
+           'APP_TIMELINE_SERVER::App Timeline Server 
process':'timelineserver_process',
+           'JOBHISTORY::HistoryServer RPC latency':'historyserver_rpc_latency',
+           'JOBHISTORY::HistoryServer CPU 
utilization':'historyserver_cpu_utilization',
+           'JOBHISTORY::HistoryServer Web UI':'historyserver_ui',
            'JOBHISTORY::HistoryServer process':'historyserver_process'}
 
 # Determine the severity of the TVI alert based on the Nagios alert state.
 def determine_severity(state, service):
-    if severities.has_key(state):
-        severity = severities[state]
-    else: severity = 'Warning'
+  if severities.has_key(state):
+    severity = severities[state]
+  else: severity = 'Warning'
 
-    # For some alerts, warning should be converted to Degraded
-    if severity == 'Warning' and service in degraded_alert_services:
-        severity = 'Degraded'
-    elif severity != 'OK' and service in fatal_alert_services:
-        severity = 'Fatal'
+  # For some alerts, warning should be converted to Degraded
+  if severity == 'Warning' and service in degraded_alert_services:
+    severity = 'Degraded'
+  elif severity != 'OK' and service in fatal_alert_services:
+    severity = 'Fatal'
 
-    return severity
+  return severity
 
 
 # Determine the msg id for the TVI alert from based on the service which 
generates the Nagios alert.
 # The msg id is used to correlate a log msg to a TVI rule.
 def determine_msg_id(service, severity):
-    if msg_ids.has_key(service):
-        msg_id = msg_ids[service]
-        if severity == 'OK':
-            msg_id = '{0}_ok'.format(msg_id)
-
-        return msg_id
-    else: return 'HADOOP_UNKNOWN_MSG'
+  for k, v in msg_ids.iteritems():
+    if(k in service):
+      msg_id = v
+      if severity == 'OK':
+        msg_id = '{0}_ok'.format(msg_id)
+      return msg_id
+  return 'HADOOP_UNKNOWN_MSG'
 
 
 # Determine the domain.  Currently the domain is always 'Hadoop'.
 def determine_domain():
-    return 'Hadoop'
+  return 'Hadoop'
 
 
 # log the TVI msg to the syslog
 def log_tvi_msg(msg):
-    syslog.openlog('nagios', syslog.LOG_PID)
-    syslog.syslog(msg)
+  syslog.openlog('nagios', syslog.LOG_PID)
+  syslog.syslog(msg)
 
 
 # generate a tvi log msg from a Hadoop alert
 def generate_tvi_log_msg(alert_type, attempt, state, service, msg):
-    # Determine the TVI msg contents
-    severity = determine_severity(state, service)  # The TVI alert severity.
-    domain   = determine_domain()                  # The domain specified in 
the TVI alert.
-    msg_id   = determine_msg_id(service, severity) # The msg_id used to 
correlate to a TVI rule.
+  # Determine the TVI msg contents
+  severity = determine_severity(state, service)  # The TVI alert severity.
+  domain   = determine_domain()                  # The domain specified in the 
TVI alert.
+  msg_id   = determine_msg_id(service, severity) # The msg_id used to 
correlate to a TVI rule.
 
-    # Only log HARD alerts
-    if alert_type == 'HARD':
-        # Format and log msg
-        log_tvi_msg('{0}: {1}: {2}# {3}'.format(severity, domain, msg_id, msg))
+  # Only log HARD alerts
+  if alert_type == 'HARD':
+    # Format and log msg
+    log_tvi_msg('{0}: {1}: {2}# {3}'.format(severity, domain, msg_id, msg))
 
 
 # main method which is called when invoked on the command line
 def main():
-    generate_tvi_log_msg(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], 
sys.argv[5])
+  generate_tvi_log_msg(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], 
sys.argv[5])
 
 
 # run the main method
 if __name__ == '__main__':
-    main()
-    sys.exit(0)
\ No newline at end of file
+  main()
+  sys.exit(0)
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/ambari/blob/3e873a95/contrib/addons/test/nagios/plugins/test_sys_logger.py
----------------------------------------------------------------------
diff --git a/contrib/addons/test/nagios/plugins/test_sys_logger.py 
b/contrib/addons/test/nagios/plugins/test_sys_logger.py
index 49c5de8..08339e3 100644
--- a/contrib/addons/test/nagios/plugins/test_sys_logger.py
+++ b/contrib/addons/test/nagios/plugins/test_sys_logger.py
@@ -267,14 +267,22 @@ test('HBASE_RegionServer_live:OK',
      'HARD', '1', 'OK', 'HBASE::Percent RegionServers live', 'SERVICE MSG')
 
 # Hadoop_Hive_Metastore_Process_Down
-test('Hadoop_Hive_Metastore_Process_Down',
+test('Hadoop_Hive_Metastore_Status_Check_Down',
      'Critical: Hadoop: hive_metastore_process_down# SERVICE MSG',
      'HARD', '1', 'CRITICAL', 'HIVE-METASTORE::HIVE-METASTORE status check', 
'SERVICE MSG')
 
-test('Hadoop_Hive_Metastore_Process_Down:OK',
+test('Hadoop_Hive_Metastore_Status_Check_Down:OK',
     'OK: Hadoop: hive_metastore_process_down_ok# SERVICE MSG',
     'HARD', '1', 'OK', 'HIVE-METASTORE::HIVE-METASTORE status check', 'SERVICE 
MSG')
 
+test('Hadoop_Hive_Metastore_Process_Down',
+     'Critical: Hadoop: hive_metastore_process_down# SERVICE MSG',
+     'HARD', '1', 'CRITICAL', 'HIVE-METASTORE::Hive Metastore process', 
'SERVICE MSG')
+
+test('Hadoop_Hive_Metastore_Process_Down:OK',
+     'OK: Hadoop: hive_metastore_process_down_ok# SERVICE MSG',
+     'HARD', '1', 'OK', 'HIVE-METASTORE::Hive Metastore process', 'SERVICE 
MSG')
+
 # Hadoop_Zookeeper_Down
 test('Hadoop_Zookeeper_Down',
      'Critical: Hadoop: zookeepers_down# SERVICE MSG',
@@ -448,10 +456,10 @@ test('JournalNode_process:OK',
      'HARD', '1', 'OK', 'JOURNALNODE::JournalNode process', 'SERVICE MSG')
 
 test('ZooKeeper_Server_process',
-     'Critical: Hadoop: zookeeper_server_process# SERVICE MSG',
+     'Critical: Hadoop: zookeeper_process_down# SERVICE MSG',
      'HARD', '1', 'CRITICAL', 'ZOOKEEPER::ZooKeeper Server process', 'SERVICE 
MSG')
 test('ZooKeeper_Server_process:OK',
-     'OK: Hadoop: zookeeper_server_process_ok# SERVICE MSG',
+     'OK: Hadoop: zookeeper_process_down_ok# SERVICE MSG',
      'HARD', '1', 'OK', 'ZOOKEEPER::ZooKeeper Server process', 'SERVICE MSG')
 
 test('JobTracker_process',
@@ -541,10 +549,10 @@ test('Flume_Agent_process:OK',
      'HARD', '1', 'OK', 'FLUME::Flume Agent process', 'SERVICE MSG')
 
 test('Oozie_Server_status',
-     'Critical: Hadoop: oozie_server_process# SERVICE MSG',
+     'Critical: Hadoop: oozie_down# SERVICE MSG',
      'HARD', '1', 'CRITICAL', 'OOZIE::Oozie Server status', 'SERVICE MSG')
 test('Oozie_Server_status:OK',
-     'OK: Hadoop: oozie_server_process_ok# SERVICE MSG',
+     'OK: Hadoop: oozie_down_ok# SERVICE MSG',
      'HARD', '1', 'OK', 'OOZIE::Oozie Server status', 'SERVICE MSG')
 
 test('Hive_Metastore_status',

Reply via email to