Updated Branches: refs/heads/trunk d44c1c2ba -> a2b675c6a
AMBARI-2920. Rename alert titles and descriptions. (yusaku)" Project: http://git-wip-us.apache.org/repos/asf/incubator-ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-ambari/commit/a2b675c6 Tree: http://git-wip-us.apache.org/repos/asf/incubator-ambari/tree/a2b675c6 Diff: http://git-wip-us.apache.org/repos/asf/incubator-ambari/diff/a2b675c6 Branch: refs/heads/trunk Commit: a2b675c6ada585a830ad928f6aefe426acb9dc03 Parents: d44c1c2 Author: Yusaku Sako <[email protected]> Authored: Thu Aug 15 14:59:48 2013 -0700 Committer: Yusaku Sako <[email protected]> Committed: Thu Aug 15 15:22:02 2013 -0700 ---------------------------------------------------------------------- .../files/check_ambari_agent_status.sh | 4 +- .../files/check_hive_metastore_status.sh | 4 +- .../files/check_mapred_local_dir_used.sh | 4 +- .../hdp-nagios/files/check_name_dir_status.php | 6 +- .../files/check_nodemanager_health.sh | 4 +- .../hdp-nagios/files/check_oozie_status.sh | 4 +- .../hdp-nagios/files/check_templeton_status.sh | 4 +- .../modules/hdp-nagios/files/check_webui.sh | 10 +- .../templates/hadoop-services.cfg.erb | 96 ++++++++++---------- 9 files changed, 68 insertions(+), 68 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/a2b675c6/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_ambari_agent_status.sh ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_ambari_agent_status.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_ambari_agent_status.sh index dd67496..a8b510a 100644 --- a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_ambari_agent_status.sh +++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_ambari_agent_status.sh @@ -31,9 +31,9 @@ fi if [ $RES -eq "2" ] then - echo "OK: Ambari agent is running [PID:$AMBARI_AGENT_PID]"; + echo "OK: Ambari Agent is running [PID:$AMBARI_AGENT_PID]"; exit 0; else - echo "CRITICAL: Ambari agent is not running [$AMBARI_AGENT_PID_PATH not found]"; + echo "CRITICAL: Ambari Agent is not running [$AMBARI_AGENT_PID_PATH not found]"; exit 2; fi \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/a2b675c6/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_hive_metastore_status.sh ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_hive_metastore_status.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_hive_metastore_status.sh index 0140958..640c077 100644 --- a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_hive_metastore_status.sh +++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_hive_metastore_status.sh @@ -38,8 +38,8 @@ HCAT_URL=-Dhive.metastore.uris="thrift://$HOST:$PORT" export JAVA_HOME=$JAVA_HOME out=`hcat $HCAT_URL -e "show databases" 2>&1` if [[ "$?" -ne 0 ]]; then - echo "CRITICAL: Error accessing hive-metaserver status [$out]"; + echo "CRITICAL: Error accessing Hive Metastore status [$out]"; exit 2; fi -echo "OK: Hive metaserver status OK"; +echo "OK: Hive Metastore status OK"; exit 0; http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/a2b675c6/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_mapred_local_dir_used.sh ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_mapred_local_dir_used.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_mapred_local_dir_used.sh index e91cb66..15c85eb 100644 --- a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_mapred_local_dir_used.sh +++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_mapred_local_dir_used.sh @@ -26,9 +26,9 @@ for mapred_dir in $MAPRED_LOCAL_DIRS do percent=`df -hl $mapred_dir | awk '{percent=$5;} END{print percent}' | cut -d % -f 1` if [ $percent -ge $CRITICAL ]; then - echo "CRITICAL: Mapreduce local dir is full." + echo "CRITICAL: MapReduce local dir is full." exit 2 fi done -echo "OK: Mapreduce local dir space is available." +echo "OK: MapReduce local dir space is available." exit 0 http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/a2b675c6/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_name_dir_status.php ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_name_dir_status.php b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_name_dir_status.php index 3f38c98..db2b491 100644 --- a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_name_dir_status.php +++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_name_dir_status.php @@ -36,12 +36,12 @@ $json_array = json_decode($json_string, true); $object = $json_array['beans'][0]; if ($object['NameDirStatuses'] == "") { - echo "WARNING: Namenode directory status not available via http://".$host.":".$port."/jmx url" . "\n"; + echo "WARNING: NameNode directory status not available via http://".$host.":".$port."/jmx url" . "\n"; exit(1); } $NameDirStatuses = json_decode($object['NameDirStatuses'], true); $failed_dir_count = count($NameDirStatuses['failed']); - $out_msg = "CRITICAL: Offline Namenode directories: "; + $out_msg = "CRITICAL: Offline NameNode directories: "; if ($failed_dir_count > 0) { foreach ($NameDirStatuses['failed'] as $key => $value) { $out_msg = $out_msg . $key . ":" . $value . ", "; @@ -49,7 +49,7 @@ echo $out_msg . "\n"; exit (2); } - echo "OK: All Namenode directories are active" . "\n"; + echo "OK: All NameNode directories are active" . "\n"; exit(0); /* print usage */ http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/a2b675c6/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh index 2a26f4e..82b8a3d 100644 --- a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh +++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh @@ -25,8 +25,8 @@ NODEMANAGER_URL="http://$HOST:$PORT/ws/v1/node/info" export PATH="/usr/bin:$PATH" RESPONSE=`curl -s $NODEMANAGER_URL` if [[ "$RESPONSE" == *'"nodeHealthy":true'* ]]; then - echo "OK: nodemanager healthy true"; + echo "OK: NodeManager healthy"; exit 0; fi -echo "CRITICAL: nodemanager healthy false"; +echo "CRITICAL: NodeManager unhealthy"; exit 2; http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/a2b675c6/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_oozie_status.sh ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_oozie_status.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_oozie_status.sh index bfd9d75..820ee99 100644 --- a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_oozie_status.sh +++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_oozie_status.sh @@ -38,8 +38,8 @@ OOZIE_URL="http://$HOST:$PORT/oozie" export JAVA_HOME=$JAVA_HOME out=`oozie admin -oozie ${OOZIE_URL} -status 2>&1` if [[ "$?" -ne 0 ]]; then - echo "CRITICAL: Error accessing oozie server status [$out]"; + echo "CRITICAL: Error accessing Oozie Server status [$out]"; exit 2; fi -echo "OK: Oozie server status [$out]"; +echo "OK: Oozie Server status [$out]"; exit 0; http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/a2b675c6/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_templeton_status.sh ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_templeton_status.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_templeton_status.sh index 7190956..79424be 100644 --- a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_templeton_status.sh +++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_templeton_status.sh @@ -37,8 +37,8 @@ fi regex="^.*\"status\":\"ok\".*<status_code:200>$" out=`curl --negotiate -u : -s -w '<status_code:%{http_code}>' http://$HOST:$PORT/templeton/$VERSION/status 2>&1` if [[ $out =~ $regex ]]; then - echo "OK: Templeton server status [$out]"; + echo "OK: WebHCat Server status [$out]"; exit 0; fi -echo "CRITICAL: Error accessing Templeton server, status [$out]"; +echo "CRITICAL: Error accessing WebHCat Server, status [$out]"; exit 2; http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/a2b675c6/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_webui.sh ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_webui.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_webui.sh index 57381e4..b23045e 100644 --- a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_webui.sh +++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_webui.sh @@ -46,35 +46,35 @@ jobtracker) namenode) nnweburl="http://$host:$port" if [[ `checkurl "$nnweburl"` -ne 0 ]] ; then - echo "WARNING: NameNode web UI not accessible : $nnweburl"; + echo "WARNING: NameNode Web UI not accessible : $nnweburl"; exit 1; fi ;; jobhistory) jhweburl="http://$host:$port/jobhistoryhome.jsp" if [[ `checkurl "$jhweburl"` -ne 0 ]]; then - echo "WARNING: Jobhistory web UI not accessible : $jhweburl"; + echo "WARNING: HistoryServer Web UI not accessible : $jhweburl"; exit 1; fi ;; hbase) hbaseweburl="http://$host:$port/master-status" if [[ `checkurl "$hbaseweburl"` -ne 0 ]]; then - echo "WARNING: HBase Master web UI not accessible : $hbaseweburl"; + echo "WARNING: HBase Master Web UI not accessible : $hbaseweburl"; exit 1; fi ;; resourcemanager) rmweburl="http://$host:$port/cluster" if [[ `checkurl "$rmweburl"` -ne 0 ]]; then - echo "WARNING: ResourceManager web UI not accessible : $rmweburl"; + echo "WARNING: ResourceManager Web UI not accessible : $rmweburl"; exit 1; fi ;; historyserver2) hsweburl="http://$host:$port/jobhistory" if [[ `checkurl "$hsweburl"` -ne 0 ]]; then - echo "WARNING: HistoryServer2 web UI not accessible : $hsweburl"; + echo "WARNING: HistoryServer Web UI not accessible : $hsweburl"; exit 1; fi ;; http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/a2b675c6/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb index d35b982..3256eb4 100644 --- a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb +++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb @@ -33,7 +33,7 @@ define service { define service { hostgroup_name nagios-server use hadoop-service - service_description NAGIOS::Nagios status log staleness + service_description NAGIOS::Nagios status log freshness servicegroups NAGIOS check_command check_nagios!10!/var/nagios/status.dat!<%=scope.function_hdp_template_var("::hdp-nagios::server::config::nagios_lookup_daemon_str")%> normal_check_interval 5 @@ -45,9 +45,9 @@ define service { define service { hostgroup_name nagios-server use hadoop-service - service_description HDFS::Percent DataNodes storage full + service_description HDFS::Percent DataNodes with space available servicegroups HDFS - check_command check_aggregate!"DATANODE::DataNode storage full"!10%!30% + check_command check_aggregate!"DATANODE::DataNode space"!10%!30% normal_check_interval 2 retry_check_interval 1 max_check_attempts 1 @@ -56,9 +56,9 @@ define service { define service { hostgroup_name nagios-server use hadoop-service - service_description HDFS::Percent DataNodes down + service_description HDFS::Percent DataNodes live servicegroups HDFS - check_command check_aggregate!"DATANODE::DataNode process down"!10%!30% + check_command check_aggregate!"DATANODE::DataNode process"!10%!30% normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 @@ -69,7 +69,7 @@ define service { define service { hostgroup_name agent-servers use hadoop-service - service_description AMBARI::Check ambari-agent process + service_description AMBARI::Ambari Agent process servicegroups AMBARI check_command check_ambari_agent_status normal_check_interval 5 @@ -82,9 +82,9 @@ define service { define service { hostgroup_name nagios-server use hadoop-service - service_description ZOOKEEPER::Percent ZooKeeper Servers down + service_description ZOOKEEPER::Percent ZooKeeper Servers live servicegroups ZOOKEEPER - check_command check_aggregate!"ZOOKEEPER::ZooKeeper Server process down"!35%!70% + check_command check_aggregate!"ZOOKEEPER::ZooKeeper Server process"!35%!70% normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 @@ -96,9 +96,9 @@ define service { define service { hostgroup_name nagios-server use hadoop-service - service_description HBASE::Percent RegionServers down + service_description HBASE::Percent RegionServers live servicegroups HBASE - check_command check_aggregate!"REGIONSERVER::RegionServer process down"!10%!30% + check_command check_aggregate!"REGIONSERVER::RegionServer process"!10%!30% normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 @@ -113,7 +113,7 @@ define service { define service { hostgroup_name ganglia-server use hadoop-service - service_description GANGLIA::Ganglia [gmetad] process down + service_description GANGLIA::Ganglia Server process servicegroups GANGLIA check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_port")%>!-w 1 -c 1 normal_check_interval 0.25 @@ -124,7 +124,7 @@ define service { define service { hostgroup_name ganglia-server use hadoop-service - service_description GANGLIA::Ganglia Collector [gmond] process down alert for slaves + service_description GANGLIA::Ganglia Monitor process for Slaves servicegroups GANGLIA check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_collector_slaves_port")%>!-w 1 -c 1 normal_check_interval 0.25 @@ -135,7 +135,7 @@ define service { define service { hostgroup_name ganglia-server use hadoop-service - service_description GANGLIA::Ganglia Collector [gmond] process down alert for NameNode + service_description GANGLIA::Ganglia Monitor process for NameNode servicegroups GANGLIA check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_collector_namenode_port")%>!-w 1 -c 1 normal_check_interval 0.25 @@ -147,7 +147,7 @@ define service { define service { hostgroup_name ganglia-server use hadoop-service - service_description GANGLIA::Ganglia Collector [gmond] process down alert for JobTracker + service_description GANGLIA::Ganglia Monitor process for JobTracker servicegroups GANGLIA check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_collector_jobtracker_port")%>!-w 1 -c 1 normal_check_interval 0.25 @@ -160,7 +160,7 @@ define service { define service { hostgroup_name ganglia-server use hadoop-service - service_description GANGLIA::Ganglia Collector [gmond] process down alert for HBase Master + service_description GANGLIA::Ganglia Monitor process for HBase Master servicegroups GANGLIA check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_collector_hbase_port")%>!-w 1 -c 1 normal_check_interval 0.25 @@ -173,7 +173,7 @@ define service { define service { hostgroup_name ganglia-server use hadoop-service - service_description GANGLIA::Ganglia Collector [gmond] process down alert for ResourceManager + service_description GANGLIA::Ganglia Monitor process for ResourceManager servicegroups GANGLIA check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_collector_rm_port")%>!-w 1 -c 1 normal_check_interval 0.25 @@ -186,7 +186,7 @@ define service { define service { hostgroup_name ganglia-server use hadoop-service - service_description GANGLIA::Ganglia Collector [gmond] process down alert for NodeManager + service_description GANGLIA::Ganglia Monitor process for NodeManager servicegroups GANGLIA check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_collector_nm_port")%>!-w 1 -c 1 normal_check_interval 0.25 @@ -199,7 +199,7 @@ define service { define service { hostgroup_name ganglia-server use hadoop-service - service_description GANGLIA::Ganglia Collector [gmond] process down alert for History Server 2 + service_description GANGLIA::Ganglia Monitor process for HistoryServer servicegroups GANGLIA check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::ganglia_collector_hs_port")%>!-w 1 -c 1 normal_check_interval 0.25 @@ -215,7 +215,7 @@ define service { define service { hostgroup_name snamenode use hadoop-service - service_description NAMENODE::Secondary NameNode process down + service_description NAMENODE::Secondary NameNode process servicegroups HDFS check_command check_tcp!<%=scope.function_hdp_template_var("snamenode_port")%>!-w 1 -c 1 normal_check_interval 0.5 @@ -228,7 +228,7 @@ define service { define service { hostgroup_name namenode use hadoop-service - service_description NAMENODE::NameNode Web UI down + service_description NAMENODE::NameNode Web UI servicegroups HDFS check_command check_webui!namenode!<%=scope.function_hdp_template_var("::hdp::namenode_port")%> normal_check_interval 1 @@ -262,7 +262,7 @@ define service { define service { hostgroup_name namenode use hadoop-service - service_description NAMENODE::NameNode process down + service_description NAMENODE::NameNode process servicegroups HDFS check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::namenode_metadata_port")%>!-w 1 -c 1 normal_check_interval 0.5 @@ -273,7 +273,7 @@ define service { define service { hostgroup_name namenode use hadoop-service - service_description HDFS::Corrupt/Missing blocks + service_description HDFS::Blocks health servicegroups HDFS check_command check_hdfs_blocks!<%=scope.function_hdp_template_var("::hdp::namenode_port")%>!0%!0% normal_check_interval 2 @@ -309,7 +309,7 @@ define service { define service { hostgroup_name jobtracker use hadoop-service - service_description JOBTRACKER::JobTracker Web UI down + service_description JOBTRACKER::JobTracker Web UI servicegroups MAPREDUCE check_command check_webui!jobtracker!<%=scope.function_hdp_template_var("::hdp::jtnode_port")%> normal_check_interval 1 @@ -320,7 +320,7 @@ define service { define service { hostgroup_name jobtracker use hadoop-service - service_description JOBTRACKER::JobHistory Web UI down + service_description JOBTRACKER::HistoryServer Web UI servicegroups MAPREDUCE check_command check_webui!jobhistory!<%=scope.function_hdp_template_var("::hdp::jobhistory_port")%> normal_check_interval 1 @@ -344,7 +344,7 @@ define service { hostgroup_name jobtracker use hadoop-service use hadoop-service - service_description JOBTRACKER::JobTracker process down + service_description JOBTRACKER::JobTracker process servicegroups MAPREDUCE check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::jtnode_port")%>!-w 1 -c 1 normal_check_interval 0.5 @@ -369,9 +369,9 @@ define service { define service { hostgroup_name nagios-server use hadoop-service - service_description MAPREDUCE::Percent TaskTrackers down + service_description MAPREDUCE::Percent TaskTrackers live servicegroups MAPREDUCE - check_command check_aggregate!"TASKTRACKER::TaskTracker process down"!10%!30% + check_command check_aggregate!"TASKTRACKER::TaskTracker process"!10%!30% normal_check_interval 0.5 retry_check_interval 0.25 max_check_attempts 3 @@ -381,7 +381,7 @@ define service { define service { hostgroup_name tasktracker-servers use hadoop-service - service_description TASKTRACKER::TaskTracker process down + service_description TASKTRACKER::TaskTracker process servicegroups MAPREDUCE check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::tasktracker_port")%>!-w 1 -c 1 normal_check_interval 1 @@ -393,7 +393,7 @@ define service { define service { hostgroup_name tasktracker-servers use hadoop-service - service_description TASKTRACKER::Mapreduce local dir used space + service_description TASKTRACKER::MapReduce local dir space servicegroups MAPREDUCE check_command check_mapred_local_dir_used_space!<%=scope.function_hdp_default("::hdp::mapred-site/mapred.local.dir")%>!85% normal_check_interval 0.5 @@ -409,7 +409,7 @@ define service { define service { hostgroup_name resourcemanager use hadoop-service - service_description RESOURCEMANAGER::ResourceManager Web UI down + service_description RESOURCEMANAGER::ResourceManager Web UI servicegroups YARN check_command check_webui!resourcemanager!<%=scope.function_hdp_template_var("::hdp::rm_port")%> normal_check_interval 1 @@ -442,7 +442,7 @@ define service { define service { hostgroup_name resourcemanager use hadoop-service - service_description RESOURCEMANAGER::Percent NodeManager down + service_description RESOURCEMANAGER::Percent NodeManager live servicegroups YARN check_command check_resourcemanager_nodes_percentage!<%=scope.function_hdp_template_var("::hdp::rm_port")%>!lost!10!30 normal_check_interval 1 @@ -453,7 +453,7 @@ define service { define service { hostgroup_name resourcemanager use hadoop-service - service_description RESOURCEMANAGER::Percent NodeManager unhealthy + service_description RESOURCEMANAGER::Percent NodeManager healthy servicegroups YARN check_command check_resourcemanager_nodes_percentage!<%=scope.function_hdp_template_var("::hdp::rm_port")%>!unhealthy!10!30 normal_check_interval 1 @@ -467,7 +467,7 @@ define service { define service { hostgroup_name nodemanagers use hadoop-service - service_description NODEMANAGER::NodeManager process down + service_description NODEMANAGER::NodeManager process servicegroups YARN check_command check_tcp!<%=scope.function_hdp_template_var("nm_port")%>!-w 1 -c 1 normal_check_interval 1 @@ -478,7 +478,7 @@ define service { define service { hostgroup_name nodemanagers use hadoop-service - service_description NODEMANAGER::NodeManager unhealthy + service_description NODEMANAGER::NodeManager health servicegroups YARN check_command check_nodemanager_health!<%=scope.function_hdp_template_var("nm_port")%> normal_check_interval 1 @@ -492,7 +492,7 @@ define service { define service { hostgroup_name historyserver2 use hadoop-service - service_description JOBHISTORY::History Server 2 Web UI down + service_description JOBHISTORY::HistoryServer Web UI servicegroups MAPREDUCE check_command check_webui!historyserver2!<%=scope.function_hdp_template_var("::hdp::hs_port")%> normal_check_interval 1 @@ -503,7 +503,7 @@ define service { define service { hostgroup_name historyserver2 use hadoop-service - service_description JOBHISTORY::History Server 2 CPU utilization + service_description JOBHISTORY::HistoryServer CPU utilization servicegroups MAPREDUCE check_command check_cpu!200%!250% normal_check_interval 5 @@ -514,7 +514,7 @@ define service { define service { hostgroup_name historyserver2 use hadoop-service - service_description JOBHISTORY::History Server 2 RPC latency + service_description JOBHISTORY::HistoryServer RPC latency servicegroups MAPREDUCE check_command check_rpcq_latency!JobHistoryServer!<%=scope.function_hdp_template_var("::hdp::hs_port")%>!3000!5000 normal_check_interval 5 @@ -529,7 +529,7 @@ define service { define service { hostgroup_name slaves use hadoop-service - service_description DATANODE::DataNode process down + service_description DATANODE::DataNode process servicegroups HDFS check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::datanode_port")%>!-w 1 -c 1 normal_check_interval 1 @@ -540,7 +540,7 @@ define service { define service { hostgroup_name slaves use hadoop-service - service_description DATANODE::DataNode storage full + service_description DATANODE::DataNode space servicegroups HDFS check_command check_datanode_storage!<%=scope.function_hdp_template_var("::hdp::datanode_port")%>!90%!90% normal_check_interval 5 @@ -555,7 +555,7 @@ define service { define service { hostgroup_name flume-servers use hadoop-service - service_description FLUME::Flume Agent process down + service_description FLUME::Flume Agent process servicegroups FLUME check_command check_tcp!<%=scope.function_hdp_template_var("flume_port")%>!-w 1 -c 1 normal_check_interval 1 @@ -570,7 +570,7 @@ define service { define service { hostgroup_name zookeeper-servers use hadoop-service - service_description ZOOKEEPER::ZooKeeper Server process down + service_description ZOOKEEPER::ZooKeeper Server process servicegroups ZOOKEEPER check_command check_tcp!<%=scope.function_hdp_template_var("::clientPort")%>!-w 1 -c 1 normal_check_interval 1 @@ -584,7 +584,7 @@ define service { define service { hostgroup_name region-servers use hadoop-service - service_description REGIONSERVER::RegionServer process down + service_description REGIONSERVER::RegionServer process servicegroups HBASE check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::hbase_rs_port")%>!-w 1 -c 1 normal_check_interval 1 @@ -596,7 +596,7 @@ define service { define service { hostgroup_name hbasemasters use hadoop-service - service_description HBASEMASTER::HBase Master Web UI down + service_description HBASEMASTER::HBase Master Web UI servicegroups HBASE check_command check_webui!hbase!<%=scope.function_hdp_template_var("::hdp::hbase_master_port")%> normal_check_interval 1 @@ -618,7 +618,7 @@ define service { define service { hostgroup_name hbasemasters use hadoop-service - service_description HBASEMASTER::HBase Master process down + service_description HBASEMASTER::HBase Master process servicegroups HBASE check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::hbase_master_port")%>!-w 1 -c 1 normal_check_interval 0.5 @@ -632,7 +632,7 @@ define service { define service { hostgroup_name hiveserver use hadoop-service - service_description HIVE-METASTORE::Hive Metastore status check + service_description HIVE-METASTORE::Hive Metastore status servicegroups HIVE-METASTORE <%if scope.function_hdp_template_var("security_enabled")-%> check_command check_hive_metastore_status!<%=scope.function_hdp_template_var("::hive_metastore_port")%>!<%=scope.function_hdp_template_var("java64_home")%>!true!<%=scope.function_hdp_template_var("nagios_keytab_path")%>!<%=scope.function_hdp_template_var("nagios_principal_name")%>!<%=scope.function_hdp_template_var("kinit_path_local")%> @@ -649,7 +649,7 @@ define service { define service { hostgroup_name oozie-server use hadoop-service - service_description OOZIE::Oozie Server status check + service_description OOZIE::Oozie Server status servicegroups OOZIE <%if scope.function_hdp_template_var("security_enabled")-%> check_command check_oozie_status!<%=scope.function_hdp_template_var("::hdp::oozie_server_port")%>!<%=scope.function_hdp_template_var("java64_home")%>!true!<%=scope.function_hdp_template_var("nagios_keytab_path")%>!<%=scope.function_hdp_template_var("nagios_principal_name")%>!<%=scope.function_hdp_template_var("kinit_path_local")%> @@ -666,7 +666,7 @@ define service { define service { hostgroup_name webhcat-server use hadoop-service - service_description WEBHCAT::WebHCat Server status check + service_description WEBHCAT::WebHCat Server status servicegroups WEBHCAT <%if scope.function_hdp_template_var("security_enabled")-%> check_command check_templeton_status!<%=scope.function_hdp_template_var("::hdp::templeton_port")%>!v1!true!<%=scope.function_hdp_template_var("nagios_keytab_path")%>!<%=scope.function_hdp_template_var("nagios_principal_name")%>!<%=scope.function_hdp_template_var("kinit_path_local")%> @@ -683,7 +683,7 @@ define service { define service { hostgroup_name hue-server use hadoop-service - service_description HUE::Hue Server status check + service_description HUE::Hue Server status servicegroups HUE check_command check_hue_status normal_check_interval 100
