Author: swagle
Date: Tue May 14 20:56:41 2013
New Revision: 1482589
URL: http://svn.apache.org/r1482589
Log:
AMBARI-2133. Add Nagios alerts for Hadoop 2.0 in Ambari. (swagle)
Modified:
incubator/ambari/trunk/CHANGES.txt
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_rpcq_latency.php
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_webui.sh
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/params.pp
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-servicegroups.cfg.erb
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp/manifests/init.pp
incubator/ambari/trunk/ambari-server/src/main/resources/stacks/HDP/2.0.1/services/GANGLIA/metainfo.xml
Modified: incubator/ambari/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/incubator/ambari/trunk/CHANGES.txt?rev=1482589&r1=1482588&r2=1482589&view=diff
==============================================================================
--- incubator/ambari/trunk/CHANGES.txt (original)
+++ incubator/ambari/trunk/CHANGES.txt Tue May 14 20:56:41 2013
@@ -12,6 +12,8 @@ Trunk (unreleased changes):
NEW FEATURES
+ AMBARI-2133. Add Nagios alerts for Hadoop 2.0 in Ambari. (swagle)
+
AMBARI-2123. Allow the user to specify a non-root ssh user in Install Options.
(yusaku)
Modified:
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_rpcq_latency.php
URL:
http://svn.apache.org/viewvc/incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_rpcq_latency.php?rev=1482589&r1=1482588&r2=1482589&view=diff
==============================================================================
---
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_rpcq_latency.php
(original)
+++
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_rpcq_latency.php
Tue May 14 20:56:41 2013
@@ -21,7 +21,7 @@
* It checks the rpc wait time in the queue, RpcQueueTime_avg_time
* check_rpcq_latency -h hostaddress -p port -t ServiceName -w 1 -c 1
* Warning and Critical values are in seconds
- * Service Name = JobTracker, NameNode
+ * Service Name = JobTracker, NameNode, JobHistoryServer
*/
$options = getopt ("h:p:w:c:n:");
@@ -62,6 +62,6 @@
/* print usage */
function usage () {
- echo "Usage: $0 -h <host> -p port -n <JobTracker/NameNode> -w
<warn_in_sec> -c <crit_in_sec>\n";
+ echo "Usage: $0 -h <host> -p port -n
<JobTracker/NameNode/JobHistoryServer> -w <warn_in_sec> -c <crit_in_sec>\n";
}
?>
Modified:
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_webui.sh
URL:
http://svn.apache.org/viewvc/incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_webui.sh?rev=1482589&r1=1482588&r2=1482589&view=diff
==============================================================================
---
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_webui.sh
(original)
+++
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_webui.sh
Tue May 14 20:56:41 2013
@@ -64,7 +64,21 @@ hbase)
exit 1;
fi
;;
-*) echo "UNKNOWN: Invalid service name [$service], valid options
[jobtracker|jobhistory|hbase|namenode]"
+resorcemanager)
+ rmweburl="http://$host:$port/cluster"
+ if [[ `checkurl "$rmweburl"` -ne 0 ]]; then
+ echo "WARNING: ResourceManager web UI not accessible : $rmweburl";
+ exit 1;
+ fi
+ ;;
+historyserver2)
+ hsweburl="http://$host:$port/jobhistory"
+ if [[ `checkurl "$hsweburl"` -ne 0 ]]; then
+ echo "WARNING: HistoryServer2 web UI not accessible : $hsweburl";
+ exit 1;
+ fi
+ ;;
+*) echo "UNKNOWN: Invalid service name [$service], valid options
[jobtracker|jobhistory|hbase|namenode|resorcemanager|historyserver2]"
exit 3
;;
esac
Modified:
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/params.pp
URL:
http://svn.apache.org/viewvc/incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/params.pp?rev=1482589&r1=1482588&r2=1482589&view=diff
==============================================================================
---
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/params.pp
(original)
+++
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/params.pp
Tue May 14 20:56:41 2013
@@ -86,6 +86,9 @@ class hdp-nagios::params() inherits hdp:
region-servers => {host_member_info => 'hbase_rs_hosts'},
oozie-server => {host_member_info => 'oozie_server'},
webhcat-server => {host_member_info => 'webhcat_server_host'},
- hue-server => {host_member_info => 'hue_server_host'}
+ hue-server => {host_member_info => 'hue_server_host'},
+ resorcemanager => {host_member_info => 'rm_host'},
+ nodemanagers => {host_member_info => 'nm_hosts'},
+ historyserver2 => {host_member_info => 'hs_host'}
}
}
Modified:
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-servicegroups.cfg.erb
URL:
http://svn.apache.org/viewvc/incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-servicegroups.cfg.erb?rev=1482589&r1=1482588&r2=1482589&view=diff
==============================================================================
---
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-servicegroups.cfg.erb
(original)
+++
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-servicegroups.cfg.erb
Tue May 14 20:56:41 2013
@@ -7,6 +7,11 @@ define servicegroup {
alias MAPREDUCE Checks
}
define servicegroup {
+ servicegroup_name YARN
+ alias YARN Checks
+}
+
+define servicegroup {
servicegroup_name HBASE
alias HBASE Checks
}
Modified:
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
URL:
http://svn.apache.org/viewvc/incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb?rev=1482589&r1=1482588&r2=1482589&view=diff
==============================================================================
---
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
(original)
+++
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb
Tue May 14 20:56:41 2013
@@ -175,6 +175,46 @@ define service {
max_check_attempts 4
}
<%end-%>
+
+<%if scope.function_hdp_nagios_members_exist('resorcemanager')-%>
+define service {
+ hostgroup_name resorcemanager
+ use hadoop-service
+ service_description GANGLIA::Ganglia Collector [gmond] process
down alert for Resource Manager
+ servicegroups GANGLIA
+ check_command
check_tcp!<%=scope.function_hdp_template_var("ganglia_collector_rm_port")%>!-w
1 -c 1
+ normal_check_interval 0.25
+ retry_check_interval 0.25
+ max_check_attempts 4
+}
+<%end-%>
+
+<%if scope.function_hdp_nagios_members_exist('nodemanagers')-%>
+define service {
+ hostgroup_name nodemanagers
+ use hadoop-service
+ service_description GANGLIA::Ganglia Collector [gmond] process
down alert for Node Manager
+ servicegroups GANGLIA
+ check_command
check_tcp!<%=scope.function_hdp_template_var("ganglia_collector_nm_port")%>!-w
1 -c 1
+ normal_check_interval 0.25
+ retry_check_interval 0.25
+ max_check_attempts 4
+}
+<%end-%>
+
+<%if scope.function_hdp_nagios_members_exist('historyserver2')-%>
+define service {
+ hostgroup_name historyserver2
+ use hadoop-service
+ service_description GANGLIA::Ganglia Collector [gmond] process
down alert for History Server 2
+ servicegroups GANGLIA
+ check_command
check_tcp!<%=scope.function_hdp_template_var("ganglia_collector_hs_port")%>!-w
1 -c 1
+ normal_check_interval 0.25
+ retry_check_interval 0.25
+ max_check_attempts 4
+}
+<%end-%>
+
<%end-%>
<%if scope.function_hdp_nagios_members_exist('snamenode')-%>
@@ -344,6 +384,94 @@ define service {
<%end-%>
+<%if scope.function_hdp_nagios_members_exist('resorcemanager')-%>
+# YARN::RESOURCEMANAGER Checks
+define service {
+ hostgroup_name resorcemanager
+ use hadoop-service
+ service_description RESOURCEMANAGER::Resource Manager Web UI down
+ servicegroups YARN
+ check_command
check_webui!resorcemanager!<%=scope.function_hdp_template_var("rm_port")%>
+ normal_check_interval 1
+ retry_check_interval 1
+ max_check_attempts 3
+}
+
+define service {
+ hostgroup_name resorcemanager
+ use hadoop-service
+ service_description RESOURCEMANAGER::Resource Manager CPU
utilization
+ servicegroups YARN
+ check_command check_cpu!200%!250%
+ normal_check_interval 5
+ retry_check_interval 2
+ max_check_attempts 5
+}
+
+define service {
+ hostgroup_name resorcemanager
+ use hadoop-service
+ service_description RESOURCEMANAGER::Resource Manager RPC latency
+ servicegroups YARN
+ check_command
check_rpcq_latency!ResorceManager!<%=scope.function_hdp_template_var("rm_port")%>!3000!5000
+ normal_check_interval 5
+ retry_check_interval 1
+ max_check_attempts 5
+}
+
+<% end %>
+
+<%if scope.function_hdp_nagios_members_exist('nodemanagers')-%>
+# YARN::NODEMANAGER Checks
+define service {
+ hostgroup_name nodemanagers
+ use hadoop-service
+ service_description NODEMANAGER::Node Manager process down
+ servicegroups YARN
+ check_command
check_tcp!<%=scope.function_hdp_template_var("nm_port")%>!-w 1 -c 1
+ normal_check_interval 1
+ retry_check_interval 0.5
+ max_check_attempts 3
+}
+<% end %>
+
+<%if scope.function_hdp_nagios_members_exist('historyserver2')-%>
+# MAPREDUCE::HISTORYSERVER2 Checks
+define service {
+ hostgroup_name historyserver2
+ use hadoop-service
+ service_description HISTORYSERVER2::History Server 2 Web UI down
+ servicegroups MAPREDUCE
+ check_command
check_webui!historyserver2!<%=scope.function_hdp_template_var("hs_port")%>
+ normal_check_interval 1
+ retry_check_interval 1
+ max_check_attempts 3
+}
+
+define service {
+ hostgroup_name historyserver2
+ use hadoop-service
+ service_description HISTORYSERVER::History Server 2 CPU utilization
+ servicegroups MAPREDUCE
+ check_command check_cpu!200%!250%
+ normal_check_interval 5
+ retry_check_interval 2
+ max_check_attempts 5
+}
+
+define service {
+ hostgroup_name historyserver2
+ use hadoop-service
+ service_description HISTORYSERVER::History Server 2 RPC latency
+ servicegroups MAPREDUCE
+ check_command
check_rpcq_latency!JobHistoryServer!<%=scope.function_hdp_template_var("hs_port")%>!3000!5000
+ normal_check_interval 5
+ retry_check_interval 1
+ max_check_attempts 5
+}
+
+<% end %>
+
<%if scope.function_hdp_nagios_members_exist('slaves')-%>
# HDFS::DATANODE Checks
define service {
Modified:
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp/manifests/init.pp
URL:
http://svn.apache.org/viewvc/incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp/manifests/init.pp?rev=1482589&r1=1482588&r2=1482589&view=diff
==============================================================================
---
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp/manifests/init.pp
(original)
+++
incubator/ambari/trunk/ambari-agent/src/main/puppet/modules/hdp/manifests/init.pp
Tue May 14 20:56:41 2013
@@ -50,6 +50,14 @@ class hdp(
$jtnode_port =
hdp_get_port_from_url($mapred-site["mapred.job.tracker.http.address"],"50030")
$tasktracker_port =
hdp_get_port_from_url($mapred-site["mapred.task.tracker.http.address"],"50060")
$jobhistory_port =
hdp_get_port_from_url($mapred-site["mapreduce.history.server.http.address"],"51111")
+
+ $hs_port =
hdp_get_port_from_url($mapred-site["mapreduce.jobhistory.webapp.address"],"19888")
+ }
+
+ if has_key($configuration, 'yarn-site') {
+ $yarn-site = $configuration['yarn-site']
+ $rm_port =
hdp_get_port_from_url($yarn-site["yarn.resourcemanager.webapp.address"],"8088")
+ $nm_port =
hdp_get_port_from_url($yarn-site["yarn.nodemanager.webapp.address"],"8042")
}
$hbase_master_port = hdp_default("hbase-site/hbase.master.info.port","60010")
@@ -60,6 +68,9 @@ class hdp(
$ganglia_collector_namenode_port =
hdp_default("ganglia_collector_namenode_port","8661")
$ganglia_collector_jobtracker_port =
hdp_default("ganglia_collector_jobtracker_port","8662")
$ganglia_collector_hbase_port =
hdp_default("ganglia_collector_hbase_port","8663")
+ $ganglia_collector_rm_port = hdp_default("ganglia_collector_rm_port","8664")
+ $ganglia_collector_nm_port = hdp_default("ganglia_collector_nm_port","8665")
+ $ganglia_collector_hs_port = hdp_default("ganglia_collector_hs_port","8666")
$oozie_server_port = hdp_default("oozie_server_port","11000")
Modified:
incubator/ambari/trunk/ambari-server/src/main/resources/stacks/HDP/2.0.1/services/GANGLIA/metainfo.xml
URL:
http://svn.apache.org/viewvc/incubator/ambari/trunk/ambari-server/src/main/resources/stacks/HDP/2.0.1/services/GANGLIA/metainfo.xml?rev=1482589&r1=1482588&r2=1482589&view=diff
==============================================================================
---
incubator/ambari/trunk/ambari-server/src/main/resources/stacks/HDP/2.0.1/services/GANGLIA/metainfo.xml
(original)
+++
incubator/ambari/trunk/ambari-server/src/main/resources/stacks/HDP/2.0.1/services/GANGLIA/metainfo.xml
Tue May 14 20:56:41 2013
@@ -31,10 +31,6 @@
<category>SLAVE</category>
</component>
- <component>
- <name>MONITOR_WEBSERVER</name>
- <category>MASTER</category>
- </component>
</components>
</metainfo>