Updated Branches: refs/heads/trunk f4cc4c887 -> 4be888c57
AMBARI-2861. YARN RM/NM alerts need to be generated. (Vitaly Brodetskyi via swagle) Project: http://git-wip-us.apache.org/repos/asf/incubator-ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-ambari/commit/4be888c5 Tree: http://git-wip-us.apache.org/repos/asf/incubator-ambari/tree/4be888c5 Diff: http://git-wip-us.apache.org/repos/asf/incubator-ambari/diff/4be888c5 Branch: refs/heads/trunk Commit: 4be888c577e71cef7ada6f68548bff436ac10d50 Parents: f4cc4c8 Author: Siddharth Wagle <[email protected]> Authored: Fri Aug 9 14:46:31 2013 -0700 Committer: Siddharth Wagle <[email protected]> Committed: Fri Aug 9 14:46:31 2013 -0700 ---------------------------------------------------------------------- .../files/check_nodemanager_health.sh | 32 ++++++++++++++ .../check_resourcemanager_nodes_percentage.sh | 45 ++++++++++++++++++++ .../hdp-nagios/manifests/server/config.pp | 2 + .../templates/hadoop-commands.cfg.erb | 10 +++++ .../templates/hadoop-services.cfg.erb | 32 ++++++++++++++ 5 files changed, 121 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/4be888c5/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh new file mode 100644 index 0000000..ca13909 --- /dev/null +++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_nodemanager_health.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# +HOST=$1 +PORT=$2 +NODEMANAGER_URL="http://$HOST:$PORT/ws/v1/node/info" +export PATH="/usr/bin:$PATH" +RESPONSE=`curl $NODEMANAGER_URL` +if [[ "$RESPONSE" == *'"nodeHealthy":true'* ]]; then + echo "OK: nodemanager healthy true"; + exit 0; +fi +echo "CRITICAL: nodemanager healthy false"; +exit 2; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/4be888c5/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_resourcemanager_nodes_percentage.sh ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_resourcemanager_nodes_percentage.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_resourcemanager_nodes_percentage.sh new file mode 100644 index 0000000..48a2aae --- /dev/null +++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_resourcemanager_nodes_percentage.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +# +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# +HOST=$1 +PORT=$2 +#Resource manager nodes, with selected status, which number we want to know +NODE_STATUS=$3 +WARN_PERCENT=$4 +CRIT_PERCENT=$5 +NODES="Nodes" +RESOURCEMANAGER_URL="http://$HOST:$PORT/ws/v1/cluster/metrics" +export PATH="/usr/bin:$PATH" +RESPONSE=`curl $RESOURCEMANAGER_URL` +#code below is parsing RESPONSE that we get from resourcemanager api, for number between "totalNodes": and ',' +TOTAL_NODES_NUM=`echo "$RESPONSE" | sed -nre 's/^.*"totalNodes":([[:digit:]]+).*$/\1/gp'` +NODES_NUM=`echo "$RESPONSE" | sed -nre "s/^.*\"$NODE_STATUS$NODES\":([[:digit:]]+).*$/\1/gp"` +PERCENT=$(($NODES_NUM*100/$TOTAL_NODES_NUM)) +if [[ "$PERCENT" -lt "$WARN_PERCENT" ]]; then + echo "OK: total:<$TOTAL_NODES_NUM>, affected:<$NODES_NUM>" + exit 0; +elif [[ "$PERCENT" -lt "$CRIT_PERCENT" ]]; then + echo "WARN: total:<$TOTAL_NODES_NUM>, affected:<$NODES_NUM>" + exit 1; +else + echo "CRITICAL: total:<$TOTAL_NODES_NUM>, affected:<$NODES_NUM>" + exit 2; +fi http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/4be888c5/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp b/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp index 025bcd7..598a8f5 100644 --- a/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp +++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp @@ -50,6 +50,8 @@ class hdp-nagios::server::config() hdp-nagios::server::check { 'check_ambari_agent_status.sh': } hdp-nagios::server::check { 'check_hue_status.sh': } hdp-nagios::server::check { 'check_mapred_local_dir_used.sh': } + hdp-nagios::server::check { 'check_nodemanager_health.sh': } + hdp-nagios::server::check { 'check_resourcemanager_nodes_percentage.sh': } anchor{'hdp-nagios::server::config::begin':} -> Hdp-nagios::Server::Configfile<||> -> anchor{'hdp-nagios::server::config::end':} Anchor['hdp-nagios::server::config::begin'] -> Hdp-nagios::Server::Check<||> -> Anchor['hdp-nagios::server::config::end'] http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/4be888c5/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb index 1233e18..4dbc398 100644 --- a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb +++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb @@ -96,3 +96,13 @@ define command{ command_name check_mapred_local_dir_used_space command_line $USER1$/check_mapred_local_dir_used.sh $ARG1$ $ARG2$ } + +define command{ + command_name check_nodemanager_health + command_line $USER1$/check_nodemanager_health.sh $HOSTADDRESS$ $ARG1$ + } + +define command{ + command_name check_resourcemanager_nodes_percentage + command_line $USER1$/check_resourcemanager_nodes_percentage.sh $HOSTADDRESS$ $ARG1$ $ARG2$ $ARG3$ $ARG4$ + } http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/4be888c5/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb index 8e29808..401c79f 100644 --- a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb +++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb @@ -435,6 +435,27 @@ define service { max_check_attempts 5 } +define service { + hostgroup_name resourcemanager + use hadoop-service + service_description RESOURCEMANAGER::Resource Manager percent nodemanager down + servicegroups YARN + check_command check_resourcemanager_nodes_percentage!<%=scope.function_hdp_template_var("::hdp::rm_port")%>!lost!10!30 + normal_check_interval 1 + retry_check_interval 1 + max_check_attempts 3 +} + +define service { + hostgroup_name resourcemanager + use hadoop-service + service_description RESOURCEMANAGER::Resource Manager percent nodemanager unhealthy + servicegroups YARN + check_command check_resourcemanager_nodes_percentage!<%=scope.function_hdp_template_var("::hdp::rm_port")%>!unhealthy!10!30 + normal_check_interval 1 + retry_check_interval 1 + max_check_attempts 3 +} <% end %> <%if scope.function_hdp_nagios_members_exist('nodemanagers')-%> @@ -449,6 +470,17 @@ define service { retry_check_interval 0.5 max_check_attempts 3 } + +define service { + hostgroup_name nodemanagers + use hadoop-service + service_description NODEMANAGER::Node Manager unhealthy + servicegroups YARN + check_command check_nodemanager_health!<%=scope.function_hdp_template_var("nm_port")%> + normal_check_interval 1 + retry_check_interval 1 + max_check_attempts 3 +} <% end %> <%if scope.function_hdp_nagios_members_exist('historyserver2')-%>
