Updated Branches: refs/heads/trunk 062480094 -> 82d4a5438
AMBARI-3256. 'Percent NodeManager Live' alert and 'Percent NodeManager healthy' alert for YARN service work with a considerable delay (Andrew Onischuk via dlysnichenko) Project: http://git-wip-us.apache.org/repos/asf/incubator-ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-ambari/commit/82d4a543 Tree: http://git-wip-us.apache.org/repos/asf/incubator-ambari/tree/82d4a543 Diff: http://git-wip-us.apache.org/repos/asf/incubator-ambari/diff/82d4a543 Branch: refs/heads/trunk Commit: 82d4a5438afc8ed8cda46dbfa09d819efe703106 Parents: 0624800 Author: Lisnichenko Dmitro <[email protected]> Authored: Wed Sep 18 15:17:43 2013 +0300 Committer: Lisnichenko Dmitro <[email protected]> Committed: Wed Sep 18 15:17:43 2013 +0300 ---------------------------------------------------------------------- .../check_resourcemanager_nodes_percentage.sh | 59 -------------------- .../hdp-nagios/manifests/server/config.pp | 1 - .../templates/hadoop-commands.cfg.erb | 5 -- .../templates/hadoop-services.cfg.erb | 32 ++++------- 4 files changed, 10 insertions(+), 87 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/82d4a543/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_resourcemanager_nodes_percentage.sh ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_resourcemanager_nodes_percentage.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_resourcemanager_nodes_percentage.sh deleted file mode 100644 index 358da57..0000000 --- a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_resourcemanager_nodes_percentage.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env bash -# -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -# -HOST=$1 -PORT=$2 -#Resource manager nodes, with selected status, which number we want to know -NODE_STATUS=$3 -WARN_PERCENT=$4 -CRIT_PERCENT=$5 -NODES="Nodes" - -RESOURCEMANAGER_URL="http://$HOST:$PORT/ws/v1/cluster/metrics" -export PATH="/usr/bin:$PATH" -RESPONSE=`curl -s $RESOURCEMANAGER_URL` - -if [ -z "$RESPONSE" ]; then - echo "CRITICAL: Can't get data from http://$HOST:$PORT/ws/v1/cluster/metrics" - exit 2; -fi - -#code below is parsing RESPONSE that we get from resourcemanager api, for number between "activeNodes": and ',' -ACTIVE_NODES=`echo "$RESPONSE" | sed -nre 's/^.*"activeNodes":([[:digit:]]+).*$/\1/gp'` -LOST_NODES=`echo "$RESPONSE" | sed -nre 's/^.*"lostNodes":([[:digit:]]+).*$/\1/gp'` -UNHEALTHY_NODES=`echo "$RESPONSE" | sed -nre 's/^.*"unhealthyNodes":([[:digit:]]+).*$/\1/gp'` -DECOMMISSIONED_NODES=`echo "$RESPONSE" | sed -nre 's/^.*"decommissionedNodes":([[:digit:]]+).*$/\1/gp'` -REBOOTED_NODES=`echo "$RESPONSE" | sed -nre 's/^.*"rebootedNodes":([[:digit:]]+).*$/\1/gp'` - -TOTAL_NODES_NUM=$(($ACTIVE_NODES+$LOST_NODES+$UNHEALTHY_NODES+$DECOMMISSIONED_NODES+$REBOOTED_NODES)) -NODES_NUM=`echo "$RESPONSE" | sed -nre "s/^.*\"$NODE_STATUS$NODES\":([[:digit:]]+).*$/\1/gp"` -PERCENT=$(($NODES_NUM*100/$TOTAL_NODES_NUM)) - -if [[ "$PERCENT" -lt "$WARN_PERCENT" ]]; then - echo "OK: total:<$TOTAL_NODES_NUM>, affected:<$NODES_NUM>" - exit 0; -elif [[ "$PERCENT" -lt "$CRIT_PERCENT" ]]; then - echo "WARNING: total:<$TOTAL_NODES_NUM>, affected:<$NODES_NUM>" - exit 1; -else - echo "CRITICAL: total:<$TOTAL_NODES_NUM>, affected:<$NODES_NUM>" - exit 2; -fi http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/82d4a543/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp b/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp index cdae953..c527e1f 100644 --- a/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp +++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp @@ -50,7 +50,6 @@ class hdp-nagios::server::config() hdp-nagios::server::check { 'check_hue_status.sh': } hdp-nagios::server::check { 'check_mapred_local_dir_used.sh': } hdp-nagios::server::check { 'check_nodemanager_health.sh': } - hdp-nagios::server::check { 'check_resourcemanager_nodes_percentage.sh': } hdp-nagios::server::check { 'check_namenodes_ha.sh': } hdp-nagios::server::check { 'hdp_nagios_init.php': } http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/82d4a543/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb index 7a8e293..c532adb 100644 --- a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb +++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb @@ -102,8 +102,3 @@ define command{ command_name check_nodemanager_health command_line $USER1$/check_nodemanager_health.sh $HOSTADDRESS$ $ARG1$ } - -define command{ - command_name check_resourcemanager_nodes_percentage - command_line $USER1$/check_resourcemanager_nodes_percentage.sh $HOSTADDRESS$ $ARG1$ $ARG2$ $ARG3$ $ARG4$ - } http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/82d4a543/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb index f1d4b3b..4ecfbb2 100644 --- a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb +++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb @@ -458,28 +458,6 @@ define service { define service { hostgroup_name resourcemanager use hadoop-service - service_description RESOURCEMANAGER::Percent NodeManager live - servicegroups YARN - check_command check_resourcemanager_nodes_percentage!<%=scope.function_hdp_template_var("::hdp::rm_port")%>!lost!10!30 - normal_check_interval 1 - retry_check_interval 1 - max_check_attempts 3 -} - -define service { - hostgroup_name resourcemanager - use hadoop-service - service_description RESOURCEMANAGER::Percent NodeManager healthy - servicegroups YARN - check_command check_resourcemanager_nodes_percentage!<%=scope.function_hdp_template_var("::hdp::rm_port")%>!unhealthy!10!30 - normal_check_interval 1 - retry_check_interval 1 - max_check_attempts 3 -} - -define service { - hostgroup_name resourcemanager - use hadoop-service service_description RESOURCEMANAGER::ResourceManager process servicegroups YARN check_command check_tcp!<%=scope.function_hdp_template_var("::hdp::rm_port")%>!-w 1 -c 1 @@ -512,6 +490,16 @@ define service { retry_check_interval 1 max_check_attempts 3 } +define service { + hostgroup_name nagios-server + use hadoop-service + service_description NODEMANAGER::Percent NodeManager process + servicegroups YARN + check_command check_aggregate!"NODEMANAGER::NodeManager process"!10%!30% + normal_check_interval 0.5 + retry_check_interval 0.25 + max_check_attempts 3 +} <% end %> <%if scope.function_hdp_nagios_members_exist('historyserver2')-%>
