Updated Branches: refs/heads/trunk 631ae2b5b -> ddfbda054
AMBARI-2928. Add a Nagios alert to check state of NN HA. (Dmitry Sen via odiachenko) Project: http://git-wip-us.apache.org/repos/asf/incubator-ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-ambari/commit/ddfbda05 Tree: http://git-wip-us.apache.org/repos/asf/incubator-ambari/tree/ddfbda05 Diff: http://git-wip-us.apache.org/repos/asf/incubator-ambari/diff/ddfbda05 Branch: refs/heads/trunk Commit: ddfbda054fd2c7cf063c2a1ef8189e880c091a4a Parents: 631ae2b Author: Oleksandr Diachenko <[email protected]> Authored: Wed Sep 4 18:18:41 2013 +0300 Committer: Oleksandr Diachenko <[email protected]> Committed: Wed Sep 4 18:19:01 2013 +0300 ---------------------------------------------------------------------- .../hdp-nagios/files/check_namenodes_ha.sh | 82 ++++++++++++++++++++ .../hdp-nagios/manifests/server/config.pp | 1 + .../templates/hadoop-commands.cfg.erb | 5 ++ .../templates/hadoop-services.cfg.erb | 13 +++- 4 files changed, 100 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/ddfbda05/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_namenodes_ha.sh ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_namenodes_ha.sh b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_namenodes_ha.sh new file mode 100644 index 0000000..e476def --- /dev/null +++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_namenodes_ha.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +# +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# + +IFS=',' read -a namenodes <<< "$1" +port=$2 +totalNN=${#namenodes[@]} +activeNN=() +standbyNN=() +unavailableNN=() + +for nn in "${namenodes[@]}" +do + status=$(curl -m 5 -s http://$nn:$port/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem | grep -i "tag.HAState" | grep -o -E "standby|active") + if [ "$status" == "active" ]; then + activeNN[${#activeNN[*]}]="$nn" + elif [ "$status" == "standby" ]; then + standbyNN[${#standbyNN[*]}]="$nn" + elif [ "$status" == "" ]; then + unavailableNN[${#unavailableNN[*]}]="$nn" + fi +done + +message="" +critical=false + +if [ ${#activeNN[@]} -gt 1 ]; then + critical=true + message=$message" Only one NN can have HAState=active;" +elif [ ${#activeNN[@]} == 0 ]; then + critical=true + message=$message" No Active NN available;" +elif [ ${#standbyNN[@]} == 0 ]; then + critical=true + message=$message" No failover NN available;" +fi + +NNstats=" Active<" +for nn in "${activeNN[@]}" +do + NNstats="$NNstats$nn;" +done +NNstats=${NNstats%\;} +NNstats=$NNstats">, Standby<" +for nn in "${standbyNN[@]}" +do + NNstats="$NNstats$nn;" +done +NNstats=${NNstats%\;} +NNstats=$NNstats">, Unavailable<" +for nn in "${unavailableNN[@]}" +do + NNstats="$NNstats$nn;" +done +NNstats=${NNstats%\;} +NNstats=$NNstats">" + +if [ $critical == false ]; then + echo "OK: NameNode HA healthy;"$NNstats + exit 0 +fi + +echo "CRITICAL:"$message$NNstats +exit 2 http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/ddfbda05/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp b/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp index 598a8f5..f013c58 100644 --- a/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp +++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp @@ -52,6 +52,7 @@ class hdp-nagios::server::config() hdp-nagios::server::check { 'check_mapred_local_dir_used.sh': } hdp-nagios::server::check { 'check_nodemanager_health.sh': } hdp-nagios::server::check { 'check_resourcemanager_nodes_percentage.sh': } + hdp-nagios::server::check { 'check_namenodes_ha.sh': } anchor{'hdp-nagios::server::config::begin':} -> Hdp-nagios::Server::Configfile<||> -> anchor{'hdp-nagios::server::config::end':} Anchor['hdp-nagios::server::config::begin'] -> Hdp-nagios::Server::Check<||> -> Anchor['hdp-nagios::server::config::end'] http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/ddfbda05/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb index 09fe235..f3aec98 100644 --- a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb +++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb @@ -98,6 +98,11 @@ define command{ } define command{ + command_name check_namenodes_ha + command_line $USER1$/check_namenodes_ha.sh $ARG1$ $ARG2$ + } + +define command{ command_name check_nodemanager_health command_line $USER1$/check_nodemanager_health.sh $HOSTADDRESS$ $ARG1$ } http://git-wip-us.apache.org/repos/asf/incubator-ambari/blob/ddfbda05/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb index 8d55338..4f50b67 100644 --- a/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb +++ b/ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb @@ -63,7 +63,18 @@ define service { retry_check_interval 0.25 max_check_attempts 3 } - +<% if scope.function_hdp_nagios_members_exist('namenode') && (scope.function_hdp_get_major_stack_version([scope.function_hdp_template_var("stack_version")]) >= 2)%> +define service { + hostgroup_name nagios-server + use hadoop-service + service_description HDFS::NameNode HA Healthy + servicegroups HDFS + check_command check_namenodes_ha!$HOSTGROUPMEMBERS:namenode$!<%=scope.function_hdp_template_var("::hdp::namenode_port")%> + normal_check_interval 0.5 + retry_check_interval 0.25 + max_check_attempts 5 +} +<%end-%> # AMBARI AGENT Checks define service {
