Ottomata has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/399255 )

Change subject: Add nrpe check_newest_file_age; monitor some analytics file 
backups
......................................................................

Add nrpe check_newest_file_age; monitor some analytics file backups

Bug: T182327
Change-Id: I785a5f601bfb3f0b48a604a8e39991cac256b8a5
---
A modules/nrpe/files/plugins/check_newest_file_age
M modules/profile/manifests/analytics/database/meta/backup_dest.pp
M modules/profile/manifests/hadoop/backup/namenode.pp
3 files changed, 385 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/55/399255/1

diff --git a/modules/nrpe/files/plugins/check_newest_file_age 
b/modules/nrpe/files/plugins/check_newest_file_age
new file mode 100755
index 0000000..0b7a4f8
--- /dev/null
+++ b/modules/nrpe/files/plugins/check_newest_file_age
@@ -0,0 +1,367 @@
+#! /bin/sh
+#
+# NOTE: This file is managed by Puppet.
+# Originally copied from 
https://github.com/thehunmonkgroup/nagios-plugin-newest-file-age
+#
+# Newest file in a directory plugin for Nagios.
+# Written by Chad Phillips ([email protected])
+# Last Modified: 2009-02-12
+#
+# The MIT License (MIT)
+#
+# Copyright (c) 2015 thehunmonkgroup
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in 
all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+PROGPATH=`dirname $0`
+
+STATE_OK=0
+STATE_WARNING=1
+STATE_CRITICAL=2
+STATE_UNKNOWN=3
+STATE_DEPENDENT=4
+
+print_usage() {
+    echo "
+Usage: check_newest_file_age --dirs | -d <directories> [-w <max_age>] [-c 
<max_age>] [-W] [-C] [-t <time_unit>] [-V] [--check-dirs] [--base-dir 
<directory>]
+Usage: check_newest_file_age --help | -h
+
+Description:
+
+This plugin pulls the most recently created file in each specified directory,
+and checks it's created time against the current time.  If the maximum age of
+the file is exceeded, a warning/critical message is returned as appropriate.
+
+This is useful for examining backup directories for freshness.
+
+Tested to work on Linux/FreeBSD/OS X.
+
+The following arguments are accepted:
+
+  --dirs | -d     A space separated list of directories to examine.  Each
+                  directory will be checked for the newest created file in that
+                  directory.
+
+  -w              (Optional) Generate a warning message if the last created
+                  file is older than this value.  Defaults to 26 hours.
+
+  -c            (Optional) Generate a critical message if the last created
+                  file is older than this value.  Defaults to 52 hours.
+
+  -W              (Optional) If set, a warning message will be returned if the
+                  specified directory doesn't exist, or there are no checkable
+                  files in the specified directory.
+
+  -C              (Optional) If set, a critical message will be returned if the
+                  specified directory doesn't exist, or there are no checkable
+                  files in the specified directory.
+
+  -t              (Optional) The time unit used for the -w and -c values.  Must
+                  be one of: seconds, minutes, hours, days.  Defaults to hours.
+
+  -V              (Optional) Output verbose information about all checked
+                  directories.  Default is only to print verbose information
+                  for directories with non-OK states.
+
+  --check-dirs    (Optional) If set, directories inside the specified directory
+                  will also be checked for their creation time. Note that this
+                  check is not recursive.  Without this option, only real files
+                  inside the specified directory will be checked.
+
+  --base-dir      (Optional) If set, this path will be prepended to all
+                  checked directories.
+
+  --help | -h     Print this help and exit.
+
+Examples:
+
+Generate a warning if the newest file in /backups is more than 26 hours old,
+and a critical if it's more than 52 hours old...
+
+  check_newest_file_age -d \"/backups\"
+
+Generate a warning if the newest file in /backups/bill or /backups/dave is more
+than one week old, or a critical if it's more than two weeks old...
+
+  check_newest_file_age -d \"/backups/bill /backups/dave\" -w 7 -c 14 -t days
+
+Caveats:
+
+Although multiple directories can be specified, only one set of
+warning/critical times can be supplied.
+
+Linux doesn't seem to have an easy way to check file/directory creation time,
+so file/directory last modification time is used instead.
+"
+}
+
+print_help() {
+    print_usage
+    echo "Newest file in a directory plugin for Nagios."
+    echo ""
+}
+
+# Sets the exit status for the plugin.  This is done in such a way that the
+# status can only go in one direction: OK -> WARNING -> CRITICAL.
+set_exit_status() {
+  new_status=$1
+  # Nothing needs to be done if the state is already critical, so exclude
+  # that case.
+  case $exitstatus
+  in
+    $STATE_WARNING)
+      # Only upgrade from warning to critical.
+      if [ "$new_status" = "$STATE_CRITICAL" ]; then
+        exitstatus=$new_status;
+      fi
+    ;;
+    $STATE_OK)
+      # Always update state if current state is OK.
+      exitstatus=$new_status;
+    ;;
+  esac
+}
+
+# Make sure the correct number of command line
+# arguments have been supplied
+if [ $# -lt 1 ]; then
+    print_usage
+    exit $STATE_UNKNOWN
+fi
+
+# Defaults.
+exitstatus=$STATE_OK
+warning=26
+critical=52
+time_unit=hours
+verbose=
+on_empty=$STATE_OK
+check_dirs=
+base_dir=
+
+# Grab the command line arguments.
+while test -n "$1"; do
+    case "$1" in
+        --help)
+            print_help
+            exit $STATE_OK
+            ;;
+        -h)
+            print_help
+            exit $STATE_OK
+            ;;
+        --dirs)
+            dirs=$2
+            shift
+            ;;
+        -d)
+            dirs=$2
+            shift
+            ;;
+        -w)
+            warning=$2
+            shift
+            ;;
+        -c)
+            critical=$2
+            shift
+            ;;
+    -W)
+      on_empty=$STATE_WARNING
+            ;;
+    -C)
+      on_empty=$STATE_CRITICAL
+            ;;
+        -t)
+            time_unit=$2
+            shift
+            ;;
+    -V)
+      verbose=1
+            ;;
+        --check-dirs)
+            check_dirs=1
+            ;;
+    --base-dir)
+          base_dir=$2
+      shift
+      ;;
+        -x)
+            exitstatus=$2
+            shift
+            ;;
+        --exitstatus)
+            exitstatus=$2
+            shift
+            ;;
+        *)
+            echo "Unknown argument: $1"
+            print_usage
+            exit $STATE_UNKNOWN
+            ;;
+    esac
+    shift
+done
+
+if [ ! "$dirs" ]; then
+  echo "No directories provided."
+  exit $STATE_UNKNOWN
+fi
+
+if [ `echo "$warning" | grep [^0-9]` ] || [ ! "$warning" ]; then
+  echo "Warning value must be a number."
+  exit $STATE_UNKNOWN
+fi
+
+if [ `echo "$critical" | grep [^0-9]` ] || [ ! "$critical" ]; then
+  echo "Critical value must be a number."
+  exit $STATE_UNKNOWN
+fi
+
+if [ ! `echo "$time_unit" | grep "seconds\|minutes\|hours\|days"` ]; then
+  echo "Time unit must be one of: seconds, minutes, hours, days."
+  exit $STATE_UNKNOWN
+fi
+
+if [ "$warning" -ge "$critical" ]; then
+  echo "Critical time must be greater than warning time."
+  exit $STATE_UNKNOWN
+fi
+
+case $time_unit
+in
+  days)
+    multiplier=86400;
+    abbreviation="days";
+  ;;
+  hours)
+    multiplier=3600;
+    abbreviation="hrs";
+  ;;
+  minutes)
+    multiplier=60;
+    abbreviation="mins";
+  ;;
+  *)
+    multiplier=1
+    abbreviation="secs";
+  ;;
+esac
+
+# Starting values.
+DIR_COUNT=0
+OK_FILE_COUNT=0
+OUTPUT=
+CURRENT_TIME=`date +%s`
+OS_DISTRO=`uname -s`
+
+# Loop through each provided directory.
+for dir in $dirs
+do
+  check_file=
+  DIR_COUNT=$(($DIR_COUNT + 1))
+
+  # Check if dir exists.
+  full_path=${base_dir}${dir}
+  if [ -d "$full_path" ]; then
+    file_list=`ls -t $full_path`
+    # Cycle through files, looking for a checkable file.
+    for next_file in $file_list
+    do
+      next_filepath=$full_path/$next_file
+      if [ "$check_dirs" ]; then
+        # Check if it's a file or directory.
+        if [ -f "$next_filepath" ] || [ -d "$next_filepath" ]; then
+          check_file=1
+        fi
+      else
+        # Check if it's a file.
+        if [ -f "$next_filepath" ]; then
+          check_file=1
+        fi
+      fi
+      if [ "$check_file" ]; then
+        # stat doesn't work the same on Linux and FreeBSD/Darwin, so
+        # make adjustments here.
+        if [ "$OS_DISTRO" = "Linux" ]; then
+          st_ctime=`stat --printf=%Y ${next_filepath}`
+        else
+          eval $(stat -s ${next_filepath})
+        fi
+
+        FILE_AGE=$(($CURRENT_TIME - $st_ctime))
+        FILE_AGE_UNITS=$(($FILE_AGE / $multiplier))
+        MAX_WARN_AGE=$(($warning * $multiplier))
+        MAX_CRIT_AGE=$(($critical * $multiplier))
+        if [ $FILE_AGE -gt $MAX_CRIT_AGE ]; then
+          OUTPUT="$OUTPUT ${dir}: ${FILE_AGE_UNITS}${abbreviation}"
+          set_exit_status $STATE_CRITICAL
+        elif [ $FILE_AGE -gt $MAX_WARN_AGE ]; then
+          OUTPUT="$OUTPUT ${dir}: ${FILE_AGE_UNITS}${abbreviation}"
+          set_exit_status $STATE_WARNING
+        else
+          OK_FILE_COUNT=$(($OK_FILE_COUNT + 1))
+          if [ "$verbose" ]; then
+            OUTPUT="$OUTPUT ${dir}: ${FILE_AGE_UNITS}${abbreviation}"
+          fi
+        fi
+        break
+      fi
+    done
+    # Check here to see if any files got tested in the directory.
+    if [ ! "$check_file" ]; then
+      set_exit_status $on_empty
+      OUTPUT="$OUTPUT ${dir}: No files"
+      # If empty is an OK state, then increment the ok file count.
+      if [ "$on_empty" = "$STATE_OK" ]; then
+        OK_FILE_COUNT=$(($OK_FILE_COUNT + 1))
+      fi
+    fi
+  else
+    set_exit_status $on_empty
+    OUTPUT="$OUTPUT ${dir}: Does not exist"
+  fi
+done
+
+case $exitstatus
+in
+  $STATE_CRITICAL)
+    exit_message="CRITICAL";
+  ;;
+  $STATE_WARNING)
+    exit_message="WARNING";
+  ;;
+  $STATE_OK)
+    exit_message="OK";
+  ;;
+  *)
+    exitstatus=$STATE_UNKNOWN;
+    exit_message="UNKNOWN";
+  ;;
+esac
+
+exit_message="${exit_message}: ${OK_FILE_COUNT}/${DIR_COUNT}"
+
+if [ "$OUTPUT" ]; then
+  exit_message="${exit_message} --${OUTPUT}"
+fi
+
+echo "$exit_message"
+exit $exitstatus
\ No newline at end of file
diff --git a/modules/profile/manifests/analytics/database/meta/backup_dest.pp 
b/modules/profile/manifests/analytics/database/meta/backup_dest.pp
index 872d9f9..5be8007 100644
--- a/modules/profile/manifests/analytics/database/meta/backup_dest.pp
+++ b/modules/profile/manifests/analytics/database/meta/backup_dest.pp
@@ -50,4 +50,13 @@
         port   => '873',
         srange => "@resolve((${rsync_clients_ferm}))",
     }
+
+    # Alert if backup gets stale.
+    $warning_threshold_hours = 26
+    $critical_threshold_hours = 48
+    nrpe::monitor_service { 'analytics-database-meta-backup-age':
+        description   => 'Age of most recent Analytics meta MySQL database 
backup files',
+        nrpe_command  => "/usr/local/lib/nagios/plugins/check_newest_file_age 
-C -d /srv/backup/mysql/analytics-meta -w ${$warning_threshold_hours} -c 
${critical_threshold_hours}",
+        contact_group => 'analytics',
+    }
 }
diff --git a/modules/profile/manifests/hadoop/backup/namenode.pp 
b/modules/profile/manifests/hadoop/backup/namenode.pp
index 5afd377..1f98d4e 100644
--- a/modules/profile/manifests/hadoop/backup/namenode.pp
+++ b/modules/profile/manifests/hadoop/backup/namenode.pp
@@ -44,6 +44,15 @@
         minute  => 0,
     }
 
+    # Alert if backup gets stale.
+    $warning_threshold_hours = 26
+    $critical_threshold_hours = 48
+    nrpe::monitor_service { 'hadoop-namenode-backup-age':
+        description   => 'Age of most recent Hadoop NameNode backup files',
+        nrpe_command  => "/usr/local/lib/nagios/plugins/check_newest_file_age 
-C -d ${destination} -w ${$warning_threshold_hours} -c 
${critical_threshold_hours}",
+        contact_group => 'analytics',
+    }
+
     # Bacula will also back up this directory.
     # See: bacula::director::fileset { 'hadoop-namenode-backup'
     # in profile::backup::director

-- 
To view, visit https://gerrit.wikimedia.org/r/399255
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I785a5f601bfb3f0b48a604a8e39991cac256b8a5
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Ottomata <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to