Dzahn has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/373687 )

Change subject: icinga: enhance check for screen sessions, also detect tmux
......................................................................

icinga: enhance check for screen sessions, also detect tmux

Enhance the plugin script to not just detect SCREEN but also
tmux sessions. Use a more generic function to check processes
to make it easy to add others when needed.

Consider different types of OK (none running vs running but
not longer than crit/warn time thresholds).

Bug: T165348
Change-Id: I44ae0a6fedab4075353c73319eb5f4a4fc1f8d76
---
A modules/icinga/files/check_long_procs
D modules/icinga/files/check_long_screens
2 files changed, 132 insertions(+), 64 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/87/373687/1

diff --git a/modules/icinga/files/check_long_procs 
b/modules/icinga/files/check_long_procs
new file mode 100755
index 0000000..f82b142
--- /dev/null
+++ b/modules/icinga/files/check_long_procs
@@ -0,0 +1,132 @@
+#!/bin/bash
+# icinga/nagios plugin to detect long-running screen and tmux processes
+#
+# https://phabricator.wikimedia.org/T165348
+#
+# Daniel Zahn (<[email protected]>) - Wikimedia Foundation Inc.
+#
+
+set -eu
+
+usage() { echo "Usage: $0 -w <warn> -c <crit>" 1>&2; exit 1; }
+
+declare -i warn_time=0
+declare -i crit_time=0
+
+debug=false
+
+while getopts "w:c:" o; do
+    case "${o}" in
+    w)
+       warn_time=${OPTARG}
+       ;;
+    c)
+       crit_time=${OPTARG}
+       ;;
+    *)
+       usage
+       ;;
+    esac
+done
+
+if [ $warn_time == 0 ] || [ $crit_time == 0 ]; then
+    usage
+fi
+
+if [ $crit_time -le $warn_time ]; then
+    echo "Error. The value for CRIT must be higher than the value for WARN."
+    exit 1
+fi
+
+# check for long running processes
+# checkprocess <pattern>
+# example: checkprocess SCREEN
+# possible results:
+# 0 = no process (pattern) found (OK)
+# 1 = process found and running longer than warn_time, but not longer than 
crit_time (WARN)
+# 2 = process found and running longer than crit_time (CRIT)
+# 4 = process found but not running longer than warn_time (OK)
+function checkprocess {
+
+    pattern=$1
+    check_pid=""
+    run_time=0
+    declare -A result
+
+    if /usr/bin/pgrep -fc $pattern > /dev/null; then
+        for check_pid in $(/usr/bin/pgrep -f ${pattern}) ; do
+            run_time=$(/bin/ps -o etimes= -p ${check_pid} | /usr/bin/xargs)
+            if $debug; then
+                echo "Found a '${pattern}'-process with PID ${check_pid}. It 
has been running ${run_time} seconds."
+            fi
+            if [ $run_time -gt $crit_time ]; then
+               result[$pattern]=2
+            fi
+            if [ $run_time -gt $warn_time ]; then
+                result[$pattern]=1
+            fi
+            if [ $run_time -le $warn_time ] && [ $run_time -le $crit_time ] ; 
then
+                result[$pattern]=4
+           fi
+        done
+    else
+        result[$pattern]=0
+    fi
+
+echo "${result["${pattern}"]},${check_pid},${run_time}"
+
+}
+
+result_screen=$(checkprocess SCREEN)
+result_tmux=$(checkprocess tmux)
+
+return_code_screen=$(echo $result_screen | cut -d, -f1)
+pid_screen=$(echo $result_screen | cut -d, -f2)
+run_time_screen=$(echo $result_screen | cut -d, -f3)
+
+return_code_tmux=$(echo $result_tmux | cut -d, -f1)
+pid_tmux=$(echo $result_tmux | cut -d, -f2)
+run_time_tmux=$(echo $result_tmux | cut -d, -f3)
+
+if [ $return_code_screen -eq 2 ]; then
+    echo "CRIT: Long running SCREEN process. (PID: ${pid_screen}, 
${run_time_screen}s > ${crit_time}s)."
+    exit 2
+fi
+
+if [ $return_code_tmux -eq 2 ]; then
+    echo "CRIT: Long running tmux process. (PID: ${pid_tmux}, 
${run_time_tmux}s > ${crit_time}s)."
+    exit 2
+fi
+
+if [ $return_code_screen -eq 1 ]; then
+    echo "WARN: Long running SCREEN process. (PID: ${pid_screen}, 
${run_time_screen}s > ${warn_time}s)."
+    exit 1
+fi
+
+if [ $return_code_tmux -eq 1 ]; then
+    echo "WARN: Long running tmux process. (PID: ${pid_tmux}, 
${run_time_tmux}s > ${warn_time}s)."
+    exit 1
+fi
+
+return_code_global="${return_code_screen}${return_code_tmux}"
+case "$return_code_global" in
+       00)
+            echo "OK: No SCREEN or tmux processes detected."
+            exit 0
+       ;;
+        04) echo "OK: Tmux detected but not long running."
+            exit 0
+       ;;
+        40) echo "OK: SCREEN detected but not long running."
+           exit 0
+       ;;
+        44) echo "OK: SCREEN and tmux detected but not long running."
+            exit 0
+       ;;
+        *)  echo "UNKNOWN: This should not happen. check ${0}."
+            exit 3
+        ;;
+esac
+
+echo "UNKNOWN: This should not happen. check ${0}."
+
diff --git a/modules/icinga/files/check_long_screens 
b/modules/icinga/files/check_long_screens
deleted file mode 100755
index df821ad..0000000
--- a/modules/icinga/files/check_long_screens
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-# icinga/nagios plugin to detect long-running screen sessions
-#
-# https://phabricator.wikimedia.org/T165348
-#
-# Daniel Zahn (<[email protected]>) - Wikimedia Foundation Inc.
-#
-
-set -eu
-usage() { echo "Usage: $0 -w <warn> -c <crit>" 1>&2; exit 1; }
-
-declare -i WARN_TIME="60"
-declare -i CRIT_TIME="120"
-
-DEBUG=false
-
-while getopts "w:c:" o; do
-    case "${o}" in
-    w)
-       WARN_TIME=${OPTARG}
-       ;;
-    c)
-       CRIT_TIME=${OPTARG}
-       ;;
-    *)
-       usage
-       ;;
-    esac
-done
-
-if [ $WARN_TIME == 0 ] || [ $CRIT_TIME == 0 ]; then
-    usage
-fi
-
-SCREEN_PID=""
-RUN_TIME=0
-PGREP=$(which pgrep)
-PS=$(which ps)
-XARGS=$(which xargs)
-
-for SCREEN_PID in $(${PGREP} -f SCREEN)
-  do
-     RUN_TIME=$(${PS} -o etimes= -p ${SCREEN_PID} | ${XARGS})
-     if $DEBUG; then
-         echo "Found a screen with PID ${SCREEN_PID}. It's been running since 
${RUN_TIME} seconds."
-     fi
-     if [ $RUN_TIME -gt $CRIT_TIME ]; then
-         echo "CRIT: Long running screen session. (PID ${SCREEN_PID}, 
${RUN_TIME}s > ${CRIT_TIME}s)"
-         exit 2
-     fi
-     if [ $RUN_TIME -gt $WARN_TIME ]; then
-         echo "WARN: Long running screen session. (PID ${SCREEN_PID}, 
${RUN_TIME}s > ${WARN_TIME}s)"
-         exit 1
-     fi
-     if [ $RUN_TIME -le $WARN_TIME ] && [ $RUN_TIME -le $CRIT_TIME ] ; then
-         echo "OK: No long running screen sessions detected."
-         exit 0
-     fi
-done
-
-echo "UNKNOWN: something went wrong with the plugin. check $0"
-exit 3
-
-

-- 
To view, visit https://gerrit.wikimedia.org/r/373687
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I44ae0a6fedab4075353c73319eb5f4a4fc1f8d76
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Dzahn <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to