Dzahn has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/373687 )
Change subject: icinga: enhance check for screen sessions, also detect tmux ...................................................................... icinga: enhance check for screen sessions, also detect tmux Enhance the plugin script to not just detect SCREEN but also tmux sessions. Use a more generic function to check processes to make it easy to add others when needed. Consider different types of OK (none running vs running but not longer than crit/warn time thresholds). Bug: T165348 Change-Id: I44ae0a6fedab4075353c73319eb5f4a4fc1f8d76 --- A modules/icinga/files/check_long_procs D modules/icinga/files/check_long_screens 2 files changed, 132 insertions(+), 64 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/87/373687/1 diff --git a/modules/icinga/files/check_long_procs b/modules/icinga/files/check_long_procs new file mode 100755 index 0000000..f82b142 --- /dev/null +++ b/modules/icinga/files/check_long_procs @@ -0,0 +1,132 @@ +#!/bin/bash +# icinga/nagios plugin to detect long-running screen and tmux processes +# +# https://phabricator.wikimedia.org/T165348 +# +# Daniel Zahn (<[email protected]>) - Wikimedia Foundation Inc. +# + +set -eu + +usage() { echo "Usage: $0 -w <warn> -c <crit>" 1>&2; exit 1; } + +declare -i warn_time=0 +declare -i crit_time=0 + +debug=false + +while getopts "w:c:" o; do + case "${o}" in + w) + warn_time=${OPTARG} + ;; + c) + crit_time=${OPTARG} + ;; + *) + usage + ;; + esac +done + +if [ $warn_time == 0 ] || [ $crit_time == 0 ]; then + usage +fi + +if [ $crit_time -le $warn_time ]; then + echo "Error. The value for CRIT must be higher than the value for WARN." + exit 1 +fi + +# check for long running processes +# checkprocess <pattern> +# example: checkprocess SCREEN +# possible results: +# 0 = no process (pattern) found (OK) +# 1 = process found and running longer than warn_time, but not longer than crit_time (WARN) +# 2 = process found and running longer than crit_time (CRIT) +# 4 = process found but not running longer than warn_time (OK) +function checkprocess { + + pattern=$1 + check_pid="" + run_time=0 + declare -A result + + if /usr/bin/pgrep -fc $pattern > /dev/null; then + for check_pid in $(/usr/bin/pgrep -f ${pattern}) ; do + run_time=$(/bin/ps -o etimes= -p ${check_pid} | /usr/bin/xargs) + if $debug; then + echo "Found a '${pattern}'-process with PID ${check_pid}. It has been running ${run_time} seconds." + fi + if [ $run_time -gt $crit_time ]; then + result[$pattern]=2 + fi + if [ $run_time -gt $warn_time ]; then + result[$pattern]=1 + fi + if [ $run_time -le $warn_time ] && [ $run_time -le $crit_time ] ; then + result[$pattern]=4 + fi + done + else + result[$pattern]=0 + fi + +echo "${result["${pattern}"]},${check_pid},${run_time}" + +} + +result_screen=$(checkprocess SCREEN) +result_tmux=$(checkprocess tmux) + +return_code_screen=$(echo $result_screen | cut -d, -f1) +pid_screen=$(echo $result_screen | cut -d, -f2) +run_time_screen=$(echo $result_screen | cut -d, -f3) + +return_code_tmux=$(echo $result_tmux | cut -d, -f1) +pid_tmux=$(echo $result_tmux | cut -d, -f2) +run_time_tmux=$(echo $result_tmux | cut -d, -f3) + +if [ $return_code_screen -eq 2 ]; then + echo "CRIT: Long running SCREEN process. (PID: ${pid_screen}, ${run_time_screen}s > ${crit_time}s)." + exit 2 +fi + +if [ $return_code_tmux -eq 2 ]; then + echo "CRIT: Long running tmux process. (PID: ${pid_tmux}, ${run_time_tmux}s > ${crit_time}s)." + exit 2 +fi + +if [ $return_code_screen -eq 1 ]; then + echo "WARN: Long running SCREEN process. (PID: ${pid_screen}, ${run_time_screen}s > ${warn_time}s)." + exit 1 +fi + +if [ $return_code_tmux -eq 1 ]; then + echo "WARN: Long running tmux process. (PID: ${pid_tmux}, ${run_time_tmux}s > ${warn_time}s)." + exit 1 +fi + +return_code_global="${return_code_screen}${return_code_tmux}" +case "$return_code_global" in + 00) + echo "OK: No SCREEN or tmux processes detected." + exit 0 + ;; + 04) echo "OK: Tmux detected but not long running." + exit 0 + ;; + 40) echo "OK: SCREEN detected but not long running." + exit 0 + ;; + 44) echo "OK: SCREEN and tmux detected but not long running." + exit 0 + ;; + *) echo "UNKNOWN: This should not happen. check ${0}." + exit 3 + ;; +esac + +echo "UNKNOWN: This should not happen. check ${0}." + diff --git a/modules/icinga/files/check_long_screens b/modules/icinga/files/check_long_screens deleted file mode 100755 index df821ad..0000000 --- a/modules/icinga/files/check_long_screens +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash -# icinga/nagios plugin to detect long-running screen sessions -# -# https://phabricator.wikimedia.org/T165348 -# -# Daniel Zahn (<[email protected]>) - Wikimedia Foundation Inc. -# - -set -eu -usage() { echo "Usage: $0 -w <warn> -c <crit>" 1>&2; exit 1; } - -declare -i WARN_TIME="60" -declare -i CRIT_TIME="120" - -DEBUG=false - -while getopts "w:c:" o; do - case "${o}" in - w) - WARN_TIME=${OPTARG} - ;; - c) - CRIT_TIME=${OPTARG} - ;; - *) - usage - ;; - esac -done - -if [ $WARN_TIME == 0 ] || [ $CRIT_TIME == 0 ]; then - usage -fi - -SCREEN_PID="" -RUN_TIME=0 -PGREP=$(which pgrep) -PS=$(which ps) -XARGS=$(which xargs) - -for SCREEN_PID in $(${PGREP} -f SCREEN) - do - RUN_TIME=$(${PS} -o etimes= -p ${SCREEN_PID} | ${XARGS}) - if $DEBUG; then - echo "Found a screen with PID ${SCREEN_PID}. It's been running since ${RUN_TIME} seconds." - fi - if [ $RUN_TIME -gt $CRIT_TIME ]; then - echo "CRIT: Long running screen session. (PID ${SCREEN_PID}, ${RUN_TIME}s > ${CRIT_TIME}s)" - exit 2 - fi - if [ $RUN_TIME -gt $WARN_TIME ]; then - echo "WARN: Long running screen session. (PID ${SCREEN_PID}, ${RUN_TIME}s > ${WARN_TIME}s)" - exit 1 - fi - if [ $RUN_TIME -le $WARN_TIME ] && [ $RUN_TIME -le $CRIT_TIME ] ; then - echo "OK: No long running screen sessions detected." - exit 0 - fi -done - -echo "UNKNOWN: something went wrong with the plugin. check $0" -exit 3 - - -- To view, visit https://gerrit.wikimedia.org/r/373687 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I44ae0a6fedab4075353c73319eb5f4a4fc1f8d76 Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Dzahn <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
