coren has uploaded a new change for review.
https://gerrit.wikimedia.org/r/230555
Change subject: Revert "nrpe: Merge check_systemd_unit_lastrun into _state"
......................................................................
Revert "nrpe: Merge check_systemd_unit_lastrun into _state"
This reverts commit ac35c601ee4447baf2e1a21812856d78afb2ab19.
Broken.
Change-Id: I92857030fd47c771b9a419c55f863a61c9b4f868
---
M modules/labstore/manifests/fileserver/replicate.pp
A modules/nrpe/files/plugins/check_systemd_unit_lastrun
M modules/nrpe/files/plugins/check_systemd_unit_state
A modules/nrpe/manifests/monitor_systemd_unit_lastrun.pp
M modules/nrpe/manifests/monitor_systemd_unit_state.pp
M modules/nrpe/manifests/systemd_scripts.pp
6 files changed, 221 insertions(+), 111 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet
refs/changes/55/230555/1
diff --git a/modules/labstore/manifests/fileserver/replicate.pp
b/modules/labstore/manifests/fileserver/replicate.pp
index c516f6e..903d833 100644
--- a/modules/labstore/manifests/fileserver/replicate.pp
+++ b/modules/labstore/manifests/fileserver/replicate.pp
@@ -13,8 +13,14 @@
declare_service => false,
}
+ nrpe::monitor_systemd_unit_lastrun { "replicate-${title}":
+ description => "Last backup of the ${title} filesystem",
+ warn_secs => 60*60*1,
+ crit_secs => 60*60*2,
+ }
+
nrpe::monitor_systemd_unit_state { "replicate-${title}":
- description => "Last backup of the ${title} filesystem",
- expected_state => "periodic 3600",
+ description => "Backup of ${title} filesystem",
+ expected_state => "success",
}
}
diff --git a/modules/nrpe/files/plugins/check_systemd_unit_lastrun
b/modules/nrpe/files/plugins/check_systemd_unit_lastrun
new file mode 100755
index 0000000..201a992
--- /dev/null
+++ b/modules/nrpe/files/plugins/check_systemd_unit_lastrun
@@ -0,0 +1,100 @@
+#! /usr/bin/python3
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2015 Marc-André Pelletier <[email protected]>
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+#
+#
+# THIS FILE IS MANAGED BY PUPPET
+#
+# Source: modules/labstore/storage-replicate
+# From: modules/labstore/manifests/fileserve.rpp
+#
+
+"""
+check_systemd_unit_lastrun
+
+usage: check_systemd_unit_lastrun <unit> <warn> <crit>
+
+Checks that the systemd unit has been run recently
+enough. Warns if the last start/stop activity is older
+than warn seconds, and criticals if it is older than
+crit seconds.
+"""
+
+import argparse
+import time
+import datetime
+import subprocess
+import logging
+import json
+import sys
+
+def main():
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('unit', help='Systemd unit to check')
+ parser.add_argument('warn', help='Number of seconds past which a warning
should be emitted')
+ parser.add_argument('crit', help='Number of seconds past which a critical
should be emitted')
+ args = parser.parse_args()
+
+ warn = datetime.timedelta(seconds=int(args.warn))
+ crit = datetime.timedelta(seconds=int(args.crit))
+
+ logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+ log = []
+
+ try:
+ raw = subprocess.check_output(
+ ['/bin/journalctl', '--output=json', '--reverse', '--unit',
args.unit],
+ stderr=subprocess.STDOUT).decode()
+ for entry in raw.splitlines():
+ log.append(json.loads(entry))
+ except subprocess.CalledProcessError:
+ print('LASTRUN UNKNOWN - Unable to get systemd journal for unit "%s"'
% args.unit)
+ sys.exit(3)
+ except ValueError:
+ print('LASTRUN UNKNOWN - Unable to parse systemd journal for unit
"%s"' % args.unit)
+ sys.exit(3)
+
+ lastrun = None
+ for entry in log:
+ try:
+ if entry['CODE_FUNCTION'] ==
'unit_status_log_starting_stopping_reloading':
+ lastrun = int(float(entry['__REALTIME_TIMESTAMP'])/1000000) #
because microseconds
+ break
+ except (KeyError):
+ pass
+
+ if not lastrun:
+ print('LASTRUN UNKNOWN - No start/stop information for unit "%s"' %
args.unit)
+ sys.exit(3)
+
+ age = datetime.timedelta(seconds=int(time.time()) - lastrun)
+
+ if age > crit:
+ print('LASTRUN CRITICAL - Last run more than %s ago' % crit)
+ sys.exit(2)
+
+ if age > warn:
+ print('LASTRUN WARNING - Last run more than %s ago' % warn)
+ sys.exit(1)
+
+ print('LASTRUN OK - Last run %s ago' % age)
+ sys.exit(0)
+
+if __name__ == "__main__":
+ main()
+
diff --git a/modules/nrpe/files/plugins/check_systemd_unit_state
b/modules/nrpe/files/plugins/check_systemd_unit_state
index cea8908..bffe462 100755
--- a/modules/nrpe/files/plugins/check_systemd_unit_state
+++ b/modules/nrpe/files/plugins/check_systemd_unit_state
@@ -1,126 +1,85 @@
-#! /usr/bin/python3
-# -*- coding: utf-8 -*-
+#!/usr/bin/perl
+
+# Copyright 2015 Giuseppe Lavagetto
+# Copyright 2015 Wikimedia Foundation, Inc.
#
-# Copyright © 2015 Marc-André Pelletier <[email protected]>
+# This nagios plugin is free software, and comes with ABSOLUTELY NO WARRANTY.
+# It may be used, redistributed and/or modified under the terms of the GNU
+# General Public Licence (see http://www.fsf.org/licensing/licenses/gpl.txt).
#
-# Permission to use, copy, modify, and/or distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notice and this permission notice appear in all copies.
+# Example usage:
+# check_systemd_unit_state -s apache2 [ -e <active|inactive|success> ]
#
-# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
-# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
-# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+# Checks the state of a systemd unit and raises an error unless:
#
-#
-# THIS FILE IS MANAGED BY PUPPET
-#
-# Source: modules/nrpe/files/plugins/check_systemd_unit_state
-# From: modules/nrpe/manifests/systemd_scripts.pp
-#
+# active: the unit is currently running
+# inactive: the unit is not currently running
+# success: the unit is currently running OR its last result is success
-"""
-check_systemd_unit_state
+use strict;
+use Nagios::Plugin;
-usage: check_systemd_unit_state <unit> <expect> [<lastrun>]
+sub setup{
+ my $np = Nagios::Plugin->new(usage => "Usage: %s -s <service> -e
<active|inactive>");
+ $np->add_arg(
+ spec => 'service|s=s',
+ help => '-s SERVICE',
+ required => 1,
+ );
+ $np->add_arg(
+ spec => 'expect|e=s',
+ help => '-e active|inactive|success',
+ default => 'active',
+ );
+ $np->getopts;
+ my @expected = ('active', 'inactive', 'success');
-Checks that the systemd unit <unit> is in the correct state according
-to <expect>:
+ $np->nagios_exit(UNKNOWN, "Valid expected states are 'active', 'inactive'
or 'success'")
+ unless (grep {$_ eq $np->opts->expect} @expected);
- active - Ok if the unit is active and running
- inactive - Ok if the unit is inactive and dead
- periodic - Ok if the unit is either:
- (a) active and running
- (b) inactive, dead and the last result was success
- In addition, if <lastrun> is specified, the checks
- returns Ok iff the unit was started no more than
- <lastrun> seconds ago (and this information is only
- valid when a timer exists for the unit)
-"""
+ return $np;
+}
-import time
-import datetime
-import subprocess
-import re
-import sys
+sub get_data {
+ my $service = shift;
+ open(STATUS, "/bin/systemctl show '$service' |");
+ my %service_data = {};
-def unknown(msg):
- print("UNKNOWN - %s" % msg)
- sys.exit(3)
+ # Parse the systemctl output.
+ # Yes this is going to be crude.
+ while (<STATUS>) {
+ next if /^#/;
+ chomp;
+ my ($k, $v) = split /=/, $_, 2;
+ $service_data{$k} = $v;
+ }
+ return \%service_data;
+}
-def crit(msg):
- print("CRITICAL - %s" % msg)
- sys.exit(2)
+my $plugin = setup();
+my $service = $plugin->opts->service;
+my $expect = $plugin->opts->expect;
+my %expected_substates = ( 'active' => 'running', 'inactive' => 'dead' );
-def ok(msg):
- print("OK - %s" % msg)
- sys.exit(0)
+my $service_status = get_data($service);
-def main():
+$plugin->nagios_exit(UNKNOWN, "Service $service is not loaded") unless
$service_status->{LoadState} eq 'loaded';
- try:
- lastrun = None
- unit = sys.argv[1]
- expect = sys.argv[2]
- if expect not in ['active', 'inactive', 'periodic']:
- unknown("Must expect one of 'active', 'inactive', or 'periodic'")
- if expect == 'periodic' and len(sys.argv) > 3:
- lastrun = datetime.timedelta(seconds=int(sys.argv[3]))
- except (IndexError, ValueError):
- unknown("Bad arguments to %s (%s)" % (sys.argv[0], ",
".join(sys.argv[1:])))
+if ($service_status->{SubState} ne
$expected_substates{$service_status->{ActiveState}}) {
+ $plugin->nagios_exit(CRITICAL, "Service $service is
$service_status->{ActiveState}, but its last recorded state is:
$service_status->{SubState}");
+}
- state = {}
- try:
- raw = subprocess.check_output(['/bin/systemctl', 'show', unit],
stderr=subprocess.STDOUT).decode()
- for entry in raw.splitlines():
- kv = entry.split('=', 1)
- state[kv[0]] = kv[1]
- except IndexError:
- unknown("Unable to parse status of unit %s" % unit)
+if ($service_status->{ActiveState} eq 'active') {
+ $plugin->nagios_exit(CRITICAL, "Service $service is active (expected
inactive)") if $expect eq 'inactive';
+ $plugin->nagios_exit(OK, "Service $service is active (expected success)")
if $expect eq 'success';
+} else {
+ $plugin->nagios_exit(CRITICAL, "Service $service is inactive (expected
active)") if $expect eq 'active';
+ if($expect eq 'success') {
+ $plugin->nagios_exit(OK, "Last run of service $service was succesful")
if $service_status->{Result} eq 'success';
+ $plugin->nagios_exit(CRITICAL, "Service $service failed
($service_status->{Result})");
+ }
+}
- if expect == 'active':
-
- if state['ActiveState'] != 'active':
- crit("Expecting active but unit is %s" % state['ActiveState'])
- if state['SubState'] != 'running':
- crit("Unit is active but reported %s'" % state['SubState'])
- ok("%s is active" % unit)
-
- elif expect == 'inactive':
-
- if state['ActiveState'] != 'inactive':
- crit("Expecting inactive but unit is %s" % state['ActiveState'])
- if state['SubState'] != 'dead':
- crit("Unit is inactive but reported %s'" % state['SubState'])
- ok("%s is inactive" % unit)
-
- # else periodic
-
- if state['ActiveState'] == 'active':
- ok("Unit is currently active")
- if state['ActiveState'] != 'inactive':
- crit("Unit is in state %s" % state['ActiveState'])
- if state['Result'] != 'success':
- crit("Last run result was %s" % state['Result'])
-
- if lastrun:
- try:
- # Timestamps in systemctl show are in format 'Thu 2015-07-30
16:56:59 UTC'
- started =
datetime.datetime.strptime(state['ExecMainStartTimestamp'], '%a %Y-%m-%d
%H:%M:%S %Z')
- age = datetime.datetime.fromtimestamp(int(time.time())) - started
-
- if age > lastrun:
- crit("Last run was over %s ago" % lastrun)
-
- except (KeyError, ValueError):
- unknown("Unit has no usable last run information (not a timer?)")
-
- ok("Last run successful")
-
-if __name__ == "__main__":
- main()
+$plugin->nagios_exit(OK, "Service $service is in the desired state ($expect)");
diff --git a/modules/nrpe/manifests/monitor_systemd_unit_lastrun.pp
b/modules/nrpe/manifests/monitor_systemd_unit_lastrun.pp
new file mode 100644
index 0000000..7a6fe41
--- /dev/null
+++ b/modules/nrpe/manifests/monitor_systemd_unit_lastrun.pp
@@ -0,0 +1,36 @@
+# === Define: nrpe::monitor_systemd_unit_lastrun
+#
+# Installs a check for last run time of a systemd unit using journalctl
+define nrpe::monitor_systemd_unit_lastrun(
+ $unit = $title,
+ $description = "${unit} last run",
+ $contact_group = 'admins',
+ $retries = 3,
+ $timeout = 10,
+ $critical = false,
+ $ensure = 'present',
+ $warn_secs = 60*60*25,
+ $crit_secs = 60*60*49,
+ ){
+
+ if $::initsystem != 'systemd' {
+ fail('nrpe::monitor_systemd_unit_lastrun can only work on
systemd-enabled systems')
+ }
+ require nrpe::systemd_scripts
+
+ # Temporary hack until we fix the downstream modules
+ if $critical {
+ $nagios_critical = 'true'
+ } else {
+ $nagios_critical = 'false'
+ }
+
+ nrpe::monitor_service { "${unit}-lastrun":
+ ensure => $ensure,
+ description => $description,
+ nrpe_command => "/usr/local/bin/nrpe_check_systemd_unit_lastrun
'${unit}' ${warn_secs} ${crit_secs}",
+ retries => $retries,
+ timeout => $timeout,
+ critical => $nagios_critical,
+ }
+}
diff --git a/modules/nrpe/manifests/monitor_systemd_unit_state.pp
b/modules/nrpe/manifests/monitor_systemd_unit_state.pp
index 66de6a5..5f1d855 100644
--- a/modules/nrpe/manifests/monitor_systemd_unit_state.pp
+++ b/modules/nrpe/manifests/monitor_systemd_unit_state.pp
@@ -27,7 +27,7 @@
nrpe::monitor_service { "${unit}-state":
ensure => $ensure,
description => $description,
- nrpe_command => "/usr/local/bin/nrpe_check_systemd_unit_state
'${unit}' ${expected_state}",
+ nrpe_command => "/usr/local/bin/nrpe_check_systemd_unit_state -s
'${unit}' -e ${expected_state}",
retries => $retries,
timeout => $timeout,
critical => $nagios_critical,
diff --git a/modules/nrpe/manifests/systemd_scripts.pp
b/modules/nrpe/manifests/systemd_scripts.pp
index 0f2ee2c..0f355b5 100644
--- a/modules/nrpe/manifests/systemd_scripts.pp
+++ b/modules/nrpe/manifests/systemd_scripts.pp
@@ -4,6 +4,7 @@
#
class nrpe::systemd_scripts {
+ require_package 'libnagios-plugin-perl'
# These scripts allows monitoring of systemd services
file { '/usr/local/bin/nrpe_check_systemd_unit_state':
@@ -14,4 +15,12 @@
mode => '0555',
}
+ file { '/usr/local/bin/nrpe_check_systemd_unit_lastrun':
+ ensure => present,
+ source => 'puppet:///modules/nrpe/plugins/check_systemd_unit_lastrun',
+ owner => 'root',
+ group => 'root',
+ mode => '0555',
+ }
+
}
--
To view, visit https://gerrit.wikimedia.org/r/230555
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I92857030fd47c771b9a419c55f863a61c9b4f868
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: coren <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits