coren has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/230555

Change subject: Revert "nrpe: Merge check_systemd_unit_lastrun into _state"
......................................................................

Revert "nrpe: Merge check_systemd_unit_lastrun into _state"

This reverts commit ac35c601ee4447baf2e1a21812856d78afb2ab19.

Broken.

Change-Id: I92857030fd47c771b9a419c55f863a61c9b4f868
---
M modules/labstore/manifests/fileserver/replicate.pp
A modules/nrpe/files/plugins/check_systemd_unit_lastrun
M modules/nrpe/files/plugins/check_systemd_unit_state
A modules/nrpe/manifests/monitor_systemd_unit_lastrun.pp
M modules/nrpe/manifests/monitor_systemd_unit_state.pp
M modules/nrpe/manifests/systemd_scripts.pp
6 files changed, 221 insertions(+), 111 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/55/230555/1

diff --git a/modules/labstore/manifests/fileserver/replicate.pp 
b/modules/labstore/manifests/fileserver/replicate.pp
index c516f6e..903d833 100644
--- a/modules/labstore/manifests/fileserver/replicate.pp
+++ b/modules/labstore/manifests/fileserver/replicate.pp
@@ -13,8 +13,14 @@
         declare_service => false,
     }
 
+    nrpe::monitor_systemd_unit_lastrun { "replicate-${title}":
+        description => "Last backup of the ${title} filesystem",
+        warn_secs   => 60*60*1,
+        crit_secs   => 60*60*2,
+    }
+
     nrpe::monitor_systemd_unit_state { "replicate-${title}":
-        description    => "Last backup of the ${title} filesystem",
-        expected_state => "periodic 3600",
+        description    => "Backup of ${title} filesystem",
+        expected_state => "success",
     }
 }
diff --git a/modules/nrpe/files/plugins/check_systemd_unit_lastrun 
b/modules/nrpe/files/plugins/check_systemd_unit_lastrun
new file mode 100755
index 0000000..201a992
--- /dev/null
+++ b/modules/nrpe/files/plugins/check_systemd_unit_lastrun
@@ -0,0 +1,100 @@
+#! /usr/bin/python3
+# -*- coding: utf-8 -*-
+#
+#  Copyright © 2015 Marc-André Pelletier <[email protected]>
+#
+#  Permission to use, copy, modify, and/or distribute this software for any
+#  purpose with or without fee is hereby granted, provided that the above
+#  copyright notice and this permission notice appear in all copies.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+#  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+#  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+#  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+#  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+#  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+#  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+#
+#
+#  THIS FILE IS MANAGED BY PUPPET
+#
+#  Source: modules/labstore/storage-replicate
+#  From:   modules/labstore/manifests/fileserve.rpp
+#
+
+"""
+check_systemd_unit_lastrun
+
+usage: check_systemd_unit_lastrun <unit> <warn> <crit>
+
+Checks that the systemd unit has been run recently
+enough.  Warns if the last start/stop activity is older
+than warn seconds, and criticals if it is older than
+crit seconds.
+"""
+
+import argparse
+import time
+import datetime
+import subprocess
+import logging
+import json
+import sys
+
+def main():
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('unit', help='Systemd unit to check')
+    parser.add_argument('warn', help='Number of seconds past which a warning 
should be emitted')
+    parser.add_argument('crit', help='Number of seconds past which a critical 
should be emitted')
+    args = parser.parse_args()
+
+    warn = datetime.timedelta(seconds=int(args.warn))
+    crit = datetime.timedelta(seconds=int(args.crit))
+
+    logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+    log = []
+
+    try:
+        raw = subprocess.check_output(
+            ['/bin/journalctl', '--output=json', '--reverse', '--unit', 
args.unit],
+            stderr=subprocess.STDOUT).decode()
+        for entry in raw.splitlines():
+            log.append(json.loads(entry))
+    except subprocess.CalledProcessError:
+        print('LASTRUN UNKNOWN - Unable to get systemd journal for unit "%s"' 
% args.unit)
+        sys.exit(3)
+    except ValueError:
+        print('LASTRUN UNKNOWN - Unable to parse systemd journal for unit 
"%s"' % args.unit)
+        sys.exit(3)
+
+    lastrun = None
+    for entry in log:
+        try:
+            if entry['CODE_FUNCTION'] == 
'unit_status_log_starting_stopping_reloading':
+                lastrun = int(float(entry['__REALTIME_TIMESTAMP'])/1000000) # 
because microseconds
+                break
+        except (KeyError):
+            pass
+
+    if not lastrun:
+        print('LASTRUN UNKNOWN - No start/stop information for unit "%s"' % 
args.unit)
+        sys.exit(3)
+
+    age = datetime.timedelta(seconds=int(time.time()) - lastrun)
+
+    if age > crit:
+        print('LASTRUN CRITICAL - Last run more than %s ago' % crit)
+        sys.exit(2)
+
+    if age > warn:
+        print('LASTRUN WARNING - Last run more than %s ago' % warn)
+        sys.exit(1)
+
+    print('LASTRUN OK - Last run %s ago' % age)
+    sys.exit(0)
+
+if __name__ == "__main__":
+    main()
+
diff --git a/modules/nrpe/files/plugins/check_systemd_unit_state 
b/modules/nrpe/files/plugins/check_systemd_unit_state
index cea8908..bffe462 100755
--- a/modules/nrpe/files/plugins/check_systemd_unit_state
+++ b/modules/nrpe/files/plugins/check_systemd_unit_state
@@ -1,126 +1,85 @@
-#! /usr/bin/python3
-# -*- coding: utf-8 -*-
+#!/usr/bin/perl
+
+# Copyright 2015 Giuseppe Lavagetto
+# Copyright 2015 Wikimedia Foundation, Inc.
 #
-#  Copyright © 2015 Marc-André Pelletier <[email protected]>
+# This nagios plugin is free software, and comes with ABSOLUTELY NO WARRANTY.
+# It may be used, redistributed and/or modified under the terms of the GNU
+# General Public Licence (see http://www.fsf.org/licensing/licenses/gpl.txt).
 #
-#  Permission to use, copy, modify, and/or distribute this software for any
-#  purpose with or without fee is hereby granted, provided that the above
-#  copyright notice and this permission notice appear in all copies.
+# Example usage:
+#   check_systemd_unit_state -s apache2 [ -e <active|inactive|success> ]
 #
-#  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-#  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-#  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
-#  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-#  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-#  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
-#  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+# Checks the state of a systemd unit and raises an error unless:
 #
-#
-#  THIS FILE IS MANAGED BY PUPPET
-#
-#  Source: modules/nrpe/files/plugins/check_systemd_unit_state
-#  From:   modules/nrpe/manifests/systemd_scripts.pp
-#
+#    active: the unit is currently running
+#  inactive: the unit is not currently running
+#   success: the unit is currently running OR its last result is success
 
-"""
-check_systemd_unit_state
+use strict;
+use Nagios::Plugin;
 
-usage: check_systemd_unit_state <unit> <expect> [<lastrun>]
+sub setup{
+    my $np = Nagios::Plugin->new(usage => "Usage: %s -s <service> -e 
<active|inactive>");
+    $np->add_arg(
+                 spec     => 'service|s=s',
+                 help     => '-s SERVICE',
+                 required => 1,
+    );
+    $np->add_arg(
+                 spec    => 'expect|e=s',
+                 help    => '-e active|inactive|success',
+                 default => 'active',
+    );
+    $np->getopts;
+    my @expected = ('active', 'inactive', 'success');
 
-Checks that the systemd unit <unit> is in the correct state according
-to <expect>:
+    $np->nagios_exit(UNKNOWN, "Valid expected states are 'active', 'inactive' 
or 'success'")
+            unless (grep {$_ eq $np->opts->expect} @expected);
 
-    active   - Ok if the unit is active and running
-    inactive - Ok if the unit is inactive and dead
-    periodic - Ok if the unit is either:
-                 (a) active and running
-                 (b) inactive, dead and the last result was success
-               In addition, if <lastrun> is specified, the checks
-               returns Ok iff the unit was started no more than
-               <lastrun> seconds ago (and this information is only
-               valid when a timer exists for the unit)
-"""
+    return $np;
+}
 
-import time
-import datetime
-import subprocess
-import re
-import sys
+sub get_data {
+    my $service = shift;
+    open(STATUS, "/bin/systemctl show '$service' |");
 
+    my %service_data = {};
 
-def unknown(msg):
-    print("UNKNOWN - %s" % msg)
-    sys.exit(3)
+    # Parse the systemctl output.
+    # Yes this is going to be crude.
+    while (<STATUS>) {
+        next if /^#/;
+        chomp;
+        my ($k, $v) = split /=/, $_, 2;
+        $service_data{$k} = $v;
+    }
+    return \%service_data;
+}
 
-def crit(msg):
-    print("CRITICAL - %s" % msg)
-    sys.exit(2)
+my $plugin = setup();
+my $service = $plugin->opts->service;
+my $expect = $plugin->opts->expect;
+my %expected_substates = ( 'active' => 'running', 'inactive' => 'dead' );
 
-def ok(msg):
-    print("OK - %s" % msg)
-    sys.exit(0)
+my $service_status = get_data($service);
 
-def main():
+$plugin->nagios_exit(UNKNOWN, "Service $service is not loaded") unless 
$service_status->{LoadState} eq 'loaded';
 
-    try:
-        lastrun = None
-        unit = sys.argv[1]
-        expect = sys.argv[2]
-        if expect not in ['active', 'inactive', 'periodic']:
-            unknown("Must expect one of 'active', 'inactive', or 'periodic'")
-        if expect == 'periodic' and len(sys.argv) > 3:
-            lastrun = datetime.timedelta(seconds=int(sys.argv[3]))
-    except (IndexError, ValueError):
-        unknown("Bad arguments to %s (%s)" % (sys.argv[0], ", 
".join(sys.argv[1:])))
+if ($service_status->{SubState} ne 
$expected_substates{$service_status->{ActiveState}}) {
+    $plugin->nagios_exit(CRITICAL, "Service $service is 
$service_status->{ActiveState}, but its last recorded state is: 
$service_status->{SubState}");
+}
 
-    state = {}
-    try:
-        raw = subprocess.check_output(['/bin/systemctl', 'show', unit], 
stderr=subprocess.STDOUT).decode()
-        for entry in raw.splitlines():
-            kv = entry.split('=', 1)
-            state[kv[0]] = kv[1]
-    except IndexError:
-        unknown("Unable to parse status of unit %s" % unit)
+if ($service_status->{ActiveState} eq 'active') {
+    $plugin->nagios_exit(CRITICAL, "Service $service is active (expected 
inactive)") if $expect eq 'inactive';
+    $plugin->nagios_exit(OK, "Service $service is active (expected success)") 
if $expect eq 'success';
+} else {
+    $plugin->nagios_exit(CRITICAL, "Service $service is inactive (expected 
active)") if $expect eq 'active';
+    if($expect eq 'success') {
+        $plugin->nagios_exit(OK, "Last run of service $service was succesful") 
if $service_status->{Result} eq 'success';
+        $plugin->nagios_exit(CRITICAL, "Service $service failed 
($service_status->{Result})");
+    }
+}
 
-    if expect == 'active':
-
-        if state['ActiveState'] != 'active':
-            crit("Expecting active but unit is %s" % state['ActiveState'])
-        if state['SubState'] != 'running':
-            crit("Unit is active but reported %s'" % state['SubState'])
-        ok("%s is active" % unit)
-
-    elif expect == 'inactive':
-
-        if state['ActiveState'] != 'inactive':
-            crit("Expecting inactive but unit is %s" % state['ActiveState'])
-        if state['SubState'] != 'dead':
-            crit("Unit is inactive but reported %s'" % state['SubState'])
-        ok("%s is inactive" % unit)
-
-    # else periodic
-
-    if state['ActiveState'] == 'active':
-        ok("Unit is currently active")
-    if state['ActiveState'] != 'inactive':
-        crit("Unit is in state %s" % state['ActiveState'])
-    if state['Result'] != 'success':
-        crit("Last run result was %s" % state['Result'])
-
-    if lastrun:
-        try:
-            # Timestamps in systemctl show are in format 'Thu 2015-07-30 
16:56:59 UTC'
-            started = 
datetime.datetime.strptime(state['ExecMainStartTimestamp'], '%a %Y-%m-%d 
%H:%M:%S %Z')
-            age = datetime.datetime.fromtimestamp(int(time.time())) - started
-
-            if age > lastrun:
-                crit("Last run was over %s ago" % lastrun)
-
-        except (KeyError, ValueError):
-            unknown("Unit has no usable last run information (not a timer?)")
-
-    ok("Last run successful")
-
-if __name__ == "__main__":
-    main()
+$plugin->nagios_exit(OK, "Service $service is in the desired state ($expect)");
 
diff --git a/modules/nrpe/manifests/monitor_systemd_unit_lastrun.pp 
b/modules/nrpe/manifests/monitor_systemd_unit_lastrun.pp
new file mode 100644
index 0000000..7a6fe41
--- /dev/null
+++ b/modules/nrpe/manifests/monitor_systemd_unit_lastrun.pp
@@ -0,0 +1,36 @@
+# === Define: nrpe::monitor_systemd_unit_lastrun
+#
+# Installs a check for last run time of a systemd unit using journalctl
+define nrpe::monitor_systemd_unit_lastrun(
+    $unit = $title,
+    $description = "${unit} last run",
+    $contact_group = 'admins',
+    $retries = 3,
+    $timeout = 10,
+    $critical = false,
+    $ensure = 'present',
+    $warn_secs = 60*60*25,
+    $crit_secs = 60*60*49,
+    ){
+
+    if $::initsystem != 'systemd' {
+        fail('nrpe::monitor_systemd_unit_lastrun can only work on 
systemd-enabled systems')
+    }
+    require nrpe::systemd_scripts
+
+    # Temporary hack until we fix the downstream modules
+    if $critical {
+        $nagios_critical = 'true'
+    } else {
+        $nagios_critical = 'false'
+    }
+
+    nrpe::monitor_service { "${unit}-lastrun":
+        ensure       => $ensure,
+        description  => $description,
+        nrpe_command => "/usr/local/bin/nrpe_check_systemd_unit_lastrun 
'${unit}' ${warn_secs} ${crit_secs}",
+        retries      => $retries,
+        timeout      => $timeout,
+        critical     => $nagios_critical,
+    }
+}
diff --git a/modules/nrpe/manifests/monitor_systemd_unit_state.pp 
b/modules/nrpe/manifests/monitor_systemd_unit_state.pp
index 66de6a5..5f1d855 100644
--- a/modules/nrpe/manifests/monitor_systemd_unit_state.pp
+++ b/modules/nrpe/manifests/monitor_systemd_unit_state.pp
@@ -27,7 +27,7 @@
     nrpe::monitor_service { "${unit}-state":
         ensure       => $ensure,
         description  => $description,
-        nrpe_command => "/usr/local/bin/nrpe_check_systemd_unit_state 
'${unit}' ${expected_state}",
+        nrpe_command => "/usr/local/bin/nrpe_check_systemd_unit_state -s 
'${unit}' -e ${expected_state}",
         retries      => $retries,
         timeout      => $timeout,
         critical     => $nagios_critical,
diff --git a/modules/nrpe/manifests/systemd_scripts.pp 
b/modules/nrpe/manifests/systemd_scripts.pp
index 0f2ee2c..0f355b5 100644
--- a/modules/nrpe/manifests/systemd_scripts.pp
+++ b/modules/nrpe/manifests/systemd_scripts.pp
@@ -4,6 +4,7 @@
 #
 
 class nrpe::systemd_scripts {
+    require_package 'libnagios-plugin-perl'
 
     # These scripts allows monitoring of systemd services
     file { '/usr/local/bin/nrpe_check_systemd_unit_state':
@@ -14,4 +15,12 @@
         mode   => '0555',
     }
 
+    file { '/usr/local/bin/nrpe_check_systemd_unit_lastrun':
+        ensure => present,
+        source => 'puppet:///modules/nrpe/plugins/check_systemd_unit_lastrun',
+        owner  => 'root',
+        group  => 'root',
+        mode   => '0555',
+    }
+
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/230555
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I92857030fd47c771b9a419c55f863a61c9b4f868
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: coren <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to