Giuseppe Lavagetto has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/130582

Change subject: Add another check using graphite, small fixes.
......................................................................

Add another check using graphite, small fixes.

This commit includes a few small things:
- Corrected the output of check_graphite (now we show two digits instead
  of an arbitrary number)
- Now check_graphite_anomaly is able to consider either only the upper
  bound or the lower bound of metrics (as this makes sense in most cases)
- Shortened the window of the requstats.5xx check to the last 15 minutes
- Added a first (experimental) anomaly detection on the error ratio.
- Made the 5xx check actually page us as it's usually fired off when
  something is wrong.

Change-Id: Idea08c16164e0280f861b920155f139395eacf8d
Signed-off-by: Giuseppe Lavagetto <glavage...@wikimedia.org>
---
M files/icinga/check_graphite
M manifests/nagios.pp
M manifests/role/graphite.pp
M templates/icinga/checkcommands.cfg.erb
4 files changed, 66 insertions(+), 10 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/82/130582/1

diff --git a/files/icinga/check_graphite b/files/icinga/check_graphite
index a01676a..f3ac06d 100755
--- a/files/icinga/check_graphite
+++ b/files/icinga/check_graphite
@@ -267,9 +267,9 @@
             if lengths[key] >= t * self.perc / 100.0:
                 perc = lengths[key] * 100.0 / t
                 raise NagiosException(
-                    key, '%s%% of data exceeded the %s threshold [%s]' %
+                    key, '%3.2f%% of data exceeded the %s threshold [%s]' %
                     (perc, key.lower(), self.limits[key]))
-        return 'Less than %s%% data above the threshold [%s]' % \
+        return 'Less than %3.2f%% data above the threshold [%s]' % \
             (self.perc, self.limits['WARNING'])
 
 
@@ -290,6 +290,19 @@
             help='''How many datapoints to consider in the anomaly detection
 sampling (we will still require 1w of data)''',
             default=20)
+        p.add_argument(
+            '--over',
+            dest="over",
+            action='store_true',
+            default=False,
+            help='If alarms should happen when we are above normal values')
+        p.add_argument(
+            '--under',
+            dest="under",
+            action='store_true',
+            default=False,
+            help='If alarms should happen when we are below normal values')
+
         return p
 
     def get_all(self, args):
@@ -307,6 +320,8 @@
         self.check_window = args.check_window
         self.warn = args.warn
         self.crit = args.crit
+        self.over = args.over
+        self.under = args.under
 
     def parse_result(self, result):
         """
@@ -363,6 +378,14 @@
                 raise NagiosException(
                     'UNKNOWN',
                     'Service is critically flapping: %s data below and %s 
above the confidence bounds' % (h, l))
+
+            if self.over and h < self.crit:
+                return 'No anomaly detected'
+
+            if self.under and l < self.crit:
+                return 'No anomaly detected'
+
+
             raise NagiosException(
                 'CRITICAL', 'Anomaly detected: %s data above and %s below the 
confidence bounds' %
                 (h, l))
@@ -372,6 +395,12 @@
                 raise NagiosException(
                     'UNKNOWN',
                     'Service is flapping: %s data below and %s above the 
confidence bounds' % (h, l))
+            if self.over and h < self.warn:
+                return 'No anomaly detected'
+
+            if self.under and l < self.warn:
+                return 'No anomaly detected'
+
             raise NagiosException(
                 'WARNING', 'Anomaly detected: %s data above and %s below the 
confidence bounds' %
                 (h, l))
diff --git a/manifests/nagios.pp b/manifests/nagios.pp
index 771b142..0d070f1 100644
--- a/manifests/nagios.pp
+++ b/manifests/nagios.pp
@@ -474,6 +474,7 @@
 #       metric               => 'reqstats.5xx',
 #       warning              => 5,
 #       critical             => 10,
+#       over                 => true
 #   }
 #
 # == Parameters
@@ -486,6 +487,8 @@
 # $graphite_url         - URL of the graphite server.
 # $timeout              - Timeout for the http query to
 #                         graphite. Defaults to 10 seconds
+# over                  - check only for values above the limit
+# under                 - check only for values below the limit
 # $host
 # $retries
 # $group
@@ -504,6 +507,8 @@
     $check_window          = 100,
     $graphite_url          = 'http://graphite.wikimedia.org',
     $timeout               = 10,
+    $over                  = false,
+    $under                 = false,
     $host                  = $::hostname,
     $retries               = 3,
     $group                 = $nagios_group,
@@ -517,6 +522,15 @@
 )
 {
 
+    if $over == true {
+        $modifier = '--over'
+    }
+    elsif $under == true {
+        $modifier = '--under'
+    }
+    else {
+        $modifier = ''
+    }
     # checkcommands.cfg's check_graphite_anomaly command has
     # many positional arguments that
     # are passed to the check_graphite script:
@@ -526,10 +540,11 @@
     #   $ARG4$  -W warning threshold
     #   $ARG5$  -C critical threshold
     #   $ARG6$  --check_window sampling size
+    #   $ARG7$  --over or --under
     monitor_service { $title:
         ensure                => $ensure,
         description           => $description,
-        check_command         => 
"check_graphite_anomaly!${graphite_url}!${timeout}!${metric}!${warning}!${critical}!${check_window}",
+        check_command         => 
"check_graphite_anomaly!${graphite_url}!${timeout}!${metric}!${warning}!${critical}!${check_window}!modifier",
         retries               => $retries,
         group                 => $group,
         critical              => $nagios_critical,
diff --git a/manifests/role/graphite.pp b/manifests/role/graphite.pp
index df34cfb..a207d60 100644
--- a/manifests/role/graphite.pp
+++ b/manifests/role/graphite.pp
@@ -186,11 +186,23 @@
         $apache_auth   = template('graphite/apache-auth-ldap.erb')
 
         monitor_graphite_threshold {'reqstats_5xx':
-            description  => 'HTTP 5xx req/min',
-            metric       => 'reqstats.5xx',
-            warning      => 250,
-            critical     => 500,
-            from         => '1hours'
+            description     => 'HTTP 5xx req/min',
+            metric          => 'reqstats.5xx',
+            warning         => 250,
+            critical        => 500,
+            from            => '15min',
+            nagios_critical => 'true'
+        }
+
+        # Will try to detect anomalies in the requests error ratio;
+        # if 10% of the last 100 checks is out of forecasted bounds
+        monitor_graphite_anomaly {'requests_error_ratio':
+            description  => 'HTTP error ratio anomaly detection',
+            metric       => 'divideSeries(reqstats.5xx,reqstats.requests)',
+            warning      => 5,
+            critical     => 10,
+            check_window => 100,
+            over         => true
         }
     }
 
diff --git a/templates/icinga/checkcommands.cfg.erb 
b/templates/icinga/checkcommands.cfg.erb
index 5edfdbf..7119aac 100644
--- a/templates/icinga/checkcommands.cfg.erb
+++ b/templates/icinga/checkcommands.cfg.erb
@@ -1,4 +1,4 @@
-################################################################################
+o################################################################################
 # Sample object config file for Nagios
 #
 # Read the documentation for more information on this configuration file.  I've
@@ -512,7 +512,7 @@
 
 define command{
     command_name    check_graphite_anomaly
-    command_line    $USER1$/check_graphite -U $ARG1$ -T $ARG2$ check_anomaly 
'$ARG3$' -W $ARG4$ -C $ARG5$ --check_window $ARG6$
+    command_line    $USER1$/check_graphite -U $ARG1$ -T $ARG2$ check_anomaly 
'$ARG3$' -W $ARG4$ -C $ARG5$ --check_window $ARG6$ $ARG7$
 }
 
 

-- 
To view, visit https://gerrit.wikimedia.org/r/130582
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Idea08c16164e0280f861b920155f139395eacf8d
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Giuseppe Lavagetto <glavage...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to