Giuseppe Lavagetto has uploaded a new change for review. https://gerrit.wikimedia.org/r/130582
Change subject: Add another check using graphite, small fixes. ...................................................................... Add another check using graphite, small fixes. This commit includes a few small things: - Corrected the output of check_graphite (now we show two digits instead of an arbitrary number) - Now check_graphite_anomaly is able to consider either only the upper bound or the lower bound of metrics (as this makes sense in most cases) - Shortened the window of the requstats.5xx check to the last 15 minutes - Added a first (experimental) anomaly detection on the error ratio. - Made the 5xx check actually page us as it's usually fired off when something is wrong. Change-Id: Idea08c16164e0280f861b920155f139395eacf8d Signed-off-by: Giuseppe Lavagetto <glavage...@wikimedia.org> --- M files/icinga/check_graphite M manifests/nagios.pp M manifests/role/graphite.pp M templates/icinga/checkcommands.cfg.erb 4 files changed, 66 insertions(+), 10 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/82/130582/1 diff --git a/files/icinga/check_graphite b/files/icinga/check_graphite index a01676a..f3ac06d 100755 --- a/files/icinga/check_graphite +++ b/files/icinga/check_graphite @@ -267,9 +267,9 @@ if lengths[key] >= t * self.perc / 100.0: perc = lengths[key] * 100.0 / t raise NagiosException( - key, '%s%% of data exceeded the %s threshold [%s]' % + key, '%3.2f%% of data exceeded the %s threshold [%s]' % (perc, key.lower(), self.limits[key])) - return 'Less than %s%% data above the threshold [%s]' % \ + return 'Less than %3.2f%% data above the threshold [%s]' % \ (self.perc, self.limits['WARNING']) @@ -290,6 +290,19 @@ help='''How many datapoints to consider in the anomaly detection sampling (we will still require 1w of data)''', default=20) + p.add_argument( + '--over', + dest="over", + action='store_true', + default=False, + help='If alarms should happen when we are above normal values') + p.add_argument( + '--under', + dest="under", + action='store_true', + default=False, + help='If alarms should happen when we are below normal values') + return p def get_all(self, args): @@ -307,6 +320,8 @@ self.check_window = args.check_window self.warn = args.warn self.crit = args.crit + self.over = args.over + self.under = args.under def parse_result(self, result): """ @@ -363,6 +378,14 @@ raise NagiosException( 'UNKNOWN', 'Service is critically flapping: %s data below and %s above the confidence bounds' % (h, l)) + + if self.over and h < self.crit: + return 'No anomaly detected' + + if self.under and l < self.crit: + return 'No anomaly detected' + + raise NagiosException( 'CRITICAL', 'Anomaly detected: %s data above and %s below the confidence bounds' % (h, l)) @@ -372,6 +395,12 @@ raise NagiosException( 'UNKNOWN', 'Service is flapping: %s data below and %s above the confidence bounds' % (h, l)) + if self.over and h < self.warn: + return 'No anomaly detected' + + if self.under and l < self.warn: + return 'No anomaly detected' + raise NagiosException( 'WARNING', 'Anomaly detected: %s data above and %s below the confidence bounds' % (h, l)) diff --git a/manifests/nagios.pp b/manifests/nagios.pp index 771b142..0d070f1 100644 --- a/manifests/nagios.pp +++ b/manifests/nagios.pp @@ -474,6 +474,7 @@ # metric => 'reqstats.5xx', # warning => 5, # critical => 10, +# over => true # } # # == Parameters @@ -486,6 +487,8 @@ # $graphite_url - URL of the graphite server. # $timeout - Timeout for the http query to # graphite. Defaults to 10 seconds +# over - check only for values above the limit +# under - check only for values below the limit # $host # $retries # $group @@ -504,6 +507,8 @@ $check_window = 100, $graphite_url = 'http://graphite.wikimedia.org', $timeout = 10, + $over = false, + $under = false, $host = $::hostname, $retries = 3, $group = $nagios_group, @@ -517,6 +522,15 @@ ) { + if $over == true { + $modifier = '--over' + } + elsif $under == true { + $modifier = '--under' + } + else { + $modifier = '' + } # checkcommands.cfg's check_graphite_anomaly command has # many positional arguments that # are passed to the check_graphite script: @@ -526,10 +540,11 @@ # $ARG4$ -W warning threshold # $ARG5$ -C critical threshold # $ARG6$ --check_window sampling size + # $ARG7$ --over or --under monitor_service { $title: ensure => $ensure, description => $description, - check_command => "check_graphite_anomaly!${graphite_url}!${timeout}!${metric}!${warning}!${critical}!${check_window}", + check_command => "check_graphite_anomaly!${graphite_url}!${timeout}!${metric}!${warning}!${critical}!${check_window}!modifier", retries => $retries, group => $group, critical => $nagios_critical, diff --git a/manifests/role/graphite.pp b/manifests/role/graphite.pp index df34cfb..a207d60 100644 --- a/manifests/role/graphite.pp +++ b/manifests/role/graphite.pp @@ -186,11 +186,23 @@ $apache_auth = template('graphite/apache-auth-ldap.erb') monitor_graphite_threshold {'reqstats_5xx': - description => 'HTTP 5xx req/min', - metric => 'reqstats.5xx', - warning => 250, - critical => 500, - from => '1hours' + description => 'HTTP 5xx req/min', + metric => 'reqstats.5xx', + warning => 250, + critical => 500, + from => '15min', + nagios_critical => 'true' + } + + # Will try to detect anomalies in the requests error ratio; + # if 10% of the last 100 checks is out of forecasted bounds + monitor_graphite_anomaly {'requests_error_ratio': + description => 'HTTP error ratio anomaly detection', + metric => 'divideSeries(reqstats.5xx,reqstats.requests)', + warning => 5, + critical => 10, + check_window => 100, + over => true } } diff --git a/templates/icinga/checkcommands.cfg.erb b/templates/icinga/checkcommands.cfg.erb index 5edfdbf..7119aac 100644 --- a/templates/icinga/checkcommands.cfg.erb +++ b/templates/icinga/checkcommands.cfg.erb @@ -1,4 +1,4 @@ -################################################################################ +o################################################################################ # Sample object config file for Nagios # # Read the documentation for more information on this configuration file. I've @@ -512,7 +512,7 @@ define command{ command_name check_graphite_anomaly - command_line $USER1$/check_graphite -U $ARG1$ -T $ARG2$ check_anomaly '$ARG3$' -W $ARG4$ -C $ARG5$ --check_window $ARG6$ + command_line $USER1$/check_graphite -U $ARG1$ -T $ARG2$ check_anomaly '$ARG3$' -W $ARG4$ -C $ARG5$ --check_window $ARG6$ $ARG7$ } -- To view, visit https://gerrit.wikimedia.org/r/130582 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Idea08c16164e0280f861b920155f139395eacf8d Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Giuseppe Lavagetto <glavage...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits