Ottomata has submitted this change and it was merged.

Change subject: Finish adding --until param to check_graphite script
......................................................................


Finish adding --until param to check_graphite script

Cleanup follow up of https://gerrit.wikimedia.org/r/#/c/255415/

This has been sitting for a while.  The check_graphite_until_temp
script that added the --until paramater has been running just fine
for eventlogging and varnishkafka checks.  I tested this script manually,
and it does what is expected.

e.g.
./check_graphite --url http://graphite.wikimedia.org check_threshold   
'kafka.cluster.analytics-eqiad.kafka.kafka1012_eqiad_wmnet_9999.kafka.server.BrokerTopicMetrics-AllTopics.MessagesInPerSec.OneMinuteRate'
 --from 20min --until=-0min  --under -C 15000 -W 18000

Varying the --until (including for the default of 0min) works as expected.

Bug: T116035
Change-Id: I863367b8b5e97f501ebabc4eacbac081262462d6
---
M modules/monitoring/manifests/graphite_threshold.pp
M modules/nagios_common/files/check_commands/check_graphite
M modules/nagios_common/files/check_commands/check_graphite.cfg
3 files changed, 49 insertions(+), 64 deletions(-)

Approvals:
  Ottomata: Looks good to me, approved
  Filippo Giunchedi: Looks good to me, but someone else must approve
  jenkins-bot: Verified



diff --git a/modules/monitoring/manifests/graphite_threshold.pp 
b/modules/monitoring/manifests/graphite_threshold.pp
index bcd64a2..917421d 100644
--- a/modules/monitoring/manifests/graphite_threshold.pp
+++ b/modules/monitoring/manifests/graphite_threshold.pp
@@ -30,6 +30,8 @@
 #                         datapoints that should be checked individually
 # $from                 - Date from which to fetch data.
 #                         Examples: '1hours','10min' (default), '2w'
+# $until                - end sampling date (negative relative time from
+#                         now.  Default: '0min'
 # $percentage           - Number of datapoints exceeding the
 #                         threshold. Defaults to 1%.
 # $under                - If true, the threshold is a lower limit.
@@ -54,8 +56,7 @@
     $critical,
     $series                = false,
     $from                  = '10min',
-    # temporarly use $until to conditionally use check_graphite_until command
-    $until                 = undef,
+    $until                 = '0min',
     $percentage            = 1,
     $under                 = false,
     $graphite_url          = 'https://graphite.wikimedia.org',
@@ -83,7 +84,7 @@
     #   $ARG4$  -W warning threshold
     #   $ARG5$  -C critical threshold
     #   $ARG6$  --from start sampling date (negative relative time from now)
-    #####   $ARG7$  --until end sampling date (negative relative time from now)
+    #   $ARG7$  --until end sampling date (negative relative time from now)
     #   $ARG8$  --perc percentage of exceeding datapoints
     #   $ARG9$  --over or --under
     $modifier = $under ? {
@@ -95,43 +96,22 @@
         fail("single quotes will be stripped from graphite metric ${metric}, 
consider using double quotes")
     }
 
-    # TEMPORARY conditional to test the --until arg without affecting all
-    # alerts. This conditional will be removed once we are sure until works.
-    if $until and !$series {
-        $command = 'check_graphite_threshold_until_temp'
-
-        monitoring::service { $title:
-            ensure                => $ensure,
-            description           => $description,
-            check_command         => 
"${command}!${graphite_url}!${timeout}!${metric}!${warning}!${critical}!${from}!${until}!${percentage}!${modifier}",
-            retries               => $retries,
-            group                 => $group,
-            critical              => $nagios_critical,
-            passive               => $passive,
-            freshness             => $freshness,
-            normal_check_interval => $normal_check_interval,
-            retry_check_interval  => $retry_check_interval,
-            contact_group         => $contact_group,
-        }
+    $command = $series ? {
+        true    => 'check_graphite_series_threshold',
+        default => 'check_graphite_threshold'
     }
-    else {
-        $command = $series ? {
-            true    => 'check_graphite_series_threshold',
-            default => 'check_graphite_threshold'
-        }
 
-        monitoring::service { $title:
-            ensure                => $ensure,
-            description           => $description,
-            check_command         => 
"${command}!${graphite_url}!${timeout}!${metric}!${warning}!${critical}!${from}!${percentage}!${modifier}",
-            retries               => $retries,
-            group                 => $group,
-            critical              => $nagios_critical,
-            passive               => $passive,
-            freshness             => $freshness,
-            normal_check_interval => $normal_check_interval,
-            retry_check_interval  => $retry_check_interval,
-            contact_group         => $contact_group,
-        }
+    monitoring::service { $title:
+        ensure                => $ensure,
+        description           => $description,
+        check_command         => 
"${command}!${graphite_url}!${timeout}!${metric}!${warning}!${critical}!${from}!${until}!${percentage}!${modifier}",
+        retries               => $retries,
+        group                 => $group,
+        critical              => $nagios_critical,
+        passive               => $passive,
+        freshness             => $freshness,
+        normal_check_interval => $normal_check_interval,
+        retry_check_interval  => $retry_check_interval,
+        contact_group         => $contact_group,
     }
 }
diff --git a/modules/nagios_common/files/check_commands/check_graphite 
b/modules/nagios_common/files/check_commands/check_graphite
index 75e3c45..87184af 100755
--- a/modules/nagios_common/files/check_commands/check_graphite
+++ b/modules/nagios_common/files/check_commands/check_graphite
@@ -160,7 +160,7 @@
     Checks if the metric exceeds the desired threshold
     '''
     parser_name = 'check_threshold'
-    from_regex = re.compile('^-(\d+)(\w+)$')
+    timespec_regex = re.compile('^-(\d+)(\w+)$')
     _accepted_time_defs = ['hours', 'hour', 'h',
                            'weeks', 'week', 'w',
                            'days', 'day', 'd',
@@ -172,8 +172,13 @@
         p.add_argument(
             '--from',
             dest='_from',
-            help='When to fetch the metric from (date or "-1d")',
+            help='When to fetch the metric from (date or "-1d") (sign does not 
matter)',
             default='-1h')
+        p.add_argument(
+            '--until',
+            dest='_until',
+            help='When to fetch the metric until (date or "-1d") (sign does 
not matter)',
+            default='-0min')
         p.add_argument(
             '--over',
             dest='over',
@@ -194,6 +199,17 @@
             help='Number of datapoints above or below threshold that will 
raise the alarm')
         return p
 
+    def check_time_parameter(self, param_name, param_value):
+        m = self.timespec_regex.match(param_value)
+        if not m:
+            raise ValueError(
+                'the value of the %s argument is invalid: %s' %
+                (param_name, param_value))
+        if m.group(2) not in self._accepted_time_defs:
+            raise ValueError('The unit specification for %s' % param_name +
+                             'should be one of the following: %s' %
+                             ','.join(self._accepted_time_defs))
+
     def get_all(self, args):
         '''
         Gets additional data from the command-line args.
@@ -204,18 +220,14 @@
         _from = args._from
         if not args._from.startswith('-'):
             _from = '-%s' % args._from
-        m = self.from_regex.match(_from)
-        if not m:
-            raise ValueError(
-                'the value of the --from argument is invalid: %s' %
-                args._from)
-        if m.group(2) not in self._accepted_time_defs:
-            raise ValueError(
-                'The unit specification for --from should be one of the 
following: %s' %
-                ','.join(
-                    self._accepted_time_defs))
+        self.check_time_parameter('--from', _from)
 
-        self.params = [('format', 'json'), ('from', _from)]
+        _until = args._until
+        if not args._until.startswith('-'):
+            _until = '-%s' % args._until
+        self.check_time_parameter('--until', _until)
+
+        self.params = [('format', 'json'), ('from', _from), ('until', _until)]
         for target in self.targets:
             self.params.append(('target', target))
         if args.under:
@@ -544,11 +556,12 @@
 
     Examples:
 
-    Check if a metric exceeds a certain value 10 times in the last 20 minutes:
+    Check if a metric exceeds a certain value 10 times in the last 20 minutes
+    with a 5 minutes lag:
 
     ./check_graphite --url http://some-graphite-host \
            check_threshold my.beloved.metric  --from -20minutes \
-           --threshold 100 --over -C 10 -W 5
+           --until -5minutes --threshold 100 --over -C 10 -W 5
 
     Check if a metric has exceeded its holt-winters confidence bands 5% of the
     times over the last 500 checks
diff --git a/modules/nagios_common/files/check_commands/check_graphite.cfg 
b/modules/nagios_common/files/check_commands/check_graphite.cfg
index 68c60ac..e2d5421 100644
--- a/modules/nagios_common/files/check_commands/check_graphite.cfg
+++ b/modules/nagios_common/files/check_commands/check_graphite.cfg
@@ -2,23 +2,15 @@
 # Generic checks for graphite
 define command{
     command_name    check_graphite_threshold
-    command_line    $USER1$/check_graphite -U $ARG1$ -T $ARG2$ check_threshold 
'$ARG3$' -W $ARG4$ -C $ARG5$ --from $ARG6$ --perc $ARG7$ $ARG8$
+    command_line    $USER1$/check_graphite -U $ARG1$ -T $ARG2$ check_threshold 
'$ARG3$' -W $ARG4$ -C $ARG5$ --from $ARG6$ --until $ARG7$ --perc $ARG8$ $ARG9$
 }
 
 define command{
     command_name    check_graphite_series_threshold
-    command_line    $USER1$/check_graphite -U $ARG1$ -T $ARG2$ 
check_series_threshold '$ARG3$' -W $ARG4$ -C $ARG5$ --from $ARG6$ --perc $ARG7$ 
$ARG8$
+    command_line    $USER1$/check_graphite -U $ARG1$ -T $ARG2$ 
check_series_threshold '$ARG3$' -W $ARG4$ -C $ARG5$ --from $ARG6$ --until 
$ARG7$ --perc $ARG8$ $ARG9$
 }
 
 define command{
     command_name    check_graphite_anomaly
     command_line    $USER1$/check_graphite -U $ARG1$ -T $ARG2$ check_anomaly 
'$ARG3$' -W $ARG4$ -C $ARG5$ --check_window $ARG6$ $ARG7$
-}
-
-# NOTE: This is a temporary check command that will be removed once it is
-# verified to not break things and can be integrated into the regular
-# check_graphite_threshold.
-define command{
-    command_name    check_graphite_threshold_until_temp
-    command_line    $USER1$/check_graphite_until_temp -U $ARG1$ -T $ARG2$ 
check_threshold '$ARG3$' -W $ARG4$ -C $ARG5$ --from $ARG6$ --until $ARG7$ 
--perc $ARG8$ $ARG9$
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/300548
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I863367b8b5e97f501ebabc4eacbac081262462d6
Gerrit-PatchSet: 7
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Ottomata <o...@wikimedia.org>
Gerrit-Reviewer: Elukey <ltosc...@wikimedia.org>
Gerrit-Reviewer: Filippo Giunchedi <fgiunch...@wikimedia.org>
Gerrit-Reviewer: Joal <j...@wikimedia.org>
Gerrit-Reviewer: Ottomata <o...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to