Giuseppe Lavagetto has submitted this change and it was merged.

Change subject: alerts: move icinga alerts away from reqstats.*
......................................................................


alerts: move icinga alerts away from reqstats.*

Since we're decommissioning udp2log globally, we are also going to stop
relying on reqstats.* for any alarms.

We also add initial alarms for 5xx spikes per-datacenter and
per-cache-type, with the hope it will help debugging issues faster.

Bug: T118979
Change-Id: Ia5026cb25d67a9844947acc23ee7a457d0769549
---
M manifests/role/graphite.pp
M manifests/site.pp
2 files changed, 109 insertions(+), 14 deletions(-)

Approvals:
  Giuseppe Lavagetto: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/manifests/role/graphite.pp b/manifests/role/graphite.pp
index c76fddd..a30d7b2 100644
--- a/manifests/role/graphite.pp
+++ b/manifests/role/graphite.pp
@@ -246,13 +246,13 @@
     }
 }
 
-# == Class: role::graphite::production::alerts
+# == Class: role::graphite::alerts
 #
 # Install icinga alerts on graphite metrics.
 # NOTE to be included only from one host, icinga will generate different alerts
 # for all hosts that include this class.
 #
-class role::graphite::production::alerts {
+class role::graphite::alerts {
     # Infer Kafka cluster configuration from this class
     include ::role::analytics::kafka::config
 
@@ -266,17 +266,6 @@
 
     swift::monitoring::graphite_alerts { 'eqiad-prod': }
     swift::monitoring::graphite_alerts { 'codfw-prod': }
-
-    # Monitor production 5xx rates
-    monitoring::graphite_threshold { 'reqstats-5xx-threshold':
-        description     => 'HTTP 5xx reqs/min threshold',
-        metric          => 'reqstats.5xx',
-        warning         => 250,
-        critical        => 500,
-        from            => '15min',
-        nagios_critical => false
-    }
-
 
     # Use graphite's anomaly detection support.
     monitoring::graphite_anomaly { 'kafka-broker-MessagesIn-anomaly':
@@ -306,3 +295,109 @@
     }
 
 }
+
+class role::graphite::alerts::reqstats {
+
+    # Global threshold alarm as we had with reqstats.5xx
+    # Monitor production 5xx rates
+    monitoring::graphite_threshold { 'reqstats-5xx-global':
+        description     => 'HTTP 5xx reqs/min 
(https://grafana.wikimedia.org/dashboard/db/varnish-http-errors)',
+        metric          => 
'sumSeries(varnish.*.*.frontend.request.client.status.5xx.sum)',
+        warning         => 250,
+        critical        => 500,
+        from            => '15min',
+        nagios_critical => false,
+    }
+
+    # sites aggregates
+    monitoring::graphite_threshold { 'reqstats-5xx-eqiad':
+        description     => 'Eqiad HTTP 5xx reqs/min',
+        metric          => 
'sumSeries(varnish.eqiad.*.frontend.request.client.status.5xx.sum)',
+        warning         => 75,
+        critical        => 150,
+        from            => '15min',
+        nagios_critical => false,
+    }
+
+    monitoring::graphite_threshold { 'reqstats-5xx-esams':
+        description     => 'Esams HTTP 5xx reqs/min',
+        metric          => 
'sumSeries(varnish.esams.*.frontend.request.client.status.5xx.sum)',
+        warning         => 75,
+        critical        => 150,
+        from            => '15min',
+        nagios_critical => false,
+    }
+
+    monitoring::graphite_threshold { 'reqstats-5xx-codfw':
+        description     => 'Codfw HTTP 5xx reqs/min',
+        metric          => 
'sumSeries(varnish.codfw.*.frontend.request.client.status.5xx.sum)',
+        warning         => 75,
+        critical        => 150,
+        from            => '15min',
+        nagios_critical => false,
+    }
+
+    monitoring::graphite_threshold { 'reqstats-5xx-ulsfo':
+        description     => 'Ulsfo HTTP 5xx reqs/min',
+        metric          => 
'sumSeries(varnish.ulsfo.*.frontend.request.client.status.5xx.sum)',
+        warning         => 75,
+        critical        => 150,
+        from            => '15min',
+        nagios_critical => false,
+    }
+
+    # per-cache aggregates
+    monitoring::graphite_threshold { 'reqstats-5xx-text':
+        description     => 'Text HTTP 5xx reqs/min',
+        metric          => 
'sumSeries(varnish.*.text.frontend.request.client.status.5xx.sum)',
+        warning         => 75,
+        critical        => 150,
+        from            => '15min',
+        nagios_critical => false,
+    }
+
+    monitoring::graphite_threshold { 'reqstats-5xx-mobile':
+        description     => 'Mobile HTTP 5xx reqs/min',
+        metric          => 
'sumSeries(varnish.*.mobile.frontend.request.client.status.5xx.sum)',
+        warning         => 75,
+        critical        => 150,
+        from            => '15min',
+        nagios_critical => false,
+    }
+
+    monitoring::graphite_threshold { 'reqstats-5xx-uploads':
+        description     => 'Uploads HTTP 5xx reqs/min',
+        metric          => 
'sumSeries(varnish.*.uploads.frontend.request.client.status.5xx.sum)',
+        warning         => 75,
+        critical        => 150,
+        from            => '15min',
+        nagios_critical => false,
+    }
+
+    monitoring::graphite_threshold { 'reqstats-5xx-misc':
+        description     => 'Misc HTTP 5xx reqs/min',
+        metric          => 
'sumSeries(varnish.*.misc.frontend.request.client.status.5xx.sum)',
+        warning         => 75,
+        critical        => 150,
+        from            => '15min',
+        nagios_critical => false,
+    }
+
+    monitoring::graphite_threshold { 'reqstats-5xx-parsoid':
+        description     => 'Parsoid HTTP 5xx reqs/min',
+        metric          => 
'sumSeries(varnish.*.parsoid.frontend.request.client.status.5xx.sum)',
+        warning         => 75,
+        critical        => 150,
+        from            => '15min',
+        nagios_critical => false,
+    }
+
+    monitoring::graphite_threshold { 'reqstats-5xx-maps':
+        description     => 'Maps HTTP 5xx reqs/min',
+        metric          => 
'sumSeries(varnish.*.maps.frontend.request.client.status.5xx.sum)',
+        warning         => 75,
+        critical        => 150,
+        from            => '15min',
+        nagios_critical => false,
+    }
+}
diff --git a/manifests/site.pp b/manifests/site.pp
index 31714c3..ab6cd9f 100644
--- a/manifests/site.pp
+++ b/manifests/site.pp
@@ -1051,7 +1051,7 @@
 
 # Primary graphite machines
 node 'graphite1001.eqiad.wmnet' {
-    role graphite::production, statsdlb, performance, 
graphite::production::alerts, restbase::alerts
+    role graphite::production, statsdlb, performance, graphite::alerts, 
restbase::alerts, graphite::alerts::reqstats
     include standard
 }
 

-- 
To view, visit https://gerrit.wikimedia.org/r/255347
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ia5026cb25d67a9844947acc23ee7a457d0769549
Gerrit-PatchSet: 4
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Giuseppe Lavagetto <glavage...@wikimedia.org>
Gerrit-Reviewer: Alexandros Kosiaris <akosia...@wikimedia.org>
Gerrit-Reviewer: Faidon Liambotis <fai...@wikimedia.org>
Gerrit-Reviewer: Filippo Giunchedi <fgiunch...@wikimedia.org>
Gerrit-Reviewer: Giuseppe Lavagetto <glavage...@wikimedia.org>
Gerrit-Reviewer: Ottomata <o...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to