Giuseppe Lavagetto has submitted this change and it was merged. Change subject: alerts: move icinga alerts away from reqstats.* ......................................................................
alerts: move icinga alerts away from reqstats.* Since we're decommissioning udp2log globally, we are also going to stop relying on reqstats.* for any alarms. We also add initial alarms for 5xx spikes per-datacenter and per-cache-type, with the hope it will help debugging issues faster. Bug: T118979 Change-Id: Ia5026cb25d67a9844947acc23ee7a457d0769549 --- M manifests/role/graphite.pp M manifests/site.pp 2 files changed, 109 insertions(+), 14 deletions(-) Approvals: Giuseppe Lavagetto: Looks good to me, approved jenkins-bot: Verified diff --git a/manifests/role/graphite.pp b/manifests/role/graphite.pp index c76fddd..a30d7b2 100644 --- a/manifests/role/graphite.pp +++ b/manifests/role/graphite.pp @@ -246,13 +246,13 @@ } } -# == Class: role::graphite::production::alerts +# == Class: role::graphite::alerts # # Install icinga alerts on graphite metrics. # NOTE to be included only from one host, icinga will generate different alerts # for all hosts that include this class. # -class role::graphite::production::alerts { +class role::graphite::alerts { # Infer Kafka cluster configuration from this class include ::role::analytics::kafka::config @@ -266,17 +266,6 @@ swift::monitoring::graphite_alerts { 'eqiad-prod': } swift::monitoring::graphite_alerts { 'codfw-prod': } - - # Monitor production 5xx rates - monitoring::graphite_threshold { 'reqstats-5xx-threshold': - description => 'HTTP 5xx reqs/min threshold', - metric => 'reqstats.5xx', - warning => 250, - critical => 500, - from => '15min', - nagios_critical => false - } - # Use graphite's anomaly detection support. monitoring::graphite_anomaly { 'kafka-broker-MessagesIn-anomaly': @@ -306,3 +295,109 @@ } } + +class role::graphite::alerts::reqstats { + + # Global threshold alarm as we had with reqstats.5xx + # Monitor production 5xx rates + monitoring::graphite_threshold { 'reqstats-5xx-global': + description => 'HTTP 5xx reqs/min (https://grafana.wikimedia.org/dashboard/db/varnish-http-errors)', + metric => 'sumSeries(varnish.*.*.frontend.request.client.status.5xx.sum)', + warning => 250, + critical => 500, + from => '15min', + nagios_critical => false, + } + + # sites aggregates + monitoring::graphite_threshold { 'reqstats-5xx-eqiad': + description => 'Eqiad HTTP 5xx reqs/min', + metric => 'sumSeries(varnish.eqiad.*.frontend.request.client.status.5xx.sum)', + warning => 75, + critical => 150, + from => '15min', + nagios_critical => false, + } + + monitoring::graphite_threshold { 'reqstats-5xx-esams': + description => 'Esams HTTP 5xx reqs/min', + metric => 'sumSeries(varnish.esams.*.frontend.request.client.status.5xx.sum)', + warning => 75, + critical => 150, + from => '15min', + nagios_critical => false, + } + + monitoring::graphite_threshold { 'reqstats-5xx-codfw': + description => 'Codfw HTTP 5xx reqs/min', + metric => 'sumSeries(varnish.codfw.*.frontend.request.client.status.5xx.sum)', + warning => 75, + critical => 150, + from => '15min', + nagios_critical => false, + } + + monitoring::graphite_threshold { 'reqstats-5xx-ulsfo': + description => 'Ulsfo HTTP 5xx reqs/min', + metric => 'sumSeries(varnish.ulsfo.*.frontend.request.client.status.5xx.sum)', + warning => 75, + critical => 150, + from => '15min', + nagios_critical => false, + } + + # per-cache aggregates + monitoring::graphite_threshold { 'reqstats-5xx-text': + description => 'Text HTTP 5xx reqs/min', + metric => 'sumSeries(varnish.*.text.frontend.request.client.status.5xx.sum)', + warning => 75, + critical => 150, + from => '15min', + nagios_critical => false, + } + + monitoring::graphite_threshold { 'reqstats-5xx-mobile': + description => 'Mobile HTTP 5xx reqs/min', + metric => 'sumSeries(varnish.*.mobile.frontend.request.client.status.5xx.sum)', + warning => 75, + critical => 150, + from => '15min', + nagios_critical => false, + } + + monitoring::graphite_threshold { 'reqstats-5xx-uploads': + description => 'Uploads HTTP 5xx reqs/min', + metric => 'sumSeries(varnish.*.uploads.frontend.request.client.status.5xx.sum)', + warning => 75, + critical => 150, + from => '15min', + nagios_critical => false, + } + + monitoring::graphite_threshold { 'reqstats-5xx-misc': + description => 'Misc HTTP 5xx reqs/min', + metric => 'sumSeries(varnish.*.misc.frontend.request.client.status.5xx.sum)', + warning => 75, + critical => 150, + from => '15min', + nagios_critical => false, + } + + monitoring::graphite_threshold { 'reqstats-5xx-parsoid': + description => 'Parsoid HTTP 5xx reqs/min', + metric => 'sumSeries(varnish.*.parsoid.frontend.request.client.status.5xx.sum)', + warning => 75, + critical => 150, + from => '15min', + nagios_critical => false, + } + + monitoring::graphite_threshold { 'reqstats-5xx-maps': + description => 'Maps HTTP 5xx reqs/min', + metric => 'sumSeries(varnish.*.maps.frontend.request.client.status.5xx.sum)', + warning => 75, + critical => 150, + from => '15min', + nagios_critical => false, + } +} diff --git a/manifests/site.pp b/manifests/site.pp index 31714c3..ab6cd9f 100644 --- a/manifests/site.pp +++ b/manifests/site.pp @@ -1051,7 +1051,7 @@ # Primary graphite machines node 'graphite1001.eqiad.wmnet' { - role graphite::production, statsdlb, performance, graphite::production::alerts, restbase::alerts + role graphite::production, statsdlb, performance, graphite::alerts, restbase::alerts, graphite::alerts::reqstats include standard } -- To view, visit https://gerrit.wikimedia.org/r/255347 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ia5026cb25d67a9844947acc23ee7a457d0769549 Gerrit-PatchSet: 4 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Giuseppe Lavagetto <glavage...@wikimedia.org> Gerrit-Reviewer: Alexandros Kosiaris <akosia...@wikimedia.org> Gerrit-Reviewer: Faidon Liambotis <fai...@wikimedia.org> Gerrit-Reviewer: Filippo Giunchedi <fgiunch...@wikimedia.org> Gerrit-Reviewer: Giuseppe Lavagetto <glavage...@wikimedia.org> Gerrit-Reviewer: Ottomata <o...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits