Volans has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/392607 )
Change subject: Metric alarms: convert dashboad_link to array ...................................................................... Metric alarms: convert dashboad_link to array * Convert the dashboad_link parameter to dashboad_links, it must be an array. * Ensure that there is at least one item and that the first item is a link to Grafana. * Populate the Icinga's notes_url parameter with all the URLs in dashboad_links, using the single-quoted syntax even when only one link is present, to force Icinga to use the 1-notes.gif image instead of the default notes.gif one. Bug: T170353 Change-Id: I9d1f8b440844ad556281c0c30eac2c98422fe4ef --- M modules/confluent/manifests/kafka/broker/alerts.pp M modules/eventlogging/manifests/monitoring/graphite.pp M modules/graphite/manifests/monitoring/graphite.pp M modules/icinga/manifests/monitor/wdqs.pp M modules/labstore/manifests/monitoring/interfaces.pp M modules/mediawiki/manifests/monitoring/graphite.pp M modules/monitoring/manifests/check_prometheus.pp M modules/monitoring/manifests/graphite_anomaly.pp M modules/monitoring/manifests/graphite_threshold.pp M modules/profile/manifests/cache/kafka/webrequest.pp M modules/profile/manifests/kafka/broker/monitoring.pp M modules/profile/manifests/zookeeper/server.pp M modules/role/manifests/analytics_cluster/hadoop/master.pp M modules/role/manifests/analytics_cluster/hadoop/standby.pp M modules/role/manifests/analytics_cluster/hadoop/worker.pp M modules/role/manifests/elasticsearch/alerts.pp M modules/role/manifests/graphite/alerts.pp M modules/role/manifests/graphite/alerts/reqstats.pp M modules/role/manifests/restbase/alerts.pp M modules/swift/manifests/monitoring/graphite_alerts.pp M modules/varnish/manifests/instance.pp M modules/wdqs/manifests/monitor/services.pp M modules/zuul/manifests/monitoring/server.pp 23 files changed, 473 insertions(+), 431 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/07/392607/1 diff --git a/modules/confluent/manifests/kafka/broker/alerts.pp b/modules/confluent/manifests/kafka/broker/alerts.pp index 66844b8..8844987 100644 --- a/modules/confluent/manifests/kafka/broker/alerts.pp +++ b/modules/confluent/manifests/kafka/broker/alerts.pp @@ -40,29 +40,29 @@ # If it does, this means a broker replica is falling behind # and will be removed from the ISR. monitoring::graphite_threshold { 'kafka-broker-UnderReplicatedPartitions': - description => 'Kafka Broker Under Replicated Partitions', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/kafka?panelId=29&fullscreen&orgId=1', - metric => "${group_prefix}kafka.${graphite_broker_key}.kafka.server.ReplicaManager.UnderReplicatedPartitions.Value", - warning => '1', - critical => '10', + description => 'Kafka Broker Under Replicated Partitions', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/kafka?panelId=29&fullscreen&orgId=1'], + metric => "${group_prefix}kafka.${graphite_broker_key}.kafka.server.ReplicaManager.UnderReplicatedPartitions.Value", + warning => '1', + critical => '10', # Alert if any undereplicated for more than 50% # of the time in the last 30 minutes. - from => '30min', - percentage => 50, - group => $nagios_servicegroup, + from => '30min', + percentage => 50, + group => $nagios_servicegroup, } # Alert if any Kafka Broker replica lag is too high monitoring::graphite_threshold { 'kafka-broker-Replica-MaxLag': - description => 'Kafka Broker Replica Max Lag', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/kafka?panelId=16&fullscreen&orgId=1', - metric => "${group_prefix}kafka.${graphite_broker_key}.kafka.server.ReplicaFetcherManager.MaxLag.Value", - warning => $replica_maxlag_warning, - critical => $replica_maxlag_critical, + description => 'Kafka Broker Replica Max Lag', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/kafka?panelId=16&fullscreen&orgId=1'], + metric => "${group_prefix}kafka.${graphite_broker_key}.kafka.server.ReplicaFetcherManager.MaxLag.Value", + warning => $replica_maxlag_warning, + critical => $replica_maxlag_critical, # Alert if large replica lag for more than 50% # of the time in the last 30 minutes. - from => '30min', - percentage => 50, - group => $nagios_servicegroup, + from => '30min', + percentage => 50, + group => $nagios_servicegroup, } } diff --git a/modules/eventlogging/manifests/monitoring/graphite.pp b/modules/eventlogging/manifests/monitoring/graphite.pp index 590b657..d27269d 100644 --- a/modules/eventlogging/manifests/monitoring/graphite.pp +++ b/modules/eventlogging/manifests/monitoring/graphite.pp @@ -17,29 +17,29 @@ # in a 15 min period. # These thresholds are somewhat arbtirary. monitoring::graphite_threshold { 'eventlogging_throughput': - description => 'Throughput of EventLogging events', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/eventlogging?panelId=6&fullscreen&orgId=1', - metric => $raw_events_rate_metric, - warning => 1000, - critical => 5000, - percentage => 15, # At least 3 of the 15 readings - from => '15min', - contact_group => 'analytics', + description => 'Throughput of EventLogging events', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/eventlogging?panelId=6&fullscreen&orgId=1'], + metric => $raw_events_rate_metric, + warning => 1000, + critical => 5000, + percentage => 15, # At least 3 of the 15 readings + from => '15min', + contact_group => 'analytics', } # Alarms if 15% of Navigation Timing event throughput goes under 1 req/sec # in a 15 min period # https://meta.wikimedia.org/wiki/Schema:NavigationTiming monitoring::graphite_threshold { 'eventlogging_NavigationTiming_throughput': - description => 'Throughput of EventLogging NavigationTiming events', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/eventlogging?panelId=6&fullscreen&orgId=1', - metric => $navigation_timing_events_rate_metric, - warning => 1, - critical => 0, - percentage => 15, # At least 3 of the 15 readings - from => '15min', - contact_group => 'analytics', - under => true, + description => 'Throughput of EventLogging NavigationTiming events', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/eventlogging?panelId=6&fullscreen&orgId=1'], + metric => $navigation_timing_events_rate_metric, + warning => 1, + critical => 0, + percentage => 15, # At least 3 of the 15 readings + from => '15min', + contact_group => 'analytics', + under => true, } # Warn if 15% of overall error event throughput goes above 20 events/s @@ -47,14 +47,14 @@ # The EventError topic counted here includes both events that do not # validate and events that can not be processed for other reasons monitoring::graphite_threshold { 'eventlogging_EventError_throughput': - description => 'Throughput of EventLogging EventError events', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/eventlogging?panelId=13&fullscreen&orgId=1', - metric => $error_events_rate_metric, - warning => 20, - critical => 30, - percentage => 15, # At least 3 of the 15 readings - from => '15min', - contact_group => 'analytics', + description => 'Throughput of EventLogging EventError events', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/eventlogging?panelId=13&fullscreen&orgId=1'], + metric => $error_events_rate_metric, + warning => 20, + critical => 30, + percentage => 15, # At least 3 of the 15 readings + from => '15min', + contact_group => 'analytics', } @@ -63,15 +63,15 @@ # this metric is a good proxy to make sure events are flowing through the # kafka pipeline monitoring::graphite_threshold { 'eventlogging_overall_inserted_rate': - description => 'EventLogging overall insertion rate from MySQL consumer', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/eventlogging?panelId=12&fullscreen&orgId=1', - metric => 'movingAverage(eventlogging.overall.inserted.rate, "10min")', - warning => 50, - critical => 10, - percentage => 20, # At least 3 of the (25 - 10) = 15 readings - from => '25min', - until => '10min', - contact_group => 'analytics', - under => true, + description => 'EventLogging overall insertion rate from MySQL consumer', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/eventlogging?panelId=12&fullscreen&orgId=1'], + metric => 'movingAverage(eventlogging.overall.inserted.rate, "10min")', + warning => 50, + critical => 10, + percentage => 20, # At least 3 of the (25 - 10) = 15 readings + from => '25min', + until => '10min', + contact_group => 'analytics', + under => true, } } diff --git a/modules/graphite/manifests/monitoring/graphite.pp b/modules/graphite/manifests/monitoring/graphite.pp index dca903b..764b96a 100644 --- a/modules/graphite/manifests/monitoring/graphite.pp +++ b/modules/graphite/manifests/monitoring/graphite.pp @@ -9,7 +9,10 @@ class graphite::monitoring::graphite { monitoring::graphite_threshold { 'carbon-frontend-relay_drops': description => 'carbon-frontend-relay metric drops', - dashboard_link => "'https://grafana.wikimedia.org/dashboard/db/graphite-eqiad?orgId=1&panelId=21&fullscreen' 'https://grafana.wikimedia.org/dashboard/db/graphite-codfw?orgId=1&panelId=21&fullscreen'", + dashboard_links => [ + 'https://grafana.wikimedia.org/dashboard/db/graphite-eqiad?orgId=1&panelId=21&fullscreen', + 'https://grafana.wikimedia.org/dashboard/db/graphite-codfw?orgId=1&panelId=21&fullscreen', + ], metric => 'sumSeries(transformNull(perSecond(carbon.relays.graphite*_frontend.destinations.*.dropped)))', from => '5minutes', warning => 25, @@ -20,7 +23,10 @@ monitoring::graphite_threshold { 'carbon-local-relay_drops': description => 'carbon-local-relay metric drops', - dashboard_link => "'https://grafana.wikimedia.org/dashboard/db/graphite-eqiad?orgId=1&panelId=29&fullscreen' 'https://grafana.wikimedia.org/dashboard/db/graphite-codfw?orgId=1&panelId=29&fullscreen'", + dashboard_links => [ + 'https://grafana.wikimedia.org/dashboard/db/graphite-eqiad?orgId=1&panelId=29&fullscreen', + 'https://grafana.wikimedia.org/dashboard/db/graphite-codfw?orgId=1&panelId=29&fullscreen', + ], metric => 'sumSeries(transformNull(perSecond(carbon.relays.graphite*_local.destinations.*.dropped)))', from => '5minutes', warning => 25, @@ -32,7 +38,7 @@ # is carbon-cache able to write to disk (e.g. permissions) monitoring::graphite_threshold { 'carbon-cache_write_error': description => 'carbon-cache write error', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/graphite-eqiad?orgId=1&panelId=30&fullscreen', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/graphite-eqiad?orgId=1&panelId=30&fullscreen'], metric => 'secondYAxis(sumSeries(carbon.agents.graphite1001-*.errors))', from => '10minutes', warning => 1, @@ -44,7 +50,7 @@ # are carbon-cache queues overflowing their capacity? monitoring::graphite_threshold { 'carbon-cache_overflow': description => 'carbon-cache queues overflow', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/graphite-eqiad?orgId=1&panelId=8&fullscreen', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/graphite-eqiad?orgId=1&panelId=8&fullscreen'], metric => 'secondYAxis(sumSeries(carbon.agents.graphite1001-*.cache.overflow))', from => '10minutes', warning => 1, @@ -56,7 +62,7 @@ # are we creating too many metrics? monitoring::graphite_threshold { 'carbon-cache_many_creates': description => 'carbon-cache too many creates', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/graphite-eqiad?orgId=1&panelId=9&fullscreen', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/graphite-eqiad?orgId=1&panelId=9&fullscreen'], metric => 'sumSeries(carbon.agents.graphite1001-*.creates)', from => '30min', warning => 500, diff --git a/modules/icinga/manifests/monitor/wdqs.pp b/modules/icinga/manifests/monitor/wdqs.pp index 6b24fb6..4f52073 100644 --- a/modules/icinga/manifests/monitor/wdqs.pp +++ b/modules/icinga/manifests/monitor/wdqs.pp @@ -4,27 +4,27 @@ # raise a warning / critical alert if response time was over 2 minutes / 5 minutes # more than 5% of the time during the last minute monitoring::graphite_threshold { 'wdqs-response-time-codfw': - description => 'Response time of WDQS codfw', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/wikidata-query-service?orgId=1&panelId=13&fullscreen', - host => 'wdqs.svc.codfw.wmnet', - metric => 'varnish.codfw.backends.be_wdqs_svc_codfw_wmnet.GET.p99', - warning => 120000, # 2 minutes - critical => 300000, # 5 minutes - from => '10min', - percentage => 5, - contact_group => 'wdqs-admins', + description => 'Response time of WDQS codfw', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/wikidata-query-service?orgId=1&panelId=13&fullscreen'], + host => 'wdqs.svc.codfw.wmnet', + metric => 'varnish.codfw.backends.be_wdqs_svc_codfw_wmnet.GET.p99', + warning => 120000, # 2 minutes + critical => 300000, # 5 minutes + from => '10min', + percentage => 5, + contact_group => 'wdqs-admins', } monitoring::graphite_threshold { 'wdqs-response-time-eqiad': - description => 'Response time of WDQS eqiad', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/wikidata-query-service?orgId=1&panelId=13&fullscreen', - host => 'wdqs.svc.eqiad.wmnet', - metric => 'varnish.eqiad.backends.be_wdqs_svc_eqiad_wmnet.GET.p99', - warning => 120000, # 2 minutes - critical => 300000, # 5 minutes - from => '10min', - percentage => 5, - contact_group => 'wdqs-admins', + description => 'Response time of WDQS eqiad', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/wikidata-query-service?orgId=1&panelId=13&fullscreen'], + host => 'wdqs.svc.eqiad.wmnet', + metric => 'varnish.eqiad.backends.be_wdqs_svc_eqiad_wmnet.GET.p99', + warning => 120000, # 2 minutes + critical => 300000, # 5 minutes + from => '10min', + percentage => 5, + contact_group => 'wdqs-admins', } } diff --git a/modules/labstore/manifests/monitoring/interfaces.pp b/modules/labstore/manifests/monitoring/interfaces.pp index e2b2d6e..12a716e 100644 --- a/modules/labstore/manifests/monitoring/interfaces.pp +++ b/modules/labstore/manifests/monitoring/interfaces.pp @@ -16,43 +16,43 @@ $int_throughput_crit = '106250000' # 850Mbps monitoring::graphite_threshold { 'network_out_saturated': - description => 'Outgoing network saturation', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/labs-monitoring', - metric => "servers.${::hostname}.network.${monitor_iface}.tx_byte", - from => '30min', - warning => $int_throughput_warn, - critical => $int_throughput_crit, - percentage => '10', # smooth over peaks + description => 'Outgoing network saturation', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/labs-monitoring'], + metric => "servers.${::hostname}.network.${monitor_iface}.tx_byte", + from => '30min', + warning => $int_throughput_warn, + critical => $int_throughput_crit, + percentage => '10', # smooth over peaks } monitoring::graphite_threshold { 'network_in_saturated': - description => 'Incoming network saturation', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/labs-monitoring', - metric => "servers.${::hostname}.network.${monitor_iface}.rx_byte", - from => '30min', - warning => $int_throughput_warn, - critical => $int_throughput_crit, - percentage => '10', # smooth over peaks + description => 'Incoming network saturation', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/labs-monitoring'], + metric => "servers.${::hostname}.network.${monitor_iface}.rx_byte", + from => '30min', + warning => $int_throughput_warn, + critical => $int_throughput_crit, + percentage => '10', # smooth over peaks } monitoring::graphite_threshold { 'high_iowait_stalling': - description => 'Persistent high iowait', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/labs-monitoring', - metric => "servers.${::hostname}.cpu.total.iowait", - from => '10min', - warning => '40', # Based off looking at history of metric - critical => '60', - percentage => '50', # Ignore small spikes + description => 'Persistent high iowait', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/labs-monitoring'], + metric => "servers.${::hostname}.cpu.total.iowait", + from => '10min', + warning => '40', # Based off looking at history of metric + critical => '60', + percentage => '50', # Ignore small spikes } # Monitor for high load consistently, is a 'catchall' monitoring::graphite_threshold { 'high_load': - description => 'High load average', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/labs-monitoring', - metric => "servers.${::hostname}.loadavg.01", - from => '10min', - warning => '16', - critical => '24', - percentage => '50', # Don't freak out on spikes + description => 'High load average', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/labs-monitoring'], + metric => "servers.${::hostname}.loadavg.01", + from => '10min', + warning => '16', + critical => '24', + percentage => '50', # Don't freak out on spikes } } diff --git a/modules/mediawiki/manifests/monitoring/graphite.pp b/modules/mediawiki/manifests/monitoring/graphite.pp index 87ea43e..3fd2e8b 100644 --- a/modules/mediawiki/manifests/monitoring/graphite.pp +++ b/modules/mediawiki/manifests/monitoring/graphite.pp @@ -6,7 +6,7 @@ # Also check that the metric is being collected monitoring::graphite_threshold { 'mediawiki_job_insert_rate': description => 'MediaWiki jobs not being inserted', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/mediawiki-job-queue?panelId=2&fullscreen&orgId=1', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/mediawiki-job-queue?panelId=2&fullscreen&orgId=1'], metric => 'MediaWiki.jobqueue.inserts.all.rate', from => '1hours', warning => 1, @@ -19,7 +19,7 @@ # Also check that the metric is being collected monitoring::graphite_threshold { 'mediawiki_job_pop_rate': description => 'MediaWiki jobs not dequeued', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/mediawiki-job-queue?panelId=2&fullscreen&orgId=1', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/mediawiki-job-queue?panelId=2&fullscreen&orgId=1'], metric => 'MediaWiki.jobqueue.pops.all.rate', from => '1hours', warning => 1, diff --git a/modules/monitoring/manifests/check_prometheus.pp b/modules/monitoring/manifests/check_prometheus.pp index 9625e40..0c53291 100644 --- a/modules/monitoring/manifests/check_prometheus.pp +++ b/modules/monitoring/manifests/check_prometheus.pp @@ -69,8 +69,8 @@ # [*contact_group*] # What contact groups to use for notifications # -# [*dashboard_link*] -# Link to the Grafana dashboard for this alarm +# [*dashboard_links*] +# Links to the Grafana dashboard for this alarm # define monitoring::check_prometheus( $description, @@ -78,7 +78,7 @@ $prometheus_url, $warning, $critical, - $dashboard_link, + $dashboard_links, $method = 'ge', $nan_ok = false, $retries = 5, @@ -90,7 +90,19 @@ { validate_re($method, '^(gt|ge|lt|le|eq|ne)$') validate_bool($nan_ok) - validate_re($dashboard_link, '^https:\/\/grafana\.wikimedia\.org') + validate_array($dashboard_links) + if size($dashboard_links) < 1 { + fail('The $dashboard_links array cannot be empty') + } + validate_re($dashboard_links[0], '^https:\/\/grafana\.wikimedia\.org') + + # Puppet reduce doesn't call the lambda if there is only one element + if size($dashboard_links) == 1 { + $notes_urls = "'${dashboard_links[0]}'" + } else { + $dashboard_links.reduce |$notes_urls, $dashboard_link| { + $notes_urls + "'${dashboard_link}' " + } $command = $nan_ok ? { true => 'check_prometheus_nan_ok', @@ -105,6 +117,6 @@ group => $group, critical => $nagios_critical, contact_group => $contact_group, - notes_url => $dashboard_link, + notes_url => $notes_urls, } } diff --git a/modules/monitoring/manifests/graphite_anomaly.pp b/modules/monitoring/manifests/graphite_anomaly.pp index 1d785c2..c96df28 100644 --- a/modules/monitoring/manifests/graphite_anomaly.pp +++ b/modules/monitoring/manifests/graphite_anomaly.pp @@ -23,18 +23,18 @@ # } # # == Parameters -# $description - Description of icinga alert -# $metric - graphite metric name -# $warning - alert warning datapoints -# $critical - alert critical datapoints -# $check_window - the number of datapoints on which the check -# is performed. Defaults to 100. -# $graphite_url - URL of the graphite server. -# $timeout - Timeout for the http query to -# graphite. Defaults to 10 seconds -# $over - check only for values above the limit -# $under - check only for values below the limit -# $dashboard_link - Link to the Grafana dashboard for this alarm +# $description - Description of icinga alert +# $metric - graphite metric name +# $warning - alert warning datapoints +# $critical - alert critical datapoints +# $check_window - the number of datapoints on which the check +# is performed. Defaults to 100. +# $graphite_url - URL of the graphite server. +# $timeout - Timeout for the http query to +# graphite. Defaults to 10 seconds +# $over - check only for values above the limit +# $under - check only for values below the limit +# $dashboard_links - Links to the Grafana dashboard for this alarm # $host # $retries # $group @@ -50,7 +50,7 @@ $metric, $warning, $critical, - $dashboard_link, + $dashboard_links, $check_window = 100, $graphite_url = 'https://graphite.wikimedia.org', $timeout = 10, @@ -68,7 +68,19 @@ $contact_group = 'admins', ) { - validate_re($dashboard_link, '^https:\/\/grafana\.wikimedia\.org') + validate_array($dashboard_links) + if size($dashboard_links) < 1 { + fail('The $dashboard_links array cannot be empty') + } + validate_re($dashboard_links[0], '^https:\/\/grafana\.wikimedia\.org') + + # Puppet reduce doesn't call the lambda if there is only one element + if size($dashboard_links) == 1 { + $notes_urls = "'${dashboard_links[0]}'" + } else { + $dashboard_links.reduce |$notes_urls, $dashboard_link| { + $notes_urls + "'${dashboard_link}' " + } if $over == true { $modifier = '--over' @@ -106,6 +118,6 @@ check_interval => $check_interval, retry_interval => $retry_interval, contact_group => $contact_group, - notes_url => $dashboard_link, + notes_url => $notes_urls, } } diff --git a/modules/monitoring/manifests/graphite_threshold.pp b/modules/monitoring/manifests/graphite_threshold.pp index b9839c9..935b7bf 100644 --- a/modules/monitoring/manifests/graphite_threshold.pp +++ b/modules/monitoring/manifests/graphite_threshold.pp @@ -22,24 +22,24 @@ # percentage => 5, # } # == Parameters -# $description - Description of icinga alert -# $metric - graphite metric name -# $warning - alert warning threshold -# $critical - alert critical threshold -# $series - true if the metric refers to a series of graphite -# datapoints that should be checked individually -# $from - Date from which to fetch data. -# Examples: '1hours','10min' (default), '2w' -# $until - end sampling date (negative relative time from -# now. Default: '0min' -# $percentage - Number of datapoints exceeding the -# threshold. Defaults to 1%. -# $under - If true, the threshold is a lower limit. -# Defaults to false. -# $graphite_url - URL of the graphite server. -# $timeout - Timeout for the http query to -# graphite. Defaults to 10 seconds -# $dashboard_link - Link to the Grafana dashboard for this alarm +# $description - Description of icinga alert +# $metric - graphite metric name +# $warning - alert warning threshold +# $critical - alert critical threshold +# $series - true if the metric refers to a series of graphite +# datapoints that should be checked individually +# $from - Date from which to fetch data. +# Examples: '1hours','10min' (default), '2w' +# $until - end sampling date (negative relative time from +# now. Default: '0min' +# $percentage - Number of datapoints exceeding the +# threshold. Defaults to 1%. +# $under - If true, the threshold is a lower limit. +# Defaults to false. +# $graphite_url - URL of the graphite server. +# $timeout - Timeout for the http query to +# graphite. Defaults to 10 seconds +# $dashboard_links - Links to the Grafana dashboard for this alarm # $host # $retries # $group @@ -55,7 +55,7 @@ $metric, $warning, $critical, - $dashboard_link, + $dashboard_links, $series = false, $from = '10min', $until = '0min', @@ -75,7 +75,19 @@ $contact_group = 'admins', ) { - validate_re($dashboard_link, '^https:\/\/grafana\.wikimedia\.org') + validate_array($dashboard_links) + if size($dashboard_links) < 1 { + fail('The $dashboard_links array cannot be empty') + } + validate_re($dashboard_links[0], '^https:\/\/grafana\.wikimedia\.org') + + # Puppet reduce doesn't call the lambda if there is only one element + if size($dashboard_links) == 1 { + $notes_urls = "'${dashboard_links[0]}'" + } else { + $dashboard_links.reduce |$notes_urls, $dashboard_link| { + $notes_urls + "'${dashboard_link}' " + } # checkcommands.cfg's check_graphite_threshold command has # many positional arguments that @@ -115,6 +127,6 @@ check_interval => $check_interval, retry_interval => $retry_interval, contact_group => $contact_group, - notes_url => $dashboard_link, + notes_url => $notes_urls, } } diff --git a/modules/profile/manifests/cache/kafka/webrequest.pp b/modules/profile/manifests/cache/kafka/webrequest.pp index 8a10f31..754f1d5 100644 --- a/modules/profile/manifests/cache/kafka/webrequest.pp +++ b/modules/profile/manifests/cache/kafka/webrequest.pp @@ -142,17 +142,17 @@ # Generate an alert if too many delivery report errors per minute # (logster only reports once a minute) monitoring::graphite_threshold { 'varnishkafka-kafka_drerr': - ensure => 'present', - description => 'Varnishkafka Delivery Errors per minute', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/varnishkafka?panelId=20&fullscreen&orgId=1', - metric => "derivative(transformNull(${graphite_metric_prefix}.varnishkafka.kafka_drerr, 0))", - warning => 0, - critical => 5000, + ensure => 'present', + description => 'Varnishkafka Delivery Errors per minute', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/varnishkafka?panelId=20&fullscreen&orgId=1'], + metric => "derivative(transformNull(${graphite_metric_prefix}.varnishkafka.kafka_drerr, 0))", + warning => 0, + critical => 5000, # But only alert if a large percentage of the examined datapoints # are over the threshold. - percentage => 80, - from => '10min', - require => Logster::Job['varnishkafka-webrequest'], + percentage => 80, + from => '10min', + require => Logster::Job['varnishkafka-webrequest'], } # Make sure varnishes are configured and started for the first time # before the instances as well, or they fail to start initially... diff --git a/modules/profile/manifests/kafka/broker/monitoring.pp b/modules/profile/manifests/kafka/broker/monitoring.pp index fcb6661..1221b6f 100644 --- a/modules/profile/manifests/kafka/broker/monitoring.pp +++ b/modules/profile/manifests/kafka/broker/monitoring.pp @@ -46,21 +46,21 @@ # Alert on the average number of under replicated partitions over the last 30 minutes. monitoring::check_prometheus { 'kafka_broker_under_replicated_partitions': - description => 'Kafka Broker Under Replicated Partitions', - dashboard_link => "https://grafana.wikimedia.org/dashboard/db/prometheus-kafka?panelId=29&fullscreen&orgId=1&var-datasource=${::site}%20prometheus%2Fops&var-cluster=${cluster}&var-kafka_brokers=${::hostname}", - query => "scalar(avg_over_time(kafka_server_ReplicaManager_UnderReplicatedPartitions{${prometheus_labels}}[30m]))", - warning => 5, - critical => 10, - prometheus_url => "http://prometheus.svc.${::site}.wmnet/ops", + description => 'Kafka Broker Under Replicated Partitions', + dashboard_links => ["https://grafana.wikimedia.org/dashboard/db/prometheus-kafka?panelId=29&fullscreen&orgId=1&var-datasource=${::site}%20prometheus%2Fops&var-cluster=${cluster}&var-kafka_brokers=${::hostname}"], + query => "scalar(avg_over_time(kafka_server_ReplicaManager_UnderReplicatedPartitions{${prometheus_labels}}[30m]))", + warning => 5, + critical => 10, + prometheus_url => "http://prometheus.svc.${::site}.wmnet/ops", } # Alert on the average max replica lag over the last 30 minutes. monitoring::check_prometheus { 'kafka_broker_replica_max_lag': - description => 'Kafka Broker Replica Max Lag', - dashboard_link => "https://grafana.wikimedia.org/dashboard/db/prometheus-kafka?panelId=16&fullscreen&orgId=1&var-datasource=${::site}%20prometheus%2Fops&var-cluster=${cluster}&var-kafka_brokers=${::hostname}", - query => "scalar(avg_over_time(kafka_server_ReplicaFetcherManager_MaxLag{${prometheus_labels}}[30m]))", - warning => $replica_maxlag_warning, - critical => $replica_maxlag_critical, - prometheus_url => "http://prometheus.svc.${::site}.wmnet/ops", + description => 'Kafka Broker Replica Max Lag', + dashboard_links => ["https://grafana.wikimedia.org/dashboard/db/prometheus-kafka?panelId=16&fullscreen&orgId=1&var-datasource=${::site}%20prometheus%2Fops&var-cluster=${cluster}&var-kafka_brokers=${::hostname}"], + query => "scalar(avg_over_time(kafka_server_ReplicaFetcherManager_MaxLag{${prometheus_labels}}[30m]))", + warning => $replica_maxlag_warning, + critical => $replica_maxlag_critical, + prometheus_url => "http://prometheus.svc.${::site}.wmnet/ops", } } diff --git a/modules/profile/manifests/zookeeper/server.pp b/modules/profile/manifests/zookeeper/server.pp index c63413d..40f8566 100644 --- a/modules/profile/manifests/zookeeper/server.pp +++ b/modules/profile/manifests/zookeeper/server.pp @@ -64,27 +64,27 @@ # Alert if NumAliveConnections approaches max client connections # Alert if any Kafka Broker replica lag is too high monitoring::graphite_threshold { 'zookeeper-client-connections': - description => 'Zookeeper Alive Client Connections too high', - dashboard_link => "https://grafana.wikimedia.org/dashboard/db/zookeeper?orgId=1&panelId=6&fullscreen&var-cluster=${cluster_name}&var-zookeeper_hosts=All", - metric => "${group_prefix}zookeeper.${graphite_broker_key}.zookeeper.NumAliveConnections", + description => 'Zookeeper Alive Client Connections too high', + dashboard_links => ["https://grafana.wikimedia.org/dashboard/db/zookeeper?orgId=1&panelId=6&fullscreen&var-cluster=${cluster_name}&var-zookeeper_hosts=All"], + metric => "${group_prefix}zookeeper.${graphite_broker_key}.zookeeper.NumAliveConnections", # Warn if we go over 50% of max - warning => $max_client_connections * 0.5, + warning => $max_client_connections * 0.5, # Critical if we go over 90% of max - critical => $max_client_connections * 0.9, + critical => $max_client_connections * 0.9, } # Experimental Analytics alarms on JVM usage # These alarms are not really generic and the thresholds are based # on a fixed Max Heap size of 1G. monitoring::graphite_threshold { 'zookeeper-server-heap-usage': - description => 'Zookeeper node JVM Heap usage', - dashboard_link => "https://grafana.wikimedia.org/dashboard/db/zookeeper?panelId=40&fullscreen&orgId=1&var-cluster=${cluster_name}&var-zookeeper_hosts=All", - metric => "${group_prefix}jvm_memory.${::hostname}_${::site}_wmnet_${jmxtrans_port}.memory.HeapMemoryUsage_used.upper", - from => '60min', - warning => '921000000', # 90% of the Heap used - critical => '972000000', # 95% of the Heap used - percentage => '60', - contact_group => 'analytics', + description => 'Zookeeper node JVM Heap usage', + dashboard_links => ["https://grafana.wikimedia.org/dashboard/db/zookeeper?panelId=40&fullscreen&orgId=1&var-cluster=${cluster_name}&var-zookeeper_hosts=All"], + metric => "${group_prefix}jvm_memory.${::hostname}_${::site}_wmnet_${jmxtrans_port}.memory.HeapMemoryUsage_used.upper", + from => '60min', + warning => '921000000', # 90% of the Heap used + critical => '972000000', # 95% of the Heap used + percentage => '60', + contact_group => 'analytics', } } } diff --git a/modules/role/manifests/analytics_cluster/hadoop/master.pp b/modules/role/manifests/analytics_cluster/hadoop/master.pp index 2705780..f77d012 100644 --- a/modules/role/manifests/analytics_cluster/hadoop/master.pp +++ b/modules/role/manifests/analytics_cluster/hadoop/master.pp @@ -117,38 +117,38 @@ # Alert if the HDFS space consumption raises above a safe threshold. monitoring::graphite_threshold { 'hadoop-hdfs-percent-used': - description => 'HDFS capacity used percentage', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=47&fullscreen', - metric => "Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.NameNodeInfo.PercentUsed.mean", - from => '30min', - warning => 85, - critical => 90, - percentage => '60', - contact_group => 'analytics', + description => 'HDFS capacity used percentage', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=47&fullscreen'], + metric => "Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.NameNodeInfo.PercentUsed.mean", + from => '30min', + warning => 85, + critical => 90, + percentage => '60', + contact_group => 'analytics', } # Alert in case of HDFS currupted or missing blocks. In the ideal state # these values should always be 0. monitoring::graphite_threshold { 'hadoop-hdfs-corrupt-blocks': - description => 'HDFS corrupt blocks', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=39&fullscreen', - metric => "Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.FSNamesystem.CorruptBlocks.mean", - from => '30min', - warning => 2, - critical => 5, - percentage => '60', - contact_group => 'analytics', + description => 'HDFS corrupt blocks', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=39&fullscreen'], + metric => "Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.FSNamesystem.CorruptBlocks.mean", + from => '30min', + warning => 2, + critical => 5, + percentage => '60', + contact_group => 'analytics', } monitoring::graphite_threshold { 'hadoop-hdfs-missing-blocks': - description => 'HDFS missing blocks', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=40&fullscreen', - metric => "Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.FSNamesystem.MissingBlocks.mean", - from => '180min', - warning => 2, - critical => 5, - percentage => '60', - contact_group => 'analytics', + description => 'HDFS missing blocks', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=40&fullscreen'], + metric => "Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.FSNamesystem.MissingBlocks.mean", + from => '180min', + warning => 2, + critical => 5, + percentage => '60', + contact_group => 'analytics', } # Java heap space used alerts. @@ -159,14 +159,14 @@ $nn_jvm_warning_threshold = $hadoop_namenode_heapsize * 0.9 $nn_jvm_critical_threshold = $hadoop_namenode_heapsize * 0.95 monitoring::graphite_threshold { 'hadoop-hdfs-namenode-heap-usaage': - description => 'HDFS active Namenode JVM Heap usage', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?panelId=4&fullscreen&orgId=1', - metric => "Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.JvmMetrics.MemHeapUsedM.upper", - from => '60min', - warning => $nn_jvm_warning_threshold, - critical => $nn_jvm_critical_threshold, - percentage => '60', - contact_group => 'analytics', + description => 'HDFS active Namenode JVM Heap usage', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?panelId=4&fullscreen&orgId=1'], + metric => "Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.JvmMetrics.MemHeapUsedM.upper", + from => '60min', + warning => $nn_jvm_warning_threshold, + critical => $nn_jvm_critical_threshold, + percentage => '60', + contact_group => 'analytics', } } @@ -175,14 +175,14 @@ $rm_jvm_warning_threshold = $hadoop_resourcemanager_heapsize * 0.9 $rm_jvm_critical_threshold = $hadoop_resourcemanager_heapsize * 0.95 monitoring::graphite_threshold { 'hadoop-yarn-resourcemananager-heap-usage': - description => 'YARN active ResourceManager JVM Heap usage', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?panelId=12&fullscreen&orgId=1', - metric => "Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9983.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper", - from => '60min', - warning => $rm_jvm_warning_threshold, - critical => $rm_jvm_critical_threshold, - percentage => '60', - contact_group => 'analytics', + description => 'YARN active ResourceManager JVM Heap usage', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?panelId=12&fullscreen&orgId=1'], + metric => "Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9983.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper", + from => '60min', + warning => $rm_jvm_warning_threshold, + critical => $rm_jvm_critical_threshold, + percentage => '60', + contact_group => 'analytics', } } } diff --git a/modules/role/manifests/analytics_cluster/hadoop/standby.pp b/modules/role/manifests/analytics_cluster/hadoop/standby.pp index 6d51ba6..bc7f2d7 100644 --- a/modules/role/manifests/analytics_cluster/hadoop/standby.pp +++ b/modules/role/manifests/analytics_cluster/hadoop/standby.pp @@ -43,14 +43,14 @@ $nn_jvm_warning_threshold = $hadoop_namenode_heapsize * 0.9 $nn_jvm_critical_threshold = $hadoop_namenode_heapsize * 0.95 monitoring::graphite_threshold { 'hadoop-hdfs-namenode-heap-usaage': - description => 'HDFS standby Namenode JVM Heap usage', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=4&fullscreen', - metric => "Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.JvmMetrics.MemHeapUsedM.upper", - from => '60min', - warning => $nn_jvm_warning_threshold, - critical => $nn_jvm_critical_threshold, - percentage => '60', - contact_group => 'analytics', + description => 'HDFS standby Namenode JVM Heap usage', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=4&fullscreen'], + metric => "Hadoop.NameNode.${::hostname}_eqiad_wmnet_9980.Hadoop.NameNode.JvmMetrics.MemHeapUsedM.upper", + from => '60min', + warning => $nn_jvm_warning_threshold, + critical => $nn_jvm_critical_threshold, + percentage => '60', + contact_group => 'analytics', } } } @@ -81,14 +81,14 @@ $rm_jvm_warning_threshold = $hadoop_resourcemanager_heapsize * 0.9 $rm_jvm_critical_threshold = $hadoop_resourcemanager_heapsize * 0.95 monitoring::graphite_threshold { 'hadoop-yarn-resourcemananager-heap-usage': - description => 'YARN standby Resource Manager JVM Heap usage', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=12&fullscreen', - metric => "Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9983.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper", - from => '60min', - warning => $rm_jvm_warning_threshold, - critical => $rm_jvm_critical_threshold, - percentage => '60', - contact_group => 'analytics', + description => 'YARN standby Resource Manager JVM Heap usage', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=12&fullscreen'], + metric => "Hadoop.ResourceManager.${::hostname}_eqiad_wmnet_9983.Hadoop.ResourceManager.JvmMetrics.MemHeapUsedM.upper", + from => '60min', + warning => $rm_jvm_warning_threshold, + critical => $rm_jvm_critical_threshold, + percentage => '60', + contact_group => 'analytics', } } } diff --git a/modules/role/manifests/analytics_cluster/hadoop/worker.pp b/modules/role/manifests/analytics_cluster/hadoop/worker.pp index 14d3ba0..8080aca 100644 --- a/modules/role/manifests/analytics_cluster/hadoop/worker.pp +++ b/modules/role/manifests/analytics_cluster/hadoop/worker.pp @@ -64,14 +64,14 @@ $dn_jvm_warning_threshold = $hadoop_datanode_heapsize * 0.9 $dn_jvm_critical_threshold = $hadoop_datanode_heapsize * 0.95 monitoring::graphite_threshold { 'analytics_hadoop_hdfs_datanode': - description => 'HDFS DataNode JVM Heap usage', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?panelId=1&fullscreen&orgId=1', - metric => "Hadoop.DataNode.${::hostname}_eqiad_wmnet_9981.Hadoop.DataNode.JvmMetrics.MemHeapUsedM.upper", - from => '60min', - warning => $dn_jvm_critical_threshold, - critical => $dn_jvm_critical_threshold, - percentage => '60', - contact_group => 'analytics', + description => 'HDFS DataNode JVM Heap usage', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?panelId=1&fullscreen&orgId=1'], + metric => "Hadoop.DataNode.${::hostname}_eqiad_wmnet_9981.Hadoop.DataNode.JvmMetrics.MemHeapUsedM.upper", + from => '60min', + warning => $dn_jvm_critical_threshold, + critical => $dn_jvm_critical_threshold, + percentage => '60', + contact_group => 'analytics', } } @@ -80,14 +80,14 @@ $nm_jvm_warning_threshold = $hadoop_nodemanager_heapsize * 0.9 $nm_jvm_critical_threshold = $hadoop_nodemanager_heapsize * 0.95 monitoring::graphite_threshold { 'analytics_hadoop_yarn_nodemanager': - description => 'YARN NodeManager JVM Heap usage', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=17&fullscreen', - metric => "Hadoop.NodeManager.${::hostname}_eqiad_wmnet_9984.Hadoop.NodeManager.JvmMetrics.MemHeapUsedM.upper", - from => '60min', - warning => $nm_jvm_critical_threshold, - critical => $nm_jvm_critical_threshold, - percentage => '60', - contact_group => 'analytics', + description => 'YARN NodeManager JVM Heap usage', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/analytics-hadoop?orgId=1&panelId=17&fullscreen'], + metric => "Hadoop.NodeManager.${::hostname}_eqiad_wmnet_9984.Hadoop.NodeManager.JvmMetrics.MemHeapUsedM.upper", + from => '60min', + warning => $nm_jvm_critical_threshold, + critical => $nm_jvm_critical_threshold, + percentage => '60', + contact_group => 'analytics', } } diff --git a/modules/role/manifests/elasticsearch/alerts.pp b/modules/role/manifests/elasticsearch/alerts.pp index 3499dec..8d32616 100644 --- a/modules/role/manifests/elasticsearch/alerts.pp +++ b/modules/role/manifests/elasticsearch/alerts.pp @@ -1,35 +1,35 @@ class role::elasticsearch::alerts { monitoring::graphite_threshold { 'cirrussearch_eqiad_95th_percentile': - description => 'CirrusSearch eqiad 95th percentile latency', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/elasticsearch-percentiles?panelId=19&fullscreen&orgId=1&var-cluster=eqiad&var-smoothing=1', - metric => 'transformNull(MediaWiki.CirrusSearch.eqiad.requestTime.p95, 0)', - from => '10min', - warning => '500', - critical => '1000', - percentage => '20', - contact_group => 'team-discovery', + description => 'CirrusSearch eqiad 95th percentile latency', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/elasticsearch-percentiles?panelId=19&fullscreen&orgId=1&var-cluster=eqiad&var-smoothing=1'], + metric => 'transformNull(MediaWiki.CirrusSearch.eqiad.requestTime.p95, 0)', + from => '10min', + warning => '500', + critical => '1000', + percentage => '20', + contact_group => 'team-discovery', } monitoring::graphite_threshold { 'cirrussearch_codfw_95th_percentile': - description => 'CirrusSearch codfw 95th percentile latency - more_like', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/elasticsearch-percentiles?panelId=39&fullscreen&orgId=1&var-cluster=codfw&var-smoothing=1', - metric => 'transformNull(MediaWiki.CirrusSearch.codfw.requestTimeMs.more_like.p95, 0)', - from => '10min', - warning => '1200', - critical => '2000', - percentage => '20', - contact_group => 'team-discovery', + description => 'CirrusSearch codfw 95th percentile latency - more_like', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/elasticsearch-percentiles?panelId=39&fullscreen&orgId=1&var-cluster=codfw&var-smoothing=1'], + metric => 'transformNull(MediaWiki.CirrusSearch.codfw.requestTimeMs.more_like.p95, 0)', + from => '10min', + warning => '1200', + critical => '2000', + percentage => '20', + contact_group => 'team-discovery', } # warning level is ~1% of peak traffic failing monitoring::graphite_threshold { 'search_backend_failure_count': - description => 'Number of backend failures per minute from CirrusSearch', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/elasticsearch-percentiles?orgId=1&var-cluster=eqiad&var-smoothing=1&panelId=9&fullscreen', - metric => 'transformNull(MediaWiki.CirrusSearch.eqiad.backend_failure.failed.count, 0)', - from => '10min', - warning => '300', - critical => '600', - percentage => '20', - contact_group => 'team-discovery', + description => 'Number of backend failures per minute from CirrusSearch', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/elasticsearch-percentiles?orgId=1&var-cluster=eqiad&var-smoothing=1&panelId=9&fullscreen'], + metric => 'transformNull(MediaWiki.CirrusSearch.eqiad.backend_failure.failed.count, 0)', + from => '10min', + warning => '300', + critical => '600', + percentage => '20', + contact_group => 'team-discovery', } } diff --git a/modules/role/manifests/graphite/alerts.pp b/modules/role/manifests/graphite/alerts.pp index 802ce94..8a2a317 100644 --- a/modules/role/manifests/graphite/alerts.pp +++ b/modules/role/manifests/graphite/alerts.pp @@ -21,86 +21,86 @@ # Use graphite's anomaly detection support. monitoring::graphite_anomaly { 'kafka-analytics-eqiad-broker-MessagesIn-anomaly': - description => 'Kafka Cluster analytics-eqiad Broker Messages In Per Second', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/kafka?panelId=6&fullscreen&orgId=1&var-cluster=analytics-eqiad&var-kafka_brokers=All&var-kafka_servers=All', - metric => 'sumSeries(kafka.cluster.analytics-eqiad.kafka.*.kafka.server.BrokerTopicMetrics-AllTopics.MessagesInPerSec.OneMinuteRate)', + description => 'Kafka Cluster analytics-eqiad Broker Messages In Per Second', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/kafka?panelId=6&fullscreen&orgId=1&var-cluster=analytics-eqiad&var-kafka_brokers=All&var-kafka_servers=All'], + metric => 'sumSeries(kafka.cluster.analytics-eqiad.kafka.*.kafka.server.BrokerTopicMetrics-AllTopics.MessagesInPerSec.OneMinuteRate)', # check over the 60 data points (an hour?) and: # - alert warn if more than 30 are under the confidence band # - alert critical if more than 45 are under the confidecne band - check_window => 60, - warning => 30, - critical => 45, - under => true, - group => 'analytics_eqiad', + check_window => 60, + warning => 30, + critical => 45, + under => true, + group => 'analytics_eqiad', } # Monitor memcached error rate from MediaWiki. This is commonly a sign of # a failing nutcracker instance that can be tracked down via # https://logstash.wikimedia.org/#/dashboard/elasticsearch/memcached monitoring::graphite_threshold { 'mediawiki-memcached-threshold': - description => 'MediaWiki memcached error rate', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/mediawiki-graphite-alerts?orgId=1&panelId=1&fullscreen', - metric => 'transformNull(logstash.rate.mediawiki.memcached.ERROR.sum, 0)', + description => 'MediaWiki memcached error rate', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/mediawiki-graphite-alerts?orgId=1&panelId=1&fullscreen'], + metric => 'transformNull(logstash.rate.mediawiki.memcached.ERROR.sum, 0)', # Nominal error rate in production is <150/min - warning => 1000, - critical => 5000, - from => '5min', - percentage => 40, + warning => 1000, + critical => 5000, + from => '5min', + percentage => 40, } # Monitor MediaWiki fatals and exceptions. monitoring::graphite_threshold { 'mediawiki_error_rate': - description => 'MediaWiki exceptions and fatals per minute', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/mediawiki-graphite-alerts?orgId=1&panelId=2&fullscreen', - metric => 'transformNull(sumSeries(logstash.rate.mediawiki.fatal.ERROR.sum, logstash.rate.mediawiki.exception.ERROR.sum), 0)', - warning => 25, - critical => 50, - from => '10min', - percentage => 70, + description => 'MediaWiki exceptions and fatals per minute', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/mediawiki-graphite-alerts?orgId=1&panelId=2&fullscreen'], + metric => 'transformNull(sumSeries(logstash.rate.mediawiki.fatal.ERROR.sum, logstash.rate.mediawiki.exception.ERROR.sum), 0)', + warning => 25, + critical => 50, + from => '10min', + percentage => 70, } # Monitor MediaWiki session failures # See https://grafana.wikimedia.org/dashboard/db/edit-count monitoring::graphite_threshold { 'mediawiki_session_loss': - description => 'MediaWiki edit session loss', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/edit-count?panelId=13&fullscreen&orgId=1', - metric => 'transformNull(scale(consolidateBy(MediaWiki.edit.failures.session_loss.rate, "max"), 60), 0)', - warning => 10, - critical => 50, - from => '15min', - percentage => 30, + description => 'MediaWiki edit session loss', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/edit-count?panelId=13&fullscreen&orgId=1'], + metric => 'transformNull(scale(consolidateBy(MediaWiki.edit.failures.session_loss.rate, "max"), 60), 0)', + warning => 10, + critical => 50, + from => '15min', + percentage => 30, } monitoring::graphite_threshold { 'mediawiki_bad_token': - description => 'MediaWiki edit failure due to bad token', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/edit-count?panelId=13&fullscreen&orgId=1', - metric => 'transformNull(scale(consolidateBy(MediaWiki.edit.failures.bad_token.rate, "max"), 60), 0)', - warning => 10, - critical => 50, - from => '15min', - percentage => 30, + description => 'MediaWiki edit failure due to bad token', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/edit-count?panelId=13&fullscreen&orgId=1'], + metric => 'transformNull(scale(consolidateBy(MediaWiki.edit.failures.bad_token.rate, "max"), 60), 0)', + warning => 10, + critical => 50, + from => '15min', + percentage => 30, } # Monitor MediaWiki CentralAuth bad tokens monitoring::graphite_threshold { 'mediawiki_centralauth_errors': - description => 'MediaWiki centralauth errors', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/mediawiki-graphite-alerts?orgId=1&panelId=3&fullscreen', - metric => 'transformNull(sumSeries(MediaWiki.centralauth.centrallogin_errors.*.rate), 0)', - warning => 0.5, - critical => 1, - from => '15min', - percentage => 30, + description => 'MediaWiki centralauth errors', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/mediawiki-graphite-alerts?orgId=1&panelId=3&fullscreen'], + metric => 'transformNull(sumSeries(MediaWiki.centralauth.centrallogin_errors.*.rate), 0)', + warning => 0.5, + critical => 1, + from => '15min', + percentage => 30, } # Monitor EventBus 4xx and 5xx HTTP response rate. monitoring::graphite_threshold { 'eventbus_http_error_rate': - description => 'EventBus HTTP Error Rate (4xx + 5xx)', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/eventbus?panelId=1&fullscreen&orgId=1', - metric => 'transformNull(sumSeries(eventbus.counters.eventlogging.service.EventHandler.POST.[45]*.rate))', + description => 'EventBus HTTP Error Rate (4xx + 5xx)', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/eventbus?panelId=1&fullscreen&orgId=1'], + metric => 'transformNull(sumSeries(eventbus.counters.eventlogging.service.EventHandler.POST.[45]*.rate))', # If > 50% of datapoints over last 10 minutes is over thresholds, then alert. - warning => 1, - critical => 10, - from => '10min', - percentage => 50, + warning => 1, + critical => 10, + from => '10min', + percentage => 50, } } diff --git a/modules/role/manifests/graphite/alerts/reqstats.pp b/modules/role/manifests/graphite/alerts/reqstats.pp index 9b15b01..32a33d3 100644 --- a/modules/role/manifests/graphite/alerts/reqstats.pp +++ b/modules/role/manifests/graphite/alerts/reqstats.pp @@ -7,7 +7,7 @@ # sites aggregates monitoring::graphite_threshold { 'reqstats-5xx-eqiad': description => 'Eqiad HTTP 5xx reqs/min', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/file/varnish-aggregate-client-status-codes.json?panelId=3&fullscreen&orgId=1&var-site=eqiad&var-cache_type=All&var-status_type=5', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/file/varnish-aggregate-client-status-codes.json?panelId=3&fullscreen&orgId=1&var-site=eqiad&var-cache_type=All&var-status_type=5'], metric => 'sumSeries(varnish.eqiad.*.frontend.request.client.status.5xx.sum)', warning => $settings['warning'], critical => $settings['critical'], @@ -17,7 +17,7 @@ monitoring::graphite_threshold { 'reqstats-5xx-esams': description => 'Esams HTTP 5xx reqs/min', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/file/varnish-aggregate-client-status-codes.json?panelId=3&fullscreen&orgId=1&var-site=esams&var-cache_type=All&var-status_type=5', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/file/varnish-aggregate-client-status-codes.json?panelId=3&fullscreen&orgId=1&var-site=esams&var-cache_type=All&var-status_type=5'], metric => 'sumSeries(varnish.esams.*.frontend.request.client.status.5xx.sum)', warning => $settings['warning'], critical => $settings['critical'], @@ -27,7 +27,7 @@ monitoring::graphite_threshold { 'reqstats-5xx-codfw': description => 'Codfw HTTP 5xx reqs/min', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/file/varnish-aggregate-client-status-codes.json?panelId=3&fullscreen&orgId=1&var-site=codfw&var-cache_type=All&var-status_type=5', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/file/varnish-aggregate-client-status-codes.json?panelId=3&fullscreen&orgId=1&var-site=codfw&var-cache_type=All&var-status_type=5'], metric => 'sumSeries(varnish.codfw.*.frontend.request.client.status.5xx.sum)', warning => $settings['warning'], critical => $settings['critical'], @@ -37,7 +37,7 @@ monitoring::graphite_threshold { 'reqstats-5xx-ulsfo': description => 'Ulsfo HTTP 5xx reqs/min', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/file/varnish-aggregate-client-status-codes.json?panelId=3&fullscreen&orgId=1&var-site=ulsfo&var-cache_type=All&var-status_type=5', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/file/varnish-aggregate-client-status-codes.json?panelId=3&fullscreen&orgId=1&var-site=ulsfo&var-cache_type=All&var-status_type=5'], metric => 'sumSeries(varnish.ulsfo.*.frontend.request.client.status.5xx.sum)', warning => $settings['warning'], critical => $settings['critical'], @@ -48,7 +48,7 @@ # per-cache aggregates monitoring::graphite_threshold { 'reqstats-5xx-text': description => 'Text HTTP 5xx reqs/min', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/file/varnish-aggregate-client-status-codes.json?panelId=3&fullscreen&orgId=1&var-site=All&var-cache_type=text&var-status_type=5', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/file/varnish-aggregate-client-status-codes.json?panelId=3&fullscreen&orgId=1&var-site=All&var-cache_type=text&var-status_type=5'], metric => 'sumSeries(varnish.*.text.frontend.request.client.status.5xx.sum)', warning => $settings['warning'], critical => $settings['critical'], @@ -58,7 +58,7 @@ monitoring::graphite_threshold { 'reqstats-5xx-upload': description => 'Upload HTTP 5xx reqs/min', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/file/varnish-aggregate-client-status-codes.json?panelId=3&fullscreen&orgId=1&var-site=All&var-cache_type=upload&var-status_type=5', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/file/varnish-aggregate-client-status-codes.json?panelId=3&fullscreen&orgId=1&var-site=All&var-cache_type=upload&var-status_type=5'], metric => 'sumSeries(varnish.*.upload.frontend.request.client.status.5xx.sum)', warning => $settings['warning'], critical => $settings['critical'], @@ -68,7 +68,7 @@ monitoring::graphite_threshold { 'reqstats-5xx-misc': description => 'Misc HTTP 5xx reqs/min', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/file/varnish-aggregate-client-status-codes.json?panelId=3&fullscreen&orgId=1&var-site=All&var-cache_type=misc&var-status_type=5', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/file/varnish-aggregate-client-status-codes.json?panelId=3&fullscreen&orgId=1&var-site=All&var-cache_type=misc&var-status_type=5'], metric => 'sumSeries(varnish.*.misc.frontend.request.client.status.5xx.sum)', warning => $settings['warning'], critical => $settings['critical'], diff --git a/modules/role/manifests/restbase/alerts.pp b/modules/role/manifests/restbase/alerts.pp index 264ad8d..75d1a27 100644 --- a/modules/role/manifests/restbase/alerts.pp +++ b/modules/role/manifests/restbase/alerts.pp @@ -1,68 +1,68 @@ class role::restbase::alerts { monitoring::graphite_threshold { 'restbase_request_5xx_rate': - description => 'RESTBase html revision 5xx req/s', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/restbase?panelId=18&fullscreen&orgId=1&var-cluster=restbase', - metric => 'transformNull(restbase.external.v1_page_html_-title-_-revision--_tid-.GET.5xx.sample_rate, 0)', - from => '10min', - warning => '1', # 1 5xx/s - critical => '3', # 5 5xx/s - percentage => '20', - contact_group => 'team-services', + description => 'RESTBase html revision 5xx req/s', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/restbase?panelId=18&fullscreen&orgId=1&var-cluster=restbase'], + metric => 'transformNull(restbase.external.v1_page_html_-title-_-revision--_tid-.GET.5xx.sample_rate, 0)', + from => '10min', + warning => '1', # 1 5xx/s + critical => '3', # 5 5xx/s + percentage => '20', + contact_group => 'team-services', } monitoring::graphite_threshold { 'restbase_html_storage_hit_latency': - description => 'RESTBase HTML revision request mean storage latency ms', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/restbase?panelId=11&fullscreen&orgId=1&var-cluster=restbase', - metric => 'movingMedian(restbase.external.sys_key-rev-value_-bucket-_-key--_revision--_tid-.GET.2xx.mean, 15)', - from => '10min', - warning => '25', # 25ms - critical => '50', # 50ms - percentage => '50', - contact_group => 'team-services', + description => 'RESTBase HTML revision request mean storage latency ms', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/restbase?panelId=11&fullscreen&orgId=1&var-cluster=restbase'], + metric => 'movingMedian(restbase.external.sys_key-rev-value_-bucket-_-key--_revision--_tid-.GET.2xx.mean, 15)', + from => '10min', + warning => '25', # 25ms + critical => '50', # 50ms + percentage => '50', + contact_group => 'team-services', } monitoring::graphite_threshold { 'restbase_html_storage_hit_latency_99p': - description => 'RESTBase HTML revision request 99p storage latency ms', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/restbase?panelId=11&fullscreen&orgId=1&var-cluster=restbase', - metric => 'movingMedian(restbase.external.sys_key-rev-value_-bucket-_-key--_revision--_tid-.GET.2xx.p99, 15)', - from => '10min', - warning => '1500', # 1.5s - critical => '3000', # 3s - percentage => '50', - contact_group => 'team-services', + description => 'RESTBase HTML revision request 99p storage latency ms', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/restbase?panelId=11&fullscreen&orgId=1&var-cluster=restbase'], + metric => 'movingMedian(restbase.external.sys_key-rev-value_-bucket-_-key--_revision--_tid-.GET.2xx.p99, 15)', + from => '10min', + warning => '1500', # 1.5s + critical => '3000', # 3s + percentage => '50', + contact_group => 'team-services', } monitoring::graphite_threshold { 'restbase_cassandra_highest_storage_exceptions': - description => 'RESTBase Cassandra highest storage exceptions', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/restbase-cassandra-storage?panelId=5&fullscreen&orgId=1&var-datacenter=1&var-node=All&var-keyspace=all', - metric => 'highestMax(nonNegativeDerivative(cassandra.restbase10*.org.apache.cassandra.metrics.Storage.Exceptions.count), 1)', - from => '10min', - warning => '5', - critical => '10', - percentage => '50', - contact_group => 'team-services', + description => 'RESTBase Cassandra highest storage exceptions', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/restbase-cassandra-storage?panelId=5&fullscreen&orgId=1&var-datacenter=1&var-node=All&var-keyspace=all'], + metric => 'highestMax(nonNegativeDerivative(cassandra.restbase10*.org.apache.cassandra.metrics.Storage.Exceptions.count), 1)', + from => '10min', + warning => '5', + critical => '10', + percentage => '50', + contact_group => 'team-services', } monitoring::graphite_threshold { 'restbase_cassandra_highest_total_hints': - description => 'RESTBase Cassandra highest total hints', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/restbase-cassandra-storage?panelId=28&fullscreen&orgId=1&var-datacenter=1&var-node=All&var-keyspace=all', - metric => 'highestMax(nonNegativeDerivative(cassandra.restbase10*.org.apache.cassandra.metrics.Storage.TotalHints.count), 1)', - from => '10min', - warning => '600', - critical => '1000', - percentage => '50', - contact_group => 'team-services', + description => 'RESTBase Cassandra highest total hints', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/restbase-cassandra-storage?panelId=28&fullscreen&orgId=1&var-datacenter=1&var-node=All&var-keyspace=all'], + metric => 'highestMax(nonNegativeDerivative(cassandra.restbase10*.org.apache.cassandra.metrics.Storage.TotalHints.count), 1)', + from => '10min', + warning => '600', + critical => '1000', + percentage => '50', + contact_group => 'team-services', } monitoring::graphite_threshold { 'restbase_cassandra_highest_pending_compactions': - description => 'RESTBase Cassandra highest pending compactions', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/restbase-cassandra-compaction?orgId=1&panelId=5&fullscreen&var-datacenter=1&var-node=All', - metric => 'highestMax(cassandra.restbase10*.org.apache.cassandra.metrics.Compaction.PendingTasks.value, 1)', - from => '60min', - warning => '4000', - critical => '5000', - percentage => '50', - contact_group => 'team-services', + description => 'RESTBase Cassandra highest pending compactions', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/restbase-cassandra-compaction?orgId=1&panelId=5&fullscreen&var-datacenter=1&var-node=All'], + metric => 'highestMax(cassandra.restbase10*.org.apache.cassandra.metrics.Compaction.PendingTasks.value, 1)', + from => '60min', + warning => '4000', + critical => '5000', + percentage => '50', + contact_group => 'team-services', } # With instance sizes in-flux, and expansions taking place, it is proving @@ -81,35 +81,35 @@ # } monitoring::graphite_threshold { 'restbase_cassandra_highest_tombstones_scanned': - description => 'RESTBase Cassandra highest tombstones scanned', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/restbase-cassandra-cf-tombstones-scanned?panelId=5&fullscreen&orgId=1&var-datacenter=1&var-node=All&var-quantiles=99percentile', - metric => 'highestMax(cassandra.restbase10*.org.apache.cassandra.metrics.ColumnFamily.all.TombstoneScannedHistogram.99percentile, 1)', - from => '10min', - warning => '1000', - critical => '1500', - percentage => '50', - contact_group => 'team-services', + description => 'RESTBase Cassandra highest tombstones scanned', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/restbase-cassandra-cf-tombstones-scanned?panelId=5&fullscreen&orgId=1&var-datacenter=1&var-node=All&var-quantiles=99percentile'], + metric => 'highestMax(cassandra.restbase10*.org.apache.cassandra.metrics.ColumnFamily.all.TombstoneScannedHistogram.99percentile, 1)', + from => '10min', + warning => '1000', + critical => '1500', + percentage => '50', + contact_group => 'team-services', } monitoring::graphite_threshold { 'restbase_cassandra_highest_pending_internal': - description => 'RESTBase Cassandra highest pending internal thread pool tasks', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/restbase-cassandra-thread-pools?panelId=34&fullscreen&orgId=1&var-datacenter=1&var-node=All', - metric => 'highestMax(exclude(cassandra.restbase10*.org.apache.cassandra.metrics.ThreadPools.internal.*.PendingTasks.value, "CompactionExecutor"), 1)', - from => '10min', - warning => '500', - critical => '1000', - percentage => '50', - contact_group => 'team-services', + description => 'RESTBase Cassandra highest pending internal thread pool tasks', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/restbase-cassandra-thread-pools?panelId=34&fullscreen&orgId=1&var-datacenter=1&var-node=All'], + metric => 'highestMax(exclude(cassandra.restbase10*.org.apache.cassandra.metrics.ThreadPools.internal.*.PendingTasks.value, "CompactionExecutor"), 1)', + from => '10min', + warning => '500', + critical => '1000', + percentage => '50', + contact_group => 'team-services', } monitoring::graphite_threshold { 'restbase_cassandra_highest_dropped_messages': - description => 'RESTBase Cassandra highest dropped message rate', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/restbase-cassandra-dropped-messages?panelId=35&fullscreen&orgId=1&var-datacenter=1', - metric => 'highestMax(cassandra.restbase10*.org.apache.cassandra.metrics.DroppedMessage.*.Dropped.1MinuteRate, 1)', - from => '10min', - warning => '50', - critical => '100', - percentage => '50', - contact_group => 'team-services', + description => 'RESTBase Cassandra highest dropped message rate', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/restbase-cassandra-dropped-messages?panelId=35&fullscreen&orgId=1&var-datacenter=1'], + metric => 'highestMax(cassandra.restbase10*.org.apache.cassandra.metrics.DroppedMessage.*.Dropped.1MinuteRate, 1)', + from => '10min', + warning => '50', + critical => '100', + percentage => '50', + contact_group => 'team-services', } } diff --git a/modules/swift/manifests/monitoring/graphite_alerts.pp b/modules/swift/manifests/monitoring/graphite_alerts.pp index 18a6554..d3113c7 100644 --- a/modules/swift/manifests/monitoring/graphite_alerts.pp +++ b/modules/swift/manifests/monitoring/graphite_alerts.pp @@ -3,7 +3,7 @@ ) { monitoring::graphite_threshold { "swift_${cluster}_dispersion_object": description => "swift ${cluster} object availability", - dashboard_link => "https://grafana.wikimedia.org/dashboard/file/swift.json?panelId=8&fullscreen&orgId=1&var-DC=${cluster}", + dashboard_links => ["https://grafana.wikimedia.org/dashboard/file/swift.json?panelId=8&fullscreen&orgId=1&var-DC=${cluster}"], metric => "keepLastValue(swift.${cluster}.dispersion.object.pct_found)", from => '1hours', warning => 95, @@ -14,7 +14,7 @@ monitoring::graphite_threshold { "swift_${cluster}_dispersion_container}": description => "swift ${cluster} container availability", - dashboard_link => "https://grafana.wikimedia.org/dashboard/file/swift.json?panelId=8&fullscreen&orgId=1&var-DC=${cluster}", + dashboard_links => ["https://grafana.wikimedia.org/dashboard/file/swift.json?panelId=8&fullscreen&orgId=1&var-DC=${cluster}"], metric => "keepLastValue(swift.${cluster}.dispersion.container.pct_found)", from => '30min', warning => 92, @@ -25,7 +25,7 @@ monitoring::graphite_threshold { "mediawiki_${cluster}_media_uploads": description => "mediawiki originals uploads (hourly) for ${cluster}", - dashboard_link => "https://grafana.wikimedia.org/dashboard/file/swift.json?panelId=9&fullscreen&orgId=1&var-DC=${cluster}", + dashboard_links => ["https://grafana.wikimedia.org/dashboard/file/swift.json?panelId=9&fullscreen&orgId=1&var-DC=${cluster}"], metric => "summarize(nonNegativeDerivative(keepLastValue(swift.${cluster}.containers.mw-media.originals.objects)), \"1h\")", from => '5h', warning => 2000, diff --git a/modules/varnish/manifests/instance.pp b/modules/varnish/manifests/instance.pp index b39e9f3..640cab4 100644 --- a/modules/varnish/manifests/instance.pp +++ b/modules/varnish/manifests/instance.pp @@ -50,13 +50,13 @@ $prometheus_labels = "instance=~\"${::hostname}:.*\",layer=\"${inst}\"" monitoring::check_prometheus { "varnish-${inst}-check-child-start": - description => 'Varnish child restarted', - dashboard_link => "https://grafana.wikimedia.org/dashboard/db/varnish-machine-stats?orgId=1&var-server=${::hostname}&var-datasource=${::site}%20prometheus%2Fops", - query => "scalar(varnish_mgt_child_start{${prometheus_labels}})", - method => 'gt', - warning => 1, - critical => 3, - prometheus_url => "http://prometheus.svc.${::site}.wmnet/ops", + description => 'Varnish child restarted', + dashboard_links => ["https://grafana.wikimedia.org/dashboard/db/varnish-machine-stats?orgId=1&var-server=${::hostname}&var-datasource=${::site}%20prometheus%2Fops"], + query => "scalar(varnish_mgt_child_start{${prometheus_labels}})", + method => 'gt', + warning => 1, + critical => 3, + prometheus_url => "http://prometheus.svc.${::site}.wmnet/ops", } $runtime_params = join(prefix($runtime_parameters, '-p '), ' ') diff --git a/modules/wdqs/manifests/monitor/services.pp b/modules/wdqs/manifests/monitor/services.pp index 29cf7a3..845cd82 100644 --- a/modules/wdqs/manifests/monitor/services.pp +++ b/modules/wdqs/manifests/monitor/services.pp @@ -37,13 +37,13 @@ } monitoring::graphite_threshold { 'WDQS_Lag': - description => 'High lag', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/wikidata-query-service?orgId=1&panelId=8&fullscreen', - metric => "servers.${::hostname}.BlazegraphCollector.lag", - from => '30min', - warning => '600', # 10 minutes - critical => '1800', # 30 minutes - percentage => '30', # Don't freak out on spikes - contact_group => hiera('contactgroups', 'admins'), + description => 'High lag', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/wikidata-query-service?orgId=1&panelId=8&fullscreen'], + metric => "servers.${::hostname}.BlazegraphCollector.lag", + from => '30min', + warning => '600', # 10 minutes + critical => '1800', # 30 minutes + percentage => '30', # Don't freak out on spikes + contact_group => hiera('contactgroups', 'admins'), } } diff --git a/modules/zuul/manifests/monitoring/server.pp b/modules/zuul/manifests/monitoring/server.pp index 6773612..1b564b0 100644 --- a/modules/zuul/manifests/monitoring/server.pp +++ b/modules/zuul/manifests/monitoring/server.pp @@ -30,14 +30,14 @@ } monitoring::graphite_threshold{ 'zuul_gearman_wait_queue': - ensure => $ensure, - description => 'Work requests waiting in Zuul Gearman server', - dashboard_link => 'https://grafana.wikimedia.org/dashboard/db/zuul-gearman?panelId=10&fullscreen&orgId=1', - metric => 'zuul.geard.queue.waiting', - contact_group => 'contint', - from => '15min', - percentage => 30, - warning => 90, - critical => 140, + ensure => $ensure, + description => 'Work requests waiting in Zuul Gearman server', + dashboard_links => ['https://grafana.wikimedia.org/dashboard/db/zuul-gearman?panelId=10&fullscreen&orgId=1'], + metric => 'zuul.geard.queue.waiting', + contact_group => 'contint', + from => '15min', + percentage => 30, + warning => 90, + critical => 140, } } -- To view, visit https://gerrit.wikimedia.org/r/392607 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I9d1f8b440844ad556281c0c30eac2c98422fe4ef Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Volans <rcocci...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits