Filippo Giunchedi has uploaded a new change for review.
https://gerrit.wikimedia.org/r/197352
Change subject: graphite: add error alerts
......................................................................
graphite: add error alerts
provide alerts for error situations, e.g. queue dropping datapoints, too many
metric creations, file write errors
Bug: T92965
Change-Id: I0d3816e922bb749f0750d49299c56d0d2e34034c
---
M manifests/role/graphite.pp
A modules/graphite/manifests/monitoring/graphite.pp
2 files changed, 38 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet
refs/changes/52/197352/1
diff --git a/manifests/role/graphite.pp b/manifests/role/graphite.pp
index 4fce5df..315539f 100644
--- a/manifests/role/graphite.pp
+++ b/manifests/role/graphite.pp
@@ -228,6 +228,7 @@
include ::eventlogging::monitoring::graphite
include ::swift::monitoring::graphite
include ::swift_new::monitoring::graphite
+ include ::graphite::monitoring::graphite
# Monitor production 5xx rates
monitoring::graphite_threshold { 'reqstats_5xx':
diff --git a/modules/graphite/manifests/monitoring/graphite.pp
b/modules/graphite/manifests/monitoring/graphite.pp
new file mode 100644
index 0000000..d4ef199
--- /dev/null
+++ b/modules/graphite/manifests/monitoring/graphite.pp
@@ -0,0 +1,37 @@
+class graphite::monitoring::graphite {
+ monitoring::graphite_threshold { 'carbon-relay queue full':
+ description => 'carbon-relay queue full',
+ metric =>
'sumSeries(carbon.relays.graphite1001-*.destinations.*.fullQueueDrops)',
+ from => '10minutes',
+ warning => 200,
+ critical => 1000,
+ nagios_critical => false
+ }
+
+ monitoring::graphite_threshold { 'carbon-cache write error':
+ description => 'carbon-cache write error',
+ metric =>
'secondYAxis(sumSeries(carbon.agents.graphite1001-*.errors))',
+ from => '10minutes',
+ warning => 1,
+ critical => 8,
+ nagios_critical => false
+ }
+
+ monitoring::graphite_threshold { 'carbon-cache overflows':
+ description => 'carbon-cache queues overflow',
+ metric =>
'secondYAxis(sumSeries(carbon.agents.graphite1001-*.cache.overflow))',
+ from => '10minutes',
+ warning => 1,
+ critical => 8,
+ nagios_critical => false
+ }
+
+ monitoring::graphite_threshold { 'carbon-cache creates':
+ description => 'carbon-cache too many creates',
+ metric => 'sumSeries(carbon.agents.graphite1001-*.creates)',
+ from => '1hour',
+ warning => 200,
+ critical => 1000,
+ nagios_critical => false
+ }
+}
--
To view, visit https://gerrit.wikimedia.org/r/197352
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I0d3816e922bb749f0750d49299c56d0d2e34034c
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Filippo Giunchedi <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits