Filippo Giunchedi has submitted this change and it was merged.
Change subject: graphite: add error alerts
......................................................................
graphite: add error alerts
provide alerts for error situations, e.g. queue dropping datapoints, too many
metric creations, file write errors
Bug: T92965
Change-Id: I0d3816e922bb749f0750d49299c56d0d2e34034c
---
M manifests/role/graphite.pp
A modules/graphite/manifests/monitoring/graphite.pp
2 files changed, 50 insertions(+), 0 deletions(-)
Approvals:
Ori.livneh: Looks good to me, but someone else must approve
Filippo Giunchedi: Verified; Looks good to me, approved
diff --git a/manifests/role/graphite.pp b/manifests/role/graphite.pp
index 4fce5df..315539f 100644
--- a/manifests/role/graphite.pp
+++ b/manifests/role/graphite.pp
@@ -228,6 +228,7 @@
include ::eventlogging::monitoring::graphite
include ::swift::monitoring::graphite
include ::swift_new::monitoring::graphite
+ include ::graphite::monitoring::graphite
# Monitor production 5xx rates
monitoring::graphite_threshold { 'reqstats_5xx':
diff --git a/modules/graphite/manifests/monitoring/graphite.pp
b/modules/graphite/manifests/monitoring/graphite.pp
new file mode 100644
index 0000000..0d4da01
--- /dev/null
+++ b/modules/graphite/manifests/monitoring/graphite.pp
@@ -0,0 +1,49 @@
+# == Class: graphite::monitoring::graphite
+#
+# Monitor a graphite stack for important vitals, namely what it is interesting
+# if we are losing data and how much.
+# To that end, both "carbon-relay" (what accepts metrics from the outside) and
+# "carbon-cache" (what read/writes datapoints from/to disk) are monitored, e.g.
+# if there is any dropping of datapoints in their queues or errors otherwise.
+
+class graphite::monitoring::graphite {
+ # is carbon-relay queue full? (i.e. dropping data)
+ monitoring::graphite_threshold { 'carbon-relay_queue_full':
+ description => 'carbon-relay queue full',
+ metric =>
'sumSeries(carbon.relays.graphite1001-*.destinations.*.fullQueueDrops)',
+ from => '10minutes',
+ warning => 200,
+ critical => 1000,
+ nagios_critical => false
+ }
+
+ # is carbon-cache able to write to disk (e.g. permissions)
+ monitoring::graphite_threshold { 'carbon-cache_write_error':
+ description => 'carbon-cache write error',
+ metric =>
'secondYAxis(sumSeries(carbon.agents.graphite1001-*.errors))',
+ from => '10minutes',
+ warning => 1,
+ critical => 8,
+ nagios_critical => false
+ }
+
+ # are carbon-cache queues overflowing their capacity?
+ monitoring::graphite_threshold { 'carbon-cache_overflow':
+ description => 'carbon-cache queues overflow',
+ metric =>
'secondYAxis(sumSeries(carbon.agents.graphite1001-*.cache.overflow))',
+ from => '10minutes',
+ warning => 1,
+ critical => 8,
+ nagios_critical => false
+ }
+
+ # are we creating too many metrics?
+ monitoring::graphite_threshold { 'carbon-cache_many_creates':
+ description => 'carbon-cache too many creates',
+ metric => 'sumSeries(carbon.agents.graphite1001-*.creates)',
+ from => '1hour',
+ warning => 500,
+ critical => 1000,
+ nagios_critical => false
+ }
+}
--
To view, visit https://gerrit.wikimedia.org/r/197352
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I0d3816e922bb749f0750d49299c56d0d2e34034c
Gerrit-PatchSet: 3
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Filippo Giunchedi <[email protected]>
Gerrit-Reviewer: Chasemp <[email protected]>
Gerrit-Reviewer: Filippo Giunchedi <[email protected]>
Gerrit-Reviewer: Ori.livneh <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits