Filippo Giunchedi has submitted this change and it was merged.

Change subject: graphite: add error alerts
......................................................................


graphite: add error alerts

provide alerts for error situations, e.g. queue dropping datapoints, too many
metric creations, file write errors

Bug: T92965
Change-Id: I0d3816e922bb749f0750d49299c56d0d2e34034c
---
M manifests/role/graphite.pp
A modules/graphite/manifests/monitoring/graphite.pp
2 files changed, 50 insertions(+), 0 deletions(-)

Approvals:
  Ori.livneh: Looks good to me, but someone else must approve
  Filippo Giunchedi: Verified; Looks good to me, approved



diff --git a/manifests/role/graphite.pp b/manifests/role/graphite.pp
index 4fce5df..315539f 100644
--- a/manifests/role/graphite.pp
+++ b/manifests/role/graphite.pp
@@ -228,6 +228,7 @@
     include ::eventlogging::monitoring::graphite
     include ::swift::monitoring::graphite
     include ::swift_new::monitoring::graphite
+    include ::graphite::monitoring::graphite
 
     # Monitor production 5xx rates
     monitoring::graphite_threshold { 'reqstats_5xx':
diff --git a/modules/graphite/manifests/monitoring/graphite.pp 
b/modules/graphite/manifests/monitoring/graphite.pp
new file mode 100644
index 0000000..0d4da01
--- /dev/null
+++ b/modules/graphite/manifests/monitoring/graphite.pp
@@ -0,0 +1,49 @@
+# == Class: graphite::monitoring::graphite
+#
+# Monitor a graphite stack for important vitals, namely what it is interesting
+# if we are losing data and how much.
+# To that end, both "carbon-relay" (what accepts metrics from the outside) and
+# "carbon-cache" (what read/writes datapoints from/to disk) are monitored, e.g.
+# if there is any dropping of datapoints in their queues or errors otherwise.
+
+class graphite::monitoring::graphite {
+    # is carbon-relay queue full? (i.e. dropping data)
+    monitoring::graphite_threshold { 'carbon-relay_queue_full':
+        description     => 'carbon-relay queue full',
+        metric          => 
'sumSeries(carbon.relays.graphite1001-*.destinations.*.fullQueueDrops)',
+        from            => '10minutes',
+        warning         => 200,
+        critical        => 1000,
+        nagios_critical => false
+    }
+
+    # is carbon-cache able to write to disk (e.g. permissions)
+    monitoring::graphite_threshold { 'carbon-cache_write_error':
+        description     => 'carbon-cache write error',
+        metric          => 
'secondYAxis(sumSeries(carbon.agents.graphite1001-*.errors))',
+        from            => '10minutes',
+        warning         => 1,
+        critical        => 8,
+        nagios_critical => false
+    }
+
+    # are carbon-cache queues overflowing their capacity?
+    monitoring::graphite_threshold { 'carbon-cache_overflow':
+        description     => 'carbon-cache queues overflow',
+        metric          => 
'secondYAxis(sumSeries(carbon.agents.graphite1001-*.cache.overflow))',
+        from            => '10minutes',
+        warning         => 1,
+        critical        => 8,
+        nagios_critical => false
+    }
+
+    # are we creating too many metrics?
+    monitoring::graphite_threshold { 'carbon-cache_many_creates':
+        description     => 'carbon-cache too many creates',
+        metric          => 'sumSeries(carbon.agents.graphite1001-*.creates)',
+        from            => '1hour',
+        warning         => 500,
+        critical        => 1000,
+        nagios_critical => false
+    }
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/197352
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I0d3816e922bb749f0750d49299c56d0d2e34034c
Gerrit-PatchSet: 3
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Filippo Giunchedi <[email protected]>
Gerrit-Reviewer: Chasemp <[email protected]>
Gerrit-Reviewer: Filippo Giunchedi <[email protected]>
Gerrit-Reviewer: Ori.livneh <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to