Ottomata has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/219465

Change subject: Use graphite_threshold instead of ganglia for Kafka alerts
......................................................................

Use graphite_threshold instead of ganglia for Kafka alerts

Change-Id: I7fbf2826f73b04b2a8d642b9bd253829754db5d2
---
M manifests/role/analytics/kafka.pp
1 file changed, 8 insertions(+), 4 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/65/219465/1

diff --git a/manifests/role/analytics/kafka.pp 
b/manifests/role/analytics/kafka.pp
index b77e7d8..f5b4feb 100644
--- a/manifests/role/analytics/kafka.pp
+++ b/manifests/role/analytics/kafka.pp
@@ -211,6 +211,8 @@
         $nagios_servicegroup = 'analytics_eqiad'
 
         monitoring::ganglia { 'kafka-broker-MessagesIn':
+            # This alert is handled globally across all brokers via 
graphite_anomaly
+            ensure      => 'absent'
             description => 'Kafka Broker Messages In',
             metric      => 
'kafka.server.BrokerTopicMetrics.AllTopicsMessagesInPerSec.FifteenMinuteRate',
             warning     => ':1500.0',
@@ -241,27 +243,29 @@
         # Alert if any Kafka has under replicated partitions.
         # If it does, this means a broker replica is falling behind
         # and will be removed from the ISR.
-        monitoring::ganglia { 'kafka-broker-UnderReplicatedPartitions':
+        monitoring::graphite_threshold { 
'kafka-broker-UnderReplicatedPartitions':
             description => 'Kafka Broker Under Replicated Partitions',
-            metric      => 
'kafka.server.ReplicaManager.UnderReplicatedPartitions.Value',
+            metric      => 
"kafka.${::hostname}_${::site}_wmnet_${::kafka::server::jmx_port}.kafka.server.ReplicaManager.UnderReplicatedPartitions.Value"
             # Any under replicated partitions are bad.
             # Over 10 means (probably) that at least an entire topic
             # is under replicated.
             warning     => '1',
             critical    => '10',
+            percentage  => 10,
             require     => Class['::kafka::server::jmxtrans'],
             group       => $nagios_servicegroup,
         }
 
         # Alert if any Kafka Broker replica lag is too high
-        monitoring::ganglia { 'kafka-broker-Replica-MaxLag':
+        monitoring::graphite_threshold { 'kafka-broker-Replica-MaxLag':
             description => 'Kafka Broker Replica Lag',
-            metric      => 
'kafka.server.ReplicaFetcherManager.Replica-MaxLag.Value',
+            metric      => 
"kafka.${::hostname}_${::site}_wmnet_${::kafka::server::jmx_port}.kafka.server.ReplicaFetcherManager.Replica-MaxLag.Value",
             # As of 2014-02 replag could catch up at more than 1000 msgs / sec,
             # (probably more like 2 or 3 K / second). At that rate, 1M messages
             # behind should catch back up in at least 30 minutes.
             warning     => '1000000',
             critical    => '5000000',
+            percentage  => 10,
             require     => Class['::kafka::server::jmxtrans'],
             group       => $nagios_servicegroup,
         }

-- 
To view, visit https://gerrit.wikimedia.org/r/219465
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I7fbf2826f73b04b2a8d642b9bd253829754db5d2
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Ottomata <o...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to