Ottomata has uploaded a new change for review. https://gerrit.wikimedia.org/r/219465
Change subject: Use graphite_threshold instead of ganglia for Kafka alerts ...................................................................... Use graphite_threshold instead of ganglia for Kafka alerts Change-Id: I7fbf2826f73b04b2a8d642b9bd253829754db5d2 --- M manifests/role/analytics/kafka.pp 1 file changed, 8 insertions(+), 4 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/65/219465/1 diff --git a/manifests/role/analytics/kafka.pp b/manifests/role/analytics/kafka.pp index b77e7d8..f5b4feb 100644 --- a/manifests/role/analytics/kafka.pp +++ b/manifests/role/analytics/kafka.pp @@ -211,6 +211,8 @@ $nagios_servicegroup = 'analytics_eqiad' monitoring::ganglia { 'kafka-broker-MessagesIn': + # This alert is handled globally across all brokers via graphite_anomaly + ensure => 'absent' description => 'Kafka Broker Messages In', metric => 'kafka.server.BrokerTopicMetrics.AllTopicsMessagesInPerSec.FifteenMinuteRate', warning => ':1500.0', @@ -241,27 +243,29 @@ # Alert if any Kafka has under replicated partitions. # If it does, this means a broker replica is falling behind # and will be removed from the ISR. - monitoring::ganglia { 'kafka-broker-UnderReplicatedPartitions': + monitoring::graphite_threshold { 'kafka-broker-UnderReplicatedPartitions': description => 'Kafka Broker Under Replicated Partitions', - metric => 'kafka.server.ReplicaManager.UnderReplicatedPartitions.Value', + metric => "kafka.${::hostname}_${::site}_wmnet_${::kafka::server::jmx_port}.kafka.server.ReplicaManager.UnderReplicatedPartitions.Value" # Any under replicated partitions are bad. # Over 10 means (probably) that at least an entire topic # is under replicated. warning => '1', critical => '10', + percentage => 10, require => Class['::kafka::server::jmxtrans'], group => $nagios_servicegroup, } # Alert if any Kafka Broker replica lag is too high - monitoring::ganglia { 'kafka-broker-Replica-MaxLag': + monitoring::graphite_threshold { 'kafka-broker-Replica-MaxLag': description => 'Kafka Broker Replica Lag', - metric => 'kafka.server.ReplicaFetcherManager.Replica-MaxLag.Value', + metric => "kafka.${::hostname}_${::site}_wmnet_${::kafka::server::jmx_port}.kafka.server.ReplicaFetcherManager.Replica-MaxLag.Value", # As of 2014-02 replag could catch up at more than 1000 msgs / sec, # (probably more like 2 or 3 K / second). At that rate, 1M messages # behind should catch back up in at least 30 minutes. warning => '1000000', critical => '5000000', + percentage => 10, require => Class['::kafka::server::jmxtrans'], group => $nagios_servicegroup, } -- To view, visit https://gerrit.wikimedia.org/r/219465 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I7fbf2826f73b04b2a8d642b9bd253829754db5d2 Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Ottomata <o...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits