Milimetric has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/316567

Change subject: Correct and simplify EventLogging monitoring
......................................................................

Correct and simplify EventLogging monitoring

EventLogging monitoring was incorrectly calculating the difference
between valid and invalid events.  The valid event metric included
EventError which is a kafka topic where invalid events are sent.  This
fixes that and also removes the use of server-side event monitoring or
mention in role::eventlogging.

Bug: T147321
Change-Id: I8b7aadecb9cf2ef43f2b7a4a638d797271dfac9e
---
M manifests/role/eventlogging.pp
M modules/eventlogging/manifests/monitoring/graphite.pp
2 files changed, 12 insertions(+), 18 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/67/316567/1

diff --git a/manifests/role/eventlogging.pp b/manifests/role/eventlogging.pp
index 2845f8d..f3086ce 100644
--- a/manifests/role/eventlogging.pp
+++ b/manifests/role/eventlogging.pp
@@ -55,9 +55,8 @@
     # to your query params.
     $kafka_base_uri    = inline_template('kafka:///<%= 
@kafka_brokers_array.join(":9092,") + ":9092" %>')
 
-    # Read in server side and client side raw events from
-    # Kafka, process them, and send events to schema
-    # based topics in Kafka.
+    # Read in raw events from Kafka, process them, and send them to
+    # the schema corresponding to their topic in Kafka.
     $kafka_schema_uri  = "${kafka_base_uri}?topic=eventlogging_{schema}"
 
     # The downstream eventlogging MySQL consumer expects schemas to be
@@ -70,7 +69,6 @@
         default => 
"${kafka_base_uri}?topic=eventlogging-valid-mixed&blacklist=${mixed_schema_blacklist}"
     }
 
-    $kafka_server_side_raw_uri = 
"${kafka_base_uri}?topic=eventlogging-server-side"
     $kafka_client_side_raw_uri = 
"${kafka_base_uri}?topic=eventlogging-client-side"
 
     # This check was written for eventlog1001, so only include it there.,
diff --git a/modules/eventlogging/manifests/monitoring/graphite.pp 
b/modules/eventlogging/manifests/monitoring/graphite.pp
index 5fc9dd0..67dc0a3 100644
--- a/modules/eventlogging/manifests/monitoring/graphite.pp
+++ b/modules/eventlogging/manifests/monitoring/graphite.pp
@@ -9,8 +9,9 @@
 #                                    kafka::server::jmxtrans
 #
 class eventlogging::monitoring::graphite($kafka_brokers_graphite_wildcard) {
-    $raw_events_rate_metric   = 
"sumSeries(kafka.cluster.analytics-eqiad.kafka.${kafka_brokers_graphite_wildcard}.kafka.server.BrokerTopicMetrics.MessagesInPerSec.{eventlogging-client-side,eventlogging-server-side}.OneMinuteRate)"
-    $valid_events_rate_metric = 
"sumSeries(kafka.cluster.analytics-eqiad.kafka.${kafka_brokers_graphite_wildcard}.kafka.server.BrokerTopicMetrics.MessagesInPerSec.eventlogging_*.OneMinuteRate)"
+    $raw_events_rate_metric   = 
"sumSeries(kafka.cluster.analytics-eqiad.kafka.${kafka_brokers_graphite_wildcard}.kafka.server.BrokerTopicMetrics.MessagesInPerSec.eventlogging-client-side.OneMinuteRate)"
+    $invalid_events_rate_metric = 
"sumSeries(kafka.cluster.analytics-eqiad.kafka.${kafka_brokers_graphite_wildcard}.kafka.server.BrokerTopicMetrics.MessagesInPerSec.eventlogging_EventError.OneMinuteRate)"
+    $navigation_timing_events_rate_metric = 
"sumSeries(kafka.cluster.analytics-eqiad.kafka.${kafka_brokers_graphite_wildcard}.kafka.server.BrokerTopicMetrics.MessagesInPerSec.eventlogging_NavigationTiming.OneMinuteRate)"
 
     # Warn if 15% of overall event throughput goes beyond 1000 events/s
     # in a 15 min period.
@@ -28,7 +29,6 @@
     # Alarms if 15% of Navigation Timing event throughput goes under 1 req/sec
     # in a 15 min period
     # https://meta.wikimedia.org/wiki/Schema:NavigationTiming
-    $navigation_timing_events_rate_metric = 
"sumSeries(kafka.cluster.analytics-eqiad.kafka.${kafka_brokers_graphite_wildcard}.kafka.server.BrokerTopicMetrics.MessagesInPerSec.eventlogging_NavigationTiming.OneMinuteRate)"
     monitoring::graphite_threshold { 
'eventlogging_NavigationTiming_throughput':
         description   => 'Throughput of EventLogging NavigationTiming events',
         metric        => $navigation_timing_events_rate_metric,
@@ -40,19 +40,15 @@
         under         => true
     }
 
-    # Warn/Alert if the difference between raw and valid EventLogging
-    # alerts gets too big.  We put a 10 minute lag because of metrics
-    # not being correct in graphite before.
-    # If the difference gets too big, either the validation step is
-    # overloaded, or high volume schemas are failing validation.
-    monitoring::graphite_threshold { 'eventlogging_difference_raw_validated':
-        description   => 'Difference between raw and validated EventLogging 
overall message rates',
-        metric        => 
"absolute(diffSeries(${raw_events_rate_metric},${valid_events_rate_metric}))",
+    # Warn if 15% of overall invalid event throughput goes above 20 events/s
+    # in a 15 minute period.
+    monitoring::graphite_threshold { 'eventlogging_EventError_throughput':
+        description   => 'Throughput of invalid EventLogging events',
+        metric        => $invalid_events_rate_metric,
         warning       => 20,
         critical      => 30,
-        percentage    => 20, # At least 3 of the (25 - 10) = 15 readings
-        from          => '25min',
-        until         => '10min',
+        percentage    => 15, # At least 3 of the (25 - 10) = 15 readings
+        from          => '15min',
         contact_group => 'analytics',
     }
 

-- 
To view, visit https://gerrit.wikimedia.org/r/316567
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I8b7aadecb9cf2ef43f2b7a4a638d797271dfac9e
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Milimetric <dandree...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to