Milimetric has uploaded a new change for review. https://gerrit.wikimedia.org/r/316567
Change subject: Correct and simplify EventLogging monitoring ...................................................................... Correct and simplify EventLogging monitoring EventLogging monitoring was incorrectly calculating the difference between valid and invalid events. The valid event metric included EventError which is a kafka topic where invalid events are sent. This fixes that and also removes the use of server-side event monitoring or mention in role::eventlogging. Bug: T147321 Change-Id: I8b7aadecb9cf2ef43f2b7a4a638d797271dfac9e --- M manifests/role/eventlogging.pp M modules/eventlogging/manifests/monitoring/graphite.pp 2 files changed, 12 insertions(+), 18 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/67/316567/1 diff --git a/manifests/role/eventlogging.pp b/manifests/role/eventlogging.pp index 2845f8d..f3086ce 100644 --- a/manifests/role/eventlogging.pp +++ b/manifests/role/eventlogging.pp @@ -55,9 +55,8 @@ # to your query params. $kafka_base_uri = inline_template('kafka:///<%= @kafka_brokers_array.join(":9092,") + ":9092" %>') - # Read in server side and client side raw events from - # Kafka, process them, and send events to schema - # based topics in Kafka. + # Read in raw events from Kafka, process them, and send them to + # the schema corresponding to their topic in Kafka. $kafka_schema_uri = "${kafka_base_uri}?topic=eventlogging_{schema}" # The downstream eventlogging MySQL consumer expects schemas to be @@ -70,7 +69,6 @@ default => "${kafka_base_uri}?topic=eventlogging-valid-mixed&blacklist=${mixed_schema_blacklist}" } - $kafka_server_side_raw_uri = "${kafka_base_uri}?topic=eventlogging-server-side" $kafka_client_side_raw_uri = "${kafka_base_uri}?topic=eventlogging-client-side" # This check was written for eventlog1001, so only include it there., diff --git a/modules/eventlogging/manifests/monitoring/graphite.pp b/modules/eventlogging/manifests/monitoring/graphite.pp index 5fc9dd0..67dc0a3 100644 --- a/modules/eventlogging/manifests/monitoring/graphite.pp +++ b/modules/eventlogging/manifests/monitoring/graphite.pp @@ -9,8 +9,9 @@ # kafka::server::jmxtrans # class eventlogging::monitoring::graphite($kafka_brokers_graphite_wildcard) { - $raw_events_rate_metric = "sumSeries(kafka.cluster.analytics-eqiad.kafka.${kafka_brokers_graphite_wildcard}.kafka.server.BrokerTopicMetrics.MessagesInPerSec.{eventlogging-client-side,eventlogging-server-side}.OneMinuteRate)" - $valid_events_rate_metric = "sumSeries(kafka.cluster.analytics-eqiad.kafka.${kafka_brokers_graphite_wildcard}.kafka.server.BrokerTopicMetrics.MessagesInPerSec.eventlogging_*.OneMinuteRate)" + $raw_events_rate_metric = "sumSeries(kafka.cluster.analytics-eqiad.kafka.${kafka_brokers_graphite_wildcard}.kafka.server.BrokerTopicMetrics.MessagesInPerSec.eventlogging-client-side.OneMinuteRate)" + $invalid_events_rate_metric = "sumSeries(kafka.cluster.analytics-eqiad.kafka.${kafka_brokers_graphite_wildcard}.kafka.server.BrokerTopicMetrics.MessagesInPerSec.eventlogging_EventError.OneMinuteRate)" + $navigation_timing_events_rate_metric = "sumSeries(kafka.cluster.analytics-eqiad.kafka.${kafka_brokers_graphite_wildcard}.kafka.server.BrokerTopicMetrics.MessagesInPerSec.eventlogging_NavigationTiming.OneMinuteRate)" # Warn if 15% of overall event throughput goes beyond 1000 events/s # in a 15 min period. @@ -28,7 +29,6 @@ # Alarms if 15% of Navigation Timing event throughput goes under 1 req/sec # in a 15 min period # https://meta.wikimedia.org/wiki/Schema:NavigationTiming - $navigation_timing_events_rate_metric = "sumSeries(kafka.cluster.analytics-eqiad.kafka.${kafka_brokers_graphite_wildcard}.kafka.server.BrokerTopicMetrics.MessagesInPerSec.eventlogging_NavigationTiming.OneMinuteRate)" monitoring::graphite_threshold { 'eventlogging_NavigationTiming_throughput': description => 'Throughput of EventLogging NavigationTiming events', metric => $navigation_timing_events_rate_metric, @@ -40,19 +40,15 @@ under => true } - # Warn/Alert if the difference between raw and valid EventLogging - # alerts gets too big. We put a 10 minute lag because of metrics - # not being correct in graphite before. - # If the difference gets too big, either the validation step is - # overloaded, or high volume schemas are failing validation. - monitoring::graphite_threshold { 'eventlogging_difference_raw_validated': - description => 'Difference between raw and validated EventLogging overall message rates', - metric => "absolute(diffSeries(${raw_events_rate_metric},${valid_events_rate_metric}))", + # Warn if 15% of overall invalid event throughput goes above 20 events/s + # in a 15 minute period. + monitoring::graphite_threshold { 'eventlogging_EventError_throughput': + description => 'Throughput of invalid EventLogging events', + metric => $invalid_events_rate_metric, warning => 20, critical => 30, - percentage => 20, # At least 3 of the (25 - 10) = 15 readings - from => '25min', - until => '10min', + percentage => 15, # At least 3 of the (25 - 10) = 15 readings + from => '15min', contact_group => 'analytics', } -- To view, visit https://gerrit.wikimedia.org/r/316567 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I8b7aadecb9cf2ef43f2b7a4a638d797271dfac9e Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Milimetric <dandree...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits