Ottomata has uploaded a new change for review.
https://gerrit.wikimedia.org/r/66241
Change subject: Adding alerts for webrequest data loss in HDFS
......................................................................
Adding alerts for webrequest data loss in HDFS
Change-Id: If91ce8badded15a2d15e8a0be42735ebe80f5968
---
M manifests/misc/analytics.pp
M manifests/misc/monitoring.pp
M templates/icinga/checkcommands.cfg.erb
3 files changed, 28 insertions(+), 1 deletion(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet
refs/changes/41/66241/1
diff --git a/manifests/misc/analytics.pp b/manifests/misc/analytics.pp
index 66655da..bea13d8 100644
--- a/manifests/misc/analytics.pp
+++ b/manifests/misc/analytics.pp
@@ -35,4 +35,4 @@
check_command =>
"check_kafka_broker_produce_requests!3!2",
contact_group => "analytics",
}
-}
\ No newline at end of file
+}
diff --git a/manifests/misc/monitoring.pp b/manifests/misc/monitoring.pp
index 5f6c6c1..a2ec64c 100644
--- a/manifests/misc/monitoring.pp
+++ b/manifests/misc/monitoring.pp
@@ -58,6 +58,20 @@
source =>
"puppet:///files/ganglia/plugins/kraken_webrequest_loss.pyconf",
notify => Service[gmond];
}
+
+ # Set up icinga monitoring of Kraken HDFS data loss.
+ monitor_service { "kraken_webrequest_loss_average_positive":
+ description => "webrequest_loss_average_positive",
+ check_command =>
"check_kraken_webrequest_loss_positive!2!8",
+ contact_group => "analytics",
+ }
+ # It is possible to have negative data loss. This would mean that
+ # we are receiving duplicates log lines. We need alerts for this too.
+ monitor_service { "kraken_webrequest_loss_average_negative":
+ description => "webrequest_loss_average_negative",
+ check_command =>
"check_kraken_webrequest_loss_negative!-2!-8",
+ contact_group => "analytics",
+ }
}
# Ganglia views that should be
diff --git a/templates/icinga/checkcommands.cfg.erb
b/templates/icinga/checkcommands.cfg.erb
index 830ba4a..b156f7e 100644
--- a/templates/icinga/checkcommands.cfg.erb
+++ b/templates/icinga/checkcommands.cfg.erb
@@ -621,4 +621,17 @@
command_line $USER1$/check_ganglios_generic_value -H $HOSTADDRESS$
-m kafka_network_SocketServerStats.ProduceRequestsPerSecond -w $ARG1$ -c $ARG2$
-o lt
}
+# Alerts for data loss in Kraken HDFS.
+define command{
+ command_name check_kraken_webrequest_loss_positive
+ command_line $USER1$/check_ganglios_generic_value -H $HOSTADDRESS$
-m webrequest_loss_average -w $ARG1$ -c $ARG2$ -o gt
+}
+
+# Data loss percentage CAN be negative if we receive duplicate traffic
+# (this has happened before). We need an extra alert if the percentages goes
negative.
+define command{
+ command_name check_kraken_webrequest_loss_negative
+ command_line $USER1$/check_ganglios_generic_value -H $HOSTADDRESS$
-m webrequest_loss_average -w $ARG1$ -c $ARG2$ -o lt
+}
+
--
To view, visit https://gerrit.wikimedia.org/r/66241
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: If91ce8badded15a2d15e8a0be42735ebe80f5968
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits