Ottomata has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/381489 )
Change subject: Prometheus based Kafka broker alerts, take 1
......................................................................
Prometheus based Kafka broker alerts, take 1
This refactors the Prometheus JMX exporter just a bit, moving
it to a separate profile::kafka::broker::monitoring class,
along with icinga alerts.
Bug: T175923
Change-Id: I839d5de4110da245f712e23285280c2fd546fe8f
---
M hieradata/role/common/kafka/jumbo/broker.yaml
M modules/profile/manifests/kafka/broker.pp
A modules/profile/manifests/kafka/broker/monitoring.pp
3 files changed, 97 insertions(+), 48 deletions(-)
Approvals:
Ottomata: Verified; Looks good to me, approved
Elukey: Looks good to me, but someone else must approve
diff --git a/hieradata/role/common/kafka/jumbo/broker.yaml
b/hieradata/role/common/kafka/jumbo/broker.yaml
index 5fb6770..5b18607 100644
--- a/hieradata/role/common/kafka/jumbo/broker.yaml
+++ b/hieradata/role/common/kafka/jumbo/broker.yaml
@@ -2,8 +2,8 @@
profile::kafka::broker::kafka_cluster_name: jumbo
-# Enable the Prometheus JMX Exporter
-profile::kafka::broker::prometheus_monitoring_enabled: true
+# Enable Monitoring (via Prometheus) and icinga alerts
+profile::kafka::broker::monitoring_enabled: true
profile::kafka::broker::log_dirs: [/srv/kafka/data]
profile::kafka::broker::plaintext: true
@@ -28,5 +28,5 @@
profile::kafka::broker::num_recovery_threads_per_data_dir: 12
profile::kafka::broker::num_io_threads: 12
-profile::kafka::broker::replica_maxlag_warning: "1000000"
-profile::kafka::broker::replica_maxlag_critical: "5000000"
+profile::kafka::broker::monitoring::replica_maxlag_warning: 1000000
+profile::kafka::broker::monitoring::replica_maxlag_critical: 5000000
diff --git a/modules/profile/manifests/kafka/broker.pp
b/modules/profile/manifests/kafka/broker.pp
index 0ce4f1a..cb7c072 100644
--- a/modules/profile/manifests/kafka/broker.pp
+++ b/modules/profile/manifests/kafka/broker.pp
@@ -51,22 +51,14 @@
# [*nofiles_ulimit*]
# Hiera: profile::kafka::broker::nofiles_ulimit
#
-# [*replica_maxlag_warning*]
-# Max messages a replica can lag before a warning alert is generated.
-# Hiera: profile::kafka::broker::replica_maxlag_warning
-#
-# [*replica_maxlag_critical*]
-# Mac messages a replica can lag before a critical alert is generated.
-# Hiera: profile::kafka::broker::replica_maxlag_critical
-#
# [*message_max_bytes*]
# The largest record batch size allowed by Kafka.
# If this is increased and there are consumers older
# than 0.10.2, the consumers' fetch size must also be increased
# so that the they can fetch record batches this large.
#
-# [*prometheus_monitoring_enabled*]
-# Enable the Prometheus jmx exporter.
+# [*monitoring_enabled*]
+# Enable monitoring and alerts for this broker.
#
class profile::kafka::broker(
$kafka_cluster_name =
hiera('profile::kafka::broker::kafka_cluster_name'),
@@ -83,12 +75,9 @@
$num_io_threads =
hiera('profile::kafka::broker::num_io_threads'),
$num_replica_fetchers =
hiera('profile::kafka::broker::num_replica_fetchers'),
$nofiles_ulimit =
hiera('profile::kafka::broker::nofiles_ulimit'),
- $replica_maxlag_warning =
hiera('profile::kafka::broker::replica_maxlag_warning'),
- $replica_maxlag_critical =
hiera('profile::kafka::broker::replica_maxlag_critical'),
# This is set via top level hiera variable so it can be synchronized
between roles and clients.
$message_max_bytes = hiera('kafka_message_max_bytes'),
- $prometheus_monitoring_enabled =
hiera('profile::kafka::broker::prometheus_monitoring_enabled'),
- $prometheus_nodes = hiera('prometheus_nodes'),
+ $monitoring_enabled =
hiera('profile::kafka::broker::monitoring_enabled'),
) {
# TODO: WIP
$tls_secrets_path = undef
@@ -185,36 +174,13 @@
java_home => '/usr/lib/jvm/java-8-openjdk-amd64',
}
- if $prometheus_monitoring_enabled {
- # Allow automatic generation of config on the
- # Prometheus master
- prometheus::jmx_exporter_instance { $::hostname:
- address => $::ipaddress,
- port => 7800,
- }
-
- $prometheus_nodes_ferm = join($prometheus_nodes, ' ')
- ferm::service { 'kafka-broker-jmx_exporter':
- proto => 'tcp',
- port => '7800',
- srange => "@resolve((${prometheus_nodes_ferm}))",
- }
-
- require_package('prometheus-jmx-exporter')
-
- $jmx_exporter_config_file =
'/etc/kafka/broker_prometheus_jmx_exporter.yaml'
- $java_opts =
"-javaagent:/usr/share/java/prometheus/jmx_prometheus_javaagent.jar=${::ipaddress}:7800:${jmx_exporter_config_file}"
-
- # Create the Prometheus JMX Exporter configuration
- file { $jmx_exporter_config_file:
- ensure => present,
- source =>
'puppet:///modules/profile/kafka/broker_prometheus_jmx_exporter.yaml',
- owner => 'kafka',
- group => 'kafka',
- mode => '0400',
- require => Class['::confluent::kafka::broker'],
- }
- } else {
+ # If monitoring is enabled, then include the monitoring profile and set
$java_opts
+ # for exposing the Prometheus JMX Exporter in the Kafka Broker process.
+ if $monitoring_enabled {
+ include ::profile::kafka::broker::monitoring
+ $java_opts = $::profile::kafka::broker::monitoring::java_opts
+ }
+ else {
$java_opts = undef
}
diff --git a/modules/profile/manifests/kafka/broker/monitoring.pp
b/modules/profile/manifests/kafka/broker/monitoring.pp
new file mode 100644
index 0000000..8b0a096
--- /dev/null
+++ b/modules/profile/manifests/kafka/broker/monitoring.pp
@@ -0,0 +1,83 @@
+# Class: profile::kafka::broker::monitoring
+#
+# Sets up Prometheus based monitoring and icinga alerts.
+#
+# [*replica_maxlag_warning*]
+# Max messages a replica can lag before a warning alert is generated.
+# Hiera: profile::kafka::broker::replica_maxlag_warning
+#
+# [*replica_maxlag_critical*]
+# Mac messages a replica can lag before a critical alert is generated.
+# Hiera: profile::kafka::broker::replica_maxlag_critical
+#
+class profile::kafka::broker::monitoring (
+ $cluster = hiera('cluster'),
+ $prometheus_nodes = hiera('prometheus_nodes'),
+ $replica_maxlag_warning =
hiera('profile::kafka::broker::monitoring::replica_maxlag_warning'),
+ $replica_maxlag_critical =
hiera('profile::kafka::broker::monitoring::replica_maxlag_critical'),
+) {
+ ### Expose Kafka Broker JMX metrics to Prometheus
+ require_package('prometheus-jmx-exporter')
+
+ $prometheus_jmx_exporter_port = 7800
+ $jmx_exporter_config_file =
'/etc/kafka/broker_prometheus_jmx_exporter.yaml'
+
+ # Use this in your JAVA_OPTS you pass to the Kafka broker process
+ $java_opts =
"-javaagent:/usr/share/java/prometheus/jmx_prometheus_javaagent.jar=${::ipaddress}:${prometheus_jmx_exporter_port}:${jmx_exporter_config_file}"
+
+ # Create the Prometheus JMX Exporter configuration
+ file { $jmx_exporter_config_file:
+ ensure => present,
+ source =>
'puppet:///modules/profile/kafka/broker_prometheus_jmx_exporter.yaml',
+ owner => 'kafka',
+ group => 'kafka',
+ mode => '0400',
+ # Require this to make sure that kafka user and group are already
created.
+ require => Class['::confluent::kafka::broker'],
+ }
+
+ # Allow automatic generation of config on the Prometheus master
+ prometheus::jmx_exporter_instance { $::hostname:
+ address => $::ipaddress,
+ port => $prometheus_jmx_exporter_port,
+ }
+
+ $prometheus_nodes_ferm = join($prometheus_nodes, ' ')
+ ferm::service { 'kafka-broker-jmx_exporter':
+ proto => 'tcp',
+ port => '7800',
+ srange => "@resolve((${prometheus_nodes_ferm}))",
+ }
+
+
+ ### Icinga alerts
+ # Generate icinga alert if Kafka Broker Server is not running.
+ nrpe::monitor_service { 'kafka':
+ description => 'Kafka Broker Server',
+ nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C java -a
"Kafka /etc/kafka/server.properties"',
+ critical => true,
+ }
+
+ # Prometheus labels for this Kafka Broker instance
+ $prometheus_labels =
"cluster=${cluster},instance=${::hostname}:${prometheus_jmx_exporter_port},job=jmx_kafka"
+
+ # Alert on the average number of under replicated partitions over the last
30 minutes.
+ #
https://grafana.wikimedia.org/dashboard/db/prometheus-kafka?panelId=29&fullscreen
+ monitoring::check_prometheus { 'kafka_broker_under_replicated_partitions':
+ description => 'Kafka Broker Under Replicated Partitions',
+ query =>
"scalar(avg_over_time(kafka_server_replicamanager_underreplicatedpartitions{${prometheus_labels}}[30m]))",
+ warning => 5,
+ critical => 10,
+ prometheus_url => "http://prometheus.svc.${::site}.wmnet/ops",
+ }
+
+ # Alert on the average max replica lag over the last 30 minutes.
+ #
https://grafana.wikimedia.org/dashboard/db/prometheus-kafka?panelId=16&fullscreen
+ monitoring::check_prometheus { 'kafka_broker_replica_max_lag':
+ description => 'Kafka Broker Replica Max Lag',
+ query =>
"scalar(avg_over_time(kafka_server_replicafetchermanager_maxlag{${prometheus_labels}}[30m]))",
+ warning => $replica_maxlag_warning,
+ critical => $replica_maxlag_critical,
+ prometheus_url => "http://prometheus.svc.${::site}.wmnet/ops",
+ }
+}
\ No newline at end of file
--
To view, visit https://gerrit.wikimedia.org/r/381489
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I839d5de4110da245f712e23285280c2fd546fe8f
Gerrit-PatchSet: 12
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Ottomata <[email protected]>
Gerrit-Reviewer: Elukey <[email protected]>
Gerrit-Reviewer: Filippo Giunchedi <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits