Ottomata has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/381489 )

Change subject: Prometheus based Kafka broker alerts, take 1
......................................................................


Prometheus based Kafka broker alerts, take 1

This refactors the Prometheus JMX exporter just a bit, moving
it to a separate profile::kafka::broker::monitoring class,
along with icinga alerts.

Bug: T175923
Change-Id: I839d5de4110da245f712e23285280c2fd546fe8f
---
M hieradata/role/common/kafka/jumbo/broker.yaml
M modules/profile/manifests/kafka/broker.pp
A modules/profile/manifests/kafka/broker/monitoring.pp
3 files changed, 97 insertions(+), 48 deletions(-)

Approvals:
  Ottomata: Verified; Looks good to me, approved
  Elukey: Looks good to me, but someone else must approve



diff --git a/hieradata/role/common/kafka/jumbo/broker.yaml 
b/hieradata/role/common/kafka/jumbo/broker.yaml
index 5fb6770..5b18607 100644
--- a/hieradata/role/common/kafka/jumbo/broker.yaml
+++ b/hieradata/role/common/kafka/jumbo/broker.yaml
@@ -2,8 +2,8 @@
 
 profile::kafka::broker::kafka_cluster_name: jumbo
 
-# Enable the Prometheus JMX Exporter
-profile::kafka::broker::prometheus_monitoring_enabled: true
+# Enable Monitoring (via Prometheus) and icinga alerts
+profile::kafka::broker::monitoring_enabled: true
 
 profile::kafka::broker::log_dirs: [/srv/kafka/data]
 profile::kafka::broker::plaintext: true
@@ -28,5 +28,5 @@
 profile::kafka::broker::num_recovery_threads_per_data_dir: 12
 profile::kafka::broker::num_io_threads: 12
 
-profile::kafka::broker::replica_maxlag_warning: "1000000"
-profile::kafka::broker::replica_maxlag_critical: "5000000"
+profile::kafka::broker::monitoring::replica_maxlag_warning: 1000000
+profile::kafka::broker::monitoring::replica_maxlag_critical: 5000000
diff --git a/modules/profile/manifests/kafka/broker.pp 
b/modules/profile/manifests/kafka/broker.pp
index 0ce4f1a..cb7c072 100644
--- a/modules/profile/manifests/kafka/broker.pp
+++ b/modules/profile/manifests/kafka/broker.pp
@@ -51,22 +51,14 @@
 # [*nofiles_ulimit*]
 #   Hiera: profile::kafka::broker::nofiles_ulimit
 #
-# [*replica_maxlag_warning*]
-#   Max messages a replica can lag before a warning alert is generated.
-#   Hiera: profile::kafka::broker::replica_maxlag_warning
-#
-# [*replica_maxlag_critical*]
-#   Mac messages a replica can lag before a critical alert is generated.
-#   Hiera: profile::kafka::broker::replica_maxlag_critical
-#
 # [*message_max_bytes*]
 #   The largest record batch size allowed by Kafka.
 #   If this is increased and there are consumers older
 #   than 0.10.2, the consumers' fetch size must also be increased
 #   so that the they can fetch record batches this large.
 #
-# [*prometheus_monitoring_enabled*]
-#   Enable the Prometheus jmx exporter.
+# [*monitoring_enabled*]
+#   Enable monitoring and alerts for this broker.
 #
 class profile::kafka::broker(
     $kafka_cluster_name                = 
hiera('profile::kafka::broker::kafka_cluster_name'),
@@ -83,12 +75,9 @@
     $num_io_threads                    = 
hiera('profile::kafka::broker::num_io_threads'),
     $num_replica_fetchers              = 
hiera('profile::kafka::broker::num_replica_fetchers'),
     $nofiles_ulimit                    = 
hiera('profile::kafka::broker::nofiles_ulimit'),
-    $replica_maxlag_warning            = 
hiera('profile::kafka::broker::replica_maxlag_warning'),
-    $replica_maxlag_critical           = 
hiera('profile::kafka::broker::replica_maxlag_critical'),
     # This is set via top level hiera variable so it can be synchronized 
between roles and clients.
     $message_max_bytes                 = hiera('kafka_message_max_bytes'),
-    $prometheus_monitoring_enabled     = 
hiera('profile::kafka::broker::prometheus_monitoring_enabled'),
-    $prometheus_nodes                  = hiera('prometheus_nodes'),
+    $monitoring_enabled                = 
hiera('profile::kafka::broker::monitoring_enabled'),
 ) {
     # TODO: WIP
     $tls_secrets_path = undef
@@ -185,36 +174,13 @@
         java_home     => '/usr/lib/jvm/java-8-openjdk-amd64',
     }
 
-    if $prometheus_monitoring_enabled {
-        # Allow automatic generation of config on the
-        # Prometheus master
-        prometheus::jmx_exporter_instance { $::hostname:
-            address => $::ipaddress,
-            port    => 7800,
-        }
-
-        $prometheus_nodes_ferm = join($prometheus_nodes, ' ')
-        ferm::service { 'kafka-broker-jmx_exporter':
-            proto  => 'tcp',
-            port   => '7800',
-            srange => "@resolve((${prometheus_nodes_ferm}))",
-        }
-
-        require_package('prometheus-jmx-exporter')
-
-        $jmx_exporter_config_file = 
'/etc/kafka/broker_prometheus_jmx_exporter.yaml'
-        $java_opts = 
"-javaagent:/usr/share/java/prometheus/jmx_prometheus_javaagent.jar=${::ipaddress}:7800:${jmx_exporter_config_file}"
-
-        # Create the Prometheus JMX Exporter configuration
-        file { $jmx_exporter_config_file:
-            ensure  => present,
-            source  => 
'puppet:///modules/profile/kafka/broker_prometheus_jmx_exporter.yaml',
-            owner   => 'kafka',
-            group   => 'kafka',
-            mode    => '0400',
-            require => Class['::confluent::kafka::broker'],
-        }
-    } else {
+    # If monitoring is enabled, then include the monitoring profile and set 
$java_opts
+    # for exposing the Prometheus JMX Exporter in the Kafka Broker process.
+    if $monitoring_enabled {
+        include ::profile::kafka::broker::monitoring
+        $java_opts = $::profile::kafka::broker::monitoring::java_opts
+    }
+    else {
         $java_opts = undef
     }
 
diff --git a/modules/profile/manifests/kafka/broker/monitoring.pp 
b/modules/profile/manifests/kafka/broker/monitoring.pp
new file mode 100644
index 0000000..8b0a096
--- /dev/null
+++ b/modules/profile/manifests/kafka/broker/monitoring.pp
@@ -0,0 +1,83 @@
+# Class: profile::kafka::broker::monitoring
+#
+# Sets up Prometheus based monitoring and icinga alerts.
+#
+# [*replica_maxlag_warning*]
+#   Max messages a replica can lag before a warning alert is generated.
+#   Hiera: profile::kafka::broker::replica_maxlag_warning
+#
+# [*replica_maxlag_critical*]
+#   Mac messages a replica can lag before a critical alert is generated.
+#   Hiera: profile::kafka::broker::replica_maxlag_critical
+#
+class profile::kafka::broker::monitoring (
+    $cluster                 = hiera('cluster'),
+    $prometheus_nodes        = hiera('prometheus_nodes'),
+    $replica_maxlag_warning  = 
hiera('profile::kafka::broker::monitoring::replica_maxlag_warning'),
+    $replica_maxlag_critical = 
hiera('profile::kafka::broker::monitoring::replica_maxlag_critical'),
+) {
+    ### Expose Kafka Broker JMX metrics to Prometheus
+    require_package('prometheus-jmx-exporter')
+
+    $prometheus_jmx_exporter_port = 7800
+    $jmx_exporter_config_file = 
'/etc/kafka/broker_prometheus_jmx_exporter.yaml'
+
+    # Use this in your JAVA_OPTS you pass to the Kafka  broker process
+    $java_opts = 
"-javaagent:/usr/share/java/prometheus/jmx_prometheus_javaagent.jar=${::ipaddress}:${prometheus_jmx_exporter_port}:${jmx_exporter_config_file}"
+
+    # Create the Prometheus JMX Exporter configuration
+    file { $jmx_exporter_config_file:
+        ensure  => present,
+        source  => 
'puppet:///modules/profile/kafka/broker_prometheus_jmx_exporter.yaml',
+        owner   => 'kafka',
+        group   => 'kafka',
+        mode    => '0400',
+        # Require this to make sure that kafka user and group are already 
created.
+        require => Class['::confluent::kafka::broker'],
+    }
+
+    # Allow automatic generation of config on the Prometheus master
+    prometheus::jmx_exporter_instance { $::hostname:
+        address => $::ipaddress,
+        port    => $prometheus_jmx_exporter_port,
+    }
+
+    $prometheus_nodes_ferm = join($prometheus_nodes, ' ')
+    ferm::service { 'kafka-broker-jmx_exporter':
+        proto  => 'tcp',
+        port   => '7800',
+        srange => "@resolve((${prometheus_nodes_ferm}))",
+    }
+
+
+    ### Icinga alerts
+    # Generate icinga alert if Kafka Broker Server is not running.
+    nrpe::monitor_service { 'kafka':
+        description  => 'Kafka Broker Server',
+        nrpe_command => '/usr/lib/nagios/plugins/check_procs -c 1:1 -C java -a 
"Kafka /etc/kafka/server.properties"',
+        critical     => true,
+    }
+
+    # Prometheus labels for this Kafka Broker instance
+    $prometheus_labels = 
"cluster=${cluster},instance=${::hostname}:${prometheus_jmx_exporter_port},job=jmx_kafka"
+
+    # Alert on the average number of under replicated partitions over the last 
30 minutes.
+    # 
https://grafana.wikimedia.org/dashboard/db/prometheus-kafka?panelId=29&fullscreen
+    monitoring::check_prometheus { 'kafka_broker_under_replicated_partitions':
+        description    => 'Kafka Broker Under Replicated Partitions',
+        query          => 
"scalar(avg_over_time(kafka_server_replicamanager_underreplicatedpartitions{${prometheus_labels}}[30m]))",
+        warning        => 5,
+        critical       => 10,
+        prometheus_url => "http://prometheus.svc.${::site}.wmnet/ops";,
+    }
+
+    # Alert on the average max replica lag over the last 30 minutes.
+    # 
https://grafana.wikimedia.org/dashboard/db/prometheus-kafka?panelId=16&fullscreen
+    monitoring::check_prometheus { 'kafka_broker_replica_max_lag':
+        description    => 'Kafka Broker Replica Max Lag',
+        query          => 
"scalar(avg_over_time(kafka_server_replicafetchermanager_maxlag{${prometheus_labels}}[30m]))",
+        warning        => $replica_maxlag_warning,
+        critical       => $replica_maxlag_critical,
+        prometheus_url => "http://prometheus.svc.${::site}.wmnet/ops";,
+    }
+}
\ No newline at end of file

-- 
To view, visit https://gerrit.wikimedia.org/r/381489
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I839d5de4110da245f712e23285280c2fd546fe8f
Gerrit-PatchSet: 12
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Ottomata <[email protected]>
Gerrit-Reviewer: Elukey <[email protected]>
Gerrit-Reviewer: Filippo Giunchedi <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to