Ottomata has submitted this change and it was merged. Change subject: Fix for check_ganglia, removing unused checkcommands ......................................................................
Fix for check_ganglia, removing unused checkcommands Cannot use $HOSTADDRESS$ as argument to check_ganglia's -H flag. Ganglia IDs hosts by fqdn. Manually passing in fqdn as a positional argument macro. Also renamed threshold parameters. Change-Id: Ie52bd80b37159b0e074fe37514b7fe79373c27b7 --- M manifests/ganglia.pp M manifests/misc/icinga.pp M manifests/misc/udp2log.pp M manifests/nagios.pp M manifests/role/analytics/kafka.pp M manifests/role/cache.pp M templates/icinga/checkcommands.cfg.erb 7 files changed, 26 insertions(+), 44 deletions(-) Approvals: Ottomata: Verified; Looks good to me, approved jenkins-bot: Verified diff --git a/manifests/ganglia.pp b/manifests/ganglia.pp index 6c859f8..9bf2f72 100644 --- a/manifests/ganglia.pp +++ b/manifests/ganglia.pp @@ -264,14 +264,13 @@ case $::hostname { # manutius runs gmetad to get varnish data into torrus # unlike other servers, manutius uses the default rrd_rootdir - # neon needs gmetad for ganglios /^manutius$/: { $data_sources = { "Upload caches eqiad" => "cp1048.eqiad.wmnet cp1061.eqiad.wmnet" } $rra_sizes = '"RRA:AVERAGE:0:1:4032" "RRA:AVERAGE:0.17:6:2016" "RRA:MAX:0.17:6:2016" "RRA:AVERAGE:0.042:288:732" "RRA:MAX:0.042:288:732"' } - # neon runs gmetad for ganglios + # neon needs gmetad config for ganglios /^neon$/: { $data_sources = { "Miscellaneous" => "tarin.pmtpa.wmnet", diff --git a/manifests/misc/icinga.pp b/manifests/misc/icinga.pp index 93ad0e3..e22954a 100644 --- a/manifests/misc/icinga.pp +++ b/manifests/misc/icinga.pp @@ -893,8 +893,6 @@ # directly, rather than downloading and mangling # xmlfiles from each aggregator. # -# TODO: will deprectate and remove ganglios soon. -# class icinga::ganglia::check { package { 'check-ganglia': ensure => 'installed', diff --git a/manifests/misc/udp2log.pp b/manifests/misc/udp2log.pp index ee941f0..db31cc0 100644 --- a/manifests/misc/udp2log.pp +++ b/manifests/misc/udp2log.pp @@ -256,8 +256,8 @@ monitor_ganglia{ "udp2log-${name}-packetloss": description => 'Packetloss_Average', metric => 'packet_loss_average', - warning_threshold => '4', - critical_threshold => '8', + warning => '4', + critical => '8', contact_group => "admins,analytics", # ganglia-logtailer only runs every 5. # let's make nagios check every 2 minutes (to match ganglia_parser) diff --git a/manifests/nagios.pp b/manifests/nagios.pp index 311559e..c6282c1 100644 --- a/manifests/nagios.pp +++ b/manifests/nagios.pp @@ -307,15 +307,19 @@ # monitor_ganglia { 'hdfs-capacity-remaining': # description => 'GB free in HDFS', # metric => 'Hadoop.NameNode.FSNamesystem.CapacityRemainingGB', -# warning_threshold => ':1024', -# critical_threshold => ':512, +# warning => ':1024', +# critical => ':512, # } # # == Parameters # $description - Description of icinga alert # $metric - ganglia metric name # $warning - alert warning threshold -# $critical_threshold - alert critical threshold +# $critical - alert critical threshold +# $metric_host - hostname in ganglia we want to monitor. +# Can't use nagios macro in checkcommands.cfg +# because fqdn is not available. +# Default: $::fqdn of this node # $gmetad_host - Default: 'nickel.wikimedia.org' # $gmetad_query_port - gmetad XML query interface port. Default: 8654 # $host @@ -332,8 +336,9 @@ define monitor_ganglia( $description, $metric, - $warning_threshold, - $critical_threshold, + $warning, + $critical, + $metric_host = $::fqdn, $gmetad_host = 'nickel.wikimedia.org', $gmetad_query_port = 8654, $host = $::hostname, @@ -353,6 +358,7 @@ # are passed to check_ganglia script: # $ARG1$ -g gmetad host # $ARG2$ -p gmetad xml query port + # $ARG3$ -H Host for which we want metrics # $ARG3$ -m ganglia metric name # $ARG4$ -w warning threshold # $ARG5$ -c critical threshold @@ -360,7 +366,7 @@ monitor_service { $title: ensure => $ensure, description => $description, - check_command => "check_ganglia!${gmetad_host}!${gmetad_query_port}!${metric}!${warning_threshold}!${critical_threshold}", + check_command => "check_ganglia!${gmetad_host}!${gmetad_query_port}!${metric_host}!${metric}!${warning}!${critical}", retries => $retries, group => $group, critical => $critical, diff --git a/manifests/role/analytics/kafka.pp b/manifests/role/analytics/kafka.pp index 1750fca..22f30fe 100644 --- a/manifests/role/analytics/kafka.pp +++ b/manifests/role/analytics/kafka.pp @@ -155,11 +155,11 @@ # These thresholds have to be manually set. # adjust them if you add or remove data from Kafka topics. monitor_ganglia { 'kafka-broker-MessagesIn': - description => 'Kafka Broker Messages In', - metric => 'kafka.server.BrokerTopicMetrics.AllTopicsMessagesInPerSec.FifteenMinuteRate', - warning_threshold => ':1500.0', - critical_threshold => ':1000.0', - require => Class['::kafka::server::jmxtrans'], + description => 'Kafka Broker Messages In', + metric => 'kafka.server.BrokerTopicMetrics.AllTopicsMessagesInPerSec.FifteenMinuteRate', + warning => ':1500.0', + critical => ':1000.0', + require => Class['::kafka::server::jmxtrans'], } } diff --git a/manifests/role/cache.pp b/manifests/role/cache.pp index a12ef1c..0490b96 100644 --- a/manifests/role/cache.pp +++ b/manifests/role/cache.pp @@ -448,11 +448,11 @@ # Generate an alert if we ever see any delivery report errors monitor_ganglia { 'varnishkafka-drerr': - description => 'Varnishkafka Delivery Errors', - metric => 'kafka.varnishkafka.kafka_drerr.per_second', - warning_threshold => '0.0', - critical_threshold => '0.0', - require => Class['::varnishkafka::monitoring'], + description => 'Varnishkafka Delivery Errors', + metric => 'kafka.varnishkafka.kafka_drerr.per_second', + warning => '0.0', + critical => '0.0', + require => Class['::varnishkafka::monitoring'], } } } diff --git a/templates/icinga/checkcommands.cfg.erb b/templates/icinga/checkcommands.cfg.erb index 45b6aa1..6a148d3 100644 --- a/templates/icinga/checkcommands.cfg.erb +++ b/templates/icinga/checkcommands.cfg.erb @@ -495,12 +495,6 @@ command_line $USER1$/check_procs -w $ARG1$:$ARG2$ -c $ARG3$:$ARG4$ -C $ARG5$ } -# check that logging packet loss is not too high -define command{ - command_name check_packet_loss_ave - command_line $USER1$/check_ganglios_generic_value -H $HOSTADDRESS$ -m packet_loss_average -w $ARG1$ -c $ARG2$ -o gt -} - define command{ command_name check_memory_used command_line $USER3$/check_ganglios_memory_v2 -H $HOSTADDRESS -w $ARG1$ -c $ARG2$ @@ -509,7 +503,7 @@ # check arbitrary ganglia metric values define command{ command_name check_ganglia - command_line $USER1$/check_ganglia -q -g $ARG1$ -p $ARG2$ -H $HOSTADDRESS$ -m '$ARG3$' -w '$ARG4$' -c '$ARG5$' + command_line $USER1$/check_ganglia -q -g $ARG1$ -p $ARG2$ -H $ARG3$ -m '$ARG4$' -w '$ARG5$' -c '$ARG6$' } # percona mysql checks @@ -559,21 +553,6 @@ command_name check_to_check_nagios_paging command_line $USER1$/check_to_check_nagios_paging } - - -# Analytics checks - -# Check that Kafka Brokers are getting messages produced to them. -define command{ - command_name check_kafka_broker_messages_in - command_line $USER1$/check_ganglios_generic_value -H $HOSTADDRESS$ -m kafka.server.BrokerTopicMetrics.AllTopicsMessagesInPerSec.FifteenMinuteRate -w $ARG1$ -c $ARG2$ -o lt -} - -define command{ - command_name check_varnishkafka_drerr - command_line $USER1$/check_ganglios_generic_value -H $HOSTADDRESS$ -m kafka.varnishkafka.kafka_drerr.per_second -w $ARG1$ -c $ARG2$ -o gt -} - # Elasticsearch Checks -- To view, visit https://gerrit.wikimedia.org/r/107896 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ie52bd80b37159b0e074fe37514b7fe79373c27b7 Gerrit-PatchSet: 3 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Ottomata <o...@wikimedia.org> Gerrit-Reviewer: Ottomata <o...@wikimedia.org> Gerrit-Reviewer: jenkins-bot _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits