Alexandros Kosiaris has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/373291 )
Change subject: WIP: Allow silencing notifications for hosts
......................................................................
WIP: Allow silencing notifications for hosts
Add a notifications_enabled parameter to monitoring::host and
monitoring::service, defaulting to 1. This is passed straight to the
nagios_host and nagios_service resource respectively allowing to
selectively have disabled notification on a per host level. In the case
of monitoring::host the aforementioned parameter gets passed directly
from profile::base, effectively curbing the extent of this patch only to
puppet enabled hosts. This protects all "virtual" hosts (e.g. LVS) from
being accidentaly silenced. On the monitoring::service side, things are
not so easy since that define is used in a myriad places. Instead of
violating our puppet coding policy and adding one more hiera lookup in a
module class, lookup directly $::profile::base::notifications_enabled
instead, then safeguard it in the case it's not defined
There is one caveat with this approach. Setting the hiera parameter on
the icinga host level will cause all virtual host (e.g. LVS) bound
services to be silenced.
TODO: Actually create an event handler
Bug: T151632
Change-Id: I3047d9421c035c7004ca42b248f584216b8e5bdb
---
M modules/monitoring/manifests/host.pp
M modules/monitoring/manifests/service.pp
M modules/profile/manifests/base.pp
3 files changed, 53 insertions(+), 30 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet
refs/changes/91/373291/1
diff --git a/modules/monitoring/manifests/host.pp
b/modules/monitoring/manifests/host.pp
index 9e99a6d..ce02077 100644
--- a/modules/monitoring/manifests/host.pp
+++ b/modules/monitoring/manifests/host.pp
@@ -2,14 +2,15 @@
# Exports the resource that monitors hosts in icinga/shinken
#
define monitoring::host (
- $ip_address = $facts['ipaddress'],
- $os = $facts['operatingsystem'],
- $host_fqdn = undef,
- $group = undef,
- $ensure = present,
- $critical = false,
- $parents = undef,
- $contact_group = hiera('contactgroups', 'admins'),
+ $ip_address = $facts['ipaddress'],
+ $os = $facts['operatingsystem'],
+ $host_fqdn = undef,
+ $group = undef,
+ $ensure = present,
+ $critical = false,
+ $parents = undef,
+ $contact_group = hiera('contactgroups', 'admins'),
+ $notifications_enabled = 1,
) {
$nagios_address = $host_fqdn ? {
@@ -64,6 +65,7 @@
$real_parents = undef
}
# We have a BMC, and the BMC is configured and it has an IP address
+ # We always monitor the BMC so never skip notifications
if $facts['has_ipmi'] and $facts['ipmi_lan'] and 'ipaddress' in
$facts['ipmi_lan'] {
$mgmt_host = {
"${title}.mgmt" => {
@@ -100,6 +102,7 @@
check_command => 'check_ping!500,20%!2000,100%',
check_period => '24x7',
max_check_attempts => 2,
+ notifications_enabled => $notifications_enabled,
contact_groups => $real_contact_groups,
notification_interval => 0,
notification_period => '24x7',
@@ -119,21 +122,24 @@
create_resources($rtype, $host)
if $mgmt_host {
create_resources($rtype, $mgmt_host)
+ # We always monitor the BMC so never skip notifications
monitoring::service { "dns_${title}.mgmt":
- description => "DNS ${title}.mgmt",
- host => "${title}.mgmt",
- check_command => "check_fqdn!${title}.mgmt.${::site}.wmnet",
- group => 'mgmt',
- check_interval => 60,
- retry_interval => 60,
+ description => "DNS ${title}.mgmt",
+ host => "${title}.mgmt",
+ check_command =>
"check_fqdn!${title}.mgmt.${::site}.wmnet",
+ notifications_enabled => 1,
+ group => 'mgmt',
+ check_interval => 60,
+ retry_interval => 60,
}
monitoring::service { "ssh_${title}.mgmt":
- description => "SSH ${title}.mgmt",
- host => "${title}.mgmt",
- check_command => 'check_ssh',
- group => 'mgmt',
- check_interval => 60,
- retry_interval => 60,
+ description => "SSH ${title}.mgmt",
+ host => "${title}.mgmt",
+ check_command => 'check_ssh',
+ notifications_enabled => 1,
+ group => 'mgmt',
+ check_interval => 60,
+ retry_interval => 60,
}
}
}
diff --git a/modules/monitoring/manifests/service.pp
b/modules/monitoring/manifests/service.pp
index 3ddcfb0..63fae93 100644
--- a/modules/monitoring/manifests/service.pp
+++ b/modules/monitoring/manifests/service.pp
@@ -13,6 +13,7 @@
$contact_group = hiera('contactgroups', 'admins'),
$config_dir = '/etc/nagios',
$event_handler = undef,
+ $notifications_enabled = $::profile::base::notifications_enabled,
)
{
# the list of characters is the default for illegal_object_name_chars
@@ -71,6 +72,23 @@
default => undef,
}
+ # Safeguard against notifications enabled not being defined due to class
+ # declarations
+ if $notifications_enabled {
+ $real_notifications_enabled = $notifications_enabled
+ } else {
+ $real_notifications_enabled = '1'
+ }
+ # XXX: Actually setup an event handler
+ # We setup a specific event handler for services that have no notifications
+ # enabled and do not already have a service handler defined. This should
+ # maintain backwards compatibility
+ if $real_notifications_enabled == '0' and !event_handler {
+ $real_event_handler = 'XXX: Figure this out'
+ } else {
+ $real_event_handler = $event_handler
+ }
+
# the nagios service instance
$service = {
"${::hostname} ${title}" => {
@@ -86,13 +104,14 @@
notification_interval => $notification_interval,
notification_period => '24x7',
notification_options => 'c,r,f',
+ notifications_enabled => $real_notifications_enabled,
contact_groups => $real_contact_groups,
passive_checks_enabled => 1,
active_checks_enabled => $is_active,
is_volatile => $check_volatile,
check_freshness => $check_fresh,
freshness_threshold => $is_fresh,
- event_handler => $event_handler,
+ event_handler => $real_event_handler,
},
}
# This is a hack. We detect if we are running on the scope of an icinga
diff --git a/modules/profile/manifests/base.pp
b/modules/profile/manifests/base.pp
index b973b5b..7f1c192 100644
--- a/modules/profile/manifests/base.pp
+++ b/modules/profile/manifests/base.pp
@@ -5,7 +5,7 @@
$use_apt_proxy = hiera('profile::base::use_apt_proxy', true),
$domain_search = hiera('profile::base::domain_search', $::domain),
$remote_syslog = hiera('profile::base:remote_syslog',
['syslog.eqiad.wmnet', 'syslog.codfw.wmnet']),
- $monitoring = hiera('profile::base::monitoring', true),
+ $notifications_enabled = hiera('profile::base::notifications_enabled',
'1'),
$core_dump_pattern = hiera('profile::base::core_dump_pattern',
'/var/tmp/core/core.%h.%e.%p.%t'),
$ssh_server_settings = hiera('profile::base::ssh_server_settings', {}),
$nrpe_allowed_hosts = hiera('profile::base::nrpe_allowed_hosts',
'127.0.0.1,208.80.154.14,208.80.153.74,208.80.155.119'),
@@ -92,14 +92,12 @@
class { '::base::initramfs': }
}
- # unless disabled in Hiera, have Icinga monitoring (T151632)
- if $monitoring {
- class { '::base::monitoring::host':
- contact_group => $group_contact,
- nrpe_check_disk_options => $check_disk_options,
- nrpe_check_disk_critical => $check_disk_critical,
- raid_write_cache_policy => $check_raid_policy,
- }
+ class { '::base::monitoring::host':
+ contact_group => $group_contact,
+ nrpe_check_disk_options => $check_disk_options,
+ nrpe_check_disk_critical => $check_disk_critical,
+ raid_write_cache_policy => $check_raid_policy,
+ notifications_enabled => $notifications_enabled,
}
if os_version('ubuntu == trusty') {
--
To view, visit https://gerrit.wikimedia.org/r/373291
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I3047d9421c035c7004ca42b248f584216b8e5bdb
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Alexandros Kosiaris <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits