Volans has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/321642 )
Change subject: icinga: raid_handler improvements ...................................................................... icinga: raid_handler improvements - cleanup default ArgumentParser action (store) - use Nagios variable for service description, cleanup Puppet code to not pass it as parameter - add option --message-remain to include the remaining lines of the Nagios status message, if multiline - add option --skip-nrpe to not perform the NRPE call to gather the RAID status but instead use the complete Nagios status message in the task. This option is required for frack instances where an NRPE call is not possible - allow to pass 'n/a' as RAID type, if the information is not available. In this case it behaves as --skip-nrpe was set. Once merged it requires a change in secret('nagios/nsca_frack.cfg') to add the event handler for the check_raid checks with something like: event_handler raid_handler_skip_nrpe!raid-type!dcname where: - raid-type is one of ('megacli', 'hpssacli', 'mpt', 'md', 'n/a'). When using raid_handler_skip_nrpe is just used in messages and logging, so also a 'n/a' can be provided if the information is not available. - dcname is the datacenter where the host is in (i.e. eqiad), and has to match the equivalent Phabricator component (ops-$dcname). Bug: T149913 Change-Id: I1b052f01d887908d25f9555cb1ae549728c835a9 --- M modules/icinga/files/raid_handler.py M modules/icinga/templates/event_handlers/raid_handler.cfg.erb M modules/raid/manifests/init.pp 3 files changed, 46 insertions(+), 30 deletions(-) Approvals: Faidon Liambotis: Looks good to me, but someone else must approve jenkins-bot: Verified Jgreen: Looks good to me, but someone else must approve Volans: Looks good to me, approved diff --git a/modules/icinga/files/raid_handler.py b/modules/icinga/files/raid_handler.py index 0700433..de144e3 100644 --- a/modules/icinga/files/raid_handler.py +++ b/modules/icinga/files/raid_handler.py @@ -5,6 +5,7 @@ import ConfigParser import logging import subprocess +import sys import time import zlib @@ -15,7 +16,7 @@ SERVICE_STATES = ('OK', 'UNKNOWN', 'WARNING', 'CRITICAL') SERVICE_STATE_TYPES = ('SOFT', 'HARD') -RAID_TYPES = ('megacli', 'hpssacli', 'mpt', 'md') +RAID_TYPES = ('megacli', 'hpssacli', 'mpt', 'md', 'n/a') COMPRESSED_RAID_TYPES = ('megacli', 'hpssacli') SKIP_STRINGS = ('timeout', 'timed out', 'connection refused', 'out of bounds') @@ -34,8 +35,8 @@ PHABRICATOR_TASK_TITLE = "Degraded RAID on {host}" PHABRICATOR_TASK_DESCRIPTION_PREFIX = ( "TASK AUTO-GENERATED by Nagios/Icinga RAID event handler\n\n" - "A degraded RAID [[ {url} | was detected ]] on host `{host}`. An " - "automatic snapshot of the current RAID status is attached below.\n\n" + "A degraded RAID ({type}) [[ {url} | was detected ]] on host `{host}`. " + "An automatic snapshot of the current RAID status is attached below.\n\n" "Please **sync with the service owner** to find the appropriate time " "window before actually replacing any failed hardware." ) @@ -49,29 +50,35 @@ parser = argparse.ArgumentParser( description='Nagios/Icinga event handler for RAID checks') parser.add_argument( - '-s', dest='service_state', action='store', required=True, + '-s', dest='service_state', required=True, choices=SERVICE_STATES, help='Nagios/Icinga service state') parser.add_argument( - '-t', dest='service_state_type', action='store', required=True, + '-t', dest='service_state_type', required=True, choices=SERVICE_STATE_TYPES, help='Nagios/Icinga service state type') parser.add_argument( - '-a', dest='service_attempts', action='store', required=True, type=int, + '-a', dest='service_attempts', required=True, type=int, help='Nagios/Icinga service retry attemp counter') parser.add_argument( - '-H', dest='host_address', action='store', required=True, + '-H', dest='host_address', required=True, help='Hostname/address of the monitored host') parser.add_argument( - '-r', dest='raid_type', action='store', required=True, - choices=RAID_TYPES, help='The RAID type') + '-r', dest='raid_type', required=True, choices=RAID_TYPES, + help='The RAID type, if n/a behaves as --skip-nrpe was set') parser.add_argument( - '-D', dest='service_description', action='store', required=True, + '-D', dest='service_description', required=True, help='The Nagios/Icinga service description') parser.add_argument( - '-c', dest='datacenter', action='store', required=True, + '-c', dest='datacenter', required=True, help='The name of the datacenter the host is located in') parser.add_argument( - '-m', dest='message', action='store', required=True, + '-m', dest='message', required=True, help='The service Status information output (first line)') + parser.add_argument( + '--message-remain', default='', + help='The service Status information output (remaining lines)') + parser.add_argument( + '--skip-nrpe', action='store_true', + help='Do not get the RAID status via NRPE, rely only on the message') parser.add_argument( '-d', dest='debug', action='store_true', help='Debug level logging') @@ -152,19 +159,20 @@ def open_phabricator_task( - phab_client, project_ids, host, raid_status, icinga_url): + phab_client, project_ids, host, raid_type, raid_status, icinga_url): """ Open a task on Phabricator and return it Arguments: phab_client -- a Phabricator client instance project_ids -- the PHIDs to tag the task with host -- the hostname of the affected host + raid_type -- the RAID type, one of RAID_TYPES raid_status -- the RAID status message to include in the task icinga_url -- the URL of the Icinga alarm that triggered this handler """ description_prefix = PHABRICATOR_TASK_DESCRIPTION_PREFIX.format( - host=host, url=icinga_url) + type=raid_type, host=host, url=icinga_url) description = '{description_prefix}\n```\n{raid_status}\n```'.format( description_prefix=description_prefix, raid_status=raid_status) @@ -231,14 +239,19 @@ args.message)) return - raid_status = get_raid_status(args.host_address, args.raid_type) + if args.skip_nrpe or args.raid_type == 'n/a': + logger.debug('Skipping NRPE RAID status gathering') + raid_status = '{}\n{}'.format(args.message, args.message_remain) + else: + raid_status = get_raid_status(args.host_address, args.raid_type) + phab_client = get_phabricator_client() project_ids = get_phabricator_project_ids(phab_client, args.datacenter) icinga_url = ICINGA_URL.format( host=args.host_address, service=args.service_description) - task = open_phabricator_task( - phab_client, project_ids, args.host_address, raid_status, icinga_url) + task = open_phabricator_task(phab_client, project_ids, args.host_address, + args.raid_type, raid_status, icinga_url) acknowledge_nagios_alert( args.host_address, args.service_description, task['uri']) @@ -256,3 +269,5 @@ main() except Exception: logger.exception("Unable to handle RAID check alert") + if sys.stdout.isatty(): + raise diff --git a/modules/icinga/templates/event_handlers/raid_handler.cfg.erb b/modules/icinga/templates/event_handlers/raid_handler.cfg.erb index 8104f04..7469197 100644 --- a/modules/icinga/templates/event_handlers/raid_handler.cfg.erb +++ b/modules/icinga/templates/event_handlers/raid_handler.cfg.erb @@ -1,4 +1,9 @@ define command{ command_name raid_handler - command_line $USER1$/eventhandlers/raid_handler -d -s $SERVICESTATE$ -t $SERVICESTATETYPE$ -a $SERVICEATTEMPT$ -H $HOSTNAME$ -m "$SERVICEOUTPUT$" -r $ARG1$ -D "$ARG2$" -c $ARG3$ + command_line $USER1$/eventhandlers/raid_handler -d -s $SERVICESTATE$ -t $SERVICESTATETYPE$ -a $SERVICEATTEMPT$ -H $HOSTNAME$ -D "$SERVICEDESC$" -m "$SERVICEOUTPUT$" -r "$ARG1$" -c "$ARG2$" + } + +define command{ + command_name raid_handler_skip_nrpe + command_line $USER1$/eventhandlers/raid_handler -d -s $SERVICESTATE$ -t $SERVICESTATETYPE$ -a $SERVICEATTEMPT$ -H $HOSTNAME$ -D "$SERVICEDESC$" -m "$SERVICEOUTPUT$" --message-remain="$LONGSERVICEOUTPUT$" --skip-nrpe -r "$ARG1$" -c "$ARG2$" } diff --git a/modules/raid/manifests/init.pp b/modules/raid/manifests/init.pp index 71fcd0d..fd5a071 100644 --- a/modules/raid/manifests/init.pp +++ b/modules/raid/manifests/init.pp @@ -44,13 +44,12 @@ command => "/usr/bin/sudo ${get_raid_status_megacli} -c", } - $service_description_megaraid = 'MegaRAID' nrpe::monitor_service { 'raid_megaraid': - description => $service_description_megaraid, + description => 'MegaRAID', nrpe_command => "${check_raid} megacli", check_interval => $check_interval, retry_interval => $retry_interval, - event_handler => "raid_handler!megacli!${service_description_megaraid}!${::site}", + event_handler => "raid_handler!megacli!${::site}", } } @@ -88,14 +87,13 @@ ], } - $service_description_hp = 'HP RAID' nrpe::monitor_service { 'raid_hpssacli': - description => $service_description_hp, + description => 'HP RAID', nrpe_command => '/usr/local/lib/nagios/plugins/check_hpssacli', timeout => 50, # can take > 10s on servers with lots of disks check_interval => $check_interval, retry_interval => $retry_interval, - event_handler => "raid_handler!hpssacli!${service_description_hp}!${::site}", + event_handler => "raid_handler!hpssacli!${::site}", } $get_raid_status_hpssacli = '/usr/local/lib/nagios/plugins/get-raid-status-hpssacli' @@ -127,13 +125,12 @@ before => Package['mpt-status'], } - $service_description_mpt = 'MPT RAID' nrpe::monitor_service { 'raid_mpt': - description => $service_description_mpt, + description => 'MPT RAID', nrpe_command => "${check_raid} mpt", check_interval => $check_interval, retry_interval => $retry_interval, - event_handler => "raid_handler!mpt!${service_description_mpt}!${::site}", + event_handler => "raid_handler!mpt!${::site}", } nrpe::check { 'get_raid_status_mpt': @@ -144,11 +141,10 @@ if 'md' in $raid { # if there is an "md" RAID configured, mdadm is already installed - $service_description_md = 'MD RAID' nrpe::monitor_service { 'raid_md': - description => $service_description_md, + description => 'MD RAID', nrpe_command => "${check_raid} md", - event_handler => "raid_handler!md!${service_description_md}!${::site}", + event_handler => "raid_handler!md!${::site}", } nrpe::check { 'get_raid_status_md': -- To view, visit https://gerrit.wikimedia.org/r/321642 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I1b052f01d887908d25f9555cb1ae549728c835a9 Gerrit-PatchSet: 4 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Volans <rcocci...@wikimedia.org> Gerrit-Reviewer: Alexandros Kosiaris <akosia...@wikimedia.org> Gerrit-Reviewer: Faidon Liambotis <fai...@wikimedia.org> Gerrit-Reviewer: Filippo Giunchedi <fgiunch...@wikimedia.org> Gerrit-Reviewer: Jgreen <jgr...@wikimedia.org> Gerrit-Reviewer: Volans <rcocci...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits