Lcarr has uploaded a new change for review. https://gerrit.wikimedia.org/r/55785
Change subject: big nagios cleanup, part 2 ...................................................................... big nagios cleanup, part 2 Change-Id: I768cb221b9b5e6b4f03f32f01cebe47bea8e0ba0 --- R files/icinga/check_dpkg R files/icinga/check_ram.sh R files/icinga/check_subdir_limit D files/nagios/cgi.cfg D files/nagios/check-raid.py M manifests/misc/icinga.pp M manifests/nagios.pp 7 files changed, 5 insertions(+), 645 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/85/55785/1 diff --git a/files/nagios/check_dpkg b/files/icinga/check_dpkg similarity index 100% rename from files/nagios/check_dpkg rename to files/icinga/check_dpkg diff --git a/files/nagios/check_ram.sh b/files/icinga/check_ram.sh similarity index 100% rename from files/nagios/check_ram.sh rename to files/icinga/check_ram.sh diff --git a/files/nagios/check_subdir_limit b/files/icinga/check_subdir_limit similarity index 100% rename from files/nagios/check_subdir_limit rename to files/icinga/check_subdir_limit diff --git a/files/nagios/cgi.cfg b/files/nagios/cgi.cfg deleted file mode 100644 index 521b00c..0000000 --- a/files/nagios/cgi.cfg +++ /dev/null @@ -1,282 +0,0 @@ -################################################################# -# -# CGI.CFG - Sample CGI Configuration File for Nagios -# -# Last Modified: 05-05-2005 -# -################################################################# - - -# MAIN CONFIGURATION FILE -# This tells the CGIs where to find your main configuration file. -# The CGIs will read the main and host config files for any other -# data they might need. - -main_config_file=/etc/nagios/nagios.cfg - - - -# PHYSICAL HTML PATH -# This is the path where the HTML files for Nagios reside. This -# value is used to locate the logo images needed by the statusmap -# and statuswrl CGIs. - -physical_html_path=/usr/share/nagios - - - -# URL HTML PATH -# This is the path portion of the URL that corresponds to the -# physical location of the Nagios HTML files (as defined above). -# This value is used by the CGIs to locate the online documentation -# and graphics. If you access the Nagios pages with an URL like -# http://www.myhost.com/nagios, this value should be '/nagios' -# (without the quotes). - -url_html_path=/ - - - -# CONTEXT-SENSITIVE HELP -# This option determines whether or not a context-sensitive -# help icon will be displayed for most of the CGIs. -# Values: 0 = disables context-sensitive help -# 1 = enables context-sensitive help - -show_context_help=0 - - - -# NAGIOS PROCESS CHECK COMMAND -# This is the full path and filename of the program used to check -# the status of the Nagios process. It is used only by the CGIs -# and is completely optional. However, if you don't use it, you'll -# see warning messages in the CGIs about the Nagios process -# not running and you won't be able to execute any commands from -# the web interface. The program should follow the same rules -# as plugins; the return codes are the same as for the plugins, -# it should have timeout protection, it should output something -# to STDIO, etc. -# -# Note: The command line for the check_nagios plugin below may -# have to be tweaked a bit, as different versions of the plugin -# use different command line arguments/syntaxes. - -#nagios_check_command=/usr/lib/nagios/plugins/check_nagios /var/log/nagios/status.dat 5 '/usr/bin/nagios' - - - -# AUTHENTICATION USAGE -# This option controls whether or not the CGIs will use any -# authentication when displaying host and service information, as -# well as committing commands to Nagios for processing. -# -# Read the HTML documentation to learn how the authorization works! -# -# NOTE: It is a really *bad* idea to disable authorization, unless -# you plan on removing the command CGI (cmd.cgi)! Failure to do -# so will leave you wide open to kiddies messing with Nagios and -# possibly hitting you with a denial of service attack by filling up -# your drive by continuously writing to your command file! -# -# Setting this value to 0 will cause the CGIs to *not* use -# authentication (bad idea), while any other value will make them -# use the authentication functions (the default). - -use_authentication=1 - - - -# DEFAULT USER -# Setting this variable will define a default user name that can -# access pages without authentication. This allows people within a -# secure domain (i.e., behind a firewall) to see the current status -# without authenticating. You may want to use this to avoid basic -# authentication if you are not using a sercure server since basic -# authentication transmits passwords in the clear. -# -# Important: Do not define a default username unless you are -# running a secure web server and are sure that everyone who has -# access to the CGIs has been authenticated in some manner! If you -# define this variable, anyone who has not authenticated to the web -# server will inherit all rights you assign to this user! - -default_user_name=guest - - - -# SYSTEM/PROCESS INFORMATION ACCESS -# This option is a comma-delimited list of all usernames that -# have access to viewing the Nagios process information as -# provided by the Extended Information CGI (extinfo.cgi). By -# default, *no one* has access to this unless you choose to -# not use authorization. You may use an asterisk (*) to -# authorize any user who has authenticated to the web server. - -authorized_for_system_information=tstarling,RobH,mark,midom,laner,ariel,py,asher,dzahn,lcarr,jgreen - - - -# CONFIGURATION INFORMATION ACCESS -# This option is a comma-delimited list of all usernames that -# can view ALL configuration information (hosts, commands, etc). -# By default, users can only view configuration information -# for the hosts and services they are contacts for. You may use -# an asterisk (*) to authorize any user who has authenticated -# to the web server. - -authorized_for_configuration_information=tstarling,RobH,mark,midom,laner,ariel,py,asher,dzahn,lcarr,jgreen - - - -# SYSTEM/PROCESS COMMAND ACCESS -# This option is a comma-delimited list of all usernames that -# can issue shutdown and restart commands to Nagios via the -# command CGI (cmd.cgi). Users in this list can also change -# the program mode to active or standby. By default, *no one* -# has access to this unless you choose to not use authorization. -# You may use an asterisk (*) to authorize any user who has -# authenticated to the web server. - -authorized_for_system_commands=tstarling,RobH,mark,midom,laner,ariel,py,asher,dzahn,lcarr,jgreen - - - -# GLOBAL HOST/SERVICE VIEW ACCESS -# These two options are comma-delimited lists of all usernames that -# can view information for all hosts and services that are being -# monitored. By default, users can only view information -# for hosts or services that they are contacts for (unless you -# you choose to not use authorization). You may use an asterisk (*) -# to authorize any user who has authenticated to the web server. - - -#authorized_for_all_services=nagiosadmin,guest -#authorized_for_all_hosts=nagiosadmin,guest - -authorized_for_all_services=* -authorized_for_all_hosts=* - - -# GLOBAL HOST/SERVICE COMMAND ACCESS -# These two options are comma-delimited lists of all usernames that -# can issue host or service related commands via the command -# CGI (cmd.cgi) for all hosts and services that are being monitored. -# By default, users can only issue commands for hosts or services -# that they are contacts for (unless you you choose to not use -# authorization). You may use an asterisk (*) to authorize any -# user who has authenticated to the web server. - -#authorized_for_all_service_commands=nagiosadmin -#authorized_for_all_host_commands=nagiosadmin - -authorized_for_all_service_commands=tstarling,RobH,mark,midom,laner,ariel,py,asher,dzahn,lcarr,jgreen -authorized_for_all_host_commands=tstarling,RobH,mark,midom,laner,ariel,py,asher,dzahn,lcarr,jgreen - - -# STATUSMAP BACKGROUND IMAGE -# This option allows you to specify an image to be used as a -# background in the statusmap CGI. It is assumed that the image -# resides in the HTML images path (i.e. /usr/local/nagios/share/images). -# This path is automatically determined by appending "/images" -# to the path specified by the 'physical_html_path' directive. -# Note: The image file may be in GIF, PNG, JPEG, or GD2 format. -# However, I recommend that you convert your image to GD2 format -# (uncompressed), as this will cause less CPU load when the CGI -# generates the image. - -#statusmap_background_image=smbackground.gd2 - - - -# DEFAULT STATUSMAP LAYOUT METHOD -# This option allows you to specify the default layout method -# the statusmap CGI should use for drawing hosts. If you do -# not use this option, the default is to use user-defined -# coordinates. Valid options are as follows: -# 0 = User-defined coordinates -# 1 = Depth layers -# 2 = Collapsed tree -# 3 = Balanced tree -# 4 = Circular -# 5 = Circular (Marked Up) - -default_statusmap_layout=5 - - - -# DEFAULT STATUSWRL LAYOUT METHOD -# This option allows you to specify the default layout method -# the statuswrl (VRML) CGI should use for drawing hosts. If you -# do not use this option, the default is to use user-defined -# coordinates. Valid options are as follows: -# 0 = User-defined coordinates -# 2 = Collapsed tree -# 3 = Balanced tree -# 4 = Circular - -default_statuswrl_layout=4 - - - -# STATUSWRL INCLUDE -# This option allows you to include your own objects in the -# generated VRML world. It is assumed that the file -# resides in the HTML path (i.e. /usr/local/nagios/share). - -#statuswrl_include=myworld.wrl - - - -# PING SYNTAX -# This option determines what syntax should be used when -# attempting to ping a host from the WAP interface (using -# the statuswml CGI. You must include the full path to -# the ping binary, along with all required options. The -# $HOSTADDRESS$ macro is substituted with the address of -# the host before the command is executed. -# Please note that the syntax for the ping binary is -# notorious for being different on virtually ever *NIX -# OS and distribution, so you may have to tweak this to -# work on your system. - -ping_syntax=/bin/ping -n -U -c 5 $HOSTADDRESS$ - - - -# REFRESH RATE -# This option allows you to specify the refresh rate in seconds -# of various CGIs (status, statusmap, extinfo, and outages). - -refresh_rate=90 - - - -# SOUND OPTIONS -# These options allow you to specify an optional audio file -# that should be played in your browser window when there are -# problems on the network. The audio files are used only in -# the status CGI. Only the sound for the most critical problem -# will be played. Order of importance (higher to lower) is as -# follows: unreachable hosts, down hosts, critical services, -# warning services, and unknown services. If there are no -# visible problems, the sound file optionally specified by -# 'normal_sound' variable will be played. -# -# -# <varname>=<sound_file> -# -# Note: All audio files must be placed in the /media subdirectory -# under the HTML path (i.e. /usr/local/nagios/share/media/). - -host_unreachable_sound=hostdown.wav -host_down_sound=hostdown.wav -service_critical_sound=critical.wav -service_warning_sound=warning.wav -service_unknown_sound=warning.wav -#normal_sound=noproblem.wav - - -notes_url_target=main -action_url_target=main -lock_author_names=1 diff --git a/files/nagios/check-raid.py b/files/nagios/check-raid.py deleted file mode 100755 index 116ac8f..0000000 --- a/files/nagios/check-raid.py +++ /dev/null @@ -1,341 +0,0 @@ -#!/usr/bin/python - -import sys, os, os.path, re, subprocess - -def main(): - osName = os.uname()[0] - if osName == 'SunOS': - utility = 'zpool' - elif osName == 'Linux': - utility = getLinuxUtility() - else: - print 'WARNING: Operating system "%s" is not supported by this check script' % (osName) - sys.exit(1) - - try: - if utility == None: - print 'OK: no RAID installed' - status = 0 - elif utility == 'arcconf': - status = checkAdaptec() - elif utility == 'tw_cli': - status = check3ware() - elif utility == 'MegaCli': - status = checkMegaSas() - elif utility == 'zpool': - status = checkZfs() - elif utility == 'mdadm': - status = checkSoftwareRaid() - else: - print 'WARNING: %s is not yet supported by this check script' % (utility) - status = 1 - except: - error = sys.exc_info()[1] - print 'WARNING: check-raid.py encountered exception: ' + str(error) - status = 1 - - sys.exit(status) - -def getLinuxUtility(): - f = open("/proc/devices", "r") - regex = re.compile('^\s*\d+\s+(\w+)') - utility = None - for line in f: - m = regex.match(line) - if m == None: - continue - name = m.group(1) - - if name == 'aac': - utility = 'arcconf' - break - elif name == 'twe': - utility = 'tw_cli' - break - elif name == 'megadev': - utility = 'megarc' - break - elif name == 'megaraid_sas_ioctl': - utility = 'MegaCli' - break - - f.close() - if utility != None: - return utility - - # Try mdadm - devices = getSoftwareRaidDevices() - if len(devices): - return 'mdadm' - - return None - -def getSoftwareRaidDevices(): - if not os.path.exists('/sbin/mdadm'): - return [] - - try: - proc = subprocess.Popen(['/sbin/mdadm', '--detail', '--scan'], - stdout=subprocess.PIPE) - except: - return [] - - regex = re.compile('^ARRAY\s+([^ ]*) ') - devices = [] - for line in proc.stdout: - m = regex.match(line) - if m != None: - devices.append(m.group(1)) - proc.wait() - - return devices - -def checkAdaptec(): - # Need to change directory so that the log file goes to the right place - oldDir = os.getcwd() - os.chdir('/var/log') - devNull = open('/dev/null', 'w') - - # Check if we need to run arcconf using sudo - try: - os.stat('/etc/sudoers.d/nrpe') - cmd = ['sudo'] - except: - cmd = [] - - # Run the command - try: - proc = subprocess.Popen(cmd + ['/usr/bin/arcconf', 'getconfig', '1'], - stdout = subprocess.PIPE, stderr = devNull) - except: - print 'WARNING: Unable to execute arcconf' - os.chdir(oldDir) - return 1 - - defunctRegex = re.compile('^\s*Defunct disk drive count\s*:\s*(\d+)') - logicalRegex = re.compile('^\s*Logical devices/Failed/Degraded\s*:\s*(\d+)/(\d+)/(\d+)') - status = 0 - numLogical = None - for line in proc.stdout: - m = defunctRegex.match(line) - if m != None and m.group(1) != '0': - print 'CRITICAL: Defunct disk drive count: ' + m.group(1) - status = 2 - break - - m = logicalRegex.match(line) - if m != None: - numLogical = int(m.group(1)) - if m.group(2) != '0' and m.group(3) != '0': - print 'CRITICAL: logical devices: %s failed and %s defunct' % \ - (m.group(2), m.group(3)) - status = 2 - break - if m.group(2) != '0': - print 'CRITICAL: logical devices: %s failed' % \ - (m.group(2)) - status = 2 - break - if m.group(3) != '0': - print 'CRITICAL: logical devices: %s defunct' % \ - (m.group(3)) - status = 2 - break - - ret = proc.wait() - if status == 0 and ret != 0: - print 'WARNING: arcconf returned exit status %d' % (ret) - status = 1 - - if status == 0 and numLogical == None: - print 'WARNING: unable to parse output from arcconf' - status = 1 - - if status == 0: - print 'OK: %d logical device(s) checked' % numLogical - - os.chdir(oldDir) - return status - - -def check3ware(): - # Get the list of controllers - try: - proc = subprocess.Popen(['/usr/bin/tw_cli', 'show'], stdout = subprocess.PIPE) - except: - print 'WARNING: error executing tw_cli' - return 1 - - regex = re.compile('^(c\d+)') - controllers = [] - for line in proc.stdout: - m = regex.match(line) - if m != None: - controllers.push('/' + m.group(1)) - - ret = proc.wait() - if ret != 0: - print 'WARNING: tw_cli returned exit status %d' % (ret) - return 1 - - # Check each controller - regex = re.compile('^(p\d+)\s+([\w-]+)') - failedDrives = [] - numDrives = 0 - for controller in controllers: - proc = subprocess.Popen(['/usr/bin/tw_cli', controller, 'show'], - stdout = subprocess.PIPE) - for line in proc.stdout(): - m = regex.match(line) - if m != None: - numDrives += 1 - if m.group(2) != 'OK': - failedDrives.push(controller + '/' + m.group(1)) - - proc.wait() - - if len(failedDrives) != 0: - print 'CRITICAL: %d failed drive(s): %s' % \ - (len(failedDrives), ', '.join(failedDrives) ) - return 2 - - if numDrives == 0: - print 'WARNING: no physical drives found, tw_cli parse error?' - return 1 - else: - print 'OK: %d drives checked' % numDrives - return 0 - -def checkMegaSas(): - try: - proc = subprocess.Popen(['/usr/bin/MegaCli64', '-LDInfo', '-LALL', '-aALL'], - stdout=subprocess.PIPE) - except: - error = sys.exc_info()[1] - print 'WARNING: error executing MegaCli64: %s' % str(error) - return 1 - - stateRegex = re.compile('^State:\s*([^\n]*)') - drivesRegex = re.compile('^Number Of Drives( per span)?:\s*([^\n]*)') - state = None - numDrives = None - for line in proc.stdout: - m = stateRegex.match(line) - if m != None: - state = m.group(1) - continue - - m = drivesRegex.match(line) - if m != None: - numDrives = int(m.group(2)) - continue - - ret = proc.wait() - if ret != 0: - print 'WARNING: MegaCli64 returned exit status %d' % (ret) - return 1 - - if numDrives == None: - print 'WARNING: Parse error processing MegaCli64 output' - return 1 - - if state != 'Optimal': - print 'CRITICAL: %s' % (state) - return 2 - - print 'OK: State is %s, checked %d logical device(s)' % (state, numDrives) - return 0 - -def checkZfs(): - try: - proc = subprocess.Popen(['/sbin/zpool', 'list', '-Honame,health'], - stdout=subprocess.PIPE) - except: - error = sys.exc_info()[1] - print 'WARNING: error executing zpool: %s' % str(error) - return 1 - - regex = re.compile('^(\S+)\s+(\S+)') - status = 0 - msg = '' - for line in proc.stdout: - m = regex.match(line) - if m != None: - name = m.group(1) - health = m.group(2) - if health != 'ONLINE': - status = 2 - - if msg != '': - msg += ', ' - msg += name + ': ' + health - - ret = proc.wait() - if ret != 0: - print 'WARNING: zpool returned exit status %d' % (ret) - return 1 - - if status: - print 'CRITICAL: ' + msg - else: - print 'OK: ' + msg - return status - -def checkSoftwareRaid(): - devices = getSoftwareRaidDevices() - if len(devices) == 0: - print 'WARNING: Unexpectedly checked no devices' - return 1 - - args = ['/sbin/mdadm', '--detail'] - args.extend(devices) - try: - proc = subprocess.Popen(args, stdout = subprocess.PIPE) - except: - error = sys.exc_info()[1] - print 'WARNING: error executing mdadm: %s' % str(error) - return 1 - - deviceRegex = re.compile('^(/[^ ]*):$') - statRegex = re.compile('^ *(Active|Working|Failed|Spare) Devices *: *(\d+)') - currentDevice = None - stats = { - 'Active': 0, - 'Working': 0, - 'Failed': 0, - 'Spare': 0 - } - for line in proc.stdout: - m = deviceRegex.match(line) - if m == None: - if currentDevice == None: - continue - else: - currentDevice = m.group(1) - continue - - m = statRegex.match(line) - if m == None: - continue - - stats[m.group(1)] += int(m.group(2)) - - ret = proc.wait() - if ret != 0: - print 'WARNING: mdadm returned exit status %d' % (ret) - return 1 - - msg = '' - for name in ('Active', 'Working', 'Failed', 'Spare'): - if msg != '': - msg += ', ' - msg += name + ': ' + str(stats[name]) - - if stats['Failed'] > 0: - print 'CRITICAL: ' + msg - return 2 - else: - print 'OK: ' + msg - return 0 - -main() diff --git a/manifests/misc/icinga.pp b/manifests/misc/icinga.pp index 866e8aa..554b2d5 100644 --- a/manifests/misc/icinga.pp +++ b/manifests/misc/icinga.pp @@ -576,6 +576,11 @@ owner => 'root', group => 'root', mode => '0755'; + '/usr/lib/nagios/plugins/check_ram.sh': + source => 'puppet:///files/icinga/check_ram.sh', + owner => 'root', + group => 'root', + mode => '0755'; } # some default configuration files conflict and should be removed diff --git a/manifests/nagios.pp b/manifests/nagios.pp index 5e67167..11fed05 100644 --- a/manifests/nagios.pp +++ b/manifests/nagios.pp @@ -176,28 +176,6 @@ } - -class nagios::ganglia::ganglios { - include generic::mysql::packages::client, - ganglia::collector - - package { "ganglios": - ensure => latest; - } - cron { "ganglios-cron": - command => "test -w /var/log/ganglia/ganglia_parser.log && /usr/sbin/ganglia_parser", - user => nagios, - minute => "*/2", - ensure => present; - } - file { "/var/lib/ganglia/xmlcache": - ensure => directory, - mode => 0755, - owner => nagios; - } -} - - class nagios::gsbmonitoring { @monitor_host { "google": ip_address => "74.125.225.84" } -- To view, visit https://gerrit.wikimedia.org/r/55785 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I768cb221b9b5e6b4f03f32f01cebe47bea8e0ba0 Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Lcarr <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
