Lcarr has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/55785


Change subject: big nagios cleanup, part 2
......................................................................

big nagios cleanup, part 2

Change-Id: I768cb221b9b5e6b4f03f32f01cebe47bea8e0ba0
---
R files/icinga/check_dpkg
R files/icinga/check_ram.sh
R files/icinga/check_subdir_limit
D files/nagios/cgi.cfg
D files/nagios/check-raid.py
M manifests/misc/icinga.pp
M manifests/nagios.pp
7 files changed, 5 insertions(+), 645 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/85/55785/1

diff --git a/files/nagios/check_dpkg b/files/icinga/check_dpkg
similarity index 100%
rename from files/nagios/check_dpkg
rename to files/icinga/check_dpkg
diff --git a/files/nagios/check_ram.sh b/files/icinga/check_ram.sh
similarity index 100%
rename from files/nagios/check_ram.sh
rename to files/icinga/check_ram.sh
diff --git a/files/nagios/check_subdir_limit b/files/icinga/check_subdir_limit
similarity index 100%
rename from files/nagios/check_subdir_limit
rename to files/icinga/check_subdir_limit
diff --git a/files/nagios/cgi.cfg b/files/nagios/cgi.cfg
deleted file mode 100644
index 521b00c..0000000
--- a/files/nagios/cgi.cfg
+++ /dev/null
@@ -1,282 +0,0 @@
-#################################################################
-#
-# CGI.CFG - Sample CGI Configuration File for Nagios 
-#
-# Last Modified: 05-05-2005
-#
-#################################################################
-
-
-# MAIN CONFIGURATION FILE
-# This tells the CGIs where to find your main configuration file.
-# The CGIs will read the main and host config files for any other
-# data they might need.
-
-main_config_file=/etc/nagios/nagios.cfg
-
-
-
-# PHYSICAL HTML PATH
-# This is the path where the HTML files for Nagios reside.  This
-# value is used to locate the logo images needed by the statusmap
-# and statuswrl CGIs.
-
-physical_html_path=/usr/share/nagios
-
-
-
-# URL HTML PATH
-# This is the path portion of the URL that corresponds to the
-# physical location of the Nagios HTML files (as defined above).
-# This value is used by the CGIs to locate the online documentation
-# and graphics.  If you access the Nagios pages with an URL like
-# http://www.myhost.com/nagios, this value should be '/nagios'
-# (without the quotes).
-
-url_html_path=/
-
-
-
-# CONTEXT-SENSITIVE HELP
-# This option determines whether or not a context-sensitive
-# help icon will be displayed for most of the CGIs.
-# Values: 0 = disables context-sensitive help
-#         1 = enables context-sensitive help
-
-show_context_help=0
-
-
-
-# NAGIOS PROCESS CHECK COMMAND
-# This is the full path and filename of the program used to check
-# the status of the Nagios process.  It is used only by the CGIs
-# and is completely optional.  However, if you don't use it, you'll
-# see warning messages in the CGIs about the Nagios process
-# not running and you won't be able to execute any commands from
-# the web interface.  The program should follow the same rules
-# as plugins; the return codes are the same as for the plugins,
-# it should have timeout protection, it should output something
-# to STDIO, etc.
-#
-# Note: The command line for the check_nagios plugin below may
-# have to be tweaked a bit, as different versions of the plugin
-# use different command line arguments/syntaxes.
-
-#nagios_check_command=/usr/lib/nagios/plugins/check_nagios 
/var/log/nagios/status.dat 5 '/usr/bin/nagios'
-
-
-
-# AUTHENTICATION USAGE
-# This option controls whether or not the CGIs will use any 
-# authentication when displaying host and service information, as
-# well as committing commands to Nagios for processing.  
-#
-# Read the HTML documentation to learn how the authorization works!
-#
-# NOTE: It is a really *bad* idea to disable authorization, unless
-# you plan on removing the command CGI (cmd.cgi)!  Failure to do
-# so will leave you wide open to kiddies messing with Nagios and
-# possibly hitting you with a denial of service attack by filling up
-# your drive by continuously writing to your command file!
-#
-# Setting this value to 0 will cause the CGIs to *not* use
-# authentication (bad idea), while any other value will make them
-# use the authentication functions (the default).
-
-use_authentication=1
-
-
-
-# DEFAULT USER
-# Setting this variable will define a default user name that can
-# access pages without authentication.  This allows people within a
-# secure domain (i.e., behind a firewall) to see the current status
-# without authenticating.  You may want to use this to avoid basic
-# authentication if you are not using a sercure server since basic
-# authentication transmits passwords in the clear.
-#
-# Important:  Do not define a default username unless you are
-# running a secure web server and are sure that everyone who has
-# access to the CGIs has been authenticated in some manner!  If you
-# define this variable, anyone who has not authenticated to the web
-# server will inherit all rights you assign to this user!
- 
-default_user_name=guest
-
-
-
-# SYSTEM/PROCESS INFORMATION ACCESS
-# This option is a comma-delimited list of all usernames that
-# have access to viewing the Nagios process information as
-# provided by the Extended Information CGI (extinfo.cgi).  By
-# default, *no one* has access to this unless you choose to
-# not use authorization.  You may use an asterisk (*) to
-# authorize any user who has authenticated to the web server.
-
-authorized_for_system_information=tstarling,RobH,mark,midom,laner,ariel,py,asher,dzahn,lcarr,jgreen
-
-
-
-# CONFIGURATION INFORMATION ACCESS
-# This option is a comma-delimited list of all usernames that
-# can view ALL configuration information (hosts, commands, etc).
-# By default, users can only view configuration information
-# for the hosts and services they are contacts for. You may use
-# an asterisk (*) to authorize any user who has authenticated
-# to the web server.
-
-authorized_for_configuration_information=tstarling,RobH,mark,midom,laner,ariel,py,asher,dzahn,lcarr,jgreen
-
-
-
-# SYSTEM/PROCESS COMMAND ACCESS
-# This option is a comma-delimited list of all usernames that
-# can issue shutdown and restart commands to Nagios via the
-# command CGI (cmd.cgi).  Users in this list can also change
-# the program mode to active or standby. By default, *no one*
-# has access to this unless you choose to not use authorization.
-# You may use an asterisk (*) to authorize any user who has
-# authenticated to the web server.
-
-authorized_for_system_commands=tstarling,RobH,mark,midom,laner,ariel,py,asher,dzahn,lcarr,jgreen
-
-
-
-# GLOBAL HOST/SERVICE VIEW ACCESS
-# These two options are comma-delimited lists of all usernames that
-# can view information for all hosts and services that are being
-# monitored.  By default, users can only view information
-# for hosts or services that they are contacts for (unless you
-# you choose to not use authorization). You may use an asterisk (*)
-# to authorize any user who has authenticated to the web server.
-
-
-#authorized_for_all_services=nagiosadmin,guest
-#authorized_for_all_hosts=nagiosadmin,guest
-
-authorized_for_all_services=*
-authorized_for_all_hosts=*
-
-
-# GLOBAL HOST/SERVICE COMMAND ACCESS
-# These two options are comma-delimited lists of all usernames that
-# can issue host or service related commands via the command
-# CGI (cmd.cgi) for all hosts and services that are being monitored. 
-# By default, users can only issue commands for hosts or services 
-# that they are contacts for (unless you you choose to not use 
-# authorization).  You may use an asterisk (*) to authorize any
-# user who has authenticated to the web server.
-
-#authorized_for_all_service_commands=nagiosadmin
-#authorized_for_all_host_commands=nagiosadmin
-
-authorized_for_all_service_commands=tstarling,RobH,mark,midom,laner,ariel,py,asher,dzahn,lcarr,jgreen
-authorized_for_all_host_commands=tstarling,RobH,mark,midom,laner,ariel,py,asher,dzahn,lcarr,jgreen
-
-
-# STATUSMAP BACKGROUND IMAGE
-# This option allows you to specify an image to be used as a 
-# background in the statusmap CGI.  It is assumed that the image
-# resides in the HTML images path (i.e. /usr/local/nagios/share/images).
-# This path is automatically determined by appending "/images"
-# to the path specified by the 'physical_html_path' directive.
-# Note:  The image file may be in GIF, PNG, JPEG, or GD2 format.
-# However, I recommend that you convert your image to GD2 format
-# (uncompressed), as this will cause less CPU load when the CGI
-# generates the image.
-
-#statusmap_background_image=smbackground.gd2
-
-
-
-# DEFAULT STATUSMAP LAYOUT METHOD
-# This option allows you to specify the default layout method
-# the statusmap CGI should use for drawing hosts.  If you do
-# not use this option, the default is to use user-defined
-# coordinates.  Valid options are as follows:
-#      0 = User-defined coordinates
-#      1 = Depth layers
-#       2 = Collapsed tree
-#       3 = Balanced tree
-#       4 = Circular
-#       5 = Circular (Marked Up)
-
-default_statusmap_layout=5
-
-
-
-# DEFAULT STATUSWRL LAYOUT METHOD
-# This option allows you to specify the default layout method
-# the statuswrl (VRML) CGI should use for drawing hosts.  If you
-# do not use this option, the default is to use user-defined
-# coordinates.  Valid options are as follows:
-#      0 = User-defined coordinates
-#       2 = Collapsed tree
-#       3 = Balanced tree
-#       4 = Circular
-
-default_statuswrl_layout=4
-
-
-
-# STATUSWRL INCLUDE
-# This option allows you to include your own objects in the 
-# generated VRML world.  It is assumed that the file
-# resides in the HTML path (i.e. /usr/local/nagios/share).
-
-#statuswrl_include=myworld.wrl
-
-
-
-# PING SYNTAX
-# This option determines what syntax should be used when
-# attempting to ping a host from the WAP interface (using
-# the statuswml CGI.  You must include the full path to
-# the ping binary, along with all required options.  The
-# $HOSTADDRESS$ macro is substituted with the address of
-# the host before the command is executed.
-# Please note that the syntax for the ping binary is
-# notorious for being different on virtually ever *NIX
-# OS and distribution, so you may have to tweak this to
-# work on your system.
-
-ping_syntax=/bin/ping -n -U -c 5 $HOSTADDRESS$
-
-
-
-# REFRESH RATE
-# This option allows you to specify the refresh rate in seconds
-# of various CGIs (status, statusmap, extinfo, and outages).  
-
-refresh_rate=90
-
-
-
-# SOUND OPTIONS
-# These options allow you to specify an optional audio file
-# that should be played in your browser window when there are
-# problems on the network.  The audio files are used only in
-# the status CGI.  Only the sound for the most critical problem
-# will be played.  Order of importance (higher to lower) is as
-# follows: unreachable hosts, down hosts, critical services,
-# warning services, and unknown services. If there are no
-# visible problems, the sound file optionally specified by
-# 'normal_sound' variable will be played.
-#
-#
-# <varname>=<sound_file>
-#
-# Note: All audio files must be placed in the /media subdirectory
-# under the HTML path (i.e. /usr/local/nagios/share/media/).
-
-host_unreachable_sound=hostdown.wav
-host_down_sound=hostdown.wav
-service_critical_sound=critical.wav
-service_warning_sound=warning.wav
-service_unknown_sound=warning.wav
-#normal_sound=noproblem.wav
-
-
-notes_url_target=main
-action_url_target=main
-lock_author_names=1
diff --git a/files/nagios/check-raid.py b/files/nagios/check-raid.py
deleted file mode 100755
index 116ac8f..0000000
--- a/files/nagios/check-raid.py
+++ /dev/null
@@ -1,341 +0,0 @@
-#!/usr/bin/python
-
-import sys, os, os.path, re, subprocess
-
-def main():
-       osName = os.uname()[0]
-       if osName == 'SunOS':
-               utility = 'zpool'
-       elif osName == 'Linux':
-               utility = getLinuxUtility()
-       else:
-               print 'WARNING: Operating system "%s" is not supported by this 
check script' % (osName)
-               sys.exit(1)
-
-       try:
-               if utility == None:
-                       print 'OK: no RAID installed'
-                       status = 0
-               elif utility == 'arcconf':
-                       status = checkAdaptec()
-               elif utility == 'tw_cli':
-                       status = check3ware()
-               elif utility == 'MegaCli':
-                       status = checkMegaSas()
-               elif utility == 'zpool':
-                       status = checkZfs()
-               elif utility == 'mdadm':
-                       status = checkSoftwareRaid()
-               else:
-                       print 'WARNING: %s is not yet supported by this check 
script' % (utility)
-                       status = 1
-       except:
-               error = sys.exc_info()[1]
-               print 'WARNING: check-raid.py encountered exception: ' + 
str(error)
-               status = 1
-       
-       sys.exit(status)
-
-def getLinuxUtility():
-       f = open("/proc/devices", "r")
-       regex = re.compile('^\s*\d+\s+(\w+)')
-       utility = None
-       for line in f:
-               m = regex.match(line)
-               if m == None:
-                       continue
-               name = m.group(1)
-               
-               if name == 'aac':
-                       utility = 'arcconf'
-                       break
-               elif name == 'twe':
-                       utility = 'tw_cli'
-                       break
-               elif name == 'megadev':
-                       utility = 'megarc'
-                       break
-               elif name == 'megaraid_sas_ioctl':
-                       utility = 'MegaCli'
-                       break
-       
-       f.close()
-       if utility != None:
-               return utility
-
-       # Try mdadm
-       devices = getSoftwareRaidDevices()
-       if len(devices):
-               return 'mdadm'
-
-       return None
-
-def getSoftwareRaidDevices():
-       if not os.path.exists('/sbin/mdadm'):
-               return []
-
-       try:
-               proc = subprocess.Popen(['/sbin/mdadm', '--detail', '--scan'], 
-                               stdout=subprocess.PIPE)
-       except:
-               return []
-
-       regex = re.compile('^ARRAY\s+([^ ]*) ')
-       devices = []
-       for line in proc.stdout:
-               m = regex.match(line)
-               if m != None:
-                       devices.append(m.group(1))
-       proc.wait()
-
-       return devices
-
-def checkAdaptec():
-       # Need to change directory so that the log file goes to the right place
-       oldDir = os.getcwd()
-       os.chdir('/var/log')
-       devNull = open('/dev/null', 'w')
-
-       # Check if we need to run arcconf using sudo
-       try:
-               os.stat('/etc/sudoers.d/nrpe')
-               cmd = ['sudo']
-       except:
-               cmd = []
-
-       # Run the command
-       try:
-               proc = subprocess.Popen(cmd + ['/usr/bin/arcconf', 'getconfig', 
'1'], 
-                               stdout = subprocess.PIPE, stderr = devNull)
-       except:
-               print 'WARNING: Unable to execute arcconf'
-               os.chdir(oldDir)
-               return 1
-
-       defunctRegex = re.compile('^\s*Defunct disk drive count\s*:\s*(\d+)')
-       logicalRegex = re.compile('^\s*Logical 
devices/Failed/Degraded\s*:\s*(\d+)/(\d+)/(\d+)')
-       status = 0
-       numLogical = None
-       for line in proc.stdout:
-               m = defunctRegex.match(line)
-               if m != None and m.group(1) != '0':
-                       print 'CRITICAL: Defunct disk drive count: ' + 
m.group(1)
-                       status = 2
-                       break
-
-               m = logicalRegex.match(line)
-               if m != None:
-                       numLogical = int(m.group(1))
-                       if m.group(2) != '0' and m.group(3) != '0':
-                               print 'CRITICAL: logical devices: %s failed and 
%s defunct' % \
-                                       (m.group(2), m.group(3))
-                               status = 2
-                               break
-                       if m.group(2) != '0':
-                               print 'CRITICAL: logical devices: %s failed' % \
-                                       (m.group(2))
-                               status = 2
-                               break
-                       if m.group(3) != '0':
-                               print 'CRITICAL: logical devices: %s defunct' % 
\
-                                       (m.group(3))
-                               status = 2
-                               break
-
-       ret = proc.wait()
-       if status == 0 and ret != 0:
-               print 'WARNING: arcconf returned exit status %d' % (ret)
-               status = 1
-
-       if status == 0 and numLogical == None:
-               print 'WARNING: unable to parse output from arcconf'
-               status = 1
-       
-       if status == 0:
-               print 'OK: %d logical device(s) checked' % numLogical
-
-       os.chdir(oldDir)
-       return status
-
-
-def check3ware():
-       # Get the list of controllers
-       try:
-               proc = subprocess.Popen(['/usr/bin/tw_cli', 'show'], stdout = 
subprocess.PIPE)
-       except:
-               print 'WARNING: error executing tw_cli'
-               return 1
-
-       regex = re.compile('^(c\d+)')
-       controllers = []
-       for line in proc.stdout:
-               m = regex.match(line)
-               if m != None:
-                       controllers.push('/' + m.group(1))
-       
-       ret = proc.wait()
-       if ret != 0:
-               print 'WARNING: tw_cli returned exit status %d' % (ret)
-               return 1
-
-       # Check each controller
-       regex = re.compile('^(p\d+)\s+([\w-]+)')
-       failedDrives = []
-       numDrives = 0
-       for controller in controllers:
-               proc = subprocess.Popen(['/usr/bin/tw_cli', controller, 'show'],
-                               stdout = subprocess.PIPE)
-               for line in proc.stdout():
-                       m = regex.match(line)
-                       if m != None:
-                               numDrives += 1
-                               if m.group(2) != 'OK':
-                                       failedDrives.push(controller + '/' + 
m.group(1))
-
-               proc.wait()
-       
-       if len(failedDrives) != 0:
-               print 'CRITICAL: %d failed drive(s): %s' % \
-                               (len(failedDrives), ', '.join(failedDrives) )
-               return 2
-
-       if numDrives == 0:
-               print 'WARNING: no physical drives found, tw_cli parse error?'
-               return 1
-       else:
-               print 'OK: %d drives checked' % numDrives
-               return 0
-
-def checkMegaSas():
-       try:
-               proc = subprocess.Popen(['/usr/bin/MegaCli64', '-LDInfo', 
'-LALL', '-aALL'], 
-                               stdout=subprocess.PIPE)
-       except:
-               error = sys.exc_info()[1]
-               print 'WARNING: error executing MegaCli64: %s' % str(error)
-               return 1
-       
-       stateRegex = re.compile('^State:\s*([^\n]*)')
-       drivesRegex = re.compile('^Number Of Drives( per span)?:\s*([^\n]*)')
-       state = None
-       numDrives = None
-       for line in proc.stdout:
-               m = stateRegex.match(line)
-               if m != None:
-                       state = m.group(1)
-                       continue
-               
-               m = drivesRegex.match(line)
-               if m != None:
-                       numDrives = int(m.group(2))
-                       continue
-       
-       ret = proc.wait()
-       if ret != 0:
-               print 'WARNING: MegaCli64 returned exit status %d' % (ret)
-               return 1
-
-       if numDrives == None:
-               print 'WARNING: Parse error processing MegaCli64 output'
-               return 1
-
-       if state != 'Optimal':
-               print 'CRITICAL: %s' % (state)
-               return 2
-
-       print 'OK: State is %s, checked %d logical device(s)' % (state, 
numDrives)
-       return 0
-
-def checkZfs():
-       try:
-               proc = subprocess.Popen(['/sbin/zpool', 'list', 
'-Honame,health'],
-                               stdout=subprocess.PIPE)
-       except:
-               error = sys.exc_info()[1]
-               print 'WARNING: error executing zpool: %s' % str(error)
-               return 1
-
-       regex = re.compile('^(\S+)\s+(\S+)')
-       status = 0
-       msg = ''
-       for line in proc.stdout:
-               m = regex.match(line)
-               if m != None:
-                       name = m.group(1)
-                       health = m.group(2)
-                       if health != 'ONLINE':
-                               status = 2
-
-                       if msg != '':
-                               msg += ', '
-                       msg += name + ': ' + health
-
-       ret = proc.wait()
-       if ret != 0:
-               print 'WARNING: zpool returned exit status %d' % (ret)
-               return 1
-       
-       if status:
-               print 'CRITICAL: ' + msg
-       else:
-               print 'OK: ' + msg
-       return status
-
-def checkSoftwareRaid():
-       devices = getSoftwareRaidDevices()
-       if len(devices) == 0:
-               print 'WARNING: Unexpectedly checked no devices'
-               return 1
-
-       args = ['/sbin/mdadm', '--detail']
-       args.extend(devices)
-       try:
-               proc = subprocess.Popen(args, stdout = subprocess.PIPE)
-       except:
-               error = sys.exc_info()[1]
-               print 'WARNING: error executing mdadm: %s' % str(error)
-               return 1
-
-       deviceRegex = re.compile('^(/[^ ]*):$')
-       statRegex = re.compile('^ *(Active|Working|Failed|Spare) Devices *: 
*(\d+)')
-       currentDevice = None
-       stats = {
-               'Active': 0,
-               'Working': 0,
-               'Failed': 0,
-               'Spare': 0
-       }
-       for line in proc.stdout:
-               m = deviceRegex.match(line)
-               if m == None:
-                       if currentDevice == None:
-                               continue
-               else:
-                       currentDevice = m.group(1)
-                       continue
-               
-               m = statRegex.match(line)
-               if m == None:
-                       continue
-
-               stats[m.group(1)] += int(m.group(2))
-
-       ret = proc.wait()
-       if ret != 0:
-               print 'WARNING: mdadm returned exit status %d' % (ret)
-               return 1
-
-       msg = ''
-       for name in ('Active', 'Working', 'Failed', 'Spare'):
-               if msg != '':
-                       msg += ', '
-               msg += name + ': ' + str(stats[name])
-       
-       if stats['Failed'] > 0:
-               print 'CRITICAL: ' + msg
-               return 2
-       else:
-               print 'OK: ' + msg
-               return 0
-
-main()
diff --git a/manifests/misc/icinga.pp b/manifests/misc/icinga.pp
index 866e8aa..554b2d5 100644
--- a/manifests/misc/icinga.pp
+++ b/manifests/misc/icinga.pp
@@ -576,6 +576,11 @@
       owner => 'root',
       group => 'root',
       mode => '0755';
+    '/usr/lib/nagios/plugins/check_ram.sh':
+      source => 'puppet:///files/icinga/check_ram.sh',
+      owner => 'root',
+      group => 'root',
+      mode => '0755';
   }
 
   # some default configuration files conflict and should be removed
diff --git a/manifests/nagios.pp b/manifests/nagios.pp
index 5e67167..11fed05 100644
--- a/manifests/nagios.pp
+++ b/manifests/nagios.pp
@@ -176,28 +176,6 @@
 }
 
 
-
-class nagios::ganglia::ganglios {
-       include generic::mysql::packages::client,
-               ganglia::collector
-
-       package { "ganglios":
-               ensure => latest;
-       }
-       cron { "ganglios-cron":
-               command => "test -w /var/log/ganglia/ganglia_parser.log && 
/usr/sbin/ganglia_parser",
-               user => nagios,
-               minute => "*/2",
-               ensure => present;
-       }
-       file { "/var/lib/ganglia/xmlcache":
-               ensure => directory,
-               mode => 0755,
-               owner => nagios;
-       }
-}
-
-
 class nagios::gsbmonitoring {
        @monitor_host { "google": ip_address => "74.125.225.84" }
 

-- 
To view, visit https://gerrit.wikimedia.org/r/55785
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I768cb221b9b5e6b4f03f32f01cebe47bea8e0ba0
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Lcarr <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to