MaxSem has uploaded a new change for review.
https://gerrit.wikimedia.org/r/49372
Change subject: check_solr, attempt 2
......................................................................
check_solr, attempt 2
Change-Id: I65a14f8f0783040ff2869953fb111e72decb842c
---
A files/nagios/check_solr
1 file changed, 137 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet
refs/changes/72/49372/1
diff --git a/files/nagios/check_solr b/files/nagios/check_solr
new file mode 100755
index 0000000..fe7e49a
--- /dev/null
+++ b/files/nagios/check_solr
@@ -0,0 +1,137 @@
+#!/usr/bin/env python2.7
+
+import urllib2
+import sqlite3
+import xml.etree.ElementTree as ET
+from optparse import OptionParser
+import json
+from datetime import datetime
+
+cmd_parser = OptionParser(usage='Usage: %prog [options] [host[:port]]')
+cmd_parser.add_option('-a', '--avgtime', action='store',\
+ dest='request_time', help='Check average request time',\
+ metavar='error_threshold[:warning_threshold]')
+cmd_parser.add_option('-q', '--qps', action='store', dest='qps',\
+ help='Check requests per second',\
+ metavar='error_threshold[:warning_threshold]')
+cmd_parser.add_option('-r', '--check-replication', action='store_true',\
+ dest='check_replication', default=False, help='Check replication')
+cmd_parser.add_option('-t', '--timeout', type='int', action='store',\
+ dest='timeout', help='Timeout in seconds', default=3)
+
+(cmd_options, cmd_args) = cmd_parser.parse_args()
+if len(cmd_args) > 0:
+ host = cmd_args[0]
+else:
+ host = 'localhost'
+
+if len(host.split(':')) < 2:
+ host += ':8983'
+
+errors = []
+warnings = []
+unknowns = []
+
+def http_get(path):
+ url = 'http://' + host + '/solr/' + path
+ return urllib2.urlopen(url, None, cmd_options.timeout).read()
+
+
+def check_value(value, limits, message):
+ value = float(value)
+ limits = limits.split(':')
+ if value > float(limits[0]):
+ errors.append(message % (value, limits[0]))
+ elif len(limits) > 1 and value > float(limits[1]):
+ warnings.append(message % (value, limits[1]))
+
+
+def check_stat(entry, stat, limits, stat_name):
+ if limits == None:
+ return
+ node = entry.find(".//stats/stat[@name='%s']" % stat)
+ if node != None:
+ check_value(node.text, limits, stat_name + ' is %s (gt %s)')
+ else:
+ unknowns.append('Parameter "%s" not found in response' % stat)
+
+
+def check_select(entry):
+ check_stat(entry, 'avgTimePerRequest', cmd_options.request_time,\
+ 'Average request time')
+ check_stat(entry, 'avgRequestsPerSecond', cmd_options.request_time,\
+ 'Average requests per second')
+
+
+def parse_datetime(str):
+ # e.g. 'Thu Feb 07 00:28:00 UTC 2013'
+ return datetime.strptime(str, '%a %b %d %H:%M:%S UTC %Y')
+
+
+def check_all_stats():
+ text = http_get('admin/stats.jsp')
+ xml = ET.fromstring(text)
+
+ for entry in xml.iter('entry'):
+ name = entry.find('name').text.strip()
+ if name == '/select':
+ check_select(entry)
+ break
+
+
+def check_core_replication(core_name):
+ url_start = core_name
+ if url_start != '':
+ url_start += '/'
+ text = http_get(url_start + 'replication?command=details&wt=json')
+ response = json.loads(text)
+ details = response['details']
+ if details['isMaster'] == 'true':
+ if details['master']['replicationEnabled'] != 'true':
+ errors.append('Core "%s" is a master, but replication \
+ is disabled on it' % core_name)
+ elif details['isSlave'] == 'true':
+ if details['slave']['isPollingDisabled'] != 'false':
+ errors.append('Core "%s" is a slave, but polling is \
+ disabled on it' % core_name)
+ last_error = details['slave']['replicationFailedAt']
+ last_success = parse_datetime(details['slave']['indexReplicatedAt'])
+ if parse_datetime(last_error) > last_success:
+ errors.append('Core "%s": replication failed at %s'\
+ % (core_name, last_error))
+
+
+def check_replication():
+ if not cmd_options.check_replication:
+ return
+ # todo: multicore support
+ check_core_replication('')
+
+
+def process_results():
+ if len(errors):
+ print '; '.join(errors)
+ return 2
+ elif len(unknowns):
+ print '; '.join(unknowns)
+ return 3
+ elif len(warnings):
+ print '; '.join(warnings)
+ return 1
+ print 'All OK'
+ return 0
+
+
+try:
+ check_all_stats()
+ check_replication()
+
+except Exception as err:
+ (e, ) = err.args
+ if isinstance(err, urllib2.URLError) and e.errno == 115:
+ msg = 'Request timeout after %ds' % cmd_options.timeout
+ else:
+ msg = '%s: %s' % (type(err).__name__, e)
+ unknowns.append(msg)
+
+exit(process_results())
--
To view, visit https://gerrit.wikimedia.org/r/49372
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I65a14f8f0783040ff2869953fb111e72decb842c
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: MaxSem <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits