MaxSem has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/49372


Change subject: check_solr, attempt 2
......................................................................

check_solr, attempt 2

Change-Id: I65a14f8f0783040ff2869953fb111e72decb842c
---
A files/nagios/check_solr
1 file changed, 137 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/72/49372/1

diff --git a/files/nagios/check_solr b/files/nagios/check_solr
new file mode 100755
index 0000000..fe7e49a
--- /dev/null
+++ b/files/nagios/check_solr
@@ -0,0 +1,137 @@
+#!/usr/bin/env python2.7
+
+import urllib2
+import sqlite3
+import xml.etree.ElementTree as ET
+from optparse import OptionParser
+import json
+from datetime import datetime
+
+cmd_parser = OptionParser(usage='Usage: %prog [options] [host[:port]]')
+cmd_parser.add_option('-a', '--avgtime', action='store',\
+    dest='request_time', help='Check average request time',\
+    metavar='error_threshold[:warning_threshold]')
+cmd_parser.add_option('-q', '--qps', action='store', dest='qps',\
+    help='Check requests per second',\
+    metavar='error_threshold[:warning_threshold]')
+cmd_parser.add_option('-r', '--check-replication', action='store_true',\
+    dest='check_replication', default=False, help='Check replication')
+cmd_parser.add_option('-t', '--timeout', type='int', action='store',\
+    dest='timeout', help='Timeout in seconds', default=3)
+
+(cmd_options, cmd_args) = cmd_parser.parse_args()
+if len(cmd_args) > 0:
+    host = cmd_args[0]
+else:
+    host = 'localhost'
+
+if len(host.split(':')) < 2:
+    host += ':8983'
+
+errors = []
+warnings = []
+unknowns = []
+
+def http_get(path):
+    url = 'http://' + host + '/solr/' + path
+    return urllib2.urlopen(url, None, cmd_options.timeout).read()
+
+
+def check_value(value, limits, message):
+    value = float(value)
+    limits = limits.split(':')
+    if value > float(limits[0]):
+        errors.append(message % (value, limits[0]))
+    elif len(limits) > 1 and value > float(limits[1]):
+        warnings.append(message % (value, limits[1]))
+
+
+def check_stat(entry, stat, limits, stat_name):
+    if limits == None:
+        return
+    node = entry.find(".//stats/stat[@name='%s']" % stat)
+    if node != None:
+        check_value(node.text, limits, stat_name + ' is %s (gt %s)')
+    else:
+        unknowns.append('Parameter "%s" not found in response' % stat)
+
+
+def check_select(entry):
+    check_stat(entry, 'avgTimePerRequest', cmd_options.request_time,\
+        'Average request time')
+    check_stat(entry, 'avgRequestsPerSecond', cmd_options.request_time,\
+        'Average requests per second')
+
+
+def parse_datetime(str):
+    # e.g. 'Thu Feb 07 00:28:00 UTC 2013'
+    return datetime.strptime(str, '%a %b %d %H:%M:%S UTC %Y')
+
+
+def check_all_stats():
+    text = http_get('admin/stats.jsp')
+    xml = ET.fromstring(text)
+
+    for entry in xml.iter('entry'):
+        name = entry.find('name').text.strip()
+        if name == '/select':
+            check_select(entry)
+            break
+
+
+def check_core_replication(core_name):
+    url_start = core_name
+    if url_start != '':
+        url_start += '/'
+    text = http_get(url_start + 'replication?command=details&wt=json')
+    response = json.loads(text)
+    details = response['details']
+    if details['isMaster'] == 'true':
+        if details['master']['replicationEnabled'] != 'true':
+            errors.append('Core "%s" is a master, but replication \
+                is disabled on it' % core_name)
+    elif details['isSlave'] == 'true':
+        if details['slave']['isPollingDisabled'] != 'false':
+            errors.append('Core "%s" is a slave, but polling is \
+                disabled on it' % core_name)
+        last_error = details['slave']['replicationFailedAt']
+        last_success = parse_datetime(details['slave']['indexReplicatedAt'])
+        if parse_datetime(last_error) > last_success:
+            errors.append('Core "%s": replication failed at %s'\
+                % (core_name, last_error))
+
+
+def check_replication():
+    if not cmd_options.check_replication:
+        return
+    # todo: multicore support
+    check_core_replication('')
+
+
+def process_results():
+    if len(errors):
+        print '; '.join(errors)
+        return 2
+    elif len(unknowns):
+        print '; '.join(unknowns)
+        return 3
+    elif len(warnings):
+        print '; '.join(warnings)
+        return 1
+    print 'All OK'
+    return 0
+
+
+try:
+    check_all_stats()
+    check_replication()
+
+except Exception as err:
+    (e, ) = err.args
+    if isinstance(err, urllib2.URLError) and e.errno == 115:
+        msg = 'Request timeout after %ds' % cmd_options.timeout
+    else:
+        msg = '%s: %s' % (type(err).__name__, e)
+    unknowns.append(msg)
+
+exit(process_results())

-- 
To view, visit https://gerrit.wikimedia.org/r/49372
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I65a14f8f0783040ff2869953fb111e72decb842c
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: MaxSem <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to