Filippo Giunchedi has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/154786

Change subject: elasticsearch: add percent-based shard check
......................................................................

elasticsearch: add percent-based shard check

One of the problems that have been observed is the cluster going temporarily
red while elasticsearch shuffles shards around and the nagios check firing a
false positive.

This plugin add functionality to check non-active shards (i.e. everything that
ES reports as not in its place), and critically the ability to check
percentages of the total shards.

This should reduce the number of false positives, the plan is to extend the
plugin with more in-depth checks on the index/shard/etc status and so on.

The plugin by default also checks the cluster status as a whole if it is
non-green, however that can be disabled to avoid false positives for the
reasons explained above.

Change-Id: Icf4240e08dd612ab43825db63b0eae3608b73225
---
A modules/elasticsearch/files/nagios/check_elasticsearch.py
A modules/elasticsearch/files/nagios/check_elasticsearch_test.py
2 files changed, 181 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/86/154786/1

diff --git a/modules/elasticsearch/files/nagios/check_elasticsearch.py 
b/modules/elasticsearch/files/nagios/check_elasticsearch.py
new file mode 100644
index 0000000..461a4ed
--- /dev/null
+++ b/modules/elasticsearch/files/nagios/check_elasticsearch.py
@@ -0,0 +1,157 @@
+#!/usr/bin/python
+
+# Author: Filippo Giunchedi <[email protected]>
+# Copyright 2014 Wikimedia Foundation
+# Copyright 2014 Filippo Giunchedi
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+import argparse
+import re
+import operator
+import sys
+
+import requests
+
+
+EX_OK = 0
+EX_WARNING = 1
+EX_CRITICAL = 2
+EX_UNKNOWN = 3
+
+
+class Threshold(object):
+    '''Implement a simple threshold parser/checker with common predicates and
+    percentages.'''
+
+    PREDICATES = {
+        '<=': operator.le,
+        '>=': operator.ge,
+        '>': operator.gt,
+        '<': operator.lt,
+        '==': operator.eq,
+        }
+
+    def __init__(self, threshold):
+        self.threshold_string = threshold
+        self.predicate = None
+        self.threshold = None
+        self.percent = None
+        self.FORMAT_RE = re.compile(r'^(%s)?\s*([\d.]+)\s*(%%)?' % 
'|'.join(self.PREDICATES))
+        self._parse(threshold)
+
+    def breach(self, value, total=None):
+        if total is None and self.percent is not None:
+            raise ValueError('threshold %r has percent but no total provided' %
+                    self.threshold_string)
+        if total is not None:
+            value = float(value) / total
+        return self.predicate(value, self.threshold)
+
+    def _parse(self, threshold):
+        m = self.FORMAT_RE.match(threshold)
+        if not m:
+            raise ValueError('unable to parse threshold: %r' % threshold)
+        predicate, value, percent = m.groups()
+        try:
+            value = float(value)
+        except ValueError, e:
+            raise ValueError('unable to parse as float: %r' % value)
+        self.predicate = self.PREDICATES.get(predicate, operator.eq)
+        self.threshold = value
+        self.percent = percent
+
+
+def check_status(health):
+    if health['status'] != 'green':
+        return EX_CRITICAL
+    return EX_OK
+
+
+def log_critical(log):
+    print 'CRITICAL - elasticsearch %s' % log
+
+
+def log_ok(log):
+    print 'OK - elasticsearch %s' % log
+
+
+def check_shards_inactive(health, threshold):
+    total_shards = 0
+    inactive_shards = 0
+    for s in 'relocating', 'initializing', 'unassigned':
+        inactive_shards += health['%s_shards' % s]
+        total_shards += health['%s_shards' % s]
+    total_shards += health['active_shards']
+    t = Threshold(threshold)
+    if not t.breach(inactive_shards, total_shards):
+        return EX_OK
+
+    log_critical('inactive shards %s threshold %s breach: %r' % (
+            inactive_shards, threshold, health))
+    return EX_CRITICAL
+
+
+def check_elasticsearch(options):
+    try:
+        cluster_health_url = options.url + '/_cluster/health'
+        response = requests.get(cluster_health_url,
+                timeout=options.timeout)
+        response.raise_for_status()
+    except requests.exceptions.RequestException, e:
+        log_critical('%s error while fetching: %s' % (cluster_health_url, e))
+        return EX_CRITICAL
+
+    try:
+        cluster_health = response.json()
+    except ValueError, e:
+        log_critical('%s error while decoding json: %s' % (cluster_health_url,
+            e))
+        return EX_CRITICAL
+
+    r = check_shards_inactive(cluster_health, options.shards_inactive)
+    if r != EX_OK:
+        return r
+
+    if not options.ignore_status:
+        r = check_status(cluster_health)
+        if r != EX_OK:
+            return r
+
+    log_ok('status %s: %r' % (cluster_health['cluster_name'], cluster_health))
+    return EX_OK
+
+
+def main():
+    parser = argparse.ArgumentParser(
+            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--url', default='http://localhost:9200',
+            help='Elasticsearch endpoint')
+    parser.add_argument('--timeout', default=2, type=int, metavar='SECONDS',
+            help='Timeout for the request to complete')
+    parser.add_argument('--shards-inactive', default='>=0.1%',
+            dest='shards_inactive', metavar='THRESHOLD',
+            help='Threshold to check for inactive shards '
+                 '(i.e. initializing/relocating/unassigned)')
+    parser.add_argument('--ignore-status', default=False, action='store_true',
+            dest='ignore_status',
+            help='Do not check elasticsearch cluster status')
+    options = parser.parse_args()
+
+    return check_elasticsearch(options)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/modules/elasticsearch/files/nagios/check_elasticsearch_test.py 
b/modules/elasticsearch/files/nagios/check_elasticsearch_test.py
new file mode 100644
index 0000000..89214ba
--- /dev/null
+++ b/modules/elasticsearch/files/nagios/check_elasticsearch_test.py
@@ -0,0 +1,24 @@
+import unittest
+
+from check_elasticsearch import Threshold
+
+
+class ThresholdTest(unittest.TestCase):
+    def testBasicThreshold(self):
+        self.assertFalse(self._breach('>0', 0))
+        self.assertTrue(self._breach('0', 0))
+        self.assertFalse(self._breach('>=2', 0))
+        self.assertFalse(self._breach('2', 0))
+
+    def testInvalidThreshold(self):
+        self.assertRaises(ValueError, self._breach, '')
+        self.assertRaises(ValueError, self._breach, '>')
+        self.assertRaises(ValueError, self._breach, '%123')
+
+    def testPercentThreshold(self):
+        self.assertRaises(ValueError, self._breach, '>0%', 0)
+        self.assertTrue(self._breach('>=0.34%', 42, 123))
+        self.assertFalse(self._breach('1%', 1, 100))
+
+    def _breach(self, threshold, *args):
+        return Threshold(threshold).breach(*args)

-- 
To view, visit https://gerrit.wikimedia.org/r/154786
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Icf4240e08dd612ab43825db63b0eae3608b73225
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Filippo Giunchedi <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to