Gehel has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/349168 )
Change subject: logstash - raise elasticsearch shard alert threshold to 34 ...................................................................... logstash - raise elasticsearch shard alert threshold to 34 After some discussion, this seems to be a good compromise. It allows to loose one node (1/3 of the shards) without raising an alert. Some cleanup of the check_elasticsearch.py script is done as well: * drop the support of "%" * treat all threshold as ratio Previously, that script treated ">=0.1%" as a check that more than 10% of shards were unassigned, which is a bit misleading. Change-Id: I352e7230c6bd42f5d1a2fa84f33675e0a6ce8225 --- M modules/elasticsearch/files/nagios/check_elasticsearch.py M modules/elasticsearch/files/nagios/check_elasticsearch_test.py M modules/elasticsearch/manifests/nagios/check.pp M modules/nagios_common/files/checkcommands.cfg M modules/role/manifests/logstash/elasticsearch.pp 5 files changed, 43 insertions(+), 25 deletions(-) Approvals: Muehlenhoff: Looks good to me, but someone else must approve jenkins-bot: Verified DCausse: Looks good to me, but someone else must approve Gehel: Looks good to me, approved diff --git a/modules/elasticsearch/files/nagios/check_elasticsearch.py b/modules/elasticsearch/files/nagios/check_elasticsearch.py index e67ce4d..62c47a9 100755 --- a/modules/elasticsearch/files/nagios/check_elasticsearch.py +++ b/modules/elasticsearch/files/nagios/check_elasticsearch.py @@ -34,8 +34,7 @@ class Threshold(object): - '''Implement a simple threshold parser/checker with common predicates and - percentages.''' + '''Implement a simple threshold parser/checker with common predicates.''' PREDICATES = { '<=': operator.le, @@ -49,31 +48,27 @@ self.threshold_string = threshold self.predicate = None self.threshold = None - self.percent = None self.FORMAT_RE = re.compile( - r'^(%s)?\s*([\d.]+)\s*(%%)?' % '|'.join(self.PREDICATES)) + r'^(%s)?\s*([\d.]+)$' % '|'.join(self.PREDICATES)) self._parse(threshold) - def breach(self, value, total=None): - if total is None and self.percent is not None: - raise ValueError('threshold %r has percent but no total provided' % - self.threshold_string) - if total not in [None, 0]: - value = float(value) / total - return self.predicate(value, self.threshold) + def breach(self, value, total): + ratio = float(value) / total + return self.predicate(ratio, self.threshold) def _parse(self, threshold): m = self.FORMAT_RE.match(threshold) if not m: raise ValueError('unable to parse threshold: %r' % threshold) - predicate, value, percent = m.groups() + predicate, value = m.groups() try: value = float(value) except ValueError: raise ValueError('unable to parse as float: %r' % value) + if 0.0 > value or value > 1.0: + raise ValueError('threshold should be a ratio between 0.0 and 1.0: %r' % value) self.predicate = self.PREDICATES.get(predicate, operator.eq) self.threshold = value - self.percent = percent def _format_health(health): @@ -168,9 +163,9 @@ help='Timeout for the request to complete') parser.add_argument('--retries', default=2, type=int, metavar='INTEGER', help='How many times to retry a request on timeout') - parser.add_argument('--shards-inactive', default='>=0.1%', + parser.add_argument('--shards-inactive', default='>=0.1', dest='shards_inactive', metavar='THRESHOLD', - help='Threshold to check for inactive shards ' + help='Threshold to check the ratio of inactive shards ' '(i.e. initializing/relocating/unassigned)') parser.add_argument('--ignore-status', default=False, action='store_true', dest='ignore_status', diff --git a/modules/elasticsearch/files/nagios/check_elasticsearch_test.py b/modules/elasticsearch/files/nagios/check_elasticsearch_test.py index 89214ba..6373d7c 100644 --- a/modules/elasticsearch/files/nagios/check_elasticsearch_test.py +++ b/modules/elasticsearch/files/nagios/check_elasticsearch_test.py @@ -5,20 +5,25 @@ class ThresholdTest(unittest.TestCase): def testBasicThreshold(self): - self.assertFalse(self._breach('>0', 0)) - self.assertTrue(self._breach('0', 0)) - self.assertFalse(self._breach('>=2', 0)) - self.assertFalse(self._breach('2', 0)) + self.assertFalse(self._breach('>0', 0, 1)) + self.assertTrue(self._breach('0', 0, 1)) + self.assertFalse(self._breach('>=0.2', 0, 1)) + self.assertFalse(self._breach('0.2', 0, 1)) def testInvalidThreshold(self): self.assertRaises(ValueError, self._breach, '') self.assertRaises(ValueError, self._breach, '>') self.assertRaises(ValueError, self._breach, '%123') + self.assertRaises(ValueError, self._breach, '0.1%') + self.assertRaises(ValueError, self._breach, '1.1') def testPercentThreshold(self): - self.assertRaises(ValueError, self._breach, '>0%', 0) - self.assertTrue(self._breach('>=0.34%', 42, 123)) - self.assertFalse(self._breach('1%', 1, 100)) + self.assertFalse(self._breach('>0', 0, 1)) + self.assertTrue(self._breach('>=0.34', 42, 123)) + self.assertFalse(self._breach('1', 1, 100)) def _breach(self, threshold, *args): return Threshold(threshold).breach(*args) + +if __name__ == '__main__': + unittest.main() diff --git a/modules/elasticsearch/manifests/nagios/check.pp b/modules/elasticsearch/manifests/nagios/check.pp index ce651c7..850d74d 100644 --- a/modules/elasticsearch/manifests/nagios/check.pp +++ b/modules/elasticsearch/manifests/nagios/check.pp @@ -3,9 +3,15 @@ # Make sure your Nagios/Icinga node has included # the elasticsearch::nagios::plugin class. # -class elasticsearch::nagios::check { +# [*threshold*] +# The ratio of inactive shards to check (initializing / relocating / +# unassigned). +# Default: '>=0.1' +class elasticsearch::nagios::check( + $threshold = '>=0.1', +) { monitoring::service { 'elasticsearch shards': - check_command => 'check_elasticsearch_shards', + check_command => "check_elasticsearch_shards_threshold!${threshold}", description => 'ElasticSearch health check for shards', } } diff --git a/modules/nagios_common/files/checkcommands.cfg b/modules/nagios_common/files/checkcommands.cfg index da9af15..fca3c15 100644 --- a/modules/nagios_common/files/checkcommands.cfg +++ b/modules/nagios_common/files/checkcommands.cfg @@ -477,6 +477,11 @@ command_line $USER1$/check_elasticsearch.py --ignore-status --url http://$HOSTADDRESS$:9200 } +define command { + command_name check_elasticsearch_shards_threshold + command_line $USER1$/check_elasticsearch.py --ignore-status --url http://$HOSTADDRESS$:9200 --shards-inactive $ARG1$ +} + # Analytics Cluster Checks define command { diff --git a/modules/role/manifests/logstash/elasticsearch.pp b/modules/role/manifests/logstash/elasticsearch.pp index 26b97a9..fab2b88 100644 --- a/modules/role/manifests/logstash/elasticsearch.pp +++ b/modules/role/manifests/logstash/elasticsearch.pp @@ -5,10 +5,17 @@ # class role::logstash::elasticsearch { include ::standard - include ::elasticsearch::nagios::check include ::elasticsearch::monitor::diamond include ::base::firewall + # the logstash cluster has 3 data nodes, and each shard has 3 replica (each + #shard is present on each node). If one node is lost, 1/3 of the shards + # will be unassigned, with no way to reallocate them on another node, which + # is fine and should not raise an alert. So threshold needs to be > 1/3. + class { '::elasticsearch::nagios::check': + threshold => '>=0.34', + } + if $::standard::has_ganglia { include ::elasticsearch::ganglia } -- To view, visit https://gerrit.wikimedia.org/r/349168 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I352e7230c6bd42f5d1a2fa84f33675e0a6ce8225 Gerrit-PatchSet: 6 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Gehel <guillaume.leder...@wikimedia.org> Gerrit-Reviewer: DCausse <dcau...@wikimedia.org> Gerrit-Reviewer: Filippo Giunchedi <fgiunch...@wikimedia.org> Gerrit-Reviewer: Gehel <guillaume.leder...@wikimedia.org> Gerrit-Reviewer: Muehlenhoff <mmuhlenh...@wikimedia.org> Gerrit-Reviewer: Zppix <megadev44s.m...@gmail.com> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits