Repository: bigtop Updated Branches: refs/heads/master 4cfd51332 -> 05f14ffa5
BIGTOP-3047: juju: nagios support for zookeeper Closes #372 Signed-off-by: Kevin W Monroe <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/bigtop/repo Commit: http://git-wip-us.apache.org/repos/asf/bigtop/commit/05f14ffa Tree: http://git-wip-us.apache.org/repos/asf/bigtop/tree/05f14ffa Diff: http://git-wip-us.apache.org/repos/asf/bigtop/diff/05f14ffa Branch: refs/heads/master Commit: 05f14ffa56196f2aba92662155df1a076c17d1f2 Parents: 4cfd513 Author: Kevin W Monroe <[email protected]> Authored: Thu Aug 2 13:25:38 2018 -0500 Committer: Kevin W Monroe <[email protected]> Committed: Thu Aug 2 13:25:38 2018 -0500 ---------------------------------------------------------------------- .../charm/zookeeper/layer-zookeeper/config.yaml | 16 + .../layer-zookeeper/files/check_zookeeper.py | 356 +++++++++++++++++++ .../charm/zookeeper/layer-zookeeper/layer.yaml | 2 + .../zookeeper/layer-zookeeper/metadata.yaml | 6 + .../layer-zookeeper/reactive/zookeeper.py | 102 +++++- 5 files changed, 481 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/bigtop/blob/05f14ffa/bigtop-packages/src/charm/zookeeper/layer-zookeeper/config.yaml ---------------------------------------------------------------------- diff --git a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/config.yaml b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/config.yaml index df9af76..63f566c 100644 --- a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/config.yaml +++ b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/config.yaml @@ -23,3 +23,19 @@ options: snapRetainCount most recent snapshots and the corresponding transaction logs in the dataDir and dataLogDir respectively and deletes the rest. Defaults to 3. Minimum value is 3. + nagios_context: + default: "juju" + type: string + description: | + Used by the nrpe subordinate charms. + A string that will be prepended to instance name to set the host name + in nagios. So for instance the hostname would be something like: + juju-myservice-0 + If you're running multiple environments with the same services in them + this allows you to differentiate between them. + nagios_servicegroups: + default: "" + type: string + description: | + A comma-separated list of nagios servicegroups. + If left empty, the nagios_context will be used as the servicegroup http://git-wip-us.apache.org/repos/asf/bigtop/blob/05f14ffa/bigtop-packages/src/charm/zookeeper/layer-zookeeper/files/check_zookeeper.py ---------------------------------------------------------------------- diff --git a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/files/check_zookeeper.py b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/files/check_zookeeper.py new file mode 100644 index 0000000..923ccef --- /dev/null +++ b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/files/check_zookeeper.py @@ -0,0 +1,356 @@ +#! /usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Check Zookeeper Cluster + +Generic monitoring script that could be used with multiple platforms (Ganglia, Nagios, Cacti). + +It requires ZooKeeper 3.4.0 or greater. The script needs the 'mntr' 4letter word +command (patch ZOOKEEPER-744) that was now commited to the trunk. +The script also works with ZooKeeper 3.3.x but in a limited way. + +Taken from https://github.com/andreisavu/zookeeper-monitoring/ + +""" + +import sys +import socket +import logging +import re +import subprocess + +from StringIO import StringIO +from optparse import OptionParser, OptionGroup + +__version__ = (0, 1, 0) + +log = logging.getLogger() +logging.basicConfig(level=logging.ERROR) + +class NagiosHandler(object): + + @classmethod + def register_options(cls, parser): + group = OptionGroup(parser, 'Nagios specific options') + + group.add_option('-w', '--warning', dest='warning') + group.add_option('-c', '--critical', dest='critical') + + parser.add_option_group(group) + + def analyze(self, opts, cluster_stats): + try: + warning = int(opts.warning) + critical = int(opts.critical) + + except (TypeError, ValueError): + print >>sys.stderr, 'Invalid values for "warning" and "critical".' + return 2 + + if opts.key is None: + print >>sys.stderr, 'You should specify a key name.' + return 2 + + warning_state, critical_state, values = [], [], [] + for host, stats in cluster_stats.items(): + if opts.key in stats: + + value = stats[opts.key] + values.append('%s=%s;%s;%s' % (host, value, warning, critical)) + + if warning >= value > critical or warning <= value < critical: + warning_state.append(host) + + elif (warning < critical and critical <= value) or (warning > critical and critical >= value): + critical_state.append(host) + + values = ' '.join(values) + if critical_state: + print 'Critical "%s" %s!|%s' % (opts.key, ', '.join(critical_state), values) + return 2 + + elif warning_state: + print 'Warning "%s" %s!|%s' % (opts.key, ', '.join(warning_state), values) + return 1 + + else: + print 'Ok "%s"!|%s' % (opts.key, values) + return 0 + +class CactiHandler(object): + + @classmethod + def register_options(cls, parser): + group = OptionGroup(parser, 'Cacti specific options') + + group.add_option('-l', '--leader', dest='leader', + action="store_true", help="only query the cluster leader") + + parser.add_option_group(group) + + def analyze(self, opts, cluster_stats): + if opts.key is None: + print >>sys.stderr, 'The key name is mandatory.' + return 1 + + if opts.leader is True: + try: + leader = [x for x in cluster_stats.values() \ + if x.get('zk_server_state', '') == 'leader'][0] + + except IndexError: + print >>sys.stderr, 'No leader found.' + return 3 + + if opts.key in leader: + print leader[opts.key] + return 0 + + else: + print >>sys.stderr, 'Unknown key: "%s"' % opts.key + return 2 + else: + for host, stats in cluster_stats.items(): + if opts.key not in stats: + continue + + host = host.replace(':', '_') + print '%s:%s' % (host, stats[opts.key]), + + +class GangliaHandler(object): + + @classmethod + def register_options(cls, parser): + group = OptionGroup(parser, 'Ganglia specific options') + + group.add_option('-g', '--gmetric', dest='gmetric', + default='/usr/bin/gmetric', help='ganglia gmetric binary '\ + 'location: /usr/bin/gmetric') + + parser.add_option_group(group) + + def call(self, *args, **kwargs): + subprocess.call(*args, **kwargs) + + def analyze(self, opts, cluster_stats): + if len(cluster_stats) != 1: + print >>sys.stderr, 'Only allowed to monitor a single node.' + return 1 + + for host, stats in cluster_stats.items(): + for k, v in stats.items(): + try: + self.call([opts.gmetric, '-n', k, '-v', str(int(v)), '-t', 'uint32']) + except (TypeError, ValueError): + pass + +class ZooKeeperServer(object): + + def __init__(self, host='localhost', port='2181', timeout=1): + self._address = (host, int(port)) + self._timeout = timeout + + def get_stats(self): + """ Get ZooKeeper server stats as a map """ + data = self._send_cmd('mntr') + if data: + return self._parse(data) + else: + data = self._send_cmd('stat') + return self._parse_stat(data) + + def _create_socket(self): + return socket.socket() + + def _send_cmd(self, cmd): + """ Send a 4letter word command to the server """ + s = self._create_socket() + s.settimeout(self._timeout) + + s.connect(self._address) + s.send(cmd) + + data = s.recv(2048) + s.close() + + return data + + def _parse(self, data): + """ Parse the output from the 'mntr' 4letter word command """ + h = StringIO(data) + + result = {} + for line in h.readlines(): + try: + key, value = self._parse_line(line) + result[key] = value + except ValueError: + pass # ignore broken lines + + return result + + def _parse_stat(self, data): + """ Parse the output from the 'stat' 4letter word command """ + h = StringIO(data) + + result = {} + + version = h.readline() + if version: + result['zk_version'] = version[version.index(':')+1:].strip() + + # skip all lines until we find the empty one + while h.readline().strip(): pass + + for line in h.readlines(): + m = re.match('Latency min/avg/max: (\d+)/(\d+)/(\d+)', line) + if m is not None: + result['zk_min_latency'] = int(m.group(1)) + result['zk_avg_latency'] = int(m.group(2)) + result['zk_max_latency'] = int(m.group(3)) + continue + + m = re.match('Received: (\d+)', line) + if m is not None: + result['zk_packets_received'] = int(m.group(1)) + continue + + m = re.match('Sent: (\d+)', line) + if m is not None: + result['zk_packets_sent'] = int(m.group(1)) + continue + + m = re.match('Outstanding: (\d+)', line) + if m is not None: + result['zk_outstanding_requests'] = int(m.group(1)) + continue + + m = re.match('Mode: (.*)', line) + if m is not None: + result['zk_server_state'] = m.group(1) + continue + + m = re.match('Node count: (\d+)', line) + if m is not None: + result['zk_znode_count'] = int(m.group(1)) + continue + + return result + + def _parse_line(self, line): + try: + key, value = map(str.strip, line.split('\t')) + except ValueError: + raise ValueError('Found invalid line: %s' % line) + + if not key: + raise ValueError('The key is mandatory and should not be empty') + + try: + value = int(value) + except (TypeError, ValueError): + pass + + return key, value + +def main(): + opts, args = parse_cli() + + cluster_stats = get_cluster_stats(opts.servers) + if opts.output is None: + dump_stats(cluster_stats) + return 0 + + handler = create_handler(opts.output) + if handler is None: + log.error('undefined handler: %s' % opts.output) + sys.exit(1) + + return handler.analyze(opts, cluster_stats) + +def create_handler(name): + """ Return an instance of a platform specific analyzer """ + try: + return globals()['%sHandler' % name.capitalize()]() + except KeyError: + return None + +def get_all_handlers(): + """ Get a list containing all the platform specific analyzers """ + return [NagiosHandler, CactiHandler, GangliaHandler] + +def dump_stats(cluster_stats): + """ Dump cluster statistics in an user friendly format """ + for server, stats in cluster_stats.items(): + print 'Server:', server + + for key, value in stats.items(): + print "%30s" % key, ' ', value + print + +def get_cluster_stats(servers): + """ Get stats for all the servers in the cluster """ + stats = {} + for host, port in servers: + try: + zk = ZooKeeperServer(host, port) + stats["%s:%s" % (host, port)] = zk.get_stats() + + except socket.error, e: + # ignore because the cluster can still work even + # if some servers fail completely + + # this error should be also visible in a variable + # exposed by the server in the statistics + + logging.info('unable to connect to server '\ + '"%s" on port "%s"' % (host, port)) + + return stats + + +def get_version(): + return '.'.join(map(str, __version__)) + + +def parse_cli(): + parser = OptionParser(usage='./check_zookeeper.py <options>', version=get_version()) + + parser.add_option('-s', '--servers', dest='servers', + help='a list of SERVERS', metavar='SERVERS') + + parser.add_option('-o', '--output', dest='output', + help='output HANDLER: nagios, ganglia, cacti', metavar='HANDLER') + + parser.add_option('-k', '--key', dest='key') + + for handler in get_all_handlers(): + handler.register_options(parser) + + opts, args = parser.parse_args() + + if opts.servers is None: + parser.error('The list of servers is mandatory') + + opts.servers = [s.split(':') for s in opts.servers.split(',')] + + return (opts, args) + + +if __name__ == '__main__': + sys.exit(main()) http://git-wip-us.apache.org/repos/asf/bigtop/blob/05f14ffa/bigtop-packages/src/charm/zookeeper/layer-zookeeper/layer.yaml ---------------------------------------------------------------------- diff --git a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/layer.yaml b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/layer.yaml index 7f6ee76..e52afc8 100644 --- a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/layer.yaml +++ b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/layer.yaml @@ -4,6 +4,8 @@ includes: - 'layer:leadership' - 'interface:zookeeper-quorum' - 'interface:zookeeper' + - 'interface:nrpe-external-master' + - 'interface:local-monitors' options: apache-bigtop-base: ports: http://git-wip-us.apache.org/repos/asf/bigtop/blob/05f14ffa/bigtop-packages/src/charm/zookeeper/layer-zookeeper/metadata.yaml ---------------------------------------------------------------------- diff --git a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/metadata.yaml b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/metadata.yaml index a563775..36dce42 100644 --- a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/metadata.yaml +++ b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/metadata.yaml @@ -14,6 +14,12 @@ tags: [] provides: zookeeper: interface: zookeeper + nrpe-external-master: + interface: nrpe-external-master + scope: container + local-monitors: + interface: local-monitors + scope: container peers: zkpeer: interface: zookeeper-quorum http://git-wip-us.apache.org/repos/asf/bigtop/blob/05f14ffa/bigtop-packages/src/charm/zookeeper/layer-zookeeper/reactive/zookeeper.py ---------------------------------------------------------------------- diff --git a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/reactive/zookeeper.py b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/reactive/zookeeper.py index 0cf11c2..5af4c5d 100644 --- a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/reactive/zookeeper.py +++ b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/reactive/zookeeper.py @@ -19,8 +19,108 @@ from charmhelpers.core import hookenv from charms.layer.apache_bigtop_base import get_package_version from charms.layer.bigtop_zookeeper import Zookeeper from charms.leadership import leader_set, leader_get -from charms.reactive import set_state, when, when_not, is_state +from charms.reactive import ( + hook, + is_state, + remove_state, + set_state, + when, + when_not +) from charms.reactive.helpers import data_changed +import shutil +import os + + +@when('local-monitors.available') +def local_monitors_available(nagios): + setup_nagios(nagios) + + +@when('nrpe-external-master.available') +def nrpe_external_master_available(nagios): + setup_nagios(nagios) + + +def setup_nagios(nagios): + config = hookenv.config() + unit_name = hookenv.local_unit() + checks = [ + { + 'name': 'zk_open_file_descriptor_coun', + 'description': 'ZK_Open_File_Descriptors_Count', + 'warn': 500, + 'crit': 800 + }, + { + 'name': 'zk_ephemerals_count', + 'description': 'ZK_Ephemerals_Count', + 'warn': 10000, + 'crit': 100000 + }, + { + 'name': 'zk_avg_latency', + 'description': 'ZK_Avg_Latency', + 'warn': 500, + 'crit': 1000 + }, + { + 'name': 'zk_max_latency', + 'description': 'ZK_Max_Latency', + 'warn': 1000, + 'crit': 2000 + }, + { + 'name': 'zk_min_latency', + 'description': 'ZK_Min_Latency', + 'warn': 500, + 'crit': 1000 + }, + { + 'name': 'zk_outstanding_requests', + 'description': 'ZK_Outstanding_Requests', + 'warn': 20, + 'crit': 50 + }, + { + 'name': 'zk_watch_count', + 'description': 'ZK_Watch_Count', + 'warn': 100, + 'crit': 500 + }, + ] + check_cmd = ['/usr/local/lib/nagios/plugins/check_zookeeper.py', + '-o', 'nagios', '-s', 'localhost:2181'] + for check in checks: + nagios.add_check(check_cmd + ['--key', check['name'], + '-w', str(check['warn']), + '-c', str(check['crit'])], + name=check['name'], + description=check['description'], + context=config["nagios_context"], + servicegroups=config["nagios_servicegroups"], + unit=unit_name + ) + nagios.updated() + + +@hook('upgrade-charm') +def nrpe_helper_upgrade_charm(): + # Make sure the nrpe handler will get replaced at charm upgrade + remove_state('zookeeper.nrpe_helper.installed') + + +@when('zookeeper.nrpe_helper.registered') +@when_not('zookeeper.nrpe_helper.installed') +def install_nrpe_helper(): + dst_dir = '/usr/local/lib/nagios/plugins/' + if not os.path.exists(dst_dir): + os.makedirs(dst_dir) + src = '{}/files/check_zookeeper.py'.format(hookenv.charm_dir()) + dst = '{}/check_zookeeper.py'.format(dst_dir) + shutil.copy(src, dst) + os.chmod(dst, 0o755) + set_state('zookeeper.nrpe_helper.installed') @when('bigtop.available')
