Hello Giuseppe Lavagetto, jenkins-bot, Filippo Giunchedi, Volans, I'd like you to do a code review. Please visit
https://gerrit.wikimedia.org/r/385484 to review the following change. Change subject: Revert "apache: remove ganglia monitoring" ...................................................................... Revert "apache: remove ganglia monitoring" This reverts commit 6d16aa43a2d14e6a8cd914be962c060756d6d3fc. Change-Id: I41bcb2fde4474ffa0d5ab9658b573e8725507428 --- M hieradata/role/common/mediawiki/appserver.yaml M hieradata/role/common/mediawiki/appserver/api.yaml M hieradata/role/common/mediawiki/appserver/canary_api.yaml M hieradata/role/common/mediawiki/canary_appserver.yaml M hieradata/role/common/mediawiki/imagescaler.yaml M hieradata/role/common/mediawiki/jobrunner.yaml M hieradata/role/common/mediawiki/memcached.yaml M hieradata/role/common/mediawiki/videoscaler.yaml A modules/apache/files/apache_status.py A modules/apache/files/apache_status.pyconf M modules/apache/manifests/monitoring.pp 11 files changed, 573 insertions(+), 8 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/84/385484/1 diff --git a/hieradata/role/common/mediawiki/appserver.yaml b/hieradata/role/common/mediawiki/appserver.yaml index 7d14664..28e1f2f 100644 --- a/hieradata/role/common/mediawiki/appserver.yaml +++ b/hieradata/role/common/mediawiki/appserver.yaml @@ -18,4 +18,3 @@ apache::logrotate::rotate: 12 nutcracker::verbosity: "4" role::mediawiki::webserver::tls: true -standard::has_ganglia: false diff --git a/hieradata/role/common/mediawiki/appserver/api.yaml b/hieradata/role/common/mediawiki/appserver/api.yaml index 7f01396..1195204 100644 --- a/hieradata/role/common/mediawiki/appserver/api.yaml +++ b/hieradata/role/common/mediawiki/appserver/api.yaml @@ -18,4 +18,3 @@ apache::logrotate::rotate: 12 nutcracker::verbosity: "4" role::mediawiki::webserver::tls: true -standard::has_ganglia: false diff --git a/hieradata/role/common/mediawiki/appserver/canary_api.yaml b/hieradata/role/common/mediawiki/appserver/canary_api.yaml index 575a080..99cde6f 100644 --- a/hieradata/role/common/mediawiki/appserver/canary_api.yaml +++ b/hieradata/role/common/mediawiki/appserver/canary_api.yaml @@ -23,4 +23,3 @@ apache::logrotate::rotate: 12 nutcracker::verbosity: "4" role::mediawiki::webserver::tls: true -standard::has_ganglia: false diff --git a/hieradata/role/common/mediawiki/canary_appserver.yaml b/hieradata/role/common/mediawiki/canary_appserver.yaml index 10e7a9e..a33df65 100644 --- a/hieradata/role/common/mediawiki/canary_appserver.yaml +++ b/hieradata/role/common/mediawiki/canary_appserver.yaml @@ -23,4 +23,3 @@ apache::logrotate::rotate: 12 nutcracker::verbosity: "4" role::mediawiki::webserver::tls: true -standard::has_ganglia: false diff --git a/hieradata/role/common/mediawiki/imagescaler.yaml b/hieradata/role/common/mediawiki/imagescaler.yaml index 7a6a277..201ab70 100644 --- a/hieradata/role/common/mediawiki/imagescaler.yaml +++ b/hieradata/role/common/mediawiki/imagescaler.yaml @@ -13,4 +13,3 @@ light_process_count: "10" apache::mpm::mpm: worker role::mediawiki::webserver::tls: true -standard::has_ganglia: false diff --git a/hieradata/role/common/mediawiki/jobrunner.yaml b/hieradata/role/common/mediawiki/jobrunner.yaml index 10b4bf2..aae358f 100644 --- a/hieradata/role/common/mediawiki/jobrunner.yaml +++ b/hieradata/role/common/mediawiki/jobrunner.yaml @@ -20,4 +20,3 @@ role::lvs::realserver::pools: hhvm: lvs_name: jobrunner -standard::has_ganglia: false diff --git a/hieradata/role/common/mediawiki/memcached.yaml b/hieradata/role/common/mediawiki/memcached.yaml index c695d12..70221d7 100644 --- a/hieradata/role/common/mediawiki/memcached.yaml +++ b/hieradata/role/common/mediawiki/memcached.yaml @@ -22,4 +22,3 @@ profile::memcached::extended_options: - 'slab_reassign' profile::memcached::port: '11211' -standard::has_ganglia: false diff --git a/hieradata/role/common/mediawiki/videoscaler.yaml b/hieradata/role/common/mediawiki/videoscaler.yaml index 253052e..5fbe653 100644 --- a/hieradata/role/common/mediawiki/videoscaler.yaml +++ b/hieradata/role/common/mediawiki/videoscaler.yaml @@ -12,4 +12,3 @@ connection_timeout_seconds: 86400 thread_count: 15 max_execution_time: 86400 -standard::has_ganglia: false diff --git a/modules/apache/files/apache_status.py b/modules/apache/files/apache_status.py new file mode 100755 index 0000000..15c0f2b --- /dev/null +++ b/modules/apache/files/apache_status.py @@ -0,0 +1,439 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import time +import urllib2 +import traceback +import re +import copy + +# global to store state for "total accesses" +METRICS = { + 'time': 0, + 'data': {} +} + +LAST_METRICS = copy.deepcopy(METRICS) +METRICS_CACHE_MAX = 5 + +# Metric prefix +NAME_PREFIX = "ap_" +SSL_NAME_PREFIX = "apssl_" + +SERVER_STATUS_URL = "" + +descriptors = list() +Desc_Skel = {} +Scoreboard = { + NAME_PREFIX + 'waiting': { + 'key': '_', + 'desc': 'Waiting for Connection', + }, + NAME_PREFIX + 'starting': { + 'key': 'S', + 'desc': 'Starting up', + }, + NAME_PREFIX + 'reading_request': { + 'key': 'R', + 'desc': 'Reading Request', + }, + NAME_PREFIX + 'sending_reply': { + 'key': 'W', + 'desc': 'Sending Reply', + }, + NAME_PREFIX + 'keepalive': { + 'key': 'K', + 'desc': 'Keepalive (read)', + }, + NAME_PREFIX + 'dns_lookup': { + 'key': 'D', + 'desc': 'DNS Lookup', + }, + NAME_PREFIX + 'closing': { + 'key': 'C', + 'desc': 'Closing connection', + }, + NAME_PREFIX + 'logging': { + 'key': 'L', + 'desc': 'Logging', + }, + NAME_PREFIX + 'gracefully_fin': { + 'key': 'G', + 'desc': 'Gracefully finishing', + }, + NAME_PREFIX + 'idle': { + 'key': 'I', + 'desc': 'Idle cleanup of worker', + }, + NAME_PREFIX + 'open_slot': { + 'key': '.', + 'desc': 'Open slot with no current process', + }, +} +Scoreboard_bykey = dict([(v["key"], k) for (k, v) in Scoreboard.iteritems()]) + +SSL_REGEX = re.compile( + '^(cache type:) (.*)(<b>)(?P<shared_mem>[0-9]+)(</b> bytes, current sessio' + 'ns: <b>)(?P<current_sessions>[0-9]+)(</b><br>subcaches: <b>)(?P<num_subca' + 'ches>[0-9]+)(</b>, indexes per subcache: <b>)(?P<indexes_per_subcache>[0-' + '9]+)(</b><br>)(.*)(<br>index usage: <b>)(?P<index_usage>[0-9]+)(%</b>, ca' + 'che usage: <b>)(?P<cache_usage>[0-9]+)(%</b><br>total sessions stored sin' + 'ce starting: <b>)(?P<sessions_stored>[0-9]+)(</b><br>total sessions expir' + 'ed since starting: <b>)(?P<sessions_expired>[0-9]+)(</b><br>total \(pre-e' + 'xpiry\) sessions scrolled out of the cache: <b>)(?P<sessions_scrolled_out' + 'of_cache>[0-9]+)(</b><br>total retrieves since starting: <b>)(?P<retrieve' + 's_hit>[0-9]+)(</b> hit, <b>)(?P<retrieves_miss>[0-9]+)(</b> miss<br>total' + ' removes since starting: <b>)(?P<removes_hit>[0-9]+)(</b> hit, <b>)(?P<re' + 'moves_miss>[0-9]+)' +) + +Metric_Map = { + 'Uptime': NAME_PREFIX + "uptime", + 'IdleWorkers': NAME_PREFIX + "idle_workers", + 'BusyWorkers': NAME_PREFIX + "busy_workers", + 'Total kBytes': NAME_PREFIX + "bytes", + 'CPULoad': NAME_PREFIX + "cpuload", + "Total Accesses": NAME_PREFIX + "rps" +} + + +def get_metrics(): + + global METRICS, LAST_METRICS, SERVER_STATUS_URL, COLLECT_SSL + + if (time.time() - METRICS['time']) > METRICS_CACHE_MAX: + + metrics = dict([(k, 0) for k in Scoreboard.keys()]) + + # This is the short server-status. Lacks SSL metrics + try: + req = urllib2.Request(SERVER_STATUS_URL + "?auto") + + # Download the status file + res = urllib2.urlopen(req) + + for line in res: + split_line = line.rstrip().split(": ") + long_metric_name = split_line[0] + if long_metric_name == "Scoreboard": + for sck in split_line[1]: + metrics[Scoreboard_bykey[sck]] += 1 + else: + if long_metric_name in Metric_Map: + metric_name = Metric_Map[long_metric_name] + else: + metric_name = long_metric_name + metrics[metric_name] = split_line[1] + + except urllib2.URLError: + traceback.print_exc() + + # If we are collecting SSL metrics we'll do + if COLLECT_SSL: + + try: + req2 = urllib2.Request(SERVER_STATUS_URL) + + # Download the status file + res = urllib2.urlopen(req2) + + for line in res: + regMatch = SSL_REGEX.match(line) + if regMatch: + linebits = regMatch.groupdict() + for key in linebits: + # print SSL_NAME_PREFIX + key + "=" + linebits[key] + metrics[SSL_NAME_PREFIX + key] = linebits[key] + + except urllib2.URLError: + traceback.print_exc() + + LAST_METRICS = copy.deepcopy(METRICS) + METRICS = { + 'time': time.time(), + 'data': metrics + } + + return [METRICS, LAST_METRICS] + + +def get_value(name): + """Return a value for the requested metric""" + + metrics = get_metrics()[0] + + try: + result = metrics['data'][name] + except StandardError: + result = 0 + + return result + + +def get_delta(name): + """Return change over time for the requested metric""" + + # get metrics + [curr_metrics, last_metrics] = get_metrics() + + # If it's ap_bytes metric multiply result by 1024 + if name == NAME_PREFIX + "bytes": + multiplier = 1024 + else: + multiplier = 1 + + try: + delta = ( + multiplier * ( + float(curr_metrics['data'][name]) - + float(last_metrics['data'][name]) + ) / (curr_metrics['time'] - last_metrics['time']) + ) + if delta < 0: + print name + " is less 0" + delta = 0 + except KeyError: + delta = 0.0 + + return delta + + +def create_desc(prop): + d = Desc_Skel.copy() + for k, v in prop.iteritems(): + d[k] = v + return d + + +def metric_init(params): + global descriptors, Desc_Skel, SERVER_STATUS_URL, COLLECT_SSL + + print '[apache_status] Received the following parameters' + print params + + if "metric_group" not in params: + params["metric_group"] = "apache" + + Desc_Skel = { + 'name': 'XXX', + 'call_back': get_value, + 'time_max': 60, + 'value_type': 'uint', + 'units': 'proc', + 'slope': 'both', + 'format': '%d', + 'description': 'XXX', + 'groups': params["metric_group"], + } + + if "refresh_rate" not in params: + params["refresh_rate"] = 15 + + if "url" not in params: + params["url"] = "http://localhost:7070/server-status" + + if "collect_ssl" not in params: + params["collect_ssl"] = False + + SERVER_STATUS_URL = params["url"] + COLLECT_SSL = params["collect_ssl"] + + # IP:HOSTNAME + if "spoof_host" in params: + Desc_Skel["spoof_host"] = params["spoof_host"] + + descriptors.append(create_desc({ + "name": NAME_PREFIX + "rps", + "value_type": "float", + "units": "req/sec", + "call_back": get_delta, + "format": "%.3f", + "description": "request per second", + })) + + descriptors.append(create_desc({ + "name": NAME_PREFIX + "bytes", + "value_type": "float", + "units": "bytes/sec", + "call_back": get_delta, + "format": "%.3f", + "description": "bytes transferred per second", + })) + + descriptors.append(create_desc({ + "name": NAME_PREFIX + "cpuload", + "value_type": "float", + "units": "pct", + "format": "%.6f", + "call_back": get_value, + "description": "Pct of time CPU utilized", + })) + + descriptors.append(create_desc({ + "name": NAME_PREFIX + "busy_workers", + "value_type": "uint", + "units": "threads", + "format": "%u", + "call_back": get_value, + "description": "Busy threads", + })) + + descriptors.append(create_desc({ + "name": NAME_PREFIX + "idle_workers", + "value_type": "uint", + "units": "threads", + "format": "%u", + "call_back": get_value, + "description": "Idle threads", + })) + + descriptors.append(create_desc({ + "name": NAME_PREFIX + "uptime", + "value_type": "uint", + "units": "seconds", + "format": "%u", + "call_back": get_value, + "description": "Uptime", + })) + + for k, v in Scoreboard.iteritems(): + descriptors.append(create_desc({ + "name": k, + "call_back": get_value, + "description": v["desc"], + })) + + ########################################################################## + # SSL metrics + ########################################################################## + if params['collect_ssl']: + + descriptors.append(create_desc({ + "name": SSL_NAME_PREFIX + "shared_mem", + "value_type": "float", + "units": "bytes", + "format": "%.3f", + "call_back": get_value, + "description": "Shared memory", + })) + + descriptors.append(create_desc({ + "name": SSL_NAME_PREFIX + "current_sessions", + "value_type": "uint", + "units": "sessions", + "format": "%u", + "call_back": get_value, + "description": "Current sessions", + })) + + descriptors.append(create_desc({ + "name": SSL_NAME_PREFIX + "num_subcaches", + "value_type": "uint", + "units": "subcaches", + "format": "%u", + "call_back": get_value, + "description": "Number of subcaches", + })) + + descriptors.append(create_desc({ + "name": SSL_NAME_PREFIX + "indexes_per_subcache", + "value_type": "float", + "units": "indexes", + "format": "%.3f", + "call_back": get_value, + "description": "Subcaches", + })) + + descriptors.append(create_desc({ + "name": SSL_NAME_PREFIX + "index_usage", + "value_type": "float", + "units": "pct", + "format": "%.3f", + "call_back": get_value, + "description": "Index usage", + })) + + descriptors.append(create_desc({ + "name": SSL_NAME_PREFIX + "cache_usage", + "value_type": "float", + "units": "pct", + "format": "%.3f", + "call_back": get_value, + "description": "Cache usage", + })) + + descriptors.append(create_desc({ + "name": SSL_NAME_PREFIX + "sessions_stored", + "value_type": "float", + "units": "sessions/sec", + "format": "%.3f", + "call_back": get_delta, + "description": "Sessions stored", + })) + + descriptors.append(create_desc({ + "name": SSL_NAME_PREFIX + "sessions_expired", + "value_type": "float", + "units": "sessions/sec", + "format": "%.3f", + "call_back": get_delta, + "description": "Sessions expired", + })) + + descriptors.append(create_desc({ + "name": SSL_NAME_PREFIX + "retrieves_hit", + "value_type": "float", + "units": "retrieves/sec", + "format": "%.3f", + "call_back": get_delta, + "description": "Retrieves Hit", + })) + + descriptors.append(create_desc({ + "name": SSL_NAME_PREFIX + "retrieves_miss", + "value_type": "float", + "units": "retrieves/sec", + "format": "%.3f", + "call_back": get_delta, + "description": "Retrieves Miss", + })) + + descriptors.append(create_desc({ + "name": SSL_NAME_PREFIX + "removes_hit", + "value_type": "float", + "units": "removes/sec", + "format": "%.3f", + "call_back": get_delta, + "description": "Removes Hit", + })) + + descriptors.append(create_desc({ + "name": SSL_NAME_PREFIX + "removes_miss", + "value_type": "float", + "units": "removes/sec", + "format": "%.3f", + "call_back": get_delta, + "description": "Removes Miss", + })) + + return descriptors + + +if __name__ == '__main__': + try: + params = { + 'url': 'http://localhost:7070/server-status', + 'collect_ssl': False + } + metric_init(params) + while True: + for d in descriptors: + v = d['call_back'](d['name']) + if d['name'] == NAME_PREFIX + "rps": + print 'value for %s is %.4f' % (d['name'], v) + else: + print 'value for %s is %s' % (d['name'], v) + time.sleep(15) + except KeyboardInterrupt: + os._exit(1) diff --git a/modules/apache/files/apache_status.pyconf b/modules/apache/files/apache_status.pyconf new file mode 100644 index 0000000..ca5fecd --- /dev/null +++ b/modules/apache/files/apache_status.pyconf @@ -0,0 +1,113 @@ +modules { + module { + name = "apache_status" + language = "python" + param url { + value = "http://127.0.0.1:80/server-status" + } + + # Which metric group should these metrics be put into + param metric_group { + value = "apache" + } + + # Collecting SSL metrics under Apache 2.2 appears to cause a memory leak + # in mod_status. Watch Apache memory utilization if you enable them + param collect_ssl { + value = False + } + + + } +} + +collection_group { + collect_every = 30 + time_threshold = 90 + + metric { + name = "ap_busy_workers" + title = "Busy Threads" + value_threshold = 0 + } + metric { + name = "ap_idle_workers" + title = "Idle Threads" + value_threshold = 0 + } + metric { + name = "ap_logging" + title = "Logging" + value_threshold = 0 + } + metric { + name = "ap_open_slot" + title = "Open slot with no current process" + value_threshold = 0 + } + metric { + name = "ap_reading_request" + title = "Reading Request" + value_threshold = 0 + } + metric { + name = "ap_waiting" + title = "Waiting for Connection" + value_threshold = 0 + } + metric { + name = "ap_sending_reply" + title = "Sending Reply" + value_threshold = 0 + } + metric { + name = "ap_idle" + title = "Idle cleanup of worker" + value_threshold = 0 + } + metric { + name = "ap_dns_lookup" + title = "DNS Lookup" + value_threshold = 0 + } + metric { + name = "ap_closing" + title = "Closing connection" + value_threshold = 0 + } + metric { + name = "ap_starting" + title = "Starting up" + value_threshold = 0 + } + metric { + name = "ap_gracefully_fin" + title = "Gracefully finishing" + value_threshold = 0 + } + metric { + name = "ap_keepalive" + title = "Keepalive (read)" + value_threshold = 0 + } + + metric { + name = "ap_rps" + title = "Requests per second" + value_threshold = 0.0 + } + + metric { + name = "ap_cpuload" + title = "Pct of time CPU utilized" + value_threshold = 0.0 + } + +# Uncomment if you are collecting SSL metrics +# metric { +# name_match = "apssl_(.+)" +# value_threshold = 0.0 +# } + + +} diff --git a/modules/apache/manifests/monitoring.pp b/modules/apache/manifests/monitoring.pp index cfdb447..aebe2f3 100644 --- a/modules/apache/manifests/monitoring.pp +++ b/modules/apache/manifests/monitoring.pp @@ -9,6 +9,27 @@ include ::apache::mod::status include ::standard + if $::standard::has_ganglia { + include ::ganglia + + file { '/usr/lib/ganglia/python_modules/apache_status.py': + source => 'puppet:///modules/apache/apache_status.py', + owner => 'root', + group => 'root', + mode => '0444', + require => Package['ganglia-monitor'], + } + + file { '/etc/ganglia/conf.d/apache_status.pyconf': + source => 'puppet:///modules/apache/apache_status.pyconf', + owner => 'root', + group => 'root', + mode => '0444', + require => File['/usr/lib/ganglia/python_modules/apache_status.py'], + notify => Service['ganglia-monitor'], + } + } + # Use `links -dump http://127.0.0.1/server-status` to generate # an Apache status report. require_package('links') -- To view, visit https://gerrit.wikimedia.org/r/385484 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I41bcb2fde4474ffa0d5ab9658b573e8725507428 Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Dzahn <dz...@wikimedia.org> Gerrit-Reviewer: Filippo Giunchedi <fgiunch...@wikimedia.org> Gerrit-Reviewer: Giuseppe Lavagetto <glavage...@wikimedia.org> Gerrit-Reviewer: Volans <rcocci...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits