Ori.livneh has uploaded a new change for review.
https://gerrit.wikimedia.org/r/74837
Change subject: Report MediaWiki exceptions & fatals to Ganglia
......................................................................
Report MediaWiki exceptions & fatals to Ganglia
This patch re-introduces the Python Ganglia module for MediaWiki exceptions and
fatals which I removed in If035a0ef2 so I could clean up the EventLogging
module. I've modified it to consume the UDP error stream directly rather than
via ZeroMQ forwarding since it was adding an unnecessary layer of complexity to
a piece of code that should be as straightforward as possible. Other than the
slight modification to consume a UDP stream rather than a ZeroMQ stream, the
code is substantially the same as it was when Tim reviewed it.
Change-Id: I874ec534062c2086f2c1ac00bdb9cb4ffbeaf846
---
M manifests/role/logging.pp
M manifests/site.pp
A modules/mediawiki/files/mwerrors.py
A modules/mediawiki/manifests/monitor/errors.pp
A modules/mediawiki/templates/mwerrors.pyconf.erb
5 files changed, 232 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet
refs/changes/37/74837/1
diff --git a/manifests/role/logging.pp b/manifests/role/logging.pp
index eee5943..c088be7 100644
--- a/manifests/role/logging.pp
+++ b/manifests/role/logging.pp
@@ -78,6 +78,22 @@
}
}
+# == Class role::logging::mediawiki::ganglia
+# fluorine's udp2log instance forwards MediaWiki exceptions and fatals
+# to vanadium, as configured in templates/udp2log/filters.mw.erb. This
+# role provisions a metric module that reports error counts to Ganglia.
+# Configures a Ganglia metric module that listens on a UDP port for
+# MediaWiki fatal and exception log messages and reports them to Ganglia.
+#
+class role::logging::mediawiki::errors {
+ system_role { 'role::logging::mediawiki::errors':
+ description => 'Report MediaWiki exceptions and fatals to Ganglia',
+ }
+
+ class { 'mediawiki::monitor::errors':
+ port => 8423,
+ }
+}
# == Class role::logging::relay::webrequest-multicast
# Sets up a multicast relay using socat for
diff --git a/manifests/site.pp b/manifests/site.pp
index 3ee41d0..8ca9ae1 100644
--- a/manifests/site.pp
+++ b/manifests/site.pp
@@ -2800,6 +2800,7 @@
include standard,
role::eventlogging,
role::ipython_notebook,
+ role::logging::mediawiki::ganglia,
nrpe,
role::solr::ttm
diff --git a/modules/mediawiki/files/mwerrors.py
b/modules/mediawiki/files/mwerrors.py
new file mode 100644
index 0000000..232af82
--- /dev/null
+++ b/modules/mediawiki/files/mwerrors.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+ mwerrors.py
+ ~~~~~~~~~~~
+
+ This is a Gmond metric-gathering module; it listens on a UDP port for
+ MediaWiki fatals and exceptions and reports them to Ganglia.
+
+ To use this module, copy this file to /usr/lib/ganglia/python_modules.
+ Then edit mwerrors.pyconf and ensure the UDP port is correct. Finally,
+ copy mwerrors.pyconf to /etc/ganglia/conf.d and run 'service
+ ganglia-monitor restart'.
+
+ You can test this module by invoking it from the command line with the
+ UDP port number as an argument. When invoked from the command line,
+ this script will emit a count of errors every ten seconds.
+
+ Copyright (C) 2013, Ori Livneh <[email protected]>
+ Licensed under the terms of the GNU General Public License, version 2
+ or later.
+
+"""
+import sys
+reload(sys)
+sys.setdefaultencoding('utf8')
+
+import io
+import socket
+import threading
+import time
+
+
+BLOCK_SIZE = 65536 # Matches Udp2LogConfig::BLOCK_SIZE
+
+patterns = (
+ # Substring to match # Metric # Metric title
+ ('Fatal error: Out of memory', 'oom', 'OOM fatals'),
+ ('Fatal error: Maximum execution', 'timelimit', 'Time limit fatals'),
+ ('Fatal error:', 'fatal', 'Misc fatals'),
+ ('Exception from', 'exception', 'Exceptions'),
+ ('Catchable fatal error', 'catchable', 'Catchable fatals'),
+ ('DatabaseBase->reportQueryError', 'query', 'Query errors'),
+)
+
+
+def count_errors(counter, port):
+ """Count error types in error stream."""
+ sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+ sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+ sock.bind(('0.0.0.0', port))
+ with io.open(sock.fileno(), buffering=BLOCK_SIZE, encoding='utf8',
+ errors='replace') as f:
+ for line in f:
+ for pattern, name, description in patterns:
+ if pattern in line:
+ counter[name] += 1
+ break
+
+
+def metric_init(params):
+ """
+ Initialize; part of Gmond interface
+
+ `params` is a dictionary of configuration options, generated by
+ Ganglia out of values specified in the module's .pyconf file. It
+ should contain an 'port' key, specifying the UDP port to listen on.
+
+ param port {
+ value = "8423"
+ }
+
+ """
+ port = int(params['port'])
+ counter = {name: 0 for pattern, name, description in patterns}
+
+ thread = threading.Thread(target=count_errors, args=(counter, port))
+ thread.daemon = True
+ thread.start()
+
+ time.sleep(2)
+
+ return [{
+ 'name': name,
+ 'value_type': 'uint',
+ 'format': '%d',
+ 'units': 'errors',
+ 'slope': 'positive',
+ 'time_max': 15,
+ 'description': description,
+ 'groups': 'mediawiki',
+ 'call_back': counter.get,
+ } for pattern, name, description in patterns]
+
+
+def metric_cleanup():
+ """Teardown; part of Gmond interface"""
+ pass
+
+
+if __name__ == '__main__':
+ # Self-test: report metrics to stdout every 10 seconds.
+ import sys
+
+ if len(sys.argv) != 2:
+ sys.exit('Usage: %s PORT' % __file__)
+
+ params = {'port': sys.argv[1]}
+ metrics = metric_init(params)
+
+ print('Listening on %(port)s...' % params)
+
+ while 1:
+ print('\n{:-^32}'.format(time.asctime()))
+ for metric in metrics:
+ call_back = metric['call_back']
+ name = metric['name']
+ description = metric['description']
+ print('{:.<30}{}'.format(description, call_back(name)))
+ time.sleep(10)
+
+# vim: set et ft=python ts=4 sw=4:
diff --git a/modules/mediawiki/manifests/monitor/errors.pp
b/modules/mediawiki/manifests/monitor/errors.pp
new file mode 100644
index 0000000..658e414
--- /dev/null
+++ b/modules/mediawiki/manifests/monitor/errors.pp
@@ -0,0 +1,39 @@
+# == Class: mediawiki::monitor::errors
+#
+# Configures a metric module that listens on a UDP port for MediaWiki
+# fatal and exception log messages and reports them to Ganglia.
+#
+# === Parameters
+#
+# [*port*]
+# UDP port on which metric module should listen (default: 8423).
+#
+# [*ensure*]
+# If 'present' (the default), provisions the metric module. If
+# 'absent', removes the module source and configuration files.
+#
+# === Examples
+#
+# class { 'mediawiki::monitor::errors':
+# ensure => present,
+# port => 9400,
+# }
+#
+class mediawiki::monitor::errors(
+ $ensure = present,
+ $port = 8423,
+) {
+ # Metric module.
+ file { '/usr/lib/ganglia/python_modules/mwerrors.py':
+ ensure => $ensure,
+ source => 'puppet:///modules/mediawiki/mwerrors.py',
+ before => File['/etc/ganglia/conf.d/mwerrors.pyconf'],
+ }
+
+ # Metric definitions.
+ file { '/etc/ganglia/conf.d/mwerrors.pyconf':
+ ensure => $ensure,
+ content => template('mediawiki/mwerrors.pyconf'),
+ notify => Service['gmond'],
+ }
+}
diff --git a/modules/mediawiki/templates/mwerrors.pyconf.erb
b/modules/mediawiki/templates/mwerrors.pyconf.erb
new file mode 100644
index 0000000..2838c5f
--- /dev/null
+++ b/modules/mediawiki/templates/mwerrors.pyconf.erb
@@ -0,0 +1,54 @@
+# Ganglia metric definitions for MediaWiki exceptions and fatals.
+# This file is managed by Puppet.
+
+modules {
+ module {
+ name = "mwerrors"
+ language = "python"
+ param port {
+ value = "<%= @port %>"
+ }
+ }
+}
+
+
+collection_group {
+ collect_every = 15
+ time_threshold = 30
+
+ metric {
+ name = "oom"
+ title = "OOM fatals"
+ value_threshold = 1
+ }
+
+ metric {
+ name = "timelimit"
+ title = "Time limit fatals"
+ value_threshold = 1
+ }
+
+ metric {
+ name = "fatal"
+ title = "Misc fatals"
+ value_threshold = 1
+ }
+
+ metric {
+ name = "exception"
+ title = "Exceptions"
+ value_threshold = 1
+ }
+
+ metric {
+ name = "catchable"
+ title = "Catchable fatals"
+ value_threshold = 1
+ }
+
+ metric {
+ name = "query"
+ title = "Query errors"
+ value_threshold = 1
+ }
+}
--
To view, visit https://gerrit.wikimedia.org/r/74837
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I874ec534062c2086f2c1ac00bdb9cb4ffbeaf846
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Ori.livneh <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits