Alexandros Kosiaris has uploaded a new change for review. https://gerrit.wikimedia.org/r/310772
Change subject: check_postgres_replication_lag.py: Rewrite parts of it ...................................................................... check_postgres_replication_lag.py: Rewrite parts of it Add the following changes: * Check that indeed the DB is in recovery mode * Check that the master actually reports the slave as active. * Pass the required arguments for the above to work as this requires for the script to connect to master * Flake8 compatible Change-Id: I723bce6af13245785055f5a76d0713097c4e258e --- M modules/postgresql/files/check_postgres_replication_lag.py M tox.ini 2 files changed, 76 insertions(+), 21 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/72/310772/1 diff --git a/modules/postgresql/files/check_postgres_replication_lag.py b/modules/postgresql/files/check_postgres_replication_lag.py index 0b38b29..ab1737b 100644 --- a/modules/postgresql/files/check_postgres_replication_lag.py +++ b/modules/postgresql/files/check_postgres_replication_lag.py @@ -1,11 +1,12 @@ #!/usr/bin/env python # written in 2.6.6 on CentOS 6. All other versions untested. +# WMF edited and maintained by Alexandros Kosiaris. Tested on Debian Jessie #Header Info __author__= 'Kirk Hammond' __email__ = 'kirkdhamm...@gmail.com' __version__ = '1.0' __license__ = "GPLv3" -__maintainer__ = "Kirk Hammond" +__maintainer__ = "Alexandros Kosiaris" __status__ = "Production" __credits__ = "Kirk Hammond" @@ -19,7 +20,13 @@ from optparse import OptionParser, OptionGroup import psycopg2 import sys +import socket +#nagios return codes +UNKNOWN = -1 +OK = 0 +WARNING = 1 +CRITICAL = 2 # parse command arguemnts and return options def parse_args(): @@ -28,6 +35,8 @@ parser.version = __version__ parser.add_option("-H", "--host", dest="hostname", default="127.0.0.1", help="Name of the host you are checking") + parser.add_option("-m", "--master", dest="master", + help="Name of the master of the host you are checking") parser.add_option("-O", "--port", dest="port", default="5432", help="Port you will connect to the database with") parser.add_option("-U", "--user", dest="username", default="postgres", @@ -35,39 +44,77 @@ parser.add_option("-P", "--password", dest="password", help="Password the database") parser.add_option("-D", "--database", dest="database", - help="Datbase you are checking") + help="Database you are checking") parser.add_option("-W", "--warn", dest="warn", default="300", - help="Warning alert delay in seconds") + help="Warning alert delay in seconds") parser.add_option("-C", "--crit", dest="crit", default="1800", - help="Critical alert delay in seconds") + help="Critical alert delay in seconds") (options, args) = parser.parse_args() + if not options.master: + parser.error('master not given') + if not options.password: + parser.error('password not given') + if not options.database: + parser.error('database not given') return options -# check delay using options from parse_args -def check_delay(options): +# execute SQL query using options from parse_args +# This function creates and closes connections to clear up after itself. This is +# not the most efficient thing to do but for a monitoring app it is probably fine +def execute_query(query, options, on_master=False): username = str(options.username) password = str(options.password) port = str(options.port) - hostname = str(options.hostname) + if on_master: + hostname = str(options.master) + else: + hostname = str(options.hostname) database = str(options.database) conn_string = "host=" + hostname + " dbname=" + database + " user=" + username + " password=" + password conn = psycopg2.connect(conn_string) cursor = conn.cursor() - cursor.execute('SELECT CASE WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location() THEN 0 ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) END AS log_delay;') - delay = cursor.fetchall() + cursor.execute(query) + result = cursor.fetchall() + cursor.close() + conn.close() + return result + +# check we are in recovery +def check_recovery(options): + query = 'SELECT pg_is_in_recovery();' + recovery = execute_query(query, options) + recovery = recovery.pop() + recovery = recovery[0] + return recovery + + +def check_master_active(options): + query = 'SELECT * from pg_stat_replication;' + slaves = execute_query(query, options, True) + if len(slaves) == 0: + return False + else: + # Note: suboptimal and does not cover all corner cases but ok for now + my_ip = socket.gethostbyname(socket.gethostname()) + for slave in slaves: + # 4th field (counting from 0) is client_addr + if slave[4] == my_ip: + return True + return False + + +# check delay using options from parse_args +def check_delay(options): + query = 'SELECT CASE WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location() THEN 0 ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) END AS log_delay;' + delay = execute_query(query, options) delay = delay.pop() delay = delay[0] return delay # return results and graphing data to Nagios -def nagios(delay,options): - #nagios return codes - UNKNOWN = -1 - OK = 0 - WARNING = 1 - CRITICAL = 2 +def nagios_delay(delay, optionsTrue): warn = float(options.warn) crit = float(options.crit) #pop delay out of list and get float out of tuple for direct comparison to warn/crit float values @@ -85,19 +132,27 @@ sys.exit(UNKNOWN) - - # main function, controls flow of script def main(): #call parse_arges and return options for script options = parse_args() + # Check first that we are indeed in recovery + is_in_recovery = check_recovery(options) + if not is_in_recovery: + print "CRITICAL: Server is not in recovery" + sys.exit(CRITICAL) + + # Then check that we have an active connection to a master + master_active = check_master_active(options) + if not master_active: + print "CRITICAL: Master reports slave not active" + sys.exit(CRITICAL) + # execute command using options from parse_args delay = check_delay(options) - - #call nagios process - nagios(delay,options) + nagios_delay(delay, options) # call main function diff --git a/tox.ini b/tox.ini index 5cd81db..74b90ce 100644 --- a/tox.ini +++ b/tox.ini @@ -10,7 +10,7 @@ # E402: module level import not at top of file ignore = E123,E133,E226,E241,E242,E402 # Upstream files that don't pass flake8 but should not be locally modified -exclude = modules/letsencrypt/files/acme_tiny.py,modules/varnish/files/varnishapi.py,modules/postgresql/files/check_postgres_replication_lag.py +exclude = modules/letsencrypt/files/acme_tiny.py,modules/varnish/files/varnishapi.py [testenv] deps = -- To view, visit https://gerrit.wikimedia.org/r/310772 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I723bce6af13245785055f5a76d0713097c4e258e Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Alexandros Kosiaris <akosia...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits