Alexandros Kosiaris has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/310772

Change subject: check_postgres_replication_lag.py: Rewrite parts of it
......................................................................

check_postgres_replication_lag.py: Rewrite parts of it

Add the following changes:
 * Check that indeed the DB is in recovery mode
 * Check that the master actually reports the slave as active.
 * Pass the required arguments for the above to work as this requires
   for the script to connect to master
 * Flake8 compatible

Change-Id: I723bce6af13245785055f5a76d0713097c4e258e
---
M modules/postgresql/files/check_postgres_replication_lag.py
M tox.ini
2 files changed, 76 insertions(+), 21 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/72/310772/1

diff --git a/modules/postgresql/files/check_postgres_replication_lag.py 
b/modules/postgresql/files/check_postgres_replication_lag.py
index 0b38b29..ab1737b 100644
--- a/modules/postgresql/files/check_postgres_replication_lag.py
+++ b/modules/postgresql/files/check_postgres_replication_lag.py
@@ -1,11 +1,12 @@
 #!/usr/bin/env python
 # written in 2.6.6 on CentOS 6. All other versions untested.
+# WMF edited and maintained by Alexandros Kosiaris. Tested on Debian Jessie
 #Header Info
 __author__= 'Kirk Hammond'
 __email__ = 'kirkdhamm...@gmail.com'
 __version__ = '1.0'
 __license__ = "GPLv3"
-__maintainer__ = "Kirk Hammond"
+__maintainer__ = "Alexandros Kosiaris"
 __status__ = "Production"
 __credits__ = "Kirk Hammond"
 
@@ -19,7 +20,13 @@
 from optparse import OptionParser, OptionGroup
 import psycopg2
 import sys
+import socket
 
+#nagios return codes
+UNKNOWN = -1
+OK = 0
+WARNING = 1
+CRITICAL = 2
 
 # parse command arguemnts and return options
 def parse_args():
@@ -28,6 +35,8 @@
     parser.version = __version__
     parser.add_option("-H", "--host", dest="hostname", default="127.0.0.1",
                       help="Name of the host you are checking")
+    parser.add_option("-m", "--master", dest="master",
+                      help="Name of the master of the host you are checking")
     parser.add_option("-O", "--port", dest="port", default="5432",
                        help="Port you will connect to the database with")
     parser.add_option("-U", "--user", dest="username", default="postgres",
@@ -35,39 +44,77 @@
     parser.add_option("-P", "--password", dest="password",
                        help="Password the database")
     parser.add_option("-D", "--database", dest="database",
-                       help="Datbase you are checking") 
+                       help="Database you are checking")
     parser.add_option("-W", "--warn", dest="warn", default="300",
-                       help="Warning alert delay in seconds") 
+                       help="Warning alert delay in seconds")
     parser.add_option("-C", "--crit", dest="crit", default="1800",
-                       help="Critical alert delay in seconds") 
+                       help="Critical alert delay in seconds")
     (options, args) = parser.parse_args()
+    if not options.master:
+        parser.error('master not given')
+    if not options.password:
+        parser.error('password not given')
+    if not options.database:
+        parser.error('database not given')
     return options
 
 
-# check delay using options from parse_args
-def check_delay(options):
+# execute SQL query using options from parse_args
+# This function creates and closes connections to clear up after itself. This 
is
+# not the most efficient thing to do but for a monitoring app it is probably 
fine
+def execute_query(query, options, on_master=False):
     username = str(options.username)
     password = str(options.password)
     port = str(options.port)
-    hostname = str(options.hostname)
+    if on_master:
+        hostname = str(options.master)
+    else:
+        hostname = str(options.hostname)
     database = str(options.database)
     conn_string = "host=" + hostname + " dbname=" + database + " user=" + 
username + " password=" + password
     conn = psycopg2.connect(conn_string)
     cursor = conn.cursor()
-    cursor.execute('SELECT CASE WHEN pg_last_xlog_receive_location() = 
pg_last_xlog_replay_location() THEN 0 ELSE EXTRACT (EPOCH FROM now() - 
pg_last_xact_replay_timestamp()) END AS log_delay;')
-    delay = cursor.fetchall()
+    cursor.execute(query)
+    result = cursor.fetchall()
+    cursor.close()
+    conn.close()
+    return result
+
+# check we are in recovery
+def check_recovery(options):
+    query = 'SELECT pg_is_in_recovery();'
+    recovery = execute_query(query, options)
+    recovery = recovery.pop()
+    recovery = recovery[0]
+    return recovery
+
+
+def check_master_active(options):
+    query = 'SELECT * from pg_stat_replication;'
+    slaves = execute_query(query, options, True)
+    if len(slaves) == 0:
+        return False
+    else:
+        # Note: suboptimal and does not cover all corner cases but ok for now
+        my_ip = socket.gethostbyname(socket.gethostname())
+        for slave in slaves:
+            # 4th field (counting from 0) is client_addr
+            if slave[4] == my_ip:
+                return True
+    return False
+
+
+# check delay using options from parse_args
+def check_delay(options):
+    query = 'SELECT CASE WHEN pg_last_xlog_receive_location() = 
pg_last_xlog_replay_location() THEN 0 ELSE EXTRACT (EPOCH FROM now() - 
pg_last_xact_replay_timestamp()) END AS log_delay;'
+    delay = execute_query(query, options)
     delay = delay.pop()
     delay = delay[0]
     return delay
 
 
 # return results and graphing data to Nagios
-def nagios(delay,options):
-    #nagios return codes
-    UNKNOWN = -1
-    OK = 0
-    WARNING = 1
-    CRITICAL = 2
+def nagios_delay(delay, optionsTrue):
     warn = float(options.warn)
     crit = float(options.crit)
     #pop delay out of list and get float out of tuple for direct comparison to 
warn/crit float values
@@ -85,19 +132,27 @@
         sys.exit(UNKNOWN)
 
 
-
-
 # main function, controls flow of script
 def main():
 
     #call parse_arges and return options for script
     options = parse_args()
 
+    # Check first that we are indeed in recovery
+    is_in_recovery = check_recovery(options)
+    if not is_in_recovery:
+        print "CRITICAL: Server is not in recovery"
+        sys.exit(CRITICAL)
+
+    # Then check that we have an active connection to a master
+    master_active = check_master_active(options)
+    if not master_active:
+        print "CRITICAL: Master reports slave not active"
+        sys.exit(CRITICAL)
+
     # execute command using options from parse_args
     delay = check_delay(options)
-
-    #call nagios process
-    nagios(delay,options)
+    nagios_delay(delay, options)
 
 
 # call main function
diff --git a/tox.ini b/tox.ini
index 5cd81db..74b90ce 100644
--- a/tox.ini
+++ b/tox.ini
@@ -10,7 +10,7 @@
 # E402: module level import not at top of file
 ignore = E123,E133,E226,E241,E242,E402
 # Upstream files that don't pass flake8 but should not be locally modified
-exclude = 
modules/letsencrypt/files/acme_tiny.py,modules/varnish/files/varnishapi.py,modules/postgresql/files/check_postgres_replication_lag.py
+exclude = 
modules/letsencrypt/files/acme_tiny.py,modules/varnish/files/varnishapi.py
 
 [testenv]
 deps =

-- 
To view, visit https://gerrit.wikimedia.org/r/310772
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I723bce6af13245785055f5a76d0713097c4e258e
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Alexandros Kosiaris <akosia...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to