Alexandros Kosiaris has submitted this change and it was merged.

Change subject: check_postgres_replication_lag.py: Rewrite parts of it
......................................................................


check_postgres_replication_lag.py: Rewrite parts of it

Add the following changes:
 * Check that indeed the DB is in recovery mode
 * Check that the master actually reports the slave as active.
 * Pass the required arguments for the above to work as this requires
   for the script to connect to master
 * Fix the using classes to pass the correct master argument

Change-Id: I723bce6af13245785055f5a76d0713097c4e258e
---
M modules/postgresql/files/check_postgres_replication_lag.py
M modules/role/manifests/maps/slave.pp
M modules/role/manifests/puppetmaster/puppetdb.pp
3 files changed, 78 insertions(+), 22 deletions(-)

Approvals:
  Alexandros Kosiaris: Verified; Looks good to me, approved



diff --git a/modules/postgresql/files/check_postgres_replication_lag.py 
b/modules/postgresql/files/check_postgres_replication_lag.py
index 0b38b29..ab1737b 100644
--- a/modules/postgresql/files/check_postgres_replication_lag.py
+++ b/modules/postgresql/files/check_postgres_replication_lag.py
@@ -1,11 +1,12 @@
 #!/usr/bin/env python
 # written in 2.6.6 on CentOS 6. All other versions untested.
+# WMF edited and maintained by Alexandros Kosiaris. Tested on Debian Jessie
 #Header Info
 __author__= 'Kirk Hammond'
 __email__ = 'kirkdhamm...@gmail.com'
 __version__ = '1.0'
 __license__ = "GPLv3"
-__maintainer__ = "Kirk Hammond"
+__maintainer__ = "Alexandros Kosiaris"
 __status__ = "Production"
 __credits__ = "Kirk Hammond"
 
@@ -19,7 +20,13 @@
 from optparse import OptionParser, OptionGroup
 import psycopg2
 import sys
+import socket
 
+#nagios return codes
+UNKNOWN = -1
+OK = 0
+WARNING = 1
+CRITICAL = 2
 
 # parse command arguemnts and return options
 def parse_args():
@@ -28,6 +35,8 @@
     parser.version = __version__
     parser.add_option("-H", "--host", dest="hostname", default="127.0.0.1",
                       help="Name of the host you are checking")
+    parser.add_option("-m", "--master", dest="master",
+                      help="Name of the master of the host you are checking")
     parser.add_option("-O", "--port", dest="port", default="5432",
                        help="Port you will connect to the database with")
     parser.add_option("-U", "--user", dest="username", default="postgres",
@@ -35,39 +44,77 @@
     parser.add_option("-P", "--password", dest="password",
                        help="Password the database")
     parser.add_option("-D", "--database", dest="database",
-                       help="Datbase you are checking") 
+                       help="Database you are checking")
     parser.add_option("-W", "--warn", dest="warn", default="300",
-                       help="Warning alert delay in seconds") 
+                       help="Warning alert delay in seconds")
     parser.add_option("-C", "--crit", dest="crit", default="1800",
-                       help="Critical alert delay in seconds") 
+                       help="Critical alert delay in seconds")
     (options, args) = parser.parse_args()
+    if not options.master:
+        parser.error('master not given')
+    if not options.password:
+        parser.error('password not given')
+    if not options.database:
+        parser.error('database not given')
     return options
 
 
-# check delay using options from parse_args
-def check_delay(options):
+# execute SQL query using options from parse_args
+# This function creates and closes connections to clear up after itself. This 
is
+# not the most efficient thing to do but for a monitoring app it is probably 
fine
+def execute_query(query, options, on_master=False):
     username = str(options.username)
     password = str(options.password)
     port = str(options.port)
-    hostname = str(options.hostname)
+    if on_master:
+        hostname = str(options.master)
+    else:
+        hostname = str(options.hostname)
     database = str(options.database)
     conn_string = "host=" + hostname + " dbname=" + database + " user=" + 
username + " password=" + password
     conn = psycopg2.connect(conn_string)
     cursor = conn.cursor()
-    cursor.execute('SELECT CASE WHEN pg_last_xlog_receive_location() = 
pg_last_xlog_replay_location() THEN 0 ELSE EXTRACT (EPOCH FROM now() - 
pg_last_xact_replay_timestamp()) END AS log_delay;')
-    delay = cursor.fetchall()
+    cursor.execute(query)
+    result = cursor.fetchall()
+    cursor.close()
+    conn.close()
+    return result
+
+# check we are in recovery
+def check_recovery(options):
+    query = 'SELECT pg_is_in_recovery();'
+    recovery = execute_query(query, options)
+    recovery = recovery.pop()
+    recovery = recovery[0]
+    return recovery
+
+
+def check_master_active(options):
+    query = 'SELECT * from pg_stat_replication;'
+    slaves = execute_query(query, options, True)
+    if len(slaves) == 0:
+        return False
+    else:
+        # Note: suboptimal and does not cover all corner cases but ok for now
+        my_ip = socket.gethostbyname(socket.gethostname())
+        for slave in slaves:
+            # 4th field (counting from 0) is client_addr
+            if slave[4] == my_ip:
+                return True
+    return False
+
+
+# check delay using options from parse_args
+def check_delay(options):
+    query = 'SELECT CASE WHEN pg_last_xlog_receive_location() = 
pg_last_xlog_replay_location() THEN 0 ELSE EXTRACT (EPOCH FROM now() - 
pg_last_xact_replay_timestamp()) END AS log_delay;'
+    delay = execute_query(query, options)
     delay = delay.pop()
     delay = delay[0]
     return delay
 
 
 # return results and graphing data to Nagios
-def nagios(delay,options):
-    #nagios return codes
-    UNKNOWN = -1
-    OK = 0
-    WARNING = 1
-    CRITICAL = 2
+def nagios_delay(delay, optionsTrue):
     warn = float(options.warn)
     crit = float(options.crit)
     #pop delay out of list and get float out of tuple for direct comparison to 
warn/crit float values
@@ -85,19 +132,27 @@
         sys.exit(UNKNOWN)
 
 
-
-
 # main function, controls flow of script
 def main():
 
     #call parse_arges and return options for script
     options = parse_args()
 
+    # Check first that we are indeed in recovery
+    is_in_recovery = check_recovery(options)
+    if not is_in_recovery:
+        print "CRITICAL: Server is not in recovery"
+        sys.exit(CRITICAL)
+
+    # Then check that we have an active connection to a master
+    master_active = check_master_active(options)
+    if not master_active:
+        print "CRITICAL: Master reports slave not active"
+        sys.exit(CRITICAL)
+
     # execute command using options from parse_args
     delay = check_delay(options)
-
-    #call nagios process
-    nagios(delay,options)
+    nagios_delay(delay, options)
 
 
 # call main function
diff --git a/modules/role/manifests/maps/slave.pp 
b/modules/role/manifests/maps/slave.pp
index 2897fe0..bdf6dfa 100644
--- a/modules/role/manifests/maps/slave.pp
+++ b/modules/role/manifests/maps/slave.pp
@@ -7,12 +7,13 @@
         ensure      => 'present',
         description => 'Maps Postgres slave',
     }
+    $master = hiera('postgresql::slave::master_server')
 
     $pg_password = hiera('postgresql::slave::replication_pass')
     $critical = 1800
     $warning = 300
     $command = "/usr/lib/nagios/plugins/check_postgres_replication_lag.py \
--U replication -P ${pg_password} -D template1 -C ${critical} -W ${warning}"
+-U replication -P ${pg_password} -m ${master} -D template1 -C ${critical} -W 
${warning}"
     nrpe::monitor_service { 'postgres-rep-lag':
         description  => 'Postgres Replication Lag',
         nrpe_command => $command,
diff --git a/modules/role/manifests/puppetmaster/puppetdb.pp 
b/modules/role/manifests/puppetmaster/puppetdb.pp
index 9fd04fa..26c4ee3 100644
--- a/modules/role/manifests/puppetmaster/puppetdb.pp
+++ b/modules/role/manifests/puppetmaster/puppetdb.pp
@@ -18,7 +18,7 @@
         $critical = 1800
         $warning = 300
         $command = "/usr/lib/nagios/plugins/check_postgres_replication_lag.py \
-    -U replication -P ${pg_password} -D template1 -C ${critical} -W ${warning}"
+    -U replication -P ${pg_password} -m ${master} -D template1 -C ${critical} 
-W ${warning}"
         nrpe::monitor_service { 'postgres-rep-lag':
             description  => 'Postgres Replication Lag',
             nrpe_command => $command,

-- 
To view, visit https://gerrit.wikimedia.org/r/310772
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I723bce6af13245785055f5a76d0713097c4e258e
Gerrit-PatchSet: 5
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Alexandros Kosiaris <akosia...@wikimedia.org>
Gerrit-Reviewer: Alexandros Kosiaris <akosia...@wikimedia.org>
Gerrit-Reviewer: Volans <rcocci...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to