Madhuvishy has submitted this change and it was merged.

Change subject: labstore: Add monitoring for secondary HA cluster health
......................................................................


labstore: Add monitoring for secondary HA cluster health

- Check for drbd node health (connection and disk states)
- Check if node conforms to expected drbd role (primary/secondary)

Bug: T144633
Change-Id: Ic8639ae29973f29cc6f17007b76e926ab35a1143
---
A modules/labstore/files/monitor/check_drbd_role
A modules/labstore/files/monitor/check_drbd_status
A modules/labstore/manifests/monitoring/drbd.pp
M modules/role/manifests/labs/nfs/secondary.pp
4 files changed, 167 insertions(+), 0 deletions(-)

Approvals:
  Madhuvishy: Verified; Looks good to me, approved



diff --git a/modules/labstore/files/monitor/check_drbd_role 
b/modules/labstore/files/monitor/check_drbd_role
new file mode 100644
index 0000000..abea821
--- /dev/null
+++ b/modules/labstore/files/monitor/check_drbd_role
@@ -0,0 +1,38 @@
+#!/usr/bin/python3
+import argparse
+import subprocess
+import sys
+
+
+def check_role(node, expected_role):
+    """
+    Check if the role of the DRBD node matches the expected role
+    :param expected_role: string
+    :returns: boolean
+    """
+    drbd_res_roles = str(
+        subprocess.check_output(['drbdadm', 'role', 'all']), 'utf-8')\
+        .rstrip('\n').split('\n')
+    if expected_role == 'primary':
+        role_ok = all([role == 'Primary/Secondary' for role in drbd_res_roles])
+    else:
+        role_ok = all([role == 'Secondary/Primary' for role in drbd_res_roles])
+
+    if not role_ok:
+        print('{}: Unexpected role match, expected role {}'.format(
+            node, expected_role))
+
+    return role_ok
+
+
+def main():
+    parser = argparse.ArgumentParser('Check DRBD node role')
+    parser.add_argument('node', help='Hostname of node being checked')
+    parser.add_argument('role', help='Expected drbd role, primary|secondary')
+    args = parser.parse_args()
+
+    if not check_role(args.node, args.role):
+        sys.exit(1)
+
+if __name__ == '__main__':
+    main()
diff --git a/modules/labstore/files/monitor/check_drbd_status 
b/modules/labstore/files/monitor/check_drbd_status
new file mode 100644
index 0000000..3d7edb0
--- /dev/null
+++ b/modules/labstore/files/monitor/check_drbd_status
@@ -0,0 +1,87 @@
+#!/usr/bin/python3
+import argparse
+import re
+import subprocess
+import sys
+
+
+def parse_drbd_overview():
+    """
+    Parse the output of running drbd-overview and construct a map of status
+    data per resource, e.g.
+    {
+        resource1: {cstate: .., dstate: .., ..},
+        resource2: {cstate: .., dstate: .., ..}
+    }
+    :returns: dict
+    """
+
+    # Define headers in drbd-overview
+    headers = ['cstate', 'role', 'dstate', 'mount-point', 'fstype',
+               'size', 'used', 'avail', 'use%']
+
+    # Read contents of drbd-overview and split by new lines
+    drbd_overview_raw = str(subprocess.check_output(
+        ['drbd-overview']), 'utf-8').rstrip('\n').split('\n')
+
+    # Split each line into a list - making a nested list, remove empty elements
+    # and strip whitespaces in the rest
+    drbd_overview_split = [[z.strip(' ') for z in
+                            filter(lambda y: y != '', x.split(' '))]
+                           for x in drbd_overview_raw]
+
+    # Make a dict of status data per resource
+    # Resource name is extracted from the first item of each list
+    # e.g test is extracted from 1:test/0
+    resource_status_map = {re.split(':|/', x[0])[1]:
+                           dict(zip(headers, x[1:]))
+                           for x in drbd_overview_split}
+
+    return resource_status_map
+
+
+def check_resource(resource, resource_status):
+    """
+    Compute resource status based on connection state, disk state and role,
+    and construct and print an appropriate error string.
+    :param resource: string
+    :param role: string
+    :param resource_status: dict
+    :returns: boolean
+    """
+    cstate_ok = (resource_status['cstate'] == 'Connected')
+    dstate_ok = (resource_status['dstate'] == 'UpToDate/UpToDate')
+
+    drbd_ok = cstate_ok and dstate_ok
+
+    if not drbd_ok:
+        errors = []
+        errors.append('{}: Unexpected connected state: {}'.format(
+            resource, resource_status['cstate']) if not cstate_ok else '')
+        errors.append('{}: Unexpected disk state: {}'.format(
+            resource, resource_status['dstate']) if not dstate_ok else '')
+
+        print(', '.join(filter(lambda e: e, errors)))
+
+    return drbd_ok
+
+
+def main():
+    parser = argparse.ArgumentParser('Check DRBD Status')
+    parser.add_argument('resource',
+                        help='Name of resource or \'all\'')
+    args = parser.parse_args()
+
+    resource_status_map = parse_drbd_overview()
+
+    if args.resource == 'all':
+        if not all([check_resource(r, resource_status_map[r])
+                    for r in resource_status_map.keys()]):
+            sys.exit(1)
+    elif not check_resource(
+            args.resource, resource_status_map[args.resource]):
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/modules/labstore/manifests/monitoring/drbd.pp 
b/modules/labstore/manifests/monitoring/drbd.pp
new file mode 100644
index 0000000..da2d30e
--- /dev/null
+++ b/modules/labstore/manifests/monitoring/drbd.pp
@@ -0,0 +1,34 @@
+# == Class: labstore::monitoring::drbd
+#
+# Installs icinga checks to make sure resource status on a drbd node is OK,
+# and to check that the nodes conform to the expected drbd roles.
+
+class labstore::monitoring::drbd($drbd_role, $resource = 'all') {
+
+    file { '/usr/local/sbin/check_drbd_status':
+        source => 'puppet:///modules/labstore/monitor/check_drbd_status',
+        mode   => '0755',
+        owner  => 'root',
+        group  => 'root',
+    }
+
+    nrpe::monitor_service { 'check-drbd-status':
+        description  => 'Check status of DRBD node',
+        nrpe_command => "/usr/local/sbin/check_drbd_status ${resource} 
${drbd_role}",
+        require      => File['/usr/local/sbin/check_drbd_status'],
+    }
+
+    file { '/usr/local/sbin/check_drbd_role':
+        source => 'puppet:///modules/labstore/monitor/check_drbd_role',
+        mode   => '0755',
+        owner  => 'root',
+        group  => 'root',
+    }
+
+    nrpe::monitor_service { 'check_drbd_role':
+        description  => 'Check status of DRBD node',
+        nrpe_command => "/usr/local/sbin/check_drbd_role ${::hostname} 
${drbd_role}",
+        require      => File['/usr/local/sbin/check_drbd_role'],
+    }
+
+}
diff --git a/modules/role/manifests/labs/nfs/secondary.pp 
b/modules/role/manifests/labs/nfs/secondary.pp
index 292fea5..cd0b8c5 100644
--- a/modules/role/manifests/labs/nfs/secondary.pp
+++ b/modules/role/manifests/labs/nfs/secondary.pp
@@ -17,6 +17,8 @@
             address   => '10.64.37.26',
             prefixlen => '24',
         }
+        # Define DRBD role for this host, should come from hiera
+        $drbd_role = 'secondary'
     }
 
     if $::hostname == 'labstore1004' {
@@ -25,6 +27,8 @@
             address   => '10.64.37.25',
             prefixlen => '24',
         }
+        # Define DRBD role for this host, should come from hiera
+        $drbd_role = 'primary'
     }
 
     # TODO: hiera this
@@ -56,4 +60,8 @@
         disk         => '/dev/misc/others',
         require      => Interface::Ip['drbd-replication'],
     }
+
+    class { 'labstore::monitoring::drbd':
+        role  => $drbd_role,
+    }
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/311723
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ic8639ae29973f29cc6f17007b76e926ab35a1143
Gerrit-PatchSet: 14
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Madhuvishy <mviswanat...@wikimedia.org>
Gerrit-Reviewer: Madhuvishy <mviswanat...@wikimedia.org>
Gerrit-Reviewer: Rush <r...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to