Review: Needs Fixing Some comments / notes inline. One issue with smartctl checks in general is that they won't work well with some types of RAID cards.
I think it'd be good to have the README updated for the smartctl checks, suggest to add a warning about smartctl and RAID controllers there. For future work, could expand this check for Dell megaraid cards. Some howto: https://www.thomas-krenn.com/en/wiki/Smartmontools_with_MegaRAID_Controller This will required the storcli binary, see here: https://discourse.maas.io/t/running-smart-tests-against-megaraid-controllers/185 Diff comments: > diff --git a/src/files/smart/check_smart.py b/src/files/smart/check_smart.py > new file mode 100755 > index 0000000..a931b57 > --- /dev/null > +++ b/src/files/smart/check_smart.py > @@ -0,0 +1,51 @@ > +#!/usr/bin/env python3 > +# -*- coding: us-ascii -*- > + > +import subprocess > +from nagios_plugin3 import CriticalError, UnknownError, try_check > + > + > +def check_smartctl(): > + """Check the status of disk devices using SMART > + > + Use smartctl to collect a list of supported devices, and check the > status of each. > + See the smartctl(8) manpage for a description of the return codes and > what they mean. > + :return: either a string to indicate the devices are OK, or raise an > exception. > + """ > + devices = smartctl_scan() > + errors = [] > + for device in devices: > + command = ['sudo', 'smartctl', '-H', device, '--quietmode=silent'] This won't work with raid cards. E.g. Dell megaraid will return sudo smartctl -H /dev/sda smartctl 6.5 2016-01-24 r4214 [x86_64-linux-4.15.0-45-generic] (local build) Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org Smartctl open device: /dev/sda failed: DELL or MegaRaid controller, please try adding '-d megaraid,N' Could we use '--quietmode=errorsonly' instead and have some addtl error info right away? > + try: > + status_out = subprocess.run(command) > + except subprocess.CalledProcessError: > + # smartctl returned a non-OK state, but it's silent so won't > output any stdout nor stderr for that > + errors.append("{} RC: {}".format(device, status_out.returncode)) ... and include the errorsonly output from ^^, and also print them out? > + if len(errors) > 0: Nit: could get rid of superfluous len() and use if errors: ... > + raise CriticalError("Smartctl found drives not OK, > {}".format(",".join(errors))) > + msg = "OK: All SMART devices report OK" > + print(msg) > + return "check_smartctl returned OK" > + > + > +def smartctl_scan(): > + """Run smartctl --scan to get a list of devices > + > + :return: a list of devices that are capable of smartctl checks > + """ > + command = ['smartctl', '--scan'] > + try: > + raw_devices = subprocess.check_output(command).decode() > + except subprocess.CalledProcessError as e: > + raise UnknownError("smartctl failed to scan with error {}".format(e)) > + raw_devices = raw_devices.strip() > + devices = [dev.split(' ')[0] for dev in raw_devices.split('\n')] Nit, maybe use raw_devices.splitlines() > + return devices > + > + > +def main(): > + try_check(check_smartctl) > + > + > +if __name__ == '__main__': > + main() > diff --git a/src/lib/utils/hwdiscovery.py b/src/lib/utils/hwdiscovery.py > index c144fa4..9da532f 100644 > --- a/src/lib/utils/hwdiscovery.py > +++ b/src/lib/utils/hwdiscovery.py > @@ -33,14 +33,16 @@ def get_tools(): > # LSI3008-IR > tools.add('sas3ircu') > > - # HPE (Gen10 and newer) or HP (older) > hw_vendor = dmidecode('system-manufacturer') > - if hw_vendor == 'HPE': > + if hw_vendor == 'Supermicro': > + tools.add('sas3ircu') This seems to be a different topic than smartctl chks, is that on purpose? Maybe note in commit message > + """ TODO implement HP/HPE monitoring > + # HPE (Gen10 and newer) or HP (older) > + elif hw_vendor == 'HPE': > tools.add('hpe') # ['hponcfg', 'ilorest'] > elif hw_vendor == 'HP': > tools.add('hp') # ['hponcfg', 'hp-health', 'hpssacli'] > - elif hw_vendor == 'Supermicro': > - tools.add('sas3ircu') > + """ > > # SW RAID? > if _supports_mdadm(): > diff --git a/src/lib/utils/tooling.py b/src/lib/utils/tooling.py > index bef4b4f..7d93792 100644 > --- a/src/lib/utils/tooling.py > +++ b/src/lib/utils/tooling.py > @@ -196,7 +210,10 @@ def configure_nrpe_checks(tools): > hookenv.log( > 'Cronjob file [{}] removed'.format(cronjob_file), > hookenv.DEBUG) > - > + if os.path.exists('/usr/bin/sensors'): Also this seems to be a different topic than smartctl chks, is that on purpose? Maybe note in commit message > + nrpe_setup.add_check('check_sensors', "lmsensors status", > '/usr/lib/nagios/plugins/check_sensors') > + nrpe_setup.write() > + count += 1 > return count > 0 > > > diff --git a/src/reactive/hw_health.py b/src/reactive/hw_health.py > index 29cea0f..1890eb5 100644 > --- a/src/reactive/hw_health.py > +++ b/src/reactive/hw_health.py > @@ -36,16 +37,21 @@ def install(): > status.blocked('Hardware not supported') > else: > resource = hookenv.resource_get('tools') > + resource_needed_tools = [tool for tool in tools if > TOOLING[tool].get('install') == 'resource'] > if resource: > hookenv.log('Installing from resource') > status.waiting('Installing from attached resource') > installed = install_resource(tools, resource) > else: > - hookenv.log( > - 'Missing Juju resource: tools - alternative method is not' > - ' available yet', hookenv.ERROR) > - status.blocked('Missing Juju resource: tools') > - return > + if resource_needed_tools: > + # we know that this tool needs to be provided by a resource > but there's no resource available > + hookenv.log( > + 'Missing Juju resource: tools - alternative method is > not' > + ' available yet', hookenv.ERROR) Wonder if this might be misleading, as in "not available yet but this process will keep searching", poss. just say "alternative method is not available" > + status.blocked('Missing Juju resource: tools') > + return > + else: > + installed = True > # TODO(aluria): install vendor utilities via snaps > # > https://forum.snapcraft.io/t/request-for-classic-confinement-sas2ircu/9023 > # Snap interface for hw tooling is not supported > diff --git a/src/tests/unit/test_check_smart.py > b/src/tests/unit/test_check_smart.py > new file mode 100644 > index 0000000..f3556d1 > --- /dev/null > +++ b/src/tests/unit/test_check_smart.py > @@ -0,0 +1,31 @@ > +import sys > +import unittest > +import unittest.mock as mock > + > +sys.path.append('files/smart') > +import check_smart # noqa E402 > + > + > +class TestCheckSmart(unittest.TestCase): > + > + @mock.patch('subprocess.check_output') > + def test_smartctl_scan(self, mock_smartctl): > + mock_smartctl.return_value = b"""/dev/sda -d scsi # /dev/sda, SCSI > device > +/dev/sdb -d scsi # /dev/sdb, SCSI device > +/dev/sdc -d scsi # /dev/sdc, SCSI device > +/dev/sdd -d scsi # /dev/sdd, SCSI device > +/dev/sde -d scsi # /dev/sde, SCSI device > + > + """ > + actual = check_smart.smartctl_scan() > + expected = ['/dev/sda', '/dev/sdb', '/dev/sdc', '/dev/sdd', > '/dev/sde'] > + self.assertEqual(actual, expected) > + > + @mock.patch('subprocess.run') > + @mock.patch('check_smart.smartctl_scan') > + def test_check_smartctl(self, mock_devices, mock_smartctl): > + mock_smartctl.return_code = 0 > + mock_devices.return_value = ['/dev/sda'] > + actual = check_smart.check_smartctl() > + expected = "check_smartctl returned OK" > + self.assertEqual(actual, expected) Would be great to have a test for a failing drive as well -- https://code.launchpad.net/~xavpaice/hw-health-charm/+git/hw-health-charm/+merge/363834 Your team Nagios Charm developers is subscribed to branch hw-health-charm:master. -- Mailing list: https://launchpad.net/~nagios-charmers Post to : [email protected] Unsubscribe : https://launchpad.net/~nagios-charmers More help : https://help.launchpad.net/ListHelp

