Xav Paice has proposed merging ~xavpaice/hw-health-charm:add_smartctl into hw-health-charm:master.
Requested reviews: Nagios Charm developers (nagios-charmers) For more details, see: https://code.launchpad.net/~xavpaice/hw-health-charm/+git/hw-health-charm/+merge/363833 -- Your team Nagios Charm developers is requested to review the proposed merge of ~xavpaice/hw-health-charm:add_smartctl into hw-health-charm:master.
diff --git a/Makefile b/Makefile index 20a6f9b..8f9d4fb 100644 --- a/Makefile +++ b/Makefile @@ -21,8 +21,12 @@ help: lint: @echo "Running flake8" +<<<<<<< Makefile cd src && tox -e pep8 +======= + @flake8 --max-line-length=120 ./src +>>>>>>> Makefile test: unittest lint diff --git a/src/actions/actions.py b/src/actions/actions.py index 9e27f0c..a851b2a 100755 --- a/src/actions/actions.py +++ b/src/actions/actions.py @@ -18,7 +18,13 @@ import os import subprocess import sys +<<<<<<< src/actions/actions.py from charmhelpers.core.hookenv import action_set, action_fail, log +======= +from charms.layer import basic # noqa E402 +basic.bootstrap_charm_deps() +from charmhelpers.core.hookenv import action_set, action_fail, log # noqa E402 +>>>>>>> src/actions/actions.py def clear_sel(args): diff --git a/src/config.yaml b/src/config.yaml index 1a5bcfd..9dbf3e8 100644 --- a/src/config.yaml +++ b/src/config.yaml @@ -16,4 +16,8 @@ options: enable_ipmi: type: boolean default: True - description: Enable the use of freeipmi tools to monitor hardware status. \ No newline at end of file + description: Enable the use of freeipmi tools to monitor hardware status. + enable_smart: + type: boolean + default: True + description: Enable SMART monitoring of HDDs. \ No newline at end of file diff --git a/src/files/smart/check_smart.py b/src/files/smart/check_smart.py new file mode 100755 index 0000000..a931b57 --- /dev/null +++ b/src/files/smart/check_smart.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +# -*- coding: us-ascii -*- + +import subprocess +from nagios_plugin3 import CriticalError, UnknownError, try_check + + +def check_smartctl(): + """Check the status of disk devices using SMART + + Use smartctl to collect a list of supported devices, and check the status of each. + See the smartctl(8) manpage for a description of the return codes and what they mean. + :return: either a string to indicate the devices are OK, or raise an exception. + """ + devices = smartctl_scan() + errors = [] + for device in devices: + command = ['sudo', 'smartctl', '-H', device, '--quietmode=silent'] + try: + status_out = subprocess.run(command) + except subprocess.CalledProcessError: + # smartctl returned a non-OK state, but it's silent so won't output any stdout nor stderr for that + errors.append("{} RC: {}".format(device, status_out.returncode)) + if len(errors) > 0: + raise CriticalError("Smartctl found drives not OK, {}".format(",".join(errors))) + msg = "OK: All SMART devices report OK" + print(msg) + return "check_smartctl returned OK" + + +def smartctl_scan(): + """Run smartctl --scan to get a list of devices + + :return: a list of devices that are capable of smartctl checks + """ + command = ['smartctl', '--scan'] + try: + raw_devices = subprocess.check_output(command).decode() + except subprocess.CalledProcessError as e: + raise UnknownError("smartctl failed to scan with error {}".format(e)) + raw_devices = raw_devices.strip() + devices = [dev.split(' ')[0] for dev in raw_devices.split('\n')] + return devices + + +def main(): + try_check(check_smartctl) + + +if __name__ == '__main__': + main() diff --git a/src/files/smart/check_smart_sudoer b/src/files/smart/check_smart_sudoer new file mode 100644 index 0000000..989729a --- /dev/null +++ b/src/files/smart/check_smart_sudoer @@ -0,0 +1,2 @@ +nagios ALL=(root) NOPASSWD: /usr/sbin/smartctl + diff --git a/src/layer.yaml b/src/layer.yaml index 054b43e..a3c8c45 100644 --- a/src/layer.yaml +++ b/src/layer.yaml @@ -12,5 +12,7 @@ options: packages: - "freeipmi" - "libipc-run-perl" + - "smartmontools" + - "lm-sensors" repo: "lp:hw-health-charm" is: "hw-health" diff --git a/src/lib/utils/hwdiscovery.py b/src/lib/utils/hwdiscovery.py index c144fa4..9da532f 100644 --- a/src/lib/utils/hwdiscovery.py +++ b/src/lib/utils/hwdiscovery.py @@ -33,14 +33,16 @@ def get_tools(): # LSI3008-IR tools.add('sas3ircu') - # HPE (Gen10 and newer) or HP (older) hw_vendor = dmidecode('system-manufacturer') - if hw_vendor == 'HPE': + if hw_vendor == 'Supermicro': + tools.add('sas3ircu') + """ TODO implement HP/HPE monitoring + # HPE (Gen10 and newer) or HP (older) + elif hw_vendor == 'HPE': tools.add('hpe') # ['hponcfg', 'ilorest'] elif hw_vendor == 'HP': tools.add('hp') # ['hponcfg', 'hp-health', 'hpssacli'] - elif hw_vendor == 'Supermicro': - tools.add('sas3ircu') + """ # SW RAID? if _supports_mdadm(): @@ -49,6 +51,9 @@ def get_tools(): if hookenv.config('enable_ipmi'): tools.add('ipmi') + if hookenv.config('enable_smart'): + tools.add('smart') + return tools diff --git a/src/lib/utils/tooling.py b/src/lib/utils/tooling.py index bef4b4f..7d93792 100644 --- a/src/lib/utils/tooling.py +++ b/src/lib/utils/tooling.py @@ -13,17 +13,20 @@ TOOLING = { 'megacli': { 'snap': 'megacli', 'filename': 'megaraid/check_megacli.py', - 'cronjob': 'megaraid/check_megacli.sh' + 'cronjob': 'megaraid/check_megacli.sh', + 'install': 'resource' }, 'sas2ircu': { 'snap': 'sas2ircu', 'filename': 'mpt/check_sas2ircu.py', - 'cronjob': 'mpt/check_sas2ircu.sh' + 'cronjob': 'mpt/check_sas2ircu.sh', + 'install': 'resource' }, 'sas3ircu': { 'snap': 'sas3ircu', 'filename': 'mpt/check_sas3ircu.py', - 'cronjob': 'mpt/check_sas3ircu.sh' + 'cronjob': 'mpt/check_sas3ircu.sh', + 'install': 'resource' }, 'mdadm': { 'filename': 'check_mdadm.py', @@ -33,6 +36,10 @@ TOOLING = { 'filename': 'ipmi/check_ipmi_sensor', 'sudoers': 'ipmi/check_ipmi_sensor_sudoer' }, + 'smart': { + 'filename': 'smart/check_smart.py', + 'sudoers': 'smart/check_smart_sudoer' + } } PLUGINS_DIR = '/usr/local/lib/nagios/plugins/' @@ -44,10 +51,17 @@ SUDOERS_DIR = '/etc/sudoers.d' def install_resource(tools, resource): """Install hardware diagnostic tools from a charm resource + Install hardware diagnostic tools from a charm resource. We currently exclude a set of tools which are + installed by packages elsewhere. :param tools: set of tools to include :param resource: resource object :return: boolean, status of installation """ + exclude_tools = set() + for tool in TOOLING.keys(): + if not TOOLING[tool].get('install') == 'resource': + # any tool that isn't marked to be installed by resource needs excluding further down + exclude_tools.add(tool) if not isinstance(tools, set): try: tools = set(tools) @@ -56,8 +70,8 @@ def install_resource(tools, resource): return False if not tools: return False - elif tools & {'mdadm', 'ipmi'}: - tools = tools - {'mdadm', 'ipmi'} + elif tools & exclude_tools: # if the two sets intersect + tools = tools - exclude_tools if not tools: return True @@ -196,7 +210,10 @@ def configure_nrpe_checks(tools): hookenv.log( 'Cronjob file [{}] removed'.format(cronjob_file), hookenv.DEBUG) - + if os.path.exists('/usr/bin/sensors'): + nrpe_setup.add_check('check_sensors', "lmsensors status", '/usr/lib/nagios/plugins/check_sensors') + nrpe_setup.write() + count += 1 return count > 0 @@ -219,6 +236,7 @@ def remove_nrpe_checks(tools): hookenv.log('Check removed from nrpe: {}'.format(tool)) count += 1 + # TODO remove nrpe check for check_sensors if count > 0: nrpe_setup.write() return True diff --git a/src/reactive/hw_health.py b/src/reactive/hw_health.py index 29cea0f..1890eb5 100644 --- a/src/reactive/hw_health.py +++ b/src/reactive/hw_health.py @@ -14,6 +14,7 @@ from utils.tooling import ( configure_nrpe_checks, install_resource, remove_nrpe_checks, + TOOLING ) @@ -36,16 +37,21 @@ def install(): status.blocked('Hardware not supported') else: resource = hookenv.resource_get('tools') + resource_needed_tools = [tool for tool in tools if TOOLING[tool].get('install') == 'resource'] if resource: hookenv.log('Installing from resource') status.waiting('Installing from attached resource') installed = install_resource(tools, resource) else: - hookenv.log( - 'Missing Juju resource: tools - alternative method is not' - ' available yet', hookenv.ERROR) - status.blocked('Missing Juju resource: tools') - return + if resource_needed_tools: + # we know that this tool needs to be provided by a resource but there's no resource available + hookenv.log( + 'Missing Juju resource: tools - alternative method is not' + ' available yet', hookenv.ERROR) + status.blocked('Missing Juju resource: tools') + return + else: + installed = True # TODO(aluria): install vendor utilities via snaps # https://forum.snapcraft.io/t/request-for-classic-confinement-sas2ircu/9023 # Snap interface for hw tooling is not supported @@ -62,8 +68,8 @@ def install(): set_flag('hw-health.installed') status.waiting('Preparing scripts installation') else: - missing_tools = tools - {'mdadm', 'ipmi'} - status.blocked('Tools not found: {}'.format(', '.join(missing_tools))) + status.blocked('Not all required tools found in tools resource: {}'.format( + ', '.join(resource_needed_tools))) @when('hw-health.installed') diff --git a/src/tests/charms/__init__.py b/src/tests/charms/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/src/tests/charms/__init__.py diff --git a/src/tests/charms/layer/__init__.py b/src/tests/charms/layer/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/src/tests/charms/layer/__init__.py diff --git a/src/tests/charms/layer/basic.py b/src/tests/charms/layer/basic.py new file mode 100644 index 0000000..5434905 --- /dev/null +++ b/src/tests/charms/layer/basic.py @@ -0,0 +1,2 @@ +def bootstrap_charm_deps(): + pass diff --git a/src/tests/unit/test_actions.py b/src/tests/unit/test_actions.py index 3df07a3..bb5a492 100644 --- a/src/tests/unit/test_actions.py +++ b/src/tests/unit/test_actions.py @@ -12,10 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +import sys import unittest import unittest.mock as mock -from actions.actions import clear_sel +sys.path.insert(0, '{}/{}'.format(os.getcwd(), 'tests')) +from actions.actions import clear_sel # noqa: E402 class ClearSelTestCase(unittest.TestCase): @@ -27,5 +30,9 @@ class ClearSelTestCase(unittest.TestCase): sel_output = "Unittest system event log output" mock_subprocess.return_value = sel_output mock_check_call.return_value = None +<<<<<<< src/tests/unit/test_actions.py clear_sel('dummy_arg') +======= + clear_sel('unittest') +>>>>>>> src/tests/unit/test_actions.py mock_check_call.assert_called_once_with(['action-set', "cleared_log={}".format(sel_output)]) diff --git a/src/tests/unit/test_check_smart.py b/src/tests/unit/test_check_smart.py new file mode 100644 index 0000000..f3556d1 --- /dev/null +++ b/src/tests/unit/test_check_smart.py @@ -0,0 +1,31 @@ +import sys +import unittest +import unittest.mock as mock + +sys.path.append('files/smart') +import check_smart # noqa E402 + + +class TestCheckSmart(unittest.TestCase): + + @mock.patch('subprocess.check_output') + def test_smartctl_scan(self, mock_smartctl): + mock_smartctl.return_value = b"""/dev/sda -d scsi # /dev/sda, SCSI device +/dev/sdb -d scsi # /dev/sdb, SCSI device +/dev/sdc -d scsi # /dev/sdc, SCSI device +/dev/sdd -d scsi # /dev/sdd, SCSI device +/dev/sde -d scsi # /dev/sde, SCSI device + + """ + actual = check_smart.smartctl_scan() + expected = ['/dev/sda', '/dev/sdb', '/dev/sdc', '/dev/sdd', '/dev/sde'] + self.assertEqual(actual, expected) + + @mock.patch('subprocess.run') + @mock.patch('check_smart.smartctl_scan') + def test_check_smartctl(self, mock_devices, mock_smartctl): + mock_smartctl.return_code = 0 + mock_devices.return_value = ['/dev/sda'] + actual = check_smart.check_smartctl() + expected = "check_smartctl returned OK" + self.assertEqual(actual, expected) diff --git a/src/tests/unit/test_hwdiscovery.py b/src/tests/unit/test_hwdiscovery.py index 7cb516e..5fc3342 100644 --- a/src/tests/unit/test_hwdiscovery.py +++ b/src/tests/unit/test_hwdiscovery.py @@ -1,6 +1,4 @@ import io -import os # noqa: F401 -import subprocess # noqa: F401 import sys import unittest import unittest.mock as mock @@ -26,7 +24,7 @@ class TestHWDiscovery(unittest.TestCase): driver.side_effect = side_effect_glob mock_hookenv.return_value = True real = hwdiscovery.get_tools() - expected = set(['megacli', 'ipmi']) + expected = set(['megacli', 'ipmi', 'smart']) self.assertEqual(real, expected) @mock.patch('hwdiscovery._supports_mdadm') @@ -42,10 +40,10 @@ class TestHWDiscovery(unittest.TestCase): supports_mdadm.return_value = False dmidecode.return_value = 'unittest' - mock_hookenv.return_value = True + mock_hookenv.return_value = False driver.side_effect = side_effect_glob real = hwdiscovery.get_tools() - expected = set(['sas2ircu', 'ipmi']) + expected = set(['sas2ircu']) self.assertEqual(real, expected) @mock.patch('hwdiscovery._supports_mdadm') @@ -68,6 +66,7 @@ class TestHWDiscovery(unittest.TestCase): expected = set(['sas3ircu']) self.assertEqual(real, expected) + """ TODO implement HP @mock.patch('hwdiscovery._supports_mdadm') @mock.patch('hwdiscovery.dmidecode') @mock.patch('glob.glob') @@ -93,6 +92,7 @@ class TestHWDiscovery(unittest.TestCase): real = hwdiscovery.get_tools() expected = set(['hp']) self.assertEqual(real, expected) + """ @mock.patch('hwdiscovery._supports_mdadm') @mock.patch('hwdiscovery.dmidecode') diff --git a/src/tests/unit/test_tooling.py b/src/tests/unit/test_tooling.py index 5236f6e..9d55b83 100644 --- a/src/tests/unit/test_tooling.py +++ b/src/tests/unit/test_tooling.py @@ -131,3 +131,7 @@ class TestTooling(unittest.TestCase): def test_configure_tools_unsupported_tools(self): self.assertEqual( tooling.configure_tools(['unittest1', 'unittest2']), False) + + def test_configure_nrpe_checks(self): + # TODO write this test + pass
-- Mailing list: https://launchpad.net/~nagios-charmers Post to : nagios-charmers@lists.launchpad.net Unsubscribe : https://launchpad.net/~nagios-charmers More help : https://help.launchpad.net/ListHelp