I've been playing with the existing HealthSMART RA in Pacemaker and have discovered a number of fundamental bugs and errors with it that mean it will never have worked for anyone.
I've done a big overhaul of this RA, replacing most of the logic to make it work properly, and extending the component to cover a few more use-cases: - Handles lists of drives - Handles 'devices' behind drives - useful for RAID devices e.g 3ware, megaraid etc - Threshold for 'yellow' is now configurable (relative to upper/lower limits) - Various bugfixes, typos, indentation etc I've attached the whole RA to thsi email, as its changed enough that a diff would have just been unhelpful. I'm fairly sure that the logic behind the smartctl result handling is correct, I'm pretty sure the shell is all valid, but as I've been working with an existing 'template' I can't guarantee that the ocf-related bits are valid, as this is the area I know least about. I've tested it as much as I can, but I'd be grateful if other people could review the code/test the RA and let me know if there are any obvious bugs, errors or 'just plain bad code' in it :) Thanks, Matthew -- The University of Edinburgh is a charitable body, registered in Scotland, with registration number SC005336.
#!/bin/sh
#
#
# HealthSMART OCF RA. Checks the S.M.A.R.T. status of all given
# drives and writes the #health-smart status into the CIB
#
# Copyright (c) 2009 Michael Schwartzkopff, 2010 Matthew Richardson
#
# All Rights Reserved.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it would be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Further, this software is distributed without any warranty that it is
# free of the rightful claim of any third person regarding infringement
# or the like. Any license provided herein, whether implied or
# otherwise, applies only to this software file. Patent licenses, if
# any, provided herein do not apply to combinations of this program with
# other software, or any other product whatsoever.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write the Free Software Foundation,
# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
#
#######################################################################
#######################################################################
# Initialization:
. ${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs
#
SMARTCTL=/usr/sbin/smartctl
ATTRDUP=/usr/sbin/attrd_updater
#######################################################################
meta_data() {
cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="HealthSMART" version="0.1">
<version>0.1</version>
<longdesc lang="en">
Systhem health agent that checks the S.M.A.R.T. status of the given drives and
updates the #health-smart attribute.
</longdesc>
<shortdesc lang="en">SMART health status</shortdesc>
<parameters>
<parameter name="state" unique="1">
<longdesc lang="en">
Location to store the resource state in.
</longdesc>
<shortdesc lang="en">State file</shortdesc>
<content type="string"
default="${HA_VARRUN}/HealthSMART-{OCF_RESOURCE_INSTANCE}.state" />
</parameter>
<parameter name="drives" unique="1">
<longdesc lang="en">
The drive(s) to check as a SPACE separated list. Enter the full path to the
device, e.g. "/dev/sda".
</longdesc>
<shortdesc lang="en">Drives to check</shortdesc>
<content type="string" default="/dev/sda" />
</parameter>
<parameter name="devices" unique="1">
<longdesc lang="en">
The device type(s) to assume for the drive(s) being tested as a SPACE separated
list.
</longdesc>
<shortdesc lang="en">Device types</shortdesc>
<content type="string" />
</parameter>
<parameter name="temp_lower_limit" unique="1">
<longdesc lang="en">
Lower limit of the temperature in deg C of the drive(s). Below this limit the
status will be red.
</longdesc>
<shortdesc lang="en">Lower limit for the red smart attribute</shortdesc>
content type="string" default="0"/>
</parameter>
<parameter name="temp_upper_limit" unique="1">
<longdesc lang="en">
Upper limit of the temperature if deg C of the drives(s). If the drive reports
a temperature higher than this value the status of #health-smart will be red.
</longdesc>
<shortdesc lang="en">Upper limit for red smart attribute</shortdesc>
<content type="string" default="60"/>
</parameter>
<parameter name="temp_warning" unique="1">
<longdesc lang="en">
Number of deg C below/above the upper/lower temp limits at which point the
status of #health-smart will change to yellow.
</longdesc>
<shortdesc lang="en">Deg C below/above the upper limits for yellow smart
attribute</shortdesc>
<content type="string" default="5"/>
</parameter>
</parameters>
<actions>
<action name="start" timeout="10" />
<action name="stop" timeout="10" />
<action name="monitor" timeout="10" interval="10" start-delay="0" />
<action name="meta-data" timeout="5" />
<action name="validate-all" timeout="10" />
</actions>
</resource-agent>
END
}
#######################################################################
check_temperature() {
if [ $1 -lt ${lower_red_limit} ] ; then
ocf_log info "Drive ${DRIVE} ${DEVICE} too cold: ${1} C"
$ATTRDUP -n "#health-smart" -U "red" -d "5s"
return 1
fi
if [ $1 -gt ${upper_red_limit} ] ; then
ocf_log info "Drive ${DRIVE} ${DEVICE} too hot: ${1} C"
$ATTRDUP -n "#health-smart" -U "red" -d "5s"
return 1
fi
if [ $1 -lt ${lower_yellow_limit} ] ; then
ocf_log info "Drive ${DRIVE} ${DEVICE} quite cold: ${1} C"
$ATTRDUP -n "#health-smart" -U "yellow" -d "5s"
return 1
fi
if [ $1 -gt ${upper_yellow_limit} ] ; then
ocf_log info "Drive ${DRIVE} ${DEVICE} quite hot: ${1} C"
$ATTRDUP -n "#health-smart" -U "yellow" -d "5s"
return 1
fi
}
init_smart() {
#Set temperature defaults
if [ -z ${OCF_RESKEY_temp_warning} ]; then
yellow_threshold=5
else
yellow_threshold=${OCF_RESKEY_temp_warning}
fi
if [ -z ${OCF_RESKEY_temp_lower_limit} ] ; then
lower_red_limit=0
else
lower_red_limit=${OCF_RESKEY_temp_lower_limit}
fi
lower_yellow_limit=$((${lower_red_limit}+${yellow_threshold}))
if [ -z ${OCF_RESKEY_temp_upper_limit} ] ; then
upper_red_limit=60
else
upper_red_limit=${OCF_RESKEY_temp_upper_limit}
fi
upper_yellow_limit=$((${upper_red_limit}-${yellow_threshold}))
#Set disk defaults
if [ -z ${OCF_RESKEY_drives} ] ; then
DRIVES="/dev/sda"
else
DRIVES=${OCF_RESKEY_drives}
fi
#Test for presence of smartctl
if [ ! -x $SMARTCTL ] ; then
ocf_log err "${SMARTCTL} not installed."
exit $OCF_ERR_INSTALLED
fi
for DRIVE in $DRIVES; do
if [ "${OCF_RESKEY_devices}" ]; then
for DEVICE in ${OCF_RESKEY_devices}; do
$SMARTCTL -d $DEVICE -i ${DRIVE} | grep -q "SMART support is:
Enabled"
if [ $? -ne "0" ] ; then
ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE}
exit $OCF_ERR_INSTALLED
fi
done
else
$SMARTCTL -i ${DRIVE} | grep -q "SMART support is: Enabled"
if [ $? -ne "0" ] ; then
ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE}
exit $OCF_ERR_INSTALLED
fi
fi
done
}
HealthSMART_usage() {
cat <<END
usage: $0 {start|stop|monitor|validate-all|meta-data}
Expects to have a fully populated OCF RA-compliant environment set.
END
}
HealthSMART_start() {
HealthSMART_monitor
if [ $? = $OCF_SUCCESS ]; then
return $OCF_SUCCESS
fi
touch ${OCF_RESKEY_state}
}
HealthSMART_stop() {
HealthSMART_monitor
if [ $? = $OCF_SUCCESS ]; then
rm ${OCF_RESKEY_state}
fi
return $OCF_SUCCESS
}
HealthSMART_monitor() {
init_smart
# Monitor _MUST!_ differentiate correctly between running
# (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
# That is THREE states, not just yes/no.
if [ -f ${OCF_RESKEY_state} ]; then
# Check overall S.M.A.R.T. status
for DRIVE in $DRIVES; do
if [ "${OCF_RESKEY_devices}" ]; then
for DEVICE in ${OCF_RESKEY_devices}; do
$SMARTCTL -d $DEVICE -H ${DRIVE} | grep -q "SMART
overall-health self-assessment test result: PASSED"
if [ $? -ne "0" ]; then
$ATTRDUP -n "#health-smart" -U "red" -d "5s"
return $OCF_SUCCESS
fi
done
else
$SMARTCTL -H ${DRIVE} | grep -q "SMART overall-health
self-assessment test result: PASSED"
if [ $? -ne "0" ]; then
$ATTRDUP -n "#health-smart" -U "red" -d "5s"
return $OCF_SUCCESS
fi
fi
# Check drive temperature(s)
if [ "${OCF_RESKEY_devices}" ]; then
for DEVICE in ${OCF_RESKEY_devices}; do
check_temperature `$SMARTCTL $DEVICE -A ${DRIVE} | awk
'/^194/ { print $10 }'`
if [ $? != 0 ]; then
return $OCF_SUCCESS
fi
done
else
check_temperature `$SMARTCTL $DEVICE -A ${DRIVE} | awk '/^194/
{ print $10 }'`
if [ $? != 0 ]; then
return $OCF_SUCCESS
fi
fi
done
$ATTRDUP -n "#health-smart" -U "green" -d "5s"
return $OCF_SUCCESS
fi
return $OCF_NOT_RUNNING
}
HealthSMART_validate() {
init_smart
# Is the state directory writable?
state_dir=`dirname "$OCF_RESKEY_state"`
touch "$state_dir/$$"
if [ $? != 0 ]; then
return $OCF_ERR_ARGS
fi
rm "$state_dir/$$"
return $OCF_SUCCESS
}
: ${OCF_RESKEY_CRM_meta_interval=0}
: ${OCF_RESKEY_CRM_meta_globally_unique:="true"}
if [ "x$OCF_RESKEY_state" = "x" ]; then
if [ ${OCF_RESKEY_CRM_meta_globally_unique} = "false" ]; then
state="${HA_VARRUN}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state"
# Strip off the trailing clone marker
OCF_RESKEY_state=`echo $state | sed s/:[0-9][0-9]*\.state/.state/`
else
OCF_RESKEY_state="${HA_VARRUN}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state"
fi
fi
case $__OCF_ACTION in
meta-data) meta_data
exit $OCF_SUCCESS
;;
start) HealthSMART_start;;
stop) HealthSMART_stop;;
monitor) HealthSMART_monitor;;
validate-all) HealthSMART_validate;;
usage|help) HealthSMART_usage
exit $OCF_SUCCESS
;;
*) HealthSMART_usage
exit $OCF_ERR_UNIMPLEMENTED
;;
esac
rc=$?
ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
exit $rc
signature.asc
Description: OpenPGP digital signature
_______________________________________________ Linux-HA mailing list [email protected] http://lists.linux-ha.org/mailman/listinfo/linux-ha See also: http://linux-ha.org/ReportingProblems
