I've been playing with the existing HealthSMART RA in Pacemaker and have
discovered a number of fundamental bugs and errors with it that mean it
will never have worked for anyone.

I've done a big overhaul of this RA, replacing most of the logic to make
it work properly, and extending the component to cover a few more use-cases:

- Handles lists of drives
- Handles 'devices' behind drives - useful for RAID devices e.g 3ware,
megaraid etc
- Threshold for 'yellow' is now configurable (relative to upper/lower
limits)
- Various bugfixes, typos, indentation etc

I've attached the whole RA to thsi email, as its changed enough that a
diff would have just been unhelpful.

I'm fairly sure that the logic behind the smartctl result handling is
correct, I'm pretty sure the shell is all valid, but as I've been
working with an existing 'template' I can't guarantee that the
ocf-related bits are valid, as this is the area I know least about.

I've tested it as much as I can, but I'd be grateful if other people
could review the code/test the RA and let me know if there are any
obvious bugs, errors or 'just plain bad code' in it :)

Thanks,

Matthew


-- 
The University of Edinburgh is a charitable body, registered in
Scotland, with registration number SC005336.
#!/bin/sh
#
#
# HealthSMART OCF RA. Checks the S.M.A.R.T. status of all given
# drives and writes the #health-smart status into the CIB
#
# Copyright (c) 2009 Michael Schwartzkopff, 2010 Matthew Richardson
#
#                    All Rights Reserved.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it would be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Further, this software is distributed without any warranty that it is
# free of the rightful claim of any third person regarding infringement
# or the like.  Any license provided herein, whether implied or
# otherwise, applies only to this software file.  Patent licenses, if
# any, provided herein do not apply to combinations of this program with
# other software, or any other product whatsoever.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write the Free Software Foundation,
# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
#
#######################################################################

#######################################################################
# Initialization:

. ${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs
#
SMARTCTL=/usr/sbin/smartctl
ATTRDUP=/usr/sbin/attrd_updater

#######################################################################

meta_data() {
    cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="HealthSMART" version="0.1">
<version>0.1</version>

<longdesc lang="en">
Systhem health agent that checks the S.M.A.R.T. status of the given drives and
updates the #health-smart attribute.
</longdesc>
<shortdesc lang="en">SMART health status</shortdesc>

<parameters>
<parameter name="state" unique="1">
<longdesc lang="en">
Location to store the resource state in.
</longdesc>
<shortdesc lang="en">State file</shortdesc>
<content type="string" 
default="${HA_VARRUN}/HealthSMART-{OCF_RESOURCE_INSTANCE}.state" />
</parameter>

<parameter name="drives" unique="1">
<longdesc lang="en">
The drive(s) to check as a SPACE separated list. Enter the full path to the 
device, e.g. "/dev/sda".
</longdesc>
<shortdesc lang="en">Drives to check</shortdesc>
<content type="string" default="/dev/sda" />
</parameter>

<parameter name="devices" unique="1">
<longdesc lang="en">
The device type(s) to assume for the drive(s) being tested as a SPACE separated 
list.
</longdesc>
<shortdesc lang="en">Device types</shortdesc>
<content type="string" />
</parameter>

<parameter name="temp_lower_limit" unique="1">
<longdesc lang="en">
Lower limit of the temperature in deg C of the drive(s). Below this limit the 
status will be red.
</longdesc>
<shortdesc lang="en">Lower limit for the red smart attribute</shortdesc>
content type="string" default="0"/>
</parameter>

<parameter name="temp_upper_limit" unique="1">
<longdesc lang="en">
Upper limit of the temperature if deg C of the drives(s). If the drive reports
a temperature higher than this value the status of #health-smart will be red.
</longdesc>
<shortdesc lang="en">Upper limit for red smart attribute</shortdesc>
<content type="string" default="60"/>
</parameter>

<parameter name="temp_warning" unique="1">
<longdesc lang="en">
Number of deg C below/above the upper/lower temp limits at which point the 
status of #health-smart will change to yellow.
</longdesc>
<shortdesc lang="en">Deg C below/above the upper limits for yellow smart 
attribute</shortdesc>
<content type="string" default="5"/>
</parameter>

</parameters>

<actions>
<action name="start"        timeout="10" />
<action name="stop"         timeout="10" />
<action name="monitor"      timeout="10" interval="10" start-delay="0" />
<action name="meta-data"    timeout="5" />
<action name="validate-all"   timeout="10" />
</actions>
</resource-agent>
END
}

#######################################################################

check_temperature() {
    
    if [ $1 -lt ${lower_red_limit} ] ; then
        ocf_log info "Drive ${DRIVE} ${DEVICE} too cold: ${1} C"
        $ATTRDUP -n "#health-smart" -U "red" -d "5s"
        return 1
    fi

    if [ $1 -gt ${upper_red_limit} ] ; then
        ocf_log info "Drive ${DRIVE} ${DEVICE} too hot: ${1} C"
        $ATTRDUP -n "#health-smart" -U "red" -d "5s"
        return 1
    fi
    
    if [ $1 -lt ${lower_yellow_limit} ] ; then
        ocf_log info "Drive ${DRIVE} ${DEVICE} quite cold: ${1} C"
        $ATTRDUP -n "#health-smart" -U "yellow" -d "5s"
        return 1
    fi
    
    if [ $1 -gt ${upper_yellow_limit} ] ; then
        ocf_log info "Drive ${DRIVE} ${DEVICE} quite hot: ${1} C"
        $ATTRDUP -n "#health-smart" -U "yellow" -d "5s"
        return 1
    fi
}


init_smart() {
    #Set temperature defaults
    if [ -z ${OCF_RESKEY_temp_warning} ]; then
        yellow_threshold=5
    else
        yellow_threshold=${OCF_RESKEY_temp_warning}
    fi

    if [ -z ${OCF_RESKEY_temp_lower_limit} ] ; then
        lower_red_limit=0
    else
        lower_red_limit=${OCF_RESKEY_temp_lower_limit}
    fi
    lower_yellow_limit=$((${lower_red_limit}+${yellow_threshold}))

    if [ -z ${OCF_RESKEY_temp_upper_limit} ] ; then
        upper_red_limit=60
    else
        upper_red_limit=${OCF_RESKEY_temp_upper_limit}
    fi
    upper_yellow_limit=$((${upper_red_limit}-${yellow_threshold}))

    #Set disk defaults
    if [ -z ${OCF_RESKEY_drives} ] ; then
        DRIVES="/dev/sda"
    else
        DRIVES=${OCF_RESKEY_drives}
    fi

    #Test for presence of smartctl
    if [ ! -x $SMARTCTL ] ; then
        ocf_log err "${SMARTCTL} not installed."
        exit $OCF_ERR_INSTALLED
    fi

    for DRIVE in $DRIVES; do
        if [ "${OCF_RESKEY_devices}" ]; then
            for DEVICE in ${OCF_RESKEY_devices}; do
                $SMARTCTL -d $DEVICE -i ${DRIVE} | grep -q "SMART support is: 
Enabled"
                if [ $? -ne "0" ] ; then
                    ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE}
                    exit $OCF_ERR_INSTALLED
                fi
            done
        else
            $SMARTCTL -i ${DRIVE} | grep -q "SMART support is: Enabled"
            if [ $? -ne "0" ] ; then
                ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE}
                exit $OCF_ERR_INSTALLED
            fi
        fi
    done
}

HealthSMART_usage() {
    cat <<END
usage: $0 {start|stop|monitor|validate-all|meta-data}

Expects to have a fully populated OCF RA-compliant environment set.
END
}

HealthSMART_start() {
    HealthSMART_monitor
    if [ $? =  $OCF_SUCCESS ]; then
        return $OCF_SUCCESS
    fi
    touch ${OCF_RESKEY_state}
}

HealthSMART_stop() {
    HealthSMART_monitor
    if [ $? =  $OCF_SUCCESS ]; then
        rm ${OCF_RESKEY_state}
    fi
    return $OCF_SUCCESS
}

HealthSMART_monitor() {

    init_smart

    # Monitor _MUST!_ differentiate correctly between running
    # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
    # That is THREE states, not just yes/no.

    if [ -f ${OCF_RESKEY_state} ]; then

        # Check overall S.M.A.R.T. status
        for DRIVE in $DRIVES; do
            if [ "${OCF_RESKEY_devices}" ]; then
                for DEVICE in ${OCF_RESKEY_devices}; do
                    $SMARTCTL -d $DEVICE -H ${DRIVE} | grep -q "SMART 
overall-health self-assessment test result: PASSED"
                    if [ $? -ne "0" ]; then
                        $ATTRDUP -n "#health-smart" -U "red" -d "5s"
                        return $OCF_SUCCESS
                    fi
                done
            else
                $SMARTCTL -H ${DRIVE} | grep -q "SMART overall-health 
self-assessment test result: PASSED"
                if [ $? -ne "0" ]; then
                    $ATTRDUP -n "#health-smart" -U "red" -d "5s"
                    return $OCF_SUCCESS
                fi
            fi
            
            # Check drive temperature(s)
            if [ "${OCF_RESKEY_devices}" ]; then
                for DEVICE in ${OCF_RESKEY_devices}; do
                    check_temperature `$SMARTCTL $DEVICE -A ${DRIVE} | awk 
'/^194/ { print $10 }'`
                    if [ $? != 0 ]; then
                        return $OCF_SUCCESS
                    fi
                done
            else
                check_temperature `$SMARTCTL $DEVICE -A ${DRIVE} | awk '/^194/ 
{ print $10 }'`
                if [ $? != 0 ]; then
                    return $OCF_SUCCESS
                fi
            fi
        done

        $ATTRDUP -n "#health-smart" -U "green" -d "5s"
        return $OCF_SUCCESS
    fi

    return $OCF_NOT_RUNNING

}

HealthSMART_validate() {

    init_smart

  # Is the state directory writable?
    state_dir=`dirname "$OCF_RESKEY_state"`
    touch "$state_dir/$$"
    if [ $? != 0 ]; then
        return $OCF_ERR_ARGS
    fi
    rm "$state_dir/$$"

    return $OCF_SUCCESS
}

: ${OCF_RESKEY_CRM_meta_interval=0}
: ${OCF_RESKEY_CRM_meta_globally_unique:="true"}

if [ "x$OCF_RESKEY_state" = "x" ]; then
    if [ ${OCF_RESKEY_CRM_meta_globally_unique} = "false" ]; then
        state="${HA_VARRUN}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state"

  # Strip off the trailing clone marker
        OCF_RESKEY_state=`echo $state | sed s/:[0-9][0-9]*\.state/.state/`
    else
        
OCF_RESKEY_state="${HA_VARRUN}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state"
    fi
fi

case $__OCF_ACTION in
    meta-data)  meta_data
    exit $OCF_SUCCESS
    ;;
    start)    HealthSMART_start;;
    stop)    HealthSMART_stop;;
    monitor)  HealthSMART_monitor;;
    validate-all)  HealthSMART_validate;;
    usage|help)  HealthSMART_usage
    exit $OCF_SUCCESS
    ;;
    *)    HealthSMART_usage
    exit $OCF_ERR_UNIMPLEMENTED
    ;;
esac
rc=$?
ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
exit $rc

Attachment: signature.asc
Description: OpenPGP digital signature

_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems

Reply via email to