Blob Blame History Raw
#!@BASH_PATH@
#
# ocf:pacemaker:HealthSMART resource agent
#
# Copyright 2009-2019 the Pacemaker project contributors
#
# The version control history for this file may have further details.
#
# This source code is licensed under the GNU General Public License version 2
# (GPLv2) WITHOUT ANY WARRANTY.
#

#
# Checks the S.M.A.R.T. status of all given drives and writes the #health-smart
# status into the CIB
#
#######################################################################

#######################################################################
# Initialization:

: ${OCF_FUNCTIONS:="${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs"}
. "${OCF_FUNCTIONS}"
: ${__OCF_ACTION:="$1"}
#
SMARTCTL=/usr/sbin/smartctl
ATTRDUP=/usr/sbin/attrd_updater

#######################################################################

meta_data() {
    cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="HealthSMART" version="0.1">
<version>1.0</version>

<longdesc lang="en">
System health agent that checks the S.M.A.R.T. status of the given drives and
updates the #health-smart attribute.
</longdesc>
<shortdesc lang="en">SMART health status</shortdesc>

<parameters>
<parameter name="state" unique="1">
<longdesc lang="en">
Location to store the resource state in.
</longdesc>
<shortdesc lang="en">State file</shortdesc>
<content type="string" default="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state" />
</parameter>

<parameter name="drives" unique="0">
<longdesc lang="en">
The drive(s) to check as a SPACE separated list. Enter the full path to the device, e.g. "/dev/sda".
</longdesc>
<shortdesc lang="en">Drives to check</shortdesc>
<content type="string" default="/dev/sda" />
</parameter>

<parameter name="devices" unique="0">
<longdesc lang="en">
The device type(s) to assume for the drive(s) being tested as a SPACE separated list.
</longdesc>
<shortdesc lang="en">Device types</shortdesc>
<content type="string" />
</parameter>

<parameter name="temp_lower_limit" unique="0">
<longdesc lang="en">
Lower limit of the temperature in deg C of the drive(s). Below this limit the status will be red.
</longdesc>
<shortdesc lang="en">Lower limit for the red smart attribute</shortdesc>
<content type="string" default="0"/>
</parameter>

<parameter name="temp_upper_limit" unique="0">
<longdesc lang="en">
Upper limit of the temperature if deg C of the drives(s). If the drive reports
a temperature higher than this value the status of #health-smart will be red.
</longdesc>
<shortdesc lang="en">Upper limit for red smart attribute</shortdesc>
<content type="string" default="60"/>
</parameter>

<parameter name="temp_warning" unique="0">
<longdesc lang="en">
Number of deg C below/above the upper/lower temp limits at which point the status of #health-smart will change to yellow.
</longdesc>
<shortdesc lang="en">Deg C below/above the upper limits for yellow smart attribute</shortdesc>
<content type="string" default="5"/>
</parameter>

</parameters>

<actions>
<action name="start"        timeout="10s" />
<action name="stop"         timeout="10s" />
<action name="monitor"      timeout="10s" interval="10s" start-delay="0s" />
<action name="meta-data"    timeout="5s" />
<action name="validate-all"   timeout="10s" />
</actions>
</resource-agent>
END
}

#######################################################################

check_temperature() {

    if [ $1 -lt ${lower_red_limit} ] ; then
        ocf_log info "Drive ${DRIVE} ${DEVICE} too cold: ${1} C"
        "$ATTRDUP" -n "#health-smart" -U "red" -d "5s"
        return 1
    fi

    if [ $1 -gt ${upper_red_limit} ] ; then
        ocf_log info "Drive ${DRIVE} ${DEVICE} too hot: ${1} C"
        "$ATTRDUP" -n "#health-smart" -U "red" -d "5s"
        return 1
    fi

    if [ $1 -lt ${lower_yellow_limit} ] ; then
        ocf_log info "Drive ${DRIVE} ${DEVICE} quite cold: ${1} C"
        "$ATTRDUP" -n "#health-smart" -U "yellow" -d "5s"
        return 1
    fi

    if [ $1 -gt ${upper_yellow_limit} ] ; then
        ocf_log info "Drive ${DRIVE} ${DEVICE} quite hot: ${1} C"
        "$ATTRDUP" -n "#health-smart" -U "yellow" -d "5s"
        return 1
    fi
}


init_smart() {
    #Set temperature defaults
    if [ -z "${OCF_RESKEY_temp_warning}" ]; then
        yellow_threshold=5
    else
        yellow_threshold=${OCF_RESKEY_temp_warning}
    fi

    if [ -z "${OCF_RESKEY_temp_lower_limit}" ] ; then
        lower_red_limit=0
    else
        lower_red_limit=${OCF_RESKEY_temp_lower_limit}
    fi
    lower_yellow_limit=$((${lower_red_limit}+${yellow_threshold}))

    if [ -z "${OCF_RESKEY_temp_upper_limit}" ] ; then
        upper_red_limit=60
    else
        upper_red_limit=${OCF_RESKEY_temp_upper_limit}
    fi
    upper_yellow_limit=$((${upper_red_limit}-${yellow_threshold}))

    #Set disk defaults
    if [ -z "${OCF_RESKEY_drives}" ] ; then
        DRIVES="/dev/sda"
    else
        DRIVES=${OCF_RESKEY_drives}
    fi

    #Test for presence of smartctl
    if [ ! -x "$SMARTCTL" ] ; then
        ocf_log err "${SMARTCTL} not installed."
        exit $OCF_ERR_INSTALLED
    fi

    for DRIVE in $DRIVES; do
        if [ -n "${OCF_RESKEY_devices}" ]; then
            for DEVICE in ${OCF_RESKEY_devices}; do
                "$SMARTCTL" -d "$DEVICE" -i "${DRIVE}" | grep -q "SMART support is: Enabled"
                if [ $? -ne 0 ] ; then
                    ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE}
                    exit $OCF_ERR_INSTALLED
                fi
            done
        else
            "$SMARTCTL" -i "${DRIVE}" | grep -q "SMART support is: Enabled"
            if [ $? -ne 0 ] ; then
                ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE}
                exit $OCF_ERR_INSTALLED
            fi
        fi
    done
}

HealthSMART_usage() {
    cat <<END
usage: $0 {start|stop|monitor|validate-all|meta-data}

Expects to have a fully populated OCF RA-compliant environment set.
END
}

HealthSMART_start() {
    HealthSMART_monitor
    if [ $? -eq $OCF_SUCCESS ]; then
        return $OCF_SUCCESS
    fi
    touch "${OCF_RESKEY_state}"
}

HealthSMART_stop() {
    HealthSMART_monitor
    if [ $? -eq $OCF_SUCCESS ]; then
        rm "${OCF_RESKEY_state}"
    fi
    return $OCF_SUCCESS
}

HealthSMART_monitor() {

    init_smart

    # Monitor _MUST!_ differentiate correctly between running
    # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
    # That is THREE states, not just yes/no.

    if [ -f "${OCF_RESKEY_state}" ]; then

        # Check overall S.M.A.R.T. status
        for DRIVE in $DRIVES; do
            if [ -n "${OCF_RESKEY_devices}" ]; then
                for DEVICE in ${OCF_RESKEY_devices}; do
                    "$SMARTCTL" -d "$DEVICE" -H ${DRIVE} | grep -q "SMART overall-health self-assessment test result: PASSED"
                    if [ $? -ne 0 ]; then
                        "$ATTRDUP" -n "#health-smart" -U "red" -d "5s"
                        return $OCF_SUCCESS
                    fi
                done
            else
                "$SMARTCTL" -H "${DRIVE}" | grep -q "SMART overall-health self-assessment test result: PASSED"
                if [ $? -ne 0 ]; then
                    "$ATTRDUP" -n "#health-smart" -U "red" -d "5s"
                    return $OCF_SUCCESS
                fi
            fi

            # Check drive temperature(s)
            if [ -n "${OCF_RESKEY_devices}" ]; then
                for DEVICE in ${OCF_RESKEY_devices}; do
                    check_temperature "$("$SMARTCTL" -d "$DEVICE" -A "${DRIVE}" | awk '/^194/ { print $10 }')"
                    if [ $? -ne 0 ]; then
                        return $OCF_SUCCESS
                    fi
                done
            else
                check_temperature "$("$SMARTCTL" -A "${DRIVE}" | awk '/^194/ { print $10 }')"
                if [ $? -ne 0 ]; then
                    return $OCF_SUCCESS
                fi
            fi
        done

        "$ATTRDUP" -n "#health-smart" -U "green" -d "5s"
        return $OCF_SUCCESS
    fi

    return $OCF_NOT_RUNNING

}

HealthSMART_validate() {

    init_smart

    # Is the state directory writable?
    state_dir=$(dirname "$OCF_RESKEY_state")
    touch "$state_dir/$$"
    if [ $? -ne 0 ]; then
        return $OCF_ERR_ARGS
    fi
    rm "$state_dir/$$"

    return $OCF_SUCCESS
}

: ${OCF_RESKEY_CRM_meta_interval:=0}
: ${OCF_RESKEY_CRM_meta_globally_unique:="true"}

if [ -z "$OCF_RESKEY_state" ]; then
    if [ "${OCF_RESKEY_CRM_meta_globally_unique}" = "false" ]; then
        state="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state"

        # Strip off the trailing clone marker
        OCF_RESKEY_state=$(echo $state | sed s/:[0-9][0-9]*\.state/.state/)
    else
        OCF_RESKEY_state="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state"
    fi
fi

case "$__OCF_ACTION" in
    start)        HealthSMART_start;;
    stop)         HealthSMART_stop;;
    monitor)      HealthSMART_monitor;;
    validate-all) HealthSMART_validate;;
    meta-data)
        meta_data
        exit $OCF_SUCCESS
        ;;
    usage|help)
        HealthSMART_usage
        exit $OCF_SUCCESS
        ;;
    *)  HealthSMART_usage
        exit $OCF_ERR_UNIMPLEMENTED
        ;;
esac
rc=$?
ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
exit $rc

# vim: set filetype=sh expandtab tabstop=4 softtabstop=4 shiftwidth=4 textwidth=80: