Blob Blame History Raw
#!/bin/sh
#
# ocf:pacemaker:controld resource agent
#
# Copyright 2008-2019 the Pacemaker project contributors
#
# The version control history for this file may have further details.
#
# This source code is licensed under the GNU General Public License version 2
# (GPLv2) WITHOUT ANY WARRANTY.

#
# Manages the DLM controld process
#

#######################################################################
# Initialization:

: ${OCF_FUNCTIONS:="${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs"}
. "${OCF_FUNCTIONS}"
: ${__OCF_ACTION:="$1"}

#######################################################################

if [ -e "$OCF_ROOT/resource.d/heartbeat/controld" ]; then
    ocf_log info "Using heartbeat controld agent"
    "$OCF_ROOT/resource.d/heartbeat/controld" "$1"
    exit $?
fi

meta_data() {
    cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="controld" version="1.1">
<version>1.0</version>

<longdesc lang="en">
This Resource Agent can control the dlm_controld services needed by cluster-aware file systems.
It assumes that dlm_controld is in your default PATH.
In most cases, it should be run as an anonymous clone.
</longdesc>
<shortdesc lang="en">DLM Agent for cluster file systems</shortdesc>

<parameters>

<parameter name="args" unique="1">
<longdesc lang="en">
Any additional options to start the dlm_controld service with
</longdesc>
<shortdesc lang="en">DLM Options</shortdesc>
<content type="string" default="-s 0" />
</parameter>

<parameter name="daemon" unique="1">
<longdesc lang="en">
The daemon to start - supports gfs_controld and dlm_controld
</longdesc>
<shortdesc lang="en">The daemon to start</shortdesc>
<content type="string" default="dlm_controld" />
</parameter>

<parameter name="allow_stonith_disabled">
<longdesc lang="en">
Allow DLM start-up even if STONITH/fencing is disabled in the cluster.

Setting this option to true will cause cluster malfunction and hangs on
fail-over for DLM clients that require fencing (such as GFS2, OCFS2, and
cLVM2).

This option is advanced use only.
</longdesc>
<shortdesc lang="en">Allow start-up even without STONITH/fencing</shortdesc>
<content type="string" default="false" />
</parameter>

</parameters>

<actions>
<action name="start"        timeout="90s" />
<action name="stop"         timeout="100s" />
<action name="monitor"      timeout="20s" interval="10s" depth="0" start-delay="0s" />
<action name="meta-data"    timeout="5s" />
<action name="validate-all"   timeout="30s" />
</actions>
</resource-agent>
END
}

#######################################################################

CONFIGFS_DIR="/sys/kernel/config"
DLM_CONFIGFS_DIR="${CONFIGFS_DIR}/dlm"
DLM_SYSFS_DIR="/sys/kernel/dlm"

controld_usage() {
    cat <<END
usage: $0 {start|stop|monitor|validate-all|meta-data}

Expects to have a fully populated OCF RA-compliant environment set.
END
}

check_uncontrolled_locks()
{
    CUL_TMP=$(ls "$DLM_SYSFS_DIR" 2>&1)
    if [ $? -eq 0 ]; then
        if [ -n "$CUL_TMP" ]; then

            ocf_log err "Uncontrolled lockspace exists, system must reboot. Executing suicide fencing"
            stonith_admin --reboot="$(crm_node -n)" --tag controld

            exit $OCF_ERR_GENERIC
        fi
    fi
}

controld_start() {
    controld_monitor; rc=$?

    case $rc in
      $OCF_SUCCESS)     return $OCF_SUCCESS;;
      $OCF_NOT_RUNNING) ;;
      *) return $OCF_ERR_GENERIC;;
    esac

    # Ensure configfs is mounted
    if [ ! -e "$CONFIGFS_DIR" ]; then
        modprobe configfs
        if [ ! -e "$CONFIGFS_DIR" ]; then
           ocf_log err "$CONFIGFS_DIR not available"
           return $OCF_ERR_INSTALLED
        fi
    fi
    mount -t configfs | grep " $CONFIGFS_DIR " >/dev/null 2>/dev/null
    if [ $? -ne 0 ]; then
       mount -t configfs none "$CONFIGFS_DIR"
    fi

    # Ensure DLM is available
    if [ ! -e "$DLM_CONFIGFS_DIR" ]; then
       modprobe dlm
       if [ ! -e "$DLM_CONFIGFS_DIR" ]; then
          ocf_log err "$DLM_CONFIGFS_DIR not available"
          return $OCF_ERR_INSTALLED
       fi
    fi

    if ! ocf_is_true "$OCF_RESKEY_allow_stonith_disabled" && \
        ! ocf_is_true "$(crm_attribute --type=crm_config --name=stonith-enabled --query --quiet --default=true)"; then
        ocf_log err "The cluster property stonith-enabled may not be deactivated to use the DLM"
        return $OCF_ERR_CONFIGURED
    fi

    "${OCF_RESKEY_daemon}" $OCF_RESKEY_args

    while true
    do
        sleep 1

        controld_monitor; rc=$?
        case $rc in
          $OCF_SUCCESS)
            CS_ADDR_LIST="$(cat "${DLM_CONFIGFS_DIR}"/cluster/comms/*/addr_list 2>/dev/null)"
            if [ $? -eq 0 ] && [ -n "$CS_ADDR_LIST" ]; then
                return $OCF_SUCCESS
            fi
            ;;
          $OCF_NOT_RUNNING)
            return $OCF_NOT_RUNNING
            ;;
          *)
            return $OCF_ERR_GENERIC
            ;;
        esac

        ocf_log debug "Waiting for ${OCF_RESKEY_daemon} to be ready"
    done
}

controld_stop() {
    controld_monitor; rc=$?

    if [ $rc -eq $OCF_NOT_RUNNING ]; then
        return $OCF_SUCCESS
    fi

    killall -TERM "${OCF_RESKEY_daemon}"; rc=$?

    if [ $rc -ne 0 ]; then
        return $OCF_ERR_GENERIC
    fi

    rc=$OCF_SUCCESS
    while [ $rc -eq $OCF_SUCCESS ]; do
        controld_monitor; rc=$?
        sleep 1
    done

    if [ $rc -eq $OCF_NOT_RUNNING ]; then
        rc=$OCF_SUCCESS
    fi

    return $rc
}

controld_monitor() {
    killall -0 ${OCF_RESKEY_daemon} >/dev/null 2>&1 ; CM_RC=$?

    case $CM_RC in
      0) smw=$(dlm_tool status -v | grep "stateful_merge_wait=" | cut -d= -f2)
         if [ -n "$smw" ] && [ $smw -eq 1 ]; then
             ocf_log err "DLM status is: stateful_merge_wait"
             CM_RC=$OCF_ERR_GENERIC
         elif [ -z "$smw" ] && dlm_tool ls | grep -q "wait fencing" && \
              ! stonith_admin -H '*' --output-as xml | grep -q "extended-status=\"pending\""; then
             ocf_log err "DLM status is: wait fencing"
             CM_RC=$OCF_ERR_GENERIC
         else
             CM_RC=$OCF_SUCCESS
         fi
         ;;
      1) CM_RC=$OCF_NOT_RUNNING;;
      *) CM_RC=$OCF_ERR_GENERIC;;
    esac

    # if the dlm is not successfully running, but
    # dlm lockspace bits are left over, we self must fence.
    if [ $CM_RC -ne $OCF_SUCCESS ]; then
        check_uncontrolled_locks
    fi

    return $CM_RC
}

controld_validate() {
    check_binary killall
    check_binary "${OCF_RESKEY_daemon}"

    case "${OCF_RESKEY_CRM_meta_globally_unique}" in
        yes|Yes|true|True|1)
            ocf_log err "$OCF_RESOURCE_INSTANCE must be configured with the globally_unique=false meta attribute"
            exit $OCF_ERR_CONFIGURED
            ;;
    esac

    [ -d /var/run/cluster ] || mkdir /var/run/cluster

    return $OCF_SUCCESS
}

: ${OCF_RESKEY_sctp:="false"}
: ${OCF_RESKEY_CRM_meta_globally_unique:="false"}

case "$OCF_RESOURCE_INSTANCE" in
    *[gG][fF][sS]*)
        : ${OCF_RESKEY_args=-g 0}
        : ${OCF_RESKEY_daemon:=gfs_controld}
        ;;
    *[dD][lL][mM]*)
        : ${OCF_RESKEY_args=-s 0}
        : ${OCF_RESKEY_daemon:=dlm_controld}
        ;;
    *)
        : ${OCF_RESKEY_args=-s 0}
        : ${OCF_RESKEY_daemon:=dlm_controld}
esac

case "$__OCF_ACTION" in
meta-data)      meta_data
                exit $OCF_SUCCESS
                ;;
start)          controld_validate; controld_start;;
stop)           controld_stop;;
monitor)        controld_validate; controld_monitor;;
validate-all)   controld_validate;;
usage|help)     controld_usage
                exit $OCF_SUCCESS
                ;;
*)              controld_usage
                exit $OCF_ERR_UNIMPLEMENTED
                ;;
esac
rc=$?

exit $rc

# vim: set filetype=sh expandtab tabstop=4 softtabstop=4 shiftwidth=4 textwidth=80: