summaryrefslogtreecommitdiffstats
path: root/agents/ocf/HealthSMART.in
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 06:53:20 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 06:53:20 +0000
commite5a812082ae033afb1eed82c0f2df3d0f6bdc93f (patch)
treea6716c9275b4b413f6c9194798b34b91affb3cc7 /agents/ocf/HealthSMART.in
parentInitial commit. (diff)
downloadpacemaker-e5a812082ae033afb1eed82c0f2df3d0f6bdc93f.tar.xz
pacemaker-e5a812082ae033afb1eed82c0f2df3d0f6bdc93f.zip
Adding upstream version 2.1.6.upstream/2.1.6
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'agents/ocf/HealthSMART.in')
-rwxr-xr-xagents/ocf/HealthSMART.in372
1 files changed, 372 insertions, 0 deletions
diff --git a/agents/ocf/HealthSMART.in b/agents/ocf/HealthSMART.in
new file mode 100755
index 0000000..b6edac2
--- /dev/null
+++ b/agents/ocf/HealthSMART.in
@@ -0,0 +1,372 @@
+#!@BASH_PATH@
+#
+# ocf:pacemaker:HealthSMART resource agent
+#
+# Copyright 2009-2023 the Pacemaker project contributors
+#
+# The version control history for this file may have further details.
+#
+# This source code is licensed under the GNU General Public License version 2
+# (GPLv2) WITHOUT ANY WARRANTY.
+#
+
+#
+# Checks the S.M.A.R.T. status of all given drives and writes the #health-smart
+# status into the CIB
+#
+#######################################################################
+
+#######################################################################
+# Initialization:
+
+: ${OCF_FUNCTIONS:="${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs"}
+. "${OCF_FUNCTIONS}"
+: ${__OCF_ACTION:="$1"}
+
+# Explicitly list all environment variables used, to make static analysis happy
+: ${OCF_RESKEY_CRM_meta_interval:=0}
+: ${OCF_RESKEY_CRM_meta_globally_unique:="true"}
+: ${OCF_RESKEY_temp_warning:=""}
+: ${OCF_RESKEY_temp_lower_limit:=""}
+: ${OCF_RESKEY_temp_upper_limit:=""}
+: ${OCF_RESKEY_drives:="/dev/sda"}
+: ${OCF_RESKEY_devices:=""}
+: ${OCF_RESKEY_state:=""}
+: ${OCF_RESKEY_smartctl:="/usr/sbin/smartctl"}
+: ${OCF_RESKEY_dampen:="5s"}
+
+# Turn these into arrays so we can iterate them later.
+DRIVES=(${OCF_RESKEY_drives})
+DEVICES=(${OCF_RESKEY_devices})
+
+#######################################################################
+
+meta_data() {
+ cat <<END
+<?xml version="1.0"?>
+<resource-agent name="HealthSMART" version="@VERSION@">
+<version>1.1</version>
+
+<longdesc lang="en">
+System health agent that checks the S.M.A.R.T. status of the given drives and
+updates the #health-smart attribute.
+</longdesc>
+<shortdesc lang="en">SMART health status</shortdesc>
+
+<parameters>
+<parameter name="state" unique-group="state">
+<longdesc lang="en">
+Location to store the resource state in.
+</longdesc>
+<shortdesc lang="en">State file</shortdesc>
+<content type="string" default="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state" />
+</parameter>
+
+<parameter name="drives" reloadable="1">
+<longdesc lang="en">
+The drive(s) to check as a SPACE separated list. Enter the full path to the device, e.g. "/dev/sda".
+</longdesc>
+<shortdesc lang="en">Drives to check</shortdesc>
+<content type="string" default="/dev/sda" />
+</parameter>
+
+<parameter name="devices" reloadable="1">
+<longdesc lang="en">
+The device type(s) to assume for the drive(s) being tested as a SPACE separated list.
+</longdesc>
+<shortdesc lang="en">Device types</shortdesc>
+<content type="string" />
+</parameter>
+
+<parameter name="temp_lower_limit" reloadable="1">
+<longdesc lang="en">
+Lower limit of the temperature in deg C of the drive(s). Below this limit the status will be red.
+</longdesc>
+<shortdesc lang="en">Lower limit for the red smart attribute</shortdesc>
+<content type="string" default="0"/>
+</parameter>
+
+<parameter name="temp_upper_limit" reloadable="1">
+<longdesc lang="en">
+Upper limit of the temperature if deg C of the drives(s). If the drive reports
+a temperature higher than this value the status of #health-smart will be red.
+</longdesc>
+<shortdesc lang="en">Upper limit for red smart attribute</shortdesc>
+<content type="string" default="60"/>
+</parameter>
+
+<parameter name="temp_warning" reloadable="1">
+<longdesc lang="en">
+Number of deg C below/above the upper/lower temp limits at which point the status of #health-smart will change to yellow.
+</longdesc>
+<shortdesc lang="en">Deg C below/above the upper limits for yellow smart attribute</shortdesc>
+<content type="string" default="5"/>
+</parameter>
+
+<parameter name="smartctl" reloadable="1">
+<longdesc lang="en">
+The path to the smartctl program, used for querying device health.
+</longdesc>
+<shortdesc lang="en">The path to the smartctl program</shortdesc>
+<content type="string" default="/usr/sbin/smartctl"/>
+</parameter>
+
+<parameter name="dampen" reloadable="1">
+<longdesc lang="en">
+The time to wait (dampening) for further changes to occur
+</longdesc>
+<shortdesc lang="en">Dampening interval</shortdesc>
+<content type="integer" default="5s"/>
+</parameter>
+
+</parameters>
+
+<actions>
+<action name="start" timeout="10s" />
+<action name="stop" timeout="10s" />
+<action name="monitor" timeout="10s" interval="10s" start-delay="0s" />
+<action name="meta-data" timeout="5s" />
+<action name="validate-all" timeout="10s" depth="0" />
+<action name="reload-agent" timeout="20s" />
+</actions>
+</resource-agent>
+END
+}
+
+#######################################################################
+
+check_temperature() {
+
+ if [ $1 -lt ${lower_red_limit} ] ; then
+ ocf_log info "Drive ${DRIVE} ${DEVICE} too cold: ${1} C"
+ attrd_updater -n "#health-smart" -U "red" -d "${OCF_RESKEY_dampen}"
+ return 1
+ fi
+
+ if [ $1 -gt ${upper_red_limit} ] ; then
+ ocf_log info "Drive ${DRIVE} ${DEVICE} too hot: ${1} C"
+ attrd_updater -n "#health-smart" -U "red" -d "${OCF_RESKEY_dampen}"
+ return 1
+ fi
+
+ if [ $1 -lt ${lower_yellow_limit} ] ; then
+ ocf_log info "Drive ${DRIVE} ${DEVICE} quite cold: ${1} C"
+ attrd_updater -n "#health-smart" -U "yellow" -d "${OCF_RESKEY_dampen}"
+ return 1
+ fi
+
+ if [ $1 -gt ${upper_yellow_limit} ] ; then
+ ocf_log info "Drive ${DRIVE} ${DEVICE} quite hot: ${1} C"
+ attrd_updater -n "#health-smart" -U "yellow" -d "${OCF_RESKEY_dampen}"
+ return 1
+ fi
+}
+
+common_checks() {
+ # Each item in $OCF_RESKEY_drives must have a corresponding item in
+ # $OCF_RESKEY_devices with the device type. Alternately,
+ # $OCF_RESKEY_devices can be empty.
+ drives_len=${#DRIVES[@]}
+ devices_len=${#DEVICES[@]}
+
+ if [ "${drives_len}" -ne "${devices_len}" ] && [ "${devices_len}" -gt 0 ]; then
+ ocf_log err "OCF_RESKEY_devices must be empty or the same length as OCF_RESKEY_drives."
+ exit $OCF_ERR_ARGS
+ fi
+
+ # Each item in $OCF_RESKEY_drives must look like a device node.
+ for d in "${DRIVES[@]}"; do
+ if [[ "$d" != /dev/* ]]; then
+ ocf_log err "Device in OCF_RESKEY_devices does not look like a device node: $d"
+ exit $OCF_ERR_ARGS
+ fi
+ done
+}
+
+
+init_smart() {
+ #Set temperature defaults
+ if [ -z "${OCF_RESKEY_temp_warning}" ]; then
+ yellow_threshold=5
+ else
+ yellow_threshold=${OCF_RESKEY_temp_warning}
+ fi
+
+ if [ -z "${OCF_RESKEY_temp_lower_limit}" ] ; then
+ lower_red_limit=0
+ else
+ lower_red_limit=${OCF_RESKEY_temp_lower_limit}
+ fi
+ lower_yellow_limit=$((${lower_red_limit}+${yellow_threshold}))
+
+ if [ -z "${OCF_RESKEY_temp_upper_limit}" ] ; then
+ upper_red_limit=60
+ else
+ upper_red_limit=${OCF_RESKEY_temp_upper_limit}
+ fi
+ upper_yellow_limit=$((${upper_red_limit}-${yellow_threshold}))
+
+ for ndx in ${!DRIVES[*]}; do
+ DRIVE=${DRIVES[$ndx]}
+
+ if [ -n "${OCF_RESKEY_devices}" ]; then
+ DEVICE=${DEVICES[$ndx]}
+
+ "${OCF_RESKEY_smartctl}" -d "${DEVICE}" -i "${DRIVE}" | grep -q "SMART support is: Enabled"
+ if [ $? -ne 0 ] ; then
+ ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE}
+ exit $OCF_ERR_INSTALLED
+ fi
+ else
+ "${OCF_RESKEY_smartctl}" -i "${DRIVE}" | grep -q "SMART support is: Enabled"
+ if [ $? -ne 0 ] ; then
+ ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE}
+ exit $OCF_ERR_INSTALLED
+ fi
+ fi
+ done
+}
+
+HealthSMART_usage() {
+ cat <<END
+usage: $0 {start|stop|monitor|validate-all|meta-data|reload-agent}
+
+Expects to have a fully populated OCF RA-compliant environment set.
+END
+}
+
+HealthSMART_start() {
+ HealthSMART_monitor
+ if [ $? -eq $OCF_SUCCESS ]; then
+ return $OCF_SUCCESS
+ fi
+ touch "${OCF_RESKEY_state}"
+}
+
+HealthSMART_stop() {
+ attrd_updater -D -n "#health-smart" -d "${OCF_RESKEY_dampen}"
+
+ rm "${OCF_RESKEY_state}"
+
+ if [ $? -eq 0 ]; then
+ return $OCF_SUCCESS
+ else
+ return $OCF_ERR_GENERIC
+ fi
+}
+
+HealthSMART_monitor() {
+ common_checks
+
+ # Test for presence of smartctl
+ check_binary smartctl
+
+ init_smart
+
+ # Monitor _MUST!_ differentiate correctly between running
+ # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
+ # That is THREE states, not just yes/no.
+
+ if [ -f "${OCF_RESKEY_state}" ]; then
+
+ for ndx in ${!DRIVES[*]}; do
+ DRIVE=${DRIVES[$ndx]}
+
+ if [ -n "${OCF_RESKEY_devices}" ]; then
+ DEVICE=${DEVICES[$ndx]}
+
+ # Check overall S.M.A.R.T. status
+ "${OCF_RESKEY_smartctl}" -d "${DEVICE}" -H ${DRIVE} | grep -q "SMART overall-health self-assessment test result: PASSED"
+ if [ $? -ne 0 ]; then
+ attrd_updater -n "#health-smart" -U "red" -d "${OCF_RESKEY_dampen}"
+ return $OCF_SUCCESS
+ fi
+
+ # Check drive temperature(s)
+ check_temperature "$("${OCF_RESKEY_smartctl}" -d "${DEVICE}" -A "${DRIVE}" | awk '/^194/ { print $10 }')"
+ if [ $? -ne 0 ]; then
+ return $OCF_SUCCESS
+ fi
+ else
+ "${OCF_RESKEY_smartctl}" -H "${DRIVE}" | grep -q "SMART overall-health self-assessment test result: PASSED"
+ if [ $? -ne 0 ]; then
+ attrd_updater -n "#health-smart" -U "red" -d "${OCF_RESKEY_dampen}"
+ return $OCF_SUCCESS
+ fi
+
+ check_temperature "$("${OCF_RESKEY_smartctl}" -A "${DRIVE}" | awk '/^194/ { print $10 }')"
+ if [ $? -ne 0 ]; then
+ return $OCF_SUCCESS
+ fi
+ fi
+ done
+
+ attrd_updater -n "#health-smart" -U "green" -d "${OCF_RESKEY_dampen}"
+ return $OCF_SUCCESS
+ fi
+
+ return $OCF_NOT_RUNNING
+
+}
+
+HealthSMART_validate() {
+ common_checks
+
+ # Host-specific checks
+ if [ "$OCF_CHECK_LEVEL" = "10" ]; then
+ # Test for presence of smartctl
+ check_binary smartctl
+
+ init_smart
+
+ # Is the state directory writable?
+ state_dir=$(dirname "$OCF_RESKEY_state")
+ touch "$state_dir/$$"
+ if [ $? -ne 0 ]; then
+ return $OCF_ERR_ARGS
+ fi
+ rm "$state_dir/$$"
+ fi
+
+ return $OCF_SUCCESS
+}
+
+HealthSMART_reload_agent() {
+ return $OCF_SUCCESS
+}
+
+
+if [ -z "$OCF_RESKEY_state" ]; then
+ if [ "${OCF_RESKEY_CRM_meta_globally_unique}" = "false" ]; then
+ state="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state"
+
+ # Strip off the trailing clone marker
+ OCF_RESKEY_state=$(echo $state | sed s/:[0-9][0-9]*\.state/.state/)
+ else
+ OCF_RESKEY_state="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state"
+ fi
+fi
+
+case "$__OCF_ACTION" in
+ start) HealthSMART_start;;
+ stop) HealthSMART_stop;;
+ monitor) HealthSMART_monitor;;
+ validate-all) HealthSMART_validate;;
+ reload-agent) HealthSMART_reload_agent;;
+ meta-data)
+ meta_data
+ exit $OCF_SUCCESS
+ ;;
+ usage|help)
+ HealthSMART_usage
+ exit $OCF_SUCCESS
+ ;;
+ *) HealthSMART_usage
+ exit $OCF_ERR_UNIMPLEMENTED
+ ;;
+esac
+rc=$?
+ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
+exit $rc
+
+# vim: set filetype=sh expandtab tabstop=4 softtabstop=4 shiftwidth=4 textwidth=80: