diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 06:53:20 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 06:53:20 +0000 |
commit | e5a812082ae033afb1eed82c0f2df3d0f6bdc93f (patch) | |
tree | a6716c9275b4b413f6c9194798b34b91affb3cc7 /agents/ocf/HealthSMART.in | |
parent | Initial commit. (diff) | |
download | pacemaker-e5a812082ae033afb1eed82c0f2df3d0f6bdc93f.tar.xz pacemaker-e5a812082ae033afb1eed82c0f2df3d0f6bdc93f.zip |
Adding upstream version 2.1.6.upstream/2.1.6
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'agents/ocf/HealthSMART.in')
-rwxr-xr-x | agents/ocf/HealthSMART.in | 372 |
1 files changed, 372 insertions, 0 deletions
diff --git a/agents/ocf/HealthSMART.in b/agents/ocf/HealthSMART.in new file mode 100755 index 0000000..b6edac2 --- /dev/null +++ b/agents/ocf/HealthSMART.in @@ -0,0 +1,372 @@ +#!@BASH_PATH@ +# +# ocf:pacemaker:HealthSMART resource agent +# +# Copyright 2009-2023 the Pacemaker project contributors +# +# The version control history for this file may have further details. +# +# This source code is licensed under the GNU General Public License version 2 +# (GPLv2) WITHOUT ANY WARRANTY. +# + +# +# Checks the S.M.A.R.T. status of all given drives and writes the #health-smart +# status into the CIB +# +####################################################################### + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS:="${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs"} +. "${OCF_FUNCTIONS}" +: ${__OCF_ACTION:="$1"} + +# Explicitly list all environment variables used, to make static analysis happy +: ${OCF_RESKEY_CRM_meta_interval:=0} +: ${OCF_RESKEY_CRM_meta_globally_unique:="true"} +: ${OCF_RESKEY_temp_warning:=""} +: ${OCF_RESKEY_temp_lower_limit:=""} +: ${OCF_RESKEY_temp_upper_limit:=""} +: ${OCF_RESKEY_drives:="/dev/sda"} +: ${OCF_RESKEY_devices:=""} +: ${OCF_RESKEY_state:=""} +: ${OCF_RESKEY_smartctl:="/usr/sbin/smartctl"} +: ${OCF_RESKEY_dampen:="5s"} + +# Turn these into arrays so we can iterate them later. +DRIVES=(${OCF_RESKEY_drives}) +DEVICES=(${OCF_RESKEY_devices}) + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<resource-agent name="HealthSMART" version="@VERSION@"> +<version>1.1</version> + +<longdesc lang="en"> +System health agent that checks the S.M.A.R.T. status of the given drives and +updates the #health-smart attribute. +</longdesc> +<shortdesc lang="en">SMART health status</shortdesc> + +<parameters> +<parameter name="state" unique-group="state"> +<longdesc lang="en"> +Location to store the resource state in. +</longdesc> +<shortdesc lang="en">State file</shortdesc> +<content type="string" default="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state" /> +</parameter> + +<parameter name="drives" reloadable="1"> +<longdesc lang="en"> +The drive(s) to check as a SPACE separated list. Enter the full path to the device, e.g. "/dev/sda". +</longdesc> +<shortdesc lang="en">Drives to check</shortdesc> +<content type="string" default="/dev/sda" /> +</parameter> + +<parameter name="devices" reloadable="1"> +<longdesc lang="en"> +The device type(s) to assume for the drive(s) being tested as a SPACE separated list. +</longdesc> +<shortdesc lang="en">Device types</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="temp_lower_limit" reloadable="1"> +<longdesc lang="en"> +Lower limit of the temperature in deg C of the drive(s). Below this limit the status will be red. +</longdesc> +<shortdesc lang="en">Lower limit for the red smart attribute</shortdesc> +<content type="string" default="0"/> +</parameter> + +<parameter name="temp_upper_limit" reloadable="1"> +<longdesc lang="en"> +Upper limit of the temperature if deg C of the drives(s). If the drive reports +a temperature higher than this value the status of #health-smart will be red. +</longdesc> +<shortdesc lang="en">Upper limit for red smart attribute</shortdesc> +<content type="string" default="60"/> +</parameter> + +<parameter name="temp_warning" reloadable="1"> +<longdesc lang="en"> +Number of deg C below/above the upper/lower temp limits at which point the status of #health-smart will change to yellow. +</longdesc> +<shortdesc lang="en">Deg C below/above the upper limits for yellow smart attribute</shortdesc> +<content type="string" default="5"/> +</parameter> + +<parameter name="smartctl" reloadable="1"> +<longdesc lang="en"> +The path to the smartctl program, used for querying device health. +</longdesc> +<shortdesc lang="en">The path to the smartctl program</shortdesc> +<content type="string" default="/usr/sbin/smartctl"/> +</parameter> + +<parameter name="dampen" reloadable="1"> +<longdesc lang="en"> +The time to wait (dampening) for further changes to occur +</longdesc> +<shortdesc lang="en">Dampening interval</shortdesc> +<content type="integer" default="5s"/> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="10s" /> +<action name="stop" timeout="10s" /> +<action name="monitor" timeout="10s" interval="10s" start-delay="0s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="10s" depth="0" /> +<action name="reload-agent" timeout="20s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +check_temperature() { + + if [ $1 -lt ${lower_red_limit} ] ; then + ocf_log info "Drive ${DRIVE} ${DEVICE} too cold: ${1} C" + attrd_updater -n "#health-smart" -U "red" -d "${OCF_RESKEY_dampen}" + return 1 + fi + + if [ $1 -gt ${upper_red_limit} ] ; then + ocf_log info "Drive ${DRIVE} ${DEVICE} too hot: ${1} C" + attrd_updater -n "#health-smart" -U "red" -d "${OCF_RESKEY_dampen}" + return 1 + fi + + if [ $1 -lt ${lower_yellow_limit} ] ; then + ocf_log info "Drive ${DRIVE} ${DEVICE} quite cold: ${1} C" + attrd_updater -n "#health-smart" -U "yellow" -d "${OCF_RESKEY_dampen}" + return 1 + fi + + if [ $1 -gt ${upper_yellow_limit} ] ; then + ocf_log info "Drive ${DRIVE} ${DEVICE} quite hot: ${1} C" + attrd_updater -n "#health-smart" -U "yellow" -d "${OCF_RESKEY_dampen}" + return 1 + fi +} + +common_checks() { + # Each item in $OCF_RESKEY_drives must have a corresponding item in + # $OCF_RESKEY_devices with the device type. Alternately, + # $OCF_RESKEY_devices can be empty. + drives_len=${#DRIVES[@]} + devices_len=${#DEVICES[@]} + + if [ "${drives_len}" -ne "${devices_len}" ] && [ "${devices_len}" -gt 0 ]; then + ocf_log err "OCF_RESKEY_devices must be empty or the same length as OCF_RESKEY_drives." + exit $OCF_ERR_ARGS + fi + + # Each item in $OCF_RESKEY_drives must look like a device node. + for d in "${DRIVES[@]}"; do + if [[ "$d" != /dev/* ]]; then + ocf_log err "Device in OCF_RESKEY_devices does not look like a device node: $d" + exit $OCF_ERR_ARGS + fi + done +} + + +init_smart() { + #Set temperature defaults + if [ -z "${OCF_RESKEY_temp_warning}" ]; then + yellow_threshold=5 + else + yellow_threshold=${OCF_RESKEY_temp_warning} + fi + + if [ -z "${OCF_RESKEY_temp_lower_limit}" ] ; then + lower_red_limit=0 + else + lower_red_limit=${OCF_RESKEY_temp_lower_limit} + fi + lower_yellow_limit=$((${lower_red_limit}+${yellow_threshold})) + + if [ -z "${OCF_RESKEY_temp_upper_limit}" ] ; then + upper_red_limit=60 + else + upper_red_limit=${OCF_RESKEY_temp_upper_limit} + fi + upper_yellow_limit=$((${upper_red_limit}-${yellow_threshold})) + + for ndx in ${!DRIVES[*]}; do + DRIVE=${DRIVES[$ndx]} + + if [ -n "${OCF_RESKEY_devices}" ]; then + DEVICE=${DEVICES[$ndx]} + + "${OCF_RESKEY_smartctl}" -d "${DEVICE}" -i "${DRIVE}" | grep -q "SMART support is: Enabled" + if [ $? -ne 0 ] ; then + ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE} + exit $OCF_ERR_INSTALLED + fi + else + "${OCF_RESKEY_smartctl}" -i "${DRIVE}" | grep -q "SMART support is: Enabled" + if [ $? -ne 0 ] ; then + ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE} + exit $OCF_ERR_INSTALLED + fi + fi + done +} + +HealthSMART_usage() { + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data|reload-agent} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +HealthSMART_start() { + HealthSMART_monitor + if [ $? -eq $OCF_SUCCESS ]; then + return $OCF_SUCCESS + fi + touch "${OCF_RESKEY_state}" +} + +HealthSMART_stop() { + attrd_updater -D -n "#health-smart" -d "${OCF_RESKEY_dampen}" + + rm "${OCF_RESKEY_state}" + + if [ $? -eq 0 ]; then + return $OCF_SUCCESS + else + return $OCF_ERR_GENERIC + fi +} + +HealthSMART_monitor() { + common_checks + + # Test for presence of smartctl + check_binary smartctl + + init_smart + + # Monitor _MUST!_ differentiate correctly between running + # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING). + # That is THREE states, not just yes/no. + + if [ -f "${OCF_RESKEY_state}" ]; then + + for ndx in ${!DRIVES[*]}; do + DRIVE=${DRIVES[$ndx]} + + if [ -n "${OCF_RESKEY_devices}" ]; then + DEVICE=${DEVICES[$ndx]} + + # Check overall S.M.A.R.T. status + "${OCF_RESKEY_smartctl}" -d "${DEVICE}" -H ${DRIVE} | grep -q "SMART overall-health self-assessment test result: PASSED" + if [ $? -ne 0 ]; then + attrd_updater -n "#health-smart" -U "red" -d "${OCF_RESKEY_dampen}" + return $OCF_SUCCESS + fi + + # Check drive temperature(s) + check_temperature "$("${OCF_RESKEY_smartctl}" -d "${DEVICE}" -A "${DRIVE}" | awk '/^194/ { print $10 }')" + if [ $? -ne 0 ]; then + return $OCF_SUCCESS + fi + else + "${OCF_RESKEY_smartctl}" -H "${DRIVE}" | grep -q "SMART overall-health self-assessment test result: PASSED" + if [ $? -ne 0 ]; then + attrd_updater -n "#health-smart" -U "red" -d "${OCF_RESKEY_dampen}" + return $OCF_SUCCESS + fi + + check_temperature "$("${OCF_RESKEY_smartctl}" -A "${DRIVE}" | awk '/^194/ { print $10 }')" + if [ $? -ne 0 ]; then + return $OCF_SUCCESS + fi + fi + done + + attrd_updater -n "#health-smart" -U "green" -d "${OCF_RESKEY_dampen}" + return $OCF_SUCCESS + fi + + return $OCF_NOT_RUNNING + +} + +HealthSMART_validate() { + common_checks + + # Host-specific checks + if [ "$OCF_CHECK_LEVEL" = "10" ]; then + # Test for presence of smartctl + check_binary smartctl + + init_smart + + # Is the state directory writable? + state_dir=$(dirname "$OCF_RESKEY_state") + touch "$state_dir/$$" + if [ $? -ne 0 ]; then + return $OCF_ERR_ARGS + fi + rm "$state_dir/$$" + fi + + return $OCF_SUCCESS +} + +HealthSMART_reload_agent() { + return $OCF_SUCCESS +} + + +if [ -z "$OCF_RESKEY_state" ]; then + if [ "${OCF_RESKEY_CRM_meta_globally_unique}" = "false" ]; then + state="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state" + + # Strip off the trailing clone marker + OCF_RESKEY_state=$(echo $state | sed s/:[0-9][0-9]*\.state/.state/) + else + OCF_RESKEY_state="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state" + fi +fi + +case "$__OCF_ACTION" in + start) HealthSMART_start;; + stop) HealthSMART_stop;; + monitor) HealthSMART_monitor;; + validate-all) HealthSMART_validate;; + reload-agent) HealthSMART_reload_agent;; + meta-data) + meta_data + exit $OCF_SUCCESS + ;; + usage|help) + HealthSMART_usage + exit $OCF_SUCCESS + ;; + *) HealthSMART_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc + +# vim: set filetype=sh expandtab tabstop=4 softtabstop=4 shiftwidth=4 textwidth=80: |