diff options
Diffstat (limited to '')
-rw-r--r-- | heartbeat/storage-mon.in | 399 |
1 files changed, 399 insertions, 0 deletions
diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in new file mode 100644 index 0000000..284dec3 --- /dev/null +++ b/heartbeat/storage-mon.in @@ -0,0 +1,399 @@ +#!@BASH_SHELL@ +# +# Copyright (C) 2021 Red Hat, Inc. All rights reserved. +# +# Authors: Christine Caulfield <ccaulfie@redhat.com> +# Fabio M. Di Nitto <fdinitto@redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +# +# Checks storage I/O status of all given drives and writes the #health-storage +# status into the CIB +# Implementation is heavily based on ocf:pacemaker:HealtSMART +# +# It sends a single block on IO to a radom location on the device and reports any errors returned. +# If the IO hangs, that will also be returned. (bear in mind tha tmay also hang the C app in some +# instances). +# +# It's worth making a note in the RA description that the smartmon RA is also recommended (this +# does not replace it), and that Pacemaker health checking should be configued. +# +# https://clusterlabs.org/pacemaker/doc/2.1/Pacemaker_Explained/singlehtml/index.html#tracking-node-health + +####################################################################### + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# +STORAGEMON=${HA_BIN}/storage_mon +ATTRDUP=${HA_SBIN_DIR}/attrd_updater +PIDFILE=${HA_VARRUN}/storage-mon-${OCF_RESOURCE_INSTANCE}.pid +ATTRNAME="#health-${OCF_RESOURCE_INSTANCE}" + +OCF_RESKEY_CRM_meta_interval_default="0" +OCF_RESKEY_io_timeout_default="10" +OCF_RESKEY_check_interval_default="30" +OCF_RESKEY_inject_errors_default="" +OCF_RESKEY_state_file_default="${HA_RSCTMP%%/}/storage-mon-${OCF_RESOURCE_INSTANCE}.state" +OCF_RESKEY_daemonize_default="false" + +# Explicitly list all environment variables used, to make static analysis happy +: ${OCF_RESKEY_CRM_meta_interval:=${OCF_RESKEY_CRM_meta_interval_default}} +: ${OCF_RESKEY_drives:=""} +: ${OCF_RESKEY_io_timeout:=${OCF_RESKEY_io_timeout_default}} +: ${OCF_RESKEY_check_interval:=${OCF_RESKEY_check_interval_default}} +: ${OCF_RESKEY_inject_errors:=${OCF_RESKEY_inject_errors_default}} +: ${OCF_RESKEY_state_file:=${OCF_RESKEY_state_file_default}} +: ${OCF_RESKEY_daemonize:=${OCF_RESKEY_daemonize_default}} + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="storage-mon" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +System health agent that checks the storage I/O status of the given drives and +updates the #health-storage attribute. Usage is highly recommended in combination +with the HealthSMART monitoring agent. The agent currently support a maximum of 25 +devices per instance. +</longdesc> +<shortdesc lang="en">storage I/O health status</shortdesc> + +<parameters> + +<parameter name="state_file" unique="1"> +<longdesc lang="en"> +Location to store the resource state in. +</longdesc> +<shortdesc lang="en">State file</shortdesc> +<content type="string" default="${OCF_RESKEY_state_file_default}" /> +</parameter> + +<parameter name="drives" unique="1" required="1"> +<longdesc lang="en"> +The drive(s) to check as a SPACE separated list. Enter the full path to the device, e.g. "/dev/sda". +</longdesc> +<shortdesc lang="en">Drives to check</shortdesc> +<content type="string" default="" /> +</parameter> + +<parameter name="io_timeout" unique="0"> +<longdesc lang="en"> +Specify disk I/O timeout in seconds. Minimum 1, recommended 10 (default). +</longdesc> +<shortdesc lang="en">Disk I/O timeout</shortdesc> +<content type="integer" default="${OCF_RESKEY_io_timeout_default}" /> +</parameter> + +<parameter name="check_interval" unique="0"> +<longdesc lang="en"> +Specify interval between I/O checks in seconds.(Only supported with the damonize option.) +</longdesc> +<shortdesc lang="en">I/O check interval</shortdesc> +<content type="integer" default="${OCF_RESKEY_check_interval_default}" /> +</parameter> + +<parameter name="inject_errors" unique="0"> +<longdesc lang="en"> +Used only for testing! Specify % of I/O errors to simulate drives failures. +</longdesc> +<shortdesc lang="en">Specify % of I/O errors to simulate drives failures</shortdesc> +<content type="integer" default="${OCF_RESKEY_inject_errors_default}" /> +</parameter> + +<parameter name="daemonize" unique="0"> +<longdesc lang="en"> +Specifies to start storage-mon as a daemon and check for devices. +</longdesc> +<shortdesc lang="en">start storage-mon with daemon</shortdesc> +<content type="boolean" default="${OCF_RESKEY_daemonize_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="10s" /> +<action name="stop" timeout="120s" /> +<action name="monitor" timeout="120s" interval="30s" start-delay="0s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="10s" /> +</actions> +</resource-agent> +END + return $OCF_SUCCESS +} + +####################################################################### + +storage-mon_usage() { + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END + return $1 +} + +storage-mon_init() { + #Test for presence of storage_mon helper + if [ ! -x "$STORAGEMON" ] ; then + ocf_log err "${STORAGEMON} not installed." + exit $OCF_ERR_INSTALLED + fi + + if [ ! -x "$ATTRDUP" ] ; then + ocf_log err "${ATTRDUP} not installed." + exit $OCF_ERR_INSTALLED + fi + + i=0 + for DRIVE in ${OCF_RESKEY_drives}; do + if [ ! -e "$DRIVE" ] ; then + ocf_log err "${DRIVE} not found on the system" + exit $OCF_ERR_INSTALLED + fi + i=$((i + 1)) + done + + if [ "$i" -gt "25" ]; then + ocf_log err "Too many drives ($i) configured for this agent. Max 25." + exit $OCF_ERR_CONFIGURED + fi + + if [ "${OCF_RESKEY_io_timeout}" -lt "1" ]; then + ocf_log err "Minimum timeout is 1. Recommended ${OCF_RESKEY_io_timeout_default} (default)." + exit $OCF_ERR_CONFIGURED + fi + + if [ "${OCF_RESKEY_check_interval}" -lt "1" ]; then + ocf_log err "Minimum interval to check is 1. default ${OCF_RESKEY_check_interval_default}." + exit $OCF_ERR_CONFIGURED + fi + + if [ -n "${OCF_RESKEY_inject_errors}" ]; then + if [ "${OCF_RESKEY_inject_errors}" -lt "1" ] || [ "${OCF_RESKEY_inject_errors}" -gt "100" ]; then + ocf_log err "Inject errors % has to be a value between 1 and 100." + exit $OCF_ERR_CONFIGURED + fi + fi +} + +storage-mon_update_attribute() { + + while : + do + "$ATTRDUP" -n ${ATTRNAME} -U "$1" -d "5s" + rc=$? + if [ $rc -eq 0 ]; then + break + fi + + ocf_log debug "${1} attribute by attrd_updater failed" + if [ "$1" = "red" ]; then + # If the attrd_updater fails with the red attribute, return an error to let pacemaker handle the failure immediately. + return $OCF_ERR_GENERIC + fi + done + return $OCF_SUCCESS +} + +storage-mon_monitor() { + if ! ocf_is_true "$OCF_RESKEY_daemonize"; then + storage-mon_init + + # Monitor _MUST!_ differentiate correctly between running + # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING). + # That is THREE states, not just yes/no. + + if [ ! -f "${OCF_RESKEY_state_file}" ]; then + return $OCF_NOT_RUNNING + fi + + # generate command line + cmdline="" + for DRIVE in ${OCF_RESKEY_drives}; do + cmdline="$cmdline --device $DRIVE --score 1" + done + cmdline="$cmdline --timeout ${OCF_RESKEY_io_timeout}" + if [ -n "${OCF_RESKEY_inject_errors}" ]; then + cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}" + fi + $STORAGEMON $cmdline + if [ $? -ne 0 ]; then + status="red" + else + status="green" + fi + + storage-mon_update_attribute $status + return "$?" + else + ocf_pidfile_status "${PIDFILE}" > /dev/null 2>&1 + case "$?" in + 0) rc=$OCF_SUCCESS;; + 1|2) rc=$OCF_NOT_RUNNING;; + *) rc=$OCF_ERR_GENERIC;; + esac + + if [ $rc -ne $OCF_SUCCESS ]; then + return "$rc" + fi + if [ "$1" = "pid_check_only" ]; then + return "$rc" + fi + + # generate client command line + cmdline="" + cmdline="$cmdline --client --attrname ${ATTRNAME}" + while : + do + # 0 : Normal. + # greater than 0 : monitoring error. + # 255(-1) : communication system error. + # 254(-2) : Not all checks completed for first device in daemon mode. + $STORAGEMON $cmdline + rc=$? + case "$rc" in + 254|255) + # If there is a communication error or the initial check of all devices has not been completed, + # it will loop and try to reconnect. + # When everything ends with a communication error during monitor, a monitor timeout occurs. + ocf_log debug "client monitor error : $rc" + ;; + 0) + status="green" + break + ;; + *) + status="red" + break + ;; + esac + done + + storage-mon_update_attribute $status + return "$?" + fi +} + +storage-mon_start() { + if ! ocf_is_true "$OCF_RESKEY_daemonize"; then + storage-mon_monitor + if [ $? -eq $OCF_SUCCESS ]; then + return $OCF_SUCCESS + fi + touch "${OCF_RESKEY_state_file}" + else + storage-mon_init + # generate command line + cmdline="" + for DRIVE in ${OCF_RESKEY_drives}; do + cmdline="$cmdline --device $DRIVE --score 1" + done + cmdline="$cmdline --daemonize --timeout ${OCF_RESKEY_io_timeout} --interval ${OCF_RESKEY_check_interval} --pidfile ${PIDFILE} --attrname ${ATTRNAME}" + if [ -n "${OCF_RESKEY_inject_errors}" ]; then + cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}" + fi + $STORAGEMON $cmdline + if [ "$?" -ne 0 ]; then + return $OCF_ERR_GENERIC + fi + fi +} + +storage-mon_stop() { + storage-mon_monitor + rc=$? + + if ! ocf_is_true "$OCF_RESKEY_daemonize"; then + if [ $rc -eq $OCF_SUCCESS ]; then + rm "${OCF_RESKEY_state_file}" + fi + else + case "$rc" in + $OCF_SUCCESS) + ;; + $OCF_NOT_RUNNING) + return "$OCF_SUCCESS";; + *) + return "$rc";; + esac + + kill -TERM $(cat "${PIDFILE}") + if [ "$?" -ne 0 ]; then + return $OCF_ERR_GENERIC + fi + + while true; do + storage-mon_monitor pid_check_only + rc="$?" + case "$rc" in + $OCF_SUCCESS) + ;; + $OCF_NOT_RUNNING) + return "$OCF_SUCCESS";; + *) + return "$rc";; + esac + sleep 1 + done + fi + return $OCF_SUCCESS +} + +storage-mon_validate() { + storage-mon_init + + if ! ocf_is_true "$OCF_RESKEY_daemonize"; then + # Is the state directory writable? + state_dir=$(dirname "${OCF_RESKEY_state_file}") + touch "$state_dir/$$" + if [ $? -ne 0 ]; then + return $OCF_ERR_CONFIGURED + fi + rm "$state_dir/$$" + fi + + return $OCF_SUCCESS +} + +case "$__OCF_ACTION" in + start) storage-mon_start;; + stop) storage-mon_stop;; + monitor) storage-mon_monitor;; + validate-all) storage-mon_validate;; + meta-data) meta_data;; + usage|help) storage-mon_usage $OCF_SUCCESS;; + *) storage-mon_usage $OCF_ERR_UNIMPLEMENTED;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc +# vim: set filetype=sh: |