diff options
Diffstat (limited to '')
168 files changed, 76131 insertions, 0 deletions
diff --git a/heartbeat/AoEtarget.in b/heartbeat/AoEtarget.in new file mode 100644 index 0000000..5a14c1e --- /dev/null +++ b/heartbeat/AoEtarget.in @@ -0,0 +1,245 @@ +#!@BASH_SHELL@ +# +# +# AoEtarget OCF RA. +# Manages an ATA-over-Ethernet (AoE) target utilizing the vblade utility. +# +# (c) 2009-2010 Florian Haas, Dejan Muhamedagic, +# and Linux-HA contributors +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +###################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_nic_default="eth0" +OCF_RESKEY_pid_default="${HA_RSCTMP}/AoEtarget-${OCF_RESOURCE_INSTANCE}.pid" +OCF_RESKEY_binary_default="/usr/sbin/vblade" + +: ${OCF_RESKEY_nic=${OCF_RESKEY_nic_default}} +: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}} +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} + +####################################################################### + +meta_data() { + cat <<EOF +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="AoEtarget" version="1.0"> + <version>1.0</version> + <longdesc lang="en"> +This resource agent manages an ATA-over-Ethernet (AoE) target using vblade. +It exports any block device, or file, as an AoE target using the +specified Ethernet device, shelf, and slot number. +</longdesc> + <shortdesc lang="en">Manages ATA-over-Ethernet (AoE) target exports</shortdesc> + <parameters> + <parameter name="device" required="1"> + <longdesc lang="en"> +The local block device (or file) to export as an AoE target. +</longdesc> + <shortdesc lang="en">Device to export</shortdesc> + <content type="string"/> + </parameter> + <parameter name="nic" required="1"> + <longdesc lang="en"> +The local Ethernet interface to use for exporting this AoE target. +</longdesc> + <shortdesc lang="en">Ethernet interface</shortdesc> + <content type="string" default="${OCF_RESKEY_nic_default}"/> + </parameter> + <parameter name="shelf" required="0"> + <longdesc lang="en"> +The AoE shelf number to use when exporting this target. +</longdesc> + <shortdesc lang="en">AoE shelf number</shortdesc> + <content type="integer"/> + </parameter> + <parameter name="slot" required="1"> + <longdesc lang="en"> +The AoE slot number to use when exporting this target. +</longdesc> + <shortdesc lang="en">AoE slot number</shortdesc> + <content type="integer"/> + </parameter> + <parameter name="pid" required="0" unique="1"> + <longdesc lang="en"> +The file to record the daemon pid to. +</longdesc> + <shortdesc lang="en">Daemon pid file</shortdesc> + <content type="string" default="${OCF_RESKEY_pid_default}"/> + </parameter> + <parameter name="binary" required="0"> + <longdesc lang="en"> +Location of the vblade binary. +</longdesc> + <shortdesc lang="en">vblade binary</shortdesc> + <content type="string" default="${OCF_RESKEY_binary_default}"/> + </parameter> + </parameters> + <actions> + <action name="start" timeout="15s"/> + <action name="stop" timeout="15s"/> + <action name="monitor" timeout="15s" interval="10s" depth="0"/> + <action name="reload" timeout="15s"/> + <action name="meta-data" timeout="5s"/> + <action name="validate-all" timeout="15s"/> + </actions> +</resource-agent> +EOF +} + +####################################################################### + +AoEtarget_usage() { + cat <<END +usage: $0 {start|stop|status|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +AoEtarget_start() { + AoEtarget_monitor + if [ $? = $OCF_SUCCESS ]; then + return $OCF_SUCCESS + fi + ocf_log info "Exporting device ${OCF_RESKEY_device} on ${OCF_RESKEY_nic} as shelf ${OCF_RESKEY_shelf}, slot ${OCF_RESKEY_slot}" + ${OCF_RESKEY_binary} ${OCF_RESKEY_shelf} ${OCF_RESKEY_slot} \ + ${OCF_RESKEY_nic} ${OCF_RESKEY_device} 2>&1 & + rc=$? + pid=$! + if [ $rc -ne 0 ]; then + return $OCF_ERR_GENERIC + fi + echo $pid > ${OCF_RESKEY_pid} && return $OCF_SUCCESS + return $OCF_ERR_GENERIC +} + +AoEtarget_stop() { + AoEtarget_monitor + if [ $? -eq $OCF_SUCCESS ]; then + ocf_log info "Unxporting device ${OCF_RESKEY_device} on ${OCF_RESKEY_nic} as shelf ${OCF_RESKEY_shelf}, slot ${OCF_RESKEY_slot}" + pid=$(cat ${OCF_RESKEY_pid}) + kill -TERM $pid + # loop until we're really stopped, wait for the LRM to time us + # out if not + while AoEtarget_monitor; do + sleep 1 + done + fi + # Clean up pid file + rm -f ${OCF_RESKEY_pid} + return $OCF_SUCCESS +} + +AoEtarget_monitor() { + ocf_pidfile_status ${OCF_RESKEY_pid} >/dev/null 2>&1 + rc=$? + if [ $rc -eq 2 ]; then + # no pid file, must assume we're not running + return $OCF_NOT_RUNNING + elif [ $rc -eq 1 ]; then + # stale pid file, assume something went wrong + return $OCF_ERR_GENERIC + fi + return $OCF_SUCCESS +} + +AoEtarget_validate() { + # Is our binary executable? + if [ ! -x ${OCF_RESKEY_binary} ]; then + ocf_log error "${OCF_RESKEY_binary} not found or not executable" + return $OCF_ERR_INSTALLED + fi + + # Do we have all required variables? + for var in device nic shelf slot pid; do + param="OCF_RESKEY_${var}" + if [ -z "${!param}" ]; then + ocf_log error "Missing resource parameter \"$var\"!" + return $OCF_ERR_CONFIGURED + fi + done + + # Is the pid file directory writable? + pid_dir=`dirname "$OCF_RESKEY_pid"` + touch "$pid_dir/$$" + if [ $? != 0 ]; then + ocf_log error "Cannot create pid file in $pid_dir -- check directory permissions" + return $OCF_ERR_INSTALLED + fi + rm "$pid_dir/$$" + + # Does the device we are trying to export exist? + if [ ! -e ${OCF_RESKEY_device} ]; then + ocf_log error "${OCF_RESKEY_device} does not exist" + return $OCF_ERR_INSTALLED + fi + return $OCF_SUCCESS +} + +case $1 in + meta-data) + meta_data + exit $OCF_SUCCESS + ;; + usage|help) + AoEtarget_usage + exit $OCF_SUCCESS + ;; +esac + +# Everything except usage and meta-data must pass the validate test +AoEtarget_validate || exit $? + +case $__OCF_ACTION in + start) + AoEtarget_start + ;; + stop) + AoEtarget_stop + ;; + status|monitor) + AoEtarget_monitor + ;; + reload) + ocf_log err "Reloading..." + AoEtarget_start + ;; + validate-all) + AoEtarget_validate + ;; + *) + AoEtarget_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc + diff --git a/heartbeat/AudibleAlarm b/heartbeat/AudibleAlarm new file mode 100755 index 0000000..44a3088 --- /dev/null +++ b/heartbeat/AudibleAlarm @@ -0,0 +1,188 @@ +#!/bin/sh +# +# Startup script for the Audible Alarm +# +# author: Kirk Lawson <lklawson@heapy.com> +# Horms <horms@vergenet.net> +# +# description: sets an audible alarm running by beeping at a set interval +# processname: alarm +# config: /etc/AudibleAlarm/AudibleAlarm.conf - not yet implemented +# +# OCF parameters are as below: +# OCF_RESKEY_nodelist +# +# License: GNU General Public License (GPL) + +####################################################################### +# Source function library. +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### +PIDFILE=${HA_VARRUN}/heartbeat-bell +#For testing +#PIDFILE=/tmp/heartbeat-bell + +# What host are we running on? +us=`uname -n` + +usage() { + echo "Usage: $0 {start|stop|restart|status|monitor|meta-data|validate-all}" + echo " The node list is an optional space delimited" + echo " list of hosts that should never sound the alarm." +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="AudibleAlarm" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for AudibleAlarm. It sets an audible alarm running by beeping +at a set interval. +</longdesc> +<shortdesc lang="en">Emits audible beeps at a configurable interval</shortdesc> + +<parameters> +<parameter name="nodelist" unique="0"> +<longdesc lang="en"> +The node list that should never sound the alarm. +</longdesc> +<shortdesc lang="en">Node list</shortdesc> +<content type="string" default="" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="10s" /> +<action name="stop" timeout="10s" /> +<action name="restart" timeout="10s" /> +<action name="status" depth="0" timeout="10s" interval="10s" /> +<action name="monitor" depth="0" timeout="10s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s" /> +</actions> +</resource-agent> +END +} + +audiblealarm_start () { + ocf_log info "$0: Starting" + if [ -f $PIDFILE ]; then + PID=`head -n 1 $PIDFILE` + if [ -n "$PID" ]; then + ocf_log info "$0: Appears to already be running, killing [$PID]" + kill $PID > /dev/null + fi + fi + # Use () to create a subshell to make the redirection be synchronized. + ( while [ 1 ]; do + sleep 1 #Sleep first, incase we bail out + printf "\a" > /dev/console + # Uncomment this line to cause floppy drive light + # to flash (requires fdutils package). + # /usr/bin/floppycontrol --pollstate > /dev/null + # + # To avoid issues when called by lrmd, redirect stdout->stderr. + done & + if echo $! > $PIDFILE; then + : + else + ocf_log info "$0: Could not write to pid file \"$PIDFILE\", bailing" + kill $! + return $OCF_ERR_GENERIC + fi) >&2 + + return $? +} + +audiblealarm_stop () { + ocf_log info "$0: Shutting down" + if [ -f $PIDFILE ]; then + PID=`head -n 1 $PIDFILE` + # ocf_log info "$0: Appears to already be running, killing [$PID]" + # commented by Phost, since the confusion in the log. + + if [ -n "$PID" ]; then + # Donnot remove PIDFILE in case the `kill` fails. + kill $PID > /dev/null && rm -f $PIDFILE + fi + fi + + return $? +} + +audiblealarm_restart () { + audiblealarm_stop + audiblealarm_start + + return $? +} + +audiblealarm_status () { + if [ -f $PIDFILE ]; then + PID=`head -n 1 $PIDFILE` + if [ -n "$PID" ]; then + echo running + return $OCF_SUCCESS + fi + fi + + echo stopped + return $OCF_NOT_RUNNING +} + +audiblealarm_validate_all () { + check_binary printf + + echo "Validate OK" + return $OCF_SUCCESS +} +if [ $# -ne 1 ]; then + usage + exit $OCF_ERR_ARGS +fi + +case "$1" in + meta-data) + meta_data + exit $OCF_SUCCESS + ;; + start) + for arg in $OCF_RESKEY_nodelist + do + if [ "$us" = "$arg" ]; then + # We should not start because we are on a host + # listed in our argument list. + exit $OCF_SUCCESS + fi + done + audiblealarm_start + ;; + stop) + audiblealarm_stop + ;; + restart) + audiblealarm_restart + ;; + status|monitor) + audiblealarm_status + ;; + validate-all) + audiblealarm_validate_all + ;; + usage) + usage + exit $OCF_SUCCESS + ;; + + *) + usage + exit $OCF_ERR_ARGS + ;; +esac + +exit $? diff --git a/heartbeat/CTDB.in b/heartbeat/CTDB.in new file mode 100755 index 0000000..b4af66b --- /dev/null +++ b/heartbeat/CTDB.in @@ -0,0 +1,996 @@ +#!@BASH_SHELL@ +# +# OCF Resource Agent for managing CTDB +# +# Copyright (c) 2009-2010 Novell Inc., Tim Serong +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# +# OVERVIEW +# +# When run by itself, CTDB can handle IP failover and includes scripts +# to manage various services (Samba, Winbind, HTTP, etc.). When run as +# a resource in a Pacemaker cluster, this additional functionality +# should not be used; instead one should define separate resources for +# CTDB, Samba, Winbind, IP addresses, etc. +# +# As of 2010-11-17, there is no separate OCF Samba or Winbind RA, so +# it is still possible to configure CTDB so that it manages these +# resources itself. In future, once Samba and Winbind RAs are +# available, this ability will be deprecated and ultimately removed. +# +# This RA intentionally provides no ability to configure CTDB such that +# it manages IP failover, HTTP, NFS, etc. +# +# +# TODO: +# - ctdb_stop doesn't really support multiple independent CTDB instances, +# unless they're running from distinct ctdbd binaries (it uses pkill +# $OCF_RESKEY_ctdbd_binary if "ctdb stop" doesn't work, which it might +# not under heavy load - this will kill all ctdbd instances on the +# system). OTOH, running multiple CTDB instances per node is, well, +# AFAIK, completely crazy. Can't run more than one in a vanilla CTDB +# cluster, with the CTDB init script. So it might be nice to address +# this for complete semantic correctness of the RA, but shouldn't +# actually cause any trouble in real life. +# - As much as possible, get rid of auto config generation +# - Especially smb.conf +# - Verify timeouts are sane +# - Monitor differentiate between error and not running? +# - Do we need to verify globally unique setting? +# - Should set CTDB_NODES to ${HA_RSCTMP}/ctdb (generated based on +# current nodes) +# - Look at enabling set_ctdb_variables() if necessary. +# - Probably possible for sysconfig file to not be restored if +# CTDB dies unexpectedly. +# +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### +# Default parameter values: + +# Some distro's ctdb package stores the persistent db in /var/lib/ctdb, +# others store in /var/ctdb. This attempts to detect the correct default +# directory. +var_prefix="/var/lib/ctdb" +if [ ! -d "$var_prefix" ] && [ -d "/var/ctdb" ]; then + var_prefix="/var/ctdb" +fi + +run_prefix="/run" +if [ ! -d "$var_prefix" ] && [ -d "/var/run" ]; then + var_prefix="/var/run" +fi + +# Parameter defaults + +OCF_RESKEY_ctdb_recovery_lock_default="" +OCF_RESKEY_ctdb_manages_samba_default="no" +OCF_RESKEY_ctdb_manages_winbind_default="no" +OCF_RESKEY_ctdb_service_smb_default="" +OCF_RESKEY_ctdb_service_nmb_default="" +OCF_RESKEY_ctdb_service_winbind_default="" +OCF_RESKEY_ctdb_samba_skip_share_check_default="yes" +OCF_RESKEY_ctdb_monitor_free_memory_default="100" +OCF_RESKEY_ctdb_start_as_disabled_default="no" + +: ${OCF_RESKEY_ctdb_recovery_lock=${OCF_RESKEY_ctdb_recovery_lock_default}} +: ${OCF_RESKEY_ctdb_manages_samba=${OCF_RESKEY_ctdb_manages_samba_default}} +: ${OCF_RESKEY_ctdb_manages_winbind=${OCF_RESKEY_ctdb_manages_winbind_default}} +: ${OCF_RESKEY_ctdb_service_smb=${OCF_RESKEY_ctdb_service_smb_default}} +: ${OCF_RESKEY_ctdb_service_nmb=${OCF_RESKEY_ctdb_service_nmb_default}} +: ${OCF_RESKEY_ctdb_service_winbind=${OCF_RESKEY_ctdb_service_winbind_default}} +: ${OCF_RESKEY_ctdb_samba_skip_share_check=${OCF_RESKEY_ctdb_samba_skip_share_check_default}} +: ${OCF_RESKEY_ctdb_monitor_free_memory=${OCF_RESKEY_ctdb_monitor_free_memory_default}} +: ${OCF_RESKEY_ctdb_start_as_disabled=${OCF_RESKEY_ctdb_start_as_disabled_default}} + +OCF_RESKEY_ctdb_config_dir_default="/etc/ctdb" +OCF_RESKEY_ctdb_binary_default="/usr/bin/ctdb" +OCF_RESKEY_ctdbd_binary_default="/usr/sbin/ctdbd" +OCF_RESKEY_ctdb_dbdir_default="${var_prefix}" +OCF_RESKEY_ctdb_logfile_default="/var/log/ctdb/log.ctdb" +OCF_RESKEY_ctdb_rundir_default="${run_prefix}/ctdb" +OCF_RESKEY_ctdb_timeout_default="10" + +: ${OCF_RESKEY_ctdb_config_dir=${OCF_RESKEY_ctdb_config_dir_default}} +: ${OCF_RESKEY_ctdb_binary=${OCF_RESKEY_ctdb_binary_default}} +: ${OCF_RESKEY_ctdbd_binary=${OCF_RESKEY_ctdbd_binary_default}} +: ${OCF_RESKEY_ctdb_dbdir=${OCF_RESKEY_ctdb_dbdir_default}} +: ${OCF_RESKEY_ctdb_logfile=${OCF_RESKEY_ctdb_logfile_default}} +: ${OCF_RESKEY_ctdb_rundir=${OCF_RESKEY_ctdb_rundir_default}} +: ${OCF_RESKEY_ctdb_timeout=${OCF_RESKEY_ctdb_timeout_default}} + +OCF_RESKEY_ctdb_socket_default="${OCF_RESKEY_ctdb_rundir}/ctdbd.socket" +OCF_RESKEY_ctdb_debuglevel_default="2" +OCF_RESKEY_ctdb_max_open_files_default="" + +: ${OCF_RESKEY_ctdb_socket=${OCF_RESKEY_ctdb_socket_default}} +: ${OCF_RESKEY_ctdb_debuglevel=${OCF_RESKEY_ctdb_debuglevel_default}} +: ${OCF_RESKEY_ctdb_max_open_files=${OCF_RESKEY_ctdb_max_open_files_default}} + +OCF_RESKEY_smb_conf_default="/etc/samba/smb.conf" +OCF_RESKEY_smb_private_dir_default="" +OCF_RESKEY_smb_passdb_backend_default="tdbsam" +OCF_RESKEY_smb_idmap_backend_default="tdb2" +OCF_RESKEY_smb_fileid_algorithm_default="" + +: ${OCF_RESKEY_smb_conf=${OCF_RESKEY_smb_conf_default}} +: ${OCF_RESKEY_smb_private_dir=${OCF_RESKEY_smb_private_dir_default}} +: ${OCF_RESKEY_smb_passdb_backend=${OCF_RESKEY_smb_passdb_backend_default}} +: ${OCF_RESKEY_smb_idmap_backend=${OCF_RESKEY_smb_idmap_backend_default}} +: ${OCF_RESKEY_smb_fileid_algorithm=${OCF_RESKEY_smb_fileid_algorithm_default}} + +####################################################################### + +ctdb_version() { + $OCF_RESKEY_ctdb_binary version | awk '{print $NF}' | sed "s/[-\.]\?[[:alpha:]].*//" +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="CTDB" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This resource agent manages CTDB, allowing one to use Clustered Samba in a +Linux-HA/Pacemaker cluster. You need a shared filesystem (e.g. OCFS2 or GFS2) on +which the CTDB lock will be stored. Create /etc/ctdb/nodes containing a list +of private IP addresses of each node in the cluster, then configure this RA +as a clone. This agent expects the samba and windbind resources +to be managed outside of CTDB's control as a separate set of resources controlled +by the cluster manager. The optional support for enabling CTDB management of these +daemons will be depreciated. + +For more information see http://linux-ha.org/wiki/CTDB_(resource_agent) +</longdesc> +<shortdesc lang="en">CTDB Resource Agent</shortdesc> + +<parameters> + +<parameter name="ctdb_recovery_lock" unique="1" required="1"> +<longdesc lang="en"> +The location of a shared lock file or helper binary, common across all nodes. +See CTDB documentation for details. +</longdesc> +<shortdesc lang="en">CTDB shared lock file</shortdesc> +<content type="string" default="${OCF_RESKEY_ctdb_recovery_lock_default}" /> +</parameter> + +<parameter name="ctdb_manages_samba" unique="0" required="0"> +<longdesc lang="en"> +Should CTDB manage starting/stopping the Samba service for you? +This will be deprecated in future, in favor of configuring a +separate Samba resource. +</longdesc> +<shortdesc lang="en">Should CTDB manage Samba?</shortdesc> +<content type="boolean" default="${OCF_RESKEY_ctdb_manages_samba_default}" /> +</parameter> + +<parameter name="ctdb_manages_winbind" unique="0" required="0"> +<longdesc lang="en"> +Should CTDB manage starting/stopping the Winbind service for you? +This will be deprecated in future, in favor of configuring a +separate Winbind resource. +</longdesc> +<shortdesc lang="en">Should CTDB manage Winbind?</shortdesc> +<content type="boolean" default="${OCF_RESKEY_ctdb_manages_winbind_default}" /> +</parameter> + +<parameter name="ctdb_service_smb" unique="0" required="0"> +<longdesc lang="en"> +Name of smb init script. Only necessary if CTDB is managing +Samba directly. Will usually be auto-detected. +</longdesc> +<shortdesc lang="en">Name of smb init script</shortdesc> +<content type="string" default="${OCF_RESKEY_ctdb_service_smb_default}" /> +</parameter> + +<parameter name="ctdb_service_nmb" unique="0" required="0"> +<longdesc lang="en"> +Name of nmb init script. Only necessary if CTDB is managing +Samba directly. Will usually be auto-detected. +</longdesc> +<shortdesc lang="en">Name of nmb init script</shortdesc> +<content type="string" default="${OCF_RESKEY_ctdb_service_nmb_default}" /> +</parameter> + +<parameter name="ctdb_service_winbind" unique="0" required="0"> +<longdesc lang="en"> +Name of winbind init script. Only necessary if CTDB is managing +Winbind directly. Will usually be auto-detected. +</longdesc> +<shortdesc lang="en">Name of winbind init script</shortdesc> +<content type="string" default="${OCF_RESKEY_ctdb_service_winbind_default}" /> +</parameter> + +<parameter name="ctdb_samba_skip_share_check" unique="0" required="0"> +<longdesc lang="en"> +If there are very many shares it may not be feasible to check that all +of them are available during each monitoring interval. In that case +this check can be disabled. +</longdesc> +<shortdesc lang="en">Skip share check during monitor?</shortdesc> +<content type="boolean" default="${OCF_RESKEY_ctdb_samba_skip_share_check_default}" /> +</parameter> + +<parameter name="ctdb_monitor_free_memory" unique="0" required="0"> +<longdesc lang="en"> +If the amount of free memory drops below this value the node will +become unhealthy and ctdb and all managed services will be shutdown. +Once this occurs, the administrator needs to find the reason for the +OOM situation, rectify it and restart ctdb with "service ctdb start". +With CTDB 4.4.0 and later this parameter is ignored. +</longdesc> +<shortdesc lang="en">Minimum amount of free memory (MB)</shortdesc> +<content type="integer" default="${OCF_RESKEY_ctdb_monitor_free_memory_default}" /> +</parameter> + +<parameter name="ctdb_start_as_disabled" unique="0" required="0"> +<longdesc lang="en"> +When set to yes, the CTDB node will start in DISABLED mode and not +host any public ip addresses. +</longdesc> +<shortdesc lang="en">Start CTDB disabled?</shortdesc> +<content type="boolean" default="${OCF_RESKEY_ctdb_start_as_disabled_default}" /> +</parameter> + +<parameter name="ctdb_config_dir" unique="0" required="0"> +<longdesc lang="en"> +The directory containing various CTDB configuration files. +The "nodes" and "notify.sh" scripts are expected to be +in this directory. +</longdesc> +<shortdesc lang="en">CTDB config file directory</shortdesc> +<content type="string" default="${OCF_RESKEY_ctdb_config_dir_default}" /> +</parameter> + +<parameter name="ctdb_binary" unique="0" required="0"> +<longdesc lang="en"> +Full path to the CTDB binary. +</longdesc> +<shortdesc lang="en">CTDB binary path</shortdesc> +<content type="string" default="${OCF_RESKEY_ctdb_binary_default}" /> +</parameter> + +<parameter name="ctdbd_binary" unique="0" required="0"> +<longdesc lang="en"> +Full path to the CTDB cluster daemon binary. +</longdesc> +<shortdesc lang="en">CTDB Daemon binary path</shortdesc> +<content type="string" default="${OCF_RESKEY_ctdbd_binary_default}" /> +</parameter> + +<parameter name="ctdb_socket" unique="1" required="0"> +<longdesc lang="en"> +Full path to the domain socket that ctdbd will create, used for +local clients to attach and communicate with the ctdb daemon. +With CTDB 4.9.0 and later the socket path is hardcoded at build +time, so this parameter is ignored. +</longdesc> +<shortdesc lang="en">CTDB socket location (ignored with CTDB 4.9+)</shortdesc> +<content type="string" default="${OCF_RESKEY_ctdb_socket_default}" /> +</parameter> + +<parameter name="ctdb_dbdir" unique="1" required="0"> +<longdesc lang="en"> +The directory to put the local CTDB database files in. +Persistent database files will be put in ctdb_dbdir/persistent. +</longdesc> +<shortdesc lang="en">CTDB database directory</shortdesc> +<content type="string" default="${OCF_RESKEY_ctdb_dbdir_default}" /> +</parameter> + +<parameter name="ctdb_logfile" unique="0" required="0"> +<longdesc lang="en"> +Full path to log file. To log to syslog instead, use the +value "syslog". +</longdesc> +<shortdesc lang="en">CTDB log file location</shortdesc> +<content type="string" default="${OCF_RESKEY_ctdb_logfile_default}" /> +</parameter> + +<parameter name="ctdb_rundir" unique="0" required="0"> +<longdesc lang="en"> +Full path to ctdb runtime directory, used for storage of socket +lock state. +</longdesc> +<shortdesc lang="en">CTDB runtime directory location</shortdesc> +<content type="string" default="${OCF_RESKEY_ctdb_rundir_default}" /> +</parameter> + +<parameter name="ctdb_timeout" unique="1" required="0"> +<longdesc lang="en"> +Indicates that ctdb should wait up to TIMEOUT seconds for a response to most commands sent to the CTDB daemon. +</longdesc> +<shortdesc lang="en">CTDB timeout in seconds</shortdesc> +<content type="integer" default="${OCF_RESKEY_ctdb_timeout_default}" /> +</parameter> + +<parameter name="ctdb_debuglevel" unique="0" required="0"> +<longdesc lang="en"> +What debug level to run at (0-10). Higher means more verbose. +</longdesc> +<shortdesc lang="en">CTDB debug level</shortdesc> +<content type="integer" default="${OCF_RESKEY_ctdb_debuglevel_default}" /> +</parameter> + +<parameter name="ctdb_max_open_files" required="0"> +<longdesc lang="en"> +Maximum number of open files (for ulimit -n) +</longdesc> +<shortdesc lang="en">Max open files</shortdesc> +<content type="integer" default="${OCF_RESKEY_ctdb_max_open_files_default}" /> +</parameter> + +<parameter name="smb_conf" unique="0" required="0"> +<longdesc lang="en"> +Path to default samba config file. Only necessary if CTDB +is managing Samba. +</longdesc> +<shortdesc lang="en">Path to smb.conf</shortdesc> +<content type="string" default="${OCF_RESKEY_smb_conf_default}" /> +</parameter> + +<parameter name="smb_private_dir" unique="1" required="0"> +<longdesc lang="en"> +The directory for smbd to use for storing such files as +smbpasswd and secrets.tdb. Old versions of CTBD (prior to 1.0.50) +required this to be on shared storage. This parameter should not +be set for current versions of CTDB, and only remains in the RA +for backwards compatibility. +</longdesc> +<shortdesc lang="en">Samba private dir (deprecated)</shortdesc> +<content type="string" default="${OCF_RESKEY_smb_private_dir_default}" /> +</parameter> + +<parameter name="smb_passdb_backend" unique="0" required="0"> +<longdesc lang="en"> +Which backend to use for storing user and possibly group +information. Only necessary if CTDB is managing Samba. +</longdesc> +<shortdesc lang="en">Samba passdb backend</shortdesc> +<content type="string" default="${OCF_RESKEY_smb_passdb_backend_default}" /> +</parameter> + +<parameter name="smb_idmap_backend" unique="0" required="0"> +<longdesc lang="en"> +Which backend to use for SID/uid/gid mapping. Only necessary +if CTDB is managing Samba. +</longdesc> +<shortdesc lang="en">Samba idmap backend</shortdesc> +<content type="string" default="${OCF_RESKEY_smb_idmap_backend_default}" /> +</parameter> + +<parameter name="smb_fileid_algorithm" unique="0" required="0"> +<longdesc lang="en"> +Which fileid:algorithm to use with vfs_fileid. The correct +value depends on which clustered filesystem is in use, e.g.: +for OCFS2, this should be set to "fsid". Only necessary if +CTDB is managing Samba. +</longdesc> +<shortdesc lang="en">Samba VFS fileid algorithm</shortdesc> +<content type="string" default="${OCF_RESKEY_smb_fileid_algorithm_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="90s" /> +<action name="stop" timeout="100s" /> +<action name="monitor" timeout="20s" interval="10s" depth="0" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="30s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +# Figure out path to /etc/sysconfig/ctdb (same logic as +# loadconfig() from /etc/ctdb/functions +if [ -f /etc/sysconfig/ctdb ]; then + CTDB_SYSCONFIG=/etc/sysconfig/ctdb +elif [ -f /etc/default/ctdb ]; then + CTDB_SYSCONFIG=/etc/default/ctdb +elif [ -f "$OCF_RESKEY_ctdb_config_dir/ctdb" ]; then + CTDB_SYSCONFIG=$OCF_RESKEY_ctdb_config_dir/ctdb +elif [ -f "$OCF_RESKEY_ctdb_config_dir/ctdbd.conf" ]; then + CTDB_SYSCONFIG=$OCF_RESKEY_ctdb_config_dir/ctdbd.conf +fi + +# Backup paths +CTDB_SYSCONFIG_BACKUP=${CTDB_SYSCONFIG}.ctdb-ra-orig + +invoke_ctdb() { + # CTDB's defaults are: + local timelimit + timelimit=120 + # ...but we override with the timeout for the current op: + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + timelimit=$((OCF_RESKEY_CRM_meta_timeout/1000)) + fi + + local vers=$(ctdb_version) + ocf_version_cmp "$vers" "4.9.0" + + # if version < 4.9.0 specify '--socket' otherwise it's + # a compiled option + if [ "$?" -eq "0" ]; then + $OCF_RESKEY_ctdb_binary --socket="$OCF_RESKEY_ctdb_socket" \ + -t ${OCF_RESKEY_ctdb_timeout} -T $timelimit \ + "$@" + else + $OCF_RESKEY_ctdb_binary \ + -t ${OCF_RESKEY_ctdb_timeout} -T $timelimit \ + "$@" + fi +} + +# Enable any event scripts that are explicitly required. +# Any others will ultimately be invoked or not based on how they ship +# with CTDB, but will generally have no effect, beacuase the relevant +# CTDB_MANAGES_* options won't be set in /etc/sysconfig/ctdb. +enable_event_scripts_chmod() { + local event_dir + event_dir=$OCF_RESKEY_ctdb_config_dir/events.d + + chmod u+x "$event_dir/00.ctdb" # core database health check + + if [ -f "${OCF_RESKEY_ctdb_config_dir}/public_addresses" ]; then + chmod u+x "$event_dir/10.interface" + else + chmod a-x "$event_dir/10.interface" + fi + if [ -f "${OCF_RESKEY_ctdb_config_dir}/static-routes" ]; then + chmod u+x "$event_dir/11.routing" + else + chmod a-x "$event_dir/11.routing" + fi + if ocf_is_true "$OCF_RESKEY_ctdb_manages_samba" || \ + ocf_is_true "$OCF_RESKEY_ctdb_manages_winbind"; then + chmod u+x "$event_dir/50.samba" + else + chmod a-x "$event_dir/50.samba" + fi +} + +enable_event_scripts_symlink() { + # event scripts are symlinked once enabled, with the link source in... + mkdir -p "$OCF_RESKEY_ctdb_config_dir/events/legacy" 2>/dev/null + + invoke_ctdb event script enable legacy 00.ctdb + + if [ -f "${OCF_RESKEY_ctdb_config_dir}/public_addresses" ]; then + invoke_ctdb event script enable legacy 10.interface + else + invoke_ctdb event script disable legacy 10.interface + fi + if [ -f "${OCF_RESKEY_ctdb_config_dir}/static-routes" ]; then + invoke_ctdb event script enable legacy 11.routing + else + invoke_ctdb event script disable legacy 11.routing + fi + + if ocf_is_true "$OCF_RESKEY_ctdb_manages_winbind"; then + invoke_ctdb event script enable legacy 49.winbind + else + invoke_ctdb event script disable legacy 49.winbind + fi + + if ocf_is_true "$OCF_RESKEY_ctdb_manages_samba"; then + invoke_ctdb event script enable legacy 50.samba + else + invoke_ctdb event script disable legacy 50.samba + fi +} + +# This function has no effect (currently no way to set CTDB_SET_*) +# but remains here in case we need it in future. +set_ctdb_variables() { + rv=$OCF_SUCCESS + set | grep ^CTDB_SET_ | cut -d_ -f3- | + while read v; do + varname=$(echo "$v" | cut -d= -f1) + value=$(echo "$v" | cut -d= -f2) + invoke_ctdb setvar "$varname" "$value" || rv=$OCF_ERR_GENERIC + done || rv=$OCF_ERR_GENERIC + return $rv +} + + +# Add necessary settings to /etc/samba/smb.conf. In a perfect world, +# we'd be able to generate a new, temporary, smb.conf file somewhere, +# something like: +# include = /etc/samba/smb.conf +# [global] +# clustering = yes +# # ...etc... +# Unfortunately, we can't do this, because there's no way to tell the +# smb init script where the temporary config is, so we just edit +# the default config file. +init_smb_conf() { + # Don't screw around with the config if CTDB isn't managing Samba! + ocf_is_true "$OCF_RESKEY_ctdb_manages_samba" || return 0 + + # replace these things in smb.conf + local repl + repl='# CTDB-RA:|passdb backend|clustering|idmap backend|idmap config[[:space:]]*\*[[:space:]]*:[[:space:]]*backend|private dir|ctdbd socket' + + local private_dir + [ -n "$OCF_RESKEY_smb_private_dir" ] && private_dir="\tprivate dir = $OCF_RESKEY_smb_private_dir\n" + + local vfs_fileid + local do_vfs + do_vfs=0 + if [ -n "$OCF_RESKEY_smb_fileid_algorithm" ]; then + repl="${repl}|fileid:algorithm|fileid:mapping" + vfs_fileid="\tfileid:algorithm = $OCF_RESKEY_smb_fileid_algorithm\n" + if sed -n '/^[[:space:]]*\[global\]/,/^[[:space:]]*\[/p' $OCF_RESKEY_smb_conf | \ + grep -Eq '^[[:space:]]*vfs objects'; then + # vfs objects already specified, will append fileid to existing line + do_vfs=1 + else + vfs_fileid="$vfs_fileid\tvfs objects = fileid\n" + fi + fi + # Preserve permissions of smb.conf + local idmap_config + if grep -Eqs '^[[:space:]]*idmap backend[[:space:]]*=' $OCF_RESKEY_smb_conf; then + idmap_config=old + else + idmap_config=new + fi + cp -a "$OCF_RESKEY_smb_conf" "$OCF_RESKEY_smb_conf.$$" + awk ' + /^[[:space:]]*\[/ { global = 0 } + /^[[:space:]]*\[global\]/ { global = 1 } + { + if(global) { + if ('$do_vfs' && $0 ~ /^[[:space:]]vfs objects/ && $0 !~ /fileid/) { + print $0" fileid" + } else if ($0 !~ /^[[:space:]]*('"$repl"')/) { + print + } + } else { + print + } + }' "$OCF_RESKEY_smb_conf" | sed "/^[[:space:]]*\[global\]/ a\\ +\t# CTDB-RA: Begin auto-generated section (do not change below)\n\ +\tpassdb backend = $OCF_RESKEY_smb_passdb_backend\n\ +\tclustering = yes\n\ +\tctdbd socket = $OCF_RESKEY_ctdb_socket\n$private_dir$vfs_fileid\ +\t# CTDB-RA: End auto-generated section (do not change above)" > "$OCF_RESKEY_smb_conf.$$" + if [ "$idmap_config" = "old" ]; then + sed -i "/^[[:space:]]*clustering = yes/ a\\ +\tidmap backend = $OCF_RESKEY_smb_idmap_backend" $OCF_RESKEY_smb_conf.$$ + else + sed -i "/^[[:space:]]*clustering = yes/ a\\ +\tidmap config * : backend = $OCF_RESKEY_smb_idmap_backend" $OCF_RESKEY_smb_conf.$$ + fi + dd conv=notrunc,fsync of="$OCF_RESKEY_smb_conf.$$" if=/dev/null >/dev/null 2>&1 + mv "$OCF_RESKEY_smb_conf.$$" "$OCF_RESKEY_smb_conf" +} + + +# Get rid of that section we added +cleanup_smb_conf() { + ocf_is_true "$OCF_RESKEY_ctdb_manages_samba" || return 0 + + # preserve permissions of smb.conf + cp -a "$OCF_RESKEY_smb_conf" "$OCF_RESKEY_smb_conf.$$" + sed '/# CTDB-RA: Begin/,/# CTDB-RA: End/d' "$OCF_RESKEY_smb_conf" > "$OCF_RESKEY_smb_conf.$$" + mv "$OCF_RESKEY_smb_conf.$$" "$OCF_RESKEY_smb_conf" +} + +append_conf() { + local file_path="$1" + shift + [ -n "$2" ] && echo "$1=$2" >> "$file_path" +} + +generate_ctdb_config() { + local ctdb_config="$OCF_RESKEY_ctdb_config_dir/ctdb.conf" + + # Backup existing config if we're not already using an auto-generated one + grep -qa '# CTDB-RA: Auto-generated' $ctdb_config || cp -p $ctdb_config ${ctdb_config}.ctdb-ra-orig + if [ $? -ne 0 ]; then + ocf_log warn "Unable to backup $ctdb_config to ${ctdb_config}.ctdb-ra-orig" + fi + + local log_option="file:$OCF_RESKEY_ctdb_logfile" + if [ "$OCF_RESKEY_ctdb_logfile" = "syslog" ]; then + log_option="syslog" + fi + + local start_as_disabled="false" + ocf_is_true "$OCF_RESKEY_ctdb_start_as_disabled" && start_as_disabled="true" + + local dbdir_volatile="$OCF_RESKEY_ctdb_dbdir/volatile" + [ -d "$dbdir_volatile" ] || mkdir -p "$dbdir_volatile" 2>/dev/null + local dbdir_persistent="$OCF_RESKEY_ctdb_dbdir/persistent" + [ -d "$dbdir_persistent" ] || mkdir -p "$dbdir_persistent" 2>/dev/null + local dbdir_state="$OCF_RESKEY_ctdb_dbdir/state" + [ -d "$dbdir_state" ] || mkdir -p "$dbdir_state" 2>/dev/null + +cat >$ctdb_config <<EOF +# CTDB-RA: Auto-generated +[logging] + location = $log_option + log level = $OCF_RESKEY_ctdb_debuglevel +[cluster] + recovery lock = $OCF_RESKEY_ctdb_recovery_lock +[database] + volatile database directory = $dbdir_volatile + persistent database directory = $dbdir_persistent + state database directory = $dbdir_state +[legacy] + start as disabled = $start_as_disabled +EOF +} + +generate_event_script_options() { + local script_options="$OCF_RESKEY_ctdb_config_dir/script.options" + + # Backup existing config if we're not already using an auto-generated one + grep -qa '# CTDB-RA: Auto-generated' $script_options || cp -p $script_options ${script_options}.ctdb-ra-orig + if [ $? -ne 0 ]; then + ocf_log warn "Unable to backup $script_options to ${script_options}.ctdb-ra-orig" + fi + +cat >$script_options <<EOF +# CTDB-RA: Auto-generated +CTDB_SAMBA_SKIP_SHARE_CHECK=$(ocf_is_true "$OCF_RESKEY_ctdb_samba_skip_share_check" && echo 'yes' || echo 'no') +EOF + + append_conf "$script_options" CTDB_SERVICE_SMB $OCF_RESKEY_ctdb_service_smb + append_conf "$script_options" CTDB_SERVICE_NMB $OCF_RESKEY_ctdb_service_nmb + append_conf "$script_options" CTDB_SERVICE_WINBIND $OCF_RESKEY_ctdb_service_winbind +} + +# Generate a new, minimal CTDB config file that's just enough +# to get CTDB running as configured by the RA parameters. +generate_ctdb_sysconfig() { + # Backup existing sysconfig if we're not already using an auto-generated one + grep -qa '# CTDB-RA: Auto-generated' $CTDB_SYSCONFIG || cp -p $CTDB_SYSCONFIG $CTDB_SYSCONFIG_BACKUP + if [ $? -ne 0 ]; then + ocf_log warn "Unable to backup $CTDB_SYSCONFIG to $CTDB_SYSCONFIG_BACKUP" + fi + + ocf_log info "Generating new $CTDB_SYSCONFIG" + + # Note to maintainers and other random hackers: + # Parameters may need to be set here, for CTDB event + # scripts to pick up, or may need to be passed to ctdbd + # when starting, or both. Be careful. The CTDB source + # tree and manpages are your friends. As a concrete + # example, setting CTDB_START_AS_DISABLED here is + # completely useless, as this is actually a command line + # argument for ctdbd; it's not used anywhere else. + + cat >$CTDB_SYSCONFIG <<EOF +# CTDB-RA: Auto-generated by ${0}, backup is at $CTDB_SYSCONFIG_BACKUP +CTDB_MONITOR_FREE_MEMORY=$OCF_RESKEY_ctdb_monitor_free_memory +CTDB_SAMBA_SKIP_SHARE_CHECK=$(ocf_is_true "$OCF_RESKEY_ctdb_samba_skip_share_check" && echo 'yes' || echo 'no') +CTDB_MANAGES_SAMBA=$(ocf_is_true "$OCF_RESKEY_ctdb_manages_samba" && echo 'yes' || echo 'no') +CTDB_MANAGES_WINBIND=$(ocf_is_true "$OCF_RESKEY_ctdb_manages_winbind" && echo 'yes' || echo 'no') +EOF + append_conf "$CTDB_SYSCONFIG" CTDB_SERVICE_SMB $OCF_RESKEY_ctdb_service_smb + append_conf "$CTDB_SYSCONFIG" CTDB_SERVICE_NMB $OCF_RESKEY_ctdb_service_nmb + append_conf "$CTDB_SYSCONFIG" CTDB_SERVICE_WINBIND $OCF_RESKEY_ctdb_service_winbind +} + + +invoke_ctdbd() { + local vers="$1" + + ocf_move_to_root_cgroup_if_rt_enabled + + ocf_version_cmp "$vers" "4.9.0" + if [ "$?" -ne "0" ]; then + # With 4.9+, all ctdbd binary parameters are provided as + # config settings + $OCF_RESKEY_ctdbd_binary + return + fi + + # Use logfile by default, or syslog if asked for + local log_option + # --logging supported from v4.3.0 and --logfile / --syslog support + # has been removed from newer versions + ocf_version_cmp "$vers" "4.2.14" + if [ "$?" -eq "2" ]; then + log_option="--logging=file:$OCF_RESKEY_ctdb_logfile" + if [ "$OCF_RESKEY_ctdb_logfile" = "syslog" ]; then + log_option="--logging=syslog" + fi + else + log_option="--logfile=$OCF_RESKEY_ctdb_logfile" + if [ "$OCF_RESKEY_ctdb_logfile" = "syslog" ]; then + log_option="--syslog" + fi + fi + + # public addresses file (should not be present, but need to set for correctness if it is) + local pub_addr_option + pub_addr_option="" + [ -f "${OCF_RESKEY_ctdb_config_dir}/public_addresses" ] && \ + pub_addr_option="--public-addresses=${OCF_RESKEY_ctdb_config_dir}/public_addresses" + # start as disabled + local start_as_disabled + start_as_disabled="--start-as-disabled" + ocf_is_true "$OCF_RESKEY_ctdb_start_as_disabled" || start_as_disabled="" + + $OCF_RESKEY_ctdbd_binary \ + --reclock="$OCF_RESKEY_ctdb_recovery_lock" \ + --nlist="$OCF_RESKEY_ctdb_config_dir/nodes" \ + --socket="$OCF_RESKEY_ctdb_socket" \ + --dbdir="$OCF_RESKEY_ctdb_dbdir" \ + --dbdir-persistent="$OCF_RESKEY_ctdb_dbdir/persistent" \ + --event-script-dir="$OCF_RESKEY_ctdb_config_dir/events.d" \ + --notification-script="$OCF_RESKEY_ctdb_config_dir/notify.sh" \ + --transport=tcp \ + $start_as_disabled $log_option $pub_addr_option \ + -d "$OCF_RESKEY_ctdb_debuglevel" +} + + +ctdb_usage() { + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + + +ctdb_start() { + # Do nothing if already running + ctdb_monitor && return $OCF_SUCCESS + + # Make sure config is adequate + ctdb_validate + rv=$? + [ $rv -ne 0 ] && return $rv + + # Add necessary configuration to smb.conf + init_smb_conf + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to update $OCF_RESKEY_smb_conf." + return $OCF_ERR_GENERIC + fi + + local version=$(ctdb_version) + + ocf_version_cmp "$version" "4.9.0" + if [ "$?" -eq "0" ]; then + # prior to 4.9, ctdbd parameters are in sysconfig or passed as + # binary arguments + generate_ctdb_sysconfig + + # prior to 4.9, event script enablement without a running + # ctdbd is done by chmoding the scripts directly + enable_event_scripts_chmod + else + # 4.9+ moves all ctdbd parameters to ctdb.conf + generate_ctdb_config + + # 4.9+ event script options are in script.options + generate_event_script_options + + # 4.9+ event scripts can be enabled with ctdb directly, which + # performs a symlink + enable_event_scripts_symlink + fi + + if [ ! -d "$(dirname $OCF_RESKEY_ctdb_logfile)" ]; then + # ensure the logfile's directory exists, otherwise ctdb will fail to start + mkdir -p $(dirname $OCF_RESKEY_ctdb_logfile) + fi + + # ensure ctdb's rundir exists, otherwise it will fail to start + mkdir -p $OCF_RESKEY_ctdb_rundir 2>/dev/null + + # set nofile ulimit for ctdbd process + if [ -n "$OCF_RESKEY_ctdb_max_open_files" ]; then + ulimit -n "$OCF_RESKEY_ctdb_max_open_files" + fi + + # Start her up + invoke_ctdbd "$version" + + if [ $? -ne 0 ]; then + # cleanup smb.conf + cleanup_smb_conf + + ocf_exit_reason "Failed to execute $OCF_RESKEY_ctdbd_binary." + return $OCF_ERR_GENERIC + else + # Wait a bit for CTDB to stabilize + # (until start times out if necessary) + while true; do + # Initial sleep is intentional (ctdb init script + # has sleep after ctdbd start, but before invoking + # ctdb to talk to it) + sleep 1 + status=$(invoke_ctdb status 2>/dev/null) + if [ $? -ne 0 ]; then + # CTDB will be running, kill it before returning + ctdb_stop + ocf_exit_reason "Can't invoke $OCF_RESKEY_ctdb_binary status" + return $OCF_ERR_GENERIC + fi + if ! echo "$status" | grep -qs 'UNHEALTHY (THIS'; then + # Status does not say this node is unhealthy, + # so we're good to go. Do a bit of final + # setup and (hopefully) return success. + set_ctdb_variables + return $? + fi + done + fi + + # ctdbd will (or can) actually still be running at this point, so kill it + ctdb_stop + + ocf_exit_reason "Timeout waiting for CTDB to stabilize" + return $OCF_ERR_GENERIC +} + + +ctdb_stop() { + # Do nothing if already stopped + pkill -0 -f "$OCF_RESKEY_ctdbd_binary" || return $OCF_SUCCESS + + # Tell it to die nicely + invoke_ctdb shutdown >/dev/null 2>&1 + rv=$? + + # No more Mr. Nice Guy + count=0 + while pkill -0 -f "$OCF_RESKEY_ctdbd_binary" ; do + sleep 1 + count=$((count + 1)) + [ $count -gt 10 ] && { + ocf_log info "killing ctdbd " + pkill -9 -f "$OCF_RESKEY_ctdbd_binary" + pkill -9 -f "${OCF_RESKEY_ctdb_config_dir}/events" + } + done + + # Cleanup smb.conf + cleanup_smb_conf + + # It was a clean shutdown, return success + [ $rv -eq $OCF_SUCCESS ] && return $OCF_SUCCESS + + # Unclean shutdown, return success if there's no ctdbds left (we + # killed them forcibly, but at least they're good and dead). + pkill -0 -f "$OCF_RESKEY_ctdbd_binary" || return $OCF_SUCCESS + + # Problem: ctdb shutdown didn't work and neither did some vigorous + # kill -9ing. Only thing to do is report failure. + return $OCF_ERR_GENERIC +} + + +ctdb_monitor() { + local status + # "ctdb status" exits non-zero if CTDB isn't running. + # It can also exit non-zero if there's a timeout (ctdbd blocked, + # stalled, massive load, or otherwise wedged). If it's actually + # not running, STDERR will say "Errno:Connection refused(111)", + # whereas if it's wedged, it'll say various other unpleasant things. + status=$(invoke_ctdb status 2>&1) + if [ $? -ne 0 ]; then + if echo "$status" | grep -qs 'Connection refused'; then + return $OCF_NOT_RUNNING + elif echo "$status" | grep -qs 'No such file or directory'; then + return $OCF_NOT_RUNNING + elif echo $status | grep -qs 'connect() failed'; then + return $OCF_NOT_RUNNING + else + ocf_exit_reason "CTDB status call failed: $status" + return $OCF_ERR_GENERIC + fi + fi + if echo "$status" | grep -Eqs '(OK|DISABLED) \(THIS'; then + return $OCF_SUCCESS + fi + + ocf_exit_reason "CTDB status is bad: $status" + return $OCF_ERR_GENERIC +} + + +ctdb_validate() { + # Required binaries + for binary in pkill; do + check_binary $binary + done + + if ocf_is_true "$OCF_RESKEY_ctdb_manages_samba" && [ ! -f "$OCF_RESKEY_smb_conf" ]; then + ocf_exit_reason "Samba config file '$OCF_RESKEY_smb_conf' does not exist." + return $OCF_ERR_INSTALLED + fi + + if [ -f "${OCF_RESKEY_ctdb_config_dir}/public_addresses" ]; then + ocf_log info "CTDB file '${OCF_RESKEY_ctdb_config_dir}/public_addresses' exists - CTDB will try to manage IP failover!" + fi + + if [ ! -f "$OCF_RESKEY_ctdb_config_dir/nodes" ]; then + ocf_exit_reason "$OCF_RESKEY_ctdb_config_dir/nodes does not exist." + return $OCF_ERR_ARGS + fi + + if [ -z "$OCF_RESKEY_ctdb_recovery_lock" ]; then + ocf_exit_reason "ctdb_recovery_lock not specified." + return $OCF_ERR_CONFIGURED + fi + + if [ "${OCF_RESKEY_ctdb_recovery_lock:0:1}" == '!' ]; then + # '!' prefix means recovery lock is handled via a helper binary + binary="${OCF_RESKEY_ctdb_recovery_lock:1}" + binary="${binary%% *}" # trim any parameters + if [ -z "$binary" ]; then + ocf_exit_reason "ctdb_recovery_lock invalid helper" + return $OCF_ERR_CONFIGURED + fi + check_binary "${binary}" + else + lock_dir=$(dirname "$OCF_RESKEY_ctdb_recovery_lock") + touch "$lock_dir/$$" 2>/dev/null + if [ $? != 0 ]; then + ocf_exit_reason "Directory for lock file '$OCF_RESKEY_ctdb_recovery_lock' does not exist, or is not writable." + return $OCF_ERR_ARGS + fi + rm "$lock_dir/$$" + fi + + return $OCF_SUCCESS +} + + +case $__OCF_ACTION in +meta-data) meta_data + exit $OCF_SUCCESS + ;; +start) ctdb_start;; +stop) ctdb_stop;; +monitor) ctdb_monitor;; +validate-all) ctdb_validate;; +usage|help) ctdb_usage + exit $OCF_SUCCESS + ;; +*) ctdb_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc diff --git a/heartbeat/ClusterMon b/heartbeat/ClusterMon new file mode 100755 index 0000000..161e309 --- /dev/null +++ b/heartbeat/ClusterMon @@ -0,0 +1,271 @@ +#!/bin/sh +# +# +# ClusterMon OCF RA. +# Starts crm_mon in background which logs cluster status as +# html to the specified file. +# +# Copyright (c) 2004 SUSE LINUX AG, Lars Marowsky-Bree +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# OCF instance parameters: +# OCF_RESKEY_user +# OCF_RESKEY_pidfile +# OCF_RESKEY_update +# OCF_RESKEY_extra_options +# OCF_RESKEY_htmlfile + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_user_default="root" +OCF_RESKEY_update_default="15000" +OCF_RESKEY_extra_options_default="" +OCF_RESKEY_pidfile_default="${HA_RSCTMP}/ClusterMon_${OCF_RESOURCE_INSTANCE}.pid" +OCF_RESKEY_htmlfile_default="${HA_RSCTMP}/ClusterMon_${OCF_RESOURCE_INSTANCE}.html" + +: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} +: ${OCF_RESKEY_update=${OCF_RESKEY_update_default}} +: ${OCF_RESKEY_extra_options=${OCF_RESKEY_extra_options_default}} +: ${OCF_RESKEY_pidfile=${OCF_RESKEY_pidfile_default}} +: ${OCF_RESKEY_htmlfile=${OCF_RESKEY_htmlfile_default}} + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="ClusterMon" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This is a ClusterMon Resource Agent. +It outputs current cluster status to the html. +</longdesc> +<shortdesc lang="en">Runs crm_mon in the background, recording the cluster status to an HTML file</shortdesc> + +<parameters> + +<parameter name="user" unique="0"> +<longdesc lang="en"> +The user we want to run crm_mon as +</longdesc> +<shortdesc lang="en">The user we want to run crm_mon as</shortdesc> +<content type="string" default="${OCF_RESKEY_user_default}" /> +</parameter> + +<parameter name="update" unique="0"> +<longdesc lang="en"> +How frequently should we update the cluster status +</longdesc> +<shortdesc lang="en">Update interval</shortdesc> +<content type="integer" default="${OCF_RESKEY_update_default}" /> +</parameter> + +<parameter name="extra_options" unique="0"> +<longdesc lang="en"> +Additional options to pass to crm_mon. Eg. -n -r +</longdesc> +<shortdesc lang="en">Extra options</shortdesc> +<content type="string" default="${OCF_RESKEY_extra_options_default}" /> +</parameter> + +<parameter name="pidfile" unique="1"> +<longdesc lang="en"> +PID file location to ensure only one instance is running +</longdesc> +<shortdesc lang="en">PID file</shortdesc> +<content type="string" default="${OCF_RESKEY_pidfile_default}" /> +</parameter> + +<parameter name="htmlfile" unique="1" required="0"> +<longdesc lang="en"> +Location to write HTML output to. +</longdesc> +<shortdesc lang="en">HTML output</shortdesc> +<content type="string" default="${OCF_RESKEY_htmlfile_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="30s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +ClusterMon_usage() { + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +ClusterMon_exit() { + if [ $1 != 0 ]; then + exit $OCF_ERR_GENERIC + else + exit $OCF_SUCCESS + fi +} + +ClusterMon_start() { + cmd_prefix="" + cmd_suffix="" + if [ ! -z $OCF_RESKEY_user ]; then + su - $OCF_RESKEY_user -c "${HA_SBIN_DIR}/crm_mon -p $OCF_RESKEY_pidfile -d -i $OCF_RESKEY_update $OCF_RESKEY_extra_options -h $OCF_RESKEY_htmlfile" + else + ${HA_SBIN_DIR}/crm_mon -p $OCF_RESKEY_pidfile -d -i $OCF_RESKEY_update $OCF_RESKEY_extra_options -h $OCF_RESKEY_htmlfile + fi + ClusterMon_exit $? +} + +ClusterMon_stop() { + if [ -f $OCF_RESKEY_pidfile ]; then + pid=`cat $OCF_RESKEY_pidfile` + if [ ! -z $pid ]; then + kill -s 9 $pid + rm -f $OCF_RESKEY_pidfile + fi + fi + ClusterMon_exit 0 +} + +ClusterMon_monitor() { + if [ -f $OCF_RESKEY_pidfile ]; then + pid=`cat $OCF_RESKEY_pidfile` + [ "$pid" ] && kill -s 0 $pid && + exit $OCF_SUCCESS + fi + exit $OCF_NOT_RUNNING +} + +CheckOptions() { +while getopts Vi:nrh:cdp: OPTION +do + case $OPTION in + V|n|r|c|d);; + i) ocf_log warn "You should not have specified the -i option, since OCF_RESKEY_update is set already!";; + h) ocf_log warn "You should not have specified the -h option, since OCF_RESKEY_htmlfile is set already!";; + p) ocf_log warn "You should not have specified the -p option, since OCF_RESKEY_pidfile is set already!";; + *) return $OCF_ERR_ARGS;; + esac +done + +if [ $? -ne 0 ]; then + return $OCF_ERR_ARGS +fi + +# We should have eaten all options at this stage +shift $(($OPTIND -1)) +if [ $# -gt 0 ]; then + false +else + true +fi +} + +ClusterMon_validate() { +# Existence of the user + if [ ! -z $OCF_RESKEY_user ]; then + getent passwd "$OCF_RESKEY_user" >/dev/null + if [ $? -eq 0 ]; then + : Yes, user exists. We can further check his permission on crm_mon if necessary + else + ocf_log err "The user $OCF_RESKEY_user does not exist!" + exit $OCF_ERR_ARGS + fi + fi + +# Pidfile better be an absolute path + case $OCF_RESKEY_pidfile in + /*) ;; + *) ocf_log warn "You should have pidfile($OCF_RESKEY_pidfile) of absolute path!" ;; + esac + +# Check the update interval + if ocf_is_decimal "$OCF_RESKEY_update" && [ $OCF_RESKEY_update -gt 0 ]; then + : + else + ocf_log err "Invalid update interval $OCF_RESKEY_update. It should be positive integer!" + exit $OCF_ERR_ARGS + fi + + if CheckOptions $OCF_RESKEY_extra_options; then + : + else + ocf_log err "Invalid options $OCF_RESKEY_extra_options!" + exit $OCF_ERR_ARGS + fi + +# Htmlfile better be an absolute path + case $OCF_RESKEY_htmlfile in + /*) ;; + *) ocf_log warn "You should have htmlfile($OCF_RESKEY_htmlfile) of absolute path!" ;; + esac + + + echo "Validate OK" + return $OCF_SUCCESS +} + +if [ $# -ne 1 ]; then + ClusterMon_usage + exit $OCF_ERR_ARGS +fi + +OCF_RESKEY_update=`expr $OCF_RESKEY_update / 1000` + +case $__OCF_ACTION in +meta-data) meta_data + exit $OCF_SUCCESS + ;; +start) ClusterMon_start + ;; +stop) ClusterMon_stop + ;; +monitor) ClusterMon_monitor + ;; +validate-all) ClusterMon_validate + ;; +usage|help) ClusterMon_usage + exit $OCF_SUCCESS + ;; +*) ClusterMon_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +exit $? diff --git a/heartbeat/Delay b/heartbeat/Delay new file mode 100755 index 0000000..5aa8f46 --- /dev/null +++ b/heartbeat/Delay @@ -0,0 +1,227 @@ +#!/bin/sh +# +# +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL) +# +# This script is a test resource for introducing delay. +# +# usage: $0 {start|stop|status|monitor|meta-data} +# +# OCF parameters are as below: +# OCF_RESKEY_startdelay +# OCF_RESKEY_stopdelay +# OCF_RESKEY_mondelay +# +# +# OCF_RESKEY_startdelay defaults to 20 (seconds) +# OCF_RESKEY_stopdelay defaults to $OCF_RESKEY_startdelay +# OCF_RESKEY_mondelay defaults to $OCF_RESKEY_startdelay +# +# +# This is really a test resource script. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_startdelay_default="20" +OCF_RESKEY_stopdelay_default="30" +OCF_RESKEY_mondelay_default="30" + +: ${OCF_RESKEY_startdelay=${OCF_RESKEY_startdelay_default}} +: ${OCF_RESKEY_stopdelay=${OCF_RESKEY_stopdelay_default}} +: ${OCF_RESKEY_mondelay=${OCF_RESKEY_mondelay_default}} + +####################################################################### + +usage() { + cat <<-! + usage: $0 {start|stop|status|monitor|meta-data|validate-all} + ! +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="Delay" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This script is a test resource for introducing delay. +</longdesc> +<shortdesc lang="en">Waits for a defined timespan</shortdesc> + +<parameters> + +<parameter name="startdelay" unique="0" required="0"> +<longdesc lang="en"> +How long in seconds to delay on start operation. +</longdesc> +<shortdesc lang="en">Start delay</shortdesc> +<content type="integer" default="${OCF_RESKEY_startdelay_default}" /> +</parameter> + +<parameter name="stopdelay" unique="0" required="0"> +<longdesc lang="en"> +How long in seconds to delay on stop operation. +</longdesc> +<shortdesc lang="en">Stop delay</shortdesc> +<content type="integer" default="${OCF_RESKEY_stopdelay_default}" /> +</parameter> + +<parameter name="mondelay" unique="0" required="0"> +<longdesc lang="en"> +How long in seconds to delay on monitor operation. +</longdesc> +<shortdesc lang="en">Monitor delay</shortdesc> +<content type="integer" default="${OCF_RESKEY_mondelay_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="30s" /> +<action name="stop" timeout="40s" /> +<action name="status" depth="0" timeout="40s" interval="10s" /> +<action name="monitor" depth="0" timeout="40s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s" /> +</actions> +</resource-agent> +END +} + +Delay_stat() { + ha_pseudo_resource Delay_${OCF_RESOURCE_INSTANCE} monitor +} + +Delay_Status() { + if + Delay_stat + then + ocf_log info "Delay is running OK" + return $OCF_SUCCESS + else + ocf_log info "Delay is stopped" + return $OCF_NOT_RUNNING + fi +} + +Delay_Monitor() { + Delay_Validate_All -q + sleep $OCF_RESKEY_mondelay + Delay_Status +} + +Delay_Start() { + if + Delay_stat + then + ocf_log info "Delay already running." + return $OCF_SUCCESS + else + Delay_Validate_All -q + ha_pseudo_resource Delay_${OCF_RESOURCE_INSTANCE} start + rc=$? + sleep $OCF_RESKEY_startdelay + if + [ $rc -ne 0 ] + then + return $OCF_ERR_PERM + fi + return $OCF_SUCCESS + fi +} + +Delay_Stop() { + if + Delay_stat + then + Delay_Validate_All -q + ha_pseudo_resource Delay_${OCF_RESOURCE_INSTANCE} stop + rc=$? + sleep $OCF_RESKEY_stopdelay + if + [ $rc -ne 0 ] + then + return $OCF_ERR_PERM + fi + return $OCF_SUCCESS + else + ocf_log info "Delay already stopped." + return $OCF_SUCCESS + fi +} + +# Check if all the arguments are valid numbers, a string is considered valid if: +# 1. It does not contain any character but digits and period "."; +# 2. The period "." does not occur more than once + +Are_Valid_Numbers() { + for i in "$@"; do + echo $i |grep -v "[^0-9.]" |grep -q -v "[.].*[.]" + if test $? -ne 0; then + return $OCF_ERR_ARGS + fi + done + return $OCF_SUCCESS +} + +Delay_Validate_All() { +# Be quiet when specified -q option _and_ validation succeded + getopts "q" option + + if test $option = "q"; then + quiet=yes + else + quiet=no + fi + shift $(($OPTIND -1)) + + if Are_Valid_Numbers $OCF_RESKEY_startdelay $OCF_RESKEY_stopdelay \ + $OCF_RESKEY_mondelay; then + if test $quiet = "no"; then + echo "Validate OK" + fi +# _Return_ on validation success + return $OCF_SUCCESS + else + ocf_exit_reason "Some of the instance parameters are invalid" +# _Exit_ on validation failure + exit $OCF_ERR_ARGS + fi +} + +if [ $# -ne 1 ]; then + usage + exit $OCF_ERR_ARGS +fi + +case $1 in + meta-data) meta_data + exit $OCF_SUCCESS + ;; + start) Delay_Start + ;; + stop) Delay_Stop + ;; + monitor) Delay_Monitor + ;; + status) Delay_Status + ;; + validate-all) Delay_Validate_All + ;; + usage) usage + exit $OCF_SUCCESS + ;; + *) usage + exit $OCF_ERR_ARGS + ;; +esac +exit $? diff --git a/heartbeat/Dummy b/heartbeat/Dummy new file mode 100755 index 0000000..81a675d --- /dev/null +++ b/heartbeat/Dummy @@ -0,0 +1,186 @@ +#!/bin/sh +# +# +# Dummy OCF RA. Does nothing except track its own state. +# Use it only as a testing tool or example for how to write +# a resource agent. +# +# Copyright (c) 2004 SUSE LINUX AG, Lars Marowsky-Bree +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_state_default="${HA_RSCTMP}/Dummy-${OCF_RESOURCE_INSTANCE}.state" +OCF_RESKEY_fake_default="dummy" + +: ${OCF_RESKEY_state=${OCF_RESKEY_state_default}} +: ${OCF_RESKEY_fake=${OCF_RESKEY_fake_default}} + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="Dummy" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This is a Dummy Resource Agent. It does absolutely nothing except +keep track of whether its running or not. +Its purpose in life is for testing and to serve as a template for RA writers. + +NB: Please pay attention to the timeouts specified in the actions +section below. They should be meaningful for the kind of resource +the agent manages. They should be the minimum advised timeouts, +but they shouldn't/cannot cover _all_ possible resource +instances. So, try to be neither overly generous nor too stingy, +but moderate. The minimum timeouts should never be below 10 seconds. +</longdesc> +<shortdesc lang="en">Example stateless resource agent</shortdesc> + +<parameters> +<parameter name="state" unique="1"> +<longdesc lang="en"> +Location to store the resource state in. +</longdesc> +<shortdesc lang="en">State file</shortdesc> +<content type="string" default="${OCF_RESKEY_state_default}" /> +</parameter> + +<parameter name="fake" unique="0"> +<longdesc lang="en"> +Fake attribute that can be changed to cause a reload +</longdesc> +<shortdesc lang="en">Fake attribute that can be changed to cause a reload</shortdesc> +<content type="string" default="${OCF_RESKEY_fake_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" timeout="20s" interval="10s" depth="0" /> +<action name="reload" timeout="20s" /> +<action name="migrate_to" timeout="20s" /> +<action name="migrate_from" timeout="20s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="20s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +dummy_usage() { + cat <<END +usage: $0 {start|stop|monitor|reload|migrate_to|migrate_from|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +dummy_start() { + dummy_monitor + if [ $? = $OCF_SUCCESS ]; then + return $OCF_SUCCESS + fi + touch ${OCF_RESKEY_state} +} + +dummy_stop() { + dummy_monitor + if [ $? = $OCF_SUCCESS ]; then + rm ${OCF_RESKEY_state} + fi + return $OCF_SUCCESS +} + +dummy_monitor() { + # Monitor _MUST!_ differentiate correctly between running + # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING). + # That is THREE states, not just yes/no. + + if [ -f ${OCF_RESKEY_state} ]; then + return $OCF_SUCCESS + fi + if false ; then + return $OCF_ERR_GENERIC + fi + + if ! ocf_is_probe && [ "$__OCF_ACTION" = "monitor" ]; then + # set exit string only when NOT_RUNNING occurs during an actual monitor operation. + ocf_exit_reason "No process state file found" + fi + return $OCF_NOT_RUNNING +} + +dummy_validate() { + + # Is the state directory writable? + state_dir=`dirname "$OCF_RESKEY_state"` + touch "$state_dir/$$" + if [ $? != 0 ]; then + ocf_exit_reason "State file \"$OCF_RESKEY_state\" is not writable" + return $OCF_ERR_ARGS + fi + rm "$state_dir/$$" + + return $OCF_SUCCESS +} + +case $__OCF_ACTION in +meta-data) meta_data + exit $OCF_SUCCESS + ;; +start) dummy_start;; +stop) dummy_stop;; +monitor) dummy_monitor;; +migrate_to) ocf_log info "Migrating ${OCF_RESOURCE_INSTANCE} to ${OCF_RESKEY_CRM_meta_migrate_target}." + dummy_stop + ;; +migrate_from) ocf_log info "Migrating ${OCF_RESOURCE_INSTANCE} from ${OCF_RESKEY_CRM_meta_migrate_source}." + dummy_start + ;; +reload) ocf_log info "Reloading ${OCF_RESOURCE_INSTANCE} ..." + ;; +validate-all) dummy_validate;; +usage|help) dummy_usage + exit $OCF_SUCCESS + ;; +*) dummy_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc + diff --git a/heartbeat/EvmsSCC b/heartbeat/EvmsSCC new file mode 100755 index 0000000..a691138 --- /dev/null +++ b/heartbeat/EvmsSCC @@ -0,0 +1,222 @@ +#!/bin/sh +# +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL) +# +# EvmsSCC +# Description: Runs evms_activate in a heartbeat cluster to activate a +# EVMS shared cluster container in the cluster. +# Original Author: Jo De Baer (jdebaer@novell.com) +# Original Release: 06 Nov 2006 +# +# usage: ./EvmsSCC {start|stop|status|monitor|meta-data} +# +# The goal of this resource agent is to provoke the creation of device file +# in /dev/emvs which correspond to EVMS2 volumes that reside in a EVMS2 shared +# cluster container. As such it should be run as a clone resource in the +# cluster. Logic inside the resource agent will make sure that "evms_activate" +# is run on only one node in the cluster, both at cluster startup time as well +# as when a node joins the cluster. +# +# Typically, resources that need to mount EVMS2 volumes should run after this +# resource agent has finished it's run. As such those resources should be made +# "dependent" on this resource agent by the cluster administrator. An example +# of resources that should depend on this resource agent are Filesystem resource +# agent that mount OCFS2 volumes that reside on EVMS2 volumes in a shared +# EVMS2 cluster container. +# +# For this resource agent to do it's job correctly, evmsd must be running on +# the node where the agent is started. Usually evmsd is started by the cluster +# software via a respawn statement in /etc/ha.d/ha.cf. If you encounter timing +# issues where evmsd is not yet started but where the cluster already starts +# the EvmsSCC clone, then you should comment out the evmsd respawn statement +# in /etc/ha.d/ha.cf and start evmsd on each node in the cluster via a separate +# clone resource agent. The EvmsSCC resource agent cloneset should then be made +# dependent to this evmsd cloneset. This will guarantee that emvsd is running +# before EvmsSCC is started, on each node in the cluster. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_ignore_deprecation_default="false" + +: ${OCF_RESKEY_ignore_deprecation=${OCF_RESKEY_ignore_deprecation_default}} + +####################################################################### + + +# Utilities used by this script +CUT=cut +EVMSACTIVATE=evms_activate + +usage() { + cat <<-EOT + usage: $0 {start|stop|status|monitor|meta-data} + EOT +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="EvmsSCC" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Deprecation warning: EVMS is no longer actively maintained and should not be used. This agent is deprecated and may be removed from a future release. -- +Resource script for EVMS shared cluster container. It runs evms_activate on one node in the cluster. +</longdesc> +<shortdesc lang="en">Manages EVMS Shared Cluster Containers (SCCs) (deprecated)</shortdesc> + +<parameters> +<parameter name="ignore_deprecation"> +<longdesc lang="en"> +If set to true, suppresses the deprecation warning for this agent. +</longdesc> +<shortdesc lang="en">Suppress deprecation warning</shortdesc> +<content type="boolean" default="${OCF_RESKEY_ignore_deprecation_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="60s" /> +<action name="stop" timeout="60s" /> +<action name="notify" timeout="60s" /> +<action name="status" depth="0" timeout="10s" interval="10s" /> +<action name="monitor" depth="0" timeout="10s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +EvmsSCC_status() +{ + # At the moment we don't support monitoring EVMS activations. We just return "not running" to cope with the pre-start monitor call. + return $OCF_NOT_RUNNING +} + +EvmsSCC_notify() +{ + local n_type="$OCF_RESKEY_CRM_meta_notify_type" + local n_op="$OCF_RESKEY_CRM_meta_notify_operation" + local n_active="$OCF_RESKEY_CRM_meta_notify_active_uname" + local n_stop="$OCF_RESKEY_CRM_meta_notify_stop_uname" + local n_start="$OCF_RESKEY_CRM_meta_notify_start_uname" + + case "$n_type" in + pre) + case "$n_op" in + start) ocf_log debug "EvmsSCC: Notify: Starting node(s): $n_start." + EvmsSCC_start_notify_common + ;; + esac + ;; + esac + + + return $OCF_SUCCESS +} + +EvmsSCC_start() +{ + local n_type="$OCF_RESKEY_CRM_meta_notify_type" + local n_op="$OCF_RESKEY_CRM_meta_notify_operation" + local n_active="$OCF_RESKEY_CRM_meta_notify_active_uname" + local n_stop="$OCF_RESKEY_CRM_meta_notify_stop_uname" + local n_start="$OCF_RESKEY_CRM_meta_notify_start_uname" + + ocf_log debug "EvmsSCC: Start: starting node(s): $n_start." + + EvmsSCC_start_notify_common + + return $OCF_SUCCESS +} + +EvmsSCC_stop() +{ + return $OCF_SUCCESS +} + +EvmsSCC_start_notify_common() +{ + local n_myself=${HA_CURHOST:-$(uname -n | tr A-Z a-z)} + ocf_log debug "EvmsSCC: Start_Notify: I am node $n_myself." + + n_active="$n_active $n_start" + case " $n_active " in + *" $n_myself "*) ;; + *) ocf_log err "EvmsSCC: $n_myself (local) not on active list!" + return $OCF_ERR_GENERIC + ;; + esac + + #pick the first node from the starting list + #when the cluster boots this will be one of the many booting nodes + #when a node later joins the cluster, this will be the joining node + local n_first=$(echo $n_start | cut -d ' ' -f 1) + ocf_log debug "EvmsSCC: Start_Notify: First node in starting list is $n_first." + + if [ "$n_myself" = "$n_first" ] ; then + ocf_log debug "EvmsSCC: Start_Notify: I am running ${EVMSACTIVATE}." + while true ; do + if ! ${EVMSACTIVATE} -q 2> /dev/null ; then + SLEEP_TIME=$(($(ocf_maybe_random) % 40)) + ocf_log info "EvmsSCC: Evms call failed - sleeping for $SLEEP_TIME seconds and then trying again." + sleep $SLEEP_TIME + else + break + fi + done + + fi + + return $OCF_SUCCESS +} + +# Check the arguments passed to this script +if + [ $# -ne 1 ] +then + usage + exit $OCF_ERR_ARGS +fi + +OP=$1 + +case $OP in + meta-data) meta_data + exit $OCF_SUCCESS + ;; + usage) usage + exit $OCF_SUCCESS + ;; +esac + +# Be obnoxious, log deprecation warning on every invocation (unless +# suppressed by resource configuration). +ocf_deprecated + +check_binary $CUT +check_binary $EVMSACTIVATE + +case $OP in + start) EvmsSCC_start + ;; + notify) EvmsSCC_notify + ;; + stop) EvmsSCC_stop + ;; + status|monitor) EvmsSCC_status + ;; + *) usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +exit $? diff --git a/heartbeat/Evmsd b/heartbeat/Evmsd new file mode 100755 index 0000000..6e30eae --- /dev/null +++ b/heartbeat/Evmsd @@ -0,0 +1,161 @@ +#!/bin/sh +# +# Evmsd OCF RA. +# +# Copyright (c) 2004 SUSE LINUX AG, Jo De Baer +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_ignore_deprecation_default="false" + +: ${OCF_RESKEY_ignore_deprecation=${OCF_RESKEY_ignore_deprecation_default}} + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="Evmsd" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Deprecation warning: EVMS is no longer actively maintained and should not be used. This agent is deprecated and may be removed from a future release. -- +This is a Evmsd Resource Agent. +</longdesc> +<shortdesc lang="en">Controls clustered EVMS volume management +(deprecated)</shortdesc> + +<parameters> +<parameter name="ignore_deprecation"> +<longdesc lang="en"> +If set to true, suppresses the deprecation warning for this agent. +</longdesc> +<shortdesc lang="en">Suppress deprecation warning</shortdesc> +<content type="boolean" default="${OCF_RESKEY_ignore_deprecation_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" timeout="20s" interval="10s" depth="0" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +evmsd_usage() { + cat <<END +usage: $0 {start|stop|monitor|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +evmsd_start() { + local PID=`pgrep evmsd` + if [ -z $PID ] ; then + nohup /sbin/evmsd & + # Spin waiting for the server to come up. + # Let the CRM/LRM time us out if required + start_wait=1 + while [ $start_wait = 1 ]; do + if evmsd_monitor ; then + sleep 1 + return $OCF_SUCCESS + else + sleep 1 + fi + done + else + # already running + return $OCF_SUCCESS + fi +} + +evmsd_stop() { + local PID=`pgrep evmsd` + if [ -z $PID ] ; then + # not running + return $OCF_SUCCESS + else + /bin/kill -15 $PID + sleep 1 + /bin/kill -9 $PID + # Spin waiting for the server to go down. + # Let the CRM/LRM time us out if required + stop_wait=1 + while [ $stop_wait = 1 ]; do + if evmsd_monitor ; then + sleep 1 + else + return $OCF_SUCCESS + fi + done + fi +} + +evmsd_monitor() { + local PID=`pgrep evmsd` + if [ -z $PID ] ; then + return $OCF_NOT_RUNNING + else + return $OCF_SUCCESS + fi +} + +if [ "$__OCF_ACTION" = "meta-data" ]; then + meta_data + exit $OCF_SUCCESS +fi + +# Be obnoxious, log deprecation warning on every invocation (unless +# suppressed by resource configuration). +ocf_deprecated + +case $__OCF_ACTION in +start) evmsd_start;; +stop) evmsd_stop;; +monitor) evmsd_monitor;; +usage|help) evmsd_usage + exit $OCF_SUCCESS + ;; +*) evmsd_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc + diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem new file mode 100755 index 0000000..0665628 --- /dev/null +++ b/heartbeat/Filesystem @@ -0,0 +1,1128 @@ +#!/bin/sh +# +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL) +# +# Filesystem +# Description: Manages a Filesystem on a shared storage medium. +# Original Author: Eric Z. Ayers (eric.ayers@compgen.com) +# Original Release: 25 Oct 2000 +# +# usage: ./Filesystem {start|stop|status|monitor|validate-all|meta-data} +# +# OCF parameters are as below: +# OCF_RESKEY_device +# OCF_RESKEY_directory +# OCF_RESKEY_fstype +# OCF_RESKEY_options +# OCF_RESKEY_statusfile_prefix +# OCF_RESKEY_run_fsck +# OCF_RESKEY_fast_stop +# OCF_RESKEY_force_clones +# +#OCF_RESKEY_device : name of block device for the filesystem. e.g. /dev/sda1, /dev/md0 +# Or a -U or -L option for mount, or an NFS mount specification +#OCF_RESKEY_directory : the mount point for the filesystem +#OCF_RESKEY_fstype : optional name of the filesystem type. e.g. ext2 +#OCF_RESKEY_options : options to be given to the mount command via -o +#OCF_RESKEY_statusfile_prefix : the prefix used for a status file for monitoring +#OCF_RESKEY_run_fsck : fsck execution mode: auto(default)/force/no +#OCF_RESKEY_fast_stop : fast stop: yes(default)/no +#OCF_RESKEY_force_clones : allow running the resource as clone. e.g. local xfs mounts +# for each brick in a glusterfs setup +# +# +# This assumes you want to manage a filesystem on a shared (SCSI) bus, +# on a replicated device (such as DRBD), or a network filesystem (such +# as NFS or Samba). +# +# Do not put this filesystem in /etc/fstab. This script manages all of +# that for you. +# +# NOTE: If 2 or more nodes mount the same file system read-write, and +# that file system is not designed for that specific purpose +# (such as GFS or OCFS2), and is not a network file system like +# NFS or Samba, then the filesystem is going to become +# corrupted. +# +# As a result, you should use this together with the stonith +# option and redundant, independent communications paths. +# +# If you don't do this, don't blame us when you scramble your +# disk. + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +DFLT_STATUSDIR=".Filesystem_status/" + +# Parameter defaults + +OCF_RESKEY_device_default="" +OCF_RESKEY_directory_default="" +OCF_RESKEY_fstype_default="" +OCF_RESKEY_options_default="" +OCF_RESKEY_statusfile_prefix_default="${DFLT_STATUSDIR}" +OCF_RESKEY_run_fsck_default="auto" +OCF_RESKEY_fast_stop_default="no" +OCF_RESKEY_force_clones_default="false" +OCF_RESKEY_force_unmount_default="true" +OCF_RESKEY_term_signals_default="TERM" +OCF_RESKEY_kill_signals_default="KILL" +OCF_RESKEY_signal_delay_default="1" + +# RHEL specific defaults +if is_redhat_based; then + get_os_ver + ocf_version_cmp "$VER" "9.0" 2>/dev/null + + case "$?" in + # RHEL >= 9 + 1|2) + OCF_RESKEY_force_unmount_default="safe";; + # RHEL < 9 and fallback if ocf_version_cmp() fails + *) + OCF_RESKEY_fast_stop_default="yes";; + esac +fi + + +: ${OCF_RESKEY_device=${OCF_RESKEY_device_default}} +: ${OCF_RESKEY_directory=${OCF_RESKEY_directory_default}} +: ${OCF_RESKEY_fstype=${OCF_RESKEY_fstype_default}} +: ${OCF_RESKEY_options=${OCF_RESKEY_options_default}} +: ${OCF_RESKEY_statusfile_prefix=${OCF_RESKEY_statusfile_prefix_default}} +: ${OCF_RESKEY_run_fsck=${OCF_RESKEY_run_fsck_default}} +if [ -z "${OCF_RESKEY_fast_stop}" ]; then + case "$OCF_RESKEY_fstype" in + gfs2) + OCF_RESKEY_fast_stop="no";; + *) + OCF_RESKEY_fast_stop=${OCF_RESKEY_fast_stop_default};; + esac +fi +: ${OCF_RESKEY_force_clones=${OCF_RESKEY_force_clones_default}} +: ${OCF_RESKEY_force_unmount=${OCF_RESKEY_force_unmount_default}} +: ${OCF_RESKEY_term_signals=${OCF_RESKEY_term_signals_default}} +: ${OCF_RESKEY_kill_signals=${OCF_RESKEY_kill_signals_default}} +: ${OCF_RESKEY_signal_delay=${OCF_RESKEY_signal_delay_default}} + +# Variables used by multiple methods +HOSTOS=$(uname) +TAB=' ' + +# The status file is going to an extra directory, by default +# +prefix=${OCF_RESKEY_statusfile_prefix} +: ${prefix:=$DFLT_STATUSDIR} +suffix="${OCF_RESOURCE_INSTANCE}" +[ "$OCF_RESKEY_CRM_meta_clone" ] && + suffix="${suffix}_$OCF_RESKEY_CRM_meta_clone" +suffix="${suffix}_$(uname -n)" +STATUSFILE="${OCF_RESKEY_directory}/$prefix$suffix" + +####################################################################### + +usage() { + cat <<-EOT + usage: $0 {start|stop|status|monitor|validate-all|meta-data} + EOT +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="Filesystem" version="1.1"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for Filesystem. It manages a Filesystem on a +shared storage medium. + +The standard monitor operation of depth 0 (also known as probe) +checks if the filesystem is mounted. If you want deeper tests, +set OCF_CHECK_LEVEL to one of the following values: + +10: read first 16 blocks of the device (raw read) + +This doesn't exercise the filesystem at all, but the device on +which the filesystem lives. This is noop for non-block devices +such as NFS, SMBFS, or bind mounts. + +20: test if a status file can be written and read + +The status file must be writable by root. This is not always the +case with an NFS mount, as NFS exports usually have the +"root_squash" option set. In such a setup, you must either use +read-only monitoring (depth=10), export with "no_root_squash" on +your NFS server, or grant world write permissions on the +directory where the status file is to be placed. +</longdesc> +<shortdesc lang="en">Manages filesystem mounts</shortdesc> + +<parameters> +<parameter name="device" required="1"> +<longdesc lang="en"> +The name of block device for the filesystem, or -U, -L options for mount, or NFS mount specification. + +NOTE: On Linux /dev/disk/by-{uuid,label}/ are preferred to -U/-L. +</longdesc> +<shortdesc lang="en">block device</shortdesc> +<content type="string" default="${OCF_RESKEY_device_default}" /> +</parameter> + +<parameter name="directory" required="1"> +<longdesc lang="en"> +The mount point for the filesystem. +</longdesc> +<shortdesc lang="en">mount point</shortdesc> +<content type="string" default="${OCF_RESKEY_directory_default}" /> +</parameter> + +<parameter name="fstype" required="1"> +<longdesc lang="en"> +The type of filesystem to be mounted. +</longdesc> +<shortdesc lang="en">filesystem type</shortdesc> +<content type="string" default="${OCF_RESKEY_fstype_default}" /> +</parameter> + +<parameter name="options"> +<longdesc lang="en"> +Any extra options to be given as -o options to mount. + +For bind mounts, add "bind" here and set fstype to "none". +We will do the right thing for options such as "bind,ro". +</longdesc> +<shortdesc lang="en">options</shortdesc> +<content type="string" default="${OCF_RESKEY_options_default}" /> +</parameter> + +<parameter name="statusfile_prefix"> +<longdesc lang="en"> +The prefix to be used for a status file for resource monitoring +with depth 20. If you don't specify this parameter, all status +files will be created in a separate directory. +</longdesc> +<shortdesc lang="en">status file prefix</shortdesc> +<content type="string" default="${OCF_RESKEY_statusfile_prefix_default}" /> +</parameter> + +<parameter name="run_fsck"> +<longdesc lang="en"> +Specify how to decide whether to run fsck or not. + +"auto" : decide to run fsck depending on the fstype(default) +"force" : always run fsck regardless of the fstype +"no" : do not run fsck ever. +</longdesc> +<shortdesc lang="en">run_fsck</shortdesc> +<content type="string" default="${OCF_RESKEY_run_fsck_default}" /> +</parameter> + +<parameter name="fast_stop"> +<longdesc lang="en"> +Normally, we expect no users of the filesystem and the stop +operation to finish quickly. If you cannot control the filesystem +users easily and want to prevent the stop action from failing, +then set this parameter to "no" and add an appropriate timeout +for the stop operation. + +This defaults to "no" for GFS2 filesystems. +</longdesc> +<shortdesc lang="en">fast stop</shortdesc> +<content type="boolean" default="${OCF_RESKEY_fast_stop_default}" /> +</parameter> + +<parameter name="force_clones"> +<longdesc lang="en"> +The use of a clone setup for local filesystems is forbidden +by default. For special setups like glusterfs, cloning a mount +of a local device with a filesystem like ext4 or xfs independently +on several nodes is a valid use case. + +Only set this to "true" if you know what you are doing! +</longdesc> +<shortdesc lang="en">allow running as a clone, regardless of filesystem type</shortdesc> +<content type="boolean" default="${OCF_RESKEY_force_clones_default}" /> +</parameter> + +<parameter name="force_unmount"> +<longdesc lang="en"> +This option allows specifying how to handle processes that are +currently accessing the mount directory. + +"true" : Kill processes accessing mount point +"safe" : Kill processes accessing mount point using methods that + avoid functions that could potentially block during process + detection +"false" : Do not kill any processes. + +The 'safe' option uses shell logic to walk the /procs/ directory +for pids using the mount point while the default option uses the +fuser cli tool. fuser is known to perform operations that can potentially +block if unresponsive nfs mounts are in use on the system. +</longdesc> +<shortdesc lang="en">Kill processes before unmount</shortdesc> +<content type="string" default="${OCF_RESKEY_force_unmount_default}" /> +</parameter> + +<parameter name="term_signals"> +<longdesc lang="en"> +Signals (names or numbers, whitespace separated) to send processes during graceful termination phase in stop-action. +</longdesc> +<shortdesc lang="en">Signals (names or numbers, whitespace separated) to send processes during graceful termination phase in stop-action</shortdesc> +<content type="string" default="${OCF_RESKEY_term_signals_default}" /> +</parameter> + +<parameter name="kill_signals"> +<longdesc lang="en"> +Signals (names or numbers, whitespace separated) to send processes during forceful killing phase in stop-action. +</longdesc> +<shortdesc lang="en">Signals (names or numbers, whitespace separated) to send processes during forceful killing phase in stop-action</shortdesc> +<content type="string" default="${OCF_RESKEY_kill_signals_default}" /> +</parameter> + +<parameter name="signal_delay"> +<longdesc lang="en"> +How many seconds to wait after sending term/kill signals to processes in stop-action. +</longdesc> +<shortdesc lang="en">How many seconds to wait after sending term/kill signals to processes in stop-action</shortdesc> +<content type="string" default="${OCF_RESKEY_signal_delay_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="60s" /> +<action name="stop" timeout="60s" /> +<action name="monitor" depth="0" timeout="40s" interval="20s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +# +# Make sure the kernel does the right thing with the FS buffers +# This function should be called after unmounting and before mounting +# It may not be necessary in 2.4 and later kernels, but it shouldn't hurt +# anything either... +# +# It's really a bug that you have to do this at all... +# +flushbufs() { + if have_binary $BLOCKDEV ; then + if [ "$blockdevice" = "yes" ] ; then + $BLOCKDEV --flushbufs $1 + return $? + fi + fi + return 0 +} + +# Take advantage of /etc/mtab if present, use portable mount command +# otherwise. Normalize format to "dev mountpoint fstype". +is_bind_mount() { + echo "$options" | grep -w bind >/dev/null 2>&1 +} + +list_mounts() { + local inpf="" + local mount_list="" + local check_list="x" + + if [ -e "/proc/mounts" ] && ! is_bind_mount; then + inpf=/proc/mounts + elif [ -f "/etc/mtab" -a -r "/etc/mtab" ]; then + inpf=/etc/mtab + fi + + # Make sure that the mount list has not been changed while reading. + while [ "$mount_list" != "$check_list" ]; do + check_list="$mount_list" + if [ "$inpf" ]; then + # <device> <mountpoint> <fstype> ... + # Spaces in device or mountpoint are octal \040 in $inpf + # Convert literal spaces (field separators) to tabs + mount_list=$(cut -d' ' -f1,2,3 < $inpf | tr ' ' "$TAB") + else + # <device> on <mountpoint> type <fstype> ... + # Use tabs as field separators + match_string='\(.*\) on \(.*\) type \([^[:space:]]\+\) .*' + replace_string="\\1${TAB}\\2${TAB}\\3" + mount_list=$($MOUNT | sed "s/$match_string/$replace_string/g") + fi + done + + # Convert octal \040 to space characters + printf "$mount_list" +} + +determine_blockdevice() { + if [ $blockdevice = "yes" ]; then + return + fi + + # Get the current real device name, if possible. + # (specified devname could be -L or -U...) + case "$FSTYPE" in + nfs4|nfs|efs|smbfs|cifs|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs|none|lustre) + : ;; + *) + match_string="${TAB}${CANONICALIZED_MOUNTPOINT}${TAB}" + DEVICE=$(list_mounts | grep "$match_string" | cut -d"$TAB" -f1) + if [ -b "$DEVICE" ]; then + blockdevice=yes + fi + ;; + esac +} + +# Lists all filesystems potentially mounted under a given path, +# excluding the path itself. +list_submounts() { + list_mounts | grep "${TAB}${1}/" | cut -d"$TAB" -f2 | sort -r +} + +# Lists all bind mounts of a given file system, +# excluding the path itself. +list_bindmounts() { + if is_bind_mount; then + # skip bind mount + # we should not umount the original file system via a bind mount + return + fi + + match_string="${TAB}${1}${TAB}" + if list_mounts | grep "$match_string" >/dev/null 2>&1; then + mount_disk=$(list_mounts | grep "$match_string" | cut -d"$TAB" -f1) + else + return + fi + + if [ -b "$mount_disk" ]; then + list_mounts | grep "$mount_disk" | grep -v "$match_string" | cut -d"$TAB" -f2 | sort -r + fi +} + +# kernels < 2.6.26 can't handle bind remounts +bind_kernel_check() { + echo "$options" | grep -w ro >/dev/null 2>&1 || + return + uname -r | awk -F. ' + $1==2 && $2==6 { + sub("[^0-9].*","",$3); + if ($3<26) + exit(1); + }' + [ $? -ne 0 ] && + ocf_log warn "kernel $(uname -r) cannot handle read only bind mounts" +} + +bind_root_mount_check() { + if [ "$(df -P "$1" | awk 'END{print $6}')" = "/" ]; then + return 1 + else + return 0 + fi +} + +bind_mount() { + if is_bind_mount && [ "$options" != "-o bind" ] + then + bind_kernel_check + bind_opts=$(echo "$options" | sed 's/bind/remount/') + $MOUNT $bind_opts "$MOUNTPOINT" + else + true # make sure to return OK + fi +} + +is_option() { + echo "$OCF_RESKEY_options" | grep -w "$1" >/dev/null 2>&1 +} + +is_fsck_needed() { + case $OCF_RESKEY_run_fsck in + force) true;; + no) false;; + ""|auto) + case "$FSTYPE" in + ext4|ext4dev|ext3|reiserfs|reiser4|nss|xfs|jfs|vfat|fat|nfs4|nfs|efs|cifs|smbfs|ocfs2|gfs2|none|lustre|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs) + false;; + *) + true;; + esac;; + *) + ocf_log warn "Invalid parameter value for fsck: '$OCF_RESKEY_run_fsck'; setting to 'auto'" + OCF_RESKEY_run_fsck="auto" + is_fsck_needed;; + esac +} + +fstype_supported() +{ + local support="$FSTYPE" + local rc + + if [ "X${HOSTOS}" = "XOpenBSD" ];then + # skip checking /proc/filesystems for obsd + return $OCF_SUCCESS + fi + + if [ -z "$FSTYPE" -o "$FSTYPE" = none ]; then + : No FSTYPE specified, rely on the system has the right file-system support already + return $OCF_SUCCESS + fi + + # support fuse-filesystems (e.g. GlusterFS) and Amazon Elastic File + # System (EFS) + case "$FSTYPE" in + fuse.*|glusterfs|rozofs) support="fuse";; + efs) check_binary "mount.efs"; support="nfs4";; + esac + + if [ "$support" != "$FSTYPE" ]; then + ocf_log info "Checking support for $FSTYPE as \"$support\"" + fi + + grep -w "$support"'$' /proc/filesystems >/dev/null + if [ $? -eq 0 ]; then + # found the fs type + return $OCF_SUCCESS + fi + + # if here, we should attempt to load the module and then + # check the if the filesystem support exists again. + $MODPROBE $support >/dev/null + if [ $? -ne 0 ]; then + ocf_exit_reason "Couldn't find filesystem $support in /proc/filesystems and failed to load kernel module" + return $OCF_ERR_INSTALLED + fi + + # It is possible for the module to load and not be complete initialized + # before we check /proc/filesystems again. Give this a few trys before + # giving up entirely. + for try in $(seq 5); do + grep -w "$support"'$' /proc/filesystems >/dev/null + if [ $? -eq 0 ] ; then + # yes. found the filesystem after doing the modprobe + return $OCF_SUCCESS + fi + ocf_log debug "Unable to find support for $support in /proc/filesystems after modprobe, trying again" + sleep 1 + done + + ocf_exit_reason "Couldn't find filesystem $support in /proc/filesystems" + return $OCF_ERR_INSTALLED +} + + +# +# In the case a fresh filesystem is just created from another +# node on the shared storage, and is not visible yet. Then try +# partprobe to refresh /dev/disk/by-{label,uuid}/* up to date. +# +# DEVICE can be /dev/xxx, -U, -L +# +trigger_udev_rules_if_needed() +{ + local refresh_flag="no" + local tmp + local timeout + + if [ $blockdevice = "yes" ]; then + tmp="$DEVICE" + if [ "$DEVICE" != "/dev/null" -a ! -b "$DEVICE" ] ; then + refresh_flag="yes" + fi + else + tmp="$(echo $DEVICE|awk '{$1=""; print substr($0,2)}')" + case "$DEVICE" in + -U*|--uuid*) + tmp="/dev/disk/by-uuid/$tmp" + ;; + -L*|--label*) + tmp="/dev/disk/by-label/$tmp" + ;; + *) + # bind mount? + return ;; + esac + [ ! -b "$tmp" ] && refresh_flag="yes" + fi + + [ "$refresh_flag" = "no" ] && return + + have_binary partprobe && partprobe >/dev/null 2>&1 + timeout=${OCF_RESKEY_CRM_meta_timeout:="60000"} + timeout=$((timeout/1000)) + have_binary udevadm && udevadm settle -t $timeout --exit-if-exists=$tmp + + return $? +} + +# +# START: Start up the filesystem +# +Filesystem_start() +{ + # Check if there are any mounts mounted under the mountpoint + match_string="${TAB}${CANONICALIZED_MOUNTPOINT}" + if list_mounts | grep -E "$match_string/\w+" >/dev/null 2>&1; then + ocf_log err "There is one or more mounts mounted under $MOUNTPOINT." + return $OCF_ERR_CONFIGURED + fi + + # See if the device is already mounted. + if Filesystem_status >/dev/null 2>&1 ; then + ocf_log info "Filesystem $MOUNTPOINT is already mounted." + return $OCF_SUCCESS + fi + + fstype_supported || exit $OCF_ERR_INSTALLED + + # Check the filesystem & auto repair. + # NOTE: Some filesystem types don't need this step... Please modify + # accordingly + + trigger_udev_rules_if_needed + + if [ $blockdevice = "yes" ]; then + if [ "$DEVICE" != "/dev/null" -a ! -b "$DEVICE" ] ; then + ocf_exit_reason "Couldn't find device [$DEVICE]. Expected /dev/??? to exist" + exit $OCF_ERR_INSTALLED + fi + + if is_fsck_needed; then + ocf_log info "Starting filesystem check on $DEVICE" + if [ -z "$FSTYPE" ]; then + $FSCK -p "$DEVICE" + else + $FSCK -t "$FSTYPE" -p "$DEVICE" + fi + + # NOTE: if any errors at all are detected, it returns non-zero + # if the error is >= 4 then there is a big problem + if [ $? -ge 4 ]; then + ocf_exit_reason "Couldn't successfully fsck filesystem for $DEVICE" + return $OCF_ERR_GENERIC + fi + fi + fi + + [ -d "$MOUNTPOINT" ] || + ocf_run mkdir -p "$MOUNTPOINT" + if [ ! -d "$MOUNTPOINT" ] ; then + ocf_exit_reason "Couldn't find directory [$MOUNTPOINT] to use as a mount point" + exit $OCF_ERR_INSTALLED + fi + + flushbufs "$DEVICE" + # Mount the filesystem. + case "$FSTYPE" in + none) $MOUNT $options $device_opt "$DEVICE" "$MOUNTPOINT" && + bind_mount + ;; + "") $MOUNT $options $device_opt "$DEVICE" "$MOUNTPOINT" ;; + *) $MOUNT -t "$FSTYPE" $options $device_opt "$DEVICE" "$MOUNTPOINT" ;; + esac + + if [ $? -ne 0 ]; then + ocf_exit_reason "Couldn't mount device [$DEVICE] as $MOUNTPOINT" + return $OCF_ERR_GENERIC + fi + return $OCF_SUCCESS +} +# end of Filesystem_start + +get_pids() +{ + local dir=$1 + local procs + local mmap_procs + + if is_bind_mount && ocf_is_true "$FORCE_UNMOUNT" && ! bind_root_mount_check "$DEVICE"; then + ocf_log debug "Change force_umount from '$FORCE_UNMOUNT' to 'safe'" + FORCE_UNMOUNT=safe + fi + + if ocf_is_true "$FORCE_UNMOUNT"; then + if [ "X${HOSTOS}" = "XOpenBSD" ];then + fstat | grep $dir | awk '{print $3}' + else + $FUSER -m $dir 2>/dev/null + fi + elif [ "$FORCE_UNMOUNT" = "safe" ]; then + procs=$(find /proc/[0-9]*/ -type l -lname "${dir}/*" -or -lname "${dir}" 2>/dev/null | awk -F/ '{print $3}') + mmap_procs=$(grep " ${dir}/" /proc/[0-9]*/maps | awk -F/ '{print $3}') + printf "${procs}\n${mmap_procs}" | sort | uniq + fi +} + +signal_processes() { + local dir=$1 + local sig=$2 + local pids pid + # fuser returns a non-zero return code if none of the + # specified files is accessed or in case of a fatal + # error. + pids=$(get_pids "$dir") + if [ -z "$pids" ]; then + ocf_log info "No processes on $dir were signalled. force_unmount is set to '$FORCE_UNMOUNT'" + return + fi + for pid in $pids; do + ocf_log info "sending signal $sig to: $(ps -f $pid | tail -1)" + kill -s $sig $pid + done +} +try_umount() { + local SUB="$1" + $UMOUNT $umount_force "$SUB" + list_mounts | grep "${TAB}${SUB}${TAB}" >/dev/null 2>&1 || { + ocf_log info "unmounted $SUB successfully" + return $OCF_SUCCESS + } + return $OCF_ERR_GENERIC +} +timeout_child() { + local pid="$1" timeout="$2" killer ret + + # start job in the background that will KILL the given process after timeout expires + sleep $timeout && kill -s KILL $pid & + killer=$! + + # block until the child process either exits on its own or gets killed by the above killer pipeline + wait $pid + ret=$? + + # ret would be 127 + child exit code if the timeout expired + [ $ret -lt 128 ] && kill -s KILL $killer + return $ret +} +fs_stop_loop() { + local SUB="$1" signals="$2" sig + while true; do + for sig in $signals; do + signal_processes "$SUB" $sig + done + sleep $OCF_RESKEY_signal_delay + try_umount "$SUB" && return $OCF_SUCCESS + done +} +fs_stop() { + local SUB="$1" timeout=$2 grace_time ret + grace_time=$((timeout/2)) + + # try gracefully terminating processes for up to half of the configured timeout + fs_stop_loop "$SUB" "$OCF_RESKEY_term_signals" & + timeout_child $! $grace_time + ret=$? + [ $ret -eq $OCF_SUCCESS ] && return $ret + + # try killing them for the rest of the timeout + fs_stop_loop "$SUB" "$OCF_RESKEY_kill_signals" & + timeout_child $! $grace_time + ret=$? + [ $ret -eq $OCF_SUCCESS ] && return $ret + + # timeout expired + ocf_exit_reason "Couldn't unmount $SUB within given timeout" + return $OCF_ERR_GENERIC +} + +# +# STOP: Unmount the filesystem +# +Filesystem_stop() +{ + # See if the device is currently mounted + Filesystem_status >/dev/null 2>&1 + if [ $? -eq $OCF_NOT_RUNNING ]; then + # Already unmounted, wonderful. + rc=$OCF_SUCCESS + else + # Wipe the status file, but continue with a warning if + # removal fails -- the file system might be read only + if [ $OCF_CHECK_LEVEL -eq 20 ]; then + rm -f "${STATUSFILE}" + if [ $? -ne 0 ]; then + ocf_log warn "Failed to remove status file ${STATUSFILE}." + fi + fi + + # Determine the real blockdevice this is mounted on (if + # possible) prior to unmounting. + determine_blockdevice + + # For networked filesystems, there's merit in trying -f: + case "$FSTYPE" in + nfs4|nfs|efs|cifs|smbfs) umount_force="-f" ;; + esac + + # Umount all sub-filesystems mounted under $MOUNTPOINT/ too. + local timeout + while read SUB; do + ocf_log info "Trying to unmount $SUB" + if ocf_is_true "$FAST_STOP"; then + timeout=6 + else + timeout=${OCF_RESKEY_CRM_meta_timeout:="20000"} + timeout=$((timeout/1000)) + fi + fs_stop "$SUB" $timeout + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + ocf_exit_reason "Couldn't unmount $SUB, giving up!" + fi + done <<-EOF + $(list_submounts "$CANONICALIZED_MOUNTPOINT"; \ + list_bindmounts "$CANONICALIZED_MOUNTPOINT"; \ + echo $CANONICALIZED_MOUNTPOINT) + EOF + fi + + flushbufs "$DEVICE" + + return $rc +} +# end of Filesystem_stop + +# +# STATUS: is the filesystem mounted or not? +# +Filesystem_status() +{ + match_string="${TAB}${CANONICALIZED_MOUNTPOINT}${TAB}" + if list_mounts | grep "$match_string" >/dev/null 2>&1; then + rc=$OCF_SUCCESS + msg="$MOUNTPOINT is mounted (running)" + else + rc=$OCF_NOT_RUNNING + msg="$MOUNTPOINT is unmounted (stopped)" + fi + + # Special case "monitor" to check whether the UUID cached and + # on-disk still match? + case "$OP" in + status) ocf_log info "$msg";; + esac + + return $rc +} +# end of Filesystem_status + + +# Note: the read/write tests below will stall in case the +# underlying block device (or in the case of a NAS mount, the +# NAS server) has gone away. In that case, if I/O does not +# return to normal in time, the operation hits its timeout +# and it is up to the CRM to initiate appropriate recovery +# actions (such as fencing the node). +# +# MONITOR 10: read the device +# +Filesystem_monitor_10() +{ + if [ "$blockdevice" = "no" ] ; then + ocf_log warn "$DEVICE is not a block device, monitor 10 is noop" + return $OCF_SUCCESS + fi + dd_opts="iflag=direct bs=4k count=1" + err_output=$(dd if="$DEVICE" $dd_opts 2>&1 >/dev/null) + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to read device $DEVICE" + ocf_log err "dd said: $err_output" + return $OCF_ERR_GENERIC + fi + return $OCF_SUCCESS +} +# +# MONITOR 20: write and read a status file +# +Filesystem_monitor_20() +{ + if [ "$blockdevice" = "no" ] ; then + # O_DIRECT not supported on cifs/smbfs + dd_opts="oflag=sync bs=4k conv=fsync,sync" + else + # Writing to the device in O_DIRECT mode is imperative + # to bypass caches. + dd_opts="oflag=direct,sync bs=4k conv=fsync,sync" + fi + status_dir=$(dirname "$STATUSFILE") + [ -d "$status_dir" ] || mkdir -p "$status_dir" + err_output=$(echo "${OCF_RESOURCE_INSTANCE}" | dd of="${STATUSFILE}" $dd_opts 2>&1) + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to write status file ${STATUSFILE}" + ocf_log err "dd said: $err_output" + return $OCF_ERR_GENERIC + fi + test -f "${STATUSFILE}" + if [ $? -ne 0 ]; then + ocf_exit_reason "Cannot stat the status file ${STATUSFILE}" + return $OCF_ERR_GENERIC + fi + cat "${STATUSFILE}" > /dev/null + if [ $? -ne 0 ]; then + ocf_exit_reason "Cannot read the status file ${STATUSFILE}" + return $OCF_ERR_GENERIC + fi + return $OCF_SUCCESS +} +Filesystem_monitor() +{ + Filesystem_status + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + return $rc + fi + + if [ $rc -eq $OCF_SUCCESS -a $OCF_CHECK_LEVEL -gt 0 ]; then + case "$OCF_CHECK_LEVEL" in + 10) Filesystem_monitor_10; rc=$?;; + 20) Filesystem_monitor_20; rc=$?;; + *) + ocf_exit_reason "unsupported monitor level $OCF_CHECK_LEVEL" + rc=$OCF_ERR_CONFIGURED + ;; + esac + fi + return $rc +} +# end of Filesystem_monitor + + +# +# VALIDATE_ALL: Are the instance parameters valid? +# FIXME!! The only part that's useful is the return code. +# This code always returns $OCF_SUCCESS (!) +# FIXME!! Needs some tuning to match fstype_supported() (e.g., for +# fuse). Can we just call fstype_supported() with a flag like +# "no_modprobe" instead? +# +Filesystem_validate_all() +{ + # Check if the $FSTYPE is workable + # NOTE: Without inserting the $FSTYPE module, this step may be imprecise + # TODO: This is Linux specific crap. + if [ ! -z "$FSTYPE" -a "$FSTYPE" != none ]; then + cut -f2 /proc/filesystems | grep "^${FSTYPE}$" >/dev/null 2>&1 + if [ $? -ne 0 ]; then + modpath=/lib/modules/$(uname -r) + moddep=$modpath/modules.dep + # Do we have $FSTYPE in modules.dep? + cut -d' ' -f1 $moddep \ + | grep "^${modpath}.*${FSTYPE}\.k\?o:$" >/dev/null 2>&1 + if [ $? -ne 0 ]; then + ocf_log info "It seems we do not have $FSTYPE support" + fi + fi + fi + + # If we are supposed to do monitoring with status files, then + # we need a utility to write in O_DIRECT mode. + if [ $OCF_CHECK_LEVEL -gt 0 ]; then + check_binary dd + # Note: really old coreutils version do not support + # the "oflag" option for dd. We don't check for that + # here. In case dd does not support oflag, monitor is + # bound to fail, with dd spewing an error message to + # the logs. On such systems, we must do without status + # file monitoring. + fi + + #TODO: How to check the $options ? + return $OCF_SUCCESS +} + +# +# set the blockdevice variable to "no" or "yes" +# +set_blockdevice_var() { + blockdevice=no + + # these are definitely not block devices + case "$FSTYPE" in + nfs4|nfs|efs|smbfs|cifs|none|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs|lustre) return;; + esac + + if $(is_option "loop"); then + return + fi + + case "$DEVICE" in + --uuid=*|--uuid\ *|--label=*|--label\ *) + device_opt=$(echo $DEVICE | sed "s/\([[:blank:]]\|=\).*//") + DEVICE=$(echo $DEVICE | sed -E "s/$device_opt([[:blank:]]*|=)//") + ;; + -U*|-L*) # short versions of --uuid/--label + device_opt=$(echo $DEVICE | cut -c1-2) + DEVICE=$(echo $DEVICE | sed "s/$device_opt[[:blank:]]*//") + ;; + /dev/null) # Special case for BSC + blockdevice=yes + ;; + *) + if [ ! -b "$DEVICE" -a ! -d "$DEVICE" -a "X$OP" != Xstart ] ; then + ocf_log warn "Couldn't find device [$DEVICE]. Expected /dev/??? to exist" + fi + if [ ! -d "$DEVICE" ]; then + blockdevice=yes + fi + ;; + esac +} + +# Check the arguments passed to this script +if [ $# -ne 1 ]; then + usage + exit $OCF_ERR_ARGS +fi + +# Check the OCF_RESKEY_ environment variables... +FORCE_UNMOUNT="yes" +if [ -n "${OCF_RESKEY_force_unmount}" ]; then + FORCE_UNMOUNT=$OCF_RESKEY_force_unmount +fi + +DEVICE="$OCF_RESKEY_device" +FSTYPE=$OCF_RESKEY_fstype +if [ ! -z "$OCF_RESKEY_options" ]; then + options="-o $OCF_RESKEY_options" +fi +FAST_STOP=${OCF_RESKEY_fast_stop:="yes"} + +OP=$1 + +# These operations do not require instance parameters +case $OP in + meta-data) meta_data + exit $OCF_SUCCESS + ;; + usage) usage + exit $OCF_SUCCESS + ;; +esac + +if [ x = x"$DEVICE" ]; then + ocf_exit_reason "Please set OCF_RESKEY_device to the device to be managed" + exit $OCF_ERR_CONFIGURED +fi + +set_blockdevice_var + +# Normalize instance parameters: + +# It is possible that OCF_RESKEY_directory has one or even multiple trailing "/". +# But the output of `mount` and /proc/mounts do not. +if [ -z "$OCF_RESKEY_directory" ]; then + if [ X$OP = "Xstart" -o $blockdevice = "no" ]; then + ocf_exit_reason "Please specify the directory" + exit $OCF_ERR_CONFIGURED + fi +else + MOUNTPOINT="$(echo "$OCF_RESKEY_directory" | sed 's/\/*$//')" + : ${MOUNTPOINT:=/} + if [ -e "$MOUNTPOINT" ] ; then + CANONICALIZED_MOUNTPOINT="$(readlink -f "$MOUNTPOINT")" + if [ $? -ne 0 ]; then + ocf_exit_reason "Could not canonicalize $MOUNTPOINT because readlink failed" + exit $OCF_ERR_GENERIC + fi + else + CANONICALIZED_MOUNTPOINT="$MOUNTPOINT" + fi + # At this stage, $MOUNTPOINT does not contain trailing "/" unless it is "/" + # TODO: / mounted via Filesystem sounds dangerous. On stop, we'll + # kill the whole system. Is that a good idea? +fi + +# Check to make sure the utilites are found +if [ "X${HOSTOS}" != "XOpenBSD" ];then +check_binary $MODPROBE +check_binary $FUSER +fi +check_binary $FSCK +check_binary $MOUNT +check_binary $UMOUNT + +if [ "$OP" != "monitor" ]; then + ocf_log info "Running $OP for $DEVICE on $MOUNTPOINT" +fi + +case $OP in + status) Filesystem_status + exit $? + ;; + monitor) Filesystem_monitor + exit $? + ;; + validate-all) Filesystem_validate_all + exit $? + ;; + stop) Filesystem_stop + exit $? + ;; +esac + +CLUSTERSAFE=0 +is_option "ro" && + CLUSTERSAFE=2 + +case "$FSTYPE" in +nfs4|nfs|efs|smbfs|cifs|none|gfs2|glusterfs|ceph|ocfs2|overlay|overlayfs|tmpfs|cvfs|lustre) + CLUSTERSAFE=1 # this is kind of safe too + systemd_drop_in "99-Filesystem-remote" "After" "remote-fs.target" + ;; +# add here CLUSTERSAFE=0 for all filesystems which are not +# cluster aware and which, even if when mounted read-only, +# could still modify parts of it such as journal/metadata +ext4|ext4dev|ext3|reiserfs|reiser4|xfs|jfs) + if ocf_is_true "$OCF_RESKEY_force_clones"; then + CLUSTERSAFE=2 + systemd_drop_in "99-Filesystem-remote" "After" "remote-fs.target" + else + CLUSTERSAFE=0 # these are not allowed + fi + ;; +esac + +if ocf_is_clone; then + case $CLUSTERSAFE in + 0) + ocf_exit_reason "DANGER! $FSTYPE on $DEVICE is NOT cluster-aware!" + ocf_log err "DO NOT RUN IT AS A CLONE!" + ocf_log err "Politely refusing to proceed to avoid data corruption." + exit $OCF_ERR_CONFIGURED + ;; + 2) + ocf_log warn "$FSTYPE on $DEVICE is NOT cluster-aware!" + if ocf_is_true "$OCF_RESKEY_force_clones"; then + ocf_log warn "But we'll let it run because we trust _YOU_ verified it's safe to do so." + else + ocf_log warn "But we'll let it run because it is mounted read-only." + ocf_log warn "Please make sure that it's meta data is read-only too!" + fi + ;; + esac +fi + +case $OP in + start) Filesystem_start + ;; + *) usage + exit $OCF_ERR_UNIMPLEMENTED + ;; + esac +exit $? + + diff --git a/heartbeat/ICP b/heartbeat/ICP new file mode 100755 index 0000000..0bf37de --- /dev/null +++ b/heartbeat/ICP @@ -0,0 +1,304 @@ +#!/bin/sh +# +# +# ICP +# +# Description: Manages an ICP Vortex clustered host drive as an HA resource +# +# +# Author: Lars Marowsky-Bree <lmb@suse.de> +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL) +# Copyright: (C) 2002 SuSE Linux AG +# +# +# An example usage in /etc/ha.d/haresources: +# node1 10.0.0.170 LinuxSCSI::0:0 ICP::c0h1::/dev/sdb1 LVM::myvolname +# +# Notice that you will need to get the utility "icpclucon" from the ICP +# support to use this. +# +# See usage() function below for more details... +# +# OCF parameters are as below: +# OCF_RESKEY_driveid +# OCF_RESKEY_device + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_driveid_default="" +OCF_RESKEY_device_default="" + +: ${OCF_RESKEY_driveid=${OCF_RESKEY_driveid_default}} +: ${OCF_RESKEY_device=${OCF_RESKEY_device_default}} + +####################################################################### + +# +ICPCLUCON=/usr/sbin/icpclucon +# + +usage() { + methods=`ICP_methods | grep -v methods` + methods=`echo $methods | tr ' ' '|'` + cat <<-! + usage: $0 ($methods) + + $0 manages an ICP Vortex clustered host drive. + + The 'start' operation reserves the given host drive. + The 'stop' operation releses the given host drive. + The 'status' operation reports whether the host drive is reserved. + The 'monitor' operation reports whether the host drive is reserved. + The 'validate-all' operation reports whether OCF instance parameters are valid. + The 'methods' operation reports on the methods $0 supports + + ! +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="ICP" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for ICP. It Manages an ICP Vortex clustered host drive as an +HA resource. +</longdesc> +<shortdesc lang="en">Manages an ICP Vortex clustered host drive</shortdesc> + +<parameters> +<parameter name="driveid" unique="0" required="1"> +<longdesc lang="en"> +The ICP cluster drive ID. +</longdesc> +<shortdesc lang="en">ICP cluster drive ID</shortdesc> +<content type="string" default="${OCF_RESKEY_driveid_default}" /> +</parameter> + +<parameter name="device" unique="0" required="1"> +<longdesc lang="en"> +The device name. +</longdesc> +<shortdesc lang="en">device</shortdesc> +<content type="string" default="${OCF_RESKEY_device_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="status" depth="0" timeout="20s" interval="10s" /> +<action name="monitor" depth="0" timeout="20s" interval="10s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +# +# methods: What methods/operations do we support? +# +ICP_methods() { + cat <<-! + start + stop + status + monitor + methods + validate-all + meta-data + usage + ! +} + +ICP_status() { + local icp_out + + icp_out=$($ICPCLUCON -v -status $1) + if [ $? -ne 0 ]; then + ocf_log "err" "Hostdrive not reserved by us." + return $OCF_ERR_GENERIC + fi + + if expr match "$icp_out" \ + '.*Drive is reserved by this host.*' >/dev/null 2>&1 ; then + ocf_log "info" "Volume $1 is reserved by us." + return $OCF_SUCCESS + elif expr match "$icp_out" \ + '.*Drive is not reserved by any host.*' >/dev/null 2>&1 ; then + ocf_log "err" "Volume $1 not reserved by any host." + return $OCF_NOT_RUNNING + else + ocf_log "err" "Unknown output from icpclucon. Assuming we do not have a reservation:" + ocf_log "err" "$icp_out" + return $OCF_NOT_RUNNING + fi +} + +ICP_report_status() { + if ICP_status $1 ; then + echo "$1: running" + return $OCF_SUCCESS + else + echo "$1: not running" + return $OCF_NOT_RUNNING + fi +} + + +# +# Monitor the host drive - does it really seem to be working? +# +# +ICP_monitor() { + + if + ICP_status $1 + then + return $? + else + ocf_log "err" "ICP host drive $1 is offline" + return $OCF_NOT_RUNNING + fi + +} + +Clear_bufs() { + $BLOCKDEV --flushbufs $1 +} + +# +# Enable ICP host drive +# +ICP_start() { + + ocf_log "info" "Activating host drive $1" + ocf_run $ICPCLUCON -v -reserve $1 + if [ $? -ne 0 ]; then + ocf_log "info" "Forcing reservation of $1" + ocf_run $ICPCLUCON -v -force $1 || return $OCF_ERR_GENERIC + fi + + if + ICP_status $1 + then + : OK + # A reservation isn't as prompt as it should be + sleep 3 + return $OCF_SUCCESS + else + ocf_log "err" "ICP: $1 was not reserved correctly" + return $OCF_ERR_GENERIC + fi +} + +# +# Release the ICP host drive +# +ICP_stop() { + + ocf_log "info" "Releasing ICP host drive $1" + ocf_run $ICPCLUCON -v -release $1 || return $OCF_ERR_GENERIC + + ocf_log "info" "Verifying reservation" + if ICP_status $1 ; then + ocf_log "err" "ICP: $1 was not released correctly" + return $OCF_ERR_GENERIC + fi + return $OCF_SUCCESS +} + +ICP_validate_all() { + check_binary $BLOCKDEV + check_binary $ICPCLUCON + $ICPCLUCON -v -status $driveid >/dev/null 2>&1 + if [ $? -ne 0 ]; then + ocf_log err "Invalid driveid $driveid" + exit $OCF_ERR_ARGS + fi + + if [ ! -b $device ]; then + ocf_log err "Device $device is not a block device" + exit $OCF_ERR_ARGS + fi + +# Do not know how to check the association of $device with $driveid. + + return $OCF_SUCCESS +} + +# +# 'main' starts here... +# + +if + ( [ $# -ne 1 ] ) +then + usage + exit $OCF_ERR_ARGS +fi + +# These operations do not require OCF instance parameters to be set +case "$1" in + + meta-data) meta_data + exit $OCF_SUCCESS;; + + methods) ICP_methods + exit $OCF_SUCCESS;; + + usage) usage + exit $OCF_SUCCESS;; + + *) ;; +esac + +if + [ -z "$OCF_RESKEY_driveid" ] +then + ocf_log err "Please specify OCF_RESKEY_driveid" + exit $OCF_ERR_ARGS +fi + +if [ -z "$OCF_RESKEY_device" ]; then + ocf_log err "Please specify OCF_RESKEY_device" + exit $OCF_ERR_ARGS +fi + +driveid=$OCF_RESKEY_driveid +device=$OCF_RESKEY_device + +# What kind of method was invoked? +case "$1" in + + start) ICP_validate_all + ICP_start $driveid + Clear_bufs $device + exit $?;; + + stop) ICP_stop $driveid + Clear_bufs $device + exit $?;; + + status) ICP_report_status $driveid + exit $?;; + + monitor) ICP_monitor $driveid + exit $?;; + + validate-all) ICP_validate_all + exit $?;; + + *) usage + exit $OCF_ERR_UNIMPLEMENTED;; +esac diff --git a/heartbeat/IPaddr b/heartbeat/IPaddr new file mode 100755 index 0000000..9b0ea81 --- /dev/null +++ b/heartbeat/IPaddr @@ -0,0 +1,912 @@ +#!/bin/sh +# +# License: GNU General Public License (GPL) +# Support: users@clusterlabs.org +# +# This script manages IP alias IP addresses +# +# It can add an IP alias, or remove one. +# +# usage: $0 {start|stop|status|monitor|validate-all|meta-data} +# +# The "start" arg adds an IP alias. +# +# Surprisingly, the "stop" arg removes one. :-) +# +# OCF parameters are as below +# OCF_RESKEY_ip +# OCF_RESKEY_broadcast +# OCF_RESKEY_nic +# OCF_RESKEY_cidr_netmask +# OCF_RESKEY_lvs_support ( e.g. true, on, 1 ) +# OCF_RESKEY_ARP_INTERVAL_MS +# OCF_RESKEY_ARP_REPEAT +# OCF_RESKEY_ARP_BACKGROUND (e.g. yes ) +# OCF_RESKEY_ARP_NETMASK +# OCF_RESKEY_local_start_script +# OCF_RESKEY_local_stop_script +# +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_ip_default="" +OCF_RESKEY_nic_default="eth0" +OCF_RESKEY_cidr_netmask_default="" +OCF_RESKEY_broadcast_default="" +OCF_RESKEY_iflabel_default="" +OCF_RESKEY_lvs_support_default="false" +OCF_RESKEY_local_stop_script_default="" +OCF_RESKEY_local_start_script_default="" +OCF_RESKEY_ARP_INTERVAL_MS_default="500" +OCF_RESKEY_ARP_REPEAT_default="10" +OCF_RESKEY_ARP_BACKGROUND_default="yes" +OCF_RESKEY_ARP_NETMASK_default="ffffffffffff" + +: ${OCF_RESKEY_ip=${OCF_RESKEY_ip_default}} +: ${OCF_RESKEY_nic=${OCF_RESKEY_nic_default}} +: ${OCF_RESKEY_cidr_netmask=${OCF_RESKEY_cidr_netmask_default}} +: ${OCF_RESKEY_broadcast=${OCF_RESKEY_broadcast_default}} +: ${OCF_RESKEY_iflabel=${OCF_RESKEY_iflabel_default}} +: ${OCF_RESKEY_lvs_support=${OCF_RESKEY_lvs_support_default}} +: ${OCF_RESKEY_local_stop_script=${OCF_RESKEY_local_stop_script_default}} +: ${OCF_RESKEY_local_start_script=${OCF_RESKEY_local_start_script_default}} +: ${OCF_RESKEY_ARP_INTERVAL_MS=${OCF_RESKEY_ARP_INTERVAL_MS_default}} +: ${OCF_RESKEY_ARP_REPEAT=${OCF_RESKEY_ARP_REPEAT_default}} +: ${OCF_RESKEY_ARP_BACKGROUND=${OCF_RESKEY_ARP_BACKGROUND_default}} +: ${OCF_RESKEY_ARP_NETMASK=${OCF_RESKEY_ARP_NETMASK_default}} + +SENDARP=$HA_BIN/send_arp +FINDIF=$HA_BIN/findif +VLDIR=$HA_RSCTMP +SENDARPPIDDIR=$HA_RSCTMP +SENDARPPIDFILE="$SENDARPPIDDIR/send_arp-$OCF_RESKEY_ip" +USAGE="usage: $0 {start|stop|status|monitor|validate-all|meta-data}"; + +####################################################################### + +SYSTYPE="`uname -s`" +case "$SYSTYPE" in + SunOS) + # `uname -r` = 5.9 -> SYSVERSION = 9 + SYSVERSION="`uname -r | cut -d. -f 2`" + ;; + Darwin) + # Treat Darwin the same as the other BSD variants (matched as *BSD) + SYSTYPE="${SYSTYPE}BSD" + ;; + *) + ;; +esac + + + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="IPaddr" version="1.0"> +<version>1.0</version> +<longdesc lang="en"> +This script manages IP alias IP addresses +It can add an IP alias, or remove one. +</longdesc> +<shortdesc lang="en">Manages virtual IPv4 addresses (portable version)</shortdesc> + +<parameters> +<parameter name="ip" unique="1" required="1"> +<longdesc lang="en"> +The IPv4 address to be configured in dotted quad notation, for example +"192.168.1.1". +</longdesc> +<shortdesc lang="en">IPv4 address</shortdesc> +<content type="string" default="${OCF_RESKEY_ip_default}" /> +</parameter> +<parameter name="nic" unique="0"> +<longdesc lang="en"> +The base network interface on which the IP address will be brought +online. + +If left empty, the script will try and determine this from the +routing table. + +Do NOT specify an alias interface in the form eth0:1 or anything here; +rather, specify the base interface only. + +Prerequisite: + +There must be at least one static IP address, which is not managed by +the cluster, assigned to the network interface. + +If you can not assign any static IP address on the interface, +modify this kernel parameter: +sysctl -w net.ipv4.conf.all.promote_secondaries=1 +(or per device) + +</longdesc> +<shortdesc lang="en">Network interface</shortdesc> +<content type="string" default="${OCF_RESKEY_nic_default}"/> +</parameter> + +<parameter name="cidr_netmask"> +<longdesc lang="en"> +The netmask for the interface in CIDR format. (ie, 24), or in +dotted quad notation 255.255.255.0). + +If unspecified, the script will also try to determine this from the +routing table. +</longdesc> +<shortdesc lang="en">Netmask</shortdesc> +<content type="string" default="${OCF_RESKEY_cidr_netmask_default}"/> +</parameter> + +<parameter name="broadcast"> +<longdesc lang="en"> +Broadcast address associated with the IP. If left empty, the script will +determine this from the netmask. +</longdesc> +<shortdesc lang="en">Broadcast address</shortdesc> +<content type="string" default="${OCF_RESKEY_broadcast_default}"/> +</parameter> + +<parameter name="iflabel"> +<longdesc lang="en"> +You can specify an additional label for your IP address here. +</longdesc> +<shortdesc lang="en">Interface label</shortdesc> +<content type="string" default="${OCF_RESKEY_iflabel_default}"/> +</parameter> + +<parameter name="lvs_support"> +<longdesc lang="en"> +Enable support for LVS Direct Routing configurations. In case a IP +address is stopped, only move it to the loopback device to allow the +local node to continue to service requests, but no longer advertise it +on the network. +</longdesc> +<shortdesc lang="en">Enable support for LVS DR</shortdesc> +<content type="boolean" default="${OCF_RESKEY_lvs_support_default}"/> +</parameter> + +<parameter name="local_stop_script"> +<longdesc lang="en"> +Script called when the IP is released +</longdesc> +<shortdesc lang="en">Script called when the IP is released</shortdesc> +<content type="string" default="${OCF_RESKEY_local_stop_script_default}"/> +</parameter> + +<parameter name="local_start_script"> +<longdesc lang="en"> +Script called when the IP is added +</longdesc> +<shortdesc lang="en">Script called when the IP is added</shortdesc> +<content type="string" default="${OCF_RESKEY_local_start_script_default}"/> +</parameter> + +<parameter name="ARP_INTERVAL_MS"> +<longdesc lang="en"> +milliseconds between ARPs +</longdesc> +<shortdesc lang="en">milliseconds between gratuitous ARPs</shortdesc> +<content type="integer" default="${OCF_RESKEY_ARP_INTERVAL_MS_default}"/> +</parameter> + +<parameter name="ARP_REPEAT"> +<longdesc lang="en"> +How many gratuitous ARPs to send out when bringing up a new address +</longdesc> +<shortdesc lang="en">repeat count</shortdesc> +<content type="integer" default="${OCF_RESKEY_ARP_REPEAT_default}"/> +</parameter> + +<parameter name="ARP_BACKGROUND"> +<longdesc lang="en"> +run in background (no longer any reason to do this) +</longdesc> +<shortdesc lang="en">run in background</shortdesc> +<content type="boolean" default="${OCF_RESKEY_ARP_BACKGROUND_default}"/> +</parameter> + +<parameter name="ARP_NETMASK"> +<longdesc lang="en"> +netmask for ARP - in nonstandard hexadecimal format. +</longdesc> +<shortdesc lang="en">netmask for ARP</shortdesc> +<content type="string" default="${OCF_RESKEY_ARP_NETMASK_default}"/> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="5s" /> +<action name="validate-all" timeout="20s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END + exit $OCF_SUCCESS +} + +# The 'ping' command takes highly OS-dependent arguments, so this +# function creates a suitable argument list for the host OS's 'ping'. +# We use a subset of its functionality: +# 1. single packet +# 2. reasonable timeout (say 1 second) +# +# arguments: +# $1: IP address to ping +# result string: +# arguments for ping command +# +# If more flexibility is needed, they could be specified in the environment +# to this function, to adjust the resulting 'ping' arguments. +# David Lee <t.d.lee@durham.ac.uk> May 2007 +pingargs() { + _baseip=$1 + _timeout=1 # seconds + _pktcount=1 + _systype="`uname -s`" + case $_systype in + Linux) + # Default is perpetual ping: need "-c $_pktcount". + # -c count -t timetolive -q(uiet) -n(umeric) -W timeout + _pingargs="-c $_pktcount -q -n $_baseip" + ;; + SunOS) + # Default is immediate (or timeout) return. + _pingargs="$_baseip $_timeout" + ;; + *) + _pingargs="-c $_pktcount $_baseip" + ;; + esac + + echo "$_pingargs" +} + +# On Linux systems the (hidden) loopback interface may +# conflict with the requested IP address. If so, this +# unoriginal code will remove the offending loopback address +# and save it in VLDIR so it can be added back in later +# when the IPaddr is released. +# +lvs_remove_conflicting_loopback() { + ipaddr="$1" + ifname="$2" + + ocf_log info "Removing conflicting loopback $ifname." + if + echo $ifname > "$VLDIR/$ipaddr" + then + : Saved loopback information in $VLDIR/$ipaddr + else + ocf_log err "Could not save conflicting loopback $ifname." \ + "it will not be restored." + fi + + if [ ! -z "${OCF_RESKEY_local_stop_script}" ]; then + if [ -x "${OCF_RESKEY_local_stop_script}" ]; then + ${OCF_RESKEY_local_stop_script} $* + fi + fi + + delete_interface "$ifname" "$ipaddr" + + # Forcibly remove the route (if it exists) to the loopback. + delete_route "$ipaddr" +} + +# +# On Linux systems the (hidden) loopback interface may +# need to be restored if it has been taken down previously +# by lvs_remove_conflicting_loopback() +# +lvs_restore_loopback() { + ipaddr="$1" + + if [ ! -s "$VLDIR/$ipaddr" ]; then + return + fi + + ifname=`cat "$VLDIR/$ipaddr"` + ocf_log info "Restoring loopback IP Address $ipaddr on $ifname." + + CMD="OCF_RESKEY_cidr_netmask=32 OCF_RESKEY_ip=$1 OCF_RESKEY_nic=$ifname $FINDIF" + if + NICINFO=`eval $CMD` + NICINFO=`echo $NICINFO | tr " " " " | tr -s " "` + then + netmask_text=`echo "$NICINFO" | cut -f3 -d " "` + broadcast=`echo "$NICINFO" | cut -f5 -d " "` + else + echo "ERROR: $CMD failed (rc=$rc)" + exit $OCF_ERR_GENERIC + fi + + add_interface "$ipaddr" "$ifname" "$ifname" $netmask_text $broadcast + rm -f "$VLDIR/$ipaddr" +} + +# +# Find out which alias serves the given IP address +# The argument is an IP address, and its output +# is an aliased interface name (e.g., "eth0:0"). +# +find_interface_solaris() { + ipaddr="$1" + + $IFCONFIG $IFCONFIG_A_OPT | $AWK '{if ($0 ~ /.*: / && NR > 1) {print "\n"$0} else {print}}' | + while read ifname linkstuff + do + : ifname = $ifname + read inet addr junk + : inet = $inet addr = $addr + while + read line && [ "X$line" != "X" ] + do + : Nothing + done + + case $ifname in + *:*) ;; + *) continue;; + esac + + # This doesn't look right for a box with multiple NICs. + # It looks like it always selects the first interface on + # a machine. Yet, we appear to use the results for this case too... + ifname=`echo "$ifname" | sed s'%:$%%'` + + case $addr in + addr:$ipaddr) echo $ifname; return $OCF_SUCCESS;; + $ipaddr) echo $ifname; return $OCF_SUCCESS;; + esac + done + return $OCF_ERR_GENERIC +} + +find_interface_bsd() { + $IFCONFIG $IFCONFIG_A_OPT | awk -v ip_addr="$ipaddr" ' + /UP,/ && $0 ~ /^[a-z]+[0-9]:/ { + if_name=$1; sub(":$","",if_name); + } + $1 == "inet" && $2 == ip_addr { + print if_name + exit(0) + }' + +} + +# +# Find out which alias serves the given IP address +# The argument is an IP address, and its output +# is an aliased interface name (e.g., "eth0:0"). +# +find_interface_generic() { + ipaddr="$1" + $IFCONFIG $IFCONFIG_A_OPT | + while read ifname linkstuff + do + : Read gave us ifname = $ifname + + read inet addr junk + : Read gave us inet = $inet addr = $addr + + while + read line && [ "X$line" != "X" ] + do + : Nothing + done + + case $ifname in + *:*) ifname=`echo $ifname | sed 's/:$//'`;; + *) continue;; + esac + + : "comparing $ipaddr to $addr (from ifconfig)" + case $addr in + addr:$ipaddr) echo $ifname; return $OCF_SUCCESS;; + $ipaddr) echo $ifname; return $OCF_SUCCESS;; + esac + done + return $OCF_ERR_GENERIC +} + +# +# Find out which alias serves the given IP address +# The argument is an IP address, and its output +# is an aliased interface name (e.g., "eth0:0"). +# +find_interface() { + ipaddr="$1" + case "$SYSTYPE" in + SunOS) + NIC=`find_interface_solaris $ipaddr`;; + *BSD) + NIC=`find_interface_bsd $ipaddr`;; + *) + NIC=`find_interface_generic $ipaddr`;; + esac + + echo $NIC + return $OCF_SUCCESS; +} + +# +# Find an unused interface/alias name for us to use for new IP alias +# The argument is an IP address, and the output +# is an aliased interface name (e.g., "eth0:0", "dc0", "le0:0"). +# +find_free_interface() { + NIC="$1" + + if [ "X$NIC" = "X" ]; then + ocf_log err "No free interface found for $OCF_RESKEY_ip" + return $OCF_ERR_GENERIC; + fi + + NICBASE="$VLDIR/IPaddr-$NIC" + touch "$NICBASE" + + case "$SYSTYPE" in + *BSD) + echo $NIC; + return $OCF_SUCCESS;; + SunOS) + j=1 + IFLIST=`$IFCONFIG $IFCONFIG_A_OPT | \ + grep "^$NIC:[0-9]" | sed 's%: .*%%'`;; + *) + j=0 + IFLIST=`$IFCONFIG $IFCONFIG_A_OPT | \ + grep "^$NIC:[0-9]" | sed 's% .*%%'` + TRYADRCNT=`ls "${NICBASE}:"* 2>/dev/null | wc -w | tr -d ' '` + if [ -f "${NICBASE}:${TRYADRCNT}" ]; then + : OK + else + j="${TRYADRCNT}" + fi + ;; + esac + + IFLIST=" `echo $IFLIST` " + while + [ $j -lt 512 ] + do + case $IFLIST in + *" "$NIC:$j" "*) + ;; + *) + NICLINK="$NICBASE:$j" + if + ln "$NICBASE" "$NICLINK" 2>/dev/null + then + echo "$NIC:$j" + return $OCF_SUCCESS + fi + ;; + esac + j=`expr $j + 1` + done + return $OCF_ERR_GENERIC +} + +delete_route () { + ipaddr="$1" + + case "$SYSTYPE" in + SunOS) return 0;; + *BSD) CMD="$ROUTE -n delete -host $ipaddr";; + *) CMD="$ROUTE -n del -host $ipaddr";; + esac + + $CMD + + return $? +} + +delete_interface () { + ifname="$1" + ipaddr="$2" + + case "$SYSTYPE" in + SunOS) + if [ "$SYSVERSION" -ge 8 ] ; then + CMD="$IFCONFIG $ifname unplumb" + else + CMD="$IFCONFIG $ifname 0 down" + fi;; + Darwin*) + CMD="$IFCONFIG $ifname $ipaddr delete";; + *BSD) + CMD="$IFCONFIG $ifname inet $ipaddr delete";; + *) + CMD="$IFCONFIG $ifname down";; + esac + + ocf_log info "$CMD" + $CMD + + return $? +} + + +add_interface () { + ipaddr="$1" + iface_base="$2" + iface="$3" + netmask="$4" + broadcast="$5" + + if [ $# != 5 ]; then + ocf_log err "Insufficient arguments to add_interface: $*" + exit $OCF_ERR_ARGS + fi + + case "$SYSTYPE" in + SunOS) + if [ "$SYSVERSION" -ge 8 ] ; then + $IFCONFIG $iface plumb + rc=$? + if [ $rc -ne 0 ] ; then + echo "ERROR: '$IFCONFIG $iface plumb' failed." + return $rc + fi + fi + # At Solaris 10, this single-command version sometimes broke. + # Almost certainly an S10 bug. + # CMD="$IFCONFIG $iface inet $ipaddr $text up" + # So hack the following workaround: + CMD="$IFCONFIG $iface inet $ipaddr" + CMD="$CMD && $IFCONFIG $iface netmask $netmask" + CMD="$CMD && $IFCONFIG $iface up" + ;; + + *BSD) + # netmask is always set to 255.255.255.255 for an alias + CMD="$IFCONFIG $iface inet $ipaddr netmask 255.255.255.255 alias";; + *) + CMD="$IFCONFIG $iface $ipaddr netmask $netmask broadcast $broadcast";; + esac + + # Use "eval $CMD" (not "$CMD"): it might be a chain of two or more commands. + ocf_log info "eval $CMD" + eval $CMD + rc=$? + if [ $rc != 0 ]; then + echo "ERROR: eval $CMD failed (rc=$rc)" + fi + + return $rc +} + +# +# Remove the IP alias for the requested IP address... +# +ip_stop() { + SENDARPPIDFILE="$SENDARPPIDDIR/send_arp-$OCF_RESKEY_ip" + NIC=`find_interface $OCF_RESKEY_ip` + + if [ -f "$SENDARPPIDFILE" ]; then + cat "$SENDARPPIDFILE" | xargs kill + rm -f "$SENDARPPIDFILE" + fi + + if [ -z "$NIC" ]; then + : Requested interface not in use + return $OCF_SUCCESS + fi + + if [ ${OCF_RESKEY_lvs_support} = 1 ]; then + case $NIC in + lo*) + : Requested interface is on loopback + return $OCF_SUCCESS;; + esac + fi + + delete_route "$OCF_RESKEY_ip" + delete_interface "$NIC" "$OCF_RESKEY_ip" + rc=$? + + if [ ${OCF_RESKEY_lvs_support} = 1 ]; then + lvs_restore_loopback "$OCF_RESKEY_ip" + fi + + # remove lock file... + rm -f "$VLDIR/IPaddr-$NIC" + + if [ $rc != 0 ]; then + ocf_log warn "IP Address $OCF_RESKEY_ip NOT released: rc=$rc" + return $OCF_ERR_GENERIC + fi + return $OCF_SUCCESS +} + + +# +# Add an IP alias for the requested IP address... +# +# It could be that we already have taken it, in which case it should +# do nothing. +# + +ip_start() { + # + # Do we already service this IP address? + # + ip_status_internal + if [ $? = $OCF_SUCCESS ]; then + # Nothing to do, the IP is already active + return $OCF_SUCCESS; + fi + + NIC_unique=`find_free_interface $OCF_RESKEY_nic` + if [ -n "$NIC_unique" ]; then + : OK got interface [$NIC_unique] for $OCF_RESKEY_ip + else + return $OCF_ERR_GENERIC + fi + + # This logic is mostly to support LVS (If I understand it correctly) + if [ ${OCF_RESKEY_lvs_support} = 1 ]; then + NIC_current=`find_interface $OCF_RESKEY_ip` + case $NIC_unique in + lo*) + if [ x"$NIC_unique" = x"$NIC_current" ]; then + # Its already "running" and not moving, nothing to do. + ocf_log err "Could not find a non-loopback device to move $OCF_RESKEY_ip to" + return $OCF_ERR_GENERIC + fi;; + *) lvs_remove_conflicting_loopback "$OCF_RESKEY_ip" "$NIC_current";; + esac + fi + + if [ ! -z "${OCF_RESKEY_local_start_script}" ]; then + if [ -x "${OCF_RESKEY_local_start_script}" ]; then + ${OCF_RESKEY_local_start_script} $* + fi + fi + + add_interface "$OCF_RESKEY_ip" "$OCF_RESKEY_nic" "$NIC_unique" \ + "$OCF_RESKEY_cidr_netmask" "$OCF_RESKEY_broadcast" + rc=$? + if [ $rc != 0 ]; then + ocf_log err "Could not add $OCF_RESKEY_ip to $OCF_RESKEY_nic: rc=$rc" + return $rc + fi + + # The address is active, now notify others about it using sendarp + + if [ "$SYSTYPE" = "DarwinBSD" -a "$NIC_unique" = "lo0" ]; then + # Darwin can't send ARPs on loopback devices + SENDARP="x$SENDARP" # Prevent the binary from being found + fi + + if [ -x $SENDARP ]; then + TARGET_INTERFACE=`echo $NIC_unique | sed 's%:.*%%'` + SENDARPPIDFILE="$SENDARPPIDDIR/send_arp-$OCF_RESKEY_ip" + + ARGS="-i $OCF_RESKEY_ARP_INTERVAL_MS -r $OCF_RESKEY_ARP_REPEAT" + ARGS="$ARGS -p $SENDARPPIDFILE $TARGET_INTERFACE $OCF_RESKEY_ip" + ARGS="$ARGS auto $OCF_RESKEY_ip $OCF_RESKEY_ARP_NETMASK" + + ocf_log debug "Sending Gratuitous Arp for $OCF_RESKEY_ip on $NIC_unique [$TARGET_INTERFACE]" + case $OCF_RESKEY_ARP_BACKGROUND in + yes) ($SENDARP $ARGS || ocf_log err "Could not send gratuitous arps. rc=$?" & ) >&2 ;; + *) $SENDARP $ARGS || ocf_log err "Could not send gratuitous arps. rc=$?";; + esac + fi + + ip_status_internal + return $? +} + +ip_status_internal() { + NIC=`find_interface "$OCF_RESKEY_ip"` + + if [ "x$NIC" = x ]; then + return $OCF_NOT_RUNNING + + elif [ "${OCF_RESKEY_lvs_support}" = "1" ]; then + case $NIC in + lo*) return $OCF_NOT_RUNNING;; + *) return $OCF_SUCCESS;; + esac + else + if [ x$OCF_RESKEY_nic != x ]; then + simple_OCF_NIC=`echo $OCF_RESKEY_nic | awk -F: '{print $1}'` + simple_NIC=`echo $NIC | awk -F: '{print $1}'` + if [ $simple_OCF_NIC != $simple_NIC ]; then + ocf_log err "$OCF_RESKEY_ip is running an interface ($simple_NIC) instead of the configured one ($simple_OCF_NIC)" + return $OCF_ERR_GENERIC + fi + fi + return $OCF_SUCCESS + fi +} + +ip_status() { + ip_status_internal + rc=$? + if [ $rc = $OCF_SUCCESS ]; then + echo "running" + elif [ $rc = $OCF_NOT_RUNNING ]; then + echo "stopped" + else + echo "unknown" + fi + return $rc; +} + +# +# Determine if this IP address is really being served, or not. +# Note that we must distinguish if *we're* serving it locally... +# +ip_monitor() { + ip_status_internal + rc=$? + + if [ $OCF_CHECK_LEVEL = 0 -o $rc != 0 ]; then + return $rc + fi + + ocf_log info "Checking IP stack" + + PINGARGS="`pingargs $OCF_RESKEY_ip`" + for j in 1 2 3 4 5 6 7 8 9 10; do + MSG=`$PING $PINGARGS 2>&1` + if [ $? = 0 ]; then + return $OCF_SUCCESS + fi + done + + ocf_log err "$MSG" + return $OCF_ERR_GENERIC +} + +is_positive_integer() { + ocf_is_decimal $1 && [ $1 -ge 1 ] + if [ $? = 0 ]; then + return 1 + fi + return 0 +} + +ip_validate_all() { + check_binary $AWK + check_binary $IFCONFIG + check_binary $ROUTE + check_binary $PING + + if is_positive_integer $OCF_RESKEY_ARP_INTERVAL_MS + then + ocf_log err "Invalid parameter value: ARP_INTERVAL_MS [$OCF_RESKEY_ARP_INTERVAL_MS]" + return $OCF_ERR_ARGS + fi + + if is_positive_integer $OCF_RESKEY_ARP_REPEAT + then + ocf_log err "Invalid parameter value: ARP_REPEAT [$OCF_RESKEY_ARP_REPEAT]" + return $OCF_ERR_ARGS + fi + + if [ "$SYSTYPE" = "Linux" -o "$SYSTYPE" = "SunOS" ]; then + : + else + if [ "${OCF_RESKEY_lvs_support}" = "1" ]; then + ocf_log err "$SYSTYPE does not support LVS" + return $OCF_ERR_GENERIC + fi + fi + + case $OCF_RESKEY_ip in + "") ocf_log err "Required parameter OCF_RESKEY_ip is missing" + return $OCF_ERR_CONFIGURED;; + [0-9]*.[0-9]*.[0-9]*.*[0-9]) : OK;; + *) ocf_log err "Parameter OCF_RESKEY_ip [$OCF_RESKEY_ip] not an IP address" + return $OCF_ERR_CONFIGURED;; + esac + + # Unconditionally do this? + case $OCF_RESKEY_nic in + *:*) + OCF_RESKEY_nic=`echo $OCF_RESKEY_nic | sed 's/:.*//'` + ;; + esac + + NICINFO=`$FINDIF` + rc=$? + + if [ $rc != 0 ]; then + ocf_log err "$FINDIF failed [rc=$rc]." + return $OCF_ERR_GENERIC + fi + + tmp=`echo "$NICINFO" | cut -f1` + if + [ "x$OCF_RESKEY_nic" = "x" ] + then + ocf_log info "Using calculated nic for ${OCF_RESKEY_ip}: $tmp" + OCF_RESKEY_nic=$tmp + elif + [ x$tmp != x${OCF_RESKEY_nic} ] + then + ocf_log err "Invalid parameter value: nic [$OCF_RESKEY_nic] Calculated nic: [$tmp]" + return $OCF_ERR_ARGS + fi + + tmp=`echo "$NICINFO" | cut -f2 | cut -d ' ' -f2` + if + [ "x$OCF_RESKEY_cidr_netmask" != "x$tmp" ] + then + ocf_log info "Using calculated netmask for ${OCF_RESKEY_ip}: $tmp" + fi + + # Always use the calculated version becuase it might have been specified + # using CIDR notation which not every system accepts + OCF_RESKEY_netmask=$tmp + OCF_RESKEY_cidr_netmask=$tmp; export OCF_RESKEY_cidr_netmask + + tmp=`echo "$NICINFO" | cut -f3 | cut -d ' ' -f2` + if + [ "x$OCF_RESKEY_broadcast" = "x" ] + then + ocf_log debug "Using calculated broadcast for ${OCF_RESKEY_ip}: $tmp" + OCF_RESKEY_broadcast=$tmp + + elif [ x$tmp != x${OCF_RESKEY_broadcast} ]; then + ocf_log err "Invalid parameter value: broadcast [$OCF_RESKEY_broadcast] Calculated broadcast: [$tmp]" + return $OCF_ERR_ARGS + fi + + return $OCF_SUCCESS +} + +usage() { + echo $USAGE >&2 + return $1 +} + +if [ $# -ne 1 ]; then + usage $OCF_ERR_ARGS +fi + +# Normalize the value of lvs_support +if [ "${OCF_RESKEY_lvs_support}" = "true" \ + -o "${OCF_RESKEY_lvs_support}" = "on" \ + -o "${OCF_RESKEY_lvs_support}" = "yes" \ + -o "${OCF_RESKEY_lvs_support}" = "1" ]; then + OCF_RESKEY_lvs_support=1 +else + OCF_RESKEY_lvs_support=0 +fi + +# Note: We had a version out there for a while which used +# netmask instead of cidr_netmask. So, don't remove this aliasing code! +if + [ ! -z "$OCF_RESKEY_netmask" -a -z "$OCF_RESKEY_cidr_netmask" ] +then + OCF_RESKEY_cidr_netmask=$OCF_RESKEY_netmask + export OCF_RESKEY_cidr_netmask +fi + +case $1 in + meta-data) meta_data;; + start) ip_validate_all && ip_start;; + stop) ip_stop;; + status) ip_status;; + monitor) ip_monitor;; + validate-all) ip_validate_all;; + usage) usage $OCF_SUCCESS;; + *) usage $OCF_ERR_UNIMPLEMENTED;; +esac + +exit $? diff --git a/heartbeat/IPaddr2 b/heartbeat/IPaddr2 new file mode 100755 index 0000000..97a7431 --- /dev/null +++ b/heartbeat/IPaddr2 @@ -0,0 +1,1357 @@ +#!/bin/sh +# +# $Id: IPaddr2.in,v 1.24 2006/08/09 13:01:54 lars Exp $ +# +# OCF Resource Agent compliant IPaddr2 script. +# +# Based on work by Tuomo Soini, ported to the OCF RA API by Lars +# Marowsky-Brée. Implements Cluster Alias IP functionality too. +# +# Cluster Alias IP cleanup, fixes and testing by Michael Schwartzkopff +# +# +# Copyright (c) 2003 Tuomo Soini +# Copyright (c) 2004-2006 SUSE LINUX AG, Lars Marowsky-Brée +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# + + +# TODO: +# - There ought to be an ocf_run_cmd function which does all logging, +# timeout handling etc for us +# - Make this the standard IP address agent on Linux; the other +# platforms simply should ignore the additional parameters OR can use +# the legacy heartbeat resource script... +# - Check LVS <-> clusterip incompatibilities. +# +# OCF parameters are as below +# OCF_RESKEY_ip +# OCF_RESKEY_broadcast +# OCF_RESKEY_nic +# OCF_RESKEY_cidr_netmask +# OCF_RESKEY_iflabel +# OCF_RESKEY_mac +# OCF_RESKEY_clusterip_hash +# OCF_RESKEY_arp_interval +# OCF_RESKEY_arp_count +# OCF_RESKEY_arp_bg +# OCF_RESKEY_preferred_lft +# +# OCF_RESKEY_CRM_meta_clone +# OCF_RESKEY_CRM_meta_clone_max + + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs +. ${OCF_FUNCTIONS_DIR}/findif.sh + +# Defaults +OCF_RESKEY_ip_default="" +OCF_RESKEY_cidr_netmask_default="" +OCF_RESKEY_broadcast_default="" +OCF_RESKEY_iflabel_default="" +OCF_RESKEY_cidr_netmask_default="" +OCF_RESKEY_lvs_support_default=false +OCF_RESKEY_lvs_ipv6_addrlabel_default=false +OCF_RESKEY_lvs_ipv6_addrlabel_value_default=99 +OCF_RESKEY_clusterip_hash_default="sourceip-sourceport" +OCF_RESKEY_mac_default="" +OCF_RESKEY_unique_clone_address_default=false +OCF_RESKEY_arp_interval_default=200 +OCF_RESKEY_arp_count_default=5 +OCF_RESKEY_arp_count_refresh_default=0 +OCF_RESKEY_arp_bg_default="" +OCF_RESKEY_arp_sender_default="" +OCF_RESKEY_send_arp_opts_default="" +OCF_RESKEY_flush_routes_default="false" +OCF_RESKEY_run_arping_default=false +OCF_RESKEY_nodad_default=false +OCF_RESKEY_noprefixroute_default="false" +OCF_RESKEY_preferred_lft_default="forever" +OCF_RESKEY_network_namespace_default="" + +: ${OCF_RESKEY_ip=${OCF_RESKEY_ip_default}} +: ${OCF_RESKEY_cidr_netmask=${OCF_RESKEY_cidr_netmask_default}} +: ${OCF_RESKEY_broadcast=${OCF_RESKEY_broadcast_default}} +: ${OCF_RESKEY_iflabel=${OCF_RESKEY_iflabel_default}} +: ${OCF_RESKEY_lvs_support=${OCF_RESKEY_lvs_support_default}} +: ${OCF_RESKEY_lvs_ipv6_addrlabel=${OCF_RESKEY_lvs_ipv6_addrlabel_default}} +: ${OCF_RESKEY_lvs_ipv6_addrlabel_value=${OCF_RESKEY_lvs_ipv6_addrlabel_value_default}} +: ${OCF_RESKEY_clusterip_hash=${OCF_RESKEY_clusterip_hash_default}} +: ${OCF_RESKEY_mac=${OCF_RESKEY_mac_default}} +: ${OCF_RESKEY_unique_clone_address=${OCF_RESKEY_unique_clone_address_default}} +: ${OCF_RESKEY_arp_interval=${OCF_RESKEY_arp_interval_default}} +: ${OCF_RESKEY_arp_count=${OCF_RESKEY_arp_count_default}} +: ${OCF_RESKEY_arp_count_refresh=${OCF_RESKEY_arp_count_refresh_default}} +: ${OCF_RESKEY_arp_bg=${OCF_RESKEY_arp_bg_default}} +: ${OCF_RESKEY_arp_sender=${OCF_RESKEY_arp_sender_default}} +: ${OCF_RESKEY_send_arp_opts=${OCF_RESKEY_send_arp_opts_default}} +: ${OCF_RESKEY_flush_routes=${OCF_RESKEY_flush_routes_default}} +: ${OCF_RESKEY_run_arping=${OCF_RESKEY_run_arping_default}} +: ${OCF_RESKEY_nodad=${OCF_RESKEY_nodad_default}} +: ${OCF_RESKEY_noprefixroute=${OCF_RESKEY_noprefixroute_default}} +: ${OCF_RESKEY_preferred_lft=${OCF_RESKEY_preferred_lft_default}} +: ${OCF_RESKEY_network_namespace=${OCF_RESKEY_network_namespace_default}} + +####################################################################### + +SENDARP=$HA_BIN/send_arp +SENDUA=$HA_BIN/send_ua +FINDIF=findif +VLDIR=$HA_RSCTMP +SENDARPPIDDIR=$HA_RSCTMP +CIP_lockfile=$HA_RSCTMP/IPaddr2-CIP-${OCF_RESKEY_ip} + +IPADDR2_CIP_IPTABLES=$IPTABLES + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="IPaddr2" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This Linux-specific resource manages IP alias IP addresses. +It can add an IP alias, or remove one. +In addition, it can implement Cluster Alias IP functionality +if invoked as a clone resource. + +If used as a clone, "shared address with a trivial, stateless +(autonomous) load-balancing/mutual exclusion on ingress" mode gets +applied (as opposed to "assume resource uniqueness" mode otherwise). +For that, Linux firewall (kernel and userspace) is assumed, and since +recent distributions are ambivalent in plain "iptables" command to +particular back-end resolution, "iptables-legacy" (when present) gets +prioritized so as to avoid incompatibilities (note that respective +ipt_CLUSTERIP firewall extension in use here is, at the same time, +marked deprecated, yet said "legacy" layer can make it workable, +literally, to this day) with "netfilter" one (as in "iptables-nft"). +In that case, you should explicitly set clone-node-max >= 2, +and/or clone-max < number of nodes. In case of node failure, +clone instances need to be re-allocated on surviving nodes. +This would not be possible if there is already an instance +on those nodes, and clone-node-max=1 (which is the default). + +When the specified IP address gets assigned to a respective interface, the +resource agent sends unsolicited ARP (Address Resolution Protocol, IPv4) or NA +(Neighbor Advertisement, IPv6) packets to inform neighboring machines about the +change. This functionality is controlled for both IPv4 and IPv6 by shared +'arp_*' parameters. +</longdesc> + +<shortdesc lang="en">Manages virtual IPv4 and IPv6 addresses (Linux specific version)</shortdesc> + +<parameters> +<parameter name="ip" unique="1" required="1"> +<longdesc lang="en"> +The IPv4 (dotted quad notation) or IPv6 address (colon hexadecimal notation) +example IPv4 "192.168.1.1". +example IPv6 "2001:db8:DC28:0:0:FC57:D4C8:1FFF". +</longdesc> +<shortdesc lang="en">IPv4 or IPv6 address</shortdesc> +<content type="string" default="${OCF_RESKEY_ip_default}" /> +</parameter> +<parameter name="nic" unique="0"> +<longdesc lang="en"> +The base network interface on which the IP address will be brought +online. +If left empty, the script will try and determine this from the +routing table. + +Do NOT specify an alias interface in the form eth0:1 or anything here; +rather, specify the base interface only. +If you want a label, see the iflabel parameter. + +Prerequisite: + +There must be at least one static IP address, which is not managed by +the cluster, assigned to the network interface. +If you can not assign any static IP address on the interface, +modify this kernel parameter: + +sysctl -w net.ipv4.conf.all.promote_secondaries=1 # (or per device) +</longdesc> +<shortdesc lang="en">Network interface</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="cidr_netmask"> +<longdesc lang="en"> +The netmask for the interface in CIDR format +(e.g., 24 and not 255.255.255.0) + +If unspecified, the script will also try to determine this from the +routing table. +</longdesc> +<shortdesc lang="en">CIDR netmask</shortdesc> +<content type="string" default="${OCF_RESKEY_cidr_netmask_default}"/> +</parameter> + +<parameter name="broadcast"> +<longdesc lang="en"> +Broadcast address associated with the IP. It is possible to use the +special symbols '+' and '-' instead of the broadcast address. In this +case, the broadcast address is derived by setting/resetting the host +bits of the interface prefix. +</longdesc> +<shortdesc lang="en">Broadcast address</shortdesc> +<content type="string" default="${OCF_RESKEY_broadcast_default}"/> +</parameter> + +<parameter name="iflabel"> +<longdesc lang="en"> +You can specify an additional label for your IP address here. +This label is appended to your interface name. + +The kernel allows alphanumeric labels up to a maximum length of 15 +characters including the interface name and colon (e.g. eth0:foobar1234) + +A label can be specified in nic parameter but it is deprecated. +If a label is specified in nic name, this parameter has no effect. +</longdesc> +<shortdesc lang="en">Interface label</shortdesc> +<content type="string" default="${OCF_RESKEY_iflabel_default}"/> +</parameter> + +<parameter name="lvs_support"> +<longdesc lang="en"> +Enable support for LVS Direct Routing configurations. In case a IP +address is stopped, only move it to the loopback device to allow the +local node to continue to service requests, but no longer advertise it +on the network. + +Notes for IPv6: +It is not necessary to enable this option on IPv6. +Instead, enable 'lvs_ipv6_addrlabel' option for LVS-DR usage on IPv6. +</longdesc> +<shortdesc lang="en">Enable support for LVS DR</shortdesc> +<content type="boolean" default="${OCF_RESKEY_lvs_support_default}"/> +</parameter> + +<parameter name="lvs_ipv6_addrlabel"> +<longdesc lang="en"> +Enable adding IPv6 address label so IPv6 traffic originating from +the address's interface does not use this address as the source. +This is necessary for LVS-DR health checks to realservers to work. Without it, +the most recently added IPv6 address (probably the address added by IPaddr2) +will be used as the source address for IPv6 traffic from that interface and +since that address exists on loopback on the realservers, the realserver +response to pings/connections will never leave its loopback. +See RFC3484 for the detail of the source address selection. + +See also 'lvs_ipv6_addrlabel_value' parameter. +</longdesc> +<shortdesc lang="en">Enable adding IPv6 address label.</shortdesc> +<content type="boolean" default="${OCF_RESKEY_lvs_ipv6_addrlabel_default}"/> +</parameter> + +<parameter name="lvs_ipv6_addrlabel_value"> +<longdesc lang="en"> +Specify IPv6 address label value used when 'lvs_ipv6_addrlabel' is enabled. +The value should be an unused label in the policy table +which is shown by 'ip addrlabel list' command. +You would rarely need to change this parameter. +</longdesc> +<shortdesc lang="en">IPv6 address label value.</shortdesc> +<content type="integer" default="${OCF_RESKEY_lvs_ipv6_addrlabel_value_default}"/> +</parameter> + +<parameter name="mac"> +<longdesc lang="en"> +Set the interface MAC address explicitly. Currently only used in case of +the Cluster IP Alias. Leave empty to chose automatically. + +</longdesc> +<shortdesc lang="en">Cluster IP MAC address</shortdesc> +<content type="string" default="${OCF_RESKEY_mac_default}"/> +</parameter> + +<parameter name="clusterip_hash"> +<longdesc lang="en"> +Specify the hashing algorithm used for the Cluster IP functionality. + +</longdesc> +<shortdesc lang="en">Cluster IP hashing function</shortdesc> +<content type="string" default="${OCF_RESKEY_clusterip_hash_default}"/> +</parameter> + +<parameter name="unique_clone_address"> +<longdesc lang="en"> +If true, add the clone ID to the supplied value of IP to create +a unique address to manage +</longdesc> +<shortdesc lang="en">Create a unique address for cloned instances</shortdesc> +<content type="boolean" default="${OCF_RESKEY_unique_clone_address_default}"/> +</parameter> + +<parameter name="arp_interval"> +<longdesc lang="en"> +Specify the interval between unsolicited ARP (IPv4) or NA (IPv6) packets in +milliseconds. + +This parameter is deprecated and used for the backward compatibility only. +It is effective only for the send_arp binary which is built with libnet, +and send_ua for IPv6. It has no effect for other arp_sender. +</longdesc> +<shortdesc lang="en">ARP/NA packet interval in ms (deprecated)</shortdesc> +<content type="integer" default="${OCF_RESKEY_arp_interval_default}"/> +</parameter> + +<parameter name="arp_count"> +<longdesc lang="en"> +Number of unsolicited ARP (IPv4) or NA (IPv6) packets to send at resource +initialization. +</longdesc> +<shortdesc lang="en">ARP/NA packet count sent during initialization</shortdesc> +<content type="integer" default="${OCF_RESKEY_arp_count_default}"/> +</parameter> + +<parameter name="arp_count_refresh"> +<longdesc lang="en"> +For IPv4, number of unsolicited ARP packets to send during resource monitoring. +Doing so helps mitigate issues of stuck ARP caches resulting from split-brain +situations. +</longdesc> +<shortdesc lang="en">ARP packet count sent during monitoring</shortdesc> +<content type="integer" default="${OCF_RESKEY_arp_count_refresh_default}"/> +</parameter> + +<parameter name="arp_bg"> +<longdesc lang="en"> +Whether or not to send the ARP (IPv4) or NA (IPv6) packets in the background. +The default is true for IPv4 and false for IPv6. +</longdesc> +<shortdesc lang="en">ARP/NA from background</shortdesc> +<content type="string" default="${OCF_RESKEY_arp_bg_default}"/> +</parameter> + +<parameter name="arp_sender"> +<longdesc lang="en"> +For IPv4, the program to send ARP packets with on start. Available options are: + - send_arp: default + - ipoibarping: default for infiniband interfaces if ipoibarping is available + - iputils_arping: use arping in iputils package + - libnet_arping: use another variant of arping based on libnet +</longdesc> +<shortdesc lang="en">ARP sender</shortdesc> +<content type="string" default="${OCF_RESKEY_arp_sender_default}"/> +</parameter> + +<parameter name="send_arp_opts"> +<longdesc lang="en"> +For IPv4, extra options to pass to the arp_sender program. +Available options are vary depending on which arp_sender is used. + +A typical use case is specifying '-A' for iputils_arping to use +ARP REPLY instead of ARP REQUEST as Gratuitous ARPs. +</longdesc> +<shortdesc lang="en">Options for ARP sender</shortdesc> +<content type="string" default="${OCF_RESKEY_send_arp_opts_default}"/> +</parameter> + +<parameter name="flush_routes"> +<longdesc lang="en"> +Flush the routing table on stop. This is for +applications which use the cluster IP address +and which run on the same physical host that the +IP address lives on. The Linux kernel may force that +application to take a shortcut to the local loopback +interface, instead of the interface the address +is really bound to. Under those circumstances, an +application may, somewhat unexpectedly, continue +to use connections for some time even after the +IP address is deconfigured. Set this parameter in +order to immediately disable said shortcut when the +IP address goes away. +</longdesc> +<shortdesc lang="en">Flush kernel routing table on stop</shortdesc> +<content type="boolean" default="${OCF_RESKEY_flush_routes_default}"/> +</parameter> + +<parameter name="run_arping"> +<longdesc lang="en"> +For IPv4, whether or not to run arping for collision detection check. +</longdesc> +<shortdesc lang="en">Run arping for IPv4 collision detection check</shortdesc> +<content type="string" default="${OCF_RESKEY_run_arping_default}"/> +</parameter> + +<parameter name="nodad"> +<longdesc lang="en"> +For IPv6, do not perform Duplicate Address Detection when adding the address. +</longdesc> +<shortdesc lang="en">Use nodad flag</shortdesc> +<content type="string" default="${OCF_RESKEY_nodad_default}"/> +</parameter> + +<parameter name="noprefixroute"> +<longdesc lang="en"> +Use noprefixroute flag (see 'man ip-address'). +</longdesc> +<shortdesc lang="en">Use noprefixroute flag</shortdesc> +<content type="string" default="${OCF_RESKEY_noprefixroute_default}"/> +</parameter> + +<parameter name="preferred_lft"> +<longdesc lang="en"> +For IPv6, set the preferred lifetime of the IP address. +This can be used to ensure that the created IP address will not +be used as a source address for routing. +Expects a value as specified in section 5.5.4 of RFC 4862. +</longdesc> +<shortdesc lang="en">IPv6 preferred lifetime</shortdesc> +<content type="string" default="${OCF_RESKEY_preferred_lft_default}"/> +</parameter> + +<parameter name="network_namespace"> +<longdesc lang="en"> +Specifies the network namespace to operate within. +The namespace must already exist, and the interface to be used must be within +the namespace. +</longdesc> +<shortdesc lang="en">Network namespace to use</shortdesc> +<content type="string" default="${OCF_RESKEY_network_namespace_default}"/> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="status" depth="0" timeout="20s" interval="10s" /> +<action name="monitor" depth="0" timeout="20s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="20s" /> +</actions> +</resource-agent> +END + + exit $OCF_SUCCESS +} + +ip_init() { + local rc + + if [ X`uname -s` != "XLinux" ]; then + ocf_exit_reason "IPaddr2 only supported Linux." + exit $OCF_ERR_INSTALLED + fi + + if [ X"$OCF_RESKEY_ip" = "X" ] && [ "$__OCF_ACTION" != "stop" ]; then + ocf_exit_reason "IP address (the ip parameter) is mandatory" + exit $OCF_ERR_CONFIGURED + fi + + if + case $__OCF_ACTION in + start|stop) ocf_is_root;; + *) true;; + esac + then + : YAY! + else + ocf_exit_reason "You must be root for $__OCF_ACTION operation." + exit $OCF_ERR_PERM + fi + + BASEIP="$OCF_RESKEY_ip" + BRDCAST="$OCF_RESKEY_broadcast" + NIC="$OCF_RESKEY_nic" + # Note: We had a version out there for a while which used + # netmask instead of cidr_netmask. Don't remove this aliasing code! + if + [ ! -z "$OCF_RESKEY_netmask" -a -z "$OCF_RESKEY_cidr_netmask" ] + then + OCF_RESKEY_cidr_netmask=$OCF_RESKEY_netmask + export OCF_RESKEY_cidr_netmask + fi + NETMASK="$OCF_RESKEY_cidr_netmask" + IFLABEL="$OCF_RESKEY_iflabel" + IF_MAC="$OCF_RESKEY_mac" + + IP_INC_GLOBAL=${OCF_RESKEY_CRM_meta_clone_max:-1} + IP_INC_NO=`expr ${OCF_RESKEY_CRM_meta_clone:-0} + 1` + + if ocf_is_true ${OCF_RESKEY_lvs_support} && [ $IP_INC_GLOBAL -gt 1 ]; then + ocf_exit_reason "LVS and load sharing do not go together well" + exit $OCF_ERR_CONFIGURED + fi + + if ocf_is_decimal "$IP_INC_GLOBAL" && [ $IP_INC_GLOBAL -gt 0 ]; then + : + else + ocf_exit_reason "Invalid meta-attribute clone_max [$IP_INC_GLOBAL], should be positive integer" + exit $OCF_ERR_CONFIGURED + fi + + echo $OCF_RESKEY_ip | grep -qs ":" + if [ $? -ne 0 ];then + FAMILY=inet + if ocf_is_true $OCF_RESKEY_lvs_ipv6_addrlabel ;then + ocf_exit_reason "IPv4 does not support lvs_ipv6_addrlabel" + exit $OCF_ERR_CONFIGURED + fi + if [ -z "$OCF_RESKEY_arp_bg" ]; then + OCF_RESKEY_arp_bg=true + fi + else + FAMILY=inet6 + # address sanitization defined in RFC5952 + SANITIZED_IP=$($IP2UTIL route get $OCF_RESKEY_ip 2> /dev/null | awk '$1~/:/ {print $1} $2~/:/ {print $2}') + if [ -n "$SANITIZED_IP" ]; then + OCF_RESKEY_ip="$SANITIZED_IP" + fi + + if ocf_is_true $OCF_RESKEY_lvs_support ;then + ocf_exit_reason "The IPv6 does not support lvs_support" + exit $OCF_ERR_CONFIGURED + fi + if ocf_is_true $OCF_RESKEY_lvs_ipv6_addrlabel ;then + if ocf_is_decimal "$OCF_RESKEY_lvs_ipv6_addrlabel_value" && [ $OCF_RESKEY_lvs_ipv6_addrlabel_value -ge 0 ]; then + : + else + ocf_exit_reason "Invalid lvs_ipv6_addrlabel_value [$OCF_RESKEY_lvs_ipv6_addrlabel_value], should be positive integer" + exit $OCF_ERR_CONFIGURED + fi + fi + if [ -z "$OCF_RESKEY_arp_bg" ]; then + OCF_RESKEY_arp_bg=false + fi + fi + + # support nic:iflabel format in nic parameter + case $NIC in + *:*) + IFLABEL=`echo $NIC | sed 's/[^:]*://'` + NIC=`echo $NIC | sed 's/:.*//'` + # only the base name should be passed to findif + OCF_RESKEY_nic=$NIC + ;; + esac + + # $FINDIF takes its parameters from the environment + # + NICINFO=`$FINDIF` + rc=$? + if + [ $rc -eq 0 ] + then + NICINFO=`echo "$NICINFO" | sed -e 's/netmask\ //;s/broadcast\ //'` + NIC=`echo "$NICINFO" | cut -d" " -f1` + NETMASK=`echo "$NICINFO" | cut -d" " -f2` + BRDCAST=`echo "$NICINFO" | cut -d" " -f3` + else + # findif couldn't find the interface + if ocf_is_probe; then + ocf_log info "[$FINDIF] failed" + exit $OCF_NOT_RUNNING + elif [ "$__OCF_ACTION" = stop ]; then + ocf_log warn "[$FINDIF] failed" + exit $OCF_SUCCESS + else + ocf_exit_reason "[$FINDIF] failed" + exit $rc + fi + fi + + SENDARPPIDFILE="$SENDARPPIDDIR/send_arp-$OCF_RESKEY_ip" + + if [ -n "$IFLABEL" ]; then + IFLABEL=${NIC}:${IFLABEL} + if [ ${#IFLABEL} -gt 15 ]; then + ocf_exit_reason "Interface label [$IFLABEL] exceeds maximum character limit of 15" + exit $OCF_ERR_CONFIGURED + fi + fi + + if [ "$IP_INC_GLOBAL" -gt 1 ] && ! ocf_is_true "$OCF_RESKEY_unique_clone_address"; then + IP_CIP="yes" + IP_CIP_HASH="${OCF_RESKEY_clusterip_hash}" + if [ -z "$IF_MAC" ]; then + # Choose a MAC + # 1. Concatenate some input together + # 2. This doesn't need to be a cryptographically + # secure hash. + # 3. Drop everything after the first 6 octets (12 chars) + # 4. Delimit the octets with ':' + # 5. Make sure the first octet is odd, + # so the result is a multicast MAC + IF_MAC=`echo $OCF_RESKEY_ip $NETMASK $BRDCAST | \ + md5sum | \ + sed -e 's#\(............\).*#\1#' \ + -e 's#..#&:#g; s#:$##' \ + -e 's#^\(.\)[02468aAcCeE]#\11#'` + fi + IP_CIP_FILE="/proc/net/ipt_CLUSTERIP/$OCF_RESKEY_ip" + fi +} + +# +# Find out which interfaces serve the given IP address and netmask. +# The arguments are an IP address and a netmask. +# Its output are interface names devided by spaces (e.g., "eth0 eth1"). +# +find_interface() { + local ipaddr="$1" + local netmask="$2" + + # + # List interfaces but exclude FreeS/WAN ipsecN virtual interfaces + # + local iface="`$IP2UTIL -o -f $FAMILY addr show \ + | grep "\ $ipaddr/$netmask" \ + | cut -d ' ' -f2 \ + | grep -v '^ipsec[0-9][0-9]*$'`" + + echo "$iface" + return 0 +} + +# +# Delete an interface +# +delete_interface () { + ipaddr="$1" + iface="$2" + netmask="$3" + + CMD="$IP2UTIL -f $FAMILY addr delete $ipaddr/$netmask dev $iface" + + ocf_run $CMD || return $OCF_ERR_GENERIC + + if ocf_is_true $OCF_RESKEY_flush_routes; then + ocf_run $IP2UTIL route flush cache + fi + + if [ "$FAMILY" = "inet6" ] && ocf_is_true $OCF_RESKEY_lvs_ipv6_addrlabel ;then + delete_ipv6_addrlabel $ipaddr + fi + + return $OCF_SUCCESS +} + +# +# Add an interface +# +add_interface () { + local cmd msg extra_opts ipaddr netmask broadcast iface label + + ipaddr="$1" + netmask="$2" + broadcast="$3" + iface="$4" + label="$5" + + if [ "$FAMILY" = "inet" ] && ocf_is_true $OCF_RESKEY_run_arping && + check_binary arping; then + arping -q -c 2 -w 3 -D -I $iface $ipaddr + if [ $? = 1 ]; then + ocf_log err "IPv4 address collision $ipaddr [DAD]" + return $OCF_ERR_GENERIC + fi + fi + + if [ "$FAMILY" = "inet6" ] && ocf_is_true $OCF_RESKEY_lvs_ipv6_addrlabel ;then + add_ipv6_addrlabel $ipaddr + fi + + cmd="$IP2UTIL -f $FAMILY addr add $ipaddr/$netmask dev $iface" + msg="Adding $FAMILY address $ipaddr/$netmask to device $iface" + if [ "$broadcast" != "none" ]; then + cmd="$IP2UTIL -f $FAMILY addr add $ipaddr/$netmask brd $broadcast dev $iface" + msg="Adding $FAMILY address $ipaddr/$netmask with broadcast address $broadcast to device $iface" + fi + + extra_opts="" + if [ "$FAMILY" = "inet6" ] && ocf_is_true "${OCF_RESKEY_nodad}"; then + extra_opts="$extra_opts nodad" + fi + + if ocf_is_true "${OCF_RESKEY_noprefixroute}"; then + extra_opts="$extra_opts noprefixroute" + fi + + if [ ! -z "$label" ]; then + extra_opts="$extra_opts label $label" + fi + if [ "$FAMILY" = "inet6" ] ;then + extra_opts="$extra_opts preferred_lft $OCF_RESKEY_preferred_lft" + fi + if [ -n "$extra_opts" ]; then + cmd="$cmd$extra_opts" + msg="$msg (with$extra_opts)" + fi + + ocf_log info "$msg" + ocf_run $cmd || return $OCF_ERR_GENERIC + + msg="Bringing device $iface up" + cmd="$IP2UTIL link set $iface up" + ocf_log info "$msg" + ocf_run $cmd || return $OCF_ERR_GENERIC + + return $OCF_SUCCESS +} + +# +# Delete a route +# +delete_route () { + prefix="$1" + iface="$2" + + CMD="$IP2UTIL route delete $prefix dev $iface" + + ocf_log info "$CMD" + $CMD + + return $? +} + +# On Linux systems the (hidden) loopback interface may +# conflict with the requested IP address. If so, this +# unoriginal code will remove the offending loopback address +# and save it in VLDIR so it can be added back in later +# when the IPaddr is released. +# +# TODO: This is very ugly and should be controlled by an additional +# instance parameter. Or even: multi-state, with the IP only being +# "active" on the master!? +# +remove_conflicting_loopback() { + ipaddr="$1" + netmask="$2" + broadcast="$3" + ifname="$4" + + ocf_log info "Removing conflicting loopback $ifname." + if + echo "$ipaddr $netmask $broadcast $ifname" > "$VLDIR/$ipaddr" + then + : Saved loopback information in $VLDIR/$ipaddr + else + ocf_log err "Could not save conflicting loopback $ifname." \ + "it will not be restored." + fi + delete_interface "$ipaddr" "$ifname" "$netmask" + # Forcibly remove the route (if it exists) to the loopback. + delete_route "$ipaddr" "$ifname" +} + +# +# On Linux systems the (hidden) loopback interface may +# need to be restored if it has been taken down previously +# by remove_conflicting_loopback() +# +restore_loopback() { + ipaddr="$1" + + if [ -s "$VLDIR/$ipaddr" ]; then + ifinfo=`cat "$VLDIR/$ipaddr"` + ocf_log info "Restoring loopback IP Address " \ + "$ifinfo." + add_interface $ifinfo + rm -f "$VLDIR/$ipaddr" + fi +} + +add_ipv6_addrlabel() { + local cmd ipaddr value + ipaddr="$1" + value="$OCF_RESKEY_lvs_ipv6_addrlabel_value" + + cmd="$IP2UTIL addrlabel add prefix $ipaddr label $value" + ocf_log info "Adding IPv6 address label prefix $ipaddr label $value" + ocf_run $cmd || ocf_log warn "$cmd failed." +} + +delete_ipv6_addrlabel() { + local cmd ipaddr value + ipaddr="$1" + value="$OCF_RESKEY_lvs_ipv6_addrlabel_value" + + cmd="$IP2UTIL addrlabel del prefix $ipaddr label $value" + ocf_run $cmd # an error can be ignored +} + +is_infiniband() { + $IP2UTIL link show $NIC | grep link/infiniband >/dev/null +} + +log_arp_sender() { + local cmdline + local output + local rc + cmdline="$@" + + output=$($cmdline 2>&1) + rc=$? + if [ $rc -ne 0 ] && \ + [ "$ARP_SENDER" != "libnet_arping" ] ; then + # libnet_arping always return an error as no answers + ocf_log err "Could not send gratuitous arps: rc=$rc" + fi + ocf_log $LOGLEVEL "$output" +} + +# wrapper function to manage PID file to run arping in background +run_with_pidfile() { + local cmdline + local pid + local rc + + cmdline="$@" + + $cmdline & + pid=$! + echo "$pid" > $SENDARPPIDFILE + wait $pid + rc=$? + rm -f $SENDARPPIDFILE + return $rc +} + +build_arp_sender_cmd() { + case "$ARP_SENDER" in + send_arp) + if [ "x$IP_CIP" = "xyes" ] ; then + if [ x = "x$IF_MAC" ] ; then + MY_MAC=auto + else + # send_arp.linux should return without doing anything in this case + MY_MAC=`echo ${IF_MAC} | sed -e 's/://g'` + fi + else + MY_MAC=auto + fi + + ARGS="$OCF_RESKEY_send_arp_opts -i $OCF_RESKEY_arp_interval -r $ARP_COUNT -p $SENDARPPIDFILE $NIC $OCF_RESKEY_ip $MY_MAC not_used not_used" + ARP_SENDER_CMD="$SENDARP $ARGS" + ;; + iputils_arping) + ARGS="$OCF_RESKEY_send_arp_opts -U -c $ARP_COUNT -I $NIC $OCF_RESKEY_ip" + ARP_SENDER_CMD="run_with_pidfile arping $ARGS" + ;; + libnet_arping) + ARGS="$OCF_RESKEY_send_arp_opts -U -c $ARP_COUNT -i $NIC -S $OCF_RESKEY_ip $OCF_RESKEY_ip" + ARP_SENDER_CMD="run_with_pidfile arping $ARGS" + ;; + ipoibarping) + ARGS="-q -c $ARP_COUNT -U -I $NIC $OCF_RESKEY_ip" + ARP_SENDER_CMD="ipoibarping $ARGS" + ;; + *) + # should not occur + ocf_exit_reason "unrecognized arp_sender value: $ARP_SENDER" + exit $OCF_ERR_GENERIC + ;; + esac +} + +# +# Send Unsolicited ARPs to update neighbor's ARP cache +# +run_arp_sender() { + if [ "x$1" = "xrefresh" ] ; then + ARP_COUNT=$OCF_RESKEY_arp_count_refresh + LOGLEVEL=debug + else + ARP_COUNT=$OCF_RESKEY_arp_count + LOGLEVEL=info + fi + if [ $ARP_COUNT -eq 0 ] ; then + return + fi + + # do not need to send Gratuitous ARPs in the Cluster IP configuration + # except send_arp.libnet binary to retain the old behavior + if [ "x$IP_CIP" = "xyes" ] && \ + [ "x$ARP_SENDER" != "xsend_arp" ] ; then + ocf_log info "Gratuitous ARPs are not sent in the Cluster IP configuration" + return + fi + + # prepare arguments for each arp sender program + # $ARP_SENDER_CMD should be set + build_arp_sender_cmd + + ocf_log $LOGLEVEL "$ARP_SENDER_CMD" + + if ocf_is_true $OCF_RESKEY_arp_bg; then + log_arp_sender $ARP_SENDER_CMD & + else + log_arp_sender $ARP_SENDER_CMD + fi +} + +log_send_ua() { + local cmdline + local output + local rc + + cmdline="$@" + output=$($cmdline 2>&1) + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log err "Could not send ICMPv6 Unsolicited Neighbor Advertisements: rc=$rc" + fi + ocf_log info "$output" + return $rc +} + +# +# Run send_ua to note send ICMPv6 Unsolicited Neighbor Advertisements. +# +run_send_ua() { + local i + + # Duplicate Address Detection [DAD] + # Kernel will flag the IP as 'tentative' until it ensured that + # there is no duplicates. + # If there is, it will flag it as 'dadfailed' + for i in $(seq 1 10); do + ipstatus=$($IP2UTIL -o -f $FAMILY addr show dev $NIC to $OCF_RESKEY_ip/$NETMASK) + case "$ipstatus" in + *dadfailed*) + ocf_log err "IPv6 address collision $OCF_RESKEY_ip [DAD]" + $IP2UTIL -f $FAMILY addr del dev $NIC $OCF_RESKEY_ip/$NETMASK + if [ $? -ne 0 ]; then + ocf_log err "Could not delete IPv6 address" + fi + return $OCF_ERR_GENERIC + ;; + *tentative*) + if [ $i -eq 10 ]; then + ocf_log warn "IPv6 address : DAD is still in tentative" + fi + ;; + *) + break + ;; + esac + sleep 1 + done + # Now the address should be usable + + ARGS="-i $OCF_RESKEY_arp_interval -c $OCF_RESKEY_arp_count $OCF_RESKEY_ip $NETMASK $NIC" + ocf_log info "$SENDUA $ARGS" + if ocf_is_true $OCF_RESKEY_arp_bg; then + log_send_ua $SENDUA $ARGS & + else + log_send_ua $SENDUA $ARGS + fi +} + +# Do we already serve this IP address on the given $NIC? +# +# returns: +# ok = served (for CIP: + hash bucket) +# partial = served and no hash bucket (CIP only) +# partial2 = served and no CIP iptables rule +# no = nothing +# +ip_served() { + if [ -z "$NIC" ]; then # no nic found or specified + echo "no" + return 0 + fi + + cur_nic="`find_interface $OCF_RESKEY_ip $NETMASK`" + + if [ -z "$cur_nic" ]; then + echo "no" + return 0 + fi + + if [ -z "$IP_CIP" ]; then + for i in $cur_nic; do + # only mark as served when on the same interfaces as $NIC + [ "$i" = "$NIC" ] || continue + echo "ok" + return 0 + done + # There used to be logic here to pretend "not served", + # if ${OCF_RESKEY_lvs_support} was enabled, and the IP was + # found active on "lo*" only. With lvs_support on, you should + # have NIC != lo, so thats already filtered + # by the continue above. + + echo "no" + return 0 + fi + + # Special handling for the CIP: + if [ ! -e $IP_CIP_FILE ]; then + echo "partial2" + return 0 + fi + if egrep -q "(^|,)${IP_INC_NO}(,|$)" $IP_CIP_FILE ; then + echo "ok" + return 0 + else + echo "partial" + return 0 + fi + + exit $OCF_ERR_GENERIC +} + +####################################################################### + +ip_usage() { + cat <<END +usage: $0 {start|stop|status|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +ip_start() { + if [ -z "$NIC" ]; then + ocf_exit_reason "No nic found or specified" + exit $OCF_ERR_CONFIGURED + fi + + if [ -n "$IP_CIP" ]; then + # Cluster IPs need special processing when the first bucket + # is added to the node... take a lock to make sure only one + # process executes that code + ocf_take_lock $CIP_lockfile + ocf_release_lock_on_exit $CIP_lockfile + fi + + # + # Do we already service this IP address on $NIC? + # + local ip_status=`ip_served` + + if [ "$ip_status" = "ok" ]; then + exit $OCF_SUCCESS + fi + + if [ -n "$IP_CIP" ] && ([ $ip_status = "no" ] || [ $ip_status = "partial2" ]); then + $MODPROBE ip_conntrack + $IPADDR2_CIP_IPTABLES -I INPUT -d $OCF_RESKEY_ip -i $NIC -j CLUSTERIP \ + --new \ + --clustermac $IF_MAC \ + --total-nodes $IP_INC_GLOBAL \ + --local-node $IP_INC_NO \ + --hashmode $IP_CIP_HASH + if [ $? -ne 0 ]; then + ocf_exit_reason "iptables failed" + exit $OCF_ERR_GENERIC + fi + fi + + if [ -n "$IP_CIP" ] && [ $ip_status = "partial" ]; then + echo "+$IP_INC_NO" >$IP_CIP_FILE + fi + + if [ "$ip_status" = "no" ]; then + if ocf_is_true ${OCF_RESKEY_lvs_support}; then + for i in `find_interface $OCF_RESKEY_ip 32`; do + case $i in + lo*) + remove_conflicting_loopback $OCF_RESKEY_ip 32 255.255.255.255 lo + ;; + esac + done + fi + + add_interface $OCF_RESKEY_ip $NETMASK ${BRDCAST:-none} $NIC $IFLABEL + rc=$? + + if [ $rc -ne $OCF_SUCCESS ]; then + ocf_exit_reason "Failed to add $OCF_RESKEY_ip" + exit $rc + fi + fi + + case $NIC in + lo*) + : no need to run send_arp on loopback + ;; + *) + if [ $FAMILY = "inet" ];then + run_arp_sender + else + if [ -x $SENDUA ]; then + run_send_ua + if [ $? -ne 0 ]; then + ocf_exit_reason "run_send_ua failed." + exit $OCF_ERR_GENERIC + fi + fi + fi + ;; + esac + exit $OCF_SUCCESS +} + +ip_stop() { + local ip_del_if="yes" + if [ -n "$IP_CIP" ]; then + # Cluster IPs need special processing when the last bucket + # is removed from the node... take a lock to make sure only one + # process executes that code + ocf_take_lock $CIP_lockfile + ocf_release_lock_on_exit $CIP_lockfile + fi + + if [ -f "$SENDARPPIDFILE" ] ; then + kill `cat "$SENDARPPIDFILE"` + if [ $? -ne 0 ]; then + ocf_log warn "Could not kill previously running send_arp for $OCF_RESKEY_ip" + else + ocf_log info "killed previously running send_arp for $OCF_RESKEY_ip" + fi + rm -f "$SENDARPPIDFILE" + fi + local ip_status=`ip_served` + ocf_log info "IP status = $ip_status, IP_CIP=$IP_CIP" + + if [ $ip_status = "no" ]; then + : Requested interface not in use + exit $OCF_SUCCESS + fi + + if [ -n "$IP_CIP" ] && [ $ip_status != "partial2" ]; then + if [ $ip_status = "partial" ]; then + exit $OCF_SUCCESS + fi + echo "-$IP_INC_NO" >$IP_CIP_FILE + if [ "x$(cat $IP_CIP_FILE)" = "x" ]; then + ocf_log info $OCF_RESKEY_ip, $IP_CIP_HASH + i=1 + while [ $i -le $IP_INC_GLOBAL ]; do + ocf_log info $i + $IPADDR2_CIP_IPTABLES -D INPUT -d $OCF_RESKEY_ip -i $NIC -j CLUSTERIP \ + --new \ + --clustermac $IF_MAC \ + --total-nodes $IP_INC_GLOBAL \ + --local-node $i \ + --hashmode $IP_CIP_HASH + i=`expr $i + 1` + done + else + ip_del_if="no" + fi + fi + + if [ "$ip_del_if" = "yes" ]; then + delete_interface $OCF_RESKEY_ip $NIC $NETMASK + if [ $? -ne 0 ]; then + ocf_exit_reason "Unable to remove IP [${OCF_RESKEY_ip} from interface [ $NIC ]" + exit $OCF_ERR_GENERIC + fi + + if ocf_is_true ${OCF_RESKEY_lvs_support}; then + restore_loopback "$OCF_RESKEY_ip" + fi + fi + + exit $OCF_SUCCESS +} + +ip_monitor() { + # TODO: Implement more elaborate monitoring like checking for + # interface health maybe via a daemon like FailSafe etc... + + local ip_status=`ip_served` + case $ip_status in + ok) + run_arp_sender refresh + return $OCF_SUCCESS + ;; + partial|no|partial2) + exit $OCF_NOT_RUNNING + ;; + *) + # Errors on this interface? + return $OCF_ERR_GENERIC + ;; + esac +} + +# make sure that we have something to send ARPs with +set_send_arp_program() { + ARP_SENDER=send_arp + if [ -n "$OCF_RESKEY_arp_sender" ]; then + case "$OCF_RESKEY_arp_sender" in + send_arp) + check_binary $SENDARP + ;; + iputils_arping) + check_binary arping + ;; + libnet_arping) + check_binary arping + ;; + ipoibarping) + check_binary ipoibarping + ;; + *) + ocf_exit_reason "unrecognized arp_sender value: $OCF_RESKEY_arp_sender" + exit $OCF_ERR_CONFIGURED + ;; + esac + ARP_SENDER="$OCF_RESKEY_arp_sender" + else + if is_infiniband; then + ARP_SENDER=ipoibarping + if ! have_binary ipoibarping; then + [ "$__OCF_ACTION" = start ] && + ocf_log warn "using send_arp for infiniband because ipoibarping is not available (set arp_sender to \"send_arp\" to suppress this message)" + check_binary $SENDARP + ARP_SENDER=send_arp + fi + fi + fi +} + +ip_validate() { + check_binary $IP2UTIL + IP_CIP= + + if [ -n "$OCF_RESKEY_network_namespace" ]; then + OCF_RESKEY_network_namespace= exec $IP2UTIL netns exec "$OCF_RESKEY_network_namespace" "$0" "$__OCF_ACTION" + fi + + ip_init + + set_send_arp_program + + if [ -n "$IP_CIP" ]; then + if have_binary "$IPTABLES_LEGACY"; then + IPADDR2_CIP_IPTABLES="$IPTABLES_LEGACY" + fi + check_binary "$IPADDR2_CIP_IPTABLES" + check_binary $MODPROBE + fi + +# $BASEIP, $NETMASK, $NIC , $IP_INC_GLOBAL, and $BRDCAST have been checked within ip_init, +# do not bother here. + + if ocf_is_true "$OCF_RESKEY_unique_clone_address" && + ! ocf_is_true "$OCF_RESKEY_CRM_meta_globally_unique"; then + ocf_exit_reason "unique_clone_address makes sense only with meta globally_unique set" + exit $OCF_ERR_CONFIGURED + fi + + if ocf_is_decimal "$OCF_RESKEY_arp_interval" && [ $OCF_RESKEY_arp_interval -gt 0 ]; then + : + else + ocf_exit_reason "Invalid OCF_RESKEY_arp_interval [$OCF_RESKEY_arp_interval]" + exit $OCF_ERR_CONFIGURED + fi + + if ocf_is_decimal "$OCF_RESKEY_arp_count" && [ $OCF_RESKEY_arp_count -gt 0 ]; then + : + else + ocf_exit_reason "Invalid OCF_RESKEY_arp_count [$OCF_RESKEY_arp_count]" + exit $OCF_ERR_CONFIGURED + fi + + if [ -z "$OCF_RESKEY_preferred_lft" ]; then + ocf_exit_reason "Empty value is invalid for OCF_RESKEY_preferred_lft" + exit $OCF_ERR_CONFIGURED + fi + + if [ -n "$IP_CIP" ]; then + + local valid=1 + + case $IP_CIP_HASH in + sourceip|sourceip-sourceport|sourceip-sourceport-destport) + ;; + *) + ocf_exit_reason "Invalid OCF_RESKEY_clusterip_hash [$IP_CIP_HASH]" + exit $OCF_ERR_CONFIGURED + ;; + esac + + if ocf_is_true ${OCF_RESKEY_lvs_support}; then + ocf_exit_reason "LVS and load sharing not advised to try" + exit $OCF_ERR_CONFIGURED + fi + + case $IF_MAC in + [0-9a-zA-Z][13579bBdDfF][!0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z][!0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z][!0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z][!0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z][!0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z]) + ;; + *) + valid=0 + ;; + esac + + if [ $valid -eq 0 ]; then + ocf_exit_reason "Invalid IF_MAC [$IF_MAC]" + exit $OCF_ERR_CONFIGURED + fi + + fi +} + +if ocf_is_true "$OCF_RESKEY_unique_clone_address"; then + prefix=`echo $OCF_RESKEY_ip | awk -F. '{print $1"."$2"."$3}'` + suffix=`echo $OCF_RESKEY_ip | awk -F. '{print $4}'` + suffix=`expr ${OCF_RESKEY_CRM_meta_clone:-0} + $suffix` + OCF_RESKEY_ip="$prefix.$suffix" +fi + +case $__OCF_ACTION in +meta-data) meta_data + ;; +usage|help) ip_usage + exit $OCF_SUCCESS + ;; +esac + +ip_validate + +case $__OCF_ACTION in +start) ip_start + ;; +stop) ip_stop + ;; +status) ip_status=`ip_served` + if [ $ip_status = "ok" ]; then + echo "running" + exit $OCF_SUCCESS + else + echo "stopped" + exit $OCF_NOT_RUNNING + fi + ;; +monitor) ip_monitor + ;; +validate-all) ;; +*) ip_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +# vi:sw=4:ts=8: diff --git a/heartbeat/IPsrcaddr b/heartbeat/IPsrcaddr new file mode 100755 index 0000000..c732ce8 --- /dev/null +++ b/heartbeat/IPsrcaddr @@ -0,0 +1,631 @@ +#!/bin/sh +# +# Description: IPsrcaddr - Preferred source(/dest) address modification +# +# Author: John Sutton <john@scl.co.uk> +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL) +# Copyright: SCL Internet +# +# Based on the IPaddr script. +# +# This script manages the preferred source address associated with +# packets which originate on the localhost and are routed through the +# matching route. By default, i.e. without the use of this script or +# similar, these packets will carry the IP of the primary i.e. the +# non-aliased interface. This can be a nuisance if you need to ensure +# that such packets carry the same IP irrespective of which host in +# a redundant cluster they actually originate from. +# +# It can add a preferred source address, or remove one. +# +# usage: IPsrcaddr {start|stop|status|monitor|validate-all|meta-data} +# +# The "start" arg adds a preferred source address. +# +# Surprisingly, the "stop" arg removes it. :-) +# +# NOTES: +# +# 1) There must be one and not more than 1 matching route! Mainly because +# I can't see why you should have more than one. And if there is more +# than one, we would have to box clever to find out which one is to be +# modified, or we would have to pass its identity as an argument. +# +# 2) The script depends on Alexey Kuznetsov's ip utility from the +# iproute aka iproute2 package. +# +# 3) No checking is done to see if the passed in IP address can +# reasonably be associated with the interface on which the default +# route exists. So unless you want to deliberately spoof your source IP, +# check it! Normally, I would expect that your haresources looks +# something like: +# +# nodename ip1 ip2 ... ipN IPsrcaddr::ipX +# +# where ipX is one of the ip1 to ipN. +# +# OCF parameters are as below: +# OCF_RESKEY_ipaddress + +####################################################################### +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs +. ${OCF_FUNCTIONS_DIR}/findif.sh + +# Defaults +OCF_RESKEY_ipaddress_default="" +OCF_RESKEY_cidr_netmask_default="" +OCF_RESKEY_destination_default="0.0.0.0/0" +OCF_RESKEY_proto_default="" +OCF_RESKEY_metric_default="" +OCF_RESKEY_table_default="" + +: ${OCF_RESKEY_ipaddress=${OCF_RESKEY_ipaddress_default}} +: ${OCF_RESKEY_cidr_netmask=${OCF_RESKEY_cidr_netmask_default}} +: ${OCF_RESKEY_destination=${OCF_RESKEY_destination_default}} +: ${OCF_RESKEY_proto=${OCF_RESKEY_proto_default}} +: ${OCF_RESKEY_metric=${OCF_RESKEY_metric_default}} +: ${OCF_RESKEY_table=${OCF_RESKEY_table_default}} +####################################################################### + +[ -z "$OCF_RESKEY_proto" ] && PROTO="" || PROTO="proto $OCF_RESKEY_proto" +[ -z "$OCF_RESKEY_table" ] && TABLE="" || TABLE="table $OCF_RESKEY_table" + +USAGE="usage: $0 {start|stop|status|monitor|validate-all|meta-data}"; + + CMDSHOW="$IP2UTIL route show $TABLE to exact $OCF_RESKEY_destination" +CMDCHANGE="$IP2UTIL route change to " + +if [ "$OCF_RESKEY_destination" != "0.0.0.0/0" ]; then + CMDSHOW="$CMDSHOW src $OCF_RESKEY_ipaddress" +fi + +if [ "$OCF_RESKEY_table" = "local" ]; then + TABLE="$TABLE local" +fi + +SYSTYPE="`uname -s`" + +usage() { + echo $USAGE >&2 +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="IPsrcaddr" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for IPsrcaddr. It manages the preferred source address +modification. + +Note: DHCP should not be enabled for the interface serving the preferred +source address. Enabling DHCP may result in unexpected behavior, such as +the automatic addition of duplicate or conflicting routes. This may +cause the IPsrcaddr resource to fail, or it may produce undesired +behavior while the resource continues to run. +</longdesc> +<shortdesc lang="en">Manages the preferred source address for outgoing IP packets</shortdesc> + +<parameters> +<parameter name="ipaddress" unique="0" required="1"> +<longdesc lang="en"> +The IP address. +</longdesc> +<shortdesc lang="en">IP address</shortdesc> +<content type="string" default="${OCF_RESKEY_ipaddress_default}" /> +</parameter> + +<parameter name="cidr_netmask"> +<longdesc lang="en"> +The netmask for the interface in CIDR format. (ie, 24), or in +dotted quad notation 255.255.255.0). +</longdesc> +<shortdesc lang="en">Netmask</shortdesc> +<content type="string" default="${OCF_RESKEY_cidr_netmask_default}"/> +</parameter> + +<parameter name="destination"> +<longdesc lang="en"> +The destination IP/subnet for the route (default: $OCF_RESKEY_destination_default) +</longdesc> +<shortdesc lang="en">Destination IP/subnet</shortdesc> +<content type="string" default="${OCF_RESKEY_destination_default}" /> +</parameter> + +<parameter name="proto"> +<longdesc lang="en"> +Proto to match when finding network. E.g. "kernel". +</longdesc> +<shortdesc lang="en">Proto</shortdesc> +<content type="string" default="${OCF_RESKEY_proto_default}" /> +</parameter> + +<parameter name="metric"> +<longdesc lang="en"> +Metric. Only needed if incorrect metric value is used. +</longdesc> +<shortdesc lang="en">Metric</shortdesc> +<content type="string" default="${OCF_RESKEY_metric_default}" /> +</parameter> + +<parameter name="table"> +<longdesc lang="en"> +Table to modify and use for interface lookup. E.g. "local". + +The table has to have a route matching the "destination" parameter. + +This can be used for policy based routing. See man ip-rule(8). +</longdesc> +<shortdesc lang="en">Table</shortdesc> +<content type="string" default="${OCF_RESKEY_table_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="10s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +errorexit() { + ocf_exit_reason "$*" + exit $OCF_ERR_GENERIC +} + +# +# We can distinguish 3 cases: no preferred source address, a +# preferred source address exists which matches that specified, and one +# exists but doesn't match that specified. srca_read() returns 1,0,2 +# respectively. +# +# The output of route show is something along the lines of: +# +# default via X.X.X.X dev eth1 src Y.Y.Y.Y +# +# where the src clause "src Y.Y.Y.Y" may or may not be present + +WS="[[:blank:]]" +OCTET="[0-9]\{1,3\}" +IPADDR="\($OCTET\.\)\{3\}$OCTET" +SRCCLAUSE="src$WS$WS*\($IPADDR\)" +MATCHROUTE="\(.*${WS}\)\($SRCCLAUSE\)\($WS.*\|$\)" +METRICCLAUSE=".*\(metric$WS[^ ]\+\)" +PROTOCLAUSE=".*\(proto$WS[^ ]\+\).*" +FINDIF=findif + +# findif needs that to be set +export OCF_RESKEY_ip=$OCF_RESKEY_ipaddress + +srca_read() { + # Capture matching route - doublequotes prevent word splitting... + ROUTE="`$CMDSHOW dev $INTERFACE 2> /dev/null`" || errorexit "command '$CMDSHOW' failed" + + # ... so we can make sure there is only 1 matching route + [ 1 -eq `echo "$ROUTE" | wc -l` ] || \ + errorexit "more than 1 matching route exists" + + # But there might still be no matching route + [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] && [ -z "$ROUTE" ] && \ + ! ocf_is_probe && [ "$__OCF_ACTION" != stop ] && errorexit "no matching route exists" + + # Sed out the source ip address if it exists + SRCIP=`echo $ROUTE | sed -n "s/$MATCHROUTE/\3/p"` + + # and what remains after stripping out the source ip address clause + ROUTE_WO_SRC=`echo $ROUTE | sed "s/$MATCHROUTE/\1\5/"` + + # using "src <ip>" only returns output if there's a match + if [ "$OCF_RESKEY_destination" != "0.0.0.0/0" ]; then + [ -z "$ROUTE" ] && return 1 || return 0 + fi + + [ -z "$SRCIP" ] && return 1 + [ $SRCIP = $1 ] && return 0 + [ "$__OCF_ACTION" = "monitor" ] || [ "$__OCF_ACTION" = "status" ] && [ "${ROUTE%% *}" = "default" ] && return 1 + return 2 +} + +# +# Add (or change if it already exists) the preferred source address +# The exit code should conform to LSB exit codes. +# + +srca_start() { + srca_read $1 + + rc=$? + if [ $rc = 0 ]; then + rc=$OCF_SUCCESS + ocf_log info "The ip route has been already set.($NETWORK, $INTERFACE, $ROUTE_WO_SRC)" + else + $IP2UTIL route replace $TABLE $NETWORK dev $INTERFACE $PROTO src $1 $METRIC || \ + errorexit "command 'ip route replace $TABLE $NETWORK dev $INTERFACE $PROTO src $1 $METRIC' failed" + + if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] ;then + $CMDCHANGE $ROUTE_WO_SRC src $1 || \ + errorexit "command '$CMDCHANGE $ROUTE_WO_SRC src $1' failed" + fi + rc=$? + fi + + return $rc +} + +# +# Remove (if it exists) the preferred source address. +# If one exists but it's not the same as the one specified, that's +# an error. Maybe that's the wrong behaviour because if this fails +# then when IPaddr releases the associated interface (if there is one) +# your matching route will also get dropped ;-( +# The exit code should conform to LSB exit codes. +# + +srca_stop() { + srca_read $1 + rc=$? + + if [ $rc = 1 ]; then + # We do not have a preferred source address for now + ocf_log info "No preferred source address defined, nothing to stop" + exit $OCF_SUCCESS + fi + + [ $rc = 2 ] && errorexit "The address you specified to stop does not match the preferred source address" + + if [ -z "$TABLE" ] || [ "${TABLE#table }" = "main" ]; then + SCOPE="link" + else + SCOPE="host" + fi + + PRIMARY_IP="$($IP2UTIL -4 -o addr show dev $INTERFACE primary | awk '{split($4,a,"/");print a[1]}')" + OPTS="proto kernel scope $SCOPE src $PRIMARY_IP" + + $IP2UTIL route replace $TABLE $NETWORK dev $INTERFACE $OPTS $METRIC || \ + errorexit "command 'ip route replace $TABLE $NETWORK dev $INTERFACE $OPTS $METRIC' failed" + + if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] ;then + $CMDCHANGE $ROUTE_WO_SRC src $PRIMARY_IP || \ + errorexit "command '$CMDCHANGE $ROUTE_WO_SRC src $PRIMARY_IP' failed" + fi + + return $? +} + +srca_status() { + srca_read $1 + + case $? in + 0) echo "OK" + return $OCF_SUCCESS;; + + 1) echo "No preferred source address defined" + return $OCF_NOT_RUNNING;; + + 2) echo "Preferred source address has incorrect value" + return $OCF_ERR_GENERIC;; + esac +} + +# A not reliable IP address checking function, which only picks up those _obvious_ violations... +# +# It accepts IPv4 address in dotted quad notation, for example "192.168.1.1" +# +# 100% confidence whenever it reports "negative", +# but may get false "positive" answer. +# +CheckIP() { + ip="$1" + case $ip in + *[!0-9.]*) #got invalid char + false;; + .*|*.) #begin or end by ".", which is invalid + false;; + *..*) #consecutive ".", which is invalid + false;; + *.*.*.*.*) #four decimal dots, which is too many + false;; + *.*.*.*) #exactly three decimal dots, candidate, evaluate each field + local IFS=. + set -- $ip + if + ( [ $1 -le 254 ] && [ $2 -le 254 ] && [ $3 -le 254 ] && [ $4 -le 254 ] ) + then + if [ $1 -eq 127 ]; then + ocf_exit_reason "IP address [$ip] is a loopback address, thus can not be preferred source address" + exit $OCF_ERR_CONFIGURED + fi + else + true + fi + ;; + *) #less than three decimal dots + false;; + esac + return $? # This return is unnecessary, this comment too :) +} + +# +# Find out which interface or alias serves the given IP address +# The argument is an IP address, and its output +# is an (aliased) interface name (e.g., "eth0" and "eth0:0"). +# +find_interface_solaris() { + + + $IFCONFIG $IFCONFIG_A_OPT | $AWK '{if ($0 ~ /.*: / && NR > 1) {print "\n"$0} else {print}}' | + while read ifname linkstuff + do + : ifname = $ifname + read inet addr junk + : inet = $inet addr = $addr + while + read line && [ "X$line" != "X" ] + do + : Nothing + done + + # This doesn't look right for a box with multiple NICs. + # It looks like it always selects the first interface on + # a machine. Yet, we appear to use the results for this case too... + ifname=`echo "$ifname" | sed s'%:*$%%'` + + case $addr in + addr:$BASEIP) echo $ifname; return $OCF_SUCCESS;; + $BASEIP) echo $ifname; return $OCF_SUCCESS;; + esac + done + return $OCF_ERR_GENERIC +} + + +# +# Find out which interface or alias serves the given IP address +# The argument is an IP address, and its output +# is an (aliased) interface name (e.g., "eth0" and "eth0:0"). +# +find_interface_generic() { + + local iface=`$IP2UTIL -o -f inet addr show | grep "\ $BASEIP" \ + | cut -d ' ' -f2 | grep -v '^ipsec[0-9][0-9]*$'` + if [ -z "$iface" ]; then + return $OCF_ERR_GENERIC + else + echo $iface + return $OCF_SUCCESS + fi +} + + +# +# Find out which interface or alias serves the given IP address +# The argument is an IP address, and its output +# is an (aliased) interface name (e.g., "eth0" and "eth0:0"). +# +find_interface() { + case "$SYSTYPE" in + SunOS) + IF=`find_interface_solaris $BASEIP` + ;; + *) + IF=`find_interface_generic $BASEIP` + ;; + esac + + echo $IF + return $OCF_SUCCESS; +} + + +ip_status() { + + BASEIP="$1" + case "$SYSTYPE" in + Darwin) + # Treat Darwin the same as the other BSD variants (matched as *BSD) + SYSTYPE="${SYSTYPE}BSD" + ;; + *) + ;; + esac + + + case "$SYSTYPE" in + *BSD) + $IFCONFIG $IFCONFIG_A_OPT | grep "inet.*[: ]$BASEIP " >/dev/null 2>&1 + if [ $? = 0 ]; then + return $OCF_SUCCESS + else + return $OCF_NOT_RUNNING + fi;; + + Linux|SunOS) + IF=`find_interface "$BASEIP"` + if [ -z "$IF" ]; then + return $OCF_NOT_RUNNING + fi + + case $IF in + lo*) + ocf_exit_reason "IP address [$BASEIP] is served by loopback, thus can not be preferred source address" + exit $OCF_ERR_CONFIGURED + ;; + *)return $OCF_SUCCESS;; + esac + ;; + + *) + if [ -z "$IF" ]; then + return $OCF_NOT_RUNNING + else + return $OCF_SUCCESS + fi;; + esac +} + + +srca_validate_all() { + + if [ -z "$OCF_RESKEY_ipaddress" ]; then + # usage + ocf_exit_reason "Please set OCF_RESKEY_ipaddress to the preferred source IP address!" + return $OCF_ERR_CONFIGURED + fi + + if ! echo "$OCF_RESKEY_destination" | grep -q "/"; then + return $OCF_ERR_CONFIGURED + fi + + + if ! [ "x$SYSTYPE" = "xLinux" ]; then + # checks after this point are only relevant for linux. + return $OCF_SUCCESS + fi + + check_binary $AWK + case "$SYSTYPE" in + *BSD|SunOS) + check_binary $IFCONFIG + ;; + esac + +# The IP address should be in good shape + if CheckIP "$ipaddress"; then + : + else + ocf_exit_reason "Invalid IP address [$ipaddress]" + return $OCF_ERR_CONFIGURED + fi + + if ocf_is_probe; then + return $OCF_SUCCESS + fi + +# We should serve this IP address of course + if [ "$OCF_CHECK_LEVEL" -eq 10 ]; then + if ip_status "$ipaddress"; then + : + else + ocf_exit_reason "We are not serving [$ipaddress], hence can not make it a preferred source address" + return $OCF_ERR_INSTALLED + fi + fi + return $OCF_SUCCESS +} + +if + ( [ $# -ne 1 ] ) +then + usage + exit $OCF_ERR_ARGS +fi + +# These operations do not require the OCF instance parameters to be set +case $1 in + meta-data) meta_data + exit $OCF_SUCCESS + ;; + usage) usage + exit $OCF_SUCCESS + ;; + *) + ;; +esac + +ipaddress="$OCF_RESKEY_ipaddress" + +[ "$__OCF_ACTION" != "validate-all" ] && OCF_CHECK_LEVEL=10 +srca_validate_all +rc=$? +if [ $rc -ne $OCF_SUCCESS ]; then + case $1 in + # if we can't validate the configuration during a stop, that + # means the resources isn't configured correctly. There's no way + # to actually stop the resource in this situation because there's + # no way it could have even started. Return success here + # to indicate that the resource is not running, otherwise the + # stop action will fail causing the node to be fenced just because + # of a mis configuration. + stop) exit $OCF_SUCCESS;; + *) exit $rc;; + esac +fi + +findif_out=`$FINDIF` +rc=$? +[ $rc -ne 0 ] && { + ocf_exit_reason "[$FINDIF] failed" + exit $rc +} + +INTERFACE=`echo $findif_out | awk '{print $1}'` +LISTROUTE=`$IP2UTIL route list dev $INTERFACE scope link $PROTO match $ipaddress` +[ -z "$PROTO" ] && PROTO=`echo $LISTROUTE | sed -n "s/$PROTOCLAUSE/\1/p"` +if [ -n "$OCF_RESKEY_metric" ]; then + METRIC="metric $OCF_RESKEY_metric" +elif [ -z "$TABLE" ] || [ "${TABLE#table }" = "main" ]; then + METRIC=`echo $LISTROUTE | sed -n "s/$METRICCLAUSE/\1/p"` +else + METRIC="" +fi +if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] ;then + NETWORK=`echo $LISTROUTE | grep -m 1 -o '^[^ ]*'` + + if [ -z "$NETWORK" ]; then + err_str="command '$IP2UTIL route list dev $INTERFACE scope link $PROTO" + err_str="$err_str match $ipaddress' failed to find a matching route" + + if [ "$__OCF_ACTION" = "start" ]; then + ocf_exit_reason "$err_str" + exit $OCF_ERR_ARGS + elif ! ocf_is_probe; then + ocf_log warn "$err_str" + else + ocf_log debug "$err_str" + fi + fi +else + NETWORK="$OCF_RESKEY_destination" +fi + +case $1 in + start) srca_start $ipaddress + ;; + stop) srca_stop $ipaddress + ;; + status) srca_status $ipaddress + ;; + monitor) srca_status $ipaddress + ;; + validate-all) srca_validate_all + ;; + *) usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +exit $? + +# +# Version 0.3 2002/11/04 17:00:00 John Sutton <john@scl.co.uk> +# Name changed from IPsrcroute to IPsrcaddr and now reports errors +# using ha_log rather than on stderr. +# +# Version 0.2 2002/11/02 17:00:00 John Sutton <john@scl.co.uk> +# Changed status output to "OK" to satisfy ResourceManager's +# we_own_resource() function. +# +# Version 0.1 2002/11/01 17:00:00 John Sutton <john@scl.co.uk> +# First effort but does the job? +# diff --git a/heartbeat/IPv6addr.c b/heartbeat/IPv6addr.c new file mode 100644 index 0000000..2e9e126 --- /dev/null +++ b/heartbeat/IPv6addr.c @@ -0,0 +1,899 @@ + +/* + * This program manages IPv6 address with OCF Resource Agent standard. + * + * Author: Huang Zhen <zhenh@cn.ibm.com> + * Copyright (c) 2004 International Business Machines + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +/* + * It can add an IPv6 address, or remove one. + * + * Usage: IPv6addr {start|stop|status|monitor|meta-data} + * + * The "start" arg adds an IPv6 address. + * The "stop" arg removes one. + * The "status" arg shows whether the IPv6 address exists + * The "monitor" arg shows whether the IPv6 address can be pinged (ICMPv6 ECHO) + * The "meta_data" arg shows the meta data(XML) + */ + +/* + * ipv6-address: + * + * currently the following forms are legal: + * address + * address/prefix + * + * E.g. + * 3ffe:ffff:0:f101::3 + * 3ffe:ffff:0:f101::3/64 + * + * It should be passed by environment variant: + * OCF_RESKEY_ipv6addr=3ffe:ffff:0:f101::3 + * OCF_RESKEY_cidr_netmask=64 + * OCF_RESKEY_nic=eth0 + * + */ + +/* + * start: + * 1.IPv6addr will choice a proper interface for the new address. + * 2.Then assign the new address to the interface. + * 3.Wait until the new address is available (reply ICMPv6 ECHO packet) + * 4.Send out the unsolicited advertisements. + * + * return 0(OCF_SUCCESS) for success + * return 1(OCF_ERR_GENERIC) for failure + * return 2(OCF_ERR_ARGS) for invalid or excess argument(s) + * + * + * stop: + * remove the address from the inferface. + * + * return 0(OCF_SUCCESS) for success + * return 1(OCF_ERR_GENERIC) for failure + * return 2(OCF_ERR_ARGS) for invalid or excess argument(s) + * + * status: + * return the status of the address. only check whether it exists. + * + * return 0(OCF_SUCCESS) for existing + * return 1(OCF_NOT_RUNNING) for not existing + * return 2(OCF_ERR_ARGS) for invalid or excess argument(s) + * + * + * monitor: + * ping the address by ICMPv6 ECHO request. + * + * return 0(OCF_SUCCESS) for response correctly. + * return 1(OCF_NOT_RUNNING) for no response. + * return 2(OCF_ERR_ARGS) for invalid or excess argument(s) + */ + +#include <config.h> +#include <IPv6addr.h> + +#include <stdio.h> +#include <stdlib.h> +#include <malloc.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/icmp6.h> +#include <arpa/inet.h> /* for inet_pton */ +#include <net/if.h> /* for if_nametoindex */ +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <libgen.h> +#include <syslog.h> +#include <signal.h> +#include <errno.h> +#include <poll.h> +#include <clplumbing/cl_log.h> + + +#define PIDFILE_BASE HA_RSCTMPDIR "/IPv6addr-" + +/* +0 No error, action succeeded completely +1 generic or unspecified error (current practice) + The "monitor" operation shall return this for a crashed, hung or + otherwise non-functional resource. +2 invalid or excess argument(s) + Likely error code for validate-all, if the instance parameters + do not validate. Any other action is free to also return this + exit status code for this case. +3 unimplemented feature (for example, "reload") +4 user had insufficient privilege +5 program is not installed +6 program is not configured +7 program is not running +8 resource is running in "master" mode and fully operational +9 resource is in "master" mode but in a failed state +*/ +#define OCF_SUCCESS 0 +#define OCF_ERR_GENERIC 1 +#define OCF_ERR_ARGS 2 +#define OCF_ERR_UNIMPLEMENTED 3 +#define OCF_ERR_PERM 4 +#define OCF_ERR_INSTALLED 5 +#define OCF_ERR_CONFIGURED 6 +#define OCF_NOT_RUNNING 7 + +const char* APP_NAME = "IPv6addr"; + +const char* START_CMD = "start"; +const char* STOP_CMD = "stop"; +const char* STATUS_CMD = "status"; +const char* MONITOR_CMD = "monitor"; +const char* ADVT_CMD = "advt"; +const char* RECOVER_CMD = "recover"; +const char* RELOAD_CMD = "reload"; +const char* META_DATA_CMD = "meta-data"; +const char* VALIDATE_CMD = "validate-all"; + +const int QUERY_COUNT = 5; + +struct in6_ifreq { + struct in6_addr ifr6_addr; + uint32_t ifr6_prefixlen; + unsigned int ifr6_ifindex; +}; + +static int start_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname); +static int stop_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname); +static int status_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname); +static int monitor_addr6(struct in6_addr* addr6, int prefix_len); +static int advt_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname); +static int meta_data_addr6(void); + + +static void usage(const char* self); +int write_pid_file(const char *pid_file); +int create_pid_directory(const char *pid_file); +static void byebye(int nsig); + +static char* scan_if(struct in6_addr* addr_target, int* plen_target, + int use_mask, char* prov_ifname); +static char* find_if(struct in6_addr* addr_target, int* plen_target, char* prov_ifname); +static char* get_if(struct in6_addr* addr_target, int* plen_target, char* prov_ifname); +static int assign_addr6(struct in6_addr* addr6, int prefix_len, char* if_name); +static int unassign_addr6(struct in6_addr* addr6, int prefix_len, char* if_name); +int is_addr6_available(struct in6_addr* addr6); + +int +main(int argc, char* argv[]) +{ + char pid_file[256]; + char* ipv6addr; + char* cidr_netmask; + int ret; + char* cp; + char* prov_ifname = NULL; + int prefix_len = -1; + struct in6_addr addr6; + struct sigaction act; + + /* Check the count of parameters first */ + if (argc < 2) { + usage(argv[0]); + return OCF_ERR_ARGS; + } + + /* set termination signal */ + memset(&act, 0, sizeof(struct sigaction)); + act.sa_flags &= ~SA_RESTART; /* redundant - to stress syscalls should fail */ + act.sa_handler = byebye; + if ((sigemptyset(&act.sa_mask) < 0) || (sigaction(SIGTERM, &act, NULL) < 0)) { + cl_log(LOG_ERR, "Could not set handler for signal: %s", strerror(errno)); + return OCF_ERR_GENERIC; + } + + /* open system log */ + cl_log_set_entity(APP_NAME); + cl_log_set_facility(LOG_DAEMON); + + /* the meta-data dont need any parameter */ + if (0 == strncmp(META_DATA_CMD, argv[1], strlen(META_DATA_CMD))) { + ret = meta_data_addr6(); + return OCF_SUCCESS; + } + + /* check the OCF_RESKEY_ipv6addr parameter, should be an IPv6 address */ + ipv6addr = getenv("OCF_RESKEY_ipv6addr"); + + if (ipv6addr == NULL) { + cl_log(LOG_ERR, "Please set OCF_RESKEY_ipv6addr to the IPv6 address you want to manage."); + usage(argv[0]); + return OCF_ERR_ARGS; + } + + /* legacy option */ + if ((cp = strchr(ipv6addr, '/'))) { + prefix_len = atol(cp + 1); + if ((prefix_len < 0) || (prefix_len > 128)) { + cl_log(LOG_ERR, "Invalid prefix_len [%s], should be an integer in [0, 128]", cp+1); + usage(argv[0]); + return OCF_ERR_ARGS; + } + *cp=0; + } + + /* get provided netmask (optional) */ + cidr_netmask = getenv("OCF_RESKEY_cidr_netmask"); + + if (cidr_netmask != NULL) { + if ((atol(cidr_netmask) < 0) || (atol(cidr_netmask) > 128)) { + cl_log(LOG_ERR, "Invalid prefix_len [%s], " + "should be an integer in [0, 128]", cidr_netmask); + usage(argv[0]); + return OCF_ERR_ARGS; + } + if (prefix_len != -1 && prefix_len != atol(cidr_netmask)) { + cl_log(LOG_DEBUG, "prefix_len(%d) is overwritted by cidr_netmask(%s)", + prefix_len, cidr_netmask); + } + prefix_len = atol(cidr_netmask); + + } else if (prefix_len == -1) { + prefix_len = 0; + } + + /* get provided interface name (optional) */ + prov_ifname = getenv("OCF_RESKEY_nic"); + + if (inet_pton(AF_INET6, ipv6addr, &addr6) <= 0) { + cl_log(LOG_ERR, "Invalid IPv6 address [%s]", ipv6addr); + usage(argv[0]); + return OCF_ERR_ARGS; + } + + /* Check whether this system supports IPv6 */ + if (access(IF_INET6, R_OK)) { + cl_log(LOG_ERR, "No support for INET6 on this system."); + return OCF_ERR_GENERIC; + } + + /* create the pid file so we can make sure that only one IPv6addr + * for this address is running + */ + if (snprintf(pid_file, sizeof(pid_file), "%s%s", PIDFILE_BASE, ipv6addr) + >= (int)sizeof(pid_file)) { + cl_log(LOG_ERR, "Pid file truncated"); + return OCF_ERR_GENERIC; + } + + if (write_pid_file(pid_file) < 0) { + return OCF_ERR_GENERIC; + } + + + /* switch the command */ + if (0 == strncmp(START_CMD,argv[1], strlen(START_CMD))) { + ret = start_addr6(&addr6, prefix_len, prov_ifname); + }else if (0 == strncmp(STOP_CMD,argv[1], strlen(STOP_CMD))) { + ret = stop_addr6(&addr6, prefix_len, prov_ifname); + }else if (0 == strncmp(STATUS_CMD,argv[1], strlen(STATUS_CMD))) { + ret = status_addr6(&addr6, prefix_len, prov_ifname); + }else if (0 ==strncmp(MONITOR_CMD,argv[1], strlen(MONITOR_CMD))) { + ret = monitor_addr6(&addr6, prefix_len); + }else if (0 ==strncmp(RELOAD_CMD,argv[1], strlen(RELOAD_CMD))) { + ret = OCF_ERR_UNIMPLEMENTED; + }else if (0 ==strncmp(RECOVER_CMD,argv[1], strlen(RECOVER_CMD))) { + ret = OCF_ERR_UNIMPLEMENTED; + }else if (0 ==strncmp(VALIDATE_CMD,argv[1], strlen(VALIDATE_CMD))) { + /* ipv6addr has been validated by inet_pton, hence a valid IPv6 address */ + ret = OCF_SUCCESS; + }else if (0 ==strncmp(ADVT_CMD,argv[1], strlen(MONITOR_CMD))) { + ret = advt_addr6(&addr6, prefix_len, prov_ifname); + }else{ + usage(argv[0]); + ret = OCF_ERR_ARGS; + } + + /* release the pid file */ + unlink(pid_file); + + return ret; +} +int +start_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname) +{ + int i; + char* if_name; + if(OCF_SUCCESS == status_addr6(addr6,prefix_len,prov_ifname)) { + return OCF_SUCCESS; + } + + /* we need to find a proper device to assign the address */ + if_name = find_if(addr6, &prefix_len, prov_ifname); + if (NULL == if_name) { + cl_log(LOG_ERR, "no valid mechanisms"); + return OCF_ERR_GENERIC; + } + + /* Assign the address */ + if (0 != assign_addr6(addr6, prefix_len, if_name)) { + cl_log(LOG_ERR, "failed to assign the address to %s", if_name); + return OCF_ERR_GENERIC; + } + + /* Check whether the address available */ + for (i = 0; i < QUERY_COUNT; i++) { + if (0 == is_addr6_available(addr6)) { + break; + } + sleep(1); + } + if (i == QUERY_COUNT) { + cl_log(LOG_ERR, "failed to ping the address"); + return OCF_ERR_GENERIC; + } + + /* Send unsolicited advertisement packet to neighbor */ + for (i = 0; i < UA_REPEAT_COUNT; i++) { + send_ua(addr6, if_name); + sleep(1); + } + return OCF_SUCCESS; +} + +int +advt_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname) +{ + /* First, we need to find a proper device to assign the address */ + char* if_name = get_if(addr6, &prefix_len, prov_ifname); + int i; + if (NULL == if_name) { + cl_log(LOG_ERR, "no valid mechanisms"); + return OCF_ERR_GENERIC; + } + /* Send unsolicited advertisement packet to neighbor */ + for (i = 0; i < UA_REPEAT_COUNT; i++) { + send_ua(addr6, if_name); + sleep(1); + } + return OCF_SUCCESS; +} + +int +stop_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname) +{ + char* if_name; + if(OCF_NOT_RUNNING == status_addr6(addr6,prefix_len,prov_ifname)) { + return OCF_SUCCESS; + } + + if_name = get_if(addr6, &prefix_len, prov_ifname); + + if (NULL == if_name) { + cl_log(LOG_ERR, "no valid mechanisms."); + /* I think this should be a success exit according to LSB. */ + return OCF_ERR_GENERIC; + } + + /* Unassign the address */ + if (0 != unassign_addr6(addr6, prefix_len, if_name)) { + cl_log(LOG_ERR, "failed to assign the address to %s", if_name); + return OCF_ERR_GENERIC; + } + + return OCF_SUCCESS; +} + +int +status_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname) +{ + char* if_name = get_if(addr6, &prefix_len, prov_ifname); + if (NULL == if_name) { + return OCF_NOT_RUNNING; + } + return OCF_SUCCESS; +} + +int +monitor_addr6(struct in6_addr* addr6, int prefix_len) +{ + if(0 == is_addr6_available(addr6)) { + return OCF_SUCCESS; + } + return OCF_NOT_RUNNING; +} + +/* find the network interface associated with an address */ +char* +scan_if(struct in6_addr* addr_target, int* plen_target, int use_mask, char* prov_ifname) +{ + FILE *f; + static char devname[21]=""; + struct in6_addr addr; + struct in6_addr mask; + unsigned int plen, scope, dad_status, if_idx; + unsigned int addr6p[4]; + + /* open /proc/net/if_inet6 file */ + if ((f = fopen(IF_INET6, "r")) == NULL) { + return NULL; + } + + /* Loop for each entry */ + while (1) { + int i; + int n; + int s; + gboolean same = TRUE; + + i = fscanf(f, "%08x%08x%08x%08x %x %02x %02x %02x %20s\n", + &addr6p[0], &addr6p[1], &addr6p[2], &addr6p[3], + &if_idx, &plen, &scope, &dad_status, devname); + if (i == EOF) { + break; + } + else if (i != 9) { + cl_log(LOG_INFO, "Error parsing %s, " + "perhaps the format has changed\n", IF_INET6); + break; + } + + /* Consider link-local addresses (scope == 0x20) only when + * the inerface name is provided, and global addresses + * (scope == 0). Skip everything else. + */ + if (scope != 0) { + if (scope != 0x20 || prov_ifname == 0 + || *prov_ifname == 0) + continue; + } + + /* If specified prefix, only same prefix entry + * would be considered. + */ + if (*plen_target!=0 && plen != *plen_target) { + continue; + } + + /* If interface name provided, only same devname entry + * would be considered + */ + if (prov_ifname!=0 && *prov_ifname!=0) + { + if (strcmp(devname, prov_ifname)) + continue; + } + + for (i = 0; i< 4; i++) { + addr.s6_addr32[i] = htonl(addr6p[i]); + } + + /* Make the mask based on prefix length */ + memset(mask.s6_addr, 0xff, 16); + if (use_mask && plen < 128) { + n = plen / 32; + memset(mask.s6_addr32 + n + 1, 0, (3 - n) * 4); + s = 32 - plen % 32; + if (s == 32) + mask.s6_addr32[n] = 0x0; + else + mask.s6_addr32[n] = 0xffffffff << s; + mask.s6_addr32[n] = htonl(mask.s6_addr32[n]); + } + + /* compare addr and addr_target */ + same = TRUE; + for (i = 0; i < 4; i++) { + if ((addr.s6_addr32[i]&mask.s6_addr32[i]) != + (addr_target->s6_addr32[i]&mask.s6_addr32[i])) { + same = FALSE; + break; + } + } + + /* We found it! */ + if (same) { + fclose(f); + *plen_target = plen; + return devname; + } + } + fclose(f); + return NULL; +} +/* find a proper network interface to assign the address */ +char* +find_if(struct in6_addr* addr_target, int* plen_target, char* prov_ifname) +{ + char *best_ifname = scan_if(addr_target, plen_target, 1, prov_ifname); + + /* use the provided ifname and prefix if the address did not match */ + if (best_ifname == NULL && + prov_ifname != 0 && *prov_ifname != 0 && *plen_target != 0) { + cl_log(LOG_INFO, "Could not find a proper interface by the ipv6addr. Using the specified nic:'%s' and cidr_netmask:'%d'", prov_ifname, *plen_target); + return prov_ifname; + } + return best_ifname; +} +/* get the device name and the plen_target of a special address */ +char* +get_if(struct in6_addr* addr_target, int* plen_target, char* prov_ifname) +{ + return scan_if(addr_target, plen_target, 0, prov_ifname); +} +int +assign_addr6(struct in6_addr* addr6, int prefix_len, char* if_name) +{ + struct in6_ifreq ifr6; + + /* Get socket first */ + int fd; + struct ifreq ifr; + + fd = socket(AF_INET6, SOCK_DGRAM, 0); + if (fd < 0) { + return 1; + } + + /* Query the index of the if */ + strcpy(ifr.ifr_name, if_name); + if (ioctl(fd, SIOGIFINDEX, &ifr) < 0) { + return -1; + } + + /* Assign the address to the if */ + ifr6.ifr6_addr = *addr6; + ifr6.ifr6_ifindex = ifr.ifr_ifindex; + ifr6.ifr6_prefixlen = prefix_len; + if (ioctl(fd, SIOCSIFADDR, &ifr6) < 0) { + return -1; + } + close (fd); + return 0; +} +int +unassign_addr6(struct in6_addr* addr6, int prefix_len, char* if_name) +{ + int fd; + struct ifreq ifr; + struct in6_ifreq ifr6; + + /* Get socket first */ + fd = socket(AF_INET6, SOCK_DGRAM, 0); + if (fd < 0) { + return 1; + } + + /* Query the index of the if */ + strcpy(ifr.ifr_name, if_name); + if (ioctl(fd, SIOGIFINDEX, &ifr) < 0) { + return -1; + } + + /* Unassign the address to the if */ + ifr6.ifr6_addr = *addr6; + ifr6.ifr6_ifindex = ifr.ifr_ifindex; + ifr6.ifr6_prefixlen = prefix_len; + if (ioctl(fd, SIOCDIFADDR, &ifr6) < 0) { + return -1; + } + + close (fd); + return 0; +} + +#define MINPACKSIZE 64 +int +is_addr6_available(struct in6_addr* addr6) +{ + struct sockaddr_in6 addr; + struct icmp6_hdr icmph; + u_char outpack[MINPACKSIZE]; + int icmp_sock; + int ret; + struct iovec iov; + u_char packet[MINPACKSIZE]; + struct msghdr msg; + int i; + struct pollfd pfd; + + if ((icmp_sock = socket(AF_INET6, SOCK_RAW, IPPROTO_ICMPV6)) == -1) { + return -1; + } + + memset(&icmph, 0, sizeof(icmph)); + icmph.icmp6_type = ICMP6_ECHO_REQUEST; + icmph.icmp6_code = 0; + icmph.icmp6_cksum = 0; + icmph.icmp6_seq = htons(0); + icmph.icmp6_id = 0; + + memset(&outpack, 0, sizeof(outpack)); + memcpy(&outpack, &icmph, sizeof(icmph)); + + memset(&addr, 0, sizeof(struct sockaddr_in6)); + addr.sin6_family = AF_INET6; + addr.sin6_port = htons(IPPROTO_ICMPV6); + memcpy(&addr.sin6_addr,addr6,sizeof(struct in6_addr)); + + /* Only the first 8 bytes of outpack are meaningful... */ + ret = sendto(icmp_sock, (char *)outpack, sizeof(outpack), 0, + (struct sockaddr *) &addr, + sizeof(struct sockaddr_in6)); + if (0 >= ret) { + return -1; + } + + iov.iov_base = (char *)packet; + iov.iov_len = sizeof(packet); + + msg.msg_name = &addr; + msg.msg_namelen = sizeof(addr); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = NULL; + msg.msg_controllen = 0; + + for (i = 0; i < 3; i++) { + pfd.fd = icmp_sock; + pfd.events = POLLIN; + pfd.revents = 0; + ret = poll(&pfd, 1, 10); + + if (ret < 1) + continue; + + ret = recvmsg(icmp_sock, &msg, MSG_DONTWAIT); + if (ret > 0) + return 0; + if (ret == 0) + break; + + if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) + break; + } + + return -1; +} + +static void usage(const char* self) +{ + printf("usage: %s {start|stop|status|monitor|validate-all|meta-data}\n",self); + return; +} + +/* Following code is copied from send_arp.c, linux-HA project. */ +void +byebye(int nsig) +{ + (void)nsig; + /* Avoid an "error exit" log message if we're killed */ + exit(0); +} + +int +create_pid_directory(const char *pid_file) +{ + int status; + int return_status = -1; + struct stat stat_buf; + char* dir; + + dir = strdup(pid_file); + if (!dir) { + cl_log(LOG_INFO, "Memory allocation failure: %s", + strerror(errno)); + return -1; + } + + dirname(dir); + + status = stat(dir, &stat_buf); + + if (status < 0 && errno != ENOENT && errno != ENOTDIR) { + cl_log(LOG_INFO, "Could not stat pid-file directory " + "[%s]: %s", dir, strerror(errno)); + goto err; + } + + if (!status) { + if (S_ISDIR(stat_buf.st_mode)) { + goto out; + } + cl_log(LOG_INFO, "Pid-File directory exists but is " + "not a directory [%s]", dir); + goto err; + } + + if (mkdir(dir, S_IRUSR|S_IWUSR|S_IXUSR | S_IRGRP|S_IXGRP) < 0) { + cl_log(LOG_INFO, "Could not create pid-file directory " + "[%s]: %s", dir, strerror(errno)); + goto err; + } + +out: + return_status = 0; +err: + free(dir); + return return_status; +} + +int +write_pid_file(const char *pid_file) +{ + + int pidfilefd; + char pidbuf[11]; + unsigned long pid; + ssize_t bytes; + + if (*pid_file != '/') { + cl_log(LOG_INFO, "Invalid pid-file name, must begin with a " + "'/' [%s]\n", pid_file); + return -1; + } + + if (create_pid_directory(pid_file) < 0) { + return -1; + } + + while (1) { + pidfilefd = open(pid_file, O_CREAT|O_EXCL|O_RDWR, + S_IRUSR|S_IWUSR); + if (pidfilefd < 0) { + if (errno != EEXIST) { /* Old PID file */ + cl_log(LOG_INFO, "Could not open pid-file " + "[%s]: %s", pid_file, + strerror(errno)); + return -1; + } + } + else { + break; + } + + pidfilefd = open(pid_file, O_RDONLY, S_IRUSR|S_IWUSR); + if (pidfilefd < 0) { + cl_log(LOG_INFO, "Could not open pid-file " + "[%s]: %s", pid_file, + strerror(errno)); + return -1; + } + + while (1) { + bytes = read(pidfilefd, pidbuf, sizeof(pidbuf)-1); + if (bytes < 0) { + if (errno == EINTR) { + continue; + } + cl_log(LOG_INFO, "Could not read pid-file " + "[%s]: %s", pid_file, + strerror(errno)); + return -1; + } + pidbuf[bytes] = '\0'; + break; + } + + if(unlink(pid_file) < 0) { + cl_log(LOG_INFO, "Could not delete pid-file " + "[%s]: %s", pid_file, + strerror(errno)); + return -1; + } + + if (!bytes) { + cl_log(LOG_INFO, "Invalid pid in pid-file " + "[%s]: %s", pid_file, + strerror(errno)); + return -1; + } + + close(pidfilefd); + + pid = strtoul(pidbuf, NULL, 10); + if (pid == ULONG_MAX && errno == ERANGE) { + cl_log(LOG_INFO, "Invalid pid in pid-file " + "[%s]: %s", pid_file, + strerror(errno)); + return -1; + } + + if (kill(pid, SIGKILL) < 0 && errno != ESRCH) { + cl_log(LOG_INFO, "Error killing old process [%lu] " + "from pid-file [%s]: %s", pid, + pid_file, strerror(errno)); + return -1; + } + + cl_log(LOG_INFO, "Killed old send_ua process [%lu]", pid); + } + + if (snprintf(pidbuf, sizeof(pidbuf), "%u" + , getpid()) >= (int)sizeof(pidbuf)) { + cl_log(LOG_INFO, "Pid too long for buffer [%u]", getpid()); + return -1; + } + + while (1) { + bytes = write(pidfilefd, pidbuf, strlen(pidbuf)); + if (bytes != strlen(pidbuf)) { + if (bytes < 0 && errno == EINTR) { + continue; + } + cl_log(LOG_INFO, "Could not write pid-file " + "[%s]: %s", pid_file, + strerror(errno)); + return -1; + } + break; + } + + close(pidfilefd); + + return 0; +} +static int +meta_data_addr6(void) +{ + const char* meta_data= + "<?xml version=\"1.0\"?>\n" + "<!DOCTYPE resource-agent SYSTEM \"ra-api-1.dtd\">\n" + "<resource-agent name=\"IPv6addr\" version=\"1.0\">\n" + " <version>1.0</version>\n" + " <longdesc lang=\"en\">\n" + " This script manages IPv6 alias IPv6 addresses,It can add an IP6\n" + " alias, or remove one.\n" + " </longdesc>\n" + " <shortdesc lang=\"en\">Manages IPv6 aliases</shortdesc>\n" + " <parameters>\n" + " <parameter name=\"ipv6addr\" unique=\"0\" required=\"1\">\n" + " <longdesc lang=\"en\">\n" + " The IPv6 address this RA will manage \n" + " </longdesc>\n" + " <shortdesc lang=\"en\">IPv6 address</shortdesc>\n" + " <content type=\"string\" default=\"\" />\n" + " </parameter>\n" + " <parameter name=\"cidr_netmask\" unique=\"0\">\n" + " <longdesc lang=\"en\">\n" + " The netmask for the interface in CIDR format. (ie, 24).\n" + " The value of this parameter overwrites the value of _prefix_\n" + " of ipv6addr parameter.\n" + " </longdesc>\n" + " <shortdesc lang=\"en\">Netmask</shortdesc>\n" + " <content type=\"string\" default=\"\" />\n" + " </parameter>\n" + " <parameter name=\"nic\" unique=\"0\">\n" + " <longdesc lang=\"en\">\n" + " The base network interface on which the IPv6 address will\n" + " be brought online.\n" + " </longdesc>\n" + " <shortdesc lang=\"en\">Network interface</shortdesc>\n" + " <content type=\"string\" default=\"\" />\n" + " </parameter>\n" + " </parameters>\n" + " <actions>\n" + " <action name=\"start\" timeout=\"15s\" />\n" + " <action name=\"stop\" timeout=\"15s\" />\n" + " <action name=\"status\" timeout=\"15s\" interval=\"15s\" />\n" + " <action name=\"monitor\" timeout=\"15s\" interval=\"15s\" />\n" + " <action name=\"validate-all\" timeout=\"5s\" />\n" + " <action name=\"meta-data\" timeout=\"5s\" />\n" + " </actions>\n" + "</resource-agent>\n"; + printf("%s\n",meta_data); + return OCF_SUCCESS; +} diff --git a/heartbeat/IPv6addr_utils.c b/heartbeat/IPv6addr_utils.c new file mode 100644 index 0000000..7672b70 --- /dev/null +++ b/heartbeat/IPv6addr_utils.c @@ -0,0 +1,147 @@ + +/* + * This program manages IPv6 address with OCF Resource Agent standard. + * + * Author: Huang Zhen <zhenh@cn.ibm.com> + * Copyright (c) 2004 International Business Machines + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#include <IPv6addr.h> + +#include <stdio.h> +#include <stdlib.h> +#include <malloc.h> +#include <unistd.h> +#include <sys/socket.h> +#include <arpa/inet.h> /* for inet_pton */ +#include <net/if.h> /* for if_nametoindex */ +#include <sys/ioctl.h> +#include <fcntl.h> +#include <signal.h> +#include <errno.h> + +/* Send an unsolicited advertisement packet + * Please refer to rfc4861 / rfc3542 + */ +int +send_ua(struct in6_addr* src_ip, char* if_name) +{ + int status = -1; + int fd; + + int ifindex; + int hop; + struct ifreq ifr; + u_int8_t *payload = NULL; + int payload_size; + struct nd_neighbor_advert *na; + struct nd_opt_hdr *opt; + struct sockaddr_in6 src_sin6; + struct sockaddr_in6 dst_sin6; + + if ((fd = socket(AF_INET6, SOCK_RAW, IPPROTO_ICMPV6)) == -1) { + printf("ERROR: socket(IPPROTO_ICMPV6) failed: %s", + strerror(errno)); + return status; + } + /* set the outgoing interface */ + ifindex = if_nametoindex(if_name); + if (setsockopt(fd, IPPROTO_IPV6, IPV6_MULTICAST_IF, + &ifindex, sizeof(ifindex)) < 0) { + printf("ERROR: setsockopt(IPV6_MULTICAST_IF) failed: %s", + strerror(errno)); + goto err; + } + /* set the hop limit */ + hop = 255; /* 255 is required. see rfc4861 7.1.2 */ + if (setsockopt(fd, IPPROTO_IPV6, IPV6_MULTICAST_HOPS, + &hop, sizeof(hop)) < 0) { + printf("ERROR: setsockopt(IPV6_MULTICAST_HOPS) failed: %s", + strerror(errno)); + goto err; + } + + /* set the source address */ + memset(&src_sin6, 0, sizeof(src_sin6)); + src_sin6.sin6_family = AF_INET6; + src_sin6.sin6_addr = *src_ip; + src_sin6.sin6_port = 0; + if (IN6_IS_ADDR_LINKLOCAL(&src_sin6.sin6_addr) || + IN6_IS_ADDR_MC_LINKLOCAL(&src_sin6.sin6_addr)) { + src_sin6.sin6_scope_id = ifindex; + } + + if (bind(fd, (struct sockaddr *)&src_sin6, sizeof(src_sin6)) < 0) { + printf("ERROR: bind() failed: %s", strerror(errno)); + goto err; + } + + + /* get the hardware address */ + memset(&ifr, 0, sizeof(ifr)); + strncpy(ifr.ifr_name, if_name, sizeof(ifr.ifr_name) - 1); + if (ioctl(fd, SIOCGIFHWADDR, &ifr) < 0) { + printf("ERROR: ioctl(SIOCGIFHWADDR) failed: %s", strerror(errno)); + goto err; + } + + /* build a neighbor advertisement message */ + payload_size = sizeof(struct nd_neighbor_advert) + + sizeof(struct nd_opt_hdr) + HWADDR_LEN; + payload = memalign(sysconf(_SC_PAGESIZE), payload_size); + if (!payload) { + printf("ERROR: malloc for payload failed"); + goto err; + } + memset(payload, 0, payload_size); + + /* Ugly typecast from ia64 hell! */ + na = (struct nd_neighbor_advert *)((void *)payload); + na->nd_na_type = ND_NEIGHBOR_ADVERT; + na->nd_na_code = 0; + na->nd_na_cksum = 0; /* calculated by kernel */ + na->nd_na_flags_reserved = ND_NA_FLAG_OVERRIDE; + na->nd_na_target = *src_ip; + + /* options field; set the target link-layer address */ + opt = (struct nd_opt_hdr *)(payload + sizeof(struct nd_neighbor_advert)); + opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; + opt->nd_opt_len = 1; /* The length of the option in units of 8 octets */ + memcpy(payload + sizeof(struct nd_neighbor_advert) + + sizeof(struct nd_opt_hdr), + &ifr.ifr_hwaddr.sa_data, HWADDR_LEN); + + /* sending an unsolicited neighbor advertisement to all */ + memset(&dst_sin6, 0, sizeof(dst_sin6)); + dst_sin6.sin6_family = AF_INET6; + inet_pton(AF_INET6, BCAST_ADDR, &dst_sin6.sin6_addr); /* should not fail */ + + if (sendto(fd, payload, payload_size, 0, + (struct sockaddr *)&dst_sin6, sizeof(dst_sin6)) + != payload_size) { + printf("ERROR: sendto(%s) failed: %s", + if_name, strerror(errno)); + goto err; + } + + status = 0; + +err: + close(fd); + free(payload); + return status; +} diff --git a/heartbeat/LVM b/heartbeat/LVM new file mode 100755 index 0000000..b587bd8 --- /dev/null +++ b/heartbeat/LVM @@ -0,0 +1,470 @@ +#!/bin/sh +# +# +# LVM +# +# Description: Manages an LVM volume as an HA resource +# +# +# Author: Alan Robertson +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL) +# Copyright: (C) 2002 - 2005 International Business Machines, Inc. +# +# This code significantly inspired by the LVM resource +# in FailSafe by Lars Marowsky-Bree +# +# +# An example usage in /etc/ha.d/haresources: +# node1 10.0.0.170 ServeRAID::1::1 LVM::myvolname +# +# See usage() function below for more details... +# +# OCF parameters are as below: +# OCF_RESKEY_volgrpname +# +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_volgrpname_default="" +OCF_RESKEY_exclusive_default="false" +OCF_RESKEY_tag_default="pacemaker" +OCF_RESKEY_partial_activation_default="false" + +: ${OCF_RESKEY_volgrpname=${OCF_RESKEY_volgrpname_default}} +: ${OCF_RESKEY_exclusive=${OCF_RESKEY_exclusive_default}} +: ${OCF_RESKEY_tag=${OCF_RESKEY_tag_default}} +: ${OCF_RESKEY_partial_activation=${OCF_RESKEY_partial_activation_default}} + +####################################################################### + + +usage() { + methods=`LVM_methods` + methods=`echo $methods | tr ' ' '|'` + cat <<EOF + usage: $0 $methods + + $0 manages an Linux Volume Manager volume (LVM) as an HA resource + + The 'start' operation brings the given volume online + The 'stop' operation takes the given volume offline + The 'status' operation reports whether the volume is available + The 'monitor' operation reports whether the volume seems present + The 'validate-all' operation checks whether the OCF parameters are valid + The 'meta-data' operation show meta data + The 'methods' operation reports on the methods $0 supports + +EOF +} + +meta_data() { + cat <<EOF +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="LVM" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for LVM. It manages an Linux Volume Manager volume (LVM) +as an HA resource. +</longdesc> +<shortdesc lang="en">Controls the availability of an LVM Volume Group</shortdesc> + +<parameters> +<parameter name="volgrpname" unique="1" required="1"> +<longdesc lang="en"> +The name of volume group. +</longdesc> +<shortdesc lang="en">Volume group name</shortdesc> +<content type="string" default="${OCF_RESKEY_volgrpname_default}" /> +</parameter> +<parameter name="exclusive" unique="0" required="0"> +<longdesc lang="en"> +If set, the volume group will be activated exclusively. This option works one of +two ways. If the volume group has the cluster attribute set, then the volume group +will be activated exclusively using clvmd across the cluster. If the cluster attribute +is not set, the volume group will be activated exclusively using a tag and the volume_list +filter. When the tag option is in use, the volume_list in lvm.con must be initialized. This +can be as simple as setting 'volume_list = []' depending on your setup. +</longdesc> +<shortdesc lang="en">Exclusive activation</shortdesc> +<content type="boolean" default="${OCF_RESKEY_exclusive_default}" /> +</parameter> + +<parameter name="tag" unique="0" required="0"> +<longdesc lang="en"> +If "exclusive" is set on a non clustered volume group, this overrides the tag to be used. +</longdesc> +<shortdesc lang="en">Exclusive activation tag</shortdesc> +<content type="string" default="${OCF_RESKEY_tag_default}" /> +</parameter> + +<parameter name="partial_activation" unique="0" required="0"> +<longdesc lang="en"> +If set, the volume group will be activated partially even with some +physical volumes missing. It helps to set to true when using mirrored +logical volumes. +</longdesc> +<shortdesc lang="en">Activate VG partially when missing PVs</shortdesc> +<content type="string" default="${OCF_RESKEY_partial_activation_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="30s" /> +<action name="stop" timeout="30s" /> +<action name="status" timeout="30s" /> +<action name="monitor" depth="0" timeout="30s" interval="10s" /> +<action name="methods" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s" /> +</actions> +</resource-agent> +EOF +} + +# +# methods: What methods/operations do we support? +# +LVM_methods() { + cat <<EOF + start + stop + status + monitor + methods + validate-all + meta-data + usage +EOF +} + +## +# +# plain = normal (non-exclusive) local activation +# tag = tagged-exclusive activation +# clvm = clvm-exclusive activation +# +# the mode specific implementation is in lvm-$mode.sh +## + +set_lvm_mode() { + local mode + + if ocf_is_true "$OCF_RESKEY_exclusive"; then + case $(vgs -o attr --noheadings $OCF_RESKEY_volgrpname | tr -d ' ') in + ?????c*) + mode="clvm" ;; + *) + mode="tag" ;; + esac + else + mode="plain" + fi + + . ${OCF_FUNCTIONS_DIR}/lvm-${mode}.sh +} + +# +# Return LVM status (silently) +# +LVM_status() { + local rc=1 + loglevel="debug" + + # Set the log level of the error message + if [ "X${2}" = "X" ]; then + loglevel="err" + if ocf_is_probe; then + loglevel="warn" + else + if [ ${OP_METHOD} = "stop" ]; then + loglevel="info" + fi + fi + fi + + if [ -d /dev/$1 ]; then + test "`cd /dev/$1 && ls`" != "" + rc=$? + if [ $rc -ne 0 ]; then + ocf_exit_reason "VG $1 with no logical volumes is not supported by this RA!" + fi + fi + + if [ $rc -ne 0 ]; then + ocf_log $loglevel "LVM Volume $1 is not available (stopped)" + rc=$OCF_NOT_RUNNING + else + lvm_status + rc=$? + fi + + if [ "X${2}" = "X" ]; then + # status call return + return $rc + fi + + # Report on LVM volume status to stdout... + if [ $rc -eq 0 ]; then + echo "Volume $1 is available (running)" + else + echo "Volume $1 is not available (stopped)" + fi + return $rc +} + +# +# Enable LVM volume +# +LVM_start() { + local vg=$1 + + # systemd drop-in to stop process before storage services during + # shutdown/reboot + if systemd_is_running ; then + systemd_drop_in "99-LVM" "After" "blk-availability.service" + fi + + # TODO: This MUST run vgimport as well + ocf_log info "Activating volume group $vg" + if [ "$LVM_MAJOR" -eq "1" ]; then + ocf_run vgscan $vg + else + ocf_run vgscan + fi + + lvm_pre_activate || exit + ocf_run vgchange $vgchange_activate_options $vg + lvm_post_activate $? + + if LVM_status $vg; then + : OK Volume $vg activated just fine! + return $OCF_SUCCESS + else + ocf_exit_reason "LVM: $vg did not activate correctly" + return $OCF_ERR_GENERIC + fi +} + +# +# Disable the LVM volume +# +LVM_stop() { + local res=$OCF_ERR_GENERIC + local vg=$1 + + if ! vgs $vg > /dev/null 2>&1; then + ocf_log info "Volume group $vg not found" + return $OCF_SUCCESS + fi + + ocf_log info "Deactivating volume group $vg" + + lvm_pre_deactivate || exit + + for i in $(seq 10) + do + ocf_run vgchange $vgchange_deactivate_options $vg + res=$? + if LVM_status $vg; then + ocf_exit_reason "LVM: $vg did not stop correctly" + res=1 + fi + + if [ $res -eq 0 ]; then + break + fi + + res=$OCF_ERR_GENERIC + ocf_log warn "$vg still Active" + ocf_log info "Retry deactivating volume group $vg" + sleep 1 + which udevadm > /dev/null 2>&1 && udevadm settle --timeout=5 + done + + lvm_post_deactivate $res +} + +# +# Check whether the OCF instance parameters are valid +# +LVM_validate_all() { + check_binary $AWK + + ## + # lvmetad is a daemon that caches lvm metadata to improve the + # performance of LVM commands. This daemon should never be used when + # volume groups exist that are being managed by the cluster. The lvmetad + # daemon introduces a response lag, where certain LVM commands look like + # they have completed (like vg activation) when in fact the command + # is still in progress by the lvmetad. This can cause reliability issues + # when managing volume groups in the cluster. For Example, if you have a + # volume group that is a dependency for another application, it is possible + # the cluster will think the volume group is activated and attempt to start + # the application before volume group is really accesible... lvmetad is bad. + ## + lvm dumpconfig global/use_lvmetad | grep 'use_lvmetad.*=.*1' > /dev/null 2>&1 + if [ $? -eq 0 ]; then + # for now warn users that lvmetad is enabled and that they should disable it. In the + # future we may want to consider refusing to start, or killing the lvmetad daemon. + ocf_log warn "Disable lvmetad in lvm.conf. lvmetad should never be enabled in a clustered environment. Set use_lvmetad=0 and kill the lvmetad process" + fi + + ## + # Off-the-shelf tests... + ## + VGOUT=`vgck ${VOLUME} 2>&1` + if [ $? -ne 0 ]; then + # Inconsistency might be due to missing physical volumes, which doesn't + # automatically mean we should fail. If partial_activation=true then + # we should let start try to handle it, or if no PVs are listed as + # "unknown device" then another node may have marked a device missing + # where we have access to all of them and can start without issue. + if vgs -o pv_attr --noheadings $OCF_RESKEY_volgrpname 2>/dev/null | grep 'm' > /dev/null 2>&1; then + case $(vgs -o attr --noheadings $OCF_RESKEY_volgrpname | tr -d ' ') in + ???p??*) + if ! ocf_is_true "$OCF_RESKEY_partial_activation" ; then + # We are missing devices and cannot activate partially + ocf_exit_reason "Volume group [$VOLUME] has devices missing. Consider partial_activation=true to attempt to activate partially" + exit $OCF_ERR_GENERIC + else + # We are missing devices but are allowed to activate partially. + # Assume that caused the vgck failure and carry on + ocf_log warn "Volume group inconsistency detected with missing device(s) and partial_activation enabled. Proceeding with requested action." + fi + ;; + esac + # else the vg is partial but all devices are accounted for, so another + # node must have marked the device missing. Proceed. + else + # vgck failure was for something other than missing devices + ocf_exit_reason "Volume group [$VOLUME] does not exist or contains error! ${VGOUT}" + exit $OCF_ERR_GENERIC + fi + fi + + ## + # Does the Volume Group exist? + ## + if [ "$LVM_MAJOR" = "1" ]; then + VGOUT=`vgdisplay ${VOLUME} 2>&1` + else + VGOUT=`vgdisplay -v ${VOLUME} 2>&1` + fi + if [ $? -ne 0 ]; then + ocf_exit_reason "Volume group [$VOLUME] does not exist or contains error! ${VGOUT}" + exit $OCF_ERR_GENERIC + fi + + if lvs --noheadings -o segtype | grep -q "cache"; then + if ! lvs --noheadings -o cache_mode "$OCF_RESKEY_volgrpname" | grep -q "writethrough"; then + ocf_log warn "LVM CACHE IS NOT IN WRITETHROUGH MODE. THIS IS NOT A SUPPORTED CONFIGURATION." + fi + fi + + if ocf_is_clone && ocf_is_true "$OCF_RESKEY_exclusive"; then + ocf_exit_reason "cloned lvm resources can not be activated exclusively" + exit $OCF_ERR_CONFIGURED + fi + + lvm_validate_all +} + +# +# 'main' starts here... +# + +if + [ $# -ne 1 ] +then + usage + exit $OCF_ERR_ARGS +fi + +case $1 in + meta-data) meta_data + exit $OCF_SUCCESS;; + + methods) LVM_methods + exit $?;; + + usage) usage + exit $OCF_SUCCESS;; + *) ;; +esac + +if + [ -z "$OCF_RESKEY_volgrpname" ] +then + ocf_exit_reason "You must identify the volume group name!" + exit $OCF_ERR_CONFIGURED +fi + +# Get the LVM version number, for this to work we assume(thanks to panjiam): +# +# LVM1 outputs like this +# +# # vgchange --version +# vgchange: Logical Volume Manager 1.0.3 +# Heinz Mauelshagen, Sistina Software 19/02/2002 (IOP 10) +# +# LVM2 and higher versions output in this format +# +# # vgchange --version +# LVM version: 2.00.15 (2004-04-19) +# Library version: 1.00.09-ioctl (2004-03-31) +# Driver version: 4.1.0 + +LVM_VERSION=`vgchange --version 2>&1 | \ + $AWK '/Logical Volume Manager/ {print $5"\n"; exit; } + /LVM version:/ {printf $3"\n"; exit;}'` +rc=$? + +if + ( [ $rc -ne 0 ] || [ -z "$LVM_VERSION" ] ) +then + ocf_exit_reason "LVM: $1 could not determine LVM version. Try 'vgchange --version' manually and modify $0 ?" + exit $OCF_ERR_INSTALLED +fi +LVM_MAJOR="${LVM_VERSION%%.*}" + +VOLUME=$OCF_RESKEY_volgrpname +OP_METHOD=$1 + +set_lvm_mode +lvm_init +if ocf_is_true "$OCF_RESKEY_partial_activation" ; then + vgchange_activate_options="${vgchange_activate_options} --partial" +fi + +# What kind of method was invoked? +case "$1" in + + start) + LVM_validate_all + LVM_start $VOLUME + exit $?;; + + stop) LVM_stop $VOLUME + exit $?;; + + status) LVM_status $VOLUME $1 + exit $?;; + + monitor) LVM_status $VOLUME + exit $?;; + + validate-all) LVM_validate_all + ;; + + *) usage + exit $OCF_ERR_UNIMPLEMENTED;; +esac diff --git a/heartbeat/LVM-activate b/heartbeat/LVM-activate new file mode 100755 index 0000000..f6f24a3 --- /dev/null +++ b/heartbeat/LVM-activate @@ -0,0 +1,997 @@ +#!/bin/sh +# +# +# Copyright (c) 2017 SUSE LINUX, Eric Ren +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# LVM-activate OCF Resource Agent: +# +# Logical volume manager (LVM) provides new features for cluster enviroment: +# lvmlockd and system ID, which aims to replace clvmd and tagged-exclusive +# activation. Accordingly, we have created a new resource agent named "lvmlockd" +# to manage lvmlockd daemon. In addition, this new resource agent "LVM-activate" +# is created to take care of LVM activation/deactivation work. This agent supports +# the new features: lvmlockd and system ID, and also supports the old features: +# clvmd and lvm tag. +# +# Thanks David Teigland! He is the author of these LVM features, giving valuable +# idea/feedback about this resource agent. +############################################################################ + +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_vgname_default="" +OCF_RESKEY_lvname_default="" +OCF_RESKEY_vg_access_mode_default="" +OCF_RESKEY_activation_mode_default="exclusive" +OCF_RESKEY_tag_default="pacemaker" +OCF_RESKEY_partial_activation_default="false" +OCF_RESKEY_degraded_activation_default="false" +OCF_RESKEY_majority_pvs_default="false" + +: ${OCF_RESKEY_vgname=${OCF_RESKEY_vgname_default}} +: ${OCF_RESKEY_lvname=${OCF_RESKEY_lvname_default}} +: ${OCF_RESKEY_vg_access_mode=${OCF_RESKEY_vg_access_mode_default}} +: ${OCF_RESKEY_activation_mode=${OCF_RESKEY_activation_mode_default}} +: ${OCF_RESKEY_tag=${OCF_RESKEY_tag_default}} +: ${OCF_RESKEY_partial_activation=${OCF_RESKEY_partial_activation_default}} +: ${OCF_RESKEY_degraded_activation=${OCF_RESKEY_degraded_activation_default}} +: ${OCF_RESKEY_majority_pvs=${OCF_RESKEY_majority_pvs_default}} + +# If LV is given, only activate this named LV; otherwise, activate all +# LVs in the named VG. +VG=${OCF_RESKEY_vgname} +LV=${OCF_RESKEY_lvname} + +# How LVM controls access to the VG: +# +# 0: place-holder for any incorrect cases; To be safe, we enforce the VG +# must use any of the following protection methods in cluster environment. +# 1: vg is shared - lvmlockd (new) +# 2: vg is clustered - clvmd (old) +# 3: vg has system_id (new) +# 4: vg has tagging (old) +VG_access_mode=${OCF_RESKEY_vg_access_mode} +VG_access_mode_num=0 + +# Activate LV(s) with "shared" lock for cluster fs +# or "exclusive" lock for local fs +LV_activation_mode=${OCF_RESKEY_activation_mode} + +# For system ID feature +SYSTEM_ID="" + +# For tagging activation mode +OUR_TAG=${OCF_RESKEY_tag} + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> + + +<resource-agent name="LVM-activate" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This agent manages LVM activation/deactivation work for a given volume group. + +It supports the following modes, controlled by the vg_access_mode parameter: + +* lvmlockd +* system_id +* clvmd +* tagging + +Notes: + +1. There are two possible configuration combinations: lvmlockd+LVM-activate and +clvm+LVM-activate. However, it is not possible to use both at the same time! + +2. Put all "lvmlockd"/"clvmd" volume groups into auto management by the agent +if using the cluster to manage at least one of them. If you manage some manually, +the stop action of the lvmlockd agent may fail and the node may get fenced, +because some DLM lockspaces might be in use and cannot be closed automatically. + +3. The autoactivation property of volume group will be disabled when vg_access_mode +is set to system_id. + +Option: OCF_CHECK_LEVEL + +The standard monitor operation of depth 0 checks if the VG or LV is valid. +If you want deeper tests, set OCF_CHECK_LEVEL to 10: + + 10: read first 1 byte of the underlying device (raw read) + +If there are many underlying devs in VG, it will only read one of the devs. +This is not perfect solution for detecting underlying devices livable. +e.g. iscsi SAN IO timeout will return EIO, and it makes monitor failed. + +</longdesc> +<shortdesc lang="en">This agent activates/deactivates logical volumes.</shortdesc> + +<parameters> +<parameter name="vgname" unique="0" required="1"> +<longdesc lang="en"> +The volume group name. +</longdesc> +<shortdesc lang="en">The volume group name</shortdesc> +<content type="string" default="${OCF_RESKEY_vgname_default}" /> +</parameter> + +<parameter name="lvname" unique="0" required="0"> +<longdesc lang="en"> +If set, only the specified LV will be activated. +</longdesc> +<shortdesc lang="en">Only activate the given LV</shortdesc> +<content type="string" default="${OCF_RESKEY_lvname_default}" /> +</parameter> + +<parameter name="vg_access_mode" unique="0" required="1"> +<longdesc lang="en"> +This option decides which solution will be used to protect the volume group in +cluster environment. Optional solutions are: lvmlockd, clvmd, system_id and +tagging. +</longdesc> +<shortdesc lang="en">The VG access mode</shortdesc> +<content type="string" default="${OCF_RESKEY_vg_access_mode_default}" /> +</parameter> + +<parameter name="activation_mode" unique="0" required="0"> +<longdesc lang="en"> +The activation mode decides the visibility of logical volumes in the cluster. There +are two different modes: "shared" for cluster filesystem and "exclusive" for local +filesystem. With "shared", an LV can be activated concurrently from multiple nodes. +With "exclusive", an LV can be activated by one node at a time. + +This option only has effect on "lvmlockd"/"clvmd" vg_access_mode. For "system_id" +and "tagging", they always mean exclusive activation. +</longdesc> +<shortdesc lang="en">Logical volume activation mode</shortdesc> +<content type="string" default="${OCF_RESKEY_activation_mode_default}" /> +</parameter> + +<parameter name="tag" unique="0" required="0"> +<longdesc lang="en"> +The tag used for tagging activation mode. +</longdesc> +<shortdesc lang="en">The tag used for tagging activation mode</shortdesc> +<content type="string" default="${OCF_RESKEY_tag_default}" /> +</parameter> + +<parameter name="partial_activation" unique="0" required="0"> +<longdesc lang="en"> +If set, the volume group will be activated partially even with some +physical volumes missing. It helps to set to true when using mirrored +logical volumes. +</longdesc> +<shortdesc lang="en">Activate VG partially when missing PVs</shortdesc> +<content type="string" default="${OCF_RESKEY_partial_activation_default}" /> +</parameter> + +<parameter name="degraded_activation" unique="0" required="0"> +<longdesc lang="en"> +Activate RAID LVs using the "degraded" activation mode. This allows RAID +LVs to be activated with missing PVs if all data can be provided with +RAID redundancy. The RAID level determines the number of PVs that are +required for degraded activation to succeed. If fewer PVs are available, +then degraded activation will fail. Also enable majority_pvs. +</longdesc> +<shortdesc lang="en">Activate RAID LVs in degraded mode when missing PVs</shortdesc> +<content type="string" default="${OCF_RESKEY_degraded_activation_default}" /> +</parameter> + +<parameter name="majority_pvs" unique="0" required="0"> +<longdesc lang="en"> +If set, the VG system ID can be reassigned to a new host if a majority +of PVs in the VG are present. Otherwise, VG failover with system ID +will fail when the VG is missing PVs. Also enable degraded_activation +when RAID LVs are used. +</longdesc> +<shortdesc lang="en">Allow changing the system ID of a VG with a majority of PVs</shortdesc> +<content type="string" default="${OCF_RESKEY_majority_pvs_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="90s" /> +<action name="stop" timeout="90s" /> +<action name="monitor" timeout="90s" interval="30s" depth="0" /> +<action name="meta-data" timeout="10s" /> +<action name="validate-all" timeout="20s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +usage() { + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} +END +} + +# VG access modes: +# 0: unsafe to activate LV without proper protection in cluster +# 1: vg is shared - lvmlockd (new) +# 2: vg is clustered - clvmd (old) +# 3: vg has system_id (new) +# 4: vg has tagging (old) + +get_VG_access_mode_num() +{ + # Use -o reporting fields to get multiple bits of info from a single command + kvs=$(vgs --foreign --nolocking --noheadings --nameprefixes \ + --rows --config report/compact_output=0 \ + -o name,clustered,lock_type,systemid,tags ${VG} 2>/dev/null | tr -d \') + export ${kvs} + vg_locktype=${LVM2_VG_LOCK_TYPE} + vg_clustered=${LVM2_VG_CLUSTERED} + vg_systemid=${LVM2_VG_SYSTEMID} + vg_tags=${LVM2_VG_TAGS} + + # We know this VG is using lvmlockd if the lock type is dlm. + if [ "$vg_locktype" = "dlm" ]; then + access_mode=1 + elif [ "$vg_clustered" = "clustered" ]; then + access_mode=2 + elif [ -n "$vg_systemid" ]; then + SYSTEM_ID=$(lvm systemid 2>/dev/null | cut -d':' -f2 | tr -d '[:blank:]') + access_mode=3 + elif [ -n "$vg_tags" ]; then + # TODO: + # We don't have reliable way to test if tagging activation is used. + access_mode=4 + else + access_mode=0 + fi + + return $access_mode +} + +# TODO: All tagging activation code is almost copied from LVM RA!!! +# But, the old LVM RA just uses the ordinary tags, not the "hosttag" feature +# which may be a better method for active-inactive cluster scenario. +# +# We have two choice: +# 1. Continue to use the LVM way, which may work well on old system. +# 2. Change to use the real hosttag feature, but it looks very same +# to systemID. +# Anyway, we can easily change this if anyone requests with good reasons. + +# does this vg have our tag +check_tags() +{ + owner=$(vgs -o tags --noheadings ${VG} | tr -d '[:blank:]') + + if [ -z "$owner" ]; then + # No-one owns this VG yet + return 1 + fi + + if [ "$OUR_TAG" = "$owner" ]; then + # yep, this is ours + return 0 + fi + + # some other tag is set on this vg + return 2 +} + +strip_tags() +{ + for tag in $(vgs --noheadings -o tags $OCF_RESKEY_volgrpname | sed s/","/" "/g); do + ocf_log info "Stripping tag, $tag" + + # LVM version 2.02.98 allows changing tags if PARTIAL + vgchange --deltag "$tag" ${VG} + done + + if [ ! -z $(vgs -o tags --noheadings ${VG} | tr -d '[:blank:]') ]; then + ocf_exit_reason "Failed to remove ownership tags from ${VG}" + exit $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +set_tags() +{ + case check_tags in + 0) + # we already own it. + return $OCF_SUCCESS + ;; + 2) + # other tags are set, strip them before setting + if ! strip_tags; then + return $OCF_ERR_GENERIC + fi + ;; + *) + : ;; + esac + + if ! vgchange --addtag $OUR_TAG ${VG} ; then + ocf_exit_reason "Failed to add ownership tag to ${VG}" + return $OCF_ERR_GENERIC + fi + + ocf_log info "New tag \"${OUR_TAG}\" added to ${VG}" + return $OCF_SUCCESS +} + +# Parameters: +# 1st: config item name +# 2nd: expected config item value +config_verify() +{ + name=$1 + expect=$2 + + real=$(lvmconfig "$name" | cut -d'=' -f2) + if [ "$real" != "$expect" ]; then + ocf_exit_reason "config item $name: expect=$expect but real=$real" + exit $OCF_ERR_ARGS + fi + + return $OCF_SUCCESS +} + +lvmlockd_check() +{ + config_verify "global/use_lvmlockd" "1" + + # locking_type was removed from config in v2.03 + ocf_version_cmp "$(lvmconfig --version | awk '/LVM ver/ {sub(/\(.*/, "", $3); print $3}')" "2.03" + if [ "$?" -eq 0 ]; then + config_verify "global/locking_type" "1" + fi + + # We recommend to activate one LV at a time so that this specific volume + # binds to a proper filesystem to protect the data + # TODO: + # Will this warn message be too noisy? + if [ -z "$LV" ]; then + ocf_log warn "You are recommended to activate one LV at a time or use exclusive activation mode." + fi + + # Good: lvmlockd is running, and clvmd is not running + if ! pgrep lvmlockd >/dev/null 2>&1 ; then + if ocf_is_probe; then + ocf_log info "initial probe: lvmlockd is not running yet." + exit $OCF_NOT_RUNNING + fi + + ocf_exit_reason "lvmlockd daemon is not running!" + exit $OCF_ERR_GENERIC + fi + + if pgrep clvmd >/dev/null 2>&1 ; then + ocf_exit_reason "clvmd daemon is running unexpectedly." + exit $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +clvmd_check() +{ + config_verify "global/use_lvmetad" "0" + config_verify "global/use_lvmlockd" "0" + config_verify "global/locking_type" "3" + + # TODO: + # David asked a question: does lvchange -aey works well enough with clvmd? + # + # Corey said: I think it does work well enough. We do a fair amount of exclusive + # activation clvm testing, and my experience is you'll get the LV activated on + # the node you ran the command on. But, I think the specific scenario and issue + # that surprised us all was when the LV was *already* exclusively active on say + # nodeA, and nodeB then attempts to also exclusively activate it as well. Instead + # of failing, the activation succeeds even though nodeB activation didn't occur. + # This is documented in the following bug: + # https://bugzilla.redhat.com/show_bug.cgi?id=1191724#c8 + # Technically, you're not guaranteed to have it activated on the node you run + # the cmd on, but again, that's not been my experience. + # + # Eric: Put the interesting discussion here so that we can be more careful on this. + + # Good: clvmd is running, and lvmlockd is not running + if ! pgrep clvmd >/dev/null 2>&1 ; then + ocf_exit_reason "clvmd daemon is not running!" + exit $OCF_ERR_GENERIC + fi + + if pgrep lvmetad >/dev/null 2>&1 ; then + ocf_exit_reason "Please stop lvmetad daemon when clvmd is running." + exit $OCF_ERR_GENERIC + fi + + if pgrep lvmlockd >/dev/null 2>&1 ; then + ocf_exit_reason "lvmlockd daemon is running unexpectedly." + exit $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +systemid_check() +{ + # system_id_source is set in lvm.conf + source=$(lvmconfig 'global/system_id_source' 2>/dev/null | cut -d"=" -f2) + if [ "$source" = "" ] || [ "$source" = "none" ]; then + ocf_exit_reason "system_id_source in lvm.conf is not set correctly!" + exit $OCF_ERR_ARGS + fi + + if [ -z ${SYSTEM_ID} ]; then + ocf_exit_reason "local/system_id is not set!" + exit $OCF_ERR_ARGS + fi + + return $OCF_SUCCESS +} + +# Verify tags setup +tagging_check() +{ + # The volume_list must be initialized to something in order to + # guarantee our tag will be filtered on startup + if ! lvm dumpconfig activation/volume_list; then + ocf_log err "LVM: Improper setup detected" + ocf_exit_reason "The volume_list filter must be initialized in lvm.conf for exclusive activation without clvmd" + exit $OCF_ERR_ARGS + fi + + # Our tag must _NOT_ be in the volume_list. This agent + # overrides the volume_list during activation using the + # special tag reserved for cluster activation + if lvm dumpconfig activation/volume_list | grep -e "\"@${OUR_TAG}\"" -e "\"${VG}\""; then + ocf_log err "LVM: Improper setup detected" + ocf_exit_reason "The volume_list in lvm.conf must not contain the cluster tag, \"${OUR_TAG}\", or volume group, ${VG}" + exit $OCF_ERR_ARGS + fi + + return $OCF_SUCCESS +} + +read_parameters() +{ + if [ -z "$VG" ] + then + ocf_exit_reason "You must identify the volume group name!" + exit $OCF_ERR_CONFIGURED + fi + + if [ "$LV_activation_mode" != "shared" ] && [ "$LV_activation_mode" != "exclusive" ] + then + ocf_exit_reason "Invalid value for activation_mode: $LV_activation_mode" + exit $OCF_ERR_CONFIGURED + fi + + # Convert VG_access_mode from string to index + case ${VG_access_mode} in + lvmlockd) + VG_access_mode_num=1 + ;; + clvmd) + VG_access_mode_num=2 + ;; + system_id) + VG_access_mode_num=3 + ;; + tagging) + VG_access_mode_num=4 + ;; + *) + # dont exit with error-code here or nodes will get fenced on + # e.g. "pcs resource create" + ocf_exit_reason "You specified an invalid value for vg_access_mode: $VG_access_mode" + ;; + esac +} + +lvm_validate() { + read_parameters + + check_binary pgrep + # Every LVM command is just symlink to lvm binary + check_binary lvm + check_binary dmsetup + + # This is necessary when using system ID to update lvm hints, + # or in older versions of lvm, this is necessary to update the + # lvmetad cache. + pvscan --cache + + if ! vgs --foreign ${VG} >/dev/null 2>&1 ; then + # stop action exits successfully if the VG cannot be accessed... + if [ $__OCF_ACTION = "stop" ]; then + ocf_log warn "VG [${VG}] cannot be accessed, stop action exits successfully." + exit $OCF_SUCCESS + fi + + if ocf_is_probe; then + ocf_log info "initial probe: VG [${VG}] is not found on any block device yet." + exit $OCF_NOT_RUNNING + fi + + # Could be a transient error (e.g., iSCSI connection + # issue) so use OCF_ERR_GENERIC + ocf_exit_reason "Volume group[${VG}] doesn't exist, or not visible on this node!" + exit $OCF_ERR_GENERIC + fi + + vg_missing_pv_count=$(vgs -o missing_pv_count --noheadings ${VG} 2>/dev/null) + + if [ $vg_missing_pv_count -gt 0 ]; then + ocf_log warn "Volume Group ${VG} is missing $vg_missing_pv_count PVs." + + # Setting new system ID will succeed if over half of PVs remain. + # Don't try to calculate here if a majority is present, + # but leave this up to the vgchange command to determine. + if ocf_is_true "$OCF_RESKEY_majority_pvs" ; then + ocf_log warn "Attempting fail over with missing PVs (majority.)" + + # Setting new system ID will fail, and behavior is undefined for + # other access modes. + elif ocf_is_true "$OCF_RESKEY_partial_activation" ; then + ocf_log warn "Attempting fail over with missing PVs (partial.)" + + else + ocf_exit_reason "Volume group [$VG] has devices missing. Consider majority_pvs=true" + exit $OCF_ERR_GENERIC + fi + fi + + # Get the access mode from VG metadata and check if it matches the input + # value. Skip to check "tagging" mode because there's no reliable way to + # automatically check if "tagging" mode is being used. + get_VG_access_mode_num + mode=$? + if [ $VG_access_mode_num -ne 4 ] && [ $mode -ne $VG_access_mode_num ]; then + ocf_exit_reason "The specified vg_access_mode doesn't match the lock_type on VG metadata!" + exit $OCF_ERR_CONFIGURED + fi + + # Nothing to do if the VG has no logical volume + lv_count=$(vgs --foreign -o lv_count --noheadings ${VG} 2>/dev/null) + if [ $lv_count -lt 1 ]; then + ocf_exit_reason "Volume group [$VG] doesn't contain any logical volume!" + exit $OCF_ERR_CONFIGURED + fi + + # Check if the given $LV is in the $VG + if [ -n "$LV" ]; then + output=$(lvs --foreign --noheadings ${VG}/${LV} 2>&1) + if [ $? -ne 0 ]; then + ocf_log err "lvs: ${output}" + ocf_exit_reason "LV ($LV) is not in the given VG ($VG)." + exit $OCF_ERR_CONFIGURED + fi + fi + + # VG_access_mode_num specific checking goes here + case ${VG_access_mode_num} in + 1) + lvmlockd_check + ;; + 2) + clvmd_check + ;; + 3) + systemid_check + ;; + 4) + tagging_check + ;; + *) + ocf_exit_reason "Incorrect VG access mode detected!" + exit $OCF_ERR_CONFIGURED + esac + + if [ $? -ne $OCF_SUCCESS ]; then + ocf_exit_reason "Improper configuration issue is detected!" + exit $OCF_ERR_CONFIGURED + fi + + return $OCF_SUCCESS +} + +# To activate LV(s) with different "activation mode" parameters +do_activate() { + do_activate_opt=$1 + + if ocf_is_true "$OCF_RESKEY_degraded_activation" ; then + # This will allow a RAID LV to be activated if sufficient + # devices are available to allow the LV to be usable + do_activate_opt="${do_activate_opt} --activationmode degraded" + + elif ocf_is_true "$OCF_RESKEY_partial_activation" ; then + # This will allow a mirror LV to be activated if any + # devices are missing, but the activated LV may not be + # usable, so it is not recommended. Also, other LV + # types without data redundancy will be activated + # when partial is set. + # RAID LVs and degraded_activation should be used instead. + do_activate_opt="${do_activate_opt} --partial" + fi + + # Only activate the specific LV if it's given + if [ -n "$LV" ]; then + ocf_run lvchange $do_activate_opt ${VG}/${LV} + if [ $? -ne $OCF_SUCCESS ]; then + return $OCF_ERR_GENERIC + fi + else + ocf_run lvchange $do_activate_opt ${VG} + if [ $? -ne $OCF_SUCCESS ]; then + return $OCF_ERR_GENERIC + fi + fi + + return $OCF_SUCCESS +} + +lvmlockd_activate() { + if [ "$LV_activation_mode" = "shared" ]; then + activate_opt="-asy" + else + activate_opt="-aey" + fi + + # lvmlockd requires shared VGs to be started before they're used + ocf_run vgchange --lockstart ${VG} + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + ocf_exit_reason "Failed to start shared VG(s), exit code: $rc" + return $OCF_ERR_GENERIC + fi + + do_activate "$activate_opt" + if [ $? -ne $OCF_SUCCESS ]; then + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +# clvmd must be running to activate clustered VG +clvmd_activate() { + if [ "$LV_activation_mode" = "shared" ]; then + activate_opt="-asy" + else + activate_opt="-aey" + fi + + do_activate "$activate_opt" + if [ $? -ne $OCF_SUCCESS ]; then + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +systemid_activate() { + majority_opt="" + set_autoactivation=0 + cur_systemid=$(vgs --foreign --noheadings -o systemid ${VG} | tr -d '[:blank:]') + + if ocf_is_true "$OCF_RESKEY_majority_pvs" ; then + vgchange --help | grep '\--majoritypvs' >/dev/null 2>&1 && majority_opt="--majoritypvs" + fi + + # Put our system ID on the VG + vgchange -y $majority_opt --config "local/extra_system_ids=[\"${cur_systemid}\"]" \ + --systemid ${SYSTEM_ID} ${VG} + vgchange --help | grep '\--setautoactivation' >/dev/null 2>&1 && set_autoactivation=1 + + if [ $set_autoactivation -ne 0 ]; then + if vgs -o autoactivation ${VG} | grep enabled >/dev/null 2>&1 ; then + ocf_log info "disable the autoactivation property of ${VG}" + ocf_run vgchange --setautoactivation n ${VG} + fi + fi + + do_activate "-ay" + if [ $? -ne $OCF_SUCCESS ]; then + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +tagging_activate() { + if ! set_tags ; then + ocf_exit_reason "Failed to set tags on ${VG}." + return $OCF_ERR_GENERIC + fi + + do_activate "-ay --config activation{volume_list=[\"@${OUR_TAG}\"]}" + if [ $? -ne $OCF_SUCCESS ]; then + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +lvmlockd_deactivate() { + do_activate "-an" + if [ $? -ne $OCF_SUCCESS ]; then + return $OCF_ERR_GENERIC + fi + + OUT=$(lvs --noheadings -S lv_active=active ${VG} 2>/dev/null) + [ -n "$OUT" ] && return $OCF_SUCCESS + + # Close the lockspace of this VG if there is no active LV + ocf_run vgchange --lockstop ${VG} + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + ocf_exit_reason "Failed to close the shared VG lockspace, exit code: $rc" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +clvmd_deactivate() { + do_activate "-an" + if [ $? -ne $OCF_SUCCESS ]; then + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +systemid_deactivate() { + do_activate "-an" + if [ $? -ne $OCF_SUCCESS ]; then + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +tagging_deactivate() { + do_activate "-an --config activation{volume_list=[\"@${OUR_TAG}\"]}" + if [ $? -ne $OCF_SUCCESS ]; then + return $OCF_ERR_GENERIC + fi + + if ! strip_tags ; then + ocf_exit_reason "Failed to remove tags on ${VG}." + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +# TODO: +# How can we accurately check if LVs in the given VG are all active? +# +# David: +# If we wanted to check that all LVs in the VG are active, then we would +# probably need to use the lvs/lv_live_table command here since dmsetup +# won't know about inactive LVs that should be active. +# +# Eric: +# But, lvs/lv_live_table command doesn't work well now. I tried the following +# method: +# +# lv_count=$(vgs --foreign -o lv_count --noheadings ${VG} 2>/dev/null | tr -d '[:blank:]') +# dm_count=$(dmsetup --noheadings info -c -S "vg_name=${VG}" 2>/dev/null | grep -c "${VG}-") +# test $lv_count -eq $dm_count +# +# It works, but we cannot afford to use LVM command in lvm_status. LVM command is expensive +# because it may potencially scan all disks on the system, update the metadata even using +# lvs/vgs when the metadata is somehow inconsistent. +# +# So, we have to make compromise that the VG is assumably active if any LV of the VG is active. +# +# Paul: +# VGS + LVS with "-" in their name get mangled with double dashes in dmsetup. +# Switching to wc and just counting lines while depending on the vgname + lvname filter +# in dmsetup gets around the issue with dmsetup reporting correctly but grep failing. +# +# Logic for both test cases and dmsetup calls changed so they match too. +# +# This is AllBad but there isn't a better way that I'm aware of yet. +lvm_status() { + if [ -n "${LV}" ]; then + # dmsetup ls? It cannot accept device name. It's + # too heavy to list all DM devices. + dm_count=$(dmsetup info --noheadings --noflush -c -S "vg_name=${VG} && lv_name=${LV}" | grep -c -v '^No devices found') + else + dm_count=$(dmsetup info --noheadings --noflush -c -S "vg_name=${VG}" | grep -c -v '^No devices found') + fi + + if [ $dm_count -eq 0 ]; then + if ocf_is_probe ;then + return $OCF_NOT_RUNNING + else + return $OCF_ERR_GENERIC + fi + fi + + case "$OCF_CHECK_LEVEL" in + 0) + ;; + 10) + # if there are many lv in vg dir, pick the first name + dm_name="/dev/${VG}/$(ls -1 /dev/${VG} | head -n 1)" + + # read 1 byte to check the dev is alive + dd if=${dm_name} of=/dev/null bs=1 count=1 >/dev/null \ + 2>&1 + if [ $? -ne 0 ]; then + return $OCF_ERR_GENERIC + fi + return $OCF_SUCCESS + ;; + *) + ocf_exit_reason "unsupported monitor level $OCF_CHECK_LEVEL" + return $OCF_ERR_CONFIGURED + ;; + esac +} + +lvm_start() { + if systemd_is_running ; then + # Create drop-in to deactivate VG before stopping + # storage services during shutdown/reboot. + systemctl show resource-agents-deps.target \ + --property=After | cut -d'=' -f2 | \ + grep -qE "(^|\s)blk-availability.service(\s|$)" + + if [ "$?" -ne 0 ]; then + systemd_drop_in "99-LVM-activate" "After" \ + "blk-availability.service" + fi + + # If blk-availability isn't started, the "After=" + # directive has no effect. + if ! systemctl is-active blk-availability.service ; then + systemctl start blk-availability.service + fi + fi + + if lvm_status ; then + ocf_log info "${vol}: is already active." + return $OCF_SUCCESS + fi + + [ -z ${LV} ] && vol=${VG} || vol=${VG}/${LV} + ocf_log info "Activating ${vol}" + + case ${VG_access_mode_num} in + 1) + lvmlockd_activate + ;; + 2) + clvmd_activate + ;; + 3) + systemid_activate + ;; + 4) + tagging_activate + ;; + *) + ocf_exit_reason "VG [${VG}] is not properly configured in cluster. It's unsafe!" + exit $OCF_ERR_CONFIGURED + ;; + esac + + rc=$? + if lvm_status ; then + ocf_log info "${vol}: activated successfully." + return $OCF_SUCCESS + else + ocf_exit_reason "${vol}: failed to activate." + return $rc + fi +} + +# Deactivate LVM volume(s) +lvm_stop() { + [ -z ${LV} ] && vol=${VG} || vol=${VG}/${LV} + + if ! lvm_status ; then + ocf_log info "${vol}: has already been deactivated." + return $OCF_SUCCESS + fi + + ocf_log info "Deactivating ${vol}" + + case ${VG_access_mode_num} in + 1) + lvmlockd_deactivate + ;; + 2) + clvmd_deactivate + ;; + 3) + systemid_deactivate + ;; + 4) + tagging_deactivate + ;; + *) + ocf_exit_reason "VG [${VG}] is not properly configured in cluster. It's unsafe!" + exit $OCF_SUCCESS + ;; + esac + + if ! lvm_status ; then + ocf_log info "${vol}: deactivated successfully." + return $OCF_SUCCESS + else + ocf_exit_reason "${vol}: failed to deactivate." + return $OCF_ERR_GENERIC + fi +} + +# +# MAIN +# + +case $__OCF_ACTION in +start) + lvm_validate + lvm_start + ;; +stop) + read_parameters + lvm_stop + ;; +monitor) + lvm_status + ;; +validate-all) + lvm_validate + ;; +meta-data) + meta_data + ;; +usage|help) + usage + ;; +*) + usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? + +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc diff --git a/heartbeat/LinuxSCSI b/heartbeat/LinuxSCSI new file mode 100755 index 0000000..e9038cd --- /dev/null +++ b/heartbeat/LinuxSCSI @@ -0,0 +1,322 @@ +#!/bin/sh +# +# +# LinuxSCSI +# +# Description: Enables/Disables SCSI devices to protect them from being +# used by mistake +# +# +# Author: Alan Robertson +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL) +# Copyright: (C) 2002 - 2005 IBM +# +# CAVEATS: See the usage message for some important warnings +# +# usage: ./LinuxSCSI (start|stop|status|monitor|meta-data|validate-all|methods) +# +# OCF parameters are as below: +# OCF_RESKEY_scsi +# +# An example usage in /etc/ha.d/haresources: +# node1 10.0.0.170 LinuxSCSI:0:0:11 +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_scsi_default="" +OCF_RESKEY_ignore_deprecation_default="false" + +: ${OCF_RESKEY_scsi=${OCF_RESKEY_scsi_default}} +: ${OCF_RESKEY_ignore_deprecation=${OCF_RESKEY_ignore_deprecation_default}} + +####################################################################### + +zeropat="[ 0]0" + +PROCSCSI=/proc/scsi/scsi + +usage() { + cat <<EOF + usage: $0 (start|stop|status|monitor|meta-data|validate-all|methods) + + $0 manages the availability of a SCSI device from the point + of view of the linux kernel. It make Linux believe the + device has gone away, and it can make it come back again. + + The purpose of this resource script is to keep admins from + accidentally messing with a shared disk that is managed by the + HA subsystem and is currently owned by the other side. + + To get maximum benefit from this feature, you should (manually) + disable the resources on boot, and let your HA software enable + them when it wants to acquire the disk. + + The kernel code says this is potentially dangerous. DO NOT USE + IT ON AN ACTIVE DEVICE. If the device is inactive, this script + will make it stay inactive, when given "off". If you inactivate + the wrong device, you may have to reboot your machine, and your + data may take a hit. + + On the other hand, at least one RAID controller requires the + use of this technique for it to work correctly in a failover + environment - so it is believed that it is more stable in this + usage than the comments in the code imply. + + Here are the warnings from the kernel source about the "stop" + operation as of 2.4.10: + + ------------------------------ + Consider this feature pre-BETA. + CAUTION: This is not for hotplugging your peripherals. As + SCSI was not designed for this, you could damage your + hardware and thoroughly confuse the SCSI subsystem. + + Similar warnings apply to the "start" operation... + + Consider this feature BETA. + CAUTION: This is not for hotplugging your peripherals. + As SCSI was not designed for this you could damage your + hardware ! + However perhaps it is legal to switch on an already connected + device. It is perhaps not guaranteed this device doesn't corrupt + an ongoing data transfer. + ------------------------- + + So, Caveat Emptor, and test this feature thoroughly on + your kernel and your configuration with real load on the SCSI + bus before using it in production! + + Another potential trouble spot... + The order in which you bring up LinuxSCSI resources determines which + SCSI device they show up as on Linux. If you have two SCSI devices + in different resource groups they will be brought up asyncronously + resulting in indeterminate device name assignments. This usually + happens in an active-active configuration. + + To solve this you probably should use LVM or EVMS to manage these + volumes. LVM and EVMS solve this problem for you by labels they + keep in the volumes. If you don't use a reasonable volume manager, + then you'll have to mount by UUID. + +EOF +} + +meta_data() { + cat <<EOF +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="LinuxSCSI" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Deprecation warning: This agent makes use of Linux SCSI hot-plug +functionality which has been superseded by SCSI reservations. It is +deprecated and may be removed from a future release. See the +scsi2reservation and sfex agents for alternatives. -- +This is a resource agent for LinuxSCSI. It manages the availability of a +SCSI device from the point of view of the linux kernel. It make Linux +believe the device has gone away, and it can make it come back again. +</longdesc> +<shortdesc lang="en">Enables and disables SCSI devices through the +kernel SCSI hot-plug subsystem (deprecated)</shortdesc> + +<parameters> +<parameter name="scsi" unique="0" required="1"> +<longdesc lang="en"> +The SCSI instance to be managed. +</longdesc> +<shortdesc lang="en">SCSI instance</shortdesc> +<content type="string" default="${OCF_RESKEY_scsi_default}" /> +</parameter> + +<parameter name="ignore_deprecation"> +<longdesc lang="en"> +If set to true, suppresses the deprecation warning for this agent. +</longdesc> +<shortdesc lang="en">Suppress deprecation warning</shortdesc> +<content type="boolean" default="${OCF_RESKEY_ignore_deprecation_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="methods" timeout="5s" /> +<action name="status" depth="0" timeout="20s" interval="10s" /> +<action name="monitor" depth="0" timeout="20s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s" /> +</actions> +</resource-agent> +EOF +} + +scsi_methods() { + cat <<EOF + start + stop + status + monitor + validate-all + methods +EOF +} + + +parseinst() { + lun=0 + case "$1" in + + [0-9]*:[0-9]*:[0-9]*);; + + [0-9]*:[0-9]*:[0-9]*:[0-9]*) + lun=`echo "$1" | cut -d: -f4`;; + + *) #host=error + #channel=error + #target=error + #lun=error + ocf_exit_reason "Invalid SCSI instance $1" + exit $OCF_ERR_ARGS + esac + host=`echo "$1" | cut -d: -f1` + channel=`echo "$1" | cut -d: -f2` + target=`echo "$1" | cut -d: -f3` +} + +# +# start: Enable the given SCSI device in the kernel +# +scsi_start() { + parseinst "$1" +# [ $target = error ] && exit 1 +# echo "scsi-add-single-device $host $channel $target $lun" >>$PROCSCSI + echo "scsi add-single-device $host $channel $target $lun" >>$PROCSCSI + if + scsi_status "$1" + then + return $OCF_SUCCESS + else + ocf_exit_reason "SCSI device $1 not active!" + return $OCF_ERR_GENERIC + fi +} + + +# +# stop: Disable the given SCSI device in the kernel +# +scsi_stop() { + parseinst "$1" +# [ $target = error ] && exit 1 + echo "scsi remove-single-device $host $channel $target $lun" >>$PROCSCSI + if + scsi_status "$1" + then + ocf_exit_resaon "SCSI device $1 still active!" + return $OCF_ERR_GENERIC + else + return $OCF_SUCCESS + fi +} + + +# +# status: is the given device now available? +# +scsi_status() { + parseinst "$1" +# [ $target = error ] && exit 1 + [ $channel -eq 0 ] && channel=$zeropat + [ $target -eq 0 ] && target=$zeropat + [ $lun -eq 0 ] && lun=$zeropat + greppat="Host: *scsi$host *Channel: *$channel *Id: *$target *Lun: *$lun" + grep -i "$greppat" $PROCSCSI >/dev/null + if [ $? -eq 0 ]; then + return $OCF_SUCCESS + else + return $OCF_NOT_RUNNING + fi +} + +# +# validate_all: Check the OCF instance parameters +# +scsi_validate_all() { + parseinst $instance + return $OCF_SUCCESS +} + +if + ( [ $# -ne 1 ] ) +then + ocf_exit_reason "Parameter number error." + usage + exit $OCF_ERR_GENERIC +fi + +#if +# [ -z "$OCF_RESKEY_scsi" ] && [ "X$1" = "Xmethods" ] +#then +# scsi_methods +# exit #? +#fi +case $1 in + methods) scsi_methods + exit $OCF_SUCCESS + ;; + meta-data) meta_data + exit $OCF_SUCCESS + ;; + usage) usage + exit $OCF_SUCCESS + ;; + *) ;; +esac + +# Be obnoxious, log deprecation warning on every invocation (unless +# suppressed by resource configuration). +ocf_deprecated + +if + [ -z "$OCF_RESKEY_scsi" ] +then + ocf_exit_reason "You have to set a valid scsi id at least!" +# usage + exit $OCF_ERR_GENERIC +fi + +instance=$OCF_RESKEY_scsi + +case $1 in + start) scsi_start $instance + ;; + stop) scsi_stop $instance + ;; + status|monitor) + if + scsi_status $instance + then + ocf_log info "SCSI device $instance is running" + return $OCF_SUCCESS + else + ocf_log info "SCSI device $instance is stopped" + exit $OCF_NOT_RUNNING + fi + ;; + validate-all) scsi_validate_all + ;; + *) usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +exit $? diff --git a/heartbeat/MailTo b/heartbeat/MailTo new file mode 100755 index 0000000..56940ba --- /dev/null +++ b/heartbeat/MailTo @@ -0,0 +1,199 @@ +#!/bin/sh +# +# Resource script for MailTo +# +# Author: Alan Robertson <alanr@unix.sh> +# +# Description: sends email to a sysadmin whenever a takeover occurs. +# +# Note: This command requires an argument, unlike normal init scripts. +# +# This can be given in the haresources file as: +# +# You can also give a mail subject line or even multiple addresses +# MailTo::alanr@unix.sh::BigImportantWebServer +# MailTo::alanr@unix.sh,spoppi@gmx.de::BigImportantWebServer +# +# This will then be put into the message subject and body. +# +# OCF parameters are as below: +# OCF_RESKEY_email +# OCF_RESKEY_subject +# +# License: GNU General Public License (GPL) +# +# Copyright: (C) 2005 International Business Machines + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_email_default="" +OCF_RESKEY_subject_default="Resource Group" + +: ${OCF_RESKEY_email=${OCF_RESKEY_email_default}} +: ${OCF_RESKEY_subject=${OCF_RESKEY_subject_default}} + +####################################################################### + +ARGS="$0 $*" + +us=`uname -n` + +usage() { + echo "Usage: $0 {start|stop|status|monitor|meta-data|validate-all}" +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="MailTo" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This is a resource agent for MailTo. It sends email to a sysadmin whenever +a takeover occurs. +</longdesc> +<shortdesc lang="en">Notifies recipients by email in the event of resource takeover</shortdesc> + +<parameters> +<parameter name="email" unique="0" required="1"> +<longdesc lang="en"> +The email address of sysadmin. +</longdesc> +<shortdesc lang="en">Email address</shortdesc> +<content type="string" default="${OCF_RESKEY_email_default}" /> +</parameter> + +<parameter name="subject" unique="0"> +<longdesc lang="en"> +The subject of the email. +</longdesc> +<shortdesc lang="en">Subject</shortdesc> +<content type="string" default="${OCF_RESKEY_subject_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="10s" /> +<action name="stop" timeout="10s" /> +<action name="status" depth="0" timeout="10s" interval="10s" /> +<action name="monitor" depth="0" timeout="10s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s" /> +</actions> +</resource-agent> +END +} + +MailProgram() { + $MAILCMD -s "$1" "$email" <<EOF + $Subject + + Command line was: + $ARGS +EOF + return $? +} + +SubjectLine() { + case $1 in + ??*) echo "$@";; + *) echo "${OCF_RESKEY_subject_default}";; + esac +} + + +MailToStart() { + + Subject="`SubjectLine $subject` Takeover in progress at `date` on $us" + + MailProgram "$Subject" $1 + ha_pseudo_resource MailTo_${OCF_RESOURCE_INSTANCE} start +} + +MailToStop () { + Subject="`SubjectLine $subject` Migrating resource away at `date` from $us" + + MailProgram "$Subject" $1 + ha_pseudo_resource MailTo_${OCF_RESOURCE_INSTANCE} stop +} + +MailToStatus () { +# ocf_log warn "Don't stat/monitor me! MailTo is a pseudo resource agent, so the status reported may be incorrect" + + if ha_pseudo_resource MailTo_${OCF_RESOURCE_INSTANCE} monitor + then + echo "running" + return $OCF_SUCCESS + else + echo "stopped" + return $OCF_NOT_RUNNING + fi +} + +MailToValidateAll () { + if [ -z "$MAILCMD" ]; then + ocf_exit_reason "MAILCMD not set: complain to the packager" + exit $OCF_ERR_INSTALLED + fi + check_binary "$MAILCMD" + + return $OCF_SUCCESS +} + +# +# See how we were called. +# +# The order in which heartbeat provides arguments to resource +# scripts is broken. It should be fixed. +# + +if + ( [ $# -ne 1 ] ) +then + usage + exit $OCF_ERR_GENERIC +fi + +case $1 in + meta-data) meta_data + exit $OCF_SUCCESS + ;; + status|monitor) MailToStatus + exit $? + ;; + usage) usage + exit $OCF_SUCCESS + ;; + *) ;; +esac + +if + [ -z "$OCF_RESKEY_email" ] +then + ocf_exit_reason "At least 1 Email address has to be given!" + exit $OCF_ERR_CONFIGURED +fi + +email=$OCF_RESKEY_email +subject=$OCF_RESKEY_subject + +MailToValidateAll + +case $1 in + start) MailToStart + ;; + stop) MailToStop + ;; + validate-all) ;; + *) usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +exit $? diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am new file mode 100644 index 0000000..6bd18f2 --- /dev/null +++ b/heartbeat/Makefile.am @@ -0,0 +1,249 @@ +# Makefile.am for OCF RAs +# +# Author: Sun Jing Dong +# Copyright (C) 2004 IBM +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +MAINTAINERCLEANFILES = Makefile.in + +EXTRA_DIST = $(ocf_SCRIPTS) $(ocfcommon_DATA) \ + $(common_DATA) $(hb_DATA) $(dtd_DATA) \ + README README.galera + +AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/linux-ha + +halibdir = $(libexecdir)/heartbeat + +ocfdir = $(OCF_RA_DIR_PREFIX)/heartbeat + +dtddir = $(datadir)/$(PACKAGE_NAME) +dtd_DATA = ra-api-1.dtd metadata.rng + +ocf_PROGRAMS = + +if USE_IPV6ADDR_AGENT +ocf_PROGRAMS += IPv6addr +endif + +halib_PROGRAMS = + +if IPV6ADDR_COMPATIBLE +halib_PROGRAMS += send_ua +endif + +IPv6addr_SOURCES = IPv6addr.c IPv6addr_utils.c +IPv6addr_LDADD = -lplumb $(LIBNETLIBS) + +send_ua_SOURCES = send_ua.c IPv6addr_utils.c +send_ua_LDADD = $(LIBNETLIBS) + +ocf_SCRIPTS = AoEtarget \ + AudibleAlarm \ + ClusterMon \ + CTDB \ + Delay \ + Dummy \ + EvmsSCC \ + Evmsd \ + Filesystem \ + ICP \ + IPaddr \ + IPaddr2 \ + IPsrcaddr \ + LVM \ + LinuxSCSI \ + lvmlockd \ + LVM-activate \ + MailTo \ + ManageRAID \ + ManageVE \ + NodeUtilization \ + Pure-FTPd \ + Raid1 \ + Route \ + SAPDatabase \ + SAPInstance \ + SendArp \ + ServeRAID \ + SphinxSearchDaemon \ + Squid \ + Stateful \ + SysInfo \ + VIPArip \ + VirtualDomain \ + WAS \ + WAS6 \ + WinPopup \ + Xen \ + Xinetd \ + ZFS \ + aliyun-vpc-move-ip \ + anything \ + apache \ + asterisk \ + aws-vpc-move-ip \ + aws-vpc-route53 \ + awseip \ + awsvip \ + azure-lb \ + clvm \ + conntrackd \ + corosync-qnetd \ + crypt \ + db2 \ + dhcpd \ + dnsupdate \ + dummypy \ + docker \ + docker-compose \ + dovecot \ + eDir88 \ + ethmonitor \ + exportfs \ + fio \ + galera \ + garbd \ + gcp-ilb \ + gcp-vpc-move-ip \ + iSCSILogicalUnit \ + iSCSITarget \ + ids \ + iface-bridge \ + iface-macvlan \ + iface-vlan \ + ipsec \ + iscsi \ + jboss \ + jira \ + kamailio \ + lxc \ + lxd-info \ + machine-info \ + mariadb \ + mdraid \ + minio \ + mysql \ + mysql-proxy \ + nagios \ + named \ + nfsnotify \ + nfsserver \ + nginx \ + nvmet-subsystem \ + nvmet-namespace \ + nvmet-port \ + ocivip \ + openstack-cinder-volume \ + openstack-floating-ip \ + openstack-info \ + openstack-virtual-ip \ + oraasm \ + oracle \ + oralsnr \ + ovsmonitor \ + pgagent \ + pgsql \ + pingd \ + podman \ + portblock \ + postfix \ + pound \ + proftpd \ + rabbitmq-cluster \ + rabbitmq-server-ha \ + redis \ + rkt \ + rsyncd \ + rsyslog \ + scsi2reservation \ + sfex \ + sg_persist \ + mpathpersist \ + slapd \ + smb-share \ + storage-mon \ + sybaseASE \ + symlink \ + syslog-ng \ + tomcat \ + varnish \ + vdo-vol \ + vmware \ + vsftpd \ + zabbixserver + +if BUILD_AZURE_EVENTS +ocf_SCRIPTS += azure-events +endif + +if BUILD_AZURE_EVENTS_AZ +ocf_SCRIPTS += azure-events-az +endif + +if BUILD_GCP_PD_MOVE +ocf_SCRIPTS += gcp-pd-move +endif + +if BUILD_GCP_VPC_MOVE_ROUTE +ocf_SCRIPTS += gcp-vpc-move-route +endif + +if BUILD_GCP_VPC_MOVE_VIP +ocf_SCRIPTS += gcp-vpc-move-vip +endif + +ocfcommondir = $(OCF_LIB_DIR_PREFIX)/heartbeat +ocfcommon_DATA = ocf-shellfuncs \ + ocf-binaries \ + ocf-directories \ + ocf-returncodes \ + ocf-rarun \ + ocf-distro \ + apache-conf.sh \ + http-mon.sh \ + sapdb-nosha.sh \ + sapdb.sh \ + lvm-clvm.sh \ + lvm-plain.sh \ + lvm-tag.sh \ + openstack-common.sh \ + ora-common.sh \ + mysql-common.sh \ + nfsserver-redhat.sh \ + findif.sh \ + ocf.py + +# Legacy locations +hbdir = $(sysconfdir)/ha.d +hb_DATA = shellfuncs + +check: $(ocf_SCRIPTS:=.check) + +%.check: % + OCF_ROOT=$(abs_srcdir) OCF_FUNCTIONS_DIR=$(abs_srcdir) ./$< meta-data | xmllint --path $(abs_srcdir) --noout --relaxng $(abs_srcdir)/metadata.rng - + +do_spellcheck = printf '[%s]\n' "$(agent)"; \ + OCF_ROOT=$(abs_srcdir) OCF_FUNCTIONS_DIR=$(abs_srcdir) \ + ./$(agent) meta-data 2>/dev/null \ + | xsltproc $(top_srcdir)/make/extract_text.xsl - \ + | aspell pipe list -d en_US --ignore-case \ + --home-dir=$(top_srcdir)/make -p spellcheck-ignore \ + | sed -n 's|^&\([^:]*\):.*|\1|p'; +spellcheck: + @$(foreach agent,$(ocf_SCRIPTS), $(do_spellcheck)) + +clean-local: + rm -rf __pycache__ *.pyc diff --git a/heartbeat/ManageRAID.in b/heartbeat/ManageRAID.in new file mode 100644 index 0000000..bf5c745 --- /dev/null +++ b/heartbeat/ManageRAID.in @@ -0,0 +1,391 @@ +#!@BASH_SHELL@ +# +# Name ManageRAID +# Author Matthias Dahl, m.dahl@designassembly.de +# License GPL version 2 +# +# (c) 2006 The Design Assembly GmbH. +# +# +# WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING +# +# This resource agent is most likely function complete but not error free. Please +# consider it BETA quality for the moment until it has proven itself stable... +# +# USE AT YOUR OWN RISK. +# +# WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING +# +# +# partly based on/inspired by original Heartbeat2 OCF resource agents +# +# Description +# +# Manages starting, mounting, unmounting, stopping and monitoring of RAID devices +# which are preconfigured in /etc/conf.d/HB-ManageRAID. +# +# +# Created 11. Sep 2006 +# Updated 18. Sep 2006 +# +# rev. 1.00.2 +# +# Changelog +# +# 18/Sep/06 1.00.1 more cleanup +# 12/Sep/06 1.00.1 add more functionality +# add sanity check for config parameters +# general cleanup all over the place +# 11/Sep/06 1.00.0 it's alive... muahaha... ALIVE... :-) +# +# +# TODO +# +# - check if at least one disk out of PREFIX_LOCALDISKS is still active +# in RAID otherwise consider RAID broken and stop it. +# +# The reason behind this: consider a RAID-1 which contains iSCSI devices +# shared over Ethernet which get dynamically added/removed to/from the RAID. +# Once all local disks have failed and only those iSCSI disks remain, the RAID +# should really stop to prevent bad performance and possible data loss. +# + +### +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_raidname_default="" + +: ${OCF_RESKEY_raidname=${OCF_RESKEY_raidname_default}} + +### + +# required utilities + +# required files/devices +RAID_MDSTAT=/proc/mdstat + +# +# check_file() +# +check_file () +{ + if [[ ! -e $1 ]]; then + ocf_log err "setup problem: file $1 does not exist." + exit $OCF_ERR_GENERIC + fi +} + +# +# usage() +# +usage() +{ + cat <<-EOT + usage: $0 {start|stop|status|monitor|validate-all|usage|meta-data} + EOT +} + +# +# meta_data() +# +meta_data() +{ + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="ManageRAID" version="1.00.2"> + <version>1.0</version> + + <longdesc lang="en"> + Manages starting, stopping and monitoring of RAID devices which + are preconfigured in /etc/conf.d/HB-ManageRAID. + </longdesc> + + <shortdesc lang="en">Manages RAID devices</shortdesc> + + <parameters> + <parameter name="raidname" unique="0" required="1"> + <longdesc lang="en"> + Name (case sensitive) of RAID to manage. (preconfigured in /etc/conf.d/HB-ManageRAID) + </longdesc> + <shortdesc lang="en">RAID name</shortdesc> + <content type="string" default="${OCF_RESKEY_raidname_default}" /> + </parameter> + </parameters> + + <actions> + <action name="start" timeout="75s" /> + <action name="stop" timeout="75s" /> + <action name="status" depth="0" timeout="10s" interval="10s" /> + <action name="monitor" depth="0" timeout="10s" interval="10s" /> + <action name="validate-all" timeout="5s" /> + <action name="meta-data" timeout="5s" /> + </actions> +</resource-agent> +END +} + +# +# start_raid() +# +start_raid() +{ + declare -i retcode + + status_raid + retcode=$? + if [[ $retcode == $OCF_SUCCESS ]]; then + return $OCF_SUCCESS + elif [[ $retcode != $OCF_NOT_RUNNING ]]; then + return $retcode + fi + + for ldev in "${RAID_LOCALDISKS[@]}"; do + if [[ ! -b $ldev ]]; then + ocf_log err "$ldev is not a (local) block device." + return $OCF_ERR_ARGS + fi + done + + $MDADM -A $RAID_DEVPATH -a yes -u ${!RAID_UUID} "${RAID_LOCALDISKS[@]}" &> /dev/null + if [[ $? != 0 ]]; then + ocf_log err "starting ${!RAID_DEV} with ${RAID_LOCALDISKS[*]} failed." + return $OCF_ERR_GENERIC + fi + + $MOUNT -o ${!RAID_MOUNTOPTIONS} $RAID_DEVPATH ${!RAID_MOUNTPOINT} &> /dev/null + if [[ $? != 0 ]]; then + $MDADM -S $RAID_DEVPATH &> /dev/null + + if [[ $? != 0 ]]; then + ocf_log err "mounting ${!RAID_DEV} to ${!RAID_MOUNTPOINT} failed as well as stopping the RAID itself." + else + ocf_log err "mounting ${!RAID_DEV} to ${!RAID_MOUNTPOINT} failed. RAID stopped again." + fi + + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +# +# stop_raid() +# +stop_raid() +{ + status_raid + if [[ $? == $OCF_NOT_RUNNING ]]; then + return $OCF_SUCCESS + fi + + $UMOUNT ${!RAID_MOUNTPOINT} &> /dev/null + if [[ $? != 0 ]]; then + ocf_log err "unmounting ${!RAID_MOUNTPOINT} failed. not stopping ${!RAID_DEV}!" + return $OCF_ERR_GENERIC + fi + + $MDADM -S $RAID_DEVPATH &> /dev/null + if [[ $? != 0 ]]; then + ocf_log err "stopping RAID ${!RAID_DEV} failed." + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +# +# status_raid() +# +status_raid() +{ + declare -i retcode_raidcheck + declare -i retcode_uuidcheck + + $CAT $RAID_MDSTAT | $GREP -e "${!RAID_DEV}[\ ]*:[\ ]*active" &> /dev/null + if [ $? -ne 0 ]; then + return $OCF_NOT_RUNNING + fi + + if [ ! -e $RAID_DEVPATH ]; then + return $OCF_ERR_GENERIC + fi + + $MDADM --detail -t $RAID_DEVPATH &> /dev/null + retcode_raidcheck=$? + $MDADM --detail -t $RAID_DEVPATH | $GREP -qEe "^[\ ]*UUID[\ ]*:[\ ]*${!RAID_UUID}" &> /dev/null + retcode_uuidcheck=$? + + if [ $retcode_raidcheck -gt 3 ]; then + ocf_log err "mdadm returned error code $retcode_raidcheck while checking ${!RAID_DEV}." + return $OCF_ERR_GENERIC + elif [ $retcode_raidcheck -eq 3 ]; then + ocf_log err "${!RAID_DEV} has failed." + return $OCF_ERR_GENERIC + elif [ $retcode_raidcheck -lt 3 ] && [ $retcode_uuidcheck != 0 ]; then + ocf_log err "active RAID ${!RAID_DEV} and configured UUID (!$RAID_UUID) do not match." + return $OCF_ERR_GENERIC + fi + + $MOUNT | $GREP -e "$RAID_DEVPATH on ${!RAID_MOUNTPOINT}" &> /dev/null + if [[ $? != 0 ]]; then + ocf_log err "${!RAID_DEV} seems to be no longer mounted at ${!RAID_MOUNTPOINT}" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +# +# validate_all_raid() +# +validate_all_raid() +{ + # + # since all parameters are checked every time ManageRAID is + # invoked, there not much more to check... + # + # status_raid should cover the rest. + # + declare -i retcode + + status_ve + retcode=$? + + if [[ $retcode != $OCF_SUCCESS && $retcode != $OCF_NOT_RUNNING ]]; then + return $retcode + fi + + return $OCF_SUCCESS +} + +if [ $# -ne 1 ]; then + usage + exit $OCF_ERR_ARGS +fi + +case "$1" in + meta-data) + meta_data + exit $OCF_SUCCESS + ;; + usage) + usage + exit $OCF_SUCCESS + ;; + *) + ;; +esac + +## required configuration +# +[ -f /etc/conf.d/HB-ManageRAID ] || { + ocf_log err "/etc/conf.d/HB-ManageRAID missing" + exit $OCF_ERR_INSTALLED +} +. /etc/conf.d/HB-ManageRAID +# +## + +# +# check relevant environment variables for sanity and security +# + +declare -i retcode_test +declare -i retcode_grep + +$TEST -z "$OCF_RESKEY_raidname" +retcode_test=$? +echo "$OCF_RESKEY_raidname" | $GREP -qEe "^[[:alnum:]\_]+$" +retcode_grep=$? + +if [[ $retcode_test != 1 || $retcode_grep != 0 ]]; then + ocf_log err "OCF_RESKEY_raidname not set or invalid." + exit $OCF_ERR_ARGS +fi + +RAID_UUID=${OCF_RESKEY_raidname}_UUID + +echo ${!RAID_UUID} | $GREP -qEe "^[[:alnum:]]{8}:[[:alnum:]]{8}:[[:alnum:]]{8}:[[:alnum:]]{8}$" +if [[ $? != 0 ]]; then + ocf_log err "${OCF_RESKEY_raidname}_UUID is invalid." + exit $OCF_ERR_ARGS +fi + +RAID_DEV=${OCF_RESKEY_raidname}_DEV + +echo ${!RAID_DEV} | $GREP -qEe "^md[0-9]+$" +if [[ $? != 0 ]]; then + ocf_log err "${OCF_RESKEY_raidname}_DEV is invalid." + exit $OCF_ERR_ARGS +fi + +RAID_DEVPATH=/dev/${!RAID_DEV/md/md\/} +RAID_MOUNTPOINT=${OCF_RESKEY_raidname}_MOUNTPOINT + +echo ${!RAID_MOUNTPOINT} | $GREP -qEe "^[[:alnum:]\/\_\"\ ]+$" +if [[ $? != 0 ]]; then + ocf_log err "${OCF_RESKEY_raidname}_MOUNTPOINT is invalid." + exit $OCF_ERR_ARGS +fi + +RAID_MOUNTOPTIONS=${OCF_RESKEY_raidname}_MOUNTOPTIONS + +echo ${!RAID_MOUNTOPTIONS} | $GREP -qEe "^[[:alpha:]\,]+$" +if [[ $? != 0 ]]; then + ocf_log err "${OCF_RESKEY_raidname}_MOUNTOPTIONS is invalid." + exit $OCF_ERR_ARGS +fi + +RAID_LOCALDISKS=${OCF_RESKEY_raidname}_LOCALDISKS[@] +RAID_LOCALDISKS=( "${!RAID_LOCALDISKS}" ) + +if [ ${#RAID_LOCALDISKS[@]} -lt 1 ]; then + ocf_log err "you have to specify at least one local disk." + exit $OCF_ERR_ARGS +fi + +# +# check that all relevant utilities are available +# +check_binary $MDADM +check_binary $MOUNT +check_binary $UMOUNT +check_binary $GREP +check_binary $CAT +check_binary $TEST +check_binary echo + + +# +# check that all relevant devices are available +# +check_file $RAID_MDSTAT + +# +# finally... let's see what we are ordered to do :-) +# +case "$1" in + start) + start_raid + ;; + stop) + stop_raid + ;; + status|monitor) + status_raid + ;; + validate-all) + validate_all_raid + ;; + *) + usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +exit $? + diff --git a/heartbeat/ManageVE.in b/heartbeat/ManageVE.in new file mode 100644 index 0000000..f07ca5b --- /dev/null +++ b/heartbeat/ManageVE.in @@ -0,0 +1,320 @@ +#!@BASH_SHELL@ +# +# ManageVE OCF RA. Manages OpenVZ Virtual Environments (VEs) +# +# (c) 2006-2010 Matthias Dahl, Florian Haas, +# and Linux-HA contributors +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# +# This OCF compliant resource agent manages OpenVZ VEs and thus requires +# a proper OpenVZ installation including a recent vzctl util. +# +# rev. 1.00.4 +# +# Changelog +# +# 21/Oct/10 1.00.4 implement migrate_from/migrate_to +# 12/Sep/06 1.00.3 more cleanup +# 12/Sep/06 1.00.2 fixed some logic in start_ve +# general cleanup all over the place +# 11/Sep/06 1.00.1 fixed some typos +# 07/Sep/06 1.00.0 it's alive... muahaha... ALIVE... :-) +# + +### +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_veid_default="" + +: ${OCF_RESKEY_veid=${OCF_RESKEY_veid_default}} + +### + +# required utilities +VZCTL=/usr/sbin/vzctl + +# +# usage() +# +usage() +{ + cat <<-EOF + usage: $0 {start|stop|status|monitor|migrate_from|migrate_to|validate-all|usage|meta-data} + EOF +} + +# +# meta_data() +# +meta_data() +{ + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="ManageVE" version="1.00.4"> + <version>1.0</version> + + <longdesc lang="en"> + This OCF compliant resource agent manages OpenVZ VEs and thus requires + a proper OpenVZ installation including a recent vzctl util. + </longdesc> + + <shortdesc lang="en">Manages an OpenVZ Virtual Environment (VE)</shortdesc> + + <parameters> + <parameter name="veid" unique="0" required="1"> + <longdesc lang="en"> + OpenVZ ID of virtual environment (see output of vzlist -a for all assigned IDs) + </longdesc> + <shortdesc lang="en">OpenVZ ID of VE</shortdesc> + <content type="integer" default="${OCF_RESKEY_veid_default}" /> + </parameter> + </parameters> + + <actions> + <action name="start" timeout="75s" /> + <action name="stop" timeout="75s" /> + <action name="status" depth="0" timeout="10s" interval="10s" /> + <action name="monitor" depth="0" timeout="10s" interval="10s" /> + <action name="migrate_to" timeout="75s" /> + <action name="migrate_from" timeout="75s" /> + <action name="validate-all" timeout="5s" /> + <action name="meta-data" timeout="5s" /> + </actions> +</resource-agent> +END +} + +# +# start_ve() +# +# Starts a VE, or simply logs a message if the VE is already running. +# +start_ve() +{ + if status_ve; then + ocf_log info "VE $VEID already running." + return $OCF_SUCCESS + fi + + ocf_run $VZCTL start $VEID || exit $OCF_ERR_GENERIC + + return $OCF_SUCCESS +} + +# +# stop_ve() +# +# ATTENTION: The following code relies on vzctl's exit codes, especially: +# +# 0 : success +# +# In case any of those exit codes change, this function will need fixing. +# +stop_ve() +{ + status_ve + if [ $? -eq $OCF_NOT_RUNNING ]; then + ocf_log info "VE $VEID already stopped." + return $OCF_SUCCESS + fi + + ocf_run $VZCTL stop $VEID || exit $OCF_ERR_GENERIC + + return $OCF_SUCCESS +} + +# +# migrate_to_ve() +# +# In the process of a resource migration, checkpoints the VE. For this +# to work, vzctl must obviously create the dump file in a place which +# the migration target has access to (an NFS mount, a DRBD device, +# etc.). +# +migrate_to_ve() +{ + if ! status_ve; then + ocf_log err "VE $VEID is not running, aborting" + exit $OCF_ERR_GENERIC + fi + ocf_run $VZCTL chkpnt $VEID || exit $OCF_ERR_GENERIC + return $OCF_SUCCESS +} + +# +# migrate_to_ve() +# +# In the process of a resource migration, restores the VE. For this to +# work, vzctl must obviously have access to the dump file which was +# created on the migration source (on an NFS mount, a DRBD device, +# etc.). +# +migrate_from_ve() +{ + ocf_run $VZCTL restore $VEID || exit $OCF_ERR_GENERIC + return $OCF_SUCCESS +} + +# +# status_ve() +# +# ATTENTION: The following code relies on vzctl's status output. The fifth +# column is interpreted as the VE status (either up or down). +# +# In case the output format should change, this function will need fixing. +# +status_ve() +{ + declare -i retcode + + veexists=`$VZCTL status $VEID 2>/dev/null | $AWK '{print $3}'` + vestatus=`$VZCTL status $VEID 2>/dev/null | $AWK '{print $5}'` + retcode=$? + + if [[ $retcode != 0 ]]; then + # log error only if expected to find running + if [ "$__OCF_ACTION" = "monitor" ] && ! ocf_is_probe; then + ocf_log err "vzctl status $VEID returned: $retcode" + fi + exit $OCF_ERR_GENERIC + fi + + if [[ $veexists != "exist" ]]; then + ocf_log err "vzctl status $VEID returned: $VEID does not exist." + return $OCF_NOT_RUNNING + fi + + case "$vestatus" in + running) + return $OCF_SUCCESS + ;; + down) + return $OCF_NOT_RUNNING + ;; + *) + ocf_log err "vzctl status $VEID, wrong output format. (5th column: $vestatus)" + exit $OCF_ERR_GENERIC + ;; + esac +} + +# +# validate_all_ve() +# +# ATTENTION: The following code relies on vzctl's status output. The fifth +# column is interpreted as the VE status (either up or down). +# +# In case the output format should change, this function will need fixing. +# +validate_all_ve() +{ + declare -i retcode + + # VEID should be a valid VE + `status_ve` + retcode=$? + + if [[ $retcode != $OCF_SUCCESS && $retcode != $OCF_NOT_RUNNING ]]; then + return $retcode + fi + + return $OCF_SUCCESS +} + + +if [[ $# != 1 ]]; then + usage + exit $OCF_ERR_ARGS +fi + +case "$1" in + meta-data) + meta_data + exit $OCF_SUCCESS + ;; + usage) + usage + exit $OCF_SUCCESS + ;; + *) + ;; +esac + +# +# check relevant environment variables for sanity and security +# + +# empty string? +`test -z "$OCF_RESKEY_veid"` + +declare -i veidtest1=$? + +# really a number? +`echo "$OCF_RESKEY_veid" | egrep -q '^[[:digit:]]+$'` + +if [[ $veidtest1 != 1 || $? != 0 ]]; then + ocf_log err "OCF_RESKEY_veid not set or not a number." + exit $OCF_ERR_ARGS +fi + +declare -i VEID=$OCF_RESKEY_veid + +# +# check that all relevant utilities are available +# +check_binary $VZCTL +check_binary $AWK + +# +# finally... let's see what we are ordered to do :-) +# +case "$1" in + start) + start_ve + ;; + stop) + stop_ve + ;; + status|monitor) + status_ve + ;; + migrate_to) + migrate_to_ve + ;; + migrate_from) + migrate_from_ve + ;; + validate-all) + validate_all_ve + ;; + *) + usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +exit $? + diff --git a/heartbeat/NodeUtilization b/heartbeat/NodeUtilization new file mode 100755 index 0000000..f98ab13 --- /dev/null +++ b/heartbeat/NodeUtilization @@ -0,0 +1,237 @@ +#!/bin/sh +# +# +# NodeUtilization OCF Resource Agent +# +# Copyright (c) 2011 SUSE LINUX, John Shi +# Copyright (c) 2016 SUSE LINUX, Kristoffer Gronlund +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_pidfile_default="$HA_VARRUN/NodeUtilization-${OCF_RESOURCE_INSTANCE}" +OCF_RESKEY_dynamic_default="true" +OCF_RESKEY_utilization_cpu_default="true" +OCF_RESKEY_utilization_cpu_reservation_default="1" +OCF_RESKEY_utilization_host_memory_default="true" +OCF_RESKEY_utilization_host_memory_reservation_default="512" +OCF_RESKEY_utilization_hv_memory_default="true" +OCF_RESKEY_utilization_hv_memory_reservation_default="512" + +: ${OCF_RESKEY_pidfile=${OCF_RESKEY_pidfile_default}} +: ${OCF_RESKEY_dynamic=${OCF_RESKEY_dynamic_default}} +: ${OCF_RESKEY_utilization_cpu=${OCF_RESKEY_utilization_cpu_default}} +: ${OCF_RESKEY_utilization_cpu_reservation=${OCF_RESKEY_utilization_cpu_reservation_default}} +: ${OCF_RESKEY_utilization_host_memory=${OCF_RESKEY_utilization_host_memory_default}} +: ${OCF_RESKEY_utilization_host_memory_reservation=${OCF_RESKEY_utilization_host_memory_reservation_default}} +: ${OCF_RESKEY_utilization_hv_memory=${OCF_RESKEY_utilization_hv_memory_default}} +: ${OCF_RESKEY_utilization_hv_memory_reservation=${OCF_RESKEY_utilization_hv_memory_reservation_default}} + +####################################################################### + +NodeUtilization_meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="NodeUtilization" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +The Node Utilization agent detects system parameters like available CPU, host +memory and hypervisor memory availability, and adds them into the CIB for each +node using crm_attribute. Run the agent as a clone resource to have it populate +these parameters on each node. +Note: Setting hv_memory only works with Xen at the moment, using the xl or xm +command line tools. +</longdesc> +<shortdesc lang="en">Node Utilization</shortdesc> + +<parameters> +<parameter name="dynamic" unique="0" required="0"> +<longdesc lang="en"> +If set, parameters will be updated if there are differences between the HA +parameters and the system values when running the monitor action. +If not set, the parameters will be set once when the resource instance starts. +</longdesc> +<shortdesc lang="en">Dynamically update parameters in monitor</shortdesc> +<content type="boolean" default="${OCF_RESKEY_dynamic_default}" /> +</parameter> + +<parameter name="utilization_cpu" unique="0" required="0"> +<longdesc lang="en">Enable setting node CPU utilization limit.</longdesc> +<shortdesc lang="en">Set node CPU utilization limit.</shortdesc> +<content type="boolean" default="${OCF_RESKEY_utilization_cpu_default}" /> +</parameter> + +<parameter name="utilization_cpu_reservation" unique="0" required="0"> +<longdesc lang="en">Subtract this value when setting the CPU utilization parameter.</longdesc> +<shortdesc lang="en">CPU reservation.</shortdesc> +<content type="integer" default="${OCF_RESKEY_utilization_cpu_reservation_default}" /> +</parameter> + +<parameter name="utilization_host_memory" unique="0" required="0"> +<longdesc lang="en">Enable setting available host memory.</longdesc> +<shortdesc lang="en">Set available host memory.</shortdesc> +<content type="boolean" default="${OCF_RESKEY_utilization_host_memory_default}" /> +</parameter> + +<parameter name="utilization_host_memory_reservation" unique="0" required="0"> +<longdesc lang="en">Subtract this value when setting host memory utilization, in MB.</longdesc> +<shortdesc lang="en">Host memory reservation, in MB.</shortdesc> +<content type="integer" default="${OCF_RESKEY_utilization_host_memory_reservation_default}" /> +</parameter> + +<parameter name="utilization_hv_memory" unique="0" required="0"> +<longdesc lang="en">Enable setting available hypervisor memory.</longdesc> +<shortdesc lang="en">Set available hypervisor memory.</shortdesc> +<content type="boolean" default="${OCF_RESKEY_utilization_hv_memory_default}" /> +</parameter> + +<parameter name="utilization_hv_memory_reservation" unique="0" required="0"> +<longdesc lang="en">Subtract this value when setting hypervisor memory utilization, in MB.</longdesc> +<shortdesc lang="en">Hypervisor memory reservation, in MB.</shortdesc> +<content type="integer" default="${OCF_RESKEY_utilization_hv_memory_reservation_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="90s" /> +<action name="stop" timeout="100s" /> +<action name="monitor" timeout="20s" interval="60s"/> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="30s" /> +</actions> +</resource-agent> +END +} + +Host_Total_Memory() { + local xentool + + xentool=$(which xl 2> /dev/null || which xm 2> /dev/null) + + if [ -x "$xentool" ]; then + "$xentool" info | awk '/total_memory/{printf("%d\n",$3);exit(0)}' + else + ocf_log debug "Can only set hv_memory for Xen hypervisor" + echo "0" + fi +} + + +set_utilization() { + host_name="$(ocf_local_nodename)" + + if ocf_is_true "$OCF_RESKEY_utilization_cpu"; then + sys_cpu=$(( $(grep -c processor /proc/cpuinfo) - $OCF_RESKEY_utilization_cpu_reservation )) + uti_cpu=$(crm_attribute --quiet -t nodes --node "$host_name" -z -n cpu 2>/dev/null) + + if [ "$sys_cpu" != "$uti_cpu" ]; then + if ! crm_attribute -t nodes --node "$host_name" -z -n cpu -v $sys_cpu; then + ocf_log err "Failed to set the cpu utilization attribute for $host_name using crm_attribute." + return 1 + fi + fi + fi + + if ocf_is_true "$OCF_RESKEY_utilization_host_memory"; then + sys_mem=$(( $(awk '/MemTotal/{printf("%d\n",$2/1024);exit(0)}' /proc/meminfo) - $OCF_RESKEY_utilization_host_memory_reservation )) + uti_mem=$(crm_attribute --quiet -t nodes --node "$host_name" -z -n host_memory 2>/dev/null) + + if [ "$sys_mem" != "$uti_mem" ]; then + if ! crm_attribute -t nodes --node "$host_name" -z -n host_memory -v $sys_mem; then + ocf_log err "Failed to set the host_memory utilization attribute for $host_name using crm_attribute." + return 1 + fi + fi + fi + + if ocf_is_true "$OCF_RESKEY_utilization_hv_memory"; then + hv_mem=$(( $(Host_Total_Memory) - OCF_RESKEY_utilization_hv_memory_reservation )) + uti_mem=$(crm_attribute --quiet -t nodes --node "$host_name" -z -n hv_memory 2>/dev/null) + + [ $hv_mem -lt 0 ] && hv_mem=0 + + if [ "$hv_mem" != "$uti_mem" ]; then + if ! crm_attribute -t nodes --node "$host_name" -z -n hv_memory -v $hv_mem; then + ocf_log err "Failed to set the hv_memory utilization attribute for $host_name using crm_attribute." + return 1 + fi + fi + fi +} + +NodeUtilization_usage() { + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +NodeUtilization_start() { + ha_pseudo_resource $statefile start + if ! ocf_is_true "$OCF_RESKEY_dynamic"; then + if ! set_utilization; then + exit $OCF_ERR_GENERIC + fi + fi + exit $OCF_SUCCESS +} + +NodeUtilization_stop() { + ha_pseudo_resource $statefile stop + exit $OCF_SUCCESS +} + +NodeUtilization_monitor() { + local rc + ha_pseudo_resource $statefile monitor + rc=$? + + case $rc in + $OCF_SUCCESS) + if ocf_is_true "$OCF_RESKEY_dynamic"; then + if ! set_utilization; then + exit $OCF_ERR_GENERIC + fi + fi + ;; + *) exit $rc;; + esac +} + +NodeUtilization_validate() { + exit $OCF_SUCCESS +} + +statefile=$OCF_RESOURCE_TYPE.$(echo $OCF_RESOURCE_INSTANCE | sed -e 's/^.*://') + +OCF_REQUIRED_PARAMS="" +OCF_REQUIRED_BINARIES="" +ocf_rarun $* diff --git a/heartbeat/Pure-FTPd b/heartbeat/Pure-FTPd new file mode 100755 index 0000000..1499ddd --- /dev/null +++ b/heartbeat/Pure-FTPd @@ -0,0 +1,260 @@ +#!/bin/sh +# +# Resource script for Pure-FTPd +# +# Description: Manages Pure-FTPd as an OCF resource in +# an Active-Passive High Availability setup. +# +# Author: Rajat Upadhyaya <urajat@novell.com> : Pure-FTPd script +# Author: Raoul Bhatia <r.bhatia@ipax.at> : Minor Cleanup. Added Debian GNU/Linux Support +# License: GNU General Public License (GPL) +# +# +# usage: $0 {start|stop|status|monitor|validate-all|meta-data} +# +# The "start" arg starts Pure-FTPd. +# +# The "stop" arg stops it. +# +# OCF parameters: +# OCF_RESKEY_script +# OCF_RESKEY_conffile +# OCF_RESKEY_daemon_type +# OCF_RESKEY_pidfile +# +########################################################################## +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_script_default="/sbin/pure-config.pl" +OCF_RESKEY_conffile_default="/etc/pure-ftpd/pure-ftpd.conf" +OCF_RESKEY_daemon_type_default="" +OCF_RESKEY_pidfile_default="${HA_RSCTMP}/pure-ftpd-${OCF_RESOURCE_INSTANCE}.pid" + +: ${OCF_RESKEY_script=${OCF_RESKEY_script_default}} +: ${OCF_RESKEY_conffile=${OCF_RESKEY_conffile_default}} +: ${OCF_RESKEY_daemon_type=${OCF_RESKEY_daemon_type_default}} +: ${OCF_RESKEY_pidfile=${OCF_RESKEY_pidfile_default}} + +script_basename=`basename $OCF_RESKEY_script` + +USAGE="Usage: $0 {start|stop|status|monitor|validate-all|meta-data}"; + +########################################################################## + +usage() { + echo $USAGE >&2 +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="Pure-FTPd" version="1.0"> +<version>1.0</version> +<longdesc lang="en"> +This script manages Pure-FTPd in an Active-Passive setup +</longdesc> +<shortdesc lang="en">Manages a Pure-FTPd FTP server instance</shortdesc> + +<parameters> + +<parameter name="script" unique="1" required="0"> +<longdesc lang="en"> +The full path to the Pure-FTPd startup script. +For example, "/sbin/pure-config.pl" +</longdesc> +<shortdesc lang="en">Script name with full path</shortdesc> +<content type="string" default="${OCF_RESKEY_script_default}" /> +</parameter> + +<parameter name="conffile" unique="1" required="0"> +<longdesc lang="en"> +The Pure-FTPd configuration file name with full path. +For example, "/etc/pure-ftpd/pure-ftpd.conf" +</longdesc> +<shortdesc lang="en">Configuration file name with full path</shortdesc> +<content type="string" default="${OCF_RESKEY_conffile_default}" /> +</parameter> + +<parameter name="daemon_type" unique="1" required="0"> +<longdesc lang="en"> +The Pure-FTPd daemon to be called by pure-ftpd-wrapper. +Valid options are "" for pure-ftpd, "mysql" for pure-ftpd-mysql, +"postgresql" for pure-ftpd-postgresql and "ldap" for pure-ftpd-ldap +</longdesc> +<shortdesc lang="en">Configuration file name with full path</shortdesc> +<content type="string" default="${OCF_RESKEY_daemon_type_default}" /> +</parameter> + +<parameter name="pidfile" unique="0"> +<longdesc lang="en">PID file</longdesc> +<shortdesc lang="en">PID file</shortdesc> +<content type="string" default="${OCF_RESKEY_pidfile_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="60s" /> +<action name="validate-all" timeout="20s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END + exit $OCF_SUCCESS +} + +isRunning() +{ + kill -s 0 "$1" > /dev/null +} + +PureFTPd_status() +{ + if [ -f $OCF_RESKEY_pidfile ] + then + # Pure-FTPd is probably running + PID=`head -n 1 $OCF_RESKEY_pidfile` + if [ ! -z $PID ] ; then + isRunning "$PID" && [ `ps -p $PID | grep pure-ftpd | wc -l` -eq 1 ] + return $? + fi + fi + + # Pure-FTPd is not running + false +} + +PureFTPd_start() +{ + local pid_dir + + # + # make a few checks and start Pure-FTPd + # + if ocf_is_root ; then : ; else + ocf_log err "You must be root." + exit $OCF_ERR_PERM + fi + + # if Pure-FTPd is running return success + + if PureFTPd_status ; then + exit $OCF_SUCCESS + fi + + # check that the Pure-FTPd script exists and can be executed + if [ ! -x "$OCF_RESKEY_script" ]; then + ocf_log err "Pure-FTPd script '$OCF_RESKEY_script' does not exist or cannot be executed" + exit $OCF_ERR_GENERIC + fi + + # make sure that the pid directory exists + pid_dir=`dirname $OCF_RESKEY_pidfile` + if [ ! -d $pid_dir ] ; then + ocf_log info "Creating PID directory '$pid_dir'." + mkdir -p $pid_dir + fi + + # test for pure-ftpd-wrapper (e.g. Debian GNU/Linux Systems) + if [ "$script_basename" = "pure-ftpd-wrapper" ]; then + # pure-ftpd-wrapper expects STANDALONE_OR_INETD to be set to standalone + STANDALONE_OR_INETD=standalone $OCF_RESKEY_script $OCF_RESKEY_daemon_type + else + # check that the Pure-FTPd config file exist + if [ ! -f "$OCF_RESKEY_conffile" ]; then + ocf_log err "Pure_FTPd config file '$OCF_RESKEY_conffile' does not exist" + exit $OCF_ERR_GENERIC + fi + + $OCF_RESKEY_script $OCF_RESKEY_conffile -g $OCF_RESKEY_pidfile + fi + + if [ $? -ne 0 ]; then + ocf_log info "Pure-FTPd returned error" $? + exit $OCF_ERR_GENERIC + fi + + exit $OCF_SUCCESS +} + + +PureFTPd_stop() +{ + if PureFTPd_status ; then + PID=`head -n 1 $OCF_RESKEY_pidfile` + if [ ! -z $PID ] ; then + kill $PID + fi + fi + + exit $OCF_SUCCESS +} + +PureFTPd_monitor() +{ + if PureFTPd_status ; then + return $OCF_SUCCESS + fi + + return $OCF_NOT_RUNNING +} + +PureFTPd_validate_all() +{ + return $OCF_SUCCESS +} + +# +# Main +# + +if [ $# -ne 1 ] +then + usage + exit $OCF_ERR_ARGS +fi + +case $1 in + start) PureFTPd_start + ;; + + stop) PureFTPd_stop + ;; + + status) if PureFTPd_status + then + ocf_log info "Pure-FTPd is running" + exit $OCF_SUCCESS + else + ocf_log info "Pure-FTPd is stopped" + exit $OCF_NOT_RUNNING + fi + ;; + + monitor) PureFTPd_monitor + exit $? + ;; + + validate-all) PureFTPd_validate_all + exit $? + ;; + + meta-data) meta_data + ;; + + usage) usage + exit $OCF_SUCCESS + ;; + + *) usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac diff --git a/heartbeat/README b/heartbeat/README new file mode 100644 index 0000000..6042956 --- /dev/null +++ b/heartbeat/README @@ -0,0 +1,46 @@ +The OCF RA shared code directory + +If an RA is too big to be comfortably maintained, split it into +several source files. Obviosuly, if two or more RAs share some +code, move that code out to a file which can be shared. + +These files will be installed in $OCF_ROOT/lib/heartbeat with +permissions 644. + +Naming practice + +Use names such as <RA>.sh or <RA>-check.sh or anything-else.sh +where "anything-else" should be related to both the RA and the +code it contains. By adding extension (.sh) it is going to be +easier to notice that these files are not complete resource +agents. + +For instance, oracle and oralsnr RA can both use code in +ora-common.sh. + +Of course, if the RA is implemented in another programming +language, use the appropriate extension. + +RA tracing + +RA tracing may be turned on by setting OCF_TRACE_RA. The trace +output will be saved to OCF_TRACE_FILE, if set. If not, +then the trace would be saved to the OCF_RESKEY_trace_dir. +If it's also not defined, the log will be saved by default to + + $HA_VARLIB/trace_ra/<type>/<id>.<action>.<timestamp> + +e.g. $HA_VARLIB/trace_ra/oracle/db.start.2012-11-27.08:37:08 + +HA_VARLIB is typically set to /var/lib/heartbeat. + +OCF_TRACE_FILE can be set to a path or file descriptor: + +- FD (small integer [3-9]) in that case it is up to the callers + to capture output; the FD _must_ be open for writing + +- absolute path + +NB: FD 9 may be used for tracing with bash >= v4 in case +OCF_TRACE_FILE is set to a path. + diff --git a/heartbeat/README.galera b/heartbeat/README.galera new file mode 100644 index 0000000..dd45618 --- /dev/null +++ b/heartbeat/README.galera @@ -0,0 +1,148 @@ +Notes regarding the Galera resource agent +--- + +In the resource agent, the action of bootstrapping a Galera cluster is +implemented into a series of small steps, by using: + + * Two CIB attributes `last-committed` and `bootstrap` to elect a + bootstrap node that will restart the cluster. + + * One CIB attribute `sync-needed` that will identify that joining + nodes are in the process of synchronizing their local database + via SST. + + * A Master/Slave pacemaker resource which helps splitting the boot + into steps, up to a point where a galera node is available. + + * the recurring monitor action to coordinate switch from one + state to another. + +How boot works +==== + +There are two things to know to understand how the resource agent +restart a Galera cluster. + +### Bootstrap the cluster with the right node + +When synced, the nodes of a galera cluster have in common a last seqno, +which identifies the last transaction considered successful by a +majority of nodes in the cluster (think quorum). + +To restart a cluster, the resource agent must ensure that it will +bootstrap the cluster from an node which is up-to-date, i.e which has +the highest seqno of all nodes. + +As a result, if the resource agent cannot retrieve the seqno on all +nodes, it won't be able to safely identify a bootstrap node, and +will simply refuse to start the galera cluster. + +### synchronizing nodes can be a long operation + +Starting a bootstrap node is relatively fast, so it's performed +during the "promote" operation, which is a one-off, time-bounded +operation. + +Subsequent nodes will need to synchronize via SST, which consists +in "pushing" an entire Galera DB from one node to another. + +There is no perfect time-out, as time spent during synchronization +depends on the size of the DB. Thus, joiner nodes are started during +the "monitor" operation, which is a recurring operation that can +better track the progress of the SST. + + +State flow +==== + +General idea for starting Galera: + + * Before starting the Galera cluster each node needs to go in Slave + state so that the agent records its last seqno into the CIB. + __ This uses attribute last-committed __ + + * When all node went in Slave, the agent can safely determine the + last seqno and elect a bootstrap node (`detect_first_master()`). + __ This uses attribute bootstrap __ + + * The agent then sets the score of the elected bootstrap node to + Master so that pacemaker promote it and start the first Galera + server. + + * Once the first Master is running, the agent can start joiner + nodes during the "monitor" operation, and starts monitoring + their SST sync. + __ This uses attribute sync-needed __ + + * Only when SST is over on joiner nodes, the agent promotes them + to Master. At this point, the entire Galera cluster is up. + + +Attribute usage and liveness +==== + +Here is how attributes are created on a per-node basis. If you +modify the resource agent make sure those properties still hold. + +### last-committed + +It is just a temporary hint for the resource agent to help +elect a bootstrap node. Once the bootstrap attribute is set on one +of the nodes, we can get rid of last-committed. + + - Used : during Slave state to compare seqno + - Created: before entering Slave state: + . at startup in `galera_start()` + . or when a Galera node is stopped in `galera_demote()` + - Deleted: just before node starts in `galera_start_local_node()`; + cleaned-up during `galera_demote()` and `galera_stop()` + +We delete last-committed before starting Galera, to avoid race +conditions that could arise due to discrepancies between the CIB and +Galera. + +### bootstrap + +Attribute set on the node that is elected to bootstrap Galera. + +- Used : during promotion in `galera_start_local_node()` +- Created: at startup once all nodes have `last-committed`; + or during monitor if all nodes have failed +- Deleted: in `galera_start_local_node()`, just after the bootstrap + node started and is ready; + cleaned-up during `galera_demote()` and `galera_stop()` + +There cannot be more than one bootstrap node at any time, otherwise +the Galera cluster would stop replicating properly. + +### sync-needed + +While this attribute is set on a node, the Galera node is in JOIN +state, i.e. SST is in progress and the node cannot serve queries. + +The resource agent relies on the underlying SST method to monitor +the progress of the SST. For instance, with `wsrep_sst_rsync`, +timeout would be reported by rsync, the Galera node would go in +Non-primary state, which would make `galera_monitor()` fail. + +- Used : during recurring slave monitor in `check_sync_status()` +- Created: in `galera_start_local_node()`, just after the joiner + node started and entered the Galera cluster +- Deleted: during recurring slave monitor in `check_sync_status()` + as soon as the Galera code reports to be SYNC-ed. + +### no-grastate + +If a galera node was unexpectedly killed in a middle of a replication, +InnoDB can retain the equivalent of a XA transaction in prepared state +in its redo log. If so, mysqld cannot recover state (nor last seqno) +automatically, and special recovery heuristic has to be used to +unblock the node. + +This transient attribute is used to keep track of forced recoveries to +prevent bootstrapping a cluster from a recovered node when possible. + +- Used : during `detect_first_master()` to elect the bootstrap node +- Created: in `detect_last_commit()` if the node has a pending XA + transaction to recover in the redo log +- Deleted: when a node is promoted to Master. diff --git a/heartbeat/README.mariadb.md b/heartbeat/README.mariadb.md new file mode 100644 index 0000000..da35a03 --- /dev/null +++ b/heartbeat/README.mariadb.md @@ -0,0 +1,156 @@ +Setting up the MariaDB resource agent +===================================== + +This resource agent requires corosync version >= 2 and mariadb version > 10.2 . + +Before embarking on this quest one should read the MariaDB pages on replication +and global transaction IDs, GTID. This will greatly help in understanding what +is going on and why. + +Replication: https://mariadb.com/kb/en/mariadb/setting-up-replication/ +GTID: https://mariadb.com/kb/en/mariadb/gtid/ +semi-sync: https://mariadb.com/kb/en/mariadb/semisynchronous-replication/ + +Some reading on failures under enhanced semi-sync can be found here: +https://jira.mariadb.org/browse/MDEV-162 + +Part 1: MariaDB Setup +--------------------- + +It is best to initialize your MariaDB and do a failover before trying to use +Pacemaker to manage MariaDB. This will both verify the MariaDB configuration +and help you understand what is going on. + +###Configuration Files + +In your MariaDB config file for the server on node 1, place the following +entry (replacing my_database and other names as needed): +``` +[mariadb] +log-bin +server_id=1 +log-basename=master +binlog_do_db=my_database +``` + +Then for each other node create the same entry, but increment the server_id. + +###Replication User + +Now create the replication user (be sure to change the password!): +``` +GRANT ALL PRIVILEGES ON *.* TO 'slave_user'@'%' IDENTIFIED BY 'password'; +GRANT ALL PRIVILEGES ON *.* TO 'slave_user'@'localhost' IDENTIFIED BY 'password'; +``` + +The second entry may not be necessary, but simplified other steps. Change +user name and password as needed. + + +###Intialize from a database backup + +Initialize all nodes from an existing backup, or create a backup from the +first node if needed: + +On the current database: +``` +mysqldump -u root --master-data --databases my_database1 my_database2 > backup.sql +``` + +At the top of this file is a commented out line: +SET GLOBAL gtid_slave_pos='XXXX...' + +uncomment his line. + +On all new nodes: +``` +mysqldump -u root < backup.sql +``` + +###Initialize replication + +Choose a node as master, in this example node1. + +On all slaves, execute: +``` +RESET MASTER; + +CHANGE MASTER TO master_host="node1", master_port=3306, \ + master_user="slave_user", master_password="password", \ + master_use_gtid=current_pos; + +SET GLOBAL rpl_semi_sync_master_enabled='ON', rpl_semi_sync_slave_enabled='ON'; + +START SLAVE; + +SHOW SLAVE STATUS\G +``` + +In an ideal world this will show that replication is now fully working. + +Once replication is working, verify the configuration by doing some updates +and verifying that they are replicated. + +Now try changing the master. On each slave perform: +``` +STOP SLAVE +``` + +Choose a new master, node2 in our example. On all slave nodes execute: +``` +CHANGE MASTER TO master_host="node2", master_port=3306, \ + master_user="slave_user", master_password="password", \ + master_use_gtid=current_pos; + +START SLAVE; +``` + +And again, check that replication is working and changes are synchronized. + + +Part 2: Pacemaker Setup +----------------------- + +This is pretty straightforward. Example is using pcs. + +``` +# Dump the cib +pcs cluster cib mariadb_cfg + +# Create the mariadb_server resource +pcs -f mariadb_cfg resource create mariadb_server mariadb \ + binary="/usr/sbin/mysqld" \ + replication_user="slave_user" \ + replication_passwd="password" \ + node_list="node1 node2 node3" \ + op start timeout=120 interval=0 \ + op stop timeout=120 interval=0 \ + op promote timeout=120 interval=0 \ + op demote timeout=120 interval=0 \ + op monitor role=Master timeout=30 interval=10 \ + op monitor role=Slave timeout=30 interval=20 \ + op notify timeout="60s" interval="0s" + +# Create the master slave resource +pcs -f mariadb_cfg resource master msMariadb mariadb_server \ + master-max=1 master-node-max=1 clone-max=3 clone-node-max=1 notify=true + +# Avoid running this on some nodes, only if needed +pcs -f mariadb_cfg constraint location msMariadb avoids \ + node4=INFINITY node5=INFINITY + +# Push the cib +pcs cluster cib-push mariadb_cfg +``` + +You should now have a running MariaDB cluster: +``` +pcs status + +... + Master/Slave Set: msMariadb [mariadb_server] + Masters: [ node1 ] + Slaves: [ node2 node3 ] +... +``` + diff --git a/heartbeat/Raid1 b/heartbeat/Raid1 new file mode 100755 index 0000000..924d94c --- /dev/null +++ b/heartbeat/Raid1 @@ -0,0 +1,586 @@ +#!/bin/sh +# +# +# License: GNU General Public License (GPL) +# Support: users@clusterlabs.org +# +# Raid1 +# Description: Manages a Linux software RAID device on a shared storage medium. +# Original Author: Eric Z. Ayers (eric.ayers@compgen.com) +# Original Release: 25 Oct 2000 +# RAID patches: http://people.redhat.com/mingo/raid-patches/ +# Word to the Wise: http://lwn.net/2000/0810/a/raid-faq.php3 +# Sympathetic Ear: mailto:linux-raid@vger.kernel.org +# +# usage: $0 {start|stop|status|monitor|validate-all|usage|meta-data} +# +# +# EXAMPLE config file /etc/raidtab.md0 +# This file must exist on both machines! +# +# raiddev /dev/md0 +# raid-level 1 +# nr-raid-disks 2 +# chunk-size 64k +# persistent-superblock 1 +# #nr-spare-disks 0 +# device /dev/sda1 +# raid-disk 0 +# device /dev/sdb1 +# raid-disk 1 +# +# EXAMPLE config file /etc/mdadm.conf (for more info:man mdadm.conf) +# +# DEVICE /dev/sdb1 /dev/sdc1 +# ARRAY /dev/md0 UUID=4a865b55:ba27ef8d:29cd5701:6fb42799 +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_raidconf_default="" +OCF_RESKEY_raiddev_default="" +OCF_RESKEY_homehost_default="" +OCF_RESKEY_force_stop_default="true" +OCF_RESKEY_udev_default="true" +OCF_RESKEY_force_clones_default="false" + +: ${OCF_RESKEY_raidconf=${OCF_RESKEY_raidconf_default}} +: ${OCF_RESKEY_raiddev=${OCF_RESKEY_raiddev_default}} +: ${OCF_RESKEY_homehost=${OCF_RESKEY_homehost_default}} +: ${OCF_RESKEY_force_stop=${OCF_RESKEY_force_stop_default}} +: ${OCF_RESKEY_udev=${OCF_RESKEY_udev_default}} +: ${OCF_RESKEY_force_clones=${OCF_RESKEY_force_clones_default}} + +####################################################################### + +usage() { + cat <<-EOT + usage: $0 {start|stop|status|monitor|validate-all|usage|meta-data} + EOT +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="Raid1" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This resource agent manages Linux software RAID (MD) devices on +a shared storage medium. It uses mdadm(8) to start, stop, and +monitor the MD devices. Raidtools are supported, but deprecated. +See https://raid.wiki.kernel.org/index.php/Linux_Raid for more +information. +</longdesc> +<shortdesc lang="en">Manages Linux software RAID (MD) devices on shared storage</shortdesc> + +<parameters> +<parameter name="raidconf" unique="0" required="1"> +<longdesc lang="en"> +The RAID configuration file, e.g. /etc/mdadm.conf. +</longdesc> +<shortdesc lang="en">RAID config file</shortdesc> +<content type="string" default="${OCF_RESKEY_raidconf_default}" /> +</parameter> + +<parameter name="raiddev" unique="0" required="1"> +<longdesc lang="en"> +One or more block devices to use, space separated. Alternatively, +set to "auto" to manage all devices specified in raidconf. +</longdesc> +<shortdesc lang="en">block device</shortdesc> +<content type="string" default="${OCF_RESKEY_raiddev_default}" /> +</parameter> + +<parameter name="homehost" unique="0" required="0"> +<longdesc lang="en"> +The value for the homehost directive; this is an mdadm feature to +protect RAIDs against being activated by accident. It is recommended to +create RAIDs managed by the cluster with "homehost" set to a special +value, so they are not accidentally auto-assembled by nodes not +supposed to own them. +</longdesc> +<shortdesc lang="en">Homehost for mdadm</shortdesc> +<content type="string" default="${OCF_RESKEY_homehost_default}" /> +</parameter> + +<parameter name="force_stop" unique="0" required="0"> +<longdesc lang="en"> +If processes or kernel threads are using the array, it cannot be +stopped. We will try to stop processes, first by sending TERM and +then, if that doesn't help in $PROC_CLEANUP_TIME seconds, using KILL. +The lsof(8) program is required to get the list of array users. +Of course, the kernel threads cannot be stopped this way. +If the processes are critical for data integrity, then set this +parameter to false. Note that in that case the stop operation +will fail and the node will be fenced. +</longdesc> +<shortdesc lang="en">force stop processes using the array</shortdesc> +<content type="boolean" default="${OCF_RESKEY_force_stop_default}" /> +</parameter> + +<parameter name="udev" unique="0" required="0"> +<longdesc lang="en"> +Wait until udevd creates a device in the start operation. On a +normally loaded host this should happen quickly, but you may be +unlucky. If you are not using udev set this to "no". +</longdesc> +<shortdesc lang="en">udev</shortdesc> +<content type="boolean" default="${OCF_RESKEY_udev_default}" /> +</parameter> + +<parameter name="force_clones"> +<longdesc lang="en"> +Activating the same md RAID array on multiple nodes at the same time +will result in data corruption and thus is forbidden by default. + +A safe example could be an array that is only named identically across +all nodes, but is in fact distinct. + +Only set this to "true" if you know what you are doing! +</longdesc> +<shortdesc lang="en">force ability to run as a clone</shortdesc> +<content type="boolean" default="${OCF_RESKEY_force_clones_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="status" depth="0" timeout="20s" interval="10s" /> +<action name="monitor" depth="0" timeout="20s" interval="10s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +udev_settle() { + if ocf_is_true $WAIT_FOR_UDEV; then + udevadm settle $* + fi +} +list_conf_arrays() { + test -f $RAIDCONF || { + ocf_exit_reason "$RAIDCONF gone missing!" + exit $OCF_ERR_GENERIC + } + grep ^ARRAY $RAIDCONF | awk '{print $2}' +} +forall() { + local func=$1 + local checkall=$2 + local mddev rc=0 + for mddev in $RAIDDEVS; do + $func $mddev + rc=$(($rc | $?)) + [ "$checkall" = all ] && continue + [ $rc -ne 0 ] && return $rc + done + return $rc +} +are_arrays_stopped() { + local rc mddev + for mddev in $RAIDDEVS; do + raid1_monitor_one $mddev + rc=$? + [ $rc -ne $OCF_NOT_RUNNING ] && break + done + test $rc -eq $OCF_NOT_RUNNING +} + +md_assemble() { + local mddev=$1 + $MDADM --assemble $mddev --config=$RAIDCONF $MDADM_HOMEHOST + udev_settle --exit-if-exists=$mddev +} +# +# START: Start up the RAID device +# +raid1_start() { + local rc + raid1_monitor + rc=$? + if [ $rc -eq $OCF_SUCCESS ]; then + # md already online, nothing to do. + return $OCF_SUCCESS + fi + if [ $rc -ne $OCF_NOT_RUNNING ]; then + # If the array is in a broken state, this agent doesn't + # know how to repair that. + ocf_exit_reason "$RAIDDEVS in a broken state; cannot start (rc=$rc)" + return $OCF_ERR_GENERIC + fi + + if [ $HAVE_RAIDTOOLS = "true" ]; then + # Run raidstart to start up the RAID array + $RAIDSTART --configfile $RAIDCONF $MDDEV + else + forall md_assemble all + fi + + raid1_monitor + if [ $? -eq $OCF_SUCCESS ]; then + return $OCF_SUCCESS + else + ocf_exit_reason "Couldn't start RAID for $RAIDDEVS" + return $OCF_ERR_GENERIC + fi +} + +# +# STOP: stop the RAID device +# +mark_readonly() { + local mddev=$1 + local rc + ocf_log info "Attempting to mark array $mddev readonly" + $MDADM --readonly $mddev --config=$RAIDCONF + rc=$? + if [ $rc -ne 0 ]; then + ocf_exit_reason "Failed to set $mddev readonly (rc=$rc)" + fi + return $rc +} +mknod_raid1_stop() { + # first create a block device file, then try to stop the + # array + local rc n tmp_block_file + n=`echo $1 | sed 's/[^0-9]*//'` + if ! ocf_is_decimal "$n"; then + ocf_log warn "could not get the minor device number from $1" + return 1 + fi + tmp_block_file="$HA_RSCTMP/${OCF_RESOURCE_INSTANCE}-`basename $1`" + rm -f $tmp_block_file + ocf_log info "block device file $1 missing, creating one in order to stop the array" + mknod $tmp_block_file b 9 $n + $MDADM --stop $tmp_block_file --config=$RAIDCONF + rc=$? + rm -f $tmp_block_file + return $rc +} +raid1_stop_one() { + ocf_log info "Stopping array $1" + if [ -b "$1" ]; then + $MDADM --stop $1 --config=$RAIDCONF && + return + else + # newer mdadm releases can stop arrays when given the + # basename; try that first + $MDADM --stop `basename $1` --config=$RAIDCONF && + return + # otherwise create a block device file + mknod_raid1_stop $1 + fi +} +get_users_pids() { + local mddev=$1 + local outp l + ocf_log debug "running lsof to list $mddev users..." + outp=`lsof $mddev | tail -n +2` + echo "$outp" | awk '{print $2}' | sort -u + echo "$outp" | while read l; do + ocf_log warn "$l" + done +} +stop_raid_users() { + local pids + pids=`forall get_users_pids all | sort -u` + if [ -z "$pids" ]; then + ocf_log warn "lsof reported no users holding arrays" + return 2 + else + ocf_stop_processes TERM $PROC_CLEANUP_TIME $pids + fi +} +stop_arrays() { + if [ $HAVE_RAIDTOOLS = "true" ]; then + $RAIDSTOP --configfile $RAIDCONF $MDDEV + else + forall raid1_stop_one all + fi +} +showusers() { + local disk + for disk; do + if have_binary lsof; then + ocf_log info "running lsof to list $disk users..." + ocf_run -warn lsof $disk + fi + if [ -d /sys/block/$disk/holders ]; then + ocf_log info "ls -l /sys/block/$disk/holders" + ocf_run -warn ls -l /sys/block/$disk/holders + fi + done +} +raid1_stop() { + local rc + # See if the MD device is already cleanly stopped: + if are_arrays_stopped; then + return $OCF_SUCCESS + fi + + # Turn off raid + if ! stop_arrays; then + if ocf_is_true $FORCESTOP; then + if have_binary lsof; then + stop_raid_users + case $? in + 2) false;; + *) stop_arrays;; + esac + else + ocf_log warn "install lsof(8) to list users holding the disk" + false + fi + else + false + fi + fi + rc=$? + + if [ $rc -ne 0 ]; then + ocf_log warn "Couldn't stop RAID for $RAIDDEVS (rc=$rc)" + showusers $RAIDDEVS + if [ $HAVE_RAIDTOOLS != "true" ]; then + forall mark_readonly all + fi + return $OCF_ERR_GENERIC + fi + + if are_arrays_stopped; then + return $OCF_SUCCESS + fi + + ocf_exit_reason "RAID $RAIDDEVS still active after stop command!" + return $OCF_ERR_GENERIC +} + +# +# monitor: a less noisy status +# +raid1_monitor_one() { + local mddev=$1 + local md= + local rc + local TRY_READD=0 + local pbsize + # check if the md device exists first + # but not if we are in the stop operation + # device existence is important only for the running arrays + if [ "$__OCF_ACTION" != "stop" ]; then + if [ -h "$mddev" ]; then + md=$(ls $mddev -l | awk -F'/' '{print $NF}') + elif [ -b "$mddev" ]; then + md=$(echo $mddev | sed 's,/dev/,,') + else + ocf_log info "$mddev is not a block device" + return $OCF_NOT_RUNNING + fi + fi + if ! grep -e "^$md[ \t:]" /proc/mdstat >/dev/null ; then + ocf_log info "$md not found in /proc/mdstat" + return $OCF_NOT_RUNNING + fi + if [ $HAVE_RAIDTOOLS != "true" ]; then + $MDADM --detail --test $mddev >/dev/null 2>&1 ; rc=$? + case $rc in + 0) ;; + 1) ocf_log warn "$mddev has at least one failed device." + TRY_READD=1 + ;; + 2) ocf_exit_reason "$mddev has failed." + return $OCF_ERR_GENERIC + ;; + 4) + if [ "$__OCF_ACTION" = "stop" ] ; then + # There may be a transient invalid device after + # we stop MD due to uevent processing, the + # original device is stopped though. + return $OCF_NOT_RUNNING + else + ocf_exit_reason "mdadm failed on $mddev." + return $OCF_ERR_GENERIC + fi + ;; + *) ocf_exit_reason "mdadm returned an unknown result ($rc)." + return $OCF_ERR_GENERIC + ;; + esac + fi + if [ "$__OCF_ACTION" = "monitor" -a "$OCF_RESKEY_CRM_meta_interval" != 0 \ + -a $TRY_READD -eq 1 -a $OCF_CHECK_LEVEL -gt 0 ]; then + ocf_log info "Attempting recovery sequence to re-add devices on $mddev:" + $MDADM $mddev --fail detached + $MDADM $mddev --remove failed + $MDADM $mddev --re-add missing + # TODO: At this stage, there's nothing to actually do + # here. Either this worked or it did not. + fi + + pbsize=`(blockdev --getpbsz $mddev || stat -c "%o" $mddev) 2>/dev/null` + if [ -z "$pbsize" ]; then + ocf_log warn "both blockdev and stat could not get the block size (will use 4k)" + pbsize=4096 # try with 4k + fi + if ! dd if=$mddev count=1 bs=$pbsize of=/dev/null \ + iflag=direct >/dev/null 2>&1 ; then + ocf_exit_reason "$mddev: I/O error on read" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +raid1_monitor() { + forall raid1_monitor_one +} + +# +# STATUS: is the raid device online or offline? +# +raid1_status() { + # See if the MD device is online + local rc + raid1_monitor + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + echo "stopped" + else + echo "running" + fi + return $rc +} + +raid1_validate_all() { + return $OCF_SUCCESS +} + +PROC_CLEANUP_TIME=3 + +if + ( [ $# -ne 1 ] ) +then + usage + exit $OCF_ERR_ARGS +fi + +case "$1" in + meta-data) + meta_data + exit $OCF_SUCCESS + ;; + usage) + usage + exit $OCF_SUCCESS + ;; + *) + ;; +esac + +RAIDCONF="$OCF_RESKEY_raidconf" +MDDEV="$OCF_RESKEY_raiddev" +FORCESTOP="${OCF_RESKEY_force_stop}" +WAIT_FOR_UDEV="${OCF_RESKEY_udev}" + +if [ -z "$RAIDCONF" ] ; then + ocf_exit_reason "Please set OCF_RESKEY_raidconf!" + exit $OCF_ERR_CONFIGURED +fi + +if [ ! -r "$RAIDCONF" ] ; then + ocf_exit_reason "Configuration file [$RAIDCONF] does not exist, or can not be opened!" + exit $OCF_ERR_INSTALLED +fi + +if [ -z "$MDDEV" ] ; then + ocf_exit_reason "Please set OCF_RESKEY_raiddev to the Raid device you want to control!" + exit $OCF_ERR_CONFIGURED +fi + +if ocf_is_clone && ! ocf_is_true "$OCF_RESKEY_force_clones"; then + ocf_exit_reason "md RAID arrays are NOT safe to run as a clone!" + ocf_log err "Please read the comment on the force_clones parameter." + exit $OCF_ERR_CONFIGURED +fi + +if ocf_is_true $WAIT_FOR_UDEV && ! have_binary udevadm; then + if [ "$__OCF_ACTION" = "start" ]; then + ocf_log warn "either install udevadm or set udev to false" + ocf_log info "setting udev to false!" + fi + WAIT_FOR_UDEV=0 +fi + +if ! ocf_is_true $WAIT_FOR_UDEV; then + export MDADM_NO_UDEV=1 +fi + +if ocf_is_true $FORCESTOP && ! have_binary lsof; then + ocf_log warn "Please install lsof(8), we may need it when stopping Raid device! Now continuing anyway ..." +fi + +HAVE_RAIDTOOLS=false +if have_binary $MDADM >/dev/null 2>&1 ; then + if [ -n "$OCF_RESKEY_homehost" ]; then + MDADM_HOMEHOST="--homehost=${OCF_RESKEY_homehost}" + else + MDADM_HOMEHOST="" + fi +else + check_binary $RAIDSTART + HAVE_RAIDTOOLS=true +fi + +if [ $HAVE_RAIDTOOLS = true ]; then + if [ "$MDDEV" = "auto" ]; then + ocf_exit_reason "autoconf supported only with mdadm!" + exit $OCF_ERR_INSTALLED + elif [ `echo $MDDEV|wc -w` -gt 1 ]; then + ocf_exit_reason "multiple devices supported only with mdadm!" + exit $OCF_ERR_INSTALLED + fi +fi + +if [ "$MDDEV" = "auto" ]; then + RAIDDEVS=`list_conf_arrays` +else + RAIDDEVS="$MDDEV" +fi + +# At this stage, +# [ $HAVE_RAIDTOOLS = false ] <=> we have $MDADM, +# otherwise we have raidtools (raidstart and raidstop) + +# Look for how we are called +case "$1" in + start) + raid1_start + ;; + stop) + raid1_stop + ;; + status) + raid1_status + ;; + monitor) + raid1_monitor + ;; + validate-all) + raid1_validate_all + ;; + *) + usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +exit $? diff --git a/heartbeat/Route b/heartbeat/Route new file mode 100755 index 0000000..7db41d0 --- /dev/null +++ b/heartbeat/Route @@ -0,0 +1,348 @@ +#!/bin/sh +# +# Route OCF RA. Enables and disables network routes. +# +# (c) 2008-2010 Florian Haas, Dejan Muhamedagic, +# and Linux-HA contributors +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Default values +OCF_RESKEY_device_default="" +OCF_RESKEY_gateway_default="" +OCF_RESKEY_source_default="" +OCF_RESKEY_table_default="" +OCF_RESKEY_family_default="detect" + +: ${OCF_RESKEY_device=${OCF_RESKEY_device_default}} +: ${OCF_RESKEY_gateway=${OCF_RESKEY_gateway_default}} +: ${OCF_RESKEY_source=${OCF_RESKEY_source_default}} +: ${OCF_RESKEY_table=${OCF_RESKEY_table_default}} +: ${OCF_RESKEY_family=${OCF_RESKEY_family_default}} + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="Route" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Enables and disables network routes. + +Supports host and net routes, routes via a gateway address, +and routes using specific source addresses. + +This resource agent is useful if a node's routing table +needs to be manipulated based on node role assignment. + +Consider the following example use case: + + - One cluster node serves as an IPsec tunnel endpoint. + + - All other nodes use the IPsec tunnel to reach hosts + in a specific remote network. + +Then, here is how you would implement this scheme making use +of the Route resource agent: + + - Configure an ipsec LSB resource. + + - Configure a cloned Route OCF resource. + + - Create an order constraint to ensure + that ipsec is started before Route. + + - Create a colocation constraint between the + ipsec and Route resources, to make sure no instance + of your cloned Route resource is started on the + tunnel endpoint itself. +</longdesc> +<shortdesc lang="en">Manages network routes</shortdesc> + +<parameters> + +<parameter name="destination" unique="1" required="1"> +<longdesc lang="en"> +The destination network (or host) to be configured for the route. +Specify the netmask suffix in CIDR notation (e.g. "/24"). +If no suffix is given, a host route will be created. +Specify "0.0.0.0/0" or "default" if you want this resource to set +the system default route. +</longdesc> +<shortdesc lang="en">Destination network</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="device" unique="0"> +<longdesc lang="en"> +The outgoing network device to use for this route. +</longdesc> +<shortdesc lang="en">Outgoing network device</shortdesc> +<content type="string" default="${OCF_RESKEY_device_default}" /> +</parameter> + +<parameter name="gateway" unique="0"> +<longdesc lang="en"> +The gateway IP address to use for this route. +</longdesc> +<shortdesc lang="en">Gateway IP address</shortdesc> +<content type="string" default="${OCF_RESKEY_gateway_default}" /> +</parameter> + +<parameter name="source" unique="0"> +<longdesc lang="en"> +The source IP address to be configured for the route. +</longdesc> +<shortdesc lang="en">Source IP address</shortdesc> +<content type="string" default="${OCF_RESKEY_source_default}" /> +</parameter> + +<parameter name="table" unique="0"> +<longdesc lang="en"> +The routing table to be configured for the route. +</longdesc> +<shortdesc lang="en">Routing table</shortdesc> +<content type="string" default="${OCF_RESKEY_table_default}" /> +</parameter> + +<parameter name="family" unique="0"> +<longdesc lang="en"> +The address family to be used for the route +ip4 IP version 4 +ip6 IP version 6 +detect Detect from 'destination' address. +</longdesc> +<shortdesc lang="en">Address Family</shortdesc> +<content type="string" default="${OCF_RESKEY_family_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" timeout="20s" interval="10s" + depth="0"/> +<action name="reload" timeout="20s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="20s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +create_route_spec() { + # Creates a route specification for use by "ip route (add|del|show)" + route_spec="to ${OCF_RESKEY_destination}" + if [ -n "${OCF_RESKEY_device}" ]; then + route_spec="${route_spec} dev ${OCF_RESKEY_device}" + fi + if [ -n "${OCF_RESKEY_gateway}" ]; then + route_spec="${route_spec} via ${OCF_RESKEY_gateway}" + fi + if [ -n "${OCF_RESKEY_source}" ]; then + route_spec="${route_spec} src ${OCF_RESKEY_source}" + fi + if [ -n "${OCF_RESKEY_table}" ]; then + route_spec="${route_spec} table ${OCF_RESKEY_table}" + fi + echo "$route_spec" +} + +route_usage() { + cat <<END +usage: $0 {start|stop|status|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +route_start() { + route_validate || exit $? + + route_status + status=$? + if [ $status -eq $OCF_SUCCESS ]; then + ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : already started." + return $OCF_SUCCESS + fi + route_spec="$(create_route_spec)" + if ip route add $route_spec; then + ocf_log info "${OCF_RESOURCE_INSTANCE} Added network route: $route_spec" + return $OCF_SUCCESS + else + ocf_exit_reason "${OCF_RESOURCE_INSTANCE} Failed to add network route: $route_spec" + fi + return $OCF_ERR_GENERIC +} + +route_stop() { + route_status + status=$? + case $status in + $OCF_SUCCESS) + route_spec="$(create_route_spec)" + if ip route del $route_spec; then + ocf_log info "${OCF_RESOURCE_INSTANCE} Removed network route: $route_spec" + return $OCF_SUCCESS + else + ocf_exit_reason "${OCF_RESOURCE_INSTANCE} Failed to remove network route: $route_spec" + fi + ;; + $OCF_NOT_RUNNING) + ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : already stopped." + return $OCF_SUCCESS + ;; + esac + return $OCF_ERR_GENERIC +} + +route_status() { + show_output="$(ip $addr_family route show $(create_route_spec) 2>/dev/null)" + if [ $? -eq 0 ]; then + if [ -n "$show_output" ]; then + # "ip route show" returned zero, and produced output on + # stdout. That is what we expect. + return $OCF_SUCCESS + else + # "ip route show" returned zero, but produced no + # output on stdout. Assume the route was cleanly + # unconfigured. + return $OCF_NOT_RUNNING + fi + else + # "ip route show" returned an error code. Assume something + # went wrong. + if ocf_is_probe; then + return $OCF_NOT_RUNNING + else + return $OCF_ERR_GENERIC + fi + fi +} + +route_validate() { + # If we're running as a clone, are the clone meta attrs OK? + if [ "${OCF_RESKEY_CRM_meta_clone}" ]; then + if [ "${OCF_RESKEY_CRM_meta_clone_node_max}" != 1 ]; then + ocf_exit_reason "Misconfigured clone parameters. Must set meta attribute \"clone_node_max\" to 1, got ${OCF_RESKEY_CRM_meta_clone_node_max}." + return $OCF_ERR_CONFIGURED + fi + fi + # Did we get a destination? + if [ -z "${OCF_RESKEY_destination}" ]; then + ocf_exit_reason "Missing required parameter \"destination\"." + return $OCF_ERR_CONFIGURED + fi + # Did we get either a device or a gateway address? + if [ -z "${OCF_RESKEY_device}" -a -z "${OCF_RESKEY_gateway}" ]; then + ocf_exit_reason "Must specify either \"device\", or \"gateway\", or both." + return $OCF_ERR_CONFIGURED + fi + # If a device has been configured, is it available on this system? + if [ -n "${OCF_RESKEY_device}" ]; then + if ! ip link show ${OCF_RESKEY_device} >/dev/null 2>&1; then + ocf_exit_reason "Network device ${OCF_RESKEY_device} appears not to be available on this system." + # OCF_ERR_ARGS prevents the resource from running anywhere at all, + # maybe another node has the interface? + # OCF_ERR_INSTALLED just prevents starting on this particular node. + return $OCF_ERR_INSTALLED + fi + fi + + # The following tests must return $OCF_ERR_INSTALLED, but only if + # the resource is actually running (i.e., not during probes) + if ! ocf_is_probe; then + # If a source address has been configured, is it available on + # this system? + if [ -n "${OCF_RESKEY_source}" ]; then + if ! ip address show | grep -w ${OCF_RESKEY_source} >/dev/null 2>&1; then + ocf_exit_reason "Source address ${OCF_RESKEY_source} appears not to be available on this system." + # same reason as with _device: + return $OCF_ERR_INSTALLED + fi + fi + # If a gateway address has been configured, is it reachable? + if [ -n "${OCF_RESKEY_gateway}" ]; then + if ! ip route get ${OCF_RESKEY_gateway} >/dev/null 2>&1; then + ocf_exit_reason "Gateway address ${OCF_RESKEY_gateway} is unreachable." + # same reason as with _device: + return $OCF_ERR_INSTALLED + fi + fi + fi + return $OCF_SUCCESS +} + +# These two actions must always succeed +case $__OCF_ACTION in +meta-data) meta_data + # OCF variables are not set when querying meta-data + exit 0 + ;; +usage|help) route_usage + exit $OCF_SUCCESS + ;; +esac + +# Don't do anything if the necessary utilities aren't present +for binary in ip grep; do + check_binary $binary +done + +case $OCF_RESKEY_family in + ip4) addr_family="-4" ;; + ip6) addr_family="-6" ;; + detect) + case $OCF_RESKEY_destination in + *:*) addr_family="-6" ;; + *.*) addr_family="-4" ;; + *) ocf_exit_reason "Address family detection requires a numeric destination address." ;; + esac ;; + *) ocf_exit_reason "Address family '${OCF_RESKEY_family}' not recognized." ;; +esac + +case $__OCF_ACTION in +start) route_start;; +stop) route_stop;; +status|monitor) route_status;; +reload) ocf_log info "Reloading..." + route_start + ;; +validate-all) route_validate;; +*) route_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION returned $rc" +exit $rc diff --git a/heartbeat/SAPDatabase b/heartbeat/SAPDatabase new file mode 100755 index 0000000..563a6f3 --- /dev/null +++ b/heartbeat/SAPDatabase @@ -0,0 +1,401 @@ +#!/bin/sh +# +# SAPDatabase +# +# Description: Manages any type of SAP supported database instance +# as a High-Availability OCF compliant resource. +# +# Author: Alexander Krauth, October 2006 +# Support: linux@sap.com +# License: GNU General Public License (GPL) +# Copyright: (c) 2006, 2007, 2010, 2012 Alexander Krauth +# +# An example usage: +# See usage() function below for more details... +# +# OCF instance parameters: +# OCF_RESKEY_SID +# OCF_RESKEY_DIR_EXECUTABLE (optional, well known directories will be searched by default) +# OCF_RESKEY_DBTYPE (mandatory, one of the following values: ORA,ADA,DB6,SYB,HDB) +# OCF_RESKEY_DBINSTANCE (optional, Database instance name, if not equal to SID) +# OCF_RESKEY_DBOSUSER (optional, the Linux user that owns the database processes on operating system level) +# OCF_RESKEY_STRICT_MONITORING (optional, activate application level monitoring - with Oracle a failover will occur in case of an archiver stuck) +# OCF_RESKEY_AUTOMATIC_RECOVER (optional, automatic startup recovery, default is false) +# OCF_RESKEY_MONITOR_SERVICES (optional, default is to monitor all database services) +# OCF_RESKEY_PRE_START_USEREXIT (optional, lists a script which can be executed before the resource is started) +# OCF_RESKEY_POST_START_USEREXIT (optional, lists a script which can be executed after the resource is started) +# OCF_RESKEY_PRE_STOP_USEREXIT (optional, lists a script which can be executed before the resource is stopped) +# OCF_RESKEY_POST_STOP_USEREXIT (optional, lists a script which can be executed after the resource is stopped) +# Deprecated parameters: +# OCF_RESKEY_NETSERVICENAME +# OCF_RESKEY_DBJ2EE_ONLY +# OCF_RESKEY_JAVA_HOME +# OCF_RESKEY_DIR_BOOTSTRAP +# OCF_RESKEY_DIR_SECSTORE +# OCF_RESKEY_DB_JARS +# +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_SID_default="" +OCF_RESKEY_DIR_EXECUTABLE_default="/usr/sap/hostctrl/exe" +OCF_RESKEY_DBTYPE_default="" +OCF_RESKEY_DBINSTANCE_default="" +OCF_RESKEY_DBOSUSER_default="" +OCF_RESKEY_NETSERVICENAME_default="" +OCF_RESKEY_DBJ2EE_ONLY_default="" +OCF_RESKEY_JAVA_HOME_default="" +OCF_RESKEY_STRICT_MONITORING_default="false" +OCF_RESKEY_AUTOMATIC_RECOVER_default="false" +OCF_RESKEY_MONITOR_SERVICES_default="" +OCF_RESKEY_DIR_BOOTSTRAP_default="" +OCF_RESKEY_DIR_SECSTORE_default="" +OCF_RESKEY_DB_JARS_default="" +OCF_RESKEY_PRE_START_USEREXIT_default="" +OCF_RESKEY_POST_START_USEREXIT_default="" +OCF_RESKEY_PRE_STOP_USEREXIT_default="" +OCF_RESKEY_POST_STOP_USEREXIT_default="" + +: ${OCF_RESKEY_SID=${OCF_RESKEY_SID_default}} +: ${OCF_RESKEY_DIR_EXECUTABLE=${OCF_RESKEY_DIR_EXECUTABLE_default}} +: ${OCF_RESKEY_DBTYPE=${OCF_RESKEY_DBTYPE_default}} +: ${OCF_RESKEY_DBINSTANCE=${OCF_RESKEY_DBINSTANCE_default}} +: ${OCF_RESKEY_DBOSUSER=${OCF_RESKEY_DBOSUSER_default}} +: ${OCF_RESKEY_NETSERVICENAME=${OCF_RESKEY_NETSERVICENAME_default}} +: ${OCF_RESKEY_DBJ2EE_ONLY=${OCF_RESKEY_DBJ2EE_ONLY_default}} +: ${OCF_RESKEY_JAVA_HOME=${OCF_RESKEY_JAVA_HOME_default}} +: ${OCF_RESKEY_STRICT_MONITORING=${OCF_RESKEY_STRICT_MONITORING_default}} +: ${OCF_RESKEY_AUTOMATIC_RECOVER=${OCF_RESKEY_AUTOMATIC_RECOVER_default}} +: ${OCF_RESKEY_MONITOR_SERVICES=${OCF_RESKEY_MONITOR_SERVICES_default}} +: ${OCF_RESKEY_DIR_BOOTSTRAP=${OCF_RESKEY_DIR_BOOTSTRAP_default}} +: ${OCF_RESKEY_DIR_SECSTORE=${OCF_RESKEY_DIR_SECSTORE_default}} +: ${OCF_RESKEY_DB_JARS=${OCF_RESKEY_DB_JARS_default}} +: ${OCF_RESKEY_PRE_START_USEREXIT=${OCF_RESKEY_PRE_START_USEREXIT_default}} +: ${OCF_RESKEY_POST_START_USEREXIT=${OCF_RESKEY_POST_START_USEREXIT_default}} +: ${OCF_RESKEY_PRE_STOP_USEREXIT=${OCF_RESKEY_PRE_STOP_USEREXIT_default}} +: ${OCF_RESKEY_POST_STOP_USEREXIT=${OCF_RESKEY_POST_STOP_USEREXIT_default}} + +####################################################################### + +SH=/bin/sh + +usage() { + methods=`sapdatabase_methods` + methods=`echo $methods | tr ' ' '|'` + cat <<-EOF + usage: $0 ($methods) + + $0 manages a SAP database of any type as an HA resource. + Currently Oracle, MaxDB, DB/2 UDB, Sybase ASE and SAP HANA Database are supported. + ABAP databases as well as JAVA only databases are supported. + + The 'start' operation starts the instance. + The 'stop' operation stops the instance. + The 'status' operation reports whether the instance is running + The 'monitor' operation reports whether the instance seems to be working + The 'recover' operation tries to recover the instance after a crash (instance will be stopped first!) + The 'validate-all' operation reports whether the parameters are valid + The 'methods' operation reports on the methods $0 supports + + EOF +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="SAPDatabase" version="2.14"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for SAP databases. It manages a SAP database of any type as an HA resource. + +The purpose of the resource agent is to start, stop and monitor the database instance of a SAP system. Together with the RDBMS system it will also control the related network service for the database. Like the Oracle Listener and the xserver of MaxDB. +The resource agent expects a standard SAP installation of the database and therefore needs less parameters to configure. +The resource agent supports the following databases: +- Oracle 10.2, 11.2 and 12 +- DB/2 UDB for Windows and Unix 9.x +- SAP-DB / MaxDB 7.x +- Sybase ASE 15.7 +- SAP HANA Database since 1.00 - with SAP note 1625203 (http://sdn.sap.com) + +In fact this resource agent does not run any database commands directly. It uses the SAP standard process SAPHostAgent to control the database. +The SAPHostAgent must be installed on each cluster node locally. It will not work, if you try to run the SAPHostAgent also as a HA resource. +Please follow SAP note 1031096 for the installation of SAPHostAgent. +The required minimum version of SAPHostAgent is: +Release: 7.20 +Patch Number: 90 +or compile time after: Dec 17 2011 + +To exemplify the usage, for a HANA database with SID "TST" and instance number "10", the resource configuration using crmsh syntax looks like: + +primitive rsc_SAPDatabase_TST_HDB10 ocf:heartbeat:SAPDatabase \\ + params DBTYPE="HDB" SID="TST" \\ + op start interval="0" timeout="3600" \\ + op monitor interval="120" timeout="700" \\ + op stop interval="0" timeout="600" + +Make sure to tune the operations timeout values accordingly with your chosen Database and available infrastructure. + +Note that the same configuration can be achieved using any other CLI tool for cluster configuration available, like pcs or cibadmin. +</longdesc> +<shortdesc lang="en">Manages a SAP database instance as an HA resource.</shortdesc> +<parameters> + <parameter name="SID" unique="1" required="1"> + <longdesc lang="en">The unique database system identifier. e.g. P01</longdesc> + <shortdesc lang="en">Database system ID</shortdesc> + <content type="string" default="${OCF_RESKEY_SID_default}" /> + </parameter> + <parameter name="DIR_EXECUTABLE" unique="0" required="0"> + <longdesc lang="en">The full qualified path where to find saphostexec and saphostctrl. +Usually you can leave this empty. Then the default: ${OCF_RESKEY_DIR_EXECUTABLE_default} is used. + </longdesc> + <shortdesc lang="en">path of saphostexec and saphostctrl</shortdesc> + <content type="string" default="${OCF_RESKEY_DIR_EXECUTABLE_default}" /> + </parameter> + <parameter name="DBTYPE" unique="0" required="1"> + <longdesc lang="en">The name of the database vendor you use. Set either: ADA, DB6, ORA, SYB, HDB</longdesc> + <shortdesc lang="en">database vendor</shortdesc> + <content type="string" default="${OCF_RESKEY_DBTYPE_default}" /> + </parameter> + <parameter name="DBINSTANCE" unique="1" required="0"> + <longdesc lang="en">Must be used for special database implementations, when database instance name is not equal to the SID (e.g. Oracle DataGuard)</longdesc> + <shortdesc lang="en">Database instance name, if not equal to SID</shortdesc> + <content type="string" default="${OCF_RESKEY_DBINSTANCE_default}" /> + </parameter> + <parameter name="DBOSUSER" unique="1" required="0"> + <longdesc lang="en">The parameter can be set, if the database processes on operating system level are not executed with the default user of the used database type. Defaults: ADA=taken from /etc/opt/sdb, DB6=db2SID, ORA=oraSID and oracle, SYB=sybSID, HDB=SIDadm</longdesc> + <shortdesc lang="en">the Linux user that owns the database processes on operating system level</shortdesc> + <content type="string" default="${OCF_RESKEY_DBOSUSER_default}" /> + </parameter> + <parameter name="NETSERVICENAME" unique="0" required="0"> + <longdesc lang="en">Deprecated - do not use anymore. This parameter will be deleted in one of the next releases.</longdesc> + <shortdesc lang="en">deprecated - do not use anymore</shortdesc> + <content type="string" default="${OCF_RESKEY_NETSERVICENAME_default}" /> + </parameter> + <parameter name="DBJ2EE_ONLY" unique="0" required="0"> + <longdesc lang="en">Deprecated - do not use anymore. This parameter will be deleted in one of the next releases.</longdesc> + <shortdesc lang="en">deprecated - do not use anymore</shortdesc> + <content type="boolean" default="${OCF_RESKEY_DBJ2EE_ONLY_default}"/> + </parameter> + <parameter name="JAVA_HOME" unique="0" required="0"> + <longdesc lang="en">Deprecated - do not use anymore. This parameter will be deleted in one of the next releases.</longdesc> + <shortdesc lang="en">deprecated - do not use anymore</shortdesc> + <content type="string" default="${OCF_RESKEY_JAVA_HOME_default}"/> + </parameter> + <parameter name="STRICT_MONITORING" unique="0" required="0"> + <longdesc lang="en">This controls how the resource agent monitors the database. If set to true, it will use 'saphostctrl -function GetDatabaseStatus' to test the database state. If set to false, only operating system processes are monitored.</longdesc> + <shortdesc lang="en">Activates application level monitoring</shortdesc> + <content type="boolean" default="${OCF_RESKEY_STRICT_MONITORING_default}"/> + </parameter> + <parameter name="AUTOMATIC_RECOVER" unique="0" required="0"> + <longdesc lang="en">If you set this to true, 'saphostctrl -function StartDatabase' will always be called with the '-force' option.</longdesc> + <shortdesc lang="en">Enable or disable automatic startup recovery</shortdesc> + <content type="boolean" default="${OCF_RESKEY_AUTOMATIC_RECOVER_default}"/> + </parameter> + <parameter name="MONITOR_SERVICES" unique="0" required="0"> + <longdesc lang="en">Defines which services are monitored by the SAPDatabase resource agent. Service names must correspond with the output of the 'saphostctrl -function GetDatabaseStatus' command. +The default MONITOR_SERVICES value is derived from the database type DBTYPE. For reference: + +- DBTYPE "ORA" sets MONITOR_SERVICES="Instance|Database|Listener"; +- DBTYPE "HDB" sets MONITOR_SERVICES="hdbindexserver|hdbnameserver"; +- DBTYPE "ADA" sets MONITOR_SERVICES="Database"; +- DBTYPE "DB6" sets MONITOR_SERVICES="{SID}|{db2sid}"; +- DBTYPE "SYB" sets MONITOR_SERVICES="Server". + +This parameter should be set ONLY if is needed to monitor different services than the ones listed above. +</longdesc> + <shortdesc lang="en">Database services to monitor</shortdesc> + <content type="string" default="${OCF_RESKEY_MONITOR_SERVICES_default}"/> + </parameter> + <parameter name="DIR_BOOTSTRAP" unique="0" required="0"> + <longdesc lang="en">Deprecated - do not use anymore. This parameter will be deleted in one of the next releases.</longdesc> + <shortdesc lang="en">deprecated - do not use anymore</shortdesc> + <content type="string" default="${OCF_RESKEY_DIR_BOOTSTRAP_default}" /> + </parameter> + <parameter name="DIR_SECSTORE" unique="0" required="0"> + <longdesc lang="en">Deprecated - do not use anymore. This parameter will be deleted in one of the next releases.</longdesc> + <shortdesc lang="en">deprecated - do not use anymore</shortdesc> + <content type="string" default="${OCF_RESKEY_DIR_SECSTORE_default}" /> + </parameter> + <parameter name="DB_JARS" unique="0" required="0"> + <longdesc lang="en">Deprecated - do not use anymore. This parameter will be deleted in one of the next releases.</longdesc> + <shortdesc lang="en">deprecated - do not use anymore</shortdesc> + <content type="string" default="${OCF_RESKEY_DB_JARS_default}" /> + </parameter> + <parameter name="PRE_START_USEREXIT" unique="0" required="0"> + <longdesc lang="en">The full qualified path where to find a script or program which should be executed before this resource gets started.</longdesc> + <shortdesc lang="en">path to a pre-start script</shortdesc> + <content type="string" default="${OCF_RESKEY_PRE_START_USEREXIT_default}" /> + </parameter> + <parameter name="POST_START_USEREXIT" unique="0" required="0"> + <longdesc lang="en">The full qualified path where to find a script or program which should be executed after this resource got started.</longdesc> + <shortdesc lang="en">path to a post-start script</shortdesc> + <content type="string" default="${OCF_RESKEY_POST_START_USEREXIT_default}" /> + </parameter> + <parameter name="PRE_STOP_USEREXIT" unique="0" required="0"> + <longdesc lang="en">The full qualified path where to find a script or program which should be executed before this resource gets stopped.</longdesc> + <shortdesc lang="en">path to a pre-start script</shortdesc> + <content type="string" default="${OCF_RESKEY_PRE_STOP_USEREXIT_default}" /> + </parameter> + <parameter name="POST_STOP_USEREXIT" unique="0" required="0"> + <longdesc lang="en">The full qualified path where to find a script or program which should be executed after this resource got stopped.</longdesc> + <shortdesc lang="en">path to a post-start script</shortdesc> + <content type="string" default="${OCF_RESKEY_POST_STOP_USEREXIT_default}" /> + </parameter> +</parameters> + +<actions> +<action name="start" timeout="1800s" /> +<action name="stop" timeout="1800s" /> +<action name="status" timeout="60s" /> +<action name="monitor" depth="0" timeout="60s" interval="120s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +<action name="methods" timeout="5s" /> +</actions> +</resource-agent> +END +} + + +# +# methods: What methods/operations do we support? +# +sapdatabase_methods() { + cat <<-EOF + start + stop + status + monitor + recover + validate-all + methods + meta-data + usage + EOF +} + + +# +# sapuserexit : Many SAP customers need some additional processes/tools to run their SAP systems. +# This specialties do not allow a totally generic SAP cluster resource agent. +# Someone should write a resource agent for each additional process you need, if it +# is required to monitor that process within the cluster manager. To enable +# you to extent this resource agent without developing a new one, this user exit +# was introduced. +# +sapuserexit() { + NAME="$1" + VALUE="$2" + + if [ -n "$VALUE" ] + then + if have_binary "$VALUE" + then + ocf_log info "Calling userexit ${NAME} with customer script file ${VALUE}" + "$VALUE" >/dev/null 2>&1 + ocf_log info "Exiting userexit ${NAME} with customer script file ${VALUE}, returncode: $?" + else + ocf_log warn "Attribute ${NAME} is set to ${VALUE}, but this file is not executable" + fi + fi + return $OCF_SUCCESS +} + + +# +# saphostctrl_installed +# +saphostctrl_installed() { + SAPHOSTCTRL="${OCF_RESKEY_DIR_EXECUTABLE}/saphostctrl" + SAPHOSTEXEC="${OCF_RESKEY_DIR_EXECUTABLE}/saphostexec" + SAPHOSTSRV="${OCF_RESKEY_DIR_EXECUTABLE}/sapstartsrv" + SAPHOSTOSCOL="${OCF_RESKEY_DIR_EXECUTABLE}/saposcol" + + have_binary $SAPHOSTCTRL && have_binary $SAPHOSTEXEC +} + + +# +# 'main' starts here... +# + +if + ( [ $# -ne 1 ] ) +then + usage + exit $OCF_ERR_ARGS +fi + +# These operations don't require OCF instance parameters to be set +case "$1" in + meta-data) meta_data + exit $OCF_SUCCESS;; + + usage) usage + exit $OCF_SUCCESS;; + + methods) sapdatabase_methods + exit $?;; + + *);; +esac + +if ! ocf_is_root +then + ocf_log err "$0 must be run as root" + exit $OCF_ERR_PERM +fi + +# mandatory parameter check +if [ -z "$OCF_RESKEY_SID" ]; then + ocf_log err "Please set OCF_RESKEY_SID to the SAP system id!" + exit $OCF_ERR_ARGS +fi +SID=`echo "$OCF_RESKEY_SID"` + +if [ -z "$OCF_RESKEY_DBTYPE" ]; then + ocf_log err "Please set OCF_RESKEY_DBTYPE to the database vendor specific tag (ADA,DB6,ORA,SYB,HDB)!" + exit $OCF_ERR_ARGS +fi +DBTYPE=`echo "$OCF_RESKEY_DBTYPE" | tr '[:lower:]' '[:upper:]'` + + +# source functions and initialize global variables +if saphostctrl_installed; then + . ${OCF_FUNCTIONS_DIR}/sapdb.sh +else + if [ -n "${OCF_RESKEY_DBOSUSER}" ]; then + ocf_exit_reason "Usage of parameter OCF_RESKEY_DBOSUSER is not possible without having SAP Host-Agent installed" + exit $OCF_ERR_ARGS + fi + . ${OCF_FUNCTIONS_DIR}/sapdb-nosha.sh +fi +sapdatabase_init + + +# we always want to fall to the faster status method in case of a probe by the cluster +ACTION=$1 +if ocf_is_probe +then + ACTION=status +fi + +# What kind of method was invoked? +case "$ACTION" in + + start|stop|status|recover) sapdatabase_$ACTION + exit $?;; + monitor) sapdatabase_monitor $OCF_RESKEY_STRICT_MONITORING + exit $?;; + validate-all) sapdatabase_validate + exit $?;; + *) sapdatabase_methods + exit $OCF_ERR_UNIMPLEMENTED;; +esac diff --git a/heartbeat/SAPInstance b/heartbeat/SAPInstance new file mode 100755 index 0000000..26fd541 --- /dev/null +++ b/heartbeat/SAPInstance @@ -0,0 +1,1076 @@ +#!/bin/sh +# +# SAPInstance +# +# Description: Manages a single SAP Instance as a High-Availability +# resource. One SAP Instance is defined by one +# SAP Instance-Profile. start/stop handles all services +# of the START-Profile, status and monitor care only +# about essential services. +# +# Author: Alexander Krauth, June 2006 +# Support: linux@sap.com +# License: GNU General Public License (GPL) +# Copyright: (c) 2006-2008 Alexander Krauth +# +# An example usage: +# See usage() function below for more details... +# +# OCF instance parameters: +# OCF_RESKEY_InstanceName +# OCF_RESKEY_DIR_EXECUTABLE (optional, well known directories will be searched by default) +# OCF_RESKEY_DIR_PROFILE (optional, well known directories will be searched by default) +# OCF_RESKEY_START_PROFILE (optional, well known directories will be searched by default) +# OCF_RESKEY_START_WAITTIME (optional, to solve timing problems during J2EE-Addin start) +# OCF_RESKEY_AUTOMATIC_RECOVER (optional, automatic startup recovery using cleanipc, default is false) +# OCF_RESKEY_MONITOR_SERVICES (optional, default is to monitor critical services only) +# OCF_RESKEY_SHUTDOWN_METHOD (optional, defaults to NORMAL, KILL: terminate the SAP instance with OS commands - faster, at your own risk) +# OCF_RESKEY_ERS_InstanceName (optional, InstanceName of the ERS instance in a Promotable configuration) +# OCF_RESKEY_ERS_START_PROFILE (optional, START_PROFILE of the ERS instance in a Promotable configuration) +# OCF_RESKEY_PRE_START_USEREXIT (optional, lists a script which can be executed before the resource is started) +# OCF_RESKEY_POST_START_USEREXIT (optional, lists a script which can be executed after the resource is started) +# OCF_RESKEY_PRE_STOP_USEREXIT (optional, lists a script which can be executed before the resource is stopped) +# OCF_RESKEY_POST_STOP_USEREXIT (optional, lists a script which can be executed after the resource is stopped) +# OCF_RESKEY_IS_ERS (needed for ENQ/REPL NW 740) +# OCF_RESKEY_MINIMAL_PROBE (optional but needed for simple mount structure architecure) +# +# TODO: - Option to shutdown sapstartsrv for non-active instances -> that means: do probes only with OS tools (sapinstance_status) +# - Option for better standalone enqueue server monitoring, using ensmon (test enque-deque) +# - Option for cleanup abandoned enqueue replication tables +# +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_InstanceName_default="" +OCF_RESKEY_DIR_EXECUTABLE_default="" +OCF_RESKEY_DIR_PROFILE_default="" +OCF_RESKEY_START_PROFILE_default="" +OCF_RESKEY_START_WAITTIME_default="3600" +OCF_RESKEY_AUTOMATIC_RECOVER_default="false" +OCF_RESKEY_MONITOR_SERVICES_default="disp+work|msg_server|enserver|enrepserver|jcontrol|jstart|enq_server|enq_replicator" +OCF_RESKEY_SHUTDOWN_METHOD_default="normal" +OCF_RESKEY_ERS_InstanceName_default="" +OCF_RESKEY_ERS_START_PROFILE_default="" +OCF_RESKEY_PRE_START_USEREXIT_default="" +OCF_RESKEY_POST_START_USEREXIT_default="" +OCF_RESKEY_PRE_STOP_USEREXIT_default="" +OCF_RESKEY_POST_STOP_USEREXIT_default="" +OCF_RESKEY_IS_ERS_default="false" +OCF_RESKEY_MINIMAL_PROBE_default="false" + +: ${OCF_RESKEY_InstanceName=${OCF_RESKEY_InstanceName_default}} +: ${OCF_RESKEY_DIR_EXECUTABLE=${OCF_RESKEY_DIR_EXECUTABLE_default}} +: ${OCF_RESKEY_DIR_PROFILE=${OCF_RESKEY_DIR_PROFILE_default}} +: ${OCF_RESKEY_START_PROFILE=${OCF_RESKEY_START_PROFILE_default}} +: ${OCF_RESKEY_START_WAITTIME=${OCF_RESKEY_START_WAITTIME_default}} +: ${OCF_RESKEY_AUTOMATIC_RECOVER=${OCF_RESKEY_AUTOMATIC_RECOVER_default}} +: ${OCF_RESKEY_MONITOR_SERVICES=${OCF_RESKEY_MONITOR_SERVICES_default}} +: ${OCF_RESKEY_SHUTDOWN_METHOD=${OCF_RESKEY_SHUTDOWN_METHOD_default}} +: ${OCF_RESKEY_ERS_InstanceName=${OCF_RESKEY_ERS_InstanceName_default}} +: ${OCF_RESKEY_ERS_START_PROFILE=${OCF_RESKEY_ERS_START_PROFILE_default}} +: ${OCF_RESKEY_PRE_START_USEREXIT=${OCF_RESKEY_PRE_START_USEREXIT_default}} +: ${OCF_RESKEY_POST_START_USEREXIT=${OCF_RESKEY_POST_START_USEREXIT_default}} +: ${OCF_RESKEY_PRE_STOP_USEREXIT=${OCF_RESKEY_PRE_STOP_USEREXIT_default}} +: ${OCF_RESKEY_POST_STOP_USEREXIT=${OCF_RESKEY_POST_STOP_USEREXIT_default}} +: ${OCF_RESKEY_IS_ERS=${OCF_RESKEY_IS_ERS_default}} +: ${OCF_RESKEY_IS_MINIMAL_PROBE=${OCF_RESKEY_IS_MINIMAL_PROBE_default}} + +####################################################################### + +SH=/bin/sh + +sapinstance_usage() { + methods=`sapinstance_methods` + methods=`echo $methods | tr ' ' '|'` + cat <<-EOF + usage: $0 ($methods) + + $0 manages a SAP Instance as an HA resource. + + The 'start' operation starts the instance or the ERS instance in a Promotable configuration + The 'stop' operation stops the instance + The 'status' operation reports whether the instance is running + The 'monitor' operation reports whether the instance seems to be working + The 'promote' operation starts the primary instance in a Promotable configuration + The 'demote' operation stops the primary instance and starts the ERS instance + The 'reload' operation allows changed parameters (non-unique only) without restarting the service + The 'notify' operation always returns SUCCESS + The 'validate-all' operation reports whether the parameters are valid + The 'methods' operation reports on the methods $0 supports + + EOF +} + +sapinstance_meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="SAPInstance" version="2.14"> +<version>1.0</version> + +<longdesc lang="en"> +Usually a SAP system consists of one database and at least one or more SAP instances (sometimes called application servers). One SAP Instance is defined by having exactly one instance profile. The instance profiles can usually be found in the directory /sapmnt/SID/profile. Each instance must be configured as it's own resource in the cluster configuration. +The resource agent supports the following SAP versions: +- SAP WebAS ABAP Release 6.20 - 7.40 +- SAP WebAS Java Release 6.40 - 7.40 +- SAP WebAS ABAP + Java Add-In Release 6.20 - 7.40 (Java is not monitored by the cluster in that case) +When using a SAP Kernel 6.40 please check and implement the actions from the section "Manual postprocessing" from SAP note 995116 (http://sdn.sap.com). +Other versions may also work with this agent, but have not been verified. + +All operations of the SAPInstance resource agent are done by using the startup framework called SAP Management Console or sapstartsrv that was introduced with SAP kernel release 6.40. Find more information about the SAP Management Console in SAP note 1014480. Using this framework defines a clear interface for the Heartbeat cluster, how it sees the SAP system. The options for monitoring the SAP system are also much better than other methods like just watching the ps command for running processes or doing some pings to the application. sapstartsrv uses SOAP messages to request the status of running SAP processes. Therefore it can actually ask a process itself what it's status is, independent from other problems that might exist at the same time. + +sapstartsrv knows 4 status colours: +- GREEN = everything is fine +- YELLOW = something is wrong, but the service is still working +- RED = the service does not work +- GRAY = the service has not been started + +The SAPInstance resource agent will interpret GREEN and YELLOW as OK. That means that minor problems will not be reported to the Heartbeat cluster. This prevents the cluster from doing an unwanted failover. +The statuses RED and GRAY are reported as NOT_RUNNING to the cluster. Depending on the status the cluster expects from the resource, it will do a restart, failover or just nothing. +</longdesc> +<shortdesc lang="en">Manages a SAP instance as an HA resource.</shortdesc> +<parameters> + <parameter name="InstanceName" unique="1" required="1"> + <longdesc lang="en">The full qualified SAP instance name. e.g. P01_DVEBMGS00_sapp01ci. Usually this is the name of the SAP instance profile.</longdesc> + <shortdesc lang="en">Instance name: SID_INSTANCE_VIR-HOSTNAME</shortdesc> + <content type="string" default="${OCF_RESKEY_InstanceName_default}" /> + </parameter> + <parameter name="DIR_EXECUTABLE" unique="0" required="0"> + <longdesc lang="en">The full qualified path where to find sapstartsrv and sapcontrol. Specify this parameter, if you have changed the SAP kernel directory location after the default SAP installation.</longdesc> + <shortdesc lang="en">Path of sapstartsrv and sapcontrol</shortdesc> + <content type="string" default="${OCF_RESKEY_DIR_EXECUTABLE_default}" /> + </parameter> + <parameter name="DIR_PROFILE" unique="0" required="0"> + <longdesc lang="en">The full qualified path where to find the SAP START profile. Specify this parameter, if you have changed the SAP profile directory location after the default SAP installation.</longdesc> + <shortdesc lang="en">Path of start profile</shortdesc> + <content type="string" default="${OCF_RESKEY_DIR_PROFILE_default}" /> + </parameter> + <parameter name="START_PROFILE" unique="1" required="0"> + <longdesc lang="en">The name of the SAP START profile. Specify this parameter, if you have changed the name of the SAP START profile after the default SAP installation. As SAP release 7.10 does not have a START profile anymore, you need to specify the Instance Profile than.</longdesc> + <shortdesc lang="en">Start profile name</shortdesc> + <content type="string" default="${OCF_RESKEY_START_PROFILE_default}" /> + </parameter> + <parameter name="START_WAITTIME" unique="0" required="0"> + <longdesc lang="en">After that time in seconds a monitor operation is executed by the resource agent. Does the monitor return SUCCESS, the start ishandled as SUCCESS. This is useful to resolve timing problems with e.g. the J2EE-Addin instance.Usually the resource agent waits until all services are started and the SAP Management Console reports a GREEN status. A double stack installation (ABAP + Java AddIn) consists of an ABAP dispatcher and a JAVA instance. Normally the start of the JAVA instance takes much longer than the start of the ABAP instance. For a JAVA Instance you may need to configure a much higher timeout for the start operation of the resource in Heartbeat. The disadvantage here is, that the discovery of a failed start by the cluster takes longer. Somebody might say: For me it is important, that the ABAP instance is up and running. A failure of the JAVA instance shall not cause a failover of the SAP instance. +Actually the SAP MC reports a YELLOW status, if the JAVA instance of a double stack system fails. From the resource agent point of view YELLOW means:everything is OK. Setting START_WAITTIME to a lower value determines the resource agent to check the status of the instance during a start operation after that time. As it would wait normally for a GREEN status, now it reports SUCCESS to the cluster in case of a YELLOW status already after the specified time. + +That is only useful for double stack systems. + </longdesc> + <shortdesc lang="en">Check the successful start after that time (do not wait for J2EE-Addin)</shortdesc> + <content type="string" default="${OCF_RESKEY_START_WAITTIME_default}" /> + </parameter> + <parameter name="AUTOMATIC_RECOVER" unique="0" required="0"> + <longdesc lang="en">The SAPInstance resource agent tries to recover a failed start attempt automatically one time. This is done by killing running instance processes, removing the kill.sap file and executing cleanipc. Sometimes a crashed SAP instance leaves some processes and/or shared memory segments behind. Setting this option to true will try to remove those leftovers during a start operation. That is to reduce manual work for the administrator.</longdesc> + <shortdesc lang="en">Enable or disable automatic startup recovery</shortdesc> + <content type="boolean" default="${OCF_RESKEY_AUTOMATIC_RECOVER_default}"/> + </parameter> + <parameter name="MONITOR_SERVICES" unique="0" required="0"> + <longdesc lang="en">Within a SAP instance there can be several services. Usually you will find the defined services in the START profile of the related instance (Attention: with SAP Release 7.10 the START profile content was moved to the instance profile). Not all of those services are worth to monitor by the cluster. For example you properly do not like to failover your SAP instance, if the central syslog collector daemon fails. +Those services are monitored within the SAPInstance resource agent: + +- disp+work +- msg_server +- enserver (ENSA1) +- enq_server (ENSA2) +- enrepserver (ENSA1) +- enq_replicator (ENSA2) +- jcontrol +- jstart + +Some other services could be monitored as well. They have to be +given with the parameter MONITOR_SERVICES, e.g.: + + - sapwebdisp + - TREXDaemon.x + +That names match the strings used in the output of the command 'sapcontrol -nr [Instance-Nr] -function GetProcessList'. +The default should fit most cases where you want to manage a SAP Instance from the cluster. You may change this with this parameter, if you like to monitor more/less or other services that sapstartsrv supports. +You may specify multiple services separated by a | (pipe) sign in this parameter: disp+work|msg_server|enserver + </longdesc> + <shortdesc lang="en">Services to monitor</shortdesc> + <content type="string" default="${OCF_RESKEY_MONITOR_SERVICES_default}"/> + </parameter> + <parameter name="SHUTDOWN_METHOD" unique="0" required="0"> + <longdesc lang="en">Usually a SAP Instance is stopped by the command 'sapcontrol -nr InstanceNr -function Stop'. SHUTDOWN_METHOD=KILL means to kill the SAP Instance using OS commands. SAP processes of the instance are terminated with 'kill -9', shared memory is deleted with 'cleanipc' and the 'kill.sap' file will be deleted. That method is much faster than the graceful stop, but the instance does not have the chance to say goodbye to other SAPinstances in the same system. USE AT YOUR OWN RISK !!</longdesc> + <shortdesc lang="en">Shutdown graceful or kill a SAP instance by terminating the processes. (normal|KILL)</shortdesc> + <content type="string" default="${OCF_RESKEY_SHUTDOWN_METHOD_default}"/> + </parameter> + <parameter name="ERS_InstanceName" unique="1" required="0"> + <longdesc lang="en">Only used in a Promotable resource configuration: +The full qualified SAP enqueue replication instance name. e.g. P01_ERS02_sapp01ers. Usually this is the name of the SAP instance profile. +The enqueue replication instance must be installed, before you want to configure a promotable cluster resource. + +The promotable configuration in the cluster must use this properties: +clone_max = 2 +clone_node_max = 1 +master_node_max = 1 +master_max = 1 + </longdesc> + <shortdesc lang="en">Enqueue replication instance name: SID_INSTANCE_VIR-HOSTNAME</shortdesc> + <content type="string" default="${OCF_RESKEY_ERS_InstanceName_default}"/> + </parameter> + <parameter name="ERS_START_PROFILE" unique="1" required="0"> + <longdesc lang="en">Only used in a Promotable resource configuration: +The parameter ERS_InstanceName must also be set in this configuration. +The name of the SAP START profile. Specify this parameter, if you have changed the name of the SAP START profile after the default SAP installation. As SAP release 7.10 does not have a START profile anymore, you need to specify the Instance Profile than. + </longdesc> + <shortdesc lang="en">Enqueue replication start profile name</shortdesc> + <content type="string" default="${OCF_RESKEY_ERS_START_PROFILE_default}"/> + </parameter> + <parameter name="PRE_START_USEREXIT" unique="0" required="0"> + <longdesc lang="en">The full qualified path where to find a script or program which should be executed before this resource gets started.</longdesc> + <shortdesc lang="en">Path to a pre-start script</shortdesc> + <content type="string" default="${OCF_RESKEY_PRE_START_USEREXIT_default}" /> + </parameter> + <parameter name="POST_START_USEREXIT" unique="0" required="0"> + <longdesc lang="en">The full qualified path where to find a script or program which should be executed after this resource got started.</longdesc> + <shortdesc lang="en">Path to a post-start script</shortdesc> + <content type="string" default="${OCF_RESKEY_POST_START_USEREXIT_default}" /> + </parameter> + <parameter name="PRE_STOP_USEREXIT" unique="0" required="0"> + <longdesc lang="en">The full qualified path where to find a script or program which should be executed before this resource gets stopped.</longdesc> + <shortdesc lang="en">Path to a pre-start script</shortdesc> + <content type="string" default="${OCF_RESKEY_PRE_STOP_USEREXIT_default}" /> + </parameter> + <parameter name="POST_STOP_USEREXIT" unique="0" required="0"> + <longdesc lang="en">The full qualified path where to find a script or program which should be executed after this resource got stopped.</longdesc> + <shortdesc lang="en">Path to a post-start script</shortdesc> + <content type="string" default="${OCF_RESKEY_POST_STOP_USEREXIT_default}" /> + </parameter> + <parameter name="IS_ERS" unique="0" required="0"> + <longdesc lang="en">Only used for ASCS/ERS SAP Netweaver installations without implementing a promotable resource to + allow the ASCS to 'find' the ERS running on another cluster node after a resource failure. This parameter should be set + to true 'only' for the ERS instance for implementations following the SAP NetWeaver 7.40 HA certification (NW-HA-CLU-740). This includes also + systems for NetWeaver less than 7.40, if you like to implement the NW-HA-CLU-740 scenario. + </longdesc> + <shortdesc lang="en">Mark SAPInstance as ERS instance</shortdesc> + <content type="boolean" default="${OCF_RESKEY_IS_ERS_default}" /> + </parameter> + <parameter name="MINIMAL_PROBE" unique="0" required="0"> + <longdesc lang="en">Setting MINIMAL_PROBE=true forces the resource agent to do only minimal check during a probe. This is needed for special + file system setups. The MINIMAL_PROBE=true is only supported, if requested either by your vendor's support or if described in an architecture document + from your HA vendor. + </longdesc> + <shortdesc lang="en">Switch probe action from full to minimal check</shortdesc> + <content type="boolean" default="${OCF_RESKEY_MINIMAL_PROBE_default}" /> + </parameter> +</parameters> + +<actions> +<action name="start" timeout="180s" /> +<action name="stop" timeout="240s" /> +<action name="status" timeout="60s" /> +<action name="monitor" depth="0" timeout="60s" interval="120s" /> +<action name="monitor" depth="0" timeout="60s" interval="121s" role="Unpromoted" /> +<action name="monitor" depth="0" timeout="60s" interval="119s" role="Promoted" /> +<action name="promote" timeout="320s" /> +<action name="demote" timeout="320s" /> +<action name="reload" timeout="320s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +<action name="methods" timeout="5s" /> +</actions> +</resource-agent> +END +} + + +# +# methods: What methods/operations do we support? +# +sapinstance_methods() { + cat <<-EOF + start + stop + status + monitor + promote + demote + reload + notify + validate-all + methods + meta-data + usage + EOF +} + + + +# +# is_clone : find out if we are configured to run in a Master/Slave configuration +# +is_clone() { + if [ -n "$OCF_RESKEY_CRM_meta_clone_max" ] \ + && [ "$OCF_RESKEY_CRM_meta_clone_max" -gt 0 ] + then + if [ "$OCF_RESKEY_CRM_meta_clone_max" -ne 2 ] || \ + [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] || \ + [ "$OCF_RESKEY_CRM_meta_master_node_max" -ne 1 ] || \ + [ "$OCF_RESKEY_CRM_meta_master_max" -ne 1 ] + then + ocf_log err "Clone options misconfigured. (expect: clone_max=2,clone_node_max=1,master_node_max=1,master_max=1)" + exit $OCF_ERR_CONFIGURED + fi + + if [ -z "$OCF_RESKEY_ERS_InstanceName" ] + then + ocf_log err "In a Master/Slave configuration the ERS_InstanceName parameter is mandatory." + exit $OCF_ERR_ARGS + fi + else + return 0 + fi + return 1 +} + + +# +# abnormal_end : essential things are missing, but in the natur of a SAP installation - which can be very different +# from customer to customer - we cannot handle this always as an error +# This would be the case, if the software is installed on shared disks and not visible +# to all cluster nodes at all times. +# +abnormal_end() { + local err_msg=$1 + + ocf_is_probe && { + sapinstance_status + exit $? + } + + ocf_log err $err_msg + if [ "$ACTION" = "stop" ] + then + cleanup_instance + exit $OCF_SUCCESS + fi + + exit $OCF_ERR_CONFIGURED +} + +# +# sapinstance_init : Define global variables with default values, if optional parameters are not set +# +# +sapinstance_init() { + + local myInstanceName="$1" + + SID=`echo "$myInstanceName" | cut -d_ -f1` + InstanceName=`echo "$myInstanceName" | cut -d_ -f2` + InstanceNr=`echo "$InstanceName" | sed 's/.*\([0-9][0-9]\)$/\1/'` + SAPVIRHOST=`echo "$myInstanceName" | cut -d_ -f3` + + # make sure that we don't care the content of variable from previous run of sapinstance_init + DIR_EXECUTABLE="" + SYSTEMCTL="systemctl" + # optional OCF parameters, we try to guess which directories are correct + if [ -z "$OCF_RESKEY_DIR_EXECUTABLE" ] + then + if have_binary /usr/sap/$SID/$InstanceName/exe/sapstartsrv && have_binary /usr/sap/$SID/$InstanceName/exe/sapcontrol + then + DIR_EXECUTABLE="/usr/sap/$SID/$InstanceName/exe" + SAPSTARTSRV="/usr/sap/$SID/$InstanceName/exe/sapstartsrv" + SAPCONTROL="/usr/sap/$SID/$InstanceName/exe/sapcontrol" + elif have_binary /usr/sap/$SID/SYS/exe/run/sapstartsrv && have_binary /usr/sap/$SID/SYS/exe/run/sapcontrol + then + DIR_EXECUTABLE="/usr/sap/$SID/SYS/exe/run" + SAPSTARTSRV="/usr/sap/$SID/SYS/exe/run/sapstartsrv" + SAPCONTROL="/usr/sap/$SID/SYS/exe/run/sapcontrol" + fi + else + if have_binary "$OCF_RESKEY_DIR_EXECUTABLE/sapstartsrv" && have_binary "$OCF_RESKEY_DIR_EXECUTABLE/sapcontrol" + then + DIR_EXECUTABLE="$OCF_RESKEY_DIR_EXECUTABLE" + SAPSTARTSRV="$OCF_RESKEY_DIR_EXECUTABLE/sapstartsrv" + SAPCONTROL="$OCF_RESKEY_DIR_EXECUTABLE/sapcontrol" + fi + fi + + sidadm="`echo $SID | tr '[:upper:]' '[:lower:]'`adm" + + [ -z "$DIR_EXECUTABLE" ] && abnormal_end "Cannot find sapstartsrv and sapcontrol executable, please set DIR_EXECUTABLE parameter!" + + if [ -z "$OCF_RESKEY_DIR_PROFILE" ] + then + DIR_PROFILE="/usr/sap/$SID/SYS/profile" + else + DIR_PROFILE="$OCF_RESKEY_DIR_PROFILE" + fi + + if [ "$myInstanceName" != "$OCF_RESKEY_InstanceName" ] + then + currentSTART_PROFILE=$OCF_RESKEY_ERS_START_PROFILE + else + currentSTART_PROFILE=$OCF_RESKEY_START_PROFILE + fi + + if [ -z "$OCF_RESKEY_IS_ERS" ]; then + is_ers="no" + else + is_ers="$OCF_RESKEY_IS_ERS" + fi + + if [ -z "$currentSTART_PROFILE" ] + then + if [ ! -r "$DIR_PROFILE/START_${InstanceName}_${SAPVIRHOST}" -a -r "$DIR_PROFILE/${SID}_${InstanceName}_${SAPVIRHOST}" ]; then + SAPSTARTPROFILE="$DIR_PROFILE/${SID}_${InstanceName}_${SAPVIRHOST}" + else + SAPSTARTPROFILE="$DIR_PROFILE/START_${InstanceName}_${SAPVIRHOST}" + fi + else + SAPSTARTPROFILE="$currentSTART_PROFILE" + fi + + if [ -z "$OCF_RESKEY_START_WAITTIME" ] + then + export OCF_RESKEY_START_WAITTIME="${OCF_RESKEY_START_WAITTIME_default}" + fi + + if [ -z "$OCF_RESKEY_MONITOR_SERVICES" ] + then + export OCF_RESKEY_MONITOR_SERVICES="${OCF_RESKEY_MONITOR_SERVICES_default}" + fi + + # as root user we need the library path to the SAP kernel to be able to call sapcontrol + if [ `echo $LD_LIBRARY_PATH | grep -c "^$DIR_EXECUTABLE\>"` -eq 0 ]; then + LD_LIBRARY_PATH=$DIR_EXECUTABLE${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH + export LD_LIBRARY_PATH + fi + + return $OCF_SUCCESS +} + +# +# check_systemd_integration : Check, if SAP instance is controlled by systemd unit file SAP<SID>_<InstanceNr>.service +# rc == 0 : sap instance is controlled by the unit file (file at least exists) +# rc == 1 : sap instance is NOT controlled by the unit file (file does not exist) +# +check_systemd_integration() { + local systemd_unit_name="SAP${SID}_${InstanceNr}" + local rc=1 + + if which "$SYSTEMCTL" 1>/dev/null 2>/dev/null; then + if $SYSTEMCTL list-unit-files | \ + awk '$1 == service { found=1 } END { if (! found) {exit 1}}' service="${systemd_unit_name}.service"; then + rc=0 + else + rc=1 + fi + fi + return "$rc" +} + +# +# check_sapstartsrv : Before using sapcontrol we make sure that the sapstartsrv is running for the correct instance. +# We cannot use sapinit and the /usr/sap/sapservices file in case of an enquerep instance, +# because then we have two instances with the same instance number. +# +check_sapstartsrv() { + local restart=0 + local runninginst="" + local chkrc=$OCF_SUCCESS + local output="" + + # check for sapstartsrv/systemd integration + + if check_systemd_integration; then + # do it the systemd way + local systemd_unit_name="SAP${SID}_${InstanceNr}" + + if $SYSTEMCTL status "$systemd_unit_name" 1>/dev/null 2>/dev/null; then + ocf_log info "systemd service $systemd_unit_name is active" + else + ocf_log warn "systemd service $systemd_unit_name is not active, it will be started using systemd" + $SYSTEMCTL start "$systemd_unit_name" 1>/dev/null 2>/dev/null + # use start, because restart does also stop sap instance + fi + + return 0 + else # otherwise continue with old code... + if [ ! -S /tmp/.sapstream5${InstanceNr}13 ]; then + ocf_log warn "sapstartsrv is not running for instance $SID-$InstanceName (no UDS), it will be started now" + restart=1 + else + output=`$SAPCONTROL -nr $InstanceNr -function ParameterValue INSTANCE_NAME -format script` + if [ $? -eq 0 ] + then + runninginst=`echo "$output" | grep '^0 : ' | cut -d' ' -f3` + if [ "$runninginst" != "$InstanceName" ] + then + ocf_log warn "sapstartsrv is running for instance $runninginst, that service will be killed" + restart=1 + else + output=`$SAPCONTROL -nr $InstanceNr -function AccessCheck Start` + if [ $? -ne 0 ]; then + ocf_log warn "FAILED : sapcontrol -nr $InstanceNr -function AccessCheck Start (`ls -ld1 /tmp/.sapstream5${InstanceNr}13`)" + ocf_log warn "sapstartsrv will be restarted to try to solve this situation, otherwise please check sapstsartsrv setup (SAP Note 927637)" + restart=1 + fi + fi + else + ocf_log warn "sapstartsrv is not running for instance $SID-$InstanceName, it will be started now" + restart=1 + fi + fi + + if [ -z "$runninginst" ]; then runninginst=$InstanceName; fi + + if [ $restart -eq 1 ] + then + if [ -d /usr/sap/$SID/SYS/profile/ ] + then + DIR_PROFILE="/usr/sap/$SID/SYS/profile" + else + abnormal_end "Expected /usr/sap/$SID/SYS/profile/ to be a directory, please set DIR_PROFILE parameter!" + fi + + [ ! -r $SAPSTARTPROFILE ] && abnormal_end "Expected $SAPSTARTPROFILE to be the instance START profile, please set START_PROFILE parameter!" + + pkill -9 -f "sapstartsrv.*$runninginst" + + # removing the unix domain socket files as they might have wrong permissions + # or ownership - they will be recreated by sapstartsrv during next start + rm -f /tmp/.sapstream5${InstanceNr}13 + rm -f /tmp/.sapstream5${InstanceNr}14 + + $SAPSTARTSRV pf=$SAPSTARTPROFILE -D -u $sidadm + + # now make sure the daemon has been started and is able to respond + local srvrc=1 + while [ $srvrc -eq 1 -a `pgrep -f "sapstartsrv.*$runninginst" | wc -l` -gt 0 ] + do + sleep 1 + $SAPCONTROL -nr $InstanceNr -function GetProcessList > /dev/null 2>&1 + srvrc=$? + done + + if [ $srvrc -ne 1 ] + then + ocf_log info "sapstartsrv for instance $SID-$InstanceName was restarted !" + chkrc=$OCF_SUCCESS + else + ocf_log error "sapstartsrv for instance $SID-$InstanceName could not be started!" + chkrc=$OCF_ERR_GENERIC + ocf_is_probe && chkrc=$OCF_NOT_RUNNING + fi + fi + + return $chkrc + fi +} + + +# +# sapuserexit : Many SAP customers need some additional processes/tools to run their SAP systems. +# This specialties do not allow a totally generic SAP cluster resource agent. +# Someone should write a resource agent for each additional process you need, if it +# is required to monitor that process within the cluster manager. To enable +# you to extent this resource agent without developing a new one, this user exit +# was introduced. +# +sapuserexit() { + local NAME="$1" + local VALUE="$2" + + if [ -n "$VALUE" ] + then + if have_binary "$VALUE" + then + ocf_log info "Calling userexit ${NAME} with customer script file ${VALUE}" + "$VALUE" >/dev/null 2>&1 + ocf_log info "Exiting userexit ${NAME} with customer script file ${VALUE}, returncode: $?" + else + ocf_log warn "Attribute ${NAME} is set to ${VALUE}, but this file is not executable" + fi + fi + return 0 +} + + +# +# cleanup_instance : remove resources (processes and shared memory) from a crashed instance) +# +cleanup_instance() { + pkill -9 -f -U $sidadm $InstanceName + ocf_log info "Terminated instance using 'pkill -9 -f -U $sidadm $InstanceName'" + + # it is necessary to call cleanipc as user sidadm if the system has 'vmcj/enable = ON' set - otherwise SHM-segments in /dev/shm/SAP_ES2* cannot be removed + su - $sidadm -c "cleanipc $InstanceNr remove" + ocf_log info "Tried to remove shared memory resources using 'cleanipc $InstanceNr remove' as user $sidadm" + + ocf_run rm -fv /usr/sap/$SID/$InstanceName/work/kill.sap + ocf_run rm -fv /usr/sap/$SID/$InstanceName/work/shutdown.sap + ocf_run rm -fv /usr/sap/$SID/$InstanceName/data/rslgcpid + ocf_run rm -fv /usr/sap/$SID/$InstanceName/data/rslgspid + + return 0 +} + +# +# sapinstance_start : Start the SAP instance +# +sapinstance_start() { + + sapuserexit PRE_START_USEREXIT "$OCF_RESKEY_PRE_START_USEREXIT" + + local rc=$OCF_NOT_RUNNING + local output="" + local loopcount=0 + + while [ $loopcount -lt 2 ] + do + loopcount=$(($loopcount + 1)) + + check_sapstartsrv + rc=$? + if [ $rc -eq $OCF_SUCCESS ]; then + output=`$SAPCONTROL -nr $InstanceNr -function Start` + rc=$? + ocf_log info "Starting SAP Instance $SID-$InstanceName: $output" + fi + + if [ $rc -ne 0 ] + then + ocf_log err "SAP Instance $SID-$InstanceName start failed." + return $OCF_ERR_GENERIC + fi + + local startrc=1 + while [ $startrc -gt 0 ] + do + local waittime_start=`date +%s` + output=`$SAPCONTROL -nr $InstanceNr -function WaitforStarted $OCF_RESKEY_START_WAITTIME 10` + startrc=$? + local waittime_stop=`date +%s` + + if [ $startrc -ne 0 ] + then + if [ $(($waittime_stop - $waittime_start)) -ge $OCF_RESKEY_START_WAITTIME ] + then + sapinstance_monitor NOLOG + if [ $? -eq $OCF_SUCCESS ] + then + output="START_WAITTIME ($OCF_RESKEY_START_WAITTIME) has elapsed, but instance monitor returned SUCCESS. Instance considered running." + startrc=0; loopcount=2 + fi + else + if [ $loopcount -eq 1 ] && ocf_is_true $OCF_RESKEY_AUTOMATIC_RECOVER + then + ocf_log warn "SAP Instance $SID-$InstanceName start failed: $output" + ocf_log warn "Try to recover $SID-$InstanceName" + cleanup_instance + else + loopcount=2 + fi + startrc=-1 + fi + else + loopcount=2 + fi + done + done + + if [ $startrc -eq 0 ] + then + ocf_log info "SAP Instance $SID-$InstanceName started: $output" + rc=$OCF_SUCCESS + sapuserexit POST_START_USEREXIT "$OCF_RESKEY_POST_START_USEREXIT" + if ocf_is_true $is_ers; then crm_attribute -n runs_ers_${SID} -v 1 -l reboot; fi + else + ocf_log err "SAP Instance $SID-$InstanceName start failed: $output" + rc=$OCF_NOT_RUNNING + if ocf_is_true $is_ers; then crm_attribute -n runs_ers_${SID} -v 0 -l reboot; fi + fi + + return $rc +} + + +# +# sapinstance_recover: Try startup of failed instance by cleaning up resources +# +sapinstance_recover() { + cleanup_instance + sapinstance_start + return $? +} + + +# +# sapinstance_stop: Stop the SAP instance +# +sapinstance_stop() { + local output="" + local rc + + sapuserexit PRE_STOP_USEREXIT "$OCF_RESKEY_PRE_STOP_USEREXIT" + + if [ "$OCF_RESKEY_SHUTDOWN_METHOD" = "KILL" ] + then + ocf_log info "Stopping SAP Instance $SID-$InstanceName with shutdown method KILL!" + cleanup_instance + return $OCF_SUCCESS + fi + + check_sapstartsrv + rc=$? + if [ $rc -eq $OCF_SUCCESS ]; then + output=`$SAPCONTROL -nr $InstanceNr -function Stop` + rc=$? + ocf_log info "Stopping SAP Instance $SID-$InstanceName: $output" + fi + + if [ $rc -eq 0 ] + then + output=`$SAPCONTROL -nr $InstanceNr -function WaitforStopped 3600 1` + if [ $? -eq 0 ] + then + ocf_log info "SAP Instance $SID-$InstanceName stopped: $output" + rc=$OCF_SUCCESS + else + ocf_log err "SAP Instance $SID-$InstanceName stop failed: $output" + rc=$OCF_ERR_GENERIC + fi + else + ocf_log err "SAP Instance $SID-$InstanceName stop failed: $output" + rc=$OCF_ERR_GENERIC + fi + + sapuserexit POST_STOP_USEREXIT "$OCF_RESKEY_POST_STOP_USEREXIT" + if ocf_is_true $is_ers; then crm_attribute -n runs_ers_${SID} -v 0 -l reboot; fi + + return $rc +} + + +# +# sapinstance_monitor: Can the given SAP instance do anything useful? +# +sapinstance_monitor() { + local MONLOG=$1 + local rc + + if ocf_is_probe && ocf_is_true "$OCF_RESKEY_MINIMAL_PROBE"; then + # code for minimal probe: # grep for sapstartsrv and maybe also for sapstart + # TODO: Do we need to improve this minimal test? + if pgrep -f -l "sapstartsrv .*pf=.*${SID}_${InstanceName}_${SAPVIRHOST}"; then + rc="$OCF_SUCCESS" + elif pgrep -f -l "sapstart .*pf=.*${SID}_${InstanceName}_${SAPVIRHOST}"; then + rc="$OCF_SUCCESS" + else + rc="$OCF_NOT_RUNNING" + fi + else + # standard probe and monitoring code + check_sapstartsrv + rc=$? + fi + + if [ $rc -eq $OCF_SUCCESS ] + then + local count=0 + local SERVNO + local output + + output=`$SAPCONTROL -nr $InstanceNr -function GetProcessList -format script` + + # we have to parse the output, because the returncode doesn't tell anything about the instance status + for SERVNO in `echo "$output" | grep '^[0-9] ' | cut -d' ' -f1 | sort -u` + do + local COLOR=`echo "$output" | grep "^$SERVNO dispstatus: " | cut -d' ' -f3` + local SERVICE=`echo "$output" | grep "^$SERVNO name: " | cut -d' ' -f3` + local STATE=0 + local SEARCH + + case $COLOR in + GREEN|YELLOW) STATE=$OCF_SUCCESS;; + *) STATE=$OCF_NOT_RUNNING;; + esac + + SEARCH=`echo "$OCF_RESKEY_MONITOR_SERVICES" | sed 's/\+/\\\+/g' | sed 's/\./\\\./g'` + if [ `echo "$SERVICE" | egrep -c "$SEARCH"` -eq 1 ] + then + if [ $STATE -eq $OCF_NOT_RUNNING ] + then + [ "$MONLOG" != "NOLOG" ] && ocf_log err "SAP instance service $SERVICE is not running with status $COLOR !" + rc=$STATE + fi + count=1 + fi + done + + if [ $count -eq 0 -a $rc -eq $OCF_SUCCESS ] + then + if ocf_is_probe + then + rc=$OCF_NOT_RUNNING + else + [ "$MONLOG" != "NOLOG" ] && ocf_log err "The SAP instance does not run any services which this RA could monitor!" + rc=$OCF_ERR_GENERIC + fi + fi + fi + + return $rc +} + + +# +# sapinstance_status: Lightweight check of SAP instance only with OS tools +# +sapinstance_status() { + local pid + local pids + + [ ! -f "/usr/sap/$SID/$InstanceName/work/kill.sap" ] && return $OCF_NOT_RUNNING + pids=$(awk '$3 ~ "^[0-9]+$" { print $3 }' /usr/sap/$SID/$InstanceName/work/kill.sap) + for pid in $pids + do + [ `pgrep -f -U $sidadm $InstanceName | grep -c $pid` -gt 0 ] && return $OCF_SUCCESS + done + return $OCF_NOT_RUNNING +} + + +# +# sapinstance_validate: Check the semantics of the input parameters +# +sapinstance_validate() { + local rc=$OCF_SUCCESS + if [ `echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$'` -ne 1 ] + then + ocf_log err "Parsing instance profile name: '$SID' is not a valid system ID!" + rc=$OCF_ERR_ARGS + fi + + if [ `echo "$InstanceName" | grep -c '^[A-Z].*[0-9][0-9]$'` -ne 1 ] + then + ocf_log err "Parsing instance profile name: '$InstanceName' is not a valid instance name!" + rc=$OCF_ERR_ARGS + fi + + if [ `echo "$InstanceNr" | grep -c '^[0-9][0-9]$'` -ne 1 ] + then + ocf_log err "Parsing instance profile name: '$InstanceNr' is not a valid instance number!" + rc=$OCF_ERR_ARGS + fi + + if [ `echo "$SAPVIRHOST" | grep -c '^[A-Za-z][A-Za-z0-9_-]*$'` -ne 1 ] + then + ocf_log err "Parsing instance profile name: '$SAPVIRHOST' is not a valid hostname!" + rc=$OCF_ERR_ARGS + fi + + return $rc +} + + +# +# sapinstance_start_clone +# +sapinstance_start_clone() { + sapinstance_init $OCF_RESKEY_ERS_InstanceName + ${HA_SBIN_DIR}/crm_master -v 50 -l reboot + sapinstance_start + return $? +} + + +# +# sapinstance_stop_clone +# +sapinstance_stop_clone() { + sapinstance_init $OCF_RESKEY_ERS_InstanceName + ${HA_SBIN_DIR}/crm_master -v 0 -l reboot + sapinstance_stop + return $? +} + + +# +# sapinstance_monitor_clone +# +sapinstance_monitor_clone() { + # first check with the status function (OS tools) if there could be something like a SAP instance running + # as we do not know here, if we are in master or slave state we do not want to start our monitoring + # agents (sapstartsrv) on the wrong host + local rc + + sapinstance_init $OCF_RESKEY_InstanceName + if sapinstance_status; then + if sapinstance_monitor; then + ${HA_SBIN_DIR}/crm_master -Q -v 100 -l reboot + return $OCF_RUNNING_MASTER + fi + # by nature of the SAP enqueue server we have to make sure + # that we do a failover to the slave (enqueue replication server) + # in case the enqueue process has failed. We signal this to the + # cluster by setting our master preference to a lower value than the slave. + ${HA_SBIN_DIR}/crm_master -v 10 -l reboot + return $OCF_FAILED_MASTER + fi + + sapinstance_init $OCF_RESKEY_ERS_InstanceName + sapinstance_status && sapinstance_monitor + rc=$? + if [ $rc -eq $OCF_SUCCESS ]; then + ${HA_SBIN_DIR}/crm_master -Q -v 100 -l reboot + fi + return $rc +} + + +# +# sapinstance_promote_clone: In a Master/Slave configuration get Master by starting the SCS instance and stopping the ERS instance +# The order is important here to behave correct from the application levels view +# +sapinstance_promote_clone() { + local rc + + sapinstance_init $OCF_RESKEY_InstanceName + ocf_log info "Promoting $SID-$InstanceName to running Master." + sapinstance_start + rc=$? + + if [ $rc -eq $OCF_SUCCESS ]; then + sapinstance_init $OCF_RESKEY_ERS_InstanceName + sapinstance_stop + rc=$? + fi + + return $rc +} + + +# +# sapinstance_demote_clone: In a Master/Slave configuration get Slave by stopping the SCS instance and starting the ERS instance +# +sapinstance_demote_clone() { + local rc + + sapinstance_init $OCF_RESKEY_InstanceName + ocf_log info "Demoting $SID-$InstanceName to a slave." + sapinstance_stop + rc=$? + + if [ $rc -eq $OCF_SUCCESS ]; then + sapinstance_init $OCF_RESKEY_ERS_InstanceName + sapinstance_start + rc=$? + fi + + return $rc +} + + +# +# sapinstance_notify: Handle master scoring - to make sure a slave gets the next master +# +sapinstance_notify() { + local n_type="$OCF_RESKEY_CRM_meta_notify_type" + local n_op="$OCF_RESKEY_CRM_meta_notify_operation" + + if [ "${n_type}_${n_op}" = "post_promote" ]; then + # After promotion of one master in the cluster, we make sure that all clones reset their master + # value back to 100. This is because a failed monitor on a master might have degree one clone + # instance to score 10. + ${HA_SBIN_DIR}/crm_master -v 100 -l reboot + elif [ "${n_type}_${n_op}" = "pre_demote" ]; then + # if we are a slave and a demote event is announced, make sure we are highest on the list to become master + # that is, when a slave resource was started after the promote event of an already running master (e.g. node of slave was down) + # We also have to make sure to overrule the globally set resource_stickiness or any fail-count factors => INFINITY + local n_uname="$OCF_RESKEY_CRM_meta_notify_demote_uname" + if [ ${n_uname} != ${NODENAME} ]; then + ${HA_SBIN_DIR}/crm_master -v INFINITY -l reboot + fi + fi +} + + +# +# 'main' starts here... +# + +## GLOBALS +SID="" +sidadm="" +InstanceName="" +InstanceNr="" +SAPVIRHOST="" +DIR_EXECUTABLE="" +SAPSTARTSRV="" +SAPCONTROL="" +DIR_PROFILE="" +SAPSTARTPROFILE="" +CLONE=0 +NODENAME=$(ocf_local_nodename) + + +if + ( [ $# -ne 1 ] ) +then + sapinstance_usage + exit $OCF_ERR_ARGS +fi + +ACTION=$1 +if [ "$ACTION" = "status" ]; then + ACTION=monitor +fi + +# These operations don't require OCF instance parameters to be set +case "$ACTION" in + usage|methods) sapinstance_$ACTION + exit $OCF_SUCCESS;; + meta-data) sapinstance_meta_data + exit $OCF_SUCCESS;; + notify) sapinstance_notify + exit $OCF_SUCCESS;; + *);; +esac + +if ! ocf_is_root +then + ocf_log err "$0 must be run as root" + exit $OCF_ERR_PERM +fi + +# parameter check +if [ -z "$OCF_RESKEY_InstanceName" ] +then + ocf_log err "Please set OCF_RESKEY_InstanceName to the name to the SAP instance profile!" + exit $OCF_ERR_ARGS +fi + +is_clone; CLONE=$? +if [ ${CLONE} -eq 1 ] +then + CLACT=_clone +else + if [ "$ACTION" = "promote" -o "$ACTION" = "demote" ] + then + ocf_log err "$ACTION called in a non master/slave environment" + exit $OCF_ERR_ARGS + fi + sapinstance_init $OCF_RESKEY_InstanceName +fi + +# What kind of method was invoked? +case "$ACTION" in + start|stop|monitor|promote|demote) sapinstance_$ACTION$CLACT + exit $?;; + validate-all) sapinstance_validate + exit $?;; + reload ) + ocf_log info "reloading SAPInstance parameters" + exit $OCF_SUCCESS;; + *) sapinstance_methods + exit $OCF_ERR_UNIMPLEMENTED;; +esac diff --git a/heartbeat/SendArp b/heartbeat/SendArp new file mode 100755 index 0000000..5af7bec --- /dev/null +++ b/heartbeat/SendArp @@ -0,0 +1,277 @@ +#!/bin/sh +# +# +# Copyright (c) 2006, Huang Zhen <zhen.huang@gmail.com> +# Converting original heartbeat RA to OCF RA. +# +# Copyright (C) 2004 Horms <horms@verge.net.au> +# +# Based on IPaddr2: Copyright (C) 2003 Tuomo Soini <tis@foobar.fi> +# +# License: GNU General Public License (GPL) +# Support: users@clusterlabs.org +# +# This script send out gratuitous Arp for an IP address +# +# It can be used _instead_ of the IPaddr2 or IPaddr resource +# to send gratuitous arp for an IP address on a given interface, +# without adding the address to that interface. I.e. if for +# some reason you want to send gratuitous arp for addresses +# managed by IPaddr2 or IPaddr on an additional interface. +# +# OCF parameters are as below: +# OCF_RESKEY_ip +# OCF_RESKEY_nic +# +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_ip_default="" +OCF_RESKEY_nic_default="" +OCF_RESKEY_background_default="true" + +: ${OCF_RESKEY_ip=${OCF_RESKEY_ip_default}} +: ${OCF_RESKEY_nic=${OCF_RESKEY_nic_default}} +: ${OCF_RESKEY_background=${OCF_RESKEY_background_default}} + +SENDARP=$HA_BIN/send_arp +SENDARPPIDDIR=${HA_RSCTMP} + +BASEIP="$OCF_RESKEY_ip" +INTERFACE="$OCF_RESKEY_nic" +RESIDUAL="" +SENDARPPIDFILE="$SENDARPPIDDIR/send_arp-$BASEIP" +BACKGROUND=${OCF_RESKEY_background} + +# Set default values + + : ${ARP_INTERVAL_MS=200} # milliseconds between ARPs + : ${ARP_REPEAT=5} # repeat count + : ${ARP_BACKGROUND=$BACKGROUND} # no to run in foreground + : ${ARP_NETMASK=ffffffffffff} # netmask for ARP + +####################################################################### + +sendarp_meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="SendArp" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This RA can be used _instead_ of the IPaddr2 or IPaddr RA to +send gratuitous ARP for an IP address on a given interface, +without adding the address to that interface. For example, +if for some reason you wanted to send gratuitous ARP for +addresses managed by IPaddr2 or IPaddr on an additional +interface. +</longdesc> +<shortdesc lang="en">Broadcasts unsolicited ARP announcements</shortdesc> + +<parameters> +<parameter name="ip" unique="0" required="1"> +<longdesc lang="en"> +The IP address for sending ARP packet. +</longdesc> +<shortdesc lang="en">IP address</shortdesc> +<content type="string" default="${OCF_RESKEY_ip_default}" /> +</parameter> + +<parameter name="nic" unique="0" required="1"> +<longdesc lang="en"> +The NIC for sending ARP packet. +</longdesc> +<shortdesc lang="en">NIC</shortdesc> +<content type="string" default="${OCF_RESKEY_nic_default}" /> +</parameter> + +<parameter name="background" unique="0" required="0"> +<longdesc lang="en"> +Send ARPs in background. Set to false if you want to test if +sending ARPs succeeded. +</longdesc> +<shortdesc lang="en">Send ARPs in background</shortdesc> +<content type="boolean" default="${OCF_RESKEY_background_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="20s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +sendarp_usage() { + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +# +# Send gratuitous arp +# +sendarp_start() { + local rc + + sendarp_validate + if [ $? = $OCF_ERR_CONFIGURED ]; then + return $OCF_ERR_CONFIGURED + fi + + sendarp_monitor + if [ $? = $OCF_SUCCESS ]; then + return $OCF_SUCCESS + fi + + [ -r ${HA_CONFDIR}/arp_config ] && . ${HA_CONFDIR}/arp_config + if [ -r "${HA_CONFDIR}/arp_config:${TARGET_INTERFACE}" ]; then + . "${HA_CONFDIR}/arp_config:${TARGET_INTERFACE}" + fi + + + ARGS="-i $ARP_INTERVAL_MS -r $ARP_REPEAT -p $SENDARPPIDFILE $INTERFACE $BASEIP auto $BASEIP $ARP_NETMASK" + ocf_log debug "$SENDARP $ARGS" + + rc=$OCF_SUCCESS + if ocf_is_true $ARP_BACKGROUND; then + # not possible to check the status without wait! we can + # just log the outcome + # and wait-ing would be equal to not running in + # background + ($SENDARP $ARGS || + ocf_exit_reason "Could not send gratuitous arps") & + else + $SENDARP $ARGS || { + ocf_exit_reason "Could not send gratuitous arps" + rc=$OCF_ERR_GENERIC + } + fi + + if [ $rc -eq $OCF_SUCCESS ]; then + ha_pseudo_resource SendArp_${OCF_RESOURCE_INSTANCE} start + fi + return $rc +} + +# +# Stop sending gratuitous arp +# +sendarp_stop() { + sendarp_monitor + if [ $? -eq $OCF_NOT_RUNNING ]; then + return $OCF_SUCCESS + fi + + rc=$OCF_SUCCESS + + if + [ -f "$SENDARPPIDFILE" ] + then + kill `cat "$SENDARPPIDFILE"` + rc=$? + case $rc in + 0) + ocf_log info "killed previously running send_arp for $BASEIP" + rm -f "$SENDARPPIDFILE" + rc=$OCF_SUCCESS + ;; + *) + ocf_log warn "Could not kill previously running send_arp for $BASEIP" + rc=$OCF_ERR_GENERIC + ;; + esac + fi + + case $rc in + $OCF_SUCCESS) + ocf_log info "SendArp for $BASEIP/$INTERFACE released" + ha_pseudo_resource SendArp_${OCF_RESOURCE_INSTANCE} stop + ;; + *) + ocf_log warn "SendArp for $BASEIP/$INTERFACE NOT released" + ;; + esac + return $rc +} +# +sendarp_monitor() { + if [ -f "$SENDARPPIDFILE" ]; then + return $OCF_SUCCESS + fi + ha_pseudo_resource SendArp_${OCF_RESOURCE_INSTANCE} monitor +} + +sendarp_validate() { + if [ -z "$INTERFACE" -o -z "$BASEIP" -o -n "$RESIDUAL" ] + then + return $OCF_ERR_CONFIGURED + fi + return $OCF_SUCCESS +} + +case $__OCF_ACTION in +meta-data) sendarp_meta_data + exit $OCF_SUCCESS + ;; +start) sendarp_start + ;; +stop) sendarp_stop + ;; +monitor) sendarp_monitor + ;; +status) sendarp_monitor + if [ $? = $OCF_SUCCESS ]; then + echo "running" + exit $OCF_SUCCESS; + else + echo "stopped" + exit $OCF_NOT_RUNNING; + fi + ;; +validate-all) sendarp_validate + ;; +usage|help) sendarp_usage + exit $OCF_SUCCESS + ;; +*) sendarp_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + diff --git a/heartbeat/ServeRAID b/heartbeat/ServeRAID new file mode 100755 index 0000000..6d6596f --- /dev/null +++ b/heartbeat/ServeRAID @@ -0,0 +1,427 @@ +#!/bin/sh +# +# +# ServeRAID +# +# Description: Enables/Disables shared ServeRAID merge groups +# +# Author: Alan Robertson, Renzo Alejandro Granados +# +# Support: users@clusterlabs.org +# +# License: GNU General Public License (GPL) +# +# Copyright: (C) 2002-2005 International Business Machines +# (C) 2002 Renzo Alejandro Granados +# +# usage: ./ServeRAID (start|stop|status|monitor|validate-all|meta-data) +# +# OCF parameters are as below: +# OCF_RESKEY_serveraid +# (Adapter number of the ServeRAID adapter) +# OCF_RESKEY_mergegroup +# (MergeGroup # of the logical drive under consideration) +# +# The ServeRAID clustering model is a bit odd, and its terminology needs +# a little explanation +# +# Logical Volume - a particular SCSI id {target id and LUN} on +# a particular controller. +# +# Merge Group - when active on one side or the other of the ServeRAID +# configuration it corresponds with a logical drive. +# Merge group numbers are permanently assigned to a particular +# chunk of storage. Shared merge groups are in the +# range of 1 to 8, and are largely arbitrary. +# Unshared merge groups start at 200. +# We can only deal with shared merge groups. When a merge +# group is activated on one of the controllers, it becomes +# a logical volume on that system. NOTE: The order in +# which the Merge Groups are activated determines which +# SCSI Ids they become. This makes for extra headaches +# for this script to deal with. It also means that if +# you have more than one shared ServeRAID merge group on +# a particular controller, that the SCSI IDs will not +# be constant. This requires mounting by uuid or label. +# +# One of the ServerRAID controllers has to be configured with +# SCSI initiator ID 6, and the other with SCSI id 7. +# +# At this time, the ServeRAID clustering solution only works with +# RAID 1 setups. It does NOT support RAID 5. This is a firmware +# bug in the ServeRAID where it doesn't fail over correctly +# if the RAID5 array is in a critical state... +# +# Note that this script requires ServeRAID software version 6.10 or +# later. This software is now available from IBM. +# +# An example usage in /etc/ha.d/haresources: +# node1 10.0.0.170 ServeRAID::1::1 +# + +# Older ServeRAID utility returns 1 when it succeeds (weird) +# BUT - the newly released version is more normal... + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_serveraid_default="" +OCF_RESKEY_mergegroup_default="" + +: ${OCF_RESKEY_serveraid=${OCF_RESKEY_serveraid_default}} +: ${OCF_RESKEY_mergegroup=${OCF_RESKEY_mergegroup_default}} + +####################################################################### + +srsuccess=0 +SCSI="scsi " + +usage() { + cat <<-EOF + usage: $0 (start|stop|status|monitor|validate-all|meta-data) + + You have to set the following environment virables before running $0 : + OCF_RESKEY_serveraid + (Adapter number of the ServeRAID adapter) + OCF_RESKEY_mergegroup + (MergeGroup # of the logical drive under consideration) + + ServeRAID adapters are numbered starting from 1. + + The shared merge group number is a number between 1 and 8 inclusive. + It indicates to the controller which logical disk to fail over. + + node1 10.0.0.170 ServeRAID::1::1 + + PREREQUISITES: + You must configure your ServeRAID adapters for clustering for this + to work. + + To do this, you must use the bootable "ServeRAID Support CD" and right + click your controller and pick "configure for clustering". The Linux + version of the ServeRAID manager does not have the "configure for + clustering" option. + + You will need at least version 6.10 (~July 2003 release) of the ipssend + command for this script to work. + + EOF +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="ServeRAID" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for ServeRAID. It enables/disables shared ServeRAID merge groups. +</longdesc> +<shortdesc lang="en">Enables and disables shared ServeRAID merge groups</shortdesc> + +<parameters> +<parameter name="serveraid" unique="0" required="1"> +<longdesc lang="en"> +The adapter number of the ServeRAID adapter. +</longdesc> +<shortdesc lang="en">serveraid</shortdesc> +<content type="integer" default="${OCF_RESKEY_serveraid_default}" /> +</parameter> + +<parameter name="mergegroup" unique="0" required="1"> +<longdesc lang="en"> +The logical drive under consideration. +</longdesc> +<shortdesc lang="en">mergegroup</shortdesc> +<content type="integer" default="${OCF_RESKEY_mergegroup_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="40s" /> +<action name="stop" timeout="40s" /> +<action name="status" depth="0" timeout="20s" interval="10s" /> +<action name="monitor" depth="0" timeout="20s" interval="10s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +<action name="methods" timeout="5s" /> +</actions> +</resource-agent> +END +} + +ServeRAID_methods() { + cat <<-! + start + stop + status + validate-all + methods + usage + meta-data + ! +} + +ServeRAIDSCSI="/proc/scsi/ips" + + +IPS=ipssend +proc_scsi=/proc/scsi/scsi + + +parseinst() { + sr_adapter=error + sr_mergegroup=error + hostid=error + sr_logicaldrivenumber=error + if + [ $# -ne 2 ] + then + ocf_log err "Invalid ServeRAID instance: $*" + exit $OCF_ERR_ARGS + fi + PerlScript='next unless /^Host/; $_ .= <>.<>; print "$1 " if /SERVERAID/ and /Proces/ and /scsi(\d+)/' + # Get the list of host ids of the ServeRAID host adapters + hostlist=`$PERL -ne "${PerlScript}" <$proc_scsi` + # Figure the host id of the desired ServeRAID adapter + hostid=`echo $hostlist | cut -d' ' -f$1` + if + [ ! -f "$ServeRAIDSCSI/$hostid" ] + then + ocf_log err "No such ServeRAID adapter: $1" + exit $OCF_ERR_ARGS + fi + + case $2 in + [1-8]);; + *) ocf_log err "Invalid Shared Merge Group Number: $2" + exit $OCF_ERR_ARGS;; + esac + sr_adapter=$1 + sr_mergegroup=$2 + CheckRaidLevel + return $? +} + +SRLogicalDriveConfig() { + $IPS getconfig $sr_adapter ld +} + +MergeGroupToSCSI_ID() { + + PerlScript="while (<>) { + /logical drive number *([0-9]+)/i && (\$ld=\$1); + /part of merge group *: *$sr_mergegroup *\$/i && print \$ld - 1, \"\n\"; + }" + + ID=`SRLogicalDriveConfig | $PERL -e "$PerlScript"` + case $ID in + [0-9]*) echo "$ID"; return 0;; + *) return 1;; + esac +} + +MergeGroupRaidLevel() { + + PerlScript="while (<>) { + /RAID level *: *([0-9]+[A-Za-z]*)/i && (\$ld=\$1); + /part of merge group *: *$sr_mergegroup *\$/i && print \$ld, \"\n\"; + }" + + Level=`SRLogicalDriveConfig | $PERL -e "$PerlScript"` + case $Level in + ?*) echo "$Level"; return 0;; + *) return 1;; + esac +} + +CheckRaidLevel() { + RAIDlevel=`MergeGroupRaidLevel` + case $RAIDlevel in + *5*) + ocf_log err "ServeRAID device $sr_adapter $sr_mergegroup is RAID level $RAIDlevel" + ocf_log err "This level of ServeRAID RAID is not supported for failover by the firmware." + exit $OCF_ERR_GENERIC;; + esac + return $OCF_SUCCESS +} + + + + +ReleaseSCSI() { + targetid=`MergeGroupToSCSI_ID` + echo "${SCSI}remove-single-device $hostid 0 $targetid 0" > $proc_scsi +} + +AddSCSI() { + targetid=`MergeGroupToSCSI_ID` + echo "${SCSI}add-single-device $hostid 0 $targetid 0" > $proc_scsi +} + +# +# start: Enable the given ServeRAID device +# +ServeRAID_start() { + if + ServeRAID_status $serveraid $mergegroup + then + ocf_log debug "ServeRAID merge group $serveraid $mergegroup is running." + return $OCF_SUCCESS + else + if + # + # Normally we do a MERGE PARTNER, but if we still own the drive for + # some reason, then we'll need to do a MERGE OWN instead... + # + out=`$IPS MERGE $sr_adapter $sr_mergegroup PARTNER 2>&1` + if + [ $? -eq $srsuccess ] + then + ocf_log info "$out" + else + ocf_run $IPS MERGE $sr_adapter $sr_mergegroup OWN + fi + then + : OK All is well! + targetid=`MergeGroupToSCSI_ID` + sr_logicaldrivenumber=`expr $targetid + 1` + #run $IPS SYNCH $sr_adapter $sr_logicaldrivenumber & + # This version of the SYNCH command requires the 6.10 or later + # ServeRAID support CD. + # To avoid issues when called by lrmd, redirect stdout->stderr. + # Use () to create a subshell to make the redirection be synchronized. + ( ocf_run $IPS SYNCH $sr_adapter $sr_mergegroup & ) >&2 + AddSCSI + else + return $OCF_ERR_GENERIC + fi + fi + if + ServeRAID_status "$@" + then + return $OCF_SUCCESS + else + ocf_log err "ServeRAID device $1 not active!" + exit $OCF_ERR_GENERIC + fi +} + + +# +# stop: Disable the given ServeRAID device +# +ServeRAID_stop() { + parseinst "$@" + ReleaseSCSI + if + ocf_run $IPS UNMERGE $sr_adapter $sr_mergegroup + then + : UNMERGE $sr_adapter $sr_mergegroup worked + fi + if + ServeRAID_status "$@" + then + ocf_log err "ServeRAID device $* is still active!" + return $OCF_ERR_GENERIC + else + return $OCF_SUCCESS + fi +} + + +# +# status: is the given device now available? +# +ServeRAID_status() { + parseinst "$@" + # + # The output we're looking for + # Part of merge group : 2 + # + SRLogicalDriveConfig \ + | grep -i "part of merge group[ ]*: *$sr_mergegroup *\$" >/dev/null +} + +# +# validate_all: are the OCF instance parameters valid? +# +ServeRAID_validate_all() { + check_binary $PERL + +# parseinst() will do all the work... + parseinst "$@" + return $? +} + +if + ( [ $# -ne 1 ] ) +then + usage + exit $OCF_ERR_ARGS +fi + +# These operations don't require OCF instance parameters to be set +case "$1" in + meta-data) + meta_data + exit $OCF_SUCCESS;; +# +# methods: What methods do we support? +# + methods) + ServeRAID_methods + exit $?;; + usage) + usage + exit $OCF_SUCCESS;; + *) + ;; +esac + +if + ( [ -z "$OCF_RESKEY_serveraid" ] || [ -z "$OCF_RESKEY_mergegroup" ] ) +then + ocf_log err "You have to set the OCF_RESKEY_serveraid and OCF_RESKEY_mergegroup\n + enviroment virables before running $0 !" +# usage + exit $OCF_ERR_GENERIC +fi + +: Right Number of arguments.. +serveraid=$OCF_RESKEY_serveraid +mergegroup=$OCF_RESKEY_mergegroup + +# Look for the start, stop, status, or methods calls... +case "$1" in + stop) + ServeRAID_stop $serveraid $mergegroup + exit $?;; + start) + ServeRAID_start $serveraid $mergegroup + exit $?;; + status|monitor) + if + ServeRAID_status $serveraid $mergegroup + then + ocf_log debug "ServeRAID merge group $serveraid $mergegroup is running." + exit $OCF_SUCCESS + else + ocf_log debug "ServeRAID merge group $serveraid $mergegroup is stopped." + exit $OCF_NOT_RUNNING + fi + exit $?;; + validate-all) + ServeRAID_validate_all $serveraid $mergegroup + exit $?;; + *) + usage + exit $OCF_ERR_UNIMPLEMENTED;; + +esac diff --git a/heartbeat/SphinxSearchDaemon b/heartbeat/SphinxSearchDaemon new file mode 100755 index 0000000..d4e9e85 --- /dev/null +++ b/heartbeat/SphinxSearchDaemon @@ -0,0 +1,230 @@ +#!/bin/sh +# +# +# Searchd OCF RA. +# Manages the Sphinx search daemon +# +# Copyright (c) 2007 Christian Rishoj (christian@rishoj.net) +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_config_default="/etc/sphinx/sphinx.conf" +OCF_RESKEY_searchd_default="/usr/local/bin/searchd" +OCF_RESKEY_search_default="/usr/local/bin/search" +OCF_RESKEY_testQuery_default="Heartbeat_Monitor_Query_Match_string" + +: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} +: ${OCF_RESKEY_searchd=${OCF_RESKEY_searchd_default}} +: ${OCF_RESKEY_search=${OCF_RESKEY_search_default}} +: ${OCF_RESKEY_testQuery=${OCF_RESKEY_testQuery_default}} + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="SphinxSearchDaemon" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This is a searchd Resource Agent. It manages the Sphinx Search Daemon. +</longdesc> +<shortdesc lang="en">Manages the Sphinx search daemon.</shortdesc> + +<parameters> + +<parameter name="config" required="0" unique="1"> +<longdesc lang="en"> +searchd configuration file +</longdesc> +<shortdesc lang="en">Configuration file</shortdesc> +<content type="string" default="${OCF_RESKEY_config_default}" /> +</parameter> + +<parameter name="searchd" required="0" unique="0"> +<longdesc lang="en"> +searchd binary +</longdesc> +<shortdesc lang="en">searchd binary</shortdesc> +<content type="string" default="${OCF_RESKEY_searchd_default}" /> +</parameter> + +<parameter name="search" required="0" unique="0"> +<longdesc lang="en"> +Search binary for functional testing in the monitor action. +</longdesc> +<shortdesc lang="en">search binary</shortdesc> +<content type="string" default="${OCF_RESKEY_search_default}" /> +</parameter> + +<parameter name="testQuery" required="0" unique="0"> +<longdesc lang="en"> +Test query for functional testing in the monitor action. +The query does not need to match any documents in the index. +The purpose is merely to test whether the search daemon is +is able to query its indices and respond properly. +</longdesc> +<shortdesc lang="en">test query</shortdesc> +<content type="string" default="${OCF_RESKEY_testQuery_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" timeout="20s" interval="10s" depth="0" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="20s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +searchd_usage() { + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +searchd_start() { + if ! searchd_validate ; then + return $OCF_ERR_GENERIC + fi + if ! searchd_status ; then + $OCF_RESKEY_searchd --config $OCF_RESKEY_config + if [ $? != 0 ]; then + return $OCF_ERR_GENERIC + fi + fi + if searchd_check ; then + return $OCF_SUCCESS + else + return $OCF_ERR_GENERIC + fi +} + +searchd_stop() { + if ! searchd_validate ; then + return $OCF_ERR_GENERIC + fi + if searchd_status ; then + $OCF_RESKEY_searchd --config $OCF_RESKEY_config --stop + if [ $? = 0 ]; then + return $OCF_SUCCESS + else + return $OCF_ERR_GENERIC + fi + else + return $OCF_SUCCESS + fi +} + +isRunning() +{ + kill -s 0 "$1" > /dev/null && [ `ps -p "$1" | grep searchd | wc -l` -eq 1 ] +} + +searchd_status() { + pidfile=`grep -v "^#" "$OCF_RESKEY_config" | grep -w pid_file | awk -F "[ \t]*=[ \t]*" '{ print $2 }'` + if [ -f "$pidfile" ] ; then + PID=`head -n 1 $pidfile` + if [ ! -z "$PID" ] ; then + isRunning "$PID" + if [ $? = 0 ] ; then + return 0 + fi + fi + fi + false +} + +searchd_check() { + $OCF_RESKEY_search --config $OCF_RESKEY_config --noinfo "$OCF_RESKEY_testQuery" > /dev/null +} + +searchd_monitor() { + if ! searchd_validate ; then + return $OCF_NOT_RUNNING + fi + if searchd_status ; then + if searchd_check ; then + return $OCF_SUCCESS + else + return $OCF_ERR_GENERIC + fi + else + return $OCF_NOT_RUNNING + fi +} + +searchd_validate() { + if [ ! -x "$OCF_RESKEY_search" ]; then + ocf_log err "search binary '$OCF_RESKEY_search' does not exist or cannot be executed" + return $OCF_ERR_ARGS + fi + + if [ ! -x "$OCF_RESKEY_searchd" ]; then + ocf_log err "searchd binary '$OCF_RESKEY_searchd' does not exist or cannot be executed" + return $OCF_ERR_ARGS + fi + + if [ ! -f "$OCF_RESKEY_config" ]; then + ocf_log err "config file '$OCF_RESKEY_config' does not exist" + return $OCF_ERR_ARGS + fi + + return $OCF_SUCCESS +} + +case $__OCF_ACTION in +meta-data) meta_data + exit $OCF_SUCCESS + ;; +start) searchd_start;; +stop) searchd_stop;; +monitor) searchd_monitor;; +validate-all) searchd_validate;; +usage|help) searchd_usage + exit $OCF_SUCCESS + ;; +*) searchd_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc + diff --git a/heartbeat/Squid.in b/heartbeat/Squid.in new file mode 100644 index 0000000..e574ad0 --- /dev/null +++ b/heartbeat/Squid.in @@ -0,0 +1,472 @@ +#!@BASH_SHELL@ +# +# Description: Manages a Squid Server provided by NTT OSSC as an +# OCF High-Availability resource under Heartbeat/LinuxHA control +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +# 02110-1301, USA. +# +# Copyright (c) 2008 NIPPON TELEGRAPH AND TELEPHONE CORPORATION +# +####################################################################### +# OCF parameters: +# OCF_RESKEY_squid_exe : Executable file +# OCF_RESKEY_squid_conf : Configuration file +# OCF_RESKEY_squid_opts : Start options +# OCF_RESKEY_squid_pidfile: Process id file +# OCF_RESKEY_squid_port : Port number +# OCF_RESKEY_debug_mode : Debug mode +# OCF_RESKEY_debug_log : Debug log file +# OCF_RESKEY_squid_stop_timeout: +# Number of seconds to await to confirm a +# normal stop method +# +# OCF_RESKEY_squid_exe, OCF_RESKEY_squid_conf, OCF_RESKEY_squid_pidfile +# and OCF_RESKEY_squid_port must be specified. Each of the rests +# has its default value or refers OCF_RESKEY_squid_conf to make +# its value when no explicit value is given. +############################################################################### + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_squid_exe_default="" +OCF_RESKEY_squid_conf_default="" +OCF_RESKEY_squid_opts_default="" +OCF_RESKEY_squid_pidfile_default="" +OCF_RESKEY_squid_port_default="" +OCF_RESKEY_squid_stop_timeout_default="10" +OCF_RESKEY_debug_mode_default="" +OCF_RESKEY_debug_log_default="" + +: ${OCF_RESKEY_squid_exe=${OCF_RESKEY_squid_exe_default}} +: ${OCF_RESKEY_squid_conf=${OCF_RESKEY_squid_conf_default}} +: ${OCF_RESKEY_squid_pidfile=${OCF_RESKEY_squid_pidfile_default}} +: ${OCF_RESKEY_squid_port=${OCF_RESKEY_squid_port_default}} +: ${OCF_RESKEY_squid_stop_timeout=${OCF_RESKEY_squid_stop_timeout_default}} +: ${OCF_RESKEY_debug_mode=${OCF_RESKEY_debug_mode_default}} +: ${OCF_RESKEY_debug_log=${OCF_RESKEY_debug_log_default}} + +usage() +{ + cat <<-! +usage: $0 action + +action: + start : start a new squid instance + + stop : stop the running squid instance + + status : return the status of squid, run or down + + monitor : return TRUE if the squid appears to be working. + + meta-data : show meta data message + + validate-all: validate the instance parameters +! + return $OCF_ERR_ARGS +} + +metadata_squid() +{ + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="Squid" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +The resource agent of Squid. +This manages a Squid instance as an HA resource. +</longdesc> +<shortdesc lang="en">Manages a Squid proxy server instance</shortdesc> + +<parameters> + +<parameter name="squid_exe" required="1" unique="0"> +<longdesc lang="en"> +This is a required parameter. This parameter specifies squid's +executable file. +</longdesc> +<shortdesc lang="en">Executable file</shortdesc> +<content type="string" default="${OCF_RESKEY_squid_exe_default}"/> +</parameter> + +<parameter name="squid_conf" required="1" unique="1"> +<longdesc lang="en"> +This is a required parameter. This parameter specifies a configuration file +for a squid instance managed by this RA. +</longdesc> +<shortdesc lang="en">Configuration file</shortdesc> +<content type="string" default="${OCF_RESKEY_squid_conf_default}"/> +</parameter> + +<parameter name="squid_opts" required="0" unique="0"> +<longdesc lang="en"> +This is a optional parameter. This parameter specifies the start options. +</longdesc> +<shortdesc lang="en">Start options</shortdesc> +<content type="string" default="${OCF_RESKEY_squid_opts_default}"/> +</parameter> + +<parameter name="squid_pidfile" required="0" unique="1"> +<longdesc lang="en">Deprecated - do not use anymore</longdesc> +<shortdesc lang="en">deprecated - do not use anymore</shortdesc> +<content type="string" default="${OCF_RESKEY_squid_pidfile_default}"/> +</parameter> + +<parameter name="squid_port" required="1" unique="1"> +<longdesc lang="en"> +This is a required parameter. This parameter specifies a port number +for a squid instance managed by this RA. If multiple ports are used, +you must specify only one of them. +</longdesc> +<shortdesc lang="en">Port number</shortdesc> +<content type="integer" default="${OCF_RESKEY_squid_port_default}"/> +</parameter> + +<parameter name="squid_stop_timeout" unique="0"> +<longdesc lang="en"> +On stop, a squid shutdown is invoked first. If the resource +doesn't stop within this timeout, we resort to stopping +processes by sending signals and finally KILLing them. +</longdesc> +<shortdesc lang="en">how long to wait for squid shutdown to stop the +instance before resorting to kill</shortdesc> +<content type="integer" default="${OCF_RESKEY_squid_stop_timeout_default}"/> +</parameter> + +<parameter name="debug_mode" unique="0"> +<longdesc lang="en"> +This is an optional parameter. +This RA runs in debug mode when this parameter includes 'x' or 'v'. +If 'x' is included, both of STDOUT and STDERR redirect to the logfile +specified by "debug_log", and then the builtin shell option 'x' is turned on. +It is similar about 'v'. +</longdesc> +<shortdesc lang="en">Debug mode</shortdesc> +<content type="string" default="${OCF_RESKEY_debug_mode_default}"/> +</parameter> + +<parameter name="debug_log" unique="0"> +<longdesc lang="en"> +This is an optional parameter. +This parameter specifies a destination file for debug logs +and works only if this RA run in debug mode. Refer to "debug_mode" +about debug mode. If no value is given but is required, it's constructed +according to the following rules: "/var/log/" as a directory part, +the basename of the configuration file given by "syslog_ng_conf" +as a basename part, ".log" as a suffix. +</longdesc> +<shortdesc lang="en">A destination of the debug log</shortdesc> +<content type="string" default="${OCF_RESKEY_debug_log_default}"/> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="60s" /> +<action name="stop" timeout="120s" /> +<action name="status" timeout="60s" /> +<action name="monitor" depth="0" timeout="30s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s"/> +</actions> +</resource-agent> +END + + return $OCF_SUCCESS +} + +get_pids() +{ + SQUID_PIDS=( ) + + # Seek by pattern + SQUID_PIDS[0]=$(pgrep -f "$PROCESS_PATTERN") + + # Seek by child process + if [[ -n "${SQUID_PIDS[0]}" ]]; then + SQUID_PIDS[1]=$(pgrep -P ${SQUID_PIDS[0]}) + fi + + if [[ -n "${SQUID_PIDS[1]}" ]]; then + typeset exe + exe=$(ls -l "/proc/${SQUID_PIDS[1]}/exe") + if [[ $? = 0 ]]; then + exe=${exe##*-> } + if ! [[ "$exe" = $SQUID_EXE ]]; then + SQUID_PIDS[1]="" + fi + else + SQUID_PIDS[1]="" + fi + fi + + # Seek by port + if have_binary netstat; then + SQUID_PIDS[2]=$( + netstat -apn | + awk '/tcp.*:'$SQUID_PORT' .*LISTEN/ && $7~/^[1-9]/ { + sub("\\/.*", "", $7); print $7; exit}') + else + SQUID_PIDS[2]=$( + ss -apn | + awk '/tcp.*LISTEN.*:'$SQUID_PORT'/ { + sub(".*pid=", "", $7); sub(",fd=.*", "", $7); print $7 }') + fi +} + +are_all_pids_found() +{ + if + [[ -n "${SQUID_PIDS[0]}" ]] && + [[ -n "${SQUID_PIDS[1]}" ]] && + [[ -n "${SQUID_PIDS[2]}" ]] + then + return 0 + else + return 1 + fi +} + +are_pids_sane() +{ + if [[ "${SQUID_PIDS[1]}" = "${SQUID_PIDS[2]}" ]]; then + return $OCF_SUCCESS + else + ocf_exit_reason "$SQUID_NAME:Pid unmatch" + return $OCF_ERR_GENERIC + fi +} + +is_squid_dead() +{ + if + [[ -z "${SQUID_PIDS[0]}" ]] && + [[ -z "${SQUID_PIDS[2]}" ]] + then + return 0 + else + return 1 + fi +} + +monitor_squid() +{ + typeset trialcount=0 + + while true; do + get_pids + + if are_all_pids_found; then + are_pids_sane + return $OCF_SUCCESS + fi + + if is_squid_dead; then + return $OCF_NOT_RUNNING + fi + + ocf_log info "$SQUID_NAME:Inconsistent processes:" \ + "${SQUID_PIDS[0]},${SQUID_PIDS[1]},${SQUID_PIDS[2]}" + (( trialcount = trialcount + 1 )) + if (( trialcount > SQUID_CONFIRM_TRIALCOUNT )); then + ocf_exit_reason "$SQUID_NAME:Inconsistency of processes remains unsolved" + return $OCF_ERR_GENERIC + fi + sleep 1 + done +} + +start_squid() +{ + typeset status + + monitor_squid + status=$? + + if [[ $status != $OCF_NOT_RUNNING ]]; then + return $status + fi + + set -- "$SQUID_OPTS" + ocf_run $SQUID_EXE -f "$SQUID_CONF" "$@" + status=$? + if [[ $status != $OCF_SUCCESS ]]; then + return $OCF_ERR_GENERIC + fi + + while true; do + get_pids + if are_all_pids_found && are_pids_sane; then + return $OCF_SUCCESS + fi + ocf_log info "$SQUID_NAME:Waiting for squid to be invoked" + sleep 1 + done + + return $OCF_ERR_GENERIC +} + +stop_squid() +{ + typeset lapse_sec + + if ocf_run $SQUID_EXE -f $SQUID_CONF -k shutdown; then + lapse_sec=0 + while true; do + get_pids + if is_squid_dead; then + return $OCF_SUCCESS + fi + (( lapse_sec = lapse_sec + 1 )) + if (( lapse_sec > SQUID_STOP_TIMEOUT )); then + break + fi + sleep 1 + ocf_log info "$SQUID_NAME:$FUNCNAME:$LINENO: " \ + "stop NORM $lapse_sec/$SQUID_STOP_TIMEOUT" + done + fi + + while true; do + get_pids + ocf_log info "$SQUID_NAME:$FUNCNAME:$LINENO: " \ + "try to stop by SIGKILL:${SQUID_PIDS[0]} ${SQUID_PIDS[2]}" + kill -KILL ${SQUID_PIDS[0]} ${SQUID_PIDS[2]} + sleep 1 + if is_squid_dead; then + return $OCF_SUCCESS + fi + done + + return $OCF_ERR_GENERIC +} + +status_squid() +{ + return $OCF_SUCCESS +} + + +validate_all_squid() +{ + ocf_log info "validate_all_squid[$SQUID_NAME]" + return $OCF_SUCCESS +} + +: "=== Debug ${0##*/} $1 ===" + +if [[ "$1" = "meta-data" ]]; then + metadata_squid + exit $? +fi + +SQUID_CONF="${OCF_RESKEY_squid_conf}" +if [[ -z "$SQUID_CONF" ]]; then + ocf_exit_reason "SQUID_CONF is not defined" + exit $OCF_ERR_CONFIGURED +fi + +SQUID_NAME="${SQUID_CONF##*/}" +SQUID_NAME="${SQUID_NAME%.*}" + +DEBUG_LOG="${OCF_RESKEY_debug_log-/var/log/squid_${SQUID_NAME}_debug}.log" + +DEBUG_MODE="" +case $OCF_RESKEY_debug_mode in + *x*) DEBUG_MODE="${DEBUG_MODE}x";; +esac +case $OCF_RESKEY_debug_mode in + *v*) DEBUG_MODE="${DEBUG_MODE}v";; +esac + +if [ -n "$DEBUG_MODE" ]; then + PS4='\d \t \h '"${1-unknown} " + export PS4 + exec 1>>$DEBUG_LOG 2>&1 + set -$DEBUG_MODE +fi + +SQUID_EXE="${OCF_RESKEY_squid_exe}" +if [[ -z "$SQUID_EXE" ]]; then + ocf_exit_reason "SQUID_EXE is not defined" + exit $OCF_ERR_CONFIGURED +fi +if [[ ! -x "$SQUID_EXE" ]]; then + ocf_exit_reason "$SQUID_EXE is not found" + exit $OCF_ERR_CONFIGURED +fi + +SQUID_PORT="${OCF_RESKEY_squid_port}" +if [[ -z "$SQUID_PORT" ]]; then + ocf_exit_reason "SQUID_PORT is not defined" + exit $OCF_ERR_CONFIGURED +fi + +SQUID_OPTS="${OCF_RESKEY_squid_opts}" + +SQUID_PIDS=( ) + +SQUID_CONFIRM_TRIALCOUNT="${OCF_RESKEY_squid_confirm_trialcount-3}" + +SQUID_STOP_TIMEOUT="${OCF_RESKEY_squid_stop_timeout-10}" +SQUID_SUSPEND_TRIALCOUNT="${OCF_RESKEY_squid_suspend_trialcount-10}" + +PROCESS_PATTERN="$SQUID_EXE -f $SQUID_CONF" + +COMMAND=$1 + +case "$COMMAND" in + start) + ocf_log debug "[$SQUID_NAME] Enter squid start" + start_squid + func_status=$? + ocf_log debug "[$SQUID_NAME] Leave squid start $func_status" + exit $func_status + ;; + stop) + ocf_log debug "[$SQUID_NAME] Enter squid stop" + stop_squid + func_status=$? + ocf_log debug "[$SQUID_NAME] Leave squid stop $func_status" + exit $func_status + ;; + status) + status_squid + exit $? + ;; + monitor) + #ocf_log debug "[$SQUID_NAME] Enter squid monitor" + monitor_squid + func_status=$? + #ocf_log debug "[$SQUID_NAME] Leave squid monitor $func_status" + exit $func_status + ;; + validate-all) + validate_all_squid + exit $? + ;; + *) + usage + ;; +esac + +# vim: set sw=4 ts=4 : + diff --git a/heartbeat/Stateful b/heartbeat/Stateful new file mode 100755 index 0000000..72dd550 --- /dev/null +++ b/heartbeat/Stateful @@ -0,0 +1,192 @@ +#!/bin/sh +# +# +# Example of a stateful OCF Resource Agent. +# +# Copyright (c) 2006 Andrew Beekhof +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_state_default="${HA_RSCTMP}/Stateful-${OCF_RESOURCE_INSTANCE}.state" + +: ${OCF_RESKEY_state=${OCF_RESKEY_state_default}} + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="Stateful" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This is an example resource agent that implements two states +</longdesc> +<shortdesc lang="en">Example stateful resource agent</shortdesc> + +<parameters> + +<parameter name="state" unique="1"> +<longdesc lang="en"> +Location to store the resource state in +</longdesc> +<shortdesc lang="en">State file</shortdesc> +<content type="string" default="${OCF_RESKEY_state_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="promote" timeout="20s" /> +<action name="demote" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="10s"/> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="20s" /> +</actions> +</resource-agent> +END + exit $OCF_SUCCESS +} + +####################################################################### + +stateful_usage() { + cat <<END +usage: $0 {start|stop|promote|demote|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END + exit $1 +} + +stateful_update() { + echo $1 > ${OCF_RESKEY_state} +} + +stateful_check_state() { + target=$1 + if [ -f ${OCF_RESKEY_state} ]; then + state=`cat ${OCF_RESKEY_state}` + if [ "x$target" = "x$state" ]; then + return $OCF_SUCCESS + fi + + else + if [ "x$target" = "x" ]; then + return $OCF_SUCCESS + fi + fi + + return $OCF_ERR_GENERIC +} + +stateful_start() { + stateful_check_state master + if [ $? = 0 ]; then + # CRM Error - Should never happen + return $OCF_RUNNING_MASTER + fi + stateful_update slave + ocf_promotion_score -v 5 + return $OCF_SUCCESS +} + +stateful_demote() { + stateful_check_state + if [ $? = 0 ]; then + # CRM Error - Should never happen + return $OCF_NOT_RUNNING + fi + stateful_update slave + ocf_promotion_score -v 5 + return $OCF_SUCCESS +} + +stateful_promote() { + stateful_check_state + if [ $? = 0 ]; then + return $OCF_NOT_RUNNING + fi + stateful_update master + ocf_promotion_score -v 10 + return $OCF_SUCCESS +} + +stateful_stop() { + ocf_promotion_score -D + stateful_check_state master + if [ $? = 0 ]; then + # CRM Error - Should never happen + return $OCF_RUNNING_MASTER + fi + if [ -f ${OCF_RESKEY_state} ]; then + rm ${OCF_RESKEY_state} + fi + return $OCF_SUCCESS +} + +stateful_monitor() { + stateful_check_state "master" + if [ $? = 0 ]; then + return $OCF_RUNNING_MASTER + fi + + stateful_check_state "slave" + if [ $? = 0 ]; then + return $OCF_SUCCESS + fi + + if [ -f ${OCF_RESKEY_state} ]; then + echo "File '${OCF_RESKEY_state}' exists but contains unexpected contents" + cat ${OCF_RESKEY_state} + return $OCF_ERR_GENERIC + fi + return $OCF_NOT_RUNNING +} + +stateful_validate() { + exit $OCF_SUCCESS +} + +case $__OCF_ACTION in +meta-data) meta_data;; +start) stateful_start;; +promote) stateful_promote;; +demote) stateful_demote;; +stop) stateful_stop;; +monitor) stateful_monitor;; +validate-all) stateful_validate;; +usage|help) stateful_usage $OCF_SUCCESS;; +*) stateful_usage $OCF_ERR_UNIMPLEMENTED;; +esac + +exit $? diff --git a/heartbeat/SysInfo.in b/heartbeat/SysInfo.in new file mode 100644 index 0000000..c57b7b6 --- /dev/null +++ b/heartbeat/SysInfo.in @@ -0,0 +1,372 @@ +#!@BASH_SHELL@ +# +# +# SysInfo OCF Resource Agent +# It records (in the CIB) various attributes of a node +# +# Copyright (c) 2004 SUSE LINUX AG, Lars Marowsky-Bree +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_pidfile_default="$HA_RSCTMP/SysInfo-${OCF_RESOURCE_INSTANCE}" +OCF_RESKEY_delay_default="0s" +OCF_RESKEY_clone_default="0" + +: ${OCF_RESKEY_pidfile=${OCF_RESKEY_pidfile_default}} +: ${OCF_RESKEY_delay=${OCF_RESKEY_delay_default}} +: ${OCF_RESKEY_clone=${OCF_RESKEY_clone_default}} + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="SysInfo" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This is a SysInfo Resource Agent. +It records (in the CIB) various attributes of a node +Sample Linux output: + arch: i686 + os: Linux-2.4.26-gentoo-r14 + free_swap: 1999 + cpu_info: Intel(R) Celeron(R) CPU 2.40GHz + cpu_speed: 4771.02 + cpu_cores: 1 + cpu_load: 0.00 + ram_total: 513 + ram_free: 117 + root_free: 2.4 + +Sample Darwin output: + arch: i386 + os: Darwin-8.6.2 + cpu_info: Intel Core Duo + cpu_speed: 2.16 + cpu_cores: 2 + cpu_load: 0.18 + ram_total: 2016 + ram_free: 787 + root_free: 13 + +Units: + free_swap: Mb + ram_*: Mb + root_free: Gb + cpu_speed (Linux): bogomips + cpu_speed (Darwin): Ghz + +</longdesc> +<shortdesc lang="en">Records various node attributes in the CIB</shortdesc> + +<parameters> + +<parameter name="pidfile" unique="0"> +<longdesc lang="en">PID file</longdesc> +<shortdesc lang="en">PID file</shortdesc> +<content type="string" default="${OCF_RESKEY_pidfile_default}" /> +</parameter> + +<parameter name="delay" unique="0"> +<longdesc lang="en">Interval to allow values to stabilize</longdesc> +<shortdesc lang="en">Dampening Delay</shortdesc> +<content type="string" default="${OCF_RESKEY_delay_default}" /> +</parameter> + +</parameters> +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" timeout="20s" interval="60s"/> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="20s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +UpdateStat() { + name=$1; shift + value="$*" + echo -e "$name:\t$value" + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -S status -n $name -v "$value" +} + +SysInfoStats() { + + UpdateStat arch "`uname -m`" + UpdateStat os "`uname -s`-`uname -r`" + + case `uname -s` in + "Darwin") + mem=`top -l 1 | grep Mem: | awk '{print $10}'` + mem_used=`top -l 1 | grep Mem: | awk '{print $8}'` + mem=`SysInfo_mem_units $mem` + mem_used=`SysInfo_mem_units $mem_used` + mem_total=`expr $mem_used + $mem` + cpu_type=`system_profiler SPHardwareDataType | grep "CPU Type:"` + cpu_type=${cpu_type/*: /} + cpu_speed=`system_profiler SPHardwareDataType | grep "CPU Speed:" | awk '{print $3}'` + cpu_cores=`system_profiler SPHardwareDataType | grep "Number Of"` + cpu_cores=${cpu_cores/*: /} + ;; + "Linux") + if [ -f /proc/cpuinfo ]; then + cpu_type=`grep "model name" /proc/cpuinfo | head -n 1` + cpu_type=${cpu_type/*: /} + cpu_speed=`grep "bogomips" /proc/cpuinfo | head -n 1` + cpu_speed=${cpu_speed/*: /} + cpu_cores=`grep "^processor" /proc/cpuinfo | wc -l` + fi + + if [ -f /proc/meminfo ]; then + # meminfo results are in kB + mem=`grep "SwapFree" /proc/meminfo | awk '{print $2"k"}'` + if [ ! -z $mem ]; then + UpdateStat free_swap `SysInfo_mem_units $mem` + fi + mem=`grep "Inactive" /proc/meminfo | awk '{print $2"k"}'` + mem_total=`grep "MemTotal" /proc/meminfo | awk '{print $2"k"}'` + else + mem=`top -n 1 | grep Mem: | awk '{print $7}'` + fi + ;; + *) + esac + + if [ x != x"$cpu_type" ]; then + UpdateStat cpu_info "$cpu_type" + fi + + if [ x != x"$cpu_speed" ]; then + UpdateStat cpu_speed "$cpu_speed" + fi + + if [ x != x"$cpu_cores" ]; then + UpdateStat cpu_cores "$cpu_cores" + fi + + loads=`uptime` + load15=`echo ${loads} | awk '{print $10}'` + UpdateStat cpu_load $load15 + + if [ ! -z "$mem" ]; then + # Massage the memory values + UpdateStat ram_total `SysInfo_mem_units $mem_total` + UpdateStat ram_free `SysInfo_mem_units $mem` + fi + + # Portability notes: + # o df: -h flag not available on Solaris 8. (OK on 9, 10, ...) #FIXME# + # o tail: explicit "-n" not available in Solaris; instead simplify + # 'tail -n <c>' to the equivalent 'tail -<c>'. + disk=`df -h / | tail -1 | awk '{print $4}'` + if [ x != x"$disk" ]; then + UpdateStat root_free `SysInfo_hdd_units $disk` + fi +} + +SysInfo_mem_units() { + mem=$1 + + if [ -z $1 ]; then + return + fi + + memlen=`expr ${#mem} - 1` + memlen_alt=`expr ${#mem} - 2` + if [ ${mem:$memlen:1} = "G" ]; then + mem="${mem:0:$memlen}" + if [ $mem != ${mem/./} ]; then + mem_before=${mem/.*/} + mem_after=${mem/*./} + mem=$[mem_before*1024] + if [ ${#mem_after} = 0 ]; then + : + elif [ ${#mem_after} = 1 ]; then + mem=$[mem+100*$mem_after] + elif [ ${#mem_after} = 2 ]; then + mem=$[mem+10*$mem_after] + elif [ ${#mem_after} = 3 ]; then + mem=$[mem+$mem_after] + else + mem_after=${mem_after:0:3} + mem=$[mem+$mem_after] + fi + fi + elif [ ${mem:$memlen:1} = "M" ]; then + mem=${mem/.*/} + mem="${mem:0:$memlen}" + elif [ ${mem:$memlen:1} = "k" ]; then + mem="${mem:0:$memlen}" + mem=${mem/.*/} + mem=`expr $mem / 1024` + elif [ ${mem:$memlen_alt:2} = "kB" ]; then + mem="${mem:0:$memlen_alt}" + mem=${mem/.*/} + mem=`expr $mem / 1024` + elif [ ${mem:$memlen_alt:2} = "Mb" ]; then + mem="${mem:0:$memlen_alt}" + mem=${mem/.*/} + elif [ ${mem:$memlen_alt:2} = "MB" ]; then + mem="${mem:0:$memlen_alt}" + mem=${mem/.*/} + fi + + # Round to the next multiple of 50 + memlen=`expr ${#mem} - 2` + mem_round="${mem:$memlen:2}" + if [ x$mem_round = x ]; then + : + elif [ $mem_round = "00" ]; then + : + else + mem_round=`echo $mem_round | sed 's/^0//'` + if [ $mem_round -lt "50" ]; then + mem=$[mem+50] + mem=$[mem-$mem_round] + + else + mem=$[mem+100] + mem=$[mem-$mem_round] + fi + fi + echo $mem +} + +SysInfo_hdd_units() { + disk=$1 + disklen=`expr ${#disk} - 1` + disklen_alt=`expr ${#disk} - 2` + if [ ${disk:$disklen:1} = "G" ]; then + disk="${disk:0:$disklen}" + elif [ ${disk:$disklen:1} = "M" ]; then + disk="${disk:0:$disklen}" + disk=${disk/.*/} + disk=`expr $disk / 1024` + elif [ ${disk:$disklen:1} = "k" ]; then + disk="${disk:0:$disklen}" + disk=${disk/.*/} + disk=`expr $disk / 1048576` + elif [ ${disk:$disklen_alt:2} = "kB" ]; then + disk="${disk:0:$disklen_alt}" + disk=${disk/.*/} + disk=`expr $disk / 1048576` + elif [ ${disk:$disklen_alt:2} = "Mb" ]; then + disk="${disk:0:$disklen_alt}" + disk=${disk/.*/} + disk=`expr $disk / 1024` + elif [ ${disk:$disklen_alt:2} = "MB" ]; then + disk="${disk:0:$disklen_alt}" + disk=${disk/.*/} + disk=`expr $disk / 1024` + fi + echo $disk +} + +SysInfo_usage() { + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +SysInfo_start() { + echo $OCF_RESKEY_clone > $OCF_RESKEY_pidfile + SysInfoStats + exit $OCF_SUCCESS +} + +SysInfo_stop() { + rm $OCF_RESKEY_pidfile + exit $OCF_SUCCESS +} + +SysInfo_monitor() { + if [ -f $OCF_RESKEY_pidfile ]; then + clone=`cat $OCF_RESKEY_pidfile` + fi + + if [ x$clone = x ]; then + rm $OCF_RESKEY_pidfile + exit $OCF_NOT_RUNNING + + elif [ $clone = $OCF_RESKEY_clone ]; then + SysInfoStats + exit $OCF_SUCCESS + + elif [ x$OCF_RESKEY_CRM_meta_globally_unique = xtrue ] || + [ x$OCF_RESKEY_CRM_meta_globally_unique = xTrue ] || + [ x$OCF_RESKEY_CRM_meta_globally_unique = xyes ] || + [ x$OCF_RESKEY_CRM_meta_globally_unique = xYes ]; then + SysInfoStats + exit $OCF_SUCCESS + fi + exit $OCF_NOT_RUNNING +} + +SysInfo_validate() { + return $OCF_SUCCESS +} + +if [ $# -ne 1 ]; then + SysInfo_usage + exit $OCF_ERR_ARGS +fi + +if [ x != x${OCF_RESKEY_delay} ]; then + OCF_RESKEY_delay="-d ${OCF_RESKEY_delay}" +fi + +case $__OCF_ACTION in +meta-data) meta_data + exit $OCF_SUCCESS + ;; +start) SysInfo_start + ;; +stop) SysInfo_stop + ;; +monitor) SysInfo_monitor + ;; +validate-all) SysInfo_validate + ;; +usage|help) SysInfo_usage + exit $OCF_SUCCESS + ;; +*) SysInfo_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +exit $? diff --git a/heartbeat/VIPArip b/heartbeat/VIPArip new file mode 100755 index 0000000..688237d --- /dev/null +++ b/heartbeat/VIPArip @@ -0,0 +1,314 @@ +#!/bin/sh +# +# License: GNU General Public License (GPL) +# Support: users@clusterlabs.org +# Author: Huang Zhen <zhenhltc@cn.ibm.com> +# Copyright (c) 2006 International Business Machines +# +# Virtual IP Address by RIP2 protocol. +# This script manages IP alias in different subnet with quagga/ripd. +# It can add an IP alias, or remove one. +# +# The quagga package should be installed to run this RA +# +# usage: $0 {start|stop|status|monitor|validate-all|meta-data} +# +# The "start" arg adds an IP alias. +# Surprisingly, the "stop" arg removes one. :-) +# +# OCF parameters are as below +# OCF_RESKEY_ip The IP address in different subnet +# OCF_RESKEY_nic The nic for broadcast the route information +# +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +RIPDCONF=$HA_RSCTMP/VIPArip-ripd.conf +ZEBRA=/usr/sbin/zebra +RIPD=/usr/sbin/ripd +USAGE="usage: $0 {start|stop|status|monitor|validate-all|meta-data}"; + +# Parameter defaults + +OCF_RESKEY_ip_default="" +OCF_RESKEY_nic_default="eth0" +OCF_RESKEY_zebra_binary_default="${ZEBRA}" +OCF_RESKEY_ripd_binary_default="${RIPD}" + +: ${OCF_RESKEY_ip=${OCF_RESKEY_ip_default}} +: ${OCF_RESKEY_nic=${OCF_RESKEY_nic_default}} +: ${OCF_RESKEY_zebra_binary=${OCF_RESKEY_zebra_binary_default}} +: ${OCF_RESKEY_ripd_binary=${OCF_RESKEY_ripd_binary_default}} + +####################################################################### + +meta_data() { +cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="VIPArip" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Virtual IP Address by RIP2 protocol. +This script manages IP alias in different subnet with quagga/ripd. +It can add an IP alias, or remove one. +</longdesc> +<shortdesc lang="en">Manages a virtual IP address through RIP2</shortdesc> + +<parameters> + +<parameter name="ip" unique="1" required="1"> +<longdesc lang="en"> +The IPv4 address in different subnet, for example "192.168.1.1". +</longdesc> +<shortdesc lang="en">The IP address in different subnet</shortdesc> +<content type="string" default="${OCF_RESKEY_ip_default}" /> +</parameter> + +<parameter name="nic" unique="0"> +<longdesc lang="en"> +The nic for broadcast the route information. +The ripd uses this nic to broadcast the route information to others +</longdesc> +<shortdesc lang="en">The nic for broadcast the route information</shortdesc> +<content type="string" default="${OCF_RESKEY_nic_default}"/> +</parameter> + +<parameter name="zebra_binary" unique="0"> +<longdesc lang="en"> +Absolute path to the zebra binary. +</longdesc> +<shortdesc lang="en">zebra binary</shortdesc> +<content type="string" default="${OCF_RESKEY_zebra_binary_default}"/> +</parameter> + +<parameter name="ripd_binary" unique="0"> +<longdesc lang="en"> +Absolute path to the ripd binary. +</longdesc> +<shortdesc lang="en">ripd binary</shortdesc> +<content type="string" default="${OCF_RESKEY_ripd_binary_default}"/> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="5s" /> +<action name="validate-all" timeout="20s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +exit $OCF_SUCCESS +} + +usage() { + echo $USAGE >&2 +} + +new_config_file() { + echo new_config_file $1 $2 $3 + cat >$RIPDCONF <<END +hostname ripd +password zebra +debug rip events +debug rip packet +debug rip zebra +log file /var/log/quagga/quagga.log +router rip +!nic_tag + no passive-interface $2 + network $2 + distribute-list private out $2 + distribute-list private in $2 +!metric_tag + redistribute connected metric $3 +!ip_tag +access-list private permit $1/32 +access-list private deny any +END +} +check_params() { + if [ x"$OCF_RESKEY_ip" = x ] + then + ocf_log err "ip is a required parameter" + exit $OCF_ERR_CONFIGURED + fi +} +set_metric() { + echo set_metric $1 + sed "s/redistribute connected metric .*/redistribute connected metric $1/g" $RIPDCONF > $RIPDCONF.tmp + cp $RIPDCONF.tmp $RIPDCONF +} + +add_ip() { + echo add_ip $1 + sed "s/ip_tag/ip_tag\naccess-list private permit $1\/32/g" $RIPDCONF > $RIPDCONF.tmp + cp $RIPDCONF.tmp $RIPDCONF +} + +del_ip() { + echo del_ip $1 + sed "/$1/d" $RIPDCONF > $RIPDCONF.tmp + cp $RIPDCONF.tmp $RIPDCONF + if $GREP "access-list private permit" $RIPDCONF>/dev/null + then + echo some other IP is running + reload_config + else + stop_quagga + echo remove $RIPDCONF + rm $RIPDCONF + fi + +} + +add_nic() { + echo add_nic $1 + if $GREP "network $1" $RIPDCONF >/dev/null + then + echo the nic is already in the config file + else + sed "s/nic_tag/nic_tag\n no passive-interface $1\n network $1\n distribute-list private out $1\n distribute-list private in $1/g" $RIPDCONF > $RIPDCONF.tmp + cp $RIPDCONF.tmp $RIPDCONF + fi +} + +reload_config() { + echo reload_config + echo $RIPDCONF: + cat $RIPDCONF + echo killall -SIGHUP ripd + killall -SIGHUP ripd +} + +start_quagga() { + echo start_quagga + echo $RIPDCONF: + cat $RIPDCONF + echo $ZEBRA -d + $ZEBRA -d + echo $RIPD -d -f $RIPDCONF + $RIPD -d -f $RIPDCONF +} + +stop_quagga() { + echo stop_quagga + echo $RIPDCONF: + cat $RIPDCONF + echo killall -SIGTERM ripd + killall -SIGTERM ripd + echo killall -SIGTERM zebra + killall -SIGTERM zebra +} + +start_rip_ip() { + echo start_rip_ip + check_params + + if [ x"$OCF_RESKEY_nic" = x ] + then + echo OCF_RESKEY_nic is null, set to ${OCF_RESKEY_nic_default} + OCF_RESKEY_nic="${OCF_RESKEY_nic_default}" + fi + + status_rip_ip + case $? in + $OCF_SUCCESS) + ocf_log info "already running" + exit $OCF_SUCCESS + ;; + $OCF_NOT_RUNNING) + ;; + *) + ocf_log info "state undefined, stopping first" + stop_rip_ip + ;; + esac + + $IP2UTIL addr add $OCF_RESKEY_ip/32 dev lo + if [ -f "$RIPDCONF" ] + then + # there is a config file, add new data(IP,nic,metric) + # to the existing config file. + add_ip $OCF_RESKEY_ip + add_nic $OCF_RESKEY_nic + set_metric 1 + reload_config + echo sleep 3 + sleep 3 + set_metric 3 + reload_config + else + new_config_file $OCF_RESKEY_ip $OCF_RESKEY_nic 1 + start_quagga + echo sleep 3 + sleep 3 + set_metric 3 + reload_config + fi + return $OCF_SUCCESS +} + +stop_rip_ip() { + echo stop_rip_ip + check_params + status_rip_ip + if [ $? = $OCF_NOT_RUNNING ] + then + exit $OCF_SUCCESS + fi + $IP2UTIL addr del $OCF_RESKEY_ip dev lo + echo sleep 2 + sleep 2 + del_ip $OCF_RESKEY_ip + return $OCF_SUCCESS +} + +status_rip_ip() { + check_params + if $IP2UTIL addr | $GREP $OCF_RESKEY_ip >/dev/null + then + if $GREP $OCF_RESKEY_ip $RIPDCONF >/dev/null + then + if pidof ripd >/dev/null + then + return $OCF_SUCCESS + fi + fi + return $OCF_ERR_GENERIC + fi + return $OCF_NOT_RUNNING +} + +if + [ $# -ne 1 ] +then + usage + exit $OCF_ERR_ARGS +fi + +[ x != x"$OCF_RESKEY_zebra_binary" ] && + ZEBRA=$OCF_RESKEY_zebra_binary +[ x != x"$OCF_RESKEY_ripd_binary" ] && + RIPD=$OCF_RESKEY_ripd_binary + +case $1 in + start) start_rip_ip;; + stop) stop_rip_ip;; + status) status_rip_ip;; + monitor) status_rip_ip;; + validate-all) check_binary $IP2UTIL + exit $OCF_SUCCESS;; + meta-data) meta_data;; + usage) usage; exit $OCF_SUCCESS;; + *) usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac diff --git a/heartbeat/VirtualDomain b/heartbeat/VirtualDomain new file mode 100755 index 0000000..3905695 --- /dev/null +++ b/heartbeat/VirtualDomain @@ -0,0 +1,1158 @@ +#!/bin/sh +# +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL) +# +# Resource Agent for domains managed by the libvirt API. +# Requires a running libvirt daemon (libvirtd). +# +# (c) 2008-2010 Florian Haas, Dejan Muhamedagic, +# and Linux-HA contributors +# +# usage: $0 {start|stop|status|monitor|migrate_to|migrate_from|meta-data|validate-all} +# +####################################################################### +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_config_default="" +OCF_RESKEY_migration_transport_default="" +OCF_RESKEY_migration_downtime_default=0 +OCF_RESKEY_migration_speed_default=0 +OCF_RESKEY_migration_network_suffix_default="" +OCF_RESKEY_force_stop_default=0 +OCF_RESKEY_monitor_scripts_default="" +OCF_RESKEY_autoset_utilization_cpu_default="true" +OCF_RESKEY_autoset_utilization_host_memory_default="true" +OCF_RESKEY_autoset_utilization_hv_memory_default="true" +OCF_RESKEY_unset_utilization_cpu_default="false" +OCF_RESKEY_unset_utilization_host_memory_default="false" +OCF_RESKEY_unset_utilization_hv_memory_default="false" +OCF_RESKEY_migrateport_default=$(( 49152 + $(ocf_maybe_random) % 64 )) +OCF_RESKEY_CRM_meta_timeout_default=90000 +OCF_RESKEY_save_config_on_stop_default=false +OCF_RESKEY_sync_config_on_stop_default=false +OCF_RESKEY_snapshot_default="" +OCF_RESKEY_backingfile_default="" +OCF_RESKEY_stateless_default="false" +OCF_RESKEY_copyindirs_default="" +OCF_RESKEY_shutdown_mode_default="" +OCF_RESKEY_start_resources_default="false" + +: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} +: ${OCF_RESKEY_migration_transport=${OCF_RESKEY_migration_transport_default}} +: ${OCF_RESKEY_migration_downtime=${OCF_RESKEY_migration_downtime_default}} +: ${OCF_RESKEY_migration_speed=${OCF_RESKEY_migration_speed_default}} +: ${OCF_RESKEY_migration_network_suffix=${OCF_RESKEY_migration_network_suffix_default}} +: ${OCF_RESKEY_force_stop=${OCF_RESKEY_force_stop_default}} +: ${OCF_RESKEY_monitor_scripts=${OCF_RESKEY_monitor_scripts_default}} +: ${OCF_RESKEY_autoset_utilization_cpu=${OCF_RESKEY_autoset_utilization_cpu_default}} +: ${OCF_RESKEY_autoset_utilization_host_memory=${OCF_RESKEY_autoset_utilization_host_memory_default}} +: ${OCF_RESKEY_autoset_utilization_hv_memory=${OCF_RESKEY_autoset_utilization_hv_memory_default}} +: ${OCF_RESKEY_unset_utilization_cpu=${OCF_RESKEY_unset_utilization_cpu_default}} +: ${OCF_RESKEY_unset_utilization_host_memory=${OCF_RESKEY_unset_utilization_host_memory_default}} +: ${OCF_RESKEY_unset_utilization_hv_memory=${OCF_RESKEY_unset_utilization_hv_memory_default}} +: ${OCF_RESKEY_migrateport=${OCF_RESKEY_migrateport_default}} +: ${OCF_RESKEY_CRM_meta_timeout=${OCF_RESKEY_CRM_meta_timeout_default}} +: ${OCF_RESKEY_save_config_on_stop=${OCF_RESKEY_save_config_on_stop_default}} +: ${OCF_RESKEY_sync_config_on_stop=${OCF_RESKEY_sync_config_on_stop_default}} +: ${OCF_RESKEY_snapshot=${OCF_RESKEY_snapshot_default}} +: ${OCF_RESKEY_backingfile=${OCF_RESKEY_backingfile_default}} +: ${OCF_RESKEY_stateless=${OCF_RESKEY_stateless_default}} +: ${OCF_RESKEY_copyindirs=${OCF_RESKEY_copyindirs_default}} +: ${OCF_RESKEY_shutdown_mode=${OCF_RESKEY_shutdown_mode_default}} +: ${OCF_RESKEY_start_resources=${OCF_RESKEY_start_resources_default}} + +if ocf_is_true ${OCF_RESKEY_sync_config_on_stop}; then + OCF_RESKEY_save_config_on_stop="true" +fi +####################################################################### + +## I'd very much suggest to make this RA use bash, +## and then use magic $SECONDS. +## But for now: +NOW=$(date +%s) + +usage() { + echo "usage: $0 {start|stop|status|monitor|migrate_to|migrate_from|meta-data|validate-all}" +} + +VirtualDomain_meta_data() { + cat <<EOF +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="VirtualDomain" version="1.1"> +<version>1.0</version> + +<longdesc lang="en"> +Resource agent for a virtual domain (a.k.a. domU, virtual machine, +virtual environment etc., depending on context) managed by libvirtd. +</longdesc> +<shortdesc lang="en">Manages virtual domains through the libvirt virtualization framework</shortdesc> + +<parameters> + +<parameter name="config" unique="1" required="1"> +<longdesc lang="en"> +Absolute path to the libvirt configuration file, +for this virtual domain. +</longdesc> +<shortdesc lang="en">Virtual domain configuration file</shortdesc> +<content type="string" default="${OCF_RESKEY_config_default}" /> +</parameter> + +<parameter name="hypervisor" unique="0" required="0"> +<longdesc lang="en"> +Hypervisor URI to connect to. See the libvirt documentation for +details on supported URI formats. The default is system dependent. +Determine the system's default uri by running 'virsh --quiet uri'. +</longdesc> +<shortdesc lang="en">Hypervisor URI</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="force_stop" unique="0" required="0"> +<longdesc lang="en"> +Always forcefully shut down ("destroy") the domain on stop. The default +behavior is to resort to a forceful shutdown only after a graceful +shutdown attempt has failed. You should only set this to true if +your virtual domain (or your virtualization backend) does not support +graceful shutdown. +</longdesc> +<shortdesc lang="en">Always force shutdown on stop</shortdesc> +<content type="boolean" default="${OCF_RESKEY_force_stop_default}" /> +</parameter> + +<parameter name="migration_transport" unique="0" required="0"> +<longdesc lang="en"> +Transport used to connect to the remote hypervisor while +migrating. Please refer to the libvirt documentation for details on +transports available. If this parameter is omitted, the resource will +use libvirt's default transport to connect to the remote hypervisor. +</longdesc> +<shortdesc lang="en">Remote hypervisor transport</shortdesc> +<content type="string" default="${OCF_RESKEY_migration_transport_default}" /> +</parameter> + +<parameter name="migration_user" unique="0" required="0"> +<longdesc lang="en"> +The username will be used in the remote libvirt remoteuri/migrateuri. No user will be +given (which means root) in the username if omitted + +If remoteuri is set, migration_user will be ignored. +</longdesc> +<shortdesc lang="en">Remote username for the remoteuri</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="migration_downtime" unique="0" required="0"> +<longdesc lang="en"> +Define max downtime during live migration in milliseconds +</longdesc> +<shortdesc lang="en">Live migration downtime</shortdesc> +<content type="integer" default="${OCF_RESKEY_migration_downtime_default}" /> +</parameter> + +<parameter name="migration_speed" unique="0" required="0"> +<longdesc lang="en"> +Define live migration speed per resource in MiB/s +</longdesc> +<shortdesc lang="en">Live migration speed</shortdesc> +<content type="integer" default="${OCF_RESKEY_migration_speed_default}" /> +</parameter> + +<parameter name="migration_network_suffix" unique="0" required="0"> +<longdesc lang="en"> +Use a dedicated migration network. The migration URI is composed by +adding this parameters value to the end of the node name. If the node +name happens to be an FQDN (as opposed to an unqualified host name), +insert the suffix immediately prior to the first period (.) in the FQDN. +At the moment Qemu/KVM and Xen migration via a dedicated network is supported. + +Note: Be sure this composed host name is locally resolvable and the +associated IP is reachable through the favored network. This suffix will +be added to the remoteuri and migrateuri parameters. + +See also the migrate_options parameter below. +</longdesc> +<shortdesc lang="en">Migration network host name suffix</shortdesc> +<content type="string" default="${OCF_RESKEY_migration_network_suffix_default}" /> +</parameter> + +<parameter name="migrateuri" unique="0" required="0"> +<longdesc lang="en"> +You can also specify here if the calculated migrate URI is unsuitable for your +environment. + +If migrateuri is set then migration_network_suffix, migrateport and +--migrateuri in migrate_options are effectively ignored. Use "%n" as the +placeholder for the target node name. + +Please refer to the libvirt documentation for details on guest +migration. +</longdesc> +<shortdesc lang="en">Custom migrateuri for migration state transfer</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="migrate_options" unique="0" required="0"> +<longdesc lang="en"> +Extra virsh options for the guest live migration. You can also specify +here --migrateuri if the calculated migrate URI is unsuitable for your +environment. If --migrateuri is set then migration_network_suffix +and migrateport are effectively ignored. Use "%n" as the placeholder +for the target node name. + +Please refer to the libvirt documentation for details on guest +migration. +</longdesc> +<shortdesc lang="en">live migrate options</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="monitor_scripts" unique="0" required="0"> +<longdesc lang="en"> +To additionally monitor services within the virtual domain, add this +parameter with a list of scripts to monitor. + +Note: when monitor scripts are used, the start and migrate_from operations +will complete only when all monitor scripts have completed successfully. +Be sure to set the timeout of these operations to accommodate this delay. +</longdesc> +<shortdesc lang="en">space-separated list of monitor scripts</shortdesc> +<content type="string" default="${OCF_RESKEY_monitor_scripts_default}" /> +</parameter> + +<parameter name="autoset_utilization_cpu" unique="0" required="0"> +<longdesc lang="en"> +If set true, the agent will detect the number of domainU's vCPUs from virsh, and put it +into the CPU utilization of the resource when the monitor is executed. +</longdesc> +<shortdesc lang="en">Enable auto-setting the CPU utilization of the resource</shortdesc> +<content type="boolean" default="${OCF_RESKEY_autoset_utilization_cpu_default}" /> +</parameter> + +<parameter name="autoset_utilization_host_memory" unique="0" required="0"> +<longdesc lang="en"> +If set true, the agent will detect the number of *Max memory* from virsh, and put it +into the host_memory utilization of the resource when the monitor is executed. +</longdesc> +<shortdesc lang="en">Enable auto-setting the host_memory utilization of the resource</shortdesc> +<content type="boolean" default="${OCF_RESKEY_autoset_utilization_host_memory_default}" /> +</parameter> + +<parameter name="autoset_utilization_hv_memory" unique="0" required="0"> +<longdesc lang="en"> +If set true, the agent will detect the number of *Max memory* from virsh, and put it +into the hv_memory utilization of the resource when the monitor is executed. +</longdesc> +<shortdesc lang="en">Enable auto-setting the hv_memory utilization of the resource</shortdesc> +<content type="boolean" default="${OCF_RESKEY_autoset_utilization_hv_memory_default}" /> +</parameter> + +<parameter name="unset_utilization_cpu" unique="0" required="0"> +<longdesc lang="en"> +If set true then the agent will remove the cpu utilization resource when the monitor +is executed. +</longdesc> +<shortdesc lang="en">Enable auto-removing the CPU utilization of the resource</shortdesc> +<content type="boolean" default="${OCF_RESKEY_unset_utilization_cpu_default}" /> +</parameter> + +<parameter name="unset_utilization_host_memory" unique="0" required="0"> +<longdesc lang="en"> +If set true then the agent will remove the host_memory utilization resource when the monitor +is executed. +</longdesc> +<shortdesc lang="en">Enable auto-removing the host_memory utilization of the resource</shortdesc> +<content type="boolean" default="${OCF_RESKEY_unset_utilization_host_memory_default}" /> +</parameter> + +<parameter name="unset_utilization_hv_memory" unique="0" required="0"> +<longdesc lang="en"> +If set true then the agent will remove the hv_memory utilization resource when the monitor +is executed. +</longdesc> +<shortdesc lang="en">Enable auto-removing the hv_memory utilization of the resource</shortdesc> +<content type="boolean" default="${OCF_RESKEY_unset_utilization_hv_memory_default}" /> +</parameter> + +<parameter name="migrateport" unique="0" required="0"> +<longdesc lang="en"> +This port will be used in the qemu migrateuri. If unset, the port will be a random highport. +</longdesc> +<shortdesc lang="en">Port for migrateuri</shortdesc> +<content type="integer" /> +</parameter> + +<parameter name="remoteuri" unique="0" required="0"> +<longdesc lang="en"> +Use this URI as virsh connection URI to commuicate with a remote hypervisor. + +If remoteuri is set then migration_user and migration_network_suffix are +effectively ignored. Use "%n" as the placeholder for the target node name. + +Please refer to the libvirt documentation for details on guest +migration. +</longdesc> +<shortdesc lang="en">Custom remoteuri to communicate with a remote hypervisor</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="save_config_on_stop" unique="0" required="0"> +<longdesc lang="en"> +Changes to a running VM's config are normally lost on stop. +This parameter instructs the RA to save the configuration back to the xml file provided in the "config" parameter. +</longdesc> +<shortdesc lang="en">Save running VM's config back to its config file</shortdesc> +<content type="boolean" /> +</parameter> + +<parameter name="sync_config_on_stop" unique="0" required="0"> +<longdesc lang="en"> +Setting this automatically enables save_config_on_stop. +When enabled this parameter instructs the RA to +call csync2 -x to synchronize the file to all nodes. +csync2 must be properly set up for this to work. +</longdesc> +<shortdesc lang="en">Save running VM's config back to its config file</shortdesc> +<content type="boolean" /> +</parameter> + +<parameter name="snapshot"> +<longdesc lang="en"> +Path to the snapshot directory where the virtual machine image will be stored. When this +parameter is set, the virtual machine's RAM state will be saved to a file in the snapshot +directory when stopped. If on start a state file is present for the domain, the domain +will be restored to the same state it was in right before it stopped last. This option +is incompatible with the 'force_stop' option. +</longdesc> +<shortdesc lang="en"> +Restore state on start/stop +</shortdesc> +<content type="string" default="${OCF_RESKEY_snapshot_default}"/> +</parameter> + +<parameter name="backingfile" unique="0" required="0"> +<longdesc lang="en"> +When the VM is used in Copy-On-Write mode, this is the backing file to use (with its full path). +The VMs image will be created based on this backing file. +This backing file will never be changed during the life of the VM. +</longdesc> +<shortdesc lang="en">If the VM is wanted to work with Copy-On-Write mode, this is the backing file to use (with its full path)</shortdesc> +<content type="string" default="${OCF_RESKEY_backingfile_default}" /> +</parameter> + +<parameter name="stateless" unique="0" required="0"> +<longdesc lang="en"> +If set to true and backingfile is defined, the start of the VM will systematically create a new qcow2 based on +the backing file, therefore the VM will always be stateless. If set to false, the start of the VM will use the +COW (<vmname>.qcow2) file if it exists, otherwise the first start will create a new qcow2 based on the backing +file given as backingfile. +</longdesc> +<shortdesc lang="en">If set to true, the (<vmname>.qcow2) file will be re-created at each start, based on the backing file (if defined)</shortdesc> +<content type="boolean" default="${OCF_RESKEY_stateless_default}" /> +</parameter> + +<parameter name="copyindirs" unique="0" required="0"> +<longdesc lang="en"> +List of directories for the virt-copy-in before booting the VM. Used only in stateless mode. +</longdesc> +<shortdesc lang="en">List of directories for the virt-copy-in before booting the VM stateless mode.</shortdesc> +<content type="string" default="${OCF_RESKEY_copyindirs_default}" /> +</parameter> + +<parameter name="shutdown_mode"> +<longdesc lang="en"> +virsh shutdown method to use. Please verify that it is supported by your virsh toolsed with 'virsh help shutdown' +When this parameter is set --mode shutdown_mode is passed as an additional argument to the 'virsh shutdown' command. +One can use this option in case default acpi method does not work. Verify that this mode is supported +by your VM. By default --mode is not passed. +</longdesc> +<shortdesc lang="en"> +Instruct virsh to use specific shutdown mode +</shortdesc> +<content type="string" default="${OCF_RESKEY_shutdown_mode_default}"/> +</parameter> + +<parameter name="start_resources"> +<longdesc lang="en"> +Start the virtual storage pools and networks used by the virtual machine before starting it or before live migrating it. +</longdesc> +<shortdesc lang="en"> +Ensure the needed virtual storage pools and networks are started +</shortdesc> +<content type="boolean" default="${OCF_RESKEY_start_resources_default}"/> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="90s" /> +<action name="stop" timeout="90s" /> +<action name="status" depth="0" timeout="30s" interval="10s" /> +<action name="monitor" depth="0" timeout="30s" interval="10s" /> +<action name="migrate_from" timeout="60s" /> +<action name="migrate_to" timeout="120s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s" /> +</actions> +</resource-agent> +EOF +} + +set_util_attr() { + local attr=$1 val=$2 + local cval outp + + cval=$(crm_resource -Q -r $OCF_RESOURCE_INSTANCE -z -g $attr 2>/dev/null) + if [ $? -ne 0 ] && [ -z "$cval" ]; then + crm_resource -Q -r $OCF_RESOURCE_INSTANCE -z -g $attr 2>&1 | grep -e "not connected" > /dev/null 2>&1 + if [ $? -eq 0 ]; then + ocf_log debug "Unable to set utilization attribute, cib is not available" + return + fi + fi + + if [ "$cval" != "$val" ]; then + outp=$(crm_resource -r $OCF_RESOURCE_INSTANCE -z -p $attr -v $val 2>&1) || + ocf_log warn "crm_resource failed to set utilization attribute $attr: $outp" + fi +} + +unset_util_attr() { + local attr=$1 + local cval outp + + outp=$(crm_resource --resource=$OCF_RESOURCE_INSTANCE --utilization --delete-parameter=$attr 2>&1) || + ocf_log warn "crm_resource failed to unset utilization attribute $attr: $outp" +} + +update_utilization() { + local dom_cpu dom_mem + + if ocf_is_true "$OCF_RESKEY_autoset_utilization_cpu"; then + dom_cpu=$(LANG=C virsh $VIRSH_OPTIONS dominfo ${DOMAIN_NAME} 2>/dev/null | awk '/CPU\(s\)/{print $2}') + test -n "$dom_cpu" && set_util_attr cpu $dom_cpu + elif ocf_is_true "$OCF_RESKEY_unset_utilization_cpu"; then + unset_util_attr cpu + fi + + if ocf_is_true "$OCF_RESKEY_autoset_utilization_host_memory"; then + dom_mem=$(LANG=C virsh $VIRSH_OPTIONS dominfo ${DOMAIN_NAME} 2>/dev/null | awk '/Max memory/{printf("%d", $3/1024)}') + test -n "$dom_mem" && set_util_attr host_memory "$dom_mem" + elif ocf_is_true "$OCF_RESKEY_unset_utilization_host_memory"; then + unset_util_attr host_memory + fi + + if ocf_is_true "$OCF_RESKEY_autoset_utilization_hv_memory"; then + dom_mem=$(LANG=C virsh $VIRSH_OPTIONS dominfo ${DOMAIN_NAME} 2>/dev/null | awk '/Max memory/{printf("%d", $3/1024)}') + test -n "$dom_mem" && set_util_attr hv_memory "$dom_mem" + elif ocf_is_true "$OCF_RESKEY_unset_utilization_hv_memory"; then + unset_util_attr hv_memory + fi +} + +get_emulator() +{ + local emulator="" + + emulator=$(virsh $VIRSH_OPTIONS dumpxml $DOMAIN_NAME 2>/dev/null | sed -n -e 's/^.*<emulator>\(.*\)<\/emulator>.*$/\1/p') + if [ -z "$emulator" ] && [ -e "$EMULATOR_STATE" ]; then + emulator=$(cat $EMULATOR_STATE) + fi + if [ -z "$emulator" ]; then + emulator=$(cat ${OCF_RESKEY_config} | sed -n -e 's/^.*<emulator>\(.*\)<\/emulator>.*$/\1/p') + fi + + if [ -n "$emulator" ]; then + basename $emulator + fi +} + +update_emulator_cache() +{ + local emulator + + emulator=$(get_emulator) + if [ -n "$emulator" ]; then + echo $emulator > $EMULATOR_STATE + fi +} + +# attempt to check domain status outside of libvirt using the emulator process +pid_status() +{ + local rc=$OCF_ERR_GENERIC + local emulator=$(get_emulator) + # An emulator is not required, so only report message in debug mode + local loglevel="debug" + + if ocf_is_probe; then + loglevel="notice" + fi + + case "$emulator" in + qemu-kvm|qemu-dm|qemu-system-*) + rc=$OCF_NOT_RUNNING + ps awx | grep -E "[q]emu-(kvm|dm|system).*-name ($DOMAIN_NAME|[^ ]*guest=$DOMAIN_NAME(,[^ ]*)?) " > /dev/null 2>&1 + if [ $? -eq 0 ]; then + rc=$OCF_SUCCESS + fi + ;; + libvirt_lxc) + rc=$OCF_NOT_RUNNING + ps awx | grep -E "[l]ibvirt_lxc.*-name ($DOMAIN_NAME|[^ ]*guest=$DOMAIN_NAME(,[^ ]*)?) " > /dev/null 2>&1 + if [ $? -eq 0 ]; then + rc=$OCF_SUCCESS + fi + ;; + # This can be expanded to check for additional emulators + *) + # We may be running xen with PV domains, they don't + # have an emulator set. try xl list or xen-lists + if have_binary xl; then + rc=$OCF_NOT_RUNNING + xl list $DOMAIN_NAME >/dev/null 2>&1 + if [ $? -eq 0 ]; then + rc=$OCF_SUCCESS + fi + elif have_binary xen-list; then + rc=$OCF_NOT_RUNNING + xen-list $DOMAIN_NAME 2>/dev/null | grep -qs "State.*[-r][-b][-p]--" 2>/dev/null + if [ $? -eq 0 ]; then + rc=$OCF_SUCCESS + fi + else + ocf_log $loglevel "Unable to determine emulator for $DOMAIN_NAME" + fi + ;; + esac + + if [ $rc -eq $OCF_SUCCESS ]; then + ocf_log debug "Virtual domain $DOMAIN_NAME is currently running." + elif [ $rc -eq $OCF_NOT_RUNNING ]; then + ocf_log debug "Virtual domain $DOMAIN_NAME is currently not running." + fi + + return $rc +} + +VirtualDomain_status() { + local try=0 + rc=$OCF_ERR_GENERIC + status="no state" + while [ "$status" = "no state" ]; do + try=$(($try + 1 )) + status=$(LANG=C virsh $VIRSH_OPTIONS domstate $DOMAIN_NAME 2>&1 | tr 'A-Z' 'a-z') + case "$status" in + *"error:"*"domain not found"|*"error:"*"failed to get domain"*|"shut off") + # shut off: domain is defined, but not started, will not happen if + # domain is created but not defined + # "Domain not found" or "failed to get domain": domain is not defined + # and thus not started + ocf_log debug "Virtual domain $DOMAIN_NAME is not running: $(echo $status | sed s/error://g)" + rc=$OCF_NOT_RUNNING + ;; + running|paused|idle|blocked|"in shutdown") + # running: domain is currently actively consuming cycles + # paused: domain is paused (suspended) + # idle: domain is running but idle + # blocked: synonym for idle used by legacy Xen versions + # in shutdown: the domain is in process of shutting down, but has not completely shutdown or crashed. + ocf_log debug "Virtual domain $DOMAIN_NAME is currently $status." + rc=$OCF_SUCCESS + ;; + ""|*"failed to "*"connect to the hypervisor"*|"no state") + # Empty string may be returned when virsh does not + # receive a reply from libvirtd. + # "no state" may occur when the domain is currently + # being migrated (on the migration target only), or + # whenever virsh can't reliably obtain the domain + # state. + status="no state" + if [ "$__OCF_ACTION" = "stop" ] && [ $try -ge 3 ]; then + # During the stop operation, we want to bail out + # quickly, so as to be able to force-stop (destroy) + # the domain if necessary. + ocf_exit_reason "Virtual domain $DOMAIN_NAME has no state during stop operation, bailing out." + return $OCF_ERR_GENERIC; + elif [ "$__OCF_ACTION" = "monitor" ]; then + pid_status + rc=$? + if [ $rc -ne $OCF_ERR_GENERIC ]; then + # we've successfully determined the domains status outside of libvirt + return $rc + fi + + else + # During all other actions, we just wait and try + # again, relying on the CRM/LRM to time us out if + # this takes too long. + ocf_log info "Virtual domain $DOMAIN_NAME currently has no state, retrying." + fi + sleep 1 + ;; + *) + # any other output is unexpected. + ocf_log error "Virtual domain $DOMAIN_NAME has unknown status \"$status\"!" + sleep 1 + ;; + esac + done + return $rc +} + +# virsh undefine removes configuration files if they are in +# directories which are managed by libvirt. such directories +# include also subdirectories of /etc (for instance +# /etc/libvirt/*) which may be surprising. VirtualDomain didn't +# include the undefine call before, hence this wasn't an issue +# before. +# +# There seems to be no way to find out which directories are +# managed by libvirt. +# +verify_undefined() { + local tmpf + if virsh --connect=${OCF_RESKEY_hypervisor} list --all --name 2>/dev/null | grep -wqs "$DOMAIN_NAME" + then + tmpf=$(mktemp -t vmcfgsave.XXXXXX) + if [ ! -r "$tmpf" ]; then + ocf_log warn "unable to create temp file, disk full?" + # we must undefine the domain + virsh $VIRSH_OPTIONS undefine $DOMAIN_NAME > /dev/null 2>&1 + else + cp -p $OCF_RESKEY_config $tmpf + virsh $VIRSH_OPTIONS undefine $DOMAIN_NAME > /dev/null 2>&1 + [ -f $OCF_RESKEY_config ] || cp -f $tmpf $OCF_RESKEY_config + rm -f $tmpf + fi + fi +} + +start_resources() { + local virsh_opts="--connect=$1 --quiet" + local pool_state net_state + for pool in `sed -n "s/^.*pool=['\"]\([^'\"]\+\)['\"].*\$/\1/gp" ${OCF_RESKEY_config} | sort | uniq`; do + pool_state=`LANG=C virsh ${virsh_opts} pool-info ${pool} | sed -n 's/^State: \+\(.*\)$/\1/gp'` + if [ "$pool_state" != "running" ]; then + virsh ${virsh_opts} pool-start $pool + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to start required virtual storage pool ${pool}." + return $OCF_ERR_GENERIC + fi + else + virsh ${virsh_opts} pool-refresh $pool + fi + done + + for net in `sed -n "s/^.*network=['\"]\([^'\"]\+\)['\"].*\$/\1/gp" ${OCF_RESKEY_config} | sort | uniq`; do + net_state=`LANG=C virsh ${virsh_opts} net-info ${net} | sed -n 's/^Active: \+\(.*\)$/\1/gp'` + if [ "$net_state" != "yes" ]; then + virsh ${virsh_opts} net-start $net + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to start required virtual network ${net}." + return $OCF_ERR_GENERIC + fi + fi + done + + return $OCF_SUCCESS +} + +VirtualDomain_start() { + local snapshotimage + + if VirtualDomain_status; then + ocf_log info "Virtual domain $DOMAIN_NAME already running." + return $OCF_SUCCESS + fi + + # systemd drop-in to stop domain before libvirtd terminates services + # during shutdown/reboot + if systemd_is_running ; then + systemd_drop_in "99-VirtualDomain-libvirt" "After" "libvirtd.service" + systemd_drop_in "99-VirtualDomain-machines" "Wants" "virt-guest-shutdown.target" + systemctl start virt-guest-shutdown.target + fi + + snapshotimage="$OCF_RESKEY_snapshot/${DOMAIN_NAME}.state" + if [ -n "$OCF_RESKEY_snapshot" -a -f "$snapshotimage" ]; then + virsh restore $snapshotimage + if [ $? -eq 0 ]; then + rm -f $snapshotimage + return $OCF_SUCCESS + fi + ocf_exit_reason "Failed to restore ${DOMAIN_NAME} from state file in ${OCF_RESKEY_snapshot} directory." + return $OCF_ERR_GENERIC + fi + + # Make sure domain is undefined before creating. + # The 'create' command guarantees that the domain will be + # undefined on shutdown, but requires the domain to be undefined. + # if a user defines the domain + # outside of this agent, we have to ensure that the domain + # is restored to an 'undefined' state before creating. + verify_undefined + + if ocf_is_true "${OCF_RESKEY_start_resources}"; then + start_resources ${OCF_RESKEY_hypervisor} + rc=$? + if [ $rc -eq $OCF_ERR_GENERIC ]; then + return $rc + fi + fi + + if [ -z "${OCF_RESKEY_backingfile}" ]; then + virsh $VIRSH_OPTIONS create ${OCF_RESKEY_config} + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to start virtual domain ${DOMAIN_NAME}." + return $OCF_ERR_GENERIC + fi + else + if ocf_is_true "${OCF_RESKEY_stateless}" || [ ! -s "${OCF_RESKEY_config%%.*}.qcow2" ]; then + # Create the Stateless image + dirconfig=`dirname ${OCF_RESKEY_config}` + qemu-img create -f qcow2 -b ${OCF_RESKEY_backingfile} ${OCF_RESKEY_config%%.*}.qcow2 + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed qemu-img create ${DOMAIN_NAME} with backing file ${OCF_RESKEY_backingfile}." + return $OCF_ERR_GENERIC + fi + + virsh define ${OCF_RESKEY_config} + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to define virtual domain ${DOMAIN_NAME}." + return $OCF_ERR_GENERIC + fi + + if [ -n "${OCF_RESKEY_copyindirs}" ]; then + # Inject copyindirs directories and files + virt-copy-in -d ${DOMAIN_NAME} ${OCF_RESKEY_copyindirs} / + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed on virt-copy-in command ${DOMAIN_NAME}." + return $OCF_ERR_GENERIC + fi + fi + else + virsh define ${OCF_RESKEY_config} + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to define virtual domain ${DOMAIN_NAME}." + return $OCF_ERR_GENERIC + fi + fi + + virsh $VIRSH_OPTIONS start ${DOMAIN_NAME} + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to start virtual domain ${DOMAIN_NAME}." + return $OCF_ERR_GENERIC + fi + fi + + while ! VirtualDomain_monitor; do + sleep 1 + done + + return $OCF_SUCCESS +} + +force_stop() +{ + local out ex translate + local status=0 + + ocf_log info "Issuing forced shutdown (destroy) request for domain ${DOMAIN_NAME}." + out=$(LANG=C virsh $VIRSH_OPTIONS destroy ${DOMAIN_NAME} 2>&1) + ex=$? + translate=$(echo $out|tr 'A-Z' 'a-z') + echo >&2 "$translate" + case $ex$translate in + *"error:"*"domain is not running"*|*"error:"*"domain not found"*|\ + *"error:"*"failed to get domain"*) + : ;; # unexpected path to the intended outcome, all is well + [!0]*) + ocf_exit_reason "forced stop failed" + return $OCF_ERR_GENERIC ;; + 0*) + while [ $status != $OCF_NOT_RUNNING ]; do + VirtualDomain_status + status=$? + done ;; + esac + return $OCF_SUCCESS +} + +sync_config(){ + ocf_log info "Syncing $DOMAIN_NAME config file with csync2 -x ${OCF_RESKEY_config}" + if ! csync2 -x ${OCF_RESKEY_config}; then + ocf_log warn "Syncing ${OCF_RESKEY_config} failed."; + fi +} + +save_config(){ + CFGTMP=$(mktemp -t vmcfgsave.XXX) + virsh $VIRSH_OPTIONS dumpxml --inactive --security-info ${DOMAIN_NAME} > ${CFGTMP} + if [ -s ${CFGTMP} ]; then + if ! cmp -s ${CFGTMP} ${OCF_RESKEY_config}; then + if virt-xml-validate ${CFGTMP} domain 2>/dev/null ; then + ocf_log info "Saving domain $DOMAIN_NAME to ${OCF_RESKEY_config}. Please make sure it's present on all nodes or sync_config_on_stop is on." + if cat ${CFGTMP} > ${OCF_RESKEY_config} ; then + ocf_log info "Saved $DOMAIN_NAME domain's configuration to ${OCF_RESKEY_config}." + if ocf_is_true "$OCF_RESKEY_sync_config_on_stop"; then + sync_config + fi + else + ocf_log warn "Moving ${CFGTMP} to ${OCF_RESKEY_config} failed." + fi + else + ocf_log warn "Domain $DOMAIN_NAME config failed to validate after dump. Skipping config update." + fi + fi + else + ocf_log warn "Domain $DOMAIN_NAME config has 0 size. Skipping config update." + fi + rm -f ${CFGTMP} +} + +VirtualDomain_stop() { + local i + local status + local shutdown_timeout + local needshutdown=1 + + VirtualDomain_status + status=$? + + case $status in + $OCF_SUCCESS) + if ocf_is_true $OCF_RESKEY_force_stop; then + # if force stop, don't bother attempting graceful shutdown. + force_stop + return $? + fi + + ocf_log info "Issuing graceful shutdown request for domain ${DOMAIN_NAME}." + + if [ -n "$OCF_RESKEY_snapshot" ]; then + virsh save $DOMAIN_NAME "$OCF_RESKEY_snapshot/${DOMAIN_NAME}.state" + if [ $? -eq 0 ]; then + needshutdown=0 + else + ocf_log error "Failed to save snapshot state of ${DOMAIN_NAME} on stop" + fi + fi + + # save config if needed + if ocf_is_true "$OCF_RESKEY_save_config_on_stop"; then + save_config + fi + + # issue the shutdown if save state didn't shutdown for us + if [ $needshutdown -eq 1 ]; then + # Issue a graceful shutdown request + if [ -n "${OCF_RESKEY_CRM_shutdown_mode}" ]; then + shutdown_opts="--mode ${OCF_RESKEY_CRM_shutdown_mode}" + fi + virsh $VIRSH_OPTIONS shutdown ${DOMAIN_NAME} $shutdown_opts + fi + + # The "shutdown_timeout" we use here is the operation + # timeout specified in the CIB, minus 5 seconds + shutdown_timeout=$(( $NOW + ($OCF_RESKEY_CRM_meta_timeout/1000) -5 )) + # Loop on status until we reach $shutdown_timeout + while [ $NOW -lt $shutdown_timeout ]; do + VirtualDomain_status + status=$? + case $status in + $OCF_NOT_RUNNING) + # This was a graceful shutdown. + return $OCF_SUCCESS + ;; + $OCF_SUCCESS) + # Domain is still running, keep + # waiting (until shutdown_timeout + # expires) + sleep 1 + ;; + *) + # Something went wrong. Bail out and + # resort to forced stop (destroy). + break; + esac + NOW=$(date +%s) + done + ;; + $OCF_NOT_RUNNING) + ocf_log info "Domain $DOMAIN_NAME already stopped." + return $OCF_SUCCESS + esac + + # OK. Now if the above graceful shutdown hasn't worked, kill + # off the domain with destroy. If that too does not work, + # have the LRM time us out. + force_stop +} + +mk_migrateuri() { + local target_node + local migrate_target + local hypervisor + + target_node="$OCF_RESKEY_CRM_meta_migrate_target" + + # A typical migration URI via a special migration network looks + # like "tcp://bar-mig:49152". The port would be randomly chosen + # by libvirt from the range 49152-49215 if omitted, at least since + # version 0.7.4 ... + if [ -n "${OCF_RESKEY_migration_network_suffix}" ]; then + hypervisor="${OCF_RESKEY_hypervisor%%[+:]*}" + # Hostname might be a FQDN + migrate_target=$(echo ${target_node} | sed -e "s,^\([^.]\+\),\1${OCF_RESKEY_migration_network_suffix},") + case $hypervisor in + qemu) + # For quiet ancient libvirt versions a migration port is needed + # and the URI must not contain the "//". Newer versions can handle + # the "bad" URI. + echo "tcp:${migrate_target}:${OCF_RESKEY_migrateport}" + ;; + xen) + echo "${migrate_target}" + ;; + *) + ocf_log warn "$DOMAIN_NAME: Migration via dedicated network currently not supported for ${hypervisor}." + ;; + esac + fi +} + +VirtualDomain_migrate_to() { + local rc + local target_node + local remoteuri + local transport_suffix + local migrateuri + local migrate_opts + local migrate_pid + + target_node="$OCF_RESKEY_CRM_meta_migrate_target" + + if VirtualDomain_status; then + # Find out the remote hypervisor to connect to. That is, turn + # something like "qemu://foo:9999/system" into + # "qemu+tcp://bar:9999/system" + + if [ -n "${OCF_RESKEY_remoteuri}" ]; then + remoteuri=`echo "${OCF_RESKEY_remoteuri}" | + sed "s/%n/$target_node/g"` + else + if [ -n "${OCF_RESKEY_migration_transport}" ]; then + transport_suffix="+${OCF_RESKEY_migration_transport}" + fi + + # append user defined suffix if virsh target should differ from cluster node name + if [ -n "${OCF_RESKEY_migration_network_suffix}" ]; then + # Hostname might be a FQDN + target_node=$(echo ${target_node} | sed -e "s,^\([^.]\+\),\1${OCF_RESKEY_migration_network_suffix},") + fi + + # a remote user has been defined to connect to target_node + if echo ${OCF_RESKEY_migration_user} | grep -q "^[a-z][-a-z0-9]*$" ; then + target_node="${OCF_RESKEY_migration_user}@${target_node}" + fi + + # Scared of that sed expression? So am I. :-) + remoteuri=$(echo ${OCF_RESKEY_hypervisor} | sed -e "s,\(.*\)://[^/:]*\(:\?[0-9]*\)/\(.*\),\1${transport_suffix}://${target_node}\2/\3,") + fi + + # User defined migrateuri or do we make one? + migrate_opts="$OCF_RESKEY_migrate_options" + + # migration_uri is directly set + if [ -n "${OCF_RESKEY_migrateuri}" ]; then + migrateuri=`echo "${OCF_RESKEY_migrateuri}" | + sed "s/%n/$target_node/g"` + + # extract migrationuri from options + elif echo "$migrate_opts" | fgrep -qs -- "--migrateuri="; then + migrateuri=`echo "$migrate_opts" | + sed "s/.*--migrateuri=\([^ ]*\).*/\1/;s/%n/$target_node/g"` + + # auto generate + else + migrateuri=`mk_migrateuri` + fi + + # remove --migrateuri from migration_opts + migrate_opts=`echo "$migrate_opts" | + sed "s/\(.*\)--migrateuri=[^ ]*\(.*\)/\1\2/"` + + + # save config if needed + if ocf_is_true "$OCF_RESKEY_save_config_on_stop"; then + save_config + fi + + if ocf_is_true "${OCF_RESKEY_start_resources}"; then + start_resources $remoteuri + rc=$? + if [ $rc -eq $OCF_ERR_GENERIC ]; then + return $rc + fi + fi + + # Live migration speed limit + if [ ${OCF_RESKEY_migration_speed} -ne 0 ]; then + ocf_log info "$DOMAIN_NAME: Setting live migration speed limit for $DOMAIN_NAME (using: virsh ${VIRSH_OPTIONS} migrate-setspeed $DOMAIN_NAME ${OCF_RESKEY_migration_speed})." + virsh ${VIRSH_OPTIONS} migrate-setspeed $DOMAIN_NAME ${OCF_RESKEY_migration_speed} + fi + + # OK, we know where to connect to. Now do the actual migration. + ocf_log info "$DOMAIN_NAME: Starting live migration to ${target_node} (using: virsh ${VIRSH_OPTIONS} migrate --live $migrate_opts $DOMAIN_NAME $remoteuri $migrateuri)." + virsh ${VIRSH_OPTIONS} migrate --live $migrate_opts $DOMAIN_NAME $remoteuri $migrateuri & + + migrate_pid=${!} + + # Live migration downtime interval + # Note: You can set downtime only while live migration is in progress + if [ ${OCF_RESKEY_migration_downtime} -ne 0 ]; then + sleep 2 + ocf_log info "$DOMAIN_NAME: Setting live migration downtime for $DOMAIN_NAME (using: virsh ${VIRSH_OPTIONS} migrate-setmaxdowntime $DOMAIN_NAME ${OCF_RESKEY_migration_downtime})." + virsh ${VIRSH_OPTIONS} migrate-setmaxdowntime $DOMAIN_NAME ${OCF_RESKEY_migration_downtime} + fi + + wait ${migrate_pid} + + rc=$? + if [ $rc -ne 0 ]; then + ocf_exit_reason "$DOMAIN_NAME: live migration to ${target_node} failed: $rc" + return $OCF_ERR_GENERIC + else + ocf_log info "$DOMAIN_NAME: live migration to ${target_node} succeeded." + return $OCF_SUCCESS + fi + else + ocf_exit_reason "$DOMAIN_NAME: migrate_to: Not active locally!" + return $OCF_ERR_GENERIC + fi +} + +VirtualDomain_migrate_from() { + # systemd drop-in to stop domain before libvirtd terminates services + # during shutdown/reboot + if systemd_is_running ; then + systemd_drop_in "99-VirtualDomain-libvirt" "After" "libvirtd.service" + systemd_drop_in "99-VirtualDomain-machines" "Wants" "virt-guest-shutdown.target" + systemctl start virt-guest-shutdown.target + fi + + while ! VirtualDomain_monitor; do + sleep 1 + done + ocf_log info "$DOMAIN_NAME: live migration from ${OCF_RESKEY_CRM_meta_migrate_source} succeeded." + # save config if needed + if ocf_is_true "$OCF_RESKEY_save_config_on_stop"; then + save_config + fi + return $OCF_SUCCESS +} + +VirtualDomain_monitor() { + # First, check the domain status. If that returns anything other + # than $OCF_SUCCESS, something is definitely wrong. + VirtualDomain_status + rc=$? + if [ ${rc} -eq ${OCF_SUCCESS} ]; then + # OK, the generic status check turned out fine. Now, if we + # have monitor scripts defined, run them one after another. + for script in ${OCF_RESKEY_monitor_scripts}; do + script_output="$($script 2>&1)" + script_rc=$? + if [ ${script_rc} -ne ${OCF_SUCCESS} ]; then + # A monitor script returned a non-success exit + # code. Stop iterating over the list of scripts, log a + # warning message, and propagate $OCF_ERR_GENERIC. + ocf_exit_reason "Monitor command \"${script}\" for domain ${DOMAIN_NAME} returned ${script_rc} with output: ${script_output}" + rc=$OCF_ERR_GENERIC + break + else + ocf_log debug "Monitor command \"${script}\" for domain ${DOMAIN_NAME} completed successfully with output: ${script_output}" + fi + done + fi + + update_emulator_cache + update_utilization + # Save configuration on monitor as well, so we will have a better chance of + # having fresh and up to date config files on all nodes. + if ocf_is_true "$OCF_RESKEY_save_config_on_stop"; then + save_config + fi + + return ${rc} +} + +VirtualDomain_validate_all() { + if ocf_is_true $OCF_RESKEY_force_stop && [ -n "$OCF_RESKEY_snapshot" ]; then + ocf_exit_reason "The 'force_stop' and 'snapshot' options can not be used together." + return $OCF_ERR_CONFIGURED + fi + + # check if we can read the config file (otherwise we're unable to + # deduce $DOMAIN_NAME from it, see below) + if [ ! -r $OCF_RESKEY_config ]; then + if ocf_is_probe; then + ocf_log info "Configuration file $OCF_RESKEY_config not readable during probe." + elif [ "$__OCF_ACTION" = "stop" ]; then + ocf_log info "Configuration file $OCF_RESKEY_config not readable, resource considered stopped." + else + ocf_exit_reason "Configuration file $OCF_RESKEY_config does not exist or not readable." + fi + return $OCF_ERR_INSTALLED + fi + + if [ -z $DOMAIN_NAME ]; then + ocf_exit_reason "Unable to determine domain name." + return $OCF_ERR_INSTALLED + fi + + # Check if csync2 is available when config tells us we might need it. + if ocf_is_true $OCF_RESKEY_sync_config_on_stop; then + check_binary csync2 + fi + + # Check if migration_speed is a decimal value + if ! ocf_is_decimal ${OCF_RESKEY_migration_speed}; then + ocf_exit_reason "migration_speed has to be a decimal value" + return $OCF_ERR_CONFIGURED + fi + + # Check if migration_downtime is a decimal value + if ! ocf_is_decimal ${OCF_RESKEY_migration_downtime}; then + ocf_exit_reason "migration_downtime has to be a decimal value" + return $OCF_ERR_CONFIGURED + fi + + if ocf_is_true "${OCF_RESKEY_stateless}" && [ -z "${OCF_RESKEY_backingfile}" ]; then + ocf_exit_reason "Stateless functionality can't be achieved without a backing file." + return $OCF_ERR_CONFIGURED + fi +} + +VirtualDomain_getconfig() { + # Grab the virsh uri default, but only if hypervisor isn't set + : ${OCF_RESKEY_hypervisor=$(virsh --quiet uri 2>/dev/null)} + + # Set options to be passed to virsh: + VIRSH_OPTIONS="--connect=${OCF_RESKEY_hypervisor} --quiet" + + # Retrieve the domain name from the xml file. + DOMAIN_NAME=`egrep '[[:space:]]*<name>.*</name>[[:space:]]*$' ${OCF_RESKEY_config} 2>/dev/null | sed -e 's/[[:space:]]*<name>\(.*\)<\/name>[[:space:]]*$/\1/'` + + EMULATOR_STATE="${HA_RSCTMP}/VirtualDomain-${DOMAIN_NAME}-emu.state" +} + +OCF_REQUIRED_PARAMS="config" +OCF_REQUIRED_BINARIES="virsh sed" +ocf_rarun $* diff --git a/heartbeat/WAS b/heartbeat/WAS new file mode 100755 index 0000000..15b56e9 --- /dev/null +++ b/heartbeat/WAS @@ -0,0 +1,572 @@ +#!/bin/sh +# +# +# WAS +# +# Description: Manages a Websphere Application Server as an HA resource +# +# +# Author: Alan Robertson +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL) +# Copyright: (C) 2002 - 2005 International Business Machines, Inc. +# +# +# An example usage in /etc/ha.d/haresources: +# node1 10.0.0.170 WAS::/opt/WebSphere/ApplicationServer/config/server-cfg.xml +# +# See usage() function below for more details... +# +# OCF parameters are as below: +# OCF_RESKEY_config +# (WAS-configuration file, used for the single server edition of WAS) +# OCF_RESKEY_port +# (WAS-<snoop>-port-number, used for the advanced edition of WAS) + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + +WASDIR=/opt/WebSphere/AppServer +if + [ ! -d $WASDIR ] +then + WASDIR=/usr/WebSphere/AppServer +fi +STARTTIME=300 # 5 minutes +DEFAULT_WASPORTS="9080" +# +# +WASBIN=$WASDIR/bin +DEFAULT=$WASDIR/config/server-cfg.xml + +# +# Print usage message +# +usage() { + methods=`WAS_methods | grep -v methods` + methods=`echo $methods | tr ' ' '|'` + cat <<-END + usage: $0 ($methods) + + For the single server edition of WAS, you have to set the following + enviroment virable: + OCF_RESKEY_config + (WAS-configuration file) + + For the advanced edition of WAS, you have to set the following + enviroment virable: + OCF_RESKEY_port + (WAS-<snoop>-port-number) + + $0 manages a Websphere Application Server (WAS) as an HA resource + + The 'start' operation starts WAS. + The 'stop' operation stops WAS. + The 'status' operation reports whether WAS is running + The 'monitor' operation reports whether the WAS seems to be working + (httpd also needs to be working for this case) + The 'validate-all' operation reports whether the OCF instance parameter (OCF_RESKEY_config or OCF_RESKEY_port) is valid + The 'methods' operation reports on the methods $0 supports + + This is known to work with the Single Server edition of Websphere, + and is believed to work with the Advanced edition too. + Since the Advanced Edition has no configuration file (it's in a the + database) you need to give a port number instead of a + configuration file for this config parameter. + + The default configuration file for the single server edition is: + $DEFAULT + + The default snoop-port for the advanced edition is: $DEFAULT_WASPORTS + + The start and stop operations must be run as root. + + The status operation will report a pid of "-" for the + WAS root process using unless it is run as root. + + If you don't have xmllint on your system, parsing of WAS + configuration files is very primitive. + In this case, the port specification we need from the XML + config file has to be on the same line as the + first part of the <transports/> tag. + + We run servlet/snoop on the first transport port listed in + the config file for the "monitor" operation. + + END +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="WAS" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for WAS. It manages a Websphere Application Server (WAS) as +an HA resource. +</longdesc> +<shortdesc lang="en">Manages a WebSphere Application Server instance</shortdesc> + +<parameters> +<parameter name="config" unique="0" required="0"> +<longdesc lang="en"> +The WAS-configuration file. +</longdesc> +<shortdesc lang="en">configration file</shortdesc> +<content type="string" default="$DEFAULT" /> +</parameter> + +<parameter name="port" unique="0"> +<longdesc lang="en"> +The WAS-(snoop)-port-number. +</longdesc> +<shortdesc lang="en">port</shortdesc> +<content type="integer" default="$DEFAULT_WASPORTS" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="300s" /> +<action name="stop" timeout="300s" /> +<action name="status" depth="0" timeout="30s" interval="10s" /> +<action name="monitor" depth="0" timeout="30s" interval="10s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +<action name="methods" timeout="5s" /> +</actions> +</resource-agent> +END +} + +# +# Reformat the XML document in a sort of canonical form +# if we can. If we don't have xmllint, we just cat it out +# and hope for the best ;-) +# +xmlcat() { + if + [ "X$XMLcat" = X ] + then + XMLcat=`which xmllint 2>/dev/null` + if + [ "X${XMLcat}" = X -o ! -x "${XMLcat}" ] + then + XMLcat=cat + else + XMLcat="$XMLcat --recover --format" + fi + fi + for j in "$@" + do + ${XMLcat} "$j" + done +} + +# +#This is a bit skanky, but it works anyway... +# +#<transports xmi:type="applicationserver:HTTPTransport" xmi:id="HttpTransport_1" hostname="*" port="9080"/> +#<transports xmi:type="applicationserver:HTTPTransport" xmi:id="HttpTransport_2" hostname="*" port="9443" sslEnabled="true"/> +#<transports xmi:type="applicationserver:HTTPTransport" xmi:id="HttpTransport_3" hostname="*" port="9090" external="false"/> +# +# It's not really skanky if we can find xmllint on the system, because it +# reformats tags so they are all on one line, which is all we we need... +# + +# +# Get the numbers of the ports WAS should be listening on... +# +# If we don't have xmllint around, then the applicationserver and the +# port= specification have to be on the same line in the XML config file. +# +GetWASPorts() { + case $1 in + [0-9]*) echo "$1" | tr ',' '\012';; + *) + xmlcat $1 | grep -i 'transports.*applicationserver:HTTPTransport' | + grep port= | + sed -e 's%.*port= *"* *%%' \ + -e 's%[^0-9][^0-9]*.*$%%' + # Delete up to port=, throw away optional quote and optional + # white space. + # Throw away everything after the first non-digit. + # This should leave us the port number all by itself... + esac +} + +# +# We assume that the first port listed in the <transports/> +# is the one we should run servlet/snoop on. +# +GetWASSnoopPort() { + GetWASPorts "$@" | head -n1 +} + +# +# Return information on the processname/id for the WAS ports +# +# pid/java is the expected output. Several lines, one per port... +# +# +WASPortInfo() { + pat="" + once=yes + PortCount=0 + for j in $* + do + case $pat in + "") pat="$j";; + *) pat="$pat|$j";; + esac + PortCount=`expr $PortCount + 1` + done + netstat -ltnp 2>/dev/null| egrep -i "($pat) .*LISTEN" | sed 's%.*LISTEN *%%' +} + +# +# Return the number of WAS ports which are open +# +CheckWASPortsInUse() { + count=`WASPortInfo "$@" | wc -l` + echo $count +} + +# +# Return the pid(s) of the processes that have WAS ports open +# +WASPIDs() { + WASPortInfo "$@" | sort -u | cut -f1 -d/ +} + +# +# The version of ps that returns all processes and their (long) args +# It's only used by WAS_procs, which isn't used for anything ;-) +# +ps_long() { + ps axww +} + + +# +# The total set of WAS processes (single server only) +# +WAS_procs() { + ps_long | grep -i "config=$1" | grep -i java | cut -d' ' -f1 +} + + + +# +# methods: What methods/operations do we support? +# +WAS_methods() { + cat <<-! + start + stop + status + methods + validate-all + meta-data + usage + ! + if + have_binary $WGET + then + echo monitor + fi +} + +# +# Return WAS status (silently) +# +WAS_status() { + WASPorts=`GetWASPorts $1` + PortsInUse=`CheckWASPortsInUse $WASPorts` + case $PortsInUse in + 0) false;; + *) true;; + esac +} + +# +# Report on WAS status to stdout... +# +WAS_report_status() { + WASPorts=`GetWASPorts $1` + PortCount=`echo $WASPorts | wc -w` + PortCount=`echo $PortCount` + PortsInUse=`CheckWASPortsInUse $WASPorts` + case $PortsInUse in + 0) ocf_log debug "WAS: server $1 is stopped."; return $OCF_NOT_RUNNING;; + *) + pids=`WASPIDs $WASPorts` + if + [ $PortsInUse -ge $PortCount ] + then + ocf_log debug "WAS: server $1 is running (pid" $pids "et al)." + else + ocf_log debug "WAS: server $1 is running (pid $pids et al) but not listening on all ports." + fi + return $OCF_SUCCESS;; + esac +} + +# +# Monitor WAS - does it really seem to be working? +# +# For this we invoke the snoop applet via wget. +# +# This is actually faster than WAS_status above... +# +WAS_monitor() { + trap '[ -z "$tmpfile" ] || rmtempfile "$tmpfile"' 0 + tmpfile=`maketempfile` || return 1 + SnoopPort=`GetWASSnoopPort $1` + output=`$WGET -nv -O$tmpfile http://localhost:$SnoopPort/servlet/snoop 2>&1` + rc=$? + if + [ $rc -eq 0 ] + then + if + grep -i 'user-agent.*Wget' $tmpfile >/dev/null + then + : OK + else + ocf_log "err" "WAS: $1: no user-agent from snoop application" + rc=$OCF_ERR_GENERIC + fi + else + ocf_log "err" "WAS: $1: wget failure: $output" + rc=$OCF_ERR_GENERIC + fi + return $rc +} + +# +# Start WAS instance +# +WAS_start() { +# Launch Arguments: +# +# -configFile <configFile> +# -nodeName <nodeName> +# -serverName <serverName> +# -oltEnabled +# -oltHost <hostname> +# -oltPort <port> +# -debugEnabled +# -jdwpPort <port> +# -debugSource <sourcePath> +# -serverTrace <traceString> +# -serverTraceFile <traceFile> +# -script [<scriptFile>] +# -platform <platformName> +# -noExecute +# -help + if + [ -x $WASBIN/startServer.sh ] + then + cmd="$WASBIN/startServer.sh -configFile $1" + else + cmd="$WASBIN/startupServer.sh" + fi + + if + ocf_run $cmd + then + if + WAS_wait_4_start $STARTTIME "$@" + then + #true + return $OCF_SUCCESS + else + ocf_log "err" "WAS server $1 did not start correctly" + return $OCF_ERR_GENERIC + fi + else + #false + return $OCF_ERR_GENERIC + fi +} + +# +# Wait for WAS to actually start up. +# +# It seems to take between 30 and 60 seconds for it to +# start up on a trivial WAS instance. +# +WAS_wait_4_start() { + max=$1 + retries=0 + shift + while + [ $retries -lt $max ] + do + if + WAS_status "$@" + then + return $OCF_SUCCESS + else + sleep 1 + fi + retries=`expr $retries + 1` + done + WAS_status "$@" +} + + +# +# Shut down WAS +# +WAS_stop() { + # They don't return good return codes... + # And, they seem to allow anyone to stop WAS (!) + if + [ -x $WASBIN/stopServer.sh ] + then + ocf_run $WASBIN/stopServer.sh -configFile $1 + else + WASPorts=`GetWASPorts $1` + kill `WASPIDs $WASPorts` + fi + if + WAS_status $1 + then + ocf_log "err" "WAS: $1 did not stop correctly" + #false + return $OCF_ERR_GENERIC + else + #true + return $OCF_SUCCESS + fi +} + +# +# Check if the port is valid +# +CheckPort() { + ocf_is_decimal "$1" && [ $1 -gt 0 ] +} + +WAS_validate_all() { + if [ -x $WASBIN/startServer.sh ]; then + # $arg should be config file + if [ ! -f "$arg" ]; then + ocf_log err "Configuration file [$arg] does not exist" + exit $OCF_ERR_ARGS + fi + + # $arg should specify a valid port number at the very least + local WASPorts=`GetWASPorts $arg` + if [ -z "$WASPorts" ]; then + ocf_log err "No port number specified in configuration file [$arg]" + exit $OCF_ERR_CONFIGURED + fi + + local port + local have_valid_port=false + for port in $WASPorts; do + if CheckPort $port; then + have_valid_port=true + break + fi + done + if [ "false" = "$have_valid_port" ]; then + ocf_log err "No valid port number specified in configuration file [$arg]" + exit $OCF_ERR_CONFIGURED + fi + + elif [ -x $WASBIN/startupServer.sh ]; then + # $arg should be port number + if CheckPort "$arg"; then + ocf_log err "Port number is required but [$arg] is not valid port number" + exit $OCF_ERR_ARGS + fi + else + # Do not know hot to validate_all + ocf_log warn "Do not know how to validate-all, assuming validation OK" + return $OCF_SUCCESS + fi +} +# +# 'main' starts here... +# + +if + ( [ $# -ne 1 ] ) +then + usage + exit $OCF_ERR_ARGS +fi + +# +# Supply default configuration parameter(s) +# + +if + ( [ -z $OCF_RESKEY_config ] && [ -z $OCF_RESKEY_port ] ) +then + if + [ -f $DEFAULT ] + then + arg=$DEFAULT + else + arg=$DEFAULT_WASPORTS + fi +elif + [ ! -z $OCF_RESKEY_config ] +then + arg=$OCF_RESKEY_config +else + arg=$OCF_RESKEY_port +fi + +if + [ ! -f $arg ] +then + case $arg in + [0-9]*) ;; # ignore port numbers... + *) ocf_log "err" "WAS configuration file $arg does not exist!" + usage + exit $OCF_ERR_ARGS;; + esac +fi + + +# What kind of method was invoked? +case "$1" in + + meta-data) meta_data + exit $OCF_SUCCESS;; + + start) WAS_start $arg + exit $?;; + + stop) WAS_stop $arg + exit $?;; + + status) WAS_report_status $arg + exit $?;; + + monitor) WAS_monitor $arg + exit $?;; + + validate-all) WAS_validate_all $arg + exit $?;; + + methods) WAS_methods + exit $?;; + + usage) usage + exit $OCF_SUCCESS;; + + *) usage + exit $OCF_ERR_UNIMPLEMENTED;; +esac diff --git a/heartbeat/WAS6 b/heartbeat/WAS6 new file mode 100755 index 0000000..9e18cd6 --- /dev/null +++ b/heartbeat/WAS6 @@ -0,0 +1,546 @@ +#!/bin/sh +# WAS6 +# +# Description: Manages a Websphere Application Server as an HA resource +# +# +# Author: Ru Xiang Min +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL) +# Copyright: (C) 2006 International Business Machines China, Ltd., Inc. +# +# +# An example usage in /etc/ha.d/haresources: +# node1 10.0.0.170 WAS::/opt/IBM/WebSphere/AppServer/profiles/default/config/cells/Node01Cell/nodes/Node01/serverindex.xml +# +# See usage() function below for more details... +# +# OCF parameters are as below: +# OCF_RESKEY_profile +# (WAS profile name, used for the single server edition of WAS6) + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + +WAS_DIR=/opt/IBM/WebSphere/AppServer +if + [ ! -d $WAS_DIR ] +then + WAS_DIR=/usr/IBM/WebSphere/AppServer +fi +STARTTIME=300 # 5 minutes +DEFAULT_WASPORTS="9080" +# +# +WAS_BIN=$WAS_DIR/bin +DEFAULT=default +# +# Print usage message +# +usage() { + methods=`WAS_methods | grep -v methods` + methods=`echo $methods | tr ' ' '|'` + cat <<-END + usage: $0 ($methods) + + For the single server edition of WAS6, you have to set the following + enviroment virable: + OCF_RESKEY_profile + (WAS profile name) + + + $0 manages a Websphere Application Server 6(WAS6) as an HA resource + + The 'start' operation starts WAS6. + The 'stop' operation stops WAS6. + The 'status' operation reports whether WAS6 is running + The 'monitor' operation reports whether the WAS6 seems to be working + (httpd also needs to be working for this case) + The 'validate-all' operation reports whether the OCF instance parameter (OCF_RESKEY_profileName ) is valid + The 'methods' operation reports on the methods $0 supports + + This is known to work with the Single Server edition of Websphere. + + The default profile name for the single server edition is: + $DEFAULT + + The start and stop operations must be run as root. + + The status operation will report a pid of "-" for the + WAS root process using unless it is run as root. + + If you don't have xmllint on your system, parsing of WAS + configuration files is very primitive. + + We run servlet/snoop on the seventh transport port listed in + the config file for the "monitor" operation. + + END +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="WAS6" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for WAS6. It manages a Websphere Application Server (WAS6) as +an HA resource. +</longdesc> +<shortdesc lang="en">Manages a WebSphere Application Server 6 instance</shortdesc> + +<parameters> +<parameter name="profile" unique="0" required="0"> +<longdesc lang="en"> +The WAS profile name. +</longdesc> +<shortdesc lang="en">profile name</shortdesc> +<content type="string" default="$DEFAULT" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="300s" /> +<action name="stop" timeout="300s" /> +<action name="status" depth="0" timeout="30s" interval="10s" /> +<action name="monitor" depth="0" timeout="30s" interval="10s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +<action name="methods" timeout="5s" /> +</actions> +</resource-agent> +END +} + +# +# Reformat the XML document in a sort of canonical form +# if we can. If we don't have xmllint, we just cat it out +# and hope for the best ;-) +# +xmlcat() { + if + [ "X$XMLcat" = X ] + then + XMLcat=`which xmllint 2>/dev/null` + if + [ "X${XMLcat}" = X -o ! -x "${XMLcat}" ] + then + XMLcat=cat + else + XMLcat="$XMLcat --recover --format" + fi + fi + for j in "$@" + do + ${XMLcat} "$j" + done +} + +# +#This is a bit skanky, but it works anyway... +# +# It's not really skanky if we can find xmllint on the system, because it +# reformats tags so they are all on one line, which is all we we need... +# +# +# Get the numbers of the ports WAS should be listening on... +# +# If we don't have xmllint around, then the applicationserver and the +# port= specification have to be on the same line in the XML config file. +# +GetWASPorts() { + case $1 in + [0-9]*) echo "$1" | tr ',' '\012';; + *) + xmlcat ${WAS_DIR}/profiles/${WAS_PROFILE_NAME}/config/cells/${WAS_CELL}/nodes/${WAS_NODE}/serverindex.xml | + grep port= | + sed -e 's%.*port= *"* *%%' \ + -e 's%[^0-9][^0-9]*.*$%%' + # Delete up to port=, throw away optional quote and optional + # white space. + # Throw away everything after the first non-digit. + # This should leave us the port number all by itself... + esac +} + +# +# We assume that the seventh port listed in the serverindex.xml +# is the one we should run servlet/snoop on. +# +GetWASSnoopPort() { + GetWASPorts "$@" | sed -n '7p' +} + +# +# Return information on the processname/id for the WAS ports +# +# pid/java is the expected output. Several lines, one per port... +# +# +WASPortInfo() { + pat="" + once=yes + PortCount=0 + for j in $* + do + case $pat in + "") pat="$j";; + *) pat="$pat|$j";; + esac + PortCount=`expr $PortCount + 1` + done + netstat -ltnp 2>/dev/null| egrep -i "($pat) .*LISTEN" | sed 's%.*LISTEN *%%' +} + +# +# Return the number of WAS ports which are open +# +CheckWASPortsInUse() { + count=`WASPortInfo "$@" | wc -l` + echo $count +} + +# +# Return the pid(s) of the processes that have WAS ports open +# +WASPIDs() { + WASPortInfo "$@" | sort -u | cut -f1 -d/ +} + +# +# The version of ps that returns all processes and their (long) args +# It's only used by WAS_procs, which isn't used for anything ;-) +# +ps_long() { + ps axww +} + + +# +# The total set of WAS processes (single server only) +# +WAS_procs() { + ps_long | grep -i "config=$1" | grep -i java | cut -d' ' -f1 +} + + + +# +# methods: What methods/operations do we support? +# +WAS_methods() { + cat <<-! + start + stop + status + methods + validate-all + meta-data + usage + ! + if + have_binary $WGET + then + echo " monitor" + fi +} + +# +# Return WAS status (silently) +# +WAS_status() { + WASPorts=`GetWASPorts $1` + PortsInUse=`CheckWASPortsInUse $WASPorts` + case $PortsInUse in + 0) false;; + *) true;; + esac +} + +# +# Report on WAS status to stdout... +# +WAS_report_status() { + WASPorts=`GetWASPorts $1` + PortCount=`echo $WASPorts | wc -w` + PortCount=`echo $PortCount` + PortsInUse=`CheckWASPortsInUse $WASPorts` + case $PortsInUse in + 0) ocf_log debug "WAS: server $1 is stopped."; return $OCF_NOT_RUNNING;; + *) + pids=`WASPIDs $WASPorts` + if + [ $PortsInUse -ge $PortCount ] + then + ocf_log debug "WAS: server $1 is running (pid" $pids "et al)." + else + ocf_log debug "WAS: server $1 is running (pid $pids et al) but not listening on all ports." + fi + return $OCF_SUCCESS;; + esac +} + +# +# Monitor WAS - does it really seem to be working? +# +# For this we invoke the snoop applet via wget. +# +# This is actually faster than WAS_status above... +# +WAS_monitor() { + trap '[ -z "$tmpfile" ] || rmtempfile "$tmpfile"' 0 + tmpfile=`maketempfile` || exit 1 + SnoopPort=`GetWASSnoopPort $1` + output=`$WGET -nv -O$tmpfile http://localhost:$SnoopPort/snoop 2>&1` + rc=$? + if + [ $rc -eq 0 ] + then + if + grep -i 'user-agent.*Wget' $tmpfile >/dev/null + then + : OK + else + ocf_log "err" "WAS: $1: no user-agent from snoop application" + rc=$OCF_ERR_GENERIC + fi + else + ocf_log "err" "WAS: $1: wget failure: $output" + rc=$OCF_ERR_GENERIC + fi + return $rc +} + +# +# Start WAS instance +# +WAS_start() { +# Launch Arguments: +# -nowait +# -quiet +# -logfile <filename> +# -replacelog +# -trace +# -script [<script filename >] [-background] +# -timeout <seconds> +# -statusport <portnumber> +# -profileName <profile> +# -help + if + [ -x $WAS_BIN/startServer.sh ] + then + cmd="$WAS_BIN/startServer.sh server1 -profileName $1" + fi + + if + ocf_run $cmd + then + if + WAS_wait_4_start $STARTTIME "$@" + then + #true + return $OCF_SUCCESS + else + ocf_log "err" "WAS server $1 did not start correctly" + return $OCF_ERR_GENERIC + fi + else + #false + if + WAS_wait_4_start $STARTTIME "$@" + then + #true + return $OCF_SUCCESS + else + ocf_log "err" "WAS server $1 did not start correctly" + return $OCF_ERR_GENERIC + fi + fi +} + +# +# Wait for WAS to actually start up. +# +# It seems to take between 30 and 60 seconds for it to +# start up on a trivial WAS instance. +# +WAS_wait_4_start() { + max=$1 + retries=0 + shift + while + [ $retries -lt $max ] + do + if + WAS_status "$@" + then + return $OCF_SUCCESS + else + sleep 1 + fi + retries=`expr $retries + 1` + done + WAS_status "$@" +} + + +# +# Shut down WAS +# +WAS_stop() { + # They don't return good return codes... + # And, they seem to allow anyone to stop WAS (!) + if + [ -x $WAS_BIN/stopServer.sh ] + then + ocf_run $WAS_BIN/stopServer.sh server1 -profileName $1 + else + WASPorts=`GetWASPorts $1` + kill `WASPIDs $WASPorts` + fi + if + WAS_status $1 + then + ocf_log "err" "WAS: $1 did not stop correctly" + #false + return $OCF_ERR_GENERIC + else + #true + return $OCF_SUCCESS + fi +} + +# +# Check if the port is valid +# +CheckPort() { + ocf_is_decimal "$1" && [ $1 -gt 0 ] +} + +WAS_validate_all() { + if [ -x $WAS_BIN/startServer.sh ]; then + # $arg should be profile name + if [ ! -f ${WAS_DIR}/profiles/${arg}/config/cells/${WAS_CELL}/nodes/${WAS_NODE}/serverindex.xml ]; then + ocf_log err "profile [$arg] does not exist" + exit $OCF_ERR_ARGS + fi + + # $arg should specify a valid port number at the very least + local WASPorts=`GetWASPorts $arg` + if [ -z "$WASPorts" ]; then + ocf_log err "No port number specified in configuration file of profile [$arg]" + exit $OCF_ERR_CONFIGURED + fi + + local port + local have_valid_port=false + for port in $WASPorts; do + if CheckPort $port; then + have_valid_port=true + break + fi + done + if [ "false" = "$have_valid_port" ]; then + ocf_log err "No valid port number specified in configuration file of profile [$arg]" + exit $OCF_ERR_CONFIGURED + fi + + elif [ -x $WAS_BIN/startupServer.sh ]; then + # $arg should be port number + if CheckPort "$arg"; then + ocf_log err "Port number is required but [$arg] is not valid port number" + exit $OCF_ERR_ARGS + fi + else + # Do not know hot to validate_all + ocf_log warn "Do not know how to validate-all, assuming validation OK" + return $OCF_SUCCESS + fi +} +# +# 'main' starts here... +# + +if + ( [ $# -ne 1 ] ) +then + usage + exit $OCF_ERR_ARGS +fi + +# These operations don't require OCF instance parameters to be set +case "$1" in + + meta-data) meta_data + exit $OCF_SUCCESS;; + + usage) usage + exit $OCF_SUCCESS;; + + methods) WAS_methods + exit $?;; + *);; +esac + + +# +# Supply default configuration parameter(s) +# + +if + [ -z $OCF_RESKEY_profile ] +then + arg=$DEFAULT +else + arg=$OCF_RESKEY_profile +fi + +if + [ ! -d ${WAS_DIR}/profiles/$arg ] +then + ocf_log "err" "WAS profile $arg does not exist!" + usage + exit $OCF_ERR_ARGS +fi + +WAS_PROFILE_NAME=$arg +if [ "${WAS_PROFILE_NAME:=}" != "" ]; then + WAS_PROFILE_FSDB_SCRIPT=${WAS_DIR}/properties/fsdb/${WAS_PROFILE_NAME}.sh +fi + +if [ "${WAS_PROFILE_FSDB_SCRIPT:=}" != "" ] && [ -f ${WAS_PROFILE_FSDB_SCRIPT} ]; then + . ${WAS_PROFILE_FSDB_SCRIPT} +fi + +if [ "${WAS_USER_SCRIPT:=}" != "" ]; then + . ${WAS_USER_SCRIPT} +fi + +# What kind of method was invoked? +case "$1" in + + start) WAS_start $arg + exit $?;; + + stop) WAS_stop $arg + exit $?;; + + status) WAS_report_status $arg + exit $?;; + + monitor) WAS_monitor $arg + exit $?;; + + validate-all) WAS_validate_all $arg + exit $?;; + + *) usage + exit $OCF_ERR_UNIMPLEMENTED;; +esac diff --git a/heartbeat/WinPopup b/heartbeat/WinPopup new file mode 100755 index 0000000..b48f3b9 --- /dev/null +++ b/heartbeat/WinPopup @@ -0,0 +1,237 @@ +#!/bin/sh +# +# Resource script for sending WinPopups using smbclient +# derived from Alan Robertson's MailTo script +# +# Author: Sandro Poppi <spoppi@gmx.de> +# +# Description: sends WinPopups to a sysadmin's workstation +# whenever a takeover occurs. +# +# OCF parameters are as below: +# OCF_RESKEY_hostfile +# +# where "hostfile" is a file containing the IPs/Workstation names +# one by line to be sent WinPopups +# +# License: GNU General Public License (GPL) + +WINPOPUPFILE=${HA_VARRUN}/WinPopup +####################################################################### +# Initialization: + +# Source function library. +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_hostfile_default="hosts" + +: ${OCF_RESKEY_hostfile=${OCF_RESKEY_hostfile_default}} + +####################################################################### + +us=`uname -n` + +usage() { + echo "Usage: $0 {start|stop|status|monitor|validate-all|meta-data}" +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="WinPopup" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for WinPopup. It sends WinPopups message to a +sysadmin's workstation whenever a takeover occurs. +</longdesc> +<shortdesc lang="en">Sends an SMB notification message to selected hosts</shortdesc> + +<parameters> +<parameter name="hostfile" unique="0" required="1"> +<longdesc lang="en"> +The file containing the hosts to send WinPopup messages to. +</longdesc> +<shortdesc lang="en">Host file</shortdesc> +<content type="string" default="${OCF_RESKEY_hostfile_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="30s" /> +<action name="stop" timeout="30s" /> +<action name="status" depth="0" timeout="10s" interval="10s" /> +<action name="monitor" depth="0" timeout="10s" interval="10s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +sendWinPopup() { + # if workstation file exists and is not zero + if [ -s "$hostfile" ] ; then + subject=$1 + shift + + for i in `cat $hostfile` ; do + echo "$subject $*" | smbclient -M $i >/dev/null 2>&1 + done + else + ocf_log err "Workstation file $hostfile missing or corrupt!" + return $OCF_ERR_ARGS + fi + + return $? +} + +SubjectLine() { + case $1 in + ??*) echo $1;; + *) echo "Resource Group";; + esac +} + + +WinPopupStart() { + + Subject="`SubjectLine $2` Takeover in progress on $us" + + if sendWinPopup "$Subject" $1; then + touch $WINPOPUPFILE + return $? + else + return $? + fi + +} + +WinPopupStop () { + Subject="`SubjectLine $2` Reestablishing original master connection in progress on $us" + + if sendWinPopup "$Subject" $1; then + rm -f $WINPOPUPFILE + return $? + else + return $? + fi + +} + +WinPopupStatus () { + ocf_log warn "Don't stat/monitor me! WinPopup is a pseudo resource agent, so the status reported may be incorrect" + if [ -f $WINPOPUPFILE ]; then + echo "running" + return $OCF_SUCCESS + else + echo "stopped" + return $OCF_NOT_RUNNING + fi +} + +# A not reliable IP address checking function, which only picks up those _obvious_ violations... +# +# It accepts IPv4 address in dotted quad notation, for example "192.168.1.1" +# +# 100% confidence whenever it reports "negative", +# but may get false "positive" answer. +# +CheckIP() { + ip="$1" + case $ip in + *[!0-9.]*) #got invalid char + false;; + .*|*.) #begin or end by ".", which is invalid + false;; + *..*) #consecutive ".", which is invalid + false;; + *.*.*.*.*) #four decimal dots, which is too many + false;; + *.*.*.*) #exactly three decimal dots, candidate, evaluate each field + local IFS=. + set -- $ip + if + ( [ $1 -le 254 ] && [ $2 -le 254 ] && [ $3 -le 254 ] && [ $4 -le 254 ] ) + then + true + fi + ;; + *) #less than three decimal dots + false;; + esac + return $? # This return is unnecessary, this comment too :) +} + +WinPopupValidateAll () { + if [ ! -s "$hostfile" ] ; then + ocf_log err "Workstation file $hostfile missing or corrupt!" + return $OCF_ERR_ARGS + fi + + # What kind of hostfiles are valid? + # We stick to the definition that, a hostfile is valid if and only if it + # contains at least one valid host to send WinPopup message to. + +# have_valid_host=no + for host in `cat $hostfile`; do + nmblookup $host 2>&1 | grep -q "failed to find name $host\>" + if [ $? -ne 0 ]; then +# have_valid_host=yes + return $OCF_SUCCESS + fi + # $host is not a netbios name, an IP address maybe? + if CheckIP "$host"; then +# have_valid_host=yes + return $OCF_SUCCESS + fi + done + + ocf_log err "Workstation file $hostfile contains no valid host!" + return $OCF_ERR_CONFIGURED +} + +if + ( [ $# -ne 1 ] ) +then + usage + exit $OCF_ERR_ARGS +fi + +# See how the environment virables were set. +hostfile=${OCF_RESKEY_hostfile} + +case "$1" in + meta-data) + meta_data + exit $OCF_SUCCESS + ;; + start) + WinPopupStart + ;; + stop) + WinPopupStop + ;; + + # Not quite sure what to do with this one... + status|monitor) + WinPopupStatus + ;; + validate-all) + WinPopupValidateAll + ;; + usage) + usage + exit $OCF_SUCCESS + ;; + *) + usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +exit $? diff --git a/heartbeat/Xen b/heartbeat/Xen new file mode 100755 index 0000000..1ef20d7 --- /dev/null +++ b/heartbeat/Xen @@ -0,0 +1,653 @@ +#!/bin/sh +# +# +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL) +# +# Resource Agent for the Xen Hypervisor. +# Manages Xen virtual machine instances by +# mapping cluster resource start and stop, +# to Xen create and shutdown, respectively. +# +# usage: $0 {start|stop|status|monitor|meta-data} +# +# OCF parameters are as below: +# OCF_RESKEY_xmfile +# Absolute path to the Xen control file, +# for this virtual machine. +# OCF_RESKEY_allow_mem_management +# Change memory usage on start/stop/migration +# of virtual machine +# OCF_RESKEY_reserved_Dom0_memory +# minimum memory reserved for domain 0 +# OCF_RESKEY_monitor_scripts +# scripts to monitor services within the +# virtual domain + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_xmfile_default="/etc/xen/vm/MyDomU" +OCF_RESKEY_name_default="" +OCF_RESKEY_shutdown_timeout_default="" +OCF_RESKEY_shutdown_acpi_default="0" +OCF_RESKEY_allow_mem_management_default="0" +OCF_RESKEY_node_ip_attribute_default="" +OCF_RESKEY_reserved_Dom0_memory_default="512" +OCF_RESKEY_autoset_utilization_cpu_default="false" +OCF_RESKEY_autoset_utilization_hv_memory_default="false" +OCF_RESKEY_monitor_scripts_default="" + +: ${OCF_RESKEY_xmfile=${OCF_RESKEY_xmfile_default}} +: ${OCF_RESKEY_name=${OCF_RESKEY_name_default}} +: ${OCF_RESKEY_shutdown_timeout=${OCF_RESKEY_shutdown_timeout_default}} +: ${OCF_RESKEY_shutdown_acpi=${OCF_RESKEY_shutdown_acpi_default}} +: ${OCF_RESKEY_allow_mem_management=${OCF_RESKEY_allow_mem_management_default}} +: ${OCF_RESKEY_node_ip_attribute=${OCF_RESKEY_node_ip_attribute_default}} +: ${OCF_RESKEY_reserved_Dom0_memory=${OCF_RESKEY_reserved_Dom0_memory_default}} +: ${OCF_RESKEY_autoset_utilization_cpu=${OCF_RESKEY_autoset_utilization_cpu_default}} +: ${OCF_RESKEY_autoset_utilization_hv_memory=${OCF_RESKEY_autoset_utilization_hv_memory_default}} +: ${OCF_RESKEY_monitor_scripts=${OCF_RESKEY_monitor_scripts_default}} + +####################################################################### + +usage() { + cat <<-END + usage: $0 {start|stop|status|monitor|meta-data|validate-all} +END +} + + +# prefer xl +xentool=$(which xl 2> /dev/null || which xm) + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="Xen" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource Agent for the Xen Hypervisor. +Manages Xen virtual machine instances by mapping cluster resource +start and stop, to Xen create and shutdown, respectively. + +A note on names + +We will try to extract the name from the config file (the xmfile +attribute). If you use a simple assignment statement, then you +should be fine. Otherwise, if there's some python acrobacy +involved such as dynamically assigning names depending on other +variables, and we will try to detect this, then please set the +name attribute. You should also do that if there is any chance of +a pathological situation where a config file might be missing, +for example if it resides on a shared storage. If all fails, we +finally fall back to the instance id to preserve backward +compatibility. + +Para-virtualized guests can also be migrated by enabling the +meta_attribute allow-migrate. + +</longdesc> +<shortdesc lang="en">Manages Xen unprivileged domains (DomUs)</shortdesc> + +<parameters> + +<parameter name="xmfile" unique="0" required="1"> +<longdesc lang="en"> +Absolute path to the Xen control file, +for this virtual machine. +</longdesc> +<shortdesc lang="en">Xen control file</shortdesc> +<content type="string" default="${OCF_RESKEY_xmfile_default}" /> +</parameter> +<parameter name="name" unique="0" required="0"> +<longdesc lang="en"> +Name of the virtual machine. +</longdesc> +<shortdesc lang="en">Xen DomU name</shortdesc> +<content type="string" default="${OCF_RESKEY_name_default}" /> +</parameter> +<parameter name="shutdown_timeout"> +<longdesc lang="en"> +The Xen agent will first try an orderly shutdown using xl shutdown. +Should this not succeed within this timeout, the agent will escalate to +xl destroy, forcibly killing the node. + +If this is not set, it will default to two-third of the stop action +timeout. + +Setting this value to 0 forces an immediate destroy. + +</longdesc> +<shortdesc lang="en">Shutdown escalation timeout</shortdesc> +<content type="string" default="${OCF_RESKEY_shutdown_timeout_default}" /> +</parameter> +<parameter name="shutdown_acpi" unique="0" required="0"> +<longdesc lang="en"> +Handle shutdown by simulating an ACPI power button event. +Enable this to allow graceful shutdown for HVM domains +without installed PV drivers. +</longdesc> +<shortdesc lang="en">Simulate power button event on shutdown</shortdesc> +<content type="boolean" default="${OCF_RESKEY_shutdown_acpi_default}" /> +</parameter> +<parameter name="allow_mem_management" unique="0" required="0"> +<longdesc lang="en"> +This parameter enables dynamic adjustment of memory for start +and stop actions used for Dom0 and the DomUs. The default is +to not adjust memory dynamically. +</longdesc> +<shortdesc lang="en">Use dynamic memory management</shortdesc> +<content type="boolean" default="${OCF_RESKEY_allow_mem_management_default}" /> +</parameter> + +<parameter name="node_ip_attribute"> +<longdesc lang="en"> +In case of a live migration, the system will default to using the IP +address associated with the hostname via DNS or /etc/hosts. + +This parameter allows you to specify a node attribute that will be +queried instead for the target node, overriding the IP address. This +allows you to use a dedicated network for live migration traffic to a +specific node. + +Warning: make very sure the IP address does point to the right node. Or +else the live migration will end up somewhere else, greatly confusing +the cluster and causing havoc. +</longdesc> +<shortdesc lang="en">Node attribute containing target IP address</shortdesc> +<content type="string" default="${OCF_RESKEY_node_ip_attribute_default}" /> +</parameter> + +<parameter name="reserved_Dom0_memory" unique="0" required="0"> +<longdesc lang="en"> +In case memory management is used, this parameter +defines the minimum amount of memory to be reserved +for the dom0. The default minimum memory is 512MB. +</longdesc> +<shortdesc lang="en">Minimum Dom0 memory</shortdesc> +<content type="string" default="${OCF_RESKEY_reserved_Dom0_memory_default}" /> +</parameter> + +<parameter name="autoset_utilization_cpu" unique="0" required="0"> +<longdesc lang="en"> +If set true, the agent will detect the number of domain's vCPUs from Xen, and put it +into the CPU utilization of the resource when the monitor is executed. +Before enabling make sure node utilization is also set (using NodeUtilization +agent or manually) or the resource might not be able to start anywhere. +</longdesc> +<shortdesc lang="en">Enable auto-setting the CPU utilization of the resource</shortdesc> +<content type="boolean" default="${OCF_RESKEY_autoset_utilization_cpu_default}" /> +</parameter> + +<parameter name="autoset_utilization_hv_memory" unique="0" required="0"> +<longdesc lang="en"> +If set true, the agent will detect the number of memory from Xen, and put it +into the hv_memory utilization of the resource when the monitor is executed. +Before enabling make sure node utilization is also set (using NodeUtilization +agent or manually) or the resource might not be able to start anywhere. +</longdesc> +<shortdesc lang="en">Enable auto-setting the hv_memory utilization of the resource</shortdesc> +<content type="boolean" default="${OCF_RESKEY_autoset_utilization_hv_memory_default}" /> +</parameter> + +<parameter name="monitor_scripts" unique="0" required="0"> +<longdesc lang="en"> +To additionally monitor services within the unprivileged domain, +add this parameter with a list of scripts to monitor. +</longdesc> +<shortdesc lang="en">list of space separated monitor scripts</shortdesc> +<content type="string" default="${OCF_RESKEY_monitor_scripts_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="60s" /> +<action name="stop" timeout="40s" /> +<action name="migrate_from" timeout="120s" /> +<action name="migrate_to" timeout="120s" /> +<action name="monitor" depth="0" timeout="30s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s" /> +</actions> +</resource-agent> +END +} + +Xen_Status() { + if expr "x$xentool" : "x.*xl" >/dev/null; then + $xentool list $1 >/dev/null 2>&1 + if [ $? -ne 0 ]; then + return $OCF_NOT_RUNNING + else + return $OCF_SUCCESS + fi + fi + if have_binary xen-list; then + xen-list $1 2>/dev/null | grep -qs "State.*[-r][-b][-p]--" 2>/dev/null + if [ $? -ne 0 ]; then + return $OCF_NOT_RUNNING + else + return $OCF_SUCCESS + fi + fi + STATUS=`$xentool list --long $1 2>/dev/null | grep status 2>/dev/null` + if [ "X${STATUS}" != "X" ]; then + # we have Xen 3.0.4 or higher + STATUS_NOSPACES=`echo "$STATUS" | awk '{ print $1,$2}'` + if [ "$STATUS_NOSPACES" = "(status 2)" -o "$STATUS_NOSPACES" = "(status 1)" ]; then + return $OCF_SUCCESS + else + return $OCF_NOT_RUNNING + fi + else + # we have Xen 3.0.3 or lower + STATUS=`$xentool list --long $1 2>/dev/null | grep state 2>/dev/null` + echo "${STATUS}" | grep -qs "[-r][-b][-p]---" + if [ $? -ne 0 ]; then + return $OCF_NOT_RUNNING + else + return $OCF_SUCCESS + fi + fi +} + +# If the guest is rebooting, it may completely disappear from the +# list of defined guests, thus xl/xen-list would return with not +# running; apparently, this period lasts only for a second or +# two +# If a status returns not running, then test status +# again for 5 times (perhaps it'll show up) +Xen_Status_with_Retry() { + local rc cnt=5 + + Xen_Status $1 + rc=$? + while [ $rc -eq $OCF_NOT_RUNNING -a $cnt -gt 0 ]; do + case "$__OCF_ACTION" in + stop) + ocf_log debug "domain $1 reported as not running, waiting $cnt seconds ..." + ;; + monitor) + ocf_log warn "domain $1 reported as not running, but it is expected to be running! Retrying for $cnt seconds ..." + ;; + *) : not reachable + ;; + esac + sleep 1 + Xen_Status $1 + rc=$? + cnt=$((cnt-1)) + done + return $rc +} + +set_util_attr() { + local attr=$1 val=$2 + local cval outp + + cval=$(crm_resource -Q -r $OCF_RESOURCE_INSTANCE -z -g $attr 2>/dev/null) + if [ $? -ne 0 ] && [ -z "$cval" ]; then + if crm_resource -Q -r $OCF_RESOURCE_INSTANCE -z -g $attr 2>&1 | grep -q "not connected"; then + ocf_log debug "Unable to get utilization attribute $attr: cib is not available" + return + fi + fi + + if [ "$cval" != "$val" ]; then + outp=$(crm_resource -r $OCF_RESOURCE_INSTANCE -z -p $attr -v $val 2>&1) || \ + ocf_log warn "Unable to set utilization attribute $attr: $outp" + fi +} + +Xen_Update_Utilization() { + local dom_status dom_cpu dom_mem + + dom_status=$($xentool list ${DOMAIN_NAME} | awk 'NR==2 {print $4, $3}') + + if ocf_is_true "$OCF_RESKEY_autoset_utilization_cpu"; then + dom_cpu=${dom_status% *} + test -n "$dom_cpu" && set_util_attr cpu $dom_cpu + fi + + if ocf_is_true "$OCF_RESKEY_autoset_utilization_hv_memory"; then + dom_mem=${dom_status#* } + test -n "$dom_mem" && set_util_attr hv_memory "$dom_mem" + fi +} + +Xen_Adjust_Memory() { + if ocf_is_true "${OCF_RESKEY_allow_mem_management}"; then + CNTNEW=$1 + RUNNING=`Xen_List_running` + RUNCNT=`Xen_Count_running` + MAXMEM=`Xen_Total_Memory` + if [ ${RUNCNT} -eq 0 -a ${CNTNEW} -eq 0 ]; then + RUNCNT=1 + fi + #NEWMEM=`echo "(${MAXMEM}-${OCF_RESKEY_reserved_Dom0_memory})/(${RUNCNT}+${CNTNEW})"|bc` + NEWMEM=$(( (${MAXMEM} - ${OCF_RESKEY_reserved_Dom0_memory}) / (${RUNCNT} + ${CNTNEW} ) )) + # do not rely on ballooning add dom0_mem=512 instead to force memory for dom0 + #$xentool mem-set Domain-0 ${OCF_RESKEY_reserved_Dom0_memory} + for DOM in ${RUNNING}; do + $xentool mem-set ${DOM} ${NEWMEM} + done + ocf_log info "Adjusted memory to: $NEWMEM, for the following $RUNCNT domains: $RUNNING" + fi +} + +Xen_List_all() { + $xentool list | grep -v -e "Name" -e "Domain-0" | awk '{print $1}' +} +Xen_List_running() { + ALL_DOMS=`Xen_List_all` + for DOM in ${ALL_DOMS}; do + if Xen_Status $DOM; then + echo "${DOM} " + fi + done +} +Xen_Count_running() { + Xen_List_running | wc -w +} + +Xen_Monitor() { + if ocf_is_probe; then + Xen_Status ${DOMAIN_NAME} + else + Xen_Status_with_Retry ${DOMAIN_NAME} + fi + if [ $? -eq ${OCF_NOT_RUNNING} ]; then + ocf_is_probe || + ocf_log err "Xen domain $DOMAIN_NAME stopped" + return ${OCF_NOT_RUNNING} + fi + if ocf_is_true "$OCF_RESKEY_autoset_utilization_cpu" || \ + ocf_is_true "$OCF_RESKEY_autoset_utilization_hv_memory" + then + Xen_Update_Utilization + fi + if [ "X${OCF_RESKEY_monitor_scripts}" = "X" ]; then + return ${OCF_SUCCESS} + fi + for SCRIPT in ${OCF_RESKEY_monitor_scripts}; do + $SCRIPT + if [ $? -ne 0 ]; then + return ${OCF_ERR_GENERIC} + fi + done + return ${OCF_SUCCESS} +} + +Xen_Total_Memory() { + $xentool info | grep "^total_memory" | awk '{print $3}' +} + +Xen_Start() { + if Xen_Status ${DOMAIN_NAME}; then + ocf_log info "Xen domain $DOMAIN_NAME already running." + return $OCF_SUCCESS + fi + + if [ ! -f "${OCF_RESKEY_xmfile}" ]; then + ocf_log err "Config file ${OCF_RESKEY_xmfile} for $DOMAIN_NAME does not exist." + return $OCF_ERR_INSTALLED + fi + + if ocf_is_true "${OCF_RESKEY_allow_mem_management}"; then + Xen_Adjust_Memory 1 + ocf_log info "New memory for virtual domains: ${NEWMEM}" + sed -i -e "/^memory=/ s/^memory=.*/memory=${NEWMEM}/" ${OCF_RESKEY_xmfile} + $xentool mem-set ${DOMAIN_NAME} ${NEWMEM} + fi + + # the latest xl management tool is squeamish about some + # characters in a name (the vm name is xen-f): + # /etc/xen/vm/xen-f:15: config parsing error near `xen': + # syntax error, unexpected IDENT, expecting STRING or NUMBER + # or '[' + # /etc/xen/vm/xen-f:15: config parsing error near `-f': lexical error + # + # the older xm management tool cannot digest quotes (see + # https://developerbugs.linuxfoundation.org/show_bug.cgi?id=2671) + # + # hence the following + if expr "x$xentool" : "x.*xl" >/dev/null; then + $xentool create ${OCF_RESKEY_xmfile} name=\"$DOMAIN_NAME\" + else + $xentool create ${OCF_RESKEY_xmfile} name="$DOMAIN_NAME" + fi + rc=$? + + if [ $rc -ne 0 ]; then + return $OCF_ERR_GENERIC + else + if ocf_is_true "${OCF_RESKEY_allow_mem_management}"; then + $xentool mem-set ${DOMAIN_NAME} ${NEWMEM} + fi + fi + while sleep 1; do + Xen_Monitor && return $OCF_SUCCESS + done +} + +xen_domain_stop() { + local dom=$1 + local timeout + + if [ -n "$OCF_RESKEY_shutdown_timeout" ]; then + timeout=$OCF_RESKEY_shutdown_timeout + elif [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + # Allow 2/3 of the action timeout for the orderly shutdown + # (The origin unit is ms, hence the conversion) + timeout=$((OCF_RESKEY_CRM_meta_timeout/1500)) + else + timeout=60 + fi + + if [ "$timeout" -gt 0 ]; then + ocf_log info "Xen domain $dom will be stopped (timeout: ${timeout}s)" + if ocf_is_true "${OCF_RESKEY_shutdown_acpi}"; then + $xentool trigger $dom power + else + $xentool shutdown $dom + fi + + while Xen_Status $dom && [ "$timeout" -gt 0 ]; do + ocf_log debug "$dom still not stopped. Waiting..." + timeout=$((timeout-1)) + sleep 1 + done + fi + + if [ "$timeout" -eq 0 ]; then + while Xen_Status $dom; do + ocf_log warn "Xen domain $dom will be destroyed!" + $xenkill $dom + sleep 1 + done + # Note: This does not give up. stop isn't allowed to to fail. + # If $xentool destroy fails, stop will eventually timeout. + # This is the correct behaviour. + fi + + ocf_log info "Xen domain $dom stopped." +} + +Xen_Stop() { + local vm + if Xen_Status_with_Retry ${DOMAIN_NAME}; then + vm=${DOMAIN_NAME} + elif Xen_Status migrating-${DOMAIN_NAME}; then + ocf_log info "Xen domain $DOMAIN_NAME is migrating" + vm="migrating-${DOMAIN_NAME}" + else + ocf_log info "Xen domain $DOMAIN_NAME already stopped." + fi + + if [ "$vm" ]; then + xen_domain_stop $vm + else + # It is supposed to be gone, but there have been situations where + # $xentool list / xen-list showed it as stopped but it was still + # instantiated. Nuke it once more to make sure: + $xenkill ${DOMAIN_NAME} + fi + + Xen_Adjust_Memory 0 + return $OCF_SUCCESS +} + +Xen_Migrate_To() { + target_node="$OCF_RESKEY_CRM_meta_migrate_target" + target_attr="$OCF_RESKEY_node_ip_attribute" + target_addr="$target_node" + + if Xen_Status ${DOMAIN_NAME}; then + ocf_log info "$DOMAIN_NAME: Starting $xentool migrate to $target_node" + + if [ -n "$target_attr" ]; then + nodevalue=`crm_attribute --type nodes --node $target_node -n $target_attr -G -q` + if [ -n "${nodevalue}" -a "${nodevalue}" != "(null)" ]; then + target_addr="$nodevalue" + ocf_log info "$DOMAIN_NAME: $target_node is using address $target_addr" + fi + fi + + if expr "x$xentool" : "x.*xm" >/dev/null; then + $xentool migrate --live $DOMAIN_NAME $target_addr + else + $xentool migrate $DOMAIN_NAME $target_addr + fi + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "$DOMAIN_NAME: $xentool migrate to $target_node failed: $rc" + return $OCF_ERR_GENERIC + else + Xen_Adjust_Memory 0 + ocf_log info "$DOMAIN_NAME: $xentool migrate to $target_node succeeded." + return $OCF_SUCCESS + fi + else + ocf_log err "$DOMAIN_NAME: migrate_to: Not active locally!" + return $OCF_ERR_GENERIC + fi +} + +Xen_Migrate_From() { + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + # Allow 2/3 of the action timeout for status to stabilize + # (The origin unit is ms, hence the conversion) + timeout=$((OCF_RESKEY_CRM_meta_timeout/1500)) + else + timeout=10 # should be plenty + fi + + while ! Xen_Status ${DOMAIN_NAME} && [ $timeout -gt 0 ]; do + ocf_log debug "$DOMAIN_NAME: Not yet active locally, waiting (timeout: ${timeout}s)" + timeout=$((timeout-1)) + sleep 1 + done + + if Xen_Status ${DOMAIN_NAME}; then + Xen_Adjust_Memory 0 + ocf_log info "$DOMAIN_NAME: Active locally, migration successful" + return $OCF_SUCCESS + else + ocf_log err "$DOMAIN_NAME: Not active locally, migration failed!" + return $OCF_ERR_GENERIC + fi +} + +Xen_Validate_All() { + return $OCF_SUCCESS +} + +if [ $# -ne 1 ]; then + usage + exit $OCF_ERR_ARGS +fi + +case $1 in + meta-data) + meta_data + exit $OCF_SUCCESS + ;; + usage) + usage + exit $OCF_SUCCESS + ;; +esac + +# the name business: +# +# 1. use the name attribute, or +# 2. find the name in the config file (if it exists) and use that +# unless it contains funny characters such as '%' or space, or +# 3. use the OCF_RESOURCE_INSTANCE + +if [ x"${OCF_RESKEY_name}" != x ]; then + DOMAIN_NAME="${OCF_RESKEY_name}" +else + if [ -f "${OCF_RESKEY_xmfile}" ]; then + DOMAIN_NAME=`awk '$1~/^name(=|$)/{print}' ${OCF_RESKEY_xmfile} | sed 's/.*=[[:space:]]*//' | tr -d "[\"']"` + if echo "$DOMAIN_NAME" | grep -qs '[%[:space:]]'; then + DOMAIN_NAME="" + fi + fi + DOMAIN_NAME=${DOMAIN_NAME:-${OCF_RESOURCE_INSTANCE}} +fi + +for binary in sed awk; do + check_binary $binary +done + +if have_binary xen-destroy ; then + xenkill="xen-destroy" +else + xenkill="$xentool destroy" +fi + +if [ -n "$OCF_RESKEY_shutdown_timeout" ]; then + ocf_is_decimal "$OCF_RESKEY_shutdown_timeout" || { + ocf_log err "shutdown_timeout must be a number" + exit $OCF_ERR_CONFIGURED + } +fi + +case $1 in + start) + Xen_Start + ;; + stop) + Xen_Stop + ;; + migrate_to) + Xen_Migrate_To + ;; + migrate_from) + Xen_Migrate_From + ;; + monitor) + Xen_Monitor + ;; + status) + Xen_Status ${DOMAIN_NAME} + ;; + validate-all) + Xen_Validate_All + ;; + *) + usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +exit $? diff --git a/heartbeat/Xinetd b/heartbeat/Xinetd new file mode 100755 index 0000000..778e18c --- /dev/null +++ b/heartbeat/Xinetd @@ -0,0 +1,256 @@ +#!/bin/sh +# +# Startup/shutdown script for services managed by xinetd. +# +# Copyright (C) 2003 Charlie Brooks +# Copyright (C) 2011 Ulrich Windl +# +# WARNING: Tested ONLY on SLES11 SP1 at this time. +# +# Author: Charlie Brooks <ha@HBCS.Org> +# Description: given parameters of a service name and start|stop|status, +# will enable, disable or report on a specified xinetd service +# Config: all services must have a descriptor file in /etc/xinetd.d +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL) +# +# OCF parameters are as below: +# OCF_RESKEY_service + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_service_default="" + +: ${OCF_RESKEY_service=${OCF_RESKEY_service_default}} + +service=$OCF_RESKEY_service +SVCDEF=/etc/xinetd.d/$service + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="Xinetd" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for Xinetd. It starts/stops services managed +by xinetd by enabling or disabling them in the configuration file. + +The xinetd daemon itself must be running: we are not going to start or +stop it ourselves. + +All services should have a line saying either "disable=yes" or "disable=no". +The script just changes those settings before reloading xinetd. + +Important: in case the services managed by the cluster are the +only ones enabled, you should specify the -stayalive option for +xinetd or it will exit on Heartbeat stop. Alternatively, you may +enable some internal service such as echo. +</longdesc> +<shortdesc lang="en">Manages a service of Xinetd</shortdesc> + +<parameters> +<parameter name="service" unique="0" required="1"> +<longdesc lang="en"> +The name of the service managed by xinetd. +</longdesc> +<shortdesc lang="en">service name</shortdesc> +<content type="string" default="${OCF_RESKEY_service_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="restart" timeout="20s" /> +<action name="status" depth="0" timeout="10s" interval="10s" /> +<action name="monitor" depth="0" timeout="10s" interval="10s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +get_xinetd_pid() { + ps -e -o pid,comm | $AWK '$2 == "xinetd" { print $1 }' +} + +# force xinetd to reload the service descriptions +hup_inetd () { + # don't rely on the pid file, but lookup xinetd in the list of + # processes + local pid + pid=`get_xinetd_pid` + if [ "$pid" ]; then + if kill -s HUP $pid; then + ocf_log info "asked xinetd to reload by sending SIGHUP to process $pid!" + else + ocf_exit_reason "could not send SIGHUP to process $pid!" + exit $OCF_ERR_GENERIC + fi + else + ocf_exit_reason "xinetd process not found!" + exit $OCF_ERR_GENERIC + fi +} + +# check "disable = X", printing X +check_service() +{ + ocf_log "info" "checking \"disable\" in $1" + local result=$(sed -nre 's/^[ ]*disable[ ]*=[ ]*([^ ]+)[# ]*/\1/p' $1) + echo "$result" +} + +# change "disable = X" to desired value +change_service() +{ + ocf_log "info" "setting \"disable = $1\" in $2" + if ! sed -i -re 's/^([ ]*disable[ ]*=[ ]*)([^ ]+)([# ]*)/\1'"$1"'\3/' $2 + then + ocf_log "err" "could not edit $2" + return 1 + fi + return 0 +} + +xup_status () { + local disabled="$(check_service $SVCDEF)" + if [ "${disabled:=no}" = no ]; then + echo running + return $OCF_SUCCESS + elif [ "$disabled" = yes ]; then + echo stopped + return $OCF_NOT_RUNNING + else + echo unknown + return $OCF_ERR_CONFIGURED + fi +} + +xup_start () { + if [ "running" = "`xup_status`" ]; then + ocf_log info "service $service already started" + exit $OCF_SUCCESS + fi + ocf_log "info" "enabling in $SVCDEF" + if change_service "no" $SVCDEF; then + hup_inetd + fi +} + +xup_stop () { + if [ "stopped" = "`xup_status`" ]; then + ocf_log info "service $service already stopped" + exit $OCF_SUCCESS + fi + ocf_log "info" "disabling in $SVCDEF" + if change_service "yes" $SVCDEF; then + hup_inetd + fi +} + +xup_usage () { + echo "Usage: $0 {start|stop|restart|status|monitor|validate-all|meta-data}" + return 0 +} + +xup_validate_all () { + if [ ! -f "$SVCDEF" ]; then + ocf_exit_reason "service $service missing $SVCDEF" + return $OCF_ERR_INSTALLED + fi + return $OCF_SUCCESS +} + +if [ $# -ne 1 ]; then + xup_usage + exit $OCF_ERR_ARGS +fi + +# These operations do not require OCF instance parameters to be set +case "$1" in + meta-data) + meta_data + exit $OCF_SUCCESS + ;; + usage) + xup_usage + exit $OCF_SUCCESS + ;; +esac + +if [ -z "$OCF_RESKEY_service" ]; then + ocf_exit_reason "please define \"service\" parameter" + if [ "$1" = "start" ]; then + exit $OCF_ERR_CONFIGURED + else + exit $OCF_NOT_RUNNING + fi +fi + +# Is xinetd running at all +if [ -z "`get_xinetd_pid`" ]; then + case "$1" in + stop) exit $OCF_SUCCESS;; + start) + ocf_exit_reason "xinetd not running, we manage just xinetd services, not the daemon itself" + exit $OCF_ERR_INSTALLED + ;; + status|monitor) + if ocf_is_probe; then + exit $OCF_NOT_RUNNING + else + ocf_exit_reason "xinetd stopped" + exit $OCF_ERR_GENERIC + fi + ;; + esac +fi + +# Make sure the OCF_RESKEY_service is a valid xinetd service name +if [ ! -f $SVCDEF ]; then + ocf_exit_reason "service definition $SVCDEF not found!" + if [ "$1" = "start" ]; then + exit $OCF_ERR_INSTALLED + else + exit $OCF_NOT_RUNNING + fi +fi + +# See how we were called. +case "$1" in + start) + xup_start + ;; + stop) + xup_stop + ;; + restart) + $0 stop + $0 start + ;; + status) + xup_status + ;; + monitor) + xup_status > /dev/null + ;; + validate-all) + xup_validate_all + ;; + *) + xup_usage + exit $OCF_ERR_UNIMPLEMENTED +esac +exit $? diff --git a/heartbeat/ZFS b/heartbeat/ZFS new file mode 100755 index 0000000..560c1b5 --- /dev/null +++ b/heartbeat/ZFS @@ -0,0 +1,212 @@ +#!/bin/sh +# +# License: GNU General Public License (GPL) +# Support: zfs@lists.illumos.org +# Written by: Saso Kiselkov +# +# This script manages ZFS pools +# It can import a ZFS pool or export it +# +# usage: $0 {start|stop|status|monitor|validate-all|meta-data} +# +# The "start" arg imports a ZFS pool. +# The "stop" arg exports it. +# +# OCF parameters are as follows +# OCF_RESKEY_pool - the pool to import/export +# +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_pool_default="" +OCF_RESKEY_importargs_default="" +OCF_RESKEY_importforce_default=true + +: ${OCF_RESKEY_pool=${OCF_RESKEY_pool_default}} +: ${OCF_RESKEY_importargs=${OCF_RESKEY_importargs_default}} +: ${OCF_RESKEY_importforce=${OCF_RESKEY_importforce_default}} + +USAGE="usage: $0 {start|stop|status|monitor|validate-all|meta-data}"; + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="ZFS" version="1.0"> +<version>1.0</version> +<longdesc lang="en"> +This script manages ZFS pools +It can import a ZFS pool or export it +</longdesc> +<shortdesc lang="en">Manages ZFS pools</shortdesc> + +<parameters> +<parameter name="pool" unique="1" required="1"> +<longdesc lang="en"> +The name of the ZFS pool to manage, e.g. "tank". +</longdesc> +<shortdesc lang="en">ZFS pool name</shortdesc> +<content type="string" default="${OCF_RESKEY_pool_default}" /> +</parameter> +<parameter name="importargs" unique="0" required="0"> +<longdesc lang="en"> +Arguments to zpool import, e.g. "-d /dev/disk/by-id". +</longdesc> +<shortdesc lang="en">Import arguments</shortdesc> +<content type="string" default="${OCF_RESKEY_importargs_default}" /> +</parameter> +<parameter name="importforce" unique="0" required="0"> +<longdesc lang="en"> +zpool import is given the -f option. +</longdesc> +<shortdesc lang="en">Import is forced</shortdesc> +<content type="boolean" default="${OCF_RESKEY_importforce_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="60s" /> +<action name="stop" timeout="60s" /> +<action name="monitor" depth="0" timeout="30s" interval="5s" /> +<action name="validate-all" timeout="30s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END + exit $OCF_SUCCESS +} + +zpool_is_imported () { + # Check if ZFS kstats exists + if [ -d /proc/spl/kstat/zfs/ ] ; then + # Check the existence of kstats for the pool. If the stats exists, the pool was imported. + [ -d /proc/spl/kstat/zfs/"${OCF_RESKEY_pool}" ] + rc=$? + else + # If ZFS kstats do not exists, fallback to the standard check + rc=$(zpool list -H "$OCF_RESKEY_pool" > /dev/null) + fi + return $rc +} + +# Forcibly imports a ZFS pool, mounting all of its auto-mounted filesystems +# (as configured in the 'mountpoint' and 'canmount' properties) +# If the pool is already imported, no operation is taken. +zpool_import () { + if ! zpool_is_imported; then + ocf_log debug "${OCF_RESKEY_pool}:starting import" + + # The meanings of the options to import are as follows: + # -f : import even if the pool is marked as imported to another + # system - the system may have failed and not exported it + # cleanly. + # -o cachefile=none : the import should be temporary, so do not + # cache it persistently (across machine reboots). We want + # the CRM to explicitly control imports of this pool. + if ocf_is_true "${OCF_RESKEY_importforce}"; then + FORCE=-f + else + FORCE="" + fi + if zpool import $FORCE $OCF_RESKEY_importargs -o cachefile=none "$OCF_RESKEY_pool" ; then + ocf_log debug "${OCF_RESKEY_pool}:import successful" + return $OCF_SUCCESS + else + ocf_log debug "${OCF_RESKEY_pool}:import failed" + return $OCF_ERR_GENERIC + fi + fi +} + +# Forcibly exports a ZFS pool, unmounting all of its filesystems in the process +# If the pool is not imported, no operation is taken. +zpool_export () { + if zpool_is_imported; then + ocf_log debug "${OCF_RESKEY_pool}:starting export" + + # -f : force the export, even if we have mounted filesystems + # Please note that this may fail with a "busy" error if there are + # other kernel subsystems accessing the pool (e.g. SCSI targets). + # Always make sure the pool export is last in your failover logic. + if zpool export -f "$OCF_RESKEY_pool" ; then + ocf_log debug "${OCF_RESKEY_pool}:export successful" + return $OCF_SUCCESS + else + ocf_log debug "${OCF_RESKEY_pool}:export failed" + return $OCF_ERR_GENERIC + fi + fi +} + +# Monitors the health of a ZFS pool resource. Please note that this only +# checks whether the pool is imported and functional, not whether it has +# any degraded devices (use monitoring systems such as Zabbix for that). +zpool_monitor () { + # If the pool is not imported, then we can't monitor its health + if ! zpool_is_imported; then + return $OCF_NOT_RUNNING + fi + + # Check the pool status + # Since version 0.7.10 status can be obtained without locks + # https://github.com/zfsonlinux/zfs/pull/7563 + if [ -f /proc/spl/kstat/zfs/$OCF_RESKEY_pool/state ] ; then + HEALTH=$(cat /proc/spl/kstat/zfs/$OCF_RESKEY_pool/state) + else + HEALTH=$(zpool list -H -o health "$OCF_RESKEY_pool") + fi + + case "$HEALTH" in + ONLINE|DEGRADED) return $OCF_SUCCESS;; + FAULTED) return $OCF_NOT_RUNNING;; + *) return $OCF_ERR_GENERIC;; + esac +} + +# Validates whether we can import a given ZFS pool +zpool_validate () { + # Check that the 'zpool' command is known + if ! which zpool > /dev/null; then + return $OCF_ERR_INSTALLED + fi + + # If the pool is imported, then it is obviously valid + if zpool_is_imported; then + return $OCF_SUCCESS + fi + + # Check that the pool can be imported + if zpool import $OCF_RESKEY_importargs | grep 'pool:' | grep "\\<$OCF_RESKEY_pool\\>" > /dev/null; + then + return $OCF_SUCCESS + else + return $OCF_ERR_CONFIGURED + fi +} + +usage () { + echo "$USAGE" >&2 + return $1 +} + +if [ $# -ne 1 ]; then + usage $OCF_ERR_ARGS +fi + +case $1 in + meta-data) meta_data;; + start) zpool_import;; + stop) zpool_export;; + status|monitor) zpool_monitor;; + validate-all) zpool_validate;; + usage) usage $OCF_SUCCESS;; + *) usage $OCF_ERR_UNIMPLEMENTED;; +esac + +exit $? diff --git a/heartbeat/aliyun-vpc-move-ip b/heartbeat/aliyun-vpc-move-ip new file mode 100755 index 0000000..1a3a1a0 --- /dev/null +++ b/heartbeat/aliyun-vpc-move-ip @@ -0,0 +1,378 @@ +#!/bin/sh +# +# OCF resource agent to move an IP address within a VPC in the Aliyun +# Based on code of Markus Guertler (GitHub AWS-VPC-move-IP) +# Based on code of Adam Gandelman (GitHub ec2-resource-agents/elasticip) +# + +####################################################################### +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_address_default="" +OCF_RESKEY_routing_table_default="" +OCF_RESKEY_interface_default="eth0" +OCF_RESKEY_profile_default="default" +OCF_RESKEY_endpoint_default="vpc.aliyuncs.com" +OCF_RESKEY_aliyuncli_default="detect" + + +: ${OCF_RESKEY_address=${OCF_RESKEY_address_default}} +: ${OCF_RESKEY_routing_table=${OCF_RESKEY_routing_table_default}} +: ${OCF_RESKEY_interface=${OCF_RESKEY_interface_default}} +: ${OCF_RESKEY_profile=${OCF_RESKEY_profile_default}} +: ${OCF_RESKEY_endpoint=${OCF_RESKEY_endpoint_default}} +: ${OCF_RESKEY_aliyuncli=${OCF_RESKEY_aliyuncli_default}} + +####################################################################### + +# aliyun cli doesnt work without HOME parameter +export HOME="/root" + +USAGE="usage: $0 {start|stop|status|meta-data}"; + +if [ "${OCF_RESKEY_aliyuncli}" = "detect" ]; then + OCF_RESKEY_aliyuncli="$(which aliyuncli 2> /dev/null || which aliyun 2> /dev/null)" +fi + +if [ "${OCF_RESKEY_aliyuncli##*/}" = 'aliyuncli' ]; then + OUTPUT="text" + EXECUTING='{ print $3 }' + IFS_=" " + ENDPOINT="" +elif [ "${OCF_RESKEY_aliyuncli##*/}" = 'aliyun' ]; then + OUTPUT="table cols=InstanceId,DestinationCidrBlock rows=RouteTables.RouteTable[].RouteEntrys.RouteEntry[]" + EXECUTING='{ gsub (" ", "", $0); print $1 }' + IFS_="|" + ENDPOINT="--endpoint $OCF_RESKEY_endpoint" +fi +############################################################################### + + +############################################################################### +# +# Functions +# +############################################################################### + +request_create_route_entry() { + cmd="${OCF_RESKEY_aliyuncli} vpc CreateRouteEntry --RouteTableId $OCF_RESKEY_routing_table --DestinationCidrBlock ${OCF_RESKEY_address}/32 --NextHopId $ECS_INSTANCE_ID --NextHopType Instance ${ENDPOINT}" + ocf_log debug "executing command: $cmd" + res=$($cmd 2>&1) + rc=$? + if [ $rc -eq 0 ] + then + ocf_log debug "result: $res; rc: $rc" + else + ocf_log err "result: $res; cmd: $cmd; rc: $rc" + fi + return $rc +} + +request_delete_route_entry() { + cmd="${OCF_RESKEY_aliyuncli} vpc DeleteRouteEntry --RouteTableId $OCF_RESKEY_routing_table --DestinationCidrBlock ${OCF_RESKEY_address}/32 --NextHopId $ROUTE_TO_INSTANCE ${ENDPOINT}" + ocf_log debug "executing command: $cmd" + res=$($cmd) + rc=$? + if [ $rc -eq 0 ] + then + ocf_log debug "result: $res; rc: $rc" + else + ocf_log err "result: $res; cmd: $cmd; rc: $rc" + fi + return $rc +} + +request_describe_route_tables() { + cmd="${OCF_RESKEY_aliyuncli} vpc DescribeRouteTables --RouteTableId $OCF_RESKEY_routing_table --output ${OUTPUT} ${ENDPOINT}" + ocf_log debug "executing command: $cmd" + res=$($cmd) + rc=$? + if [ $rc -eq 0 ] + then + ROUTE_TO_INSTANCE=$(echo "$res" |grep "\s${OCF_RESKEY_address}/" | awk -F "${IFS_}" "${EXECUTING}") + ocf_log debug "ROUTE_TO_INSTANCE: $ROUTE_TO_INSTANCE" + else + ocf_log err "result: $res; cmd: $cmd; rc: $rc" + fi +} + +ip_get_and_configure() { + ocf_log debug "function: ip_get_and_configure" + + request_describe_route_tables + if [ "$ECS_INSTANCE_ID" != "$ROUTE_TO_INSTANCE" ]; then + if [ -n "$ROUTE_TO_INSTANCE" ]; then + ip_drop + fi + request_create_route_entry + rc=$? + while [ $rc -ne 0 ]; do + sleep 1 + request_create_route_entry + rc=$? + done + wait_for_started + fi + + + # Reconfigure the local ip address + ip addr add "${OCF_RESKEY_address}/32" dev $OCF_RESKEY_interface + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "command failed, rc: $rc" + return $OCF_ERR_GENERIC + fi + + ocf_log debug "IP added" + + return $OCF_SUCCESS +} + +ip_drop() { + ocf_log debug "function: ip_drop" + cmd="ip addr delete ${OCF_RESKEY_address}/32 dev $OCF_RESKEY_interface" + ocf_log debug "executing command: $cmd" + res=$($cmd) + rc=$? + if [ $rc -ne 0 ] && [ $rc -ne 2 ]; then + ocf_log err "command failed, rc: $rc; cmd: $cmd; result: $res" + return $OCF_ERR_GENERIC + fi + request_delete_route_entry + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "command failed, rc: $rc" + return $OCF_ERR_GENERIC + fi + wait_for_deleted + + ocf_log debug "IP dropped" + + return $OCF_SUCCESS +} + +wait_for_started() { + request_describe_route_tables + while [ "$ECS_INSTANCE_ID" != "$ROUTE_TO_INSTANCE" ]; do + sleep 3 + request_describe_route_tables + done +} + +wait_for_deleted() { + request_describe_route_tables + while [ ! -z "$ROUTE_TO_INSTANCE" ]; do + sleep 1 + request_describe_route_tables + done +} + +ecs_ip_metadata() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="aliyun-vpc-move-ip" version="2.0"> +<version>1.0</version> +<longdesc lang="en"> +Resource Agent to move IP addresses within a VPC of the Aliyun Webservices ECS +by changing an entry in an specific routing table +</longdesc> +<shortdesc lang="en">Move IP within a VPC of the Aliyun ECS</shortdesc> + +<parameters> +<parameter name="aliyuncli" required="0"> +<longdesc lang="en"> +Path to command line tools for Aliyun +</longdesc> +<shortdesc lang="en">Path to Aliyun CLI tools</shortdesc> +<content type="string" default="${OCF_RESKEY_aliyuncli_default}" /> +</parameter> + +<parameter name="address" required="1"> +<longdesc lang="en"> +VPC private IP address +</longdesc> +<shortdesc lang="en">vpc ip</shortdesc> +<content type="string" default="${OCF_RESKEY_address_default}" /> +</parameter> + +<parameter name="routing_table" required="1"> +<longdesc lang="en"> +Name of the routing table, where the route for the IP address should be changed, i.e. vtb-... +</longdesc> +<shortdesc lang="en">routing table name</shortdesc> +<content type="string" default="${OCF_RESKEY_routing_table_default}" /> +</parameter> + +<parameter name="interface" required="1"> +<longdesc lang="en"> +Name of the network interface, i.e. eth0 +</longdesc> +<shortdesc lang="en">network interface name</shortdesc> +<content type="string" default="${OCF_RESKEY_interface_default}" /> +</parameter> + +<parameter name="endpoint" required="0"> +<longdesc lang="en"> +An endpoint is the service entry of an Alibaba Cloud service, i.e. vpc.cn-beijing.aliyuncs.com +</longdesc> +<shortdesc lang="en">service endpoint</shortdesc> +<content type="string" default="${OCF_RESKEY_endpoint_default}" /> +</parameter> + +<parameter name="profile" required="0"> +<longdesc lang="en"> +Valid Aliyun CLI profile name (see 'aliyun cli configure'). +See https://www.alibabacloud.com/help/zh/product/29991.htm for more information about aliyun cli. +</longdesc> +<shortdesc lang="en">profile name</shortdesc> +<content type="string" default="${OCF_RESKEY_profile_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="180s" /> +<action name="stop" timeout="180s" /> +<action name="monitor" depth="0" timeout="30s" interval="30s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +ecs_ip_validate() { + ocf_log debug "function: validate" + + if [ -z "${OCF_RESKEY_aliyuncli}" ]; then + ocf_exit_reason "unable to detect aliyuncli binary" + exit $OCF_ERR_INSTALLED + fi + + # IP address + if [ -z "$OCF_RESKEY_address" ]; then + ocf_log err "IP address parameter not set $OCF_RESKEY_ADDRESS!" + exit $OCF_ERR_CONFIGURED + fi + + # Network Interface + if [ -z "$OCF_RESKEY_interface" ]; then + ocf_log err "Network interface parameter not set $OCF_RESKEY_INTERFACE!" + exit $OCF_ERR_CONFIGURED + fi + + # Routing Table + if [ -z "$OCF_RESKEY_routing_table" ]; then + ocf_log err "Routing table parameter not set $OCF_RESKEY_ROUTING_TABLE!" + exit $OCF_ERR_CONFIGURED + fi + + if [ -z "${ECS_INSTANCE_ID}" ]; then + ocf_exit_reason "Instance ID not found. Is this a ECS instance?" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +ecs_ip_start() { + ocf_log info "ECS: Moving IP address $OCF_RESKEY_address to this host by adjusting routing table $OCF_RESKEY_routing_table" + + ecs_ip_monitor + if [ $? = $OCF_SUCCESS ]; then + ocf_log info "ECS: $OCF_RESKEY_address already started" + return $OCF_SUCCESS + fi + + ocf_log info "ECS: Adjusting routing table and locally configuring IP address" + ip_get_and_configure + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "Received $rc from 'aliyun cli'" + return $OCF_ERR_GENERIC + fi + + ecs_ip_monitor + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + ocf_log err "IP address couldn't be configured on this host (IP: $OCF_RESKEY_address, Interface: $OCF_RESKEY_interface)" + return $rc + fi + + return $OCF_SUCCESS +} + +ecs_ip_stop() { + ocf_log info "ECS: Bringing down IP address $OCF_RESKEY_address" + + ecs_ip_monitor + if [ $? = $OCF_NOT_RUNNING ]; then + ocf_log info "ECS: Address $OCF_RESKEY_address already down" + return $OCF_SUCCESS + fi + + ip_drop + if [ $? -ne $OCF_SUCCESS ]; then + ocf_log err "ECS: Couldn't drop IP address $OCF_RESKEY_address on interface $OCF_RESKEY_interface." + return $OCF_ERR_GENERIC + fi + + ecs_ip_monitor + if [ $? = $OCF_NOT_RUNNING ]; then + ocf_log info "ECS: Successfully brought down $OCF_RESKEY_address" + return $OCF_SUCCESS + fi + + ocf_log err "ECS: Couldn't bring down IP address $OCF_RESKEY_address on interface $OCF_RESKEY_interface." + return $OCF_ERR_GENERIC +} + +ecs_ip_monitor() { + ocf_log debug "function: ecsip_monitor: check routing table" + request_describe_route_tables + + if [ "$ECS_INSTANCE_ID" != "$ROUTE_TO_INSTANCE" ]; then + ocf_log debug "not routed to this instance ($ECS_INSTANCE_ID) but to instance $ROUTE_TO_INSTANCE" + return $OCF_NOT_RUNNING + fi + + cmd="ping -W 1 -c 1 $OCF_RESKEY_address" + ocf_log debug "executing command: $cmd" + $cmd > /dev/null + if [ $? -ne 0 ]; then + ocf_log debug "IP $OCF_RESKEY_address not locally reachable via ping on this system" + return $OCF_NOT_RUNNING + fi + ocf_log debug "routed in VPC and locally reachable" + return $OCF_SUCCESS +} + + +############################################################################### +# +# MAIN +# +############################################################################### + +case $__OCF_ACTION in + meta-data) ecs_ip_metadata + exit $OCF_SUCCESS;; + validate-all) ecs_ip_validate;; +esac + +ECS_INSTANCE_ID="$(curl -s http://100.100.100.200/latest/meta-data/instance-id)" + +case $__OCF_ACTION in + start) + ecs_ip_validate + ecs_ip_start;; + stop) + ecs_ip_stop;; + monitor) + ecs_ip_monitor;; + *) exit $OCF_ERR_UNIMPLEMENTED;; +esac diff --git a/heartbeat/anything b/heartbeat/anything new file mode 100755 index 0000000..137a612 --- /dev/null +++ b/heartbeat/anything @@ -0,0 +1,344 @@ +#!/bin/sh +# +# OCF Resource Agent compliant resource script. +# +# Copyright (c) 2009 IN-telegence GmbH & Co. KG, Dominik Klein +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. + +# OCF instance parameters +# OCF_RESKEY_binfile +# OCF_RESKEY_cmdline_options +# OCF_RESKEY_workdir +# OCF_RESKEY_pidfile +# OCF_RESKEY_logfile +# OCF_RESKEY_errlogfile +# OCF_RESKEY_user +# OCF_RESKEY_monitor_hook +# OCF_RESKEY_stop_timeout +# +# This RA starts $binfile with $cmdline_options as $user in $workdir and writes a $pidfile from that. +# If you want it to, it logs: +# - stdout to $logfile, stderr to $errlogfile or +# - stdout and stderr to $logfile +# - or to will be captured by lrmd if these options are omitted. +# Monitoring is done through $pidfile or your custom $monitor_hook script. +# The RA expects the program to keep running "daemon-like" and +# not just quit and exit. So this is NOT (yet - feel free to +# enhance) a way to just run a single one-shot command which just +# does something and then exits. + +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_binfile_default="" +OCF_RESKEY_workdir_default="" +OCF_RESKEY_pidfile_default="${HA_VARRUN}/anything_${OCF_RESOURCE_INSTANCE}.pid" +OCF_RESKEY_logfile_default="/dev/null" +OCF_RESKEY_user_default="root" +OCF_RESKEY_stop_timeout_default="" + +: ${OCF_RESKEY_binfile=${OCF_RESKEY_binfile_default}} +: ${OCF_RESKEY_workdir=${OCF_RESKEY_workdir_default}} +: ${OCF_RESKEY_pidfile=${OCF_RESKEY_pidfile_default}} +: ${OCF_RESKEY_logfile=${OCF_RESKEY_logfile_default}} +: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} +: ${OCF_RESKEY_stop_timeout=${OCF_RESKEY_stop_timeout_default}} + +getpid() { + grep -o '[0-9]*' $1 +} + +anything_status() { + if test -f "$pidfile" + then + if pid=`getpid $pidfile` && [ "$pid" ] && kill -s 0 $pid + then + return $OCF_SUCCESS + else + # pidfile w/o process means the process died + return $OCF_ERR_GENERIC + fi + else + return $OCF_NOT_RUNNING + fi +} + +anything_start() { + if ! anything_status + then + #Make sure that PID Directory exists and is writable by proper user + piddir=`dirname $pidfile` + if ! su -s /bin/sh - $user -c "test -w $piddir"; then + #PID Directory is not writeable by user + ocf_log warn "Directory $piddir is not writable by $user, attempting to fix." + ocf_log info "Creating directory $piddir" + mkdir -p $piddir + ocf_log info "Changing permissions for $piddir for user $user" + chown $user: $piddir + else + ocf_log debug "Directory $piddir exists, and is writeable by $user. All fine" + fi + if [ -n "$logfile" -a -n "$errlogfile" ] + then + # We have logfile and errlogfile, so redirect STDOUT und STDERR to different files + cmd="su - $user -c \"cd $workdir; nohup $binfile $cmdline_options >> $logfile 2>> $errlogfile & \"'echo \$!' " + else + # We only have logfile so redirect STDOUT and STDERR to the same file + cmd="su - $user -c \"cd $workdir; nohup $binfile $cmdline_options >> $logfile 2>&1 & \"'echo \$!' " + fi + ocf_log debug "Starting $process: $cmd" + # Execute the command as created above + eval $cmd | tail -n 1 > $pidfile + if anything_status + then + ocf_log debug "$process: $cmd started successfully, calling monitor" + anything_monitor + myres=$? + return $myres + else + ocf_log err "$process: $cmd could not be started" + return $OCF_ERR_GENERIC + fi + else + # If already running, consider start successful + ocf_log debug "$process: $cmd is already running" + return $OCF_SUCCESS + fi +} + +anything_stop() { + local rc=$OCF_SUCCESS + + if [ -n "$OCF_RESKEY_stop_timeout" ] + then + stop_timeout=$OCF_RESKEY_stop_timeout + elif [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + # Allow 2/3 of the action timeout for the orderly shutdown + # (The origin unit is ms, hence the conversion) + stop_timeout=$((OCF_RESKEY_CRM_meta_timeout/1500)) + else + stop_timeout=10 + fi + if anything_status + then + pid=`getpid $pidfile` + kill $pid + i=0 + while [ $i -lt $stop_timeout ] + do + if ! anything_status + then + rm -f $pidfile + return $OCF_SUCCESS + fi + sleep 1 + i=$((i+1)) + done + ocf_log warn "Stop with SIGTERM failed/timed out, now sending SIGKILL." + kill -s 9 $pid + while : + do + if ! anything_status + then + ocf_log warn "SIGKILL did the job." + rc=$OCF_SUCCESS + break + fi + ocf_log info "The job still hasn't stopped yet. Waiting..." + sleep 1 + done + fi + rm -f $pidfile + return $rc +} + +anything_monitor() { + anything_status + ret=$? + if [ $ret -eq $OCF_SUCCESS ] + then + if [ -n "$OCF_RESKEY_monitor_hook" ]; then + eval "$OCF_RESKEY_monitor_hook" + if [ $? -ne $OCF_SUCCESS ]; then + return ${OCF_ERR_GENERIC} + fi + return $OCF_SUCCESS + else + true + fi + else + return $ret + fi +} + +# FIXME: Attributes special meaning to the resource id +process="$OCF_RESOURCE_INSTANCE" +binfile="$OCF_RESKEY_binfile" +cmdline_options="$OCF_RESKEY_cmdline_options" +workdir="$OCF_RESKEY_workdir" +pidfile="$OCF_RESKEY_pidfile" +[ -z "$pidfile" ] && pidfile=${HA_VARRUN}/anything_${process}.pid +logfile="${OCF_RESKEY_logfile:-/dev/null}" +errlogfile="$OCF_RESKEY_errlogfile" +user="$OCF_RESKEY_user" +[ -z "$user" ] && user=root + +anything_validate() { + if ! su - $user -c "test -x $binfile" + then + ocf_log err "binfile $binfile does not exist or is not executable by $user." + exit $OCF_ERR_INSTALLED + fi + if ! getent passwd $user >/dev/null 2>&1 + then + ocf_log err "user $user does not exist." + exit $OCF_ERR_INSTALLED + fi + for logfilename in "$logfile" "$errlogfile" + do + if [ -n "$logfilename" ]; then + mkdir -p `dirname $logfilename` || { + ocf_log err "cannot create $(dirname $logfilename)" + exit $OCF_ERR_INSTALLED + } + fi + done + [ "x$workdir" != x -a ! -d "$workdir" ] && { + ocf_log err "working directory $workdir doesn't exist" + exit $OCF_ERR_INSTALLED + } + return $OCF_SUCCESS +} + +anything_meta() { +cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="anything" version="1.0"> +<version>1.0</version> +<longdesc lang="en"> +This is a generic OCF RA to manage almost anything. +</longdesc> +<shortdesc lang="en">Manages an arbitrary service</shortdesc> + +<parameters> +<parameter name="binfile" required="1" unique="0"> +<longdesc lang="en"> +The full name of the binary to be executed. This is expected to keep running with the same pid and not just do something and exit. +</longdesc> +<shortdesc lang="en">Full path name of the binary to be executed</shortdesc> +<content type="string" default="${OCF_RESKEY_binfile_default}"/> +</parameter> +<parameter name="cmdline_options" required="0"> +<longdesc lang="en"> +Command line options to pass to the binary +</longdesc> +<shortdesc lang="en">Command line options</shortdesc> +<content type="string" /> +</parameter> +<parameter name="workdir" required="0" unique="0"> +<longdesc lang="en"> +The path from where the binfile will be executed. +</longdesc> +<shortdesc lang="en">Full path name of the work directory</shortdesc> +<content type="string" default="${OCF_RESKEY_workdir_default}"/> +</parameter> +<parameter name="pidfile"> +<longdesc lang="en"> +File to read/write the PID from/to. +</longdesc> +<shortdesc lang="en">File to write STDOUT to</shortdesc> +<content type="string" default="${OCF_RESKEY_pidfile_default}"/> +</parameter> +<parameter name="logfile" required="0"> +<longdesc lang="en"> +File to write STDOUT to +</longdesc> +<shortdesc lang="en">File to write STDOUT to</shortdesc> +<content type="string" default="${OCF_RESKEY_logfile_default}" /> +</parameter> +<parameter name="errlogfile" required="0"> +<longdesc lang="en"> +File to write STDERR to +</longdesc> +<shortdesc lang="en">File to write STDERR to</shortdesc> +<content type="string" /> +</parameter> +<parameter name="user" required="0"> +<longdesc lang="en"> +User to run the command as +</longdesc> +<shortdesc lang="en">User to run the command as</shortdesc> +<content type="string" default="${OCF_RESKEY_user_default}"/> +</parameter> +<parameter name="monitor_hook"> +<longdesc lang="en"> +Command to run in monitor operation +</longdesc> +<shortdesc lang="en">Command to run in monitor operation</shortdesc> +<content type="string"/> +</parameter> +<parameter name="stop_timeout"> +<longdesc lang="en"> +In the stop operation: Seconds to wait for kill -TERM to succeed +before sending kill -SIGKILL. Defaults to 2/3 of the stop operation timeout. +</longdesc> +<shortdesc lang="en">Seconds to wait after having sent SIGTERM before sending SIGKILL in stop operation</shortdesc> +<content type="string" default="${OCF_RESKEY_stop_timeout_default}"/> +</parameter> +</parameters> +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s" /> +</actions> +</resource-agent> +END +exit 0 +} + +case "$1" in + meta-data|metadata|meta_data) + anything_meta + ;; + start) + anything_start + ;; + stop) + anything_stop + ;; + monitor) + anything_monitor + ;; + validate-all) + anything_validate + ;; + *) + ocf_log err "$0 was called with unsupported arguments: $*" + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac diff --git a/heartbeat/apache b/heartbeat/apache new file mode 100755 index 0000000..448225e --- /dev/null +++ b/heartbeat/apache @@ -0,0 +1,744 @@ +#!/bin/sh +# +# High-Availability Apache/IBMhttp control script +# +# apache (aka IBMhttpd) +# +# Description: starts/stops apache web servers. +# +# Author: Alan Robertson +# Sun Jiang Dong +# +# Support: users@clusterlabs.org +# +# License: GNU General Public License (GPL) +# +# Copyright: (C) 2002-2005 International Business Machines +# +# +# An example usage in /etc/ha.d/haresources: +# node1 10.0.0.170 apache::/opt/IBMHTTPServer/conf/httpd.conf +# node1 10.0.0.170 IBMhttpd +# +# Our parsing of the Apache config files is very rudimentary. +# It'll work with lots of different configurations - but not every +# possible configuration. +# +# Patches are being accepted ;-) +# +# OCF parameters: +# OCF_RESKEY_configfile +# OCF_RESKEY_httpd +# OCF_RESKEY_port +# OCF_RESKEY_statusurl +# OCF_RESKEY_options +# OCF_RESKEY_testregex +# OCF_RESKEY_client +# OCF_RESKEY_testurl +# OCF_RESKEY_testregex10 +# OCF_RESKEY_testconffile +# OCF_RESKEY_testname +# OCF_RESKEY_envfiles + + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs +. ${OCF_FUNCTIONS_DIR}/apache-conf.sh +. ${OCF_FUNCTIONS_DIR}/http-mon.sh +HA_VARRUNDIR=${HA_VARRUN} + +# Parameter defaults + +OCF_RESKEY_httpd_default="/usr/sbin/httpd" +OCF_RESKEY_envfiles_default="/etc/apache2/envvars" +OCF_RESKEY_use_ipv6_default="false" + +####################################################################### +# +# Configuration options - usually you don't need to change these +# +####################################################################### +# +IBMHTTPD=/opt/IBMHTTPServer/bin/httpd +HTTPDLIST="/sbin/httpd2 /usr/sbin/httpd2 /usr/sbin/apache2 /sbin/httpd /usr/sbin/httpd /usr/sbin/apache $IBMHTTPD" +MPM=/usr/share/apache2/find_mpm +if [ -x $MPM ]; then + HTTPDLIST="$HTTPDLIST `$MPM 2>/dev/null`" +fi + +LOCALHOST="http://localhost" +HTTPDOPTS="-DSTATUS" +DEFAULT_IBMCONFIG=/opt/IBMHTTPServer/conf/httpd.conf +DEFAULT_SUSECONFIG="/etc/apache2/httpd.conf" +DEFAULT_RHELCONFIG="/etc/httpd/conf/httpd.conf" +DEFAULT_DEBIANCONFIG="/etc/apache2/apache2.conf" +# +# You can also set +# HTTPD +# PORT +# STATUSURL +# CONFIGFILE +# in this section if what we're doing doesn't work for you... +# +# End of Configuration options +####################################################################### + +CMD=`basename $0` + +# The config-file-pathname is the pathname to the configuration +# file for this web server. Various appropriate defaults are +# assumed if no config file is specified. If this command is +# invoked as *IBM*, then the default config file name is +# $DEFAULT_IBMCONFIG, otherwise the default config file +# will be either $DEFAULT_RHELCONFIG or $DEFAULT_SUSECONFIG depending +# on which is detected. +usage() { +cat <<-END +usage: $0 action + +action: + start start the web server + + stop stop the web server + + status return the status of web server, run or down + + monitor return TRUE if the web server appears to be working. + For this to be supported you must configure mod_status + and give it a server-status URL. You have to have + installed either curl or wget for this to work. + + meta-data show meta data message + + validate-all validate the instance parameters +END +} + +get_pid() { + if [ -f $PidFile ]; then + cat $PidFile + else + false + fi +} +# +# return TRUE if a process with given PID is running +# +ProcessRunning() { + local pid=$1 + + # Use /proc if it looks like it's here... + if [ -d /proc -a -d /proc/1 ]; then + [ -d /proc/$pid ] + else + # This assumes we're running as root... + kill -s 0 "$pid" >/dev/null 2>&1 + fi +} +silent_status() { + local pid + local rc=$OCF_ERR_GENERIC + local retries=0 + + # Set a retry when apache's Graceful restart is applied and the pid file can not be acquired. + if [ "$__OCF_ACTION" = "monitor" ] && ! ocf_is_probe; then + retries=5 + fi + + while true; do + pid=`get_pid` + if [ -n "$pid" ]; then + ProcessRunning $pid + rc=$? + break + fi + + : No pid file + if [ $retries -le 0 ]; then + break + fi + + sleep 1 + retries=`expr $retries - 1` + done + + return $rc +} + +# May be useful to add other distros in future +validate_default_config() { + if is_suse_based; then + validate_default_suse_config + elif is_debian_based; then + validate_default_debian_config + else + return 0 + fi +} + +# When using the default /etc/apache2/httpd.conf on SUSE, the file +# /etc/apache2/sysconfig.d/include.conf is required to be present, +# but this is only generated if you run the apache init script +# (with contents derived from /etc/sysconfig/apache2). So, here, +# if we're using the default system config file and it requires +# that include, we run "/etc/init.d/apache2 configtest" to ensure +# the relevant config is generated and valid. We're also taking +# this opportunity to enable mod_status if it's not present. +validate_default_suse_config() { + if [ "$CONFIGFILE" = "$DEFAULT_SUSECONFIG" ] && \ + grep -Eq '^Include[[:space:]]+/etc/apache2/sysconfig.d/include.conf' "$CONFIGFILE" + then + [ -x "/usr/sbin/a2enmod" ] && ocf_run -q /usr/sbin/a2enmod status + # init script style, for crusty old SUSE + if [ -x "/etc/init.d/apache2" ]; then + ocf_run -q /etc/init.d/apache2 configtest || return 1 + # systemd style, for shiny new SUSE + elif [ -x "/usr/sbin/start_apache2" ]; then + ocf_run -q /usr/sbin/start_apache2 -t || return 1 + fi + fi + + # mod_status: some existing users might don't want to use mod_status, check if present in configuration + if [ "$CONFIGFILE" = "$DEFAULT_SUSECONFIG" ] && \ + grep -Eq '^Include[[:space:]]+/etc/apache2/mod_status.conf' "$CONFIGFILE" + then + # load module only if module exists + apache_mod_status="/usr/lib64/apache2-prefork/mod_status.so" + if [ -e $apache_mod_status ]; then + LOAD_STATUS_MODULE="LoadModule status_module $apache_mod_status" + fi + fi + return 0 +} + +# Debian's Default configuration uses a lock directory /var/lock/apache2 +# which is only generated using the lsb init script issues configtest. To +# ensure these default directories are present it's useful to run a configtest +# prior to the resource startup which will create the needed directories +# +# To support multiple apache instances the debian scripts and configs +# obey apache2/envvars. (copy /etc/apache2 -> /etc/apache2-instance) +# adjust (SUFFIX) envvars and set OCF_RESKEY_envfiles +validate_default_debian_config() { + if find /etc/apache2* -name apache2.conf | grep -q "$CONFIGFILE" + then + export APACHE_CONFDIR=$(dirname $CONFIGFILE) + [ -x "/usr/sbin/a2enmod" ] && ocf_run -q /usr/sbin/a2enmod status + ocf_run -q /usr/sbin/apache2ctl configtest || return 1 + fi + return 0 +} + +apache_start() { + if + silent_status + then + ocf_log info "$CMD already running (pid `get_pid`)" + return $OCF_SUCCESS + fi + + validate_default_config || return $OCF_ERR_CONFIGURED + + if [ -z $PIDFILE_DIRECTIVE ]; then + if [ -z "$LOAD_STATUS_MODULE" ]; then + ocf_run $HTTPD $HTTPDOPTS $OPTIONS -f $CONFIGFILE + else + ocf_run $HTTPD -C "$LOAD_STATUS_MODULE" $HTTPDOPTS $OPTIONS -f $CONFIGFILE + fi + else + if [ -z "$LOAD_STATUS_MODULE" ]; then + ocf_run $HTTPD $HTTPDOPTS $OPTIONS -f $CONFIGFILE -c "PidFile $PidFile" + else + ocf_run $HTTPD $HTTPDOPTS -C "$LOAD_STATUS_MODULE" $OPTIONS -f $CONFIGFILE -c "PidFile $PidFile" + fi + fi + + tries=0 + while : # wait until the user set timeout + do + apache_monitor + ec=$? + if [ $ec -eq $OCF_NOT_RUNNING ] + then + tries=`expr $tries + 1` + ocf_log info "waiting for apache $CONFIGFILE to come up" + sleep 1 + else + break + fi + done + + if [ $ec -ne 0 ] && silent_status; then + apache_stop + fi + return $ec +} + +signal_children() +{ + for sig in SIGTERM SIGHUP SIGKILL ; do + if pgrep -f $HTTPD.*$CONFIGFILE >/dev/null ; then + pkill -$sig -f $HTTPD.*$CONFIGFILE >/dev/null + ocf_log info "signal $sig sent to apache children" + sleep 1 + else + break + fi + done +} + +graceful_stop() +{ + local tries=10 + local pid=$1 + + # Try graceful stop for half timeout period if timeout period is present + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + tries=$((($OCF_RESKEY_CRM_meta_timeout/1000) / 2)) + fi + + ocf_log info "Attempting graceful stop of apache PID $pid" + kill -WINCH $pid >/dev/null + while + ProcessRunning $pid && + [ $tries -gt 0 ] + do + sleep 1 + tries=`expr $tries - 1` + done + + if [ $tries -eq 0 ]; then + # graceful stop didn't work, process still up. + return 1 + fi + + return 0 +} + +kill_stop() +{ + local tries=0 + local pid=$1 + + ocf_log info "Killing apache PID $pid" + if ProcessRunning $pid; then + kill $pid >/dev/null + while + [ $tries -lt 10 ] + do + if ProcessRunning $pid; then + tries=`expr $tries + 1` + sleep 1 + else + break + fi + done + fi +} + +apache_stop() { + local ret=$OCF_SUCCESS + local pid + + if ! silent_status; then + ocf_log info "$CMD is not running." + signal_children + return $ret + fi + + pid=`get_pid` + graceful_stop $pid + if [ $? -ne 0 ]; then + kill_stop $pid + fi + + signal_children + + if ProcessRunning $pid; then + ocf_exit_reason "$CMD still running ($pid). Killing pid failed." + ret=$OCF_ERR_GENERIC + fi + + if [ $ret -eq 0 ]; then + ocf_log info "$CMD stopped." + fi + + return $ret +} + +apache_monitor_10() { + if [ -f "$TESTCONFFILE" ] && [ -r "$TESTCONFFILE" ]; then + readtestconf < $TESTCONFFILE + else + test_url="$TESTURL" + test_regex="$TESTREGEX10" + fi + + whattorun=`gethttpclient` + fixtesturl + is_testconf_sane || + return $OCF_ERR_CONFIGURED + + if $whattorun "$test_url" | grep -Ei "$test_regex" > /dev/null + then + return $OCF_SUCCESS + else + if ! ocf_is_probe; then + ocf_exit_reason "Failed to access httpd status page." + fi + return $OCF_ERR_GENERIC + fi +} + +# If the user has not provided any basic monitoring +# information, allow the agent to verify the server is +# healthy and capable of processing requests by requesting +# the http header of website's index +attempt_index_monitor_request() { + local indexpage="" + + if [ -n "$OCF_RESKEY_testregex" ]; then + return 1; + fi + if [ -n "$OCF_RESKEY_testregex10" ]; then + return 1; + fi + if [ -n "$OCF_RESKEY_testurl" ]; then + return 1; + fi + if [ -n "$OCF_RESKEY_statusurl" ]; then + return 1; + fi + if [ -n "$OCF_RESKEY_testconffile" ]; then + return 1; + fi + + indexpage=$(buildlocalurl) + + request_url_header $indexpage + if [ $? -ne 0 ]; then + return $OCF_ERR_GENERIC + fi + ocf_log debug "Successfully retrieved http header at $indexpage" + return 0 +} + +apache_monitor_basic() { + if ${ourhttpclient}_func "$STATUSURL" | grep -Ei "$TESTREGEX" > /dev/null + then + return $OCF_SUCCESS + fi + + attempt_index_monitor_request + if [ $? -eq 0 ]; then + return $OCF_SUCCESS + fi + + if ! ocf_is_probe; then + ocf_exit_reason "Failed to access httpd status page." + fi + return $OCF_ERR_GENERIC +} +apache_monitor() { + silent_status + if [ $? -ne 0 ]; then + ocf_log info "$CMD not running" + return $OCF_NOT_RUNNING + fi + + ourhttpclient=`findhttpclient` # we'll need one + if [ -z "$ourhttpclient" ]; then + ocf_exit_reason "could not find a http client; make sure that either wget or curl is available" + return $OCF_ERR_INSTALLED + fi + + case `ocf_check_level 10` in + 0) apache_monitor_basic;; + 10) apache_monitor_10;; + esac +} + +detect_default_config() +{ + if [ -f $DEFAULT_SUSECONFIG ]; then + echo $DEFAULT_SUSECONFIG + elif [ -f $DEFAULT_DEBIANCONFIG ]; then + echo $DEFAULT_DEBIANCONFIG + else + echo $DEFAULT_RHELCONFIG + fi +} + + +apache_meta_data(){ + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="apache" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This is the resource agent for the Apache Web server. +This resource agent operates both version 1.x and version 2.x Apache +servers. + +The start operation ends with a loop in which monitor is +repeatedly called to make sure that the server started and that +it is operational. Hence, if the monitor operation does not +succeed within the start operation timeout, the apache resource +will end with an error status. + +The monitor operation by default loads the server status page +which depends on the mod_status module and the corresponding +configuration file (usually /etc/apache2/mod_status.conf). +Make sure that the server status page works and that the access +is allowed *only* from localhost (address 127.0.0.1). +See the statusurl and testregex attributes for more details. + +See also http://httpd.apache.org/ +</longdesc> +<shortdesc lang="en">Manages an Apache Web server instance</shortdesc> + +<parameters> +<parameter name="configfile" required="0" unique="1"> +<longdesc lang="en"> +The full pathname of the Apache configuration file. +This file is parsed to provide defaults for various other +resource agent parameters. +</longdesc> +<shortdesc lang="en">configuration file path</shortdesc> +<content type="string" default="$(detect_default_config)" /> +</parameter> + +<parameter name="httpd"> +<longdesc lang="en"> +The full pathname of the httpd binary (optional). +</longdesc> +<shortdesc lang="en">httpd binary path</shortdesc> +<content type="string" default="${OCF_RESKEY_httpd_default}" /> +</parameter> + +<parameter name="port" > +<longdesc lang="en"> +A port number that we can probe for status information +using the statusurl. +This will default to the port number found in the +configuration file, or 80, if none can be found +in the configuration file. + +</longdesc> +<shortdesc lang="en">httpd port</shortdesc> +<content type="integer" /> +</parameter> + +<parameter name="statusurl"> +<longdesc lang="en"> +The URL to monitor (the apache server status page by default). +If left unspecified, it will be inferred from +the apache configuration file. + +If you set this, make sure that it succeeds *only* from the +localhost (127.0.0.1). Otherwise, it may happen that the cluster +complains about the resource being active on multiple nodes. +</longdesc> +<shortdesc lang="en">url name</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="testregex"> +<longdesc lang="en"> +Regular expression to match in the output of statusurl. +Case insensitive. +</longdesc> +<shortdesc lang="en">monitor regular expression</shortdesc> +<content type="string" default="exists, but impossible to show in a human readable format (try grep testregex)"/> +</parameter> + +<parameter name="client"> +<longdesc lang="en"> +Client to use to query to Apache. If not specified, the RA will +try to find one on the system. Currently, wget and curl are +supported. For example, you can set this parameter to "curl" if +you prefer that to wget. +</longdesc> +<shortdesc lang="en">http client</shortdesc> +<content type="string" default=""/> +</parameter> + +<parameter name="testurl"> +<longdesc lang="en"> +URL to test. If it does not start with "http", then it's +considered to be relative to the Listen address. +</longdesc> +<shortdesc lang="en">test url</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="testregex10"> +<longdesc lang="en"> +Regular expression to match in the output of testurl. +Case insensitive. +</longdesc> +<shortdesc lang="en">extended monitor regular expression</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="testconffile"> +<longdesc lang="en"> +A file which contains test configuration. Could be useful if +you have to check more than one web application or in case sensitive +info should be passed as arguments (passwords). Furthermore, +using a config file is the only way to specify certain +parameters. + +Please see README.webapps for examples and file description. +</longdesc> +<shortdesc lang="en">test configuration file</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="testname"> +<longdesc lang="en"> +Name of the test within the test configuration file. +</longdesc> +<shortdesc lang="en">test name</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="options"> +<longdesc lang="en"> +Extra options to apply when starting apache. See man httpd(8). +</longdesc> +<shortdesc lang="en">command line options</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="envfiles"> +<longdesc lang="en"> +Files (one or more) which contain extra environment variables. +If you want to prevent script from reading the default file, set +this parameter to empty string. +</longdesc> +<shortdesc lang="en">environment settings files</shortdesc> +<content type="string" default="${OCF_RESKEY_envfiles_default}"/> +</parameter> + +<parameter name="use_ipv6"> +<longdesc lang="en"> +We will try to detect if the URL (for monitor) is IPv6, but if +that doesn't work set this to true to enforce IPv6. +</longdesc> +<shortdesc lang="en">use ipv6 with http clients</shortdesc> +<content type="boolean" default="${OCF_RESKEY_use_ipv6_default}"/> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="40s" /> +<action name="stop" timeout="60s" /> +<action name="status" timeout="30s" /> +<action name="monitor" depth="0" timeout="20s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s" /> +</actions> +</resource-agent> +END + return $OCF_SUCCESS +} + +apache_validate_all() { + if [ -z "$HTTPD" ]; then + ocf_exit_reason "apache httpd program not found" + return $OCF_ERR_INSTALLED + fi + if [ ! -x "$HTTPD" ]; then + ocf_exit_reason "HTTPD $HTTPD not found or is not an executable!" + return $OCF_ERR_INSTALLED + fi + if [ ! -f $CONFIGFILE ]; then + ocf_exit_reason "Configuration file $CONFIGFILE not found!" + return $OCF_ERR_INSTALLED + fi + + # validate testconffile/testurl before apache_monitor_10() + if [ -n "$TESTCONFFILE" ]; then + if [ ! -f "$TESTCONFFILE" ] || [ ! -r "$TESTCONFFILE" ]; then + ocf_exit_reason "Configuration file $TESTCONFFILE not found, or not readable." + return $OCF_ERR_INSTALLED + fi + else + if [ -n "$TESTURL" ]; then + # remove leading or trailing spaces/tabs + local temp=$(printf "$TESTURL" | sed -e 's/^[ \t]*//g' -e 's/[ \t]*$//g') + + if [ -z "$temp" ]; then + ocf_exit_reason "testurl: \"$TESTURL\" seems to be an empty string?" + return $OCF_ERR_CONFIGURED + fi + fi + + # FIXME: validate TESTREGEX10 will be needed if empty regex is not allow. + fi + + ocf_mkstatedir root 755 `dirname $PidFile` || return $OCF_ERR_INSTALLED + return $OCF_SUCCESS +} + +find_httpd_prog() { + case $0 in + *IBM*) + HTTPD=$IBMHTTPD + DefaultConfig=$DEFAULT_IBMCONFIG;; + *) + HTTPD= + for h in $HTTPDLIST + do + if [ -f $h -a -x $h ]; then + HTTPD=$h + break + fi + done + + # Let the user know that the $HTTPD used is not the one (s)he specified via $OCF_RESKEY_httpd + if [ "X$OCF_RESKEY_httpd" != X -a "X$HTTPD" != X ]; then + ocf_log info "Using $HTTPD as HTTPD" + fi + DefaultConfig=$(detect_default_config) + ;; + esac +} + +apache_getconfig() { + # these variables are global + HTTPD="$OCF_RESKEY_httpd" + PORT="$OCF_RESKEY_port" + STATUSURL="$OCF_RESKEY_statusurl" + CONFIGFILE="$OCF_RESKEY_configfile" + OPTIONS="$OCF_RESKEY_options" + CLIENT=${OCF_RESKEY_client} + TESTREGEX=${OCF_RESKEY_testregex:-'</ *html *>'} + TESTURL="$OCF_RESKEY_testurl" + TESTREGEX10=${OCF_RESKEY_testregex10} + TESTCONFFILE="$OCF_RESKEY_testconffile" + TESTNAME="$OCF_RESKEY_testname" + : ${OCF_RESKEY_envfiles=${OCF_RESKEY_envfiles_default}} + source_envfiles $OCF_RESKEY_envfiles + + if [ "X$HTTPD" = X -o ! -f "$HTTPD" -o ! -x "$HTTPD" ]; then + find_httpd_prog + fi + + CONFIGFILE=${CONFIGFILE:-$DefaultConfig} + if [ -n "$HTTPD" ]; then + httpd_basename=`basename $HTTPD` + case $httpd_basename in + *-*) httpd_basename=`echo "$httpd_basename" | sed -e 's%\-.*%%'`;; + esac + fi + GetParams $CONFIGFILE +} + +OCF_REQUIRED_PARAMS="" +OCF_REQUIRED_BINARIES="" +ocf_rarun $* diff --git a/heartbeat/apache-conf.sh b/heartbeat/apache-conf.sh new file mode 100644 index 0000000..6aaaf76 --- /dev/null +++ b/heartbeat/apache-conf.sh @@ -0,0 +1,196 @@ +# +# Common apache code +# (sourced by apache) +# +# Author: Alan Robertson +# Sun Jiang Dong +# +# Support: users@clusterlabs.org +# +# License: GNU General Public License (GPL) +# +# Copyright: (C) 2002-2005 International Business Machines +# + +source_envfiles() { + for f; do + [ -f "$f" -a -r "$f" ] && + . "$f" + done +} + +apachecat() { + awk ' + function procline() { + split($0,a); + if( a[1]~/^[Ii]nclude$/ ) { + includedir=a[2]; + gsub("\"","",includedir); + procinclude(includedir); + } else { + if( a[1]=="ServerRoot" ) { + rootdir=a[2]; + gsub("\"","",rootdir); + } + print; + } + } + function printfile(infile, a) { + while( (getline<infile) > 0 ) { + procline(); + } + close(infile); + } + function allfiles(dir, cmd,f) { + cmd="find -L "dir" -type f"; + while( ( cmd | getline f ) > 0 ) { + printfile(f); + } + close(cmd); + } + function listfiles(pattern, cmd,f) { + cmd="ls "pattern" 2>/dev/null"; + while( ( cmd | getline f ) > 0 ) { + printfile(f); + } + close(cmd); + } + function procinclude(spec) { + if( rootdir!="" && spec!~/^\// ) { + spec=rootdir"/"spec; + } + if( isdir(spec) ) { + allfiles(spec); # read all files in a directory (and subdirs) + } else { + listfiles(spec); # there could be jokers + } + } + function isdir(s) { + return !system("test -d \""s"\""); + } + { procline(); } + ' $1 | + sed 's/#.*//;s/[[:blank:]]*$//;s/^[[:blank:]]*//' | + grep -v '^$' +} + +# +# set parameters (as shell vars) from our apache config file +# +get_apache_params() { + configfile=$1 + shift 1 + vars=$(echo "$@" | sed 's/ /,/g') + + eval ` + apachecat $configfile | awk -v vars="$vars" ' + BEGIN{ + split(vars,v,","); + for( i in v ) + vl[i]=tolower(v[i]); + } + { + for( i in v ) + if( tolower($1)==vl[i] ) { + print v[i]"="$2 + delete vl[i] + break + } + } + '` +} + +# +# Return the location(s) that are handled by the given handler +# +FindLocationForHandler() { + PerlScript='while (<>) { + /<Location "?([^ >"]+)/i && ($loc=$1); + '"/SetHandler +$2"'/i && print "$loc\n"; + }' + apachecat $1 | perl -e "$PerlScript" +} + +# +# Check if the port is valid +# +CheckPort() { + ocf_is_decimal "$1" && [ $1 -gt 0 ] +} + +buildlocalurl() { + [ "x$Listen" != "x" ] && + echo "http://${Listen}" || + echo "${LOCALHOST}:${PORT}" +} +# the test url may need a local prefix (as specified in the +# apache Listen directive) +fixtesturl() { + echo $test_url | grep -qs "^http" && return + test_url="`buildlocalurl`$test_url" +} +# +# Get all the parameters we need from the Apache config file +# +GetParams() { + ConfigFile=$1 + if [ ! -f $ConfigFile ]; then + return $OCF_ERR_INSTALLED + fi + get_apache_params $ConfigFile ServerRoot PidFile Port Listen + case $PidFile in + /*) ;; + [[:alnum:]]*) PidFile=$ServerRoot/$PidFile;; + *) + # If the PidFile is not set in the config, set + # a default location. + PidFile=$HA_VARRUNDIR/${httpd_basename}-${OCF_RESOURCE_INSTANCE}.pid + # Force the daemon to use this location by using + # the -c option, which adds the PidFile directive + # as if it was in the configuration file to begin with. + PIDFILE_DIRECTIVE="true" + ;; + esac + + for p in "$PORT" "$Port" 80; do + if CheckPort "$p"; then + PORT="$p" + break + fi + done + + echo $Listen | grep ':' >/dev/null || # Listen could be just port spec + Listen="localhost:$Listen" + + # + # It's difficult to figure out whether the server supports + # the status operation. + # (we start our server with -DSTATUS - just in case :-)) + # + # Typically (but not necessarily) the status URL is /server-status + # + # For us to think status will work, we have to have the following things: + # + # - The server-status handler has to be mapped to some URL somewhere + # + # We assume that: + # + # - the "main" web server at $PORT will also support it if we can find it + # somewhere in the file + # - it will be supported at the same URL as the one we find in the file + # + # If this doesn't work for you, then set the statusurl attribute. + # + if + [ "X$STATUSURL" = "X" ] + then + StatusURL=`FindLocationForHandler $1 server-status | tail -1` + STATUSURL="`buildlocalurl`$StatusURL" + fi + + if ! test "$PidFile"; then + return $OCF_ERR_INSTALLED + else + return $OCF_SUCCESS + fi +} diff --git a/heartbeat/asterisk b/heartbeat/asterisk new file mode 100755 index 0000000..5c90935 --- /dev/null +++ b/heartbeat/asterisk @@ -0,0 +1,497 @@ +#!/bin/sh +# +# +# Asterisk +# +# Description: Manages an Asterisk PBX as an HA resource +# +# Authors: Martin Gerhard Loschwitz +# Florian Haas +# +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL) +# +# (c) 2011 hastexo Professional Services GmbH +# +# This resource agent is losely derived from the MySQL resource +# agent, which itself is made available to the public under the +# following copyright: +# +# (c) 2002-2005 International Business Machines, Inc. +# 2005-2010 Linux-HA contributors +# +# See usage() function below for more details ... +# +# OCF instance parameters: +# OCF_RESKEY_binary +# OCF_RESKEY_canary_binary +# OCF_RESKEY_config +# OCF_RESKEY_user +# OCF_RESKEY_group +# OCF_RESKEY_additional_parameters +# OCF_RESKEY_realtime +# OCF_RESKEY_maxfiles +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + +# Fill in some defaults if no values are specified +HOSTOS=`uname` +if [ "X${HOSTOS}" = "XOpenBSD" ]; then + OCF_RESKEY_user_default="_asterisk" + OCF_RESKEY_group_default="_asterisk" +else + OCF_RESKEY_user_default="asterisk" + OCF_RESKEY_group_default="asterisk" +fi +OCF_RESKEY_binary_default="asterisk" +OCF_RESKEY_canary_binary_default="astcanary" +OCF_RESKEY_config_default="/etc/asterisk/asterisk.conf" +OCF_RESKEY_additional_parameters_default="-g -vvv" +OCF_RESKEY_realtime_default="false" +OCF_RESKEY_maxfiles_default="8192" + +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +: ${OCF_RESKEY_canary_binary=${OCF_RESKEY_canary_binary_default}} +: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} +: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} +: ${OCF_RESKEY_group=${OCF_RESKEY_group_default}} +: ${OCF_RESKEY_additional_parameters=${OCF_RESKEY_additional_parameters_default}} +: ${OCF_RESKEY_realtime=${OCF_RESKEY_realtime_default}} +: ${OCF_RESKEY_maxfiles=${OCF_RESKEY_maxfiles_default}} + +####################################################################### + +usage() { + cat <<UEND + usage: $0 (start|stop|validate-all|meta-data|status|monitor) + + $0 manages an Asterisk PBX as an HA resource. + + The 'start' operation starts the database. + The 'stop' operation stops the database. + The 'validate-all' operation reports whether the parameters are valid + The 'meta-data' operation reports this RA's meta-data information + The 'status' operation reports whether the database is running + The 'monitor' operation reports whether the database seems to be working + +UEND +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="asterisk" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource agent for the Asterisk PBX. +May manage an Asterisk PBX telephony system or a clone set that +forms an Asterisk distributed device setup. +</longdesc> +<shortdesc lang="en">Manages an Asterisk PBX</shortdesc> +<parameters> + +<parameter name="binary" unique="0" required="0"> +<longdesc lang="en"> +Location of the Asterisk PBX server binary +</longdesc> +<shortdesc lang="en">Asterisk PBX server binary</shortdesc> +<content type="string" default="${OCF_RESKEY_binary_default}" /> +</parameter> + +<parameter name="canary_binary" unique="0" required="0"> +<longdesc lang="en"> +Location of the Asterisk PBX Canary server binary +</longdesc> +<shortdesc lang="en">Asterisk PBX Canary server binary</shortdesc> +<content type="string" default="${OCF_RESKEY_canary_binary_default}" /> +</parameter> + +<parameter name="config" unique="0" required="0"> +<longdesc lang="en"> +The Asterisk PBX configuration file +</longdesc> +<shortdesc lang="en">Asterisk PBX config</shortdesc> +<content type="string" default="${OCF_RESKEY_config_default}" /> +</parameter> + +<parameter name="user" unique="0" required="0"> +<longdesc lang="en"> +User running Asterisk PBX daemon +</longdesc> +<shortdesc lang="en">Asterisk PBX user</shortdesc> +<content type="string" default="${OCF_RESKEY_user_default}" /> +</parameter> + +<parameter name="group" unique="0" required="0"> +<longdesc lang="en"> +Group running Asterisk PBX daemon (for logfile and directory permissions) +</longdesc> +<shortdesc lang="en">Asterisk PBX group</shortdesc> +<content type="string" default="${OCF_RESKEY_group_default}" /> +</parameter> + +<parameter name="additional_parameters" unique="0" required="0"> +<longdesc lang="en"> +Additional parameters which are passed to the Asterisk PBX on +startup (e.g. -L <load> or -M <value>). +</longdesc> +<shortdesc lang="en">Additional parameters to pass to the Asterisk PBX</shortdesc> +<content type="string" default="${OCF_RESKEY_additional_parameters_default}" /> +</parameter> + +<parameter name="realtime" unique="0" required="0"> +<longdesc lang="en"> +Determines whether the Asterisk PBX daemon will be run with +realtime priority or not. +</longdesc> +<shortdesc lang="en">Asterisk PBX realtime priority</shortdesc> +<content type="boolean" default="${OCF_RESKEY_realtime_default}" /> +</parameter> + +<parameter name="maxfiles" unique="0" required="0"> +<longdesc lang="en"> +Determines how many files the Asterisk PBX is allowed to open at +a time. Helps to fix the 'Too many open files' error message. +</longdesc> +<shortdesc lang="en">Asterisk PBX allowed MAXFILES</shortdesc> +<content type="integer" default="${OCF_RESKEY_maxfiles_default}" /> +</parameter> + +<parameter name="monitor_sipuri" unique="0" required="0"> +<longdesc lang="en"> +A SIP URI to check when monitoring. During monitor, the agent will +attempt to do a SIP OPTIONS request against this URI. +Requires the sipsak utility to be present and executable. +If unset, the agent does no SIP URI monitoring. +</longdesc> +<shortdesc lang="en">SIP URI to check when monitoring</shortdesc> +<content type="string" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="status" timeout="20s" /> +<action name="monitor" timeout="30s" interval="20s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +####################################################################### +# Convenience functions + +asterisk_rx() { + # if $HOME is set, asterisk -rx writes a .asterisk_history there + ( + unset HOME + ocf_run $OCF_RESKEY_binary -r -s $ASTRUNDIR/asterisk.ctl -x "$1" + ) +} + +####################################################################### +# Functions invoked by resource manager actions + +asterisk_validate() { + local rc + + check_binary $OCF_RESKEY_binary + check_binary pgrep + + if [ -n "$OCF_RESKEY_monitor_sipuri" ]; then + check_binary sipsak + fi + + # A config file on shared storage that is not available + # during probes is OK. + if [ ! -f $OCF_RESKEY_config ]; then + if ! ocf_is_probe; then + ocf_log err "Config $OCF_RESKEY_config doesn't exist" + return $OCF_ERR_INSTALLED + fi + ocf_log warn "Config $OCF_RESKEY_config not available during a probe" + fi + + getent passwd $OCF_RESKEY_user >/dev/null 2>&1 + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "User $OCF_RESKEY_user doesn't exist" + return $OCF_ERR_INSTALLED + fi + + getent group $OCF_RESKEY_group >/dev/null 2>&1 + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "Group $OCF_RESKEY_group doesn't exist" + return $OCF_ERR_INSTALLED + fi + + true +} + +asterisk_status() { + local pid + local rc + + if [ ! -f $ASTRUNDIR/asterisk.pid ]; then + ocf_log info "Asterisk PBX is not running" + return $OCF_NOT_RUNNING + fi + + pid=`cat $ASTRUNDIR/asterisk.pid` + ocf_run kill -s 0 $pid + rc=$? + + if [ $rc -eq 0 ]; then + if ocf_is_true "$OCF_RESKEY_realtime"; then + astcanary_pid=`pgrep -d " " -f "astcanary $ASTRUNDIR/alt.asterisk.canary.tweet.tweet.tweet"` + if [ ! "$astcanary_pid" ]; then + ocf_log err "Asterisk PBX is running but astcanary is not although it should" + return $OCF_ERR_GENERIC + fi + else + return $OCF_SUCCESS + fi + else + ocf_log info "Asterisk PBX not running: removing old PID file" + rm -f $ASTRUNDIR/asterisk.pid + return $OCF_NOT_RUNNING + fi +} + +asterisk_monitor() { + local rc + + asterisk_status + rc=$? + + # If status returned an error, return that immediately + if [ $rc -ne $OCF_SUCCESS ]; then + return $rc + fi + + # Check whether connecting to asterisk is possible + asterisk_rx 'core show channels count' + rc=$? + + if [ $rc -ne 0 ]; then + if [ "$__OCF_ACTION" = "start" ]; then + ocf_log info "Asterisk PBX not running yet" + return $OCF_NOT_RUNNING; + else + ocf_log err "Failed to connect to the Asterisk PBX" + return $OCF_ERR_GENERIC; + fi + fi + + # Optionally check the monitor URI with sipsak + # The return values: + # 0 means that a 200 was received. + # 1 means something else then 1xx or 2xx was received. + # 2 will be returned on local errors like non resolvable names + # or wrong options combination. + # 3 will be returned on remote errors like socket errors + # (e.g. icmp error), redirects without a contact header or + # simply no answer (timeout). + # This can also happen if sipsak is run too early after asterisk + # start. + + #To avoid the case where the sipsak check runs before the sip starts at the start action + SIPCHECK="sipsak -s $OCF_RESKEY_monitor_sipuri" + if [ -n "$OCF_RESKEY_monitor_sipuri" ]; then + ocf_run $SIPCHECK + rc=$? + if [ "$__OCF_ACTION" = "start" ]; then + while [ $rc -ne 0 ]; do + ocf_log info "Starting ast, waiting for SIP ok" + sleep 1 + ocf_run $SIPCHECK + rc=$? + done + else + case "$rc" in + 1|2) return $OCF_ERR_GENERIC;; + 3) return $OCF_NOT_RUNNING;; + esac + fi + fi + + ocf_log debug "Asterisk PBX monitor succeeded" + return $OCF_SUCCESS +} + +asterisk_start() { + local asterisk_extra_params + local dir + local rc + + asterisk_status + rc=$? + if [ $rc -eq $OCF_SUCCESS ]; then + ocf_log info "Asterisk PBX already running" + return $OCF_SUCCESS + fi + + # If Asterisk is not already running, make sure there is no + # old astcanary instance when the new asterisk starts. To + # achieve this, kill old astcanary instances belonging to + # this $ASTRUNDIR. + + # Find out PIDs of running astcanaries + astcanary_pid=`pgrep -d " " -f "astcanary $ASTRUNDIR/alt.asterisk.canary.tweet.tweet.tweet"` + + # If there are astcanaries running that belong to $ASTRUNDIR, + # kill them. + if [ "$astcanary_pid" ]; then + for i in $astcanary_pid; do ocf_run kill -s KILL $astcanary_pid; done + fi + + for dir in $ASTRUNDIR $ASTLOGDIR $ASTLOGDIR/cdr-csv $ASTLOGDIR/cdr-custom; do + if [ ! -d "$dir" ]; then + ocf_run install -d -o $OCF_RESKEY_user -g $OCF_RESKEY_group $dir \ + || exit $OCF_ERR_GENERIC + fi + # Regardless of whether we just created the directory or it + # already existed, check whether it is writable by the configured + # user + if ! su -s /bin/sh - $OCF_RESKEY_user -c "test -w $dir"; then + ocf_log warn "Directory $dir is not writable by $OCF_RESKEY_user, attempting chown" + ocf_run chown $OCF_RESKEY_user:$OCF_RESKEY_group $dir \ + || exit $OCF_ERR_PERM + fi + done + + # set MAXFILES + ulimit -n $OCF_RESKEY_maxfiles + + # Determine whether Asterisk PBX is supposed to run in Realtime mode + # or not and make asterisk daemonize automatically + if ocf_is_true "$OCF_RESKEY_realtime"; then + asterisk_extra_params="-F -p" + else + asterisk_extra_params="-F" + fi + + ocf_run ${OCF_RESKEY_binary} -G $OCF_RESKEY_group -U $OCF_RESKEY_user \ + -C $OCF_RESKEY_config \ + $OCF_RESKEY_additional_parameters \ + $asterisk_extra_params + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "Asterisk PBX start command failed: $rc" + exit $OCF_ERR_GENERIC + fi + + # Spin waiting for the server to come up. + # Let the CRM/LRM time us out if required + while true; do + asterisk_monitor + rc=$? + [ $rc -eq $OCF_SUCCESS ] && break + if [ $rc -ne $OCF_NOT_RUNNING ]; then + ocf_log err "Asterisk PBX start failed" + exit $OCF_ERR_GENERIC + fi + sleep 2 + done + + ocf_log info "Asterisk PBX started" + return $OCF_SUCCESS +} + +asterisk_stop() { + local pid + local astcanary_pid + local rc + + asterisk_status + rc=$? + if [ $rc -eq $OCF_NOT_RUNNING ]; then + ocf_log info "Asterisk PBX already stopped" + return $OCF_SUCCESS + fi + + pid=`cat $ASTRUNDIR/asterisk.pid` + ocf_run kill -s TERM $pid + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "Asterisk PBX couldn't be stopped" + exit $OCF_ERR_GENERIC + fi + + # stop waiting + shutdown_timeout=15 + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-5)) + fi + count=0 + while [ $count -lt $shutdown_timeout ]; do + asterisk_status + rc=$? + if [ $rc -eq $OCF_NOT_RUNNING ]; then + break + fi + count=`expr $count + 1` + sleep 1 + ocf_log debug "Asterisk PBX still hasn't stopped yet. Waiting ..." + done + + asterisk_status + rc=$? + if [ $rc -ne $OCF_NOT_RUNNING ]; then + # SIGTERM didn't help either, try SIGKILL + ocf_log info "Asterisk PBX failed to stop after ${shutdown_timeout}s using SIGTERM. Trying SIGKILL ..." + ocf_run kill -s KILL $pid + fi + + # After killing asterisk, stop astcanary + if ocf_is_true "$OCF_RESKEY_realtime"; then + astcanary_pid=`pgrep -d " " -f "astcanary $ASTRUNDIR/alt.asterisk.canary.tweet.tweet.tweet"` + if [ "$astcanary_pid" ]; then + for i in $astcanary_pid; do ocf_run kill -s KILL $astcanary_pid; done + fi + fi + + ocf_log info "Asterisk PBX stopped" + return $OCF_SUCCESS +} + +####################################################################### + +case "$1" in + meta-data) meta_data + exit $OCF_SUCCESS;; + usage|help) usage + exit $OCF_SUCCESS;; +esac + + +# Anything except meta-data and help must pass validation +asterisk_validate || exit $? + +# Now that validate has passed and we can be sure to be able to read +# the config file, set convenience variables +ASTRUNDIR=`grep astrundir $OCF_RESKEY_config | awk '/^astrundir/ {print $3}'` +ASTLOGDIR=`grep astlogdir $OCF_RESKEY_config | awk '/^astlogdir/ {print $3}'` + +# What kind of method was invoked? +case "$1" in + start) asterisk_start;; + stop) asterisk_stop;; + status) asterisk_status;; + monitor) asterisk_monitor;; + validate-all) ;; + *) usage + exit $OCF_ERR_UNIMPLEMENTED;; +esac diff --git a/heartbeat/aws-vpc-move-ip b/heartbeat/aws-vpc-move-ip new file mode 100755 index 0000000..dee0403 --- /dev/null +++ b/heartbeat/aws-vpc-move-ip @@ -0,0 +1,495 @@ +#!/bin/sh +# +# +# OCF resource agent to move an IP address within a VPC in the AWS +# +# Copyright (c) 2017 Markus Guertler (SUSE) +# Based on code of Adam Gandelman (GitHub ec2-resource-agents/elasticip) +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_awscli_default="/usr/bin/aws" +OCF_RESKEY_profile_default="default" +OCF_RESKEY_region_default="" +OCF_RESKEY_ip_default="" +OCF_RESKEY_address_default="" +OCF_RESKEY_routing_table_default="" +OCF_RESKEY_routing_table_role_default="" +OCF_RESKEY_interface_default="eth0" +OCF_RESKEY_iflabel_default="" +OCF_RESKEY_monapi_default="false" +OCF_RESKEY_lookup_type_default="InstanceId" + +: ${OCF_RESKEY_awscli=${OCF_RESKEY_awscli_default}} +: ${OCF_RESKEY_profile=${OCF_RESKEY_profile_default}} +: ${OCF_RESKEY_region=${OCF_RESKEY_region_default}} +: ${OCF_RESKEY_ip=${OCF_RESKEY_ip_default}} +: ${OCF_RESKEY_address=${OCF_RESKEY_address_default}} +: ${OCF_RESKEY_routing_table=${OCF_RESKEY_routing_table_default}} +: ${OCF_RESKEY_routing_table_role=${OCF_RESKEY_routing_table_role_default}} +: ${OCF_RESKEY_interface=${OCF_RESKEY_interface_default}} +: ${OCF_RESKEY_iflabel=${OCF_RESKEY_iflabel_default}} +: ${OCF_RESKEY_monapi=${OCF_RESKEY_monapi_default}} +: ${OCF_RESKEY_lookup_type=${OCF_RESKEY_lookup_type_default}} + +[ -n "$OCF_RESKEY_region" ] && region_opt="--region $OCF_RESKEY_region" +####################################################################### + + +USAGE="usage: $0 {start|stop|status|meta-data}"; +############################################################################### + + +############################################################################### +# +# Functions +# +############################################################################### + + +metadata() { +cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="aws-vpc-move-ip" version="2.0"> +<version>1.0</version> +<longdesc lang="en"> +Resource Agent to move IP addresses within a VPC of the Amazon Webservices EC2 +by changing an entry in an specific routing table +</longdesc> +<shortdesc lang="en">Move IP within a VPC of the AWS EC2</shortdesc> + +<parameters> +<parameter name="awscli"> +<longdesc lang="en"> +Path to command line tools for AWS +</longdesc> +<shortdesc lang="en">Path to AWS CLI tools</shortdesc> +<content type="string" default="${OCF_RESKEY_awscli_default}" /> +</parameter> + +<parameter name="profile"> +<longdesc lang="en"> +Valid AWS CLI profile name (see ~/.aws/config and 'aws configure') +</longdesc> +<shortdesc lang="en">profile name</shortdesc> +<content type="string" default="${OCF_RESKEY_profile_default}" /> +</parameter> + +<parameter name="region"> +<longdesc lang="en"> +Valid AWS region name (e.g., 'us-west-2') +</longdesc> +<shortdesc lang="en">region name</shortdesc> +<content type="string" default="${OCF_RESKEY_region_default}" /> +</parameter> + +<parameter name="ip" required="1"> +<longdesc lang="en"> +VPC private IP address +</longdesc> +<shortdesc lang="en">VPC private IP</shortdesc> +<content type="string" default="${OCF_RESKEY_ip_default}" /> +</parameter> + +<parameter name="address"> +<longdesc lang="en"> +Deprecated IP address param. Use the ip param instead. +</longdesc> +<shortdesc lang="en">Deprecated VPC private IP Address</shortdesc> +<content type="string" default="${OCF_RESKEY_address_default}" /> +</parameter> + +<parameter name="routing_table" required="1"> +<longdesc lang="en"> +Name of the routing table(s), where the route for the IP address should be changed. If declaring multiple routing tables they should be separated by comma. Example: rtb-XXXXXXXX,rtb-YYYYYYYYY +</longdesc> +<shortdesc lang="en">routing table name(s)</shortdesc> +<content type="string" default="${OCF_RESKEY_routing_table_default}" /> +</parameter> + +<parameter name="routing_table_role" required="0"> +<longdesc lang="en"> +Role to use to query/update the route table +</longdesc> +<shortdesc lang="en">route table query/update role</shortdesc> +<content type="string" default="${OCF_RESKEY_routing_table_role_default}" /> +</parameter> + +<parameter name="interface" required="1"> +<longdesc lang="en"> +Name of the network interface, i.e. eth0 +</longdesc> +<shortdesc lang="en">network interface name</shortdesc> +<content type="string" default="${OCF_RESKEY_interface_default}" /> +</parameter> + +<parameter name="iflabel"> +<longdesc lang="en"> +You can specify an additional label for your IP address here. +This label is appended to your interface name. + +The kernel allows alphanumeric labels up to a maximum length of 15 +characters including the interface name and colon (e.g. eth0:foobar1234) +</longdesc> +<shortdesc lang="en">Interface label</shortdesc> +<content type="string" default="${OCF_RESKEY_iflabel_default}"/> +</parameter> + +<parameter name="monapi"> +<longdesc lang="en"> +Enable enhanced monitoring using AWS API calls to check route table entry +</longdesc> +<shortdesc lang="en">Enhanced Monitoring</shortdesc> +<content type="boolean" default="${OCF_RESKEY_monapi_default}" /> +</parameter> + +<parameter name="lookup_type" required="0"> +<longdesc lang="en"> +Name of resource type to lookup in route table. +"InstanceId" : EC2 instance ID. (default) +"NetworkInterfaceId" : ENI ID. (useful in shared VPC setups). +</longdesc> +<shortdesc lang="en">lookup type for route table resource</shortdesc> +<content type="string" default="${OCF_RESKEY_lookup_type_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="180s" /> +<action name="stop" timeout="180s" /> +<action name="monitor" depth="0" timeout="30s" interval="60s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + + +execute_cmd_as_role(){ + cmd=$1 + role=$2 + output="$($OCF_RESKEY_awscli sts assume-role --role-arn $role --role-session-name AWSCLI-RouteTableUpdate --profile $OCF_RESKEY_profile $region_opt --output=text)" + export AWS_ACCESS_KEY_ID="$(echo $output | awk -F" " '$4=="CREDENTIALS" {print $5}')" + export AWS_SECRET_ACCESS_KEY="$(echo $output | awk -F" " '$4=="CREDENTIALS" {print $7}')" + export AWS_SESSION_TOKEN="$(echo $output | awk -F" " '$4=="CREDENTIALS" {print $8}')" + + #Execute command + ocf_log debug "Assumed Role ${role}" + ocf_log debug "$($OCF_RESKEY_awscli sts get-caller-identity)" + ocf_log debug "executing command: $cmd" + response="$($cmd)" + unset output AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY AWS_SESSION_TOKEN + echo $response +} + +ec2ip_set_address_param_compat(){ + # Include backward compatibility for the deprecated address parameter + if [ -z "$OCF_RESKEY_ip" ] && [ -n "$OCF_RESKEY_address" ]; then + OCF_RESKEY_ip="$OCF_RESKEY_address" + fi +} + +ec2ip_validate() { + for cmd in $OCF_RESKEY_awscli ip curl; do + check_binary "$cmd" + done + + if [ -z "$OCF_RESKEY_profile" ]; then + ocf_exit_reason "profile parameter not set" + return $OCF_ERR_CONFIGURED + fi + + if [ -n "$OCF_RESKEY_iflabel" ]; then + label=${OCF_RESKEY_interface}:${OFC_RESKEY_iflabel} + if [ ${#label} -gt 15 ]; then + ocf_exit_reason "Interface label [$label] exceeds maximum character limit of 15" + exit $OCF_ERR_CONFIGURED + fi + fi + + TOKEN=$(curl -sX PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") + EC2_INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id -H "X-aws-ec2-metadata-token: $TOKEN") + + if [ -z "${EC2_INSTANCE_ID}" ]; then + ocf_exit_reason "Instance ID not found. Is this a EC2 instance?" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +ec2ip_monitor() { + MON_RES="" + if [ "${OCF_RESKEY_lookup_type}" = "NetworkInterfaceId" ]; then + EC2_ID="$(ec2ip_get_instance_eni)" + RESOURCE_TYPE="interface" + else + EC2_ID="$EC2_INSTANCE_ID" + RESOURCE_TYPE="instance" + fi + + if ocf_is_true ${OCF_RESKEY_monapi} || [ "$__OCF_ACTION" = "start" ] || ocf_is_probe; then + for rtb in $(echo $OCF_RESKEY_routing_table | sed -e 's/,/ /g'); do + ocf_log info "monitor: check routing table (API call) - $rtb" + if [ -z "${OCF_RESKEY_routing_table_role}" ]; then + cmd="$OCF_RESKEY_awscli --profile $OCF_RESKEY_profile $region_opt --output text ec2 describe-route-tables --route-table-ids $rtb --query RouteTables[*].Routes[?DestinationCidrBlock=='$OCF_RESKEY_ip/32'].$OCF_RESKEY_lookup_type" + ocf_log debug "executing command: $cmd" + ROUTE_TO_INSTANCE="$($cmd)" + else + cmd="$OCF_RESKEY_awscli $region_opt --output text ec2 describe-route-tables --route-table-ids $rtb --query RouteTables[*].Routes[?DestinationCidrBlock=='$OCF_RESKEY_ip/32'].$OCF_RESKEY_lookup_type" + ROUTE_TO_INSTANCE="$(execute_cmd_as_role "$cmd" $OCF_RESKEY_routing_table_role)" + fi + ocf_log debug "Overlay IP is currently routed to ${ROUTE_TO_INSTANCE}" + if [ -z "$ROUTE_TO_INSTANCE" ]; then + ROUTE_TO_INSTANCE="<unknown>" + fi + + if [ "$EC2_ID" != "$ROUTE_TO_INSTANCE" ]; then + ocf_log warn "not routed to this $RESOURCE_TYPE ($EC2_ID) but to $RESOURCE_TYPE $ROUTE_TO_INSTANCE on $rtb" + MON_RES="$MON_RES $rtb" + fi + sleep 1 + done + + if [ ! -z "$MON_RES" ]; then + return $OCF_NOT_RUNNING + fi + + else + ocf_log debug "monitor: Enhanced Monitoring disabled - omitting API call" + fi + + cmd="ip addr show to $OCF_RESKEY_ip up" + ocf_log debug "executing command: $cmd" + RESULT=$($cmd | grep "$OCF_RESKEY_ip") + if [ -z "$RESULT" ]; then + if [ "$__OCF_ACTION" = "monitor" ] && ! ocf_is_probe; then + level="error" + else + level="info" + fi + + ocf_log "$level" "IP $OCF_RESKEY_ip not assigned to running interface" + return $OCF_NOT_RUNNING + fi + + ocf_log debug "route in VPC and address assigned" + return $OCF_SUCCESS +} + + +ec2ip_drop() { + cmd="ip addr delete ${OCF_RESKEY_ip}/32 dev $OCF_RESKEY_interface" + ocf_log debug "executing command: $cmd" + output=$($cmd 2>&1) + rc=$? + + if [ "$rc" -gt 0 ]; then + if [ "$__OCF_ACTION" = "start" ]; then + # expected to fail during start + level="debug" + else + level="warn" + fi + + ocf_log "$level" "command failed, rc $rc" + ocf_log "$level" "output/error: $output" + return $OCF_ERR_GENERIC + else + ocf_log debug "output/error: $output" + fi + + # delete remaining route-entries if any + ip route show to exact ${OCF_RESKEY_ip}/32 dev $OCF_RESKEY_interface | xargs -r ip route delete + ip route show table local to exact ${OCF_RESKEY_ip}/32 dev $OCF_RESKEY_interface | xargs -r ip route delete + + return $OCF_SUCCESS +} + +ec2ip_get_instance_eni() { + MAC_FILE="/sys/class/net/${OCF_RESKEY_interface}/address" + if [ -f $MAC_FILE ]; then + cmd="cat ${MAC_FILE}" + else + cmd="ip -br link show dev ${OCF_RESKEY_interface} | tr -s ' ' | cut -d' ' -f3" + fi + ocf_log debug "executing command: $cmd" + MAC_ADDR="$(eval $cmd)" + rc=$? + if [ $rc != 0 ]; then + ocf_log warn "command failed, rc: $rc" + return $OCF_ERR_GENERIC + fi + ocf_log debug "MAC address associated with interface ${OCF_RESKEY_interface}: ${MAC_ADDR}" + + cmd="curl -s http://169.254.169.254/latest/meta-data/network/interfaces/macs/${MAC_ADDR}/interface-id -H \"X-aws-ec2-metadata-token: $TOKEN\"" + ocf_log debug "executing command: $cmd" + EC2_NETWORK_INTERFACE_ID="$(eval $cmd)" + rc=$? + if [ $rc != 0 ]; then + ocf_log warn "command failed, rc: $rc" + return $OCF_ERR_GENERIC + fi + ocf_log debug "network interface id associated MAC address ${MAC_ADDR}: ${EC2_NETWORK_INTERFACE_ID}" + echo $EC2_NETWORK_INTERFACE_ID +} + +ec2ip_get_and_configure() { + EC2_NETWORK_INTERFACE_ID="$(ec2ip_get_instance_eni)" + for rtb in $(echo $OCF_RESKEY_routing_table | sed -e 's/,/ /g'); do + if [ -z "${OCF_RESKEY_routing_table_role}" ]; then + cmd="$OCF_RESKEY_awscli --profile $OCF_RESKEY_profile $region_opt --output text ec2 replace-route --route-table-id $rtb --destination-cidr-block ${OCF_RESKEY_ip}/32 --network-interface-id $EC2_NETWORK_INTERFACE_ID" + ocf_log debug "executing command: $cmd" + $cmd + else + cmd="$OCF_RESKEY_awscli $region_opt --output text ec2 replace-route --route-table-id $rtb --destination-cidr-block ${OCF_RESKEY_ip}/32 --network-interface-id $EC2_NETWORK_INTERFACE_ID" + update_response="$(execute_cmd_as_role "$cmd" $OCF_RESKEY_routing_table_role)" + fi + rc=$? + if [ "$rc" != 0 ]; then + ocf_log warn "command failed, rc: $rc" + return $OCF_ERR_GENERIC + fi + sleep 1 + done + + # Reconfigure the local ip address + ec2ip_drop + + extra_opts="" + if [ -n "$OCF_RESKEY_iflabel" ]; then + extra_opts="$extra_opts label $OCF_RESKEY_interface:$OCF_RESKEY_iflabel" + fi + + cmd="ip addr add ${OCF_RESKEY_ip}/32 dev $OCF_RESKEY_interface $extra_opts" + ocf_log debug "executing command: $cmd" + $cmd + rc=$? + if [ $rc != 0 ]; then + ocf_log warn "command failed, rc: $rc" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +ec2ip_stop() { + ocf_log info "EC2: Bringing down IP address $OCF_RESKEY_ip" + + ec2ip_monitor + if [ $? = $OCF_NOT_RUNNING ]; then + ocf_log info "EC2: Address $OCF_RESKEY_ip already down" + return $OCF_SUCCESS + fi + + ec2ip_drop + if [ $? != $OCF_SUCCESS ]; then + return $OCF_ERR_GENERIC + fi + + ec2ip_monitor + if [ $? != $OCF_NOT_RUNNING ]; then + ocf_log error "EC2: Couldn't bring down IP address $OCF_RESKEY_ip on interface $OCF_RESKEY_interface." + return $OCF_ERR_GENERIC + fi + + ocf_log info "EC2: Successfully brought down $OCF_RESKEY_ip" + return $OCF_SUCCESS +} + +ec2ip_start() { + ocf_log info "EC2: Moving IP address $OCF_RESKEY_ip to this host by adjusting routing table $OCF_RESKEY_routing_table" + + ec2ip_monitor + if [ $? = $OCF_SUCCESS ]; then + ocf_log info "EC2: $OCF_RESKEY_ip already started" + return $OCF_SUCCESS + fi + + ocf_log info "EC2: Adjusting routing table and locally configuring IP address" + ec2ip_get_and_configure + rc=$? + if [ $rc != $OCF_SUCCESS ]; then + ocf_log error "Received $rc from 'aws'" + return $OCF_ERR_GENERIC + fi + + ec2ip_monitor + if [ $? != $OCF_SUCCESS ]; then + ocf_log error "EC2: IP address couldn't be configured on this host (IP: $OCF_RESKEY_ip, Interface: $OCF_RESKEY_interface)" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +############################################################################### +# +# MAIN +# +############################################################################### + +case $__OCF_ACTION in + meta-data) + metadata + exit $OCF_SUCCESS + ;; + usage|help) + echo $USAGE + exit $OCF_SUCCESS + ;; +esac + +if ! ocf_is_root; then + ocf_log err "You must be root for $__OCF_ACTION operation." + exit $OCF_ERR_PERM +fi + +ec2ip_set_address_param_compat + +ec2ip_validate + +case $__OCF_ACTION in + start) + ec2ip_start;; + stop) + ec2ip_stop;; + monitor) + ec2ip_monitor;; + validate-all) + exit $?;; + *) + echo $USAGE + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac diff --git a/heartbeat/aws-vpc-route53.in b/heartbeat/aws-vpc-route53.in new file mode 100644 index 0000000..22cbb35 --- /dev/null +++ b/heartbeat/aws-vpc-route53.in @@ -0,0 +1,449 @@ +#!@BASH_SHELL@ +# +# Copyright 2017 Amazon.com, Inc. and its affiliates. All Rights Reserved. +# Licensed under the MIT License. +# +# Copyright 2017 Amazon.com, Inc. and its affiliates + +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +# of the Software, and to permit persons to whom the Software is furnished to do +# so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +# +# +# +# OCF resource agent to move an IP address within a VPC in the AWS +# Written by Stefan Schneider , Martin Tegmeier (AWS) +# Based on code of Markus Guertler# +# +# +# OCF resource agent to move an IP address within a VPC in the AWS +# Written by Stefan Schneider (AWS) , Martin Tegmeier (AWS) +# Based on code of Markus Guertler (SUSE) +# +# Mar. 15, 2017, vers 1.0.2 + + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_awscli_default="/usr/bin/aws" +OCF_RESKEY_profile_default="default" +OCF_RESKEY_hostedzoneid_default="" +OCF_RESKEY_fullname_default="" +OCF_RESKEY_ip_default="local" +OCF_RESKEY_ttl_default=10 + +: ${OCF_RESKEY_awscli=${OCF_RESKEY_awscli_default}} +: ${OCF_RESKEY_profile=${OCF_RESKEY_profile_default}} +: ${OCF_RESKEY_hostedzoneid:=${OCF_RESKEY_hostedzoneid_default}} +: ${OCF_RESKEY_fullname:=${OCF_RESKEY_fullname_default}} +: ${OCF_RESKEY_ip:=${OCF_RESKEY_ip_default}} +: ${OCF_RESKEY_ttl:=${OCF_RESKEY_ttl_default}} +####################################################################### + + +AWS_PROFILE_OPT="--profile $OCF_RESKEY_profile --cli-connect-timeout 10" +####################################################################### + + +usage() { + cat <<-EOT + usage: $0 {start|stop|status|monitor|validate-all|meta-data} + EOT +} + +metadata() { +cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="aws-vpc-route53" version="1.0"> +<version>1.0</version> +<longdesc lang="en"> +Update Route53 record of Amazon Webservices EC2 by updating an entry in a +hosted zone ID table. + +AWS instances will require policies which allow them to update Route53 ARecords: +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "Stmt1471878724000", + "Effect": "Allow", + "Action": [ + "route53:ChangeResourceRecordSets", + "route53:GetChange", + "route53:ListResourceRecordSets", + ], + "Resource": [ + "*" + ] + } + ] +} + +Example Cluster Configuration: + +Use a configuration in "crm configure edit" which looks as follows. Replace +hostedzoneid, fullname and profile with the appropriate values: + +primitive res_route53 ocf:heartbeat:aws-vpc-route53 \ + params hostedzoneid=EX4MPL3EX4MPL3 fullname=service.cloud.example.corp. profile=cluster \ + op start interval=0 timeout=180 \ + op stop interval=0 timeout=180 \ + op monitor interval=300 timeout=180 \ + meta target-role=Started +</longdesc> +<shortdesc lang="en">Update Route53 VPC record for AWS EC2</shortdesc> + +<parameters> +<parameter name="awscli"> +<longdesc lang="en"> +Path to command line tools for AWS +</longdesc> +<shortdesc lang="en">Path to AWS CLI tools</shortdesc> +<content type="string" default="${OCF_RESKEY_awscli_default}" /> +</parameter> + +<parameter name="profile"> +<longdesc lang="en"> +The name of the AWS CLI profile of the root account. This +profile will have to use the "text" format for CLI output. +The file /root/.aws/config should have an entry which looks +like: + + [profile cluster] + region = us-east-1 + output = text + +"cluster" is the name which has to be used in the cluster +configuration. The region has to be the current one. The +output has to be "text". +</longdesc> +<shortdesc lang="en">AWS Profile Name</shortdesc> +<content type="string" default="${OCF_RESKEY_profile_default}" /> +</parameter> + +<parameter name="hostedzoneid" required="1"> +<longdesc lang="en"> +Hosted zone ID of Route 53. This is the table of +the Route 53 record. +</longdesc> +<shortdesc lang="en">AWS hosted zone ID</shortdesc> +<content type="string" default="${OCF_RESKEY_hostedzoneid_default}" /> +</parameter> + +<parameter name="fullname" required="1"> +<longdesc lang="en"> +The full name of the service which will host the IP address. +Example: service.cloud.example.corp. +Note: The trailing dot is important to Route53! +</longdesc> +<shortdesc lang="en">Full service name</shortdesc> +<content type="string" default="${OCF_RESKEY_fullname_default}" /> +</parameter> + +<parameter name="ip" required="0"> +<longdesc lang="en"> +IP (local (default), public or secondary private IP address (e.g. 10.0.0.1). + +A secondary private IP can be setup with the awsvip agent. +</longdesc> +<shortdesc lang="en">Type of IP or secondary private IP address (local, public or e.g. 10.0.0.1)</shortdesc> +<content type="string" default="${OCF_RESKEY_ip_default}" /> +</parameter> + +<parameter name="ttl" required="0"> +<longdesc lang="en"> +Time to live for Route53 ARECORD +</longdesc> +<shortdesc lang="en">ARECORD TTL</shortdesc> +<content type="string" default="${OCF_RESKEY_ttl_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="180s" /> +<action name="stop" timeout="180s" /> +<action name="monitor" depth="0" timeout="180s" interval="300s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +r53_validate() { + ocf_log debug "function: validate" + + # Check for required binaries + ocf_log debug "Checking for required binaries" + for command in curl dig; do + check_binary "$command" + done + + # Full name + [[ -z "$OCF_RESKEY_fullname" ]] && ocf_log error "Full name parameter not set $OCF_RESKEY_fullname!" && exit $OCF_ERR_CONFIGURED + + # Hosted Zone ID + [[ -z "$OCF_RESKEY_hostedzoneid" ]] && ocf_log error "Hosted Zone ID parameter not set $OCF_RESKEY_hostedzoneid!" && exit $OCF_ERR_CONFIGURED + + # Type of IP/secondary IP address + case $OCF_RESKEY_ip in + local|public|*.*.*.*) + ;; + *) + ocf_exit_reason "Invalid value for ip: ${OCF_RESKEY_ip}" + exit $OCF_ERR_CONFIGURED + esac + + # profile + [[ -z "$OCF_RESKEY_profile" ]] && ocf_log error "AWS CLI profile not set $OCF_RESKEY_profile!" && exit $OCF_ERR_CONFIGURED + + # TTL + [[ -z "$OCF_RESKEY_ttl" ]] && ocf_log error "TTL not set $OCF_RESKEY_ttl!" && exit $OCF_ERR_CONFIGURED + + ocf_log debug "Testing aws command" + $OCF_RESKEY_awscli --version 2>&1 + if [ "$?" -gt 0 ]; then + ocf_log error "Error while executing aws command as user root! Please check if AWS CLI tools (Python flavor) are properly installed and configured." && exit $OCF_ERR_INSTALLED + fi + ocf_log debug "ok" + + return $OCF_SUCCESS +} + +r53_start() { + # + # Start agent and config DNS in Route53 + # + ocf_log info "Starting Route53 DNS update...." + _get_ip + r53_monitor + if [ $? != $OCF_SUCCESS ]; then + ocf_log info "Could not start agent - check configurations" + return $OCF_ERR_GENERIC + fi + return $OCF_SUCCESS +} + +r53_stop() { + # + # Stop operation doesn't perform any API call or try to remove the DNS record + # this mostly because this is not necessarily mandatory or desired + # the start and monitor functions will take care of changing the DNS record + # if the agent starts in a different cluster node + # + ocf_log info "Bringing down Route53 agent. (Will NOT remove Route53 DNS record)" + return $OCF_SUCCESS +} + +r53_monitor() { + # + # For every start action the agent will call Route53 API to check for DNS record + # otherwise it will try to get results directly by querying the DNS using "dig". + # Due to complexity in some DNS architectures "dig" can fail, and if this happens + # the monitor will fallback to the Route53 API call. + # + # There will be no failure, failover or restart of the agent if the monitor operation fails + # hence we only return $OCF_SUCESS in this function + # + # In case of the monitor operation detects a wrong or non-existent Route53 DNS entry + # it will try to fix the existing one, or create it again + # + # + ARECORD="" + IPREGEX="^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$" + r53_validate + ocf_log debug "Checking Route53 record sets" + # + _get_ip + # + if [ "$__OCF_ACTION" = "start" ] || ocf_is_probe ; then + # + cmd="$OCF_RESKEY_awscli $AWS_PROFILE_OPT route53 list-resource-record-sets --hosted-zone-id $OCF_RESKEY_hostedzoneid --query ResourceRecordSets[?Name=='$OCF_RESKEY_fullname']" + ocf_log info "Route53 Agent Starting or probing - executing monitoring API call: $cmd" + CLIRES="$($cmd 2>&1)" + rc=$? + ocf_log debug "awscli returned code: $rc" + if [ $rc -ne 0 ]; then + CLIRES=$(echo $CLIRES | grep -v '^$') + ocf_log warn "Route53 API returned an error: $CLIRES" + ocf_log warn "Skipping cluster action due to API call error" + return $OCF_ERR_GENERIC + fi + ARECORD=$(echo $CLIRES | grep RESOURCERECORDS | awk '{ print $5 }') + # + if ocf_is_probe; then + # + # Prevent R53 record change during probe + # + if [[ $ARECORD =~ $IPREGEX ]] && [ "$ARECORD" != "$IPADDRESS" ]; then + ocf_log debug "Route53 DNS record $ARECORD found at probing, disregarding" + return $OCF_NOT_RUNNING + fi + fi + else + # + cmd="dig +retries=3 +time=5 +short $OCF_RESKEY_fullname 2>/dev/null" + ocf_log info "executing monitoring command : $cmd" + ARECORD="$($cmd)" + rc=$? + ocf_log debug "dig return code: $rc" + # + if [[ ! $ARECORD =~ $IPREGEX ]] || [ $rc -ne 0 ]; then + ocf_log info "Fallback to Route53 API query due to DNS resolution failure" + cmd="$OCF_RESKEY_awscli $AWS_PROFILE_OPT route53 list-resource-record-sets --hosted-zone-id $OCF_RESKEY_hostedzoneid --query ResourceRecordSets[?Name=='$OCF_RESKEY_fullname']" + ocf_log debug "executing monitoring API call: $cmd" + CLIRES="$($cmd 2>&1)" + rc=$? + ocf_log debug "awscli return code: $rc" + if [ $rc -ne 0 ]; then + CLIRES=$(echo $CLIRES | grep -v '^$') + ocf_log warn "Route53 API returned an error: $CLIRES" + ocf_log warn "Monitor skipping cluster action due to API call error" + return $OCF_SUCCESS + fi + ARECORD=$(echo $CLIRES | grep RESOURCERECORDS | awk '{ print $5 }') + fi + # + fi + ocf_log info "Route53 DNS record pointing $OCF_RESKEY_fullname to IP address $ARECORD" + # + if [ "$ARECORD" == "$IPADDRESS" ]; then + ocf_log info "Route53 DNS record $ARECORD found" + return $OCF_SUCCESS + elif [[ $ARECORD =~ $IPREGEX ]] && [ "$ARECORD" != "$IPADDRESS" ]; then + ocf_log info "Route53 DNS record points to a different host, setting DNS record on Route53 to this host" + _update_record "UPSERT" "$IPADDRESS" + return $OCF_SUCCESS + else + ocf_log info "No Route53 DNS record found, setting DNS record on Route53 to this host" + _update_record "UPSERT" "$IPADDRESS" + return $OCF_SUCCESS + fi + + return $OCF_SUCCESS +} + +_get_ip() { + case $OCF_RESKEY_ip in + local|public) + TOKEN=$(curl -sX PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") + IPADDRESS=$(curl -s http://169.254.169.254/latest/meta-data/${OCF_RESKEY_ip}-ipv4 -H "X-aws-ec2-metadata-token: $TOKEN");; + *.*.*.*) + IPADDRESS="${OCF_RESKEY_ip}";; + esac +} + +_update_record() { + # + # This function is the one that will actually execute Route53's API call + # and configure the DNS record using the correct API calls and parameters + # + # It creates a temporary JSON file under /tmp with the required API payload + # + # Failures in this function are critical and will cause the agent to fail + # + update_action="$1" + IPADDRESS="$2" + ocf_log info "Updating Route53 $OCF_RESKEY_hostedzoneid with $IPADDRESS for $OCF_RESKEY_fullname" + ROUTE53RECORD="$(maketempfile)" + if [ $? -ne 0 ] || [ -z "$ROUTE53RECORD" ]; then + ocf_exit_reason "Failed to create temporary file for record update" + exit $OCF_ERR_GENERIC + fi + cat >>"$ROUTE53RECORD" <<-EOF + { + "Comment": "Update record to reflect new IP address for a system ", + "Changes": [ + { + "Action": "$update_action", + "ResourceRecordSet": { + "Name": "$OCF_RESKEY_fullname", + "Type": "A", + "TTL": $OCF_RESKEY_ttl, + "ResourceRecords": [ + { + "Value": "$IPADDRESS" + } + ] + } + } + ] + } + EOF + cmd="$OCF_RESKEY_awscli $AWS_PROFILE_OPT route53 change-resource-record-sets --hosted-zone-id $OCF_RESKEY_hostedzoneid --change-batch file://$ROUTE53RECORD " + ocf_log debug "Executing command: $cmd" + CLIRES="$($cmd 2>&1)" + rc=$? + ocf_log debug "awscli returned code: $rc" + if [ $rc -ne 0 ]; then + CLIRES=$(echo $CLIRES | grep -v '^$') + ocf_log warn "Route53 API returned an error: $CLIRES" + ocf_log warn "Skipping cluster action due to API call error" + return $OCF_ERR_GENERIC + fi + CHANGEID=$(echo $CLIRES | awk '{ print $12 }') + ocf_log debug "Change id: $CHANGEID" + rmtempfile $ROUTE53RECORD + CHANGEID=$(echo $CHANGEID | cut -d'/' -f 3 | cut -d'"' -f 1 ) + ocf_log debug "Change id: $CHANGEID" + STATUS="PENDING" + MYSECONDS=20 + while [ "$STATUS" = 'PENDING' ]; do + sleep $MYSECONDS + STATUS="$($OCF_RESKEY_awscli $AWS_PROFILE_OPT route53 get-change --id $CHANGEID | grep CHANGEINFO | awk -F'\t' '{ print $4 }' |cut -d'"' -f 2 )" + ocf_log debug "Waited for $MYSECONDS seconds and checked execution of Route 53 update status: $STATUS " + done +} + +############################################################################### + +case $__OCF_ACTION in + usage|help) + usage + exit $OCF_SUCCESS + ;; + meta-data) + metadata + exit $OCF_SUCCESS + ;; + start) + r53_validate || exit $? + r53_start + ;; + stop) + r53_stop + ;; + monitor) + r53_monitor + ;; + validate-all) + r53_validate + ;; + *) + usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +exit $? diff --git a/heartbeat/awseip b/heartbeat/awseip new file mode 100755 index 0000000..dc48460 --- /dev/null +++ b/heartbeat/awseip @@ -0,0 +1,287 @@ +#!/bin/sh +# +# +# Manage Elastic IP with Pacemaker +# +# +# Copyright 2016-2018 guessi <guessi@gmail.com> +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# + +# +# Prerequisites: +# +# - preconfigured AWS CLI running environment (AccessKey, SecretAccessKey, etc.) +# - a reserved secondary private IP address for EC2 instances high availability +# - IAM user role with the following permissions: +# * DescribeInstances +# * AssociateAddress +# * DescribeAddresses +# * DisassociateAddress +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + +# +# Defaults +# +OCF_RESKEY_awscli_default="/usr/bin/aws" +OCF_RESKEY_profile_default="default" +OCF_RESKEY_api_delay_default="3" + +: ${OCF_RESKEY_awscli=${OCF_RESKEY_awscli_default}} +: ${OCF_RESKEY_profile=${OCF_RESKEY_profile_default}} +: ${OCF_RESKEY_api_delay=${OCF_RESKEY_api_delay_default}} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="awseip" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource Agent for Amazon AWS Elastic IP Addresses. + +It manages AWS Elastic IP Addresses with awscli. + +Credentials needs to be setup by running "aws configure". + +See https://aws.amazon.com/cli/ for more information about awscli. +</longdesc> +<shortdesc lang="en">Amazon AWS Elastic IP Address Resource Agent</shortdesc> + +<parameters> + +<parameter name="awscli" unique="0"> +<longdesc lang="en"> +command line tools for aws services +</longdesc> +<shortdesc lang="en">aws cli tools</shortdesc> +<content type="string" default="${OCF_RESKEY_awscli_default}" /> +</parameter> + +<parameter name="profile"> +<longdesc lang="en"> +Valid AWS CLI profile name (see ~/.aws/config and 'aws configure') +</longdesc> +<shortdesc lang="en">profile name</shortdesc> +<content type="string" default="${OCF_RESKEY_profile_default}" /> +</parameter> + +<parameter name="elastic_ip" unique="1" required="1"> +<longdesc lang="en"> +reserved elastic ip for ec2 instance +</longdesc> +<shortdesc lang="en">reserved elastic ip for ec2 instance</shortdesc> +<content type="string" default="" /> +</parameter> + +<parameter name="allocation_id" unique="1" required="1"> +<longdesc lang="en"> +reserved allocation id for ec2 instance +</longdesc> +<shortdesc lang="en">reserved allocation id for ec2 instance</shortdesc> +<content type="string" default="" /> +</parameter> + +<parameter name="private_ip_address" unique="1" required="0"> +<longdesc lang="en"> +predefined private ip address for ec2 instance +</longdesc> +<shortdesc lang="en">predefined private ip address for ec2 instance</shortdesc> +<content type="string" default="" /> +</parameter> + +<parameter name="api_delay" unique="0"> +<longdesc lang="en"> +a short delay between API calls, to avoid sending API too quick +</longdesc> +<shortdesc lang="en">a short delay between API calls</shortdesc> +<content type="integer" default="${OCF_RESKEY_api_delay_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="30s" /> +<action name="stop" timeout="30s" /> +<action name="monitor" timeout="30s" interval="20s" depth="0" /> +<action name="migrate_to" timeout="30s" /> +<action name="migrate_from" timeout="30s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate" timeout="10s" /> +<action name="validate-all" timeout="10s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +awseip_usage() { + cat <<END +usage: $0 {start|stop|monitor|migrate_to|migrate_from|validate|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +awseip_start() { + awseip_monitor && return $OCF_SUCCESS + + if [ -n "${PRIVATE_IP_ADDRESS}" ]; then + NETWORK_INTERFACES_MACS=$(curl -s http://169.254.169.254/latest/meta-data/network/interfaces/macs/ -H "X-aws-ec2-metadata-token: $TOKEN") + for MAC in ${NETWORK_INTERFACES_MACS}; do + curl -s http://169.254.169.254/latest/meta-data/network/interfaces/macs/${MAC}/local-ipv4s -H "X-aws-ec2-metadata-token: $TOKEN" | + grep -q "^${PRIVATE_IP_ADDRESS}$" + if [ $? -eq 0 ]; then + NETWORK_ID=$(curl -s http://169.254.169.254/latest/meta-data/network/interfaces/macs/${MAC}/interface-id -H "X-aws-ec2-metadata-token: $TOKEN") + fi + done + $AWSCLI --profile $OCF_RESKEY_profile ec2 associate-address \ + --network-interface-id ${NETWORK_ID} \ + --allocation-id ${ALLOCATION_ID} \ + --private-ip-address ${PRIVATE_IP_ADDRESS} + RET=$? + else + $AWSCLI --profile $OCF_RESKEY_profile ec2 associate-address \ + --instance-id ${INSTANCE_ID} \ + --allocation-id ${ALLOCATION_ID} + RET=$? + fi + + # delay to avoid sending request too fast + sleep ${OCF_RESKEY_api_delay} + + if [ $RET -ne 0 ]; then + return $OCF_NOT_RUNNING + fi + + ocf_log info "elastic_ip has been successfully brought up (${ELASTIC_IP})" + return $OCF_SUCCESS +} + +awseip_stop() { + awseip_monitor || return $OCF_SUCCESS + + ASSOCIATION_ID=$($AWSCLI --profile $OCF_RESKEY_profile --output json ec2 describe-addresses \ + --allocation-id ${ALLOCATION_ID} | grep -m 1 "AssociationId" | awk -F'"' '{print$4}') + + if [ -z "${ASSOCIATION_ID}" ]; then + ocf_log info "ASSOCIATION_ID not found while stopping AWS Elastic IP" + return $OCF_NOT_RUNNING + fi + + $AWSCLI --profile ${OCF_RESKEY_profile} \ + ec2 disassociate-address \ + --association-id ${ASSOCIATION_ID} + RET=$? + + # delay to avoid sending request too fast + sleep ${OCF_RESKEY_api_delay} + + if [ $RET -ne 0 ]; then + return $OCF_NOT_RUNNING + fi + + ocf_log info "elastic_ip has been successfully brought down (${ELASTIC_IP})" + return $OCF_SUCCESS +} + +awseip_monitor() { + $AWSCLI --profile $OCF_RESKEY_profile ec2 describe-instances --instance-id "${INSTANCE_ID}" | grep -q "${ELASTIC_IP}" + RET=$? + + if [ $RET -ne 0 ]; then + return $OCF_NOT_RUNNING + fi + return $OCF_SUCCESS +} + +awseip_validate() { + check_binary ${AWSCLI} + + if [ -z "$OCF_RESKEY_profile" ]; then + ocf_exit_reason "profile parameter not set" + return $OCF_ERR_CONFIGURED + fi + + if [ -z "${INSTANCE_ID}" ]; then + ocf_exit_reason "instance_id not found. Is this a EC2 instance?" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +case $__OCF_ACTION in + meta-data) + meta_data + exit $OCF_SUCCESS + ;; +esac + +AWSCLI="${OCF_RESKEY_awscli}" +ELASTIC_IP="${OCF_RESKEY_elastic_ip}" +ALLOCATION_ID="${OCF_RESKEY_allocation_id}" +PRIVATE_IP_ADDRESS="${OCF_RESKEY_private_ip_address}" +TOKEN=$(curl -sX PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") +INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id -H "X-aws-ec2-metadata-token: $TOKEN") + +case $__OCF_ACTION in + start) + awseip_validate + awseip_start + ;; + stop) + awseip_stop + ;; + monitor) + awseip_monitor + ;; + migrate_to) + ocf_log info "Migrating ${OCF_RESOURCE_INSTANCE} to ${OCF_RESKEY_CRM_meta_migrate_target}." + awseip_stop + ;; + migrate_from) + ocf_log info "Migrating ${OCF_RESOURCE_INSTANCE} from ${OCF_RESKEY_CRM_meta_migrate_source}." + awseip_start + ;; + reload) + ocf_log info "Reloading ${OCF_RESOURCE_INSTANCE} ..." + ;; + validate|validate-all) + awseip_validate + ;; + usage|help) + awseip_usage + exit $OCF_SUCCESS + ;; + *) + awseip_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc diff --git a/heartbeat/awsvip b/heartbeat/awsvip new file mode 100755 index 0000000..037278e --- /dev/null +++ b/heartbeat/awsvip @@ -0,0 +1,251 @@ +#!/bin/sh +# +# +# Manage Secondary Private IP with Pacemaker +# +# +# Copyright 2016-2018 guessi <guessi@gmail.com> +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# + +# +# Prerequisites: +# +# - preconfigured AWS CLI running environment (AccessKey, SecretAccessKey, etc.) +# - a reserved secondary private IP address for EC2 instances high availablity +# - IAM user role with the following permissions: +# * DescribeInstances +# * AssignPrivateIpAddresses +# * UnassignPrivateIpAddresses +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + +# +# Defaults +# +OCF_RESKEY_awscli_default="/usr/bin/aws" +OCF_RESKEY_profile_default="default" +OCF_RESKEY_api_delay_default="3" + +: ${OCF_RESKEY_awscli=${OCF_RESKEY_awscli_default}} +: ${OCF_RESKEY_profile=${OCF_RESKEY_profile_default}} +: ${OCF_RESKEY_api_delay=${OCF_RESKEY_api_delay_default}} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="awsvip" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource Agent for Amazon AWS Secondary Private IP Addresses. + +It manages AWS Secondary Private IP Addresses with awscli. + +Credentials needs to be setup by running "aws configure". + +See https://aws.amazon.com/cli/ for more information about awscli. +</longdesc> +<shortdesc lang="en">Amazon AWS Secondary Private IP Address Resource Agent</shortdesc> + +<parameters> + +<parameter name="awscli" unique="0"> +<longdesc lang="en"> +command line tools for aws services +</longdesc> +<shortdesc lang="en">aws cli tools</shortdesc> +<content type="string" default="${OCF_RESKEY_awscli_default}" /> +</parameter> + +<parameter name="profile"> +<longdesc lang="en"> +Valid AWS CLI profile name (see ~/.aws/config and 'aws configure') +</longdesc> +<shortdesc lang="en">profile name</shortdesc> +<content type="string" default="${OCF_RESKEY_profile_default}" /> +</parameter> + +<parameter name="secondary_private_ip" unique="1" required="1"> +<longdesc lang="en"> +reserved secondary private ip for ec2 instance +</longdesc> +<shortdesc lang="en">reserved secondary private ip for ec2 instance</shortdesc> +<content type="string" default="" /> +</parameter> + +<parameter name="api_delay" unique="0"> +<longdesc lang="en"> +a short delay between API calls, to avoid sending API too quick +</longdesc> +<shortdesc lang="en">a short delay between API calls</shortdesc> +<content type="integer" default="${OCF_RESKEY_api_delay_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="30s" /> +<action name="stop" timeout="30s" /> +<action name="monitor" timeout="30s" interval="20s" depth="0" /> +<action name="migrate_to" timeout="30s" /> +<action name="migrate_from" timeout="30s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate" timeout="10s" /> +<action name="validate-all" timeout="10s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +awsvip_usage() { + cat <<END +usage: $0 {start|stop|monitor|migrate_to|migrate_from|validate|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +awsvip_start() { + awsvip_monitor && return $OCF_SUCCESS + + $AWSCLI --profile $OCF_RESKEY_profile ec2 assign-private-ip-addresses \ + --network-interface-id ${NETWORK_ID} \ + --private-ip-addresses ${SECONDARY_PRIVATE_IP} \ + --allow-reassignment + RET=$? + + # delay to avoid sending request too fast + sleep ${OCF_RESKEY_api_delay} + + if [ $RET -ne 0 ]; then + return $OCF_NOT_RUNNING + fi + + ocf_log info "secondary_private_ip has been successfully brought up (${SECONDARY_PRIVATE_IP})" + return $OCF_SUCCESS +} + +awsvip_stop() { + awsvip_monitor || return $OCF_SUCCESS + + $AWSCLI --profile $OCF_RESKEY_profile ec2 unassign-private-ip-addresses \ + --network-interface-id ${NETWORK_ID} \ + --private-ip-addresses ${SECONDARY_PRIVATE_IP} + RET=$? + + # delay to avoid sending request too fast + sleep ${OCF_RESKEY_api_delay} + + if [ $RET -ne 0 ]; then + return $OCF_NOT_RUNNING + fi + + ocf_log info "secondary_private_ip has been successfully brought down (${SECONDARY_PRIVATE_IP})" + return $OCF_SUCCESS +} + +awsvip_monitor() { + $AWSCLI --profile ${OCF_RESKEY_profile} ec2 describe-instances \ + --instance-id "${INSTANCE_ID}" \ + --query 'Reservations[].Instances[].NetworkInterfaces[].PrivateIpAddresses[].PrivateIpAddress[]' \ + --output text | \ + grep -qE "(^|\s)${SECONDARY_PRIVATE_IP}(\s|$)" + RET=$? + + if [ $RET -ne 0 ]; then + return $OCF_NOT_RUNNING + fi + return $OCF_SUCCESS +} + +awsvip_validate() { + check_binary ${AWSCLI} + + if [ -z "$OCF_RESKEY_profile" ]; then + ocf_exit_reason "profile parameter not set" + return $OCF_ERR_CONFIGURED + fi + + if [ -z "${INSTANCE_ID}" ]; then + ocf_exit_reason "instance_id not found. Is this a EC2 instance?" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +case $__OCF_ACTION in + meta-data) + meta_data + exit $OCF_SUCCESS + ;; +esac + +AWSCLI="${OCF_RESKEY_awscli}" +SECONDARY_PRIVATE_IP="${OCF_RESKEY_secondary_private_ip}" +TOKEN=$(curl -sX PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") +INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id -H "X-aws-ec2-metadata-token: $TOKEN") +MAC_ADDRESS=$(curl -s http://169.254.169.254/latest/meta-data/mac -H "X-aws-ec2-metadata-token: $TOKEN") +NETWORK_ID=$(curl -s http://169.254.169.254/latest/meta-data/network/interfaces/macs/${MAC_ADDRESS}/interface-id -H "X-aws-ec2-metadata-token: $TOKEN") + +case $__OCF_ACTION in + start) + awsvip_validate + awsvip_start + ;; + stop) + awsvip_stop + ;; + monitor) + awsvip_monitor + ;; + migrate_to) + ocf_log info "Migrating ${OCF_RESOURCE_INSTANCE} to ${OCF_RESKEY_CRM_meta_migrate_target}." + awsvip_stop + ;; + migrate_from) + ocf_log info "Migrating ${OCF_RESOURCE_INSTANCE} from ${OCF_RESKEY_CRM_meta_migrate_source}." + awsvip_start + ;; + reload) + ocf_log info "Reloading ${OCF_RESOURCE_INSTANCE} ..." + ;; + validate|validate-all) + awsvip_validate + ;; + usage|help) + awsvip_usage + exit $OCF_SUCCESS + ;; + *) + awsvip_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc diff --git a/heartbeat/azure-events-az.in b/heartbeat/azure-events-az.in new file mode 100644 index 0000000..46d4d1f --- /dev/null +++ b/heartbeat/azure-events-az.in @@ -0,0 +1,772 @@ +#!@PYTHON@ -tt +# +# Resource agent for monitoring Azure Scheduled Events +# +# License: GNU General Public License (GPL) +# (c) 2018 Tobias Niekamp, Microsoft Corp. +# and Linux-HA contributors + +import os +import sys +import time +import subprocess +import json +try: + import urllib2 + from urllib2 import URLError +except ImportError: + import urllib.request as urllib2 + from urllib.error import URLError +import socket +from collections import defaultdict + +OCF_FUNCTIONS_DIR = os.environ.get("OCF_FUNCTIONS_DIR", "%s/lib/heartbeat" % os.environ.get("OCF_ROOT")) +sys.path.append(OCF_FUNCTIONS_DIR) +import ocf + +############################################################################## + + +VERSION = "0.10" +USER_AGENT = "Pacemaker-ResourceAgent/%s %s" % (VERSION, ocf.distro()) + +attr_globalPullState = "azure-events-az_globalPullState" +attr_lastDocVersion = "azure-events-az_lastDocVersion" +attr_curNodeState = "azure-events-az_curNodeState" +attr_pendingEventIDs = "azure-events-az_pendingEventIDs" +attr_healthstate = "#health-azure" + +default_loglevel = ocf.logging.INFO +default_relevantEventTypes = set(["Reboot", "Redeploy"]) + +global_pullMaxAttempts = 3 +global_pullDelaySecs = 1 + +############################################################################## + +class attrDict(defaultdict): + """ + A wrapper for accessing dict keys like an attribute + """ + def __init__(self, data): + super(attrDict, self).__init__(attrDict) + for d in data.keys(): + self.__setattr__(d, data[d]) + + def __getattr__(self, key): + try: + return self[key] + except KeyError: + raise AttributeError(key) + + def __setattr__(self, key, value): + self[key] = value + +############################################################################## + +class azHelper: + """ + Helper class for Azure's metadata API (including Scheduled Events) + """ + metadata_host = "http://169.254.169.254/metadata" + instance_api = "instance" + events_api = "scheduledevents" + api_version = "2019-08-01" + + @staticmethod + def _sendMetadataRequest(endpoint, postData=None): + """ + Send a request to Azure's Azure Metadata Service API + """ + url = "%s/%s?api-version=%s" % (azHelper.metadata_host, endpoint, azHelper.api_version) + data = "" + ocf.logger.debug("_sendMetadataRequest: begin; endpoint = %s, postData = %s" % (endpoint, postData)) + ocf.logger.debug("_sendMetadataRequest: url = %s" % url) + + if postData and type(postData) != bytes: + postData = postData.encode() + + req = urllib2.Request(url, postData) + req.add_header("Metadata", "true") + req.add_header("User-Agent", USER_AGENT) + try: + resp = urllib2.urlopen(req) + except URLError as e: + if hasattr(e, 'reason'): + ocf.logger.warning("Failed to reach the server: %s" % e.reason) + clusterHelper.setAttr(attr_globalPullState, "IDLE") + elif hasattr(e, 'code'): + ocf.logger.warning("The server couldn\'t fulfill the request. Error code: %s" % e.code) + clusterHelper.setAttr(attr_globalPullState, "IDLE") + else: + data = resp.read() + ocf.logger.debug("_sendMetadataRequest: response = %s" % data) + + if data: + data = json.loads(data) + + ocf.logger.debug("_sendMetadataRequest: finished") + return data + + @staticmethod + def getInstanceInfo(): + """ + Fetch details about the current VM from Azure's Azure Metadata Service API + """ + ocf.logger.debug("getInstanceInfo: begin") + + jsondata = azHelper._sendMetadataRequest(azHelper.instance_api) + ocf.logger.debug("getInstanceInfo: json = %s" % jsondata) + + if jsondata: + ocf.logger.debug("getInstanceInfo: finished, returning {}".format(jsondata["compute"])) + return attrDict(jsondata["compute"]) + else: + ocf.ocf_exit_reason("getInstanceInfo: Unable to get instance info") + sys.exit(ocf.OCF_ERR_GENERIC) + + @staticmethod + def pullScheduledEvents(): + """ + Retrieve all currently scheduled events via Azure Metadata Service API + """ + ocf.logger.debug("pullScheduledEvents: begin") + + jsondata = azHelper._sendMetadataRequest(azHelper.events_api) + ocf.logger.debug("pullScheduledEvents: json = %s" % jsondata) + + ocf.logger.debug("pullScheduledEvents: finished") + return attrDict(jsondata) + + @staticmethod + def forceEvents(eventIDs): + """ + Force a set of events to start immediately + """ + ocf.logger.debug("forceEvents: begin") + + events = [] + for e in eventIDs: + events.append({ + "EventId": e, + }) + postData = { + "StartRequests" : events + } + ocf.logger.info("forceEvents: postData = %s" % postData) + resp = azHelper._sendMetadataRequest(azHelper.events_api, postData=json.dumps(postData)) + + ocf.logger.debug("forceEvents: finished") + return + +############################################################################## + +class clusterHelper: + """ + Helper functions for Pacemaker control via crm + """ + @staticmethod + def _getLocation(node): + """ + Helper function to retrieve local/global attributes + """ + if node: + return ["--node", node] + else: + return ["--type", "crm_config"] + + @staticmethod + def _exec(command, *args): + """ + Helper function to execute a UNIX command + """ + args = list(args) + ocf.logger.debug("_exec: begin; command = %s, args = %s" % (command, str(args))) + + def flatten(*n): + return (str(e) for a in n + for e in (flatten(*a) if isinstance(a, (tuple, list)) else (str(a),))) + command = list(flatten([command] + args)) + ocf.logger.debug("_exec: cmd = %s" % " ".join(command)) + try: + ret = subprocess.check_output(command) + if type(ret) != str: + ret = ret.decode() + ocf.logger.debug("_exec: return = %s" % ret) + return ret.rstrip() + except Exception as err: + ocf.logger.exception(err) + return None + + @staticmethod + def setAttr(key, value, node=None): + """ + Set the value of a specific global/local attribute in the Pacemaker cluster + """ + ocf.logger.debug("setAttr: begin; key = %s, value = %s, node = %s" % (key, value, node)) + + if value: + ret = clusterHelper._exec("crm_attribute", + "--name", key, + "--update", value, + clusterHelper._getLocation(node)) + else: + ret = clusterHelper._exec("crm_attribute", + "--name", key, + "--delete", + clusterHelper._getLocation(node)) + + ocf.logger.debug("setAttr: finished") + return len(ret) == 0 + + @staticmethod + def getAttr(key, node=None): + """ + Retrieve a global/local attribute from the Pacemaker cluster + """ + ocf.logger.debug("getAttr: begin; key = %s, node = %s" % (key, node)) + + val = clusterHelper._exec("crm_attribute", + "--name", key, + "--query", "--quiet", + "--default", "", + clusterHelper._getLocation(node)) + ocf.logger.debug("getAttr: finished") + if not val: + return None + return val if not val.isdigit() else int(val) + + @staticmethod + def getAllNodes(): + """ + Get a list of hostnames for all nodes in the Pacemaker cluster + """ + ocf.logger.debug("getAllNodes: begin") + + nodes = [] + nodeList = clusterHelper._exec("crm_node", "--list") + for n in nodeList.split("\n"): + nodes.append(n.split()[1]) + ocf.logger.debug("getAllNodes: finished; return %s" % str(nodes)) + + return nodes + + @staticmethod + def getHostNameFromAzName(azName): + """ + Helper function to get the actual host name from an Azure node name + """ + return clusterHelper.getAttr("hostName_%s" % azName) + + @staticmethod + def removeHoldFromNodes(): + """ + Remove the ON_HOLD state from all nodes in the Pacemaker cluster + """ + ocf.logger.debug("removeHoldFromNodes: begin") + + for n in clusterHelper.getAllNodes(): + if clusterHelper.getAttr(attr_curNodeState, node=n) == "ON_HOLD": + clusterHelper.setAttr(attr_curNodeState, "AVAILABLE", node=n) + ocf.logger.info("removeHoldFromNodes: removed ON_HOLD from node %s" % n) + + ocf.logger.debug("removeHoldFromNodes: finished") + return False + + @staticmethod + def otherNodesAvailable(exceptNode): + """ + Check if there are any nodes (except a given node) in the Pacemaker cluster that have state AVAILABLE + """ + ocf.logger.debug("otherNodesAvailable: begin; exceptNode = %s" % exceptNode) + + for n in clusterHelper.getAllNodes(): + state = clusterHelper.getAttr(attr_curNodeState, node=n) + state = stringToNodeState(state) if state else AVAILABLE + if state == AVAILABLE and n != exceptNode.hostName: + ocf.logger.info("otherNodesAvailable: at least %s is available" % n) + ocf.logger.debug("otherNodesAvailable: finished") + return True + ocf.logger.info("otherNodesAvailable: no other nodes are available") + ocf.logger.debug("otherNodesAvailable: finished") + + return False + + @staticmethod + def transitionSummary(): + """ + Get the current Pacemaker transition summary (used to check if all resources are stopped when putting a node standby) + """ + # <tniek> Is a global crm_simulate "too much"? Or would it be sufficient it there are no planned transitions for a particular node? + # # crm_simulate -LS + # Transition Summary: + # * Promote rsc_SAPHana_HN1_HDB03:0 (Slave -> Master hsr3-db1) + # * Stop rsc_SAPHana_HN1_HDB03:1 (hsr3-db0) + # * Move rsc_ip_HN1_HDB03 (Started hsr3-db0 -> hsr3-db1) + # * Start rsc_nc_HN1_HDB03 (hsr3-db1) + # # Excepted result when there are no pending actions: + # Transition Summary: + ocf.logger.debug("transitionSummary: begin") + + summary = clusterHelper._exec("crm_simulate", "-LS") + if not summary: + ocf.logger.warning("transitionSummary: could not load transition summary") + return "" + if summary.find("Transition Summary:") < 0: + ocf.logger.debug("transitionSummary: no transactions: %s" % summary) + return "" + j=summary.find('Transition Summary:') + len('Transition Summary:') + l=summary.lower().find('executing cluster transition:') + ret = list(filter(str.strip, summary[j:l].split("\n"))) + + ocf.logger.debug("transitionSummary: finished; return = %s" % str(ret)) + return ret + + @staticmethod + def listOperationsOnNode(node): + """ + Get a list of all current operations for a given node (used to check if any resources are pending) + """ + # hsr3-db1:/home/tniek # crm_resource --list-operations -N hsr3-db0 + # rsc_azure-events-az (ocf::heartbeat:azure-events-az): Started: rsc_azure-events-az_start_0 (node=hsr3-db0, call=91, rc=0, last-rc-change=Fri Jun 8 22:37:46 2018, exec=115ms): complete + # rsc_azure-events-az (ocf::heartbeat:azure-events-az): Started: rsc_azure-events-az_monitor_10000 (node=hsr3-db0, call=93, rc=0, last-rc-change=Fri Jun 8 22:37:47 2018, exec=197ms): complete + # rsc_SAPHana_HN1_HDB03 (ocf::suse:SAPHana): Master: rsc_SAPHana_HN1_HDB03_start_0 (node=hsr3-db0, call=-1, rc=193, last-rc-change=Fri Jun 8 22:37:46 2018, exec=0ms): pending + # rsc_SAPHanaTopology_HN1_HDB03 (ocf::suse:SAPHanaTopology): Started: rsc_SAPHanaTopology_HN1_HDB03_start_0 (node=hsr3-db0, call=90, rc=0, last-rc-change=Fri Jun 8 22:37:46 2018, exec=3214ms): complete + ocf.logger.debug("listOperationsOnNode: begin; node = %s" % node) + + resources = clusterHelper._exec("crm_resource", "--list-operations", "-N", node) + if len(resources) == 0: + ret = [] + else: + ret = resources.split("\n") + + ocf.logger.debug("listOperationsOnNode: finished; return = %s" % str(ret)) + return ret + + @staticmethod + def noPendingResourcesOnNode(node): + """ + Check that there are no pending resources on a given node + """ + ocf.logger.debug("noPendingResourcesOnNode: begin; node = %s" % node) + + for r in clusterHelper.listOperationsOnNode(node): + ocf.logger.debug("noPendingResourcesOnNode: * %s" % r) + resource = r.split()[-1] + if resource == "pending": + ocf.logger.info("noPendingResourcesOnNode: found resource %s that is still pending" % resource) + ocf.logger.debug("noPendingResourcesOnNode: finished; return = False") + return False + ocf.logger.info("noPendingResourcesOnNode: no pending resources on node %s" % node) + ocf.logger.debug("noPendingResourcesOnNode: finished; return = True") + + return True + + @staticmethod + def allResourcesStoppedOnNode(node): + """ + Check that all resources on a given node are stopped + """ + ocf.logger.debug("allResourcesStoppedOnNode: begin; node = %s" % node) + + if clusterHelper.noPendingResourcesOnNode(node): + if len(clusterHelper.transitionSummary()) == 0: + ocf.logger.info("allResourcesStoppedOnNode: no pending resources on node %s and empty transition summary" % node) + ocf.logger.debug("allResourcesStoppedOnNode: finished; return = True") + return True + ocf.logger.info("allResourcesStoppedOnNode: transition summary is not empty") + ocf.logger.debug("allResourcesStoppedOnNode: finished; return = False") + return False + + ocf.logger.info("allResourcesStoppedOnNode: still pending resources on node %s" % node) + ocf.logger.debug("allResourcesStoppedOnNode: finished; return = False") + return False + +############################################################################## + +AVAILABLE = 0 # Node is online and ready to handle events +STOPPING = 1 # Standby has been triggered, but some resources are still running +IN_EVENT = 2 # All resources are stopped, and event has been initiated via Azure Metadata Service +ON_HOLD = 3 # Node has a pending event that cannot be started there are no other nodes available + +def stringToNodeState(name): + if type(name) == int: return name + if name == "STOPPING": return STOPPING + if name == "IN_EVENT": return IN_EVENT + if name == "ON_HOLD": return ON_HOLD + return AVAILABLE + +def nodeStateToString(state): + if state == STOPPING: return "STOPPING" + if state == IN_EVENT: return "IN_EVENT" + if state == ON_HOLD: return "ON_HOLD" + return "AVAILABLE" + +############################################################################## + +class Node: + """ + Core class implementing logic for a cluster node + """ + def __init__(self, ra): + self.raOwner = ra + self.azInfo = azHelper.getInstanceInfo() + self.azName = self.azInfo.name + self.hostName = socket.gethostname() + self.setAttr("azName", self.azName) + clusterHelper.setAttr("hostName_%s" % self.azName, self.hostName) + + def getAttr(self, key): + """ + Get a local attribute + """ + return clusterHelper.getAttr(key, node=self.hostName) + + def setAttr(self, key, value): + """ + Set a local attribute + """ + return clusterHelper.setAttr(key, value, node=self.hostName) + + def selfOrOtherNode(self, node): + """ + Helper function to distinguish self/other node + """ + return node if node else self.hostName + + def setState(self, state, node=None): + """ + Set the state for a given node (or self) + """ + node = self.selfOrOtherNode(node) + ocf.logger.debug("setState: begin; node = %s, state = %s" % (node, nodeStateToString(state))) + + clusterHelper.setAttr(attr_curNodeState, nodeStateToString(state), node=node) + + ocf.logger.debug("setState: finished") + + def getState(self, node=None): + """ + Get the state for a given node (or self) + """ + node = self.selfOrOtherNode(node) + ocf.logger.debug("getState: begin; node = %s" % node) + + state = clusterHelper.getAttr(attr_curNodeState, node=node) + ocf.logger.debug("getState: state = %s" % state) + ocf.logger.debug("getState: finished") + if not state: + return AVAILABLE + return stringToNodeState(state) + + def setEventIDs(self, eventIDs, node=None): + """ + Set pending EventIDs for a given node (or self) + """ + node = self.selfOrOtherNode(node) + ocf.logger.debug("setEventIDs: begin; node = %s, eventIDs = %s" % (node, str(eventIDs))) + + if eventIDs: + eventIDStr = ",".join(eventIDs) + else: + eventIDStr = None + clusterHelper.setAttr(attr_pendingEventIDs, eventIDStr, node=node) + + ocf.logger.debug("setEventIDs: finished") + return + + def getEventIDs(self, node=None): + """ + Get pending EventIDs for a given node (or self) + """ + node = self.selfOrOtherNode(node) + ocf.logger.debug("getEventIDs: begin; node = %s" % node) + + eventIDStr = clusterHelper.getAttr(attr_pendingEventIDs, node=node) + if eventIDStr: + eventIDs = eventIDStr.split(",") + else: + eventIDs = None + + ocf.logger.debug("getEventIDs: finished; eventIDs = %s" % str(eventIDs)) + return eventIDs + + def updateNodeStateAndEvents(self, state, eventIDs, node=None): + """ + Set the state and pending EventIDs for a given node (or self) + """ + ocf.logger.debug("updateNodeStateAndEvents: begin; node = %s, state = %s, eventIDs = %s" % (node, nodeStateToString(state), str(eventIDs))) + + self.setState(state, node=node) + self.setEventIDs(eventIDs, node=node) + + ocf.logger.debug("updateNodeStateAndEvents: finished") + return state + + def putNodeStandby(self, node=None): + """ + Put self to standby + """ + node = self.selfOrOtherNode(node) + ocf.logger.debug("putNodeStandby: begin; node = %s" % node) + + clusterHelper._exec("crm_attribute", + "--node", node, + "--name", attr_healthstate, + "--update", "-1000000", + "--lifetime=forever") + + ocf.logger.debug("putNodeStandby: finished") + + def isNodeInStandby(self, node=None): + """ + check if node is in standby + """ + node = self.selfOrOtherNode(node) + ocf.logger.debug("isNodeInStandby: begin; node = %s" % node) + isInStandy = False + + healthAttributeStr = clusterHelper.getAttr(attr_healthstate, node) + if healthAttributeStr is not None: + try: + healthAttribute = int(healthAttributeStr) + isInStandy = healthAttribute < 0 + except ValueError: + # Handle the exception + ocf.logger.warn("Health attribute %s on node %s cannot be converted to an integer value" % (healthAttributeStr, node)) + + ocf.logger.debug("isNodeInStandby: finished - result %s" % isInStandy) + return isInStandy + + def putNodeOnline(self, node=None): + """ + Put self back online + """ + node = self.selfOrOtherNode(node) + ocf.logger.debug("putNodeOnline: begin; node = %s" % node) + + clusterHelper._exec("crm_attribute", + "--node", node, + "--name", "#health-azure", + "--update", "0", + "--lifetime=forever") + + ocf.logger.debug("putNodeOnline: finished") + + def separateEvents(self, events): + """ + Split own/other nodes' events + """ + ocf.logger.debug("separateEvents: begin; events = %s" % str(events)) + + localEvents = [] + remoteEvents = [] + for e in events: + e = attrDict(e) + if e.EventType not in self.raOwner.relevantEventTypes: + continue + if self.azName in e.Resources: + localEvents.append(e) + else: + remoteEvents.append(e) + ocf.logger.debug("separateEvents: finished; localEvents = %s, remoteEvents = %s" % (str(localEvents), str(remoteEvents))) + return (localEvents, remoteEvents) + +############################################################################## + +class raAzEvents: + """ + Main class for resource agent + """ + def __init__(self, relevantEventTypes): + self.node = Node(self) + self.relevantEventTypes = relevantEventTypes + + def monitor(self): + ocf.logger.debug("monitor: begin") + + events = azHelper.pullScheduledEvents() + + # get current document version + curDocVersion = events.DocumentIncarnation + lastDocVersion = self.node.getAttr(attr_lastDocVersion) + ocf.logger.debug("monitor: lastDocVersion = %s; curDocVersion = %s" % (lastDocVersion, curDocVersion)) + + # split events local/remote + (localEvents, remoteEvents) = self.node.separateEvents(events.Events) + + # ensure local events are only executing once + if curDocVersion == lastDocVersion: + ocf.logger.info("monitor: already handled curDocVersion, skip") + return ocf.OCF_SUCCESS + + localAzEventIDs = set() + for e in localEvents: + localAzEventIDs.add(e.EventId) + + curState = self.node.getState() + clusterEventIDs = self.node.getEventIDs() + + ocf.logger.debug("monitor: curDocVersion has not been handled yet") + + if clusterEventIDs: + # there are pending events set, so our state must be STOPPING or IN_EVENT + i = 0; touchedEventIDs = False + while i < len(clusterEventIDs): + # clean up pending events that are already finished according to AZ + if clusterEventIDs[i] not in localAzEventIDs: + ocf.logger.info("monitor: remove finished local clusterEvent %s" % (clusterEventIDs[i])) + clusterEventIDs.pop(i) + touchedEventIDs = True + else: + i += 1 + if len(clusterEventIDs) > 0: + # there are still pending events (either because we're still stopping, or because the event is still in place) + # either way, we need to wait + if touchedEventIDs: + ocf.logger.info("monitor: added new local clusterEvent %s" % str(clusterEventIDs)) + self.node.setEventIDs(clusterEventIDs) + else: + ocf.logger.info("monitor: no local clusterEvents were updated") + else: + # there are no more pending events left after cleanup + if clusterHelper.noPendingResourcesOnNode(self.node.hostName): + # and no pending resources on the node -> set it back online + ocf.logger.info("monitor: all local events finished -> clean up, put node online and AVAILABLE") + curState = self.node.updateNodeStateAndEvents(AVAILABLE, None) + self.node.putNodeOnline() + clusterHelper.removeHoldFromNodes() + # If Azure Scheduled Events are not used for 24 hours (e.g. because the cluster was asleep), it will be disabled for a VM. + # When the cluster wakes up and starts using it again, the DocumentIncarnation is reset. + # We need to remove it during cleanup, otherwise azure-events-az will not process the event after wakeup + self.node.setAttr(attr_lastDocVersion, None) + else: + ocf.logger.info("monitor: all local events finished, but some resources have not completed startup yet -> wait") + else: + if curState == AVAILABLE: + if len(localAzEventIDs) > 0: + if clusterHelper.otherNodesAvailable(self.node): + ocf.logger.info("monitor: can handle local events %s -> set state STOPPING" % (str(localAzEventIDs))) + curState = self.node.updateNodeStateAndEvents(STOPPING, localAzEventIDs) + else: + ocf.logger.info("monitor: cannot handle azEvents %s (only node available) -> set state ON_HOLD" % str(localAzEventIDs)) + self.node.setState(ON_HOLD) + else: + ocf.logger.debug("monitor: no local azEvents to handle") + + if curState == STOPPING: + eventIDsForNode = {} + if clusterHelper.noPendingResourcesOnNode(self.node.hostName): + if not self.node.isNodeInStandby(): + ocf.logger.info("monitor: all local resources are started properly -> put node standby and exit") + self.node.putNodeStandby() + return ocf.OCF_SUCCESS + + for e in localEvents: + ocf.logger.info("monitor: handling remote event %s (%s; nodes = %s)" % (e.EventId, e.EventType, str(e.Resources))) + # before we can force an event to start, we need to ensure all nodes involved have stopped their resources + if e.EventStatus == "Scheduled": + allNodesStopped = True + for azName in e.Resources: + hostName = clusterHelper.getHostNameFromAzName(azName) + state = self.node.getState(node=hostName) + if state == STOPPING: + # the only way we can continue is when node state is STOPPING, but all resources have been stopped + if not clusterHelper.allResourcesStoppedOnNode(hostName): + ocf.logger.info("monitor: (at least) node %s has still resources running -> wait" % hostName) + allNodesStopped = False + break + elif state in (AVAILABLE, IN_EVENT, ON_HOLD): + ocf.logger.info("monitor: node %s is still %s -> remote event needs to be picked up locally" % (hostName, nodeStateToString(state))) + allNodesStopped = False + break + if allNodesStopped: + ocf.logger.info("monitor: nodes %s are stopped -> add remote event %s to force list" % (str(e.Resources), e.EventId)) + for n in e.Resources: + hostName = clusterHelper.getHostNameFromAzName(n) + if hostName in eventIDsForNode: + eventIDsForNode[hostName].append(e.EventId) + else: + eventIDsForNode[hostName] = [e.EventId] + elif e.EventStatus == "Started": + ocf.logger.info("monitor: remote event already started") + + # force the start of all events whose nodes are ready (i.e. have no more resources running) + if len(eventIDsForNode.keys()) > 0: + eventIDsToForce = set([item for sublist in eventIDsForNode.values() for item in sublist]) + ocf.logger.info("monitor: set nodes %s to IN_EVENT; force remote events %s" % (str(eventIDsForNode.keys()), str(eventIDsToForce))) + for node, eventId in eventIDsForNode.items(): + self.node.updateNodeStateAndEvents(IN_EVENT, eventId, node=node) + azHelper.forceEvents(eventIDsToForce) + self.node.setAttr(attr_lastDocVersion, curDocVersion) + else: + ocf.logger.info("monitor: some local resources are not clean yet -> wait") + + ocf.logger.debug("monitor: finished") + return ocf.OCF_SUCCESS + +############################################################################## + +def setLoglevel(verbose): + # set up writing into syslog + loglevel = default_loglevel + if verbose: + opener = urllib2.build_opener(urllib2.HTTPHandler(debuglevel=1)) + urllib2.install_opener(opener) + loglevel = ocf.logging.DEBUG + ocf.log.setLevel(loglevel) + +description = ( + "Microsoft Azure Scheduled Events monitoring agent", + """This resource agent implements a monitor for scheduled +(maintenance) events for a Microsoft Azure VM. + +If any relevant events are found, it moves all Pacemaker resources +away from the affected node to allow for a graceful shutdown. + + Deployment: + crm configure primitive rsc_azure-events-az ocf:heartbeat:azure-events-az \ + op monitor interval=10s + crm configure clone cln_azure-events-az rsc_azure-events-az + +For further information on Microsoft Azure Scheduled Events, please +refer to the following documentation: +https://docs.microsoft.com/en-us/azure/virtual-machines/linux/scheduled-events +""") + +def monitor_action(eventTypes): + relevantEventTypes = set(eventTypes.split(",") if eventTypes else []) + ra = raAzEvents(relevantEventTypes) + return ra.monitor() + +def validate_action(eventTypes): + if eventTypes: + for event in eventTypes.split(","): + if event not in ("Freeze", "Reboot", "Redeploy"): + ocf.ocf_exit_reason("Event type not one of Freeze, Reboot, Redeploy: " + eventTypes) + return ocf.OCF_ERR_CONFIGURED + return ocf.OCF_SUCCESS + +def main(): + agent = ocf.Agent("azure-events-az", shortdesc=description[0], longdesc=description[1]) + agent.add_parameter( + "eventTypes", + shortdesc="List of resources to be considered", + longdesc="A comma-separated list of event types that will be handled by this resource agent. (Possible values: Freeze,Reboot,Redeploy)", + content_type="string", + default="Reboot,Redeploy") + agent.add_parameter( + "verbose", + shortdesc="Enable verbose agent logging", + longdesc="Set to true to enable verbose logging", + content_type="boolean", + default="false") + agent.add_action("start", timeout=10, handler=lambda: ocf.OCF_SUCCESS) + agent.add_action("stop", timeout=10, handler=lambda: ocf.OCF_SUCCESS) + agent.add_action("validate-all", timeout=20, handler=validate_action) + agent.add_action("monitor", timeout=240, interval=10, handler=monitor_action) + setLoglevel(ocf.is_true(ocf.get_parameter("verbose", "false"))) + agent.run() + +if __name__ == '__main__': + main() diff --git a/heartbeat/azure-events.in b/heartbeat/azure-events.in new file mode 100644 index 0000000..90acaba --- /dev/null +++ b/heartbeat/azure-events.in @@ -0,0 +1,847 @@ +#!@PYTHON@ -tt +# +# Resource agent for monitoring Azure Scheduled Events +# +# License: GNU General Public License (GPL) +# (c) 2018 Tobias Niekamp, Microsoft Corp. +# and Linux-HA contributors + +import os +import sys +import time +import subprocess +import json +try: + import urllib2 + from urllib2 import URLError +except ImportError: + import urllib.request as urllib2 + from urllib.error import URLError +import socket +from collections import defaultdict + +OCF_FUNCTIONS_DIR = os.environ.get("OCF_FUNCTIONS_DIR", "%s/lib/heartbeat" % os.environ.get("OCF_ROOT")) +sys.path.append(OCF_FUNCTIONS_DIR) +import ocf + +############################################################################## + + +VERSION = "0.10" +USER_AGENT = "Pacemaker-ResourceAgent/%s %s" % (VERSION, ocf.distro()) + +attr_globalPullState = "azure-events_globalPullState" +attr_lastDocVersion = "azure-events_lastDocVersion" +attr_curNodeState = "azure-events_curNodeState" +attr_pendingEventIDs = "azure-events_pendingEventIDs" + +default_loglevel = ocf.logging.INFO +default_relevantEventTypes = set(["Reboot", "Redeploy"]) + +global_pullMaxAttempts = 3 +global_pullDelaySecs = 1 + +############################################################################## + +class attrDict(defaultdict): + """ + A wrapper for accessing dict keys like an attribute + """ + def __init__(self, data): + super(attrDict, self).__init__(attrDict) + for d in data.keys(): + self.__setattr__(d, data[d]) + + def __getattr__(self, key): + try: + return self[key] + except KeyError: + raise AttributeError(key) + + def __setattr__(self, key, value): + self[key] = value + +############################################################################## + +class azHelper: + """ + Helper class for Azure's metadata API (including Scheduled Events) + """ + metadata_host = "http://169.254.169.254/metadata" + instance_api = "instance" + events_api = "scheduledevents" + api_version = "2019-08-01" + + @staticmethod + def _sendMetadataRequest(endpoint, postData=None): + """ + Send a request to Azure's Azure Metadata Service API + """ + url = "%s/%s?api-version=%s" % (azHelper.metadata_host, endpoint, azHelper.api_version) + data = "" + ocf.logger.debug("_sendMetadataRequest: begin; endpoint = %s, postData = %s" % (endpoint, postData)) + ocf.logger.debug("_sendMetadataRequest: url = %s" % url) + + if postData and type(postData) != bytes: + postData = postData.encode() + + req = urllib2.Request(url, postData) + req.add_header("Metadata", "true") + req.add_header("User-Agent", USER_AGENT) + try: + resp = urllib2.urlopen(req) + except URLError as e: + if hasattr(e, 'reason'): + ocf.logger.warning("Failed to reach the server: %s" % e.reason) + clusterHelper.setAttr(attr_globalPullState, "IDLE") + elif hasattr(e, 'code'): + ocf.logger.warning("The server couldn\'t fulfill the request. Error code: %s" % e.code) + clusterHelper.setAttr(attr_globalPullState, "IDLE") + else: + data = resp.read() + ocf.logger.debug("_sendMetadataRequest: response = %s" % data) + + if data: + data = json.loads(data) + + ocf.logger.debug("_sendMetadataRequest: finished") + return data + + @staticmethod + def getInstanceInfo(): + """ + Fetch details about the current VM from Azure's Azure Metadata Service API + """ + ocf.logger.debug("getInstanceInfo: begin") + + jsondata = azHelper._sendMetadataRequest(azHelper.instance_api) + ocf.logger.debug("getInstanceInfo: json = %s" % jsondata) + + if jsondata: + ocf.logger.debug("getInstanceInfo: finished, returning {}".format(jsondata["compute"])) + return attrDict(jsondata["compute"]) + else: + ocf.ocf_exit_reason("getInstanceInfo: Unable to get instance info") + sys.exit(ocf.OCF_ERR_GENERIC) + + @staticmethod + def pullScheduledEvents(): + """ + Retrieve all currently scheduled events via Azure Metadata Service API + """ + ocf.logger.debug("pullScheduledEvents: begin") + + jsondata = azHelper._sendMetadataRequest(azHelper.events_api) + ocf.logger.debug("pullScheduledEvents: json = %s" % jsondata) + + ocf.logger.debug("pullScheduledEvents: finished") + return attrDict(jsondata) + + @staticmethod + def forceEvents(eventIDs): + """ + Force a set of events to start immediately + """ + ocf.logger.debug("forceEvents: begin") + + events = [] + for e in eventIDs: + events.append({ + "EventId": e, + }) + postData = { + "StartRequests" : events + } + ocf.logger.info("forceEvents: postData = %s" % postData) + resp = azHelper._sendMetadataRequest(azHelper.events_api, postData=json.dumps(postData)) + + ocf.logger.debug("forceEvents: finished") + return + +############################################################################## + +class clusterHelper: + """ + Helper functions for Pacemaker control via crm + """ + @staticmethod + def _getLocation(node): + """ + Helper function to retrieve local/global attributes + """ + if node: + return ["--node", node] + else: + return ["--type", "crm_config"] + + @staticmethod + def _exec(command, *args): + """ + Helper function to execute a UNIX command + """ + args = list(args) + ocf.logger.debug("_exec: begin; command = %s, args = %s" % (command, str(args))) + + def flatten(*n): + return (str(e) for a in n + for e in (flatten(*a) if isinstance(a, (tuple, list)) else (str(a),))) + command = list(flatten([command] + args)) + ocf.logger.debug("_exec: cmd = %s" % " ".join(command)) + try: + ret = subprocess.check_output(command) + if type(ret) != str: + ret = ret.decode() + ocf.logger.debug("_exec: return = %s" % ret) + return ret.rstrip() + except Exception as err: + ocf.logger.exception(err) + return None + + @staticmethod + def setAttr(key, value, node=None): + """ + Set the value of a specific global/local attribute in the Pacemaker cluster + """ + ocf.logger.debug("setAttr: begin; key = %s, value = %s, node = %s" % (key, value, node)) + + if value: + ret = clusterHelper._exec("crm_attribute", + "--name", key, + "--update", value, + clusterHelper._getLocation(node)) + else: + ret = clusterHelper._exec("crm_attribute", + "--name", key, + "--delete", + clusterHelper._getLocation(node)) + + ocf.logger.debug("setAttr: finished") + return len(ret) == 0 + + @staticmethod + def getAttr(key, node=None): + """ + Retrieve a global/local attribute from the Pacemaker cluster + """ + ocf.logger.debug("getAttr: begin; key = %s, node = %s" % (key, node)) + + val = clusterHelper._exec("crm_attribute", + "--name", key, + "--query", "--quiet", + "--default", "", + clusterHelper._getLocation(node)) + ocf.logger.debug("getAttr: finished") + if not val: + return None + return val if not val.isdigit() else int(val) + + @staticmethod + def getAllNodes(): + """ + Get a list of hostnames for all nodes in the Pacemaker cluster + """ + ocf.logger.debug("getAllNodes: begin") + + nodes = [] + nodeList = clusterHelper._exec("crm_node", "--list") + for n in nodeList.split("\n"): + nodes.append(n.split()[1]) + ocf.logger.debug("getAllNodes: finished; return %s" % str(nodes)) + + return nodes + + @staticmethod + def getHostNameFromAzName(azName): + """ + Helper function to get the actual host name from an Azure node name + """ + return clusterHelper.getAttr("hostName_%s" % azName) + + @staticmethod + def removeHoldFromNodes(): + """ + Remove the ON_HOLD state from all nodes in the Pacemaker cluster + """ + ocf.logger.debug("removeHoldFromNodes: begin") + + for n in clusterHelper.getAllNodes(): + if clusterHelper.getAttr(attr_curNodeState, node=n) == "ON_HOLD": + clusterHelper.setAttr(attr_curNodeState, "AVAILABLE", node=n) + ocf.logger.info("removeHoldFromNodes: removed ON_HOLD from node %s" % n) + + ocf.logger.debug("removeHoldFromNodes: finished") + return False + + @staticmethod + def otherNodesAvailable(exceptNode): + """ + Check if there are any nodes (except a given node) in the Pacemaker cluster that have state AVAILABLE + """ + ocf.logger.debug("otherNodesAvailable: begin; exceptNode = %s" % exceptNode) + + for n in clusterHelper.getAllNodes(): + state = clusterHelper.getAttr(attr_curNodeState, node=n) + state = stringToNodeState(state) if state else AVAILABLE + if state == AVAILABLE and n != exceptNode.hostName: + ocf.logger.info("otherNodesAvailable: at least %s is available" % n) + ocf.logger.debug("otherNodesAvailable: finished") + return True + ocf.logger.info("otherNodesAvailable: no other nodes are available") + ocf.logger.debug("otherNodesAvailable: finished") + + return False + + @staticmethod + def transitionSummary(): + """ + Get the current Pacemaker transition summary (used to check if all resources are stopped when putting a node standby) + """ + # <tniek> Is a global crm_simulate "too much"? Or would it be sufficient it there are no planned transitions for a particular node? + # # crm_simulate -LS + # Transition Summary: + # * Promote rsc_SAPHana_HN1_HDB03:0 (Slave -> Master hsr3-db1) + # * Stop rsc_SAPHana_HN1_HDB03:1 (hsr3-db0) + # * Move rsc_ip_HN1_HDB03 (Started hsr3-db0 -> hsr3-db1) + # * Start rsc_nc_HN1_HDB03 (hsr3-db1) + # # Excepted result when there are no pending actions: + # Transition Summary: + ocf.logger.debug("transitionSummary: begin") + + summary = clusterHelper._exec("crm_simulate", "-LS") + if not summary: + ocf.logger.warning("transitionSummary: could not load transition summary") + return "" + if summary.find("Transition Summary:") < 0: + ocf.logger.debug("transitionSummary: no transactions: %s" % summary) + return "" + j=summary.find('Transition Summary:') + len('Transition Summary:') + l=summary.lower().find('executing cluster transition:') + ret = list(filter(str.strip, summary[j:l].split("\n"))) + + ocf.logger.debug("transitionSummary: finished; return = %s" % str(ret)) + return ret + + @staticmethod + def listOperationsOnNode(node): + """ + Get a list of all current operations for a given node (used to check if any resources are pending) + """ + # hsr3-db1:/home/tniek # crm_resource --list-operations -N hsr3-db0 + # rsc_azure-events (ocf::heartbeat:azure-events): Started: rsc_azure-events_start_0 (node=hsr3-db0, call=91, rc=0, last-rc-change=Fri Jun 8 22:37:46 2018, exec=115ms): complete + # rsc_azure-events (ocf::heartbeat:azure-events): Started: rsc_azure-events_monitor_10000 (node=hsr3-db0, call=93, rc=0, last-rc-change=Fri Jun 8 22:37:47 2018, exec=197ms): complete + # rsc_SAPHana_HN1_HDB03 (ocf::suse:SAPHana): Master: rsc_SAPHana_HN1_HDB03_start_0 (node=hsr3-db0, call=-1, rc=193, last-rc-change=Fri Jun 8 22:37:46 2018, exec=0ms): pending + # rsc_SAPHanaTopology_HN1_HDB03 (ocf::suse:SAPHanaTopology): Started: rsc_SAPHanaTopology_HN1_HDB03_start_0 (node=hsr3-db0, call=90, rc=0, last-rc-change=Fri Jun 8 22:37:46 2018, exec=3214ms): complete + ocf.logger.debug("listOperationsOnNode: begin; node = %s" % node) + + resources = clusterHelper._exec("crm_resource", "--list-operations", "-N", node) + if len(resources) == 0: + ret = [] + else: + ret = resources.split("\n") + + ocf.logger.debug("listOperationsOnNode: finished; return = %s" % str(ret)) + return ret + + @staticmethod + def noPendingResourcesOnNode(node): + """ + Check that there are no pending resources on a given node + """ + ocf.logger.debug("noPendingResourcesOnNode: begin; node = %s" % node) + + for r in clusterHelper.listOperationsOnNode(node): + ocf.logger.debug("noPendingResourcesOnNode: * %s" % r) + resource = r.split()[-1] + if resource == "pending": + ocf.logger.info("noPendingResourcesOnNode: found resource %s that is still pending" % resource) + ocf.logger.debug("noPendingResourcesOnNode: finished; return = False") + return False + ocf.logger.info("noPendingResourcesOnNode: no pending resources on node %s" % node) + ocf.logger.debug("noPendingResourcesOnNode: finished; return = True") + + return True + + @staticmethod + def allResourcesStoppedOnNode(node): + """ + Check that all resources on a given node are stopped + """ + ocf.logger.debug("allResourcesStoppedOnNode: begin; node = %s" % node) + + if clusterHelper.noPendingResourcesOnNode(node): + if len(clusterHelper.transitionSummary()) == 0: + ocf.logger.info("allResourcesStoppedOnNode: no pending resources on node %s and empty transition summary" % node) + ocf.logger.debug("allResourcesStoppedOnNode: finished; return = True") + return True + ocf.logger.info("allResourcesStoppedOnNode: transition summary is not empty") + ocf.logger.debug("allResourcesStoppedOnNode: finished; return = False") + return False + + ocf.logger.info("allResourcesStoppedOnNode: still pending resources on node %s" % node) + ocf.logger.debug("allResourcesStoppedOnNode: finished; return = False") + return False + +############################################################################## + +AVAILABLE = 0 # Node is online and ready to handle events +STOPPING = 1 # Standby has been triggered, but some resources are still running +IN_EVENT = 2 # All resources are stopped, and event has been initiated via Azure Metadata Service +ON_HOLD = 3 # Node has a pending event that cannot be started there are no other nodes available + +def stringToNodeState(name): + if type(name) == int: return name + if name == "STOPPING": return STOPPING + if name == "IN_EVENT": return IN_EVENT + if name == "ON_HOLD": return ON_HOLD + return AVAILABLE + +def nodeStateToString(state): + if state == STOPPING: return "STOPPING" + if state == IN_EVENT: return "IN_EVENT" + if state == ON_HOLD: return "ON_HOLD" + return "AVAILABLE" + +############################################################################## + +class Node: + """ + Core class implementing logic for a cluster node + """ + def __init__(self, ra): + self.raOwner = ra + self.azInfo = azHelper.getInstanceInfo() + self.azName = self.azInfo.name + self.hostName = socket.gethostname() + self.setAttr("azName", self.azName) + clusterHelper.setAttr("hostName_%s" % self.azName, self.hostName) + + def getAttr(self, key): + """ + Get a local attribute + """ + return clusterHelper.getAttr(key, node=self.hostName) + + def setAttr(self, key, value): + """ + Set a local attribute + """ + return clusterHelper.setAttr(key, value, node=self.hostName) + + def selfOrOtherNode(self, node): + """ + Helper function to distinguish self/other node + """ + return node if node else self.hostName + + def setState(self, state, node=None): + """ + Set the state for a given node (or self) + """ + node = self.selfOrOtherNode(node) + ocf.logger.debug("setState: begin; node = %s, state = %s" % (node, nodeStateToString(state))) + + clusterHelper.setAttr(attr_curNodeState, nodeStateToString(state), node=node) + + ocf.logger.debug("setState: finished") + + def getState(self, node=None): + """ + Get the state for a given node (or self) + """ + node = self.selfOrOtherNode(node) + ocf.logger.debug("getState: begin; node = %s" % node) + + state = clusterHelper.getAttr(attr_curNodeState, node=node) + ocf.logger.debug("getState: state = %s" % state) + ocf.logger.debug("getState: finished") + if not state: + return AVAILABLE + return stringToNodeState(state) + + def setEventIDs(self, eventIDs, node=None): + """ + Set pending EventIDs for a given node (or self) + """ + node = self.selfOrOtherNode(node) + ocf.logger.debug("setEventIDs: begin; node = %s, eventIDs = %s" % (node, str(eventIDs))) + + if eventIDs: + eventIDStr = ",".join(eventIDs) + else: + eventIDStr = None + clusterHelper.setAttr(attr_pendingEventIDs, eventIDStr, node=node) + + ocf.logger.debug("setEventIDs: finished") + return + + def getEventIDs(self, node=None): + """ + Get pending EventIDs for a given node (or self) + """ + node = self.selfOrOtherNode(node) + ocf.logger.debug("getEventIDs: begin; node = %s" % node) + + eventIDStr = clusterHelper.getAttr(attr_pendingEventIDs, node=node) + if eventIDStr: + eventIDs = eventIDStr.split(",") + else: + eventIDs = None + + ocf.logger.debug("getEventIDs: finished; eventIDs = %s" % str(eventIDs)) + return eventIDs + + def updateNodeStateAndEvents(self, state, eventIDs, node=None): + """ + Set the state and pending EventIDs for a given node (or self) + """ + ocf.logger.debug("updateNodeStateAndEvents: begin; node = %s, state = %s, eventIDs = %s" % (node, nodeStateToString(state), str(eventIDs))) + + self.setState(state, node=node) + self.setEventIDs(eventIDs, node=node) + + ocf.logger.debug("updateNodeStateAndEvents: finished") + return state + + def putNodeStandby(self, node=None): + """ + Put self to standby + """ + node = self.selfOrOtherNode(node) + ocf.logger.debug("putNodeStandby: begin; node = %s" % node) + + clusterHelper._exec("crm_attribute", + "-t", "nodes", + "-N", node, + "-n", "standby", + "-v", "on", + "--lifetime=forever") + + ocf.logger.debug("putNodeStandby: finished") + + def putNodeOnline(self, node=None): + """ + Put self back online + """ + node = self.selfOrOtherNode(node) + ocf.logger.debug("putNodeOnline: begin; node = %s" % node) + + clusterHelper._exec("crm_attribute", + "-t", "nodes", + "-N", node, + "-n", "standby", + "-v", "off", + "--lifetime=forever") + + ocf.logger.debug("putNodeOnline: finished") + + def separateEvents(self, events): + """ + Split own/other nodes' events + """ + ocf.logger.debug("separateEvents: begin; events = %s" % str(events)) + + localEvents = [] + remoteEvents = [] + for e in events: + e = attrDict(e) + if e.EventType not in self.raOwner.relevantEventTypes: + continue + if self.azName in e.Resources: + localEvents.append(e) + else: + remoteEvents.append(e) + ocf.logger.debug("separateEvents: finished; localEvents = %s, remoteEvents = %s" % (str(localEvents), str(remoteEvents))) + return (localEvents, remoteEvents) + + def removeOrphanedEvents(self, azEvents): + """ + Remove remote events that are already finished + """ + ocf.logger.debug("removeOrphanedEvents: begin; azEvents = %s" % str(azEvents)) + + azEventIDs = set() + for e in azEvents: + azEventIDs.add(e.EventId) + # for all nodes except self ... + for n in clusterHelper.getAllNodes(): + if n == self.hostName: + continue + curState = self.getState(node=n) + # ... that still show in an event or shutting down resources ... + if curState in (STOPPING, IN_EVENT): + ocf.logger.info("removeOrphanedEvents: node %s has state %s" % (n, curState)) + clusterEventIDs = self.getEventIDs(node=n) + stillActive = False + # ... but don't have any more events running according to Azure, ... + for p in clusterEventIDs: + if p in azEventIDs: + ocf.logger.info("removeOrphanedEvents: (at least) event %s on node %s has not yet finished" % (str(p), n)) + stillActive = True + break + if not stillActive: + # ... put them back online. + ocf.logger.info("removeOrphanedEvents: clusterEvents %s on node %s are not in azEvents %s -> bring node back online" % (str(clusterEventIDs), n, str(azEventIDs))) + self.putNodeOnline(node=n) + + ocf.logger.debug("removeOrphanedEvents: finished") + + def handleRemoteEvents(self, azEvents): + """ + Handle a list of events (as provided by Azure Metadata Service) for other nodes + """ + ocf.logger.debug("handleRemoteEvents: begin; hostName = %s, events = %s" % (self.hostName, str(azEvents))) + + if len(azEvents) == 0: + ocf.logger.debug("handleRemoteEvents: no remote events to handle") + ocf.logger.debug("handleRemoteEvents: finished") + return + eventIDsForNode = {} + + # iterate through all current events as per Azure + for e in azEvents: + ocf.logger.info("handleRemoteEvents: handling remote event %s (%s; nodes = %s)" % (e.EventId, e.EventType, str(e.Resources))) + # before we can force an event to start, we need to ensure all nodes involved have stopped their resources + if e.EventStatus == "Scheduled": + allNodesStopped = True + for azName in e.Resources: + hostName = clusterHelper.getHostNameFromAzName(azName) + state = self.getState(node=hostName) + if state == STOPPING: + # the only way we can continue is when node state is STOPPING, but all resources have been stopped + if not clusterHelper.allResourcesStoppedOnNode(hostName): + ocf.logger.info("handleRemoteEvents: (at least) node %s has still resources running -> wait" % hostName) + allNodesStopped = False + break + elif state in (AVAILABLE, IN_EVENT, ON_HOLD): + ocf.logger.info("handleRemoteEvents: node %s is still %s -> remote event needs to be picked up locally" % (hostName, nodeStateToString(state))) + allNodesStopped = False + break + if allNodesStopped: + ocf.logger.info("handleRemoteEvents: nodes %s are stopped -> add remote event %s to force list" % (str(e.Resources), e.EventId)) + for n in e.Resources: + hostName = clusterHelper.getHostNameFromAzName(n) + if hostName in eventIDsForNode: + eventIDsForNode[hostName].append(e.EventId) + else: + eventIDsForNode[hostName] = [e.EventId] + elif e.EventStatus == "Started": + ocf.logger.info("handleRemoteEvents: remote event already started") + + # force the start of all events whose nodes are ready (i.e. have no more resources running) + if len(eventIDsForNode.keys()) > 0: + eventIDsToForce = set([item for sublist in eventIDsForNode.values() for item in sublist]) + ocf.logger.info("handleRemoteEvents: set nodes %s to IN_EVENT; force remote events %s" % (str(eventIDsForNode.keys()), str(eventIDsToForce))) + for node, eventId in eventIDsForNode.items(): + self.updateNodeStateAndEvents(IN_EVENT, eventId, node=node) + azHelper.forceEvents(eventIDsToForce) + + ocf.logger.debug("handleRemoteEvents: finished") + + def handleLocalEvents(self, azEvents): + """ + Handle a list of own events (as provided by Azure Metadata Service) + """ + ocf.logger.debug("handleLocalEvents: begin; hostName = %s, azEvents = %s" % (self.hostName, str(azEvents))) + + azEventIDs = set() + for e in azEvents: + azEventIDs.add(e.EventId) + + curState = self.getState() + clusterEventIDs = self.getEventIDs() + mayUpdateDocVersion = False + ocf.logger.info("handleLocalEvents: current state = %s; pending local clusterEvents = %s" % (nodeStateToString(curState), str(clusterEventIDs))) + + # check if there are currently/still events set for the node + if clusterEventIDs: + # there are pending events set, so our state must be STOPPING or IN_EVENT + i = 0; touchedEventIDs = False + while i < len(clusterEventIDs): + # clean up pending events that are already finished according to AZ + if clusterEventIDs[i] not in azEventIDs: + ocf.logger.info("handleLocalEvents: remove finished local clusterEvent %s" % (clusterEventIDs[i])) + clusterEventIDs.pop(i) + touchedEventIDs = True + else: + i += 1 + if len(clusterEventIDs) > 0: + # there are still pending events (either because we're still stopping, or because the event is still in place) + # either way, we need to wait + if touchedEventIDs: + ocf.logger.info("handleLocalEvents: added new local clusterEvent %s" % str(clusterEventIDs)) + self.setEventIDs(clusterEventIDs) + else: + ocf.logger.info("handleLocalEvents: no local clusterEvents were updated") + else: + # there are no more pending events left after cleanup + if clusterHelper.noPendingResourcesOnNode(self.hostName): + # and no pending resources on the node -> set it back online + ocf.logger.info("handleLocalEvents: all local events finished -> clean up, put node online and AVAILABLE") + curState = self.updateNodeStateAndEvents(AVAILABLE, None) + self.putNodeOnline() + clusterHelper.removeHoldFromNodes() + # repeat handleLocalEvents() since we changed status to AVAILABLE + else: + ocf.logger.info("handleLocalEvents: all local events finished, but some resources have not completed startup yet -> wait") + else: + # there are no pending events set for us (yet) + if curState == AVAILABLE: + if len(azEventIDs) > 0: + if clusterHelper.otherNodesAvailable(self): + ocf.logger.info("handleLocalEvents: can handle local events %s -> set state STOPPING" % (str(azEventIDs))) + # this will also set mayUpdateDocVersion = True + curState = self.updateNodeStateAndEvents(STOPPING, azEventIDs) + else: + ocf.logger.info("handleLocalEvents: cannot handle azEvents %s (only node available) -> set state ON_HOLD" % str(azEventIDs)) + self.setState(ON_HOLD) + else: + ocf.logger.debug("handleLocalEvents: no local azEvents to handle") + if curState == STOPPING: + if clusterHelper.noPendingResourcesOnNode(self.hostName): + ocf.logger.info("handleLocalEvents: all local resources are started properly -> put node standby") + self.putNodeStandby() + mayUpdateDocVersion = True + else: + ocf.logger.info("handleLocalEvents: some local resources are not clean yet -> wait") + + ocf.logger.debug("handleLocalEvents: finished; mayUpdateDocVersion = %s" % str(mayUpdateDocVersion)) + return mayUpdateDocVersion + +############################################################################## + +class raAzEvents: + """ + Main class for resource agent + """ + def __init__(self, relevantEventTypes): + self.node = Node(self) + self.relevantEventTypes = relevantEventTypes + + def monitor(self): + ocf.logger.debug("monitor: begin") + + pullFailedAttemps = 0 + while True: + # check if another node is pulling at the same time; + # this should only be a concern for the first pull, as setting up Scheduled Events may take up to 2 minutes. + if clusterHelper.getAttr(attr_globalPullState) == "PULLING": + pullFailedAttemps += 1 + if pullFailedAttemps == global_pullMaxAttempts: + ocf.logger.warning("monitor: exceeded maximum number of attempts (%d) to pull events" % global_pullMaxAttempts) + ocf.logger.debug("monitor: finished") + return ocf.OCF_SUCCESS + else: + ocf.logger.info("monitor: another node is pulling; retry in %d seconds" % global_pullDelaySecs) + time.sleep(global_pullDelaySecs) + continue + + # we can pull safely from Azure Metadata Service + clusterHelper.setAttr(attr_globalPullState, "PULLING") + events = azHelper.pullScheduledEvents() + clusterHelper.setAttr(attr_globalPullState, "IDLE") + + # get current document version + curDocVersion = events.DocumentIncarnation + lastDocVersion = self.node.getAttr(attr_lastDocVersion) + ocf.logger.debug("monitor: lastDocVersion = %s; curDocVersion = %s" % (lastDocVersion, curDocVersion)) + + # split events local/remote + (localEvents, remoteEvents) = self.node.separateEvents(events.Events) + + # ensure local events are only executing once + if curDocVersion != lastDocVersion: + ocf.logger.debug("monitor: curDocVersion has not been handled yet") + # handleLocalEvents() returns True if mayUpdateDocVersion is True; + # this is only the case if we can ensure there are no pending events + if self.node.handleLocalEvents(localEvents): + ocf.logger.info("monitor: handleLocalEvents completed successfully -> update curDocVersion") + self.node.setAttr(attr_lastDocVersion, curDocVersion) + else: + ocf.logger.debug("monitor: handleLocalEvents still waiting -> keep curDocVersion") + else: + ocf.logger.info("monitor: already handled curDocVersion, skip") + + # remove orphaned remote events and then handle the remaining remote events + self.node.removeOrphanedEvents(remoteEvents) + self.node.handleRemoteEvents(remoteEvents) + break + + ocf.logger.debug("monitor: finished") + return ocf.OCF_SUCCESS + +############################################################################## + +def setLoglevel(verbose): + # set up writing into syslog + loglevel = default_loglevel + if verbose: + opener = urllib2.build_opener(urllib2.HTTPHandler(debuglevel=1)) + urllib2.install_opener(opener) + loglevel = ocf.logging.DEBUG + ocf.log.setLevel(loglevel) + +description = ( + "Microsoft Azure Scheduled Events monitoring agent", + """This resource agent implements a monitor for scheduled +(maintenance) events for a Microsoft Azure VM. + +If any relevant events are found, it moves all Pacemaker resources +away from the affected node to allow for a graceful shutdown. + + Usage: + [OCF_RESKEY_eventTypes=VAL] [OCF_RESKEY_verbose=VAL] azure-events ACTION + + action (required): Supported values: monitor, help, meta-data + eventTypes (optional): List of event types to be considered + relevant by the resource agent (comma-separated). + Supported values: Freeze,Reboot,Redeploy + Default = Reboot,Redeploy +/ verbose (optional): If set to true, displays debug info. + Default = false + + Deployment: + crm configure primitive rsc_azure-events ocf:heartbeat:azure-events \ + op monitor interval=10s + crm configure clone cln_azure-events rsc_azure-events + +For further information on Microsoft Azure Scheduled Events, please +refer to the following documentation: +https://docs.microsoft.com/en-us/azure/virtual-machines/linux/scheduled-events +""") + +def monitor_action(eventTypes): + relevantEventTypes = set(eventTypes.split(",") if eventTypes else []) + ra = raAzEvents(relevantEventTypes) + return ra.monitor() + +def validate_action(eventTypes): + if eventTypes: + for event in eventTypes.split(","): + if event not in ("Freeze", "Reboot", "Redeploy"): + ocf.ocf_exit_reason("Event type not one of Freeze, Reboot, Redeploy: " + eventTypes) + return ocf.OCF_ERR_CONFIGURED + return ocf.OCF_SUCCESS + +def main(): + agent = ocf.Agent("azure-events", shortdesc=description[0], longdesc=description[1]) + agent.add_parameter( + "eventTypes", + shortdesc="List of resources to be considered", + longdesc="A comma-separated list of event types that will be handled by this resource agent. (Possible values: Freeze,Reboot,Redeploy)", + content_type="string", + default="Reboot,Redeploy") + agent.add_parameter( + "verbose", + shortdesc="Enable verbose agent logging", + longdesc="Set to true to enable verbose logging", + content_type="boolean", + default="false") + agent.add_action("start", timeout=10, handler=lambda: ocf.OCF_SUCCESS) + agent.add_action("stop", timeout=10, handler=lambda: ocf.OCF_SUCCESS) + agent.add_action("validate-all", timeout=20, handler=validate_action) + agent.add_action("monitor", timeout=240, interval=10, handler=monitor_action) + setLoglevel(ocf.is_true(ocf.get_parameter("verbose", "false"))) + agent.run() + +if __name__ == '__main__': + main() diff --git a/heartbeat/azure-lb b/heartbeat/azure-lb new file mode 100755 index 0000000..7f585bf --- /dev/null +++ b/heartbeat/azure-lb @@ -0,0 +1,229 @@ +#!/bin/sh +# + +# License: GNU General Public License (GPL) +# (c) 2017 O. Albrigtsen +# and Linux-HA contributors +# +# ----------------------------------------------------------------------------- +# O C F R E S O U R C E S C R I P T S P E C I F I C A T I O N +# ----------------------------------------------------------------------------- +# +# NAME +# azure-lb : OCF resource agent script for Azure Load Balancer +# +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +if is_suse_based; then + OCF_RESKEY_nc_default="/usr/bin/socat" +else + OCF_RESKEY_nc_default="/usr/bin/nc" +fi + +OCF_RESKEY_port_default="61000" + +: ${OCF_RESKEY_nc=${OCF_RESKEY_nc_default}} +: ${OCF_RESKEY_port=${OCF_RESKEY_port_default}} + +process="$OCF_RESOURCE_INSTANCE" +pidfile="/var/run/$OCF_RESOURCE_INSTANCE.pid" + + +lb_usage() { + cat <<END + usage: $0 (start|stop|validate-all|meta-data|help|usage|monitor) + $0 manages service that answers Azure Load Balancer health probe requests as a OCF HA resource. + The 'start' operation starts the instance. + The 'stop' operation stops the instance. + The 'monitor' operation reports whether the instance seems to be working + The 'validate-all' operation reports whether the parameters are valid +END +} + +lb_metadata() { +cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="azure-lb" version="1.0"> +<version>1.0</version> +<longdesc lang="en"> +Resource agent to answer Azure Load Balancer health probe requests +</longdesc> +<shortdesc lang="en">Answers Azure Load Balancer health probe requests</shortdesc> + +<parameters> + +<parameter name="nc"> +<longdesc lang="en"> +The full path of the used binary. This can be nc or socat path. +The default is /usr/bin/nc and /usr/bin/socat for SUSE distributions. +</longdesc> +<shortdesc lang="en">Full path of the used binary (nc or socat are allowed)</shortdesc> +<content type="string" default="${OCF_RESKEY_nc_default}"/> +</parameter> + +<parameter name="port"> +<longdesc lang="en"> +Port to listen to. +</longdesc> +<shortdesc lang="en">Listen to port</shortdesc> +<content type="string" default="${OCF_RESKEY_port_default}"/> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s" /> +</actions> +</resource-agent> +END +exit 0 +} + +getpid() { + grep -o '[0-9]*' $1 +} + +lb_monitor() { + if test -f "$pidfile"; then + [ "$__OCF_ACTION" = "stop" ] && level="debug" || level="err" + + if pid=$(getpid "$pidfile") && [ -n "$pid" ]; then + output=$(kill -s 0 "$pid" 2>&1) + mon_rc=$? + + [ -n "$output" ] && ocf_log "$level" "$output" + [ "$mon_rc" -eq 0 ] && return $OCF_SUCCESS + fi + + # pidfile w/o process means the process died + return $OCF_ERR_GENERIC + else + return $OCF_NOT_RUNNING + fi +} + +lb_start() { + cmd="$OCF_RESKEY_nc -l -k $OCF_RESKEY_port" + if [ $( basename $OCF_RESKEY_nc ) = 'socat' ]; then + #socat has different parameters + cmd="$OCF_RESKEY_nc -U TCP-LISTEN:$OCF_RESKEY_port,backlog=10,fork,reuseaddr /dev/null" + fi + if ! lb_monitor; then + ocf_log debug "Starting $process: $cmd" + # Execute the command as created above + $cmd >/dev/null 2>&1 & + echo $! > $pidfile + if lb_monitor; then + ocf_log debug "$process: $cmd started successfully, calling monitor" + lb_monitor + return $? + else + ocf_log err "$process: $cmd could not be started" + return $OCF_ERR_GENERIC + fi + else + # If already running, consider start successful + ocf_log debug "$process: $cmd is already running" + return $OCF_SUCCESS + fi +} + +lb_stop() { + stop_rc=$OCF_SUCCESS + + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + # Allow 2/3 of the action timeout for the orderly shutdown + # (The origin unit is ms, hence the conversion) + stop_timeout=$((OCF_RESKEY_CRM_meta_timeout/1500)) + else + stop_timeout=10 + fi + + if lb_monitor; then + pid=`getpid $pidfile` + kill $pid + + i=0 + while [ $i -lt $stop_timeout ]; do + if ! lb_monitor; then + rm -f $pidfile + return $OCF_SUCCESS + fi + sleep 1 + i=$((i+1)) + done + + ocf_log warn "Stop with SIGTERM failed/timed out, now sending SIGKILL." + kill -s 9 $pid + while :; do + if ! lb_monitor; then + ocf_log warn "SIGKILL did the job." + stop_rc=$OCF_SUCCESS + break + fi + ocf_log info "The job still hasn't stopped yet. Waiting..." + sleep 1 + done + fi + rm -f $pidfile + return $stop_rc +} + +lb_validate() { + check_binary "$OCF_RESKEY_nc" + + if ! ocf_is_decimal "$OCF_RESKEY_port"; then + ocf_exit_reason "$OCF_RESKEY_port is not a valid port" + exit $OCF_ERR_CONFIGURED + fi + + return $OCF_SUCCESS +} + +############################################################################### +# +# MAIN +# +############################################################################### + +case $__OCF_ACTION in + meta-data) + lb_metadata + exit $OCF_SUCCESS + ;; + usage|help) + lb_usage + exit $OCF_SUCCESS + ;; +esac + +if ! ocf_is_root; then + ocf_log err "You must be root for $__OCF_ACTION operation." + exit $OCF_ERR_PERM +fi + +case $__OCF_ACTION in + start) + lb_validate + lb_start;; + stop) + lb_stop;; + monitor) + lb_monitor;; + validate-all) + lb_validate;; + *) + echo $USAGE + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +exit $? diff --git a/heartbeat/clvm.in b/heartbeat/clvm.in new file mode 100644 index 0000000..a5db7a5 --- /dev/null +++ b/heartbeat/clvm.in @@ -0,0 +1,457 @@ +#!@BASH_SHELL@ +# +# Copyright (c) 2014 David Vossel <davidvossel@gmail.com> +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs +. ${OCF_FUNCTIONS_DIR}/ocf-directories + +# Parameter defaults + +OCF_RESKEY_with_cmirrord_default="false" +OCF_RESKEY_daemon_options_default="-d0" +OCF_RESKEY_activate_vgs_default="true" +OCF_RESKEY_exclusive_default="false" + +: ${OCF_RESKEY_with_cmirrord=${OCF_RESKEY_with_cmirrord_default}} +: ${OCF_RESKEY_daemon_options=${OCF_RESKEY_daemon_options_default}} +: ${OCF_RESKEY_activate_vgs=${OCF_RESKEY_activate_vgs_default}} +: ${OCF_RESKEY_exclusive=${OCF_RESKEY_exclusive_default}} + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="clvm" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This agent manages the clvmd daemon. +</longdesc> +<shortdesc lang="en">clvmd</shortdesc> + +<parameters> +<parameter name="with_cmirrord" unique="0" required="0"> +<longdesc lang="en"> +Start with cmirrord (cluster mirror log daemon). +</longdesc> +<shortdesc lang="en">activate cmirrord</shortdesc> +<content type="boolean" default="${OCF_RESKEY_with_cmirrord_default}" /> +</parameter> + +<parameter name="daemon_options" unique="0"> +<longdesc lang="en"> +Options to clvmd. Refer to clvmd.8 for detailed descriptions. +</longdesc> +<shortdesc lang="en">Daemon Options</shortdesc> +<content type="string" default="${OCF_RESKEY_daemon_options_default}"/> +</parameter> + +<parameter name="activate_vgs" unique="0"> +<longdesc lang="en"> +Whether or not to activate all cluster volume groups after starting +the clvmd or not. Note that clustered volume groups will always be +deactivated before the clvmd stops regardless of what this option +is set to. +</longdesc> +<shortdesc lang="en">Activate volume groups</shortdesc> +<content type="boolean" default="${OCF_RESKEY_activate_vgs_default}"/> +</parameter> + +<parameter name="exclusive" unique="0" required="0"> +<longdesc lang="en"> +If set, only exclusive volume groups will be monitored. +</longdesc> +<shortdesc lang="en">Only monitor exclusive volume groups</shortdesc> +<content type="boolean" default="${OCF_RESKEY_exclusive_default}" /> +</parameter> + + +</parameters> + +<actions> +<action name="start" timeout="90s" /> +<action name="stop" timeout="90s" /> +<action name="monitor" timeout="90s" interval="30s" depth="0" /> +<action name="meta-data" timeout="10s" /> +<action name="validate-all" timeout="20s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +sbindir=$HA_SBIN_DIR +if [ -z $sbindir ]; then + sbindir=/usr/sbin +fi +DAEMON="clvmd" +CMIRROR="cmirrord" +DAEMON_PATH="${sbindir}/clvmd" +CMIRROR_PATH="${sbindir}/cmirrord" +LVMCONF="${sbindir}/lvmconf" +LOCK_FILE="/var/lock/subsys/$DAEMON" + +# attempt to detect where the vg tools are located +# for some reason this isn't consistent with sbindir +# in some distros. +vgtoolsdir=$(dirname $(which vgchange 2> /dev/null) 2> /dev/null) +if [ -z "$vgtoolsdir" ]; then + vgtoolsdir="$sbindir" +fi + +LVM_VGCHANGE=${vgtoolsdir}/vgchange +LVM_VGDISPLAY=${vgtoolsdir}/vgdisplay +LVM_VGSCAN=${vgtoolsdir}/vgscan + +# Leaving this in for legacy. We do not want to advertize +# the abilty to set options in the systconfig exists, we want +# to expand the OCF style options as necessary instead. +[ -f /etc/sysconfig/cluster ] && . /etc/sysconfig/cluster +[ -f /etc/sysconfig/$DAEMON ] && . /etc/sysconfig/$DAEMON + +CLVMD_TIMEOUT="90" +if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + CLVMD_TIMEOUT=$(($OCF_RESKEY_CRM_meta_timeout/1000)) +fi + +clvmd_usage() +{ + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +clvmd_validate() +{ + # check_binary will exit with OCF_ERR_INSTALLED + # when binary is missing + check_binary "pgrep" + check_binary $DAEMON_PATH + if ocf_is_true $OCF_RESKEY_with_cmirrord; then + check_binary $CMIRROR_PATH + fi + + if [ "$__OCF_ACTION" != "monitor" ]; then + check_binary "killall" + check_binary $LVM_VGCHANGE + check_binary $LVM_VGDISPLAY + check_binary $LVM_VGSCAN + fi + + # Future validation checks here. + return $OCF_SUCCESS +} + +check_process() +{ + local binary=$1 + local pidfile="${HA_RSCTMP}/${binary}-${OCF_RESOURCE_INSTANCE}.pid" + local pid + + ocf_log debug "Checking status for ${binary}." + if [ -e "$pidfile" ]; then + cat /proc/$(cat $pidfile)/cmdline 2>/dev/null | grep -a "${binary}" > /dev/null 2>&1 + if [ $? -eq 0 ];then + # shortcut without requiring pgrep to search through all procs + return $OCF_SUCCESS + fi + fi + + pid=$(pgrep ${binary}) + case $? in + 0) + ocf_log info "PID file (pid:${pid} at $pidfile) created for ${binary}." + echo "$pid" > $pidfile + return $OCF_SUCCESS;; + 1) + rm -f "$pidfile" > /dev/null 2>&1 + ocf_log info "$binary is not running" + return $OCF_NOT_RUNNING;; + *) + rm -f "$pidfile" > /dev/null 2>&1 + ocf_exit_reason "Error encountered detecting pid status of $binary" + return $OCF_ERR_GENERIC;; + esac +} + +clvmd_status() +{ + local rc + local mirror_rc + clvmd_validate + if [ $? -ne $OCF_SUCCESS ]; then + ocf_exit_reason "Unable to monitor, Environment validation failed." + return $? + fi + + check_process $DAEMON + rc=$? + mirror_rc=$rc + + if ocf_is_true $OCF_RESKEY_with_cmirrord; then + check_process $CMIRROR + mirror_rc=$? + fi + + # If these ever don't match, return error to force recovery + if [ $mirror_rc -ne $rc ]; then + return $OCF_ERR_GENERIC + fi + + return $rc +} + +# NOTE: replace this with vgs, once display filter per attr is implemented. +clustered_vgs() { + if ! ocf_is_true "$OCF_RESKEY_exclusive"; then + ${LVM_VGDISPLAY} 2>/dev/null | awk 'BEGIN {RS="VG Name"} {if (/Clustered/) print $1;}' + else + for vg in $(vgs --select "clustered=yes" -o name --noheadings); do + lvs --select lv_active=~'local.*exclusive' -o vg_name --noheadings $vg 2> /dev/null | awk '!seen[$1]++ {print $1}' + done + fi +} + +wait_for_process() +{ + local binary=$1 + local timeout=$2 + local count=0 + + ocf_log info "Waiting for $binary to exit" + while [ $count -le $timeout ]; do + check_process $binary + if [ $? -eq $OCF_NOT_RUNNING ]; then + ocf_log info "$binary terminated" + return $OCF_SUCCESS + fi + sleep 1 + count=$((count+1)) + done + + return $OCF_ERR_GENERIC +} + +time_left() +{ + local end=$1 + local default=$2 + local now=$SECONDS + local result=0 + + result=$(( $end - $now )) + if [ $result -lt $default ]; then + return $default + fi + return $result +} + +clvmd_stop() +{ + local LVM_VGS + local rc=$OCF_SUCCESS + local end=$(( $SECONDS + $CLVMD_TIMEOUT )) + + clvmd_status + if [ $? -eq $OCF_NOT_RUNNING ]; then + return $OCF_SUCCESS + fi + + check_process $DAEMON + if [ $? -ne $OCF_NOT_RUNNING ]; then + LVM_VGS="$(clustered_vgs)" + + if [ -n "$LVM_VGS" ]; then + ocf_log info "Deactivating clustered VG(s):" + ocf_run ${LVM_VGCHANGE} -anl $LVM_VGS + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to deactivate volume groups, cluster vglist = $LVM_VGS" + return $OCF_ERR_GENERIC + fi + fi + + ocf_log info "Signaling $DAEMON to exit" + killall -TERM $DAEMON + if [ $? != 0 ]; then + ocf_exit_reason "Failed to signal -TERM to $DAEMON" + return $OCF_ERR_GENERIC + fi + + wait_for_process $DAEMON $CLVMD_TIMEOUT + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + ocf_exit_reason "$DAEMON failed to exit" + return $rc + fi + + rm -f $LOCK_FILE + fi + + check_process $CMIRROR + if [ $? -ne $OCF_NOT_RUNNING ] && ocf_is_true $OCF_RESKEY_with_cmirrord; then + local timeout + ocf_log info "Signaling $CMIRROR to exit" + killall -INT $CMIRROR + + time_left $end 10; timeout=$? + wait_for_process $CMIRROR $timeout + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + killall -KILL $CMIRROR + time_left $end 10; timeout=$? + wait_for_process $CMIRROR $(time_left $end 10) + rc=$? + fi + fi + + return $rc +} + +start_process() +{ + local binary_path=$1 + local opts=$2 + + check_process "$(basename $binary_path)" + if [ $? -ne $OCF_SUCCESS ]; then + ocf_log info "Starting $binary_path: " + ocf_run $binary_path $opts + rc=$? + if [ $rc -ne 0 ]; then + ocf_exit_reason "Failed to launch $binary_path, exit code $rc" + exit $OCF_ERR_GENERIC + fi + fi + + return $OCF_SUCCESS +} + +clvmd_activate_all() +{ + + if ! ocf_is_true "$OCF_RESKEY_activate_vgs"; then + ocf_log info "skipping vg activation, activate_vgs is set to $OCF_RESKEY_activate_vgs" + return $OCF_SUCCESS + fi + # Activate all volume groups by leaving the + # "volume group name" parameter empty + ocf_run ${LVM_VGCHANGE} -aay + if [ $? -ne 0 ]; then + ocf_log info "Failed to activate VG(s):" + clvmd_stop + return $OCF_ERR_GENERIC + fi + return $OCF_SUCCESS +} + +clvmd_start() +{ + local rc=0 + local CLVMDOPTS="-T${CLVMD_TIMEOUT} $OCF_RESKEY_daemon_options" + + clvmd_validate + if [ $? -ne $OCF_SUCCESS ]; then + ocf_exit_reason "Unable to start, Environment validation failed." + return $? + fi + + # systemd drop-in to stop process before storage services during + # shutdown/reboot + if systemd_is_running ; then + systemd_drop_in "99-clvmd" "After" "blk-availability.service" + fi + + clvmd_status + if [ $? -eq $OCF_SUCCESS ]; then + ocf_log debug "$DAEMON already started" + clvmd_activate_all + return $?; + fi + + # autoset locking type to clustered when lvmconf tool is available + if [ -x "$LVMCONF" ]; then + $LVMCONF --enable-cluster > /dev/null 2>&1 + fi + + # if either of these fail, script will exit OCF_ERR_GENERIC + if ocf_is_true $OCF_RESKEY_with_cmirrord; then + start_process $CMIRROR_PATH + fi + start_process $DAEMON_PATH "$CLVMDOPTS" + + # Refresh local cache. + # + # It's possible that new PVs were added to this, or other VGs + # while this node was down. So we run vgscan here to avoid + # any potential "Missing UUID" messages with subsequent + # LVM commands. + + # The following step would be better and more informative to the user: + # 'action "Refreshing VG(s) local cache:" ${LVM_VGSCAN}' + # but it could show warnings such as: + # 'clvmd not running on node x-y-z Unable to obtain global lock.' + # and the action would be shown as FAILED when in reality it didn't. + # Ideally vgscan should have a startup mode that would not print + # unnecessary warnings. + + ${LVM_VGSCAN} > /dev/null 2>&1 + touch $LOCK_FILE + + clvmd_activate_all + + clvmd_status + return $? +} + +case $__OCF_ACTION in + meta-data) meta_data + exit $OCF_SUCCESS;; + + start) clvmd_start;; + + stop) clvmd_stop;; + + monitor) clvmd_status;; + + validate-all) clvmd_validate;; + + usage|help) clvmd_usage;; + + *) clvmd_usage + exit $OCF_ERR_UNIMPLEMENTED;; +esac + +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc + diff --git a/heartbeat/conntrackd.in b/heartbeat/conntrackd.in new file mode 100644 index 0000000..1c2ee95 --- /dev/null +++ b/heartbeat/conntrackd.in @@ -0,0 +1,335 @@ +#!@BASH_SHELL@ +# +# +# An OCF RA for conntrackd +# http://conntrack-tools.netfilter.org/ +# +# Copyright (c) 2011 Dominik Klein +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + +OCF_RESKEY_binary_default=conntrackd +OCF_RESKEY_config_default=/etc/conntrackd/conntrackd.conf + +# For users of versions prior to 1.2: +# Map renamed parameter "conntrackd" to "binary" if in use +: ${OCF_RESKEY_binary=${OCF_RESKEY_conntrackd-${OCF_RESKEY_binary_default}}} +: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="conntrackd" version="1.2"> +<version>1.0</version> + +<longdesc lang="en"> +Promotable OCF Resource Agent for conntrackd +</longdesc> + +<shortdesc lang="en">This resource agent manages conntrackd</shortdesc> + +<parameters> +<parameter name="binary"> +<longdesc lang="en">Name of the conntrackd executable. +If conntrackd is installed and available in the default PATH, it is sufficient to configure the name of the binary +For example "my-conntrackd-binary-version-0.9.14" +If conntrackd is installed somewhere else, you may also give a full path +For example "/packages/conntrackd-0.9.14/sbin/conntrackd" +</longdesc> +<shortdesc lang="en">Name of the conntrackd executable</shortdesc> +<content type="string" default="$OCF_RESKEY_binary_default"/> +</parameter> + +<parameter name="config"> +<longdesc lang="en">Full path to the conntrackd.conf file. +For example "/packages/conntrackd-0.9.14/etc/conntrackd/conntrackd.conf"</longdesc> +<shortdesc lang="en">Path to conntrackd.conf</shortdesc> +<content type="string" default="$OCF_RESKEY_config_default"/> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="30s" /> +<action name="promote" timeout="30s" /> +<action name="demote" timeout="30s" /> +<action name="notify" timeout="30s" /> +<action name="stop" timeout="30s" /> +<action name="monitor" timeout="20s" interval="20s" role="Unpromoted" /> +<action name="monitor" timeout="20s" interval="10s" role="Promoted" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="30s" /> +</actions> +</resource-agent> +END +} + +meta_expect_eq() +{ + local what=$1 whatvar=OCF_RESKEY_CRM_meta_${1//-/_} expect=$2 + local val=${!whatvar} + if [[ -n $val ]]; then + # [, not [[, or it won't work ;) + [ $val = $expect ] && return + fi + ocf_exit_reason "meta parameter misconfigured, expected $what $op $expect, but found ${val:-unset}." + exit $OCF_ERR_CONFIGURED +} + +conntrackd_is_master() { + # You can't query conntrackd whether it is master or slave. It can be both at the same time. + # This RA creates a statefile during promote and enforces master-max=1 and clone-node-max=1 + ha_pseudo_resource $statefile monitor +} + +conntrackd_set_master_score() { + ${HA_SBIN_DIR}/crm_master -Q -l reboot -v $1 +} + +conntrackd_monitor() { + rc=$OCF_NOT_RUNNING + # It does not write a PID file, so check the socket exists after + # extracting its path from the configuration file + local conntrack_socket=$(awk '/^[ \t]*UNIX[ \t]*{/,/^[ \t]*}/ { if ($1 == "Path") { print $2 } }' $OCF_RESKEY_config) + [ -S "$conntrack_socket" ] && rc=$OCF_SUCCESS + if [ "$rc" -eq "$OCF_SUCCESS" ]; then + # conntrackd is running + # now see if it acceppts queries + if ! $OCF_RESKEY_binary -C $OCF_RESKEY_config -s > /dev/null 2>&1; then + rc=$OCF_ERR_GENERIC + ocf_exit_reason "conntrackd is running but not responding to queries" + fi + if conntrackd_is_master; then + rc=$OCF_RUNNING_MASTER + # Restore master setting on probes + if [ $OCF_RESKEY_CRM_meta_interval -eq 0 ]; then + conntrackd_set_master_score $master_score + fi + else + # Restore master setting on probes + if [ $OCF_RESKEY_CRM_meta_interval -eq 0 ]; then + conntrackd_set_master_score $slave_score + fi + fi + fi + return $rc +} + +conntrackd_start() { + rc=$OCF_ERR_GENERIC + + # Keep trying to start the resource; + # wait for the CRM to time us out if this fails + while :; do + conntrackd_monitor + status=$? + case "$status" in + $OCF_SUCCESS) + conntrackd_set_master_score $slave_score + # -n = request resync from the others + if ! $OCF_RESKEY_binary -C $OCF_RESKEY_config -n; then + ocf_exit_reason "$OCF_RESKEY_binary -C $OCF_RESKEY_config -n failed during start." + rc=$OCF_ERR_GENERIC + else + rc=$OCF_SUCCESS + fi + break + ;; + $OCF_NOT_RUNNING) + ocf_log info "Starting conntrackd" + $OCF_RESKEY_binary -C $OCF_RESKEY_config -d + ;; + $OCF_RUNNING_MASTER) + ocf_log warn "conntrackd already in master mode, demoting." + ha_pseudo_resource $statefile stop + ;; + $OCF_ERR_GENERIC) + ocf_exit_reason "conntrackd start failed" + rc=$OCF_ERR_GENERIC + break + ;; + esac + done + return $rc +} + +conntrackd_stop() { + rc=$OCF_ERR_GENERIC + + # Keep trying to bring down the resource; + # wait for the CRM to time us out if this fails + while :; do + conntrackd_monitor + status=$? + case "$status" in + $OCF_SUCCESS|$OCF_ERR_GENERIC) + ocf_log info "Stopping conntrackd" + $OCF_RESKEY_binary -C $OCF_RESKEY_config -k + ;; + $OCF_NOT_RUNNING) + rc=$OCF_SUCCESS + break + ;; + $OCF_RUNNING_MASTER) + ocf_log warn "conntrackd still master" + ;; + esac + done + return $rc + +} + +conntrackd_validate_all() { + check_binary "$OCF_RESKEY_binary" + if ! [ -e "$OCF_RESKEY_config" ]; then + ocf_exit_reason "Config FILE $OCF_RESKEY_config does not exist" + return $OCF_ERR_INSTALLED + fi + meta_expect_eq master-node-max 1 + meta_expect_eq master-max 1 + meta_expect_eq clone-node-max 1 + + return $OCF_SUCCESS +} + +conntrackd_promote() { + rc=$OCF_SUCCESS + if ! conntrackd_is_master; then + # -c = Commit the external cache to the kernel + # -f = Flush internal and external cache + # -R = resync with the kernel table + # -B = send a bulk update on the line + for parm in c f R B; do + if ! $OCF_RESKEY_binary -C $OCF_RESKEY_config -$parm; then + ocf_exit_reason "$OCF_RESKEY_binary -C $OCF_RESKEY_config -$parm failed during promote." + rc=$OCF_ERR_GENERIC + break + fi + done + ha_pseudo_resource $statefile start + conntrackd_set_master_score $master_score + fi + return $rc +} + +conntrackd_demote() { + rc=$OCF_SUCCESS + if conntrackd_is_master; then + # -t = shorten kernel timers to remove zombies + # -n = request a resync from the others + for parm in t n; do + if ! $OCF_RESKEY_binary -C $OCF_RESKEY_config -$parm; then + ocf_exit_reason "$OCF_RESKEY_binary -C $OCF_RESKEY_config -$parm failed during demote." + rc=$OCF_ERR_GENERIC + break + fi + done + ha_pseudo_resource $statefile stop + conntrackd_set_master_score $slave_score + fi + return $rc +} + +conntrackd_notify() { + hostname=$(hostname) + # OCF_RESKEY_CRM_meta_notify_master_uname is a whitespace separated list of master hostnames + for master in $OCF_RESKEY_CRM_meta_notify_master_uname; do + # if we are the master and an instance was just started on another node: + # send a bulk update to allow failback + if [ "$hostname" = "$master" -a "$OCF_RESKEY_CRM_meta_notify_type" = "post" -a "$OCF_RESKEY_CRM_meta_notify_operation" = "start" -a "$OCF_RESKEY_CRM_meta_notify_start_uname" != "$hostname" ]; then + ocf_log info "Sending bulk update in post start to peers to allow failback" + $OCF_RESKEY_binary -C $OCF_RESKEY_config -B + fi + done + for tobepromoted in $OCF_RESKEY_CRM_meta_notify_promote_uname; do + # if there is a promote action to be executed on another node: + # send a bulk update to allow failback + if [ "$hostname" != "$tobepromoted" -a "$OCF_RESKEY_CRM_meta_notify_type" = "pre" -a "$OCF_RESKEY_CRM_meta_notify_operation" = "promote" ]; then + ocf_log info "Sending bulk update in pre promote to peers to allow failback" + $OCF_RESKEY_binary -C $OCF_RESKEY_config -B + fi + done +} + +conntrackd_usage() { + cat <<EOF +usage: $0 {start|stop|promote|demote|monitor|validate-all|meta-data} +Expects to have a fully populated OCF RA-compliant environment set. +EOF +} + +statefile=conntrackd.${OCF_RESOURCE_INSTANCE//:[0-9]*}.master + +master_score=1000 +slave_score=100 + +if [ $# -ne 1 ]; then + conntrackd_usage + exit $OCF_ERR_ARGS +fi + +case $__OCF_ACTION in +meta-data) + meta_data + exit $OCF_SUCCESS + ;; +usage) + conntrackd_usage + exit $OCF_SUCCESS +esac + +# Everything except usage and meta-data must pass the validate test +conntrackd_validate_all || exit + +case $__OCF_ACTION in +start) + conntrackd_start + ;; +stop) + conntrackd_stop + ;; +promote) + conntrackd_promote + ;; +demote) + conntrackd_demote + ;; +status|monitor) + conntrackd_monitor + ;; +notify) + conntrackd_notify + ;; +validate-all) + ;; +*) + conntrackd_usage + exit $OCF_ERR_UNIMPLEMENTED +esac +# exit code is the exit code (return code) of the last command (shell function) diff --git a/heartbeat/corosync-qnetd b/heartbeat/corosync-qnetd new file mode 100755 index 0000000..6b97777 --- /dev/null +++ b/heartbeat/corosync-qnetd @@ -0,0 +1,353 @@ +#!/bin/sh +# +# Copyright (C) 2022 Red Hat, Inc. All rights reserved. +# +# Authors: Jan Friesse <jfriesse@redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +# Initialization: +: "${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}" +. "${OCF_FUNCTIONS_DIR}/ocf-shellfuncs" + +# Use runuser if available for SELinux. +if [ -x "/sbin/runuser" ]; then + SU="runuser" +else + SU="su" +fi + +# Attempt to detect a default binary +OCF_RESKEY_binary_default=$(which corosync-qnetd 2> /dev/null) +if [ "${OCF_RESKEY_binary_default}" = "" ]; then + OCF_RESKEY_binary_default="/usr/bin/corosync-qnetd" +fi + +# Defaults +OCF_RESKEY_qnetd_opts_default="" +OCF_RESKEY_qnetd_tool_binary_default="/usr/bin/corosync-qnetd-tool" +OCF_RESKEY_ip_default="" +OCF_RESKEY_port_default="" +OCF_RESKEY_nss_db_dir_default="" +OCF_RESKEY_pid_default="/var/run/corosync-qnetd/corosync-qnetd-${OCF_RESOURCE_INSTANCE}.pid" +OCF_RESKEY_ipc_sock_default="/var/run/corosync-qnetd/corosync-qnetd-${OCF_RESOURCE_INSTANCE}.sock" +OCF_RESKEY_user_default="coroqnetd" +OCF_RESKEY_group_default="${OCF_RESKEY_user_default}" + +: "${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}}" +: "${OCF_RESKEY_qnetd_opts=${OCF_RESKEY_qnetd_opts_default}}" +: "${OCF_RESKEY_qnetd_tool_binary=${OCF_RESKEY_qnetd_tool_binary_default}}" +: "${OCF_RESKEY_ip=${OCF_RESKEY_ip_default}}" +: "${OCF_RESKEY_port=${OCF_RESKEY_port_default}}" +: "${OCF_RESKEY_nss_db_dir=${OCF_RESKEY_nss_db_dir_default}}" +: "${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}}" +: "${OCF_RESKEY_ipc_sock=${OCF_RESKEY_ipc_sock_default}}" +: "${OCF_RESKEY_user=${OCF_RESKEY_user_default}}" +: "${OCF_RESKEY_group=${OCF_RESKEY_group_default}}" + +corosync_qnetd_usage() { + cat <<END +usage: $0 {start|stop|status|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +corosync_qnetd_meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="corosync-qnetd" version="1.0"> +<version>1.0</version> + +<longdesc lang="en">OCF Resource script for corosync-qnetd. It manages a corosync-qnetd +instance as a HA resource. It is required to copy nss db directory (usually /etc/corosync/qnetd/nssdb) +across all nodes (only once - after database is initialized).</longdesc> +<shortdesc lang="en">Corosync QNet daemon resource agent</shortdesc> + +<parameters> + +<parameter name="binary"> + <longdesc lang="en">Location of the corosync-qnetd binary</longdesc> + <shortdesc lang="en">corosync-qnetd binary</shortdesc> + <content type="string" default="${OCF_RESKEY_binary_default}" /> +</parameter> + +<parameter name="qnetd_opts"> + <longdesc lang="en"> + Additional options for corosync-qnetd binary. "-4" for example. + </longdesc> + <shortdesc lang="en">corosync-qnetd extra options</shortdesc> + <content type="string" default="${OCF_RESKEY_qnetd_opts_default}" /> +</parameter> + +<parameter name="qnetd_tool_binary"> + <longdesc lang="en"> + The absolute path to the corosync-qnetd-tool for monitoring with OCF_CHECK_LEVEL greater zero. + </longdesc> + <shortdesc lang="en">The absolute path to the corosync-qnetd-tool binary</shortdesc> + <content type="string" default="${OCF_RESKEY_qnetd_tool_binary_default}" /> +</parameter> + +<parameter name="ip"> + <longdesc lang="en"> + IP address to listen on. By default the daemon listens on all addresses (wildcard). + </longdesc> + <shortdesc lang="en">IP address to listen on</shortdesc> + <content type="string" default="${OCF_RESKEY_ip_default}" /> +</parameter> + +<parameter name="port"> + <longdesc lang="en"> + TCP port to listen on. Default port is 5403. + </longdesc> + <shortdesc lang="en">TCP port to listen on</shortdesc> + <content type="string" default="${OCF_RESKEY_port_default}" /> +</parameter> + +<parameter name="nss_db_dir"> + <longdesc lang="en"> + Location of the corosync-qnetd nss db directory (empty for default - usually /etc/corosync/qnetd/nssdb) + </longdesc> + <shortdesc lang="en">corosync-qnetd nss db directory</shortdesc> + <content type="string" default="${OCF_RESKEY_nss_db_dir_default}" /> +</parameter> + +<parameter name="pid"> + <longdesc lang="en"> + Location of the corosync-qnetd pid/lock + </longdesc> + <shortdesc lang="en">corosync-qnetd pid file</shortdesc> + <content type="string" default="${OCF_RESKEY_pid_default}" /> +</parameter> + +<parameter name="ipc_sock"> + <longdesc lang="en"> + Location of the corosync-qnetd ipc socket + </longdesc> + <shortdesc lang="en">corosync-qnetd ipc socket file</shortdesc> + <content type="string" default="${OCF_RESKEY_ipc_sock_default}" /> +</parameter> + +<parameter name="user"> + <longdesc lang="en">User running corosync-qnetd</longdesc> + <shortdesc lang="en">corosync-qnetd user</shortdesc> + <content type="string" default="${OCF_RESKEY_user_default}" /> +</parameter> + +<parameter name="group"> + <longdesc lang="en">Group running corosync-qnetd</longdesc> + <shortdesc lang="en">corosync-qnetd group</shortdesc> + <content type="string" default="${OCF_RESKEY_group_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="status" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="10s" start-delay="10s" /> +<action name="validate-all" timeout="20s" /> +<action name="meta-data" timeout="20s" /> +</actions> +</resource-agent> +END +} + +corosync_qnetd_status() { + ocf_pidfile_status "${OCF_RESKEY_pid}" > /dev/null 2>&1 + case "$?" in + 0) + rc="$OCF_SUCCESS" + ;; + 1|2) + rc="$OCF_NOT_RUNNING" + ;; + *) + rc="$OCF_ERR_GENERIC" + ;; + esac + + return "$rc" +} + +corosync_qnetd_start() { + corosync_qnetd_validate_all + rc="$?" + + if [ "$rc" -ne 0 ]; then + return "$rc" + fi + + # if resource is already running,no need to continue code after this. + if corosync_qnetd_status; then + ocf_log info "corosync-qnetd is already running" + return "${OCF_SUCCESS}" + fi + + pid_dir=$(dirname "${OCF_RESKEY_pid}") + sock_dir=$(dirname "${OCF_RESKEY_ipc_sock}") + + for d in "$pid_dir" "$sock_dir";do + if [ ! -d "$d" ];then + mkdir -p "$d" + chmod 0770 "$d" + chown "${OCF_RESKEY_user}:${OCF_RESKEY_group}" "$d" + fi + done + + params="-S \"local_socket_file=${OCF_RESKEY_ipc_sock}\" -S \"lock_file=${OCF_RESKEY_pid}\"" + + if [ -n "${OCF_RESKEY_nss_db_dir}" ];then + params="$params -S \"nss_db_dir=${OCF_RESKEY_nss_db_dir}\"" + fi + + if [ -n "${OCF_RESKEY_ip}" ];then + params="$params -l \"${OCF_RESKEY_ip}\"" + fi + + if [ -n "${OCF_RESKEY_port}" ];then + params="$params -p \"${OCF_RESKEY_port}\"" + fi + + params="$params ${OCF_RESKEY_qnetd_opts}" + + ocf_run "$SU" -s "/bin/sh" "${OCF_RESKEY_user}" -c "${OCF_RESKEY_binary} $params" + + while :; do + corosync_qnetd_monitor "debug" + rc="$?" + + if [ "$rc" -eq "${OCF_SUCCESS}" ]; then + break + fi + sleep 1 + + ocf_log debug "corosync-qnetd still hasn't started yet. Waiting..." + done + + ocf_log info "corosync-qnetd started" + return "${OCF_SUCCESS}" +} + +corosync_qnetd_stop() { + corosync_qnetd_status + + if [ "$?" -ne "$OCF_SUCCESS" ]; then + # Currently not running. Nothing to do. + ocf_log info "corosync-qnetd is already stopped" + + return "$OCF_SUCCESS" + fi + + pid=$(cat "${OCF_RESKEY_pid}") + kill "$pid" + + # Wait for process to stop + while corosync_qnetd_monitor "debug"; do + sleep 1 + done + + ocf_log info "corosync-qnetd stopped" + return "$OCF_SUCCESS" +} + +corosync_qnetd_monitor() { + loglevel=${1:-err} + + corosync_qnetd_status + rc="$?" + + if [ "$rc" -ne "$OCF_SUCCESS" ];then + return "$rc" + fi + + out=$("${OCF_RESKEY_qnetd_tool_binary}" -s -p "${OCF_RESKEY_ipc_sock}" 2>&1 >/dev/null) + rc="$?" + + if [ "$rc" != 0 ];then + ocf_log "$loglevel" "$out" + fi + + case "$rc" in + "0") rc="$OCF_SUCCESS" ;; + "3") rc="$OCF_NOT_RUNNING" ;; + *) rc="$OCF_ERR_GENERIC" ;; + esac + + return "$rc" +} + +corosync_qnetd_validate_all() { + check_binary "${OCF_RESKEY_binary}" + + check_binary "${OCF_RESKEY_qnetd_tool_binary}" +} + + +# **************************** MAIN SCRIPT ************************************ + +# Make sure meta-data and usage always succeed +case "$__OCF_ACTION" in + meta-data) + corosync_qnetd_meta_data + exit "$OCF_SUCCESS" + ;; + usage|help) + corosync_qnetd_usage + exit "$OCF_SUCCESS" + ;; +esac + +# This OCF agent script need to be run as root user. +if ! ocf_is_root; then + echo "$0 agent script need to be run as root user." + ocf_log debug "$0 agent script need to be run as root user." + exit "$OCF_ERR_GENERIC" +fi + +# Translate each action into the appropriate function call +case "$__OCF_ACTION" in + start) + corosync_qnetd_start + ;; + stop) + corosync_qnetd_stop + ;; + status) + corosync_qnetd_status + ;; + monitor) + corosync_qnetd_monitor + ;; + validate-all) + corosync_qnetd_validate_all + ;; + *) + corosync_qnetd_usage + exit "$OCF_ERR_UNIMPLEMENTED" + ;; +esac + +rc="$?" +exit "$rc" +# End of this script diff --git a/heartbeat/crypt b/heartbeat/crypt new file mode 100755 index 0000000..56db379 --- /dev/null +++ b/heartbeat/crypt @@ -0,0 +1,342 @@ +#!/bin/sh +# +# crypt/LUKS OCF RA. Manages cryptsetup devices. +# +# Copyright (c) 2020 Red Hat GmbH, Heinz Mauelshagen +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults +OCF_RESKEY_encrypted_dev_default="" +OCF_RESKEY_crypt_dev_default="" +OCF_RESKEY_key_file_default="" +OCF_RESKEY_crypt_type_default="" +OCF_RESKEY_force_stop_default="false" + +: ${OCF_RESKEY_encrypted_dev=${OCF_RESKEY_encrypted_dev_default}} +: ${OCF_RESKEY_crypt_dev=${OCF_RESKEY_crypt_dev_default}} +: ${OCF_RESKEY_key_file=${OCF_RESKEY_key_file_default}} +: ${OCF_RESKEY_crypt_type=${OCF_RESKEY_crypt_type_default}} +: ${OCF_RESKEY_force_stop=${OCF_RESKEY_force_stop_default}} + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="crypt" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This is a LUKS/crypt Resource Agent managing encrypted devices via cryptsetup(8). +The agent imposes limitations on device types supported: luks, luks[1..N]. +</longdesc> +<shortdesc lang="en">LUKS/crypt resource agent</shortdesc> + +<parameters> + +<parameter name="encrypted_dev" unique="1" required="1"> +<longdesc lang="en"> +Encrypted backing device, which should be defined by UUID, +36 characters including '-'s as reported by blkid(8). + +Although it can be defined as a block device path (e.g. /dev/sdh), +the UUID should be preferred over the block device path to allow for the +unique discovery of the crypt backing device given the volatile nature of +/dev entries (e.g. /dev/sdh on one node may be /dev/sdg on another). + +Only define as block device path if you know what you are doing. +</longdesc> +<shortdesc lang="en">Encrypted device</shortdesc> +<content type="string" default="${OCF_RESKEY_encrypted_dev_default}" /> +</parameter> + +<parameter name="crypt_dev" unique="1" required="1"> +<longdesc lang="en"> +Encrypted device name, no path. I.e. the one given in "cryptsetup open name ...". +The resulting block device path is /dev/mapper/name. +</longdesc> +<shortdesc lang="en">Encrypted device</shortdesc> +<content type="string" default="${OCF_RESKEY_crypt_dev_default}" /> +</parameter> + +<parameter name="key_file" unique="0" required="1"> +<longdesc lang="en"> +Key file path containing the encryption passphrase +(aka key; see cryptsetup(8)). For LUKS, the passphrase as of the key_file +parameter is used to decrypt a randomly selected key when the device was created. +</longdesc> +<shortdesc lang="en">Key file</shortdesc> +<content type="string" default="${OCF_RESKEY_key_file_default}" /> +</parameter> + +<parameter name="crypt_type" unique="0" required="1"> +<longdesc lang="en"> +Encryption (device) type (e.g. "luks" or "luks2"). + +This parameter affirms the encryption format as of the crypt metadata +thus allowing for safety measures when starting the encrypted resource. +</longdesc> +<shortdesc lang="en">Encryption type</shortdesc> +<content type="string" default="${OCF_RESKEY_crypt_type_default}" /> +</parameter> + +<parameter name="force_stop" unique="0" required="0"> +<longdesc lang="en"> +If processes or kernel threads are using the crypt device, it cannot +be stopped. We will try to stop processes, first by sending TERM and +then, if that doesn't help in $PROC_CLEANUP_TIME seconds, using KILL. +The lsof(8) program is required to get the list of array users. +Of course, the kernel threads cannot be stopped this way. +If the processes are critical for data integrity, then set this +parameter to false. Note that in that case the stop operation +will fail and the node will be fenced. +</longdesc> +<shortdesc lang="en">force stop processes using the crpyt device</shortdesc> +<content type="boolean" default="${OCF_RESKEY_force_stop_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" timeout="20s" interval="10s" depth="0" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="10s" /> +</actions> +</resource-agent> +END +} + +# Disable cryptsetup auto-recovery if cloned. +disable_locks="" +ocf_is_clone && disable_locks="--disable-locks" + +crypt_usage() { + cat <<END +usage: $0 {start|stop|monitor|usage|meta-data|validate-all} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +encrypted_dev="${OCF_RESKEY_encrypted_dev}" +crypt_dev="${OCF_RESKEY_crypt_dev}" +crypt_dev_path="/dev/mapper/$crypt_dev" +key_file="${OCF_RESKEY_key_file}" +crypt_type="${OCF_RESKEY_crypt_type}" +force_stop="${OCF_RESKEY_force_stop}" + +crypt_validate_all() { + if ! have_binary cryptsetup; then + ocf_exit_reason "Please install cryptsetup(8)" + return $OCF_ERR_INSTALLED + fi + if [ -z "$encrypted_dev" ]; then + ocf_exit_reason "Undefined OCF_RESKEY_encrypted_dev" + return $OCF_ERR_CONFIGURED + fi + if [ -n "$encrypted_dev" ]; then + case "$encrypted_dev" in + *-*-*-*) if [ `echo "$encrypted_dev" | wc -c` -ne 37 ]; then + ocf_exit_reason "Bogus encrypted device UUID \"$encrypted_dev\"" + return $OCF_ERR_ARGS + fi + encrypted_dev=/dev/disk/by-uuid/"$encrypted_dev";; + *) case "$encrypted_dev" in + /dev/*) ;; + *) ocf_exit_reason "Bogus encrypted device path" + return $OCF_ERR_ARGS;; + esac + esac + fi + + # return early for probes where device might not be available yet + # e.g. LVM exclusive volumes + if ocf_is_probe; then + return $OCF_SUCCESS + fi + + if [ ! -b "$encrypted_dev" ] && [ ! -L "$encrypted_dev" ]; then + ocf_exit_reason "Encrypted device $encrypted_dev not accessible" + return $OCF_ERR_ARGS + fi + echo "$crypt_dev" | grep "/" >/dev/null + if [ $? -eq 0 ] && [ -z "$crypt_dev" ]; then + ocf_exit_reason "Crypt device \"$crypt_dev\" name has to at least 1 character long and without path" + return $OCF_ERR_ARGS + fi + if [ ! -r "$key_file" ]; then + ocf_exit_reason "Hash key file $key_file not accessible" + return $OCF_ERR_ARGS + fi + if ocf_is_true "$force_stop" && ! have_binary lsof; then + ocf_exit_reason "Force stop requested, please install lsof(8)" + return $OCF_ERR_INSTALLED + fi + cryptsetup isLuks $encrypted_dev 2>/dev/null + if [ $? -ne 0 ]; then + ocf_exit_reason "$encrypted_dev is not a Luks formatted device" + return $OCF_ERR_CONFIGURED + fi + + return $OCF_SUCCESS +} + +get_users_pids() { + ocf_log debug "running lsof to list \"$crypt_dev\" users..." + ocf_run -warn 'lsof $crypt_dev_path | tail -n +2 | awk "{print $2}" | sort -u' +} + +stop_crypt_users() { + local pids=`get_users_pids` + + if [ -z "$pids" ]; then + ocf_log warn "lsof reported no users holding arrays" + return 2 + fi + + ocf_stop_processes TERM $PROC_CLEANUP_TIME $pids +} + +show_users() { + local dm_dev + + ocf_log info "running lsof to list \"$crypt_dev\" users..." + ocf_run -warn lsof $crypt_dev_path + + dm_dev=$(basename $(realpath $crypt_dev_path)) + if [ -d /sys/block/$dm_dev/holders ]; then + ocf_log debug "ls -l /sys/block/$dm_dev/holders" + ocf_run -warn ls -l /sys/block/$dm_dev/holders + fi +} + +crypt_stop_one() { + cryptsetup close $crypt_dev $disable_locks +} + +####################################################################### +# +# Action: START an encrypted resource +# +crypt_start() { + local rc + + cryptsetup open $encrypted_dev $crypt_dev --type $crypt_type $disable_locks --key-file=$key_file + rc=$? + if [ $rc -eq 0 ];then + crypt_monitor + rc=$? + else + rc=$OCF_ERR_GERNERIC + fi + [ $rc -ne $OCF_SUCCESS ] && ocf_exit_reason "Failed to start encrypted device \"$crypt_dev\"" + + return $rc +} + +# +# Action: STOP an encrypted resource +# +crypt_stop() { + local rc + + crypt_monitor + rc=$? + if [ $rc -ne $OCF_NOT_RUNNING ]; then + crypt_stop_one + crypt_monitor + rc=$? + fi + if [ $rc -ne $OCF_NOT_RUNNING ] && ocf_is_true $force_stop; then + stop_crypt_users + case $? in + 2) rc=$OCF_SUCCESS;; + *) crypt_stop_one + crypt_monitor + rc=$?;; + esac + fi + if [ $rc -ne $OCF_NOT_RUNNING ]; then + ocf_log warn "Couldn't stop crypt device \"$crypt_dev\" (rc=$rc)" + show_users + ocf_exit_reason "Failed to stop crypt device \"$crypt_dev\"!" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +# +# Action: MONITOR an encrypted resource +# +crypt_monitor() { + cryptsetup status $crypt_dev $disable_locks >/dev/null 2>&1 + if [ $? -eq 0 ]; then + if [ -b "$encrypted_dev" ] || [ -L $crypt_dev_path ]; then + return $OCF_SUCCESS + fi + return $OCF_ERR_GENERIC + fi + + [ "$__OCF_ACTION" = "monitor" ] && ! ocf_is_probe && ocf_exit_reason "Crypt resource not running" + return $OCF_NOT_RUNNING +} + +# Check for stange argument count. +if [ $# -ne 1 ]; then + usage + exit $OCF_ERR_ARGS +fi + +case "$__OCF_ACTION" in +meta-data) meta_data + exit $OCF_SUCCESS;; +usage|help) crypt_usage + exit $OCF_SUCCESS;; +esac + +# XME: remove once pacemaker is fixed and calls this action +crypt_validate_all +rc=$? +[ $rc -ne $OCF_SUCCESS ] && exit $rc + +case "$__OCF_ACTION" in +start) crypt_start; rc=$?;; +stop) crypt_stop; rc=$?;; +monitor) crypt_monitor; rc=$?;; +validate-all) rc=$OCF_SUCCESS;; # crypt_validate_all would have errored out above already. +*) crypt_usage + exit $OCF_ERR_UNIMPLEMENTED;; +esac + +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc diff --git a/heartbeat/db2 b/heartbeat/db2 new file mode 100755 index 0000000..95447ab --- /dev/null +++ b/heartbeat/db2 @@ -0,0 +1,919 @@ +#!/bin/sh +# +# db2 +# +# Resource agent that manages a DB2 LUW database in Standard role +# or HADR configuration in promotable configuration. +# Multi partition is supported as well. +# +# Copyright (c) 2011 Holger Teutsch <holger.teutsch@web.de> +# +# This agent incoporates code of a previous release created by +# Alan Robertson and the community. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_instance_default="" +OCF_RESKEY_admin_default="" +OCF_RESKEY_dbpartitionnum_default="0" + +: ${OCF_RESKEY_instance=${OCF_RESKEY_instance_default}} +: ${OCF_RESKEY_admin=${OCF_RESKEY_admin_default}} +: ${OCF_RESKEY_dbpartitionnum=${OCF_RESKEY_dbpartitionnum_default}} + +####################################################################### + + +db2_usage() { + echo "db2 start|stop|monitor|promote|demote|notify|validate-all|meta-data" +} + +db2_meta_data() { +cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="db2" version="1.0"> +<version>1.0</version> +<longdesc lang="en"> +Resource Agent that manages an IBM DB2 LUW databases in Standard role as primitive or in HADR roles in promotable configuration. Multiple partitions are supported. + +Standard mode: + +An instance including all or selected databases is made highly available. +Configure each partition as a separate primitive resource. + +HADR mode: + +A single database in HADR configuration is made highly available by automating takeover operations. +Configure a promotable resource with notifications enabled and an +additional monitoring operation with role "Promoted". + +In case of HADR be very deliberate in specifying intervals/timeouts. The detection of a failure including promote must complete within HADR_PEER_WINDOW. + +In addition to honoring requirements for crash recovery etc. for your specific database use the following relations as guidance: + +"monitor interval" < HADR_PEER_WINDOW - (appr 30 sec) + +"promote timeout" < HADR_PEER_WINDOW + (appr 20 sec) + +For further information and examples consult http://www.linux-ha.org/wiki/db2_(resource_agent) +</longdesc> +<shortdesc lang="en">Resource Agent that manages an IBM DB2 LUW databases in Standard role as primitive or in HADR roles as promotable configuration. Multiple partitions are supported.</shortdesc> + +<parameters> +<parameter name="instance" unique="1" required="1"> +<longdesc lang="en"> +The instance of the database(s). +</longdesc> +<shortdesc lang="en">instance</shortdesc> +<content type="string" default="${OCF_RESKEY_instance_default}" /> +</parameter> +<parameter name="dblist" unique="0" required="0"> +<longdesc lang="en"> +List of databases to be managed, e.g "db1 db2". +Defaults to all databases in the instance. Specify one db for HADR mode. +</longdesc> +<shortdesc lang="en">List of databases to be managed</shortdesc> +<content type="string"/> +</parameter> +<parameter name="admin" unique="0" required="0"> +<longdesc lang="en"> +DEPRECATED: The admin user of the instance. +</longdesc> +<shortdesc lang="en">DEPRECATED: admin</shortdesc> +<content type="string" default="${OCF_RESKEY_admin_default}" /> +</parameter> +<parameter name="dbpartitionnum" unique="0" required="0"> +<longdesc lang="en"> +The number of the partition (DBPARTITIONNUM) to be managed. +</longdesc> +<shortdesc lang="en">database partition number (DBPARTITIONNUM)</shortdesc> +<content type="string" default="${OCF_RESKEY_dbpartitionnum_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="120s"/> +<action name="stop" timeout="120s"/> +<action name="promote" timeout="120s"/> +<action name="demote" timeout="120s"/> +<action name="notify" timeout="10s"/> +<action name="monitor" depth="0" timeout="60s" interval="20s"/> +<action name="monitor" depth="0" timeout="60s" role="Promoted" interval="22s"/> +<action name="validate-all" timeout="5s"/> +<action name="meta-data" timeout="5s"/> +</actions> +</resource-agent> +END +} + +# +# validate +# .. and set global variables +# +# exit on error +# +db2_validate() { + local db2home db2sql db2instance + + # db2 uses korn shell + check_binary "ksh" + + # check required instance vars + if [ -z "$OCF_RESKEY_instance" ] + then + ocf_log err "DB2 required parameter instance is not set!" + return $OCF_ERR_CONFIGURED + fi + + instance=$OCF_RESKEY_instance + if [ -n "$OCF_RESKEY_admin" ] + then + ocf_log warn "DB2 deprecated parameter admin is set, using $OCF_RESKEY_admin as instance." + instance=$OCF_RESKEY_admin + fi + + db2node=${OCF_RESKEY_dbpartitionnum:-0} + + db2home=$(sh -c "echo ~$instance") + db2sql=$db2home/sqllib + db2profile=$db2sql/db2profile + db2bin=$db2sql/bin + + STATE_FILE=${HA_RSCTMP}/db2-${OCF_RESOURCE_INSTANCE}.state + + # Let's make sure a few important things are there... + if ! [ -d "$db2sql" -a -d "$db2bin" -a -f "$db2profile" -a \ + -x "$db2profile" -a -x "$db2bin/db2" ] + then + ocf_is_probe && exit $OCF_NOT_RUNNING + ocf_log err "DB2 required directories and/or files not found" + exit $OCF_ERR_INSTALLED + fi + + db2instance=$(runasdb2 'echo $DB2INSTANCE') + if [ "$db2instance" != "$instance" ] + then + ocf_is_probe && exit $OCF_NOT_RUNNING + ocf_log err "DB2 parameter instance \"$instance\" != DB2INSTANCE \"$db2instance\"" + exit $OCF_ERR_CONFIGURED + fi + + # enough checking for stop to succeed + [ $__OCF_ACTION = stop ] && return $OCF_SUCCESS + + dblist=$OCF_RESKEY_dblist + if [ -n "$dblist" ] + then + # support , as separator as well + dblist=$(echo "$dblist" | sed -e 's/[,]/ /g') + else + if ! dblist=$(db2_dblist) + then + ocf_log err "DB2 $instance($db2node): cannot retrieve db directory" + exit $OCF_ERR_INSTALLED + fi + fi + + # check requirements for the HADR case + if ocf_is_ms + then + set -- $dblist + if [ $# != 1 ] + then + ocf_log err "DB2 resource $OCF_RESOURCE_INSTANCE must have exactly one name in dblist" + exit $OCF_ERR_CONFIGURED + fi + + if [ $db2node != 0 ] + then + ocf_log err "DB2 resource $OCF_RESOURCE_INSTANCE must have dbpartitionnum=0" + exit $OCF_ERR_CONFIGURED + fi + fi + + return $OCF_SUCCESS +} + +master_score() +{ + if ! have_binary "crm_master"; then + return + fi + + crm_master $* +} + +# +# Run the given command as db2 instance user +# +runasdb2() { + su $instance -c ". $db2profile; $*" +} + +# +# Run a command as the DB2 admin, and log the output +# +logasdb2() { + local output rc + + output=$(runasdb2 $*) + rc=$? + if [ $rc -eq 0 ] + then + ocf_log info "$output" + else + ocf_log err "$output" + fi + return $rc +} + + +# +# maintain the fal (first active log) attribute +# db2_fal_attrib DB {set val|get} +# +db2_fal_attrib() { + local db=$1 + local attr val rc id node member me + + attr=db2hadr_${instance}_${db}_fal + + case "$2" in + set) + me=$(ocf_local_nodename) + + # loop over all member nodes and set attribute + crm_node -l | + while read id node member + do + [ "$member" = member -a "$node" != "$me" ] || continue + crm_attribute -l forever --node=$node -n $attr -v "$3" + rc=$? + ocf_log info "DB2 instance $instance($db2node/$db: setting attrib for FAL to $FIRST_ACTIVE_LOG @ $node" + [ $rc != 0 ] && break + done + ;; + + get) + crm_attribute -l forever -n $attr -G --quiet 2>&1 + rc=$? + if ! ocf_is_true "$OCF_RESKEY_CRM_meta_notify" && [ $rc != 0 ] + then + ocf_log warn "DB2 instance $instance($db2node/$db: can't retrieve attribute $attr, are you sure notifications are enabled ?" + fi + ;; + + *) + exit $OCF_ERR_CONFIGURED + esac + + return $rc +} + +# +# unfortunately a first connect after a crash may need several minutes +# for some internal cleanup stuff in DB2. +# We run a connect in background so other connects (i.e. monitoring!) may proceed. +# +db2_run_connect() { + local db=$1 + + logasdb2 "db2 connect to $db; db2 terminate" +} + +# +# get some data from the database config +# sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW +# +db2_get_cfg() { + local db=$1 + + local output hadr_vars + + output=$(runasdb2 db2 get db cfg for $db) + [ $? != 0 ] && return $OCF_ERR_GENERIC + + hadr_vars=$(echo "$output" | + awk '/HADR database role/ {printf "HADR_ROLE='%s'; ", $NF;} + /HADR_TIMEOUT/ {printf "HADR_TIMEOUT='%s'; ", $NF;} + /First active log file/ {printf "FIRST_ACTIVE_LOG='%s'\n", $NF;} + /HADR_PEER_WINDOW/ {printf "HADR_PEER_WINDOW='%s'\n", $NF;}') + + # sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW + HADR_ROLE=$(echo "$output" | awk '/HADR database role/ {print $NF;}') + HADR_TIMEOUT=$(echo "$output" | awk '/HADR_TIMEOUT/ {print $NF;}') + FIRST_ACTIVE_LOG=$(echo "$output" | awk '/First active log file/ {print $NF;}') + HADR_PEER_WINDOW=$(echo "$output" | awk '/HADR_PEER_WINDOW/ {print $NF;}') + + # HADR_PEER_WINDOW comes with V9 and is checked later + if [ -z "$HADR_ROLE" -o -z "$HADR_TIMEOUT" ] + then + ocf_log error "DB2 cfg values invalid for $instance($db2node)/$db: $hadr_vars" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +# +# return the list of databases in the instance +# +db2_dblist() { + local output + + output=$(runasdb2 db2 list database directory) || return $OCF_ERR_GENERIC + + echo "$output" | grep -i 'Database name.*=' | sed 's%.*= *%%' +} + +# +# Delayed check of the compatibility of DB2 instance and pacemaker +# config. +# Logically this belongs to validate but certain parameters can only +# be retrieved once the instance is started. +# +db2_check_config_compatibility() { + local db=$1 + local is_ms + + ocf_is_ms + is_ms=$? + + case "$HADR_ROLE/$is_ms" in + STANDARD/0) + ocf_log err "DB2 database $instance/$db is not in a HADR configuration but I am a M/S resource" + exit $OCF_ERR_INSTALLED + ;; + + STANDARD/1) + # OK + ;; + + */0) + if [ -z "$HADR_PEER_WINDOW" ] + then + ocf_log err "DB2 database $instance: release to old, need HADR_PEER_WINDOW (>=V9)" + exit $OCF_ERR_INSTALLED + fi + ;; + + */1) + ocf_log err "DB2 database $instance/$db is in a HADR configuration but I must be a M/S resource" + esac + +} + +# +# Start instance and DB. +# Standard mode is through "db2 activate" in order to start in previous +# mode (Standy/Primary). +# If the database is a primary AND we can determine that the running master +# has a higher "first active log" we conclude that we come up after a crash +# an the previous Standby is now Primary. +# The db is then started as Standby. +# +# Other cases: danger of split brain, log error and do nothing. +# +db2_start() { + local output start_cmd db + local start_opts="dbpartitionnum $db2node" + + # If we detect that db partitions are not in use, and no + # partition is explicitly specified, activate without + # partition information. This allows db2 instances without + # partition support to be managed. + if [ -z "$OCF_RESKEY_dbpartitionnum" ] && ! [ -e "$db2sql/db2nodes.cfg" ]; then + start_opts="" + fi + + if output=$(runasdb2 db2start $start_opts) + then + ocf_log info "DB2 instance $instance($db2node) started: $output" + else + case $output in + *SQL1026N*) + ocf_log info "DB2 instance $instance($db2node) already running: $output" + ;; + + *) + ocf_log err "$output" + return $OCF_ERR_GENERIC + esac + fi + + if ! db2_instance_status + then + ocf_log err "DB2 instance $instance($db2node) is not active!" + return $OCF_ERR_GENERIC + fi + + [ $db2node = 0 ] || return $OCF_SUCCESS + # activate DB only on node 0 + + for db in $dblist + do + # sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW FIRST_ACTIVE_LOG + db2_get_cfg $db || return $? + + # Better late than never: can only check this when the instance is already up + db2_check_config_compatibility $db + + start_cmd="db2 activate db $db" + + if [ $HADR_ROLE = PRIMARY ] + then + local master_fal + + # communicate our FAL to other nodes the might start concurrently + db2_fal_attrib $db set $FIRST_ACTIVE_LOG + + # ignore false positive: + # error: Can't use > in [ ]. Escape it or use [[..]]. [SC2073] + # see https://github.com/koalaman/shellcheck/issues/691 + # shellcheck disable=SC2073 + if master_fal=$(db2_fal_attrib $db get) && [ "$master_fal" '>' $FIRST_ACTIVE_LOG ] + then + ocf_log info "DB2 database $instance($db2node)/$db is Primary and outdated, starting as secondary" + start_cmd="db2 start hadr on db $db as standby" + HADR_ROLE=STANDBY + fi + fi + + if output=$(runasdb2 $start_cmd) + then + ocf_log info "DB2 database $instance($db2node)/$db started/activated" + [ $HADR_ROLE != STANDBY ] && db2_run_connect $db & + else + case $output in + SQL1490W*|SQL1494W*|SQL1497W*|SQL1777N*) + ocf_log info "DB2 database $instance($db2node)/$db already activated: $output" + ;; + + SQL1768N*"Reason code = \"7\""*) + ocf_log err "DB2 database $instance($db2node)/$db is a Primary and the Standby is down" + ocf_log err "Possible split brain ! Manual intervention required." + ocf_log err "If this DB is outdated use \"db2 start hadr on db $db as standby\"" + ocf_log err "If this DB is the surviving primary use \"db2 start hadr on db $db as primary by force\"" + + # might be the Standby is not yet there + # might be a timing problem because "First active log" is delayed + # on the next start attempt we might succeed when FAL was advanced + # might be manual intervention is required + # ... so let pacemaker give it another try and we will succeed then + return $OCF_ERR_GENERIC + ;; + + *) + ocf_log err "DB2 database $instance($db2node)/$db didn't start: $output" + return $OCF_ERR_GENERIC + esac + fi + done + + # come here with success + # Even if we are a db2 Primary pacemaker requires start to end up in slave mode + echo SLAVE > $STATE_FILE + return $OCF_SUCCESS +} + +# +# helper function to be spawned +# so we can detect a hang of the db2stop command +# +db2_stop_bg() { + local rc output + local stop_opts="dbpartitionnum $db2node" + + rc=$OCF_SUCCESS + + if [ -z "$OCF_RESKEY_dbpartitionnum" ] && ! [ -e "$db2sql/db2nodes.cfg" ]; then + stop_opts="" + fi + + if output=$(runasdb2 db2stop force $stop_opts) + then + ocf_log info "DB2 instance $instance($db2node) stopped: $output" + else + case $output in + *SQL1032N*) + #SQL1032N No start database manager command was issued + ocf_log info "$output" + ;; + + *) + ocf_log err "DB2 instance $instance($db2node) stop failed: $output" + rc=$OCF_ERR_GENERIC + esac + fi + + return $rc +} + +# +# Stop the given db2 database instance +# +db2_stop() { + local stop_timeout grace_timeout stop_bg_pid i must_kill + + # remove master score + master_score -D -l reboot + + # be very early here in order to avoid stale data + rm -f $STATE_FILE + + db2_instance_status + if [ $? -eq $OCF_NOT_RUNNING ]; then + ocf_log info "DB2 instance $instance already stopped" + return $OCF_SUCCESS + fi + + stop_timeout=${OCF_RESKEY_CRM_meta_timeout:-20000} + + # grace_time is 4/5 (unit is ms) + grace_timeout=$((stop_timeout/1250)) + + # start db2stop in background as this may hang + db2_stop_bg & + stop_bg_pid=$! + + # wait for grace_timeout + i=0 + while [ $i -lt $grace_timeout ] + do + kill -0 $stop_bg_pid 2>/dev/null || break; + sleep 1 + i=$((i+1)) + done + + # collect exit status but don't hang + if kill -0 $stop_bg_pid 2>/dev/null + then + stoprc=1 + kill -9 $stop_bg_pid 2>/dev/null + else + wait $stop_bg_pid + stoprc=$? + fi + + must_kill=0 + + if [ $stoprc -ne 0 ] + then + ocf_log warn "DB2 instance $instance($db2node): db2stop failed, using db2nkill" + must_kill=1 + elif ! db2_instance_dead + then + ocf_log warn "DB2 instance $instance($db2node): db2stop indicated success but there a still processes, using db2nkill" + must_kill=1 + fi + + if [ $must_kill -eq 1 ] + then + # db2nkill kills *all* partitions on the node + if [ -x $db2bin/db2nkill ] + then + logasdb2 $db2bin/db2nkill $db2node + elif [ -x $db2bin/db2_kill ] + then + logasdb2 $db2bin/db2_kill + fi + + # loop forever (or lrmd kills us due to timeout) until the + # instance is dead + while ! db2_instance_dead + do + ocf_log info "DB2 instance $instance($db2node): waiting for processes to exit" + sleep 1 + done + + ocf_log info "DB2 instance $instance($db2node) is now dead" + fi + + return $OCF_SUCCESS +} + +# +# check whether `enough´ processes for a healthy instance are up +# +db2_instance_status() { + local pscount + + pscount=$(runasdb2 $db2bin/db2nps $db2node | cut -c9- | grep ' db2[^ ]' | wc -l) + if [ $pscount -ge 4 ]; then + return $OCF_SUCCESS; + elif [ $pscount -ge 1 ]; then + return $OCF_ERR_GENERIC + fi + return $OCF_NOT_RUNNING +} + +# +# is the given db2 instance dead? +# +db2_instance_dead() { + local pscount + + pscount=$(runasdb2 $db2bin/db2nps $db2node | cut -c9- | grep ' db2[^ ]' | wc -l) + test $pscount -eq 0 +} + +# +# return the status of the db as "Role/Status" +# e.g. Primary/Peer, Standby/RemoteCatchupPending +# +# If not in HADR configuration return "Standard/Standalone" +# +db2_hadr_status() { + local db=$1 + local output + + output=$(runasdb2 db2pd -hadr -db $db) + if [ $? != 0 ] + then + echo "Down/Off" + return 1 + fi + + echo "$output" | + awk '/^\s+HADR_(ROLE|STATE) =/ {printf $3"/"} + /^\s+HADR_CONNECT_STATUS =/ {print $3; exit; } + /^HADR is not active/ {print "Standard/Standalone"; exit; } + /^Role *State */ {getline; printf "%s/%s\n", $1, $2; exit; }' +} + +# +# Monitor the db +# And as side effect set crm_master / FAL attribute +# +db2_monitor() { + local CMD output hadr db + local rc + + db2_instance_status + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + # instance is dead remove master score + master_score -D -l reboot + exit $rc + fi + + [ $db2node = 0 ] || return 0 + # monitoring only for partition 0 + + for db in $dblist + do + hadr=$(db2_hadr_status $db) || return $OCF_ERR_GENERIC + ocf_log debug "Monitor: DB2 database $instance($db2node)/$db has HADR status $hadr" + + # set master preference accordingly + case "$hadr" in + PRIMARY/*|Primary/*|Standard/*) + # perform a basic health check + CMD="if db2 connect to $db; + then + db2 select \* from sysibm.sysversions ; rc=\$?; + db2 terminate; + else + rc=\$?; + fi; + exit \$rc" + + if ! output=$(runasdb2 $CMD) + then + case "$output" in + SQL1776N*) + # can't connect/select on standby, may be spurious turing takeover + ;; + + *) + ocf_log err "DB2 database $instance($db2node)/$db is not working" + ocf_log err "DB2 message: $output" + + # dead primary, remove master score + master_score -D -l reboot + return $OCF_ERR_GENERIC + esac + fi + + ocf_log debug "DB2 database $instance($db2node)/$db appears to be working" + ocf_is_ms && master_score -v 10000 -l reboot + ;; + + STANDBY/*PEER/*|Standby/*Peer) + master_score -v 8000 -l reboot + ;; + + STANDBY/*|Standby/*) + ocf_log warn "DB2 database $instance($db2node)/$db in status $hadr can never be promoted" + master_score -D -l reboot + ;; + + *) + return $OCF_ERR_GENERIC + esac + done + + # everything OK, return if running as slave + grep MASTER $STATE_FILE >/dev/null 2>&1 || return $OCF_SUCCESS + + return $OCF_RUNNING_MASTER +} + +# +# Promote db to Primary +# +db2_promote() { + # validate ensured that dblist contains only one entry + local db=$dblist + local i hadr output force + + # we run this twice as after a crash of the other node + # within HADR_TIMEOUT the status may be still reported as Peer + # although a connection no longer exists + + for i in 1 2 + do + hadr=$(db2_hadr_status $db) || return $OCF_ERR_GENERIC + ocf_log info "DB2 database $instance($db2node)/$db has HADR status $hadr and will be promoted" + + case "$hadr" in + Standard/Standalone) + # this case only to keep ocf-tester happy + return $OCF_SUCCESS + ;; + + PRIMARY/PEER/*|PRIMARY/REMOTE_CATCHUP/*|PRIMARY/REMOTE_CATCHUP_PENDING/CONNECTED|Primary/Peer) + # nothing to do, only update pacemaker's view + echo MASTER > $STATE_FILE + return $OCF_SUCCESS + ;; + + STANDBY/PEER/CONNECTED|Standby/Peer) + # must take over + ;; + + STANDBY/*PEER/DISCONNECTED|Standby/DisconnectedPeer) + # must take over by force peer window only + force="by force peer window only" + ;; + + # must take over by force + STANDBY/REMOTE_CATCHUP_PENDING/DISCONNECTED) + force="by force" + ;; + + *) + return $OCF_ERR_GENERIC + esac + + if output=$(runasdb2 db2 takeover hadr on db $db $force) + then + # update pacemaker's view + echo MASTER > $STATE_FILE + + # turn the log so we rapidly get a new FAL + logasdb2 "db2 archive log for db $db" + return $OCF_SUCCESS + fi + + case "$output" in + SQL1770N*"Reason code = \"7\""*) + # expected, HADR_TIMEOUT is now expired + # go for the second try + continue + ;; + + *) + ocf_log err "DB2 database $instance($db2node)/$db promote failed: $output" + return $OCF_ERR_GENERIC + esac + done + + return $OCF_ERR_GENERIC +} + +# +# Demote db to standby +# +db2_demote() { + # validate ensured that dblist contains only one entry + local db=$dblist + local hadr + + # house keeping, set pacemaker's view to slave + echo SLAVE > $STATE_FILE + + hadr=$(db2_hadr_status $dblist) || return $OCF_ERR_GENERIC + ocf_log info "DB2 database $instance($db2node)/$db has HADR status $hadr and will be demoted" + + db2_monitor + return $? +} + +# +# handle pre start notification +# We record our first active log on the other nodes. +# If two primaries come up after a crash they can safely determine who is +# the outdated one. +# +db2_notify() { + local node + + # only interested in pre-start + [ $OCF_RESKEY_CRM_meta_notify_type = pre \ + -a $OCF_RESKEY_CRM_meta_notify_operation = start ] || return $OCF_SUCESS + + # gets FIRST_ACTIVE_LOG + db2_get_cfg $dblist || return $? + + db2_fal_attrib $dblist set $FIRST_ACTIVE_LOG || return $OCF_ERR_GENERIC + exit $OCF_SUCCESS +} + +######## +# Main # +######## +case "$__OCF_ACTION" in + meta-data) + db2_meta_data + exit $OCF_SUCCESS + ;; + + usage) + db2_usage + exit $OCF_SUCCESS + ;; + + start) + db2_validate + db2_start || exit $? + db2_monitor + exit $? + ;; + + stop) + db2_validate + db2_stop + exit $? + ;; + + promote) + db2_validate + db2_promote + exit $? + ;; + + demote) + db2_validate + db2_demote + exit $? + ;; + + notify) + db2_validate + db2_notify + exit $? + ;; + + monitor) + db2_validate + db2_monitor + exit $? + ;; + + validate-all) + db2_validate + exit $? + ;; + + *) + db2_usage + exit $OCF_ERR_UNIMPLEMENTED +esac diff --git a/heartbeat/dhcpd b/heartbeat/dhcpd new file mode 100755 index 0000000..4df4923 --- /dev/null +++ b/heartbeat/dhcpd @@ -0,0 +1,558 @@ +#!/bin/sh +# +# Resource Agent for managing dhcpd resources. +# +# License: GNU General Public License (GPL) +# (c) 2011-2012 Chris Bowlby, +# +# A fair amount of this script has been pulled from the official 0dhcpd +# init script. Those portions have been integrated into this script to +# ensure consistent behavior between the resource agent and the +# original script. The copyrights and original authors are credited +# as follows: +# +# Copyright (c) 1996, 1997, 1998 S.u.S.E. GmbH +# Copyright (c) 1998, 1999, 2000, 2001 SuSE GmbH +# Copyright (c) 2002, 2003 SuSE Linux AG +# Copyright (c) 2004-2008 SUSE LINUX Products GmbH, Nuernberg, Germany. +# +# Author(s) : Rolf Haberrecker <rolf@suse.de>, 1997-1999 +# Peter Poeml <poeml@suse.de>, 2000-2006 +# Marius Tomaschewski <mt@suse.de>, 2006-2010 +# +# and Linux-HA contributors + +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_binary_default="dhcpd" +OCF_RESKEY_pid_default="/var/run/dhcpd.pid" +OCF_RESKEY_user_default=dhcpd +OCF_RESKEY_group_default=nogroup +OCF_RESKEY_config_default="" +OCF_RESKEY_chrooted_default="true" +OCF_RESKEY_chrooted_path_default="/var/lib/dhcp" +OCF_RESKEY_leases_default="/db/dhcpd.leases" +OCF_RESKEY_interface_default="" +OCF_RESKEY_includes_default="" + +# On some systems, the chrooted default is slightly different. +# Lets do our best to support both by default. +if [ ! -d "$OCF_RESKEY_chrooted_path_default" ]; then + if [ -d "/var/lib/dhcpd" ]; then + OCF_RESKEY_chrooted_path_default="/var/lib/dhcpd" + fi +fi + +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}} +: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} +: ${OCF_RESKEY_group=${OCF_RESKEY_group_default}} +: ${OCF_RESKEY_chrooted=${OCF_RESKEY_chrooted_default}} +: ${OCF_RESKEY_chrooted_path=${OCF_RESKEY_chrooted_path_default}} +: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} +: ${OCF_RESKEY_leases=${OCF_RESKEY_leases_default}} +: ${OCF_RESKEY_interface=${OCF_RESKEY_interface_default}} +: ${OCF_RESKEY_includes=${OCF_RESKEY_includes_default}} + +# To enable support for different versions of dhcp, we need +# to know what version we are being run against. +DHCP_VERSION_MAJOR=`$OCF_RESKEY_binary --version 2>&1 | awk -F- '{print $3}' | awk -F. '{print $1}' | sed s/^[a-zA-Z]//g` + +# These files are always copied by default to ensure the chroot environment works. +DEFAULT_FILE_LIST="/etc/gai.conf /etc/nsswitch.conf /etc/resolv.conf /etc/host.conf /etc/hosts /etc/localtime /dev/urandom" + +usage() { + cat <<EOF + usage: $0 start|stop|monitor|meta-data|validate-all + + $0 manages the dhcp (dhcpd) server as an HA resource. + + The 'start' operation starts the dhcpd server. + The 'stop' operation stops the dhcpd server. + The 'restart' operation restarts the dhcpd server. + The 'monitor' operation reports whether the dhcpd service is running. + The 'validate-all' operation reports whether the parameters are valid. +EOF + return $OCF_SUCCESS +} + +dhcpd_meta_data() { + cat <<EOF +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="dhcpd" version="0.1"> + <version>1.0</version> + <longdesc lang="en"> +Manage an ISC DHCP server service in a chroot environment. + </longdesc> + <shortdesc lang="en">Chrooted ISC DHCP server resource agent.</shortdesc> + <parameters> + <parameter name="config" unique="1" required="1"> + <longdesc lang="en"> + The absolute path to the DHCP server configuration file. + </longdesc> + <shortdesc lang="en">Configuration file</shortdesc> + <content type="string" default="$OCF_RESKEY_config_default"/> + </parameter> + <parameter name="chrooted" unique="1" required="0"> + <longdesc lang="en"> + Configure the dhcpd service to run in a chrooted or non-chrooted + mode. + </longdesc> + <shortdesc lang="en">Enable chroot mode</shortdesc> + <content type="boolean" default="$OCF_RESKEY_chrooted_default"/> + </parameter> + <parameter name="chrooted_path" unique="1" required="0"> + <longdesc lang="en"> + The absolute path of the chrooted DHCP environment. + </longdesc> + <shortdesc lang="en">The chrooted path</shortdesc> + <content type="string" default="$OCF_RESKEY_chrooted_path_default"/> + </parameter> + <parameter name="binary" unique="0" required="0"> + <longdesc lang="en"> + The binary for the DHCP server process. An absolute path + definition is not required, but can be used to override + environment path. + </longdesc> + <shortdesc lang="en">dhcpd binary</shortdesc> + <content type="string" default="$OCF_RESKEY_binary_default"/> + </parameter> + <parameter name="user" unique="0" required="0"> + <longdesc lang="en"> + The system user the DHCP server process will run as when + it is chrooted. + </longdesc> + <shortdesc lang="en">dhcpd owner</shortdesc> + <content type="string" default="$OCF_RESKEY_user_default"/> + </parameter> + <parameter name="group" unique="0" required="0"> + <longdesc lang="en"> + The system group the DHCP server process will run as when + it is chrooted. + </longdesc> + <shortdesc lang="en">dhcpd group owner</shortdesc> + <content type="string" default="$OCF_RESKEY_group_default"/> + </parameter> + <parameter name="interface" unique="0" required="0"> + <longdesc lang="en"> + The network interface(s) the DHCP server process will + bind to. A blank value will bind the process to all + interfaces. + </longdesc> + <shortdesc lang="en">Network Interface</shortdesc> + <content type="string" default="$OCF_RESKEY_interface_default"/> + </parameter> + <parameter name="includes" unique="0" required="0"> + <longdesc lang="en"> + This parameter provides a means to copy include files + into the chrooted environment. If a dhcpd.conf file + contains a line similar to this: + + include "/etc/named.keys"; + + Then an admin also has to tell the dhcpd RA that this + file should be pulled into the chrooted environment. This + is a space delimited list. + </longdesc> + <shortdesc lang="en">Include files</shortdesc> + <content type="string" default="$OCF_RESKEY_includes_default"/> + </parameter> + <parameter name="leases" unique="0" required="0"> + <longdesc lang="en"> + The leases database file, relative to chrooted_path. + </longdesc> + <shortdesc lang="en">Leases file</shortdesc> + <content type="string" default="$OCF_RESKEY_leases_default"/> + </parameter> + <parameter name="pid" unique="0" required="0"> + <longdesc lang="en"> + The path and filename of the PID file. It is relative + to chrooted_path. + </longdesc> + <shortdesc lang="en">PID file</shortdesc> + <content type="string" default="$OCF_RESKEY_pid_default"/> + </parameter> + </parameters> + <actions> + <action name="start" timeout="20s" /> + <action name="stop" timeout="20s" /> + <action name="restart" timeout="20s" /> + <action name="monitor" timeout="20s" interval="10s" depth="0" /> + <action name="meta-data" timeout="5s" /> + <action name="validate-all" timeout="20s" /> + </actions> +</resource-agent> +EOF +} + +# Validate most critical parameters +dhcpd_validate_all() { + check_binary $OCF_RESKEY_binary + + + if ! ocf_is_probe; then + # Test for the appropriate configuration files depending on if + # chroot mode is enabled. + if ocf_is_true $OCF_RESKEY_chrooted ; then + if ! test -e "$OCF_RESKEY_chrooted_path"; then + ocf_exit_reason "Path $OCF_RESKEY_chrooted_path does not exist." + return $OCF_ERR_INSTALLED + fi + + if test -n "$OCF_RESKEY_chrooted_path/$OCF_RESKEY_config" -a ! -r "$OCF_RESKEY_chrooted_path/$OCF_RESKEY_config"; then + ocf_exit_reason "Configuration file $OCF_RESKEY_chrooted_path/$OCF_RESKEY_config doesn't exist" + return $OCF_ERR_INSTALLED + fi + else + if test -n "$OCF_RESKEY_config" -a ! -r "$OCF_RESKEY_config"; then + ocf_exit_reason "Configuration file $OCF_RESKEY_config doesn't exist" + return $OCF_ERR_INSTALLED + fi + fi + + fi + + if ! getent passwd $OCF_RESKEY_user >/dev/null 2>&1; then + ocf_exit_reason "User $OCF_RESKEY_user doesn't exist" + return $OCF_ERR_INSTALLED + fi + + return $OCF_SUCCESS +} + +# dhcpd_monitor. Send a request to dhcpd and check response. +dhcpd_monitor() { + # Assume chrooted mode is being used, but if not update the PIDF + # variable to point to the non-chrooted PID file. + PIDF="$OCF_RESKEY_chrooted_path/$OCF_RESKEY_pid" + + if ! ocf_is_true $OCF_RESKEY_chrooted ; then + PIDF=`dirname $OCF_RESKEY_pid`/dhcpd/`basename $OCF_RESKEY_pid` + fi + + ocf_pidfile_status $PIDF >/dev/null 2>&1 || return $OCF_NOT_RUNNING + + return $OCF_SUCCESS +} + +# Initialize Chroot +dhcpd_initialize_chroot() { + # If we are running the initialization for the first time, we need to make + # the new chrooted folder, in case we are not using the same default. + if ! [ -d $OCF_RESKEY_chrooted_path ] ; then + ocf_log info "Initializing $OCF_RESKEY_chrooted_path for use." + fi + + # Make sure all sub-paths are created if something went wrong during + # a partial run. + for i in db dev etc lib64 var/run; do + mkdir -p $OCF_RESKEY_chrooted_path/$i + done + + # If we are running version 4 of the dhcp server, we need to mount a proc partition. + if [ $DHCP_VERSION_MAJOR -ge 4 ] ; then + mkdir -p $OCF_RESKEY_chrooted_path/proc + + if ! [ -e $OCF_RESKEY_chrooted_path/proc/net/dev ] ; then + mount -t proc -o ro proc $OCF_RESKEY_chrooted_path/proc > /dev/null 2>&1 + fi + fi + + # If the folder to store the PID file does not exist, make it. + if ! [ -d "$OCF_RESKEY_chrooted_path`dirname $OCF_RESKEY_pid`" ] ; then + mkdir -p "$OCF_RESKEY_chrooted_path`dirname $OCF_RESKEY_pid`" + fi + + # Ensure all permissions are in place if the folder was re-created. + chown -R $OCF_RESKEY_user:$OCF_RESKEY_group $OCF_RESKEY_chrooted_path/`dirname $OCF_RESKEY_leases` + chown -R $OCF_RESKEY_user:$OCF_RESKEY_group "$OCF_RESKEY_chrooted_path/`dirname $OCF_RESKEY_pid`" + + ## If there is no conf file, we can't initialize the chrooted + ## environment, return with "program not configured" + if ! [ -f $OCF_RESKEY_config ] ; then + ocf_exit_reason "dhcpd has not been configured." + return $OCF_ERR_CONFIGURED + fi + + # If the leases file does not exist, create it, as this is a fresh install. + if [ ! -e $OCF_RESKEY_chrooted_path/$OCF_RESKEY_leases ]; then + touch $OCF_RESKEY_chrooted_path/$OCF_RESKEY_leases + fi + + # Remove the random device. + test -e "$OCF_RESKEY_chrooted_path/dev/urandom" && + rm -f $OCF_RESKEY_chrooted_path/dev/urandom + + # Test for the existance of the defined include files, and append + # them to the list of files to be copied. + for i in $OCF_RESKEY_includes ; do + if [ -e $i ] ; then + DEFAULT_FILE_LIST="$DEFAULT_FILE_LIST $i" + else + ocf_exit_reason "include file $i does not exist" + return $OCF_ERR_INSTALLED + fi + done + + # Ensure all "modified" non-chrooted configuration files are copied into the chrooted environment. + for i in $OCF_RESKEY_config $DEFAULT_FILE_LIST; do + # First, lets make sure the directory exists within the chrooted environment. + if test -d "$i" ; then + mkdir -p $OCF_RESKEY_chrooted_path/$i + elif test -e "$i" ; then + mkdir -p "`dirname $OCF_RESKEY_chrooted_path/$i`" + fi + + # Next, we copy the configuration file into place. + cp -aL "$i" "$OCF_RESKEY_chrooted_path/${i%/*}/" > /dev/null 2>&1 || + { ocf_exit_reason "could not copy $i to chroot jail"; return $OCF_ERR_GENERIC; } + done + + libdir=$(basename $(echo ${OCF_RESKEY_chrooted_path}/lib*)) + if test -x /usr/bin/ldd ; then + get_ldd_deps() + { + ldd_wl="/$libdir/lib" + ldd_bl="/$libdir/libc\." + /usr/bin/ldd "$1" | while read a b c d ; do + [ -n "$c" ] || continue + echo "$c" | grep -q "$ldd_wl" || continue + echo "$c" | grep -q "$ldd_bl" && continue + echo $c + done + } + else + get_ldd_deps() { :; } + fi + cplibs=`for i in /$libdir/libresolv.so.* /$libdir/libnss_*.so.* /$libdir/libpthread.so.0 /$libdir/libdl.so.2 + do + if [ -s "$i" ] ; then + echo "$i" + get_ldd_deps "$i" + fi + done | sort -u` + for i in $cplibs ; do + if [ -s "$i" ]; then + cp -aL "$i" "${OCF_RESKEY_chrooted_path}/$libdir/" || + { ocf_exit_reason "could not copy $i to chroot jail"; return $OCF_ERR_GENERIC; } + fi + done + + return $OCF_SUCCESS +} + +# Initialize a non-chroot environment +dhcpd_initialize() { + ## If there is no conf file, we can't start a dhcp service. + if ! [ -f $OCF_RESKEY_config ] ; then + ocf_exit_reason "dhcpd has not been configured." + return $OCF_ERR_CONFIGURED + fi + + # As with the standard DHCP init script, we can still use the + # chrooted default path for storing the leases file. This behavior + # is consistent with the existing /etc/init.d/dhcpd script. + if ! [ -d $OCF_RESKEY_chrooted_path ] ; then + ocf_log info "Initializing $OCF_RESKEY_chrooted_path for use." + fi + + # If the leases file does not exist, create it, as this is a fresh install. + if [ ! -e $OCF_RESKEY_chrooted_path/$OCF_RESKEY_leases ]; then + touch $OCF_RESKEY_chrooted_path/$OCF_RESKEY_leases + fi + + # if the PID storage path does not exist, make it, and setup the permissions. + + # NOTE: This part of the script has a potential security flaw, in that if someone + # puts in /var/run as the path, it will change ownership to the dhcpd user + # and group. However, all that would do is allow that user to view the contents + # of the files, which they can do now anyway. If this becomes an issue, I can work + # in some changes. + + # We need to append "dhcpd" to the path for the PID file storage folder, because + # if /var/run is used, that folders permissions can not be changed, otherwise it affects + # more then just one application. + if ! [ -d `dirname $OCF_RESKEY_pid`/dhcpd ] ; then + mkdir -p `dirname $OCF_RESKEY_pid`/dhcpd + + if [ -n "$OCF_RESKEY_user" -a "x$OCF_RESKEY_user" != "xroot" ] ; then + chown $OCF_RESKEY_user `dirname $OCF_RESKEY_pid`/dhcpd + fi + + if [ -n "$OCF_RESKEY_group" -a "x$OCF_RESKEY_group" != "xwheel" ] ; then + chgrp $OCF_RESKEY_group `dirname $OCF_RESKEY_pid`/dhcpd + fi + fi + + return $OCF_SUCCESS +} + +# Start +dhcpd_start() { + # Lets make sure we are not already running. + if dhcpd_monitor; then + ocf_log info "dhcpd already running" + return $OCF_SUCCESS + fi + + # Only initialize the chrooted path(s) if chroot mode is enabled. + if ocf_is_true $OCF_RESKEY_chrooted ; then + dhcpd_initialize_chroot || + { ocf_exit_reason "Could not fully initialize the chroot environment." ; return $OCF_ERR_INSTALLED; } + else + dhcpd_initialize || + { ocf_exit_reason "Could not fully initialize the runtime environment." ; return $OCF_ERR_INSTALLED; } + fi + + dhcpd_validate_all || exit + + # Define an empty string variable, to ensure it exists when needed. + DHCPD_ARGS="" + + # To ensure consistent behavior with the standard DHCPD init script, + # use the chrooted default path for storing a leases file, when not in + # a chrooted enviroment. + if ocf_is_true $OCF_RESKEY_chrooted ; then + DHCPD_ARGS="$DHCPD_ARGS -chroot $OCF_RESKEY_chrooted_path -lf $OCF_RESKEY_leases" + else + DHCPD_ARGS="$DHCPD_ARGS -lf $OCF_RESKEY_chrooted_path/$OCF_RESKEY_leases" + fi + + if [ -n "$OCF_RESKEY_user" ]; then + DHCPD_ARGS="$DHCPD_ARGS -user $OCF_RESKEY_user" + fi + + if [ -n "$OCF_RESKEY_group" ]; then + DHCPD_ARGS="$DHCPD_ARGS -group $OCF_RESKEY_group" + fi + + # If there is a pid file containing a pid, the machine might have crashed. pid files in + # /var/run are always cleaned up at boot time, but this is not the case for the pid file in + # the chroot jail. Therefore, an old pid file may exist. This is only a problem if it + # incidentally contains the pid of a running process. If this process is not a 'dhcpd', + # we remove the pid. (dhcpd itself only checks whether the pid is alive or not.) + + PIDF="$OCF_RESKEY_chrooted_path/$OCF_RESKEY_pid" + + if ocf_is_true $OCF_RESKEY_chrooted ; then + ocf_log info "Starting dhcpd [chroot] service." + DHCPD_ARGS="$DHCPD_ARGS -pf $OCF_RESKEY_pid" + else + ocf_log info "Starting dhcpd [non-chroot] service." + PIDF=`dirname $OCF_RESKEY_pid`/dhcpd/`basename $OCF_RESKEY_pid` + DHCPD_ARGS="$DHCPD_ARGS -pf $PIDF" + fi + + test -e "$PIDF" && rm -f $PIDF + + ocf_run $OCF_RESKEY_binary -cf $OCF_RESKEY_config $DHCPD_ARGS $OCF_RESKEY_interface || + return $OCF_ERR_INSTALLED + + while ! dhcpd_monitor; do + sleep .1 + ocf_log info "waiting for dhcpd to start" + return $OCF_SUCCESS + done + + if ocf_is_true $OCF_RESKEY_chrooted ; then + ocf_log info "dhcpd [chrooted] has started." + else + ocf_log info "dhcpd [non-chrooted] has started." + fi + + return $OCF_SUCCESS +} + +# Stop +dhcpd_stop () { + local timeout + local timewait + local rc + + dhcpd_monitor + rc=$? + + case "$rc" in + "$OCF_SUCCESS") + # Currently running, and is expected behaviour. + ;; + "$OCF_NOT_RUNNING") + # Currently not running, therefore nothing to do. + ocf_log info "dhcpd already stopped" + return $OCF_SUCCESS + ;; + esac + + PIDF="$OCF_RESKEY_chrooted_path/$OCF_RESKEY_pid" + + if ! ocf_is_true $OCF_RESKEY_chrooted ; then + PIDF=`dirname $OCF_RESKEY_pid`/dhcpd/`basename $OCF_RESKEY_pid` + fi + + kill `cat $PIDF` + + # Allow 2/3 of the action timeout for the orderly shutdown + # (The origin unit is ms, hence the conversion) + timewait=$((OCF_RESKEY_CRM_meta_timeout/1500)) + + sleep 0.1; timeout=0 # Sleep here for .1 sec to let dhcpd finish. + while dhcpd_monitor ; do + if [ $timeout -ge $timewait ]; then + break + else + sleep 1 + timeout=`expr $timeout + 1` + fi + done + + #If still up + if dhcpd_monitor 2>&1; then + ocf_log notice "dhcpd is still up! Trying kill -s KILL" + + kill -s SIGKILL `cat $PIDF` + fi + + # If we are running a dhcp server v4 or higher, unmount the proc partition. + if [ $DHCP_VERSION_MAJOR -ge 4 ] ; then + # We only want to unmount proc in a chrooted environment, else we could + # cause other issues. + if ocf_is_true $OCF_RESKEY_chrooted ; then + umount $OCF_RESKEY_chrooted_path/proc > /dev/null 2>&1 + fi + fi + + rm -f $PIDF + + ocf_log info "dhcpd stopped" + return $OCF_SUCCESS +} + +# Make sure meta-data and usage always succeed +case $__OCF_ACTION in +meta-data) dhcpd_meta_data + exit $OCF_SUCCESS + ;; +validate-all) dhcpd_validate_all + exit $OCF_SUCCESS + ;; +usage|help) dhcpd_usage + exit $OCF_SUCCESS + ;; +esac + +# Translate each action into the appropriate function call +case $__OCF_ACTION in +start) dhcpd_start;; +stop) dhcpd_stop;; +restart) dhcpd_stop + dhcpd_start + ;; +monitor) dhcpd_monitor;; +*) dhcpd_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac diff --git a/heartbeat/dnsupdate.in b/heartbeat/dnsupdate.in new file mode 100755 index 0000000..b54822c --- /dev/null +++ b/heartbeat/dnsupdate.in @@ -0,0 +1,381 @@ +#!@BASH_SHELL@ +# +# +# Support: users@clusterlabs.org +# License: GNU General Public License v2 +# +# Copyright (c) 2014 SUSE Linux Products GmbH, Lars Marowsky-Brée +# All Rights Reserved. +# +####################################################################### +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_hostname_default="" +OCF_RESKEY_type_default="A" +OCF_RESKEY_ip_default="" +OCF_RESKEY_cname_default="" +OCF_RESKEY_ttl_default="300" +OCF_RESKEY_keyfile_default="" +OCF_RESKEY_server_default="" +OCF_RESKEY_serverport_default="53" +OCF_RESKEY_nsupdate_opts_default="" +OCF_RESKEY_unregister_on_stop_default="false" + +: ${OCF_RESKEY_hostname=${OCF_RESKEY_hostname_default}} +: ${OCF_RESKEY_cname=${OCF_RESKEY_cname_default}} +: ${OCF_RESKEY_type=${OCF_RESKEY_type_default}} +: ${OCF_RESKEY_ip=${OCF_RESKEY_ip_default}} +: ${OCF_RESKEY_ttl=${OCF_RESKEY_ttl_default}} +: ${OCF_RESKEY_keyfile=${OCF_RESKEY_keyfile_default}} +: ${OCF_RESKEY_server=${OCF_RESKEY_server_default}} +: ${OCF_RESKEY_serverport=${OCF_RESKEY_serverport_default}} +: ${OCF_RESKEY_nsupdate_opts=${OCF_RESKEY_nsupdate_opts_default}} +: ${OCF_RESKEY_unregister_on_stop=${OCF_RESKEY_unregister_on_stop_default}} + +####################################################################### + +# TODO: +# - Should multiple A records be supported? + +usage() { + cat <<-! + usage: $0 {start|stop|status|monitor|meta-data|validate-all} + ! +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="dnsupdate" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This resource agent manages IP take-over via dynamic DNS updates. +</longdesc> +<shortdesc lang="en">IP take-over via dynamic DNS update</shortdesc> + +<parameters> + +<parameter name="hostname" unique="1" required="1"> +<longdesc lang="en"> +Either the hostname whose IP address will need to be updated (in case of type=A) +or alias whose hostname will need to be updated (in case of type=CNAME). +</longdesc> +<shortdesc lang="en">Hostname to update</shortdesc> +<content type="string" default="${OCF_RESKEY_hostname_default}" /> +</parameter> + +<parameter name="type" unique="0" required="0"> +<longdesc lang="en"> +The type of DNS record that need to be updated (A or CNAME). +</longdesc> +<shortdesc lang="en">Type of DNS record</shortdesc> +<content type="string" default="${OCF_RESKEY_type_default}" /> +</parameter> + +<parameter name="ip" unique="0" required="0"> +<longdesc lang="en"> +IP address to set. +</longdesc> +<shortdesc lang="en">IP address to set</shortdesc> +<content type="string" default="${OCF_RESKEY_ip_default}" /> +</parameter> + +<parameter name="cname" unique="0" required="0"> +<longdesc lang="en"> +The CNAME whose hostname address will need to be updated. +</longdesc> +<shortdesc lang="en">CNAME to update</shortdesc> +<content type="string" default="${OCF_RESKEY_cname_default}" /> +</parameter> + +<parameter name="ttl" unique="0" required="0"> +<longdesc lang="en"> +Time to live, in seconds, for the DNS record. This +affects how soon DNS updates propagate. It should be +a reasonable compromise between update speed and DNS +server load. + +If using booth, the ticket timeout is a good start. +</longdesc> +<shortdesc lang="en">TTL for the DNS record</shortdesc> +<content type="integer" default="${OCF_RESKEY_ttl_default}" /> +</parameter> + +<parameter name="keyfile" unique="0" required="0"> +<longdesc lang="en"> +The file containing the shared secret needed to update +the DNS record. Please see the nsupdate man page for +the exact syntax. +</longdesc> +<shortdesc lang="en">nsupdate key file</shortdesc> +<content type="string" default="${OCF_RESKEY_keyfile_default}" /> +</parameter> + +<parameter name="server" unique="0" required="0"> +<longdesc lang="en"> +Which DNS server to send these updates for. When no +server is provided, this defaults to the promoted server +for the correct zone. +</longdesc> +<shortdesc lang="en">DNS server to contact</shortdesc> +<content type="string" default="${OCF_RESKEY_server_default}" /> +</parameter> + +<parameter name="serverport" unique="0" required="0"> +<longdesc lang="en"> +Port number on the DNS server. + +Note: due to a limitation in the nsupdate command, this option will only +take effect if you also specify the DNS server! +</longdesc> +<shortdesc lang="en">Port number on the DNS server</shortdesc> +<content type="integer" default="${OCF_RESKEY_serverport_default}" /> +</parameter> + +<parameter name="nsupdate_opts" unique="0" required="0"> +<longdesc lang="en"> +Additional options to be passed to nsupdate. +</longdesc> +<shortdesc lang="en">Additional nsupdate options</shortdesc> +<content type="string" default="${OCF_RESKEY_nsupdate_opts_default}" /> +</parameter> + +<parameter name="unregister_on_stop" unique="0" required="0"> +<longdesc lang="en"> +Whether or not to actively remove records on stop. This is not needed +for normal operation, since the site taking over the IP address will +delete all previous records. +</longdesc> +<shortdesc lang="en">Remove A record on stop</shortdesc> +<content type="boolean" default="${OCF_RESKEY_unregister_on_stop_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="30s" /> +<action name="stop" timeout="30s" /> +<action name="status" depth="0" timeout="30s" interval="10s" /> +<action name="monitor" depth="0" timeout="30s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s" /> +</actions> +</resource-agent> +END +} + +dnsupdate_status() { + case $type in + A) + # The resource is considered active if the current IP + # address is returned as the only response. + local record=$(dig ${dig_opts} ${hostname}. A +short 2>/dev/null) + if [ "$record" = "$ip" ]; then + return $OCF_SUCCESS + fi + return $OCF_NOT_RUNNING + ;; + CNAME) + local record=$(dig ${dig_opts} ${cname}. CNAME +short 2>/dev/null) + if [ "$record" = "${hostname}." ]; then + return $OCF_SUCCESS + fi + return $OCF_NOT_RUNNING + ;; + esac + +} + +dnsupdate_monitor() { + if ocf_is_probe ; then + # + return $OCF_NOT_RUNNING + fi + dnsupdate_status +} + +dnsupdate_start() { + case $type in + A) + if dnsupdate_status ; then + ocf_log info "$hostname already resolves to $ip" + return $OCF_SUCCESS + fi + + ocf_log info "Updating DNS records for $hostname" + + ( + if [ -n "$dns_server" ]; then + echo "server ${dns_server} ${dns_serverport}" + fi + echo "update delete $hostname A" + echo "update add $hostname ${OCF_RESKEY_ttl} A $ip" + echo "send" + ) | nsupdate ${nsupdate_opts} + ;; + + CNAME) + if dnsupdate_status ; then + ocf_log info "$cname already is an alias to $hostname" + return $OCF_SUCCESS + fi + + ocf_log info "Updating DNS records for $cname" + + ( + if [ -n "$dns_server" ]; then + echo "server ${dns_server} ${dns_serverport}" + fi + echo "update delete $cname CNAME" + echo "update add $cname ${OCF_RESKEY_ttl} CNAME $hostname" + echo "send" + ) | nsupdate ${nsupdate_opts} + ;; + + esac + + + dnsupdate_monitor + + return $? +} + +dnsupdate_stop() { + case $type in + A) + if ocf_is_true "${OCF_RESKEY_unregister_on_stop}" && dnsupdate_status ; then + ocf_log info "Unregistering $hostname with $ip from DNS server" + ( + if [ -n "$dns_server" ]; then + echo "server ${dns_server} ${dns_serverport}" + fi + echo "update delete $hostname A $ip" + echo "send" + ) | nsupdate ${nsupdate_opts} + + dnsupdate_monitor + if [ $? -ne $OCF_NOT_RUNNING ]; then + ocf_log warn "Unregistering failed!" + fi + fi + return $OCF_SUCCESS + ;; + CNAME) + if ocf_is_true "${OCF_RESKEY_unregister_on_stop}" && dnsupdate_status ; then + ocf_log info "Unregistering $cname with $hostname from DNS server" + ( + if [ -n "$dns_server" ]; then + echo "server ${dns_server} ${dns_serverport}" + fi + echo "update delete $cname CNAME" + echo "send" + ) | nsupdate ${nsupdate_opts} + + dnsupdate_monitor + if [ $? -ne $OCF_NOT_RUNNING ]; then + ocf_log warn "Unregistering failed!" + fi + fi + return $OCF_SUCCESS + ;; + esac +} + +dnsupdate_validate() { + hostname=${OCF_RESKEY_hostname} + ip=${OCF_RESKEY_ip} + #added support for CNAME + type=${OCF_RESKEY_type} + cname=${OCF_RESKEY_cname} + # + dig_opts="" + dns_server=${OCF_RESKEY_server} + : ${OCF_RESKEY_serverport:="53"} + dns_serverport=${OCF_RESKEY_serverport} + : ${OCF_RESKEY_ttl:="300"} + nsupdate_opts=${OCF_RESKEY_nsupdate_opts} + if [ -z "$nsupdate_opts" -a -n "$OCF_RESKEY_opts" ]; then + nsupdate_opts=${OCF_RESKEY_opts} + ocf_log warn "opts was never an advertised parameter, please use nsupdate_opts" + fi + + if [ -z "$hostname" ]; then + ocf_log err "No hostname specified." + exit $OCF_ERR_CONFIGURED + fi + if [ -z "$ip" ] && [ "$type" = "A" ]; then + ocf_log err "No IP specified." + exit $OCF_ERR_CONFIGURED + fi + #added support for CNAME + if [ -z "$type" ]; then + ocf_log err "No TYPE specified." + exit $OCF_ERR_CONFIGURED + fi + # + if ! ocf_is_decimal $OCF_RESKEY_ttl ; then + ocf_log err "ttl $OCF_RESKEY_ttl is not valid" + exit $OCF_ERR_CONFIGURED + fi + + if ! ocf_is_decimal $dns_serverport ; then + ocf_log err "serverport $dns_serverport is not valid" + exit $OCF_ERR_CONFIGURED + fi + dig_opts+=" -p ${dns_serverport}" + + if [ -n "$dns_server" ]; then + dig_opts+=" @${dns_server}" + fi + + if [ -n "$OCF_RESKEY_keyfile" ]; then + if [ ! -f ${OCF_RESKEY_keyfile} ]; then + ocf_log err "keyfile $OCF_RESKEY_keyfile does not exist" + exit $OCF_ERR_CONFIGURED + fi + nsupdate_opts+=" -k $OCF_RESKEY_keyfile" + fi +} + +if [ $# -ne 1 ]; then + usage + exit $OCF_ERR_ARGS +fi + +case $1 in +meta-data) meta_data + exit $OCF_SUCCESS + ;; +usage) usage + exit $OCF_SUCCESS + ;; +esac + +check_binary dig +check_binary nsupdate + +dnsupdate_validate + +case $1 in +start) dnsupdate_start + ;; +stop) dnsupdate_stop + ;; +monitor) dnsupdate_monitor + ;; +status) dnsupdate_status + ;; +validate-all) # We've already run this + exit $OCF_SUCCESS + ;; +*) usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +exit $? + diff --git a/heartbeat/docker b/heartbeat/docker new file mode 100755 index 0000000..2adcade --- /dev/null +++ b/heartbeat/docker @@ -0,0 +1,605 @@ +#!/bin/sh +# +# The docker HA resource agent creates and launches a docker container +# based off a supplied docker image. Containers managed by this agent +# are both created and removed upon the agent's start and stop actions. +# +# Copyright (c) 2014 David Vossel <davidvossel@gmail.com> +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_reuse_default="0" + +: ${OCF_RESKEY_reuse=${OCF_RESKEY_reuse_default}} + +####################################################################### + +meta_data() +{ + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="docker" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +The docker HA resource agent creates and launches a docker container +based off a supplied docker image. Containers managed by this agent +are both created and removed upon the agent's start and stop actions. +</longdesc> +<shortdesc lang="en">Docker container resource agent.</shortdesc> + +<parameters> +<parameter name="image" required="1" unique="0"> +<longdesc lang="en"> +The docker image to base this container off of. +</longdesc> +<shortdesc lang="en">docker image</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="name" required="0" unique="0"> +<longdesc lang="en"> +The name to give the created container. By default this will +be that resource's instance name. +</longdesc> +<shortdesc lang="en">docker container name</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="allow_pull" unique="0"> +<longdesc lang="en"> +Allow the image to be pulled from the configured docker registry when +the image does not exist locally. NOTE, this can drastically increase +the time required to start the container if the image repository is +pulled over the network. +</longdesc> +<shortdesc lang="en">Allow pulling non-local images</shortdesc> +<content type="boolean"/> +</parameter> + +<parameter name="run_opts" required="0" unique="0"> +<longdesc lang="en"> +Add options to be appended to the 'docker run' command which is used +when creating the container during the start action. This option allows +users to do things such as setting a custom entry point and injecting +environment variables into the newly created container. Note the '-d' +option is supplied regardless of this value to force containers to run +in the background. + +NOTE: Do not explicitly specify the --name argument in the run_opts. This +agent will set --name using either the resource's instance or the name +provided in the 'name' argument of this agent. + +</longdesc> +<shortdesc lang="en">run options</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="run_cmd" required="0" unique="0"> +<longdesc lang="en"> +Specify a command to launch within the container once +it has initialized. +</longdesc> +<shortdesc lang="en">run command</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="mount_points" required="0" unique="0"> +<longdesc lang="en"> +A comma separated list of directories that the container is expecting to use. +The agent will ensure they exist by running 'mkdir -p' +</longdesc> +<shortdesc lang="en">Required mount points</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="monitor_cmd" required="0" unique="0"> +<longdesc lang="en"> +Specify the full path of a command to launch within the container to check +the health of the container. This command must return 0 to indicate that +the container is healthy. A non-zero return code will indicate that the +container has failed and should be recovered. + +If 'docker exec' is supported, it is used to execute the command. If not, +nsenter is used. + +Note: Using this method for monitoring processes inside a container +is not recommended, as containerd tries to track processes running +inside the container and does not deal well with many short-lived +processes being spawned. Ensure that your container monitors its +own processes and terminates on fatal error rather than invoking +a command from the outside. +</longdesc> +<shortdesc lang="en">monitor command</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="force_kill" required="0" unique="0"> +<longdesc lang="en"> +Kill a container immediately rather than waiting for it to gracefully +shutdown +</longdesc> +<shortdesc lang="en">force kill</shortdesc> +<content type="boolean"/> +</parameter> + +<parameter name="reuse" required="0" unique="0"> +<longdesc lang="en"> +Allow the container to be reused once it is stopped. By default, +containers get removed once they are stopped. Enable this option +to have the particular one persist when this happens. +</longdesc> +<shortdesc lang="en">reuse container</shortdesc> +<content type="boolean" default="${OCF_RESKEY_reuse_default}"/> +</parameter> + +<parameter name="query_docker_health" required="0" unique="0"> +<longdesc lang="en"> +Query the builtin healthcheck of docker (v1.12+) to determine health of the +container. If left empty or set to false it will not be used. + +The healthcheck itself has to be configured within docker, e.g. via +HEALTHCHECK in Dockerfile. This option just queries in what condition +docker considers the container to be and lets ocf do its thing accordingly. + +Note that the time a container is in "starting" state counts against the +monitor timeout. + +This is an additional check besides the standard check for the container +to be running, and the optional monitor_cmd check. It doesn't disable or +override them, so all of them (if used) have to come back healthy for the +container to be considered healthy. +</longdesc> +<shortdesc lang="en">use healthcheck</shortdesc> +<content type="boolean"/> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="90s" /> +<action name="stop" timeout="90s" /> +<action name="monitor" timeout="30s" interval="30s" depth="0" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="30s" /> +</actions> +</resource-agent> +END +} + +####################################################################### +REQUIRE_IMAGE_PULL=0 + +docker_usage() +{ + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + + +monitor_cmd_exec() +{ + local rc=$OCF_SUCCESS + local out + + if [ -z "$OCF_RESKEY_monitor_cmd" ]; then + return $rc + fi + + if docker exec --help >/dev/null 2>&1; then + out=$(docker exec ${CONTAINER} $OCF_RESKEY_monitor_cmd 2>&1) + rc=$? + else + out=$(echo "$OCF_RESKEY_monitor_cmd" | nsenter --target $(docker inspect --type=container --format {{.State.Pid}} ${CONTAINER}) --mount --uts --ipc --net --pid 2>&1) + rc=$? + fi + + if [ $rc -eq 127 ]; then + ocf_log err "monitor cmd failed (rc=$rc), output: $out" + ocf_exit_reason "monitor_cmd, ${OCF_RESKEY_monitor_cmd} , not found within container." + # there is no recovering from this, exit immediately + exit $OCF_ERR_ARGS + elif [ $rc -ne 0 ]; then + ocf_exit_reason "monitor cmd failed (rc=$rc), output: $out" + rc=$OCF_ERR_GENERIC + else + ocf_log debug "monitor cmd passed: exit code = $rc" + fi + + return $rc +} + +container_exists() +{ + local err + + err=$(docker inspect --type=container $CONTAINER 2>&1 >/dev/null) + + if [ $? -ne $OCF_SUCCESS ]; then + case $err in + *"No such container"*) + # Return failure instead of exiting if container does not exist + return 1 + ;; + *) + # Exit if error running command + ocf_exit_reason "$err" + exit $OCF_ERR_GENERIC + ;; + esac + fi + + return $OCF_SUCCESS +} + +remove_container() +{ + if ocf_is_true "$OCF_RESKEY_reuse"; then + # never remove the container if we have reuse enabled. + return 0 + fi + + container_exists + if [ $? -ne 0 ]; then + # don't attempt to remove a container that doesn't exist + return 0 + fi + ocf_log notice "Cleaning up inactive container, ${CONTAINER}." + ocf_run docker rm $CONTAINER +} + +docker_simple_status() +{ + local val + + if [ ! -x "$(command -v docker)" ]; then + ocf_exit_reason "docker is not installed on this host" + return $OCF_ERR_INSTALLED + fi + + + # let's first check if the daemon is up and running. + VERSION_OUT=$(docker version) + version_ret=$? + if [ $version_ret -eq 1 ]; then + ocf_exit_reason "Docker service is in error state while checking for ${CONTAINER}, based on image, ${OCF_RESKEY_image}: ${VERSION_OUT}" + return $OCF_ERR_GENERIC + fi + + container_exists + if [ $? -ne 0 ]; then + return $OCF_NOT_RUNNING + fi + + # retrieve the 'Running' attribute for the container + val=$(docker inspect --type=container --format {{.State.Running}} $CONTAINER 2>/dev/null) + if [ $? -ne 0 ]; then + #not running as a result of container not being found + return $OCF_NOT_RUNNING + fi + + if ocf_is_true "$val"; then + # container exists and is running + return $OCF_SUCCESS + fi + + return $OCF_NOT_RUNNING +} + +docker_health_status() +{ + + if ocf_is_true "$OCF_RESKEY_query_docker_health"; then + local val + + container_exists + if [ $? -ne 0 ]; then + return $OCF_NOT_RUNNING + fi + + # retrieve the 'Health' attribute for the container + # This is a bash-style do-while loop to wait until instance is started. + # if starting takes longer than monitor timeout then upstream will make this fail. + while + + val=$(docker inspect --type=container --format {{.State.Health.Status}} $CONTAINER 2>/dev/null) + if [ $? -ne 0 ]; then + #not healthy as a result of container not being found + return $OCF_NOT_RUNNING + fi + test "$val" = "starting" + do + + sleep 1 + done + + if [ "$val" = "healthy" ]; then + # container exists and is healthy + return $OCF_SUCCESS + fi + + return $OCF_NOT_RUNNING + fi + + return 0 +} + + + +docker_monitor() +{ + local rc=0 + + docker_simple_status + rc=$? + + if [ $rc -ne 0 ]; then + return $rc + fi + + docker_health_status + rc=$? + + if [ $rc -ne 0 ]; then + return $rc + fi + + monitor_cmd_exec +} + +docker_create_mounts() { + oldIFS="$IFS" + IFS="," + for directory in $OCF_RESKEY_mount_points; do + mkdir -p "$directory" + done + IFS="$oldIFS" +} + +docker_start() +{ + docker_create_mounts + local run_opts="-d --name=${CONTAINER}" + # check to see if the container has already started + docker_simple_status + if [ $? -eq $OCF_SUCCESS ]; then + return $OCF_SUCCESS + fi + + if [ -n "$OCF_RESKEY_run_opts" ]; then + run_opts="$run_opts $OCF_RESKEY_run_opts" + fi + + if [ $REQUIRE_IMAGE_PULL -eq 1 ]; then + ocf_log notice "Beginning pull of image, ${OCF_RESKEY_image}" + docker pull "${OCF_RESKEY_image}" + if [ $? -ne 0 ]; then + ocf_exit_reason "failed to pull image ${OCF_RESKEY_image}" + return $OCF_ERR_GENERIC + fi + fi + + if ocf_is_true "$OCF_RESKEY_reuse" && container_exists; then + ocf_log info "starting existing container $CONTAINER." + ocf_run docker start $CONTAINER + else + # make sure any previous container matching our container name is cleaned up first. + # we already know at this point it wouldn't be running + remove_container + ocf_log info "running container $CONTAINER for the first time" + ocf_run docker run $run_opts $OCF_RESKEY_image $OCF_RESKEY_run_cmd + fi + + if [ $? -ne 0 ]; then + ocf_exit_reason "docker failed to launch container" + return $OCF_ERR_GENERIC + fi + + + # wait for monitor to pass before declaring that the container is started + while true; do + docker_simple_status + if [ $? -ne $OCF_SUCCESS ]; then + ocf_exit_reason "Newly created docker container exited after start" + return $OCF_ERR_GENERIC + fi + + monitor_cmd_exec + if [ $? -eq $OCF_SUCCESS ]; then + ocf_log notice "Container $CONTAINER started successfully" + return $OCF_SUCCESS + fi + + ocf_exit_reason "waiting on monitor_cmd to pass after start" + sleep 1 + done +} + +docker_stop() +{ + local timeout=60 + docker_simple_status + ret=$? + if [ $ret -eq $OCF_NOT_RUNNING ]; then + remove_container + return $OCF_SUCCESS + elif [ $ret -eq $OCF_ERR_GENERIC ]; then + return $OCF_ERR_GENERIC + fi + + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000) -10 )) + if [ $timeout -lt 10 ]; then + timeout=10 + fi + fi + + if ocf_is_true "$OCF_RESKEY_force_kill"; then + ocf_run docker kill $CONTAINER + else + ocf_log debug "waiting $timeout second[s] before killing container" + ocf_run docker stop -t=$timeout $CONTAINER + fi + + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." + return $OCF_ERR_GENERIC + fi + + remove_container + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to remove stopped container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +image_exists() +{ + # if no tag was specified, use default "latest" + local COLON_FOUND=0 + local SLASH_FOUND=0 + local SERVER_NAME="" + local IMAGE_NAME="${OCF_RESKEY_image}" + local IMAGE_TAG="latest" + + SLASH_FOUND="$(echo "${OCF_RESKEY_image}" | grep -o '/' | grep -c .)" + + if [ ${SLASH_FOUND} -ge 1 ]; then + SERVER_NAME="$(echo ${IMAGE_NAME} | cut -d / -f 1-${SLASH_FOUND})" + IMAGE_NAME="$(echo ${IMAGE_NAME} | awk -F'/' '{print $NF}')" + fi + + COLON_FOUND="$(echo "${IMAGE_NAME}" | grep -o ':' | grep -c .)" + if [ ${COLON_FOUND} -ge 1 ]; then + IMAGE_TAG="$(echo ${IMAGE_NAME} | awk -F':' '{print $NF}')" + IMAGE_NAME="$(echo ${IMAGE_NAME} | cut -d : -f 1-${COLON_FOUND})" + fi + + # IMAGE_NAME might be following formats: + # - image + # - repository:port/image + # - docker.io/image (some distro will display "docker.io/" as prefix) + docker images | awk '{print $1 ":" $2}' | egrep -q -s "^(docker.io\/|${SERVER_NAME}\/)?${IMAGE_NAME}:${IMAGE_TAG}\$" + if [ $? -eq 0 ]; then + # image found + return 0 + fi + + if ocf_is_true "$OCF_RESKEY_allow_pull"; then + REQUIRE_IMAGE_PULL=1 + ocf_log notice "Image (${OCF_RESKEY_image}) does not exist locally but will be pulled during start" + return 0 + fi + # image not found. + return 1 +} + +docker_validate() +{ + check_binary docker + if [ -z "$OCF_RESKEY_image" ]; then + ocf_exit_reason "'image' option is required" + exit $OCF_ERR_CONFIGURED + fi + + if [ -n "$OCF_RESKEY_monitor_cmd" ]; then + docker exec --help >/dev/null 2>&1 + if [ ! $? ]; then + ocf_log info "checking for nsenter, which is required when 'monitor_cmd' is specified" + check_binary nsenter + fi + fi + + image_exists + if [ $? -ne 0 ]; then + ocf_exit_reason "base image, ${OCF_RESKEY_image}, could not be found." + exit $OCF_ERR_CONFIGURED + fi + + return $OCF_SUCCESS +} + +# TODO : +# When a user starts plural clones in a node in globally-unique, a user cannot appoint plural name parameters. +# When a user appoints reuse, the resource agent cannot connect plural clones with a container. + +if ocf_is_true "$OCF_RESKEY_CRM_meta_globally_unique"; then + if [ -n "$OCF_RESKEY_name" ]; then + if [ -n "$OCF_RESKEY_CRM_meta_clone_node_max" ] && [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] + then + ocf_exit_reason "Cannot make plural clones from the same name parameter." + exit $OCF_ERR_CONFIGURED + fi + if [ -n "$OCF_RESKEY_CRM_meta_master_node_max" ] && [ "$OCF_RESKEY_CRM_meta_master_node_max" -ne 1 ] + then + ocf_exit_reason "Cannot make plural master from the same name parameter." + exit $OCF_ERR_CONFIGURED + fi + fi + : ${OCF_RESKEY_name=`echo ${OCF_RESOURCE_INSTANCE} | tr ':' '-'`} +else + : ${OCF_RESKEY_name=${OCF_RESOURCE_INSTANCE}} +fi + +if [ -n "$OCF_RESKEY_container" ]; then + # we'll keep the container attribute around for a bit in order not to break + # any existing deployments. The 'name' attribute is prefered now though. + CONTAINER=$OCF_RESKEY_container + ocf_log warn "The 'container' attribute is depreciated" +else + CONTAINER=$OCF_RESKEY_name +fi + +case $__OCF_ACTION in +meta-data) meta_data + exit $OCF_SUCCESS;; +start) + docker_validate + docker_start;; +stop) docker_stop;; +monitor) docker_monitor;; +validate-all) docker_validate;; +usage|help) docker_usage + exit $OCF_SUCCESS + ;; +*) docker_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc + diff --git a/heartbeat/docker-compose b/heartbeat/docker-compose new file mode 100755 index 0000000..696f3a3 --- /dev/null +++ b/heartbeat/docker-compose @@ -0,0 +1,290 @@ +#!/bin/sh +# Version: 1.1.2 +# Date: 2020-06-24 +# +# Resource script for running docker-compose +# +# Description: Manages docker services using docker-compose as an OCF +# resource in an High Availability setup. +# It relies on a well-tested docker compose YAML file which +# distributed on an identical location on all cluster nodes. +# +# Caveat: 1. A YAML file (docker-compose.yml) and an optional Dockerfile +# must be provided in a working directory. +# 2. It is suggested to test run the docker-compose and verify +# on all cluster nodes before enabling this agent. +# +# docker-compose OCF script's Author: Kenny Chen <netman@study-area.org> +# License: GNU General Public License (GPL) +# +# usage: $0 {start|stop|status|monitor|validate-all|meta-data} +# +# The "start" arg starts docker service. +# The "stop" arg stops it. +# +# OCF parameters: +# OCF_RESKEY_binpath +# OCF_RESKEY_dirpath +# OCF_RESKEY_ymlfile +# +########################################################################## +# Initialization: + +: ${OCF_ROOT:=/usr/lib/ocf} +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_binpath_default=/usr/bin/docker-compose +OCF_RESKEY_ymlfile_default=docker-compose.yml +: ${OCF_RESKEY_binpath=${OCF_RESKEY_binpath_default}} +: ${OCF_RESKEY_ymlfile=${OCF_RESKEY_ymlfile_default}} + +USAGE="Usage: $0 {start|stop|status|monitor|validate-all|meta-data}" + +########################################################################## + +usage() +{ + echo $USAGE >&2 +} + +meta_data() +{ +cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="docker-compose" version="1.0.3"> +<version>1.0</version> +<longdesc lang="en"> +Manages docker services using docker-compose as an OCF resource in an High Availability setup. +It relies on a well-tested docker compose YAML file which distributed on an identical location on all cluster nodes. + +Caveat: 1. A YAML file (docker-compose.yml) and an optional Dockerfile + must be provided in a working directory. + 2. It is suggested to test run the docker-compose and verify on all cluster nodes + before enabling this agent. +</longdesc> +<shortdesc lang="en">This script manages docker services using docker-compose.</shortdesc> + +<parameters> + +<parameter name="binpath"> +<longdesc lang="en"> +The docker-composer binary path. +For example, "/usr/bin/docker-compose" +</longdesc> +<shortdesc lang="en">The docker-composer binary path</shortdesc> +<content type="string" default="$OCF_RESKEY_binpath_default"/> +</parameter> + +<parameter name="dirpath" required="1"> +<longdesc lang="en"> +The directory contains docker compose yaml file. +For example, "/data/docker" +</longdesc> +<shortdesc lang="en">Directory contains docker compose files</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="ymlfile"> +<longdesc lang="en"> +The docker-compose yaml file. +For example, "docker-compose.yml" +</longdesc> +<shortdesc lang="en">The docker compose yaml</shortdesc> +<content type="string" default="$OCF_RESKEY_ymlfile_default"/> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="240s"/> +<action name="stop" timeout="20s"/> +<action name="monitor" depth="0" timeout="10s" interval="60s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s"/> +</actions> +</resource-agent> +END +exit $OCF_SUCCESS +} + +if [ -r "$OCF_RESKEY_binpath" -a -x "$OCF_RESKEY_binpath" ]; then + COMMAND="$OCF_RESKEY_binpath" +else + COMMAND="$OCF_RESKEY_binpath_default" +fi + +DIR="$OCF_RESKEY_dirpath" +PRE="$(echo ${DIR##*/} | tr A-Z a-z | sed 's/[^a-z0-9]//g')" +YML="$OCF_RESKEY_ymlfile" + +docker_kill() +{ + for i in $(docker ps --all | awk -e '$NF ~ /\<'"${PRE}"'_.*_[0-9]+\>/ {print $1}'); do + docker kill $i >/dev/null 2>&1 + docker rm $i >/dev/null 2>&1 || RTV=false + done + if [ "$RTV" = false ]; then + ocf_log err "failed to kill docker" + return $OCF_ERR_GENERIC + else + RUN=false + fi +} + +docker_compose_status() +{ + # use docker-compose ps if YML found, otherwise try docker ps and kill containers + if [ -r "$DIR/$YML" ]; then + DKPS=$(cd $DIR; $COMMAND -f $YML ps -q) + + # get number of all containers + [ -n "$DKPS" ] && PSNU=$(echo "$DKPS" | wc -l) + # get number of running containers + for UUID in $DKPS; do + UP=$(docker inspect --format='{{.State.Running}}' "$UUID") + [ "$UP" = "true" ] && UPNU=$((UPNU+1)) + done + + if [ "${PSNU:-0}" -ne 0 ]; then + if [ ${UPNU:-0} -eq 0 ]; then + ocf_log info "docker service is running but not in up state." + return $OCF_NOT_RUNNING + elif [ "$PSNU" -eq $UPNU ]; then + ocf_log info "docker service is up and running" + return $OCF_SUCCESS + else + ocf_log err "docker service is running with partial up state" + return $OCF_ERR_GENERIC + fi + else + RUN=false + fi + else + STAT_MSG=$(docker ps --all | awk -e '$NF ~ /\<'"$PRE"'_.*_[0-9]+\>/ {print $1}') + if [ -z "$STAT_MSG" ]; then + RUN=false + else + ocf_log log "docker service is running without docker-compose, try to kill..." + docker_kill + fi + fi + [ "$RUN" = false ] && { + ocf_log info "docker service is not running" + return $OCF_NOT_RUNNING + } +} + +docker_compose_start() +{ + docker_compose_validate_all + docker_compose_status >/dev/null 2>&1 + retVal=$? + # return success if docker service is running + [ $retVal -eq $OCF_SUCCESS ] && exit $OCF_SUCCESS + + cd $DIR + $COMMAND -f $YML up -d || { + ocf_log err "Error. docker-compose returned error $?." + exit $OCF_ERR_GENERIC + } + + ocf_log info "docker service started." + exit $OCF_SUCCESS +} + +docker_compose_stop() +{ + # use docker-compose down if YML found, otherwise try docker kill and rm + if [ -r "$DIR/$YML" ]; then + docker_compose_validate_all + cd $DIR + $COMMAND -f $YML down || { + ocf_log err "Error on shutting down docker service, try docker kill..." + RUN_KILL=true + } + else + RUN_KILL=true + fi + if [ "$RUN_KILL" = true ]; then + docker_kill + [ "$RTV" = false ] && { + ocf_log err "Error. Could not stop docker services." + return $OCF_ERR_GENERIC + } + fi + ocf_log info "docker service stopped." + exit $OCF_SUCCESS +} + +docker_compose_monitor() +{ + docker_compose_status +} + +docker_compose_validate_all() +{ + if ! check_binary "$OCF_RESKEY_binpath"; then + ocf_log err "missing binary $OCF_RESKEY_binpath." + exit $OCF_ERR_ARGS + fi + if [ ! -e "$OCF_RESKEY_dirpath" ]; then + ocf_log err "diretory $OCF_RESKEY_dirpath is not found." + exit $OCF_ERR_ARGS + elif [ ! -d "$OCF_RESKEY_dirpath" ]; then + ocf_log err "diretory $OCF_RESKEY_dirpath is not a directory." + exit $OCF_ERR_ARGS + fi + if [ ! -e "$OCF_RESKEY_dirpath/$OCF_RESKEY_ymlfile" ]; then + ocf_log err "yaml file $OCF_RESKEY_dirpath/$OCF_RESKEY_ymlfile is not found." + exit $OCF_ERR_ARGS + fi + + return $OCF_SUCCESS +} + + +# +# Main +# + +if [ $# -ne 1 ]; then + usage + exit $OCF_ERR_ARGS +fi + +case $1 in + start) + docker_compose_start + ;; + + stop) + docker_compose_stop + ;; + + status) + docker_compose_status + ;; + + monitor) + docker_compose_monitor + ;; + + validate-all) + docker_compose_validate_all + ;; + + meta-data) + meta_data + ;; + + usage) usage + exit $OCF_SUCCESS + ;; + + *) usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac diff --git a/heartbeat/dovecot b/heartbeat/dovecot new file mode 100755 index 0000000..5775241 --- /dev/null +++ b/heartbeat/dovecot @@ -0,0 +1,338 @@ +#!/bin/sh +# +# Resource script for Dovecot +# +# Description: Manages Dovecot as an OCF resource in +# an high-availability setup. +# +# Author: Raoul Bhatia <r.bhatia@ipax.at> : Original Author +# License: GNU General Public License (GPL) +# +# +# usage: $0 {start|stop|reload|monitor|validate-all|meta-data} +# +# The "start" arg starts a Dovecot instance +# +# The "stop" arg stops it. +# +# OCF parameters: +# OCF_RESKEY_binary +# OCF_RESKEY_config_file +# OCF_RESKEY_parameters +# +########################################################################## + +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_binary_default="/usr/sbin/dovecot" +OCF_RESKEY_config_file_default="" +OCF_RESKEY_pid_file_default="/var/run/dovecot/master.pid" +OCF_RESKEY_parameters_default="" + +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +: ${OCF_RESKEY_config_file=${OCF_RESKEY_config_file_default}} +: ${OCF_RESKEY_pid_file=${OCF_RESKEY_pid_file_default}} +: ${OCF_RESKEY_parameters=${OCF_RESKEY_parameters_default}} +USAGE="Usage: $0 {start|stop|reload|monitor|validate-all|meta-data}"; + +########################################################################## + +usage() { + echo $USAGE >&2 +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="dovecot" version="0.1"> +<version>1.0</version> +<longdesc lang="en"> +This script manages Dovecot as an OCF resource in a high-availability setup. +</longdesc> +<shortdesc lang="en">Manages a highly available Dovecot IMAP/POP3 server instance</shortdesc> + +<parameters> + +<parameter name="binary" unique="0" required="0"> +<longdesc lang="en"> +Full path to the Dovecot binary. +For example, "/usr/sbin/dovecot". +</longdesc> +<shortdesc lang="en">Full path to Dovecot binary</shortdesc> +<content type="string" default="${OCF_RESKEY_binary_default}" /> +</parameter> + +<parameter name="config_file" unique="1" required="0"> +<longdesc lang="en"> +Full path to a Dovecot configuration file. +For example, "/etc/dovecot/dovecot.conf". +</longdesc> +<shortdesc lang="en">Full path to configuration file</shortdesc> +<content type="string" default="${OCF_RESKEY_config_file_default}" /> +</parameter> + +<parameter name="pid_file" unique="1" required="0"> +<longdesc lang="en"> +Full path to a Dovecot PID file. +For example, "/var/run/dovecot/master.pid". +</longdesc> +<shortdesc lang="en">Full path to PID file</shortdesc> +<content type="string" default="${OCF_RESKEY_pid_file_default}" /> +</parameter> + +<parameter name="parameters" unique="0" required="0"> +<longdesc lang="en"> +The Dovecot daemon may be called with additional parameters. +Specify any of them here. +</longdesc> +<shortdesc lang="en"></shortdesc> +<content type="string" default="${OCF_RESKEY_parameters_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="reload" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="60s" /> +<action name="validate-all" timeout="20s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +dovecot_running() { + local loglevel + loglevel=${1:-err} + + # run `dovecot process status` if available + if ocf_is_true $status_support; then + $binary $OPTION_CONFIG_DIR process status 2>&1 + ret=$? + if [ $ret -ne 0 ]; then + ocf_log $loglevel "Dovecot status: " $ret + fi + return $ret + fi + + # manually check Dovecot's pid + + PIDFILE=$OCF_RESKEY_pid_file + if [ -f $PIDFILE ]; then + PID=`head -n 1 $PIDFILE` + kill -s 0 $PID >/dev/null 2>&1 && [ `ps -p $PID | grep dovecot | wc -l` -eq 1 ] + return $? + fi + + # Dovecot is not running + false +} + +dovecot_start() +{ + # if Dovecot is running return success + if dovecot_running info; then + ocf_log info "Dovecot already running." + return $OCF_SUCCESS + fi + + # start Dovecot + $binary $OPTIONS >/dev/null 2>&1 + ret=$? + + if [ $ret -ne 0 ]; then + ocf_exit_reason "Dovecot returned error: $ret" + return $OCF_ERR_GENERIC + fi + + # grant some time for startup/forking the sub processes + # and loop initial monitoring until success or timeout + while true; do + sleep 1 + # break if dovecot is up and running; log failure otherwise + dovecot_running info && break + ocf_log info "Dovecot failed initial monitor action: " $ret + done + + ocf_log info "Dovecot started." + return $OCF_SUCCESS +} + + +dovecot_stop() +{ + # if Dovecot is not running return success + if ! dovecot_running info; then + ocf_log info "Dovecot already stopped." + return $OCF_SUCCESS + fi + + # stop Dovecot + $binary $OPTIONS stop >/dev/null 2>&1 + ret=$? + + if [ $ret -ne 0 ]; then + ocf_exit_reason "Dovecot returned an error while stopping: $ret" + return $OCF_ERR_GENERIC + fi + + # grant some time for shutdown and recheck 5 times + for i in 1 2 3 4 5; do + if dovecot_running info; then + sleep 1 + else + break + fi + done + + # dovecot stop did not succeed + if dovecot_running; then + ocf_exit_reason "Dovecot failed to stop." + return $OCF_ERR_GENERIC + fi + + ocf_log info "Dovecot stopped." + return $OCF_SUCCESS +} + +dovecot_reload() +{ + if dovecot_running; then + ocf_log info "Reloading Dovecot." + $binary $OPTIONS reload + fi +} + +dovecot_monitor() +{ + local status_loglevel="err" + + # Set loglevel to info during probe + if ocf_is_probe; then + status_loglevel="info" + fi + + if dovecot_running $status_loglevel; then + return $OCF_SUCCESS + fi + + return $OCF_NOT_RUNNING +} + +dovecot_validate_all() +{ + # check that the Dovecot binaries exist and can be executed + check_binary "$binary" + + # check config_file parameter + if [ "x$config_file" != "x" ]; then + if [ ! -f "$config_file" ]; then + if ocf_is_probe; then + ocf_log info "Dovecot configuration file '$config_file' not readable during probe." + else + ocf_exit_reason "Dovecot configuration file '$config_file' does not exist or is not readable." + return $OCF_ERR_INSTALLED + fi + fi + fi + + return $OCF_SUCCESS +} + +# +# Main +# + +if [ $# -ne 1 ]; then + usage + exit $OCF_ERR_ARGS +fi + +binary=$OCF_RESKEY_binary +config_file=$OCF_RESKEY_config_file +parameters=$OCF_RESKEY_parameters + + +# handle parameters +case $1 in + meta-data) meta_data + exit $OCF_SUCCESS + ;; + + usage|help) usage + exit $OCF_SUCCESS + ;; +esac + +# build Dovecot options string *outside* to access from each method +OPTIONS='' +OPTION_CONFIG_FILE='' + +# check if the Dovecot config_file exist +if [ "x$config_dir" != "x" ]; then + # remove all trailing slashes + config_file=`echo $config_file | sed 's/\/*$//'` + + # set OPTIONS if config_file is still set + # save OPTION_CONFIG_FILE seperatly + if [ "x$config_file" != "x" ]; then + OPTION_CONFIG_FILE="-c $config_file" + OPTIONS=$OPTION_CONFIG_FILE + fi +fi + +# add all additional parameters to options string +if [ "x$parameters" != "x" ]; then + OPTIONS="$OPTIONS $parameters" +fi + +# check Dovecot status support +status_support=false +process_status=`$binary help 2> /dev/null | grep -q -e "process.*status"` +ret=$? + +if [ $ret -eq 0 ]; then + status_support=true +fi + + +dovecot_validate_all +ret=$? + +if [ $ret -ne $OCF_SUCCESS ]; then + case $1 in + stop) exit $OCF_SUCCESS ;; + *) exit $ret;; + esac +fi + +case $1 in + monitor) dovecot_monitor + exit $? + ;; + start) dovecot_start + exit $? + ;; + + stop) dovecot_stop + exit $? + ;; + + reload) dovecot_reload + exit $? + ;; + + validate-all) exit $OCF_SUCCESS + ;; + + *) usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac diff --git a/heartbeat/dummypy.in b/heartbeat/dummypy.in new file mode 100755 index 0000000..3e2ff09 --- /dev/null +++ b/heartbeat/dummypy.in @@ -0,0 +1,164 @@ +#!@PYTHON@ -tt +# - *- coding: utf- 8 - *- +# +# +# Dummy Python OCF RA. Does nothing except track its own state. +# Use it only as a testing tool or example for how to write +# a resource agent in python. +# +# Copyright (c) 2020 SUSE LLC, Pablo Bravo (pablo.bravo@suse.com) +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### + +import os +import sys + +OCF_FUNCTIONS_DIR = os.environ.get( + "OCF_FUNCTIONS_DIR", + "%s/lib/heartbeat" % os.environ.get("OCF_ROOT")) +sys.path.append(OCF_FUNCTIONS_DIR) + +import ocf # noqa: E402 + +PARAM_HA_RSCTMP = ocf.get_parameter("HA_RSCTMP") +PARAM_INSTANCE = ocf.get_parameter("OCF_RESOURCE_INSTANCE") +OCF_RESKEY_state_default = '{}/dummypy-{}.state'.format(PARAM_HA_RSCTMP, PARAM_INSTANCE) +OCF_RESKEY_fake_default = "dummy" + +LONG_DESC = '''This is a Dummy Resource Agent. It does absolutely nothing +except keep track of whether its running or not. +Its purpose in life is for testing and to serve as a template for RA writers. + +NB: Please pay attention to the timeouts specified in the actions +section below. They should be meaningful for the kind of resource +the agent manages. They should be the minimum advised timeouts, +but they shouldn't/cannot cover _all_ possible resource +instances. So, try to be neither overly generous nor too stingy, +but moderate. The minimum timeouts should never be below 10 seconds.''' + +SHORT_DESC = 'Example stateless resource agent' + + +def touch(file_name): + with open(file_name, 'a'): + os.utime(file_name, None) + + +def dummy_start(state, fake): + rc = dummy_monitor(state, fake) + if rc == ocf.OCF_SUCCESS: + return ocf.OCF_SUCCESS + touch(state) + + +def dummy_stop(state, fake): + rc = dummy_monitor(state, fake) + if rc == ocf.OCF_SUCCESS: + os.remove(state) + return ocf.OCF_SUCCESS + + +def dummy_monitor(state, fake): + try: + if os.path.isfile(state): + return ocf.OCF_SUCCESS + + if (not ocf.is_probe()) and (ocf.OCF_ACTION == "monitor"): + ocf.ocf_exit_reason("No process state file found") + + return ocf.OCF_NOT_RUNNING + except: # noqa: E722 + return ocf.OCF_ERR_GENERIC + + +def dummy_validate(state, fake): + state_dir = os.path.dirname(state) + file_test = '{}/{}'.format(state_dir if state_dir != "" else ".", PARAM_INSTANCE) + + try: + touch(file_test) + except: # noqa: E722 + ocf.ocf_exit_reason('State file "{}" is not writable'.format(state)) + return ocf.OCF_ERR_ARGS + + os.remove(file_test) + return ocf.OCF_SUCCESS + + +def dummy_migrate_to(state, fake): + INSTANCE = ocf.OCF_RESOURCE_INSTANCE + TARGET = ocf.get_parameter("CRM_meta_migrate_target") + ocf.logger.info('Migrating {} to {}.'.format(INSTANCE, TARGET)) + dummy_stop(state, fake) + + +def dummy_migrate_from(state, fake): + INSTANCE = ocf.OCF_RESOURCE_INSTANCE + SOURCE = ocf.get_parameter("CRM_meta_migrate_source") + ocf.logger.info('Migrating {} from {}.'.format(INSTANCE, SOURCE)) + dummy_start(state, fake) + + +def dummy_reload(state, fake): + ocf.logger.info('Reloading {} ...'.format(ocf.OCF_RESOURCE_INSTANCE)) + + +def dummy_validate_all(state, fake): + dummy_validate(state, fake) + + +def main(): + dummy_agent = ocf.Agent('dummypy', SHORT_DESC, LONG_DESC) + + dummy_agent.add_parameter( + name="state", + shortdesc="State file", + longdesc="Location to store the resource state in.", + content_type="string", + unique=True, + default=ocf.get_parameter("state_default") + ) + + dummy_agent.add_parameter( + name="fake", + shortdesc="Fake attribute that can be changed to cause a reload", + longdesc="Fake attribute that can be changed to cause a reload", + content_type="string", + unique=False, + default=ocf.get_parameter("fake_default") + ) + + dummy_agent.add_action(name="start", timeout=20, handler=dummy_start) + dummy_agent.add_action(name="stop", timeout=20, handler=dummy_stop) + dummy_agent.add_action(name="monitor", timeout=20, handler=dummy_monitor, interval=10, depth=0) # noqa: E501 + dummy_agent.add_action(name="reload", timeout=20, handler=dummy_reload) + dummy_agent.add_action(name="migrate_to", timeout=20, handler=dummy_migrate_to) # noqa: E501 + dummy_agent.add_action(name="migrate_from", timeout=20, handler=dummy_migrate_from) # noqa: E501 + dummy_agent.add_action(name="validate-all", timeout=20, handler=dummy_validate_all) # noqa: E501 + + dummy_agent.run() + + +if __name__ == "__main__": + main() diff --git a/heartbeat/eDir88.in b/heartbeat/eDir88.in new file mode 100644 index 0000000..cd945d2 --- /dev/null +++ b/heartbeat/eDir88.in @@ -0,0 +1,476 @@ +#!@BASH_SHELL@ +# +# eDirectory Resource Agent (RA) for Heartbeat. +# This script is only compatible with eDirectory 8.8 and later +# +# Copyright (c) 2007 Novell Inc, Yan Fitterer +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# +# OCF parameters: +# OCF_RESKEY_eDir_config_file - full filename to instance configuration file +# OCF_RESKEY_eDir_monitor_ldap - Should we monitor LDAP (0/1 - 1 is true) +# OCF_RESKEY_eDir_monitor_idm - Should we monitor IDM (0/1 - 1 is true) +# OCF_RESKEY_eDir_jvm_initial_heap - Value of the DHOST_INITIAL_HEAP java env var +# OCF_RESKEY_eDir_jvm_max_heap - Value of the DHOST_MAX_HEAP java env var +# OCF_RESKEY_eDir_jvm_options - Value of the DHOST_OPTIONS java env var +############################################################################### + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs +test -f /opt/novell/eDirectory/bin/ndspath && + . /opt/novell/eDirectory/bin/ndspath 2>/dev/null >/dev/null + +# Parameter defaults + +OCF_RESKEY_eDir_config_file_default="/etc/opt/novell/eDirectory/conf/nds.conf" +OCF_RESKEY_eDir_monitor_ldap_default="0" +OCF_RESKEY_eDir_monitor_idm_default="0" +OCF_RESKEY_eDir_jvm_initial_heap_default="" +OCF_RESKEY_eDir_jvm_max_heap_default="" +OCF_RESKEY_eDir_jvm_options_default="" + +: ${OCF_RESKEY_eDir_config_file=${OCF_RESKEY_eDir_config_file_default}} +: ${OCF_RESKEY_eDir_monitor_ldap=${OCF_RESKEY_eDir_monitor_ldap_default}} +: ${OCF_RESKEY_eDir_monitor_idm=${OCF_RESKEY_eDir_monitor_idm_default}} +: ${OCF_RESKEY_eDir_jvm_initial_heap=${OCF_RESKEY_eDir_jvm_initial_heap_default}} +: ${OCF_RESKEY_eDir_jvm_max_heap=${OCF_RESKEY_eDir_jvm_max_heap_default}} +: ${OCF_RESKEY_eDir_jvm_options=${OCF_RESKEY_eDir_jvm_options_default}} + +####################################################################### + +usage() { + ME=$(basename "$0") + cat <<-EOFA + +usage: $ME start|stop|status|monitor|validate-all + +$ME manages an eDirectory instance as an HA resource. + +The 'start' operation starts the instance. +The 'stop' operation stops the instance. +The 'status' operation reports if the instance is running. +The 'monitor' operation reports if the instance is running, and runs additional checks. +The 'validate-all' operation checks the validity of the arguments (environment variables). +EOFA +} + +eDir_meta_data() { +cat <<-EOFB +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="eDir88" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for managing an eDirectory instance. Manages a single instance +of eDirectory as an HA resource. The "multiple instances" feature or +eDirectory has been added in version 8.8. This script will not work for any +version of eDirectory prior to 8.8. This RA can be used to load multiple +eDirectory instances on the same host. + +It is very strongly recommended to put eDir configuration files (as per the +eDir_config_file parameter) on local storage on each node. This is necessary for +this RA to be able to handle situations where the shared storage has become +unavailable. If the eDir configuration file is not available, this RA will fail, +and heartbeat will be unable to manage the resource. Side effects include +STONITH actions, unmanageable resources, etc... + +Setting a high action timeout value is _very_ _strongly_ recommended. eDir +with IDM can take in excess of 10 minutes to start. If heartbeat times out +before eDir has had a chance to start properly, mayhem _WILL ENSUE_. + +The LDAP module seems to be one of the very last to start. So this script will +take even longer to start on installations with IDM and LDAP if the monitoring +of IDM and/or LDAP is enabled, as the start command will wait for IDM and LDAP +to be available. +</longdesc> +<shortdesc lang="en">Manages a Novell eDirectory directory server</shortdesc> +<parameters> +<parameter name="eDir_config_file" unique="1" required="0"> +<longdesc lang="en"> +Path to configuration file for eDirectory instance. +</longdesc> +<shortdesc lang="en">eDir config file</shortdesc> +<content type="string" default="${OCF_RESKEY_eDir_config_file_default}" /> +</parameter> +<parameter name="eDir_monitor_ldap" required="0"> +<longdesc lang="en"> +Should we monitor if LDAP is running for the eDirectory instance? +</longdesc> +<shortdesc lang="en">eDir monitor ldap</shortdesc> +<content type="boolean" default="${OCF_RESKEY_eDir_monitor_ldap_default}" /> +</parameter> +<parameter name="eDir_monitor_idm" required="0"> +<longdesc lang="en"> +Should we monitor if IDM is running for the eDirectory instance? +</longdesc> +<shortdesc lang="en">eDir monitor IDM</shortdesc> +<content type="boolean" default="${OCF_RESKEY_eDir_monitor_idm_default}" /> +</parameter> +<parameter name="eDir_jvm_initial_heap" required="0"> +<longdesc lang="en"> +Value for the DHOST_INITIAL_HEAP java environment variable. If unset, java defaults will be used. +</longdesc> +<shortdesc lang="en">DHOST_INITIAL_HEAP value</shortdesc> +<content type="integer" default="${OCF_RESKEY_eDir_jvm_initial_heap_default}" /> +</parameter> +<parameter name="eDir_jvm_max_heap" required="0"> +<longdesc lang="en"> +Value for the DHOST_MAX_HEAP java environment variable. If unset, java defaults will be used. +</longdesc> +<shortdesc lang="en">DHOST_MAX_HEAP value</shortdesc> +<content type="integer" default="${OCF_RESKEY_eDir_jvm_max_heap_default}" /> +</parameter> +<parameter name="eDir_jvm_options" required="0"> +<longdesc lang="en"> +Value for the DHOST_OPTIONS java environment variable. If unset, original values will be used. +</longdesc> +<shortdesc lang="en">DHOST_OPTIONS value</shortdesc> +<content type="string" default="${OCF_RESKEY_eDir_jvm_options_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="600s" /> +<action name="stop" timeout="600s" /> +<action name="monitor" timeout="60s" interval="30s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s" /> +</actions> +</resource-agent> +EOFB +return $OCF_SUCCESS +} + +# +# eDir_start: Start eDirectory instance +# + +eDir_start() { + if eDir_status ; then + ocf_log info "eDirectory is already running ($NDSCONF)." + return $OCF_SUCCESS + fi + + # Start eDirectory instance + if [ -n "$OCF_RESKEY_eDir_jvm_initial_heap" ]; then + DHOST_JVM_INITIAL_HEAP=$OCF_RESKEY_eDir_jvm_initial_heap + export DHOST_JVM_INITIAL_HEAP + fi + if [ -n "$OCF_RESKEY_eDir_jvm_max_heap" ]; then + DHOST_JVM_MAX_HEAP=$OCF_RESKEY_eDir_jvm_max_heap + export DHOST_JVM_MAX_HEAP + fi + if [ -n "$OCF_RESKEY_eDir_jvm_options" ]; then + DHOST_JVM_OPTIONS=$OCF_RESKEY_eDir_jvm_options + export DHOST_JVM_OPTIONS + fi + + $NDSMANAGE start --config-file "$NDSCONF" > /dev/null 2>&1 + if [ $? -eq 0 ]; then + ocf_log info "eDir start command sent for $NDSCONF." + else + echo "ERROR: Can't start eDirectory for $NDSCONF." + return $OCF_ERR_GENERIC + fi + + CNT=0 + while ! eDir_monitor ; do + # Apparently, LDAP will only start after all other services + # Startup time can be in excess of 10 minutes. + # Leave a very long heartbeat timeout on the start action + # We're relying on heartbeat to bail us out... + let CNT=$CNT+1 + ocf_log info "eDirectory start waiting for ${CNT}th retry for $NDSCONF." + sleep 10 + done + + ocf_log info "eDirectory start verified for $NDSCONF." + + return $OCF_SUCCESS +} + +# +# eDir_stop: Stop eDirectory instance +# This action is written in such a way that even when run +# on a node were things are broken (no binaries, no config +# etc...) it will try to stop any running ndsd processes +# and report success if none are running. +# + +eDir_stop() { + if ! eDir_status ; then + return $OCF_SUCCESS + fi + + $NDSMANAGE stop --config-file "$NDSCONF" >/dev/null 2>&1 + if eDir_status ; then + # eDir failed to stop. + ocf_log err "eDirectory instance failed to stop for $NDSCONF" + return $OCF_ERR_GENERIC + else + ocf_log info "eDirectory stop verified for $NDSCONF." + return $OCF_SUCCESS + fi +} + +# +# eDir_status: is eDirectory instance up ? +# + +eDir_status() { + if [ ! -r "$NDSCONF" ] ; then + ocf_log err "Config file missing ($NDSCONF)." + exit $OCF_ERR_GENERIC + fi + + # Find how many ndsd processes have open listening sockets + # with the IP of this eDir instance + IFACE=$(grep -i "n4u.server.interfaces" $NDSCONF | cut -f2 -d= | tr '@' ':') + if [ -z "$IFACE" ] ; then + ocf_log err "Cannot retrieve interfaces from $NDSCONF. eDirectory may not be correctly configured." + exit $OCF_ERR_GENERIC + fi + + # In case of multiple IP's split into an array + # and check all of them + IFS=', ' read -a IFACE2 <<< "$IFACE" + ocf_log debug "Found ${#IFACE2[@]} interfaces from $NDSCONF." + + counter=${#IFACE2[@]} + + for IFACE in "${IFACE2[@]}" + do + ocf_log debug "Checking ndsd instance for $IFACE" + NDSD_SOCKS=$(netstat -ntlp | grep -ce "$IFACE.*ndsd") + + if [ "$NDSD_SOCKS" -eq 1 ] ; then + let counter=counter-1 + ocf_log debug "Found ndsd instance for $IFACE" + elif [ "$NDSD_SOCKS" -gt 1 ] ; then + ocf_log err "More than 1 ndsd listening socket matched. Likely misconfiguration of eDirectory." + exit $OCF_ERR_GENERIC + fi + done + + if [ $counter -eq 0 ] ; then + # Correct ndsd instance is definitely running + ocf_log debug "All ndsd instances found." + return 0; + elif [ $counter -lt ${#IFACE2[@]} ]; then + ocf_log err "Only some ndsd listening sockets matched, something is very wrong." + exit $OCF_ERR_GENERIC + fi + + # No listening socket. Make sure we don't have the process running... + PIDDIR=$(grep -i "n4u.server.vardir" "$NDSCONF" | cut -f2 -d=) + if [ -z "$PIDDIR" ] ; then + ocf_log err "Cannot get vardir from nds config ($NDSCONF). Probable eDir configuration error." + exit $OCF_ERR_GENERIC + fi + NDSD_PID=$(cat $PIDDIR/ndsd.pid 2>/dev/null) + if [ -z "$NDSD_PID" ] ; then + # PID file unavailable or empty. + # This will happen if the PIDDIR is not available + # on this node at this time. + return 1 + fi + + RC=$(ps -p "$NDSD_PID" | grep -c ndsd) + if [ "$RC" -gt 0 ] ; then + # process found but no listening socket. ndsd likely not operational + ocf_log err "ndsd process found, but no listening socket. Something's gone wrong ($NDSCONF)" + exit $OCF_ERR_GENERIC + fi + + ocf_log debug "ndsd instance is not running, but no other error detected." + return 1 +} + + +# +# eDir_monitor: Do more in-depth checks to ensure that eDirectory is fully functional +# LDAP and IDM checks are only done if reqested. +# +# + +eDir_monitor() { + if ! eDir_status ; then + ocf_log info "eDirectory instance is down ($NDSCONF)" + return $OCF_NOT_RUNNING + fi + + # We know the right ndsd is running locally, check health + $NDSSTAT --config-file "$NDSCONF" >/dev/null 2>&1 + if [ $? -ne 0 ] ; then + return 1 + fi + + # Monitor IDM first, as it will start before LDAP + if [ $MONITOR_IDM -eq 1 ]; then + RET=$($NDSTRACE --config-file "$NDSCONF" -c modules | egrep -i '^vrdim.*Running' | awk '{print $1}') + if [ "$RET" != "vrdim" ]; then + ocf_log err "eDirectory IDM engine isn't running ($NDSCONF)." + return $OCF_ERR_GENERIC + fi + fi + if [ $MONITOR_LDAP -eq 1 ] ; then + $NDSNLDAP -c --config-file "$NDSCONF" >/dev/null 2>&1 + if [ $? -ne 0 ]; then + ocf_log err "eDirectory LDAP server isn't running ($NDSCONF)." + return $OCF_ERR_GENERIC + fi + fi + + ocf_log debug "eDirectory monitor success ($NDSCONF)" + return $OCF_SUCCESS +} + +# +# eDir_validate: Validate environment +# + +eDir_validate() { + + declare rc=$OCF_SUCCESS + + # Script must be run as root + if ! ocf_is_root ; then + ocf_log err "$0 must be run as root" + rc=$OCF_ERR_GENERIC + fi + + # ndsmanage must be available and runnable + check_binary $NDSMANAGE + + # ndsstat must be available and runnable + check_binary $NDSSTAT + + # Config file must be readable + if [ ! -r "$NDSCONF" ] ; then + ocf_log err "eDirectory configuration file [$NDSCONF] is not readable" + rc=$OCF_ERR_ARGS + fi + + # monitor_ldap must be unambiguously resolvable to a truth value + MONITOR_LDAP=$(echo "$MONITOR_LDAP" | tr [A-Z] [a-z]) + case "$MONITOR_LDAP" in + yes|true|1) + MONITOR_LDAP=1;; + no|false|0) + MONITOR_LDAP=0;; + *) + ocf_log err "Configuration parameter eDir_monitor_ldap has invalid value [$MONITOR_LDAP]" + rc=$OCF_ERR_ARGS;; + esac + + # monitor_idm must be unambiguously resolvable to a truth value + MONITOR_IDM=$(echo "$MONITOR_IDM" | tr [A-Z] [a-z]) + case "$MONITOR_IDM" in + yes|true|1) + MONITOR_IDM=1;; + no|false|0) + MONITOR_IDM=0;; + *) + ocf_log err "Configuration parameter eDir_monitor_idm has invalid value [$MONITOR_IDM]" + rc=$OCF_ERR_ARGS;; + esac + + # eDir_jvm_initial_heap must be blank or numeric + if [ -n "$OCF_RESKEY_eDir_jvm_initial_heap" ] ; then + if ! ocf_is_decimal "$OCF_RESKEY_eDir_jvm_initial_heap" ; then + ocf_log err "Configuration parameter eDir_jvm_initial_heap has invalid" \ + "value [$OCF_RESKEY_eDir_jvm_initial_heap]" + rc=$OCF_ERR_ARGS + fi + fi + + # eDir_jvm_max_heap must be blank or numeric + if [ -n "$OCF_RESKEY_eDir_jvm_max_heap" ] ; then + if ! ocf_is_decimal "$OCF_RESKEY_eDir_jvm_max_heap" ; then + ocf_log err "Configuration parameter eDir_jvm_max_heap has invalid" \ + "value [$OCF_RESKEY_eDir_jvm_max_heap]" + rc=$OCF_ERR_ARGS + fi + fi + if [ $rc -ne $OCF_SUCCESS ] ; then + ocf_log err "Invalid environment" + fi + return $rc +} + +# +# Start of main logic +# + +ocf_log debug "$0 started with arguments \"$*\"" + +NDSBASE=/opt/novell/eDirectory +NDSNLDAP=$NDSBASE/sbin/nldap +NDSMANAGE=$NDSBASE/bin/ndsmanage +NDSSTAT=$NDSBASE/bin/ndsstat +NDSTRACE=$NDSBASE/bin/ndstrace +NDSCONF=${OCF_RESKEY_eDir_config_file:-/etc/opt/novell/eDirectory/conf/nds.conf} +MONITOR_LDAP=${OCF_RESKEY_eDir_monitor_ldap:-0} +MONITOR_IDM=${OCF_RESKEY_eDir_monitor_idm:-0} + + +# What kind of method was invoked? +case "$1" in + validate-all) eDir_validate; exit $?;; + meta-data) eDir_meta_data; exit $OCF_SUCCESS;; + status) if eDir_status ; then + ocf_log info "eDirectory instance is up ($NDSCONF)" + exit $OCF_SUCCESS + else + ocf_log info "eDirectory instance is down ($NDSCONF)" + exit $OCF_NOT_RUNNING + fi;; + start) : skip;; + stop) : skip;; + monitor) : skip;; + usage) usage; exit $OCF_SUCCESS;; + *) ocf_log err "Invalid argument [$1]" + usage; exit $OCF_ERR_ARGS;; +esac + +# From now on we must have a valid environment to continue. +# stop goes in the list above as it should ideally be able to +# clean up after a start that failed due to bad args + +eDir_validate +RC=$? +if [ $RC -ne $OCF_SUCCESS ]; then + exit $RC +fi + +case "$1" in + start) eDir_start;; + stop) eDir_stop;; + monitor) eDir_monitor;; +esac + +exit $? diff --git a/heartbeat/ethmonitor b/heartbeat/ethmonitor new file mode 100755 index 0000000..f9c9ef4 --- /dev/null +++ b/heartbeat/ethmonitor @@ -0,0 +1,580 @@ +#!/bin/sh +# +# OCF Resource Agent compliant script. +# Monitor the vitality of a local network interface. +# +# Based on the work by Robert Euhus and Lars Marowsky-Bree. +# +# Transfered from Ipaddr2 into ethmonitor by Alexander Krauth +# +# Copyright (c) 2011 Robert Euhus, Alexander Krauth, Lars Marowsky-Brée +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# OCF parameters are as below +# +# OCF_RESKEY_interface +# OCF_RESKEY_multiplicator +# OCF_RESKEY_name +# OCF_RESKEY_repeat_count +# OCF_RESKEY_repeat_interval +# OCF_RESKEY_pktcnt_timeout +# OCF_RESKEY_arping_count +# OCF_RESKEY_arping_timeout +# OCF_RESKEY_arping_cache_entries +# +# TODO: Check against IPv6 +# +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_interface_default="" +OCF_RESKEY_name_default="" +OCF_RESKEY_multiplier_default="1" +OCF_RESKEY_repeat_count_default="5" +OCF_RESKEY_repeat_interval_default="10" +OCF_RESKEY_pktcnt_timeout_default="5" +OCF_RESKEY_arping_count_default="1" +OCF_RESKEY_arping_timeout_default="1" +OCF_RESKEY_arping_cache_entries_default="5" +OCF_RESKEY_link_status_only_default="false" + +: ${OCF_RESKEY_interface=${OCF_RESKEY_interface_default}} +: ${OCF_RESKEY_name=${OCF_RESKEY_name_default}} +: ${OCF_RESKEY_multiplier=${OCF_RESKEY_multiplier_default}} +: ${OCF_RESKEY_repeat_count=${OCF_RESKEY_repeat_count_default}} +: ${OCF_RESKEY_repeat_interval=${OCF_RESKEY_repeat_interval_default}} +: ${OCF_RESKEY_pktcnt_timeout=${OCF_RESKEY_pktcnt_timeout_default}} +: ${OCF_RESKEY_arping_count=${OCF_RESKEY_arping_count_default}} +: ${OCF_RESKEY_arping_timeout=${OCF_RESKEY_arping_timeout_default}} +: ${OCF_RESKEY_arping_cache_entries=${OCF_RESKEY_arping_cache_entries_default}} +: ${OCF_RESKEY_link_status_only=${OCF_RESKEY_link_status_only_default}} + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="ethmonitor" version="1.2"> +<version>1.0</version> + +<longdesc lang="en"> +Monitor the vitality of a local network interface. + +You may set up this RA as a clone resource to monitor the network interfaces on different nodes, with the same interface name. +This is not related to the IP address or the network on which a interface is configured. +You may use this RA to move resources away from a node, which has a faulty interface or prevent moving resources to such a node. +This gives you independent control of the resources, without involving cluster intercommunication. But it requires your nodes to have more than one network interface. + +The resource configuration requires a monitor operation, because the monitor does the main part of the work. +In addition to the resource configuration, you need to configure some location constraints, based on a CIB attribute value. +The name of the attribute value is configured in the 'name' option of this RA. + +Example constraint configuration using crmsh +location loc_connected_node my_resource_grp \ + rule $id="rule_loc_connected_node" -INF: ethmonitor eq 0 + +Example constraint configuration using pcs. Only allow 'my_resource' to run on nodes where eth0 ethernet device is available. +pcs constraint location my_resource rule score=-INFINITY ethmonitor-eth0 ne 1 + +The ethmonitor works in 3 different modes to test the interface vitality. +1. call ip to see if the link status is up (if link is down -> error) +2. call ip and watch the RX counter (if packages come around in a certain time -> success) +3. call arping to check whether any of the IPs found in the local ARP cache answers an ARP REQUEST (one answer -> success) +4. return error +</longdesc> +<shortdesc lang="en">Monitors network interfaces</shortdesc> + +<parameters> +<parameter name="interface" unique="1" required="1"> +<longdesc lang="en"> +The name of the network interface which should be monitored (e.g. eth0). +</longdesc> +<shortdesc lang="en">Network interface name</shortdesc> +<content type="string" default="${OCF_RESKEY_interface_default}"/> +</parameter> + +<parameter name="name" unique="1"> +<longdesc lang="en"> +The name of the CIB attribute to set. This is the name to be used in the constraints. Defaults to "ethmonitor-'interface_name'". +</longdesc> +<shortdesc lang="en">Attribute name</shortdesc> +<content type="string" default="${OCF_RESKEY_name_default}"/> +</parameter> + +<parameter name="multiplier" unique="0" > +<longdesc lang="en"> +Multiplier for the value of the CIB attriobute specified in parameter name. +</longdesc> +<shortdesc lang="en">Multiplier for result variable</shortdesc> +<content type="integer" default="${OCF_RESKEY_multiplier_default}"/> +</parameter> + +<parameter name="repeat_count"> +<longdesc lang="en"> +Specify how often the interface will be monitored, before the status is set to failed. You need to set the timeout of the monitoring operation to at least repeat_count * repeat_interval +</longdesc> +<shortdesc lang="en">Monitor repeat count</shortdesc> +<content type="integer" default="${OCF_RESKEY_repeat_count_default}"/> +</parameter> + +<parameter name="repeat_interval"> +<longdesc lang="en"> +Specify how long to wait in seconds between the repeat_counts. +</longdesc> +<shortdesc lang="en">Monitor repeat interval in seconds</shortdesc> +<content type="integer" default="${OCF_RESKEY_repeat_interval_default}"/> +</parameter> + +<parameter name="pktcnt_timeout"> +<longdesc lang="en"> +Timeout for the RX packet counter. Stop listening for packet counter changes after the given number of seconds. +</longdesc> +<shortdesc lang="en">packet counter timeout</shortdesc> +<content type="integer" default="${OCF_RESKEY_pktcnt_timeout_default}"/> +</parameter> + +<parameter name="arping_count"> +<longdesc lang="en"> +Number of ARP REQUEST packets to send for every IP. +Usually one ARP REQUEST (arping) is send +</longdesc> +<shortdesc lang="en">Number of arpings per IP</shortdesc> +<content type="integer" default="${OCF_RESKEY_arping_count_default}"/> +</parameter> + +<parameter name="arping_timeout"> +<longdesc lang="en"> +Time in seconds to wait for ARP REQUESTs (all packets of arping_count). +This is to limit the time for arp requests, to be able to send requests to more than one node, without running in the monitor operation timeout. +</longdesc> +<shortdesc lang="en">Timeout for arpings per IP</shortdesc> +<content type="integer" default="${OCF_RESKEY_arping_timeout_default}"/> +</parameter> + +<parameter name="arping_cache_entries"> +<longdesc lang="en"> +Maximum number of IPs from ARP cache list to check for ARP REQUEST (arping) answers. Newest entries are tried first. +</longdesc> +<shortdesc lang="en">Number of ARP cache entries to try</shortdesc> +<content type="integer" default="${OCF_RESKEY_arping_cache_entries_default}"/> +</parameter> + +<parameter name="infiniband_device"> +<longdesc lang="en"> +For interfaces that are infiniband devices. +</longdesc> +<shortdesc lang="en">infiniband device</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="infiniband_port"> +<longdesc lang="en"> +For infiniband devices, this is the port to monitor. +</longdesc> +<shortdesc lang="en">infiniband port</shortdesc> +<content type="integer" /> +</parameter> + +<parameter name="link_status_only"> +<longdesc lang="en"> +Only report success based on link status. Do not perform RX counter or arping related connectivity tests. +</longdesc> +<shortdesc lang="en">link status check only</shortdesc> +<content type="boolean" default="${OCF_RESKEY_link_status_only_default}" /> +</parameter> + +</parameters> +<actions> +<action name="start" timeout="60s" /> +<action name="stop" timeout="20s" /> +<action name="status" depth="0" timeout="60s" interval="10s" /> +<action name="monitor" depth="0" timeout="60s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="20s" /> +</actions> +</resource-agent> +END + + exit $OCF_SUCCESS +} + +# +# Return true, if the interface exists +# +is_interface() { + # + # List interfaces but exclude FreeS/WAN ipsecN virtual interfaces + # + local iface=`$IP2UTIL -o -f link addr show | grep -e " $1[:@]" \ + | cut -d ' ' -f2 | tr -d ':' | cut -d '@' -f1 | sort -u | grep -v '^ipsec[0-9][0-9]*$'` + [ "$iface" != "" ] +} + +infiniband_status() +{ + local device="$OCF_RESKEY_infiniband_device" + + if [ -n "$OCF_RESKEY_infiniband_port" ]; then + device="${OCF_RESKEY_infiniband_device}:${OCF_RESKEY_infiniband_port}" + fi + + case "${OCF_RESKEY_infiniband_device}" in + *ib*|*mlx*) ibstatus ${device} | grep -q ACTIVE ;; + *hfi*) opainfo | grep -q Active ;; + esac +} + +if_init() { + local rc + + if [ X"$OCF_RESKEY_interface" = "X" ]; then + ocf_exit_reason "Interface name (the interface parameter) is mandatory" + exit $OCF_ERR_CONFIGURED + fi + + NIC="$OCF_RESKEY_interface" + + if is_interface $NIC + then + case "$NIC" in + *:*) ocf_exit_reason "Do not specify a virtual interface : $OCF_RESKEY_interface" + exit $OCF_ERR_CONFIGURED;; + *) ;; + esac + else + case $__OCF_ACTION in + validate-all) + ocf_exit_reason "Interface $NIC does not exist" + exit $OCF_ERR_CONFIGURED;; + monitor) + ocf_log debug "Interface $NIC does not exist" + ;; + *) + ## It might be a bond interface which is temporarily not available, therefore we want to continue here + ocf_log warn "Interface $NIC does not exist" + ;; + esac + fi + + if ! ocf_is_decimal "$OCF_RESKEY_multiplier"; then + ocf_exit_reason "Invalid OCF_RESKEY_multiplier [$OCF_RESKEY_multiplier]" + exit $OCF_ERR_CONFIGURED + fi + + ATTRNAME=${OCF_RESKEY_name:-"ethmonitor-$NIC"} + + REP_COUNT=${OCF_RESKEY_repeat_count:-5} + if ! ocf_is_decimal "$REP_COUNT" -o [ $REP_COUNT -lt 1 ]; then + ocf_exit_reason "Invalid OCF_RESKEY_repeat_count [$REP_COUNT]" + exit $OCF_ERR_CONFIGURED + fi + REP_INTERVAL_S=${OCF_RESKEY_repeat_interval:-10} + if ! ocf_is_decimal "$REP_INTERVAL_S"; then + ocf_exit_reason "Invalid OCF_RESKEY_repeat_interval [$REP_INTERVAL_S]" + exit $OCF_ERR_CONFIGURED + fi + if ! ocf_is_decimal "$OCF_RESKEY_pktcnt_timeout"; then + ocf_exit_reason "Invalid OCF_RESKEY_pktcnt_timeout [$OCF_RESKEY_pktcnt_timeout]" + exit $OCF_ERR_CONFIGURED + fi + if ! ocf_is_decimal "$OCF_RESKEY_arping_count"; then + ocf_exit_reason "Invalid OCF_RESKEY_arping_count [$OCF_RESKEY_arping_count]" + exit $OCF_ERR_CONFIGURED + fi + if ! ocf_is_decimal "$OCF_RESKEY_arping_timeout"; then + ocf_exit_reason "Invalid OCF_RESKEY_arping_timeout [$OCF_RESKEY_arping_count]" + exit $OCF_ERR_CONFIGURED + fi + if ! ocf_is_decimal "$OCF_RESKEY_arping_cache_entries"; then + ocf_exit_reason "Invalid OCF_RESKEY_arping_cache_entries [$OCF_RESKEY_arping_cache_entries]" + exit $OCF_ERR_CONFIGURED + fi + + if [ -n "$OCF_RESKEY_infiniband_device" ]; then + #ibstatus or opainfo is required if an infiniband_device is provided + case "${OCF_RESKEY_infiniband_device}" in + *ib*|*mlx*) check_binary ibstatus ;; + *hfi*) check_binary opainfo ;; + esac + fi + return $OCF_SUCCESS +} + +# get the link status on $NIC +# asks ip about running (up) interfaces, returns the number of matching interface names that are up +get_link_status () { + $IP2UTIL -o link show up dev "$NIC" | grep -v 'NO-CARRIER' | grep -c "$NIC" +} + +# returns the number of received rx packets on $NIC +get_rx_packets () { + ocf_log debug "$IP2UTIL -o -s link show dev $NIC" + $IP2UTIL -o -s link show dev "$NIC" \ + | sed 's/.* RX: [^0-9]*[0-9]* *\([0-9]*\) .*/\1/' + # the first number after RX: is the # of bytes , + # the second is the # of packets received +} + +# watch for packet counter changes for max. OCF_RESKEY_pktcnt_timeout seconds +# returns immedeately with return code 0 if any packets were received +# otherwise 1 is returned +watch_pkt_counter () { + local RX_PACKETS_NEW + local RX_PACKETS_OLD + RX_PACKETS_OLD="`get_rx_packets`" + for n in `seq $(( $OCF_RESKEY_pktcnt_timeout * 10 ))`; do + sleep 0.1 + RX_PACKETS_NEW="`get_rx_packets`" + ocf_log debug "RX_PACKETS_OLD: $RX_PACKETS_OLD RX_PACKETS_NEW: $RX_PACKETS_NEW" + if [ "$RX_PACKETS_OLD" -ne "$RX_PACKETS_NEW" ]; then + ocf_log debug "we received some packets." + return 0 + fi + done + return 1 +} + +# returns list of cached ARP entries for $NIC +# sorted by age ("last confirmed") +# max. OCF_RESKEY_arping_cache_entries entries +get_arp_list () { + $IP2UTIL -s neighbour show dev $NIC \ + | sort -t/ -k2,2n | cut -d' ' -f1 \ + | head -n $OCF_RESKEY_arping_cache_entries + # the "used" entries in `ip -s neighbour show` are: + # "last used"/"last confirmed"/"last updated" +} + +# arping the IP given as argument $1 on $NIC +# until OCF_RESKEY_arping_count answers are received +do_arping () { + # TODO: add the source IP + # TODO: check for diffenrent arping versions out there + arping -q -c $OCF_RESKEY_arping_count -w $OCF_RESKEY_arping_timeout -I $NIC $1 + # return with the exit code of the arping command + return $? +} + +# +# Check the interface depending on the level given as parameter: $OCF_RESKEY_check_level +# +# 09: check for nonempty ARP cache +# 10: watch for packet counter changes +# +# 19: check arping_ip_list +# 20: check arping ARP cache entries +# +# 30: watch for packet counter changes in promiscios mode +# +# If unsuccessfull in levels 18 and above, +# the tests for higher check levels are run. +# +if_check () { + local arp_list + # always check link status first + link_status="`get_link_status`" + ocf_log debug "link_status: $link_status (1=up, 0=down)" + + if [ $link_status -eq 0 ]; then + ocf_log notice "link_status: DOWN" + return $OCF_NOT_RUNNING + fi + + # if this is an infiniband device, try ibstatus script + if [ -n "$OCF_RESKEY_infiniband_device" ]; then + if infiniband_status; then + return $OCF_SUCCESS + fi + ocf_log info "Infiniband device $OCF_RESKEY_infiniband_device is not available, check ibstatus for more information" + return $OCF_NOT_RUNNING + fi + + # if using link_status_only, skip RX count and arping related tests + if ocf_is_true "$OCF_RESKEY_link_status_only"; then + return $OCF_SUCCESS + fi + + # watch for packet counter changes + ocf_log debug "watch for packet counter changes" + watch_pkt_counter + if [ $? -eq 0 ]; then + return $OCF_SUCCESS + else + ocf_log debug "No packets received during packet watch timeout" + fi + + # check arping ARP cache entries + ocf_log debug "check arping ARP cache entries" + arp_list=`get_arp_list` + for ip in `echo $arp_list`; do + do_arping $ip && return $OCF_SUCCESS + done + + # if we get here, the ethernet device is considered not running. + # provide some logging information + if [ -z "$arp_list" ]; then + ocf_log info "No ARP cache entries found to arping" + fi + + # watch for packet counter changes in promiscios mode +# ocf_log debug "watch for packet counter changes in promiscios mode" + # be sure switch off promiscios mode in any case + # TODO: check first, wether promisc is already on and leave it untouched. +# trap "$IP2UTIL link set dev $NIC promisc off; exit" INT TERM EXIT +# $IP2UTIL link set dev $NIC promisc on +# watch_pkt_counter && return $OCF_SUCCESS +# $IP2UTIL link set dev $NIC promisc off +# trap - INT TERM EXIT + + # looks like it's not working (for whatever reason) + return $OCF_NOT_RUNNING +} + +####################################################################### + +if_usage() { + cat <<END +usage: $0 {start|stop|status|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +set_cib_value() { + local score=`expr $1 \* $OCF_RESKEY_multiplier` + attrd_updater -n $ATTRNAME -v $score + local rc=$? + case $rc in + 0) ocf_log debug "attrd_updater: Updated $ATTRNAME = $score" ;; + *) ocf_log warn "attrd_updater: Could not update $ATTRNAME = $score: rc=$rc";; + esac + return $rc +} + +if_monitor() { + ha_pseudo_resource $OCF_RESOURCE_INSTANCE monitor + local pseudo_status=$? + if [ $pseudo_status -ne $OCF_SUCCESS ]; then + exit $pseudo_status + fi + + local mon_rc=$OCF_NOT_RUNNING + local attr_rc=$OCF_NOT_RUNNING + local runs=0 + local start_time + local end_time + local sleep_time + while [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ] + do + start_time=`date +%s%N` + if_check + mon_rc=$? + REP_COUNT=$(( $REP_COUNT - 1 )) + if [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ]; then + ocf_log warn "Monitoring of $OCF_RESOURCE_INSTANCE failed, $REP_COUNT retries left." + end_time=`date +%s%N` + sleep_time=`echo "scale=9; ( $start_time + ( $REP_INTERVAL_S * 1000000000 ) - $end_time ) / 1000000000" | bc -q 2> /dev/null` + sleep $sleep_time 2> /dev/null + runs=$(($runs + 1)) + fi + + if [ $mon_rc -eq $OCF_SUCCESS -a $runs -ne 0 ]; then + ocf_log info "Monitoring of $OCF_RESOURCE_INSTANCE recovered from error" + fi + done + + ocf_log debug "Monitoring return code: $mon_rc" + if [ $mon_rc -eq $OCF_SUCCESS ]; then + set_cib_value 1 + attr_rc=$? + else + ocf_log err "Monitoring of $OCF_RESOURCE_INSTANCE failed." + set_cib_value 0 + attr_rc=$? + fi + + ## The resource should not fail, if the interface is down. It should fail, if the update of the CIB variable has errors. + ## To react on the interface failure you must use constraints based on the CIB variable value, not on the resource itself. + exit $attr_rc +} + +if_stop() +{ + attrd_updater -D -n $ATTRNAME + ha_pseudo_resource $OCF_RESOURCE_INSTANCE stop +} + +if_start() +{ + local rc + ha_pseudo_resource $OCF_RESOURCE_INSTANCE start + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + ocf_exit_reason "Failure to create ethmonitor state file" + return $rc + fi + + # perform the first monitor during the start operation + if_monitor + return $? +} + + +if_validate() { + check_binary $IP2UTIL + check_binary arping + check_binary bc + if_init +} + +case $__OCF_ACTION in +meta-data) meta_data + ;; +usage|help) if_usage + exit $OCF_SUCCESS + ;; +esac + +if_validate + +case $__OCF_ACTION in +start) if_start + exit $? + ;; +stop) if_stop + exit $? + ;; +monitor|status) if_monitor + exit $? + ;; +validate-all) exit $? + ;; +*) if_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac diff --git a/heartbeat/exportfs b/heartbeat/exportfs new file mode 100755 index 0000000..435a196 --- /dev/null +++ b/heartbeat/exportfs @@ -0,0 +1,492 @@ +#!/bin/sh +# exportfs +# +# Description: Manages nfs exported file system. +# +# (c) 2010 Ben Timby, Florian Haas, Dejan Muhamedagic, +# and Linux-HA contributors +# +# License: GNU General Public License v2 (GPLv2) and later + +####################################################################### +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_unlock_on_stop_default=1 +OCF_RESKEY_wait_for_leasetime_on_stop_default=0 +OCF_RESKEY_rmtab_backup_default=".rmtab" + +: ${OCF_RESKEY_unlock_on_stop=${OCF_RESKEY_unlock_on_stop_default}} +: ${OCF_RESKEY_wait_for_leasetime_on_stop=${OCF_RESKEY_wait_for_leasetime_on_stop_default}} +: ${OCF_RESKEY_rmtab_backup=${OCF_RESKEY_rmtab_backup_default}} +####################################################################### + +exportfs_meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="exportfs" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Exportfs uses the exportfs command to add/remove nfs exports. +It does NOT manage the nfs server daemon. +It depends on Linux specific NFS implementation details, +so is considered not portable to other platforms yet. +</longdesc> + +<shortdesc lang="en"> +Manages NFS exports +</shortdesc> + +<parameters> + +<parameter name="clientspec" unique="0" required="1"> +<longdesc lang="en"> +The client specification allowing remote machines to mount the directory +(or directories) over NFS. + +Note: it follows the format defined in "man exportfs". For example, in +the use case to export the directory(-ies) for multiple subnets, please +do config a dedicated primitive for each subnet CIDR ip address, +and do not attempt to use multiple CIDR ip addresses in a space +separated list, like in /etc/exports. +</longdesc> +<shortdesc lang="en"> +Client ACL. +</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="options" unique="0" required="0"> +<longdesc lang="en"> +The options to pass to exportfs for the exported directory +or directories. +</longdesc> +<shortdesc lang="en"> +Export options. +</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="directory" unique="0" required="1"> +<longdesc lang="en"> +The directory or directories to be exported using NFS. Multiple +directories are separated by white space. +</longdesc> +<shortdesc lang="en"> +The directory or directories to export. +</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="fsid" unique="0" required="0"> +<longdesc lang="en"> +The fsid option to pass to exportfs. This can be a unique positive +integer, a UUID (assuredly sans comma characters), or the special string +"root" which is functionally identical to numeric fsid of 0. +If multiple directories are being exported, then they are +assigned ids sequentially starting with this fsid (fsid, fsid+1, +fsid+2, ...). Obviously, in that case the fsid must be an +integer. +0 (root) identifies the export as the root of an NFSv4 +pseudofilesystem -- avoid this setting unless you understand its +special status. +This value will override any fsid provided via the options parameter. +</longdesc> +<shortdesc lang="en"> +Unique fsid within cluster or starting fsid for multiple exports. +</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="unlock_on_stop"> +<longdesc lang="en"> +Relinquish NFS locks associated with this filesystem when the resource +stops. Enabling this parameter is highly recommended unless the path exported +by this ${__SCRIPT_NAME} resource is also exported by a different resource. + +Note: Unlocking is only possible on Linux systems where +/proc/fs/nfsd/unlock_filesystem exists and is writable. If your system does +not fulfill this requirement (on account of having an nonrecent kernel, +for example), you may set this parameter to 0 to silence the associated +warning. +</longdesc> +<shortdesc lang="en"> +Unlock filesystem on stop? +</shortdesc> +<content type="boolean" default="${OCF_RESKEY_unlock_on_stop_default}" /> +</parameter> + +<parameter name="wait_for_leasetime_on_stop"> +<longdesc lang="en"> +When stopping (unexporting), wait out the NFSv4 lease time. +Only after all leases have expired does the NFS kernel server +relinquish all server-side handles on the exported filesystem. +If this ${__SCRIPT_NAME} resource manages an export that resides +on a mount point designed to fail over along with the NFS export +itself, then enabling this parameter will ensure such failover +is working properly. Note that when this parameter is set, your +stop timeout MUST accommodate for the wait period. This parameter +is safe to disable if none of your NFS clients are using NFS +version 4 or later. +</longdesc> +<shortdesc lang="en"> +Ride out the NFSv4 lease time on resource stop? +</shortdesc> +<content type="boolean" default="${OCF_RESKEY_wait_for_leasetime_on_stop_default}" /> +</parameter> + +<parameter name="rmtab_backup"> +<longdesc lang="en"> +Back up those entries from the NFS rmtab that apply to the exported +directory, to the specified backup file. The filename is interpreted +as relative to the exported directory. This backup is required if +clients are connecting to the export via NFSv3 over TCP. Note that a +configured monitor operation is required for this functionality. + +To disable rmtab backups, set this parameter to the special +string "none". +</longdesc> +<shortdesc lang="en"> +Location of the rmtab backup, relative to directory. +</shortdesc> +<content type="string" default="${OCF_RESKEY_rmtab_backup_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="40s" /> +<action name="stop" timeout="120s" /> +<action name="monitor" depth="0" timeout="20s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="30s" /> +</actions> +</resource-agent> +END + +return $OCF_SUCCESS +} + +exportfs_methods() { + cat <<-EOF + start + stop + status + monitor + validate-all + methods + meta-data + usage + EOF +} + +reset_fsid() { + CURRENT_FSID=$OCF_RESKEY_fsid + [ -z "$CURRENT_FSID" ] && CURRENT_FSID=`echo "$OCF_RESKEY_options" | sed -n 's/.*fsid=\([^,]*\).*/\1/p'` + echo $CURRENT_FSID +} +bump_fsid() { + CURRENT_FSID=$((CURRENT_FSID+1)) +} +get_fsid() { + echo $CURRENT_FSID +} + +# run a function on all directories +forall() { + local func=$1 + shift 1 + local fast_exit="" + local dir rc=0 + if [ "$2" = fast_exit ]; then + fast_exit=1 + shift 1 + fi + reset_fsid + for dir in $OCF_RESKEY_directory; do + $func $dir "$@" + rc=$(($rc | $?)) + [ $NUMDIRS -gt 1 ] && bump_fsid + [ "$fast_exit" ] && continue + [ $rc -ne 0 ] && return $rc + done + return $rc +} + +backup_rmtab() { + local dir=$1 + local rmtab_backup + rmtab_backup="$dir/${OCF_RESKEY_rmtab_backup}" + if [ -r /var/lib/nfs/rmtab ]; then + grep ":$dir:" /var/lib/nfs/rmtab > ${rmtab_backup} + fi +} + +restore_rmtab() { + local dir=$1 + local rmtab_backup + rmtab_backup="$dir/${OCF_RESKEY_rmtab_backup}" + if [ -r ${rmtab_backup} ]; then + local tmpf=`mktemp` + sort -u ${rmtab_backup} /var/lib/nfs/rmtab > $tmpf && + install -o root -m 644 $tmpf /var/lib/nfs/rmtab + rm -f $tmpf + ocf_log debug "Restored `wc -l ${rmtab_backup}` rmtab entries from ${rmtab_backup}." + else + ocf_log warn "rmtab backup ${rmtab_backup} not found or not readable." + fi +} + +exportfs_usage() { + cat <<END + usage: $0 {start|stop|monitor|status|validate-all|meta-data} +END +} + +format_exports() { + # exportfs output wraps lines for long export directory names. + # We unwrap here with sed. + # We then do a literal match on the full line (grep -x -F) + exportfs | + sed -e '$! N; s/\n[[:space:]]\+/ /; t; s/[[:space:]]\+\([^[:space:]]\+\)\(\n\|$\)/ \1\2/g; P;D;' +} +is_exported() { + local dir=$1 + local spec=$2 + local rc + format_exports | grep -q -x -F "$dir $spec" + rc=$? + if [ $rc -ne 0 -a "$spec" = "*" ]; then + # on some platforms, exportfs may print + # "<world>" instead of "*" + format_exports | grep -q -x -F "$dir <world>" + rc=$? + fi + # log something only for monitors + if [ $rc -ne 0 -a "$__OCF_ACTION" = "monitor" ]; then + local sev="info" + ocf_is_probe || sev="err" + ocf_log $sev "$dir not exported to $spec (stopped)." + fi + return $rc +} + +exportfs_monitor () +{ + local spec + + if ! ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" monitor; then + return $OCF_NOT_RUNNING + fi + + # IPv6 addresses and networks are encased in brackets that need + # to be removed + case "$OCF_RESKEY_clientspec" in + *:*:*) + spec="$(echo "$OCF_RESKEY_clientspec" | tr -d '[]')" + ;; + *) + spec="$OCF_RESKEY_clientspec" + ;; + esac + + if forall is_exported "$spec"; then + if [ ${OCF_RESKEY_rmtab_backup} != "none" ]; then + forall backup_rmtab + fi + return $OCF_SUCCESS + else + return $OCF_NOT_RUNNING + fi +} + +testdir() { + if [ ! -d $1 ]; then + mkdir -p "$1" + if [ $? -ne 0 ]; then + ocf_exit_reason "Unable to create directory $1" + return 1 + fi + fi + return 0 +} +export_one() { + local dir=$1 + local opts sep + sep="" + if [ -n "$OCF_RESKEY_options" ]; then + opts="$OCF_RESKEY_options" + sep="," + fi + if echo "$opts" | grep fsid >/dev/null; then + #replace fsid in options list + opts=`echo "$opts" | sed "s,fsid=[^,]*,fsid=$(get_fsid),g"` + elif [ -n "$OCF_RESKEY_fsid" ]; then + #tack the fsid option onto our options list. + opts="${opts}${sep}fsid=$(get_fsid)" + fi + opts="-o $opts" + + # if any of directories fails to export we can exit + # immediately + ocf_run exportfs -v $opts "${OCF_RESKEY_clientspec}:$dir" + if [ $? -ne 0 ]; then + ocf_exit_reason "exportfs failed - exportfs -v $opts ${OCF_RESKEY_clientspec}:$dir" + exit $OCF_ERR_GENERIC + fi + + ocf_log info "directory $dir exported" + return $OCF_SUCCESS +} +exportfs_start () +{ + if ! forall testdir; then + return $OCF_ERR_INSTALLED + fi + + if exportfs_monitor; then + ocf_log debug "already exported" + return $OCF_SUCCESS + fi + ocf_log info "Exporting file system(s) ..." + + ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" start + forall export_one + + # Restore the rmtab to ensure smooth NFS-over-TCP failover + if [ ${OCF_RESKEY_rmtab_backup} != "none" ]; then + forall restore_rmtab + fi +} + +unlock_fs() { + local dir=$1 + local unlockfile + unlockfile=/proc/fs/nfsd/unlock_filesystem + if [ -w ${unlockfile} ]; then + echo "$dir" > ${unlockfile} + ocf_log info "Unlocked NFS export $dir" + else + ocf_log warn "Unable to unlock NFS export $dir, ${unlockfile} not found or not writable" + fi +} +wait_for_leasetime() { + local leasetimefile + local sleeptime + leasetimefile=/proc/fs/nfsd/nfsv4leasetime + if [ -r ${leasetimefile} ]; then + sleeptime=$((`cat ${leasetimefile}`+2)) + ocf_log info "Sleeping ${sleeptime} seconds to accommodate for NFSv4 lease expiry" + sleep ${sleeptime}s + else + ocf_log warn "Unable to read NFSv4 lease time from ${leasetimefile}, file not found or not readable" + fi +} +cleanup_export_cache() { + # see if the cache is blocking unexport + local contentfile=/proc/net/rpc/nfsd.export/content + local fsid_re + local i=1 + fsid_re="fsid=(echo `forall get_fsid`|sed 's/ /|/g')," + while :; do + grep -E -q "$fsid_re" $contentfile || + break + ocf_log info "Cleanup export cache ... (try $i)" + ocf_run exportfs -f + sleep 0.5 + i=$((i + 1)) + done +} +unexport_one() { + local dir=$1 + ocf_run exportfs -v -u ${OCF_RESKEY_clientspec}:$dir +} +exportfs_stop () +{ + local rc + + exportfs_monitor + if [ $? -eq $OCF_NOT_RUNNING ]; then + ocf_log debug "not exported" + return $OCF_SUCCESS + fi + + ocf_log info "Un-exporting file system ..." + + # Backup the rmtab to ensure smooth NFS-over-TCP failover + if [ ${OCF_RESKEY_rmtab_backup} != "none" ]; then + forall backup_rmtab + fi + + forall unexport_one + rc=$? + + if ocf_is_true ${OCF_RESKEY_unlock_on_stop}; then + forall unlock_fs + fi + + if ocf_is_true ${OCF_RESKEY_wait_for_leasetime_on_stop}; then + wait_for_leasetime + fi + + if [ $rc -eq 0 ]; then + cleanup_export_cache + ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" stop + + ocf_log info "Un-exported file system(s)" + return $OCF_SUCCESS + else + ocf_exit_reason "Failed to un-export file system(s)" + return $OCF_ERR_GENERIC + fi +} + +exportfs_validate_all () +{ + if echo "$OCF_RESKEY_fsid" | grep -q -F ','; then + ocf_exit_reason "$OCF_RESKEY_fsid cannot contain a comma" + return $OCF_ERR_CONFIGURED + fi + if [ $NUMDIRS -gt 1 ] && [ -n "$(reset_fsid)" ] && + ! ocf_is_decimal "$(reset_fsid)"; then + ocf_exit_reason "use integer fsid when exporting multiple directories" + return $OCF_ERR_CONFIGURED + fi +} + +for dir in $OCF_RESKEY_directory; do + # strip off trailing '/' from directory + dir=$(echo $dir | sed 's/\/*$//') + : ${dir:=/} + if [ -e "$dir" ] ; then + canonicalized_dir=$(readlink -f "$dir") + if [ $? -ne 0 ]; then + if [ "$__OCF_ACTION" != "stop" ]; then + ocf_exit_reason "Could not canonicalize $dir because readlink failed" + exit $OCF_ERR_GENERIC + fi + fi + else + case "$__OCF_ACTION" in + stop|monitor|validate-all) + canonicalized_dir="$dir" + ocf_log debug "$dir does not exist" + ;; + *) + ocf_exit_reason "$dir does not exist" + exit $OCF_ERR_CONFIGURED + ;; + esac + fi + directories="$directories$canonicalized_dir " +done + +OCF_RESKEY_directory="${directories%% }" + +NUMDIRS=`echo "$OCF_RESKEY_directory" | wc -w` +OCF_REQUIRED_PARAMS="directory clientspec" +OCF_REQUIRED_BINARIES="exportfs" +ocf_rarun $* diff --git a/heartbeat/findif.sh b/heartbeat/findif.sh new file mode 100644 index 0000000..5f1c19e --- /dev/null +++ b/heartbeat/findif.sh @@ -0,0 +1,260 @@ +#!/bin/sh +ipcheck_ipv4() { + local r1_to_255="([1-9][0-9]?|1[0-9][0-9]|2[0-4][0-9]|25[0-5])" + local r0_to_255="([0-9][0-9]?|1[0-9][0-9]|2[0-4][0-9]|25[0-5])" + local r_ipv4="^$r1_to_255\.$r0_to_255\.$r0_to_255\.$r0_to_255$" + echo "$1" | grep -q -Ee "$r_ipv4" +} +ipcheck_ipv6() { + ! echo "$1" | grep -qs "[^0-9:a-fA-F]" +} +ifcheck() { + local ifname="$1" + $IP2UTIL link show dev $ifname 2>&1 +} +prefixcheck() { + local prefix=$1 + local prefix_length=${#prefix} + local prefix_check=$2 + + if [ $prefix_length -gt 3 -o $prefix_length -eq 0 ] ; then + return 1 + fi + echo "$prefix" | grep -qs "[^0-9]" + if [ $? = 0 ] ; then + return 1 + fi + if [ $prefix -lt 1 -o $prefix -gt $prefix_check ] ; then + return 1 + fi + return 0 +} +getnetworkinfo() +{ + local line netinfo + ip -o -f inet route list match $OCF_RESKEY_ip scope host | (while read line; + do + netinfo=`echo $line | awk '{print $2}'` + case $netinfo in + */*) + set -- $line + break + ;; + esac + done + echo $line) +} + +# previous versions of the IPaddr2 resource agent used to accept a netmask +# in dotted quad notation (and convert to cidr notation implicitly; possibly +# with warnings nobody ever noticed) +# We can do so here as well. +maybe_convert_dotted_quad_to_cidr() +{ + # does this even look like a dotted quad notation? + case $netmask in + # invalid if it contains other than digits and dots + # invalid if it contains adjacent dots, + # or starts or ends with a dot + # or more than three dots + # or more than three digits in a row + *[!0-9.]* | *..* | *.*.*.*.* | .* | *. | *[0-9][0-9][0-9][0-9]* ) + return ;; + + # do we have three dots? + # component range check on <= 255 is done below + *.*.*.*) : ;; + + *) return ;; + esac + + local IFS=. + set -- $netmask + [ $# = 4 ] || return; + + local b m=0 mask; + for b ; do + [ $b -le 255 ] || return; + m=$(( (m << 8) + b )); + done; + case $m in + # for i in `seq 32 -1 0`; do printf "%10u) netmask=$i ;;\n" $(( ((1 << i)-1) << (32 - i) )); done + 4294967295) mask=32 ;; + 4294967294) mask=31 ;; + 4294967292) mask=30 ;; + 4294967288) mask=29 ;; + 4294967280) mask=28 ;; + 4294967264) mask=27 ;; + 4294967232) mask=26 ;; + 4294967168) mask=25 ;; + 4294967040) mask=24 ;; + 4294966784) mask=23 ;; + 4294966272) mask=22 ;; + 4294965248) mask=21 ;; + 4294963200) mask=20 ;; + 4294959104) mask=19 ;; + 4294950912) mask=18 ;; + 4294934528) mask=17 ;; + 4294901760) mask=16 ;; + 4294836224) mask=15 ;; + 4294705152) mask=14 ;; + 4294443008) mask=13 ;; + 4293918720) mask=12 ;; + 4292870144) mask=11 ;; + 4290772992) mask=10 ;; + 4286578688) mask=9 ;; + 4278190080) mask=8 ;; + 4261412864) mask=7 ;; + 4227858432) mask=6 ;; + 4160749568) mask=5 ;; + 4026531840) mask=4 ;; + 3758096384) mask=3 ;; + 3221225472) mask=2 ;; + 2147483648) mask=1 ;; + 0) mask=0 ;; + *) ocf_log err "Bogus netmask: $netmask" ; return ;; + esac + ocf_log warn "Please convert dotted quad netmask $netmask to CIDR notation $mask!" + netmask=$mask +} + +findif_check_params() +{ + local family="$1" + local match="$OCF_RESKEY_ip" + local nic="$OCF_RESKEY_nic" + # netmask NOT local, see maybe_convert_dotted_quad_to_cidr + netmask="$OCF_RESKEY_cidr_netmask" + local brdcast="$OCF_RESKEY_broadcast" + local errmsg + + maybe_convert_dotted_quad_to_cidr + + # Do a sanity check only on start and validate-all + # to avoid returning OCF_ERR_CONFIGURED from the monitor operation. + case $__OCF_ACTION in + start|validate-all) true;; + *) return $OCF_SUCCESS;; + esac + + if [ -n "$nic" ] ; then + errmsg=`ifcheck $nic` + if [ $? -ne 0 ] ; then + ocf_log err "Invalid interface name [$nic]: $errmsg" + return $OCF_ERR_CONFIGURED + fi + fi + + if [ "$family" = "inet6" ] ; then + ipcheck_ipv6 $match + if [ $? = 1 ] ; then + ocf_log err "IP address [$match] not valid." + return $OCF_ERR_CONFIGURED + fi + if [ -z "$nic" ] ; then + echo $match | grep -qis '^fe80::' + if [ $? = 0 ] ; then + ocf_log err "'nic' parameter is mandatory for a link local address [$match]." + return $OCF_ERR_CONFIGURED + fi + fi + if [ -n "$netmask" ] ; then + prefixcheck $netmask 128 + if [ $? = 1 ] ; then + ocf_log err "Invalid netmask specification [$netmask]." + return $OCF_ERR_CONFIGURED + fi + fi + else + # family = inet + ipcheck_ipv4 $match + if [ $? = 1 ] ; then + ocf_log err "IP address [$match] not valid." + return $OCF_ERR_CONFIGURED + fi + if [ -n "$netmask" ] ; then + prefixcheck $netmask 32 + if [ $? = 1 ] ; then + ocf_log err "Invalid netmask specification [$netmask]." + return $OCF_ERR_CONFIGURED + fi + fi + if [ -n "$brdcast" ] ; then + ipcheck_ipv4 $brdcast + if [ $? = 1 ] ; then + if [ "$brdcast" != "+" -a "$brdcast" != "-" ]; then + ocf_log err "Invalid broadcast address [$brdcast]." + return $OCF_ERR_CONFIGURED + fi + fi + fi + fi + return $OCF_SUCCESS +} + +findif() +{ + local match="$OCF_RESKEY_ip" + local family + local scope + local nic="$OCF_RESKEY_nic" + local netmask="$OCF_RESKEY_cidr_netmask" + local brdcast="$OCF_RESKEY_broadcast" + + echo $match | grep -qs ":" + if [ $? = 0 ] ; then + family="inet6" + else + family="inet" + scope="scope link" + fi + findif_check_params $family || return $? + + if [ -n "$netmask" ]; then + match=$match/$netmask + fi + if [ -n "$nic" ] ; then + # NIC supports more than two. + set -- $(ip -o -f $family route list match $match $scope | grep "dev $nic " | awk 'BEGIN{best=0} /\// { mask=$1; sub(".*/", "", mask); if( int(mask)>=best ) { best=int(mask); best_ln=$0; } } END{print best_ln}') + else + set -- $(ip -o -f $family route list match $match $scope | awk 'BEGIN{best=0} /\// { mask=$1; sub(".*/", "", mask); if( int(mask)>=best ) { best=int(mask); best_ln=$0; } } END{print best_ln}') + fi + if [ $# = 0 ] ; then + case $OCF_RESKEY_ip in + 127.*) + set -- `getnetworkinfo` + shift;; + esac + fi + if [ -z "$nic" -o -z "$netmask" ] ; then + if [ $# = 0 ] ; then + ocf_log err "Unable to find nic or netmask." + return $OCF_ERR_GENERIC + fi + case $1 in + */*) : OK ;; + # "ip route" doesnt show netmask for IPv6 /128 + *:*:*) : OK ;; + *) + ocf_log err "Unable to find cidr_netmask." + return $OCF_ERR_GENERIC ;; + esac + fi + [ -z "$nic" ] && nic=$3 + [ -z "$netmask" ] && netmask=${1#*/} + if [ $family = "inet" ] ; then + if [ -z "$brdcast" ] ; then + if [ -n "$7" ] ; then + set -- `ip -o -f $family addr show | grep $7` + [ "$5" = brd ] && brdcast=$6 + fi + fi + else + if [ -z "$OCF_RESKEY_nic" ] && [ -z "$OCF_RESKEY_cidr_netmask" ] && [ "$netmask" != "${1#*/}" ] ; then + ocf_log err "Unable to find nic, or netmask mismatch." + return $OCF_ERR_GENERIC + fi + fi + echo "$nic netmask $netmask broadcast $brdcast" + return $OCF_SUCCESS +} diff --git a/heartbeat/fio.in b/heartbeat/fio.in new file mode 100644 index 0000000..4a31251 --- /dev/null +++ b/heartbeat/fio.in @@ -0,0 +1,178 @@ +#!@BASH_SHELL@ +# +# fio RA +# +# Copyright (c) 2010 SUSE Linux Products GmbH, Lars Marowsky-Brée +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_args_default="" + +: ${OCF_RESKEY_args=${OCF_RESKEY_args_default}} + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="fio" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +fio is a generic I/O load generator. This RA allows start/stop of fio +instances to simulate load on a cluster without configuring complex +services. +</longdesc> +<shortdesc lang="en">fio IO load generator</shortdesc> + +<parameters> +<parameter name="args"> +<longdesc lang="en"> +Arguments to the fio client. Minimally, this should be a (list of) job +descriptions to run. +</longdesc> +<shortdesc lang="en">fio arguments</shortdesc> +<content type="string" default="${OCF_RESKEY_args_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="60s" /> +<action name="stop" timeout="60s" /> +<action name="monitor" timeout="60s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="20s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +fio_usage() { + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} +END +} + +fio_start() { + fio_monitor ; rc=$? + if [ $rc = $OCF_SUCCESS ]; then + ocf_log info "fio already running." + exit $OCF_SUCCESS + fi + if [ $rc != $OCF_NOT_RUNNING ]; then + ocf_log info "fio apparently dead; cleaning up before restart" + fio_stop + fi + fio $OCF_RESKEY_args >/dev/null 2>&1 </dev/null & + fio_pid=`jobs -p` + echo $fio_pid >${fio_state_file} + ocf_log info "fio started as pid=$fio_pid" + exit $OCF_SUCCESS +} + +fio_stop() { + for sig in SIGINT SIGTERM SIGKILL ; do + fio_monitor ; rc=$? + case $rc in + $OCF_NOT_RUNNING) + ocf_log info "fio already stopped." + exit $OCF_SUCCESS + ;; + $OCF_ERR_GENERIC) + rm $fio_state_file + ocf_log info "fio stopped and cleaned up." + exit $OCF_SUCCESS + ;; + $OCF_SUCCESS) + if [ -n "$fio_pid" ]; then + ocf_log info "Sending $sig to fio (pid=$fio_pid)" + kill -$sig $fio_pid + sleep 3 + continue + fi + ocf_log err "Internal logic failure in fio RA." + ;; + *) ocf_log err "Internal logic failure in fio RA." + ;; + esac + done + ocf_log err "fio did not stop! Perhaps hung on IO?" + exit $OCF_ERR_GENERIC +} + +fio_monitor() { + fio_state_file="${HA_RSCTMP}/fio-${OCF_RESOURCE_INSTANCE}.state" + if [ ! -e $fio_state_file ]; then + return $OCF_NOT_RUNNING + fi + fio_pid=`cat $fio_state_file` + + if [ -z "$fio_pid" ]; then + ocf_log err "State file found, but empty. Assuming stopped." + return $OCF_NOT_RUNNING + fi + + ps=`ps h -o comm $fio_pid 2>&1` + if [ "$ps" != "fio" ]; then + fio_pid="" + return $OCF_ERR_GENERIC + fi + return $OCF_SUCCESS +} + +fio_validate() { + return $OCF_SUCCESS +} + +case $__OCF_ACTION in + meta-data) meta_data + exit $OCF_SUCCESS + ;; + validate-all) fio_validate;; + usage|help) fio_usage + exit $OCF_SUCCESS + ;; +esac + +ocf_is_probe || check_binary fio + +case $__OCF_ACTION in + start) fio_start;; + stop) fio_stop;; + monitor) fio_monitor;; + *) fio_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + diff --git a/heartbeat/galera.in b/heartbeat/galera.in new file mode 100755 index 0000000..6aed3e4 --- /dev/null +++ b/heartbeat/galera.in @@ -0,0 +1,1097 @@ +#!@BASH_SHELL@ +# +# Copyright (c) 2014 David Vossel <davidvossel@gmail.com> +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +## +# README. +# +# This agent only supports being configured as a multistate Promoted +# resource. +# +# Unpromoted vs Promoted role: +# +# During the 'Unpromoted' role, galera instances are in read-only mode and +# will not attempt to connect to the cluster. This role exists only as +# a means to determine which galera instance is the most up-to-date. The +# most up-to-date node will be used to bootstrap a galera cluster that +# has no current members. +# +# The galera instances will only begin to be promoted to the Promoted role +# once all the nodes in the 'wsrep_cluster_address' connection address +# have entered read-only mode. At that point the node containing the +# database that is most current will be promoted to Promoted. Once the first +# Promoted instance bootstraps the galera cluster, the other nodes will be +# promoted to Promoted as well. +# +# Example: Create a galera cluster using nodes rhel7-node1 rhel7-node2 rhel7-node3 +# +# pcs resource create db galera enable_creation=true \ +# wsrep_cluster_address="gcomm://rhel7-auto1,rhel7-auto2,rhel7-auto3" meta promoted-max=3 --promoted +# +# By setting the 'enable_creation' option, the database will be automatically +# generated at startup. The meta attribute 'promoted-max=3' means that all 3 +# nodes listed in the wsrep_cluster_address list will be allowed to connect +# to the galera cluster and perform replication. +# +# NOTE: If you have more nodes in the pacemaker cluster then you wish +# to have in the galera cluster, make sure to use location contraints to prevent +# pacemaker from attempting to place a galera instance on a node that is +# not in the 'wsrep_cluster_address" list. +# +## + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +if [ "$__OCF_ACTION" != "meta-data" ]; then + . ${OCF_FUNCTIONS_DIR}/mysql-common.sh + NODENAME=$(ocf_attribute_target) +fi + +# It is common for some galera instances to store +# check user that can be used to query status +# in this file +if [ -f "/etc/sysconfig/clustercheck" ]; then + . /etc/sysconfig/clustercheck +elif [ -f "/etc/default/clustercheck" ]; then + . /etc/default/clustercheck +fi + +# Parameter defaults + +OCF_RESKEY_wsrep_cluster_address_default="" +OCF_RESKEY_cluster_host_map_default="" +OCF_RESKEY_check_user_default="" +OCF_RESKEY_check_passwd_default="" +OCF_RESKEY_two_node_mode_default="false" + +: ${OCF_RESKEY_wsrep_cluster_address=${OCF_RESKEY_wsrep_cluster_address_default}} +: ${OCF_RESKEY_cluster_host_map=${OCF_RESKEY_cluster_host_map_default}} +: ${OCF_RESKEY_check_user=${OCF_RESKEY_check_user_default}} +: ${OCF_RESKEY_check_passwd=${OCF_RESKEY_check_passwd_default}} +: ${OCF_RESKEY_two_node_mode=${OCF_RESKEY_two_node_mode_default}} + +####################################################################### +# Defaults: + +OCF_RESKEY_check_passwd_use_empty_default=0 + +: ${OCF_RESKEY_check_passwd_use_empty=${OCF_RESKEY_check_passwd_use_empty_default}} + +####################################################################### + +usage() { + cat <<UEND +usage: $0 (start|stop|validate-all|meta-data|monitor|promote|demote) + +$0 manages a galera Database as an HA resource. + +The 'start' operation starts the database. +The 'stop' operation stops the database. +The 'status' operation reports whether the database is running +The 'monitor' operation reports whether the database seems to be working +The 'promote' operation makes this mysql server run as promoted +The 'demote' operation makes this mysql server run as unpromoted +The 'validate-all' operation reports whether the parameters are valid + +UEND +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="galera" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for managing galera database. +</longdesc> +<shortdesc lang="en">Manages a galera instance</shortdesc> +<parameters> + +<parameter name="binary" unique="0" required="0"> +<longdesc lang="en"> +Location of the MySQL server binary +</longdesc> +<shortdesc lang="en">MySQL server binary</shortdesc> +<content type="string" default="${OCF_RESKEY_binary_default}" /> +</parameter> + +<parameter name="client_binary" unique="0" required="0"> +<longdesc lang="en"> +Location of the MySQL client binary +</longdesc> +<shortdesc lang="en">MySQL client binary</shortdesc> +<content type="string" default="${OCF_RESKEY_client_binary_default}" /> +</parameter> + +<parameter name="config" unique="0" required="0"> +<longdesc lang="en"> +Configuration file +</longdesc> +<shortdesc lang="en">MySQL config</shortdesc> +<content type="string" default="${OCF_RESKEY_config_default}" /> +</parameter> + +<parameter name="datadir" unique="0" required="0"> +<longdesc lang="en"> +Directory containing databases +</longdesc> +<shortdesc lang="en">MySQL datadir</shortdesc> +<content type="string" default="${OCF_RESKEY_datadir_default}" /> +</parameter> + +<parameter name="user" unique="0" required="0"> +<longdesc lang="en"> +User running MySQL daemon +</longdesc> +<shortdesc lang="en">MySQL user</shortdesc> +<content type="string" default="${OCF_RESKEY_user_default}" /> +</parameter> + +<parameter name="group" unique="0" required="0"> +<longdesc lang="en"> +Group running MySQL daemon (for logfile and directory permissions) +</longdesc> +<shortdesc lang="en">MySQL group</shortdesc> +<content type="string" default="${OCF_RESKEY_group_default}"/> +</parameter> + +<parameter name="log" unique="0" required="0"> +<longdesc lang="en"> +The logfile to be used for mysqld. +</longdesc> +<shortdesc lang="en">MySQL log file</shortdesc> +<content type="string" default="${OCF_RESKEY_log_default}"/> +</parameter> + +<parameter name="pid" unique="0" required="0"> +<longdesc lang="en"> +The pidfile to be used for mysqld. +</longdesc> +<shortdesc lang="en">MySQL pid file</shortdesc> +<content type="string" default="${OCF_RESKEY_pid_default}"/> +</parameter> + +<parameter name="socket" unique="0" required="0"> +<longdesc lang="en"> +The socket to be used for mysqld. +</longdesc> +<shortdesc lang="en">MySQL socket</shortdesc> +<content type="string" default="${OCF_RESKEY_socket_default}"/> +</parameter> + +<parameter name="enable_creation" unique="0" required="0"> +<longdesc lang="en"> +If the MySQL database does not exist, it will be created +</longdesc> +<shortdesc lang="en">Create the database if it does not exist</shortdesc> +<content type="boolean" default="${OCF_RESKEY_enable_creation_default}"/> +</parameter> + +<parameter name="additional_parameters" unique="0" required="0"> +<longdesc lang="en"> +Additional parameters which are passed to the mysqld on startup. +(e.g. --skip-external-locking or --skip-grant-tables) +</longdesc> +<shortdesc lang="en">Additional parameters to pass to mysqld</shortdesc> +<content type="string" default="${OCF_RESKEY_additional_parameters_default}"/> +</parameter> + + +<parameter name="wsrep_cluster_address" unique="0" required="1"> +<longdesc lang="en"> +The galera cluster address. This takes the form of: +gcomm://node,node,node + +Only nodes present in this node list will be allowed to start a galera instance. +The galera node names listed in this address are expected to match valid +pacemaker node names. If both names need to differ, you must provide a +mapping in option cluster_host_map. +</longdesc> +<shortdesc lang="en">Galera cluster address</shortdesc> +<content type="string" default="${OCF_RESKEY_wsrep_cluster_address_default}"/> +</parameter> + +<parameter name="cluster_host_map" unique="0" required="0"> +<longdesc lang="en"> +A mapping of pacemaker node names to galera node names. + +To be used when both pacemaker and galera names need to differ, +(e.g. when galera names map to IP from a specific network interface) +This takes the form of: +pcmk1:node.1.galera;pcmk2:node.2.galera;pcmk3:node.3.galera + +where the galera resource started on node pcmk1 would be named +node.1.galera in the wsrep_cluster_address +</longdesc> +<shortdesc lang="en">Pacemaker to Galera name mapping</shortdesc> +<content type="string" default="${OCF_RESKEY_cluster_host_map_default}"/> +</parameter> + +<parameter name="check_user" unique="0" required="0"> +<longdesc lang="en"> +Cluster check user. +</longdesc> +<shortdesc lang="en">MySQL test user</shortdesc> +<content type="string" default="${OCF_RESKEY_check_user_default}" /> +</parameter> + +<parameter name="check_passwd" unique="0" required="0"> +<longdesc lang="en"> +Cluster check user password. Empty passwords are ignored unless +the parameter "check_passwd_use_empty" is set to 1. +</longdesc> +<shortdesc lang="en">check password</shortdesc> +<content type="string" default="${OCF_RESKEY_check_passwd_default}" /> +</parameter> + +<parameter name="check_passwd_use_empty" unique="0" required="0"> +<longdesc lang="en"> +Use an empty "check_passwd" password. If this parameter is set to 1, +"check_passwd" will be ignored and an empty password is used +when calling the "mysql" client binary. +</longdesc> +<shortdesc lang="en">check password use empty</shortdesc> +<content type="boolean" default="${OCF_RESKEY_check_passwd_use_empty_default}"/> +</parameter> + +<parameter name="two_node_mode" unique="0" required="0"> +<longdesc lang="en"> +If running in a 2-node pacemaker cluster, rely on pacemaker quorum +to allow automatic recovery even when the other node is unreachable. +Use it with caution! (and fencing) +</longdesc> +<shortdesc lang="en">Special recovery when running on a 2-node cluster</shortdesc> +<content type="boolean" default="${OCF_RESKEY_two_node_mode_default}"/> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="120s" /> +<action name="stop" timeout="120s" /> +<action name="status" timeout="60s" /> +<action name="monitor" depth="0" timeout="30s" interval="20s" /> +<action name="monitor" role="Promoted" depth="0" timeout="30s" interval="10s" /> +<action name="monitor" role="Unpromoted" depth="0" timeout="30s" interval="30s" /> +<action name="promote" timeout="300s" /> +<action name="demote" timeout="120s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +get_option_variable() +{ + local key=$1 + + $MYSQL $MYSQL_OPTIONS_CHECK -e "SHOW VARIABLES like '$key';" | tail -1 +} + +get_status_variable() +{ + local key=$1 + + $MYSQL $MYSQL_OPTIONS_CHECK -e "show status like '$key';" | tail -1 +} + +set_bootstrap_node() +{ + local node=$(ocf_attribute_target $1) + + ${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-bootstrap" -v "true" +} + +clear_bootstrap_node() +{ + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-bootstrap" -D +} + +is_bootstrap() +{ + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-bootstrap" --quiet 2>/dev/null + +} + +set_no_grastate() +{ + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-no-grastate" -v "true" +} + +clear_no_grastate() +{ + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-no-grastate" -D +} + +is_no_grastate() +{ + local node=$(ocf_attribute_target $1) + ${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-no-grastate" --quiet 2>/dev/null +} + +clear_last_commit() +{ + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" -D +} + +set_last_commit() +{ + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" -v $1 +} + +get_last_commit() +{ + local node=$(ocf_attribute_target $1) + + if [ -z "$node" ]; then + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" --quiet 2>/dev/null + else + ${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" --quiet 2>/dev/null + fi +} + +clear_safe_to_bootstrap() +{ + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" -D +} + +set_safe_to_bootstrap() +{ + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" -v $1 +} + +get_safe_to_bootstrap() +{ + local node=$(ocf_attribute_target $1) + + if [ -z "$node" ]; then + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" --quiet 2>/dev/null + else + ${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" --quiet 2>/dev/null + fi +} + +wait_for_sync() +{ + local state=$(get_status_variable "wsrep_local_state") + + ocf_log info "Waiting for database to sync with the cluster. " + while [ "$state" != "4" ]; do + sleep 1 + state=$(get_status_variable "wsrep_local_state") + done + ocf_log info "Database synced." +} + +is_primary() +{ + cluster_status=$(get_status_variable "wsrep_cluster_status") + if [ "$cluster_status" = "Primary" ]; then + return 0 + fi + + if [ -z "$cluster_status" ]; then + ocf_exit_reason "Unable to retrieve wsrep_cluster_status, verify check_user '$OCF_RESKEY_check_user' has permissions to view status" + else + ocf_log info "Galera instance wsrep_cluster_status=${cluster_status}" + fi + return 1 +} + +is_readonly() +{ + local res=$(get_option_variable "read_only") + + if ! ocf_is_true "$res"; then + return 1 + fi + + cluster_status=$(get_status_variable "wsrep_cluster_status") + if ! [ "$cluster_status" = "Disconnected" ]; then + return 1 + fi + + return 0 +} + +is_two_node_mode_active() +{ + # crm_node or corosync-quorumtool cannot access various corosync + # flags when running inside a bundle, so only count the cluster + # members + ocf_is_true "$OCF_RESKEY_two_node_mode" && crm_mon_no_validation -1X | xmllint --xpath "count(//nodes/node[@type='member'])" - | grep -q -w 2 +} + +is_last_node_in_quorate_partition() +{ + # when a network split occurs in a 2-node cluster, pacemaker + # fences the other node and try to retain quorum. So until + # the fencing is resolved (and the status of the peer node + # is clean), we shouldn't consider ourself quorate. + local partition_members=$(${HA_SBIN_DIR}/crm_node -p | wc -w) + local quorate=$(${HA_SBIN_DIR}/crm_node -q) + local clean_members=$(crm_mon_no_validation -1X | xmllint --xpath 'count(//nodes/node[@type="member" and @unclean="false"])' -) + + [ "$partition_members" = 1 ] && [ "$quorate" = 1 ] && [ "$clean_members" = 2 ] +} + +master_exists() +{ + if [ "$__OCF_ACTION" = "demote" ]; then + # We don't want to detect master instances during demote. + # 1. we could be detecting ourselves as being master, which is no longer the case. + # 2. we could be detecting other master instances that are in the process of shutting down. + # by not detecting other master instances in "demote" we are deferring this check + # to the next recurring monitor operation which will be much more accurate + return 1 + fi + # determine if a master instance is already up and is healthy + ocf_version_cmp "$OCF_RESKEY_crm_feature_set" "3.1.0" + res=$? + if [ -z "$OCF_RESKEY_crm_feature_set" ] || [ $res -eq 2 ]; then + XMLOPT="--output-as=xml" + ocf_version_cmp "$OCF_RESKEY_crm_feature_set" "3.2.0" + if [ $? -eq 1 ]; then + crm_mon_no_validation -1 $XMLOPT >/dev/null 2>&1 + if [ $? -ne 0 ]; then + XMLOPT="--as-xml" + fi + fi + else + XMLOPT="--as-xml" + fi + crm_mon_no_validation -1 $XMLOPT | grep -q -i -E "resource.*id=\"${INSTANCE_ATTR_NAME}\".*role=\"(Promoted|Master)\".*active=\"true\".*orphaned=\"false\".*failed=\"false\"" + return $? +} + +clear_master_score() +{ + local node=$(ocf_attribute_target $1) + if [ -z "$node" ]; then + ocf_promotion_score -D + else + ocf_promotion_score -D -N $node + fi +} + +set_master_score() +{ + local node=$(ocf_attribute_target $1) + + if [ -z "$node" ]; then + ocf_promotion_score -v 100 + else + ocf_promotion_score -N $node -v 100 + fi +} + +promote_everyone() +{ + + for node in $(echo "$OCF_RESKEY_wsrep_cluster_address" | sed 's/gcomm:\/\///g' | tr -d ' ' | tr -s ',' ' '); do + local pcmk_node=$(galera_to_pcmk_name $node) + if [ -z "$pcmk_node" ]; then + ocf_log err "Could not determine pacemaker node from galera name <${node}>." + return + else + node=$pcmk_node + fi + + set_master_score $node + done +} + +greater_than_equal_long() +{ + # there are values we need to compare in this script + # that are too large for shell -gt to process + echo | awk -v n1="$1" -v n2="$2" '{if (n1>=n2) printf ("true"); else printf ("false");}' | grep -q "true" +} + +galera_to_pcmk_name() +{ + local galera=$1 + if [ -z "$OCF_RESKEY_cluster_host_map" ]; then + echo $galera + else + echo "$OCF_RESKEY_cluster_host_map" | tr ';' '\n' | tr -d ' ' | sed 's/:/ /' | awk -F' ' '$2=="'"$galera"'" {print $1;exit}' + fi +} + +pcmk_to_galera_name() +{ + local pcmk=$1 + if [ -z "$OCF_RESKEY_cluster_host_map" ]; then + echo $pcmk + else + echo "$OCF_RESKEY_cluster_host_map" | tr ';' '\n' | tr -d ' ' | sed 's/:/ /' | awk -F' ' '$1=="'"$pcmk"'" {print $2;exit}' + fi +} + + +detect_first_master() +{ + local best_commit=0 + local last_commit=0 + local missing_nodes=0 + local nodes="" + local nodes_recovered="" + local all_nodes + local best_node_gcomm + local best_node + local safe_to_bootstrap + + all_nodes=$(echo "$OCF_RESKEY_wsrep_cluster_address" | sed 's/gcomm:\/\///g' | tr -d ' ' | tr -s ',' ' ') + best_node_gcomm=$(echo "$all_nodes" | sed 's/^.* \(.*\)$/\1/') + best_node=$(galera_to_pcmk_name $best_node_gcomm) + if [ -z "$best_node" ]; then + ocf_log err "Could not determine initial best node from galera name <${best_node_gcomm}>." + return + fi + + # avoid selecting a recovered node as bootstrap if possible + for node in $all_nodes; do + local pcmk_node=$(galera_to_pcmk_name $node) + if [ -z "$pcmk_node" ]; then + ocf_log err "Could not determine pacemaker node from galera name <${node}>." + return + else + node=$pcmk_node + fi + + if is_no_grastate $node; then + nodes_recovered="$nodes_recovered $node" + else + nodes="$nodes $node" + fi + done + + for node in $nodes_recovered $nodes; do + # On clean shutdown, galera sets the last stopped node as 'safe to bootstrap', + # so use this hint when we can + safe_to_bootstrap=$(get_safe_to_bootstrap $node) + + # Special case for 2-node clusters: during a network split, rely on + # pacemaker's quorum to check whether we can restart galera + if [ "$safe_to_bootstrap" != "1" ] && [ "$node" = "$NODENAME" ] && is_two_node_mode_active; then + is_last_node_in_quorate_partition + if [ $? -eq 0 ]; then + ocf_log warn "Survived a split in a 2-node cluster, considering ourselves safe to bootstrap" + safe_to_bootstrap=1 + fi + fi + + if [ "$safe_to_bootstrap" = "1" ]; then + # Galera marked the node as safe to boostrap during shutdown. Let's just + # pick it as our bootstrap node. + ocf_log info "Node <${node}> is marked as safe to bootstrap." + best_node=$node + + # We don't need to wait for the other nodes to report state in this case + missing_nodes=0 + break + fi + + last_commit=$(get_last_commit $node) + + if [ -z "$last_commit" ]; then + ocf_log info "Waiting on node <${node}> to report database status before Master instances can start." + missing_nodes=1 + continue + fi + + # this means -1, or that no commit has occured yet. + if [ "$last_commit" = "18446744073709551615" ]; then + last_commit="0" + fi + + greater_than_equal_long "$last_commit" "$best_commit" + if [ $? -eq 0 ]; then + best_node=$(ocf_attribute_target $node) + best_commit=$last_commit + fi + + done + + if [ $missing_nodes -eq 1 ]; then + return + fi + + ocf_log info "Promoting $best_node to be our bootstrap node" + set_bootstrap_node $best_node + set_master_score $best_node +} + +detect_safe_to_bootstrap() +{ + local safe_to_bootstrap="" + local uuid="" + local seqno="" + + if [ -f ${OCF_RESKEY_datadir}/grastate.dat ]; then + ocf_log info "attempting to read safe_to_bootstrap flag from ${OCF_RESKEY_datadir}/grastate.dat" + safe_to_bootstrap=$(sed -n 's/^safe_to_bootstrap:\s*\(.*\)$/\1/p' < ${OCF_RESKEY_datadir}/grastate.dat) + uuid=$(sed -n 's/^uuid:\s*\(.*\)$/\1/p' < ${OCF_RESKEY_datadir}/grastate.dat) + seqno=$(sed -n 's/^seqno:\s*\(.*\)$/\1/p' < ${OCF_RESKEY_datadir}/grastate.dat) + fi + + if [ -z "$uuid" ] || \ + [ "$uuid" = "00000000-0000-0000-0000-000000000000" ]; then + clear_safe_to_bootstrap + return + fi + if [ "$safe_to_bootstrap" = "1" ]; then + if [ -z "$seqno" ] || [ "$seqno" = "-1" ]; then + clear_safe_to_bootstrap + return + fi + fi + + if [ "$safe_to_bootstrap" = "1" ] || [ "$safe_to_bootstrap" = "0" ]; then + set_safe_to_bootstrap $safe_to_bootstrap + else + clear_safe_to_bootstrap + fi +} + +detect_last_commit() +{ + local last_commit + local recover_args="--defaults-file=$OCF_RESKEY_config \ + --pid-file=$OCF_RESKEY_pid \ + --socket=$OCF_RESKEY_socket \ + --datadir=$OCF_RESKEY_datadir" + local recovery_file_regex='s/.*WSREP\:.*position\s*recovery.*--log_error='\''\([^'\'']*\)'\''.*/\1/p' + local recovered_position_regex='s/.*WSREP\:\s*[R|r]ecovered\s*position.*\:\(.*\)\s*$/\1/p' + + # codership/galera#354 + # Some ungraceful shutdowns can leave an empty gvwstate.dat on + # disk. This will prevent galera to join the cluster if it is + # configured to attempt PC recovery. Removing that file makes the + # node fall back to the normal, unoptimized joining process. + if [ -f ${OCF_RESKEY_datadir}/gvwstate.dat ] && \ + [ ! -s ${OCF_RESKEY_datadir}/gvwstate.dat ]; then + ocf_log warn "empty ${OCF_RESKEY_datadir}/gvwstate.dat detected, removing it to prevent PC recovery failure at next restart" + rm -f ${OCF_RESKEY_datadir}/gvwstate.dat + fi + + ocf_log info "attempting to detect last commit version by reading ${OCF_RESKEY_datadir}/grastate.dat" + last_commit="$(cat ${OCF_RESKEY_datadir}/grastate.dat | sed -n 's/^seqno.\s*\(.*\)\s*$/\1/p')" + if [ -z "$last_commit" ] || [ "$last_commit" = "-1" ]; then + local tmp=$(mktemp) + chown $OCF_RESKEY_user:$OCF_RESKEY_group $tmp + + # if we pass here because grastate.dat doesn't exist, + # try not to bootstrap from this node if possible + if [ ! -f ${OCF_RESKEY_datadir}/grastate.dat ]; then + set_no_grastate + fi + + ocf_log info "now attempting to detect last commit version using 'mysqld_safe --wsrep-recover'" + + $SU - $OCF_RESKEY_user -s /bin/sh -c \ + "${OCF_RESKEY_binary} $recover_args --wsrep-recover --log-error=$tmp 2>/dev/null" + + last_commit="$(cat $tmp | sed -n $recovered_position_regex | tail -1)" + if [ -z "$last_commit" ]; then + # Galera uses InnoDB's 2pc transactions internally. If + # server was stopped in the middle of a replication, the + # recovery may find a "prepared" XA transaction in the + # redo log, and mysql won't recover automatically + + local recovery_file="$(cat $tmp | sed -n $recovery_file_regex)" + if [ -e $recovery_file ]; then + cat $recovery_file | grep -q -E '\[ERROR\]\s+Found\s+[0-9]+\s+prepared\s+transactions!' 2>/dev/null + if [ $? -eq 0 ]; then + # we can only rollback the transaction, but that's OK + # since the DB will get resynchronized anyway + ocf_log warn "local node <${NODENAME}> was not shutdown properly. Rollback stuck transaction with --tc-heuristic-recover" + $SU - $OCF_RESKEY_user -s /bin/sh -c \ + "${OCF_RESKEY_binary} $recover_args --wsrep-recover \ + --tc-heuristic-recover=rollback --log-error=$tmp 2>/dev/null" + + last_commit="$(cat $tmp | sed -n $recovered_position_regex | tail -1)" + if [ ! -z "$last_commit" ]; then + ocf_log warn "State recovered. force SST at next restart for full resynchronization" + rm -f ${OCF_RESKEY_datadir}/grastate.dat + # try not to bootstrap from this node if possible + set_no_grastate + fi + fi + fi + fi + rm -f $tmp + fi + + if [ ! -z "$last_commit" ]; then + ocf_log info "Last commit version found: $last_commit" + set_last_commit $last_commit + return $OCF_SUCCESS + else + ocf_exit_reason "Unable to detect last known write sequence number" + clear_last_commit + return $OCF_ERR_GENERIC + fi +} + +# For galera, promote is really start +galera_promote() +{ + local rc + local extra_opts + local bootstrap + local safe_to_bootstrap + master_exists + if [ $? -eq 0 ]; then + # join without bootstrapping + extra_opts="--wsrep-cluster-address=${OCF_RESKEY_wsrep_cluster_address}" + else + bootstrap=$(is_bootstrap) + + if ocf_is_true $bootstrap; then + # The best node for bootstrapping wasn't cleanly shutdown. Allow + # bootstrapping anyways + if [ "$(get_safe_to_bootstrap)" = "0" ]; then + sed -ie 's/^\(safe_to_bootstrap:\) 0/\1 1/' ${OCF_RESKEY_datadir}/grastate.dat + ocf_log info "safe_to_bootstrap in ${OCF_RESKEY_datadir}/grastate.dat set to 1 on node ${NODENAME}" + fi + ocf_log info "Node <${NODENAME}> is bootstrapping the cluster" + extra_opts="--wsrep-cluster-address=gcomm://" + else + # We are being promoted without having the bootstrap + # attribute in the CIB, which means we are supposed to + # join a cluster; however if we end up here, there is no + # Master remaining right now, which means there is no + # cluster to join anymore. So force a demotion, and and + # let the RA decide later which node should be the next + # bootstrap node. + ocf_log warn "There is no running cluster to join, demoting ourself" + clear_master_score + return $OCF_SUCCESS + fi + fi + + galera_monitor + if [ $? -eq $OCF_RUNNING_MASTER ]; then + if ocf_is_true $bootstrap; then + promote_everyone + clear_bootstrap_node + ocf_log info "boostrap node already up, promoting the rest of the galera instances." + fi + clear_safe_to_bootstrap + clear_last_commit + return $OCF_SUCCESS + fi + + # last commit/safe_to_bootstrap flag are no longer relevant once promoted + clear_last_commit + clear_safe_to_bootstrap + + mysql_common_prepare_dirs + mysql_common_start "$extra_opts" + rc=$? + if [ $rc != $OCF_SUCCESS ]; then + return $rc + fi + + galera_monitor + rc=$? + if [ $rc != $OCF_SUCCESS -a $rc != $OCF_RUNNING_MASTER ]; then + ocf_exit_reason "Failed initial monitor action" + return $rc + fi + + is_readonly + if [ $? -eq 0 ]; then + ocf_exit_reason "Failure. Master instance started in read-only mode, check configuration." + return $OCF_ERR_GENERIC + fi + + is_primary + if [ $? -ne 0 ]; then + ocf_exit_reason "Failure. Master instance started, but is not in Primary mode." + return $OCF_ERR_GENERIC + fi + + if ocf_is_true $bootstrap; then + promote_everyone + clear_bootstrap_node + # clear attribute no-grastate. if last shutdown was + # not clean, we cannot be extra-cautious by requesting a SST + # since this is the bootstrap node + clear_no_grastate + ocf_log info "Bootstrap complete, promoting the rest of the galera instances." + else + # if this is not the bootstrap node, make sure this instance + # syncs with the rest of the cluster before promotion returns. + wait_for_sync + # sync is done, clear info about last startup + clear_no_grastate + fi + + ocf_log info "Galera started" + return $OCF_SUCCESS +} + +galera_demote() +{ + mysql_common_stop + rc=$? + if [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_NOT_RUNNING ]; then + ocf_exit_reason "Failed to stop Master galera instance during demotion to Master" + return $rc + fi + + # if this node was previously a bootstrap node, that is no longer the case. + clear_bootstrap_node + clear_last_commit + clear_no_grastate + clear_safe_to_bootstrap + + # Clear master score here rather than letting pacemaker do so once + # demote finishes. This way a promote cannot take place right + # after this demote even if pacemaker is requested to do so. It + # will first have to run a start/monitor op, to reprobe the state + # of the other galera nodes and act accordingly. + clear_master_score + + # record last commit for next promotion + detect_safe_to_bootstrap + detect_last_commit + rc=$? + return $rc +} + +galera_start() +{ + local rc + local galera_node + + galera_node=$(pcmk_to_galera_name $NODENAME) + if [ -z "$galera_node" ]; then + ocf_exit_reason "Could not determine galera name from pacemaker node <${NODENAME}>." + return $OCF_ERR_CONFIGURED + fi + + echo $OCF_RESKEY_wsrep_cluster_address | grep -q -F $galera_node + if [ $? -ne 0 ]; then + ocf_exit_reason "local node <${NODENAME}> (galera node <${galera_node}>) must be a member of the wsrep_cluster_address <${OCF_RESKEY_wsrep_cluster_address}> to start this galera instance" + return $OCF_ERR_CONFIGURED + fi + + galera_monitor + if [ $? -eq $OCF_RUNNING_MASTER ]; then + ocf_exit_reason "master galera instance started outside of the cluster's control" + return $OCF_ERR_GENERIC + fi + + mysql_common_prepare_dirs + + detect_safe_to_bootstrap + detect_last_commit + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + return $rc + fi + + master_exists + if [ $? -eq 0 ]; then + ocf_log info "Master instances are already up, setting master score so this instance will join galera cluster." + set_master_score $NODENAME + else + clear_master_score + detect_first_master + fi + + return $OCF_SUCCESS +} + +galera_monitor() +{ + local rc + local galera_node + local status_loglevel="err" + + # Set loglevel to info during probe + if ocf_is_probe; then + status_loglevel="info" + fi + + mysql_common_status $status_loglevel + rc=$? + + if [ $rc -eq $OCF_NOT_RUNNING ]; then + last_commit=$(get_last_commit $node) + if [ -n "$last_commit" ]; then + # if last commit is set, this instance is considered started in slave mode + rc=$OCF_SUCCESS + master_exists + if [ $? -ne 0 ]; then + detect_first_master + else + # a master instance exists and is healthy, promote this + # local read only instance + # so it can join the master galera cluster. + set_master_score + fi + fi + return $rc + elif [ $rc -ne $OCF_SUCCESS ]; then + return $rc + fi + + # if we make it here, mysql is running. Check cluster status now. + galera_node=$(pcmk_to_galera_name $NODENAME) + if [ -z "$galera_node" ]; then + ocf_exit_reason "Could not determine galera name from pacemaker node <${NODENAME}>." + return $OCF_ERR_CONFIGURED + fi + + echo $OCF_RESKEY_wsrep_cluster_address | grep -q -F $galera_node + if [ $? -ne 0 ]; then + ocf_exit_reason "local node <${NODENAME}> (galera node <${galera_node}>) is started, but is not a member of the wsrep_cluster_address <${OCF_RESKEY_wsrep_cluster_address}>" + return $OCF_ERR_GENERIC + fi + + is_primary + if [ $? -eq 0 ]; then + + if ocf_is_probe; then + # restore master score during probe + # if we detect this is a master instance + set_master_score + fi + rc=$OCF_RUNNING_MASTER + else + ocf_exit_reason "local node <${NODENAME}> is started, but not in primary mode. Unknown state." + rc=$OCF_ERR_GENERIC + fi + + return $rc +} + +galera_stop() +{ + local rc + # make sure the process is stopped + mysql_common_stop + rc=$1 + + clear_safe_to_bootstrap + clear_last_commit + clear_master_score + clear_bootstrap_node + clear_no_grastate + return $rc +} + +galera_validate() +{ + if [ "$OCF_CHECK_LEVEL" -eq 10 ]; then + if ! ocf_is_ms; then + ocf_exit_reason "Galera must be configured as a multistate Master/Slave resource." + return $OCF_ERR_CONFIGURED + fi + fi + + if [ -z "$OCF_RESKEY_wsrep_cluster_address" ]; then + ocf_exit_reason "Galera must be configured with a wsrep_cluster_address value." + return $OCF_ERR_CONFIGURED + fi + + mysql_common_validate +} + +case "$1" in + meta-data) meta_data + exit $OCF_SUCCESS;; + usage|help) usage + exit $OCF_SUCCESS;; +esac + +[ "$__OCF_ACTION" = "start" ] && OCF_CHECK_LEVEL=10 +galera_validate +rc=$? +LSB_STATUS_STOPPED=3 +if [ $rc -ne 0 ]; then + case "$1" in + stop) exit $OCF_SUCCESS;; + monitor) exit $OCF_NOT_RUNNING;; + status) exit $LSB_STATUS_STOPPED;; + *) exit $rc;; + esac +fi + +if [ -z "${OCF_RESKEY_check_passwd}" ]; then + # This value is automatically sourced from /etc/sysconfig/checkcluster if available + OCF_RESKEY_check_passwd=${MYSQL_PASSWORD} +fi +if [ -z "${OCF_RESKEY_check_user}" ]; then + # This value is automatically sourced from /etc/sysconfig/checkcluster if available + OCF_RESKEY_check_user=${MYSQL_USERNAME} +fi +: ${OCF_RESKEY_check_user="root"} + +MYSQL_OPTIONS_CHECK="-nNE --user=${OCF_RESKEY_check_user}" + +if ocf_is_true "${OCF_RESKEY_check_passwd_use_empty}"; then + MYSQL_OPTIONS_CHECK="$MYSQL_OPTIONS_CHECK --password=" +elif [ -n "${OCF_RESKEY_check_passwd}" ]; then + MYSQL_OPTIONS_CHECK="$MYSQL_OPTIONS_CHECK --password=${OCF_RESKEY_check_passwd}" +fi + +# This value is automatically sourced from /etc/sysconfig/checkcluster if available +if [ -n "${MYSQL_HOST}" ]; then + MYSQL_OPTIONS_CHECK="$MYSQL_OPTIONS_CHECK -h ${MYSQL_HOST}" +fi + +# This value is automatically sourced from /etc/sysconfig/checkcluster if available +if [ -n "${MYSQL_PORT}" ]; then + MYSQL_OPTIONS_CHECK="$MYSQL_OPTIONS_CHECK -P ${MYSQL_PORT}" +fi + + + +# What kind of method was invoked? +case "$1" in + start) galera_start;; + stop) galera_stop;; + status) mysql_common_status err;; + monitor) galera_monitor;; + promote) galera_promote;; + demote) galera_demote;; + validate-all) exit $OCF_SUCCESS;; + + *) usage + exit $OCF_ERR_UNIMPLEMENTED;; +esac + +# vi:sw=4:ts=4:et: diff --git a/heartbeat/garbd b/heartbeat/garbd new file mode 100755 index 0000000..24a6e69 --- /dev/null +++ b/heartbeat/garbd @@ -0,0 +1,436 @@ +#!/bin/sh +# +# Copyright (c) 2015 Damien Ciabrini <dciabrin@redhat.com> +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +## +# README. +# +# Resource agent for garbd, the Galera arbitrator +# +# You can use this agent if you run an even number of galera nodes, +# and you want an additional node to avoid split-brain situations. +# +# garbd requires that a Galera cluster is running, so make sure to +# add a proper ordering constraint to the cluster, e.g.: +# +# pcs constraint order galera-master then garbd +# +# If you add garbd to the cluster while Galera is not running, you +# might want to disable it before setting up ordering constraint, e.g.: +# +# pcs resource create garbd garbd \ +# wsrep_cluster_address=gcomm://node1:4567,node2:4567 \ +# meta target-role=stopped +# +# Use location constraints to avoid running galera and garbd on +# the same node, e.g.: +# +# pcs constraint colocation add garbd with galera-master -INFINITY +# pcs constraint location garbd prefers node3=INFINITY +# +## + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### +# Set default paramenter values + +OCF_RESKEY_binary_default="/usr/sbin/garbd" +OCF_RESKEY_log_default="/var/log/garbd.log" +OCF_RESKEY_pid_default="/var/run/garbd.pid" +OCF_RESKEY_user_default="mysql" +if [ "X${HOSTOS}" = "XOpenBSD" ];then + OCF_RESKEY_group_default="_mysql" +else + OCF_RESKEY_group_default="mysql" +fi +OCF_RESKEY_options_default="" +OCF_RESKEY_wsrep_cluster_address_default="" +OCF_RESKEY_wsrep_cluster_name_default="" + +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +: ${OCF_RESKEY_log=${OCF_RESKEY_log_default}} +: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}} +: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} +: ${OCF_RESKEY_group=${OCF_RESKEY_group_default}} +: ${OCF_RESKEY_options=${OCF_RESKEY_options_default}} +: ${OCF_RESKEY_wsrep_cluster_address=${OCF_RESKEY_wsrep_cluster_address_default}} +: ${OCF_RESKEY_wsrep_cluster_name=${OCF_RESKEY_wsrep_cluster_name_default}} + +usage() { + cat <<UEND +usage: $0 (start|stop|validate-all|meta-data|status|monitor) + +$0 manages a Galera arbitrator. + +The 'start' operation starts the arbitrator. +The 'stop' operation stops the arbitrator. +The 'status' operation reports whether the arbitrator is running +The 'monitor' operation reports whether the arbitrator seems to be working +The 'validate-all' operation reports whether the parameters are valid + +UEND +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="garbd" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for managing Galera arbitrator. +</longdesc> +<shortdesc lang="en">Manages a galera arbitrator instance</shortdesc> +<parameters> + +<parameter name="binary" unique="0" required="0"> +<longdesc lang="en"> +Location of the Galera arbitrator binary +</longdesc> +<shortdesc lang="en">garbd server binary</shortdesc> +<content type="string" default="${OCF_RESKEY_binary_default}" /> +</parameter> + +<parameter name="user" unique="0" required="0"> +<longdesc lang="en"> +User running the garbd process +</longdesc> +<shortdesc lang="en">garbd user</shortdesc> +<content type="string" default="${OCF_RESKEY_user_default}" /> +</parameter> + +<parameter name="group" unique="0" required="0"> +<longdesc lang="en"> +Group running garbd (for logfile permissions) +</longdesc> +<shortdesc lang="en">garbd group</shortdesc> +<content type="string" default="${OCF_RESKEY_group_default}"/> +</parameter> + +<parameter name="log" unique="0" required="0"> +<longdesc lang="en"> +The logfile to be used for garbd. +</longdesc> +<shortdesc lang="en">Galera arbitrator log file</shortdesc> +<content type="string" default="${OCF_RESKEY_log_default}"/> +</parameter> + +<parameter name="pid" unique="0" required="0"> +<longdesc lang="en"> +The pidfile to be used for garbd. +</longdesc> +<shortdesc lang="en">Galera arbitrator pidfile</shortdesc> +<content type="string" default="${OCF_RESKEY_pid_default}"/> +</parameter> + +<parameter name="options" unique="0" required="0"> +<longdesc lang="en"> +Additional parameters which are passed to garbd on startup. +</longdesc> +<shortdesc lang="en">Additional parameters to pass to garbd</shortdesc> +<content type="string" default="${OCF_RESKEY_options_default}"/> +</parameter> + +<parameter name="wsrep_cluster_address" unique="0" required="1"> +<longdesc lang="en"> +The galera cluster address. This takes the form of: +gcomm://node:port,node:port,node:port + +Unlike Galera servers, port is mandatory for garbd. +</longdesc> +<shortdesc lang="en">Galera cluster address</shortdesc> +<content type="string" default="${OCF_RESKEY_wsrep_cluster_address_default}"/> +</parameter> + +<parameter name="wsrep_cluster_name" unique="0" required="1"> +<longdesc lang="en"> +The group name of the Galera cluster to connect to. +</longdesc> +<shortdesc lang="en">Galera cluster name</shortdesc> +<content type="string" default="${OCF_RESKEY_wsrep_cluster_name_default}"/> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="20s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + + +garbd_start() +{ + local rc + local pid + local start_wait + local garbd_params + + garbd_status info + rc=$? + if [ $rc -eq $OCF_SUCCESS ]; then + ocf_exit_reason "garbd started outside of the cluster's control" + return $OCF_ERR_GENERIC; + fi + + touch $OCF_RESKEY_log + chown $OCF_RESKEY_user:$OCF_RESKEY_group $OCF_RESKEY_log + chmod 0640 $OCF_RESKEY_log + [ -x /sbin/restorecon ] && /sbin/restorecon $OCF_RESKEY_log + + garbd_params="--address=${OCF_RESKEY_wsrep_cluster_address} \ + --group ${OCF_RESKEY_wsrep_cluster_name} \ + --log ${OCF_RESKEY_log}" + + if [ ! -z "${OCF_RESKEY_options}" ]; then + garbd_params="${garbd_params} --options=${OCF_RESKEY_options}" + fi + + # garbd has no parameter to run as a specific user, + # so we need to start it by our own means + pid=$(su - -s /bin/sh $OCF_RESKEY_user -c "${OCF_RESKEY_binary} ${garbd_params} >/dev/null 2>&1 & echo \$!") + + # garbd doesn't create a pidfile either, so we create our own + echo $pid > $OCF_RESKEY_pid + if [ $? -ne 0 ]; then + ocf_exit_reason "Cannot create pidfile for garbd at $OCF_RESKEY_pid (rc=$?), please check your installation" + return $OCF_ERR_GENERIC + fi + + # Spin waiting for garbd to connect to the cluster. + # Let the CRM/LRM time us out if required. + start_wait=1 + while [ $start_wait -eq 1 ]; do + garbd_monitor info + rc=$? + if [ $rc -eq $OCF_NOT_RUNNING ]; then + ocf_exit_reason "garbd failed to start (pid=$pid), check logs in ${OCF_RESKEY_log}" + return $OCF_ERR_GENERIC + elif [ $rc -eq $OCF_SUCCESS ]; then + start_wait=0 + fi + sleep 2 + done + + ocf_log info "garbd connected to cluster \"${OCF_RESKEY_wsrep_cluster_name}\"" + return $OCF_SUCCESS +} + +garbd_status() +{ + local loglevel=$1 + local rc + ocf_pidfile_status $OCF_RESKEY_pid + rc=$? + + if [ $rc -eq 0 ]; then + return $OCF_SUCCESS + elif [ $rc -eq 2 ]; then + return $OCF_NOT_RUNNING + else + # clean up if pidfile is stale + if [ $rc -eq 1 ]; then + ocf_log $loglevel "garbd not running: removing old PID file" + rm -f $OCF_RESKEY_pid + fi + return $OCF_ERR_GENERIC + fi +} + +_port_by_pid() +{ + local pid + pid="$1" + if have_binary "netstat"; then + netstat -tnp 2>/dev/null | grep -s -q "ESTABLISHED.*${pid}/" + else + ss -Htnp 2>/dev/null | grep -s -q "^ESTAB.*pid=${pid}" + fi +} + +garbd_monitor() +{ + local rc + local pid + local loglevel=$1 + + # Set loglevel to info during probe + if ocf_is_probe; then + loglevel="info" + fi + + garbd_status $loglevel + rc=$? + + # probe just wants to know if garbd is running or not + if ocf_is_probe && [ $rc -ne $OCF_SUCCESS ]; then + rc=$OCF_NOT_RUNNING + fi + + # Consider garbd is working if it's connected to at least + # one node in the galera cluster. + # Note: a Galera node in Non-Primary state will be + # stopped by the galera RA. So we can assume that + # garbd will always be connected to the right partition + if [ $rc -eq $OCF_SUCCESS ]; then + pid=`cat $OCF_RESKEY_pid 2> /dev/null ` + _port_by_pid $pid + if [ $? -ne 0 ]; then + ocf_log $loglevel "garbd disconnected from cluster \"${OCF_RESKEY_wsrep_cluster_name}\"" + rc=$OCF_ERR_GENERIC + fi + fi + + return $rc +} + +garbd_stop() +{ + local rc + local pid + + if [ ! -f $OCF_RESKEY_pid ]; then + ocf_log info "garbd is not running" + return $OCF_SUCCESS + fi + + pid=`cat $OCF_RESKEY_pid 2> /dev/null ` + + ocf_log info "stopping garbd" + + # make sure the process is stopped + ocf_stop_processes TERM 10 $pid + rc=$? + + if [ $rc -ne 0 ]; then + return $OCF_ERR_GENERIC + else + rm -f $OCF_RESKEY_pid + ocf_log info "garbd stopped" + return $OCF_SUCCESS + fi +} + +garbd_validate() +{ + if ! have_binary "$OCF_RESKEY_binary"; then + ocf_exit_reason "Setup problem: couldn't find command: $OCF_RESKEY_binary" + return $OCF_ERR_INSTALLED; + fi + + if ! have_binary "netstat"; then + if ! have_binary "ss"; then + ocf_exit_reason "Setup problem: couldn't find command: netstat or ss" + return $OCF_ERR_INSTALLED; + fi + fi + + if [ -z "$OCF_RESKEY_wsrep_cluster_address" ]; then + ocf_exit_reason "garbd must be configured with a wsrep_cluster_address value." + return $OCF_ERR_CONFIGURED + fi + + # unlike galera RA, ports must be set in cluster address for garbd + # https://github.com/codership/galera/issues/98 + for node in $(echo "$OCF_RESKEY_wsrep_cluster_address" | sed 's/gcomm:\/\///g' | tr -d ' ' | tr -s ',' ' '); do + echo $node | grep -s -q ':[1-9][0-9]*$' + if [ $? -ne 0 ]; then + ocf_exit_reason "wsrep_cluster_address must specify ports (gcomm://node1:port,node2:port)." + return $OCF_ERR_CONFIGURED + fi + done + + # Ensure that the encryption method is set if garbd is configured + # to use SSL. + echo $OCF_RESKEY_options | grep -s -q -i -E '\bsocket.ssl_(key|cert)=' + if [ $? -eq 0 ]; then + echo $OCF_RESKEY_options | grep -s -q -i -E '\bsocket.ssl_cipher=' + if [ $? -ne 0 ]; then + ocf_exit_reason "option socket.ssl_cipher must be set if SSL is enabled." + return $OCF_ERR_CONFIGURED + fi + fi + + if [ -z "$OCF_RESKEY_wsrep_cluster_name" ]; then + ocf_exit_reason "garbd must be configured with a wsrep_cluster_name value." + return $OCF_ERR_CONFIGURED + fi + + if ! getent passwd $OCF_RESKEY_user >/dev/null 2>&1; then + ocf_exit_reason "User $OCF_RESKEY_user doesn't exist" + return $OCF_ERR_INSTALLED + fi + + if ! getent group $OCF_RESKEY_group >/dev/null 2>&1; then + ocf_exit_reason "Group $OCF_RESKEY_group doesn't exist" + return $OCF_ERR_INSTALLED + fi + + return $OCF_SUCCESS +} + +case "$1" in + meta-data) meta_data + exit $OCF_SUCCESS;; + usage|help) usage + exit $OCF_SUCCESS;; +esac + +garbd_validate +rc=$? + +# trap configuration errors early, but don't block stop in such cases +LSB_STATUS_STOPPED=3 +if [ $rc -ne 0 ]; then + case "$1" in + stop) exit $OCF_SUCCESS;; + status) exit $LSB_STATUS_STOPPED;; + *) exit $rc;; + esac +fi + +# What kind of method was invoked? +case "$1" in + start) garbd_start;; + stop) garbd_stop;; + status) garbd_status err;; + monitor) garbd_monitor err;; + promote) garbd_promote;; + demote) garbd_demote;; + validate-all) exit $OCF_SUCCESS;; + + *) usage + exit $OCF_ERR_UNIMPLEMENTED;; +esac diff --git a/heartbeat/gcp-ilb b/heartbeat/gcp-ilb new file mode 100755 index 0000000..f84f373 --- /dev/null +++ b/heartbeat/gcp-ilb @@ -0,0 +1,344 @@ +#!/bin/sh +# --------------------------------------------------------------------- +# # Copyright 2021 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Authors: Fatima Silveira, Lucia Subatin +# --------------------------------------------------------------------- +# Description: Wrapper to respond to probe requests from health +# check agents in Google Cloud Platform. Nothing is specific to Google +# Cloud. +# --------------------------------------------------------------------- + + +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_cat_default="socat" +OCF_RESKEY_port_default="60000" +OCF_RESKEY_log_enable_default="false" +OCF_RESKEY_log_cmd_default="gcloud" +OCF_RESKEY_log_params_default="logging write GCPILB" +OCF_RESKEY_log_end_params_default="" + + +if type "socat" > /dev/null 2>&1; then + OCF_RESKEY_cat_default="socat" +else + OCF_RESKEY_cat_default="nc" +fi + + +: ${OCF_RESKEY_cat=${OCF_RESKEY_cat_default}} +: ${OCF_RESKEY_port=${OCF_RESKEY_port_default}} +: ${OCF_RESKEY_log_enable=${OCF_RESKEY_log_enable_default}} +: ${OCF_RESKEY_log_cmd=${OCF_RESKEY_log_cmd_default}} +: ${OCF_RESKEY_log_params=${OCF_RESKEY_log_params_default}} +: ${OCF_RESKEY_log_end_params=${OCF_RESKEY_log_end_params_default}} + + +process="$OCF_RESOURCE_INSTANCE" +pidfile="/var/run/$OCF_RESOURCE_INSTANCE.pid" + + +#Validate command for logging +if ocf_is_true "$OCF_RESKEY_log_enable"; then + if type $OCF_RESKEY_log_cmd > /dev/null 2>&1; then + logging_cmd="$OCF_RESKEY_log_cmd $OCF_RESKEY_log_params" + ocf_log debug "Logging command is: \'$logging_cmd\' " + else + OCF_RESKEY_log_enable="false" + ocf_log err "\'$logging_cmd\' is invalid. External logging disabled." + + fi +fi + + +####################################################################### +ilb_metadata() { +cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="gcp-ilb" version="1.0"> +<version>1.0</version> +<longdesc lang="en"> + Resource Agent that wraps /usr/bin/nc or /usr/bin/socat to reply to health checks in Google Cloud. + See https://cloud.google.com/load-balancing/docs/health-checks for more information. +</longdesc> + +<shortdesc lang="en">Replies to health checks from Google Cloud</shortdesc> + +<parameters> + <parameter name="port"> + <longdesc lang="en"> + Listening port for health-check probes. Default: ${OCF_RESKEY_port_default} + </longdesc> + <shortdesc lang="en">Listening port (def ${OCF_RESKEY_port_default}) </shortdesc> + <content type="string" default="${OCF_RESKEY_port_default}" /> + </parameter> + + <parameter name="cat"> + <longdesc lang="en"> + Location of netcat (default: /usr/bin/nc ) or socat (default: /usr/bin/socat ). If present, is used /usr/bin/socat. + The recommended binary is socat, present in the following minimum versions if the package resource-agents: + - SLES 12 SP4/SP5: resource-agents-4.3.018.a7fb5035-3.30.1. + - SLES 15/15 SP1: resource-agents-4.3.0184.6ee15eb2-4.13.1. + </longdesc> + <shortdesc lang="en">Path to nc / socat </shortdesc> + <content type="string" default="${OCF_RESKEY_cat_default}" /> + </parameter> + <parameter name="log_enable"> + <longdesc lang="en"> + Logging with an external application is enabled (accepts "true" or "false"). The defaults are configured to call "gcloud logging" (see: https://cloud.google.com/sdk/gcloud/reference/logging). + </longdesc> + <shortdesc lang="en">Log enabled </shortdesc> + <content type="string" default="${OCF_RESKEY_log_enable_default}" /> + </parameter> + <parameter name="log_cmd"> + <longdesc lang="en"> + External logging command. The defaults are configured to call "gcloud logging" (see: https://cloud.google.com/sdk/gcloud/reference/logging). + This parameter should only have the binary that can be validated (e.g., gcloud). The rest of the command is formed with the additional parameters + and the message being logged as follows: + - log_cmd + log_params + "The message being logged" + log_end_params + + Using the gcloud command for Stackdriver logging, the parameters would be: + - log_cmd = gcloud + - log_params = logging write GCPILB + - "This is a message being sent by the app" + - log_end_params = (nothing in this case, this is reserved for use with other tools) + Which would result in this valid command (where GCPILB is the name of the log): + gcloud logging write GCPILB "This is a message being sent by the app" + + NOTE: Only the binary is validated for existence and no further checks are performed. The assumption is that only administrators with root access can configure this tool. + </longdesc> + <shortdesc lang="en">External log command </shortdesc> + <content type="string" default="${OCF_RESKEY_log_cmd_default}" /> + </parameter> + <parameter name="log_params"> + <longdesc lang="en"> + Additional input for the logging application. See explanation for log_cmd + </longdesc> + <shortdesc lang="en">Additional input 1 </shortdesc> + <content type="string" default="${OCF_RESKEY_log_params_default}" /> + </parameter> + <parameter name="log_end_params"> + <longdesc lang="en"> + Additional input for the logging application. Placed after the message being logged. + </longdesc> + <shortdesc lang="en">Additional input 1 </shortdesc> + <content type="string" default="${OCF_RESKEY_log_end_params_default}" /> + </parameter> + +</parameters> + +<actions> + <action name="start" timeout="10s" /> + <action name="stop" timeout="30s" /> + <action name="monitor" depth="0" timeout="30s" interval="30s" /> + <action name="validate-all" timeout="5s" /> + <action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +exit 0 +} + +####################################################################### + +log() { + lvl=$1 + msg=$2 + ocf_log $lvl "$0 - $process - $pid: $msg" + if ocf_is_true "${OCF_RESKEY_log_enable}" ; then + ( ${OCF_RESKEY_log_cmd} ${OCF_RESKEY_log_params} "L $lvl: $msg" ${OCF_RESKEY_log_end_params} ) + fi +} + +getpid() { + if ! [ -f "$pidfile" ] ; then + return + fi + + cat $pidfile +} + +ilb_usage() { + cat <<END +usage: $0 {start|stop|monitor|status|meta-data|validate-all} + +Agent wrapping socat or nc to reply to health probes. +END +} + + + + +ilb_monitor() { + + pid=`getpid` + log debug "pid is $pid" + + if [ -z "$pid" ] ; then + return $OCF_NOT_RUNNING + fi + + if [ -n "$pid" ] && kill -s 0 $pid ; then + log debug "Process is currently running" + return $OCF_SUCCESS + else + log warn "The process is not running but has a pidfile. Removing file" + rm -f $pidfile + return $OCF_NOT_RUNNING + fi + +} + +ilb_start() { + + if ilb_monitor; then + log debug "Process is already running" + return $OCF_SUCCESS + fi + + cmd="$OCF_RESKEY_cat -U TCP-LISTEN:$OCF_RESKEY_port,backlog=10,fork,reuseaddr /dev/null" + + if [ $( basename $OCF_RESKEY_cat ) = 'nc' ]; then + cmd="$OCF_RESKEY_cat -l -k $OCF_RESKEY_port" + fi + + log debug "Starting with \'$cmd\'" + ( ${cmd} ) & pid="$!" + disown + + if [ -n "$pid" ] ; then + log debug "$pid is started" + echo "$pid" > $pidfile + return $OCF_SUCCESS + else + log err "\'$cmd\' could not be started" + return $OCF_ERR_GENERIC + fi + +} + +ilb_stop() { + + if ! ilb_monitor; then + rm -f $pidfile + return $OCF_SUCCESS + fi + + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + # Allow 2/3 of the action timeout for the orderly shutdown + # (The origin unit is ms, hence the conversion) + stop_timeout=$((OCF_RESKEY_CRM_meta_timeout/1500)) + else + stop_timeout=10 + fi + + + pid=`getpid` + kill $pid + + i=0 + while [ $i -lt $stop_timeout ]; do + if ! ilb_monitor; then + rm -f $pidfile + return $OCF_SUCCESS + fi + sleep 1 + i=$((i+1)) + done + + log warn "Stop with SIGTERM failed/timed out, now sending SIGKILL." + + i=0 + while [ $i -lt $stop_timeout ]; do + + kill -s 9 $pid + + if ! ilb_monitor; then + log warn "SIGKILL did the job." + rm -f $pidfile + return $OCF_SUCCESS + fi + log info "The job still hasn't stopped yet. Re-trying SIGKILL..." + sleep 2 + i=$((i+2)) + done + + log err "The cat has more than 9 lives and could not be terminated." + return $OCF_ERR_GENERIC + +} + +ilb_validate() { + check_binary "$OCF_RESKEY_cat" + + ocf_is_true "$OCF_RESKEY_log_enable" && check_binary "$OCF_RESKEY_log_cmd" + + if ! ocf_is_decimal "$OCF_RESKEY_port"; then + ocf_exit_reason "$OCF_RESKEY_port is not a valid port" + exit $OCF_ERR_CONFIGURED + fi + + return $OCF_SUCCESS +} + +############################################################################### +# +# MAIN +# +############################################################################### + +case $__OCF_ACTION in + meta-data) + ilb_metadata + exit $OCF_SUCCESS + ;; + usage|help) + ilb_usage + exit $OCF_SUCCESS + ;; +esac + +if ! ocf_is_root; then + log err "You must be root for $__OCF_ACTION operation." + exit $OCF_ERR_PERM +fi + +case $__OCF_ACTION in + start) + ilb_validate + ilb_start + ;; + stop) + ilb_stop + ;; + monitor) + ilb_monitor + ;; + validate-all) + ilb_validate + ;; + *) + ilb_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +rc=$? +log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION returned $rc" + +exit $rc diff --git a/heartbeat/gcp-pd-move.in b/heartbeat/gcp-pd-move.in new file mode 100644 index 0000000..1e81119 --- /dev/null +++ b/heartbeat/gcp-pd-move.in @@ -0,0 +1,382 @@ +#!@PYTHON@ -tt +# - *- coding: utf- 8 - *- +# +# --------------------------------------------------------------------- +# Copyright 2018 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# --------------------------------------------------------------------- +# Description: Google Cloud Platform - Disk attach +# --------------------------------------------------------------------- + +import json +import logging +import os +import re +import sys +import time + +OCF_FUNCTIONS_DIR = os.environ.get("OCF_FUNCTIONS_DIR", "%s/lib/heartbeat" % os.environ.get("OCF_ROOT")) +sys.path.append(OCF_FUNCTIONS_DIR) + +import ocf +from ocf import logger + +try: + import googleapiclient.discovery +except ImportError: + pass + +if sys.version_info >= (3, 0): + # Python 3 imports. + import urllib.parse as urlparse + import urllib.request as urlrequest +else: + # Python 2 imports. + import urllib as urlparse + import urllib2 as urlrequest + + +CONN = None +PROJECT = None +ZONE = None +REGION = None +LIST_DISK_ATTACHED_INSTANCES = None +INSTANCE_NAME = None + +PARAMETERS = { + 'disk_name': '', + 'disk_scope': 'detect', + 'disk_csek_file': '', + 'mode': "READ_WRITE", + 'device_name': '', + 'stackdriver_logging': 'no', +} + +MANDATORY_PARAMETERS = ['disk_name', 'disk_scope'] + +METADATA_SERVER = 'http://metadata.google.internal/computeMetadata/v1/' +METADATA_HEADERS = {'Metadata-Flavor': 'Google'} +METADATA = '''<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="gcp-pd-move" version="1.0"> +<version>1.0</version> +<longdesc lang="en"> +Resource Agent that can attach or detach a regional/zonal disk on current GCP +instance. +Requirements : +- Disk has to be properly created as regional/zonal in order to be used +correctly. +</longdesc> +<shortdesc lang="en">Attach/Detach a persistent disk on current GCP instance</shortdesc> +<parameters> +<parameter name="disk_name" unique="1" required="1"> +<longdesc lang="en">The name of the GCP disk.</longdesc> +<shortdesc lang="en">Disk name</shortdesc> +<content type="string" default="{}" /> +</parameter> +<parameter name="disk_scope"> +<longdesc lang="en">Disk scope</longdesc> +<shortdesc lang="en">Network name</shortdesc> +<content type="string" default="{}" /> +</parameter> +<parameter name="disk_csek_file"> +<longdesc lang="en">Path to a Customer-Supplied Encryption Key (CSEK) key file</longdesc> +<shortdesc lang="en">Customer-Supplied Encryption Key file</shortdesc> +<content type="string" default="{}" /> +</parameter> +<parameter name="mode"> +<longdesc lang="en">Attachment mode (READ_WRITE, READ_ONLY)</longdesc> +<shortdesc lang="en">Attachment mode</shortdesc> +<content type="string" default="{}" /> +</parameter> +<parameter name="device_name"> +<longdesc lang="en">An optional name that indicates the disk name the guest operating system will see.</longdesc> +<shortdesc lang="en">Optional device name</shortdesc> +<content type="boolean" default="{}" /> +</parameter> +<parameter name="stackdriver_logging"> +<longdesc lang="en">Use stackdriver_logging output to global resource (yes, true, enabled)</longdesc> +<shortdesc lang="en">Use stackdriver_logging</shortdesc> +<content type="string" default="{}" /> +</parameter> +</parameters> +<actions> +<action name="start" timeout="300s" /> +<action name="stop" timeout="15s" /> +<action name="monitor" timeout="15s" interval="10s" depth="0" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent>'''.format(PARAMETERS['disk_name'], PARAMETERS['disk_scope'], + PARAMETERS['disk_csek_file'], PARAMETERS['mode'], PARAMETERS['device_name'], + PARAMETERS['stackdriver_logging']) + + +def get_metadata(metadata_key, params=None, timeout=None): + """Performs a GET request with the metadata headers. + + Args: + metadata_key: string, the metadata to perform a GET request on. + params: dictionary, the query parameters in the GET request. + timeout: int, timeout in seconds for metadata requests. + + Returns: + HTTP response from the GET request. + + Raises: + urlerror.HTTPError: raises when the GET request fails. + """ + timeout = timeout or 60 + metadata_url = os.path.join(METADATA_SERVER, metadata_key) + params = urlparse.urlencode(params or {}) + url = '%s?%s' % (metadata_url, params) + request = urlrequest.Request(url, headers=METADATA_HEADERS) + request_opener = urlrequest.build_opener(urlrequest.ProxyHandler({})) + return request_opener.open(request, timeout=timeout * 1.1).read().decode("utf-8") + + +def populate_vars(): + global CONN + global INSTANCE_NAME + global PROJECT + global ZONE + global REGION + global LIST_DISK_ATTACHED_INSTANCES + + # Populate global vars + try: + CONN = googleapiclient.discovery.build('compute', 'v1') + except Exception as e: + logger.error('Couldn\'t connect with google api: ' + str(e)) + sys.exit(ocf.OCF_ERR_GENERIC) + + for param in PARAMETERS: + value = os.environ.get('OCF_RESKEY_%s' % param, PARAMETERS[param]) + if not value and param in MANDATORY_PARAMETERS: + logger.error('Missing %s mandatory parameter' % param) + sys.exit(ocf.OCF_ERR_CONFIGURED) + elif value: + PARAMETERS[param] = value + + try: + INSTANCE_NAME = get_metadata('instance/name') + except Exception as e: + logger.error( + 'Couldn\'t get instance name, is this running inside GCE?: ' + str(e)) + sys.exit(ocf.OCF_ERR_GENERIC) + + PROJECT = get_metadata('project/project-id') + if PARAMETERS['disk_scope'] in ['detect', 'regional']: + ZONE = get_metadata('instance/zone').split('/')[-1] + REGION = ZONE[:-2] + else: + ZONE = PARAMETERS['disk_scope'] + LIST_DISK_ATTACHED_INSTANCES = get_disk_attached_instances( + PARAMETERS['disk_name']) + + +def configure_logs(): + # Prepare logging + global logger + logging.getLogger('googleapiclient').setLevel(logging.WARN) + logging_env = os.environ.get('OCF_RESKEY_stackdriver_logging') + if logging_env: + logging_env = logging_env.lower() + if any(x in logging_env for x in ['yes', 'true', 'enabled']): + try: + import google.cloud.logging.handlers + client = google.cloud.logging.Client() + handler = google.cloud.logging.handlers.CloudLoggingHandler( + client, name=INSTANCE_NAME) + handler.setLevel(logging.INFO) + formatter = logging.Formatter('gcp:alias "%(message)s"') + handler.setFormatter(formatter) + ocf.log.addHandler(handler) + logger = logging.LoggerAdapter( + ocf.log, {'OCF_RESOURCE_INSTANCE': ocf.OCF_RESOURCE_INSTANCE}) + except ImportError: + logger.error('Couldn\'t import google.cloud.logging, ' + 'disabling Stackdriver-logging support') + + +def wait_for_operation(operation): + while True: + result = CONN.zoneOperations().get( + project=PROJECT, + zone=ZONE, + operation=operation['name']).execute() + + if result['status'] == 'DONE': + if 'error' in result: + raise Exception(result['error']) + return + time.sleep(1) + + +def get_disk_attached_instances(disk): + def get_users_list(): + fl = 'name="%s"' % disk + request = CONN.disks().aggregatedList(project=PROJECT, filter=fl) + while request is not None: + response = request.execute() + locations = response.get('items', {}) + for location in locations.values(): + for d in location.get('disks', []): + if d['name'] == disk: + return d.get('users', []) + request = CONN.instances().aggregatedList_next( + previous_request=request, previous_response=response) + raise Exception("Unable to find disk %s" % disk) + + def get_only_instance_name(user): + return re.sub('.*/instances/', '', user) + + return map(get_only_instance_name, get_users_list()) + + +def is_disk_attached(instance): + return instance in LIST_DISK_ATTACHED_INSTANCES + + +def detach_disk(instance, disk_name): + # Python API misses disk-scope argument. + + # Detaching a disk is only possible by using deviceName, which is retrieved + # as a disk parameter when listing the instance information + request = CONN.instances().get( + project=PROJECT, zone=ZONE, instance=instance) + response = request.execute() + + device_name = None + for disk in response['disks']: + if disk_name == re.sub('.*disks/',"",disk['source']): + device_name = disk['deviceName'] + break + + if not device_name: + logger.error("Didn't find %(d)s deviceName attached to %(i)s" % { + 'd': disk_name, + 'i': instance, + }) + return + + request = CONN.instances().detachDisk( + project=PROJECT, zone=ZONE, instance=instance, deviceName=device_name) + wait_for_operation(request.execute()) + + +def attach_disk(instance, disk_name): + location = 'zones/%s' % ZONE + if PARAMETERS['disk_scope'] == 'regional': + location = 'regions/%s' % REGION + + prefix = 'https://www.googleapis.com/compute/v1' + body = { + 'source': '%(prefix)s/projects/%(project)s/%(location)s/disks/%(disk)s' % { + 'prefix': prefix, + 'project': PROJECT, + 'location': location, + 'disk': disk_name, + }, + } + + # Customer-Supplied Encryption Key (CSEK) + if PARAMETERS['disk_csek_file']: + with open(PARAMETERS['disk_csek_file']) as csek_file: + body['diskEncryptionKey'] = { + 'rawKey': csek_file.read(), + } + + if PARAMETERS['device_name']: + body['deviceName'] = PARAMETERS['device_name'] + + if PARAMETERS['mode']: + body['mode'] = PARAMETERS['mode'] + + force_attach = None + if PARAMETERS['disk_scope'] == 'regional': + # Python API misses disk-scope argument. + force_attach = True + else: + # If this disk is attached to some instance, detach it first. + for other_instance in LIST_DISK_ATTACHED_INSTANCES: + logger.info("Detaching disk %(disk_name)s from other instance %(i)s" % { + 'disk_name': PARAMETERS['disk_name'], + 'i': other_instance, + }) + detach_disk(other_instance, PARAMETERS['disk_name']) + + request = CONN.instances().attachDisk( + project=PROJECT, zone=ZONE, instance=instance, body=body, + forceAttach=force_attach) + wait_for_operation(request.execute()) + + +def fetch_data(): + configure_logs() + populate_vars() + + +def gcp_pd_move_start(): + fetch_data() + if not is_disk_attached(INSTANCE_NAME): + logger.info("Attaching disk %(disk_name)s to %(instance)s" % { + 'disk_name': PARAMETERS['disk_name'], + 'instance': INSTANCE_NAME, + }) + attach_disk(INSTANCE_NAME, PARAMETERS['disk_name']) + + +def gcp_pd_move_stop(): + fetch_data() + if is_disk_attached(INSTANCE_NAME): + logger.info("Detaching disk %(disk_name)s to %(instance)s" % { + 'disk_name': PARAMETERS['disk_name'], + 'instance': INSTANCE_NAME, + }) + detach_disk(INSTANCE_NAME, PARAMETERS['disk_name']) + + +def gcp_pd_move_status(): + fetch_data() + if is_disk_attached(INSTANCE_NAME): + logger.debug("Disk %(disk_name)s is correctly attached to %(instance)s" % { + 'disk_name': PARAMETERS['disk_name'], + 'instance': INSTANCE_NAME, + }) + else: + sys.exit(ocf.OCF_NOT_RUNNING) + + +def main(): + if len(sys.argv) < 2: + logger.error('Missing argument') + return + + command = sys.argv[1] + if 'meta-data' in command: + print(METADATA) + return + + if command in 'start': + gcp_pd_move_start() + elif command in 'stop': + gcp_pd_move_stop() + elif command in ('monitor', 'status'): + gcp_pd_move_status() + else: + configure_logs() + logger.error('no such function %s' % str(command)) + + +if __name__ == "__main__": + main() diff --git a/heartbeat/gcp-vpc-move-ip.in b/heartbeat/gcp-vpc-move-ip.in new file mode 100755 index 0000000..2e63b2b --- /dev/null +++ b/heartbeat/gcp-vpc-move-ip.in @@ -0,0 +1,374 @@ +#!@BASH_SHELL@ +# +# +# OCF resource agent to move an IP address within a VPC in GCP +# +# License: GNU General Public License (GPL) +# Copyright (c) 2018 Hervé Werner (MFG Labs) +# Based on code from Markus Guertler (aws-vpc-move-ip) +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_gcloud_default="/usr/bin/gcloud" +OCF_RESKEY_configuration_default="default" +OCF_RESKEY_vpc_network_default="default" +OCF_RESKEY_interface_default="eth0" +OCF_RESKEY_route_name_default="ra-${__SCRIPT_NAME}" + +: ${OCF_RESKEY_gcloud=${OCF_RESKEY_gcloud_default}} +: ${OCF_RESKEY_configuration=${OCF_RESKEY_configuration_default}} +: ${OCF_RESKEY_vpc_network=${OCF_RESKEY_vpc_network_default}} +: ${OCF_RESKEY_interface=${OCF_RESKEY_interface_default}} +: ${OCF_RESKEY_route_name=${OCF_RESKEY_route_name_default}} + +gcp_api_url_prefix="https://www.googleapis.com/compute/v1" +gcloud="${OCF_RESKEY_gcloud} --quiet --configuration=${OCF_RESKEY_configuration}" + +####################################################################### + +USAGE="usage: $0 {start|stop|monitor|status|meta-data|validate-all}"; +############################################################################### + + +############################################################################### +# +# Functions +# +############################################################################### + + +metadata() { +cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="gcp-vpc-move-ip" version="1.0"> +<version>1.0</version> +<longdesc lang="en"> +Resource Agent that can move a floating IP addresse within a GCP VPC by changing an +entry in the routing table. This agent also configures the floating IP locally +on the instance OS. +Requirements : +- IP forwarding must be enabled on all instances in order to be able to +terminate the route +- The floating IP address must be chosen so that it is outside all existing +subnets in the VPC network +- IAM permissions +(see https://cloud.google.com/compute/docs/access/iam-permissions) : +1) compute.routes.delete, compute.routes.get and compute.routes.update on the +route +2) compute.networks.updatePolicy on the network (to add a new route) +3) compute.networks.get on the network (to check the VPC network existence) +4) compute.routes.list on the project (to check conflicting routes) +</longdesc> +<shortdesc lang="en">Move IP within a GCP VPC</shortdesc> + +<parameters> +<parameter name="gcloud"> +<longdesc lang="en"> +Path to command line tools for GCP +</longdesc> +<shortdesc lang="en">Path to the gcloud tool</shortdesc> +<content type="string" default="${OCF_RESKEY_gcloud_default}" /> +</parameter> + +<parameter name="configuration"> +<longdesc lang="en"> +Named configuration for gcloud +</longdesc> +<shortdesc lang="en">Named gcloud configuration</shortdesc> +<content type="string" default="${OCF_RESKEY_configuration_default}" /> +</parameter> + +<parameter name="ip" unique="1" required="1"> +<longdesc lang="en"> +Floating IP address. Note that this IP must be chosen outside of all existing +subnet ranges +</longdesc> +<shortdesc lang="en">Floating IP</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="vpc_network" required="1"> +<longdesc lang="en"> +Name of the VPC network +</longdesc> +<shortdesc lang="en">VPC network</shortdesc> +<content type="string" default="${OCF_RESKEY_vpc_network_default}" /> +</parameter> + +<parameter name="interface"> +<longdesc lang="en"> +Name of the network interface +</longdesc> +<shortdesc lang="en">Network interface name</shortdesc> +<content type="string" default="${OCF_RESKEY_interface_default}" /> +</parameter> + +<parameter name="route_name" unique="1"> +<longdesc lang="en"> +Route name +</longdesc> +<shortdesc lang="en">Route name</shortdesc> +<content type="string" default="${OCF_RESKEY_route_name_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="180s" /> +<action name="stop" timeout="180s" /> +<action name="monitor" depth="0" timeout="30s" interval="60s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +validate() { + if ! ocf_is_root; then + ocf_exit_reason "You must run this agent as root" + exit $OCF_ERR_PERM + fi + + for cmd in ${OCF_RESKEY_gcloud} ip curl; do + check_binary "$cmd" + done + + if [ -z "${OCF_RESKEY_ip}" ]; then + ocf_exit_reason "Missing mandatory parameter" + exit $OCF_ERR_CONFIGURED + fi + + GCE_INSTANCE_NAME=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/name") + GCE_INSTANCE_ZONE=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/zone" | awk -F '/' '{ print $NF }') + GCE_INSTANCE_PROJECT=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/project/project-id") + + if [ -z "${GCE_INSTANCE_NAME}" -o -z "${GCE_INSTANCE_ZONE}" -o -z "${GCE_INSTANCE_PROJECT}" ]; then + ocf_exit_reason "Instance information not found. Is this a GCE instance ?" + exit $OCF_ERR_GENERIC + fi + + if ! ${OCF_RESKEY_gcloud} config configurations describe ${OCF_RESKEY_configuration} &>/dev/null; then + ocf_exit_reason "Gcloud configuration not found" + exit $OCF_ERR_CONFIGURED + fi + + if ! ip link show ${OCF_RESKEY_interface} &> /dev/null; then + ocf_exit_reason "Network interface not found" + exit $OCF_ERR_CONFIGURED + fi + + return $OCF_SUCCESS +} + +check_conflicting_routes() { + cmd="${gcloud} compute routes list \ + --filter='destRange:${OCF_RESKEY_ip} AND \ + network=(${gcp_api_url_prefix}/projects/${GCE_INSTANCE_PROJECT}/global/networks/${OCF_RESKEY_vpc_network}) AND \ + NOT name=${OCF_RESKEY_route_name}' \ + --format='value[terminator=\" \"](name)'" + ocf_log debug "Executing command: $(echo $cmd)" + route_list=$(eval ${cmd}) + if [ $? -ne 0 ]; then + exit $OCF_ERR_GENERIC + fi + if [ -n "${route_list}" ]; then + ocf_exit_reason "Conflicting unnmanaged routes for destination ${OCF_RESKEY_ip}/32 in VPC ${OCF_RESKEY_vpc_network} found : ${route_list}" + exit $OCF_ERR_CONFIGURED + fi + return $OCF_SUCCESS +} + +route_monitor() { + ocf_log info "GCP route monitor: checking route table" + + # Ensure that there is no route that we are not aware of that is also handling our IP + check_conflicting_routes + + cmd="${gcloud} compute routes describe ${OCF_RESKEY_route_name} --format='get(nextHopInstance)'" + ocf_log debug "Executing command: $cmd" + # Also redirect stderr as we parse the output to use an appropriate exit code + routed_to_instance=$(eval $cmd 2>&1) + if [ $? -ne 0 ]; then + if echo $routed_to_instance | grep -qi "Insufficient Permission" ; then + ocf_exit_reason "Insufficient permissions to get route information" + exit $OCF_ERR_PERM + elif echo $routed_to_instance | grep -qi "Could not fetch resource"; then + ocf_log debug "The route ${OCF_RESKEY_route_name} doesn't exist" + return $OCF_NOT_RUNNING + else + ocf_exit_reason "Error : ${routed_to_instance}" + exit $OCF_ERR_GENERIC + fi + fi + if [ -z "${routed_to_instance}" ]; then + routed_to_instance="<unknown>" + fi + + if [ "${routed_to_instance}" != "${gcp_api_url_prefix}/projects/${GCE_INSTANCE_PROJECT}/zones/${GCE_INSTANCE_ZONE}/instances/${GCE_INSTANCE_NAME}" ]; then + ocf_log warn "The floating IP ${OCF_RESKEY_ip} is not routed to this instance (${GCE_INSTANCE_NAME}) but to instance ${routed_to_instance##*/}" + return $OCF_NOT_RUNNING + fi + + ocf_log debug "The floating IP ${OCF_RESKEY_ip} is correctly routed to this instance (${GCE_INSTANCE_NAME})" + return $OCF_SUCCESS +} + +ip_monitor() { + ocf_log info "IP monitor: checking local network configuration" + + cmd="ip address show dev ${OCF_RESKEY_interface} to ${OCF_RESKEY_ip}/32" + ocf_log debug "Executing command: $cmd" + if [ -z "$($cmd)" ]; then + ocf_log warn "The floating IP ${OCF_RESKEY_ip} is not locally configured on this instance (${GCE_INSTANCE_NAME})" + return $OCF_NOT_RUNNING + fi + + ocf_log debug "The floating IP ${OCF_RESKEY_ip} is correctly configured on this instance (${GCE_INSTANCE_NAME})" + return $OCF_SUCCESS +} + +ip_release() { + cmd="ip address delete ${OCF_RESKEY_ip}/32 dev ${OCF_RESKEY_interface}" + ocf_log debug "Executing command: $cmd" + ocf_run $cmd || return $OCF_ERR_GENERIC + return $OCF_SUCCESS +} + +route_release() { + cmd="${gcloud} compute routes delete ${OCF_RESKEY_route_name}" + ocf_log debug "Executing command: $cmd" + ocf_run $cmd || return $OCF_ERR_GENERIC + return $OCF_SUCCESS +} + +ip_and_route_start() { + ocf_log info "Bringing up the floating IP ${OCF_RESKEY_ip}" + + # Add a new entry in the routing table + # If the route entry exists and is pointing to another instance, take it over + + # Ensure that there is no route that we are not aware of that is also handling our IP + check_conflicting_routes + + # There is no replace API, We need to first delete the existing route if any + if ${gcloud} compute routes describe ${OCF_RESKEY_route_name} &>/dev/null; then + route_release + fi + + cmd="${gcloud} compute routes create ${OCF_RESKEY_route_name} \ + --network=${OCF_RESKEY_vpc_network} --destination-range=${OCF_RESKEY_ip}/32 \ + --next-hop-instance-zone=${GCE_INSTANCE_ZONE} --next-hop-instance=${GCE_INSTANCE_NAME}" + ocf_log debug "Executing command: $(echo $cmd)" + ocf_run $cmd + + if [ $? -ne $OCF_SUCCESS ]; then + if ! ${gcloud} compute networks describe ${OCF_RESKEY_vpc_network} &>/dev/null; then + ocf_exit_reason "VPC network not found" + exit $OCF_ERR_CONFIGURED + else + return $OCF_ERR_GENERIC + fi + fi + + # Configure the IP address locally + # We need to release the IP first + ip_monitor &>/dev/null + if [ $? -eq $OCF_SUCCESS ]; then + ip_release + fi + + cmd="ip address add ${OCF_RESKEY_ip}/32 dev ${OCF_RESKEY_interface}" + ocf_log debug "Executing command: $cmd" + ocf_run $cmd || return $OCF_ERR_GENERIC + + cmd="ip link set ${OCF_RESKEY_interface} up" + ocf_log debug "Executing command: $cmd" + ocf_run $cmd || return $OCF_ERR_GENERIC + + ocf_log info "Successfully brought up the floating IP ${OCF_RESKEY_ip}" + return $OCF_SUCCESS +} + +ip_and_route_stop() { + ocf_log info "Bringing down the floating IP ${OCF_RESKEY_ip}" + + # Delete the route entry + # If the route entry exists and is pointing to another instance, don't touch it + route_monitor &>/dev/null + if [ $? -eq $OCF_NOT_RUNNING ]; then + ocf_log info "The floating IP ${OCF_RESKEY_ip} is already not routed to this instance (${GCE_INSTANCE_NAME})" + else + route_release + fi + + # Delete the local IP address + ip_monitor &>/dev/null + if [ $? -eq $OCF_NOT_RUNNING ]; then + ocf_log info "The floating IP ${OCF_RESKEY_ip} is already down" + else + ip_release + fi + + ocf_log info "Successfully brought down the floating IP ${OCF_RESKEY_ip}" + return $OCF_SUCCESS +} + + +############################################################################### +# +# MAIN +# +############################################################################### + +ocf_log warn "gcp-vpc-move-ip is deprecated, prefer to use gcp-vpc-move-route instead" + +case $__OCF_ACTION in + meta-data) metadata + exit $OCF_SUCCESS + ;; + usage|help) echo $USAGE + exit $OCF_SUCCESS + ;; +esac + +validate || exit $? + +case $__OCF_ACTION in + start) ip_and_route_start;; + stop) ip_and_route_stop;; + monitor|status) route_monitor || exit $? + ip_monitor || exit $? + ;; + validate-all) ;; + *) echo $USAGE + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac diff --git a/heartbeat/gcp-vpc-move-route.in b/heartbeat/gcp-vpc-move-route.in new file mode 100644 index 0000000..3f543fe --- /dev/null +++ b/heartbeat/gcp-vpc-move-route.in @@ -0,0 +1,490 @@ +#!@PYTHON@ -tt +# - *- coding: utf- 8 - *- +# +# +# OCF resource agent to move an IP address within a VPC in GCP +# +# License: GNU General Public License (GPL) +# Copyright (c) 2018 Hervé Werner (MFG Labs) +# Copyright 2018 Google Inc. +# Based on code from Markus Guertler (aws-vpc-move-ip) +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + + +####################################################################### + +import atexit +import logging +import os +import sys +import time + +OCF_FUNCTIONS_DIR = os.environ.get("OCF_FUNCTIONS_DIR", "%s/lib/heartbeat" % os.environ.get("OCF_ROOT")) +sys.path.append(OCF_FUNCTIONS_DIR) + +from ocf import * + +try: + import googleapiclient.discovery + import pyroute2 + try: + from google.oauth2.service_account import Credentials as ServiceAccountCredentials + except ImportError: + from oauth2client.service_account import ServiceAccountCredentials +except ImportError: + pass + +if sys.version_info >= (3, 0): + # Python 3 imports. + import urllib.parse as urlparse + import urllib.request as urlrequest +else: + # Python 2 imports. + import urllib as urlparse + import urllib2 as urlrequest + + +GCP_API_URL_PREFIX = 'https://www.googleapis.com/compute/v1' +METADATA_SERVER = 'http://metadata.google.internal/computeMetadata/v1/' +METADATA_HEADERS = {'Metadata-Flavor': 'Google'} +METADATA = \ +'''<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="gcp-vpc-move-route" version="1.0"> +<version>1.0</version> +<longdesc lang="en"> +Resource Agent that can move a floating IP addresse within a GCP VPC by changing an +entry in the routing table. This agent also configures the floating IP locally +on the instance OS. +Requirements : +- IP forwarding must be enabled on all instances in order to be able to +terminate the route +- The floating IP address must be chosen so that it is outside all existing +subnets in the VPC network +- IAM permissions +(see https://cloud.google.com/compute/docs/access/iam-permissions) : +1) compute.routes.delete, compute.routes.get and compute.routes.update on the +route +2) compute.networks.updatePolicy on the network (to add a new route) +3) compute.networks.get on the network (to check the VPC network existence) +4) compute.routes.list on the project (to check conflicting routes) +</longdesc> +<shortdesc lang="en">Move IP within a GCP VPC</shortdesc> + +<parameters> + +<parameter name="ip" unique="1" required="1"> +<longdesc lang="en"> +Floating IP address. Note that this IP must be chosen outside of all existing +subnet ranges +</longdesc> +<shortdesc lang="en">Floating IP</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="vpc_network" required="0"> +<longdesc lang="en"> +Name of the VPC network +</longdesc> +<shortdesc lang="en">VPC network</shortdesc> +<content type="string" default="default" /> +</parameter> + +<parameter name="project"> +<longdesc lang="en"> +Project ID of the instance. It can be useful to set this attribute if +the instance is in a shared service project. Otherwise, the agent should +be able to determine the project ID automatically. +</longdesc> +<shortdesc lang="en">Project ID</shortdesc> +<content type="string" default="default" /> +</parameter> + +<parameter name="interface"> +<longdesc lang="en"> +Name of the network interface +</longdesc> +<shortdesc lang="en">Network interface name</shortdesc> +<content type="string" default="eth0" /> +</parameter> + +<parameter name="route_name" unique="1"> +<longdesc lang="en"> +Route name +</longdesc> +<shortdesc lang="en">Route name</shortdesc> +<content type="string" default="ra-%s" /> +</parameter> + +<parameter name="serviceaccount"> +<longdesc lang="en">Path to Service account JSON file</longdesc> +<shortdesc lang="en">Service account JSONfile</shortdesc> +<content type="string" default="" /> +</parameter> + +<parameter name="stackdriver_logging" unique="0" required="0"> +<longdesc lang="en">If enabled (set to true), IP failover logs will be posted to stackdriver logging</longdesc> +<shortdesc lang="en">Stackdriver-logging support</shortdesc> +<content type="boolean" default="" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="180s" /> +<action name="stop" timeout="180s" /> +<action name="monitor" depth="0" timeout="30s" interval="60s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +''' % os.path.basename(sys.argv[0]) + + +class Context(object): + __slots__ = 'conn', 'iface_idx', 'instance', 'instance_url', 'interface', \ + 'ip', 'iproute', 'project', 'route_name', 'vpc_network', \ + 'vpc_network_url', 'zone' + + +def wait_for_operation(ctx, response): + """Blocks until operation completes. + Code from GitHub's GoogleCloudPlatform/python-docs-samples + + Args: + response: dict, a request's response + """ + def _OperationGetter(response): + operation = response[u'name'] + if response.get(u'zone'): + return ctx.conn.zoneOperations().get( + project=ctx.project, zone=ctx.zone, operation=operation) + else: + return ctx.conn.globalOperations().get( + project=ctx.project, operation=operation) + + while True: + result = _OperationGetter(response).execute() + + if result['status'] == 'DONE': + if 'error' in result: + raise Exception(result['error']) + return result + + time.sleep(1) + + +def get_metadata(metadata_key, params=None, timeout=None): + """Performs a GET request with the metadata headers. + + Args: + metadata_key: string, the metadata to perform a GET request on. + params: dictionary, the query parameters in the GET request. + timeout: int, timeout in seconds for metadata requests. + + Returns: + HTTP response from the GET request. + + Raises: + urlerror.HTTPError: raises when the GET request fails. + """ + timeout = timeout or 60 + metadata_url = os.path.join(METADATA_SERVER, metadata_key) + params = urlparse.urlencode(params or {}) + url = '%s?%s' % (metadata_url, params) + request = urlrequest.Request(url, headers=METADATA_HEADERS) + request_opener = urlrequest.build_opener(urlrequest.ProxyHandler({})) + return request_opener.open(request, timeout=timeout * 1.1).read().decode("utf-8") + + +def validate(ctx): + if os.geteuid() != 0: + logger.error('You must run this agent as root') + sys.exit(OCF_ERR_PERM) + + try: + serviceaccount = os.environ.get("OCF_RESKEY_serviceaccount") + if not serviceaccount: + try: + from googleapiclient import _auth + credentials = _auth.default_credentials(); + except: + credentials = GoogleCredentials.get_application_default() + logging.debug("using application default credentials") + else: + scope = ['https://www.googleapis.com/auth/cloud-platform'] + logging.debug("using credentials from service account") + try: + credentials = ServiceAccountCredentials.from_service_account_file(filename=serviceaccount, scopes=scope) + except AttributeError: + credentials = ServiceAccountCredentials.from_json_keyfile_name(serviceaccount, scope) + except Exception as e: + logging.error(str(e)) + sys.exit(OCF_ERR_GENERIC) + ctx.conn = googleapiclient.discovery.build('compute', 'v1', credentials=credentials, cache_discovery=False) + except Exception as e: + logger.error('Couldn\'t connect with google api: ' + str(e)) + sys.exit(OCF_ERR_GENERIC) + + ctx.ip = os.environ.get('OCF_RESKEY_ip') + if not ctx.ip: + logger.error('Missing ip parameter') + sys.exit(OCF_ERR_CONFIGURED) + + try: + ctx.instance = get_metadata('instance/name') + ctx.zone = get_metadata('instance/zone').split('/')[-1] + ctx.project = os.environ.get( + 'OCF_RESKEY_project', get_metadata('project/project-id')) + except Exception as e: + logger.error( + 'Instance information not found. Is this a GCE instance ?: %s', str(e)) + sys.exit(OCF_ERR_GENERIC) + + ctx.instance_url = '%s/projects/%s/zones/%s/instances/%s' % ( + GCP_API_URL_PREFIX, ctx.project, ctx.zone, ctx.instance) + ctx.vpc_network = os.environ.get('OCF_RESKEY_vpc_network', 'default') + ctx.vpc_network_url = '%s/projects/%s/global/networks/%s' % ( + GCP_API_URL_PREFIX, ctx.project, ctx.vpc_network) + ctx.interface = os.environ.get('OCF_RESKEY_interface', 'eth0') + ctx.route_name = os.environ.get( + 'OCF_RESKEY_route_name', 'ra-%s' % os.path.basename(sys.argv[0])) + ctx.iproute = pyroute2.IPRoute() + atexit.register(ctx.iproute.close) + idxs = ctx.iproute.link_lookup(ifname=ctx.interface) + if not idxs: + logger.error('Network interface not found') + sys.exit(OCF_ERR_GENERIC) + ctx.iface_idx = idxs[0] + + +def check_conflicting_routes(ctx): + fl = '(destRange = "%s*") AND (network = "%s") AND (name != "%s")' % ( + ctx.ip, ctx.vpc_network_url, ctx.route_name) + try: + request = ctx.conn.routes().list(project=ctx.project, filter=fl) + response = request.execute() + except googleapiclient.errors.HttpError as e: + if e.resp.status == 404: + logger.error('VPC network not found') + if 'stop' in sys.argv[1]: + sys.exit(OCF_SUCCESS) + else: + sys.exit(OCF_ERR_CONFIGURED) + else: + raise + + route_list = response.get('items', None) + if route_list: + logger.error( + 'Conflicting unnmanaged routes for destination %s/32 in VPC %s found : %s', + ctx.ip, ctx.vpc_network, str(route_list)) + sys.exit(OCF_ERR_CONFIGURED) + + +def route_release(ctx): + request = ctx.conn.routes().delete(project=ctx.project, route=ctx.route_name) + wait_for_operation(ctx, request.execute()) + + +def ip_monitor(ctx): + logger.info('IP monitor: checking local network configuration') + + def address_filter(addr): + for attr in addr['attrs']: + if attr[0] == 'IFA_LOCAL': + if attr[1] == ctx.ip: + return True + else: + return False + + route = ctx.iproute.get_addr( + index=ctx.iface_idx, match=address_filter) + if not route: + logger.warning( + 'The floating IP %s is not locally configured on this instance (%s)', + ctx.ip, ctx.instance) + return OCF_NOT_RUNNING + + logger.debug( + 'The floating IP %s is correctly configured on this instance (%s)', + ctx.ip, ctx.instance) + return OCF_SUCCESS + + +def ip_release(ctx): + ctx.iproute.addr('del', index=ctx.iface_idx, address=ctx.ip, mask=32) + + +def ip_and_route_start(ctx): + logger.info('Bringing up the floating IP %s', ctx.ip) + + # Add a new entry in the routing table + # If the route entry exists and is pointing to another instance, take it over + + # Ensure that there is no route that we are not aware of that is also handling our IP + check_conflicting_routes(ctx) + + # There is no replace API, We need to first delete the existing route if any + try: + request = ctx.conn.routes().get(project=ctx.project, route=ctx.route_name) + request.execute() + # TODO: check specific exception for 404 + except googleapiclient.errors.HttpError as e: + if e.resp.status != 404: + raise + else: + route_release(ctx) + + route_body = { + 'name': ctx.route_name, + 'network': ctx.vpc_network_url, + 'destRange': '%s/32' % ctx.ip, + 'nextHopInstance': ctx.instance_url, + } + try: + request = ctx.conn.routes().insert(project=ctx.project, body=route_body) + wait_for_operation(ctx, request.execute()) + except googleapiclient.errors.HttpError: + try: + request = ctx.conn.networks().get( + project=ctx.project, network=ctx.vpc_network) + request.execute() + except googleapiclient.errors.HttpError as e: + if e.resp.status == 404: + logger.error('VPC network not found') + sys.exit(OCF_ERR_CONFIGURED) + else: + raise + else: + raise + + # Configure the IP address locally + # We need to release the IP first + if ip_monitor(ctx) == OCF_SUCCESS: + ip_release(ctx) + + ctx.iproute.addr('add', index=ctx.iface_idx, address=ctx.ip, mask=32) + ctx.iproute.link('set', index=ctx.iface_idx, state='up') + logger.info('Successfully brought up the floating IP %s', ctx.ip) + + +def route_monitor(ctx): + logger.info('GCP route monitor: checking route table') + + # Ensure that there is no route that we are not aware of that is also handling our IP + check_conflicting_routes(ctx) + + try: + request = ctx.conn.routes().get(project=ctx.project, route=ctx.route_name) + response = request.execute() + except googleapiclient.errors.HttpError as e: + if e.resp.status == 404: + return OCF_NOT_RUNNING + elif 'Insufficient Permission' in e.content: + return OCF_ERR_PERM + else: + raise + + routed_to_instance = response.get('nextHopInstance', '<unknown>') + instance_url = '%s/projects/%s/zones/%s/instances/%s' % ( + GCP_API_URL_PREFIX, ctx.project, ctx.zone, ctx.instance) + if routed_to_instance != instance_url: + logger.warning( + 'The floating IP %s is not routed to this instance (%s) but to instance %s', + ctx.ip, ctx.instance, routed_to_instance.split('/')[-1]) + return OCF_NOT_RUNNING + + logger.debug( + 'The floating IP %s is correctly routed to this instance (%s)', + ctx.ip, ctx.instance) + return OCF_SUCCESS + + +def ip_and_route_stop(ctx): + logger.info('Bringing down the floating IP %s', ctx.ip) + + # Delete the route entry + # If the route entry exists and is pointing to another instance, don't touch it + if route_monitor(ctx) == OCF_NOT_RUNNING: + logger.info( + 'The floating IP %s is already not routed to this instance (%s)', + ctx.ip, ctx.instance) + else: + route_release(ctx) + + if ip_monitor(ctx) == OCF_NOT_RUNNING: + logger.info('The floating IP %s is already down', ctx.ip) + else: + ip_release(ctx) + + +def configure_logs(ctx): + # Prepare logging + global logger + logging.getLogger('googleapiclient').setLevel(logging.WARN) + logging_env = os.environ.get('OCF_RESKEY_stackdriver_logging') + if logging_env: + logging_env = logging_env.lower() + if any(x in logging_env for x in ['yes', 'true', 'enabled']): + try: + import google.cloud.logging.handlers + client = google.cloud.logging.Client() + handler = google.cloud.logging.handlers.CloudLoggingHandler( + client, name=ctx.instance) + handler.setLevel(logging.INFO) + formatter = logging.Formatter('gcp:route "%(message)s"') + handler.setFormatter(formatter) + log.addHandler(handler) + logger = logging.LoggerAdapter(log, {'OCF_RESOURCE_INSTANCE': OCF_RESOURCE_INSTANCE}) + except ImportError: + logger.error('Couldn\'t import google.cloud.logging, ' + 'disabling Stackdriver-logging support') + + +def main(): + if 'meta-data' in sys.argv[1]: + print(METADATA) + return + + ctx = Context() + + validate(ctx) + if 'validate-all' in sys.argv[1]: + return + + configure_logs(ctx) + if 'start' in sys.argv[1]: + ip_and_route_start(ctx) + elif 'stop' in sys.argv[1]: + ip_and_route_stop(ctx) + elif 'status' in sys.argv[1] or 'monitor' in sys.argv[1]: + sys.exit(ip_monitor(ctx)) + else: + usage = 'usage: %s {start|stop|monitor|status|meta-data|validate-all}' % \ + os.path.basename(sys.argv[0]) + logger.error(usage) + sys.exit(OCF_ERR_UNIMPLEMENTED) + + +if __name__ == "__main__": + main() diff --git a/heartbeat/gcp-vpc-move-vip.in b/heartbeat/gcp-vpc-move-vip.in new file mode 100755 index 0000000..331226b --- /dev/null +++ b/heartbeat/gcp-vpc-move-vip.in @@ -0,0 +1,466 @@ +#!@PYTHON@ -tt +# --------------------------------------------------------------------- +# Copyright 2016 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# --------------------------------------------------------------------- +# Description: Google Cloud Platform - Floating IP Address (Alias) +# --------------------------------------------------------------------- + +import json +import logging +import os +import sys +import time + +OCF_FUNCTIONS_DIR = os.environ.get("OCF_FUNCTIONS_DIR", "%s/lib/heartbeat" + % os.environ.get("OCF_ROOT")) +sys.path.append(OCF_FUNCTIONS_DIR) + +from ocf import * + +try: + import googleapiclient.discovery + try: + from google.oauth2.service_account import Credentials as ServiceAccountCredentials + except ImportError: + from oauth2client.service_account import ServiceAccountCredentials +except ImportError: + pass + +if sys.version_info >= (3, 0): + # Python 3 imports. + import urllib.parse as urlparse + import urllib.request as urlrequest +else: + # Python 2 imports. + import urllib as urlparse + import urllib2 as urlrequest + + +# Constants for alias add/remove modes +ADD = 0 +REMOVE = 1 + +CONN = None +THIS_VM = None +ALIAS = None +MAX_RETRIES = 3 +RETRY_BACKOFF_SECS = 1 +METADATA_SERVER = 'http://metadata.google.internal/computeMetadata/v1/' +METADATA_HEADERS = {'Metadata-Flavor': 'Google'} +METADATA = \ +'''<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="gcp-vpc-move-vip" version="1.0"> + <version>1.0</version> + <longdesc lang="en">Floating IP Address or Range on Google Cloud Platform - Using Alias IP address functionality to attach a secondary IP range to a running instance</longdesc> + <shortdesc lang="en">Floating IP Address or Range on Google Cloud Platform</shortdesc> + <parameters> + <parameter name="alias_ip" unique="1" required="1"> + <longdesc lang="en">IP range to be added including CIDR netmask (e.g., 192.168.0.1/32)</longdesc> + <shortdesc lang="en">IP range to be added including CIDR netmask (e.g., 192.168.0.1/32)</shortdesc> + <content type="string" default="" /> + </parameter> + <parameter name="alias_range_name" unique="0" required="0"> + <longdesc lang="en">Subnet name for the Alias IP</longdesc> + <shortdesc lang="en">Subnet name for the Alias IP</shortdesc> + <content type="string" default="" /> + </parameter> + <parameter name="hostlist" unique="0" required="0"> + <longdesc lang="en">List of hosts in the cluster, separated by spaces</longdesc> + <shortdesc lang="en">Host list</shortdesc> + <content type="string" default="" /> + </parameter> + <parameter name="project" unique="0" required="0"> + <longdesc lang="en"> + Project ID of the instance. It can be useful to set this + attribute if the instance is in a shared service project. + Otherwise, the agent should be able to determine the project ID + automatically. + </longdesc> + <shortdesc lang="en">Project ID</shortdesc> + <content type="string" default="default" /> + </parameter> + <parameter name="serviceaccount"> + <longdesc lang="en">Path to Service account JSON file</longdesc> + <shortdesc lang="en">Service account JSONfile</shortdesc> + <content type="string" default="" /> + </parameter> + <parameter name="stackdriver_logging" unique="0" required="0"> + <longdesc lang="en">If enabled (set to true), IP failover logs will be posted to stackdriver logging</longdesc> + <shortdesc lang="en">Stackdriver-logging support</shortdesc> + <content type="boolean" default="" /> + </parameter> + </parameters> + <actions> + <action name="start" timeout="300s" /> + <action name="stop" timeout="15s" /> + <action name="monitor" timeout="15s" interval="60s" depth="0" /> + <action name="meta-data" timeout="15s" /> + <action name="validate-all" timeout="15s" /> + </actions> +</resource-agent>''' + + +def get_metadata(metadata_key, params=None, timeout=None): + """Performs a GET request with the metadata headers. + + Args: + metadata_key: string, the metadata to perform a GET request on. + params: dictionary, the query parameters in the GET request. + timeout: int, timeout in seconds for metadata requests. + + Returns: + HTTP response from the GET request. + """ + for i in range(MAX_RETRIES): + try: + timeout = timeout or 60 + metadata_url = os.path.join(METADATA_SERVER, metadata_key) + params = urlparse.urlencode(params or {}) + url = '%s?%s' % (metadata_url, params) + request = urlrequest.Request(url, headers=METADATA_HEADERS) + request_opener = urlrequest.build_opener(urlrequest.ProxyHandler({})) + return request_opener.open( + request, timeout=timeout * 1.1).read().decode("utf-8") + except Exception as e: + logger.error('Couldn\'t get instance name, is this running inside GCE?: ' + + str(e)) + time.sleep(RETRY_BACKOFF_SECS * (i + 1)) + + # If the retries are exhausted we exit with a generic error. + sys.exit(OCF_ERR_GENERIC) + + +def create_api_connection(): + for i in range(MAX_RETRIES): + try: + serviceaccount = os.environ.get("OCF_RESKEY_serviceaccount") + if not serviceaccount: + try: + from googleapiclient import _auth + credentials = _auth.default_credentials(); + except: + credentials = GoogleCredentials.get_application_default() + logging.debug("using application default credentials") + else: + scope = ['https://www.googleapis.com/auth/cloud-platform'] + logging.debug("using credentials from service account") + try: + credentials = ServiceAccountCredentials.from_service_account_file(filename=serviceaccount, scopes=scope) + except AttributeError: + credentials = ServiceAccountCredentials.from_json_keyfile_name(serviceaccount, scope) + except Exception as e: + logging.error(str(e)) + sys.exit(OCF_ERR_GENERIC) + return googleapiclient.discovery.build('compute', 'v1', + credentials=credentials, + cache_discovery=False) + except Exception as e: + logger.error('Couldn\'t connect with google api: ' + str(e)) + time.sleep(RETRY_BACKOFF_SECS * (i + 1)) + + # If the retries are exhausted we exit with a generic error. + sys.exit(OCF_ERR_GENERIC) + + +def get_instance(project, zone, instance): + request = CONN.instances().get( + project=project, zone=zone, instance=instance) + return request.execute() + + +def get_network_ifaces(project, zone, instance): + return get_instance(project, zone, instance)['networkInterfaces'] + + +def wait_for_operation(project, zone, operation): + while True: + result = CONN.zoneOperations().get( + project=project, + zone=zone, + operation=operation['name']).execute() + + if result['status'] == 'DONE': + if 'error' in result: + raise Exception(result['error']) + return + time.sleep(1) + + +def set_aliases(project, zone, instance, aliases, fingerprint): + """Sets the alias IP ranges for an instance. + + Args: + project: string, the project in which the instance resides. + zone: string, the zone in which the instance resides. + instance: string, the name of the instance. + aliases: list, the list of dictionaries containing alias IP ranges + to be added to or removed from the instance. + fingerprint: string, the fingerprint of the network interface. + """ + body = { + 'aliasIpRanges': aliases, + 'fingerprint': fingerprint + } + + request = CONN.instances().updateNetworkInterface( + instance=instance, networkInterface='nic0', project=project, zone=zone, + body=body) + operation = request.execute() + wait_for_operation(project, zone, operation) + + +def add_rm_alias(mode, project, zone, instance, alias, alias_range_name=None): + """Adds or removes an alias IP range for a GCE instance. + + Args: + mode: int, a constant (ADD (0) or REMOVE (1)) indicating the + operation type. + project: string, the project in which the instance resides. + zone: string, the zone in which the instance resides. + instance: string, the name of the instance. + alias: string, the alias IP range to be added to or removed from + the instance. + alias_range_name: string, the subnet name for the alias IP range. + + Returns: + True if the existing list of alias IP ranges was modified, or False + otherwise. + """ + ifaces = get_network_ifaces(project, zone, instance) + fingerprint = ifaces[0]['fingerprint'] + + try: + old_aliases = ifaces[0]['aliasIpRanges'] + except KeyError: + old_aliases = [] + + new_aliases = [a for a in old_aliases if a['ipCidrRange'] != alias] + + if alias: + if mode == ADD: + obj = {'ipCidrRange': alias} + if alias_range_name: + obj['subnetworkRangeName'] = alias_range_name + new_aliases.append(obj) + elif mode == REMOVE: + pass # already removed during new_aliases build + else: + raise ValueError('Invalid value for mode: {}'.format(mode)) + + if (sorted(new_aliases, key=lambda item: item.get('ipCidrRange')) + != sorted(old_aliases, key=lambda item: item.get('ipCidrRange'))): + set_aliases(project, zone, instance, new_aliases, fingerprint) + return True + else: + return False + + +def add_alias(project, zone, instance, alias, alias_range_name=None): + return add_rm_alias(ADD, project, zone, instance, alias, alias_range_name) + + +def remove_alias(project, zone, instance, alias): + return add_rm_alias(REMOVE, project, zone, instance, alias) + + +def get_aliases(project, zone, instance): + ifaces = get_network_ifaces(project, zone, instance) + try: + aliases = ifaces[0]['aliasIpRanges'] + return [a['ipCidrRange'] for a in aliases] + except KeyError: + return [] + + +def get_localhost_aliases(): + net_iface = get_metadata('instance/network-interfaces', {'recursive': True}) + net_iface = json.loads(net_iface) + try: + return net_iface[0]['ipAliases'] + except (KeyError, IndexError): + return [] + + +def get_zone(project, instance): + fl = 'name="%s"' % instance + request = CONN.instances().aggregatedList(project=project, filter=fl) + while request is not None: + response = request.execute() + zones = response.get('items', {}) + for zone in zones.values(): + for inst in zone.get('instances', []): + if inst['name'] == instance: + return inst['zone'].split("/")[-1] + request = CONN.instances().aggregatedList_next( + previous_request=request, previous_response=response) + raise Exception("Unable to find instance %s" % (instance)) + + +def get_instances_list(project, exclude): + hostlist = [] + request = CONN.instances().aggregatedList(project=project) + while request is not None: + try: + response = request.execute() + zones = response.get('items', {}) + except googleapiclient.errors.HttpError as e: + if e.resp.status == 404: + logger.debug('get_instances_list(): no instances found') + return '' + + for zone in zones.values(): + for inst in zone.get('instances', []): + if inst['name'] != exclude: + hostlist.append(inst['name']) + request = CONN.instances().aggregatedList_next( + previous_request=request, previous_response=response) + return hostlist + + +def gcp_alias_start(alias): + my_aliases = get_localhost_aliases() + my_zone = get_metadata('instance/zone').split('/')[-1] + project = os.environ.get( + 'OCF_RESKEY_project', get_metadata('project/project-id')) + + if alias in my_aliases: + # TODO: Do we need to check alias_range_name? + logger.info( + '%s already has %s attached. No action required' % (THIS_VM, alias)) + sys.exit(OCF_SUCCESS) + + # If the alias is currently attached to another host, detach it. + hostlist = os.environ.get('OCF_RESKEY_hostlist', '') + if hostlist: + hostlist = hostlist.replace(THIS_VM, '').split() + else: + hostlist = get_instances_list(project, THIS_VM) + for host in hostlist: + host_zone = get_zone(project, host) + host_aliases = get_aliases(project, host_zone, host) + if alias in host_aliases: + logger.info( + '%s is attached to %s - Removing %s from %s' % + (alias, host, alias, host)) + remove_alias(project, host_zone, host, alias) + break + + # Add alias IP range to localhost + try: + add_alias( + project, my_zone, THIS_VM, alias, + os.environ.get('OCF_RESKEY_alias_range_name')) + except googleapiclient.errors.HttpError as e: + if e.resp.status == 404: + sys.exit(OCF_ERR_CONFIGURED) + + # Verify that the IP range has been added + my_aliases = get_localhost_aliases() + if alias in my_aliases: + logger.info('Finished adding %s to %s' % (alias, THIS_VM)) + else: + if my_aliases: + logger.error( + 'Failed to add alias IP range %s. %s has alias IP ranges attached but' + + ' they don\'t include %s' % (alias, THIS_VM, alias)) + else: + logger.error( + 'Failed to add IP range %s. %s has no alias IP ranges attached' + % (alias, THIS_VM)) + sys.exit(OCF_ERR_GENERIC) + + +def gcp_alias_stop(alias): + my_aliases = get_localhost_aliases() + my_zone = get_metadata('instance/zone').split('/')[-1] + project = os.environ.get( + 'OCF_RESKEY_project', get_metadata('project/project-id')) + + if alias in my_aliases: + logger.info('Removing %s from %s' % (alias, THIS_VM)) + remove_alias(project, my_zone, THIS_VM, alias) + else: + logger.info( + '%s is not attached to %s. No action required' + % (alias, THIS_VM)) + + +def gcp_alias_status(alias): + my_aliases = get_localhost_aliases() + if alias in my_aliases: + logger.info('%s has the correct IP range attached' % THIS_VM) + else: + sys.exit(OCF_NOT_RUNNING) + + +def validate(): + global ALIAS + global THIS_VM + global CONN + + CONN = create_api_connection() + THIS_VM = get_metadata('instance/name') + ALIAS = os.environ.get('OCF_RESKEY_alias_ip') + if not ALIAS: + logger.error('Missing alias_ip parameter') + sys.exit(OCF_ERR_CONFIGURED) + + +def configure_logs(): + # Prepare logging + global logger + logging.getLogger('googleapiclient').setLevel(logging.WARN) + logging_env = os.environ.get('OCF_RESKEY_stackdriver_logging') + if logging_env: + logging_env = logging_env.lower() + if any(x in logging_env for x in ['yes', 'true', 'enabled']): + try: + import google.cloud.logging.handlers + client = google.cloud.logging.Client() + handler = google.cloud.logging.handlers.CloudLoggingHandler( + client, name=THIS_VM) + handler.setLevel(logging.INFO) + formatter = logging.Formatter('gcp:alias "%(message)s"') + handler.setFormatter(formatter) + log.addHandler(handler) + logger = logging.LoggerAdapter(log, {'OCF_RESOURCE_INSTANCE': + OCF_RESOURCE_INSTANCE}) + except ImportError: + logger.error('Couldn\'t import google.cloud.logging, ' + 'disabling Stackdriver-logging support') + + +def main(): + if 'meta-data' in sys.argv[1]: + print(METADATA) + return + + validate() + if 'validate-all' in sys.argv[1]: + return + + configure_logs() + if 'start' in sys.argv[1]: + gcp_alias_start(ALIAS) + elif 'stop' in sys.argv[1]: + gcp_alias_stop(ALIAS) + elif 'status' in sys.argv[1] or 'monitor' in sys.argv[1]: + gcp_alias_status(ALIAS) + else: + logger.error('no such function %s' % str(sys.argv[1])) + + +if __name__ == "__main__": + main() diff --git a/heartbeat/http-mon.sh b/heartbeat/http-mon.sh new file mode 100644 index 0000000..ce13ccd --- /dev/null +++ b/heartbeat/http-mon.sh @@ -0,0 +1,140 @@ +# +# General http monitor code +# (sourced by apache and httpmon) +# +# Author: Alan Robertson +# Sun Jiang Dong +# +# Support: users@clusterlabs.org +# +# License: GNU General Public License (GPL) +# +# Copyright: (C) 2002-2005 International Business Machines +# + +# default options for http clients +# NB: We _always_ test a local resource, so it should be +# safe to connect from the local interface. +bind_address="127.0.0.1" +curl_ipv6_opts="" +if ocf_is_true "$OCF_RESKEY_use_ipv6" || echo "$STATUSURL" | grep -qs "::"; then + bind_address="::1" + curl_ipv6_opts="-g" +fi +WGETOPTS="-O- -q -L --no-proxy --bind-address=$bind_address" +CURLOPTS="-o - -Ss -L --interface lo $curl_ipv6_opts" + +request_url_header() { + which curl >/dev/null 2>&1 + if [ $? -eq 0 ]; then + curl -IL --connect-timeout 5 --interface lo $curl_ipv6_opts "$1" > /dev/null 2>&1 + return $? + fi + + which wget >/dev/null 2>&1 + if [ $? -eq 0 ]; then + local header=$(wget --server-response --spider --timeout=5 --tries=2 "$1" 2>&1) + if [ $? -eq 0 ]; then + return $OCF_SUCCESS + fi + + # a 4xx error is still a server response. + echo "$header" | grep "HTTP/1.1 4.. " > /dev/null 2>&1 + return $? + fi + return $OCF_ERR_GENERIC +} + +# +# run the http client +# +curl_func() { + cl_opts="$CURLOPTS $test_httpclient_opts" + if [ x != "x$test_user" ]; then + echo "-u $test_user:$test_password" | + curl -K - $cl_opts "$1" + else + curl $cl_opts "$1" + fi +} +wget_func() { + auth="" + cl_opts="$WGETOPTS $test_httpclient_opts" + [ x != "x$test_user" ] && + auth="--http-user=$test_user --http-passwd=$test_password" + wget $auth $cl_opts "$1" +} +# +# rely on whatever the user provided +userdefined() { + $test_httpclient $test_httpclient_opts "$1" +} + +# +# find a good http client +# +findhttpclient() { + # prefer wget (for historical reasons) + if [ "x$CLIENT" != x ] && which "$CLIENT" >/dev/null 2>&1; then + echo "$CLIENT" + elif which wget >/dev/null 2>&1; then + echo "wget" + elif which curl >/dev/null 2>&1; then + echo "curl" + else + return 1 + fi +} +gethttpclient() { + [ -z "$test_httpclient" ] && + test_httpclient=$ourhttpclient + case "$test_httpclient" in + curl|wget) echo ${test_httpclient}_func;; #these are supported + *) echo userdefined;; + esac +} + +# test configuration good? +is_testconf_sane() { + if [ "x$test_regex" = x -o "x$test_url" = x ]; then + ocf_log err "test regular expression or test url empty" + return 1 + fi + if [ "x$test_user$test_password" != x -a \( "x$test_user" = x -o "x$test_password" = x \) ]; then + ocf_log err "bad user authentication for extended test" + return 1 + fi + return 0 +} +# +# read the test definition from the config +# +readtestconf() { + test_name="$1" # we look for this one or the first one if empty + lcnt=0 + readdef="" + test_url="" test_regex="" + test_user="" test_password="" + test_httpclient="" test_httpclient_opts="" + + while read key value; do + lcnt=$((lcnt+1)) + if [ "$readdef" ]; then + case "$key" in + "url") test_url="$value" ;; + "user") test_user="$value" ;; + "password") test_password="$value" ;; + "client") test_httpclient="$value" ;; + "client_opts") test_httpclient_opts="$value" ;; + "match") test_regex="$value" ;; + "end") break ;; + "#"*|"") ;; + *) ocf_log err "$lcnt: $key: unknown keyword"; return 1 ;; + esac + else + [ "$key" = "test" ] && + [ -z "$test_name" -o "$test_name" = "$value" ] && + readdef=1 + fi + done +} diff --git a/heartbeat/iSCSILogicalUnit.in b/heartbeat/iSCSILogicalUnit.in new file mode 100644 index 0000000..efcb3a6 --- /dev/null +++ b/heartbeat/iSCSILogicalUnit.in @@ -0,0 +1,830 @@ +#!@BASH_SHELL@ +# +# +# iSCSILogicalUnit OCF RA. Exports and manages iSCSI Logical Units. +# +# (c) 2013 LINBIT, Lars Ellenberg +# (c) 2009-2010 Florian Haas, Dejan Muhamedagic, +# and Linux-HA contributors +# +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +# Set a default implementation based on software installed +if have_binary ietadm; then + OCF_RESKEY_implementation_default="iet" +elif have_binary tgtadm; then + OCF_RESKEY_implementation_default="tgt" +elif have_binary lio_node; then + OCF_RESKEY_implementation_default="lio" +elif have_binary targetcli; then + OCF_RESKEY_implementation_default="lio-t" +elif have_binary scstadmin; then + OCF_RESKEY_implementation_default="scst" +fi +: ${OCF_RESKEY_implementation=${OCF_RESKEY_implementation_default}} + +# Use a default SCSI ID and SCSI SN that is unique across the cluster, +# and persistent in the event of resource migration. +# SCSI IDs are limited to 24 bytes, but only 16 bytes are known to be +# supported by all iSCSI implementations this RA cares about. Thus, +# for a default, use the first 16 characters of +# $OCF_RESOURCE_INSTANCE. +OCF_RESKEY_scsi_id_default="${OCF_RESOURCE_INSTANCE:0:16}" +: ${OCF_RESKEY_scsi_id=${OCF_RESKEY_scsi_id_default}} +# To have a reasonably unique default SCSI SN, use the first 8 bytes +# of an MD5 hash of of $OCF_RESOURCE_INSTANCE +sn=`echo -n "${OCF_RESOURCE_INSTANCE}" | md5sum | sed -e 's/ .*//'` +OCF_RESKEY_scsi_sn_default=${sn:0:8} +: ${OCF_RESKEY_scsi_sn=${OCF_RESKEY_scsi_sn_default}} +OCF_RESKEY_allowed_initiators_default="" +: ${OCF_RESKEY_allowed_initiators=${OCF_RESKEY_allowed_initiators_default}} +# set 0 as a default value for lio iblock device number +OCF_RESKEY_lio_iblock_default=0 +OCF_RESKEY_lio_iblock=${OCF_RESKEY_lio_iblock:-$OCF_RESKEY_lio_iblock_default} +# Set LIO-T backend default as 'block' +OCF_RESKEY_liot_bstype_default="block" +: ${OCF_RESKEY_liot_bstype=${OCF_RESKEY_liot_bstype_default}} + +## tgt specifics +# tgt has "backing store type" and "backing store open flags", +# as well as device-type. +# +# suggestions how to make this generic accross all supported implementations? +# how should they be named, how should they be mapped to implementation specifics? +# # Conversation: Due to the phase out of most implementations other than lio-t +# # I have decided to use specific implementation of tgt_bstype like key for +# # lio-t. +# +# OCF_RESKEY_tgt_bstype +# OCF_RESKEY_tgt_bsoflags +# OCF_RESKEY_tgt_bsopts +# OCF_RESKEY_tgt_device_type + +# targetcli: iSCSITarget and iSCSILogicalUnit must use the same lockfile +TARGETLOCKFILE=${HA_RSCTMP}/targetcli.lock +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="iSCSILogicalUnit" version="0.9"> +<version>1.0</version> + +<longdesc lang="en"> +Manages iSCSI Logical Unit. An iSCSI Logical unit is a subdivision of +an SCSI Target, exported via a daemon that speaks the iSCSI protocol. +</longdesc> +<shortdesc lang="en">Manages iSCSI Logical Units (LUs)</shortdesc> + +<parameters> +<parameter name="implementation" required="0" unique="0"> +<longdesc lang="en"> +The iSCSI target daemon implementation. Must be one of "iet", "tgt", +"lio", "lio-t", or "scst". If unspecified, an implementation is selected based on the +availability of management utilities, with "iet" being tried first, +then "tgt", then "lio", then "lio-t", then "scst". +</longdesc> +<shortdesc lang="en">iSCSI target daemon implementation</shortdesc> +<content type="string" default="${OCF_RESKEY_implementation_default}"/> +</parameter> + +<parameter name="target_iqn" required="1" unique="0"> +<longdesc lang="en"> +The iSCSI Qualified Name (IQN) that this Logical Unit belongs to. +</longdesc> +<shortdesc lang="en">iSCSI target IQN</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="lun" required="1" unique="0"> +<longdesc lang="en"> +The Logical Unit number (LUN) exposed to initiators. +</longdesc> +<shortdesc lang="en">Logical Unit number (LUN)</shortdesc> +<content type="integer" /> +</parameter> + +<parameter name="path" required="1" unique="0"> +<longdesc lang="en"> +The path to the block device exposed. Some implementations allow this +to be a regular file, too. +</longdesc> +<shortdesc lang="en">Block device (or file) path</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="scsi_id" required="0" unique="1"> +<longdesc lang="en"> +The SCSI ID to be configured for this Logical Unit. The default +is the resource name, truncated to 24 bytes. +</longdesc> +<shortdesc lang="en">SCSI ID</shortdesc> +<content type="string" default="${OCF_RESKEY_scsi_id_default}"/> +</parameter> + +<parameter name="scsi_sn" required="0" unique="1"> +<longdesc lang="en"> +The SCSI serial number to be configured for this Logical Unit. +The default is a hash of the resource name, truncated to 8 bytes, +meaning 26 hex characters. +If you are using XenServer with multipath as iSCSI client, you +MUST make sure this value is set, or else XenServer multipath will +not be able to access the LUN +</longdesc> +<shortdesc lang="en">SCSI serial number</shortdesc> +<content type="string" default="${OCF_RESKEY_scsi_sn_default}"/> +</parameter> + +<parameter name="emulate_tpu" required="0" unique="0"> +<longdesc lang="en"> +The SCSI UNMAP command to be configured for this Logical Unit. +Setting this integer to 1 will enable TPU IOCTL emulation. +</longdesc> +<shortdesc lang="en">SCSI UNMAP (for TRIM / DISCARD)</shortdesc> +<content type="integer" /> +</parameter> + +<parameter name="emulate_3pc" required="0" unique="0"> +<longdesc lang="en"> +The SCSI EXTENDED COPY command to be configured for this Logical Unit. +Setting this integer to 1 will enable 3PC IOCTL emulation. +</longdesc> +<shortdesc lang="en">SCSI extended write</shortdesc> +<content type="integer" /> +</parameter> + +<parameter name="emulate_caw" required="0" unique="0"> +<longdesc lang="en"> +The SCSI Compare and Write command to be configured for this Logical Unit. +Setting this integer to 1 will enable CAW IOCTL emulation. +</longdesc> +<shortdesc lang="en">SCSI compare and write</shortdesc> +<content type="integer" /> +</parameter> + +<parameter name="vendor_id" required="0" unique="0"> +<longdesc lang="en"> +The SCSI vendor ID to be configured for this Logical Unit. +</longdesc> +<shortdesc lang="en">SCSI vendor ID</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="product_id" required="0" unique="0"> +<longdesc lang="en"> +The SCSI product ID to be configured for this Logical Unit. +</longdesc> +<shortdesc lang="en">SCSI product ID</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="tgt_bstype" required="0" unique="0"> +<longdesc lang="en"> +TGT specific backing store type. If you want to use aio, +make sure your tgtadm is built against libaio. +See tgtadm(8). +</longdesc> +<shortdesc lang="en">TGT backing store type</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="tgt_bsoflags" required="0" unique="0"> +<longdesc lang="en"> +TGT specific backing store open flags (direct|sync). +See tgtadm(8). +</longdesc> +<shortdesc lang="en">TGT backing store open flags</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="tgt_bsopts" required="0" unique="0"> +<longdesc lang="en"> +TGT specific backing store options. +See tgtadm(8). +</longdesc> +<shortdesc lang="en">TGT backing store options</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="tgt_device_type" required="0" unique="0"> +<longdesc lang="en"> +TGT specific device type. +See tgtadm(8). +</longdesc> +<shortdesc lang="en">TGT device type</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="additional_parameters" required="0" unique="0"> +<longdesc lang="en"> +Additional LU parameters. A space-separated list of "name=value" pairs +which will be passed through to the iSCSI daemon's management +interface. The supported parameters are implementation +dependent. Neither the name nor the value may contain whitespace. +</longdesc> +<shortdesc lang="en">List of iSCSI LU parameters</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="allowed_initiators" required="0" unique="0"> +<longdesc lang="en"> +Allowed initiators. A space-separated list of initiators allowed to +connect to this lun. Initiators may be listed in any syntax +the target implementation allows. If this parameter is empty or +not set, access to this lun will not be allowed from any initiator, +if target is not in demo mode. + +This parameter is only necessary when using LIO. +</longdesc> +<shortdesc lang="en">List of iSCSI initiators allowed to connect +to this lun.</shortdesc> +<content type="string" default="${OCF_RESKEY_allowed_initiators_default}"/> +</parameter> + +<parameter name="lio_iblock" required="0" unique="0"> +<longdesc lang="en"> +LIO iblock device name, a number starting from 0. + +Using distinct values here avoids a warning in LIO "LEGACY: SHARED HBA"; +and it is necessary when using multiple LUNs started at the same time +(eg. on node failover) to prevent a race condition in tcm_core on mkdir() +in /sys/kernel/config/target/core/. +</longdesc> +<shortdesc lang="en">LIO iblock device number</shortdesc> +<content type="integer" default="${OCF_RESKEY_lio_iblock_default}"/> +</parameter> + +<parameter name="liot_bstype" required="0" unique="0"> +<longdesc lang="en"> +LIO-T specific backing store type. If you want to use aio, +set this to 'block'. If you want to use async IO, set this to 'fileio'. +Async I/O works also with block devices, however - you need to understand +the consequences. See targetcli(8). If using file backend, you need to create this file in +advance. +If you want to use SCSI Passthrough, set this to 'pscsi'. +Do not use PSCSI unless you know exactly how it will be used. +</longdesc> +<shortdesc lang="en">LIO-T backing store type</shortdesc> +<content type="string" default="${OCF_RESKEY_liot_bstype_default}"/> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="10s" /> +<action name="stop" timeout="10s" /> +<action name="status" timeout="10s" interval="10s" depth="0" /> +<action name="monitor" timeout="10s" interval="10s" depth="0" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="10s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +iSCSILogicalUnit_usage() { + cat <<END +usage: $0 {start|stop|status|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +iSCSILogicalUnit_start() { + iSCSILogicalUnit_monitor + if [ $? = $OCF_SUCCESS ]; then + return $OCF_SUCCESS + fi + + local params + + case $OCF_RESKEY_implementation in + iet) + params="Path=${OCF_RESKEY_path}" + # use blockio if path points to a block device, fileio + # otherwise. + if [ -b "${OCF_RESKEY_path}" ]; then + params="${params} Type=blockio" + else + params="${params} Type=fileio" + fi + # in IET, we have to set LU parameters on creation + if [ -n "${OCF_RESKEY_scsi_id}" ]; then + params="${params} ScsiId=${OCF_RESKEY_scsi_id}" + fi + if [ -n "${OCF_RESKEY_scsi_sn}" ]; then + params="${params} ScsiSN=${OCF_RESKEY_scsi_sn}" + fi + params="${params} ${OCF_RESKEY_additional_parameters}" + ocf_run ietadm --op new \ + --tid=${TID} \ + --lun=${OCF_RESKEY_lun} \ + --params ${params// /,} || exit $OCF_ERR_GENERIC + ;; + tgt) + # tgt requires that we create the LU first, then set LU + # parameters + params="" + local var + local envar + for var in scsi_id scsi_sn vendor_id product_id; do + envar="OCF_RESKEY_${var}" + if [ -n "${!envar}" ]; then + params="${params} ${var}=${!envar}" + fi + done + params="${params} ${OCF_RESKEY_additional_parameters}" + + # cleanup: tgt (as of tgtadm version 1.0.24) does not like an explicit "bsoflags=direct" + # when used with "bstype=aio" (which always uses O_DIRECT) + [[ $OCF_RESKEY_tgt_bstype/$OCF_RESKEY_tgt_bsoflags = "aio/direct" ]] && OCF_RESKEY_tgt_bsoflags="" + + tgt_args="" + [[ $OCF_RESKEY_tgt_bstype ]] && tgt_args="$tgt_args --bstype=$OCF_RESKEY_tgt_bstype" + [[ $OCF_RESKEY_tgt_bsoflags ]] && tgt_args="$tgt_args --bsoflags=$OCF_RESKEY_tgt_bsoflags" + [[ $OCF_RESKEY_tgt_bsopts ]] && tgt_args="$tgt_args --bsopts=$OCF_RESKEY_tgt_bsopts" + [[ $OCF_RESKEY_tgt_device_type ]] && tgt_args="$tgt_args --device-type=$OCF_RESKEY_tgt_device_type" + + ocf_run tgtadm --lld iscsi --op new --mode logicalunit \ + --tid=${TID} \ + --lun=${OCF_RESKEY_lun} \ + $tgt_args \ + --backing-store ${OCF_RESKEY_path} || exit $OCF_ERR_GENERIC + if [ -z "$params" ]; then + return $OCF_SUCCESS + else + ocf_run tgtadm --lld iscsi --op update --mode logicalunit \ + --tid=${TID} \ + --lun=${OCF_RESKEY_lun} \ + --params ${params// /,} || exit $OCF_ERR_GENERIC + fi + ;; + lio) + # For lio, we first have to create a target device, then + # add it to the Target Portal Group as an LU. + + block_configfs_path="/sys/kernel/config/target/core/iblock_${OCF_RESKEY_lio_iblock}/${OCF_RESOURCE_INSTANCE}/udev_path" + if [ ! -e "${block_configfs_path}" ]; then + ocf_run tcm_node --createdev=iblock_${OCF_RESKEY_lio_iblock}/${OCF_RESOURCE_INSTANCE} \ + ${OCF_RESKEY_path} || exit $OCF_ERR_GENERIC + elif [ -e "$block_configfs_path" ] && [ $(cat "$block_configfs_path") != "${OCF_RESKEY_path}" ]; then + ocf_exit_reason "Existing iblock_${OCF_RESKEY_lio_iblock}/${OCF_RESOURCE_INSTANCE} has incorrect path: $(cat "$block_configfs_path") != ${OCF_RESKEY_path}" + exit $OCF_ERR_GENERIC + else + ocf_log info "iscsi iblock already exists: ${block_configfs_path}" + fi + + if [ -n "${OCF_RESKEY_scsi_sn}" ]; then + ocf_run tcm_node --setunitserial=iblock_${OCF_RESKEY_lio_iblock}/${OCF_RESOURCE_INSTANCE} \ + ${OCF_RESKEY_scsi_sn} || exit $OCF_ERR_GENERIC + fi + + lun_configfs_path="/sys/kernel/config/target/iscsi/${OCF_RESKEY_target_iqn}/tpgt_1/lun/lun_${OCF_RESKEY_lun}/${OCF_RESOURCE_INSTANCE}/udev_path" + if [ ! -e "${lun_configfs_path}" ]; then + ocf_run lio_node --addlun=${OCF_RESKEY_target_iqn} 1 ${OCF_RESKEY_lun} \ + ${OCF_RESOURCE_INSTANCE} iblock_${OCF_RESKEY_lio_iblock}/${OCF_RESOURCE_INSTANCE} || exit $OCF_ERR_GENERIC + else + ocf_log info "iscsi lun already exists: ${lun_configfs_path}" + fi + + if [ -n "${OCF_RESKEY_allowed_initiators}" ]; then + for initiator in ${OCF_RESKEY_allowed_initiators}; do + acl_configfs_path="/sys/kernel/config/target/iscsi/${OCF_RESKEY_target_iqn}/tpgt_1/acls/${initiator}/lun_${OCF_RESKEY_lun}" + if [ ! -e "${acl_configfs_path}" ]; then + ocf_run lio_node --addlunacl=${OCF_RESKEY_target_iqn} 1 \ + ${initiator} ${OCF_RESKEY_lun} ${OCF_RESKEY_lun} || exit $OCF_ERR_GENERIC + else + ocf_log info "iscsi acl already exists: ${acl_configfs_path}" + fi + done + fi + ;; + lio-t) + ocf_take_lock $TARGETLOCKFILE + ocf_release_lock_on_exit $TARGETLOCKFILE + iblock_attrib_path="/sys/kernel/config/target/core/iblock_*/${OCF_RESOURCE_INSTANCE}/attrib" + # For lio, we first have to create a target device, then + # add it to the Target Portal Group as an LU. + # Handle differently 'block', 'fileio' and 'pscsi' + if [ "${OCF_RESKEY_liot_bstype}" = "block" ] + then + ocf_run targetcli /backstores/${OCF_RESKEY_liot_bstype} create name=${OCF_RESOURCE_INSTANCE} dev=${OCF_RESKEY_path} $(test -n "$OCF_RESKEY_scsi_sn" && echo "wwn=${OCF_RESKEY_scsi_sn}") || exit $OCF_ERR_GENERIC + elif [ "${OCF_RESKEY_liot_bstype}" = "fileio" ] + then + ocf_run targetcli /backstores/${OCF_RESKEY_liot_bstype} create ${OCF_RESOURCE_INSTANCE} ${OCF_RESKEY_path} $(test -n "$OCF_RESKEY_scsi_sn" && echo "wwn=${OCF_RESKEY_scsi_sn}") || exit $OCF_ERR_GENERIC + elif [ "${OCF_RESKEY_liot_bstype}" = "pscsi" ] + then + # pscsi don't use custom wwn because it's SCSI passthrough so scsi_generic device will report it's own wwn + # so lets ignore provided serial in $OCF_RESKEY_scsi_sn + ocf_run targetcli /backstores/${OCF_RESKEY_liot_bstype} create ${OCF_RESOURCE_INSTANCE} ${OCF_RESKEY_path} || exit $OCF_ERR_GENERIC + fi + if [ -n "${OCF_RESKEY_scsi_sn}" ]; then + echo ${OCF_RESKEY_scsi_sn} > /sys/kernel/config/target/core/iblock_*/${OCF_RESOURCE_INSTANCE}/wwn/vpd_unit_serial + fi + if [ -n "${OCF_RESKEY_product_id}" ]; then + echo "${OCF_RESKEY_product_id}" > /sys/kernel/config/target/core/iblock_*/${OCF_RESOURCE_INSTANCE}/wwn/product_id + fi + + ocf_run targetcli /iscsi/${OCF_RESKEY_target_iqn}/tpg1/luns create /backstores/${OCF_RESKEY_liot_bstype}/${OCF_RESOURCE_INSTANCE} ${OCF_RESKEY_lun} || exit $OCF_ERR_GENERIC + + if $(ip a | grep -q inet6); then + # Solving the 0.0.0.0 conversion to IPv6 when using specific portal addresses + if $(ocf_run -q targetcli /iscsi/${OCF_RESKEY_target_iqn}/tpg1/portals | grep -q 0.0.0.0) + then + ocf_run -q targetcli /iscsi/${OCF_RESKEY_target_iqn}/tpg1/portals delete 0.0.0.0 3260 + ocf_run -q targetcli /iscsi/${OCF_RESKEY_target_iqn}/tpg1/portals create ::0 + fi + fi + + if [ -n "${OCF_RESKEY_allowed_initiators}" ]; then + for initiator in ${OCF_RESKEY_allowed_initiators}; do + if [ -d "/sys/kernel/config/target/iscsi/${OCF_RESKEY_target_iqn}/tpgt_1/acls" ] ;then + if ! [ -d "/sys/kernel/config/target/iscsi/${OCF_RESKEY_target_iqn}/tpgt_1/acls/${initiator}" ];then + ocf_run targetcli /iscsi/${OCF_RESKEY_target_iqn}/tpg1/acls create ${initiator} add_mapped_luns=False || exit $OCF_ERR_GENERIC + ocf_run targetcli /iscsi/${OCF_RESKEY_target_iqn}/tpg1/acls/${initiator} create ${OCF_RESKEY_lun} ${OCF_RESKEY_lun} || exit $OCF_ERR_GENERIC + fi + fi + done + fi + + if [ -n "${OCF_RESKEY_emulate_tpu}" ]; then + echo ${OCF_RESKEY_emulate_tpu} > ${iblock_attrib_path}/emulate_tpu || exit $OCF_ERR_GENERIC + fi + if [ -n "${OCF_RESKEY_emulate_3pc}" ]; then + echo ${OCF_RESKEY_emulate_3pc} > ${iblock_attrib_path}/emulate_3pc || exit $OCF_ERR_GENERIC + fi + if [ -n "${OCF_RESKEY_emulate_caw}" ]; then + echo ${OCF_RESKEY_emulate_caw} > ${iblock_attrib_path}/emulate_caw || exit $OCF_ERR_GENERIC + fi + ;; + scst) + ocf_run scstadmin -open_dev "${OCF_RESOURCE_INSTANCE}" -handler vdisk_blockio -attributes "filename=${OCF_RESKEY_path},nv_cache=0,write_through=1" + if [ -n "${OCF_RESKEY_scsi_sn}" ]; then + ocf_run scstadmin -set_dev_attr "${OCF_RESOURCE_INSTANCE}" -attributes "usn=${OCF_RESKEY_scsi_sn}" -force -noprompt + fi + if [ -n "${OCF_RESKEY_vendor_id}" ]; then + ocf_run scstadmin -set_dev_attr "${OCF_RESOURCE_INSTANCE}" -attributes "t10_vend_id=${OCF_RESKEY_vendor_id}" -force -noprompt + fi + if [ -n "${OCF_RESKEY_product_id}" ]; then + ocf_run scstadmin -set_dev_attr "${OCF_RESOURCE_INSTANCE}" -attributes "t10_dev_id=${OCF_RESKEY_product_id}" -force -noprompt + fi + if [ -d "/sys/kernel/scst_tgt/targets/iscsi/${OCF_RESKEY_target_iqn}/ini_groups/allowed/" ]; then + # if an initiator group exists for the target, add the new LUN to it. + ocf_run scstadmin -add_lun ${OCF_RESKEY_lun} -driver iscsi -target "${OCF_RESKEY_target_iqn}" -device "${OCF_RESOURCE_INSTANCE}" -group allowed -force -noprompt + fi + ocf_run scstadmin -add_lun ${OCF_RESKEY_lun} -driver iscsi -target "${OCF_RESKEY_target_iqn}" -device "${OCF_RESOURCE_INSTANCE}" $group_arg -force -noprompt + ;; + esac + + # Force the monitor operation to pass before start is considered a success. + iSCSILogicalUnit_monitor +} + +iSCSILogicalUnit_stop() { + iSCSILogicalUnit_monitor + if [ $? -eq $OCF_NOT_RUNNING ]; then + return $OCF_SUCCESS + fi + + case $OCF_RESKEY_implementation in + + iet) + # IET allows us to remove LUs while they are in use + ocf_run ietadm --op delete \ + --tid=${TID} \ + --lun=${OCF_RESKEY_lun} || exit $OCF_ERR_GENERIC + ;; + tgt) + # tgt will fail to remove an LU while it is in use, + # but at the same time does not allow us to + # selectively shut down a connection that is using a + # specific LU. Thus, we need to loop here until tgtd + # decides that the LU is no longer in use, or we get + # timed out by the LRM. + while ! ocf_run -warn tgtadm --lld iscsi --op delete --mode logicalunit \ + --tid ${TID} \ + --lun=${OCF_RESKEY_lun}; do + sleep 1 + done + ;; + lio) + + acls_configfs_path="/sys/kernel/config/target/iscsi/${OCF_RESKEY_target_iqn}/tpgt_1/acls" + for initiatorpath in ${acls_configfs_path}/*; do + initiator=$(basename "${initiatorpath}") + if [ -e "${initiatorpath}/lun_${OCF_RESKEY_lun}" ]; then + ocf_log info "deleting acl at ${initiatorpath}/lun_${OCF_RESKEY_lun}" + ocf_run lio_node --dellunacl=${OCF_RESKEY_target_iqn} 1 \ + ${initiator} ${OCF_RESKEY_lun} || exit $OCF_ERR_GENERIC + fi + done + lun_configfs_path="/sys/kernel/config/target/iscsi/${OCF_RESKEY_target_iqn}/tpgt_1/lun/lun_${OCF_RESKEY_lun}/" + if [ -e "${lun_configfs_path}" ]; then + ocf_run lio_node --dellun=${OCF_RESKEY_target_iqn} 1 ${OCF_RESKEY_lun} || exit $OCF_ERR_GENERIC + fi + block_configfs_path="/sys/kernel/config/target/core/iblock_${OCF_RESKEY_lio_iblock}/${OCF_RESOURCE_INSTANCE}/udev_path" + if [ -e "${block_configfs_path}" ]; then + ocf_run tcm_node --freedev=iblock_${OCF_RESKEY_lio_iblock}/${OCF_RESOURCE_INSTANCE} || exit $OCF_ERR_GENERIC + fi + ;; + lio-t) + ocf_take_lock $TARGETLOCKFILE + ocf_release_lock_on_exit $TARGETLOCKFILE + # "targetcli delete" will fail if the LUN is already + # gone. Log a warning and still push ahead. + ocf_run -warn targetcli /iscsi/${OCF_RESKEY_target_iqn}/tpg1/luns delete ${OCF_RESKEY_lun} + if [ -n "${OCF_RESKEY_allowed_initiators}" ]; then + for initiator in ${OCF_RESKEY_allowed_initiators}; do + if targetcli /iscsi/${OCF_RESKEY_target_iqn}/tpg1/acls/${initiator} status | grep "Mapped LUNs: 0" >/dev/null ; then + ocf_run -warn targetcli /iscsi/${OCF_RESKEY_target_iqn}/tpg1/acls/ delete ${initiator} + fi + done + fi + + # If we've proceeded down to here and we're unable to + # delete the backstore, then something is seriously + # wrong and we need to fail the stop operation + # (potentially causing fencing) + ocf_run targetcli /backstores/${OCF_RESKEY_liot_bstype} delete ${OCF_RESOURCE_INSTANCE} || exit $OCF_ERR_GENERIC + ;; + scst) + ocf_run -warn scstadmin -rem_lun ${OCF_RESKEY_lun} -driver iscsi -target "${OCF_RESKEY_target_iqn}" -force -noprompt + ocf_run scstadmin -close_dev "${OCF_RESOURCE_INSTANCE}" -handler vdisk_blockio -force -noprompt + ;; + esac + + return $OCF_SUCCESS +} + +iSCSILogicalUnit_monitor() { + if [ x"${OCF_RESKEY_tgt_bstype}" != x"rbd" ]; then + # If our backing device (or file) doesn't even exist, we're not running + [ -e ${OCF_RESKEY_path} ] || return $OCF_NOT_RUNNING + fi + + case $OCF_RESKEY_implementation in + iet) + # Figure out and set the target ID + TID=`sed -ne "s/tid:\([[:digit:]]\+\) name:${OCF_RESKEY_target_iqn}$/\1/p" < /proc/net/iet/volume` + if [ -z "${TID}" ]; then + # Our target is not configured, thus we're not + # running. + return $OCF_NOT_RUNNING + fi + # FIXME: this looks for a matching LUN and path, but does + # not actually test for the correct target ID. + grep -E -q "[[:space:]]+lun:${OCF_RESKEY_lun}.*path:${OCF_RESKEY_path}$" /proc/net/iet/volume && return $OCF_SUCCESS + ;; + tgt) + # Figure out and set the target ID + TID=`tgtadm --lld iscsi --op show --mode target \ + | sed -ne "s/^Target \([[:digit:]]\+\): ${OCF_RESKEY_target_iqn}$/\1/p"` + if [ -z "$TID" ]; then + # Our target is not configured, thus we're not + # running. + return $OCF_NOT_RUNNING + fi + # This only looks for the backing store, but does not test + # for the correct target ID and LUN. + tgtadm --lld iscsi --op show --mode target \ + | grep -E -q "[[:space:]]+Backing store.*: ${OCF_RESKEY_path}$" && return $OCF_SUCCESS + ;; + lio) + configfs_path="/sys/kernel/config/target/iscsi/${OCF_RESKEY_target_iqn}/tpgt_1/lun/lun_${OCF_RESKEY_lun}/${OCF_RESOURCE_INSTANCE}/udev_path" + [ -e ${configfs_path} ] && [ `cat ${configfs_path}` = "${OCF_RESKEY_path}" ] && return $OCF_SUCCESS + + # if we aren't activated, is a block device still left over? + block_configfs_path="/sys/kernel/config/target/core/iblock_${OCF_RESKEY_lio_iblock}/${OCF_RESOURCE_INSTANCE}/udev_path" + [ -e ${block_configfs_path} ] && ocf_log warn "existing block without an active lun: ${block_configfs_path}" + [ -e ${block_configfs_path} ] && return $OCF_ERR_GENERIC + + ;; + lio-t) + configfs_path="/sys/kernel/config/target/iscsi/${OCF_RESKEY_target_iqn}/tpgt_1/lun/lun_${OCF_RESKEY_lun}/*/udev_path" + [ -e ${configfs_path} ] && [ `cat ${configfs_path}` = "${OCF_RESKEY_path}" ] && return $OCF_SUCCESS + + # if we aren't activated, is a block device still left over? + block_configfs_path="/sys/kernel/config/target/core/iblock_*/${OCF_RESOURCE_INSTANCE}/udev_path" + [ -e ${block_configfs_path} ] && ocf_log warn "existing block without an active lun: ${block_configfs_path}" + [ -e ${block_configfs_path} ] && return $OCF_ERR_GENERIC + ;; + scst) + [ -d /sys/kernel/scst_tgt/devices/${OCF_RESOURCE_INSTANCE} ] || return $OCF_NOT_RUNNING + [ $(cat /sys/kernel/scst_tgt/devices/${OCF_RESOURCE_INSTANCE}/active) -eq 1 ] || return $OCF_NOT_RUNNING + [ $(head -n1 /sys/kernel/scst_tgt/devices/${OCF_RESOURCE_INSTANCE}/filename) = "${OCF_RESKEY_path}" ] || return $OCF_NOT_RUNNING + [ -d /sys/kernel/scst_tgt/targets/iscsi/${OCF_RESKEY_target_iqn}/luns/${OCF_RESKEY_lun} ] && return $OCF_SUCCESS + ;; + esac + + return $OCF_NOT_RUNNING +} + +iSCSILogicalUnit_validate() { + # Do we have all required variables? + for var in target_iqn lun path; do + param="OCF_RESKEY_${var}" + if [ -z "${!param}" ]; then + ocf_exit_reason "Missing resource parameter \"$var\"!" + exit $OCF_ERR_CONFIGURED + fi + done + + # Is the configured implementation supported? + case "$OCF_RESKEY_implementation" in + "iet"|"tgt"|"lio"|"lio-t"|"scst") + ;; + "") + # The user didn't specify an implementation, and we were + # unable to determine one from installed binaries (in + # other words: no binaries for any supported + # implementation could be found) + ocf_exit_reason "Undefined iSCSI target implementation" + exit $OCF_ERR_INSTALLED + ;; + *) + ocf_exit_reason "Unsupported iSCSI target implementation \"$OCF_RESKEY_implementation\"!" + exit $OCF_ERR_CONFIGURED + ;; + esac + + # Do we have a valid LUN? + case $OCF_RESKEY_implementation in + iet) + # IET allows LUN 0 and up + [ $OCF_RESKEY_lun -ge 0 ] + case $? in + 0) + # OK + ;; + 1) + ocf_log err "Invalid LUN $OCF_RESKEY_lun (must be a non-negative integer)." + exit $OCF_ERR_CONFIGURED + ;; + *) + ocf_log err "Invalid LUN $OCF_RESKEY_lun (must be an integer)." + exit $OCF_ERR_CONFIGURED + ;; + esac + ;; + tgt) + # tgt reserves LUN 0 for its own purposes + [ $OCF_RESKEY_lun -ge 1 ] + case $? in + 0) + # OK + ;; + 1) + ocf_log err "Invalid LUN $OCF_RESKEY_lun (must be greater than 0)." + exit $OCF_ERR_CONFIGURED + ;; + *) + ocf_log err "Invalid LUN $OCF_RESKEY_lun (must be an integer)." + exit $OCF_ERR_CONFIGURED + ;; + esac + ;; + esac + + # Do we have any configuration parameters that the current + # implementation does not support? + local unsupported_params + local var + local envar + case $OCF_RESKEY_implementation in + iet) + # IET does not support setting the vendor and product ID + # (it always uses "IET" and "VIRTUAL-DISK") + unsupported_params="vendor_id product_id allowed_initiators lio_iblock tgt_bstype tgt_bsoflags tgt_bsopts tgt_device_type emulate_tpu emulate_3pc emulate_caw liot_bstype" + ;; + tgt) + unsupported_params="allowed_initiators lio_iblock emulate_tpu emulate_3pc emulate_caw liot_bstype" + ;; + lio) + unsupported_params="scsi_id vendor_id product_id tgt_bstype tgt_bsoflags tgt_bsopts tgt_device_type emulate_tpu emulate_3pc emulate_caw liot_bstype" + ;; + lio-t) + unsupported_params="scsi_id vendor_id tgt_bstype tgt_bsoflags tgt_bsopts tgt_device_type lio_iblock" + ;; + scst) + unsupported_params="scsi_id emulate_tpu emulate_3pc emulate_caw" + ;; + esac + + for var in ${unsupported_params}; do + envar=OCF_RESKEY_${var} + defvar=OCF_RESKEY_${var}_default + if [ -n "${!envar}" ]; then + if [[ "${!envar}" != "${!defvar}" ]]; then + case "$__OCF_ACTION" in + start|validate-all) + ocf_log warn "Configuration parameter \"${var}\"" \ + "is not supported by the iSCSI implementation" \ + "and will be ignored." ;; + esac + fi + fi + done + + if ! ocf_is_probe; then + # Do we have all required binaries? + case $OCF_RESKEY_implementation in + iet) + check_binary ietadm + ;; + tgt) + check_binary tgtadm + ;; + lio) + check_binary tcm_node + check_binary lio_node + ;; + lio-t) + check_binary targetcli + ;; + scst) + check_binary scstadmin + ;; + esac + + # Is the required kernel functionality available? + case $OCF_RESKEY_implementation in + iet) + [ -d /proc/net/iet ] + if [ $? -ne 0 ]; then + ocf_log err "/proc/net/iet does not exist or is not a directory -- check if required modules are loaded." + exit $OCF_ERR_INSTALLED + fi + ;; + tgt) + # tgt is userland only + ;; + scst) + if [ ! -d /sys/kernel/scst_tgt ]; then + ocf_log err "/sys/kernel/scst_tgt does not exist or is not a directory -- check if required modules are loaded." + exit $OCF_ERR_INSTALLED + fi + ;; + esac + fi + + return $OCF_SUCCESS +} + +case $1 in +meta-data) + meta_data + exit $OCF_SUCCESS + ;; +usage|help) + iSCSILogicalUnit_usage + exit $OCF_SUCCESS + ;; +esac + +# Everything except usage and meta-data must pass the validate test +iSCSILogicalUnit_validate + +case $__OCF_ACTION in +start) iSCSILogicalUnit_start;; +stop) iSCSILogicalUnit_stop;; +monitor|status) iSCSILogicalUnit_monitor;; +reload) ocf_log err "Reloading..." + iSCSILogicalUnit_start + ;; +validate-all) ;; +*) iSCSILogicalUnit_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc diff --git a/heartbeat/iSCSITarget.in b/heartbeat/iSCSITarget.in new file mode 100644 index 0000000..2a9ddf0 --- /dev/null +++ b/heartbeat/iSCSITarget.in @@ -0,0 +1,766 @@ +#!@BASH_SHELL@ +# +# +# iSCSITarget OCF RA. Exports and manages iSCSI targets. +# +# (c) 2009-2010 Florian Haas, Dejan Muhamedagic, +# and Linux-HA contributors +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +# Set a default implementation based on software installed +if have_binary ietadm; then + OCF_RESKEY_implementation_default="iet" +elif have_binary tgtadm; then + OCF_RESKEY_implementation_default="tgt" +elif have_binary lio_node; then + OCF_RESKEY_implementation_default="lio" +elif have_binary targetcli; then + OCF_RESKEY_implementation_default="lio-t" +elif have_binary scstadmin; then + OCF_RESKEY_implementation_default="scst" +fi +: ${OCF_RESKEY_implementation=${OCF_RESKEY_implementation_default}} + +# Listen on 0.0.0.0:3260 by default +OCF_RESKEY_portals_default="0.0.0.0:3260" +: ${OCF_RESKEY_portals=${OCF_RESKEY_portals_default}} + +OCF_RESKEY_allowed_initiators_default="" +: ${OCF_RESKEY_allowed_initiators=${OCF_RESKEY_allowed_initiators_default}} + +# Lockfile, used for selecting a target ID +LOCKFILE=${HA_RSCTMP}/iSCSITarget-${OCF_RESKEY_implementation}.lock + +# targetcli: iSCSITarget and iSCSILogicalUnit must use the same lockfile +TARGETLOCKFILE=${HA_RSCTMP}/targetcli.lock + +# Timeout for waiting for initiators to log out (only used in scst) +INIT_LOGOUT_TIMEOUT=20 +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="iSCSITarget" version="0.9"> +<version>1.0</version> + +<longdesc lang="en"> +Manages iSCSI targets. An iSCSI target is a collection of SCSI Logical +Units (LUs) exported via a daemon that speaks the iSCSI protocol. +</longdesc> +<shortdesc lang="en">iSCSI target export agent</shortdesc> + +<parameters> +<parameter name="implementation" required="0" unique="0"> +<longdesc lang="en"> +The iSCSI target daemon implementation. Must be one of "iet", "tgt", +"lio", "lio-t", or "scst". If unspecified, an implementation is selected based on the +availability of management utilities, with "iet" being tried first, +then "tgt", then "lio", then "lio-t", then "scst". +</longdesc> +<shortdesc lang="en">Specifies the iSCSI target implementation +("iet", "tgt", "lio", "lio-t", or "scst").</shortdesc> +<content type="string" default="${OCF_RESKEY_implementation_default}"/> +</parameter> + +<parameter name="iqn" required="1" unique="1"> +<longdesc lang="en"> +The target iSCSI Qualified Name (IQN). Should follow the conventional +"iqn.yyyy-mm.<reversed domain name>[:identifier]" syntax. +</longdesc> +<shortdesc lang="en">iSCSI target IQN</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="tid" required="0" unique="1"> +<longdesc lang="en"> +The iSCSI target ID. Required for tgt. +</longdesc> +<shortdesc lang="en">iSCSI target ID</shortdesc> +<content type="integer" /> +</parameter> + +<parameter name="portals" required="0" unique="0"> +<longdesc lang="en"> +iSCSI network portal addresses. Not supported by all +implementations. If unset, the default is to create one portal that +listens on ${OCF_RESKEY_portal_default}. +</longdesc> +<shortdesc lang="en">iSCSI portal addresses</shortdesc> +<content type="string" default="${OCF_RESKEY_portals_default}"/> +</parameter> + +<parameter name="iser_portals" required="0" unique="0"> +<longdesc lang="en"> +iSCSI iSER network portal addresses. Not supported by all +implementations. +</longdesc> +<shortdesc lang="en">iSCSI iSER enabled portal addresses</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="allowed_initiators" required="0" unique="0"> +<longdesc lang="en"> +Allowed initiators. A space-separated list of initiators allowed to +connect to this target. Initiators may be listed in any syntax +the target implementation allows. If this parameter is empty or +not set, access to this target will be allowed from any initiator. +</longdesc> +<shortdesc lang="en">List of iSCSI initiators allowed to connect +to this target</shortdesc> +<content type="string" default="${OCF_RESKEY_allowed_initiators_default}"/> +</parameter> + +<parameter name="incoming_username" required="0" unique="1"> +<longdesc lang="en"> +A username used for incoming initiator authentication. If unspecified, +allowed initiators will be able to log in without authentication. +This is a unique parameter, as it not allowed to re-use a single +username across multiple target instances. +</longdesc> +<shortdesc lang="en">Incoming account username</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="incoming_password" required="0" unique="0"> +<longdesc lang="en"> +A password used for incoming initiator authentication. +</longdesc> +<shortdesc lang="en">Incoming account password</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="additional_parameters" required="0" unique="0"> +<longdesc lang="en"> +Additional target parameters. A space-separated list of "name=value" +pairs which will be passed through to the iSCSI daemon's management +interface. The supported parameters are implementation +dependent. Neither the name nor the value may contain whitespace. +</longdesc> +<shortdesc lang="en">List of iSCSI target parameters</shortdesc> +<content type="string" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="10s" /> +<action name="stop" timeout="10s" /> +<action name="status" timeout="10s" interval="10s" depth="0" /> +<action name="monitor" timeout="10s" interval="10s" depth="0" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="10s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +iSCSITarget_usage() { + cat <<END +usage: $0 {start|stop|status|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +iSCSITarget_start() { + iSCSITarget_monitor + if [ $? = $OCF_SUCCESS ]; then + return $OCF_SUCCESS + fi + + local param + local name + local value + local initiator + local portal + + case $OCF_RESKEY_implementation in + iet) + local lasttid + local tid + if [ "${OCF_RESKEY_tid}" ]; then + tid="${OCF_RESKEY_tid}" + else + # Figure out the last used target ID, add 1 to get the new + # target ID. + ocf_take_lock $LOCKFILE + ocf_release_lock_on_exit $LOCKFILE + lasttid=`sed -ne "s/tid:\([[:digit:]]\+\) name:.*/\1/p" < /proc/net/iet/volume | sort -n | tail -n1` + [ -z "${lasttid}" ] && lasttid=0 + tid=$((++lasttid)) + fi + + # Create the target. + ocf_run ietadm --op new \ + --tid=${tid} \ + --params Name=${OCF_RESKEY_iqn} || exit $OCF_ERR_GENERIC + + # Set additional parameters. + for param in ${OCF_RESKEY_additional_parameters}; do + name=${param%=*} + value=${param#*=} + ocf_run ietadm --op update \ + --tid=${tid} \ + --params ${name}=${value} || exit $OCF_ERR_GENERIC + done + + # Legacy versions of IET allow targets by default, current + # versions deny. To be safe we manage both the .allow and + # .deny files. + if [ -n "${OCF_RESKEY_allowed_initiators}" ]; then + echo "${OCF_RESKEY_iqn} ALL" >> /etc/initiators.deny + echo "${OCF_RESKEY_iqn} ${OCF_RESKEY_allowed_initiators// /,}" >> /etc/initiators.allow + else + echo "${OCF_RESKEY_iqn} ALL" >> /etc/initiators.allow + fi + # In iet, adding a new user and assigning it to a target + # is one operation. + if [ -n "${OCF_RESKEY_incoming_username}" ]; then + ocf_run ietadm --op new --user \ + --tid=${tid} \ + --params=IncomingUser=${OCF_RESKEY_incoming_username},Password=${OCF_RESKEY_incoming_password} \ + || exit $OCF_ERR_GENERIC + fi + ;; + tgt) + local tid + tid="${OCF_RESKEY_tid}" + # Create the target. + ocf_run tgtadm --lld iscsi --op new --mode target \ + --tid=${tid} \ + --targetname ${OCF_RESKEY_iqn} || exit $OCF_ERR_GENERIC + + # Set parameters. + for param in ${OCF_RESKEY_additional_parameters}; do + name=${param%=*} + value=${param#*=} + ocf_run tgtadm --lld iscsi --op update --mode target \ + --tid=${tid} \ + --name=${name} --value=${value} || exit $OCF_ERR_GENERIC + done + + # For tgt, we always have to add access per initiator; + # access to targets is denied by default. If + # "allowed_initiators" is unset, we must use the special + # keyword ALL. + for initiator in ${OCF_RESKEY_allowed_initiators=ALL}; do + ocf_run tgtadm --lld iscsi --op bind --mode target \ + --tid=${tid} \ + --initiator-address=${initiator} || exit $OCF_ERR_GENERIC + done + + # In tgt, we must first create a user account, then assign + # it to a target using the "bind" operation. + if [ -n "${OCF_RESKEY_incoming_username}" ]; then + ocf_run tgtadm --lld iscsi --mode account --op new \ + --user=${OCF_RESKEY_incoming_username} \ + --password=${OCF_RESKEY_incoming_password} || exit $OCF_ERR_GENERIC + ocf_run tgtadm --lld iscsi --mode account --op bind \ + --tid=${tid} \ + --user=${OCF_RESKEY_incoming_username} || exit $OCF_ERR_GENERIC + fi + ;; + lio) + # lio distinguishes between targets and target portal + # groups (TPGs). We will always create one TPG, with the + # number 1. In lio, creating a network portal + # automatically creates the corresponding target if it + # doesn't already exist. + for portal in ${OCF_RESKEY_portals}; do + ocf_run lio_node --addnp ${OCF_RESKEY_iqn} 1 \ + ${portal} || exit $OCF_ERR_GENERIC + done + + # in lio, we can set target parameters by manipulating + # the appropriate configfs entries + for param in ${OCF_RESKEY_additional_parameters}; do + name=${param%=*} + value=${param#*=} + configfs_path="/sys/kernel/config/target/iscsi/${OCF_RESKEY_iqn}/tpgt_1/param/${name}" + if [ -e ${configfs_path} ]; then + echo ${value} > ${configfs_path} || exit $OCF_ERR_GENERIC + else + ocf_log warn "Unsupported iSCSI target parameter ${name}: will be ignored." + fi + done + + # lio does per-initiator filtering by default. To disable + # this, we need to switch the target to "permissive mode". + if [ -n "${OCF_RESKEY_allowed_initiators}" ]; then + for initiator in ${OCF_RESKEY_allowed_initiators}; do + ocf_run lio_node --addnodeacl ${OCF_RESKEY_iqn} 1 \ + ${initiator} || exit $OCF_ERR_GENERIC + done + else + ocf_run lio_node --permissive ${OCF_RESKEY_iqn} 1 || exit $OCF_ERR_GENERIC + # permissive mode enables read-only access by default, + # so we need to change that to RW to be in line with + # the other implementations. + echo 0 > "/sys/kernel/config/target/iscsi/${OCF_RESKEY_iqn}/tpgt_1/attrib/demo_mode_write_protect" + if [ `cat /sys/kernel/config/target/iscsi/${OCF_RESKEY_iqn}/tpgt_1/attrib/demo_mode_write_protect` -ne 0 ]; then + ocf_log err "Failed to disable write protection for target ${OCF_RESKEY_iqn}." + exit $OCF_ERR_GENERIC + fi + fi + + # TODO: add CHAP authentication support when it gets added + # back into LIO + ocf_run lio_node --disableauth ${OCF_RESKEY_iqn} 1 || exit $OCF_ERR_GENERIC + # Finally, we need to enable the target to allow + # initiators to connect + ocf_run lio_node --enabletpg=${OCF_RESKEY_iqn} 1 || exit $OCF_ERR_GENERIC + ;; + lio-t) + # lio distinguishes between targets and target portal + # groups (TPGs). We will always create one TPG, with the + # number 1. In lio, creating a network portal + # automatically creates the corresponding target if it + # doesn't already exist. + ocf_take_lock $TARGETLOCKFILE + ocf_release_lock_on_exit $TARGETLOCKFILE + ocf_run targetcli /iscsi set global auto_add_default_portal=false || exit $OCF_ERR_GENERIC + if ! [ -d /sys/kernel/config/target/iscsi/${OCF_RESKEY_iqn} ] ; then + ocf_run targetcli /iscsi create ${OCF_RESKEY_iqn} || exit $OCF_ERR_GENERIC + fi + for portal in ${OCF_RESKEY_portals}; do + if [ $portal != ${OCF_RESKEY_portals_default} ] ; then + IFS=':' read -a sep_portal <<< "$portal" + ocf_run targetcli /iscsi/${OCF_RESKEY_iqn}/tpg1/portals create "${sep_portal[0]}" "${sep_portal[1]}" || exit $OCF_ERR_GENERIC + fi + done + # in lio, we can set target parameters by manipulating + # the appropriate configfs entries + for param in ${OCF_RESKEY_additional_parameters}; do + name=${param%=*} + value=${param#*=} + configfs_path="/sys/kernel/config/target/iscsi/${OCF_RESKEY_iqn}/tpgt_1/param/${name}" + if [ -e ${configfs_path} ]; then + echo ${value} > ${configfs_path} || exit $OCF_ERR_GENERIC + else + ocf_log warn "Unsupported iSCSI target parameter ${name}: will be ignored." + fi + done + + # allow iSER enabled portal + for iser_portal in ${OCF_RESKEY_iser_portals}; do + configfs_path="/sys/kernel/config/target/iscsi/${OCF_RESKEY_iqn}/tpgt_1/np/${iser_portal}\:*/iser" + if [ -f ${configfs_path} ]; then + echo "1" > ${configfs_path} || exit $OCF_ERR_GENERIC + else + ocf_log warn "Unable to set iSER on: $iser_portal" + fi + done + + # lio does per-initiator filtering by default. To disable + # this, we need to switch the target to "permissive mode". + if [ -n "${OCF_RESKEY_allowed_initiators}" ]; then + # enable authentication for tpg1 if incoming_username + # is defined + if [ -n "${OCF_RESKEY_incoming_username}" ]; then + ocf_run targetcli /iscsi/${OCF_RESKEY_iqn}/tpg1/ set attribute authentication=1 || exit $OCF_ERR_GENERIC + ocf_run targetcli /iscsi/${OCF_RESKEY_iqn}/tpg1/ set attribute generate_node_acls=0 || exit $OCF_ERR_GENERIC + fi + for initiator in ${OCF_RESKEY_allowed_initiators}; do + ocf_run targetcli /iscsi/${OCF_RESKEY_iqn}/tpg1/acls create ${initiator} || exit $OCF_ERR_GENERIC + # enable chap if incoming_username is defined + if [ -n "${OCF_RESKEY_incoming_username}" ]; then + ocf_run targetcli /iscsi/${OCF_RESKEY_iqn}/tpg1/acls/${initiator}/ set auth userid=${OCF_RESKEY_incoming_username} || exit $OCF_ERR_GENERIC + ocf_run targetcli /iscsi/${OCF_RESKEY_iqn}/tpg1/acls/${initiator}/ set auth password=${OCF_RESKEY_incoming_password} || exit $OCF_ERR_GENERIC + fi + done + else + if [ -n "${OCF_RESKEY_incoming_username}" ]; then + ocf_run targetcli /iscsi/${OCF_RESKEY_iqn}/tpg1/ set attribute authentication=1 demo_mode_write_protect=0 generate_node_acls=1 cache_dynamic_acls=1 || exit $OCF_ERR_GENERIC + ocf_run targetcli /iscsi/${OCF_RESKEY_iqn}/tpg1/ set auth userid=${OCF_RESKEY_incoming_username} || exit $OCF_ERR_GENERIC + ocf_run targetcli /iscsi/${OCF_RESKEY_iqn}/tpg1/ set auth password=${OCF_RESKEY_incoming_password} || exit $OCF_ERR_GENERIC + else + ocf_run targetcli /iscsi/${OCF_RESKEY_iqn}/tpg1/ set attribute authentication=0 demo_mode_write_protect=0 generate_node_acls=1 cache_dynamic_acls=1 || exit $OCF_ERR_GENERIC + fi + fi + ;; + scst) + ocf_run scstadmin -add_target ${OCF_RESKEY_iqn} -driver iscsi + + for portal in ${OCF_RESKEY_portals}; do + # scst only wants the IP address for some reason, so strip the port + portal_ip="${portal%%:*}" + ocf_run scstadmin -add_tgt_attr ${OCF_RESKEY_iqn} -driver iscsi -attributes "allowed_portal=${portal_ip}" + done + ocf_run scstadmin -add_group allowed -driver iscsi -target ${OCF_RESKEY_iqn} + if [ -n "${OCF_RESKEY_allowed_initiators}" ]; then + for initiator in ${OCF_RESKEY_allowed_initiators}; do + ocf_run scstadmin -add_init ${initiator} -group allowed -driver iscsi -target ${OCF_RESKEY_iqn} + done + fi + + if [ "${OCF_RESKEY_incoming_username}" != "" ]; then + ocf_run scstadmin -add_tgt_attr ${OCF_RESKEY_iqn} -driver iscsi -attributes "IncomingUser ${OCF_RESKEY_incoming_username} ${OCF_RESKEY_incoming_password}" + fi + + ocf_run scstadmin -enable_target ${OCF_RESKEY_iqn} -driver iscsi + echo 1 > /sys/kernel/scst_tgt/targets/iscsi/enabled + ;; + esac + + iSCSITarget_monitor +} + +iSCSITarget_stop() { + iSCSITarget_monitor + if [ $? -eq $OCF_NOT_RUNNING ]; then + return $OCF_SUCCESS + fi + + local tid + case $OCF_RESKEY_implementation in + iet) + # Figure out the target ID + tid=`sed -ne "s/tid:\([[:digit:]]\+\) name:${OCF_RESKEY_iqn}/\1/p" < /proc/net/iet/volume` + if [ -z "${tid}" ]; then + ocf_log err "Failed to retrieve target ID for IQN ${OCF_RESKEY_iqn}" + exit $OCF_ERR_GENERIC + fi + # Close existing connections. There is no other way to + # do this in IET than to parse the contents of + # /proc/net/iet/session. + set -- $(sed -ne '/^tid:'${tid}' /,/^tid/ { + /^[[:space:]]*sid:\([0-9]\+\)/ { + s/^[[:space:]]*sid:\([0-9]*\).*/--sid=\1/; h; + }; + /^[[:space:]]*cid:\([0-9]\+\)/ { + s/^[[:space:]]*cid:\([0-9]*\).*/--cid=\1/; G; p; + }; + }' < /proc/net/iet/session) + while [[ -n $2 ]]; do + # $2 $1 looks like "--sid=X --cid=Y" + ocf_run ietadm --op delete \ + --tid=${tid} $2 $1 + shift 2 + done + # In iet, unassigning a user from a target and + # deleting the user account is one operation. + if [ -n "${OCF_RESKEY_incoming_username}" ]; then + ocf_run ietadm --op delete --user \ + --tid=${tid} \ + --params=IncomingUser=${OCF_RESKEY_incoming_username} \ + || exit $OCF_ERR_GENERIC + fi + # Loop on delete. Keep trying until we time out, if + # necessary. + while true; do + if ietadm --op delete --tid=${tid}; then + ocf_log debug "Removed target ${OCF_RESKEY_iqn}." + break + else + ocf_log warn "Failed to remove target ${OCF_RESKEY_iqn}, retrying." + sleep 1 + fi + done + # Avoid stale /etc/initiators.{allow,deny} entries + # for this target + if [ -e /etc/initiators.deny ]; then + ocf_run sed -e "/^${OCF_RESKEY_iqn}[[:space:]]/d" \ + -i /etc/initiators.deny + fi + if [ -e /etc/initiators.allow ]; then + ocf_run sed -e "/^${OCF_RESKEY_iqn}[[:space:]]/d" \ + -i /etc/initiators.allow + fi + ;; + tgt) + tid="${OCF_RESKEY_tid}" + # Close existing connections. There is no other way to + # do this in tgt than to parse the output of "tgtadm --op + # show". + set -- $(tgtadm --lld iscsi --op show --mode target \ + | sed -ne '/^Target '${tid}':/,/^Target/ { + /^[[:space:]]*I_T nexus: \([0-9]\+\)/ { + s/^.*: \([0-9]*\).*/--sid=\1/; h; + }; + /^[[:space:]]*Connection: \([0-9]\+\)/ { + s/^.*: \([0-9]*\).*/--cid=\1/; G; p; + }; + /^[[:space:]]*LUN information:/ q; + }') + while [[ -n $2 ]]; do + # $2 $1 looks like "--sid=X --cid=Y" + ocf_run tgtadm --lld iscsi --op delete --mode connection \ + --tid=${tid} $2 $1 + shift 2 + done + # In tgt, we must first unbind the user account from + # the target, then remove the account itself. + if [ -n "${OCF_RESKEY_incoming_username}" ]; then + ocf_run tgtadm --lld iscsi --mode account --op unbind \ + --tid=${tid} \ + --user=${OCF_RESKEY_incoming_username} || exit $OCF_ERR_GENERIC + ocf_run tgtadm --lld iscsi --mode account --op delete \ + --user=${OCF_RESKEY_incoming_username} || exit $OCF_ERR_GENERIC + fi + # Loop on delete. Keep trying until we time out, if + # necessary. + while true; do + if tgtadm --lld iscsi --op delete --mode target --tid=${tid}; then + ocf_log debug "Removed target ${OCF_RESKEY_iqn}." + break + else + ocf_log warn "Failed to remove target ${OCF_RESKEY_iqn}, retrying." + sleep 1 + fi + done + # In tgt, we don't have to worry about our ACL + # entries. They are automatically removed upon target + # deletion. + ;; + lio) + # In lio, removing a target automatically removes all + # associated TPGs, network portals, and LUNs. + ocf_run lio_node --deliqn ${OCF_RESKEY_iqn} || exit $OCF_ERR_GENERIC + ;; + lio-t) + ocf_take_lock $TARGETLOCKFILE + ocf_release_lock_on_exit $TARGETLOCKFILE + ocf_run targetcli /iscsi delete ${OCF_RESKEY_iqn} || exit $OCF_ERR_GENERIC + ;; + scst) + ocf_run scstadmin -disable_target ${OCF_RESKEY_iqn} -driver iscsi -force -noprompt + for i in $(find /sys/kernel/scst_tgt/targets/iscsi/${OCF_RESKEY_iqn}/ -name force_close); do + echo 1 > ${i} + done + + timer=0 + while ls -Ad /sys/kernel/scst_tgt/targets/iscsi/${OCF_RESKEY_iqn}/sessions/* > /dev/null 2>&1; do + if [ ${timer} -gt ${INIT_LOGOUT_TIMEOUT} ]; then + ocf_log warn "Some initiators still logged in after ${INIT_LOGOUT_TIMEOUT} seconds. Continuing." + break + fi + timer=$((timer + 1)) + sleep 1 + done + + scstadmin -rem_target ${OCF_RESKEY_iqn} -driver iscsi -force -noprompt + ;; + esac + + return $OCF_SUCCESS +} + +iSCSITarget_monitor() { + case $OCF_RESKEY_implementation in + iet) + grep -Eq "tid:[0-9]+ name:${OCF_RESKEY_iqn}" /proc/net/iet/volume && return $OCF_SUCCESS + ;; + tgt) + tgtadm --lld iscsi --op show --mode target \ + | grep -Eq "Target [0-9]+: ${OCF_RESKEY_iqn}" && return $OCF_SUCCESS + ;; + lio | lio-t) + # if we have no configfs entry for the target, it's + # definitely stopped + [ -d /sys/kernel/config/target/iscsi/${OCF_RESKEY_iqn} ] || return $OCF_NOT_RUNNING + # if the target is there, but its TPG is not enabled, then + # we also consider it stopped + [ `cat /sys/kernel/config/target/iscsi/${OCF_RESKEY_iqn}/tpgt_1/enable` -eq 1 ] || return $OCF_NOT_RUNNING + return $OCF_SUCCESS + ;; + scst) + [ -d /sys/kernel/scst_tgt/targets/iscsi/${OCF_RESKEY_iqn} ] || return $OCF_NOT_RUNNING + [ $(cat /sys/kernel/scst_tgt/targets/iscsi/${OCF_RESKEY_iqn}/enabled) -eq 1 ] || return $OCF_NOT_RUNNING + return $OCF_SUCCESS + ;; + esac + + return $OCF_NOT_RUNNING +} + +iSCSITarget_validate() { + # Do we have all required variables? + local required_vars + case $OCF_RESKEY_implementation in + iet) + required_vars="iqn" + ;; + tgt) + required_vars="iqn tid" + ;; + esac + + for var in ${required_vars}; do + param="OCF_RESKEY_${var}" + if [ -z "${!param}" ]; then + ocf_exit_reason "Missing resource parameter \"$var\"!" + exit $OCF_ERR_CONFIGURED + fi + done + + # Is the configured implementation supported? + case "$OCF_RESKEY_implementation" in + "iet"|"tgt"|"lio"|"lio-t"|"scst") + ;; + "") + # The user didn't specify an implementation, and we were + # unable to determine one from installed binaries (in + # other words: no binaries for any supported + # implementation could be found) + ocf_exit_reason "Undefined iSCSI target implementation" + exit $OCF_ERR_INSTALLED + ;; + *) + ocf_exit_reason "Unsupported iSCSI target implementation \"$OCF_RESKEY_implementation\"!" + exit $OCF_ERR_CONFIGURED + ;; + esac + + # Do we have any configuration parameters that the current + # implementation does not support? + local unsupported_params + local var + local envar + case $OCF_RESKEY_implementation in + iet|tgt) + # IET and tgt do not support binding a target portal to a + # specific IP address. + unsupported_params="portals" + ;; + lio|lio-t) + unsupported_params="tid" + ;; + scst) + unsupported_params="tid iser_portals" + ;; + esac + + for var in ${unsupported_params}; do + envar=OCF_RESKEY_${var} + defvar=OCF_RESKEY_${var}_default + if [ -n "${!envar}" ]; then + if [[ "${!envar}" != "${!defvar}" ]];then + case "$__OCF_ACTION" in + start|validate-all) + ocf_log warn "Configuration parameter \"${var}\"" \ + "is not supported by the iSCSI implementation" \ + "and will be ignored." ;; + esac + fi + fi + done + + if ! ocf_is_probe; then + # Do we have all required binaries? + case $OCF_RESKEY_implementation in + iet) + check_binary ietadm + ;; + tgt) + check_binary tgtadm + ;; + lio) + check_binary tcm_node + check_binary lio_node + ;; + lio-t) + check_binary targetcli + ;; + scst) + check_binary scstadmin + ;; + esac + + # Is the required kernel functionality available? + case $OCF_RESKEY_implementation in + iet) + [ -d /proc/net/iet ] + if [ $? -ne 0 ]; then + ocf_log err "/proc/net/iet does not exist or is not a directory -- check if required modules are loaded." + exit $OCF_ERR_INSTALLED + fi + ;; + tgt) + # tgt is userland only + ;; + lio) + # lio needs configfs to be mounted + if ! grep -Eq "^.*/sys/kernel/config[[:space:]]+configfs" /proc/mounts; then + ocf_log err "configfs not mounted at /sys/kernel/config -- check if required modules are loaded." + exit $OCF_ERR_INSTALLED + fi + # check for configfs entries created by target_core_mod + if [ ! -d /sys/kernel/config/target ]; then + ocf_log err "/sys/kernel/config/target does not exist or is not a directory -- check if required modules are loaded." + exit $OCF_ERR_INSTALLED + fi + ;; + lio-t) + #targetcli loads the needed kernel modules + ;; + scst) + if [ ! -d /sys/kernel/scst_tgt ]; then + ocf_log err "/sys/kernel/scst_tgt does not exist or is not a directory -- check if required modules are loaded." + exit $OCF_ERR_INSTALLED + fi + ;; + esac + fi + + return $OCF_SUCCESS +} + + +case $1 in + meta-data) + meta_data + exit $OCF_SUCCESS + ;; + usage|help) + iSCSITarget_usage + exit $OCF_SUCCESS + ;; +esac + +# Everything except usage and meta-data must pass the validate test +iSCSITarget_validate + +case $__OCF_ACTION in +start) iSCSITarget_start;; +stop) iSCSITarget_stop;; +monitor|status) iSCSITarget_monitor;; +reload) ocf_log err "Reloading..." + iSCSITarget_start + ;; +validate-all) ;; +*) iSCSITarget_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc diff --git a/heartbeat/ids b/heartbeat/ids new file mode 100755 index 0000000..0d9e1e1 --- /dev/null +++ b/heartbeat/ids @@ -0,0 +1,751 @@ +#!/bin/sh +# +# +# ids +# +# Description: +# +# OCF resource agent that manages an +# IBM Informix Dynamic Server (IDS) instance +# as an High-Availability resource. +#### +# +# Author: Lars D. Forseth, <lars.forseth@de.ibm.com> or <lars@forseth.de> +# Created: May 25th 2007 +# Last Modified: July 30th 2007 +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL), Version 2 or later +# Copyright: (c) 2002 - 2007 International Business Machines, Inc. +# +# This code is inspired by the db2 OCF resource agent +# written by Alan Robertson, <alanr@unix.sh> +#### +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +#### +# +# Example usage as it would appear in /etc/ha.d/haresources: +# node1 192.168.0.1 ids::/informix::ids1::onconfig.ids1 +# +# +# --> Note that passing dbname and sqltestquery in heartbeat version 1 style is not supported! +# +# See usage() function below for more details... +#### +# +# OCF instance parameters: +# OCF_RESKEY_informixdir +# OCF_RESKEY_informixserver +# OCF_RESKEY_onconfig +# OCF_RESKEY_dbname +# OCF_RESKEY_sqltestquery +#### + + +# +# Include general OCF functions and variables (such as OCF return codes). +# +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_informixdir_default="" +OCF_RESKEY_informixserver_default="" +OCF_RESKEY_onconfig_default="" +OCF_RESKEY_dbname_default="sysmaster" +OCF_RESKEY_sqltestquery_default="SELECT COUNT(*) FROM systables;" + +: ${OCF_RESKEY_informixdir=${OCF_RESKEY_informixdir_default}} +: ${OCF_RESKEY_informixserver=${OCF_RESKEY_informixserver_default}} +: ${OCF_RESKEY_onconfig=${OCF_RESKEY_onconfig_default}} +: ${OCF_RESKEY_dbname=${OCF_RESKEY_dbname_default}} +: ${OCF_RESKEY_sqltestquery=${OCF_RESKEY_sqltestquery_default}} + +# +# Function that displays the usage of this script. +# +ids_usage() { + methods=`ids_methods` + methods=`echo $methods | tr ' ' '|'` + + echo " + usage: $0 ($methods) + + $0 manages an IBM Informix Dynamic Server (IDS) instance as an High-Availability resource. + + The 'start' operation starts the database. + The 'stop' operation stops the database. + The 'status' operation reports whether the database is running + The 'monitor' operation reports whether the database seems to be working + The 'validate-all' operation reports whether the parameters are valid + The 'methods' operation lists the methods $0 supports + The 'usage' operation displays this text + The 'meta-data' operation returns the meta-data (in XML) of this resource script + " +} + + +# +# Function that displays the possible methods this script supports. +# +ids_methods() { + echo " + start + stop + status + monitor + validate-all + methods + usage + meta-data + " +} + + +# +# Function that displays the meta-data of this OCF resource agent. +# +ids_meta_data() { + cat <<-! +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="ids" version="1.0"> +<version>1.0</version> + + +<longdesc lang="en"> +OCF resource agent to manage an IBM Informix Dynamic Server (IDS) instance as an High-Availability resource. +</longdesc> +<shortdesc lang="en">Manages an Informix Dynamic Server (IDS) instance</shortdesc> + + +<parameters> + +<parameter name="informixdir" required="0"> +<longdesc lang="en"> +The value the environment variable INFORMIXDIR has after a typical installation of IDS. +Or in other words: the path (without trailing '/') where IDS was installed to. +If this parameter is unspecified the script will try to get the value from the shell environment. +</longdesc> +<shortdesc lang="en"> +INFORMIXDIR environment variable +</shortdesc> +<content type="string" default="${OCF_RESKEY_informixdir_default}" /> +</parameter> + +<parameter name="informixserver" required="0"> +<longdesc lang="en"> +The value the environment variable INFORMIXSERVER has after a typical installation of IDS. +Or in other words: the name of the IDS server instance to manage. +If this parameter is unspecified the script will try to get the value from the shell environment. +</longdesc> +<shortdesc lang="en"> +INFORMIXSERVER environment variable +</shortdesc> +<content type="string" default="${OCF_RESKEY_informixserver_default}" /> +</parameter> + +<parameter name="onconfig" required="0"> +<longdesc lang="en"> +The value the environment variable ONCONFIG has after a typical installation of IDS. +Or in other words: the name of the configuration file for the IDS instance specified in INFORMIXSERVER. +The specified configuration file will be searched at '$INFORMIXDIR/etc/$ONCONFIG'. +If this parameter is unspecified the script will try to get the value from the shell environment. +</longdesc> +<shortdesc lang="en"> +ONCONFIG environment variable +</shortdesc> +<content type="string" default="${OCF_RESKEY_onconfig_default}" /> +</parameter> + +<parameter name="dbname" required="0"> +<longdesc lang="en"> +This parameter defines which database to use in order to monitor the IDS instance. +If this parameter is unspecified the script will use the 'sysmaster' database as a default. +</longdesc> +<shortdesc lang="en"> +database to use for monitoring, defaults to 'sysmaster' +</shortdesc> +<content type="string" default="${OCF_RESKEY_dbname_default}" /> +</parameter> + +<parameter name="sqltestquery" required="0"> +<longdesc lang="en"> +SQL test query to run on the database specified by the parameter 'dbname' +in order to monitor the IDS instance and determine if it's functional or not. +If this parameter is unspecified the script will use 'SELECT COUNT(*) FROM systables;' as a default. +</longdesc> +<shortdesc lang="en"> +SQL test query to use for monitoring, defaults to 'SELECT COUNT(*) FROM systables;' +</shortdesc> +<content type="string" default="${OCF_RESKEY_sqltestquery_default}" /> +</parameter> + +</parameters> + + +<actions> +<action name="start" timeout="120s" /> +<action name="stop" timeout="120s" /> +<action name="status" timeout="60s" /> +<action name="monitor" depth="0" timeout="30s" interval="10s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +<action name="methods" timeout="5s" /> +<action name="usage" timeout="5s" /> +</actions> + + +</resource-agent> +! +} + + +# +# Function that either forwards log messages to the ocf_log function +# provided by heartbeat or simply prints them to standard out via echo. +# This is determined by setting the variable "idslogger" to "echo" or "ocf". +# The default for "idslogger" is "ocf". +# +ids_log() { + + # Where should the passed log messages be passed to, + # to the standard output via the echo command ("echo") + # or to the ocf_log function provided by heartbeat ("ocf") ? + # Default is "ocf". + idslogger="ocf" + + # When the variable "idsdebug" is not set to "true" + # this function (ids_log) will not print any info message + # that has been forwarded to it! + # This is done in order to spare if-statements within the + # other functions in this script and to centralize the decision + # whether to have a chatty resource script or not... ;) + # Nevertheless, error messages will always be printed! + idsdebug=false + + # Only continue if the two expected parameters + # are not empty and "idsdebug" is set to "true" + # or the message is of type "error". + if [ $# -eq 2 -a -n "$1" -a -n "$2" ]; then + if [ "$idsdebug" = "true" -o "$1" = "error" ]; then + case $idslogger in + # Print messages to stdout via echo command. + echo) + echo "`date +'%b %d %H:%M:%S'`: [$1] $2";; + # Pass messages to ocf_log function. + ocf|*) + ocf_log "$1" "$2";; + esac + fi + fi +} + + +# +# Function that prints the current values of important environment variables +# needed by the script and the IDS instance itself. The just mentioned variables are: +# - INFORMIXDIR +# - INFORMIXSERVER +# - ONCONFIG +# - PATH +# - LD_LIBRARY_PATH +# +ids_debug() { + ids_log info "called ids_debug" + + ids_log info "INFORMIXDIR=$INFORMIXDIR" + ids_log info "OCF_RESKEY_informixdir=$OCF_RESKEY_informixdir" + + ids_log info "INFORMIXSERVER=$INFORMIXSERVER" + ids_log info "OCF_RESKEY_informixserver=$OCF_RESKEY_informixserver" + + ids_log info "ONCONFIG=$ONCONFIG" + ids_log info "OCF_RESKEY_onconfig=$OCF_RESKEY_onconfig" + + ids_log info "PATH=$PATH" + ids_log info "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" + + ids_log info "dbname=$OCF_RESKEY_dbname" + ids_log info "sqltestquery=$OCF_RESKEY_sqltestquery" + + ids_log info "this script is run as user: `id`" + ids_log info "...in the current working directory: `pwd`" +} + + +# +# Function that validates if the passed parameters are valid and sets them if valid. +# If the first three parameters have not been passed, +# this function checks whether they have been already set in the parent's shell environment. +# The variables that are checked and set (only the capitalized ones are set) are: +# - INFORMIXDIR +# - INFORMIXSERVER +# - ONCONFIG +# - PATH +# - LD_LIBRARY_PATH +# - dbname +# - sqltestquery +# +ids_validate() { + + ids_log info "called ids_validate" + rc=$OCF_SUCCESS + + # Check if INFORMIX, INFORMIXSERVER and ONCONFIG + # have been passed or set and validate them. + + # OCF vars not passed, vars empty - set and export them to the shell environment. + if [ -n "$OCF_RESKEY_informixdir" -a -n "$OCF_RESKEY_informixserver" -a -n "$OCF_RESKEY_onconfig" ]; then + ids_log info "ids_validate: passed vars not empty" + + INFORMIXDIR=$OCF_RESKEY_informixdir + export INFORMIXDIR + + INFORMIXSERVER=$OCF_RESKEY_informixserver + export INFORMIXSERVER + + ONCONFIG=$OCF_RESKEY_onconfig + export ONCONFIG + fi + + # Check if INFORMIXDIR is non-empty and a directory (and if there was an error so far). + if [ $rc -eq $OCF_SUCCESS -a -n "$INFORMIXDIR" -a -d "$INFORMIXDIR" ]; then + ids_log info "ids_validate: INFORMIXDIR is valid: $INFORMIXDIR" + rc=$OCF_SUCCESS + else + ids_log error "ids_validate: INFORMIXDIR is invalid: $INFORMIXDIR" + rc=$OCF_ERR_ARGS + fi + + # Check if INFORMIXSERVER is non-empty (and if there was an error so far). + if [ $rc -eq $OCF_SUCCESS -a -n "$INFORMIXSERVER" ]; then + ids_log info "ids_validate: INFORMIXSERVER is valid: $INFORMIXSERVER" + rc=$OCF_SUCCESS + else + ids_log error "ids_validate: INFORMIXSERVER is invalid: $INFORMIXSERVER" + rc=$OCF_ERR_ARGS + fi + + # Check if ONCONFIG is non-empty and a non-empty file (and if there was an error so far). + if [ $rc -eq $OCF_SUCCESS -a -n "$ONCONFIG" -a -s "$INFORMIXDIR/etc/$ONCONFIG" ]; then + ids_log info "ids_validate: ONCONFIG is a non-empty file in: \$INFORMIXDIR/etc/\$ONCONFIG where ONCONFIG=$ONCONFIG" + rc=$OCF_SUCCESS + else + if [ -z "$ONCONFIG" -a -s "$INFORMIXDIR/etc/onconfig" ]; then + ONCONFIG="onconfig" + export ONCONFIG + ids_log info "ids_validate: ONCONFIG is a non-empty file in: \$INFORMIXDIR/etc/\$ONCONFIG where ONCONFIG=$ONCONFIG" + rc=$OCF_SUCCESS + else + if [ -z "$ONCONFIG" -a -s "$INFORMIXDIR/etc/onconfig.std" ]; then + ONCONFIG="onconfig.std" + export ONCONFIG + ids_log info "ids_validate: ONCONFIG is a non-empty file in: \$INFORMIXDIR/etc/\$ONCONFIG where ONCONFIG=$ONCONFIG" + rc=$OCF_SUCCESS + else + ids_log error "ids_validate: ONCONFIG is invalid, searched for it in: \$INFORMIXDIR/etc/\$ONCONFIG where ONCONFIG=$ONCONFIG" + rc=$OCF_ERR_ARGS + fi + fi + fi + + # Check if the commands oninit, onstat, onmode and dbaccess exist in INFORMIXDIR/bin/ + # and whether they are executable (do this only if there wasn't an error so far). + if [ $rc -eq $OCF_SUCCESS -a -x "$INFORMIXDIR/bin/oninit" -a -x "$INFORMIXDIR/bin/onstat" -a -x "$INFORMIXDIR/bin/onmode" -a -x "$INFORMIXDIR/bin/dbaccess" ]; then + ids_log info "ids_validate: oninit, onstat and dbaccess exist and are executable in: \$INFORMIXDIR/bin/" + rc=$OCF_SUCCESS + else + ids_log error "ids_validate: oninit, onstat or dbacces don't exist or they are not executable in: \$INFORMIXDIR/bin/" + rc=$OCF_ERR_PERM + fi + + # Extend PATH and LD_LIBRARY_PATH as needed for the IDS instance to run properly + # BUT: only do this if it hasn't been done before! Otherwise PATH and LD_LIBRARY_PATH will + # keep on growing every time heartbeat calls the IDS resource agent script! ;) + echo $PATH | grep $INFORMIXDIR > /dev/null 2>&1 + inpath=$? + + if [ $rc -eq $OCF_SUCCESS -a $inpath -ne 0 ]; then + PATH="${INFORMIXDIR}/bin":${PATH} + export PATH + ids_log info "ids_validate: PATH did not contain INFORMIXDIR, added \$INFORMIXDIR/bin" + else + ids_log info "ids_validate: INFORMIXDIR already in PATH, where PATH=$PATH" + fi + + echo $LD_LIBRARY_PATH | grep $INFORMIXDIR > /dev/null 2>&1 + inldlibpath=$? + + if [ $rc -eq $OCF_SUCCESS -a $inldlibpath -ne 0 ]; then + LD_LIBRARY_PATH="${INFORMIXDIR}/lib:${INFORMIXDIR}/lib/esql" + export LD_LIBRARY_PATH + ids_log info "ids_validate: LD_LIBRARY_PATH did not contain INFORMIXDIR, added \$INFORMIXDIR/lib and \$INFORMIXDIR/lib/esql, added them" + else + ids_log info "ids_validate: INFORMIXDIR already in LD_LIBRARY_PATH, where LD_LIBRARY_PATH=$LD_LIBRARY_PATH" + fi + + # Check if dbname is empty (and if there was an error so far) + # if it is empty, assign default. + if [ $rc -eq $OCF_SUCCESS -a -n "$OCF_RESKEY_dbname" ]; then + ids_log info "ids_validate: dbname is valid: $OCF_RESKEY_dbname" + rc=$OCF_SUCCESS + else + ids_log info "ids_validate: dbname is invalid: $OCF_RESKEY_dbname" + ids_log info "ids_validate: using '${OCF_RESKEY_dbname_default}' as default..." + OCF_RESKEY_dbname="${OCF_RESKEY_dbname_default}" + export OCF_RESKEY_dbname + rc=$OCF_SUCCESS + fi + + # Check if sqltestquery is empty (and if there was an error so far) + # if it is empty, assign default. + if [ $rc -eq $OCF_SUCCESS -a -n "$OCF_RESKEY_sqltestquery" ]; then + ids_log info "ids_validate: sqltestquery is valid: $OCF_RESKEY_sqltestquery" + rc=$OCF_SUCCESS + else + ids_log info "ids_validate: sqltestquery is invalid: $OCF_RESKEY_sqltestquery" + ids_log info "ids_validate: using '${OCF_RESKEY_sqltestquery_default}' as default..." + OCF_RESKEY_sqltestquery="${OCF_RESKEY_sqltestquery_default}" + export OCF_RESKEY_sqltestquery + rc=$OCF_SUCCESS + fi + + # Return exit status code. + return $rc +} + + +# +# Function that start the IDS instance and reports any error that +# may occur while starting. +# +ids_start() { + + ids_log info "called ids_start" + + # Get current status of IDS instance. + ids_status + stat=$? + + case $stat in + + # IDS instance already running - exit with success. + $OCF_SUCCESS) + ids_log info "ids_start: IDS instance already running: $stat" + rc=$OCF_SUCCESS;; + + # IDS instance in undefined state - exit with error. + $OCF_ERR_GENERIC) + ids_log error "ids_start: IDS instance in undefined state: $stat" + ids_debug + rc=$OCF_ERR_GENERIC;; + + # IDS instance not running - try to start it. + $OCF_NOT_RUNNING) + ids_log info "ids_start: executing 'oninit' now..." + oninit + stat=$? + ids_log info "ids_start: done executing 'oninit': $stat" + + # The oninit command terminated successfully - check new state of IDS instance. + if [ $stat -eq 0 ]; then + # Initialize stat with failure exit status code. + stat=$OCF_ERR_GENERIC + # Endless loop that waits until IDS is completely online. + # If IDS takes too long to achieve this or even hangs, + # the timeout settings of heartbeat will cancel the starting + # of the IDS resource and therefore terminate the loop. + while [ $stat -ne $OCF_SUCCESS ]; do + ids_status + stat=$? + done + # IDS is running now - success. + ids_log info "ids_start: IDS instance successfully started: $stat" + rc=$OCF_SUCCESS + # The oninit command terminated with an error - starting the IDS resource failed! + else + ids_log error "ids_start: starting IDS instance failed: $stat" + ids_debug + rc=$OCF_ERR_GENERIC + fi + ;; + + # Unexpected state - return OCF_ERR_UNIMPLEMENTED error. + *) + ids_log error "ids_start: unexpected state returned from ids_status: $stat" + ids_debug + rc=$OCF_ERR_UNIMPLEMENTED;; + + esac + + # Return exit status code. + return $rc +} + + +# +# Function that stops the IDS instance and reports any error that +# may occur while stopping. +# +ids_stop() { + + ids_log info "caled ids_stop" + + ids_status + stat=$? + + case $stat in + + # IDS instance is not running - success stopping it. + $OCF_NOT_RUNNING) + ids_log info "ids_stop: IDS instance is not running: $stat" + rc=$OCF_SUCCESS;; + + # IDS instance is in an undefined state - exit with error. + $OCF_ERR_GENERIC) + ids_log error "ids_stop: IDS instance in undefined state: $stat" + ids_debug + rc=$OCF_ERR_GENERIC;; + + # IDS instance is running - try to stop it. + $OCF_SUCCESS) + ids_log info "ids_stop: running 'onmode -kuy' now..." + onmode -kuy + stat=$? + ids_log info "ids_stop: done running 'onmode -kuy' now: $stat" + + # The onmode command terminated successfully - check new state of the IDS instance. + if [ $stat -eq 0 ]; then + ids_status + stat=$? + # New state is: not running - success. + if [ $stat -eq $OCF_NOT_RUNNING ]; then + ids_log info "ids_stop: IDS instance successfully stopped: $stat" + rc=$OCF_SUCCESS + # New state is: running or even undefined - failure! + else + ids_log error "ids_stop: stopping IDS instance failed: $stat" + ids_debug + rc=$OCF_ERR_GENERIC + fi + + # The onmode command terminated with an error - stopping the IDS resource failed! + else + ids_log error "ids_stop: stopping IDS instance (by executing 'onmode -kuy') failed: $stat" + ids_debug + rc=$OCF_ERR_GENERIC + fi + ;; + + # Unexpected state - return OCF_ERR_UNIMPLEMENTED error. + *) + ids_log error "ids_stop: unexpected state returned from ids_status: $stat" + ids_debug + rc=$OCF_ERR_UNIMPLEMENTED;; + + esac + + # Return exit status code indicating whether IDS was successfully stopped or not. + return $rc +} + + +# +# Function that determines the current status/state of the IDS instance, +# meaning whether it is running (the case when output of "onstat -" contains "On-Line"), +# not running (the case when output of "onstat -" contains "shared memory not initialized") +# or in an undefined state (the case output of "onstat -" contains "Quiescent", "Single-User", or other). +# If the IDS instance is declared running the exit status code will indicate succes, otherwise failure of course. +# +ids_status() { + + ids_log info "called ids_status" + + # Get current status from the onstat tool and store it. + stat=`onstat -` + + case $stat in + + # IDS instance is running. + *"On-Line"*) + ids_log info "ids_status: IDS instance running: $stat" + rc=$OCF_SUCCESS;; + + # IDS instance is not running. + *"shared memory not initialized"*) + ids_log info "ids_status: IDS instance not running: $stat" + rc=$OCF_NOT_RUNNING;; + + # IDS instance is in an undefined state! + *) + ids_log error "ids_status: IDS instance status undefined: $stat" + rc=$OCF_ERR_GENERIC;; + esac + + # Return exit status code (ergo current status of the IDS instance) to caller + return $rc +} + + +# +# Function that monitors the current status _and_ funtionality of the IDS instance. +# First the state of the instance is determined. If it is running, a sql test query is +# executed on the database. If the sql test query executes sucessfully, the instance's +# status is rechecked and if it is still running, the script terminates with an exit +# status code indicating success. If any of the above described steps fails, +# the script terminates with an error. +# +ids_monitor() { + + ids_log info "called ids_monitor" + + ids_status + stat=$? + + case $stat in + + # IDS instance is not running - monitoring failed. + $OCF_NOT_RUNNING) + ids_log info "ids_monitor: IDS instance is not running: $stat" + rc=$OCF_NOT_RUNNING;; + + # IDS instance in an undefined state - exit with error. + $OCF_ERR_GENERIC) + ids_log error "ids_monitor: IDS instance in undefined state: $stat" + ids_debug + rc=$OCF_ERR_GENERIC;; + + # IDS instance is running - try to execute the sql test query and recheck state. + $OCF_SUCCESS) + ids_log info "ids_monitor: IDS instance is running (before executing sql test query)" + ids_log info "ids_monitor: running sql test query now..." + echo $OCF_RESKEY_sqltestquery | dbaccess $OCF_RESKEY_dbname - > /dev/null 2>&1 + stat=$? + ids_log info "ids_monitor: done running sql test query now: $stat" + + # The sql test query terminated successfully - check the new state of the IDS instance. + if [ $stat -eq 0 ]; then + ids_status + stat=$? + # New state is: running - success. + if [ $stat -eq $OCF_SUCCESS ]; then + ids_log info "ids_monitor: successfully ran sql test query on IDS instance: $stat" + rc=$OCF_SUCCESS + # New state is: not running or even undefined - failure! + else + ids_log error "ids_monitor: running sql test query on IDS instance failed: $stat" + ids_debug + rc=$OCF_ERR_GENERIC + fi + + # The sql test query terminated with an error - exit with error! + else + ids_log error "ids_monitor: running sql test query on IDS instance failed: $stat" + ids_debug + rc=$OCF_ERR_GENERIC + fi + ;; + + # Unexpected state - return OCF_ERR_UNIMPLEMENTED error! + *) + ids_log error "ids_monitor: unexpected state returned from ids_status: $stat" + ids_debug + rc=$OCF_ERR_UNIMPLEMENTED;; + + esac + + # Return exit status code indicating whether IDS is running and functional or not. + return $rc +} + + + + +### +# +# M A I N S E C T I O N +# +### + +case "$1" in + usage) + ids_usage + exit $?;; + meta-data) + ids_meta_data + exit $?;; +esac + +# Validate configuration (parameters and such) +# passed to this script and only process the method parameter +# if the configuration is valid! Otherwise exit with OCF_ERR_ARGS error code. + +# Only check configuration when given method is not "validate-all", +# as in case of "validate-all" the configuration will be checked anyway! ;) +if [ "$1" != "validate-all" ]; then + ids_validate + valid=$? + ids_log info "main section: validated ids RA configuration, result: $valid" + + # Configuration invalid - terminate with error message. + if [ $valid -ne $OCF_SUCCESS ]; then + ids_log error "main section: terminating script due to invalid configuration" + ids_debug + exit $OCF_ERR_ARGS + fi +fi + +# Configuration valid or method equals to "validate-all" - react depending on called method. +case "$1" in + + start) + ids_start + exit $?;; + + stop) + ids_stop + exit $?;; + + status) + ids_status + exit $?;; + + monitor) + ids_monitor + exit $?;; + + validate-all) + ids_validate + exit $?;; + + methods) + ids_methods + exit $?;; + + *) + ids_log error "mainsection: no or invalid command supplied: $1" + exit $OCF_ERR_UNIMPLEMENTED;; + +esac +############################################################################### diff --git a/heartbeat/iface-bridge b/heartbeat/iface-bridge new file mode 100755 index 0000000..a4e50ad --- /dev/null +++ b/heartbeat/iface-bridge @@ -0,0 +1,843 @@ +#!/bin/sh +# +# OCF Resource Agent compliant iface-bridge script. +# +# Implements network Bridge interface management +# +# Copyright (C) 2013 Red Hat, Inc. All rights reserved. +# Author: Fabio M. Di Nitto <fdinitto@redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# + +# TODO: +# * Eventually improve bridge_check to verify all runtime +# parameters. Is it really necessary? +# * consider add support for advanced multicast timers tuning +# sethashel <bridge> <int> set hash elasticity default 4 +# sethashmax <bridge> <int> set hash max default 512 +# setmclmc <bridge> <int> set multicast last member count default 2, ? +# setmcsqc <bridge> <int> set multicast startup query count default 2, ? +# setmclmi <bridge> <time> set multicast last member interval default HZ +# setmcmi <bridge> <time> set multicast membership interval default 260 * HZ +# setmcqpi <bridge> <time> set multicast querier interval default 255 * HZ +# setmcqi <bridge> <time> set multicast query interval detault 125 * HZ +# setmcqri <bridge> <time> set multicast query response interval default 10 * HZ +# setmcqri <bridge> <time> set multicast startup query interval default 125 * hZ / 4 +# +# +# OCF parameters are as below +# OCF_RESKEY_bridge_name +# OCF_RESKEY_bridge_slaves +# OCF_RESKEY_bridge_ageing +# OCF_RESKEY_port_hairpin +# OCF_RESKEY_stp +# OCF_RESKEY_stp_bridgeprio +# OCF_RESKEY_stp_fd +# OCF_RESKEY_stp_maxage +# OCF_RESKEY_stp_hello +# OCF_RESKEY_stp_pathcost +# OCF_RESKEY_stp_portprio +# OCF_RESKEY_multicast_router +# OCF_RESKEY_multicast_snooping +# OCF_RESKEY_multicast_querier +# OCF_RESKEY_multicast_port_router +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_stp_default=false +OCF_RESKEY_stp_fd_default=0 +OCF_RESKEY_multicast_router_default=1 +OCF_RESKEY_multicast_snooping_default=1 +OCF_RESKEY_multicast_querier_default=0 + +: ${OCF_RESKEY_stp=${OCF_RESKEY_stp_default}} +: ${OCF_RESKEY_stp_fd=${OCF_RESKEY_stp_fd_default}} +: ${OCF_RESKEY_multicast_router=${OCF_RESKEY_multicast_router_default}} +: ${OCF_RESKEY_multicast_snooping=${OCF_RESKEY_multicast_snooping_default}} +: ${OCF_RESKEY_multicast_querier=${OCF_RESKEY_multicast_querier_default}} + +# binaries +: ${BRCTL:=brctl} + +####################################################################### + +bridge_usage() { + cat <<END +usage: $0 {start|stop|status|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +bridge_meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="iface-bridge" version="1.0"> + <version>1.0</version> + + <longdesc lang="en"> + This resource manages Bridge network interfaces. + It can add, remove, configure bridges and spanning-tree. + </longdesc> + + <shortdesc lang="en"> + Manages Bridge network interfaces. + </shortdesc> + + <parameters> + <parameter name="bridge_name" unique="1" required="1"> + <longdesc lang="en"> + Define the name of the bridge (max 15 charaters). + </longdesc> + <shortdesc lang="en"> + Name of the bridge + </shortdesc> + <content type="string"/> + </parameter> + + <parameter name="bridge_slaves" unique="1"> + <longdesc lang="en"> + Define the list of interfaces, space separated, to add to the bridge. + The list can be empty. + </longdesc> + <shortdesc lang="en"> + Network interface + </shortdesc> + <content type="string"/> + </parameter> + + <parameter name="bridge_ageing" unique="0"> + <longdesc lang="en"> + Set the ethernet (MAC) address ageing time in seconds. + </longdesc> + <shortdesc lang="en"> + MAC ageing in seconds. + </shortdesc> + <content type="integer"/> + </parameter> + + <parameter name="port_hairpin" unique="0"> + <longdesc lang="en"> + Set hairpin forwarding mode. + A list of ports that should have hairpin enabled + can be specified using the following + Example: eth0 eth1 + </longdesc> + <shortdesc lang="en"> + Set hairpin forwarding mode. + </shortdesc> + <content type="string"/> + </parameter> + + <parameter name="stp" unique="0"> + <longdesc lang="en"> + Enable or disable Spanning Tree Protocol on the bridge. + </longdesc> + <shortdesc lang="en"> + Spanning Tree Protocol + </shortdesc> + <content type="boolean" default="${OCF_RESKEY_stp_default}"/> + </parameter> + + <parameter name="stp_bridgeprio" unique="0"> + <longdesc lang="en"> + Set the bridge's priority to defined value. The priority value is a + number between 0 and 65535), and has no dimension. Lower priority values are + preferred. The bridge with the lowest priority will be elected as root bridge. + </longdesc> + <shortdesc lang="en"> + Set the bridge's priority. + </shortdesc> + <content type="integer"/> + </parameter> + + <parameter name="stp_fd" unique="0"> + <longdesc lang="en"> + Set the bridge forward delay (in seconds). + </longdesc> + <shortdesc lang="en"> + Set the bridge forward delay. + </shortdesc> + <content type="integer" default="${OCF_RESKEY_stp_fd_default}"/> + </parameter> + + <parameter name="stp_maxage" unique="0"> + <longdesc lang="en"> + Set the bridge maximum message age (in seconds). + </longdesc> + <shortdesc lang="en"> + Set the bridge maximum message age. + </shortdesc> + <content type="integer"/> + </parameter> + + <parameter name="stp_hello" unique="0"> + <longdesc lang="en"> + Set the bridge hello time (in seconds). + </longdesc> + <shortdesc lang="en"> + Set the bridge hello time. + </shortdesc> + <content type="integer"/> + </parameter> + + <parameter name="stp_pathcost" unique="0"> + <longdesc lang="en"> + Set the port cost. This is a dimensionless metric. + A list of port/cost can be specified using the following + format: unpromoted cost unpromoted cost. + Example: eth0 100 eth1 1000 + </longdesc> + <shortdesc lang="en"> + Set the port cost. + </shortdesc> + <content type="string"/> + </parameter> + + <parameter name="stp_portprio" unique="0"> + <longdesc lang="en"> + Set the port priority. This is a number between 0 and 63. + $BRCTL man page reports a value between 0 and 255, but + tests show a limit of 63 on a live system. + This metric is used in the designated port and root port + selection algorithms. + A list of port/priority can be specified using the following + format: unpromoted cost unpromoted cost. + Example: eth0 10 eth1 60 + </longdesc> + <shortdesc lang="en"> + Set the port priority. + </shortdesc> + <content type="string"/> + </parameter> + + <parameter name="multicast_router" unique="0"> + <longdesc lang="en"> + Enable or disable multicast routing on the bridge. + </longdesc> + <shortdesc lang="en"> + Enable or disable multicast routing. + </shortdesc> + <content type="boolean" default="${OCF_RESKEY_multicast_router_default}"/> + </parameter> + + <parameter name="multicast_snooping" unique="0"> + <longdesc lang="en"> + Enable or disable multicast snooping on the bridge. + </longdesc> + <shortdesc lang="en"> + Enable or disable multicast snooping. + </shortdesc> + <content type="boolean" default="${OCF_RESKEY_multicast_snooping_default}"/> + </parameter> + + <parameter name="multicast_port_router" unique="0"> + <longdesc lang="en"> + Enable or disable a port from the multicast router. + Kernel enables all port by default. + A list of port can be specified using the following + format: unpromoted 0|1 unpromoted 0|1. + Example: eth0 1 eth1 0 + </longdesc> + <shortdesc lang="en"> + Enable or disable a port from the multicast router. + </shortdesc> + <content type="string"/> + </parameter> + </parameters> + + <actions> + <action name="start" timeout="30s" /> + <action name="stop" timeout="20s" /> + <action name="status" timeout="20s" depth="0" interval="10s" /> + <action name="monitor" timeout="20s" depth="0" interval="10s" /> + <action name="meta-data" timeout="5s" /> + <action name="validate-all" timeout="20s" /> + </actions> +</resource-agent> +END +} + +# commodity function +# split_string eth0 100 eth1 1000 eth2 100 +# eth0 100 +# eth1 1000 +# eth2 100 + +split_string() { + while [ -n "$1" ]; do + echo $1 $2 + shift && shift + done +} + +# check if the interface is admin up/down + +iface_is_up() { + if ! $IP2UTIL -o link show $1 | \ + sed -e 's#.*<##g' -e 's#>.*##' -e 's#LOWER_UP##g' | \ + grep -q UP; then + return 1 + fi + return 0 +} + +# check if the slaves have link layer up/down +# see kernel network documentation on meaning of LOWER_UP flag +# for more in depth explanation on how it works +# NOTE: this check is not reliable in virt environment +# since interfaces are always LOWER_UP. There is no way +# from the guest to know if the host has disconnected somehow + +iface_lower_is_up() { + if ! $IP2UTIL -o link show $1 | \ + grep -q LOWER_UP; then + return 1 + fi + return 0 +} + +# wrapup function to check if any interface defined in +# option lists is a slave + +iface_is_slave() { + for slave in $OCF_RESKEY_bridge_slaves; do + if [ "$1" = "$slave" ]; then + return 0 + fi + done + return 1 +} + +bridge_validate() { + check_binary $BRCTL + check_binary $IP2UTIL + + if [ -z "$OCF_RESKEY_bridge_name" ]; then + ocf_log err "Invalid OCF_RESKEY_bridge_name: value cannot be empty" + return 1 + fi + + # the echo .. is the equivalent of strlen in bash + # + # /usr/include/linux/if.h:#define IFNAMSIZ 16 + # needs to include 0 byte end string + + if [ "${#OCF_RESKEY_bridge_name}" -gt 15 ]; then + ocf_log err "Invalid OCF_RESKEY_bridge_name: name is too long" + return 1 + fi + + if [ ! -d "/sys/class/net" ]; then + ocf_log err "Unable to find sysfs network class in /sys" + return 1 + fi + + for slave in $OCF_RESKEY_bridge_slaves; do + if [ ! -e "/sys/class/net/$slave" ]; then + ocf_log err "Invalid OCF_RESKEY_bridge_slaves: $slave does not exists" + return 1 + fi + done + + # check if declared harpin ports are slaves + for hairpin in $OCF_RESKEY_port_hairpin; do + if ! iface_is_slave $hairpin; then + ocf_log err "Invalid OCF_RESKEY_port_hairpin: $hairpin is not listed in OCF_RESKEY_bridge_slaves" + return 1 + fi + done + + if [ -n "$OCF_RESKEY_bridge_ageing" ]; then + if ! ocf_is_decimal "$OCF_RESKEY_bridge_ageing"; then + ocf_log err "Invalid OCF_RESKEY_bridge_ageing: must be a decimal value (0 or greater)" + return 1 + fi + fi + + # OCF_RESKEY_stp_fd needs special handling as it can be 0 or greater + # but only when configured before OCF_RESKEY_stp. + # if enabled after OCF_RESKEY_stp with stp=true the value range is + # different. It is not clear from the man page or brctl documentation + # what the range is ahead of time. + if [ -n "$OCF_RESKEY_stp_fd" ]; then + if ! ocf_is_decimal "$OCF_RESKEY_stp_fd" || \ + [ "$OCF_RESKEY_stp_fd" -lt 0 ]; then + ocf_log err "Invalid OCF_RESKEY_stp_fd: must be a decimal value (0 or greater)" + return 1 + fi + fi + + if ocf_is_true "$OCF_RESKEY_stp"; then + + if [ -n "$OCF_RESKEY_stp_bridgeprio" ]; then + if ! ocf_is_decimal "$OCF_RESKEY_stp_bridgeprio" || \ + [ "$OCF_RESKEY_stp_bridgeprio" -gt 65535 ]; then + ocf_log err "Invalid OCF_RESKEY_stp_bridgeprio: must be a decimal value between 0 and 65535 included" + return 1 + fi + fi + + if [ -n "$OCF_RESKEY_stp_hello" ]; then + if ! ocf_is_decimal "$OCF_RESKEY_stp_hello"; then + ocf_log err "Invalid OCF_RESKEY_stp_hello: must be a decimal value (0 or greater)" + return 1 + fi + fi + + if [ -n "$OCF_RESKEY_stp_maxage" ]; then + if ! ocf_is_decimal "$OCF_RESKEY_stp_maxage"; then + ocf_log err "Invalid OCF_RESKEY_stp_maxage: must be a decimal value (0 or greater)" + return 1 + fi + fi + + if [ -n "$OCF_RESKEY_stp_pathcost" ]; then + split_string $OCF_RESKEY_stp_pathcost | { while read iface cost; do + if ! iface_is_slave $iface; then + ocf_log err "Invalid OCF_RESKEY_stp_pathcost: $iface is not listed in OCF_RESKEY_bridge_slaves" + return 1 + fi + + if ! ocf_is_decimal $cost; then + ocf_log err "Invalid OCF_RESKEY_stp_pathcost: cost must be a decimal value (0 or great)" + return 1 + fi + done + } + fi + + if [ -n "$OCF_RESKEY_stp_portprio" ]; then + split_string $OCF_RESKEY_stp_portprio | { while read iface prio; do + if ! iface_is_slave $iface; then + ocf_log err "Invalid OCF_RESKEY_stp_portprio: $iface is not listed in OCF_RESKEY_bridge_slaves" + return 1 + fi + + if ! ocf_is_decimal $prio || \ + [ "$prio" -gt "63" ]; then + ocf_log err "Invalid OCF_RESKEY_stp_portprio: priority must be a decimal value between 0 and 63 included" + return 1 + fi + done + } + fi + + fi + + if [ -n "$OCF_RESKEY_multicast_port_router" ]; then + split_string $OCF_RESKEY_multicast_port_router | { while read iface mcport; do + if ! iface_is_slave $iface; then + ocf_log err "Invalid OCF_RESKEY_multicast_port_router: $iface is not listed in OCF_RESKEY_bridge_slaves" + return 1 + fi + + if ! ocf_is_decimal $mcport || \ + [ "$mcport" -gt "1" ]; then + ocf_log err "Invalid OCF_RESKEY_multicast_port_router: valuer must be 0 (disabled) or 1 (enabled)" + return 1 + fi + done + } + fi + + return 0 +} + +bridge_check() { + if [ -e "/sys/class/net/$OCF_RESKEY_bridge_name" ]; then + if [ ! -e "$HA_RSCTMP/iface-bridge.$OCF_RESKEY_bridge_name" ]; then + return $OCF_ERR_GENERIC + fi + else + if [ -e "$HA_RSCTMP/iface-bridge.$OCF_RESKEY_bridge_name" ]; then + error="$(rm -f "$HA_RSCTMP/iface-bridge.$OCF_RESKEY_bridge_name" 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to remove stale lock file for bridge $OCF_RESKEY_bridge_name: $error" + return $OCF_ERR_GENERIC + fi + fi + return $OCF_NOT_RUNNING + fi + + # we check if all slaves are still part of the bridge + for slave in $OCF_RESKEY_bridge_slaves; do + if [ ! -e "/sys/class/net/$OCF_RESKEY_bridge_name/brif/$slave" ]; then + ocf_log err "Interface $slave is not part of the bridge $OCF_RESKEY_bridge_name" + return $OCF_ERR_GENERIC + fi + if ! iface_is_up $slave; then + ocf_log err "Interface $slave of the bridge $OCF_RESKEY_bridge_name is administratively down" + return $OCF_ERR_GENERIC + fi + done + + # check if bridge is still "UP" + # is there a cleaner way? + + if ! iface_is_up $OCF_RESKEY_bridge_name; then + ocf_log err "Bridge $OCF_RESKEY_bridge_name is administratively down" + return $OCF_ERR_GENERIC + fi + + if ! iface_lower_is_up $OCF_RESKEY_bridge_name; then + ocf_log err "Bridge $OCF_RESKEY_bridge_name has no active link-layer slaves" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +# we need a simpler stop version to clean after us if start fails +# without involving any error checking +# rolling back in case of failure is otherwise complex + +bridge_force_stop() { + $IP2UTIL link set dev "$OCF_RESKEY_bridge_name" down 2>&1 + for slave in $OCF_RESKEY_bridge_slaves; do + $IP2UTIL link set dev "$slave" down 2>&1 + done + $BRCTL delbr "$OCF_RESKEY_bridge_name" 2>&1 + rm -f "$HA_RSCTMP/iface-bridge.$OCF_RESKEY_bridge_name" 2>&1 +} + +bridge_start() { + # check if the bridge already exists + bridge_check + ret=$? + if [ "$ret" != "$OCF_NOT_RUNNING" ]; then + return $ret + fi + + # create the bridge + error="$($BRCTL addbr "$OCF_RESKEY_bridge_name" 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to create bridge $OCF_RESKEY_bridge_name: $error" + return $OCF_ERR_GENERIC + fi + + # add slaves if configured + for slave in $OCF_RESKEY_bridge_slaves; do + error="$($BRCTL addif "$OCF_RESKEY_bridge_name" "$slave" 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to add interface $slave to bridge $OCF_RESKEY_bridge_name: $error" + return $OCF_ERR_GENERIC + fi + done + + # set haripin forward mode + for hairpin in $OCF_RESKEY_port_hairpin; do + error="$($BRCTL hairpin "$OCF_RESKEY_bridge_name" "$hairpin" on 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set hairpin on for interface $hairpin to bridge $OCF_RESKEY_bridge_name: $error" + return $OCF_ERR_GENERIC + fi + done + + # set bridge ageing + if [ -n "$OCF_RESKEY_bridge_ageing" ]; then + error="$($BRCTL setageing "$OCF_RESKEY_bridge_name" "$OCF_RESKEY_bridge_ageing" 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set bridge $OCF_RESKEY_bridge_name ageing to $OCF_RESKEY_bridge_ageing: $error" + return $OCF_ERR_GENERIC + fi + fi + + # OCF_RESKEY_stp_fd needs special handling as it can be 0 or greater + # but only when configured before OCF_RESKEY_stp. + # if enabled after OCF_RESKEY_stp with stp=true the value range is + # different. It is not clear from the man page or brctl documentation + # what the range is ahead of time. + if [ -n "$OCF_RESKEY_stp_fd" ]; then + error="$($BRCTL setfd "$OCF_RESKEY_bridge_name" "$OCF_RESKEY_stp_fd" 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set bridge forward delay $OCF_RESKEY_stp_fd on bridge $OCF_RESKEY_bridge_name : $error" + return $OCF_ERR_GENERIC + fi + fi + + # enable/disable spanning tree protocol + if ocf_is_true "$OCF_RESKEY_stp"; then + error="$($BRCTL stp "$OCF_RESKEY_bridge_name" on 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to enable STP on bridge $OCF_RESKEY_bridge_name : $error" + return $OCF_ERR_GENERIC + fi + + # set bridge priority + if [ -n "$OCF_RESKEY_stp_bridgeprio" ]; then + error="$($BRCTL setbridgeprio "$OCF_RESKEY_bridge_name" "$OCF_RESKEY_stp_bridgeprio" 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set bridge priority $OCF_RESKEY_stp_bridgeprio on bridge $OCF_RESKEY_bridge_name : $error" + return $OCF_ERR_GENERIC + fi + fi + + # set hello timer + if [ -n "$OCF_RESKEY_stp_hello" ]; then + error="$($BRCTL sethello "$OCF_RESKEY_bridge_name" "$OCF_RESKEY_stp_hello" 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set bridge hello timer $OCF_RESKEY_stp_hello on bridge $OCF_RESKEY_bridge_name : $error" + return $OCF_ERR_GENERIC + fi + fi + + + # set max age + if [ -n "$OCF_RESKEY_stp_maxage" ]; then + error="$($BRCTL setmaxage "$OCF_RESKEY_bridge_name" "$OCF_RESKEY_stp_maxage" 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set bridge max age $OCF_RESKEY_stp_maxage on bridge $OCF_RESKEY_bridge_name : $error" + return $OCF_ERR_GENERIC + fi + fi + + # set path cost per port + if [ -n "$OCF_RESKEY_stp_pathcost" ]; then + split_string $OCF_RESKEY_stp_pathcost | { while read iface cost; do + error="$($BRCTL setpathcost "$OCF_RESKEY_bridge_name" "$iface" "$cost" 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set pathcost $cost for interface $iface on bridge $OCF_RESKEY_bridge_name : $error" + return $OCF_ERR_GENERIC + fi + done + } + fi + + # set port priority per port + if [ -n "$OCF_RESKEY_stp_portprio" ]; then + split_string $OCF_RESKEY_stp_portprio | { while read iface prio; do + error="$($BRCTL setportprio "$OCF_RESKEY_bridge_name" "$iface" "$prio" 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set portprio $prio for interface $iface on bridge $OCF_RESKEY_bridge_name : $error" + return $OCF_ERR_GENERIC + fi + done + } + fi + else + # stp off is default via brctl/kernel interface but it + # is best to force it since we don't know if default + # has changed across kernel releases + error="$($BRCTL stp "$OCF_RESKEY_bridge_name" off 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to disable STP on bridge $OCF_RESKEY_bridge_name : $error" + return $OCF_ERR_GENERIC + fi + fi + + # set slaves up + for slave in $OCF_RESKEY_bridge_slaves; do + error="$($IP2UTIL link set dev "$slave" up 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set slave $slave for bridge $OCF_RESKEY_bridge_name up: $error" + return $OCF_ERR_GENERIC + fi + done + + # set the bridge up + error="$($IP2UTIL link set dev "$OCF_RESKEY_bridge_name" up 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set bridge $OCF_RESKEY_bridge_name up: $error" + return $OCF_ERR_GENERIC + fi + + # multicast operations can only be executed after the bridge is up + # and configured. + + # enable/disable multicast router + if ocf_is_true "$OCF_RESKEY_multicast_router"; then + mcrouter=1 + else + mcrouter=0 + fi + if [ -e "/sys/class/net/$OCF_RESKEY_bridge_name/bridge/multicast_router" ]; then + error="$(echo $mcrouter > /sys/class/net/$OCF_RESKEY_bridge_name/bridge/multicast_router 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set OCF_RESKEY_multicast_router for bridge $OCF_RESKEY_bridge_name: $error" + return $OCF_ERR_GENERIC + fi + else + ocf_log warn "Unable to set multicast router on bridge $OCF_RESKEY_bridge_name because kernel does not support it" + fi + + # enable/disable multicast snoopint + if ocf_is_true "$OCF_RESKEY_multicast_snooping"; then + mcsnooping=1 + else + mcsnooping=0 + fi + if [ -e "/sys/class/net/$OCF_RESKEY_bridge_name/bridge/multicast_snooping" ]; then + error="$(echo $mcsnooping > /sys/class/net/$OCF_RESKEY_bridge_name/bridge/multicast_snooping 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set OCF_RESKEY_multicast_snooping for bridge $OCF_RESKEY_bridge_name: $error" + return $OCF_ERR_GENERIC + fi + else + ocf_log warn "Unable to set multicast snooping on bridge $OCF_RESKEY_bridge_name because kernel does not support it" + fi + + # enable/disable multicast querier + if ocf_is_true "$OCF_RESKEY_multicast_querier"; then + mcquerier=1 + else + mcquerier=0 + fi + if [ -e "/sys/class/net/$OCF_RESKEY_bridge_name/bridge/multicast_querier" ]; then + error="$(echo $mcquerier > /sys/class/net/$OCF_RESKEY_bridge_name/bridge/multicast_querier 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set OCF_RESKEY_multicast_querier for bridge $OCF_RESKEY_bridge_name: $error" + return $OCF_ERR_GENERIC + fi + else + ocf_log warn "Unable to set multicast querier on bridge $OCF_RESKEY_bridge_name because kernel does not support it" + fi + + # set multicast router per port + if [ -n "$OCF_RESKEY_multicast_port_router" ]; then + split_string $OCF_RESKEY_multicast_port_router | { while read iface mcport; do + if [ -e "/sys/class/net/$iface/brport/multicast_router" ]; then + error="$(echo $mcport > /sys/class/net/$iface/brport/multicast_router 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set OCF_RESKEY_multicast_port_router $mcport for interface $iface on bridge $OCF_RESKEY_bridge_name : $error" + return $OCF_ERR_GENERIC + fi + else + ocf_log warn "Unable to set multicast port router on bridge $OCF_RESKEY_bridge_name because kernel does not support it" + fi + done + } + fi + + error="$(touch "$HA_RSCTMP/iface-bridge.$OCF_RESKEY_bridge_name")" + if [ "$?" != "0" ]; then + ocf_log err "Unable to write lock file for bridge $OCF_RESKEY_bridge_name: $error" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +bridge_stop() { + bridge_check + ret=$? + if [ "$ret" = "$OCF_NOT_RUNNING" ]; then + return $OCF_SUCCESS + fi + if [ "$ret" != "$OCF_SUCCESS" ]; then + return $ret + fi + + # set bridge down + error="$($IP2UTIL link set dev "$OCF_RESKEY_bridge_name" down 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set bridge $OCF_RESKEY_bridge_name down: $error" + return $OCF_ERR_GENERIC + fi + + # set slaves down + for slave in $OCF_RESKEY_bridge_slaves; do + error="$($IP2UTIL link set dev "$slave" down 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set slave $slave for bridge $OCF_RESKEY_bridge_name down: $error" + return $OCF_ERR_GENERIC + fi + done + + # delete bridge + error="$($BRCTL delbr "$OCF_RESKEY_bridge_name" 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to delete bridge $OCF_RESKEY_bridge_name: $error" + return $OCF_ERR_GENERIC + fi + + error="$(rm -f "$HA_RSCTMP/iface-bridge.$OCF_RESKEY_bridge_name" 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to remove lock file for bridge $OCF_RESKEY_bridge_name: $error" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +case $__OCF_ACTION in + meta-data) + bridge_meta_data + exit $OCF_SUCCESS + ;; + usage|help) + bridge_usage + exit $OCF_SUCCESS + ;; +esac + +if [ ! -d "$HA_RSCTMP" ]; then + ocf_log debug "$HA_RSCTMP not found, we are probably being executed manually" + mkdir -p "$HA_RSCTMP" +fi + +if [ -n "$__OCF_ACTION" ] && ! bridge_validate; then + exit $OCF_ERR_CONFIGURED +fi + +case $__OCF_ACTION in + start|stop) + if ! ocf_is_root; then + ocf_log err "You must be root for $__OCF_ACTION operation." + exit $OCF_ERR_PERM + fi + ;; +esac + +case $__OCF_ACTION in + start) + bridge_start + ret=$? + if [ "$ret" != "$OCF_SUCCESS" ]; then + bridge_force_stop + fi + exit $ret + ;; + stop) + bridge_stop + exit $? + ;; + status|monitor) + bridge_check + exit $? + ;; + validate-all) + # bridge_validate above does the trick + ;; + *) + bridge_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +# vi:sw=4:ts=8: diff --git a/heartbeat/iface-macvlan b/heartbeat/iface-macvlan new file mode 100755 index 0000000..685f886 --- /dev/null +++ b/heartbeat/iface-macvlan @@ -0,0 +1,363 @@ +#!/bin/sh +# +# OCF Resource Agent compliant iface-macvlan script. +# +# Implements network MACVLAN interface management +# +# Resource script for MACVLAN dervice from Fabio M. Di Nitto iface-vlan +# script. +# +# Author: Ulrich Goettlich +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# + +# TODO: +# +# OCF parameters are as below +# OCF_RESKEY_device +# OCF_RESKEY_name +# OCF_RESKEY_mode +# OCF_RESKEY_mac +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_mode_default="bridge" + +: ${OCF_RESKEY_mode=${OCF_RESKEY_mode_default}} + +####################################################################### + +macvlan_usage() { + cat <<END +usage: $0 {start|stop|status|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +macvlan_meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="iface-macvlan" version="1.0"> + <version>1.0</version> + + <longdesc lang="en"> + This resource manages MACVLAN network interfaces. + It can add, remove, configure MACVLANs. + </longdesc> + + <shortdesc lang="en"> + Manages MACVLAN network interfaces. + </shortdesc> + + <parameters> + <parameter name="device" unique="0" required="1"> + <longdesc lang="en"> + Define the interface where MACVLAN should be attached. + </longdesc> + <shortdesc lang="en"> + Network interface. + </shortdesc> + <content type="string"/> + </parameter> + + <parameter name="name" unique="1" required="1"> + <longdesc lang="en"> + Define the MACVLAN NAME. It has to be a valid interface name (max 15 characters). + </longdesc> + <shortdesc lang="en"> + Define the MACVLAN NAME. + </shortdesc> + <content type="string"/> + </parameter> + + <parameter name="mode" unique="0"> + <longdesc lang="en"> + Define the name of the MACVLAN mode (currently only bridge is supported). + </longdesc> + <shortdesc lang="en"> + Mode of the macvlan. + </shortdesc> + <content type="string" default="${OCF_RESKEY_mode_default}" /> + </parameter> + + <parameter name="mac" unique="1"> + <longdesc lang="en"> + Set the interface MAC address explicitly. + </longdesc> + <shortdesc lang="en">MAC address of the macvlan</shortdesc> + <content type="string" /> +</parameter> + </parameters> + + <actions> + <action name="start" timeout="30s" /> + <action name="stop" timeout="20s" /> + <action name="status" timeout="20s" depth="0" interval="10s" /> + <action name="monitor" timeout="20s" depth="0" interval="10s" /> + <action name="meta-data" timeout="5s" /> + <action name="validate-all" timeout="20s" /> + </actions> +</resource-agent> +END +} + +# check if the interface is admin up/down + +iface_is_up() { + if ! $IP2UTIL -o link show $1 | \ + sed -e 's#.*<##g' -e 's#>.*##' -e 's#LOWER_UP##g' | \ + grep -q UP; then + return 1 + fi + return 0 +} + +# check if the slaves have link layer up/down +# see kernel network documentation on meaning of LOWER_UP flag +# for more in depth explanation on how it works +# NOTE: this check is not reliable in virt environment +# since interfaces are always LOWER_UP. There is no way +# from the guest to know if the host has disconnected somehow + +iface_lower_is_up() { + if ! $IP2UTIL -o link show $1 | \ + grep -q LOWER_UP; then + return 1 + fi + return 0 +} + +macvlan_validate() { + check_binary $IP2UTIL + + if [ -z "$OCF_RESKEY_device" ]; then + ocf_log err "Invalid device: value cannot be empty" + return $OCF_ERR_CONFIGURED + fi + + if [ -z "$OCF_RESKEY_name" ]; then + ocf_log err "Invalid name: value cannot be empty" + return $OCF_ERR_CONFIGURED + fi + + # the echo .. is the equivalent of strlen in bash + # + # /usr/include/linux/if.h:#define IFNAMSIZ 16 + # needs to include 0 byte end string + + if [ "${#OCF_RESKEY_name}" -gt 15 ]; then + ocf_log err "Invalid name: name is too long" + return $OCF_ERR_CONFIGURED + fi + + if [ ! -d "/sys/class/net" ]; then + ocf_log err "Unable to find sysfs network class in /sys" + return $OCF_ERR_GENERIC + fi + + if [ ! -e "/sys/class/net/$OCF_RESKEY_device" ]; then + ocf_log err "Invalid device: $OCF_RESKEY_device does not exists" + return $OCF_ERR_ARGS + fi + + if [ "${OCF_RESKEY_mode}" != "bridge" ]; then + ocf_log err "Invalid mode: only bridge mode is currently supported (because other modes are not tested)" + return $OCF_ERR_CONFIGURED + fi + + return 0 +} + +macvlan_check() { + if [ -e "/sys/class/net/$OCF_RESKEY_name" ]; then + if [ ! -e "$HA_RSCTMP/iface-macvlan.$OCF_RESKEY_name" ]; then + return $OCF_ERR_GENERIC + fi + else + if [ -e "$HA_RSCTMP/iface-macvlan.$OCF_RESKEY_name" ]; then + error="$(rm -f "$HA_RSCTMP/iface-macvlan.$OCF_RESKEY_name" 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to remove stale lock file for macvlan $OCF_RESKEY_name: $error" + return $OCF_ERR_INSTALLED + fi + fi + return $OCF_NOT_RUNNING + fi + + if ! iface_is_up $OCF_RESKEY_name; then + ocf_log err "MACVLAN $OCF_RESKEY_name is administratively down" + return $OCF_ERR_GENERIC + fi + + if ! iface_lower_is_up $OCF_RESKEY_name; then + ocf_log err "MACVLAN $OCF_RESKEY_name has no active link-layer" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +# we need a simpler stop version to clean after us if start fails +# without involving any error checking +# rolling back in case of failure is otherwise complex + +macvlan_force_stop() { + $IP2UTIL link delete "$OCF_RESKEY_name" >/dev/null 2>&1 + rm -f "$HA_RSCTMP/iface-macvlan.$OCF_RESKEY_name" 2>&1 +} + +macvlan_start() { + # check if the macvlan already exists + macvlan_check + ret=$? + if [ "$ret" != "$OCF_NOT_RUNNING" ]; then + return $ret + fi + + # create the MACVLAN + set_specific_mac="" + [ -n "${OCF_RESKEY_mac}" ] && set_specific_mac="address ${OCF_RESKEY_mac}" + error="$($IP2UTIL link add link "$OCF_RESKEY_device" ${set_specific_mac} name "$OCF_RESKEY_name" type macvlan mode "$OCF_RESKEY_mode" 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to create MACVLAN $OCF_RESKEY_name: $error" + return $OCF_ERR_GENERIC + fi + + # set the interface up + error="$($IP2UTIL link set dev "$OCF_RESKEY_device" up 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set MACVLAN $OCF_RESKEY_device up: $error" + return $OCF_ERR_GENERIC + fi + + # set the macvlan up + error="$($IP2UTIL link set dev "$OCF_RESKEY_name" up 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set MACVLAN $OCF_RESKEY_name up: $error" + return $OCF_ERR_GENERIC + fi + + error="$(touch "$HA_RSCTMP/iface-macvlan.$OCF_RESKEY_name" 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to create lock file for MACVLAN $OCF_RESKEY_name: $error" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +macvlan_stop() { + macvlan_check + ret=$? + if [ "$ret" = "$OCF_NOT_RUNNING" ]; then + return $OCF_SUCCESS + fi + if [ "$ret" != "$OCF_SUCCESS" ]; then + return $ret + fi + + # set macvlan down + error="$($IP2UTIL link set dev "$OCF_RESKEY_name" down 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set MACVLAN $OCF_RESKEY_name down: $error" + return $OCF_ERR_GENERIC + fi + + # delete macvlan + error="$($IP2UTIL link delete "$OCF_RESKEY_name" 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to delete MACVLAN $OCF_RESKEY_name: $error" + return $OCF_ERR_GENERIC + fi + + error="$(rm -f "$HA_RSCTMP/iface-macvlan.$OCF_RESKEY_name" 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to remove lock file for MACVLAN $OCF_RESKEY_name: $error" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +case $__OCF_ACTION in + meta-data) + macvlan_meta_data + exit $OCF_SUCCESS + ;; + usage|help) + macvlan_usage + exit $OCF_SUCCESS + ;; +esac + +if [ ! -d "$HA_RSCTMP" ]; then + ocf_log debug "$HA_RSCTMP not found, we are probably being executed manually" + mkdir -p "$HA_RSCTMP" +fi + +if [ -n "$__OCF_ACTION" ] && ! macvlan_validate; then + exit $OCF_ERR_CONFIGURED +fi + +case $__OCF_ACTION in + start|stop) + if ! ocf_is_root; then + ocf_log err "You must be root for $__OCF_ACTION operation." + exit $OCF_ERR_PERM + fi + ;; +esac + +case $__OCF_ACTION in + start) + macvlan_start + ret=$? + if [ "$ret" != "$OCF_SUCCESS" ]; then + macvlan_force_stop + fi + exit $ret + ;; + stop) + macvlan_stop + exit $? + ;; + status|monitor) + macvlan_check + exit $? + ;; + validate-all) + # macvlan_validate above does the trick + ;; + *) + macvlan_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +# vi:sw=4:ts=8: diff --git a/heartbeat/iface-vlan b/heartbeat/iface-vlan new file mode 100755 index 0000000..019c2e1 --- /dev/null +++ b/heartbeat/iface-vlan @@ -0,0 +1,475 @@ +#!/bin/sh +# +# OCF Resource Agent compliant iface-vlan script. +# +# Implements network VLAN interface management +# +# Copyright (C) 2013 Red Hat, Inc. All rights reserved. +# Author: Fabio M. Di Nitto <fdinitto@redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# + +# TODO: +# +# OCF parameters are as below +# OCF_RESKEY_vlan_interface +# OCF_RESKEY_vlan_id +# OCF_RESKEY_vlan_name +# OCF_RESKEY_vlan_reorder_hdr +# OCF_RESKEY_vlan_gvrp +# OCF_RESKEY_vlan_mvrp +# OCF_RESKEY_vlan_loose_binding +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_vlan_reorder_hdr_default=1 +OCF_RESKEY_vlan_gvrp_default=0 +OCF_RESKEY_vlan_mvrp_default=0 +OCF_RESKEY_vlan_loose_binding_default=0 +OCF_RESKEY_vlan_name_default=${OCF_RESKEY_vlan_interface}.${OCF_RESKEY_vlan_id} + +: ${OCF_RESKEY_vlan_name=${OCF_RESKEY_vlan_name_default}} +: ${OCF_RESKEY_vlan_reorder_hdr=${OCF_RESKEY_vlan_reorder_hdr_default}} +: ${OCF_RESKEY_vlan_gvrp=${OCF_RESKEY_vlan_gvrp_default}} + +# don't set defaults for mvrp or loose binding since both +# are rather new kernel features and they might not be supported +#: ${OCF_RESKEY_vlan_mvrp=${OCF_RESKEY_vlan_mvrp_default}} +#: ${OCF_RESKEY_vlan_loose_binding=${OCF_RESKEY_vlan_loose_binding_default}} + +####################################################################### + +vlan_usage() { + cat <<END +usage: $0 {start|stop|status|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +vlan_meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="iface-vlan" version="1.0"> + <version>1.0</version> + + <longdesc lang="en"> + This resource manages VLAN network interfaces. + It can add, remove, configure VLANs. + </longdesc> + + <shortdesc lang="en"> + Manages VLAN network interfaces. + </shortdesc> + + <parameters> + <parameter name="vlan_interface" unique="0" required="1"> + <longdesc lang="en"> + Define the interface where VLAN should be attached. + </longdesc> + <shortdesc lang="en"> + Network interface. + </shortdesc> + <content type="string"/> + </parameter> + + <parameter name="vlan_id" unique="0" required="1"> + <longdesc lang="en"> + Define the VLAN ID. It has to be a value between 0 and 4094. + </longdesc> + <shortdesc lang="en"> + Define the VLAN ID. + </shortdesc> + <content type="integer"/> + </parameter> + + <parameter name="vlan_name" unique="1"> + <longdesc lang="en"> + Define the name of the VLAN interface (max 15 charaters). + </longdesc> + <shortdesc lang="en"> + Name of the VLAN. + </shortdesc> + <content type="string" default="${OCF_RESKEY_vlan_name_default}" /> + </parameter> + + <parameter name="vlan_reorder_hdr" unique="0"> + <longdesc lang="en"> + Enable or disable header reordering. + </longdesc> + <shortdesc lang="en"> + Enable or disable header reordering. + </shortdesc> + <content type="boolean" default="${OCF_RESKEY_vlan_reorder_hdr_default}"/> + </parameter> + + <parameter name="vlan_gvrp" unique="0"> + <longdesc lang="en"> + Enable or disable GARP VLAN registration protocol. + </longdesc> + <shortdesc lang="en"> + Enable or disable gvrp. + </shortdesc> + <content type="boolean" default="${OCF_RESKEY_vlan_gvrp_default}"/> + </parameter> + + <parameter name="vlan_mvrp" unique="0"> + <longdesc lang="en"> + Enable or disable Multiple VLAN Registration Protocol. + Please note that most distributions do not ship a version of iproute2 + that supports mvrp yet, even if the kernel has support for it. + Check output of $IPADDR2 link add type vlan --help in the FLAG + section to verify if mvrp support is available. + </longdesc> + <shortdesc lang="en"> + Enable or disable mvrp. + </shortdesc> + <content type="boolean" default="${OCF_RESKEY_vlan_mvrp_default}"/> + </parameter> + + <parameter name="vlan_loose_binding" unique="0"> + <longdesc lang="en"> + Enable or disable VLAN loose bind. By default the VLAN interface + admin status (UP/DOWN) follows the underneath interface status. + Enabling loose bind allows the VLAN to disconnect from the + interface status. Be very careful that enabling loose binding + could invalidate this agent monitor operations. + Please note that most distributions do not ship a version of iproute2 + that supports loose_binding yet, even if the kernel has support for it. + Check output of $IPADDR2 link add type vlan --help in the FLAG + section to verify if loose_binding support is available. + </longdesc> + <shortdesc lang="en"> + Enable or disable loose binding. + </shortdesc> + <content type="boolean" default="${OCF_RESKEY_vlan_loose_binding_default}"/> + </parameter> + </parameters> + + <actions> + <action name="start" timeout="30s" /> + <action name="stop" timeout="20s" /> + <action name="status" timeout="20s" depth="0" interval="10s" /> + <action name="monitor" timeout="20s" depth="0" interval="10s" /> + <action name="meta-data" timeout="5s" /> + <action name="validate-all" timeout="20s" /> + </actions> +</resource-agent> +END +} + +# check if the interface is admin up/down + +iface_is_up() { + if ! $IP2UTIL -o link show $1 | \ + sed -e 's#.*<##g' -e 's#>.*##' -e 's#LOWER_UP##g' | \ + grep -q UP; then + return 1 + fi + return 0 +} + +# check if the slaves have link layer up/down +# see kernel network documentation on meaning of LOWER_UP flag +# for more in depth explanation on how it works +# NOTE: this check is not reliable in virt environment +# since interfaces are always LOWER_UP. There is no way +# from the guest to know if the host has disconnected somehow + +iface_lower_is_up() { + if ! $IP2UTIL -o link show $1 | \ + grep -q LOWER_UP; then + return 1 + fi + return 0 +} + +vlan_validate() { + check_binary $IP2UTIL + + if [ -z "$OCF_RESKEY_vlan_interface" ]; then + ocf_log err "Invalid OCF_RESKEY_vlan_interface: value cannot be empty" + return 1 + fi + + # the echo .. is the equivalent of strlen in bash + # + # /usr/include/linux/if.h:#define IFNAMSIZ 16 + # needs to include 0 byte end string + + if [ "${#OCF_RESKEY_vlan_interface}" -gt 15 ]; then + ocf_log err "Invalid OCF_RESKEY_vlan_interface: name is too long" + return 1 + fi + + if [ ! -d "/sys/class/net" ]; then + ocf_log err "Unable to find sysfs network class in /sys" + return 1 + fi + + if [ ! -e "/sys/class/net/$OCF_RESKEY_vlan_interface" ]; then + ocf_log err "Invalid OCF_RESKEY_vlan_interface: $OCF_RESKEY_vlan_interface does not exists" + return 1 + fi + + if [ -z "$OCF_RESKEY_vlan_id" ]; then + ocf_log err "Invalid OCF_RESKEY_vlan_id: value cannot be empty" + return 1 + fi + if ! ocf_is_decimal "$OCF_RESKEY_vlan_id" || \ + [ "$OCF_RESKEY_vlan_id" -gt "4094" ]; then + ocf_log err "Invalid OCF_RESKEY_vlan_id: must be a decimal value (0 to 4094 included)" + return 1 + fi + + if [ "${#OCF_RESKEY_vlan_name}" -gt 15 ]; then + ocf_log err "Invalid OCF_RESKEY_vlan_name: name is too long" + return 1 + fi + + return 0 +} + +vlan_check() { + if [ -e "/sys/class/net/$OCF_RESKEY_vlan_name" ]; then + if [ ! -e "$HA_RSCTMP/iface-vlan.$OCF_RESKEY_vlan_name" ]; then + return $OCF_ERR_GENERIC + fi + else + if [ -e "$HA_RSCTMP/iface-vlan.$OCF_RESKEY_vlan_name" ]; then + error="$(rm -f "$HA_RSCTMP/iface-vlan.$OCF_RESKEY_vlan_name" 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to remove stale lock file for vlan $OCF_RESKEY_vlan_name: $error" + return $OCF_ERR_GENERIC + fi + fi + return $OCF_NOT_RUNNING + fi + + if ! iface_is_up $OCF_RESKEY_vlan_interface; then + if ocf_is_true "$OCF_RESKEY_vlan_loose_binding"; then + ocf_log warn "Interface $OCF_RESKEY_vlan_interface is administratively down" + else + ocf_log err "Interface $OCF_RESKEY_vlan_interface is administratively down" + return $OCF_ERR_GENERIC + fi + fi + + if ! iface_is_up $OCF_RESKEY_vlan_name; then + ocf_log err "VLAN $OCF_RESKEY_vlan_name is administratively down" + return $OCF_ERR_GENERIC + fi + + if ! iface_lower_is_up $OCF_RESKEY_vlan_name; then + ocf_log err "VLAN $OCF_RESKEY_vlan_name has no active link-layer" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +# we need a simpler stop version to clean after us if start fails +# without involving any error checking +# rolling back in case of failure is otherwise complex + +vlan_force_stop() { + $IP2UTIL link delete "$OCF_RESKEY_vlan_name" >/dev/null 2>&1 + rm -f "$HA_RSCTMP/iface-vlan.$OCF_RESKEY_vlan_name" 2>&1 +} + +vlan_start() { + # check if the vlan already exists + vlan_check + ret=$? + if [ "$ret" != "$OCF_NOT_RUNNING" ]; then + return $ret + fi + + # make sure kernel module is loaded + if [ ! -e /proc/net/vlan ]; then + error="$(modprobe 8021q 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to load kernel 8021q driver: $error" + return $OCF_ERR_GENERIC + fi + fi + + # generate options + VLANOPTS="" + + if [ -n "$OCF_RESKEY_vlan_reorder_hdr" ]; then + if ocf_is_true "$OCF_RESKEY_vlan_reorder_hdr"; then + VLANOPTS="reorder_hdr on" + else + VLANOPTS="reorder_hdr off" + fi + fi + + if [ -n "$OCF_RESKEY_vlan_gvrp" ]; then + if ocf_is_true "$OCF_RESKEY_vlan_gvrp"; then + VLANOPTS="$VLANOPTS gvrp on" + else + VLANOPTS="$VLANOPTS gvrp off" + fi + fi + + if [ -n "$OCF_RESKEY_vlan_mvrp" ]; then + if ocf_is_true "$OCF_RESKEY_vlan_mvrp"; then + VLANOPTS="$VLANOPTS mvrp on" + else + VLANOPTS="$VLANOPTS mvrp off" + fi + fi + + if [ -n "$OCF_RESKEY_vlan_loose_binding" ]; then + if ocf_is_true "$OCF_RESKEY_vlan_loose_binding"; then + VLANOPTS="$VLANOPTS loose_binding on" + else + VLANOPTS="$VLANOPTS loose_binding off" + fi + fi + + # create the VLAN + error="$($IP2UTIL link add link "$OCF_RESKEY_vlan_interface" name "$OCF_RESKEY_vlan_name" type vlan id "$OCF_RESKEY_vlan_id" $VLANOPTS 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to create VLAN $OCF_RESKEY_vlan_name: $error" + return $OCF_ERR_GENERIC + fi + + # set the interface up + error="$($IP2UTIL link set dev "$OCF_RESKEY_vlan_interface" up 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set VLAN $OCF_RESKEY_vlan_interface up: $error" + return $OCF_ERR_GENERIC + fi + + # set the vlan up + error="$($IP2UTIL link set dev "$OCF_RESKEY_vlan_name" up 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set VLAN $OCF_RESKEY_vlan_name up: $error" + return $OCF_ERR_GENERIC + fi + + error="$(touch "$HA_RSCTMP/iface-vlan.$OCF_RESKEY_vlan_name" 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to create lock file for VLAN $OCF_RESKEY_vlan_name: $error" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +vlan_stop() { + vlan_check + ret=$? + if [ "$ret" = "$OCF_NOT_RUNNING" ]; then + return $OCF_SUCCESS + fi + if [ "$ret" != "$OCF_SUCCESS" ]; then + return $ret + fi + + # set vlan down + error="$($IP2UTIL link set dev "$OCF_RESKEY_vlan_name" down 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to set VLAN $OCF_RESKEY_vlan_name down: $error" + return $OCF_ERR_GENERIC + fi + + # delete vlan + error="$($IP2UTIL link delete "$OCF_RESKEY_vlan_name" 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to delete VLAN $OCF_RESKEY_vlan_name: $error" + return $OCF_ERR_GENERIC + fi + + error="$(rm -f "$HA_RSCTMP/iface-vlan.$OCF_RESKEY_vlan_name" 2>&1)" + if [ "$?" != "0" ]; then + ocf_log err "Unable to remove lock file for VLAN $OCF_RESKEY_vlan_name: $error" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +case $__OCF_ACTION in + meta-data) + vlan_meta_data + exit $OCF_SUCCESS + ;; + usage|help) + vlan_usage + exit $OCF_SUCCESS + ;; +esac + +if [ ! -d "$HA_RSCTMP" ]; then + ocf_log debug "$HA_RSCTMP not found, we are probably being executed manually" + mkdir -p "$HA_RSCTMP" +fi + +if [ -n "$__OCF_ACTION" ] && ! vlan_validate; then + exit $OCF_ERR_CONFIGURED +fi + +case $__OCF_ACTION in + start|stop) + if ! ocf_is_root; then + ocf_log err "You must be root for $__OCF_ACTION operation." + exit $OCF_ERR_PERM + fi + ;; +esac + +case $__OCF_ACTION in + start) + vlan_start + ret=$? + if [ "$ret" != "$OCF_SUCCESS" ]; then + vlan_force_stop + fi + exit $ret + ;; + stop) + vlan_stop + exit $? + ;; + status|monitor) + vlan_check + exit $? + ;; + validate-all) + # vlan_validate above does the trick + ;; + *) + vlan_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +# vi:sw=4:ts=8: diff --git a/heartbeat/ipsec b/heartbeat/ipsec new file mode 100755 index 0000000..6504815 --- /dev/null +++ b/heartbeat/ipsec @@ -0,0 +1,200 @@ +#!/bin/sh +# +# +# IPSEC OCF RA. Handles IPSEC tunnels associated with a VIP +# +# Copyright (c) 2017 Red Hat Inc. +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + +# Defaults + +OCF_RESKEY_tunnel_default="" +OCF_RESKEY_vip_default="" +OCF_RESKEY_confdir_default="/etc/ipsec.d/" +OCF_RESKEY_fallbacktunnel_default="" + +: ${OCF_RESKEY_tunnel=${OCF_RESKEY_tunnel_default}} +: ${OCF_RESKEY_vip=${OCF_RESKEY_vip_default}} +: ${OCF_RESKEY_confdir=${OCF_RESKEY_confdir_default}} +: ${OCF_RESKEY_fallbacktunnel=${OCF_RESKEY_fallbacktunnel_default}} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="ipsec" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This is a Resource Agent to manage IPSEC tunnels associated with a Virtual IP +Address. It's meant to be collocated with a specific VIP, and will manage +setting up or down a specific tunnel. +</longdesc> +<shortdesc lang="en">Handles IPSEC tunnels for VIPs</shortdesc> + +<parameters> +<parameter name="tunnel" unique="1" required="1"> +<longdesc lang="en"> +The name of the tunnel to be monitored. +</longdesc> +<shortdesc lang="en">Tunnel name</shortdesc> +<content type="string" default="${OCF_RESKEY_tunnel_default}" /> +</parameter> +<parameter name="vip" unique="1" required="1"> +<longdesc lang="en"> +Virtual IP address that the tunnel is using. +</longdesc> +<shortdesc lang="en">VIP</shortdesc> +<content type="string" default="${OCF_RESKEY_vip_default}" /> +</parameter> +<parameter name="confdir"> +<longdesc lang="en"> +The directory where the IPSEC tunnel configurations can be found. +</longdesc> +<shortdesc lang="en">Tunnel name</shortdesc> +<content type="string" default="${OCF_RESKEY_confdir_default}" /> +</parameter> +<parameter name="fallbacktunnel" unique="1"> +<longdesc lang="en"> +The name of the tunnel to fall back to when the main tunnel is put down. +</longdesc> +<shortdesc lang="en">Tunnel name to fall back to</shortdesc> +<content type="string" default="${OCF_RESKEY_fallbacktunnel_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" timeout="20s" interval="10s" depth="0" /> +<action name="reload" timeout="20s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +ipsec_usage() { + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. And +should have a collocation constraint with a VIP associated with the +tunnel. +END +} + +ipsec_start() { + ipsec auto --add "${OCF_RESKEY_tunnel}" + ipsec whack --listen + local return_code=$? + if [ $return_code -eq 1 -o $return_code -eq 10 ]; then + ocf_log warn "${OCF_RESOURCE_INSTANCE} : Unable to add tunnel ${OCF_RESKEY_tunnel} with return code ${return_code}" + return $OCF_ERR_GENERIC + else + return $OCF_SUCCESS + fi +} + +ipsec_stop() { + ipsec auto --down "${OCF_RESKEY_tunnel}" + local return_code=$? + ocf_log info "${OCF_RESOURCE_INSTANCE} : Put down tunnel ${OCF_RESKEY_tunnel} with return code ${return_code}" + ipsec whack --listen + if [ -n "${OCF_RESKEY_fallbacktunnel}" ]; then + # Run this in a subshell and let it run, This will end the stop + # operation And the start of the tunnel will hopefully start on the + # other node. Meanwhile, this will keep trying to put up the + # fallback tunnel up, and will eventually succeed or timeout in the + # background. + (ipsec auto --up "${OCF_RESKEY_fallbacktunnel}") & + disown + fi + return $OCF_SUCCESS +} + +ipsec_monitor() { + # Monitor _MUST!_ differentiate correctly between running + # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING). + # That is THREE states, not just yes/no. + + ipsec status | grep "$OCF_RESKEY_tunnel" | grep -q unoriented + state=$? + + # TODO I think we need also to check if output of ipsec status is empty! + # Then the tunnel is not running. + + if [ "$state" = "0" ]; then + ip addr show | grep -q "${OCF_RESKEY_vip}" + hosting_vip=$? + if [ "$hosting_vip" = "0" ]; then + ocf_log warn "${OCF_RESOURCE_INSTANCE} : tunnel ${OCF_RESKEY_tunnel} is unoriented" + return $OCF_ERR_GENERIC + else + return $OCF_NOT_RUNNING + fi + else + return $OCF_SUCCESS + fi +} + +ipsec_validate() { + # The tunnel needs to be defined in the configuration + cat ${OCF_RESKEY_confdir}/*.conf | grep -q "conn $OCF_RESKEY_tunnel" + state=$? + if [ "$state" = "0" ]; then + return $OCF_SUCCESS + else + return $OCF_ERR_GENERIC + fi +} + +case $__OCF_ACTION in +meta-data) meta_data + exit $OCF_SUCCESS + ;; +start) ipsec_start;; +stop) ipsec_stop;; +monitor) ipsec_monitor;; +reload) ocf_log info "Reloading ${OCF_RESOURCE_INSTANCE} ..." + ;; +usage|help) ipsec_usage + exit $OCF_SUCCESS + ;; +*) ipsec_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc diff --git a/heartbeat/iscsi b/heartbeat/iscsi new file mode 100755 index 0000000..d25aec2 --- /dev/null +++ b/heartbeat/iscsi @@ -0,0 +1,516 @@ +#!/bin/sh +# +# iSCSI OCF resource agent +# Description: manage iSCSI disks (add/remove) using open-iscsi +# +# Copyright Dejan Muhamedagic <dejan@suse.de> +# (C) 2007 Novell Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# See usage() and meta_data() below for more details... +# +# OCF instance parameters: +# OCF_RESKEY_portal: the iSCSI portal address or host name (required) +# OCF_RESKEY_target: the iSCSI target (required) +# OCF_RESKEY_iscsiadm: iscsiadm program path (optional) +# OCF_RESKEY_discovery_type: discovery type (optional; default: sendtargets) +# OCF_RESKEY_try_recovery: wait for iSCSI recovery in monitor (optional; default: false) +# +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_udev_default="yes" +OCF_RESKEY_iscsiadm_default="iscsiadm" +OCF_RESKEY_discovery_type_default="sendtargets" +OCF_RESKEY_try_recovery_default="false" + +: ${OCF_RESKEY_udev=${OCF_RESKEY_udev_default}} +: ${OCF_RESKEY_iscsiadm=${OCF_RESKEY_iscsiadm_default}} +: ${OCF_RESKEY_discovery_type=${OCF_RESKEY_discovery_type_default}} + +usage() { + methods=`iscsi_methods` + methods=`echo $methods | tr ' ' '|'` + cat <<EOF + usage: $0 {$methods} + + $0 manages an iSCSI target + + The 'start' operation starts (adds) the iSCSI target. + The 'stop' operation stops (removes) the iSCSI target. + The 'status' operation reports whether the iSCSI target is connected + The 'monitor' operation reports whether the iSCSI target is connected + The 'validate-all' operation reports whether the parameters are valid + The 'methods' operation reports on the methods $0 supports + +EOF +} + +meta_data() { + cat <<EOF +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="iscsi" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +OCF Resource Agent for iSCSI. Add (start) or remove (stop) iSCSI +targets. +</longdesc> +<shortdesc lang="en">Manages a local iSCSI initiator and its connections to iSCSI targets</shortdesc> + +<parameters> + +<parameter name="portal" unique="0" required="1"> +<longdesc lang="en"> +The iSCSI portal address in the form: {ip_address|hostname}[":"port] +</longdesc> +<shortdesc lang="en">Portal address</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="target" unique="1" required="1"> +<longdesc lang="en"> +The iSCSI target IQN. +</longdesc> +<shortdesc lang="en">Target IQN</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="discovery_type" unique="0" required="0"> +<longdesc lang="en"> +Target discovery type. Check the open-iscsi documentation for +supported discovery types. +</longdesc> +<shortdesc lang="en">Target discovery type</shortdesc> +<content type="string" default="${OCF_RESKEY_discovery_type_default}" /> +</parameter> + +<parameter name="iscsiadm" unique="0" required="0"> +<longdesc lang="en"> +open-iscsi administration utility binary. +</longdesc> +<shortdesc lang="en">iscsiadm binary</shortdesc> +<content type="string" default="${OCF_RESKEY_iscsiadm_default}" /> +</parameter> + +<parameter name="udev" unique="0" required="0"> +<longdesc lang="en"> +If the next resource depends on the udev creating a device then +we wait until it is finished. On a normally loaded host this +should be done quickly, but you may be unlucky. If you are not +using udev set this to "no", otherwise we will spin in a loop +until a timeout occurs. +</longdesc> +<shortdesc lang="en">udev</shortdesc> +<content type="string" default="${OCF_RESKEY_udev_default}" /> +</parameter> + +<parameter name="try_recovery" unique="0" required="0"> +<longdesc lang="en"> +If the iSCSI session exists but is currently inactive/broken, +which is most probably due to network problems, the iSCSI layer +will try to recover. If this parameter is set to true, we'll wait +for the recovery to succeed. In that case the monitor operation +can only time out so you should set the monitor op timeout +attribute appropriately. +</longdesc> +<shortdesc lang="en">On error wait for iSCSI recovery in monitor</shortdesc> +<content type="boolean" default="${OCF_RESKEY_try_recovery_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="120s" /> +<action name="stop" timeout="120s" /> +<action name="status" timeout="30s" /> +<action name="monitor" depth="0" timeout="30s" interval="120s" /> +<action name="validate-all" timeout="5s" /> +<action name="methods" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +EOF +} + +iscsi_methods() { + cat <<EOF + start + stop + status + monitor + validate-all + methods + meta-data + usage +EOF +} + +# +# open-iscsi interface +# + +is_iscsid_running() { + ps -e -o cmd | grep -qs '[i]scsid' +} +open_iscsi_setup() { + discovery=open_iscsi_discovery + add_disk=open_iscsi_add + remove_disk=open_iscsi_remove + disk_status=open_iscsi_monitor + iscsiadm=${OCF_RESKEY_iscsiadm} + + have_binary ${iscsiadm} || + return 3 + if is_iscsid_running; then + return 0 + elif grep -qs '^iscsid.startup' /etc/iscsi/iscsid.conf; then + # apparently on RedHat (perhaps elsewhere?), there is a + # kind of iscsid autostart once root invokes some + # open_iscsi command; the iscsid.startup hook should take + # care of it; reported by m.richardson@ed.ac.uk (see also + # the discussion at the linux-ha-dev ML) + return 1 + else + ocf_exit_reason "iscsid not running; please start open-iscsi utilities" + return 2 + fi +} + +# +# discovery return codes: +# 0: ok (variable portal set) +# 1: target not found +# 2: target found but can't connect it unambigously +# 3: iscsiadm returned error +# +# open-iscsi >= "2.0-872" changed discovery semantics +# see http://www.mail-archive.com/open-iscsi@googlegroups.com/msg04883.html +# there's a new discoverydb command which should be used instead discovery + +open_iscsi_discovery() { + local output + local discovery_variant="discovery" + local options="" + local cmd + local version=`$iscsiadm --version | awk '{print $3}'` + + ocf_version_cmp "$version" "2.0-871" + if [ $? -eq 2 ]; then # newer than 2.0-871? + discovery_variant="discoverydb" + [ "$discovery_type" = "sendtargets" ] && + options="-D" + fi + cmd="$iscsiadm -m $discovery_variant -p $OCF_RESKEY_portal -t $discovery_type $options" + output=`$cmd` + if [ $? -ne 0 -o x = "x$output" ]; then + [ x != "x$output" ] && { + ocf_exit_reason "$cmd FAILED" + echo "$output" + } + return 3 + fi + PORTAL=`echo "$output" | + awk -v target="$OCF_RESKEY_target" ' + $NF==target{ + if( NF==3 ) portal=$2; # sles compat mode + else portal=$1; + sub(",.*","",portal); + print portal; + }'` + + case `echo "$PORTAL" | wc -w` in + 0) #target not found + echo "$output" + ocf_exit_reason "target $OCF_RESKEY_target not found at portal $OCF_RESKEY_portal" + return 1 + ;; + 1) #we're ok + return 0 + ;; + *) # handle multihome hosts reporting multiple portals + for p in $PORTAL; do + if [ "$OCF_RESKEY_portal" = "$p" ]; then + PORTAL="$OCF_RESKEY_portal" + return 0 + fi + done + echo "$output" + ocf_exit_reason "sorry, can't handle multihomed hosts unless you specify the portal exactly" + return 2 + ;; + esac +} +open_iscsi_add() { + $iscsiadm -m node -p $1 -T $2 -l +} +open_iscsi_get_session_id() { + local target="$1" + local portal="$2" + $iscsiadm -m session 2>/dev/null | + grep -E "$target($|[[:space:]])" | + grep -E "] $portal" | + awk '{print $2}' | tr -d '[]' +} +open_iscsi_remove() { + local target="$1" + local session_id + session_id=`open_iscsi_get_session_id "$target" "$OCF_RESKEY_portal"` + if [ "$session_id" ]; then + $iscsiadm -m session -r $session_id -u + else + ocf_exit_reason "cannot find session id for target $target" + return 1 + fi +} +# open_iscsi_monitor return codes: +# 0: target running (logged in) +# 1: target not running and target record exists +# 2: iscsiadm -m session error (unexpected) +# 3: target record does not exist (discovery necessary) +# +open_iscsi_monitor() { + local target="$1" + local session_id conn_state outp + local prev_state + local recov + + recov=${2:-$OCF_RESKEY_try_recovery} + session_id=`open_iscsi_get_session_id "$target" "$OCF_RESKEY_portal"` + prev_state="" + if [ -z "$session_id" ]; then + if $iscsiadm -m node -p $OCF_RESKEY_portal -T $target >/dev/null 2>&1; then + return 1 # record found + else + return 3 + fi + fi + while :; do + outp=`$iscsiadm -m session -r $session_id -P 1` || + return 2 + conn_state=`echo "$outp" | sed -n '/Connection State/s/.*: //p'` + # some drivers don't return connection state, in that case + # we'll assume that we're still connected + case "$conn_state" in + "LOGGED IN") + [ -n "$msg_logged" ] && + ocf_log info "connection state $conn_state. Session restored." + return 0;; + "Unknown"|"") # this is also probably OK + [ -n "$msg_logged" ] && + ocf_log info "connection state $conn_state. Session restored." + return 0;; + *) # failed + if [ "$__OCF_ACTION" != stop ] && ! ocf_is_probe && ocf_is_true $recov; then + if [ "$conn_state" != "$prev_state" ]; then + ocf_log warning "connection state $conn_state, waiting for recovery..." + prev_state="$conn_state" + fi + sleep 1 + else + ocf_exit_reason "iscsiadm output: $outp" + return 2 + fi + ;; + esac + done +} + +disk_discovery() { + discovery_type=${OCF_RESKEY_discovery_type} + $discovery # discover and setup the real portal string (address) + case $? in + 0) ;; + 1|2) exit $OCF_ERR_GENERIC ;; + 3) if ! is_iscsid_running; then + [ $setup_rc -eq 1 ] && + ocf_log warning "iscsid.startup probably not correctly set in /etc/iscsi/iscsid.conf" + exit $OCF_ERR_INSTALLED + fi + exit $OCF_ERR_GENERIC + ;; + esac +} + +# +# NB: this is udev specific! +# +wait_for_udev() { + dev=/dev/disk/by-path/ip-$PORTAL-iscsi-$OCF_RESKEY_target + while :; do + ls $dev* >/dev/null 2>&1 && break + ocf_log warning "waiting for udev to create $dev" + sleep 1 + done +} +iscsi_monitor() { + $disk_status $OCF_RESKEY_target $* + case $? in + 0) return $OCF_SUCCESS;; + 1|3) return $OCF_NOT_RUNNING;; + 2) return $OCF_ERR_GENERIC;; + esac +} +iscsi_start() { + local rc + $disk_status $OCF_RESKEY_target + rc=$? + if [ $rc -eq 3 ]; then + disk_discovery + $disk_status $OCF_RESKEY_target + rc=$? + fi + case $rc in + 0) + ocf_log info "iscsi $PORTAL $OCF_RESKEY_target already running" + return $OCF_SUCCESS + ;; + 1) + $add_disk $PORTAL $OCF_RESKEY_target || + return $OCF_ERR_GENERIC + case "$OCF_RESKEY_udev" in + [Yy]es) wait_for_udev || + return $OCF_ERR_GENERIC + ;; + *) ;; + esac + ;; + *) # the session exists, but it's broken + ocf_log warning "iscsi $PORTAL $OCF_RESKEY_target in failed state" + ;; + esac + iscsi_monitor 1 # enforce wait + if [ $? -eq $OCF_SUCCESS ]; then + return $OCF_SUCCESS + else + return $OCF_ERR_GENERIC + fi +} +iscsi_stop() { + iscsi_monitor + if [ $? -ne $OCF_NOT_RUNNING ] ; then + $remove_disk $OCF_RESKEY_target || + return $OCF_ERR_GENERIC + iscsi_monitor + if [ $? -ne $OCF_NOT_RUNNING ] ; then + return $OCF_ERR_GENERIC + else + return $OCF_SUCCESS + fi + else + ocf_log info "iscsi $OCF_RESKEY_target already stopped" + return $OCF_SUCCESS + fi +} + +# +# 'main' starts here... +# + +if [ $# -ne 1 ]; then + usage + exit $OCF_ERR_ARGS +fi + +# These operations don't require OCF instance parameters to be set +case "$1" in + meta-data) meta_data + exit $OCF_SUCCESS;; + usage) usage + exit $OCF_SUCCESS;; + methods) iscsi_methods + exit $OCF_SUCCESS;; +esac + +if [ x = "x$OCF_RESKEY_target" ]; then + ocf_exit_reason "target parameter not set" + exit $OCF_ERR_CONFIGURED +fi + +if [ x = "x$OCF_RESKEY_portal" ]; then + ocf_exit_reason "portal parameter not set" + exit $OCF_ERR_CONFIGURED +fi + +case `uname` in +Linux) setup=open_iscsi_setup +;; +*) ocf_log info "platform `uname` may not be supported" + setup=open_iscsi_setup +;; +esac + +PORTAL="$OCF_RESKEY_portal" # updated by discovery +LSB_STATUS_STOPPED=3 +$setup +setup_rc=$? +if [ $setup_rc -gt 1 ]; then + ocf_exit_reason "iscsi initiator utilities not installed or not setup" + case "$1" in + stop) exit $OCF_SUCCESS;; + monitor) exit $OCF_NOT_RUNNING;; + status) exit $LSB_STATUS_STOPPED;; + *) exit $OCF_ERR_INSTALLED;; + esac +fi + +if [ `id -u` != 0 ]; then + ocf_exit_reason "$0 must be run as root" + exit $OCF_ERR_PERM +fi + +# which method was invoked? +case "$1" in + start) + iscsi_start + ;; + stop) iscsi_stop + ;; + status) iscsi_monitor + rc=$? + case $rc in + $OCF_SUCCESS) + echo iscsi target $OCF_RESKEY_target running + ;; + $OCF_NOT_RUNNING) + echo iscsi target $OCF_RESKEY_target stopped + ;; + *) + echo iscsi target $OCF_RESKEY_target failed + ;; + esac + exit $rc + ;; + monitor) iscsi_monitor + ;; + validate-all) # everything already validated + # just exit successfully here. + exit $OCF_SUCCESS;; + *) iscsi_methods + exit $OCF_ERR_UNIMPLEMENTED;; +esac + +# +# vim:tabstop=4:shiftwidth=4:textwidth=0:wrapmargin=0 diff --git a/heartbeat/jboss b/heartbeat/jboss new file mode 100755 index 0000000..948355a --- /dev/null +++ b/heartbeat/jboss @@ -0,0 +1,672 @@ +#!/bin/sh +# +# Description: Manages a Jboss Server as an OCF High-Availability +# resource under Heartbeat/LinuxHA control +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +# 02110-1301, USA. +# +# Copyright (c) 2009 Bauer Systems KG / Stefan Schluppeck +# +####################################################################### +# OCF parameters: +# OCF_RESKEY_resource_name - The name of the resource. Default is ${OCF_RESOURCE_INSTANCE} +# OCF_RESKEY_jboss_version - The version of JBoss. Default is 5. +# why not let the RA log through lrmd? +# 2009/09/09 Nakahira: +# jboss_console is used to record output of the "run.sh". +# The log of "Run.sh" should not be output to ha-log because it is so annoying. +# OCF_RESKEY_console - A destination of the log of jboss run and shutdown script. Default is /var/log/${OCF_RESKEY_resource_name}.log +# OCF_RESKEY_shutdown_timeout - Time-out at the time of the stop. Default is 5 +# OCF_RESKEY_kill_timeout - The re-try number of times awaiting a stop. Default is 10 +# OCF_RESKEY_user - A user name to start a JBoss. Default is root +# OCF_RESKEY_statusurl - URL for state confirmation. Default is ${OCF_RESKEY_statusurl_default} +# OCF_RESKEY_java_home - Home directory of the Java. Default is ${JAVA_HOME} +# OCF_RESKEY_java_opts - Options for Java. +# OCF_RESKEY_jboss_home - Home directory of Jboss. Default is None +# is it possible to devise this string from options? I'm afraid +# that allowing users to set this could be error prone. +# 2009/09/09 Nakahira: +# It is difficult to set it automatically because jboss_pstring +# greatly depends on the environment. At any rate, system architect +# should note that pstring doesn't influence other processes. +# OCF_RESKEY_jboss_base_dir - Base directory of JBoss. Default is ${OCF_RESKEY_jboss_base_dir_default} +# OCF_RESKEY_pstring - String Jboss will found in procceslist. Default is ${OCF_RESKEY_pstring_default} +# OCF_RESKEY_run_command - JBoss start command. Default is "${OCF_RESKEY_run_command_default}" +# OCF_RESKEY_run_opts - Options for jboss to run. Default is ${OCF_RESKEY_run_opts_default} +# OCF_RESKEY_shutdown_opts - Options for jboss to shutdonw. Default is "-s 127.0.0.1:1099" +# OCF_RESKEY_rotate_consolelog - Control console log logrotation flag. Default is false. +# OCF_RESKEY_rotate_value - console log logrotation value. Default is 86400 span(seconds). +# OCF_RESKEY_rotate_logsuffix - Control console log logrotation suffix. Default is .%F. +############################################################################### + + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +usage() +{ + cat <<-! +usage: $0 action + +action: + start start jboss + + stop stop the jboss + + status return the status of jboss, run or down + + monitor return TRUE if the jboss appears to be working. + You have to have installed $WGETNAME for this to work. + + meta-data show meta data message + + validate-all validate the instance parameters +! + return $OCF_ERR_ARGS +} + +isrunning_jboss() +{ + local rc + if [ -z "$1" ];then + ocf_run -q -err wget -t 1 -O /dev/null $STATUSURL + else + # Retry message for restraint + wget -t 1 -O /dev/null $STATUSURL 2>/dev/null + fi + rc=$? + if [ $rc -eq 0 ]; then + return $OCF_SUCCESS + fi + # JBoss service error + return $OCF_ERR_GENERIC +} + +monitor_rotatelogs() +{ + pgrep -f "$ROTATELOGS.*$CONSOLE$ROTATELOG_SUFFIX" > /dev/null 2>&1 + if [ $? -ne 0 ]; then + ocf_log warn "A rotatelogs command for $CONSOLE is not running. Restarting it." + start_rotatelogs + if [ $? -eq 0 ]; then + ocf_log info "Restart rotatelogs process succeeded." + else + ocf_log warn "Restart rotatelogs process failed." + fi + fi +} + +monitor_jboss() +{ + if ! pgrep -f "$PSTRING" > /dev/null; then + return $OCF_NOT_RUNNING + fi + isrunning_jboss $1 + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + return $rc + fi + if ocf_is_true $ROTATELOG_FLG; then + # Monitor rotatelogs process and restart it if it is stopped. + # And never consider rotatelogs process failure to be a monitor failure + # as long as JBoss process works fine. + monitor_rotatelogs + fi + return $OCF_SUCCESS +} + +start_rotatelogs() +{ + su - -s /bin/sh $JBOSS_USER \ + -c "$ROTATELOGS -l \"$CONSOLE$ROTATELOG_SUFFIX\" $ROTATEVALUE" \ + < "$CONSOLE" > /dev/null 2>&1 & +} + +rotate_console() +{ + # Check $CONSOLE$ROTATELOG_SUFFIX is writable or not. + CURRENT_ROTATELOG_SUFFIX=`date +"$ROTATELOG_SUFFIX"` + su - -s /bin/sh $JBOSS_USER \ + -c "touch \"$CONSOLE$CURRENT_ROTATELOG_SUFFIX\"" > /dev/null 2>&1 + if [ $? -ne 0 ]; then + ocf_log err "$CONSOLE$CURRENT_ROTATELOG_SUFFIX is not writable." + return $OCF_ERR_GENERIC + fi + + # Clean up and set permissions on required files + if [ -p "$CONSOLE" ]; then + rm -rf "$CONSOLE" + elif [ -e "$CONSOLE" ]; then + DATE=`date +"%F-%H%M%S"` + ocf_log warn "$CONSOLE already exists. It is saved as $CONSOLE-$DATE" + mv "$CONSOLE" "$CONSOLE-$DATE" + fi + mkfifo -m700 "$CONSOLE" + chown --dereference "$JBOSS_USER" "$CONSOLE" || true + + start_rotatelogs +} + +start_jboss() +{ + monitor_jboss start + if [ $? -eq $OCF_SUCCESS ]; then + ocf_log info "JBoss already running." + return $OCF_SUCCESS + fi + + if ocf_is_true $ROTATELOG_FLG; then + rotate_console + if [ $? -eq 0 ]; then + ocf_log debug "Rotate console log succeeded." + else + ocf_log err "Rotate console log failed. Avoid starting jboss without console log rotation." + return $OCF_ERR_GENERIC + fi + fi + + ocf_log info "Starting JBoss[$RESOURCE_NAME]" + if [ "$JBOSS_USER" = root ]; then + "$RUN_COMMAND" $RUN_OPTS \ + >> "$CONSOLE" 2>&1 & + else + su - -s /bin/sh "$JBOSS_USER" \ + -c "export JAVA_HOME=\"${JAVA_HOME}\"; \ + export JAVA_OPTS=\"${JAVA_OPTS}\"; \ + export JBOSS_HOME=\"${JBOSS_HOME}\"; \ + export JBOSS_BASE_DIR=\"${JBOSS_BASE_DIR}\"; \ + \"$RUN_COMMAND\" $RUN_OPTS" \ + >> "$CONSOLE" 2>&1 & + fi + + while true; do + monitor_jboss start + if [ $? -eq $OCF_SUCCESS ]; then + break + fi + ocf_log info "start_jboss[$RESOURCE_NAME]: retry monitor_jboss" + sleep 3 + done + + ocf_log info "JBoss[$RESOURCE_NAME] is started." + return $OCF_SUCCESS +} + +output_thread_dump() +{ + ocf_log info "stop_jboss[$RESOURCE_NAME]: output a JVM thread dump to $CONSOLE" + pkill -QUIT -f "$PSTRING" +} + +# arg1 : timeout +# arg2 : send specified signal +wait_process_exit() +{ + local lapse_sec=0 + local timeout=$1 + local signal=$2 + + while pgrep -f "$PSTRING" > /dev/null; do + sleep 1 + lapse_sec=`expr $lapse_sec + 1` + if [ -n "$signal" ]; then + ocf_log info "stop_jboss[$RESOURCE_NAME]: kill jboss by SIG$signal ($lapse_sec/$timeout)" + pkill -$signal -f "$PSTRING" + else + ocf_log info "stop_jboss[$RESOURCE_NAME]: stop NORM $lapse_sec/$timeout" + fi + if [ "$timeout" -ne 0 -a $lapse_sec -ge $timeout ]; then + return 1 + fi + done + return 0 +} + +stop_jboss5() +{ + if [ "$JBOSS_USER" = root ]; then + "$JBOSS_HOME/bin/shutdown.sh" $SHUTDOWN_OPTS -S \ + >> "$CONSOLE" 2>&1 & + else + su - -s /bin/sh "$JBOSS_USER" \ + -c "export JAVA_HOME=\"${JAVA_HOME}\"; \ + export JBOSS_HOME=\"${JBOSS_HOME}\"; \ + \"$JBOSS_HOME/bin/shutdown.sh\" $SHUTDOWN_OPTS -S" \ + >> "$CONSOLE" 2>&1 & + fi + if ! wait_process_exit $SHUTDOWN_TIMEOUT; then + output_thread_dump + if ! wait_process_exit $KILL_TIMEOUT TERM; then + return 1 + fi + fi + return 0 +} + +stop_jboss6() +{ + pkill -TERM -f "$PSTRING" + + if ! wait_process_exit $SHUTDOWN_TIMEOUT; then + output_thread_dump + return 1 + fi + return 0 +} + +stop_jboss() +{ + local rc + + if ! pgrep -f "$PSTRING" > /dev/null; then + ocf_log info "JBoss[$RESOURCE_NAME] is already stopped." + else + ocf_log info "Stopping JBoss[$RESOURCE_NAME]" + # JBoss5 : shutdonw.sh -> SIGQUIT(output thread dump) -> SIGTERM + # If the JBoss process hangs, JBoss RA waits $SHUTDOWN_TIMEOUT + # seconds and tries kill TERM and QUIT for $KILL_TIMEOUT seconds. + # JBoss6 : SIGTERM -> SIGQUIT(output thread dump) + # If the JBoss process hangs, JBoss RA waits $SHUTDOWN_TIMEOUT + # seconds and tries kill QUIT. + if [ "$JBOSS_VERSION" -le 5 ]; then + stop_jboss5 + rc=$? + else + stop_jboss6 + rc=$? + fi + if [ $rc -ne 0 ]; then + # JBoss5 + # The stop timeout of RA should be + # longer than $SHUTDOWN_TIMEOUT + $KILL_TIMEOUT. + # JBoss6 + # The stop timeout of RA should be longer than $SHUTDOWN_TIMEOUT. + wait_process_exit 0 KILL + fi + ocf_log info "JBoss[$RESOURCE_NAME] is stopped." + fi + + if ocf_is_true $ROTATELOG_FLG; then + rm -f "${CONSOLE}" + fi + + return $OCF_SUCCESS +} + +status_jboss() +{ + if ! pgrep -f "$PSTRING" > /dev/null; then + echo "JBoss process[$RESOURCE_NAME] is not running." + return $OCF_NOT_RUNNING + fi + + if isrunning_jboss; then + echo "JBoss[$RESOURCE_NAME] is running." + return $OCF_SUCCESS + else + echo "JBoss process[$RESOURCE_NAME] is running." + echo "But, we can not access JBoss web service." + return $OCF_NOT_RUNNING + fi +} + + +metadata_jboss() +{ + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="jboss" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for Jboss. It manages a Jboss instance as an HA resource. +</longdesc> +<shortdesc lang="en">Manages a JBoss application server instance</shortdesc> + +<parameters> + +<parameter name="jboss_version" unique="0" required="0"> +<longdesc lang="en"> +The version of JBoss. Default is ${OCF_RESKEY_jboss_version_default}. +The usage of JBoss was greatly changed as of JBoss 6. +Specify "6" when you use JBoss 6. +</longdesc> +<shortdesc lang="en">The version of JBoss</shortdesc> +<content type="integer" default="${OCF_RESKEY_jboss_version_default}" /> +</parameter> + +<parameter name="resource_name" unique="1" required="0"> +<longdesc lang="en"> +The name of the resource. Defaults to the name of the resource +instance. +</longdesc> +<shortdesc lang="en">The name of the resource</shortdesc> +<content type="string" default="${OCF_RESOURCE_INSTANCE}" /> +</parameter> + +<parameter name="console" unique="1" required="0"> +<longdesc lang="en"> +A destination of the log of jboss run and shutdown script. +</longdesc> +<shortdesc lang="en">jboss log path</shortdesc> +<content type="string" default="${OCF_RESKEY_console_default}" /> +</parameter> + +<parameter name="shutdown_timeout" unique="0" required="0"> +<longdesc lang="en"> +Timeout for jboss bin/shutdown.sh. We wait for this timeout to +expire, then send the TERM and QUIT signals. Finally, the KILL +signal is used to terminate the jboss process. You should set the +timeout for the stop operation to a value bigger than the sum of +the timeout parameters. See also kill_timeout. +</longdesc> +<shortdesc lang="en">shutdown timeout</shortdesc> +<content type="integer" default="${OCF_RESKEY_shutdown_timeout_default}" /> +</parameter> + +<parameter name="kill_timeout" unique="0" required="0"> +<longdesc lang="en"> +If bin/shutdown.sh doesn't stop the jboss process, then we send +it TERM and QUIT signals, intermittently and once a second. After +this timeout expires, if the process is still live, we use the +KILL signal. See also shutdown_timeout. +</longdesc> +<shortdesc lang="en">stop by signal timeout</shortdesc> +<content type="integer" default="${OCF_RESKEY_kill_timeout_default}" /> +</parameter> + +<parameter name="user" unique="0" required="0"> +<longdesc lang="en"> +A user name to start a JBoss. +</longdesc> +<shortdesc lang="en">A user name to start a resource.</shortdesc> +<content type="string" default="${OCF_RESKEY_user_default}"/> +</parameter> + +<parameter name="statusurl" unique="0" required="0"> +<longdesc lang="en"> +URL to test in the monitor operation. +</longdesc> +<shortdesc lang="en">URL to test in the monitor operation.</shortdesc> +<content type="string" default="${OCF_RESKEY_statusurl_default}" /> +</parameter> + +<parameter name="java_home" unique="0" required="0"> +<longdesc lang="en"> +Home directory of Java. Defaults to the environment variable +JAVA_HOME. If it is not set, then define this parameter. +</longdesc> +<shortdesc lang="en">Home directory of Java.</shortdesc> +<content type="string" default="$JAVA_HOME"/> +</parameter> + +<parameter name="java_opts" unique="0" required="0"> +<longdesc lang="en"> +Java options. +</longdesc> +<shortdesc lang="en">Java options.</shortdesc> +<content type="string" default="${OCF_RESKEY_java_opts_default}"/> +</parameter> + +<parameter name="jboss_home" unique="0" required="1"> +<longdesc lang="en"> +Home directory of Jboss. +</longdesc> +<shortdesc lang="en">Home directory of Jboss.</shortdesc> +<content type="string" default="${OCF_RESKEY_jboss_home_default}"/> +</parameter> + +<parameter name="jboss_base_dir" unique="0" required="0"> +<longdesc lang="en"> +Base directory of JBoss. This parameter is not used in JBoss5. +</longdesc> +<shortdesc lang="en">Base directory of JBoss.</shortdesc> +<content type="string" default="${OCF_RESKEY_jboss_base_dir_default}" /> +</parameter> + +<parameter name="pstring" unique="0" required="0"> +<longdesc lang="en"> +With this string heartbeat matches for the right process to kill. +</longdesc> +<shortdesc lang="en">pkill/pgrep search string</shortdesc> +<content type="string" default="${OCF_RESKEY_pstring_default}" /> +</parameter> + +<parameter name="run_command" unique="0" required="0"> +<longdesc lang="en"> +JBoss start command. +</longdesc> +<shortdesc lang="en">JBoss start command.</shortdesc> +<content type="string" default="${OCF_RESKEY_run_command_default}" /> +</parameter> + +<parameter name="run_opts" unique="0" required="0"> +<longdesc lang="en"> +Start options to start Jboss with, defaults are from the Jboss-Doku. +</longdesc> +<shortdesc lang="en">options for jboss run.sh</shortdesc> +<content type="string" default="${OCF_RESKEY_run_opts_default}" /> +</parameter> + +<parameter name="shutdown_opts" unique="0" required="0"> +<longdesc lang="en"> +Stop options to stop Jboss with. +</longdesc> +<shortdesc lang="en">options for jboss shutdown.sh</shortdesc> +<content type="string" default="${OCF_RESKEY_shutdown_opts_default}" /> +</parameter> + +<parameter name="rotate_consolelog" unique="0"> +<longdesc lang="en"> +Rotate console log flag. +</longdesc> +<shortdesc lang="en">Rotate console log flag</shortdesc> +<content type="boolean" default="${OCF_RESKEY_rotate_consolelog_default}" /> +</parameter> + +<parameter name="rotate_value" unique="0"> +<longdesc lang="en"> +Console log rotation value (default is 86400 seconds). +</longdesc> +<shortdesc lang="en">Console log rotation value (default is 86400 seconds)</shortdesc> +<content type="integer" default="${OCF_RESKEY_rotate_value_default}" /> +</parameter> + +<parameter name="rotate_logsuffix" unique="0"> +<longdesc lang="en"> +Rotate console log suffix. +</longdesc> +<shortdesc lang="en">Rotate console log suffix</shortdesc> +<content type="integer" default="${OCF_RESKEY_rotate_logsuffix_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="60s" /> +<action name="stop" timeout="120s" /> +<action name="status" timeout="30s" /> +<action name="monitor" depth="0" timeout="30s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s"/> +</actions> +</resource-agent> +END + return $OCF_SUCCESS +} + +validate_all_jboss() +{ + if [ ! -d "$JAVA_HOME" ]; then + ocf_log err "JAVA_HOME does not exist." + return $OCF_ERR_INSTALLED + fi + + if [ ! -d "$JBOSS_HOME" ]; then + ocf_log err "JBOSS_HOME does not exist." + return $OCF_ERR_INSTALLED + fi + + if [ "$JBOSS_VERSION" -gt 5 ]; then + if [ ! -d "$JBOSS_BASE_DIR" ]; then + ocf_log err "JBOSS_BASE_DIR does not exist." + return $OCF_ERR_INSTALLED + fi + fi + + if [ ! -x "$JAVA" ]; then + ocf_log err "java command does not exist." + return $OCF_ERR_INSTALLED + fi + + if ocf_is_true $ROTATELOG_FLG; then + if [ ! -x "$ROTATELOGS" ]; then + ocf_log err "rotatelogs command does not exist." + return $OCF_ERR_INSTALLED + fi + fi + + return $OCF_SUCCESS +} + +# Parameter defaults + +OCF_RESKEY_jboss_version_default="5" +OCF_RESKEY_console_default="/var/log/${OCF_RESOURCE_INSTANCE}.log" +OCF_RESKEY_shutdown_timeout_default="5" +OCF_RESKEY_kill_timeout_default="10" +OCF_RESKEY_user_default="root" +OCF_RESKEY_java_opts_default="" +OCF_RESKEY_jboss_home_default="" +OCF_RESKEY_shutdown_opts_default="-s 127.0.0.1:1099" +OCF_RESKEY_rotate_consolelog_default="false" +OCF_RESKEY_rotate_value_default="86400" +OCF_RESKEY_rotate_logsuffix_default=".%F" + +COMMAND=$1 +JBOSS_VERSION="${OCF_RESKEY_jboss_version-${OCF_RESKEY_jboss_version_default}}" +if ! ocf_is_decimal $JBOSS_VERSION; then + ocf_log err "Invalid parameter value: jboss_version [$JBOSS_VERSION]" + return $OCF_ERR_ARGS +fi +# Setting of the default value +if [ "$JBOSS_VERSION" -le 5 ]; then + OCF_RESKEY_statusurl_default="http://127.0.0.1:8080" + OCF_RESKEY_pstring_default="java -Dprogram.name=run.sh" + OCF_RESKEY_run_command_default="${OCF_RESKEY_jboss_home}/bin/run.sh" + OCF_RESKEY_run_opts_default="-c default" +else + OCF_RESKEY_jboss_base_dir_default="${OCF_RESKEY_jboss_home}/standalone" + JBOSS_BASE_DIR="${OCF_RESKEY_jboss_base_dir-${OCF_RESKEY_jboss_base_dir_default}}" + OCF_RESKEY_statusurl_default="http://127.0.0.1:9990" + OCF_RESKEY_pstring_default="java.*-Djboss.server.base.dir=${JBOSS_BASE_DIR}( .*)?$" + OCF_RESKEY_run_command_default="${OCF_RESKEY_jboss_home}/bin/standalone.sh" + OCF_RESKEY_run_opts_default="" +fi +RESOURCE_NAME="${OCF_RESKEY_resource_name-${OCF_RESOURCE_INSTANCE}}" +CONSOLE="${OCF_RESKEY_console-/var/log/${RESOURCE_NAME}.log}" +SHUTDOWN_TIMEOUT="${OCF_RESKEY_shutdown_timeout-${OCF_RESKEY_shutdown_timeout_default}}" +KILL_TIMEOUT="${OCF_RESKEY_kill_timeout-${OCF_RESKEY_kill_timeout_default}}" +JBOSS_USER="${OCF_RESKEY_user-${OCF_RESKEY_user_default}}" +STATUSURL="${OCF_RESKEY_statusurl-${OCF_RESKEY_statusurl_default}}" +PSTRING="${OCF_RESKEY_pstring-${OCF_RESKEY_pstring_default}}" +RUN_OPTS="${OCF_RESKEY_run_opts-${OCF_RESKEY_run_opts_default}}" +SHUTDOWN_OPTS="${OCF_RESKEY_shutdown_opts-${OCF_RESKEY_shutdown_opts_default}}" +ROTATELOG_FLG="${OCF_RESKEY_rotate_consolelog-${OCF_RESKEY_rotate_consolelog_default}}" +ROTATEVALUE="${OCF_RESKEY_rotate_value-${OCF_RESKEY_rotate_value_default}}" +ROTATELOG_SUFFIX="${OCF_RESKEY_rotate_logsuffix-${OCF_RESKEY_rotate_logsuffix_default}}" + +if [ $# -ne 1 ]; then + usage + exit $OCF_ERR_ARGS +fi + +if [ "$COMMAND" = "meta-data" ]; then + metadata_jboss + exit $OCF_SUCCESS +fi +if [ "$COMMAND" = "help" -o "$COMMAND" = "usage" ]; then + usage + exit $OCF_SUCCESS +fi + +# test if these two are set and if directories exist and if the +# required scripts/binaries exist; use OCF_ERR_INSTALLED +JAVA_HOME="${OCF_RESKEY_java_home-${JAVA_HOME}}" +JAVA_OPTS="${OCF_RESKEY_java_opts}" +JBOSS_HOME="${OCF_RESKEY_jboss_home}" +RUN_COMMAND="${OCF_RESKEY_run_command-${OCF_RESKEY_run_command_default}}" + +LSB_STATUS_STOPPED=3 + +export JAVA_HOME JAVA_OPTS JBOSS_HOME JBOSS_BASE_DIR + +JAVA=${JAVA_HOME}/bin/java + +ROTATELOGS="" +if ocf_is_true $ROTATELOG_FLG; then + # Look for rotatelogs/rotatelogs2 + if [ -x /usr/sbin/rotatelogs ]; then + ROTATELOGS=/usr/sbin/rotatelogs + elif [ -x /usr/sbin/rotatelogs2 ]; then + ROTATELOGS=/usr/sbin/rotatelogs2 + fi +fi + +validate_all_jboss +rc=$? + +[ "$COMMAND" = "validate-all" ] && exit $rc + +if [ $rc -ne 0 ]; then + case $COMMAND in + stop) exit $OCF_SUCCESS;; + monitor) exit $OCF_NOT_RUNNING;; + status) exit $LSB_STATUS_STOPPED;; + *) exit $rc;; + esac +fi + +case "$COMMAND" in + start) + start_jboss + func_status=$? + exit $func_status + ;; + stop) + stop_jboss + func_status=$? + exit $func_status + ;; + status) + status_jboss + exit $? + ;; + monitor) + monitor_jboss + func_status=$? + exit $func_status + ;; + validate-all) + validate_all_jboss + exit $? + ;; + *) + usage + exit $OCF_ERR_UNIMPLEMENTED;; +esac + diff --git a/heartbeat/jira.in b/heartbeat/jira.in new file mode 100644 index 0000000..66a8e53 --- /dev/null +++ b/heartbeat/jira.in @@ -0,0 +1,291 @@ +#!@BASH_SHELL@ +# +#################################################################### +# Description: OCF Resource Agent to manage JIRA software. +# Author : Saleh A. (saleh.abbas.saber@gmail.com) +# +# License : WTFPL 2 +# +# DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE +# Version 2, December 2004 +# +# Copyright (C) 2004 Sam Hocevar <sam@hocevar.net> +# +# Everyone is permitted to copy and distribute verbatim or modified +# copies of this license document, and changing it is allowed as long +# as the name is changed. +# +# DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE +# TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION +# +# 0. You just DO WHAT THE FUCK YOU WANT TO. +# +#################################################################### +# Parameters: +# OCF_RESKEY_statusurl : Status URL to monitor JIRA +# (default: http://localhost:8080/status) +# OCF_RESKEY_java_home : Java Home +# (default: /usr/lib/jvm/jre) +# OCF_RESKEY_jira_installation : Jira installtion directory +# OCF_RESKEY_jira_user : User running Jira software +# (by default: jira) +#################################################################### + +# Initialization +# Source ocf-shellfuncs +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_statusurl_default="http://localhost:8080/status" +OCF_RESKEY_java_home_default="/usr/lib/jvm/jre" +OCF_RESKEY_jira_user_default="jira" + +: ${OCF_RESKEY_statusurl=${OCF_RESKEY_statusurl_default}} +: ${OCF_RESKEY_java_home=${OCF_RESKEY_java_home_default}} +: ${OCF_RESKEY_jira_user=${OCF_RESKEY_jira_user_default}} + +# Usage +jira_usage() { + cat <<_EOF +Usage: $0 action + +Supported Actions: + start : start jira + stop : stop jira + monitor : show jira status + meta-data : show the meta-data + validate-all: validate the RA configuration +_EOF +} + +# Start +jira_start() { + # exit immediately if configuration is not valid + jira_validate_all || exit $? + + # if resource is already running, bail out early + if jira_monitor; then + ocf_log info "Resource is already running" + return $OCF_SUCCESS + fi + + # Starting Jira + waittime=300 + su -m $jira_user -c "$jira_installation/bin/startup.sh &> /dev/null" + while [[ $waittime -gt 0 ]]; do + if $(curl --connect-timeout 1 --max-time 3 -s ${statusurl} | grep '{"state":"RUNNING"}' > /dev/null); then + waittime=0 + else + sleep 1 + waittime=$(($waittime - 1)) + fi + done + + # Verify jira is running + jira_monitor + rc=$? + + return $? +} + +# Stop +jira_stop() { + local rc + + # exit immediately if configuration is not valid + jira_validate_all || exit $? + + jira_monitor + rc=$? + case "$rc" in + "$OCF_SUCCESS") + # Currently running. Normal, expected behavior. + ocf_log debug "Resource is currently running" + ;; + "$OCF_NOT_RUNNING") + # Currently not running. Nothing to do. + ocf_log info "Resource is already stopped" + return $OCF_SUCCESS + ;; + esac + + # Stopping Jira + waittime=300 + su -m $jira_user -c "$jira_installation/bin/shutdown.sh &> /dev/null" + while [[ $waittime -gt 0 ]]; do + if $(kill -0 $(cat ${jira_installation}/work/catalina.pid 2> /dev/null) 2> /dev/null) ; then + sleep 1 + waittime=$(($waittime - 1)) + else + waittime=0 + fi + done + + # Stop JIRA forcely if it failed + if $(kill -0 $(cat ${jira_installation}/work/catalina.pid 2> /dev/null) 2> /dev/null) ; then + kill -9 $(cat ${jira_installation}/work/catalina.pid) + sleep 1 + fi + + # Verify jira is stopped + jira_monitor + rc=$? + + return $rc + +} + +# Monitor +jira_monitor() { + local rc + + # exit immediately if configuration is not valid + jira_validate_all || exit $? + + if $(kill -0 $(cat ${jira_installation}/work/catalina.pid 2> /dev/null) 2> /dev/null) ; then + # Is jira working + if $(curl --connect-timeout 1 --max-time 3 -s ${statusurl} | grep '{"state":"RUNNING"}' > /dev/null) ; then + rc=0 + else + # Jira has a problem + rc=2 + fi + else + # Tomcat is stopped (and Jira) + rc=1 + fi + + case "$rc" in + 0) + rc=$OCF_SUCCESS + ocf_log debug "Resource is running" + ;; + 1) + rc=$OCF_NOT_RUNNING + ocf_log debug "Resource is not running" + ;; + *) + ocf_log err "Resource has failed" + exit $OCF_ERR_GENERIC + esac + + return $rc +} + +# Validat All +jira_validate_all() { + + # Check if java is installed + if ! [ -d $OCF_RESKEY_java_home ]; then + ocf_log err "$OCF_RESKEY_java_home does not exist. \ + Please ensure that Java is installed and configured correctly" + exit $OCF_ERR_INSTALLED + fi + + # Check if JIRA installation directory exists + if ! [ -d $OCF_RESKEY_jira_installation ]; then + ocf_log err "$OCF_RESKEY_jira_installation does not exist." + exit $OCF_ERR_INSTALLED + fi + + return $OCF_SUCCESS +} + +# Meta-data +jira_meta_data(){ + cat <<EOF +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="jira" version="0.1"> + <version>1.0</version> + <longdesc lang="en"> +OCF Resource Agent to manage JIRA software + </longdesc> + <shortdesc lang="en">JIRA OCF RA</shortdesc> + <parameters> + + <parameter name="statusurl" unique="0" required="0"> + <longdesc lang="en"> + Status URL for JIRA monitoring + </longdesc> + <shortdesc lang="en">JIRA status url</shortdesc> + <content type="string" default="${OCF_RESKEY_statusurl_default}"/> + </parameter> + + <parameter name="java_home" unique="0" required="0"> + <longdesc lang="en"> + Java Home in the Linux instance + </longdesc> + <shortdesc lang="en">Java Home</shortdesc> + <content type="string" default="${OCF_RESKEY_java_home_default}"/> + </parameter> + + + <parameter name="jira_installation" unique="0" required="1"> + <longdesc lang="en"> + JIRA installation directory (binaries, ... etc) + </longdesc> + <shortdesc lang="en">JIRA installation directory</shortdesc> + <content type="string"/> + </parameter> + + <parameter name="jira_user" unique="0" required="0"> + <longdesc lang="en"> + User to run Jira software with + </longdesc> + <shortdesc lang="en">Jira user</shortdesc> + <content type="string" default="${OCF_RESKEY_jira_user_default}"/> + </parameter> + + </parameters> + <actions> + <action name="start" timeout="300s" /> + <action name="stop" timeout="300s" /> + <action name="monitor" timeout="30s" + interval="10s" depth="0" /> + <action name="meta-data" timeout="5s" /> + <action name="validate-all" timeout="20s" /> + </actions> +</resource-agent> +EOF +} + +# Execution + +# Set vars from defined OCF env vars +statusurl=${OCF_RESKEY_statusurl-${OCF_RESKEY_statusurl_default}} +java_home=${OCF_RESKEY_java_home-${OCF_RESKEY_java_home_default}} +jira_installation=${OCF_RESKEY_jira_installation} +jira_user=${OCF_RESKEY_jira_user-${OCF_RESKEY_jira_user_default}} + +# Export JAVA_HOME env variable +export JAVA_HOME=${OCF_RESKEY_java_home} + +# Make sure meta-data and usage always succeed +case $__OCF_ACTION in +meta-data) jira_meta_data + exit $OCF_SUCCESS + ;; +usage|help) jira_usage + exit $OCF_SUCCESS + ;; +esac + +# Anything other than meta-data and usage must pass validation +jira_validate_all || exit $? + +# Translate each action into the appropriate function call +case $__OCF_ACTION in +start) jira_start;; +stop) jira_stop;; +status|monitor) jira_monitor;; +validate-all) ;; +*) jira_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? + +exit $rc diff --git a/heartbeat/kamailio.in b/heartbeat/kamailio.in new file mode 100644 index 0000000..4f6af3d --- /dev/null +++ b/heartbeat/kamailio.in @@ -0,0 +1,741 @@ +#!@BASH_SHELL@ +# +# OCF resource agent for Kamailio for pacemaker +# + +# Copyright (c) 2013 FREQUENTIS AG, +# Authors: Stefan Wenk +# Rainer Brestan +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +# OCF input parameters: +# OCF_RESKEY_binary +# OCF_RESKEY_conffile +# OCF_RESKEY_pidfile +# OCF_RESKEY_monitoring_ip +# OCF_RESKEY_listen_address +# OCF_RESKEY_port +# OCF_RESKEY_proto +# OCF_RESKEY_sipsak +# OCF_RESKEY_kamctl +# OCF_RESKEY_kamctlrc +# OCF_RESKEY_kamuser +# OCF_RESKEY_kamgroup +# OCF_RESKEY_extra_options + +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### +# Defaults + +RESKEY_binary_default="/usr/sbin/kamailio" +RESKEY_conffile_default="/etc/kamailio/kamailio.cfg" +RESKEY_pidfile_default="/var/run/kamailio_${OCF_RESOURCE_INSTANCE}/kamailio.pid" +RESKEY_monitoring_ip_default=127.0.0.1 +RESKEY_port_default=5060 +RESKEY_proto_default="udptcp" +RESKEY_sipsak_default="/usr/bin/sipsak" +RESKEY_kamctl_default="/usr/bin/kamctl" +RESKEY_kamctlrc_default="/etc/kamailio/kamctlrc" +RESKEY_kamuser_default="" +RESKEY_kamgroup_default="" +RESKEY_extra_options_default="" + +####################################################################### +: ${OCF_RESKEY_binary=${RESKEY_binary_default}} +: ${OCF_RESKEY_conffile=${RESKEY_conffile_default}} +: ${OCF_RESKEY_pidfile=${RESKEY_pidfile_default}} +: ${OCF_RESKEY_monitoring_ip=${RESKEY_monitoring_ip_default}} +: ${OCF_RESKEY_port=${RESKEY_port_default}} +: ${OCF_RESKEY_proto=${RESKEY_proto_default}} +: ${OCF_RESKEY_sipsak=${RESKEY_sipsak_default}} +: ${OCF_RESKEY_kamctl=${RESKEY_kamctl_default}} +: ${OCF_RESKEY_kamctlrc=${RESKEY_kamctlrc_default}} +: ${OCF_RESKEY_kamuser=${RESKEY_kamuser_default}} +: ${OCF_RESKEY_kamgroup=${RESKEY_kamgroup_default}} +: ${OCF_RESKEY_extra_options=${RESKEY_extra_options_default}} + +####################################################################### +usage() { + cat <<END +usage: $0 {start|stop|status|monitor|validate-all|meta-data} +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="kamailio" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> + Resource agent for the Kamailio SIP proxy/registrar. + Multiple instances are possible when using following parameter combinations: + + Parameters for Kamailio instance 1: + listen_address=192.168.159.128 + monitoring_ip=192.168.159.128 + proto=udptcp + port=5060 + + Parameters for Kamailio instance 2: + listen_address=192.168.159.128 + monitoring_ip=192.168.159.128 + proto=udp + port=5070 + conffile=/etc/kamailio/kamailio2.cfg + kamctlrc="" + + Only one instance can be monitored via the command "kamctl monitor" + because the kamctl tool of kamailio 4.x is not designed for multiple + instances. Therefore, the provided kamctrlrc file path needs to be + empty for instance 2, 3 ... + +Parameters for a third Kamailio instance: + listen_address=192.168.159.128 + monitoring_ip=192.168.159.128 + proto=tcp + port=5080 + conffile=/etc/kamailio/kamailio3.cfg + kamctlrc="" +</longdesc> + +<shortdesc lang="en">Resource agent for Kamailio</shortdesc> + +<parameters> + <parameter name="binary" unique="0" required="0"> + <longdesc lang="en">The kamailio binary</longdesc> + <shortdesc lang="en">The kamailio binary</shortdesc> + <content type="string" default="${RESKEY_binary_default}" /> + </parameter> + + <parameter name="conffile" unique="0" required="0"> + <longdesc lang="en"> + The kamailio configuration file name with full path. + For example, "/etc/kamailio/kamailio.cfg" , which is the default value. + Make sure to use unique names in case of having multiple instances. + </longdesc> + <shortdesc lang="en">Configuration file name with full path</shortdesc> + <content type="string" default="${RESKEY_conffile_default}" /> + </parameter> + + <parameter name="pidfile" unique="0" required="0"> + <longdesc lang="en"> + The kamailio PID file. The directory used must be writable by kamailio + process user. Be sure to use unique name for running more than one + instance. Try to use absolute path names. + If empty, resource agent create a unique directory from the resource + instance name for the PID file and assign it to the process user. + </longdesc> + <shortdesc lang="en">PID file</shortdesc> + <content type="string" default="${RESKEY_pidfile_default}" /> + </parameter> + + <parameter name="monitoring_ip" unique="0" required="0"> + <longdesc lang="en"> + SIP IP Address of the kamailio instance used for SIP OPTIONS polling monitoring. + Usually the same IP address value as for parameter listen_address should be + provided. + + In order to respond with a 200 OK response to the SIP OOPTION requests, + the kamailio.cfg file needs to contain following section: + Note: The following "kamailio.cfg" code snippet is part of an XML section. + Therefore it contains two & characters, which need to be replaced + with two ampersand characters within "kamailio.cfg": + + if (is_method("OPTIONS") && ($ru=~"sip:monitor@.*")) { + ## + ## If the method is an OPTIONS we are simply going to respond + ## with a 200 OK. + # xlog("L_INFO", "Method is an OPTIONS, probably just monitoring\n"); + sl_send_reply("200", "Kamailio is alive"); + exit; + } + + </longdesc> + <shortdesc lang="en">Monitoring IP address used for SIP OPTIONS polling.</shortdesc> + <content type="string" default="${RESKEY_monitoring_ip_default}" /> + </parameter> + + <parameter name="listen_address" unique="0" required="1"> + <longdesc lang="en"> + SIP IP address the kamailio will listen on. + </longdesc> + <shortdesc lang="en">Listening SIP address</shortdesc> + <content type="string" /> + </parameter> + + <parameter name="port" unique="0" required="0"> + <longdesc lang="en"> + SIP port for the kamailio instance. + </longdesc> + <shortdesc lang="en">SIP Port</shortdesc> + <content type="string" default="${RESKEY_port_default}" /> + </parameter> + + <parameter name="extra_options" unique="0" required="0"> + <longdesc lang="en"> + Extra options to add to kamailio start. + </longdesc> + <shortdesc lang="en">extra_options</shortdesc> + <content type="string" default="${RESKEY_extra_options}" /> + </parameter> + + + <parameter name="proto" unique="0" required="0"> + <longdesc lang="en"> + The protocol used for SIP proto = udp|tcp|udptcp|conf_udp|conf_tcp|conf_udptcp. + Using the options "conf_*" does not add any "-l" parameters to the kamailio command, + the "listen" parameters from kamailio.conf are used instead. The sipsak checks are + performed depending what protocol is defined after the underscore. + </longdesc> + <shortdesc lang="en">protocol</shortdesc> + <content type="string" default="${RESKEY_proto_default}" /> + </parameter> + + <parameter name="sipsak" unique="0" required="0"> + <longdesc lang="en"> + The installation path of the sipsak tool, which is used + for monitoring Kamailio via SIP OPTIONS polling. + </longdesc> + <shortdesc lang="en">sipsak path</shortdesc> + <content type="string" default="${RESKEY_sipsak_default}" /> + </parameter> + + <parameter name="kamctl" unique="0" required="0"> + <longdesc lang="en"> + The installation path of the "kamctl" control tool. + </longdesc> + <shortdesc lang="en">kamctl path</shortdesc> + <content type="string" default="${RESKEY_kamctl_default}" /> + </parameter> + + <parameter name="kamctlrc" unique="0" required="0"> + <longdesc lang="en"> + The location of the "kamctlrc" file for the Kamailio instance. + The file "kamctlrc" is the Kamailio configuration file for its "kamctl" control tool. + + This parameter only needs to be provided in case of using multiple Kamailio server + instances on a single cluster node: + + In case that the parameter "kamctlrc" is not empty, this resource agent monitors + the health state of the Kamailio server via the command "kamctl monitor 1". This + setting is recommended in case of using a single Kamailio server instance. + + In case that the parameter "kamctlrc" is empty, the resource agent does not + monitor the health state of the Kamailio server instance via the "kamctl" command. + + Please note that the "kamctl" control command of Kamailio 4.x does not support + running multiple Kamailio instances on one host. Nevertheless this resource agent + does allow multiple Kamailio instances per host. The result of the "kamctl" + limitation in terms of number of Kamailio server instances is that the health + check via "kamctl monitor 1" can be configured for a single Kamailio instance + only. + + Please refer to the long description of this resource agent for an example + of parameter combinations in case that multiple instances are to be + configured per cluster node. + + </longdesc> + <shortdesc lang="en">kamctlrc path</shortdesc> + <content type="string" default="${RESKEY_kamctlrc_default}" /> + </parameter> + + <parameter name="kamuser" unique="0" required="0"> + <longdesc lang="en"> + The user account for kamailio process to run with. + Uses the current user, if not specified or empty. + There is no check, if running kamailio with the specified user account is possible. + </longdesc> + <shortdesc lang="en">kamailio user</shortdesc> + <content type="string" default="${RESKEY_kamuser_default}" /> + </parameter> + + <parameter name="kamgroup" unique="0" required="0"> + <longdesc lang="en"> + The group for kamailio process to run with. + Uses the current group, if not specified or empty. + </longdesc> + <shortdesc lang="en">kamailio group</shortdesc> + <content type="string" default="${RESKEY_kamgroup_default}" /> + </parameter> +</parameters> + + <actions> + <action name="start" timeout="60s" /> + <action name="stop" timeout="30s" /> + <action name="status" timeout="30s" interval="10s" /> + <action name="monitor" timeout="30s" interval="10s" /> + <action name="meta-data" timeout="5s" /> + <action name="validate-all" timeout="5s" /> + <action name="notify" timeout="5s" /> + </actions> +</resource-agent> +END + + exit $OCF_SUCCESS +} + +####################################################################### + +### +#Check if a process with given PID is running +# Parameter 1: PID +### +isRunning_PID() +{ + kill -s 0 "$1" > /dev/null 2>&1 +} + +### +#Check if an instance with given command line is running +# Parameter 1: command line. +### +isRunning_cmd() +{ + pkill --signal 0 -f -x "$1" > /dev/null 2>&1 +} + +### +# Formats the result of a command. +# +# Parameter 1: Exit status. +# Parameter 2: Standard output (stdout). +# Parameter 3: Error output (stderr). +# Returns: Formatted result. +kamailio_format_result() { + local exitstatus="$1" + local value="$2" + local error="$3" + + echo -n "exit status: ${exitstatus}" + if [ -n "$value" ]; then + echo -n ", value: ${value}" + fi + + if [ -n "$error" ]; then + echo -n ", error: ${error}" + fi + echo +} + +### +# Put the command line, how the kamailio process is started according +# to the configured parameters, into the variable "kam_cmd". +### + +kamailio_cmd() +{ + case ${OCF_RESKEY_proto} in + udp) listen_param="-T -l udp:${OCF_RESKEY_listen_address}:${OCF_RESKEY_port} -l udp:127.0.0.1:${OCF_RESKEY_port}" + ;; + tcp) listen_param="-l tcp:${OCF_RESKEY_listen_address}:${OCF_RESKEY_port} -l tcp:127.0.0.1:${OCF_RESKEY_port}" + ;; + udptcp) listen_param1="-l udp:${OCF_RESKEY_listen_address}:${OCF_RESKEY_port} -l udp:127.0.0.1:${OCF_RESKEY_port}" + listen_param2="-l tcp:${OCF_RESKEY_listen_address}:${OCF_RESKEY_port} -l tcp:127.0.0.1:${OCF_RESKEY_port}" + listen_param="${listen_param1} ${listen_param2}" + ;; + conf_*) + # doing nothing, no listen_param set + ;; + *) listen_param="-T" + ;; + esac + + kam_cmd="${OCF_RESKEY_binary} -P ${OCF_RESKEY_pidfile} -f ${OCF_RESKEY_conffile}" + + if [ -n "${listen_param}" ]; then + kam_cmd="${kam_cmd} ${listen_param}" + fi + if [ -n "${OCF_RESKEY_kamuser}" ]; then + kam_cmd="${kam_cmd} -u ${OCF_RESKEY_kamuser}" + fi + if [ -n "${OCF_RESKEY_kamgroup}" ]; then + kam_cmd="${kam_cmd} -g ${OCF_RESKEY_kamgroup}" + fi + if [ -n "${OCF_RESKEY_extra_options}" ]; then + kam_cmd="${kam_cmd} ${OCF_RESKEY_extra_options}" + fi +} + +### +# Gets the PID for the running Kamailio instance. +# +# Returns: The variable $PID contains the found PID value or an empty string. +# Exit Status: Zero if the PID file was found and this process run under +# the command line parameters of our instance. +# 1) if the PID file is not present and no process running under +# our command line options is active. +# 2) in all other fatal cases, which we classify in the followig +# as OCF_ERR_genering. These are folloing cases: +# a) The PID file contains a PID value which does no match to +# to our instance +# b) The PID contains a empty string in its first line +# c) The PID file contains some text and some processeses +# from our instance are still active + +kamailio_get_pid() { + if [ -f ${OCF_RESKEY_pidfile} ]; then + PID=`head -n 1 $OCF_RESKEY_pidfile` + if [ ! -z "$PID" ]; then + #Cross check if the PID file really contains a process of our kamailio instance: + kamailio_cmd + CROSSPID=`pgrep -o -f "${kam_cmd}"` + if [ x"$PID" == x"$CROSSPID" ]; then + #ocf_log debug "Found kamailio process PID with value: $PID." + return 0 + fi + #ocf_log debug "PID file does not contain a PID of a $OCF_RESKEY_binary process!" + return 2 + fi + + #PID file does not contain a valid PID + rm -f ${OCF_RESKEY_pidfile} + return 2 + fi + + # No PID file found! + # Check if still a process exists even though we don't have the PID any longer: + kamailio_cmd + pgrep -f "${kam_cmd}" + if [ $? -eq 0 ]; then + ocf_log info "PID file does not contain a valid PID, but kamailio process is still active" + return 2 + fi + + ocf_log info "No PID file found and our kamailio instance is not active" + return 1 +} + +kamailio_status() { + local not_running_log_level="warn" + local errorfile error output + + if [ "$__OCF_ACTION" = "start" ]; then + not_running_log_level="debug" + fi + + kamailio_get_pid >/dev/null + RET=$? + if [ $RET -ne 0 ]; then + if [ $RET -eq 2 ]; then + ocf_log $not_running_log_level "PID file does not contain a PID of a ${OCF_RESKEY_binary} process!" + return $OCF_ERR_GENERIC + fi + return $OCF_NOT_RUNNING + fi + + PID=`head -n 1 $OCF_RESKEY_pidfile` + isRunning_PID "$PID" + RET=$? + if [ "$RET" -ne 0 ]; then + ocf_log $not_running_log_level "PID from $PID from ${OCF_RESKEY_pidfile} not running" + rm -f ${OCF_RESKEY_pidfile} + return $OCF_NOT_RUNNING + fi + + rc=0 + # In case that OCF_RESKEY_kamctlrc we perfom a health check via "kamctl monitor 1" + if [ ! -z ${OCF_RESKEY_kamctlrc} ]; then + # PID is running now but it is not save to check via kamctl without care, because + # the implementation analysis in the case that we kill all running processes + # shows that in case that the fifo cannot be read, then kamctl blocks. This needs + # to be avoided. + # In order to be on the safe side, we run this check therefore under "timeout" control: + rc=1 + timeout 3 ${OCF_RESKEY_kamctl} monitor 1 |grep "since" ; rc=$? + fi + + if [ $rc -ne 0 ]; then + ocf_log $not_running_log_level "Kamailio is not up according to kamctl monitor!" + return $OCF_NOT_RUNNING + fi + + errorfile=`mktemp` + case ${OCF_RESKEY_proto} in + udp) output=`$OCF_RESKEY_sipsak -s sip:monitor@$OCF_RESKEY_monitoring_ip:${OCF_RESKEY_port} -H localhost --transport udp>/dev/null 2>>$errorfile` + result=$? + ;; + tcp) output=`$OCF_RESKEY_sipsak -s sip:monitor@$OCF_RESKEY_monitoring_ip:${OCF_RESKEY_port} -H localhost --transport tcp>/dev/null 2>>$errorfile` + result=$? + ;; + udptcp) output=`$OCF_RESKEY_sipsak -s sip:monitor@$OCF_RESKEY_monitoring_ip:${OCF_RESKEY_port} -H localhost --transport tcp>/dev/null 2>>$errorfile` + result=$? + if [ $result -eq 0 ]; then + output=`$OCF_RESKEY_sipsak -s sip:monitor@$OCF_RESKEY_monitoring_ip:${OCF_RESKEY_port} -H localhost --transport udp>/dev/null 2>>$errorfile` + result=$? + fi + ;; + *) output=`$OCF_RESKEY_sipsak -s sip:monitor@$OCF_RESKEY_monitoring_ip:${OCF_RESKEY_port} -H localhost --transport udp>/dev/null 2>>$errorfile` + result=$? + ;; + esac + + error=`cat $errorfile` + rm -f $errorfile + + if [ $result -ne 0 ]; then + ocf_log $not_running_log_level "Kamailio is running, but not functional as sipsak ${OCF_RESKEY_proto} failed with $(kamailio_format_result $result "$output" "$error")" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +kamailio_monitor() { + kamailio_status +} + +kamailio_start() { + local errorfile error output piddir + + if + kamailio_status + then + ocf_log info "kamailio already running." + return $OCF_SUCCESS + fi + + # if pidfile directory does not exist, create it with kamailio process owner + piddir=`dirname "${OCF_RESKEY_pidfile}"` + if [ ! -d "$piddir" ]; then + mkdir -p "$piddir" + if [ "$OCF_RESKEY_kamuser" != "" ]; then + chown ${OCF_RESKEY_kamuser} "$piddir" + fi + fi + + kamailio_cmd + if [ "$OCF_RESKEY_kamuser" != "" ]; then + kam_cmd="su -s @BASH_SHELL@ $OCF_RESKEY_kamuser -c \"$kam_cmd\"" + fi + + ocf_log info "start kamailio with $kam_cmd." + errorfile=`mktemp` + output=$(eval ${kam_cmd} 2>>$errorfile) + result=$? + error=`cat $errorfile` + rm -f $errorfile + + if [ $result -eq 0 ]; then + result=1 + while [ $result -ne 0 ]; do + sleep 1 + kamailio_get_pid >/dev/null + result=$? + done + + ocf_log info "kamailio instance PID=$PID started." + # check with monitor operation if running correctly + result=$OCF_ERR_GENERIC + while [ $result -ne $OCF_SUCCESS ]; do + sleep 1 + kamailio_monitor + result=$? + ocf_log info "monitor in start returned $result" + done + ocf_log info "kamailio started successful." + + else + ocf_log err "kamailio instance could not be started, $(kamailio_format_result $result "$output" "$error")" + result=$OCF_ERR_GENERIC + fi + + return $result +} + +kamailio_stop() { + local piddir + local TRIES=0 + result=$OCF_SUCCESS + + kamailio_cmd + + ocf_log info "Stopping kamailio by sending SIGTERM to ${kam_cmd}" + pkill -SIGTERM -x -f "${kam_cmd}" + if [ $? -eq 1 ]; then + # already stopped. no processes found + # in case of not specified pidfile, delete the created directory + # otherwise only the pidfile itself + if [ "${OCF_RESKEY_pidfile}" == "${RESKEY_pidfile_default}" ]; then + piddir=`dirname "${OCF_RESKEY_pidfile}"` + rm -rf "$piddir" + else + rm -f "${OCF_RESKEY_pidfile}" + fi + return $result + fi + + if [ "$OCF_RESKEY_CRM_meta_timeout" != "" ]; then + KAMAILIO_STOP_TIMEOUT=$(( ($OCF_RESKEY_CRM_meta_timeout/1000) - 7 )) + else + KAMAILIO_STOP_TIMEOUT=20 + fi + + while isRunning_cmd "${kam_cmd}" && [ "$TRIES" -lt "${KAMAILIO_STOP_TIMEOUT}" ] + do + sleep 1 + ocf_log info "kamailio ${kam_cmd} is still running after SIGTERM" + ((TRIES++)) + done + + isRunning_cmd "${kam_cmd}" + RET=$? + + if [ "$RET" -eq 0 ]; then + ocf_log info "Killing ${kam_cmd} with SIGKILL" + TRIES=0 + pkill -SIGKILL -x -f "${kam_cmd}" > /dev/null 2>&1 + + while isRunning_cmd "${kam_cmd}" && [ "$TRIES" -lt 3 ] + do + sleep 1 + ocf_log info "kamailio ${kam_cmd} is still running after SIGKILL" + ((TRIES++)) + done + + isRunning_cmd "${kam_cmd}" + RET=$? + if [ "$RET" -eq 0 ]; then + ocf_log fatal "kamailio is still running even after SIGKILL" + result=$OCF_ERR_GENERIC + fi + else + ocf_log info "${kam_cmd} has stopped." + fi + + # in case of not specified pidfile, delete the created directory + # otherwise only the pidfile itself + if [ "${OCF_RESKEY_pidfile}" == "${RESKEY_pidfile_default}" ]; then + piddir=`dirname "${OCF_RESKEY_pidfile}"` + rm -rf "$piddir" + else + rm -f "${OCF_RESKEY_pidfile}" + fi + return $result + +} + +kamailio_validate_all() { + # Check if kamailio configuration is valid before starting the server + + if [ ! -f $OCF_RESKEY_binary ]; then + ocf_log err "File OCF_RESKEY_binary [${OCF_RESKEY_binary}] does not exist!" + return $OCF_NOT_INSTALLED + fi + + out=$($OCF_RESKEY_binary -c 2>&1 > /dev/null) + retcode=$? + if [ "$retcode" -ne '0' ]; then + ocf_log info "Not starting kamailio: $OCF_RESKEY_binary does not start!" + return $OCF_ERR_CONFIGURED + fi + + case $OCF_RESKEY_monitoring_ip in + "") ocf_log err "Required parameter OCF_RESKEY_monitoring_ip is missing!" + return $OCF_ERR_CONFIGURED + ;; + [0-9]*.[0-9]*.[0-9]*.[0-9]*) : OK + ;; + *) ocf_log err "Parameter OCF_RESKEY_monitoring_ip [$OCF_RESKEY_monitoring_ip] is not an IP address!" + return $OCF_ERR_CONFIGURED + ;; + esac + + case $OCF_RESKEY_listen_address in + "") ocf_log err "Required parameter $OCF_RESKEY_listen_address is missing!" + return $OCF_ERR_CONFIGURED + ;; + [0-9]*.[0-9]*.[0-9]*.[0-9]*) : OK + ;; + *) ocf_log err "Parameter OCF_RESKEY_listen_address [$OCF_RESKEY_listen_address] not an IP address!" + return $OCF_ERR_CONFIGURED + ;; + esac + + if [ ! -f ${OCF_RESKEY_sipsak} ]; then + ocf_log err "sipsak [${OCF_RESKEY_sipsak}] does not exist!" + return $OCF_NOT_INSTALLED + fi + + if [ ! -z ${OCF_RESKEY_kamctlrc} ]; then + if [ ! -f ${OCF_RESKEY_kamctlrc} ]; then + ocf_log err "kamctlrc file [${kamctlrc}] does not exist!" + return $OCF_NOT_INSTALLED + fi + else + ocf_log debug "No monitoring via kamctl monitor because the parameter [kamctlrc] is empty." + fi + + if [ ! -f ${OCF_RESKEY_conffile} ]; then + ocf_log err "Kamailio configuration file provided in the parameter conffile [${OCF_RESKEY_conffile}] does not exist!" + return $OCF_ERR_CONFIGURED + fi + + case $OCF_RESKEY_proto in + "") ocf_log err "Parameter $OCF_RESKEY_proto is empty!" + return $OCF_ERR_CONFIGURED + ;; + udp|tcp|udptcp) : OK + ;; + *) ocf_log err "Parameter value $OCF_RESKEY_proto for parameter [proto] not yet supported!" + return $OCF_ERR_CONFIGURED + ;; + esac + + return $OCF_SUCCESS +} + +if [ $# -ne 1 ]; then + usage + exit $OCF_ERR_ARGS +fi + +case $__OCF_ACTION in + meta-data) meta_data + exit $OCF_SUCCESS + ;; + start|stop|status|monitor) + kamailio_${__OCF_ACTION} + ;; + validate-all) kamailio_validate_all + ;; + notify) exit $OCF_SUCCESS + ;; + usage) usage + exit $OCF_SUCCESS + ;; +# reload) #Not supported by Kamailio, but not needed by pacemaker +# ;; +# recover #Not needed by pacemaker +# ;; + *) usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +exit $? diff --git a/heartbeat/lvm-clvm.sh b/heartbeat/lvm-clvm.sh new file mode 100644 index 0000000..9bc0f75 --- /dev/null +++ b/heartbeat/lvm-clvm.sh @@ -0,0 +1,86 @@ +# lvm-clvmd.sh +# +# Description: LVM management with clvmd +# +# +# Author: Resource agents contributors +# Interface to LVM by Dejan Muhamedagic +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL) +# Copyright: (C) 2017 Dejan Muhamedagic +# + +## +# Attempt to deactivate vg cluster wide and then start the vg exclusively +## +retry_exclusive_start() +{ + # Deactivate each LV in the group one by one cluster wide + set -- $(lvs -o name,attr --noheadings $OCF_RESKEY_volgrpname 2> /dev/null) + while [ $# -ge 2 ]; do + case $2 in + ????ao*) + # open LVs cannot be deactivated. + return $OCF_ERR_GENERIC;; + *) + if ! lvchange -an $OCF_RESKEY_volgrpname/$1; then + ocf_exit_reason "Unable to perform required deactivation of $OCF_RESKEY_volgrpname/$1 before starting" + return $OCF_ERR_GENERIC + fi + ;; + esac + shift 2 + done + + ocf_run vgchange $vgchange_activate_options $OCF_RESKEY_volgrpname +} + +# +# the interface to the LVM RA +# + +lvm_init() { + vgchange_activate_options="-aey" + vgchange_deactivate_options="-an" +} + +lvm_validate_all() { + if ! ps -C clvmd > /dev/null 2>&1; then + ocf_exit_reason "$OCF_RESKEY_volgrpname has the cluster attribute set, but 'clvmd' is not running" + exit $OCF_ERR_GENERIC + fi +} + +lvm_status() { + return 0 +} + +lvm_pre_activate() { + return 0 +} + +lvm_post_activate() { + local rc=$1 + if [ $rc -ne 0 ]; then + # Failure to exclusively activate cluster vg.: + # This could be caused by a remotely active LV, Attempt + # to disable volume group cluster wide and try again. + # Allow for some settling + sleep 5 + if ! retry_exclusive_start; then + return $OCF_ERR_GENERIC + fi + fi + return $rc +} + +lvm_pre_deactivate() { + return 0 +} + +lvm_post_deactivate() { + local rc=$1 + return $rc +} + +# vim:tabstop=4:shiftwidth=4:textwidth=0:wrapmargin=0 diff --git a/heartbeat/lvm-plain.sh b/heartbeat/lvm-plain.sh new file mode 100644 index 0000000..f533285 --- /dev/null +++ b/heartbeat/lvm-plain.sh @@ -0,0 +1,62 @@ +# lvm-plain.sh +# +# Description: LVM management with no VG protection +# +# +# Author: Dejan Muhamedagic +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL) +# Copyright: (C) 2017 Dejan Muhamedagic +# + +# +# interface to the LVM RA +# + +# apart from the standard vgchange options, +# this is mostly a template +# please copy and modify appropriately +# when adding new VG protection mechanisms + +# lvm_init sets the vgchange options: +# vgchange_activate_options +# vgchange_deactivate_options +# (for both activate and deactivate) + +lvm_init() { + vgchange_activate_options="-aly" + vgchange_deactivate_options="-aln" + # for clones (clustered volume groups), we'll also have to force + # monitoring, even if disabled in lvm.conf. + if ocf_is_clone; then + vgchange_activate_options="$vgchange_activate_options --monitor y" + fi +} + +lvm_validate_all() { + : nothing to validate +} + +lvm_status() { + return 0 +} + +lvm_pre_activate() { + return 0 +} + +lvm_post_activate() { + local rc=$1 + return $rc +} + +lvm_pre_deactivate() { + return 0 +} + +lvm_post_deactivate() { + local rc=$1 + return $rc +} + +# vim:tabstop=4:shiftwidth=4:textwidth=0:wrapmargin=0 diff --git a/heartbeat/lvm-tag.sh b/heartbeat/lvm-tag.sh new file mode 100644 index 0000000..fe17e0f --- /dev/null +++ b/heartbeat/lvm-tag.sh @@ -0,0 +1,205 @@ +# lvm-tag.sh +# +# Description: LVM management with tags +# +# +# Author: David Vossel +# Interface to LVM by Dejan Muhamedagic +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL) +# Copyright: (C) 2017 Dejan Muhamedagic +# + +## +# Verify tags setup +## + +verify_tags_environment() +{ + ## + # The volume_list must be initialized to something in order to + # guarantee our tag will be filtered on startup + ## + if ! lvm dumpconfig activation/volume_list; then + ocf_log err "LVM: Improper setup detected" + ocf_exit_reason "The volume_list filter must be initialized in lvm.conf for exclusive activation without clvmd" + return $OCF_ERR_GENERIC + fi + + ## + # Our tag must _NOT_ be in the volume_list. This agent + # overrides the volume_list during activation using the + # special tag reserved for cluster activation + ## + if lvm dumpconfig activation/volume_list | grep -e "\"@$OUR_TAG\"" -e "\"${OCF_RESKEY_volgrpname}\""; then + ocf_log err "LVM: Improper setup detected" + ocf_exit_reason "The volume_list in lvm.conf must not contain the cluster tag, \"$OUR_TAG\", or volume group, $OCF_RESKEY_volgrpname" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +check_initrd_warning() +{ + # First check to see if there is an initrd img we can safely + # compare timestamps agaist. If not, don't even bother with + # this check. This is known to work in rhel/fedora distros + ls "/boot/*$(uname -r)*.img" > /dev/null 2>&1 + if [ $? -ne 0 ]; then + return + fi + + ## + # Now check to see if the initrd has been updated. + # If not, the machine could boot and activate the VG outside + # the control of pacemaker + ## + if [ "$(find /boot -name *.img -newer /etc/lvm/lvm.conf)" = "" ]; then + ocf_log warn "LVM: Improper setup detected" + ocf_log warn "* initrd image needs to be newer than lvm.conf" + + # While dangerous if not done the first time, there are many + # cases where we don't simply want to fail here. Instead, + # keep warning until the user remakes the initrd - or has + # it done for them by upgrading the kernel. + # + # initrd can be updated using this command. + # dracut -H -f /boot/initramfs-$(uname -r).img $(uname -r) + # + fi +} + +## +# does this vg have our tag +## +check_tags() +{ + local owner=`vgs -o tags --noheadings $OCF_RESKEY_volgrpname | tr -d ' '` + + if [ -z "$owner" ]; then + # No-one owns this VG yet + return 1 + fi + + if [ "$OUR_TAG" = "$owner" ]; then + # yep, this is ours + return 0 + fi + + # some other tag is set on this vg + return 2 +} + +strip_tags() +{ + local i + + for i in `vgs --noheadings -o tags $OCF_RESKEY_volgrpname | sed s/","/" "/g`; do + ocf_log info "Stripping tag, $i" + + # LVM version 2.02.98 allows changing tags if PARTIAL + vgchange --deltag $i $OCF_RESKEY_volgrpname + done + + if [ ! -z `vgs -o tags --noheadings $OCF_RESKEY_volgrpname | tr -d ' '` ]; then + ocf_exit_reason "Failed to remove ownership tags from $OCF_RESKEY_volgrpname" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +set_tags() +{ + check_tags + case $? in + 0) + # we already own it. + return $OCF_SUCCESS + ;; + 2) + # other tags are set, strip them before setting + if ! strip_tags; then + return $OCF_ERR_GENERIC + fi + ;; + *) + : ;; + esac + + vgchange --addtag $OUR_TAG $OCF_RESKEY_volgrpname + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to add ownership tag to $OCF_RESKEY_volgrpname" + return $OCF_ERR_GENERIC + fi + + ocf_log info "New tag \"$OUR_TAG\" added to $OCF_RESKEY_volgrpname" + return $OCF_SUCCESS +} + +# +# interface to LVM +# + +lvm_init() { + OUR_TAG="pacemaker" + if [ -n "$OCF_RESKEY_tag" ]; then + OUR_TAG=$OCF_RESKEY_tag + fi + vgchange_activate_options="-aly --config activation{volume_list=[\"@${OUR_TAG}\"]}" + vgchange_deactivate_options="-aln" +} + +lvm_validate_all() { + if ! verify_tags_environment; then + exit $OCF_ERR_GENERIC + fi +} + +lvm_status() { + local rc=0 + + # If vg is running, make sure the correct tag is present. Otherwise we + # can not guarantee exclusive activation. + if ! check_tags; then + ocf_exit_reason "WARNING: $OCF_RESKEY_volgrpname is active without the cluster tag, \"$OUR_TAG\"" + rc=$OCF_ERR_GENERIC + fi + + # make sure the environment for tags activation is still valid + if ! verify_tags_environment; then + rc=$OCF_ERR_GENERIC + fi + # let the user know if their initrd is older than lvm.conf. + check_initrd_warning + + return $rc +} + +lvm_pre_activate() { + if ! set_tags; then + return $OCF_ERR_GENERIC + fi + return 0 +} + +lvm_post_activate() { + local rc=$1 + return $rc +} + +lvm_pre_deactivate() { + return 0 +} + +lvm_post_deactivate() { + local rc=$1 + if [ $rc -eq 0 ]; then + strip_tags + rc=$? + fi + return $rc +} + +# vim:tabstop=4:shiftwidth=4:textwidth=0:wrapmargin=0 diff --git a/heartbeat/lvmlockd b/heartbeat/lvmlockd new file mode 100755 index 0000000..f4b299f --- /dev/null +++ b/heartbeat/lvmlockd @@ -0,0 +1,401 @@ +#!/bin/sh +# +# +# lvmlockd OCF Resource Agent +# +# Copyright (c) 2017 SUSE LINUX, Eric Ren +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_pidfile_default="/run/lvmlockd.pid" +OCF_RESKEY_socket_path_default="/run/lvm/lvmlockd.socket" +OCF_RESKEY_syslog_priority_default="warning" +OCF_RESKEY_adopt_default="1" + +: ${OCF_RESKEY_pidfile=${OCF_RESKEY_pidfile_default}} +: ${OCF_RESKEY_socket_path=${OCF_RESKEY_socket_path_default}} +: ${OCF_RESKEY_syslog_priority=${OCF_RESKEY_syslog_priority_default}} +: ${OCF_RESKEY_adopt=${OCF_RESKEY_adopt_default}} + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="lvmlockd" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This agent manages the lvmlockd daemon. "lvmlockd" is like "clvmd". Both +are used by LVM commands to coordinate access to shared storage, but with +different design and implementations. "lvmlockd" can use two lock managers: +dlm and sanlock. This agent only supports "dlm + lvmlockd". If dlm (or corosync) +are already being used by other cluster software, you are advised to select +dlm, then configure "controld" resource agent for dlm and this agent for "lvmlockd". +Otherwise, consider sanlock for "lvmlockd" if dlm/corosync is not required. + +Using lvmlockd requires the settings in LVM configuration file (/etc/lvm/lvm.conf): +"locking_type = 1" and "use_lvmlockd = 1". This RA will change the settings +respectively if needed. + +For more information, refer to manpage lvmlockd.8. +</longdesc> +<shortdesc lang="en">This agent manages the lvmlockd daemon</shortdesc> + +<parameters> +<parameter name="pidfile" unique="0"> +<longdesc lang="en">pid file</longdesc> +<shortdesc lang="en">pid file</shortdesc> +<content type="string" default="${OCF_RESKEY_pidfile_default}"/> +</parameter> + +<parameter name="socket_path" unique="0"> +<longdesc lang="en">Set the socket path to listen on.</longdesc> +<shortdesc lang="en">socket path</shortdesc> +<content type="string" default="${OCF_RESKEY_socket_path_default}"/> +</parameter> + +<parameter name="syslog_priority" unique="0"> +<longdesc lang="en">Write log messages from this level up to syslog.</longdesc> +<shortdesc lang="en">syslog priority</shortdesc> +<content type="string" default="${OCF_RESKEY_syslog_priority_default}"/> +</parameter> + +<parameter name="adopt" unique="0"> +<longdesc lang="en"> +Adopt locks from a previous instance of lvmlockd. +</longdesc> +<shortdesc lang="en">Adopt locks from a previous instance of lvmlockd</shortdesc> +<content type="integer" default="${OCF_RESKEY_adopt_default}"/> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="90s" /> +<action name="stop" timeout="90s" /> +<action name="monitor" timeout="90s" interval="30s" depth="0" /> +<action name="meta-data" timeout="10s" /> +<action name="validate-all" timeout="20s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +LOCKD="lvmlockd" +# 0.5s sleep each count +TIMEOUT_COUNT=20 + +usage() { + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} +END +} + +get_pid() +{ + if [ -f ${OCF_RESKEY_pidfile} ] ; then + cat ${OCF_RESKEY_pidfile} + else + false + fi +} + +daemon_is_running() +{ + local pid=$1 + + # Use /proc if it exists there + if [ -d /proc ] && [ -d /proc/1 ] ; then + [ -d /proc/"$pid" ] + else + kill -s 0 "$pid" >/dev/null 2>&1 + fi +} + +silent_status() +{ + local pid=$(get_pid) + + if [ -n "$pid" ] ; then + daemon_is_running "$pid" + rc=$? + mirror_rc=$rc + + # If these ever don't match, return error to force recovery + if [ $mirror_rc -ne $rc ]; then + return $OCF_ERR_GENERIC + fi + + return $rc + else + # No pid file + false + fi +} + +# change /etc/lvm/lvm.conf to use lvmlockd +setup_lvm_config() +{ + local out="" + local use_lvmlockd="" + local lock_type="" + + # To use lvmlockd, ensure configure lvm.conf: + # locking_type = 1 + # use_lvmlockd = 1 + out=$(lvmconfig 'global/use_lvmlockd' 2> /dev/null) + use_lvmlockd=$(echo "$out" | cut -d'=' -f2) + + out=$(lvmconfig 'global/locking_type' 2> /dev/null) + lock_type=$(echo "$out" | cut -d'=' -f2) + + if [ -z "$use_lvmlockd" ]; then + ocf_log info "adding \"use_lvmlockd=1\" to /etc/lvm/lvm.conf ..." + cat >> /etc/lvm/lvm.conf << EOF + +global { + use_lvmlockd = 1 +} +EOF + + if [ $? -ne 0 ]; then + ocf_exit_reason "unable to add \"use_lvmlockd=1\" to /etc/lvm/lvm.conf ..." + exit $OCF_ERR_CONFIGURED + fi + elif [ "$use_lvmlockd" != 1 ] ; then + ocf_log info "setting \"use_lvmlockd=1\" in /etc/lvm/lvm.conf ..." + sed -i 's,^[[:blank:]]*use_lvmlockd[[:blank:]]*=.*,\ \ \ \ use_lvmlockd = 1,g' /etc/lvm/lvm.conf + fi + + if [ -n "$lock_type" ] ; then + # locking_type was removed from config in v2.03 + ocf_version_cmp "$(lvmconfig --version | awk '/LVM ver/ {sub(/\(.*/, "", $3); print $3}')" "2.03" + case "$?" in + 1|2) + ocf_log info "removing \"locking_type\" from /etc/lvm/lvm.conf ..." + sed -i '/^[[:blank:]]*locking_type[[:blank:]]*=.*/d' /etc/lvm/lvm.conf + ;; + 0) + if [ "$lock_type" != 1 ] ; then + ocf_log info "setting \"locking_type=1\" in /etc/lvm/lvm.conf ..." + sed -i 's,^[[:blank:]]*locking_type[[:blank:]]*=.*,\ \ \ \ locking_type = 1,g' /etc/lvm/lvm.conf + fi + ;; + esac + fi + + return $OCF_SUCCESS +} + +check_dlm_controld() +{ + local pid="" + + # dlm daemon should have only one instance, but for safe... + pid=$(pgrep dlm_controld | head -n1) + if ! daemon_is_running $pid ; then + ocf_exit_reason "DLM is not running. Is it configured?" + exit $OCF_ERR_CONFIGURED + fi + + return $OCF_SUCCESS +} + +lvmlockd_start() { + local extras="" + + setup_lvm_config + + ocf_log info "checking if DLM is started first..." + check_dlm_controld + + if silent_status ; then + ocf_log info "${LOCKD} already started (pid=$(get_pid))" + return $OCF_SUCCESS + fi + + if [ ! -z "$OCF_RESKEY_socket_path" ] ; then + extras="$extras -s ${OCF_RESKEY_socket_path}" + fi + if [ ! -z "$OCF_RESKEY_syslog_priority" ] ; then + extras="$extras -S ${OCF_RESKEY_syslog_priority}" + fi + if [ ! -z "$OCF_RESKEY_adopt" ] ; then + extras="$extras -A ${OCF_RESKEY_adopt}" + else + # Inside lvmlockd daemon, this option defaults to 0. But, we + # want it defaults to 1 for resource agent. When RA monitor pulls + # this daemon up, we expect it to adopt locks from a previous + # instance of lvmlockd. + extras="$extras -A 1" + fi + # This client only support "dlm" lock manager + extras="$extras -g dlm" + + ocf_log info "starting ${LOCKD}..." + ocf_run ${LOCKD} -p ${OCF_RESKEY_pidfile} $extras + rc=$? + if [ $rc -ne $OCF_SUCCESS ] ; then + ocf_exit_reason "Failed to start ${LOCKD}, exit code: $rc" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +# Each shared VG has its own lockspace. Besides, lvm_global lockspace +# is for global use, and it should be the last one to close. It should +# be enough to only check on lvm_global. +wait_lockspaces_close() +{ + local retries=0 + + ocf_log info "Waiting for all lockspaces to be closed" + while [ $retries -lt "$TIMEOUT_COUNT" ] + do + if ! dlm_tool ls lvm_global | grep -Eqs "^name[[:space:]]+lvm_global" ; then + return $OCF_SUCCESS + fi + + sleep 0.5 + retries=$((retries + 1)) + done + + ocf_exit_reason "Failed to close all lockspaces clearly" + exit $OCF_ERR_GENERIC +} + +kill_stop() +{ + local proc=$1 + local pid=$2 + local retries=0 + + ocf_log info "Killing $proc (pid=$pid)" + while + daemon_is_running $pid && [ $retries -lt "$TIMEOUT_COUNT" ] + do + if [ $retries -ne 0 ] ; then + # don't sleep on the first try + sleep 0.5 + fi + kill -s TERM $pid >/dev/null 2>&1 + retries=$((retries + 1)) + done + +} + +lvmlockd_stop() { + local pid="" + + if ! silent_status ; then + ocf_log info "${LOCKD} is not running" + return $OCF_SUCCESS + fi + + if [ -n "$(dlm_tool ls)" ]; then + # We are going to stop lvmlockd, at this moment, we hope all shared VG have + # been deactivated, otherwise we are in trouble: the stop action will fail! + ocf_log info "stop the lockspaces of shared VG(s)..." + ocf_run lvmlockctl --stop-lockspaces + rc=$? + if [ $rc -ne $OCF_SUCCESS ] ; then + ocf_exit_reason "Failed to close lockspace, exit code: $rc" + return $OCF_ERR_GENERIC + fi + fi + + wait_lockspaces_close + + pid=$(get_pid) + kill_stop $LOCKD $pid + + if silent_status ; then + ocf_exit_reason "Failed to stop, ${LOCKD} still running." + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +lvmlockd_monitor() { + if silent_status ; then + return $OCF_SUCCESS + fi + + ocf_log info "${LOCKD} not running" + return $OCF_NOT_RUNNING +} + +lvmlockd_validate() { + check_binary ${LOCKD} + check_binary lvm + check_binary dlm_tool + check_binary pgrep + check_binary lvmlockctl + + return $OCF_SUCCESS +} + + +# Make sure meta-data and usage always succeed +case $__OCF_ACTION in +meta-data) meta_data + exit $OCF_SUCCESS + ;; +usage|help) usage + exit $OCF_SUCCESS + ;; +esac + +# Anything other than meta-data and usage must pass validation +lvmlockd_validate || exit $? + +# Translate each action into the appropriate function call +case $__OCF_ACTION in +start) lvmlockd_start + ;; +stop) lvmlockd_stop + ;; +monitor) lvmlockd_monitor + ;; +validate-all) lvmlockd_validate + ;; +*) usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? + +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc diff --git a/heartbeat/lxc.in b/heartbeat/lxc.in new file mode 100644 index 0000000..1ffbc46 --- /dev/null +++ b/heartbeat/lxc.in @@ -0,0 +1,358 @@ +#!@BASH_SHELL@ +# Should now conform to guidelines: +# https://github.com/ClusterLabs/resource-agents/blob/master/doc/dev-guides/ra-dev-guide.asc +# +# LXC (Linux Containers) OCF RA. +# Used to cluster enable the start, stop and monitoring of a LXC container. +# +# Copyright (c) 2011 AkurIT.com.au, Darren Thompson +# All Rights Reserved. +# +# Without limiting the rights of the original copyright holders +# This resource is licensed under GPL version 2 +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. + +# OCF instance parameters +# OCF_RESKEY_container +# OCF_RESKEY_config +# OCF_RESKEY_log +# OCF_RESKEY_use_screen + +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_container_default="" +OCF_RESKEY_config_default="" +OCF_RESKEY_log_default="${HA_RSCTMP}/${OCF_RESOURCE_INSTANCE}.log" +OCF_RESKEY_use_screen_default="false" + +: ${OCF_RESKEY_container=${OCF_RESKEY_container_default}} +: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} +: ${OCF_RESKEY_log=${OCF_RESKEY_log_default}} +: ${OCF_RESKEY_use_screen=${OCF_RESKEY_use_screen_default}} + +# Set default TRANS_RES_STATE (temporary file to "flag" if resource was stated but not stopped) +TRANS_RES_STATE="${HA_RSCTMP}/${OCF_RESOURCE_INSTANCE}.state" + +meta_data() { +cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="lxc" version="0.1"> +<version>1.0</version> +<longdesc lang="en">Allows LXC containers to be managed by the cluster. +Notes for lxc Versions before 1.0.0, where the Container is stopped using kill -PWR instead of lxc-stop: +It is 'assumed' that the 'init' system will do an orderly shudown if presented with a 'kill -PWR' signal. +On a 'sysvinit' this would require the container to have an inittab file containing "p0::powerfail:/sbin/init 0" +</longdesc> +<shortdesc lang="en">Manages LXC containers</shortdesc> + +<parameters> +<parameter name="container" required="1" unique="1"> +<longdesc lang="en">The unique name for this 'Container Instance' e.g. 'test1'.</longdesc> +<shortdesc lang="en">Container Name</shortdesc> +<content type="string" default="${OCF_RESKEY_container_default}"/> +</parameter> + +<parameter name="config" required="1" unique="0"> +<longdesc lang="en">Absolute path to the file holding the specific configuration for this container e.g. '/etc/lxc/test1/config'.</longdesc> +<shortdesc lang="en">The LXC config file.</shortdesc> +<content type="string" default="${OCF_RESKEY_config_default}"/> +</parameter> + +<parameter name="log" required="0" unique="0"> +<longdesc lang="en">Absolute path to the container log file</longdesc> +<shortdesc lang="en">Container log file</shortdesc> +<content type="string" default="${OCF_RESKEY_log_default}"/> +</parameter> + +<parameter name="use_screen" required="0" unique="0"> +<longdesc lang="en">Provides the option of capturing the 'root console' from the container and showing it on a separate screen. +To see the screen output run 'screen -r {container name}' +The default value is set to 'false', change to 'true' to activate this option</longdesc> +<shortdesc lang="en">Use 'screen' for container 'root console' output</shortdesc> +<content type="boolean" default="${OCF_RESKEY_use_screen_default}"/> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="10s" /> +<action name="stop" timeout="30s" /> +<action name="monitor" timeout="20s" interval="60s" depth="0"/> +<action name="validate-all" timeout="20s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + + +LXC_usage() { + cat <<END + usage: $0 {start|stop|monitor|validate-all|meta-data} + + Expects to have a fully populated OCF RA-compliant environment set. +END +} + +lxc_version() { + if have_binary lxc-version ; then + lxc-version | cut -d' ' -f 3 + else # since LXC 1.0.0 all commands knows about --version + lxc-info --version + fi +} + +cgroup_mounted() { +# test cgroup_mounted, mount if required + # Various possible overrides to cgroup mount point. + # If kernel supplies cgroup mount point, prefer it. + CGROUP_MOUNT_POINT=/var/run/lxc/cgroup + CGROUP_MOUNT_NAME=lxc + CGROUP_MOUNTED=false + [[ -d /sys/fs/cgroup ]] && CGROUP_MOUNT_POINT=/sys/fs/cgroup CGROUP_MOUNT_NAME=cgroup + # If cgroup already mounted, use it no matter where it is. + # If multiple cgroup mounts, prefer the one named lxc if any. + eval `awk 'BEGIN{P="";N=""}END{print("cgmp="P" cgmn="N)}($3=="cgroup"){N=$1;P=$2;if($1="lxc")exit}' /proc/mounts` + [[ "$cgmn" && "$cgmp" && -d "$cgmp" ]] && CGROUP_MOUNT_POINT=$cgmp CGROUP_MOUNT_NAME=$cgmn CGROUP_MOUNTED=true + $CGROUP_MOUNTED || { + [[ -d $CGROUP_MOUNT_POINT ]] || ocf_run mkdir -p $CGROUP_MOUNT_POINT + ocf_run mount -t cgroup $CGROUP_MOUNT_NAME $CGROUP_MOUNT_POINT + } + echo 1 >${CGROUP_MOUNT_POINT}/notify_on_release + return 0 +} + +LXC_start() { + # put this here as it's so long it gets messy later!!! + if ocf_is_true $OCF_RESKEY_use_screen; then + STARTCMD="screen -dmS ${OCF_RESKEY_container} lxc-start -f ${OCF_RESKEY_config} -n ${OCF_RESKEY_container} -o ${OCF_RESKEY_log}" + else + STARTCMD="lxc-start -f ${OCF_RESKEY_config} -n ${OCF_RESKEY_container} -o ${OCF_RESKEY_log} -d" + fi + + LXC_status + if [ $? -eq $OCF_SUCCESS ]; then + ocf_log debug "Resource $OCF_RESOURCE_INSTANCE is already running" + ocf_run touch "${TRANS_RES_STATE}" || exit $OCF_ERR_GENERIC + return $OCF_SUCCESS + fi + + cgroup_mounted + if [ $? -ne 0 ]; then + ocf_log err "Unable to find cgroup mount" + exit $OCF_ERR_GENERIC + fi + + ocf_log info "Starting" ${OCF_RESKEY_container} + ocf_run ${STARTCMD} || exit $OCF_ERR_GENERIC + + # Spin on status, wait for the cluster manager to time us out if + # we fail + while ! LXC_status; do + ocf_log info "Container ${OCF_RESKEY_container} has not started, waiting" + sleep 1 + done + + ocf_run touch "${TRANS_RES_STATE}" || exit $OCF_ERR_GENERIC + return $OCF_SUCCESS +} + + + +LXC_stop() { + + LXC_status + if [ $? -eq $OCF_NOT_RUNNING ]; then + ocf_log debug "Resource $OCF_RESOURCE_INSTANCE is already stopped" + ocf_run rm -f $TRANS_RES_STATE + return $OCF_SUCCESS + fi + + cgroup_mounted + if [ $? -ne 0 ]; then + ocf_log err "Unable to find cgroup mount" + exit $OCF_ERR_GENERIC + fi + + if ! ocf_version_cmp "`lxc_version`" 1.0.0 ; then + # Use lxc-stop if we are newer than 1.0.0 + timeout=$(( ($OCF_RESKEY_CRM_meta_timeout/1000) -5 )) + ocf_log info "Stopping Container ${OCF_RESKEY_container} using lxc-stop" + # lxc-stop will return failure even if it reached the timeout and sucessfully hard-stopped the + # Container so we check below if the Container is really stopped instead of using || exit $OCF_ERR_GENERIC + ocf_run lxc-stop -n "${OCF_RESKEY_container}" -t ${timeout} + + LXC_status + if [ $? -eq $OCF_SUCCESS ]; then + # Try to manually hard-stop if the Container is still running + ocf_run lxc-stop -n "${OCF_RESKEY_container}" -k || exit $OCF_ERR_GENERIC + fi + + else + # Use kill -PWR + # If the container is running "init" and is able to perform and orderly shutdown, then it should be done. + # It is 'assumed' that the 'init' system will do an orderly shudown if presented with a 'kill -PWR' signal. + # On a 'sysvinit' this would require the container to have an inittab file containing "p0::powerfail:/sbin/init 0" + local shutdown_timeout + local now + declare -i PID=0 + declare CMD= + + # This should work for traditional 'sysvinit' and 'upstart' + lxc-ps --name "${OCF_RESKEY_container}" -- -C init -o pid,comm |while read CN PID CMD ;do + [ $PID -gt 1 ] || continue + [ "$CMD" = "init" ] || continue + ocf_log info "Sending \"OS shut down\" instruction to" ${OCF_RESKEY_container} "as it was found to be using \"sysV init\" or \"upstart\"" + kill -PWR $PID + done + # This should work for containers using 'systemd' instead of 'init' + lxc-ps --name "${OCF_RESKEY_container}" -- -C systemd -o pid,comm |while read CN PID CMD ;do + [ $PID -gt 1 ] || continue + [ "$CMD" = "systemd" ] || continue + ocf_log info "Sending \"OS shut down\" instruction to" ${OCF_RESKEY_container} "as it was found to be using \"systemd\"" + kill -PWR $PID + done + + # The "shutdown_timeout" we use here is the operation + # timeout specified in the CIB, minus 5 seconds + now=$(date +%s) + shutdown_timeout=$(( $now + ($OCF_RESKEY_CRM_meta_timeout/1000) -5 )) + # Loop on status until we reach $shutdown_timeout + while [ $now -lt $shutdown_timeout ]; do + LXC_status + status=$? + case $status in + "$OCF_NOT_RUNNING") + ocf_run rm -f $TRANS_RES_STATE + return $OCF_SUCCESS + ;; + "$OCF_SUCCESS") + # Container is still running, keep waiting (until + # shutdown_timeout expires) + sleep 1 + ;; + *) + # Something went wrong. Bail out and + # resort to forced stop (destroy). + break; + esac + now=$(date +%s) + done + + # If the container is still running, it will be stopped now. regardless of state! + ocf_run lxc-stop -n ${OCF_RESKEY_container} || exit $OCF_ERR_GENERIC + fi + + ocf_log info "Container" ${OCF_RESKEY_container} "stopped" + ocf_run rm -f $TRANS_RES_STATE + + return $OCF_SUCCESS +} + +LXC_status() { + # run lxc-info with -s option for LXC-0.7.5 or later + local lxc_info_opt="-s" + ocf_version_cmp "`lxc_version`" 0.7.5 && lxc_info_opt="" + S=`lxc-info $lxc_info_opt -n ${OCF_RESKEY_container}` + ocf_log debug "State of ${OCF_RESKEY_container}: $S" + if [[ "${S##* }" = "RUNNING" ]] ; then + return $OCF_SUCCESS + fi + return $OCF_NOT_RUNNING +} + +LXC_monitor() { + LXC_status && return $OCF_SUCCESS + if [ -f $TRANS_RES_STATE ]; then + ocf_log err "${OCF_RESKEY_container} is not running, but state file ${TRANS_RES_STATE} exists." + exit $OCF_ERR_GENERIC + fi + return $OCF_NOT_RUNNING +} + + +LXC_validate() { + # Quick check that all required attributes are set + if [ -z "${OCF_RESKEY_container}" ]; then + ocf_log err "LXC container name not set!" + exit $OCF_ERR_CONFIGURED + fi + if [ -z "${OCF_RESKEY_config}" ]; then + ocf_log err "LXC configuration filename name not set!" + exit $OCF_ERR_CONFIGURED + fi + + # Tests that apply only to non-probes + if ! ocf_is_probe; then + if ! [ -f "${OCF_RESKEY_config}" ]; then + ocf_log err "LXC configuration file \"${OCF_RESKEY_config}\" missing or not found!" + exit $OCF_ERR_INSTALLED + fi + + if ocf_is_true $OCF_RESKEY_use_screen; then + check_binary screen + fi + + check_binary lxc-start + check_binary lxc-stop + if ocf_version_cmp "`lxc_version`" 1.0.0 ; then + check_binary lxc-ps + fi + check_binary lxc-info + fi + return $OCF_SUCCESS +} + +if [ $# -ne 1 ]; then + LXC_usage + exit $OCF_ERR_ARGS +fi + +case $__OCF_ACTION in + meta-data) meta_data + exit $OCF_SUCCESS + ;; + usage|help) LXC_usage + exit $OCF_SUCCESS + ;; +esac + +# Everything except usage and meta-data must pass the validate test +LXC_validate + +case $__OCF_ACTION in +start) LXC_start;; +stop) LXC_stop;; +status) LXC_status;; +monitor) LXC_monitor;; +validate-all) ;; +*) LXC_usage + ocf_log err "$0 was called with unsupported arguments: $*" + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc diff --git a/heartbeat/lxd-info.in b/heartbeat/lxd-info.in new file mode 100644 index 0000000..f9fb44a --- /dev/null +++ b/heartbeat/lxd-info.in @@ -0,0 +1,156 @@ +#!@BASH_SHELL@ +# +# +# LXD Registration Service OCF Resource Agent +# It records (in the CIB) various attributes of a node +# +# Copyright (c) 2017 Mathieu Grzybek +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_pidfile_default="$HA_RSCTMP/LXDInfo-${OCF_RESOURCE_INSTANCE}" +OCF_RESKEY_delay_default="0s" +OCF_RESKEY_clone_default="0" + +: ${OCF_RESKEY_pidfile=${OCF_RESKEY_pidfile_default}} +: ${OCF_RESKEY_delay=${OCF_RESKEY_delay_default}} +: ${OCF_RESKEY_clone=${OCF_RESKEY_clone_default}} + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="lxd-info" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This is a LXD Registration Service Resource Agent. +It records (in the CIB) attributes about the number of running LXD containers +running on the node. +Sample output: + lxd_containers: 5 +</longdesc> +<shortdesc lang="en">Records various node attributes in the CIB</shortdesc> + +<parameters> +<parameter name="pidfile" unique="0"> +<longdesc lang="en">PID file</longdesc> +<shortdesc lang="en">PID file</shortdesc> +<content type="string" default="${OCF_RESKEY_pidfile_default}" /> +</parameter> +<parameter name="delay" unique="0"> +<longdesc lang="en">Interval to allow values to stabilize</longdesc> +<shortdesc lang="en">Dampening Delay</shortdesc> +<content type="string" default="${OCF_RESKEY_delay_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" timeout="20s" interval="60s"/> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="20s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +LXDInfoStats() { + value=$(lxc list|grep -ci RUNNING) + echo -e "lxd_containers:\t$value" + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -S status -n lxd_containers -v $value +} + +LXDInfo_usage() { + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +LXDInfo_start() { + echo $OCF_RESKEY_clone > $OCF_RESKEY_pidfile + LXDInfoStats + exit $OCF_SUCCESS +} + +LXDInfo_stop() { + rm -f $OCF_RESKEY_pidfile + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -D -S state -n lxd_containers + exit $OCF_SUCCESS +} + +LXDInfo_monitor() { + if [ -f "$OCF_RESKEY_pidfile" ] ; then + LXDInfoStats + exit $OCF_RUNNING + fi + exit $OCF_NOT_RUNNING +} + +LXDInfo_validate() { + return $OCF_SUCCESS +} + +if [ $# -ne 1 ]; then + LXDInfo_usage + exit $OCF_ERR_ARGS +fi + +if [ x != x${OCF_RESKEY_delay} ]; then + OCF_RESKEY_delay="-d ${OCF_RESKEY_delay}" +fi + +case $__OCF_ACTION in +meta-data) meta_data + exit $OCF_SUCCESS + ;; +start) LXDInfo_start + ;; +stop) LXDInfo_stop + ;; +monitor) LXDInfo_monitor + ;; +validate-all) LXDInfo_validate + ;; +usage|help) LXDInfo_usage + exit $OCF_SUCCESS + ;; +*) LXDInfo_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +exit $? diff --git a/heartbeat/machine-info.in b/heartbeat/machine-info.in new file mode 100644 index 0000000..bfa7ce5 --- /dev/null +++ b/heartbeat/machine-info.in @@ -0,0 +1,157 @@ +#!@BASH_SHELL@ +# +# +# Virtual Machine and Container Registration Service OCF Resource Agent +# It records (in the CIB) various attributes of a node +# +# Copyright (c) 2017 Mathieu Grzybek +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_pidfile_default="$HA_RSCTMP/MachineInfo-${OCF_RESOURCE_INSTANCE}" +OCF_RESKEY_delay_default="0s" +OCF_RESKEY_clone_default="0" + +: ${OCF_RESKEY_pidfile=${OCF_RESKEY_pidfile_default}} +: ${OCF_RESKEY_delay=${OCF_RESKEY_delay_default}} +: ${OCF_RESKEY_clone=${OCF_RESKEY_clone_default}} + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="machine-info" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This is a Virtual Machine and Container Registration Service Resource Agent. +It records (in the CIB) attributes about the number of running virtual machines +and containers running on the node. +It uses systemd machinectl. +Sample output: + machines: 5 +</longdesc> +<shortdesc lang="en">Records various node attributes in the CIB</shortdesc> + +<parameters> +<parameter name="pidfile" unique="0"> +<longdesc lang="en">PID file</longdesc> +<shortdesc lang="en">PID file</shortdesc> +<content type="string" default="${OCF_RESKEY_pidfile_default}" /> +</parameter> +<parameter name="delay" unique="0"> +<longdesc lang="en">Interval to allow values to stabilize</longdesc> +<shortdesc lang="en">Dampening Delay</shortdesc> +<content type="string" default="${OCF_RESKEY_delay_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" timeout="20s" interval="60s"/> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="20s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +MachineInfoStats() { + value=$(machinectl|awk '/machines listed/ {print $1}') + echo -e "machines:\t$value" + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -S status -n machines -v $value +} + +MachineInfo_usage() { + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +MachineInfo_start() { + echo $OCF_RESKEY_clone > $OCF_RESKEY_pidfile + MachineInfoStats + exit $OCF_SUCCESS +} + +MachineInfo_stop() { + rm -f $OCF_RESKEY_pidfile + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -D -S state -n machines + exit $OCF_SUCCESS +} + +MachineInfo_monitor() { + if [ -f "$OCF_RESKEY_pidfile" ] ; then + MachineInfoStats + exit $OCF_RUNNING + fi + exit $OCF_NOT_RUNNING +} + +MachineInfo_validate() { + return $OCF_SUCCESS +} + +if [ $# -ne 1 ]; then + MachineInfo_usage + exit $OCF_ERR_ARGS +fi + +if [ x != x${OCF_RESKEY_delay} ]; then + OCF_RESKEY_delay="-d ${OCF_RESKEY_delay}" +fi + +case $__OCF_ACTION in +meta-data) meta_data + exit $OCF_SUCCESS + ;; +start) MachineInfo_start + ;; +stop) MachineInfo_stop + ;; +monitor) MachineInfo_monitor + ;; +validate-all) MachineInfo_validate + ;; +usage|help) MachineInfo_usage + exit $OCF_SUCCESS + ;; +*) MachineInfo_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +exit $? diff --git a/heartbeat/mariadb.in b/heartbeat/mariadb.in new file mode 100644 index 0000000..e0f1f3c --- /dev/null +++ b/heartbeat/mariadb.in @@ -0,0 +1,1040 @@ +#!@BASH_SHELL@ +# +# +# MariaDB +# +# Description: Manages a MariaDB Promotable database as Linux-HA resource +# +# Authors: Alan Robertson: DB2 Script +# Jakub Janczak: rewrite as MySQL +# Andrew Beekhof: cleanup and import +# Sebastian Reitenbach: add OpenBSD defaults, more cleanup +# Narayan Newton: add Gentoo/Debian defaults +# Marian Marinov, Florian Haas: add replication capability +# Yves Trudeau, Baron Schwartz: add VIP support and improve replication +# Nils Carlson: add GTID support and semi-sync support +# +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL) +# +# (c) 2002-2005 International Business Machines, Inc. +# 2005-2010 Linux-HA contributors +# +# See usage() function below for more details... +# +# OCF instance parameters: +# OCF_RESKEY_binary +# OCF_RESKEY_client_binary +# OCF_RESKEY_config +# OCF_RESKEY_datadir +# OCF_RESKEY_user +# OCF_RESKEY_group +# OCF_RESKEY_node_list +# OCF_RESKEY_test_table +# OCF_RESKEY_test_user +# OCF_RESKEY_test_passwd +# OCF_RESKEY_enable_creation +# OCF_RESKEY_additional_parameters +# OCF_RESKEY_log +# OCF_RESKEY_pid +# OCF_RESKEY_socket +# OCF_RESKEY_replication_user +# OCF_RESKEY_replication_passwd +# OCF_RESKEY_replication_port +####################################################################### +# Initialization: + +OCF_RESKEY_node_list_default="" + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs +. ${OCF_FUNCTIONS_DIR}/mysql-common.sh +####################################################################### + +usage() { + cat <<UEND +usage: $0 (start|stop|validate-all|meta-data|monitor|promote|demote|notify) + +$0 manages a MariaDB Database as an HA resource. + +The 'start' operation starts the database. +The 'stop' operation stops the database. +The 'status' operation reports whether the database is running +The 'monitor' operation reports whether the database seems to be working +The 'promote' operation makes this mysql server run as promoted +The 'demote' operation makes this mysql server run as unpromoted +The 'validate-all' operation reports whether the parameters are valid + +UEND +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="mariadb" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for MariaDB. + +Manages a complete promotable replication setup with GTID, for simpler +uses look at the mysql resource agent which supports older replication +forms which mysql and mariadb have in common. + +The resource must be setup to use notifications. Set 'notify=true' in the metadata +attributes when defining a MariaDB promotable instance. + +The default behavior is to use uname -n values in the change promoted to command. +Other IPs can be specified manually by adding a node attribute +\${INSTANCE_ATTR_NAME}_mysql_master_IP giving the IP to use for replication. +For example, if the mariadb primitive you are using is p_mariadb, the +attribute to set will be p_mariadb_mysql_master_IP. +</longdesc> +<shortdesc lang="en">Manages a MariaDB promotable instance</shortdesc> +<parameters> + +<parameter name="binary" unique="0" required="0"> +<longdesc lang="en"> +Location of the MariaDB server binary +</longdesc> +<shortdesc lang="en">MariaDB server binary</shortdesc> +<content type="string" default="${OCF_RESKEY_binary_default}" /> +</parameter> + +<parameter name="client_binary" unique="0" required="0"> +<longdesc lang="en"> +Location of the MariaDB client binary +</longdesc> +<shortdesc lang="en">MariaDB client binary</shortdesc> +<content type="string" default="${OCF_RESKEY_client_binary_default}" /> +</parameter> + +<parameter name="config" unique="0" required="0"> +<longdesc lang="en"> +Configuration file +</longdesc> +<shortdesc lang="en">MariaDB config</shortdesc> +<content type="string" default="${OCF_RESKEY_config_default}" /> +</parameter> + +<parameter name="datadir" unique="0" required="0"> +<longdesc lang="en"> +Directory containing databases +</longdesc> +<shortdesc lang="en">MariaDB datadir</shortdesc> +<content type="string" default="${OCF_RESKEY_datadir_default}" /> +</parameter> + +<parameter name="user" unique="0" required="0"> +<longdesc lang="en"> +User running MariaDB daemon +</longdesc> +<shortdesc lang="en">MariaDB user</shortdesc> +<content type="string" default="${OCF_RESKEY_user_default}" /> +</parameter> + +<parameter name="group" unique="0" required="0"> +<longdesc lang="en"> +Group running MariaDB daemon (for logfile and directory permissions) +</longdesc> +<shortdesc lang="en">MariaDB group</shortdesc> +<content type="string" default="${OCF_RESKEY_group_default}"/> +</parameter> + +<parameter name="log" unique="0" required="0"> +<longdesc lang="en"> +The logfile to be used for mysqld. +</longdesc> +<shortdesc lang="en">MariaDB log file</shortdesc> +<content type="string" default="${OCF_RESKEY_log_default}"/> +</parameter> + +<parameter name="node_list" unique="0" required="1"> +<longdesc lang="en"> +All node names of nodes that will execute mariadb. +Please separate each node name with a space. +This is required for the promoted selection to function. +</longdesc> +<shortdesc lang="en">node list</shortdesc> +<content type="string" default="${OCF_RESKEY_node_list_default}" /> +</parameter> + +<parameter name="pid" unique="0" required="0"> +<longdesc lang="en"> +The pidfile to be used for mysqld. +</longdesc> +<shortdesc lang="en">MariaDB pid file</shortdesc> +<content type="string" default="${OCF_RESKEY_pid_default}"/> +</parameter> + +<parameter name="socket" unique="0" required="0"> +<longdesc lang="en"> +The socket to be used for mysqld. +</longdesc> +<shortdesc lang="en">MariaDB socket</shortdesc> +<content type="string" default="${OCF_RESKEY_socket_default}"/> +</parameter> + +<parameter name="test_table" unique="0" required="0"> +<longdesc lang="en"> +Table to be tested in monitor statement (in database.table notation) +</longdesc> +<shortdesc lang="en">MariaDB test table</shortdesc> +<content type="string" default="${OCF_RESKEY_test_table_default}" /> +</parameter> + +<parameter name="test_user" unique="0" required="0"> +<longdesc lang="en"> +MariaDB test user, must have select privilege on test_table +</longdesc> +<shortdesc lang="en">MariaDB test user</shortdesc> +<content type="string" default="${OCF_RESKEY_test_user_default}" /> +</parameter> + +<parameter name="test_passwd" unique="0" required="0"> +<longdesc lang="en"> +MariaDB test user password +</longdesc> +<shortdesc lang="en">MariaDB test user password</shortdesc> +<content type="string" default="${OCF_RESKEY_test_passwd_default}" /> +</parameter> + +<parameter name="enable_creation" unique="0" required="0"> +<longdesc lang="en"> +If the MariaDB database does not exist, it will be created +</longdesc> +<shortdesc lang="en">Create the database if it does not exist</shortdesc> +<content type="boolean" default="${OCF_RESKEY_enable_creation_default}"/> +</parameter> + +<parameter name="additional_parameters" unique="0" required="0"> +<longdesc lang="en"> +Additional parameters which are passed to the mysqld on startup. +(e.g. --skip-external-locking or --skip-grant-tables) +</longdesc> +<shortdesc lang="en">Additional parameters to pass to mysqld</shortdesc> +<content type="string" default="${OCF_RESKEY_additional_parameters_default}"/> +</parameter> + +<parameter name="replication_user" unique="0" required="0"> +<longdesc lang="en"> +MariaDB replication user. This user is used for starting and stopping +MariaDB replication, for setting and resetting the promoted host, and for +setting and unsetting read-only mode. Because of that, this user must +have SUPER, REPLICATION SLAVE, REPLICATION CLIENT, PROCESS and RELOAD +privileges on all nodes within the cluster. Mandatory if you define a +promotable resource. +</longdesc> +<shortdesc lang="en">MariaDB replication user</shortdesc> +<content type="string" default="${OCF_RESKEY_replication_user_default}" /> +</parameter> + +<parameter name="replication_passwd" unique="0" required="0"> +<longdesc lang="en"> +MariaDB replication password. Used for replication client and unpromoted. +Mandatory if you define a promotable resource. +</longdesc> +<shortdesc lang="en">MariaDB replication user password</shortdesc> +<content type="string" default="${OCF_RESKEY_replication_passwd_default}" /> +</parameter> + +<parameter name="replication_port" unique="0" required="0"> +<longdesc lang="en"> +The port on which the Promoted MariaDB instance is listening. +</longdesc> +<shortdesc lang="en">MariaDB replication port</shortdesc> +<content type="string" default="${OCF_RESKEY_replication_port_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="120s" /> +<action name="stop" timeout="120s" /> +<action name="status" timeout="60s" /> +<action name="monitor" depth="0" timeout="30s" interval="20s" /> +<action name="monitor" role="Promoted" depth="0" timeout="30s" interval="10s" /> +<action name="monitor" role="Unpromoted" depth="0" timeout="30s" interval="30s" /> +<action name="promote" timeout="120s" /> +<action name="demote" timeout="120s" /> +<action name="notify" timeout="90s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +# Convenience functions + +greater_than_equal_long() +{ + # there are values we need to compare in this script + # that are too large for shell -gt to process + local true=$(echo "$1 > $2" | bc) + if [ "$true" -eq "1" ]; then + return 0 + else + return 1 + fi +} + +greater_than_gtid() +{ + local gtid1_transaction_id=$(echo $1 | cut -d - -f 3) + local gtid2_transaction_id=$(echo $2 | cut -d - -f 3) + + greater_than_equal_long $gtid1_transaction_id $gtid2_transaction_id + return $? +} + +set_gtid() { + # Sets the GTID in CIB using attrd_updater for this node. + + local gtid=$($MYSQL $MYSQL_OPTIONS_REPL \ + -s -N -e "show global variables like 'gtid_current_pos'" | cut -f 2) + + # Ensure that we got somethine like a valid GTID + if ! echo $gtid | grep -q '-'; then + ocf_exit_reason "Unable to read GTID from MariaDB" + ocf_log err "Unable to read GTID from MariaDB" + return $OCF_ERR_GENERIC + fi + + ${HA_SBIN_DIR}/attrd_updater -p -n ${OCF_RESOURCE_INSTANCE}-gtid -U $gtid +} + +read_gtid() { + local node=$1 + local query_result + local name + local host + local value + + # This produces output of the form 'name="var-name" host="node2" value="val"'. + # This should be set at this point, because we have store our own GTID previously. + if ! query_result=$(${HA_SBIN_DIR}/attrd_updater -p -N $node -n ${OCF_RESOURCE_INSTANCE}-gtid -Q); then + ocf_exit_reason "Unable to read GTID from attrd" + ocf_log err "Unable to read GTID from attrd" + echo "" + return + fi + + # Evaluate the query result to place the variables in the local scope. + eval ${query_result} + + echo ${value} +} + +clear_all_gtid() { + for node in $OCF_RESKEY_node_list; do + ${HA_SBIN_DIR}/attrd_updater -n ${OCF_RESOURCE_INSTANCE}-gtid -N $node -D + done +} + +set_waiting_for_first_master() { + ${HA_SBIN_DIR}/attrd_updater -p -n ${OCF_RESOURCE_INSTANCE}-waiting-for-first-master -U true +} + +waiting_for_first_master() { + local query_result + local name + local host + local value + + if ! query_result=$(${HA_SBIN_DIR}/attrd_updater -p -n ${OCF_RESOURCE_INSTANCE}-waiting-for-first-master -Q); then + ocf_exit_reason "Unable to read waiting-for-first-master from attrd" + ocf_log err "Unable to read waiting-for-first-master from attrd" + return 1 + fi + + # Evaluate the query result to place the variables in the local scope. + eval ${query_result} + + if [ "$value" = "true" ]; then + return 0 + else + return 1 + fi +} + +clear_waiting_for_first_master() { + attrd_updater -n ${OCF_RESOURCE_INSTANCE}-waiting-for-first-master -D +} + +have_master_with_priority() { + # Go through each node and validate that at least one has + # a set priority. Because we unset the priority on reboot + # a lack of priority indicates that we need to select a + # new master. + for node in $OCF_RESKEY_node_list; do + ocf_promotion_score -G -N $node >/dev/null 2>&1 + rc=$? + if [ $rc -eq 0 ]; then + return 0 + fi + done + return 1 +} + +attempt_to_set_master() { + + ocf_log info "Attempting to set master" + + local expected_node_count + if waiting_for_first_master; then + # Wait for all nodes to come online + expected_node_count=$OCF_RESKEY_CRM_meta_clone_max + else + # We accept one node being down. This is not arbitrary, + # synchronous replication requires acknowledgement from + # at least one host, which means only two nodes must have + # the latest GTID. So a set of n - 1 ensures that we do + # not lose any writes. + expected_node_count=$(($OCF_RESKEY_CRM_meta_clone_max-1)) + fi + + # Set the gtid for this node, making it available to other nodes + set_gtid + + local node_count=0 + local highest_gtid=0 + local master_candidate="" + for node in $OCF_RESKEY_node_list; do + + local node_gtid=$(read_gtid $node) + if [ -z "$node_gtid" ]; then + continue + fi + + # Got a valid gtid, increment node count + node_count=$(($node_count+1)) + + # Check if this is a good master candidate + if greater_than_gtid $node_gtid $highest_gtid; then + master_candidate=$node + highest_gtid=$node_gtid + fi + done + + # If we managed to query a sufficient number of nodes + # then set a master + if [ $node_count -ge $expected_node_count ]; then + ocf_log info "Promoting $master_candidate to master, highest gtid $highest_gtid, queried $node_count nodes." + ocf_promotion_score -v 100 -N $master_candidate + else + ocf_log info "Not enough nodes ($node_count) contributed to select a master, need $expected_node_count nodes." + fi +} + +set_read_only() { + # Sets or unsets read-only mode. Accepts one boolean as its + # optional argument. If invoked without any arguments, defaults to + # enabling read only mode. Should only be set in master/slave + # setups. + # Returns $OCF_SUCCESS if the operation succeeds, or + # $OCF_ERR_GENERIC if it fails. + local ro_val + if ocf_is_true $1; then + ro_val="on" + else + ro_val="off" + fi + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "SET GLOBAL read_only=${ro_val}" +} + +get_read_only() { + # Check if read-only is set + local read_only_state + + read_only_state=$($MYSQL $MYSQL_OPTIONS_REPL \ + -e "SHOW VARIABLES" | grep -w read_only | awk '{print $2}') + + if [ "$read_only_state" = "ON" ]; then + return 0 + else + return 1 + fi +} + +is_slave() { + # Determine whether the machine is currently running as a MariaDB + # slave, as determined per SHOW SLAVE STATUS. Returns 1 if SHOW + # SLAVE STATUS creates an empty result set, 0 otherwise. + local rc + + # Check whether this machine should be slave + if ! get_read_only; then + return 1 + fi + + if get_slave_info; then + # show slave status is not empty + # Is the slave sql thread running, then we are a slave! + if [ "$slave_sql" == 'Yes' ]; then + return 0 + else + return 1 + fi + else + # "SHOW SLAVE STATUS" returns an empty set if instance is not a + # replication slave + return 1 + fi +} + +parse_slave_info() { + # Extracts field $1 from result of "SHOW SLAVE STATUS\G" from file $2 + sed -ne "s/^.* $1: \(.*\)$/\1/p" < $2 +} + +get_slave_info() { + + if [ "$master_log_file" -a "$master_host" ]; then + # variables are already defined, get_slave_info has been run before + return $OCF_SUCCESS + else + local tmpfile=$(mktemp ${HA_RSCTMP}/check_slave.${OCF_RESOURCE_INSTANCE}.XXXXXX) + + $MYSQL $MYSQL_OPTIONS_REPL \ + -e 'SHOW SLAVE STATUS\G' > $tmpfile + + if [ -s $tmpfile ]; then + master_host=$(parse_slave_info Master_Host $tmpfile) + master_user=$(parse_slave_info Master_User $tmpfile) + master_port=$(parse_slave_info Master_Port $tmpfile) + master_using_gtid=$(parse_slave_info Using_Gtid $tmpfile) + master_log_file=$(parse_slave_info Master_Log_File $tmpfile) + slave_sql=$(parse_slave_info Slave_SQL_Running $tmpfile) + slave_io=$(parse_slave_info Slave_IO_Running $tmpfile) + last_errno=$(parse_slave_info Last_Errno $tmpfile) + last_error=$(parse_slave_info Last_Error $tmpfile) + secs_behind=$(parse_slave_info Seconds_Behind_Master $tmpfile) + last_io_errno=$(parse_slave_info Last_IO_Errno $tmpfile) + last_io_error=$(parse_slave_info Last_IO_Error $tmpfile) + ocf_log debug "MariaDB instance running as a replication slave" + rm "$tmpfile" + else + # Instance produced an empty "SHOW SLAVE STATUS" output -- + # instance is not a slave + rm "$tmpfile" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS + fi +} + +check_slave() { + # Checks slave status + local rc new_master + + get_slave_info + rc=$? + + if [ $rc -eq 0 ]; then + + # Check normal errors + if [ $last_errno -ne 0 ]; then + ocf_exit_reason "MariaDB slave replication has failed ($last_errno): $last_error" + + exit $OCF_ERR_GENERIC + fi + + # Check IO Errors, ignore 2003 which indicates a connection failure to the master + if [ $last_io_errno -ne 0 ] && [ $last_io_errno -ne 2003 ]; then + ocf_exit_reason "MariaDB slave io has failed ($last_io_errno): $last_io_error" + + exit $OCF_ERR_GENERIC + fi + + if [ $last_io_errno -eq 2003 ]; then + ocf_log warn "MariaDB master not reachable from slave" + fi + + if [ "$slave_io" != 'Yes' ]; then + # Not necessarily a bad thing. The master may have + # temporarily shut down, and the slave may just be + # reconnecting. A warning can't hurt, though. + ocf_log warn "MariaDB Slave IO threads currently not running." + + # Sanity check, are we at least on the right master + new_master=$($CRM_ATTR_REPL_INFO --query -q) + + if [ "$master_host" != "$new_master" ]; then + # Not pointing to the right master, not good, removing the VIPs + set_reader_attr 0 + + exit $OCF_SUCCESS + fi + + fi + + if [ "$slave_sql" != 'Yes' ]; then + # We don't have a replication SQL thread running. Not a + # good thing. Try to recoved by restarting the SQL thread + # and remove reader vip. Prevent MariaDB restart. + ocf_exit_reason "MariaDB Slave SQL threads currently not running." + + # Remove reader vip + set_reader_attr 0 + + # try to restart slave + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "START SLAVE" + + # Return success to prevent a restart + exit $OCF_SUCCESS + fi + + ocf_log debug "MariaDB instance running as a replication slave" + else + # Instance produced an empty "SHOW SLAVE STATUS" output -- + # instance is not a slave + # TODO: Needs to handle when get_slave_info will return too many connections error + ocf_exit_reason "check_slave invoked on an instance that is not a replication slave." + exit $OCF_ERR_GENERIC + fi +} + +set_master() { + local new_master=$($CRM_ATTR_REPL_INFO --query -q) + + # Informs the MariaDB server of the master to replicate + # from. Accepts one mandatory argument which must contain the host + # name of the new master host. The master must either be unchanged + # from the laste master the slave replicated from, or freshly + # reset with RESET MASTER. + ocf_log info "Changing MariaDB configuration to replicate from $new_master." + + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "CHANGE MASTER TO MASTER_HOST='$new_master', \ + MASTER_PORT=$OCF_RESKEY_replication_port, \ + MASTER_USER='$OCF_RESKEY_replication_user', \ + MASTER_PASSWORD='$OCF_RESKEY_replication_passwd', \ + MASTER_USE_GTID=current_pos"; +} + +unset_master(){ + # Instructs the MariaDB server to stop replicating from a master + # host. + + # If we're currently not configured to be replicating from any + # host, then there's nothing to do. But we do log a warning as + # no-one but the CRM should be touching the MariaDB master/slave + # configuration. + if ! is_slave; then + ocf_log warn "Attempted to unset the replication master on an instance that is not configured as a replication slave" + return $OCF_SUCCESS + fi + + # Stop the slave I/O thread and wait for relay log + # processing to complete + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "STOP SLAVE IO_THREAD" + if [ $? -gt 0 ]; then + ocf_exit_reason "Error stopping slave IO thread" + exit $OCF_ERR_GENERIC + fi + + local tmpfile=$(mktemp ${HA_RSCTMP}/threads.${OCF_RESOURCE_INSTANCE}.XXXXXX) + while true; do + $MYSQL $MYSQL_OPTIONS_REPL \ + -e 'SHOW PROCESSLIST\G' > $tmpfile + if grep -i 'Has read all relay log' $tmpfile >/dev/null; then + ocf_log info "MariaDB slave has finished processing relay log" + break + fi + if ! grep -q 'system user' $tmpfile; then + ocf_log info "Slave not runnig - not waiting to finish" + break + fi + ocf_log info "Waiting for MariaDB slave to finish processing relay log" + sleep 1 + done + rm -f $tmpfile + + # Now, stop all slave activity and unset the master host + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "STOP SLAVE" + if [ $? -gt 0 ]; then + ocf_exit_reason "Error stopping rest slave threads" + exit $OCF_ERR_GENERIC + fi + + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "RESET SLAVE /*!50516 ALL */;" + if [ $? -gt 0 ]; then + ocf_exit_reason "Failed to reset slave" + exit $OCF_ERR_GENERIC + fi +} + +# Start replication as slave +start_slave() { + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "START SLAVE" +} + +# Set the attribute controlling the readers VIP +set_reader_attr() { + local curr_attr_value + + curr_attr_value=$(get_reader_attr) + + if [ "$curr_attr_value" -ne "$1" ]; then + $CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v $1 + fi + +} + +# get the attribute controlling the readers VIP +get_reader_attr() { + local attr_value + local rc + + attr_value=$($CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} --query -q) + rc=$? + if [ "$rc" -eq "0" ]; then + echo $attr_value + else + echo -1 + fi +} + +# Determines what IP address is attached to the current host. The output of the +# crm_attribute command looks like this: +# scope=nodes name=IP value=10.2.2.161 +# If the ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP node attribute is not defined, fallback is to uname -n +# The ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP is the IP address that will be used for the +# change master to command. +get_local_ip() { + local IP + IP=$($CRM_ATTR -l forever -n ${INSTANCE_ATTR_NAME}_mysql_master_IP -q -G 2>/dev/null) + if [ ! $? -eq 0 ]; then + uname -n + else + echo $IP + fi +} + +####################################################################### + +# Functions invoked by resource manager actions + +mysql_monitor() { + local rc + local status_loglevel="err" + + # Set loglevel to info during probe + if ocf_is_probe; then + status_loglevel="info" + fi + + mysql_common_status $status_loglevel + rc=$? + + # If status returned an error, return that immediately + if [ $rc -ne $OCF_SUCCESS ]; then + return $rc + fi + + # Check if this instance is configured as a slave, and if so + # check slave status + if is_slave; then + if ! check_slave; then + return $OCF_ERR_GENERIC + fi + fi + + if [ -n "$OCF_RESKEY_test_table" ]; then + + # Check for test table + ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \ + -e "SELECT COUNT(*) FROM $OCF_RESKEY_test_table" + rc=$? + + if [ $rc -ne 0 ]; then + ocf_exit_reason "Failed to select from $test_table"; + return $OCF_ERR_GENERIC; + fi + fi + + # Check if we are in read-only mode and there is no master + # with priority then we attempt to select a master + if get_read_only && ! have_master_with_priority; then + attempt_to_set_master + fi + + if ! get_read_only; then + ocf_log debug "MariaDB monitor succeeded (master)"; + return $OCF_RUNNING_MASTER + else + ocf_log debug "MariaDB monitor succeeded"; + return $OCF_SUCCESS + fi +} + +mysql_start() { + local rc + + if ! ocf_is_ms; then + ocf_exit_reason "Resource is not configured as master/slave" + return $OCF_ERR_GENERIC + fi + + # Initialize the ReaderVIP attribute, monitor will enable it + set_reader_attr 0 + + mysql_common_status info + if [ $? = $OCF_SUCCESS ]; then + ocf_log info "MariaDB already running" + return $OCF_SUCCESS + fi + + mysql_common_prepare_dirs + + mysql_common_start --skip-slave-start --log-slave-updates + rc=$? + if [ $rc != $OCF_SUCCESS ]; then + return $rc + fi + + # Enable semi-sync + ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \ + -e "SET GLOBAL rpl_semi_sync_slave_enabled='ON', \ + rpl_semi_sync_master_enabled='ON', \ + rpl_semi_sync_master_wait_no_slave='OFF', \ + rpl_semi_sync_master_wait_point='AFTER_SYNC', \ + gtid_strict_mode='ON', \ + sync_binlog=1, \ + sync_master_info=1, \ + innodb_flush_log_at_trx_commit=1;" + rc=$? + if [ $rc -ne 0 ]; then + ocf_exit_reason "Failed to enable semi-sync and set variables"; + return $OCF_ERR_GENERIC; + fi + + # We're configured as a stateful resource. We must start as + # slave by default. At this point we don't know if the CRM has + # already promoted a master. So, we simply start in read only + # mode and make sure our old score is invalidated. + set_read_only on + ocf_promotion_score -D + + # Now, let's see whether there is a master. We might be a new + # node that is just joining the cluster, and the CRM may have + # promoted a master before. + new_master_host=$(echo $OCF_RESKEY_CRM_meta_notify_master_uname|tr -d " ") + if [ "$new_master_host" -a "$new_master_host" != ${NODENAME} ]; then + set_master + start_slave + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to start slave" + return $OCF_ERR_GENERIC + fi + else + ocf_log info "No MariaDB master present - clearing replication state, setting gtid in attrd, waiting for first master" + unset_master + set_waiting_for_first_master + fi + + # Initial monitor action + if [ -n "$OCF_RESKEY_test_table" -a -n "$OCF_RESKEY_test_user" -a -n "$OCF_RESKEY_test_passwd" ]; then + OCF_CHECK_LEVEL=10 + fi + + mysql_monitor + rc=$? + if [ $rc != $OCF_SUCCESS -a $rc != $OCF_RUNNING_MASTER ]; then + ocf_exit_reason "Failed initial monitor action" + return $rc + fi + + ocf_log info "MariaDB started" + return $OCF_SUCCESS +} + +mysql_stop() { + # clear preference for becoming master + ocf_promotion_score -D + + # Remove VIP capability + set_reader_attr 0 + + mysql_common_stop +} + +mysql_promote() { + local master_info + + if ( ! mysql_common_status err ); then + return $OCF_NOT_RUNNING + fi + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "STOP SLAVE" + + set_read_only off || return $OCF_ERR_GENERIC + # Force the master to wait for timeout period on slave disconnect + ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \ + -e "SET GLOBAL rpl_semi_sync_master_wait_no_slave='ON';" + + # Set Master Info in CIB, cluster level attribute + master_info="$(get_local_ip)" + ${CRM_ATTR_REPL_INFO} -v "$master_info" + + # A master can accept reads + set_reader_attr 1 + + # Clear the gtids in attrd now that there is a master + clear_all_gtid + + return $OCF_SUCCESS +} + +mysql_demote() { + if ! mysql_common_status err; then + return $OCF_NOT_RUNNING + fi + + # Return to default no wait setting. + ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \ + -e "SET GLOBAL rpl_semi_sync_master_wait_no_slave='OFF';" + + # Return master preference to default, so the cluster manager gets + # a chance to select a new master + ocf_promotion_score -D +} + +mysql_notify() { + local type_op + type_op="${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation}" + + ocf_log debug "Received $type_op notification." + + case "$type_op" in + 'pre-promote') + # A master is now being promoted, remove the waiting-for-first-master flag + clear_waiting_for_first_master + ;; + 'post-promote') + # The master has completed its promotion. Now is a good + # time to check whether our replication slave is working + # correctly. + new_master_host=$(echo $OCF_RESKEY_CRM_meta_notify_promote_uname|tr -d " ") + if [ "$new_master_host" = ${NODENAME} ]; then + ocf_log info "This will be the new master, ignoring post-promote notification." + else + ocf_log info "Resetting replication, uname of master: $new_master_host" + unset_master + if [ $? -ne 0 ]; then + return $OCF_ERR_GENERIC + fi + + set_master + if [ $? -ne 0 ]; then + return $OCF_ERR_GENERIC + fi + + start_slave + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to start slave" + return $OCF_ERR_GENERIC + fi + fi + return $OCF_SUCCESS + ;; + 'pre-demote') + demote_host=$(echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " ") + if [ $demote_host = ${NODENAME} ]; then + ocf_log info "pre-demote notification for $demote_host" + set_read_only on + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to set read-only"; + return $OCF_ERR_GENERIC; + fi + + # Must kill all existing user threads because they are still Read/write + # in order for the slaves to complete the read of binlogs + local tmpfile=$(mktemp ${HA_RSCTMP}/threads.${OCF_RESOURCE_INSTANCE}.XXXXXX) + $MYSQL $MYSQL_OPTIONS_REPL -e "SHOW PROCESSLIST" > $tmpfile + for thread in $(awk '$0 !~ /Binlog Dump|system user|event_scheduler|SHOW PROCESSLIST/ && $0 ~ /^[0-9]/ {print $1}' $tmpfile) + do + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "KILL ${thread}" + done + rm -f $tmpfile + else + ocf_log info "Ignoring post-demote notification execpt for my own demotion." + fi + return $OCF_SUCCESS + ;; + 'post-demote') + demote_host=$(echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " ") + if [ $demote_host = ${NODENAME} ]; then + ocf_log info "Ignoring post-demote notification for my own demotion." + return $OCF_SUCCESS + fi + ocf_log info "post-demote notification for $demote_host." + # The former master has just been gracefully demoted. + unset_master + ;; + *) + return $OCF_SUCCESS + ;; + esac +} + +mysql_validate() { + check_binary bc +} + +####################################################################### + +case "$1" in + meta-data) meta_data + exit $OCF_SUCCESS;; + usage|help) usage + exit $OCF_SUCCESS;; +esac + +mysql_common_validate +rc=$? +LSB_STATUS_STOPPED=3 +if [ $rc -ne 0 ]; then + case "$1" in + stop) ;; + monitor) + mysql_common_status "info" + if [ $? -eq $OCF_SUCCESS ]; then + # if validatation fails and pid is active, always treat this as an error + ocf_exit_reason "environment validation failed, active pid is in unknown state." + exit $OCF_ERR_GENERIC + fi + # validation failed and pid is not active, it's safe to say this instance is inactive. + exit $OCF_NOT_RUNNING;; + + status) exit $LSB_STATUS_STOPPED;; + *) exit $rc;; + esac +fi + +# What kind of method was invoked? +case "$1" in + start) mysql_start;; + stop) mysql_stop;; + status) mysql_common_status err;; + monitor) mysql_monitor;; + promote) mysql_promote;; + demote) mysql_demote;; + notify) mysql_notify;; + validate-all) mysql_validate;; + + *) usage + exit $OCF_ERR_UNIMPLEMENTED;; +esac + +# vi:sw=4:ts=4:et: diff --git a/heartbeat/mdraid b/heartbeat/mdraid new file mode 100755 index 0000000..1e6a5d0 --- /dev/null +++ b/heartbeat/mdraid @@ -0,0 +1,584 @@ +#!/bin/sh +# +# License: GNU General Public License (GPL) +# Support: users@clusterlabs.org +# +# mdraid (inspired by the Raid1 upstream resource agent) +# +# Description: Manages a Linux software RAID device on a (shared) storage medium. +# Author: Heinz Mauelshagen (heinzm@redhat.com) +# Release: Mar 2020 +# +# usage: $0 {start|stop|monitor|validate-all|usage|meta-data} +# +# EXAMPLE config file /etc/mdadm.conf (for more info: mdadm.conf(5)) +# +# AUTO -all +# ARRAY /dev/md0 UUID=4a865b55:ba27ef8d:29cd5701:6fb42799 +# +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_mdadm_conf_default="" +OCF_RESKEY_md_dev_default="" +OCF_RESKEY_force_stop_default="false" +OCF_RESKEY_wait_for_udev_default="true" +OCF_RESKEY_force_clones_default="false" + +: ${OCF_RESKEY_mdadm_conf=${OCF_RESKEY_mdadm_conf_default}} +: ${OCF_RESKEY_md_dev=${OCF_RESKEY_md_dev_default}} +: ${OCF_RESKEY_force_stop=${OCF_RESKEY_force_stop_default}} +: ${OCF_RESKEY_wait_for_udev=${OCF_RESKEY_wait_for_udev_default}} +: ${OCF_RESKEY_force_clones=${OCF_RESKEY_force_clones_default}} + +####################################################################### + +usage() { + cat <<-EOT + usage: $0 {start|stop|monitor|validate-all|usage|meta-data} + EOT +} + +# +# Action: provide meta-data (parameter specifications and descriptive text) +# +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="mdraid" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This resource agent manages Linux software RAID (MD) devices on +a shared storage medium ensuring that non-clustered MD arrays +are prohibited from starting cloned (which would cause data corruption +(e.g., on raid6 arrays) unless forced (see force_clones parameter). +Clustered MD RAID layouts (see below) will be discovered and allowed +cloning by default; no need to set force_clones. + +It uses mdadm(8) to start, stop, and monitor the MD devices. + +Supported clustered (i.e., clonable active-active) arrays are linear, +raid0, and clustered raid1/raid10 (i.e. mdadm(8) created with +--bitmap=clustered). + +Option: OCF_CHECK_LEVEL + +When OCF_CHECK_LEVEL is set to any number greater than 0, the standard +monitor operation (including probe) will check the array and attempt +recovery sequence to re-add devices if any failed device exists. By +default, OCF_CHECK_LEVEL is unset, and this is disabled. + +</longdesc> +<shortdesc lang="en">Manages Linux software RAID (MD) devices on shared +storage</shortdesc> + +<parameters> +<parameter name="mdadm_conf" unique="0" required="1"> +<longdesc lang="en"> +The MD RAID configuration file (e.g., /etc/mdadm.conf). +</longdesc> +<shortdesc lang="en">MD config file</shortdesc> +<content type="string" default="${OCF_RESKEY_mdadm_conf_default}" /> +</parameter> + +<parameter name="md_dev" unique="0" required="1"> +<longdesc lang="en"> +MD array block device to use (e.g., /dev/md0 or /dev/md/3). +With shared access to the array's storage, this should +preferably be a clustered raid1 or raid10 array created +with --bitmap=clustered, assuming its resource will +be cloned (i.e., active-active access). + +Be sure to disable auto-assembly for the resource-managed arrays! +</longdesc> +<shortdesc lang="en">MD block device</shortdesc> +<content type="string" default="${OCF_RESKEY_md_dev_default}" /> +</parameter> + +<parameter name="force_stop" unique="0" required="0"> +<longdesc lang="en"> +If processes or kernel threads are using the array, it cannot be +stopped. We will try to stop processes, first by sending TERM and +then, if that doesn't help in $PROC_CLEANUP_TIME seconds, using KILL. +The lsof(8) program is required to get the list of array users. +Of course, the kernel threads cannot be stopped this way. +If the processes are critical for data integrity, then set this +parameter to false. Note that in that case the stop operation +will fail and the node will be fenced. +</longdesc> +<shortdesc lang="en">force stop processes using the array</shortdesc> +<content type="boolean" default="${OCF_RESKEY_force_stop_default}" /> +</parameter> + +<parameter name="wait_for_udev" unique="0" required="0"> +<longdesc lang="en"> +Wait until udevd creates a device in the start operation. On a +normally loaded host this should happen quickly, but you may be +unlucky. If you are not using udev set this to "no". +</longdesc> +<shortdesc lang="en">wait_for_udev</shortdesc> +<content type="boolean" default="${OCF_RESKEY_wait_for_udev_default}" /> +</parameter> + +<parameter name="force_clones" unique="0" required="0"> +<longdesc lang="en"> +Activating the same, non-clustered MD RAID array (i.e. single-host +raid1/4/5/6/10) on multiple nodes at the same time will result in +data corruption and thus is forbidden by default. + +A safe example could be an (exotic) array that is only named identically +across all nodes, but is in fact based on distinct (non-shared) storage. + +Only set this to "true" if you know what you are doing! +</longdesc> +<shortdesc lang="en">force ability to run as a clone</shortdesc> +<content type="boolean" default="${OCF_RESKEY_force_clones_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="10s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +# ocf-shellfunc ocf_is_clone() fails with meta attribute clone-max < 2. +# Checking for defined meta_clone_max reskey is sufficient until fixed. +resource_is_cloned() { + [ -z "$OCF_RESKEY_CRM_meta_clone_max" ] && return 1 || return 0; +} + +raid_validate_all() { + if [ -z "$mdadm_conf" ] ; then + ocf_exit_reason "Please set OCF_RESKEY_mdadm_conf" + return $OCF_ERR_CONFIGURED + fi + if [ ! -r "$mdadm_conf" ] ; then + ocf_exit_reason "Configuration file [$mdadm_conf] does not exist, or cannot be opened" + return $OCF_ERR_ARGS + fi + if [ -z "$md_dev" ] ; then + ocf_exit_reason "Please set OCF_RESKEY_md_dev to the MD RAID array block device you want to control" + return $OCF_ERR_CONFIGURED + fi + case "$md_dev" in + /dev/*) ;; + *) ocf_exit_reason "Bogus MD RAID array block device name (\"$md_dev\")" + return $OCF_ERR_ARGS;; + esac + if ocf_is_true $wait_for_udev && ! have_binary udevadm && [ "$__OCF_ACTION" = "start" ]; then + ocf_exit_reason "either install udevadm or set udev to false" + return $OCF_ERR_INSTALLED + fi + if ocf_is_true $force_stop && ! have_binary lsof; then + ocf_exit_reason "Please install lsof(8) or set force_stop to false." + return $OCF_ERR_INSTALLED + fi + if ! have_binary $MDADM; then + ocf_exit_reason "Please install mdadm(8)!" + return $OCF_ERR_INSTALLED + fi + if ! have_binary blkid; then + ocf_exit_reason "Please install blkid(8). We need it to list MD array UUIDs!" + return $OCF_ERR_INSTALLED + fi + if [ `echo $md_dev|wc -w` -gt 1 ]; then + ocf_exit_reason "Only one MD array supported" + return $OCF_ERR_CONFIGURED + fi + + return $OCF_SUCCESS +} + +# Remove ':' or '-' from uuid string to be able to compare between MD and blkid format. +uuid_flat() { + echo $1|sed 's/[-:]//g' +} + +# Global variable for devices by MD uuid. +devs="" + +# Get array uuid from mdadm_conf based on $md_dev. +get_array_uuid_by_mddev() { + local array_uuid + + array_uuid="`grep $md_dev $mdadm_conf`" + if [ -z "$array_uuid" ] + then + ocf_exit_reason "Entry for $MMDEV does not exist in $mdadm_conf!" + return $OCF_ERR_CONFIGURED + fi + + array_uuid=$(echo $array_uuid | sed 's/^.*UUID=//;s/ .*$//') + if [ -z "$array_uuid" ] + then + ocf_exit_reason "Bogus entry for $MMDEV in $mdadm_conf!" + return $OCF_ERR_CONFIGURED + fi + + echo `uuid_flat $array_uuid` + + return $OCF_SUCCESS +} + +# Use blkid to get to the subset of raid members by array uuid. +list_devices_for_mddev() { + local array_uuid blkid_array_uuid dev line rc + + array_uuid=`get_array_uuid_by_mddev` + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + ocf_exit_reason "Failed to get UUID of $md_dev from $mdadm_conf" + return $rc + fi + + blkid | grep linux_raid_member 2>/dev/null | while read line + do + dev=`echo $line | sed 's/: .*//'` + blkid_array_uuid=$(echo $line | sed 's/^.* UUID="//;s/" .*$//') + [ "`uuid_flat $blkid_array_uuid`" = "$array_uuid" ] && echo $dev + done +} + +# Check for linear or raid0 array; presumes defined global devs() array. +array_is_linear_or_raid0() { + local c=0 d + + for d in $devs + do + $MDADM -E $d 2>&1 | $EGREP -i "raid level : (raid0|linear)" >/dev/null 2>&1 + [ $? -eq 0 ] && c=$((c+1)) + done + + [ $c -eq `echo $devs|wc -w` ] && return 0 || return 1 +} + +# Return true for clustered RAID relying on all component devices reporting clustered type; +# presumes defined global devs variable with the component devices of the array. +array_is_clustered_raid() { + local c=0 d dev_count=`echo $devs|wc -w` s + # Check based on specific "intent bitmap : clustered" output once + # available in mdadm output or fall back to "Cluster Name" defined + # presuming bitmap is clustered if so. + local strs="clustered cluster.name" + + for d in $devs + do + for s in $strs + do + $MDADM -E $d 2>&1 | grep -i "$s" >/dev/null 2>&1 + if [ $? -eq 0 ]; then + c=$((c+1)) + break + fi + done + done + + [ $c -eq $dev_count ] && return 0 || return 1 +} + +# Check for all clustered types (linear, raid0, and clustered raid1/raid10). +is_clustered_raid() { + array_is_clustered_raid || array_is_linear_or_raid0 +} + +md_assemble() { + local rc + + $MDADM --assemble $md_dev --config="$mdadm_conf" + rc=$? + [ $rc -eq 0 ] && ocf_is_true $wait_for_udev && udevadm settle --exit-if-exists=$md_dev + + return $rc +} + +# Try setting an MD array to readonly. +mark_readonly() { + local rc + + $MDADM --readonly $md_dev --config="$mdadm_conf" + rc=$? + [ $rc -ne 0 ] && ocf_exit_reason "Failed to set $md_dev readonly (rc=$rc)" + + return $rc +} + +# Try stopping an MD array in case its block device is nonexistent for some reason. +mknod_raid_stop() { + local rc n tmp_block_file + + # first create a block device file, then try to stop the array + n=`echo $1 | sed 's/[^0-9]*//'` + if ! ocf_is_decimal "$n"; then + ocf_log warn "could not get the minor device number from $1" + return 1 + fi + tmp_block_file="$HA_RSCTMP/${OCF_RESOURCE_INSTANCE}-`basename $1`" + rm -f $tmp_block_file + ocf_log info "block device file $1 missing, creating one in order to stop the array" + mknod $tmp_block_file b 9 $n + $MDADM --stop $tmp_block_file --config="$mdadm_conf" + rc=$? + rm -f $tmp_block_file + return $rc +} + +# Stop an MD array. +raid_stop_one() { + if [ -b "$1" ]; then + $MDADM --stop $1 --config="$mdadm_conf" && return + else + # newer mdadm releases can stop arrays when given the + # basename; try that first + $MDADM --stop `basename $1` --config="$mdadm_conf" && return + # otherwise create a block device file + mknod_raid_stop $1 + fi +} + +# Functions show/stop any resource holding processes. +get_users_pids() { + ocf_log debug "running lsof to list $md_dev users..." + ocf_run -warn 'lsof $md_dev | tail -n +2 | $AWK "{print $2}" | sort -u' +} + +stop_raid_users() { + local pids=`get_users_pids $md_dev` + + if [ -z "$pids" ]; then + ocf_log warn "lsof reported no users holding arrays" + return 2 + else + ocf_stop_processes TERM $PROC_CLEANUP_TIME $pids + fi +} + +showusers() { + local disk=`basename $md_dev` + + ocf_log info "running lsof to list $disk users..." + ocf_run -warn lsof $md_dev + + if [ -d /sys/block/$disk/holders ]; then + ocf_log info "ls -l /sys/block/$disk/holders" + ocf_run -warn ls -l /sys/block/$disk/holders + fi +} + +####################################################################### + +# +# Action: START up the MD RAID array. +# +raid_start() { + local rc + + if resource_is_cloned && ! is_clustered_raid; then + if ocf_is_true "$OCF_RESKEY_force_clones"; then + ocf_log warn "Forced cloned starting non-clustered $md_dev which may lead to data corruption!" + else + ocf_exit_reason "Rejecting start: non-clustered MD RAID array $md_dev is NOT safe to run cloned" + exit $OCF_ERR_CONFIGURED + fi + fi + + raid_monitor + rc=$? + # md array already online, nothing to do. + [ $rc -eq $OCF_SUCCESS ] && return $rc + + if [ $rc -ne $OCF_NOT_RUNNING ] + then + # If the array is in a broken state, this agent doesn't know how to repair that. + ocf_exit_reason "MD RAID array $md_dev in a broken state; cannot start (rc=$rc)" + return $OCF_ERR_GENERIC + fi + + md_assemble + rc=$? + if [ $rc -ne 0 ]; then + ocf_exit_reason "Failed to assemble MD RAID array $md_dev (rc=$rc, is $mdadm_conf up-to-date?)" + return $OCF_ERR_GENERIC + fi + + raid_monitor + [ $? -eq $OCF_SUCCESS ] && return $OCF_SUCCESS + + ocf_exit_reason "Couldn't start MD RAID array $md_dev (rc=$rc)" + + return $OCF_ERR_GENERIC +} + +# +# Action: STOP the MD RAID array +# +raid_stop() { + local rc + + # See if the MD device is already cleanly stopped: + raid_monitor + [ $? -eq $OCF_NOT_RUNNING ] && return $OCF_SUCCESS + + # Turn off raid + if ! raid_stop_one $md_dev; then + if ocf_is_true $force_stop; then + stop_raid_users + case $? in + 2) false;; + *) raid_stop_one $md_dev;; + esac + else + false + fi + fi + rc=$? + + if [ $rc -ne 0 ]; then + ocf_log warn "Couldn't stop MD RAID array $md_dev (rc=$rc)" + showusers $md_dev + mark_readonly $md_dev + return $OCF_ERR_GENERIC + fi + + raid_monitor + rc=$? + [ $rc -eq $OCF_NOT_RUNNING ] && return $OCF_SUCCESS + + ocf_exit_reason "MD RAID array $md_dev still active after stop command (rc=$rc)" + return $OCF_ERR_GENERIC +} + +# +# Action: monitor the MD RAID array. +# +raid_monitor() { + local TRY_READD=0 md rc pbsize + + # check if the md device exists first + # but not if we are in the stop operation + # device existence is important only for the running arrays + if [ "$__OCF_ACTION" != "stop" ]; then + if [ -h "$md_dev" ]; then + md=$(ls $md_dev -l | $AWK -F'/' '{print $NF}') + elif [ -b "$md_dev" ]; then + md=${md_dev#/dev/} + else + ocf_log info "$md_dev is not a block device" + return $OCF_NOT_RUNNING + fi + fi + + if ! grep -e "^$md[ \t:]" /proc/mdstat >/dev/null ; then + ocf_log info "$md not found in /proc/mdstat" + return $OCF_NOT_RUNNING + fi + + $MDADM --detail --test $md_dev >/dev/null 2>&1 + rc=$? + case $rc in + 0) ;; + 1) ocf_log warn "$md_dev has at least one failed device." + TRY_READD=1;; + 2) ocf_exit_reason "$md_dev has failed." + return $OCF_ERR_GENERIC;; + 4) + if [ "$__OCF_ACTION" = "stop" ] ; then + # There may be a transient invalid device after + # we stop MD due to uevent processing, the + # original device is stopped though. + return $OCF_NOT_RUNNING + else + ocf_exit_reason "mdadm failed on $md_dev." + return $OCF_ERR_GENERIC + fi;; + *) ocf_exit_reason "mdadm returned an unknown result ($rc)." + return $OCF_ERR_GENERIC;; + esac + + if ! array_is_linear_or_raid0; then + if [ "$__OCF_ACTION" = "monitor" ] && [ "$OCF_RESKEY_CRM_meta_interval" != 0 \ + ] && [ $TRY_READD -eq 1 ] && [ $OCF_CHECK_LEVEL -gt 0 ]; then + ocf_log info "Attempting recovery sequence to re-add devices on MD RAID array $md_dev:" + $MDADM $md_dev --fail detached + $MDADM $md_dev --remove failed + $MDADM $md_dev --re-add missing + # TODO: At this stage, there's nothing to actually do + # here. Either this worked or it did not. + fi + fi + + pbsize=`(blockdev --getpbsz $md_dev || stat -c "%o" $md_dev) 2>/dev/null` + if [ -z "$pbsize" ]; then + ocf_log warn "both blockdev and stat were unable to get the block size (will use 4k)" + pbsize=4096 # try with 4k + fi + if ! dd if=$md_dev count=1 bs=$pbsize of=/dev/null iflag=direct >/dev/null 2>&1; then + ocf_exit_reason "$md_dev: I/O error on read" + return $OCF_ERR_GENERIC + fi + + [ "$__OCF_ACTION" = "monitor" ] && ocf_log info "monitoring...($md_dev)" + + return $OCF_SUCCESS +} + +if [ $# -ne 1 ]; then + usage + exit $OCF_ERR_ARGS +fi + +# Process actions which are independant from validation +case "$1" in +meta-data) meta_data + exit $OCF_SUCCESS;; +usage) usage + exit $OCF_SUCCESS;; +*) ;; +esac + +# Define global variables used in ^ functions. +mdadm_conf="${OCF_RESKEY_mdadm_conf}" +md_dev="${OCF_RESKEY_md_dev}" +force_stop="${OCF_RESKEY_force_stop}" +wait_for_udev="${OCF_RESKEY_wait_for_udev}" + +# Validate all parameters and check for mandatory binaries present +raid_validate_all +rc=$? +[ $rc -ne $OCF_SUCCESS ] && exit $rc +# raid_validate_all already processed and result checked. +[ "$1" = "validate-all" ] && return ${OCF_SUCCESS} + +# Required by start|stop|monitor processed below +devs="`list_devices_for_mddev`" +if [ `echo $devs|wc -l` -eq 0 ]; then + ocf_exit_reason "No component device(s) found for MD RAID array $md_dev" + exit $OCF_ERR_GENERIC +fi + +case "$1" in +start) raid_start;; +stop) raid_stop;; +monitor) raid_monitor;; +*) usage + exit $OCF_ERR_UNIMPLEMENTED;; +esac +rc=$? + +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc diff --git a/heartbeat/metadata.rng b/heartbeat/metadata.rng new file mode 100644 index 0000000..909efc2 --- /dev/null +++ b/heartbeat/metadata.rng @@ -0,0 +1,93 @@ +<grammar xmlns="http://relaxng.org/ns/structure/1.0"> + +<start><element name="resource-agent"> + <attribute name="name" /> + <optional><attribute name="version" /></optional> + + <element name="version"> <text /> </element> + <element name="longdesc"> <attribute name="lang" /> <text /> </element> + <element name="shortdesc"> <attribute name="lang" /> <text /> </element> + + <element name="parameters"> <oneOrMore> + <element name="parameter"> + <attribute name="name" /> + <optional> + <attribute name="unique"> <ref name="boolean-values" /> </attribute> + </optional> + <optional> + <attribute name="required"> <ref name="boolean-values" /> </attribute> + </optional> + + <element name="longdesc"> + <attribute name="lang" /> + <text /> + </element> + + <element name="shortdesc"> + <attribute name="lang" /> + <text /> + </element> + + <element name="content"> + <choice> + <attribute name="type"> + <choice> + <value>boolean</value> + <value>string</value> + <value>second</value> + <value>integer</value> + </choice> + </attribute> + <group> + <attribute name="type"> + <value>select</value> + </attribute> + <zeroOrMore> + <element name="option"> + <attribute name="value" /> + </element> + </zeroOrMore> + </group> + </choice> + <optional> + <attribute name="default"> <text /> </attribute> + </optional> + </element> + </element> + </oneOrMore> </element> + + <element name="actions"> <oneOrMore> + <element name="action"> + <attribute name="name" /> + <optional> + <attribute name="depth" /> + </optional> + <attribute name="timeout" /> + <optional> + <attribute name="interval" /> + </optional> + <optional> + <attribute name="start-delay" /> + </optional> + <optional> + <attribute name="role"> <ref name="role-values" /> </attribute> + </optional> + </element> + </oneOrMore> </element> +</element></start> + +<define name="boolean-values"> + <choice> + <value>0</value> + <value>1</value> + </choice> +</define> + +<define name="role-values"> + <choice> + <value>Promoted</value> + <value>Unpromoted</value> + </choice> +</define> + +</grammar> diff --git a/heartbeat/minio b/heartbeat/minio new file mode 100755 index 0000000..16ceeed --- /dev/null +++ b/heartbeat/minio @@ -0,0 +1,289 @@ +#!/bin/sh +# +# Resource script for Minio +# +# Description: Manages Minio as an OCF resource in +# an Active-Passive High Availability setup. +# +# Author: Ricardo Branco <tsmgeek@gmail.com> : Initial script for minio server +# License: GNU General Public License (GPL) +# +# +# usage: $0 {start|stop|status|monitor|validate-all|meta-data} +# +# The "start" arg starts Minio. +# +# The "stop" arg stops it. +# +# OCF parameters: +# OCF_RESKEY_binary +# OCF_RESKEY_conffile +# OCF_RESKEY_pidfile +# OCF_RESKEY_address +# OCF_RESKEY_volumnpaths +# +########################################################################## +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_binary_default="/opt/minio/minio-server" +OCF_RESKEY_confdir_default="/etc/minio" +OCF_RESKEY_pidfile_default="/var/run/minio.pid" +OCF_RESKEY_address_default=":9000" +OCF_RESKEY_volumepaths_default="/home/shared" + +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +: ${OCF_RESKEY_confdir=${OCF_RESKEY_confdir_default}} +: ${OCF_RESKEY_pidfile=${OCF_RESKEY_pidfile_default}} +: ${OCF_RESKEY_address=${OCF_RESKEY_address_default}} +: ${OCF_RESKEY_volumepaths=${OCF_RESKEY_volumepaths_default}} + +USAGE="Usage: $0 {start|stop|status|monitor|validate-all|meta-data}"; + +########################################################################## + +usage() { + echo $USAGE >&2 +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="minio" version="1.0"> +<version>1.0</version> +<longdesc lang="en"> +This script manages Minio in an Active-Passive setup +</longdesc> +<shortdesc lang="en">OCF Resource Agent compliant Minio server script.</shortdesc> + + +<parameters> + +<parameter name="binary"> +<longdesc lang="en">The Minio server binary</longdesc> +<shortdesc lang="en">The Minio server binary</shortdesc> +<content type="string" default="${OCF_RESKEY_binary_default}" /> +</parameter> + +<parameter name="confdir"> +<longdesc lang="en"> +The Minio configuration directory path. +For example, "/etc/minio" +</longdesc> +<shortdesc lang="en">Configuration directory path</shortdesc> +<content type="string" default="${OCF_RESKEY_confdir_default}" /> +</parameter> + +<parameter name="pidfile"> +<longdesc lang="en">The Minio PID file. The location of the PID file.</longdesc> +<shortdesc lang="en">PID file</shortdesc> +<content type="string" default="${OCF_RESKEY_pidfile_default}" /> +</parameter> + +<parameter name="address"> +<longdesc lang="en">Address to bind minio to.</longdesc> +<shortdesc lang="en">Bind address</shortdesc> +<content type="string" default="${OCF_RESKEY_address_default}" /> +</parameter> + +<parameter name="volumepaths"> +<longdesc lang="en">The storage volumes for minio to use.</longdesc> +<shortdesc lang="en">Storage Volumes</shortdesc> +<content type="string" default="${OCF_RESKEY_volumepaths_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="60s" /> +<action name="validate-all" timeout="2s" /> +<action name="meta-data" timeout="5s" /> +</actions> + +</resource-agent> +END + exit $OCF_SUCCESS +} + +isRunning() +{ + kill -0 "$1" > /dev/null 2>&1 +} + +minio_status() +{ + if [ -f "$OCF_RESKEY_pidfile" ] + then + # Minio is probably running + PID=`head -n 1 $OCF_RESKEY_pidfile` + if [ ! -z "$PID" ] ; then + isRunning "$PID" && `ps -p $PID | grep minio-server > /dev/null 2>&1` + return $? + fi + fi + + # Minio is not running + return $OCF_NOT_RUNNING; +} + +minio_start() +{ + # make a few checks and start Minio + if ocf_is_root ; then : ; else + ocf_log err "You must be root" + exit $OCF_ERR_PERM + fi + + # if Minio is running return success + if minio_status ; then + ocf_log info "Minio server is running already" + exit $OCF_SUCCESS + fi + + # starting Minio + cmd="su - root -c \"nohup ${OCF_RESKEY_binary} server --quiet --config-dir ${OCF_RESKEY_confdir} --address ${OCF_RESKEY_address} ${OCF_RESKEY_volumepaths} >/dev/null &\"'echo \$!' " + + ocf_log debug "Starting minio: $cmd" + + eval $cmd > ${OCF_RESKEY_pidfile} + + if [ "$?" -ne 0 ]; then + ocf_log err "Minio returned error" $? + exit $OCF_ERR_GENERIC + fi + + exit $OCF_SUCCESS +} + + +minio_stop() +{ + if minio_status ; then + PID=`head -n 1 $OCF_RESKEY_pidfile` + if [ ! -z "$PID" ]; then + ocf_log info "Killing Minio PID $PID" + kill $PID > /dev/null 2>&1 + if [ "$?" -eq 0 ]; then + TRIES=0 + while isRunning "$PID" && [ "$TRIES" -lt 30 ] + do + sleep 1 + ocf_log info "Minio PID $PID is still running" + TRIES=`expr $TRIES + 1` + done + isRunning "$PID" + RET=$? + if [ "$RET" -eq 0 ]; then + ocf_log info "Killing Minio PID $PID with SIGKILL" + kill -9 $PID > /dev/null 2>&1 + while isRunning "$PID" + do + sleep 1 + ocf_log info "Minio PID $PID is still running" + done + fi + else + ocf_log err "Killing Minio PID $PID FAILED" + exit $OCF_ERR_GENERIC + fi + fi + fi + + exit $OCF_SUCCESS +} + +minio_monitor() +{ + minio_status + RET=$? + + if [ "$RET" -eq 0 ]; then + PID=`head -n 1 $OCF_RESKEY_pidfile` + ocf_log debug "Minio monitor on PID $PID succeeded" + return $OCF_SUCCESS + else + ocf_log debug "Minio monitor on PID $PID failed" + return $OCF_NOT_RUNNING + fi +} + +minio_validate_all() +{ + + # check that the minio binary exists + if [ ! -x "$OCF_RESKEY_binary" ]; then + ocf_log err "Minio server binary $OCF_RESKEY_binary does not exist" + exit $OCF_ERR_INSTALLED + fi + + # check that the Minioconfig file exists + if [ ! -d "$OCF_RESKEY_confdir" ]; then + ocf_log err "Minio config dir $OCF_RESKEY_confdir does not exist" + exit $OCF_ERR_CONFIGURED + fi + +} + +# +# Main +# + +if [ $# -ne 1 ] +then + usage + exit $OCF_ERR_ARGS +fi + +case $1 in + start) + minio_validate_all + minio_start + ;; + + stop) + minio_stop + ;; + + status) + if minio_status; then + ocf_log info "Minio is running" + exit $OCF_SUCCESS + else + ocf_log info "Minio is stopped" + exit $OCF_NOT_RUNNING + fi + ;; + + monitor) + minio_monitor + ;; + + validate-all) + minio_validate_all + exit $OCF_SUCCESS + ;; + + meta-data|metadata|meta_data) + meta_data + ;; + + usage) + usage + exit $OCF_SUCCESS + ;; + + *) + usage + ocf_log err "$0 was called with unsupported args: $*" + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc diff --git a/heartbeat/mpathpersist.in b/heartbeat/mpathpersist.in new file mode 100644 index 0000000..8a46b99 --- /dev/null +++ b/heartbeat/mpathpersist.in @@ -0,0 +1,686 @@ +#!@BASH_SHELL@ +# +# +# OCF Resource Agent compliant PERSISTENT SCSI RESERVATION on multipath devices resource script. +# Testversion for a mpathpersist implementation for demo purposes by Andreas Thomas +# +# Copyright (c) 2017 Evgeny Nifontov, lwang@suse.com, +# Andreas Tomas<Andreas.Tomas@suse.com>, +# Zhu Lingshan<lszhu@suse.com> +# All Rights Reserved. +# +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# +# OCF instance parameters +# OCF_RESKEY_binary +# OCF_RESKEY_devs +# OCF_RESKEY_required_devs_no +# OCF_RESKEY_reservation_type +# OCF_RESKEY_master_score_base +# OCF_RESKEY_master_score_dev_factor +# OCF_RESKEY_master_score_delay +# +# TODO +# +# 1) PROBLEM: devices which were not accessible during 'start' action, will be never registered/reserved +# TODO: 'Master' and 'Slave' registers new devs in 'monitor' action +# TODO: 'Master' reserves new devs in 'monitor' action + +#Defaults +OCF_RESKEY_mpathpersist_binary_default="mpathpersist" +OCF_RESKEY_required_devs_no_default=1 +OCF_RESKEY_reservation_type_default=1 +OCF_RESKEY_master_score_base_default=0 +OCF_RESKEY_master_score_dev_factor_default=100 +OCF_RESKEY_master_score_delay_default=30 + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# set default values +: ${OCF_RESKEY_mpathpersist_binary=${OCF_RESKEY_mpathpersist_binary_default}} # binary name for the resource +: ${OCF_RESKEY_required_devs_no=${OCF_RESKEY_required_devs_no_default}} # number of required devices +: ${OCF_RESKEY_reservation_type=${OCF_RESKEY_reservation_type_default}} # reservation type +: ${OCF_RESKEY_master_score_base=${OCF_RESKEY_master_score_base_default}} # master score base +: ${OCF_RESKEY_master_score_dev_factor=${OCF_RESKEY_master_score_dev_factor_default}} # device factor for master score +: ${OCF_RESKEY_master_score_delay=${OCF_RESKEY_master_score_delay_default}} # delay for master score + +####################################################################### + + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="mpathpersist" version="1.1"> +<version>1.0</version> + +<longdesc lang="en"> +This resource agent manages SCSI persistent reservations on multipath devices. +"mpathpersist" from multipath-tools is used, please see its documentation. +Should be used as multistate (Promotable) resource +Unpromoted registers its node id ("crm_node -i") as reservation key ( --param-sark ) on each device in the params "devs" list. +Promoted reserves all devices from params "devs" list with reservation "--prout-type" value from "reservation_type" parameter. +Please see man sg_persist(8) and mpathpersist(8) for reservation_type details. +</longdesc> +<shortdesc lang="en">Manages SCSI persistent reservations on multipath devices</shortdesc> + +<parameters> +<parameter name="binary" unique="0"> +<longdesc lang="en"> +The name of the binary that manages the resource. +</longdesc> +<shortdesc lang="en">the binary name of the resource</shortdesc> +<content type="string" default="${OCF_RESKEY_mpathpersist_binary_default}"/> +</parameter> + +<parameter name="devs" unique="0" required="1"> +<longdesc lang="en"> +Device list. Multiple devices can be listed with blank space as separator. +Shell wildcards are allowed. +</longdesc> +<shortdesc lang="en">device list</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="required_devs_no" unique="0" required="0"> +<longdesc lang="en"> +Minimum number of "working" devices from device list + 1) existing + 2) "mpathpersist --in --read-keys <device>" works (Return code 0) +resource actions "start","monitor","promote" and "validate-all" return "OCF_ERR_INSTALLED" +if the actual number of "working" devices is less than "required_devs_no". +resource actions "stop" and "demote" tries to remove reservations and registration keys from +all working devices, but always return "OCF_SUCCESS" +</longdesc> +<shortdesc lang="en">minimum number of working devices</shortdesc> +<content type="string" default="${OCF_RESKEY_required_devs_no_default}"/> +</parameter> + +<parameter name="reservation_type" unique="0" required="0"> +<longdesc lang="en"> +reservation type +</longdesc> +<shortdesc lang="en">reservation type</shortdesc> +<content type="string" default="${OCF_RESKEY_reservation_type_default}" /> +</parameter> + +<parameter name="master_score_base" unique="0" required="0"> +<longdesc lang="en"> +master_score_base value +"master_score_base" value is used in "master_score" calculation: +master_score = master_score_base + master_score_dev_factor * working_devs +if set to bigger value in mpathpersist resource configuration on some node, this node will be "preferred" for promoted role. +</longdesc> +<shortdesc lang="en">base master_score value</shortdesc> +<content type="string" default="${OCF_RESKEY_master_score_base_default}" /> +</parameter> + +<parameter name="master_score_dev_factor" unique="0" required="0"> +<longdesc lang="en"> +Working device factor in promoted calculation +each "working" device provides additional value to "master_score", +so the node that sees more devices will be preferred for the "Promoted"-role +Setting it to 0 will disable this behavior. +</longdesc> +<shortdesc lang="en">working device factor in master_score calculation</shortdesc> +<content type="string" default="${OCF_RESKEY_master_score_dev_factor_default}" /> +</parameter> + +<parameter name="master_score_delay" unique="0" required="0"> +<longdesc lang="en"> +promoted/unpromoted decreases/increases its master_score after delay of "master_score_delay" seconds +so if some device gets inaccessible, the unpromoted decreases its promoted first and the resource will no be watched +and after this device reappears again the promoted increases its master_score first +this can work only if the master_score_delay is bigger then monitor interval on both promoted and unpromoted +Setting it to 0 will disable this behavior. +</longdesc> +<shortdesc lang="en">master_score decrease/increase delay time</shortdesc> +<content type="string" default="${OCF_RESKEY_master_score_delay_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="30s" /> +<action name="promote" timeout="30s" /> +<action name="demote" timeout="30s" /> +<action name="notify" timeout="30s" /> +<action name="stop" timeout="30s" /> +<action name="monitor" depth="0" timeout="20s" interval="29s" role="Unpromoted" /> +<action name="monitor" depth="0" timeout="20s" interval="60s" role="Promoted" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="30s" /> +</actions> +</resource-agent> +END + + exit $OCF_SUCCESS +} + +mpathpersist_init() { + + if ! ocf_is_root ; then + ocf_log err "You must be root to perform this operation." + exit $OCF_ERR_PERM + fi + + MPATHPERSIST="${OCF_RESKEY_mpathpersist_binary}" + check_binary $MPATHPERSIST + + ROLE=$OCF_RESKEY_CRM_meta_role + NOW=$(date +%s) + + RESOURCE="${OCF_RESOURCE_INSTANCE}" + MASTER_SCORE_VAR_NAME="master-${OCF_RESOURCE_INSTANCE}" + PENDING_VAR_NAME="pending-$MASTER_SCORE_VAR_NAME" + + #only works with corocync + CRM_NODE="${HA_SBIN_DIR}/crm_node" + NODE_ID_DEC=$($CRM_NODE -i) + + NODE=$($CRM_NODE -l | $GREP -w ^$NODE_ID_DEC) + NODE=${NODE#$NODE_ID_DEC } + NODE=${NODE% *} + + MASTER_SCORE_ATTRIBUTE="${HA_SBIN_DIR}/crm_attribute --lifetime=reboot --name=$MASTER_SCORE_VAR_NAME --node=$NODE" + PENDING_ATTRIBUTE="${HA_SBIN_DIR}/crm_attribute --lifetime=reboot --name=$PENDING_VAR_NAME --node=$NODE" + + NODE_ID_HEX=$(printf '0x%x' $NODE_ID_DEC) + + if [ -z "$NODE_ID_HEX" ]; then + ocf_log err "Couldn't get node id with \"$CRM_NODE\"" + exit $OCF_ERR_INSTALLED + fi + + ocf_log debug "$RESOURCE: NODE:$NODE, ROLE:$ROLE, NODE_ID DEC:$NODE_ID_DEC HEX:$NODE_ID_HEX" + + DEVS="${OCF_RESKEY_devs}" + REQUIRED_DEVS_NO="${OCF_RESKEY_required_devs_no}" + RESERVATION_TYPE="${OCF_RESKEY_reservation_type}" + MASTER_SCORE_BASE="${OCF_RESKEY_master_score_base}" + MASTER_SCORE_DEV_FACTOR="${OCF_RESKEY_master_score_dev_factor}" + MASTER_SCORE_DELAY="${OCF_RESKEY_master_score_delay}" + + ocf_log debug "$RESOURCE: DEVS=$DEVS" + ocf_log debug "$RESOURCE: REQUIRED_DEVS_NO=$REQUIRED_DEVS_NO" + ocf_log debug "$RESOURCE: RESERVATION_TYPE=$RESERVATION_TYPE" + ocf_log debug "$RESOURCE: MASTER_SCORE_BASE=$MASTER_SCORE_BASE" + ocf_log debug "$RESOURCE: MASTER_SCORE_DEV_FACTOR=$MASTER_SCORE_DEV_FACTOR" + ocf_log debug "$RESOURCE: MASTER_SCORE_DELAY=$MASTER_SCORE_DELAY" + + #expand path wildcards + DEVS=$(echo $DEVS) + + if [ -z "$DEVS" ]; then + ocf_log err "\"devs\" not defined" + exit $OCF_ERR_INSTALLED + fi + + mpathpersist_check_devs + mpathpersist_get_status +} + +mpathpersist_action_usage() { + cat <<END + usage: $0 {start|stop|monitor|validate-all|promote|demote|notify|meta-data} + + Expects to have a fully populated OCF RA-compliant environment set. +END +} + +mpathpersist_get_status() { + + unset WORKING_DEVS[*] + + for dev in ${EXISTING_DEVS[*]} + do + READ_KEYS=`$MPATHPERSIST --in --read-keys $dev 2>&1` + if [ $? -eq 0 ]; then + WORKING_DEVS+=($dev) + echo "$READ_KEYS" | $GREP -w $NODE_ID_HEX\$ >/dev/null + if [ $? -eq 0 ]; then + REGISTERED_DEVS+=($dev) + + READ_RESERVATION=`$MPATHPERSIST --in --read-reservation $dev 2>&1` + if [ $? -eq 0 ]; then + echo "$READ_RESERVATION" | $GREP -w $NODE_ID_HEX\$ >/dev/null + if [ $? -eq 0 ]; then + RESERVED_DEVS+=($dev) + fi + + reservation_key=`echo $READ_RESERVATION | $GREP -o 'Key = 0x[0-9a-f]*' | $GREP -o '0x[0-9a-f]*'` + if [ -n "$reservation_key" ]; then + DEVS_WITH_RESERVATION+=($dev) + RESERVATION_KEYS+=($reservation_key) + fi + fi + fi + fi + done + + WORKING_DEVS_NO=${#WORKING_DEVS[*]} + + ocf_log debug "$RESOURCE: working devices: `mpathpersist_echo_array ${WORKING_DEVS[*]}`" + ocf_log debug "$RESOURCE: number of working devices: $WORKING_DEVS_NO" + + ocf_log debug "$RESOURCE: registered devices: `mpathpersist_echo_array ${REGISTERED_DEVS[*]}`" + ocf_log debug "$RESOURCE: reserved devices: `mpathpersist_echo_array ${RESERVED_DEVS[*]}`" + ocf_log debug "$RESOURCE: devices with reservation: `mpathpersist_echo_array ${DEVS_WITH_RESERVATION[*]}`" + ocf_log debug "$RESOURCE: reservation keys: `mpathpersist_echo_array ${RESERVATION_KEYS[*]}`" + + MASTER_SCORE=$(($MASTER_SCORE_BASE + $MASTER_SCORE_DEV_FACTOR*$WORKING_DEVS_NO)) + ocf_log debug "$RESOURCE: master_score: $MASTER_SCORE_BASE + $MASTER_SCORE_DEV_FACTOR*$WORKING_DEVS_NO = $MASTER_SCORE" + +} + +mpathpersist_check_devs() { + + for dev in $DEVS + do + if [ -e "$dev" ]; then + EXISTING_DEVS+=($dev) + fi + done + + EXISTING_DEVS_NO=${#EXISTING_DEVS[*]} + if [ $EXISTING_DEVS_NO -lt $REQUIRED_DEVS_NO ]; then + ocf_log err "Number of existing devices=$EXISTING_DEVS_NO less then required_devs_no=$REQUIRED_DEVS_NO" + exit $OCF_ERR_INSTALLED + fi + +} + +mpathpersist_is_registered() { + for registered_dev in ${REGISTERED_DEVS[*]} + do + if [ "$registered_dev" == "$1" ]; then + return 0 + fi + done + return 1 +} + +mpathpersist_get_reservation_key() { + for array_index in ${!DEVS_WITH_RESERVATION[*]} + do + if [ "${DEVS_WITH_RESERVATION[$array_index]}" == "$1" ]; then + echo ${RESERVATION_KEYS[$array_index]} + return 0 + fi + done + echo "" +} + +mpathpersist_echo_array() { + str_count=0 + arr_str="" + + for str in "$@" + do + arr_str="$arr_str[$str_count]:$str " + str_count=$(($str_count+1)) + done + echo $arr_str +} + +mpathpersist_parse_act_pending() { + + ACT_PENDING_TS=0 + ACT_PENDING_SCORE=0 + + if [ -n "$ACT_PENDING" ]; then + ACT_PENDING_TS=${ACT_PENDING%%_*} + ACT_PENDING_SCORE=${ACT_PENDING##*_} + fi +} + +mpathpersist_clear_pending() { + if [ -n "$ACT_PENDING" ]; then + DO_PENDING_UPDATE="YES" + NEW_PENDING="" + fi +} + +mpathpersist_new_master_score() { + DO_MASTER_SCORE_UPDATE="YES" + NEW_MASTER_SCORE=$1 +} + +mpathpersist_new_pending() { + DO_PENDING_UPDATE="YES" + NEW_PENDING=$1 +} + + +# Functions invoked by resource manager actions + +mpathpersist_action_start() { + + ocf_run $MASTER_SCORE_ATTRIBUTE --update=$MASTER_SCORE + ocf_run $PENDING_ATTRIBUTE --update="" + + if [ $WORKING_DEVS_NO -lt $REQUIRED_DEVS_NO ]; then + ocf_log err "$RESOURCE: Number of working devices=$WORKING_DEVS_NO less then required_devs_no=$REQUIRED_DEVS_NO" + exit $OCF_ERR_GENERIC + fi + + for dev in ${WORKING_DEVS[*]} + do + if mpathpersist_is_registered $dev ; then + : OK + else + ocf_run $MPATHPERSIST --out --register --param-sark=$NODE_ID_HEX $dev + if [ $? -ne $OCF_SUCCESS ] + then + return $OCF_ERR_GENERIC + fi + fi + done + + return $OCF_SUCCESS +} + +mpathpersist_action_stop() { + + if [ ${#REGISTERED_DEVS[*]} -eq 0 ]; then + ocf_log debug "$RESOURCE stop: already no registrations" + else + # Clear preference for becoming master + ocf_run $MASTER_SCORE_ATTRIBUTE --delete + ocf_run $PENDING_ATTRIBUTE --delete + + for dev in ${REGISTERED_DEVS[*]} + do + ocf_run $MPATHPERSIST --out --register --param-rk=$NODE_ID_HEX $dev + done + fi + + return $OCF_SUCCESS +} + +mpathpersist_action_monitor() { + + ACT_MASTER_SCORE=`$MASTER_SCORE_ATTRIBUTE --query --quiet 2>&1` + ocf_log debug "$RESOURCE monitor: ACT_MASTER_SCORE=$ACT_MASTER_SCORE" + + ACT_PENDING=`$PENDING_ATTRIBUTE --query --quiet 2>&1` + ocf_log debug "$RESOURCE monitor: ACT_PENDING=$ACT_PENDING" + + mpathpersist_parse_act_pending + ocf_log debug "$RESOURCE monitor: ACT_PENDING_TS=$ACT_PENDING_TS" + ocf_log debug "$RESOURCE monitor: ACT_PENDING_VAL=$ACT_PENDING_SCORE" + + ocf_log debug "$MASTER_SCORE, $ACT_MASTER_SCORE, $ROLE" + + DO_MASTER_SCORE_UPDATE="NO" + DO_PENDING_UPDATE="NO" + if [ -n "$ACT_MASTER_SCORE" ] + then + if [ $ACT_MASTER_SCORE -eq $MASTER_SCORE ]; then + mpathpersist_clear_pending + else + case $ROLE in + Master) + if [ $MASTER_SCORE -lt $ACT_MASTER_SCORE ]; then + if [ -n "$ACT_PENDING" ] + then + if [ $(($NOW-$ACT_PENDING_TS-$MASTER_SCORE_DELAY)) -ge 0 ]; then + mpathpersist_new_master_score $MASTER_SCORE + mpathpersist_clear_pending + fi + else + if [ $MASTER_SCORE_DELAY -eq 0 ]; then + mpathpersist_new_master_score $MASTER_SCORE + mpathpersist_clear_pending + else + mpathpersist_new_pending "${NOW}_${MASTER_SCORE}" + fi + fi + else + mpathpersist_new_master_score $MASTER_SCORE + mpathpersist_clear_pending + fi + ;; + + Slave) + if [ $MASTER_SCORE -gt $ACT_MASTER_SCORE ]; then + if [ -n "$ACT_PENDING" ]; then + if [ $(($NOW-$ACT_PENDING_TS-$MASTER_SCORE_DELAY)) -ge 0 ]; then + mpathpersist_new_master_score $MASTER_SCORE + mpathpersist_clear_pending + fi + else + if [ $MASTER_SCORE_DELAY -eq 0 ]; then + mpathpersist_new_master_score $MASTER_SCORE + mpathpersist_clear_pending + else + mpathpersist_new_pending "${NOW}_${MASTER_SCORE}" + fi + fi + else + mpathpersist_new_master_score $MASTER_SCORE + mpathpersist_clear_pending + fi + ;; + + *) + ;; + + esac + fi + fi + + if [ $DO_MASTER_SCORE_UPDATE == "YES" ]; then + ocf_run $MASTER_SCORE_ATTRIBUTE --update=$NEW_MASTER_SCORE + fi + + if [ $DO_PENDING_UPDATE == "YES" ]; then + ocf_run $PENDING_ATTRIBUTE --update=$NEW_PENDING + fi + + if [ ${#REGISTERED_DEVS[*]} -eq 0 ]; then + ocf_log debug "$RESOURCE monitor: no registrations" + return $OCF_NOT_RUNNING + fi + + if [ ${#RESERVED_DEVS[*]} -eq ${#WORKING_DEVS[*]} ]; then + return $OCF_RUNNING_MASTER + fi + + if [ ${#REGISTERED_DEVS[*]} -eq ${#WORKING_DEVS[*]} ]; then + if [ $RESERVATION_TYPE -eq 7 ] || [ $RESERVATION_TYPE -eq 8 ]; then + if [ ${#DEVS_WITH_RESERVATION[*]} -gt 0 ]; then + return $OCF_RUNNING_MASTER + else + return $OCF_SUCCESS + fi + else + return $OCF_SUCCESS + fi + fi + + ocf_log err "$RESOURCE monitor: unexpected state" + + return $OCF_ERR_GENERIC +} + +mpathpersist_action_promote() { + + if [ ${#RESERVED_DEVS[*]} -gt 0 ]; then + ocf_log info "$RESOURCE promote: already master" + return $OCF_SUCCESS + fi + + for dev in ${WORKING_DEVS[*]} + do + reservation_key=`mpathpersist_get_reservation_key $dev` + case $RESERVATION_TYPE in + 1|3|5|6) + if [ -z "$reservation_key" ]; then + ocf_run $MPATHPERSIST --out --reserve --param-rk=$NODE_ID_HEX --prout-type=$RESERVATION_TYPE $dev + if [ $? -ne $OCF_SUCCESS ]; then + return $OCF_ERR_GENERIC + fi + else + ocf_run $MPATHPERSIST --out --preempt --param-sark=$reservation_key --param-rk=$NODE_ID_HEX --prout-type=$RESERVATION_TYPE $dev + if [ $? -ne $OCF_SUCCESS ]; then + return $OCF_ERR_GENERIC + fi + fi + ;; + + 7|8) + if [ -z "$reservation_key" ]; then + ocf_run $MPATHPERSIST --out --reserve --param-rk=$NODE_ID_HEX --prout-type=$RESERVATION_TYPE $dev + if [ $? -ne $OCF_SUCCESS ] + then + return $OCF_ERR_GENERIC + fi + else + ocf_log info "$RESOURCE promote: there already exist an reservation holder, all registrants become reservation holders" + return $OCF_SUCCESS + fi + ;; + + *) + return $OCF_ERR_ARGS + ;; + + esac + done + + return $OCF_SUCCESS +} + +mpathpersist_action_demote() { + case $RESERVATION_TYPE in + 1|3|5|6) + if [ ${#RESERVED_DEVS[*]} -eq 0 ]; then + ocf_log info "$RESOURCE demote: already slave" + return $OCF_SUCCESS + fi + + for dev in ${RESERVED_DEVS[*]} + do + ocf_run $MPATHPERSIST --out --release --param-rk=$NODE_ID_HEX --prout-type=$RESERVATION_TYPE $dev + if [ $? -ne $OCF_SUCCESS ]; then + return $OCF_ERR_GENERIC + fi + done + ;; + + 7|8) #in case of 7/8, --release won't release the reservation unless unregister the key. + if [ ${#REGISTERED_DEVS[*]} -eq 0 ]; then + ocf_log info "$RESOURCE demote: already slave" + return $OCF_SUCCESS + fi + + for dev in ${REGISTERED_DEVS[*]} + do + ocf_run $MPATHPERSIST --out --register --param-rk=$NODE_ID_HEX --param-sark=0 $dev + if [ $? -ne $OCF_SUCCESS ]; then + return $OCF_ERR_GENERIC + fi + done + ;; + + *) + return $OCF_ERR_ARGS + ;; + esac + + return $OCF_SUCCESS +} + +mpathpersist_action_notify() { + local n_type="$OCF_RESKEY_CRM_meta_notify_type" + local n_op="$OCF_RESKEY_CRM_meta_notify_operation" + set -- $OCF_RESKEY_CRM_meta_notify_active_resource + local n_active="$#" + set -- $OCF_RESKEY_CRM_meta_notify_stop_resource + local n_stop="$#" + set -- $OCF_RESKEY_CRM_meta_notify_start_resource + local n_start="$#" + + ocf_log debug "$RESOURCE notify: $n_type for $n_op - counts: active $n_active - starting $n_start - stopping $n_stop" + + return $OCF_SUCCESS +} + +mpathpersist_action_validate_all () { + if [ "$OCF_CHECK_LEVEL" -eq 10 ]; then + if [ "$OCF_RESKEY_CRM_meta_master_max" != "1" ] && [ "$RESERVATION_TYPE" != "7" ] && [ "$RESERVATION_TYPE" != "8" ]; then + ocf_log err "Master options misconfigured." + exit $OCF_ERR_CONFIGURED + fi + fi + + return $OCF_SUCCESS +} + +if [ $# -ne 1 ]; then + echo "Incorrect parameter count." + mpathpersist_action_usage + exit $OCF_ERR_ARGS +fi + +ACTION=$1 +case $ACTION in + meta-data) + meta_data + ;; + + validate-all) + mpathpersist_init + mpathpersist_action_validate_all + ;; + + start|promote|monitor|stop|demote) + ocf_log debug "$RESOURCE: starting action \"$ACTION\"" + mpathpersist_init + if [ "$__OCF_ACTION" = "start" ]; then + OCF_CHECK_LEVEL=10 + mpathpersist_action_validate_all + fi + mpathpersist_action_$ACTION + exit $? + ;; + + notify) + mpathpersist_action_notify + exit $? + ;; + + usage|help) + mpathpersist_action_usage + exit $OCF_SUCCESS + ;; + + *) + mpathpersist_action_usage + exit $OCF_ERR_ARGS + ;; + + esac diff --git a/heartbeat/mysql b/heartbeat/mysql new file mode 100755 index 0000000..1df2fc0 --- /dev/null +++ b/heartbeat/mysql @@ -0,0 +1,1074 @@ +#!/bin/sh +# +# +# MySQL +# +# Description: Manages a MySQL database as Linux-HA resource +# +# Authors: Alan Robertson: DB2 Script +# Jakub Janczak: rewrite as MySQL +# Andrew Beekhof: cleanup and import +# Sebastian Reitenbach: add OpenBSD defaults, more cleanup +# Narayan Newton: add Gentoo/Debian defaults +# Marian Marinov, Florian Haas: add replication capability +# Yves Trudeau, Baron Schwartz: add VIP support and improve replication +# +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL) +# +# (c) 2002-2005 International Business Machines, Inc. +# 2005-2010 Linux-HA contributors +# +# An example usage in /etc/ha.d/haresources: +# node1 10.0.0.170 mysql +# +# See usage() function below for more details... +# +# OCF instance parameters: +# OCF_RESKEY_binary +# OCF_RESKEY_client_binary +# OCF_RESKEY_config +# OCF_RESKEY_datadir +# OCF_RESKEY_user +# OCF_RESKEY_group +# OCF_RESKEY_test_table +# OCF_RESKEY_test_user +# OCF_RESKEY_test_passwd +# OCF_RESKEY_enable_creation +# OCF_RESKEY_additional_parameters +# OCF_RESKEY_log +# OCF_RESKEY_pid +# OCF_RESKEY_socket +# OCF_RESKEY_replication_user +# OCF_RESKEY_replication_passwd +# OCF_RESKEY_replication_port +# OCF_RESKEY_max_slave_lag +# OCF_RESKEY_evict_outdated_slaves +# OCF_RESKEY_reader_attribute + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs +. ${OCF_FUNCTIONS_DIR}/mysql-common.sh +####################################################################### + +usage() { + cat <<UEND +usage: $0 (start|stop|validate-all|meta-data|monitor|promote|demote|notify) + +$0 manages a MySQL Database as an HA resource. + +The 'start' operation starts the database. +The 'stop' operation stops the database. +The 'status' operation reports whether the database is running +The 'monitor' operation reports whether the database seems to be working +The 'promote' operation makes this mysql server run as master +The 'demote' operation makes this mysql server run as slave +The 'validate-all' operation reports whether the parameters are valid + +UEND +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="mysql" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for MySQL. +May manage a standalone MySQL database, a clone set with externally +managed replication, or a complete master/slave replication setup. +Note, when master/slave replication is in use, the resource must +be setup to use notifications. Set 'notify=true' in the metadata +attributes when defining a MySQL master/slave instance. + +While managing replication, the default behavior is to use uname -n +values in the change master to command. Other IPs can be specified +manually by adding a node attribute \${INSTANCE_ATTR_NAME}_mysql_master_IP +giving the IP to use for replication. For example, if the mysql primitive +you are using is p_mysql, the attribute to set will be +p_mysql_mysql_master_IP. +</longdesc> +<shortdesc lang="en">Manages a MySQL database instance</shortdesc> +<parameters> + +<parameter name="binary" unique="0" required="0"> +<longdesc lang="en"> +Location of the MySQL server binary +</longdesc> +<shortdesc lang="en">MySQL server binary</shortdesc> +<content type="string" default="${OCF_RESKEY_binary_default}" /> +</parameter> + +<parameter name="client_binary" unique="0" required="0"> +<longdesc lang="en"> +Location of the MySQL client binary +</longdesc> +<shortdesc lang="en">MySQL client binary</shortdesc> +<content type="string" default="${OCF_RESKEY_client_binary_default}" /> +</parameter> + +<parameter name="config" unique="0" required="0"> +<longdesc lang="en"> +Configuration file +</longdesc> +<shortdesc lang="en">MySQL config</shortdesc> +<content type="string" default="${OCF_RESKEY_config_default}" /> +</parameter> + +<parameter name="datadir" unique="0" required="0"> +<longdesc lang="en"> +Directory containing databases +</longdesc> +<shortdesc lang="en">MySQL datadir</shortdesc> +<content type="string" default="${OCF_RESKEY_datadir_default}" /> +</parameter> + +<parameter name="user" unique="0" required="0"> +<longdesc lang="en"> +User running MySQL daemon +</longdesc> +<shortdesc lang="en">MySQL user</shortdesc> +<content type="string" default="${OCF_RESKEY_user_default}" /> +</parameter> + +<parameter name="group" unique="0" required="0"> +<longdesc lang="en"> +Group running MySQL daemon (for logfile and directory permissions) +</longdesc> +<shortdesc lang="en">MySQL group</shortdesc> +<content type="string" default="${OCF_RESKEY_group_default}"/> +</parameter> + +<parameter name="log" unique="0" required="0"> +<longdesc lang="en"> +The logfile to be used for mysqld. +</longdesc> +<shortdesc lang="en">MySQL log file</shortdesc> +<content type="string" default="${OCF_RESKEY_log_default}"/> +</parameter> + +<parameter name="pid" unique="0" required="0"> +<longdesc lang="en"> +The pidfile to be used for mysqld. +</longdesc> +<shortdesc lang="en">MySQL pid file</shortdesc> +<content type="string" default="${OCF_RESKEY_pid_default}"/> +</parameter> + +<parameter name="socket" unique="0" required="0"> +<longdesc lang="en"> +The socket to be used for mysqld. +</longdesc> +<shortdesc lang="en">MySQL socket</shortdesc> +<content type="string" default="${OCF_RESKEY_socket_default}"/> +</parameter> + +<parameter name="test_table" unique="0" required="0"> +<longdesc lang="en"> +Table to be tested in monitor statement (in database.table notation) +</longdesc> +<shortdesc lang="en">MySQL test table</shortdesc> +<content type="string" default="${OCF_RESKEY_test_table_default}" /> +</parameter> + +<parameter name="test_user" unique="0" required="0"> +<longdesc lang="en"> +MySQL test user, must have select privilege on test_table +</longdesc> +<shortdesc lang="en">MySQL test user</shortdesc> +<content type="string" default="${OCF_RESKEY_test_user_default}" /> +</parameter> + +<parameter name="test_passwd" unique="0" required="0"> +<longdesc lang="en"> +MySQL test user password +</longdesc> +<shortdesc lang="en">MySQL test user password</shortdesc> +<content type="string" default="${OCF_RESKEY_test_passwd_default}" /> +</parameter> + +<parameter name="enable_creation" unique="0" required="0"> +<longdesc lang="en"> +If the MySQL database does not exist, it will be created +</longdesc> +<shortdesc lang="en">Create the database if it does not exist</shortdesc> +<content type="boolean" default="${OCF_RESKEY_enable_creation_default}"/> +</parameter> + +<parameter name="additional_parameters" unique="0" required="0"> +<longdesc lang="en"> +Additional parameters which are passed to the mysqld on startup. +(e.g. --skip-external-locking or --skip-grant-tables) +</longdesc> +<shortdesc lang="en">Additional parameters to pass to mysqld</shortdesc> +<content type="string" default="${OCF_RESKEY_additional_parameters_default}"/> +</parameter> + +<parameter name="replication_user" unique="0" required="0"> +<longdesc lang="en"> +MySQL replication user. This user is used for starting and stopping +MySQL replication, for setting and resetting the master host, and for +setting and unsetting read-only mode. Because of that, this user must +have SUPER, REPLICATION SLAVE, REPLICATION CLIENT, PROCESS and RELOAD +privileges on all nodes within the cluster. Mandatory if you define a +master-slave resource. +</longdesc> +<shortdesc lang="en">MySQL replication user</shortdesc> +<content type="string" default="${OCF_RESKEY_replication_user_default}" /> +</parameter> + +<parameter name="replication_passwd" unique="0" required="0"> +<longdesc lang="en"> +MySQL replication password. Used for replication client and slave. +Mandatory if you define a master-slave resource. +</longdesc> +<shortdesc lang="en">MySQL replication user password</shortdesc> +<content type="string" default="${OCF_RESKEY_replication_passwd_default}" /> +</parameter> + +<parameter name="replication_port" unique="0" required="0"> +<longdesc lang="en"> +The port on which the Master MySQL instance is listening. +</longdesc> +<shortdesc lang="en">MySQL replication port</shortdesc> +<content type="string" default="${OCF_RESKEY_replication_port_default}" /> +</parameter> + +<parameter name="replication_require_ssl" unique="0" required="0"> +<longdesc lang="en"> +Enables SSL connection to local MySQL service for replication user. +i.e. if REQUIRE SSL for replication user in MySQL set, this should be set to "true". +</longdesc> +<shortdesc lang="en">MySQL replication require ssl</shortdesc> +<content type="string" default="${OCF_RESKEY_replication_require_ssl_default}" /> +</parameter> + +<parameter name="replication_master_ssl_ca" unique="0" required="0"> +<longdesc lang="en"> +The SSL CA certificate to be used for replication over SSL. +</longdesc> +<shortdesc lang="en">MySQL replication SSL CA certificate</shortdesc> +<content type="string" default="${OCF_RESKEY_replication_master_ssl_ca_default}" /> +</parameter> + +<parameter name="replication_master_ssl_cert" unique="0" required="0"> +<longdesc lang="en"> +The SSL CA certificate to be used for replication over SSL. +</longdesc> +<shortdesc lang="en">MySQL replication SSL certificate</shortdesc> +<content type="string" default="${OCF_RESKEY_replication_master_ssl_cert_default}" /> +</parameter> + +<parameter name="replication_master_ssl_key" unique="0" required="0"> +<longdesc lang="en"> +The SSL certificate key to be used for replication over SSL. +</longdesc> +<shortdesc lang="en">MySQL replication SSL certificate key</shortdesc> +<content type="string" default="${OCF_RESKEY_replication_master_ssl_key_default}" /> +</parameter> + +<parameter name="max_slave_lag" unique="0" required="0"> +<longdesc lang="en"> +The maximum number of seconds a replication slave is allowed to lag +behind its master. Do not set this to zero. What the cluster manager +does in case a slave exceeds this maximum lag is determined by the +evict_outdated_slaves parameter. +</longdesc> +<shortdesc lang="en">Maximum time (seconds) a MySQL slave is allowed +to lag behind a master</shortdesc> +<content type="integer" default="${OCF_RESKEY_max_slave_lag_default}"/> +</parameter> + +<parameter name="evict_outdated_slaves" unique="0" required="0"> +<longdesc lang="en"> +If set to true, any slave which is more than max_slave_lag seconds +behind the master has its MySQL instance shut down. If this parameter +is set to false in a primitive or clone resource, it is simply +ignored. If set to false in a master/slave resource, then exceeding +the maximum slave lag will merely push down the master preference so +the lagging slave is never promoted to the new master. +</longdesc> +<shortdesc lang="en">Determines whether to shut down badly lagging +slaves</shortdesc> +<content type="boolean" default="${OCF_RESKEY_evict_outdated_slaves_default}" /> +</parameter> + +<parameter name="reader_attribute" unique="1" required="0"> +<longdesc lang="en"> +An attribute that the RA can manage to specify whether a node +can be read from. This node attribute will be 1 if it's fine to +read from the node, and 0 otherwise (for example, when a slave +has lagged too far behind the master). + +A typical example for the use of this attribute would be to tie +a set of IP addresses to MySQL slaves that can be read from. + +This parameter is only meaningful in master/slave set configurations. +</longdesc> +<shortdesc lang="en">Sets the node attribute that determines +whether a node is usable for clients to read from.</shortdesc> +<content type="string" default="${OCF_RESKEY_reader_attribute_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="120s" /> +<action name="stop" timeout="120s" /> +<action name="status" timeout="60s" /> +<action name="monitor" depth="0" timeout="30s" interval="20s" /> +<action name="monitor" role="Promoted" depth="0" timeout="30s" interval="10s" /> +<action name="monitor" role="Unpromoted" depth="0" timeout="30s" interval="30s" /> +<action name="promote" timeout="120s" /> +<action name="demote" timeout="120s" /> +<action name="notify" timeout="90s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +# Convenience functions + +set_read_only() { + # Sets or unsets read-only mode. Accepts one boolean as its + # optional argument. If invoked without any arguments, defaults to + # enabling read only mode. Should only be set in master/slave + # setups. + # Returns $OCF_SUCCESS if the operation succeeds, or + # $OCF_ERR_GENERIC if it fails. + local ro_val + if ocf_is_true $1; then + ro_val="on" + else + ro_val="off" + fi + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "SET GLOBAL read_only=${ro_val}" +} + +get_read_only() { + # Check if read-only is set + local read_only_state + + read_only_state=`$MYSQL $MYSQL_OPTIONS_REPL \ + --skip-column-names -e "SHOW VARIABLES LIKE 'read_only'" | awk '{print $2}'` + + if [ "$read_only_state" = "ON" ]; then + return 0 + else + return 1 + fi +} + +is_slave() { + # Determine whether the machine is currently running as a MySQL + # slave, as determined per SHOW SLAVE STATUS. Returns 1 if SHOW + # SLAVE STATUS creates an empty result set, 0 otherwise. + local rc + local tmpfile + + # Check whether this machine should be slave + if ! ocf_is_ms || ! get_read_only; then + return 1 + fi + + get_slave_info + rc=$? + rm -f $tmpfile + + if [ $rc -eq 0 ]; then + # show slave status is not empty + # Is there a master_log_file defined? (master_log_file is deleted + # by reset slave + if [ "$master_log_file" ]; then + return 0 + else + return 1 + fi + else + # "SHOW SLAVE STATUS" returns an empty set if instance is not a + # replication slave + return 1 + fi + +} + +parse_slave_info() { + # Extracts field $1 from result of "SHOW SLAVE STATUS\G" from file $2 + sed -ne "s/^.* $1: \(.*\)$/\1/p" < $2 +} + +get_slave_info() { + # Warning: this sets $tmpfile and LEAVE this file! You must delete it after use! + local mysql_options + + if [ "$master_log_file" -a "$master_host" ]; then + # variables are already defined, get_slave_info has been run before + return $OCF_SUCCESS + else + tmpfile=`mktemp ${HA_RSCTMP}/check_slave.${OCF_RESOURCE_INSTANCE}.XXXXXX` + + $MYSQL $MYSQL_OPTIONS_REPL \ + -e 'SHOW SLAVE STATUS\G' > $tmpfile + + if [ -s $tmpfile ]; then + master_host=`parse_slave_info Master_Host $tmpfile` + master_user=`parse_slave_info Master_User $tmpfile` + master_port=`parse_slave_info Master_Port $tmpfile` + master_log_file=`parse_slave_info Master_Log_File $tmpfile` + master_log_pos=`parse_slave_info Read_Master_Log_Pos $tmpfile` + slave_sql=`parse_slave_info Slave_SQL_Running $tmpfile` + slave_io=`parse_slave_info Slave_IO_Running $tmpfile` + last_errno=`parse_slave_info Last_Errno $tmpfile` + secs_behind=`parse_slave_info Seconds_Behind_Master $tmpfile` + ocf_log debug "MySQL instance running as a replication slave" + else + # Instance produced an empty "SHOW SLAVE STATUS" output -- + # instance is not a slave + ocf_exit_reason "check_slave invoked on an instance that is not a replication slave." + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS + fi +} + +check_slave() { + # Checks slave status + local rc new_master + + get_slave_info + rc=$? + + if [ $rc -eq 0 ]; then + # Did we receive an error other than max_connections? + if [ $last_errno -ne 0 -a $last_errno -ne "$MYSQL_TOO_MANY_CONN_ERR" ]; then + # Whoa. Replication ran into an error. This slave has + # diverged from its master. Make sure this resource + # doesn't restart in place. + ocf_exit_reason "MySQL instance configured for replication, but replication has failed." + ocf_log err "See $tmpfile for details" + + # Just pull the reader VIP away, killing MySQL here would be pretty evil + # on a loaded server + + set_reader_attr 0 + exit $OCF_SUCCESS + + fi + + # If we got max_connections, let's remove the vip + if [ $last_errno -eq "$MYSQL_TOO_MANY_CONN_ERR" ]; then + set_reader_attr 0 + exit $OCF_SUCCESS + fi + + if [ "$slave_io" != 'Yes' ]; then + # Not necessarily a bad thing. The master may have + # temporarily shut down, and the slave may just be + # reconnecting. A warning can't hurt, though. + ocf_log warn "MySQL Slave IO threads currently not running." + + # Sanity check, are we at least on the right master + new_master=`$CRM_ATTR_REPL_INFO --query -q | cut -d'|' -f1` + + if [ "$master_host" != "$new_master" ]; then + # Not pointing to the right master, not good, removing the VIPs + set_reader_attr 0 + + exit $OCF_SUCCESS + fi + + fi + + if [ "$slave_sql" != 'Yes' ]; then + # We don't have a replication SQL thread running. Not a + # good thing. Try to recoved by restarting the SQL thread + # and remove reader vip. Prevent MySQL restart. + ocf_exit_reason "MySQL Slave SQL threads currently not running." + ocf_log err "See $tmpfile for details" + + # Remove reader vip + set_reader_attr 0 + + # try to restart slave + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "START SLAVE" + + # Return success to prevent a restart + exit $OCF_SUCCESS + fi + + if ocf_is_true $OCF_RESKEY_evict_outdated_slaves; then + # We're supposed to bail out if we lag too far + # behind. Let's check our lag. + if [ "$secs_behind" = "NULL" ] || [ $secs_behind -gt $OCF_RESKEY_max_slave_lag ]; then + ocf_exit_reason "MySQL Slave is $secs_behind seconds behind master (allowed maximum: $OCF_RESKEY_max_slave_lag)." + ocf_log err "See $tmpfile for details" + + # Remove reader vip + set_reader_attr 0 + + exit $OCF_ERR_INSTALLED + fi + fi + + # is the slave ok to have a VIP on it + if [ "$secs_behind" = "NULL" ] || [ $secs_behind -gt $OCF_RESKEY_max_slave_lag ]; then + set_reader_attr 0 + else + set_reader_attr 1 + fi + + ocf_log debug "MySQL instance running as a replication slave" + rm -f $tmpfile + else + # Instance produced an empty "SHOW SLAVE STATUS" output -- + # instance is not a slave + # TODO: Needs to handle when get_slave_info will return too many connections error + rm -f $tmpfile + ocf_exit_reason "check_slave invoked on an instance that is not a replication slave." + exit $OCF_ERR_GENERIC + fi +} + +set_master() { + local new_master master_log_file master_log_pos + local master_params master_ssl_params + + new_master=`$CRM_ATTR_REPL_INFO --query -q | cut -d'|' -f1` + + # Keep replication position + get_slave_info + + if [ "$master_log_file" -a "$new_master" = "$master_host" ]; then + # master_params=", MASTER_LOG_FILE='$master_log_file', \ + # MASTER_LOG_POS=$master_log_pos" + ocf_log info "Kept master pos for $master_host : $master_log_file:$master_log_pos" + rm -f $tmpfile + return + else + master_log_file=`$CRM_ATTR_REPL_INFO --query -q | cut -d'|' -f2` + master_log_pos=`$CRM_ATTR_REPL_INFO --query -q | cut -d'|' -f3` + if [ -n "$master_log_file" -a -n "$master_log_pos" ]; then + master_params=", MASTER_LOG_FILE='$master_log_file', \ + MASTER_LOG_POS=$master_log_pos" + ocf_log info "Restored master pos for $new_master : $master_log_file:$master_log_pos" + fi + fi + + # Informs the MySQL server of the master to replicate + # from. Accepts one mandatory argument which must contain the host + # name of the new master host. The master must either be unchanged + # from the last master the slave replicated from, or freshly + # reset with RESET MASTER. + if [ -n "$OCF_RESKEY_replication_master_ssl_ca" ] && [ -n "$OCF_RESKEY_replication_master_ssl_cert" ] && [ -n "$OCF_RESKEY_replication_master_ssl_key" ]; then + master_ssl_params=", MASTER_SSL=1, \ + MASTER_SSL_CA='$OCF_RESKEY_replication_master_ssl_ca', \ + MASTER_SSL_CERT='$OCF_RESKEY_replication_master_ssl_cert', \ + MASTER_SSL_KEY='$OCF_RESKEY_replication_master_ssl_key'" + fi + + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "CHANGE MASTER TO MASTER_HOST='$new_master', \ + MASTER_PORT=$OCF_RESKEY_replication_port, \ + MASTER_USER='$OCF_RESKEY_replication_user', \ + MASTER_PASSWORD='$OCF_RESKEY_replication_passwd' $master_params $master_ssl_params" + rm -f $tmpfile +} + +unset_master(){ + # Instructs the MySQL server to stop replicating from a master + # host. + + # If we're currently not configured to be replicating from any + # host, then there's nothing to do. But we do log a warning as + # no-one but the CRM should be touching the MySQL master/slave + # configuration. + if ! is_slave; then + ocf_log warn "Attempted to unset the replication master on an instance that is not configured as a replication slave" + return $OCF_SUCCESS + fi + + local tmpfile + tmpfile=`mktemp ${HA_RSCTMP}/unset_master.${OCF_RESOURCE_INSTANCE}.XXXXXX` + + # At this point, the master is read only so there should not be much binlogs to transfer + # Let's wait for the last bits + while true; do + $MYSQL $MYSQL_OPTIONS_REPL \ + -e 'SHOW PROCESSLIST\G' > $tmpfile + if grep -i 'Waiting for master to send event' $tmpfile >/dev/null; then + ocf_log info "MySQL slave has finished reading master binary log" + break + fi + if grep -i 'Reconnecting after a failed master event read' $tmpfile >/dev/null; then + ocf_log info "Master is down, no more binary logs to come" + break + fi + if grep -i 'Connecting to master' $tmpfile >/dev/null; then + ocf_log info "Master is down, no more binary logs to come" + break + fi + if ! grep 'system user' $tmpfile >/dev/null; then + ocf_log info "Slave is not running - not waiting to finish" + break + fi + + sleep 1 + done + + # Now, stop the slave I/O thread and wait for relay log + # processing to complete + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "STOP SLAVE IO_THREAD" + if [ $? -gt 0 ]; then + ocf_exit_reason "Error stopping slave IO thread" + exit $OCF_ERR_GENERIC + fi + + while true; do + $MYSQL $MYSQL_OPTIONS_REPL \ + -e 'SHOW PROCESSLIST\G' > $tmpfile + if grep -i 'Has read all relay log' $tmpfile >/dev/null; then + ocf_log info "MySQL slave has finished processing relay log" + break + fi + if ! grep -q 'system user' $tmpfile; then + ocf_log info "Slave not runnig - not waiting to finish" + break + fi + ocf_log info "Waiting for MySQL slave to finish processing relay log" + sleep 1 + done + rm -f $tmpfile + + # Now, stop all slave activity and unset the master host + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "STOP SLAVE" + if [ $? -gt 0 ]; then + ocf_exit_reason "Error stopping rest slave threads" + exit $OCF_ERR_GENERIC + fi + + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "RESET SLAVE /*!50516 ALL */;" + if [ $? -gt 0 ]; then + ocf_exit_reason "Failed to reset slave" + exit $OCF_ERR_GENERIC + fi +} + +# Start replication as slave +start_slave() { + + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "START SLAVE" +} + +# Set the attribute controlling the readers VIP +set_reader_attr() { + local curr_attr_value + + curr_attr_value=$(get_reader_attr) + + if [ "$curr_attr_value" -ne "$1" ]; then + $CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v $1 + fi + +} + +# get the attribute controlling the readers VIP +get_reader_attr() { + local attr_value + local rc + + attr_value=`$CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} --query -q` + rc=$? + if [ "$rc" -eq "0" ]; then + echo $attr_value + else + echo -1 + fi + +} + +# Stores data for MASTER STATUS from MySQL +update_data_master_status() { + + master_status_file="${HA_RSCTMP}/master_status.${OCF_RESOURCE_INSTANCE}" + + $MYSQL $MYSQL_OPTIONS_REPL -e "SHOW MASTER STATUS\G" > $master_status_file +} + + +# Returns the specified value from the stored copy of SHOW MASTER STATUS. +# should be call after update_data_master_status for tmpfile +# Arguments: +# $1 The value to get. +get_master_status() { + awk -v var="$1" '$1 == var ":" {print substr($0, index($0, ":") + 2)}' "$master_status_file" +} + + +# Determines what IP address is attached to the current host. The output of the +# crm_attribute command looks like this: +# scope=nodes name=IP value=10.2.2.161 +# If the ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP node attribute is not defined, fallback is to uname -n +# The ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP is the IP address that will be used for the +# change master to command. +get_local_ip() { + local IP + IP=`$CRM_ATTR -l forever -n ${INSTANCE_ATTR_NAME}_mysql_master_IP -q -G` + if [ ! $? -eq 0 ]; then + uname -n + else + echo $IP + fi +} + +####################################################################### + +# Functions invoked by resource manager actions + +mysql_monitor() { + local rc + local status_loglevel="err" + + # Set loglevel to info during probe + if ocf_is_probe; then + status_loglevel="info" + fi + + if ocf_is_ms; then + OCF_CHECK_LEVEL=10 + fi + + mysql_common_status $status_loglevel + rc=$? + + # TODO: check max connections error + + # If status returned an error, return that immediately + if [ $rc -ne $OCF_SUCCESS ]; then + if ocf_is_ms ; then + # This is a master slave setup but monitored host returned some errors. + # Immediately remove it from the pool of possible masters by erasing its master-mysql key + # When new mysql master election is started and node got no or negative master-mysql attribute the following is logged + # nodename.com pengine: debug: master_color: mysql:0 master score: -1 + # If there are NO nodes with positive vaule election of mysql master will fail with + # nodename.com pengine: info: master_color: ms_mysql: Promoted 0 instances of a possible 1 to master + ocf_promotion_score -D + fi + + return $rc + fi + + if [ $OCF_CHECK_LEVEL -eq 10 ]; then + if [ -z "$OCF_RESKEY_test_table" ]; then + ocf_exit_reason "test_table not set" + return $OCF_ERR_CONFIGURED + + fi + + # Check if this instance is configured as a slave, and if so + # check slave status + if is_slave; then + check_slave + fi + + # Check for test table + ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \ + -e "SELECT COUNT(*) FROM $OCF_RESKEY_test_table" + rc=$? + + if [ $rc -ne 0 ]; then + # We are master/slave and test failed. Delete master score for this node as it is considered unhealthy because of this particular failed check. + ocf_is_ms && ocf_promotion_score -D + ocf_exit_reason "Failed to select from $test_table"; + return $OCF_ERR_GENERIC; + fi + fi + + if ocf_is_ms && ! get_read_only; then + ocf_log debug "MySQL monitor succeeded (master)"; + # Always set master score for the master + ocf_promotion_score -v $((${OCF_RESKEY_max_slave_lag}+1)) + return $OCF_RUNNING_MASTER + else + ocf_log debug "MySQL monitor succeeded"; + ocf_is_ms && ocf_promotion_score -v 1 + return $OCF_SUCCESS + fi +} + +mysql_start() { + local rc + + if ocf_is_ms; then + # Initialize the ReaderVIP attribute, monitor will enable it + set_reader_attr 0 + fi + + mysql_common_status info + if [ $? = $OCF_SUCCESS ]; then + ocf_log info "MySQL already running" + return $OCF_SUCCESS + fi + + mysql_common_prepare_dirs + + # Uncomment to perform permission clensing + # - not convinced this should be enabled by default + # + #chmod 0755 $OCF_RESKEY_datadir + #chown -R $OCF_RESKEY_user $OCF_RESKEY_datadir + #chgrp -R $OCF_RESKEY_group $OCF_RESKEY_datadir + mysql_extra_params= + if ocf_is_ms; then + mysql_extra_params="--skip-slave-start" + fi + + mysql_common_start $mysql_extra_params + rc=$? + if [ $rc != $OCF_SUCCESS ]; then + return $rc + fi + + if ocf_is_ms; then + # We're configured as a stateful resource. We must start as + # slave by default. At this point we don't know if the CRM has + # already promoted a master. So, we simply start in read only + # mode. + set_read_only on + + # Now, let's see whether there is a master. We might be a new + # node that is just joining the cluster, and the CRM may have + # promoted a master before. + master_host=`echo $OCF_RESKEY_CRM_meta_notify_master_uname|tr -d " "` + if [ "$master_host" -a "$master_host" != ${NODENAME} ]; then + ocf_log info "Changing MySQL configuration to replicate from $master_host." + set_master + start_slave + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to start slave" + return $OCF_ERR_GENERIC + fi + else + ocf_log info "No MySQL master present - clearing replication state" + unset_master + fi + + # We also need to set a master preference, otherwise Pacemaker + # won't ever promote us in the absence of any explicit + # preference set by the administrator. We choose a low + # greater-than-zero preference. + ocf_promotion_score -v 1 + fi + + # Initial monitor action + if [ -n "$OCF_RESKEY_test_table" -a -n "$OCF_RESKEY_test_user" -a -n "$OCF_RESKEY_test_passwd" ]; then + OCF_CHECK_LEVEL=10 + fi + mysql_monitor + rc=$? + if [ $rc != $OCF_SUCCESS -a $rc != $OCF_RUNNING_MASTER ]; then + ocf_exit_reason "Failed initial monitor action" + return $rc + fi + + ocf_log info "MySQL started" + return $OCF_SUCCESS +} + +mysql_stop() { + if ocf_is_ms; then + # clear preference for becoming master + ocf_promotion_score -D + + # Remove VIP capability + set_reader_attr 0 + fi + + mysql_common_stop +} + +mysql_promote() { + local master_info + + if ( ! mysql_common_status err ); then + return $OCF_NOT_RUNNING + fi + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "STOP SLAVE" + + # Set Master Info in CIB, cluster level attribute + update_data_master_status + master_info="$(get_local_ip)|$(get_master_status File)|$(get_master_status Position)" + ${CRM_ATTR_REPL_INFO} -v "$master_info" + rm -f $tmpfile + + set_read_only off || return $OCF_ERR_GENERIC + + # Existing master gets a higher-than-default master preference, so + # the cluster manager does not shuffle the master role around + # unnecessarily + ocf_promotion_score -v $((${OCF_RESKEY_max_slave_lag}+1)) + + # A master can accept reads + set_reader_attr 1 + + return $OCF_SUCCESS +} + +mysql_demote() { + if ! mysql_common_status err; then + return $OCF_NOT_RUNNING + fi + + # Return master preference to default, so the cluster manager gets + # a chance to select a new master + ocf_promotion_score -v 1 +} + +mysql_notify() { + # If not configured as a Stateful resource, we make no sense of + # notifications. + if ! ocf_is_ms; then + ocf_log info "This agent makes no use of notifications unless running in master/slave mode." + return $OCF_SUCCESS + fi + + local type_op + type_op="${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation}" + + ocf_log debug "Received $type_op notification." + + case "$type_op" in + 'pre-promote') + # Nothing to do now here, new replication info not yet published + + ;; + 'post-promote') + # The master has completed its promotion. Now is a good + # time to check whether our replication slave is working + # correctly. + master_host=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname|tr -d " "` + if [ "$master_host" = ${NODENAME} ]; then + ocf_log info "This will be the new master, ignoring post-promote notification." + else + ocf_log info "Resetting replication" + unset_master + if [ $? -ne 0 ]; then + return $OCF_ERR_GENERIC + fi + + ocf_log info "Changing MySQL configuration to replicate from $master_host" + set_master + if [ $? -ne 0 ]; then + return $OCF_ERR_GENERIC + fi + + start_slave + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to start slave" + return $OCF_ERR_GENERIC + fi + fi + return $OCF_SUCCESS + ;; + 'pre-demote') + demote_host=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " "` + if [ $demote_host = ${NODENAME} ]; then + ocf_log info "post-demote notification for $demote_host" + set_read_only on + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to set read-only"; + return $OCF_ERR_GENERIC; + fi + + # Must kill all existing user threads because they are still Read/write + # in order for the slaves to complete the read of binlogs + local tmpfile + tmpfile=`mktemp ${HA_RSCTMP}/threads.${OCF_RESOURCE_INSTANCE}.XXXXXX` + $MYSQL $MYSQL_OPTIONS_REPL \ + -e "SHOW PROCESSLIST" > $tmpfile + + for thread in `awk '$0 !~ /Binlog Dump|system user|event_scheduler|SHOW PROCESSLIST/ && $0 ~ /^[0-9]/ {print $1}' $tmpfile` + do + ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ + -e "KILL ${thread}" + done + else + ocf_log info "Ignoring post-demote notification execpt for my own demotion." + fi + return $OCF_SUCCESS + ;; + 'post-demote') + demote_host=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " "` + if [ $demote_host = ${NODENAME} ]; then + ocf_log info "Ignoring post-demote notification for my own demotion." + return $OCF_SUCCESS + fi + ocf_log info "post-demote notification for $demote_host." + # The former master has just been gracefully demoted. + unset_master + ;; + *) + return $OCF_SUCCESS + ;; + esac +} + +####################################################################### + +case "$1" in + meta-data) meta_data + exit $OCF_SUCCESS;; + usage|help) usage + exit $OCF_SUCCESS;; +esac + +mysql_common_validate +rc=$? +LSB_STATUS_STOPPED=3 +if [ $rc -ne 0 ]; then + case "$1" in + stop) ;; + monitor) + mysql_common_status "info" + if [ $? -eq $OCF_SUCCESS ]; then + # if validatation fails and pid is active, always treat this as an error + ocf_exit_reason "environment validation failed, active pid is in unknown state." + exit $OCF_ERR_GENERIC + fi + # validation failed and pid is not active, it's safe to say this instance is inactive. + exit $OCF_NOT_RUNNING;; + + status) exit $LSB_STATUS_STOPPED;; + *) exit $rc;; + esac +fi + +# What kind of method was invoked? +case "$1" in + start) mysql_start;; + stop) mysql_stop;; + status) mysql_common_status err;; + monitor) mysql_monitor;; + promote) mysql_promote;; + demote) mysql_demote;; + notify) mysql_notify;; + validate-all) exit $OCF_SUCCESS;; + + *) usage + exit $OCF_ERR_UNIMPLEMENTED;; +esac + +# vi:sw=4:ts=4:et: diff --git a/heartbeat/mysql-common.sh b/heartbeat/mysql-common.sh new file mode 100755 index 0000000..d6b4e3c --- /dev/null +++ b/heartbeat/mysql-common.sh @@ -0,0 +1,332 @@ +#!/bin/sh + +####################################################################### + +# Use runuser if available for SELinux. +if [ -x /sbin/runuser ]; then + SU=runuser +else + SU=su +fi + +# Attempt to detect a default binary +OCF_RESKEY_binary_default=$(which mysqld_safe 2> /dev/null) +if [ "$OCF_RESKEY_binary_default" = "" ]; then + OCF_RESKEY_binary_default=$(which safe_mysqld 2> /dev/null) +fi + +# Fill in some defaults if no values are specified +HOSTOS=`uname` +if [ "X${HOSTOS}" = "XOpenBSD" ];then + if [ "$OCF_RESKEY_binary_default" = "" ]; then + OCF_RESKEY_binary_default="/usr/local/bin/mysqld_safe" + fi + OCF_RESKEY_config_default="/etc/my.cnf" + OCF_RESKEY_datadir_default="/var/mysql" + OCF_RESKEY_user_default="_mysql" + OCF_RESKEY_group_default="_mysql" + OCF_RESKEY_log_default="/var/log/mysqld.log" + OCF_RESKEY_pid_default="/var/mysql/mysqld.pid" + OCF_RESKEY_socket_default="/var/run/mysql/mysql.sock" +else + if [ "$OCF_RESKEY_binary_default" = "" ]; then + OCF_RESKEY_binary_default="/usr/bin/safe_mysqld" + fi + OCF_RESKEY_config_default="/etc/my.cnf" + OCF_RESKEY_datadir_default="/var/lib/mysql" + OCF_RESKEY_user_default="mysql" + OCF_RESKEY_group_default="mysql" + OCF_RESKEY_log_default="/var/log/mysqld.log" + OCF_RESKEY_pid_default="/var/run/mysql/mysqld.pid" + OCF_RESKEY_socket_default="/var/lib/mysql/mysql.sock" +fi +OCF_RESKEY_client_binary_default="mysql" +OCF_RESKEY_test_user_default="root" +OCF_RESKEY_test_table_default="mysql.user" +OCF_RESKEY_test_passwd_default="" +OCF_RESKEY_enable_creation_default=0 +OCF_RESKEY_additional_parameters_default="" +OCF_RESKEY_replication_user_default="root" +OCF_RESKEY_replication_passwd_default="" +OCF_RESKEY_replication_port_default="3306" +OCF_RESKEY_replication_require_ssl_default="false" +OCF_RESKEY_replication_master_ssl_ca_default="" +OCF_RESKEY_replication_master_ssl_cert_default="" +OCF_RESKEY_replication_master_ssl_key_default="" +OCF_RESKEY_max_slave_lag_default="3600" +OCF_RESKEY_evict_outdated_slaves_default="false" +OCF_RESKEY_reader_attribute_default="readable" + +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +MYSQL_BINDIR=`dirname ${OCF_RESKEY_binary}` + +: ${OCF_RESKEY_client_binary=${OCF_RESKEY_client_binary_default}} + +: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} +: ${OCF_RESKEY_datadir=${OCF_RESKEY_datadir_default}} + +: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} +: ${OCF_RESKEY_group=${OCF_RESKEY_group_default}} + +: ${OCF_RESKEY_log=${OCF_RESKEY_log_default}} +: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}} +: ${OCF_RESKEY_socket=${OCF_RESKEY_socket_default}} + +: ${OCF_RESKEY_test_user=${OCF_RESKEY_test_user_default}} +: ${OCF_RESKEY_test_table=${OCF_RESKEY_test_table_default}} +: ${OCF_RESKEY_test_passwd=${OCF_RESKEY_test_passwd_default}} + +: ${OCF_RESKEY_enable_creation=${OCF_RESKEY_enable_creation_default}} +: ${OCF_RESKEY_additional_parameters=${OCF_RESKEY_additional_parameters_default}} + +: ${OCF_RESKEY_replication_user=${OCF_RESKEY_replication_user_default}} +: ${OCF_RESKEY_replication_passwd=${OCF_RESKEY_replication_passwd_default}} +: ${OCF_RESKEY_replication_port=${OCF_RESKEY_replication_port_default}} +: ${OCF_RESKEY_replication_require_ssl=${OCF_RESKEY_replication_require_ssl_default}} +: ${OCF_RESKEY_replication_master_ssl_ca=${OCF_RESKEY_replication_master_ssl_ca_default}} +: ${OCF_RESKEY_replication_master_ssl_cert=${OCF_RESKEY_replication_master_ssl_cert_default}} +: ${OCF_RESKEY_replication_master_ssl_key=${OCF_RESKEY_replication_master_ssl_key_default}} + +: ${OCF_RESKEY_max_slave_lag=${OCF_RESKEY_max_slave_lag_default}} +: ${OCF_RESKEY_evict_outdated_slaves=${OCF_RESKEY_evict_outdated_slaves_default}} + +: ${OCF_RESKEY_reader_attribute=${OCF_RESKEY_reader_attribute_default}} + +####################################################################### +# Convenience variables + +MYSQL=$OCF_RESKEY_client_binary +if ocf_is_true "$OCF_RESKEY_replication_require_ssl"; then + MYSQL_OPTIONS_LOCAL_SSL_OPTIONS="--ssl-mode=REQUIRED" +else + MYSQL_OPTIONS_LOCAL_SSL_OPTIONS="" +fi +MYSQL_OPTIONS_LOCAL="-S $OCF_RESKEY_socket" +MYSQL_OPTIONS_REPL="$MYSQL_OPTIONS_LOCAL_SSL_OPTIONS $MYSQL_OPTIONS_LOCAL --user=$OCF_RESKEY_replication_user --password=$OCF_RESKEY_replication_passwd" +MYSQL_OPTIONS_TEST="$MYSQL_OPTIONS_LOCAL --user=$OCF_RESKEY_test_user --password=$OCF_RESKEY_test_passwd" +MYSQL_TOO_MANY_CONN_ERR=1040 + +NODENAME=$(ocf_local_nodename) +CRM_ATTR="${HA_SBIN_DIR}/crm_attribute -N $NODENAME " +INSTANCE_ATTR_NAME=`echo ${OCF_RESOURCE_INSTANCE}| awk -F : '{print $1}'` +CRM_ATTR_REPL_INFO="${HA_SBIN_DIR}/crm_attribute --type crm_config --name ${INSTANCE_ATTR_NAME}_REPL_INFO -s mysql_replication" + +####################################################################### + +mysql_common_validate() +{ + + if ! have_binary "$OCF_RESKEY_binary"; then + ocf_exit_reason "Setup problem: couldn't find command: $OCF_RESKEY_binary" + return $OCF_ERR_INSTALLED; + fi + + if ! have_binary "$OCF_RESKEY_client_binary"; then + ocf_exit_reason "Setup problem: couldn't find command: $OCF_RESKEY_client_binary" + return $OCF_ERR_INSTALLED; + fi + + if [ ! -f $OCF_RESKEY_config ]; then + ocf_exit_reason "Config $OCF_RESKEY_config doesn't exist"; + return $OCF_ERR_INSTALLED; + fi + + if [ ! -d $OCF_RESKEY_datadir ]; then + ocf_exit_reason "Datadir $OCF_RESKEY_datadir doesn't exist"; + return $OCF_ERR_INSTALLED; + fi + + getent passwd $OCF_RESKEY_user >/dev/null 2>&1 + if [ ! $? -eq 0 ]; then + ocf_exit_reason "User $OCF_RESKEY_user doesn't exit"; + return $OCF_ERR_INSTALLED; + fi + + getent group $OCF_RESKEY_group >/dev/null 2>&1 + if [ ! $? -eq 0 ]; then + ocf_exit_reason "Group $OCF_RESKEY_group doesn't exist"; + return $OCF_ERR_INSTALLED; + fi + + return $OCF_SUCCESS +} + +mysql_common_check_pid() { + local pid=$1 + + if [ -d /proc -a -d /proc/1 ]; then + [ "u$pid" != "u" -a -d /proc/$pid ] + else + kill -s 0 $pid >/dev/null 2>&1 + fi + return $? +} + +mysql_common_status() { + local loglevel=$1 + local pid=$2 + if [ -z "$pid" ]; then + if [ ! -e $OCF_RESKEY_pid ]; then + ocf_log $loglevel "MySQL is not running" + return $OCF_NOT_RUNNING; + fi + + pid=`cat $OCF_RESKEY_pid`; + fi + + mysql_common_check_pid $pid + + + if [ $? -eq 0 ]; then + return $OCF_SUCCESS; + else + if [ -e $OCF_RESKEY_pid ]; then + ocf_log $loglevel "MySQL not running: removing old PID file" + rm -f $OCF_RESKEY_pid + fi + return $OCF_NOT_RUNNING; + fi +} + +mysql_common_prepare_dirs() +{ + local rc + + touch $OCF_RESKEY_log + chown $OCF_RESKEY_user:$OCF_RESKEY_group $OCF_RESKEY_log + chmod 0640 $OCF_RESKEY_log + [ -x /sbin/restorecon ] && /sbin/restorecon $OCF_RESKEY_log + + if ocf_is_true "$OCF_RESKEY_enable_creation" && [ ! -d $OCF_RESKEY_datadir/mysql ] ; then + ocf_log info "Initializing MySQL database: " + $MYSQL_BINDIR/mysql_install_db --datadir=$OCF_RESKEY_datadir + rc=$? + if [ $rc -ne 0 ] ; then + ocf_exit_reason "Initialization failed: $rc"; + exit $OCF_ERR_GENERIC + fi + chown -R $OCF_RESKEY_user:$OCF_RESKEY_group $OCF_RESKEY_datadir + fi + + pid_dir=`dirname $OCF_RESKEY_pid` + if [ ! -d $pid_dir ] ; then + ocf_log info "Creating PID dir: $pid_dir" + mkdir -p $pid_dir + chown $OCF_RESKEY_user:$OCF_RESKEY_group $pid_dir + fi + + socket_dir=`dirname $OCF_RESKEY_socket` + if [ ! -d $socket_dir ] ; then + ocf_log info "Creating socket dir: $socket_dir" + mkdir -p $socket_dir + chown $OCF_RESKEY_user:$OCF_RESKEY_group $socket_dir + fi + + # Regardless of whether we just created the directory or it + # already existed, check whether it is writable by the configured + # user + for dir in $pid_dir $socket_dir $OCF_RESKEY_datadir; do + if ! $SU -s /bin/sh - $OCF_RESKEY_user -c "test -w $dir"; then + ocf_exit_reason "Directory $dir is not writable by $OCF_RESKEY_user" + exit $OCF_ERR_PERM; + fi + done +} + +mysql_common_start() +{ + local mysql_extra_params="$1" + local pid + + $SU - $OCF_RESKEY_user -s /bin/sh -c \ + "${OCF_RESKEY_binary} --defaults-file=$OCF_RESKEY_config \ + --pid-file=$OCF_RESKEY_pid \ + --socket=$OCF_RESKEY_socket \ + --datadir=$OCF_RESKEY_datadir \ + --log-error=$OCF_RESKEY_log \ + $OCF_RESKEY_additional_parameters \ + $mysql_extra_params >/dev/null 2>&1" & + pid=$! + + # Spin waiting for the server to come up. + # Let the CRM/LRM time us out if required. + start_wait=1 + while [ $start_wait = 1 ]; do + if ! ps $pid > /dev/null 2>&1; then + wait $pid + ocf_exit_reason "MySQL server failed to start (pid=$pid) (rc=$?). Check $OCF_RESKEY_log for details" + return $OCF_ERR_GENERIC + fi + mysql_common_status info + rc=$? + if [ $rc = $OCF_SUCCESS ]; then + start_wait=0 + elif [ $rc != $OCF_NOT_RUNNING ]; then + ocf_log info "MySQL start failed: $rc" + return $rc + fi + sleep 2 + done + + return $OCF_SUCCESS +} + +mysql_common_stop() +{ + local pid + local rc + + if [ ! -f $OCF_RESKEY_pid ]; then + ocf_log info "MySQL is not running" + return $OCF_SUCCESS + fi + + pid=`cat $OCF_RESKEY_pid 2> /dev/null ` + + mysql_common_check_pid $pid + if [ $? -ne 0 ]; then + rm -f $OCF_RESKEY_pid + ocf_log info "MySQL is already stopped" + return $OCF_SUCCESS; + fi + + /bin/kill $pid > /dev/null + rc=$? + if [ $rc != 0 ]; then + ocf_exit_reason "MySQL couldn't be stopped" + return $OCF_ERR_GENERIC + fi + # stop waiting + shutdown_timeout=15 + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-5)) + fi + count=0 + while [ $count -lt $shutdown_timeout ] + do + mysql_common_status info $pid + rc=$? + if [ $rc = $OCF_NOT_RUNNING ]; then + break + fi + count=`expr $count + 1` + sleep 1 + ocf_log debug "MySQL still hasn't stopped yet. Waiting..." + done + + mysql_common_status info $pid + if [ $? != $OCF_NOT_RUNNING ]; then + ocf_log info "MySQL failed to stop after ${shutdown_timeout}s using SIGTERM. Trying SIGKILL..." + /bin/kill -KILL $pid > /dev/null + mysql_common_status info $pid + if [ $? != $OCF_NOT_RUNNING ]; then + return $OCF_ERR_GENERIC + fi + fi + + ocf_log info "MySQL stopped"; + rm -f /var/lock/subsys/mysqld + rm -f $OCF_RESKEY_socket + return $OCF_SUCCESS + +} diff --git a/heartbeat/mysql-proxy b/heartbeat/mysql-proxy new file mode 100755 index 0000000..013c5e4 --- /dev/null +++ b/heartbeat/mysql-proxy @@ -0,0 +1,741 @@ +#!/bin/sh +# +# Resource script for MySQL Proxy +# +# Description: Manages MySQL Proxy as an OCF resource in +# an high-availability setup. +# +# Tested with MySQL Proxy 0.8.1 and 0.8.3 on Debian 6.0. +# +# Based on the mysql and Pure-Ftpd OCF resource agents. +# +# Author: Raoul Bhatia <r.bhatia@ipax.at> : Original Author +# License: GNU General Public License (GPL) +# +# +# usage: $0 {start|stop|reload|status|monitor|validate-all|meta-data} +# +# The "start" arg starts a MySQL Proxy instance +# +# The "stop" arg stops it. +# +# TODO +# * add in-depth monitoring by querying the mysql-proxy admin port +# +# Test via +# (note: this did not work with MySQL Proxy 0.8.1 and ocf-tester from resource-agents 3.9.2 on Debian 6.0) +# +# * /usr/sbin/ocf-tester -n mp -o binary="/usr/sbin/mysql-proxy" -o defaults_file="" -o parameters="--proxy-skip-profiling" \ +# -o admin_address="127.0.0.1:4041" -o admin_username="root" -o admin_password="la" -o admin_lua_script="/usr/lib/mysql-proxy/lua/admin.lua" \ +# -o proxy_backend_addresses="192.168.100.200:42006" -o proxy_address="/var/run/mysqld/mysqld.sock" /usr/lib/ocf/resource.d/heartbeat/mysql-proxy +# +# +# OCF parameters: +# OCF_RESKEY_binary +# OCF_RESKEY_client_binary +# OCF_RESKEY_defaults_file +# OCF_RESKEY_proxy_backend_addresses +# OCF_RESKEY_proxy_read_only_backend_addresses +# OCF_RESKEY_proxy_address +# OCF_RESKEY_log_level +# OCF_RESKEY_keepalive +# OCF_RESKEY_plugins +# OCF_RESKEY_admin_address +# OCF_RESKEY_admin_username +# OCF_RESKEY_admin_password +# OCF_RESKEY_admin_lua_script +# OCF_RESKEY_test_table +# OCF_RESKEY_test_user +# OCF_RESKEY_test_passwd +# OCF_RESKEY_parameters +# OCF_RESKEY_pidfile +# +########################################################################## + +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_binary_default="/usr/sbin/mysql-proxy" +OCF_RESKEY_client_binary_default="mysql" +OCF_RESKEY_defaults_file_default="" +OCF_RESKEY_proxy_backend_addresses_default="127.0.0.1:3306" +OCF_RESKEY_proxy_read_only_backend_addresses_default="" +OCF_RESKEY_proxy_address_default=":4040" +OCF_RESKEY_log_level_default="" +OCF_RESKEY_keepalive_default="" +OCF_RESKEY_plugins_default="" +OCF_RESKEY_admin_address_default="127.0.0.1:4041" +OCF_RESKEY_admin_username_default="" +OCF_RESKEY_admin_password_default="" +OCF_RESKEY_admin_lua_script_default="" +OCF_RESKEY_test_table_default="mysql.user" +OCF_RESKEY_test_user_default="" +OCF_RESKEY_test_passwd_default="" +OCF_RESKEY_parameters_default="" +OCF_RESKEY_pidfile_default="${HA_RSCTMP}/mysql-proxy-${OCF_RESOURCE_INSTANCE}.pid" + +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +: ${OCF_RESKEY_client_binary=${OCF_RESKEY_client_binary_default}} +: ${OCF_RESKEY_defaults_file=${OCF_RESKEY_defaults_file_default}} +: ${OCF_RESKEY_proxy_backend_addresses=${OCF_RESKEY_proxy_backend_addresses_default}} +: ${OCF_RESKEY_proxy_read_only_backend_addresses=${OCF_RESKEY_proxy_read_only_backend_addresses_default}} +: ${OCF_RESKEY_proxy_address=${OCF_RESKEY_proxy_address_default}} +: ${OCF_RESKEY_log_level=${OCF_RESKEY_log_level_default}} +: ${OCF_RESKEY_keepalive=${OCF_RESKEY_keepalive_default}} +: ${OCF_RESKEY_plugins=${OCF_RESKEY_plugins_default}} +: ${OCF_RESKEY_admin_address=${OCF_RESKEY_admin_address_default}} +: ${OCF_RESKEY_admin_username=${OCF_RESKEY_admin_username_default}} +: ${OCF_RESKEY_admin_password=${OCF_RESKEY_admin_password_default}} +: ${OCF_RESKEY_admin_lua_script=${OCF_RESKEY_admin_lua_script_default}} +: ${OCF_RESKEY_test_table=${OCF_RESKEY_test_table_default}} +: ${OCF_RESKEY_test_user=${OCF_RESKEY_test_user_default}} +: ${OCF_RESKEY_test_passwd=${OCF_RESKEY_test_passwd_default}} +: ${OCF_RESKEY_parameters=${OCF_RESKEY_parameters_default}} +: ${OCF_RESKEY_pidfile=${OCF_RESKEY_pidfile_default}} + +USAGE="Usage: $0 {start|stop|reload|status|monitor|validate-all|meta-data}" + +########################################################################## + +usage() { + echo $USAGE >&2 +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="mysql-proxy" version="0.1"> +<version>1.0</version> +<longdesc lang="en"> +This script manages MySQL Proxy as an OCF resource in a high-availability setup. + +The default monitor operation will verify that mysql-proxy is running. + +The level 10 monitor operation is left out intentionally for possible future enhancements in conjunction with the admin plugin. + +The level 20 monitor operation will perform a SELECT on a given table to verify that the connection to a back-end server is actually working. + +Tested with MySQL Proxy 0.8.1 and 0.8.3 on Debian 6.0. +</longdesc> +<shortdesc lang="en">Manages a MySQL Proxy instance</shortdesc> + +<parameters> + +<parameter name="binary" unique="0" required="0"> +<longdesc lang="en"> +Full path to the MySQL Proxy binary. +For example, "/usr/sbin/mysql-proxy". +</longdesc> +<shortdesc lang="en">Full path to MySQL Proxy binary</shortdesc> +<content type="string" default="${OCF_RESKEY_binary_default}" /> +</parameter> + +<parameter name="client_binary" unique="0" required="0"> +<longdesc lang="en"> +Location of the MySQL client binary. +</longdesc> +<shortdesc lang="en">MySQL client binary</shortdesc> +<content type="string" default="${OCF_RESKEY_client_binary_default}" /> +</parameter> + +<parameter name="defaults_file" unique="0" required="0"> +<longdesc lang="en"> +Full path to a MySQL Proxy configuration file. +For example, "/etc/mysql-proxy.conf". +</longdesc> +<shortdesc lang="en">Full path to configuration file</shortdesc> +<content type="string" default="${OCF_RESKEY_defaults_file_default}" /> +</parameter> + +<parameter name="proxy_backend_addresses" unique="0" required="0"> +<longdesc lang="en"> +Address:port of the remote back-end servers (default: 127.0.0.1:3306). +</longdesc> +<shortdesc lang="en">MySQL Proxy back-end servers</shortdesc> +<content type="string" default="${OCF_RESKEY_proxy_backend_addresses_default}" /> +</parameter> + +<parameter name="proxy_read_only_backend_addresses" unique="0" required="0"> +<longdesc lang="en"> +Address:port of the remote (read only) unpromoted-server (default: ). +</longdesc> +<shortdesc lang="en">MySql Proxy read only back-end servers</shortdesc> +<content type="string" default="${OCF_RESKEY_proxy_read_only_backend_addresses_default}" /> +</parameter> + +<parameter name="proxy_address" unique="0" required="0"> +<longdesc lang="en"> +Listening address:port of the proxy server (default: :4040). +You can also specify a socket like "/var/run/mysql-proxy.sock". +</longdesc> +<shortdesc lang="en">MySQL Proxy listening address</shortdesc> +<content type="string" default="${OCF_RESKEY_proxy_address_default}" /> +</parameter> + +<parameter name="log_level" unique="0" required="0"> +<longdesc lang="en"> +Log all messages of level (error|warning|info|message|debug|) or higher. +An empty value disables logging. +</longdesc> +<shortdesc lang="en">MySQL Proxy log level.</shortdesc> +<content type="string" default="${OCF_RESKEY_log_level_default}" /> +</parameter> + +<parameter name="keepalive" unique="0" required="0"> +<longdesc lang="en"> +Try to restart the proxy if it crashed (default: ). +Valid values: true or false. An empty value equals "false". +</longdesc> +<shortdesc lang="en">Use keepalive option</shortdesc> +<content type="string" default="${OCF_RESKEY_keepalive_default}" /> +</parameter> + +<parameter name="plugins" unique="0" required="0"> +<longdesc lang="en"> +Whitespace separated list of plugins to load (default: ). +Note: The admin plugin will be auto-loaded in case you specify an admin_* parameter. +</longdesc> +<shortdesc lang="en">MySQL Proxy plugins</shortdesc> +<content type="string" default="${OCF_RESKEY_plugins_default}" /> +</parameter> + +<parameter name="admin_address" unique="0" required="0"> +<longdesc lang="en"> +Listening address:port of the admin plugin (default: 127.0.0.1:4041). +Note: The admin plugin will be auto-loaded in case you specify an admin_* parameter. +</longdesc> +<shortdesc lang="en">MySQL Proxy admin plugin listening address</shortdesc> +<content type="string" default="${OCF_RESKEY_admin_address_default}" /> +</parameter> + +<parameter name="admin_username" unique="0" required="0"> +<longdesc lang="en"> +Username for the admin plugin (default: ). +Required since MySQL Proxy 0.8.1, if the admin plugin is loaded. +Note: The admin plugin will be auto-loaded in case you specify an admin_* parameter. +</longdesc> +<shortdesc lang="en">MySQL Proxy admin plugin username</shortdesc> +<content type="string" default="${OCF_RESKEY_admin_username_default}" /> +</parameter> + +<parameter name="admin_password" unique="0" required="0"> +<longdesc lang="en"> +Password for the admin plugin (default: ). +Required since MySQL Proxy 0.8.1, if the admin plugin is loaded. +Note: The admin plugin will be auto-loaded in case you specify an admin_* parameter. +</longdesc> +<shortdesc lang="en">MySQL Proxy admin plugin password</shortdesc> +<content type="string" default="${OCF_RESKEY_admin_password_default}" /> +</parameter> + +<parameter name="admin_lua_script" unique="0" required="0"> +<longdesc lang="en"> +Script to execute by the admin plugin. +Required since MySQL Proxy 0.8.1, if the admin plugin is loaded. +Note: The admin plugin will be auto-loaded in case you specify an admin_* parameter. +</longdesc> +<shortdesc lang="en">MySQL Proxy admin plugin lua script</shortdesc> +<content type="string" default="${OCF_RESKEY_admin_lua_script_default}" /> +</parameter> + +<parameter name="test_table" unique="0" required="0"> +<longdesc lang="en"> +Table to be tested in monitor statement (in database.table notation) +</longdesc> +<shortdesc lang="en">MySQL test table</shortdesc> +<content type="string" default="${OCF_RESKEY_test_table_default}" /> +</parameter> + +<parameter name="test_user" unique="0" required="0"> +<longdesc lang="en"> +MySQL test user +</longdesc> +<shortdesc lang="en">MySQL test user</shortdesc> +<content type="string" default="${OCF_RESKEY_test_user_default}" /> +</parameter> + +<parameter name="test_passwd" unique="0" required="0"> +<longdesc lang="en"> +MySQL test user password +</longdesc> +<shortdesc lang="en">MySQL test user password</shortdesc> +<content type="string" default="${OCF_RESKEY_test_passwd_default}" /> +</parameter> + +<parameter name="parameters" unique="0" required="0"> +<longdesc lang="en"> +The MySQL Proxy daemon may be called with additional parameters. +Specify any of them here. +</longdesc> +<shortdesc lang="en">MySQL Proxy additional parameters</shortdesc> +<content type="string" default="${OCF_RESKEY_parameters_default}" /> +</parameter> + +<parameter name="pidfile" unique="1" required="0"> +<longdesc lang="en">PID file</longdesc> +<shortdesc lang="en">PID file</shortdesc> +<content type="string" default="${OCF_RESKEY_pidfile_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="30s" /> +<action name="stop" timeout="30s" /> +<action name="reload" timeout="30s" /> +<action name="monitor" depth="0" timeout="20s" interval="60s" /> +<action name="validate-all" timeout="30s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +isRunning() +{ + kill -s 0 "$1" 2>/dev/null +} + +mysqlproxy_status() +{ + local PID + + if [ -f "${pidfile}" ]; then + # MySQL Proxy is probably running + PID=`head -n 1 "${pidfile}"` + if [ ! -z "$PID" ] ; then + isRunning "$PID" + return $? + fi + fi + + # MySQL Proxy is not running + false +} + +mysqlproxy_start() +{ + local PARAM_PREFIX OPTIONS + local p pa pba proba + local pid_dir socket_dir + + # if MySQL Proxy is running return success + if mysqlproxy_status ; then + ocf_log info "MySQL Proxy already running." + return $OCF_SUCCESS + fi + + PARAM_PREFIX='' + + # MySQL Proxy plugins to load + # @TODO check if the plugins are actually available? + if ocf_is_true $plugin_support; then + for p in $plugins; do + PARAM_PREFIX="$PARAM_PREFIX --plugins=$p" + done + fi + + # check if the MySQL Proxy defaults-file exist + if [ -f "$defaults_file" ]; then + PARAM_PREFIX="$PARAM_PREFIX --defaults-file=$defaults_file" + fi + + # set log-level + if [ ! -z "$log_level" ]; then + PARAM_PREFIX="$PARAM_PREFIX --log-level=$log_level" + fi + + # set keepalive + if [ "$keepalive" = "true" ]; then + PARAM_PREFIX="$PARAM_PREFIX --keepalive" + fi + + # honor admin_* options + if [ ! -z "$admin_username" ]; then + PARAM_PREFIX="$PARAM_PREFIX --admin-username=$admin_username" + fi + if [ ! -z "$admin_password" ]; then + PARAM_PREFIX="$PARAM_PREFIX --admin-password=$admin_password" + fi + if [ ! -z "$admin_lua_script" ]; then + PARAM_PREFIX="$PARAM_PREFIX --admin-lua-script=$admin_lua_script" + fi + + # make sure that the pid directory exists + pid_dir=`dirname $pidfile` + if [ ! -d $pid_dir ] ; then + ocf_log info "Creating PID directory '$pid_dir'." + mkdir -p $pid_dir + #chown $OCF_RESKEY_user:$OCF_RESKEY_group $pid_dir # c/p from mysql ra; currently not needed + fi + + # split multiple proxy-address options. + # currently unsupported but let us hope for the future ;) + for pa in $proxy_address; do + [ -z "$pa" ] && continue + OPTIONS=" $OPTIONS --proxy-address=$pa" + + # if $pa contains a slash, we are dealing with a socket + # make sure that the socket directory exists + if echo "$pa" | grep -q '/' ; then + socket_dir=`dirname $pa` + if [ ! -d $socket_dir ] ; then + ocf_log info "Creating socket directory '$socket_dir'." + mkdir -p $socket_dir + #chown $OCF_RESKEY_user:$OCF_RESKEY_group $socket_dir # c/p from mysql ra; currently not needed + fi + fi + done + + # split multiple proxy-backend-addresses options. + for pba in $proxy_backend_addresses; do + [ -z "$pba" ] && continue + OPTIONS=" $OPTIONS --proxy-backend-addresses=$pba" + done + + # split multiple proxy-backend-addresses options. + for proba in $proxy_read_only_backend_addresses; do + [ -z "$proba" ] && continue + OPTIONS=" $OPTIONS --proxy-read-only-backend-addresses=$proba" + done + + # build $OPTIONS and add admin-address and pidfile + OPTIONS="$PARAM_PREFIX $OPTIONS --admin-address=$admin_address --pid-file=${pidfile}" + + # add additional parameters + if [ -n "$parameters" ]; then + OPTIONS="$OPTIONS $parameters" + fi + + # start MySQL Proxy + #start-stop-daemon --start --quiet --pidfile $pidfile --make-pidfile --name mysql-proxy --startas $binary -b -- $OPTIONS + $binary --daemon $OPTIONS + ret=$? + + if [ $ret -ne 0 ]; then + ocf_log err "MySQL Proxy returned error: " $ret + return $OCF_ERR_GENERIC + fi + + # @TODO add an initial monitoring action? + + return $OCF_SUCCESS +} + + +mysqlproxy_stop() +{ + local ret + local pa + + if mysqlproxy_status ; then + #start-stop-daemon --stop --quiet --retry 3 --exec $binary --pidfile $pidfile + /bin/kill `cat "${pidfile}"` + ret=$? + + if [ $ret -ne 0 ]; then + ocf_log err "MySQL Proxy returned an error while stopping: " $ret + return $OCF_ERR_GENERIC + fi + + # grant some time for shutdown and recheck + sleep 1 + if mysqlproxy_status ; then + ocf_log err "MySQL Proxy failed to stop." + return $OCF_ERR_GENERIC + fi + + # remove dangling socketfile, if specified + for pa in $proxy_address; do + if [ -S "$pa" ]; then + ocf_log info "Removing dangling socket file '$pa'." + rm -f "$pa" + fi + done + + # remove dangling pidfile + if [ -f "${pidfile}" ]; then + ocf_log info "Removing dangling pidfile '${pidfile}'." + rm -f "${pidfile}" + fi + fi + + return $OCF_SUCCESS +} + +mysqlproxy_reload() +{ + # @TODO check if pidfile is empty + # PID=`head -n 1 "${pidfile}"` + # if [ ! -z "$PID" ] ; then + + if mysqlproxy_status; then + ocf_log info "Reloading MySQL Proxy." + kill -s HUP `cat ${pidfile}` + fi +} + +mysqlproxy_monitor() +{ + local rc + + if [ "${OCF_RESKEY_CRM_meta_interval:-0}" -eq "0" ]; then + # in case of probe, monitor operation is surely treated as + # under suspension. This will call start operation. + # (c/p from ocf:heartbeat:sfex) + mysqlproxy_validate_all + rc=$? + [ $rc -ne 0 ] && return $rc + fi + + if ! mysqlproxy_status ; then + return $OCF_NOT_RUNNING + fi + + if [ $OCF_CHECK_LEVEL -eq 20 ]; then + mysqlproxy_monitor_20 + rc=$? + [ $rc -ne 0 ] && return $rc + fi + + return $OCF_SUCCESS +} + +mysqlproxy_monitor_20() +{ + local rc + local mysql_options pa + local mysql_server_parameter mysql_server_host mysql_server_port + + if [ -z "$OCF_RESKEY_test_table" -o -z "$OCF_RESKEY_test_user" -a -z "$OCF_RESKEY_test_passwd" ]; then + ocf_log warn "Missing proper configuration for OCF_CHECK_LEVEL=20 (test_table=[$OCF_RESKEY_test_table] test_user=[$OCF_RESKEY_test_user] test_password=[$OCF_RESKEY_test_passwd]). Not running in-depth monitoring." + return $OCF_SUCCESS + fi + + mysql_options="--connect_timeout=10 --user=$OCF_RESKEY_test_user --password=$OCF_RESKEY_test_passwd" + + # cycle each address + for pa in $proxy_address; do + # build correct connect parameter + if [ -S "$pa" ]; then + # we need to monitor a mysql socket + mysql_server_parameter="--socket=$pa" + else + # we need to monitor a host address + mysql_server_parameter="" + + # split host:port + # @TODO correctly handle IPv6 address + # @TODO correctly handle 0.0.0.0 address + mysql_server_host=`echo $pa | cut -d : -f 1` + mysql_server_port=`echo $pa | cut -d : -f 2` + + if [ -n "$mysql_server_host" ]; then + mysql_server_parameter="$mysql_server_parameter --host=$mysql_server_host" + fi + if [ -n "$mysql_server_port" ]; then + mysql_server_parameter="$mysql_server_parameter --port=$mysql_server_port" + fi + fi + + # Check for test table + ocf_run $mysql $mysql_server_parameter $mysql_options \ + -e "SELECT COUNT(*) FROM $OCF_RESKEY_test_table" + rc=$? + + if [ $rc -ne 0 ]; then + ocf_log err "Failed to select from $OCF_RESKEY_test_table: " $rc + return $OCF_ERR_GENERIC + fi + done + + return $OCF_SUCCESS +} + +mysqlproxy_validate_all() +{ + # local variables + local config_error=0 + + # check that the MySQL Proxy binary exists and can be executed + check_binary $binary + + # check MySQL client binary only if in-depth monitoring is requested + # do not break backwards compatibility otherwise + if [ $OCF_CHECK_LEVEL -gt 0 ]; then + check_binary $mysql + fi + + # check for valid log-level + echo $log_level | egrep -q "^(error|warning|info|message|debug|)$" + if [ $? -ne 0 ]; then + ocf_log err "MySQL Proxy log level '$log_level' not in valid range error|warning|info|message|debug" + return $OCF_ERR_CONFIGURED + fi + + + # if we're running MySQL Proxy > 0.8.1 and there is any admin parameter set, + # explicitly load the admin (and the proxy) plugin. + # (version 0.8.2 does not load the admin plugin by default anymore) + ocf_version_cmp "$version" "0.8.1" + ret=$? + if [ $ret -eq 2 ]; then + # simple check: concat all parameters and check if the string has non-zero length + if [ -n "$admin_username$admin_password$admin_lua_script$admin_address" ]; then + plugins="proxy admin" + has_plugin_admin=1 + else + has_plugin_admin=0 + fi + fi + + + # check for required admin_* parameters for 0.8.1 and 0.8.2 (with admin module) + # translated: if (version == 0.8.1 or (version > 0.8.1 and has_plugin_admin)) + if [ $ret -eq 1 -o \( $ret -eq 2 -a $has_plugin_admin -eq 1 \) ]; then + if [ -z "$admin_username" ]; then + ocf_log err "Missing required parameter \"admin_username\"" + config_error=1 + fi + if [ -z "$admin_password" ]; then + ocf_log err "Missing required parameter \"admin_password\"" + config_error=1 + fi + if [ -z "$admin_lua_script" ]; then + ocf_log err "Missing required parameter \"admin_lua_script\"" + config_error=1 + fi + + # check if the admin_lua_script, if specified, exists + if [ -n "$admin_lua_script" -a ! -e "$admin_lua_script" ]; then + ocf_log err "MySQL Proxy admin lua script '$admin_lua_script' does not exist or is not readable." + fi + fi + + # issue a warning during start if the user wants to load a plugin + # but this version of MySQL Proxy does not support the plugin architecture. + if [ -n "$plugins" ] && ocf_is_false "$plugin_support" && [ $__OCF_ACTION = 'start' ]; then + ocf_log warn "You are running MySQL Proxy version '$version'. This version does not support the plugin architecture. Please use version 0.7.0 or later to load the plugins '$plugins'." + fi + + # exit in case we have found relevant config errors + if [ $config_error -eq 1 ]; then + exit $OCF_ERR_CONFIGURED + fi + + return $OCF_SUCCESS +} + +# +# Main +# + +if [ $# -ne 1 ]; then + usage + exit $OCF_ERR_ARGS +fi + +pidfile=$OCF_RESKEY_pidfile +binary=$OCF_RESKEY_binary +defaults_file=$OCF_RESKEY_defaults_file +proxy_backend_addresses=$OCF_RESKEY_proxy_backend_addresses +proxy_read_only_backend_addresses=$OCF_RESKEY_proxy_read_only_backend_addresses +admin_address=$OCF_RESKEY_admin_address +admin_username=$OCF_RESKEY_admin_username +admin_password=$OCF_RESKEY_admin_password +admin_lua_script=$OCF_RESKEY_admin_lua_script +proxy_address=$OCF_RESKEY_proxy_address +log_level=$OCF_RESKEY_log_level +keepalive=$OCF_RESKEY_keepalive +plugins=`echo $OCF_RESKEY_plugins | tr "[:space:]" "\n" | sort -u` +mysql=$OCF_RESKEY_client_binary +parameters=$OCF_RESKEY_parameters +plugin_support=false +has_plugin_admin=0 # 0 because this simplifies the if statements + +# debugging stuff +#echo OCF_RESKEY_binary=$OCF_RESKEY_binary >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE +#echo OCF_RESKEY_defaults_file=$OCF_RESKEY_defaults_file >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE +#echo OCF_RESKEY_proxy_backend_addresses=$OCF_RESKEY_proxy_backend_addresses >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE +#echo OCF_RESKEY_proxy_read_only_backend_addresses=$OCF_RESKEY_proxy_read_only_backend_addresses >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE +#echo OCF_RESKEY_proxy_address=$OCF_RESKEY_proxy_address >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE +#echo OCF_RESKEY_log_level=$OCF_RESKEY_log_level >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE +#echo OCF_RESKEY_keepalive=$OCF_RESKEY_keepalive >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE +#echo OCF_RESKEY_admin_address=$OCF_RESKEY_admin_address >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE +#echo OCF_RESKEY_admin_username=$OCF_RESKEY_admin_username >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE +#echo OCF_RESKEY_admin_password=$OCF_RESKEY_admin_password >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE +#echo OCF_RESKEY_admin_lua_script=$OCF_RESKEY_admin_lua_script >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE +#echo OCF_RESKEY_parameters=$OCF_RESKEY_parameters >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE +#echo OCF_RESKEY_pidfile=$OCF_RESKEY_pidfile >> /tmp/prox_conf_$OCF_RESOURCE_INSTANCE + + +# handle some parameters before performing any additional checks +case $1 in + meta-data) meta_data + exit $? + ;; + + usage) usage + exit $OCF_SUCCESS + ;; +esac + + +# determine MySQL Proxy version +check_binary $binary +version=`$binary --version | grep ^mysql-proxy | awk '{print $NF}'` + +# version 0.7.0 (and later) support the plugin architecture and load the admin plugin by default +# version 0.8.1 loads admin plugin by default and requires the admin parameters to be set +# version 0.8.2 does not load the admin plugin by default anymore +ocf_version_cmp "$version" "0.7.0" +ret=$? +if [ $ret -eq 1 -o $ret -eq 2 ]; then + plugin_support=true + has_plugin_admin=1 +fi + + +# perform action +case $1 in + start) mysqlproxy_validate_all && + mysqlproxy_start + exit $? + ;; + + stop) mysqlproxy_validate_all && + mysqlproxy_stop + exit $? + ;; + + reload) mysqlproxy_reload + exit $? + ;; + + status) if mysqlproxy_status; then + ocf_log info "MySQL Proxy is running." + exit $OCF_SUCCESS + else + ocf_log info "MySQL Proxy is stopped." + exit $OCF_NOT_RUNNING + fi + ;; + + monitor) mysqlproxy_monitor + exit $? + ;; + + validate-all) mysqlproxy_validate_all + exit $? + ;; + + + *) usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac diff --git a/heartbeat/nagios b/heartbeat/nagios new file mode 100755 index 0000000..a45d5b8 --- /dev/null +++ b/heartbeat/nagios @@ -0,0 +1,246 @@ +#!/bin/sh +# +# License: GNU General Public License (GPL) +# (c) 2015 T.J. Yang, O. Albrigtsen +# and Linux-HA contributors +# +# ----------------------------------------------------------------------------- +# O C F R E S O U R C E S C R I P T S P E C I F I C A T I O N +# ----------------------------------------------------------------------------- +# +# NAME +# nagios : OCF resource agent script for Nagios Server +# + +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_user_default="nagios" +OCF_RESKEY_group_default="nagios" +OCF_RESKEY_binary_default="/usr/sbin/nagios" +OCF_RESKEY_config_default="/etc/nagios/nagios.cfg" +OCF_RESKEY_log_default="/var/log/nagios/nagios.log" +OCF_RESKEY_retention_default="/var/log/nagios/retention.dat" +OCF_RESKEY_command_default="/var/log/nagios/rw/nagios.cmd" +OCF_RESKEY_pid_default="/var/run/nagios.pid" + +: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} +: ${OCF_RESKEY_group=${OCF_RESKEY_group_default}} +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} +: ${OCF_RESKEY_log=${OCF_RESKEY_log_default}} +: ${OCF_RESKEY_retention=${OCF_RESKEY_retention_default}} +: ${OCF_RESKEY_command=${OCF_RESKEY_command_default}} +: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}} + + +nagios_usage() { + cat <<END + usage: $0 (start|stop|validate-all|meta-data|help|usage|monitor) + $0 manages a Nagios instance as an OCF HA resource. + The 'start' operation starts the instance. + The 'stop' operation stops the instance. + The 'status' operation reports whether the instance is running + The 'monitor' operation reports whether the instance seems to be working + The 'validate-all' operation reports whether the parameters are valid +END +} + +nagios_meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="nagios" version="0.75"> +<version>1.0</version> + +<longdesc lang="en">OCF Resource script for Nagios 3.x or 4.x. It manages a Nagios instance as a HA resource.</longdesc> +<shortdesc lang="en">Nagios resource agent</shortdesc> + +<parameters> + +<parameter name="user"> + <longdesc lang="en">User running Nagios daemon (for file permissions)</longdesc> + <shortdesc lang="en">Nagios user</shortdesc> + <content type="string" default="${OCF_RESKEY_user_default}" /> +</parameter> + +<parameter name="group"> + <longdesc lang="en">Group running Nagios daemon (for file permissions)</longdesc> + <shortdesc lang="en">Nagios group</shortdesc> + <content type="string" default="${OCF_RESKEY_group_default}" /> +</parameter> + +<parameter name="binary"> + <longdesc lang="en">Location of the Nagios binary</longdesc> + <shortdesc lang="en">Nagios binary</shortdesc> + <content type="string" default="${OCF_RESKEY_binary_default}" /> +</parameter> + +<parameter name="config"> + <longdesc lang="en">Configuration file</longdesc> + <shortdesc lang="en">Nagios config</shortdesc> + <content type="string" default="${OCF_RESKEY_config_default}" /> +</parameter> + +<parameter name="log"> + <longdesc lang="en">Location of the Nagios log</longdesc> + <shortdesc lang="en">Nagios log</shortdesc> + <content type="string" default="${OCF_RESKEY_log_default}" /> +</parameter> + +<parameter name="retention"> + <longdesc lang="en">Location of the Nagios retention file</longdesc> + <shortdesc lang="en">Nagios retention file</shortdesc> + <content type="string" default="${OCF_RESKEY_retention_default}" /> +</parameter> + +<parameter name="command"> + <longdesc lang="en">Location of the Nagios external command file</longdesc> + <shortdesc lang="en">Nagios command file</shortdesc> + <content type="string" default="${OCF_RESKEY_command_default}" /> +</parameter> + +<parameter name="pid"> + <longdesc lang="en">Location of the Nagios pid/lock</longdesc> + <shortdesc lang="en">Nagios pid file</shortdesc> + <content type="string" default="${OCF_RESKEY_pid_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="status" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="10s" start-delay="10s" /> +<action name="validate-all" timeout="20s" /> +<action name="meta-data" timeout="20s" /> +</actions> +</resource-agent> +END +} + + +nagios_start() { + nagios_validate_all + rc=$? + if [ $rc -ne 0 ]; then + return $rc + fi + + # if resource is already running,no need to continue code after this. + if nagios_monitor; then + ocf_log info "Nagios is already running" + return $OCF_SUCCESS + fi + + # Remove ${OCF_RESKEY_pid} if it exists + rm -f "${OCF_RESKEY_pid}" + + ocf_run -q touch ${OCF_RESKEY_log} ${OCF_RESKEY_retention} ${OCF_RESKEY_pid} + chown ${OCF_RESKEY_user}:${OCF_RESKEY_group} ${OCF_RESKEY_log} ${OCF_RESKEY_retention} ${OCF_RESKEY_pid} + rm -f "${OCF_RESKEY_command}" + + [ -x /sbin/restorecon ] && /sbin/restorecon ${OCF_RESKEY_pid} + ocf_run -q ${OCF_RESKEY_binary} -d ${OCF_RESKEY_config} + + while ! nagios_monitor; do + sleep 1 + done + + if [ $? -eq 0 ]; then + ocf_log info "Nagios started" + return ${OCF_SUCCESS} + fi + + return $OCF_SUCCESS +} + +nagios_stop() { + nagios_monitor + if [ $? -ne $OCF_SUCCESS ]; then + # Currently not running. Nothing to do. + ocf_log info "Resource is already stopped" + rm -f ${OCF_RESKEY_pid} + + return $OCF_SUCCESS + fi + + kill `cat ${OCF_RESKEY_pid}` + + # Wait for process to stop + while nagios_monitor; do + sleep 1 + done + + return $OCF_SUCCESS +} + +nagios_monitor(){ + ocf_pidfile_status ${OCF_RESKEY_pid} > /dev/null 2>&1 + case "$?" in + 0) + rc=$OCF_SUCCESS + ;; + 1|2) + rc=$OCF_NOT_RUNNING + ;; + *) + rc=$OCF_ERR_GENERIC + ;; + esac + return $rc +} + +nagios_validate_all(){ + check_binary "${OCF_RESKEY_binary}" + + if [ ! -f "${OCF_RESKEY_config}" ]; then + ocf_exit_reason "Configuration file ${OCF_RESKEY_config} not found" + return ${OCF_ERR_INSTALLED} + fi + + ${OCF_RESKEY_binary} -v ${OCF_RESKEY_config} >/dev/null 2>&1 + if [ $? -ne 0 ]; then + ocf_exit_reason "Configuration check failed" + return ${OCF_ERR_INSTALLED} + fi +} + + +# **************************** MAIN SCRIPT ************************************ + +# Make sure meta-data and usage always succeed +case $__OCF_ACTION in +meta-data) nagios_meta_data + exit $OCF_SUCCESS + ;; +usage|help) nagios_usage + exit $OCF_SUCCESS + ;; +esac + +# This OCF agent script need to be run as root user. +if ! ocf_is_root; then + echo "$0 agent script need to be run as root user." + ocf_log debug "$0 agent script need to be run as root user." + exit $OCF_ERR_GENERIC +fi + +# Translate each action into the appropriate function call +case $__OCF_ACTION in +start) nagios_start;; +stop) nagios_stop;; +status|monitor) nagios_monitor;; +validate-all) nagios_validate_all;; +*) nagios_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? + +exit $rc + +# End of this script diff --git a/heartbeat/named b/heartbeat/named new file mode 100755 index 0000000..f3a17e9 --- /dev/null +++ b/heartbeat/named @@ -0,0 +1,514 @@ +#!/bin/sh +# +# Description: Manages a named (Bind) server as an OCF High-Availability +# resource +# +# Authors: Serge Dubrouski (sergeyfd@gmail.com) +# +# Copyright: 2011 Serge Dubrouski <sergeyfd@gmail.com> +# +# License: GNU General Public License (GPL) +# +############################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +#Defaults +OCF_RESKEY_named_default="/usr/sbin/named" +OCF_RESKEY_rndc_default="/usr/sbin/rndc" +OCF_RESKEY_host_default="/usr/bin/host" +OCF_RESKEY_named_user_default=named +OCF_RESKEY_named_config_default="" +OCF_RESKEY_named_pidfile_default="/var/run/named/named.pid" +OCF_RESKEY_named_rootdir_default="" +OCF_RESKEY_named_options_default="" +OCF_RESKEY_named_keytab_file_default="" +OCF_RESKEY_rndc_options_default="" +OCF_RESKEY_host_options_default="" +OCF_RESKEY_monitor_request_default="localhost" +OCF_RESKEY_monitor_response_default="127.0.0.1" +OCF_RESKEY_monitor_ip_default="127.0.0.1" + +: ${OCF_RESKEY_named=${OCF_RESKEY_named_default}} +: ${OCF_RESKEY_rndc=${OCF_RESKEY_rndc_default}} +: ${OCF_RESKEY_host=${OCF_RESKEY_host_default}} +: ${OCF_RESKEY_named_user=${OCF_RESKEY_named_user_default}} +: ${OCF_RESKEY_named_config=${OCF_RESKEY_named_config_default}} +: ${OCF_RESKEY_named_pidfile=${OCF_RESKEY_named_pidfile_default}} +: ${OCF_RESKEY_named_rootdir=${OCF_RESKEY_named_rootdir_default}} +: ${OCF_RESKEY_named_options=${OCF_RESKEY_named_options_default}} +: ${OCF_RESKEY_named_keytab_file=${OCF_RESKEY_named_keytab_file_default}} +: ${OCF_RESKEY_rndc_options=${OCF_RESKEY_rndc_options_default}} +: ${OCF_RESKEY_host_options=${OCF_RESKEY_host_options_default}} +: ${OCF_RESKEY_monitor_request=${OCF_RESKEY_monitor_request_default}} +: ${OCF_RESKEY_monitor_response=${OCF_RESKEY_monitor_response_default}} +: ${OCF_RESKEY_monitor_ip=${OCF_RESKEY_monitor_ip_default}} + +usage() { + cat <<EOF + usage: $0 start|stop|reload|status|monitor|meta-data|validate-all|methods + + $0 manages named (Bind) server as an HA resource. + + The 'start' operation starts named server. + The 'stop' operation stops named server. + The 'reload' operation reload named configuration. + The 'status' operation reports whether named is up. + The 'monitor' operation reports whether named is running. + The 'validate-all' operation reports whether parameters are valid. + The 'methods' operation reports on the methods $0 supports. +EOF + return $OCF_ERR_ARGS +} + +named_meta_data() { + cat <<EOF +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="named" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for named (Bind) server. It manages named as an HA resource. +</longdesc> +<shortdesc lang="en">Manages a named server</shortdesc> + +<parameters> +<parameter name="named" unique="0" required="0"> +<longdesc lang="en"> +Path to the named command. +</longdesc> +<shortdesc lang="en">named</shortdesc> +<content type="string" default="${OCF_RESKEY_named_default}" /> +</parameter> + +<parameter name="rndc" unique="0" required="0"> +<longdesc lang="en"> +Path to the rndc command. +</longdesc> +<shortdesc lang="en">rndc</shortdesc> +<content type="string" default="${OCF_RESKEY_rndc_default}" /> +</parameter> + +<parameter name="host" unique="0" required="0"> +<longdesc lang="en"> +Path to the host command. +</longdesc> +<shortdesc lang="en">host</shortdesc> +<content type="string" default="${OCF_RESKEY_host_default}" /> +</parameter> + +<parameter name="named_user" unique="0" required="0"> +<longdesc lang="en"> +User that should own named process. +</longdesc> +<shortdesc lang="en">named_user</shortdesc> +<content type="string" default="${OCF_RESKEY_named_user_default}" /> +</parameter> + +<parameter name="named_config" unique="1" required="0"> +<longdesc lang="en"> +Configuration file for named. +</longdesc> +<shortdesc lang="en">named_config</shortdesc> +<content type="string" default="${OCF_RESKEY_named_config_default}" /> +</parameter> + +<parameter name="named_pidfile" unique="1" required="0"> +<longdesc lang="en"> +PIDFILE file for named. +</longdesc> +<shortdesc lang="en">named_pidfile</shortdesc> +<content type="string" default="${OCF_RESKEY_named_pidfile_default}" /> +</parameter> + +<parameter name="named_rootdir" unique="1" required="0"> +<longdesc lang="en"> +Directory that named should use for chroot if any. +</longdesc> +<shortdesc lang="en">named_rootdir</shortdesc> +<content type="string" default="${OCF_RESKEY_named_rootdir_default}" /> +</parameter> + +<parameter name="named_options" unique="0" required="0"> +<longdesc lang="en"> +Options for named process if any. +</longdesc> +<shortdesc lang="en">named_options</shortdesc> +<content type="string" default="${OCF_RESKEY_named_options_default}" /> +</parameter> + +<parameter name="named_keytab_file" unique="0" required="0"> +<longdesc lang="en"> +named service keytab file (for GSS-TSIG). +</longdesc> +<shortdesc lang="en">named_keytab_file</shortdesc> +<content type="string" default="${OCF_RESKEY_named_keytab_file_default}" /> +</parameter> + +<parameter name="rndc_options" unique="0" required="0"> +<longdesc lang="en"> +Options for rndc process if any. +</longdesc> +<shortdesc lang="en">rndc_options</shortdesc> +<content type="string" default="${OCF_RESKEY_rndc_options_default}" /> +</parameter> + +<parameter name="host_options" unique="0" required="0"> +<longdesc lang="en"> +Options for host process if any. +</longdesc> +<shortdesc lang="en">host_options</shortdesc> +<content type="string" default="${OCF_RESKEY_host_options_default}" /> +</parameter> + +<parameter name="monitor_request" unique="0" required="0"> +<longdesc lang="en"> +Request that shall be sent to named for monitoring. Usually an A record in DNS. +</longdesc> +<shortdesc lang="en">monitor_request</shortdesc> +<content type="string" default="${OCF_RESKEY_monitor_request_default}" /> +</parameter> + +<parameter name="monitor_response" unique="0" required="0"> +<longdesc lang="en"> +Expected response from named server. +</longdesc> +<shortdesc lang="en">monitor_response</shortdesc> +<content type="string" default="${OCF_RESKEY_monitor_response_default}" /> +</parameter> + +<parameter name="monitor_ip" unique="0" required="0"> +<longdesc lang="en"> +IP Address where named listens. +</longdesc> +<shortdesc lang="en">monitor_ip</shortdesc> +<content type="string" default="${OCF_RESKEY_monitor_ip_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="60s" /> +<action name="stop" timeout="60s" /> +<action name="reload" timeout="60s" /> +<action name="status" timeout="10s" /> +<action name="monitor" depth="0" timeout="30s" interval="30s"/> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s" /> +<action name="methods" timeout="5s" /> +</actions> +</resource-agent> + +EOF +} + +# +# methods: What methods/operations do we support? +# + +named_methods() { + cat <<EOF + start + stop + status + monitor + methods + meta-data + validate-all +EOF +} + +# Validate most critical parameters +named_validate_all() { + check_binary $OCF_RESKEY_named + check_binary $OCF_RESKEY_rndc + check_binary $OCF_RESKEY_host + + if [ -n "$OCF_RESKEY_named_config" -a \ + ! -r "${OCF_RESKEY_named_rootdir}/${OCF_RESKEY_named_config}" ]; then + if ocf_is_probe; then + ocf_log info "Configuration file ${OCF_RESKEY_named_rootdir}/${OCF_RESKEY_named_config} not readable during probe." + else + ocf_exit_reason "Configuration file ${OCF_RESKEY_named_rootdir}/${OCF_RESKEY_named_config} doesn't exist" + return $OCF_ERR_INSTALLED + fi + fi + + getent passwd $OCF_RESKEY_named_user >/dev/null 2>&1 + if [ ! $? -eq 0 ]; then + ocf_exit_reason "User $OCF_RESKEY_named_user doesn't exist"; + return $OCF_ERR_INSTALLED; + fi + + if [ -z "$OCF_RESKEY_monitor_request" -o \ + -z "$OCF_RESKEY_monitor_response" -o \ + -z "$OCF_RESKEY_monitor_ip" ]; then + ocf_exit_reason "None of monitor_request, monitor_response, and monitor_ip can be empty" + return $OCF_ERR_CONFIGURED + fi + + # make sure that the pidfile directory exists + ocf_mkstatedir $OCF_RESKEY_named_user 755 `dirname $OCF_RESKEY_named_pidfile` || return $OCF_ERR_INSTALLED + + return $OCF_SUCCESS +} + +## +# Attempt to generate a /etc/rndc.key if one is not present +## +rndc_key_generator() +{ + local rndc_options="-a -r /dev/urandom -u $OCF_RESKEY_named_user" + + if [ -s /etc/rndc.key ]; then + # file already exists + return + fi + + if ! have_binary "rndc-confgen"; then + # can't autogen key... Report this, but not as a warning or error. + # It is possible that the user configured the key in named.conf + ocf_log info "rndc-confgen tool not present, unable to autogen /etc/rndc.key." + return + fi + + if [ -n "$OCF_RESKEY_rootdir" ]; then + rndc_options="$rndc_options -t $OCF_RESKEY_rootdir" + fi + + rndc-confgen $rndc_options > /dev/null 2>&1; + if [ $? -eq 0 ]; then + if have_binary "restorecon"; then + restorecon /etc/rndc.key + fi + else + ocf_log info "failed to auto-generate /etc/rndc.key file." + fi +} + +# +# named_getpid. Get pid of named process with a given parameters. +# + +named_getpid () { + local pattern="$OCF_RESKEY_named" + + if [ -n "$OCF_RESKEY_named_rootdir" -a "x${OCF_RESKEY_named_rootdir}" != "x/" ]; then + pattern="$pattern.*-t $OCF_RESKEY_named_rootdir" + fi + + if [ -n "$OCF_RESKEY_named_config" ]; then + pattern="$pattern.*-c $OCF_RESKEY_named_config" + fi + + pid=`pgrep -f "$pattern"` + echo $pid +} + +# +# named_status. Simple check of the status of named process by pidfile. +# + +named_status () { + ocf_pidfile_status ${OCF_RESKEY_named_pidfile} >/dev/null 2>&1 +} + +# +# named_monitor. Send a request to named and check response. +# + +named_monitor() { + local output + + if ! named_status + then + ocf_log info "named is down" + return $OCF_NOT_RUNNING + fi + + output=`$OCF_RESKEY_host $OCF_RESKEY_host_options $OCF_RESKEY_monitor_request $OCF_RESKEY_monitor_ip` + + if [ $? -ne 0 ] || ! echo $output | grep -q '.* has .*address '"$OCF_RESKEY_monitor_response" + then + ocf_exit_reason "named didn't answer properly for $OCF_RESKEY_monitor_request." + ocf_log err "Expected: $OCF_RESKEY_monitor_response." + ocf_log err "Got: $output" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +# +# Reload +# + +named_reload() { + $OCF_RESKEY_rndc $OCF_RESKEY_rndc_options reload >/dev/null || return $OCF_ERR_GENERIC + + return $OCF_SUCCESS +} + +# +# Start +# + +named_start() { + local root_dir_opt + local pid + + root_dir_opt="" + named_status && return $OCF_SUCCESS + + # Remove pidfile if exists + rm -f ${OCF_RESKEY_named_pidfile} + + if [ -n "${OCF_RESKEY_named_rootdir}" -a "x${OCF_RESKEY_named_rootdir}" != "x/" ] + then + root_dir_opt="-t ${OCF_RESKEY_named_rootdir}" + [ -s /etc/localtime ] && cp -fp /etc/localtime ${OCF_RESKEY_named_rootdir}/etc/localtime + fi + + if [ -n "$OCF_RESKEY_named_config" ]; then + OCF_RESKEY_named_options="-c $OCF_RESKEY_named_config $OCF_RESKEY_named_options" + fi + + rndc_key_generator + + if ! ${OCF_RESKEY_named} -u ${OCF_RESKEY_named_user} $root_dir_opt ${OCF_RESKEY_named_options} + then + ocf_exit_reason "named failed to start." + return $OCF_ERR_GENERIC + fi + + + pid=`named_getpid` + + if [ -n "$pid" ]; then + if [ ! -e ${OCF_RESKEY_named_pidfile} ]; then + echo $pid > ${OCF_RESKEY_named_pidfile} + fi + else + ocf_exit_reason "named failed to start. Probably error in configuration." + return $OCF_ERR_GENERIC + fi + + while : + do + named_monitor && break + sleep 1 + ocf_log debug "named hasn't started yet." + done + ocf_log info "named has started." + + return $OCF_SUCCESS +} + +# +# Stop +# + +named_stop () { + local timeout + local timewait + + named_status || return $OCF_SUCCESS + + $OCF_RESKEY_rndc $OCF_RESKEY_rndc_options stop >/dev/null + if [ $? -ne 0 ]; then + ocf_log info "rndc stop failed. Killing named." + kill `cat ${OCF_RESKEY_named_pidfile}` + fi + + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + # Allow 2/3 of the action timeout for the orderly shutdown + # (The origin unit is ms, hence the conversion) + timewait=$((OCF_RESKEY_CRM_meta_timeout/1500)) + else + timewait=20 + fi + + sleep 1; timeout=0 # Sleep here for 1 sec to let rndc finish. + while named_status ; do + if [ $timeout -ge $timewait ]; then + break + else + sleep 1 + timeout=`expr $timeout + 1` + ocf_log debug "named appears to hung, waiting ..." + fi + done + + #If still up + if named_status 2>&1; then + ocf_exit_reason "named is still up! Killing" + kill -9 `cat ${OCF_RESKEY_named_pidfile}` + fi + + rm -f ${OCF_RESKEY_named_pidfile} + return $OCF_SUCCESS +} + + +# Main part + +if [ $# -ne 1 ]; then + usage + exit $OCF_ERR_GENERIC +fi + +case "$1" in + methods) named_methods + exit $?;; + + meta-data) named_meta_data + exit $OCF_SUCCESS;; +esac + +named_validate_all +rc=$? + +[ "$1" = "validate-all" ] && exit $rc + +if [ $rc -ne 0 ] +then + case "$1" in + stop) exit $OCF_SUCCESS;; + monitor) exit $OCF_NOT_RUNNING;; + status) exit $OCF_NOT_RUNNING;; + *) exit $rc;; + esac +fi + +if [ `id -u` -ne 0 ]; then + ocf_exit_reason "$0 must be run as root" + exit $OCF_ERR_GENERIC +fi + +case "$1" in + status) if named_status + then + ocf_log info "named is up" + exit $OCF_SUCCESS + else + ocf_log info "named is down" + exit $OCF_NOT_RUNNING + fi;; + + monitor) named_monitor + exit $?;; + + start) named_start + exit $?;; + + stop) named_stop + exit $?;; + reload) named_reload + exit $?;; + *) + exit $OCF_ERR_UNIMPLEMENTED;; +esac + +# vim:ts=4:sw=4:et: diff --git a/heartbeat/nfsnotify.in b/heartbeat/nfsnotify.in new file mode 100644 index 0000000..6e49535 --- /dev/null +++ b/heartbeat/nfsnotify.in @@ -0,0 +1,330 @@ +#!@BASH_SHELL@ +# +# Copyright (c) 2014 David Vossel <davidvossel@gmail.com> +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs +. ${OCF_FUNCTIONS_DIR}/ocf-directories + +# Parameter defaults + +OCF_RESKEY_source_host_default="" +OCF_RESKEY_notify_args_default="" + +: ${OCF_RESKEY_source_host=${OCF_RESKEY_source_host_default}} +: ${OCF_RESKEY_notify_args=${OCF_RESKEY_notify_args_default}} + +####################################################################### + +sbindir=$HA_SBIN_DIR +if [ -z "$sbindir" ]; then + sbindir=/usr/sbin +fi + +SELINUX_ENABLED=-1 + +NFSNOTIFY_TMP_DIR="${HA_RSCTMP}/nfsnotify_${OCF_RESOURCE_INSTANCE}/" +HA_STATD_PIDFILE="$NFSNOTIFY_TMP_DIR/rpc.statd_${OCF_RESOURCE_INSTANCE}.pid" +HA_STATD_PIDFILE_PREV="$NFSNOTIFY_TMP_DIR/rpc.statd_${OCF_RESOURCE_INSTANCE}.pid.prev" +STATD_PATH="/var/lib/nfs/statd" +SM_NOTIFY_BINARY="${sbindir}/sm-notify" +IS_RENOTIFY=0 + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="nfsnotify" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This agent sends NFSv3 reboot notifications to clients which informs clients to reclaim locks. +</longdesc> +<shortdesc lang="en">sm-notify reboot notifications</shortdesc> + +<parameters> + +<parameter name="source_host" unique="0" required="0"> +<longdesc lang="en"> +Comma separated list of floating IP addresses or host names that clients use +to access the nfs service. This will be used to set the source address and +mon_name of the SN_NOTIFY reboot notifications. +</longdesc> +<shortdesc lang="en">source IP addresses</shortdesc> +<content type="string" default="${OCF_RESKEY_source_host_default}" /> +</parameter> + +<parameter name="notify_args" unique="0" required="0"> +<longdesc lang="en"> +Additional arguments to send to the sm-notify command. By default +this agent will always set sm-notify's '-f' option. When the +source_host option is set, the '-v' option will be used automatically +to set the proper source address. Any additional sm-notify arguments +set with this option will be used in addition to the previous default +arguments. +</longdesc> +<shortdesc lang="en">sm-notify arguments</shortdesc> +<content type="string" default="${OCF_RESKEY_notify_args_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="90s" /> +<action name="stop" timeout="90s" /> +<action name="monitor" timeout="90s" interval="30s" depth="0" /> +<action name="reload" timeout="90s" /> +<action name="meta-data" timeout="10s" /> +<action name="validate-all" timeout="20s" /> +</actions> +</resource-agent> +END +} + +v3notify_usage() +{ + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +v3notify_validate() +{ + # check_binary will exit with OCF_ERR_INSTALLED when binary is missing + check_binary "$SM_NOTIFY_BINARY" + check_binary "pgrep" + check_binary "killall" + + return $OCF_SUCCESS +} + +killall_smnotify() +{ + # killall sm-notify + killall -TERM $SM_NOTIFY_BINARY > /dev/null 2>&1 + if [ $? -eq 0 ]; then + # it is useful to know if sm-notify processes were actually left around + # or not during the stop/start operation. Whether this condition is true + # or false does not indicate a failure. It does indicate that + # there are probably some unresponsive nfs clients out there that are keeping + # the sm-notify processes retrying. + ocf_log info "previous sm-notify processes terminated before $__OCF_ACTION action." + fi +} + +v3notify_stop() +{ + killall_smnotify + + rm -f $HA_STATD_PIDFILE_PREV > /dev/null 2>&1 + mv $HA_STATD_PIDFILE $HA_STATD_PIDFILE_PREV > /dev/null 2>&1 + + return $OCF_SUCCESS +} + +check_statd_pidfile() +{ + local binary="rpc.statd" + local pidfile="$HA_STATD_PIDFILE" + + ocf_log debug "Checking status for ${binary}." + if [ -e "$pidfile" ]; then + cat /proc/$(cat $pidfile)/cmdline 2>/dev/null | grep -a "${binary}" > /dev/null 2>&1 + if [ $? -eq 0 ]; then + return $OCF_SUCCESS + fi + + ocf_exit_reason "$(cat $pidfile) for $binary is no longer running, sm-notify needs to re-notify clients" + return $OCF_ERR_GENERIC + fi + + # if we don't have a pid file for rpc.statd, we have not yet sent the notifications + return $OCF_NOT_RUNNING +} + +write_statd_pid() +{ + local binary="rpc.statd" + local pidfile="$HA_STATD_PIDFILE" + local pid + + pid=$(pgrep ${binary}) + case $? in + 0) + ocf_log info "PID file (pid:${pid} at $pidfile) created for ${binary}." + mkdir -p $(dirname $pidfile) + echo "$pid" > $pidfile + return $OCF_SUCCESS;; + 1) + rm -f "$pidfile" > /dev/null 2>&1 + ocf_log info "$binary is not running" + return $OCF_NOT_RUNNING;; + *) + rm -f "$pidfile" > /dev/null 2>&1 + ocf_exit_reason "Error encountered detecting pid status of $binary" + return $OCF_ERR_GENERIC;; + esac +} + +copy_statd() +{ + local src=$1 + local dest=$2 + + if ! [ -d "$dest" ]; then + mkdir -p "$dest" + fi + + cp -rpfn $src/sm $src/sm.bak $src/state $dest > /dev/null 2>&1 + + # make sure folder ownership and selinux lables stay consistent + # When using nfsnotify resources on the debian system, the statd user replaces the rpcuser user + local rpcuser_exist=`grep rpcuser /etc/passwd` + if [ -z "$rpcuser_exist" ];then + [ -n "`id -u statd`" ] && [ -n "`id -g statd`" ] && chown statd "$dest" + else + [ -n "`id -u rpcuser`" ] && [ -n "`id -g rpcuser`" ] && chown rpcuser.rpcuser "$dest" + fi + + [ $SELINUX_ENABLED -eq 0 ] && chcon -R "$SELINUX_LABEL" "$dest" +} + +v3notify_start() +{ + local rc=$OCF_SUCCESS + local cur_statd + local statd_backup + local is_renotify=0 + + # monitor, see if we need to notify or not + v3notify_monitor + if [ $? -eq 0 ]; then + return $OCF_SUCCESS + fi + + # kill off any other sm-notify processes that might already be running. + killall_smnotify + + # record the pid of rpc.statd. if this pid ever changes, we have to re-notify + write_statd_pid + rc=$? + if [ $rc -ne 0 ]; then + return $rc + fi + + # if the last time we ran nfs-notify, it was with the same statd process, + # consider this a re-notification. During re-notifications we do not let the + # sm-notify binary have access to the real statd directory. + if [ "$(cat $HA_STATD_PIDFILE)" = "$(cat $HA_STATD_PIDFILE_PREV 2>/dev/null)" ]; then + ocf_log info "Renotifying clients" + is_renotify=1 + fi + + statd_backup="$STATD_PATH/nfsnotify.bu" + copy_statd "$STATD_PATH" "$statd_backup" + + if [ -z "$OCF_RESKEY_source_host" ]; then + if [ "$is_renotify" -eq 0 ]; then + cur_statd="$STATD_PATH" + else + cur_statd="$statd_backup" + fi + ocf_log info "sending notifications on default source address." + $SM_NOTIFY_BINARY -f $OCF_RESKEY_notify_args -P $cur_statd + if [ $? -ne 0 ]; then + ocf_exit_reason "sm-notify execution failed, view syslog for more information" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS + fi + + # do sm-notify for each ip + for ip in `echo ${OCF_RESKEY_source_host} | sed 's/,/ /g'`; do + + # have the first sm-notify use the actual statd directory so the + # notify list can be managed properly. + if [ "$is_renotify" -eq 0 ]; then + cur_statd="$STATD_PATH" + # everything after the first notify we are considering a renotification + # which means we don't use the real statd directory. + is_renotify=1 + else + # use our copied statd directory for the remaining ip addresses + cur_statd="$STATD_PATH/nfsnotify_${OCF_RESOURCE_INSTANCE}_${ip}" + copy_statd "$statd_backup" "$cur_statd" + fi + + ocf_log info "sending notifications with source address $ip" + $SM_NOTIFY_BINARY -f $OCF_RESKEY_notify_args -v $ip -P "$cur_statd" + if [ $? -ne 0 ]; then + ocf_exit_reason "sm-notify with source host set to [ $ip ] failed. view syslog for more information" + return $OCF_ERR_GENERIC + fi + done + + return $OCF_SUCCESS +} + +v3notify_monitor() +{ + # verify rpc.statd is up, and that the rpc.statd pid is the same one we + # found during the start. otherwise rpc.statd recovered and we need to notify + # again. + check_statd_pidfile +} + +case $__OCF_ACTION in + meta-data) meta_data + exit $OCF_SUCCESS;; + usage|help) v3notify_usage + exit $OCF_SUCCESS;; + *) + ;; +esac + +which restorecon > /dev/null 2>&1 && selinuxenabled +SELINUX_ENABLED=$? +if [ $SELINUX_ENABLED -eq 0 ]; then + export SELINUX_LABEL="$(ls -dZ $STATD_PATH | grep -o '\S\+:\S\+:\S\+')" +fi + +case $__OCF_ACTION in + start) v3notify_start;; + stop) v3notify_stop;; + monitor) v3notify_monitor;; + validate-all) v3notify_validate;; + *) v3notify_usage + exit $OCF_ERR_UNIMPLEMENTED;; +esac + +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc + diff --git a/heartbeat/nfsserver b/heartbeat/nfsserver new file mode 100755 index 0000000..8838195 --- /dev/null +++ b/heartbeat/nfsserver @@ -0,0 +1,1068 @@ +#!/bin/sh +# nfsserver +# +# Description: Manages nfs server as OCF resource +# by hxinwei@gmail.com +# License: GNU General Public License v2 (GPLv2) and later + + +# I don't know for certain whether all services actuall _need_ this, +# I know that at least nfs-server needs it. +# The rgmanager resource agent in rgmanager/src/resources/nfsserver.sh.in +# did the unshare for gssd and idmapd as well, even though it seems unclear why. +# Let's start with just the nfs-server, and add others if/when we have clear +# indication they need it. +#NFSD_RELATED_SYSTEMD_SERVICE_FOR_UNSHARE_UTS_NAMESPACE="nfs-idmapd.service nfs-mountd.service nfs-server.service nfsdcld.service rpc-gssd.service rpc-statd.service rpc-statd-notify.service rpcbind.service" +NFSD_RELATED_SYSTEMD_SERVICE_FOR_UNSHARE_UTS_NAMESPACE="nfs-server.service" +SYSTEMD_ENVIRONMENT_FILE_NFS_SERVER_SCOPE=/run/sysconfig/nfs-server-scope +SYSTEMD_UNSHARE_UTS_DROPIN=51-resource-agents-unshare-uts.conf + +if [ -n "$OCF_DEBUG_LIBRARY" ]; then + . $OCF_DEBUG_LIBRARY +else + : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs +fi + +if is_redhat_based; then + . ${OCF_FUNCTIONS_DIR}/nfsserver-redhat.sh +fi + +DEFAULT_INIT_SCRIPT_LIST="/etc/init.d/nfsserver /etc/init.d/nfs /etc/init.d/nfs-kernel-server" +DEFAULT_INIT_SCRIPT="/etc/init.d/nfsserver" +for script in $DEFAULT_INIT_SCRIPT_LIST +do + if [ -f $script -a -x $script ]; then + DEFAULT_INIT_SCRIPT=$script + break + fi +done + +DEFAULT_NOTIFY_CMD=`which sm-notify` +DEFAULT_NOTIFY_CMD=${DEFAULT_NOTIFY_CMD:-"/sbin/sm-notify"} +DEFAULT_NOTIFY_FOREGROUND="false" +DEFAULT_RPCPIPEFS_DIR="/var/lib/nfs/rpc_pipefs" +EXEC_MODE=0 +SELINUX_ENABLED=-1 +STATD_PATH="/var/lib/nfs" +STATD_DIR="" + +nfsserver_meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="nfsserver" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Nfsserver helps one to manage the Linux nfs server as a failover-able resource in Linux-HA. +It depends on Linux specific NFS implementation details, so is considered not portable to other platforms yet. +</longdesc> + +<shortdesc lang="en">Manages an NFS server</shortdesc> + +<parameters> + +<parameter name="nfs_init_script" unique="0" required="0"> +<longdesc lang="en"> +The default init script shipped with the Linux distro. +The nfsserver resource agent offloads the start/stop/monitor +work to the init script because the procedure to start/stop/monitor +nfsserver varies on different Linux distro. In the event that this +option is not set, this agent will attempt to use an init script at +this location, ${DEFAULT_INIT_SCRIPT}, or detect a systemd unit-file +to use in the event that no init script is detected. +</longdesc> +<shortdesc lang="en"> +Init script for nfsserver +</shortdesc> +<content type="string" default="auto detected" /> +</parameter> + +<parameter name="nfsv4_only" unique="0" required="0"> +<longdesc lang="en"> +Run in NFSv4 only mode (rpc-statd and rpcbind services masked). +</longdesc> +<shortdesc lang="en"> +NFSv4 only mode. +</shortdesc> +<content type="boolean" default="false" /> +</parameter> + +<parameter name="nfs_no_notify" unique="0" required="0"> +<longdesc lang="en"> +Do not send reboot notifications to NFSv3 clients during server startup. +</longdesc> +<shortdesc lang="en"> +Disable NFSv3 server reboot notifications +</shortdesc> +<content type="boolean" default="false" /> +</parameter> + +<parameter name="nfs_notify_foreground" unique="0" required="0"> +<longdesc lang="en"> +Keeps the sm-notify attached to its controlling terminal and running in the foreground. +</longdesc> +<shortdesc lang="en"> +Keeps the notify tool running in the foreground. +</shortdesc> +<content type="boolean" default="$DEFAULT_NOTIFY_FOREGROUND" /> +</parameter> + +<parameter name="nfs_smnotify_retry_time" unique="0" required="0"> +<longdesc lang="en"> +Specifies the length of sm-notify retry time, in minutes, to continue retrying notifications to unresponsive hosts. +If this option is not specified, sm-notify attempts to send notifications for 15 minutes. Specifying a value of 0 +causes sm-notify to continue sending notifications to unresponsive peers until it is manually killed. +</longdesc> +<shortdesc lang="en"> +Specifies the length of sm-notify retry time (minutes). +</shortdesc> +<content type="integer" default="" /> +</parameter> + +<parameter name="nfs_server_scope" unique="0" required="0"> +<longdesc lang="en"> +RFC8881, 8.4.2.1 State Reclaim: + +If the server scope is different, the client should not attempt to +reclaim locks. In this situation, no lock reclaim is possible. +Any attempt to re-obtain the locks with non-reclaim operations is +problematic since there is no guarantee that the existing +filehandles will be recognized by the new server, or that if +recognized, they denote the same objects. It is best to treat the +locks as having been revoked by the reconfiguration event. + +For lock reclaim to even be attempted, we have to define and set the same +server scope for NFSD on all cluster nodes in the NFS failover cluster. + +This agent won't "guess" a suitable server scope name for you, you need to +explicitly specify this. But without it, NFSv4 lock reclaim after failover +won't work properly. Suggested value: the failover "service IP". +</longdesc> +<shortdesc lang="en"> +RFC8881 NFS server scope for (lock) state reclaim after failover. +</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="nfs_ip" unique="0" required="0"> +<longdesc lang="en"> +Comma separated list of floating IP addresses used to access the nfs service +</longdesc> +<shortdesc lang="en"> +IP addresses. +</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="nfs_shared_infodir" unique="0" required="0"> +<longdesc lang="en"> +The nfsserver resource agent will save nfs related information in this specific directory. +And this directory must be able to fail-over before nfsserver itself. +</longdesc> +<shortdesc lang="en"> +Directory to store nfs server related information. +</shortdesc> +<content type="string" default="" /> +</parameter> + +<parameter name="rpcpipefs_dir" unique="0" required="0"> +<longdesc lang="en"> +The mount point for the sunrpc file system. Default is $DEFAULT_RPCPIPEFS_DIR. +This script will mount (bind) nfs_shared_infodir on /var/lib/nfs/ (cannot be changed), +and this script will mount the sunrpc file system on $DEFAULT_RPCPIPEFS_DIR (default, can be changed by this parameter). +If you want to move only rpc_pipefs/ (e.g. to keep rpc_pipefs/ local) from default, please set this value. +</longdesc> +<shortdesc lang="en"> +The mount point for the sunrpc file system. +</shortdesc> +<content type="string" default="$DEFAULT_RPCPIPEFS_DIR" /> +</parameter> + +$( +is_redhat_based && nfsserver_redhat_meta_data +) + +</parameters> + +<actions> +<action name="start" timeout="40s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="30s" /> +</actions> +</resource-agent> +END + +return $OCF_SUCCESS +} + +nfsserver_usage() { + cat <<END + usage: $0 {start|stop|monitor|status|validate-all|meta-data} +END +} + +if [ $# -ne 1 ]; then + nfsserver_usage + exit $OCF_ERR_ARGS +fi + +case $__OCF_ACTION in + meta-data) nfsserver_meta_data + exit $OCF_SUCCESS + ;; + usage|help) nfsserver_usage + exit $OCF_SUCCESS + ;; + *) + ;; +esac + +fp="$OCF_RESKEY_nfs_shared_infodir" +: ${OCF_RESKEY_nfs_notify_cmd="$DEFAULT_NOTIFY_CMD"} +: ${OCF_RESKEY_nfs_notify_foreground="$DEFAULT_NOTIFY_FOREGROUND"} +: ${OCF_RESKEY_rpcpipefs_dir="$DEFAULT_RPCPIPEFS_DIR"} +OCF_RESKEY_rpcpipefs_dir=${OCF_RESKEY_rpcpipefs_dir%/} + +# Use statd folder if it exists +if [ -d "/var/lib/nfs/statd" ]; then + STATD_DIR="statd" + STATD_PATH="/var/lib/nfs/statd" +fi + +# SELinux information. We are taking the permissions from +# the current statd dir and applying it to the HA one that is +# being mounted in its place. +which restorecon > /dev/null 2>&1 && selinuxenabled +SELINUX_ENABLED=$? +if [ $SELINUX_ENABLED -eq 0 ]; then + export SELINUX_LABEL="$(ls -dZ $STATD_PATH | grep -o '\S\+:\S\+:\S\+')" +fi + +## +# EXEC_MODE values +# 1 user init script or default init script +# 2 systemd (with nfs-lock.service) +# 3 systemd (with rpc-statd.service) +# +# On error, this function will terminate the process +# with error code $OCF_ERR_INSTALLED +## +set_exec_mode() +{ + + ## + # If EXEC_MODE is already set, we don't need to run this function again. + ## + if [ $EXEC_MODE -ne 0 ]; then + return 0; + fi + + ## + # If the user defined an init script, It must exist for us to continue + ## + if [ -n "$OCF_RESKEY_nfs_init_script" ]; then + # check_binary will exit the process if init script does not exist + check_binary ${OCF_RESKEY_nfs_init_script} + EXEC_MODE=1 + return 0 + fi + + ## + # Check to see if the default init script exists, if so we'll use that. + ## + if which $DEFAULT_INIT_SCRIPT > /dev/null 2>&1; then + OCF_RESKEY_nfs_init_script=$DEFAULT_INIT_SCRIPT + EXEC_MODE=1 + return 0 + fi + + if which systemctl > /dev/null 2>&1; then + if systemctl --no-legend list-unit-files 'nfs-*' | grep nfs-server > /dev/null; then + + ## + # Attempt systemd (with nfs-lock.service). + ## + if systemctl --no-legend list-unit-files 'nfs-*' | grep nfs-lock > /dev/null; then + EXEC_MODE=2 + # when using systemd, the nfs-lock service file handles nfsv3 locking daemons for us. + return 0 + fi + + ## + # Attempt systemd (with rpc-statd.service). + ## + if systemctl --no-legend list-unit-files 'rpc-*' | grep rpc-statd > /dev/null; then + EXEC_MODE=3 + return 0 + fi + fi + fi + + ocf_exit_reason "No init script or systemd unit file detected for nfs server" + exit $OCF_ERR_INSTALLED +} + +## +# wrapper for init script and systemd calls. +## +nfs_exec() +{ + local cmd=$1 + local svc=$2 + set_exec_mode + + case $EXEC_MODE in + 1) if [ -z "$OCF_RESKEY_nfs_server_scope" ] ; then + ${OCF_RESKEY_nfs_init_script} $cmd + else + unshare -u /bin/sh -c "hostname ${OCF_RESKEY_nfs_server_scope}; exec ${OCF_RESKEY_nfs_init_script} $cmd" + fi ;; + 2) if ! echo $svc | grep -q "\."; then + svc="${svc}.service" + fi + systemctl -n0 $cmd $svc + ;; + 3) if ! echo $svc | grep -q "\."; then + svc="${svc}.service" + fi + systemctl -n0 $cmd $svc + ;; + esac +} + +v3locking_exec() +{ + local cmd=$1 + set_exec_mode + + if [ $EXEC_MODE -eq 2 ]; then + nfs_exec $cmd nfs-lock.service + elif [ $EXEC_MODE -eq 3 ]; then + nfs_exec $cmd rpc-statd.service + else + case $cmd in + start) locking_start;; + stop) locking_stop;; + status) locking_status;; + esac + fi +} + +nfsserver_systemd_monitor() +{ + local threads_num + local rc + local fn + + if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then + ocf_log debug "Status: rpcbind" + rpcinfo > /dev/null 2>&1 + rc=$? + if [ "$rc" -ne "0" ]; then + ocf_exit_reason "rpcbind is not running" + return $OCF_NOT_RUNNING + fi + + ocf_log debug "Status: nfs-mountd" + ps axww | grep -q "[r]pc.mountd" + rc=$? + if [ "$rc" -ne "0" ]; then + ocf_exit_reason "nfs-mountd is not running" + return $OCF_NOT_RUNNING + fi + fi + + ocf_log debug "Status: nfs-idmapd" + fn=`mktemp` + nfs_exec status nfs-idmapd > $fn 2>&1 + rc=$? + ocf_log debug "$(cat $fn)" + rm -f $fn + if [ "$rc" -ne "0" ]; then + ocf_exit_reason "nfs-idmapd is not running" + return $OCF_NOT_RUNNING + fi + + if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then + ocf_log debug "Status: rpc-statd" + rpcinfo -t localhost 100024 > /dev/null 2>&1 + rc=$? + if [ "$rc" -ne "0" ]; then + ocf_exit_reason "rpc-statd is not running" + return $OCF_NOT_RUNNING + fi + fi + + nfs_exec is-active nfs-server + rc=$? + + # Now systemctl is-active can't detect the failure of kernel process like nfsd. + # So, if the return value of systemctl is-active is 0, check the threads number + # to make sure the process is running really. + # /proc/fs/nfsd/threads has the numbers of the nfsd threads. + if [ $rc -eq 0 ]; then + threads_num=`cat /proc/fs/nfsd/threads 2>/dev/null` + if [ $? -eq 0 ]; then + if [ $threads_num -gt 0 ]; then + return $OCF_SUCCESS + else + return 3 + fi + else + return $OCF_ERR_GENERIC + fi + fi + + return $rc +} + +nfsserver_monitor () +{ + local fn + + set_exec_mode + fn=`mktemp` + case $EXEC_MODE in + 1) nfs_exec status nfs-server > $fn 2>&1;; + [23]) nfsserver_systemd_monitor > $fn 2>&1;; + esac + rc=$? + ocf_log debug "$(cat $fn)" + rm -f $fn + + #Adapte LSB status code to OCF return code + if [ $rc -eq 0 ]; then + # don't report success if nfs servers are up + # without locking daemons. + ocf_is_true "$OCF_RESKEY_nfsv4_only" || v3locking_exec "status" + rc=$? + if [ $rc -ne 0 ]; then + ocf_exit_reason "NFS server is up, but the locking daemons are down" + rc=$OCF_ERR_GENERIC + fi + return $rc + elif [ $rc -eq 3 ] || [ $rc -eq $OCF_NOT_RUNNING ]; then + return $OCF_NOT_RUNNING + else + return $OCF_ERR_GENERIC + fi +} + +prepare_directory () +{ + if [ -z "$fp" ]; then + fp="/var/lib/nfs" + fi + + [ -d "$OCF_RESKEY_rpcpipefs_dir" ] || mkdir -p $OCF_RESKEY_rpcpipefs_dir + [ -d "$fp/v4recovery" ] || mkdir -p $fp/v4recovery + + [ -d "$fp/$STATD_DIR" ] || mkdir -p "$fp/$STATD_DIR" + [ -d "$fp/$STATD_DIR/sm" ] || mkdir -p "$fp/$STATD_DIR/sm" + [ -d "$fp/$STATD_DIR/sm.ha" ] || mkdir -p "$fp/$STATD_DIR/sm.ha" + [ -d "$fp/$STATD_DIR/sm.bak" ] || mkdir -p "$fp/$STATD_DIR/sm.bak" + [ -n "`id -u rpcuser 2>/dev/null`" -a "`id -g rpcuser 2>/dev/null`" ] && + chown -R rpcuser.rpcuser "$fp/$STATD_DIR" + + [ -f "$fp/etab" ] || touch "$fp/etab" + [ -f "$fp/xtab" ] || touch "$fp/xtab" + [ -f "$fp/rmtab" ] || touch "$fp/rmtab" + + dd if=/dev/urandom of=$fp/$STATD_DIR/state bs=1 count=4 >/dev/null 2>&1 + [ -n "`id -u rpcuser 2>/dev/null`" -a "`id -g rpcuser 2>/dev/null`" ] && chown rpcuser.rpcuser "$fp/$STATD_DIR/state" + [ $SELINUX_ENABLED -eq 0 ] && chcon -R "$SELINUX_LABEL" "$fp" +} + +is_bound () +{ + if mount | grep -q "on $1 type"; then + return 0 + fi + + return 1 +} + +bind_tree () +{ + if [ -z "$fp" ]; then + return + fi + + [ -d "$fp" ] || mkdir -p $fp + + if is_bound /var/lib/nfs; then + ocf_log debug "$fp is already bound to /var/lib/nfs" + return 0 + fi + + case $EXEC_MODE in + [23]) if nfs_exec status var-lib-nfs-rpc_pipefs.mount > /dev/null 2>&1; then + ocf_log debug "/var/lib/nfs/rpc_pipefs already mounted. Unmounting in preparation to bind mount nfs dir" + systemctl stop var-lib-nfs-rpc_pipefs.mount + fi + ;; + esac + + mount --bind $fp /var/lib/nfs + [ $SELINUX_ENABLED -eq 0 ] && restorecon /var/lib/nfs +} + +unbind_tree () +{ + local i=1 + while `mount | grep -q " on $OCF_RESKEY_rpcpipefs_dir "` && [ "$i" -le 10 ]; do + ocf_log info "Stop: umount ($i/10 attempts)" + umount -t rpc_pipefs $OCF_RESKEY_rpcpipefs_dir + sleep 1 + i=$((i + 1)) + done + + if mount | grep -q " on $OCF_RESKEY_rpcpipefs_dir "; then + ocf_log err "Failed to unmount $OCF_RESKEY_rpcpipefs_dir" + return $OCF_ERR_GENERIC + fi + + if is_bound /var/lib/nfs; then + if ! umount /var/lib/nfs; then + ocf_log err "Failed to unmount /var/lib/nfs" + return $OCF_ERR_GENERIC + fi + fi + + return $OCF_SUCCESS +} + +binary_status() +{ + local binary=$1 + local pid + + pid=$(pgrep ${binary}) + case $? in + 0) + echo "$pid" + return $OCF_SUCCESS;; + 1) + return $OCF_NOT_RUNNING;; + *) + return $OCF_ERR_GENERIC;; + esac +} + +locking_status() +{ + binary_status "rpc.statd" > /dev/null 2>&1 +} + +locking_start() +{ + local ret=$OCF_SUCCESS + + ocf_log info "Starting rpc.statd." + + rpc.statd $STATDARG + + ret=$? + if [ $ret -ne 0 ]; then + ocf_log err "Failed to start rpc.statd" + return $ret + fi + [ -d /var/lock/subsys ] && touch /var/lock/subsys/nfslock + + return $ret +} + +terminate() +{ + local pids + local i=0 + + while : ; do + pids=$(binary_status $1) + [ -z "$pids" ] && return 0 + kill $pids + sleep 1 + i=$((i + 1)) + [ $i -gt 3 ] && return 1 + done +} + + +killkill() +{ + local pids + local i=0 + + while : ; do + pids=$(binary_status $1) + [ -z "$pids" ] && return 0 + kill -9 $pids + sleep 1 + i=$((i + 1)) + [ $i -gt 3 ] && return 1 + done +} + +stop_process() +{ + local process=$1 + + ocf_log info "Stopping $process" + if terminate $process; then + ocf_log debug "$process is stopped" + else + if killkill $process; then + ocf_log debug "$process is stopped" + else + ocf_log debug "Failed to stop $process" + return 1 + fi + fi + return 0 +} + +locking_stop() +{ + ret=0 + + # sm-notify can prevent umount of /var/lib/nfs/statd if + # it is still trying to notify unresponsive clients. + stop_process sm-notify + if [ $? -ne 0 ]; then + ret=$OCF_ERR_GENERIC + fi + + stop_process rpc.statd + if [ $? -ne 0 ]; then + ret=$OCF_ERR_GENERIC + fi + + return $ret +} + +notify_locks() +{ + if ocf_is_true "$OCF_RESKEY_nfs_no_notify"; then + # we've been asked not to notify clients + return; + fi + + # run in foreground, if requested + if ocf_is_true "$OCF_RESKEY_nfs_notify_foreground"; then + opts="-d" + fi + + if [ -n "$OCF_RESKEY_nfs_smnotify_retry_time" ]; then + opts="$opts -m $OCF_RESKEY_nfs_smnotify_retry_time" + fi + + if [ -n "$OCF_RESKEY_statd_outgoing_port" ]; then + opts="$opts -p $OCF_RESKEY_statd_outgoing_port" + fi + + # forces re-notificaiton regardless if notifies have already gone out + opts="$opts -f" + + ocf_log info "executing sm-notify" + if [ -n "$OCF_RESKEY_nfs_ip" ]; then + for ip in `echo ${OCF_RESKEY_nfs_ip} | sed 's/,/ /g'`; do + cp -rpfn $STATD_PATH/sm.ha/* $STATD_PATH/ > /dev/null 2>&1 + sm-notify $opts -v $ip + done + else + sm-notify $opts + fi +} + +# Problem: https://github.com/ClusterLabs/resource-agents/issues/1644 +# RFC8881, 8.4.2.1 State Reclaim: +# +# | If the server scope is different, the client should not attempt to +# | reclaim locks. In this situation, no lock reclaim is possible. +# | Any attempt to re-obtain the locks with non-reclaim operations is +# | problematic since there is no guarantee that the existing +# | filehandles will be recognized by the new server, or that if +# | recognized, they denote the same objects. It is best to treat the +# | locks as having been revoked by the reconfiguration event. +# +# That's why for lock reclaim to even be attempted, we have to define and set +# the same server scope for NFSD on all cluster nodes in the NFS failover +# cluster. And in linux, that is done by setting the uts nodename for the +# command that starts the nfsd kernel threads. +# +inject_unshare_uts_name_into_systemd_units () +{ + local END_TAG="# END OF DROP-IN FOR NFS SERVER SCOPE" + local services + services=$(systemctl list-unit-files --no-legend $NFSD_RELATED_SYSTEMD_SERVICE_FOR_UNSHARE_UTS_NAMESPACE | cut -d ' ' -f1) + + local svc dir dropin edited_exec_start do_reload=false + local old_umask=$(umask) + umask 0022 + for svc in $services ; do + dir=/run/systemd/system/$svc.d + dropin=$dir/$SYSTEMD_UNSHARE_UTS_DROPIN + grep -sqF "$END_TAG" "$dropin" && continue + + test -d "$dir" || mkdir -p "$dir" + test -e "$dropin" && rm -f "$dropin" + + # NOTE: additional ExecStart= might exist in the drop-in files, eg. openSUSE + edited_exec_start=$(systemctl cat $svc | sed -ne "s#^ExecStart=\\([-+:!@]*\\)\\(.\+\\)#ExecStart=\\1/usr/bin/unshare --uts /bin/sh -c 'hostname \${NFS_SERVER_SCOPE}; exec \"\$@\"' -- \\2#p" | tail -1) + + cat > "$dropin" <<___ +[Service] +EnvironmentFile=$SYSTEMD_ENVIRONMENT_FILE_NFS_SERVER_SCOPE +# reset list of exec start, then re-populate with unshared uts namespace +ExecStart= +$edited_exec_start +$END_TAG +___ + do_reload=true + ocf_log debug "injected unshare --uts into $dropin" + done + + mkdir -p "${SYSTEMD_ENVIRONMENT_FILE_NFS_SERVER_SCOPE%/*}" + echo "NFS_SERVER_SCOPE=$OCF_RESKEY_nfs_server_scope" > "$SYSTEMD_ENVIRONMENT_FILE_NFS_SERVER_SCOPE" + umask $old_umask + + $do_reload && systemctl daemon-reload +} + +remove_unshare_uts_dropins () +{ + local services + services=$(systemctl list-unit-files --no-legend $NFSD_RELATED_SYSTEMD_SERVICE_FOR_UNSHARE_UTS_NAMESPACE | cut -d ' ' -f1) + + local svc dir dropin do_reload=false + for svc in $services ; do + dir=/run/systemd/system/$svc.d + dropin=$dir/$SYSTEMD_UNSHARE_UTS_DROPIN + test -e "$dropin" || continue + rm -f "$dropin" + do_reload=true + ocf_log debug "removed unshare --uts from $svc" + done + rm -f "${SYSTEMD_ENVIRONMENT_FILE_NFS_SERVER_SCOPE}" + $do_reload && systemctl daemon-reload +} + +nfsserver_start () +{ + local rc; + local fn + + if nfsserver_monitor; then + ocf_log debug "NFS server is already started" + return $OCF_SUCCESS + fi + + is_redhat_based && set_env_args + bind_tree + prepare_directory + + # Debian (and other systems) may provide "init scripts", + # which will only redirect back to systemd. + # If we just unshare --uts the init script invocation, + # the uts namespace is useless in that case. + # If systemd is running, mangle the nfs-server.service unit, + # independent of the "EXEC_MODE" we detected. + if $systemd_is_running ; then + if [ -z "$OCF_RESKEY_nfs_server_scope" ] ; then + remove_unshare_uts_dropins + else + inject_unshare_uts_name_into_systemd_units + fi + fi + + if ! `mount | grep -q " on $OCF_RESKEY_rpcpipefs_dir "`; then + mount -t rpc_pipefs sunrpc $OCF_RESKEY_rpcpipefs_dir + fi + + # remove the sm-notify pid so sm-notify will be allowed to run again without requiring a reboot. + rm -f /var/run/sm-notify.pid + # + # Synchronize these before starting statd + # + cp -rpfn $STATD_PATH/sm.ha/* $STATD_PATH/ > /dev/null 2>&1 + rm -rf $STATD_PATH/sm.ha/* > /dev/null 2>&1 + cp -rpf $STATD_PATH/sm $STATD_PATH/sm.bak /var/lib/nfs/state $STATD_PATH/sm.ha > /dev/null 2>&1 + + ocf_log info "Starting NFS server ..." + + # mounts /proc/fs/nfsd for us + lsmod | grep -q nfsd + if [ $? -ne 0 ]; then + modprobe nfsd + fi + + # systemd + case $EXEC_MODE in + [23]) if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then + nfs_exec start rpcbind + local i=1 + while : ; do + ocf_log info "Start: rpcbind i: $i" + rpcinfo > /dev/null 2>&1 + rc=$? + if [ "$rc" -eq "0" ]; then + break + fi + sleep 1 + i=$((i + 1)) + done + fi + ;; + esac + + if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then + # check to see if we need to start rpc.statd + v3locking_exec "status" + if [ $? -ne $OCF_SUCCESS ]; then + v3locking_exec "start" + rc=$? + if [ $rc -ne 0 ]; then + ocf_exit_reason "Failed to start NFS server locking daemons" + return $rc + fi + else + ocf_log info "rpc.statd already up" + fi + fi + + # systemd + case $EXEC_MODE in + [23]) if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then + nfs_exec start nfs-mountd + local i=1 + while : ; do + ocf_log info "Start: nfs-mountd i: $i" + ps axww | grep -q "[r]pc.mountd" + rc=$? + if [ "$rc" -eq "0" ]; then + break + fi + sleep 1 + i=$((i + 1)) + done + fi + + nfs_exec start nfs-idmapd + local i=1 + while : ; do + ocf_log info "Start: nfs-idmapd i: $i" + fn=`mktemp` + nfs_exec status nfs-idmapd > $fn 2>&1 + rc=$? + ocf_log debug "$(cat $fn)" + rm -f $fn + if [ "$rc" -eq "0" ]; then + break + fi + sleep 1 + i=$((i + 1)) + done + + if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then + nfs_exec start rpc-statd + local i=1 + while : ; do + ocf_log info "Start: rpc-statd i: $i" + rpcinfo -t localhost 100024 > /dev/null 2>&1 + rc=$? + if [ "$rc" -eq "0" ]; then + break + fi + sleep 1 + i=$((i + 1)) + done + fi + esac + + + fn=`mktemp` + nfs_exec start nfs-server > $fn 2>&1 + rc=$? + ocf_log debug "$(cat $fn)" + rm -f $fn + + if [ $rc -ne 0 ]; then + ocf_exit_reason "Failed to start NFS server" + return $rc + fi + + tfn="/proc/fs/nfsd/threads" + if [ ! -f "$tfn" ] || [ "$(cat $tfn)" -le "0" ]; then + ocf_exit_reason "Failed to start NFS server: /proc/fs/nfsd/threads" + return $OCF_ERR_GENERIC + fi + + notify_locks + + ocf_log info "NFS server started" + return $OCF_SUCCESS +} + +nfsserver_stop () +{ + local fn + + ocf_log info "Stopping NFS server ..." + + # backup the current sm state information to the ha folder before stopping. + # the ha folder will be synced after startup, restoring the statd client state + rm -rf $STATD_PATH/sm.ha/* > /dev/null 2>&1 + cp -rpf $STATD_PATH/sm $STATD_PATH/sm.bak /var/lib/nfs/state $STATD_PATH/sm.ha > /dev/null 2>&1 + + fn=`mktemp` + nfs_exec stop nfs-server > $fn 2>&1 + rc=$? + ocf_log debug "$(cat $fn)" + rm -f $fn + + if [ $rc -ne 0 ]; then + ocf_exit_reason "Failed to stop NFS server" + return $rc + fi + + # systemd + case $EXEC_MODE in + [23]) ocf_log info "Stop: threads" + tfn="/proc/fs/nfsd/threads" + while [ -f "$tfn" ] && [ "$(cat $tfn)" -gt "0" ]; do + ocf_log err "NFS server failed to stop: /proc/fs/nfsd/threads" + sleep 1 + done + + if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then + nfs_exec stop rpc-statd > /dev/null 2>&1 + ocf_log info "Stop: rpc-statd" + rpcinfo -t localhost 100024 > /dev/null 2>&1 + rc=$? + if [ "$rc" -eq "0" ]; then + ocf_exit_reason "Failed to stop rpc-statd" + return $OCF_ERR_GENERIC + fi + fi + + nfs_exec stop nfs-idmapd > /dev/null 2>&1 + ocf_log info "Stop: nfs-idmapd" + fn=`mktemp` + nfs_exec status nfs-idmapd > $fn 2>&1 + rc=$? + ocf_log debug "$(cat $fn)" + rm -f $fn + if [ "$rc" -eq "0" ]; then + ocf_exit_reason "Failed to stop nfs-idmapd" + return $OCF_ERR_GENERIC + fi + + if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then + nfs_exec stop nfs-mountd > /dev/null 2>&1 + ocf_log info "Stop: nfs-mountd" + ps axww | grep -q "[r]pc.mountd" + rc=$? + if [ "$rc" -eq "0" ]; then + ocf_exit_reason "Failed to stop nfs-mountd" + return $OCF_ERR_GENERIC + fi + fi + + if systemctl --no-legend list-unit-files "nfsdcld*" | grep -q nfsdcld; then + nfs_exec stop nfsdcld > /dev/null 2>&1 + ocf_log info "Stop: nfsdcld" + fn=`mktemp` + nfs_exec status nfsdcld > $fn 2>&1 + rc=$? + ocf_log debug "$(cat $fn)" + rm -f $fn + if [ "$rc" -eq "0" ]; then + ocf_exit_reason "Failed to stop nfsdcld" + return $OCF_ERR_GENERIC + fi + fi + esac + + + if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then + v3locking_exec "stop" + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to stop NFS locking daemons" + rc=$OCF_ERR_GENERIC + fi + fi + + # systemd + case $EXEC_MODE in + [23]) nfs_exec stop rpc-gssd > /dev/null 2>&1 + ocf_log info "Stop: rpc-gssd" + esac + + unbind_tree + rc=$? + if [ "$rc" -ne $OCF_SUCCESS ]; then + ocf_exit_reason "Failed to unmount a bind mount" + else + ocf_log info "NFS server stopped" + fi + + if $systemd_is_running; then + remove_unshare_uts_dropins + fi + + return $rc +} + +nfsserver_validate () +{ + ## + # set_exec_mode will exit if nfs server is not installed + ## + set_exec_mode + check_binary ${OCF_RESKEY_nfs_notify_cmd} + + + if [ -n "$OCF_RESKEY_CRM_meta_clone" ] && [ -n "$OCF_RESKEY_nfs_shared_infodir" ]; then + ocf_exit_reason "This RA does not support clone mode when a shared info directory is in use." + exit $OCF_ERR_CONFIGURED + fi + + if [ -n "$OCF_RESKEY_nfs_smnotify_retry_time" ]; then + if ! ocf_is_decimal "$OCF_RESKEY_nfs_smnotify_retry_time"; then + ocf_exit_reason "Invalid nfs_smnotify_retry_time [$OCF_RESKEY_nfs_smnotify_retry_time]" + exit $OCF_ERR_CONFIGURED + fi + fi + + case ${OCF_RESKEY_nfs_notify_cmd##*/} in + sm-notify|rpc.statd) ;; + *) + ocf_exit_reason "Invalid nfs_notify_cmd [$OCF_RESKEY_nfs_notify_cmd]" + exit $OCF_ERR_CONFIGURED + ;; + esac + + return $OCF_SUCCESS +} + +nfsserver_validate +systemd_is_running && systemd_is_running=true || systemd_is_running=false + +case $__OCF_ACTION in + start) nfsserver_start + ;; + stop) nfsserver_stop + ;; + monitor) nfsserver_monitor + ;; + validate-all) exit $OCF_SUCCESS + ;; + *) nfsserver_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + diff --git a/heartbeat/nfsserver-redhat.sh b/heartbeat/nfsserver-redhat.sh new file mode 100644 index 0000000..aec0966 --- /dev/null +++ b/heartbeat/nfsserver-redhat.sh @@ -0,0 +1,177 @@ +NFS_SYSCONFIG="/etc/sysconfig/nfs" +NFS_SYSCONFIG_LOCAL_BACKUP="/etc/sysconfig/nfs.ha.bu" +NFS_SYSCONFIG_AUTOGEN_TAG="AUTOGENERATED by $0 high availability resource-agent" +NFSCONVERT="$HA_BIN/nfsconvert" + +nfsserver_redhat_meta_data() { +cat<<EOF +<parameter name="nfsd_args" unique="0" required="0"> +<longdesc lang="en"> +Specifies what arguments to pass to the nfs daemon on startup. View the rpc.nfsd man page for information on what arguments are available. +Note that setting this value will override all settings placed in the local /etc/sysconfig/nfs file. +</longdesc> +<shortdesc lang="en"> +rpc.nfsd options +</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="lockd_udp_port" unique="0" required="0"> +<longdesc lang="en"> +The udp port lockd should listen on. +Note that setting this value will override all settings placed in the local /etc/sysconfig/nfs file. +</longdesc> +<shortdesc lang="en"> +lockd udp port +</shortdesc> +<content type="integer" /> +</parameter> + +<parameter name="lockd_tcp_port" unique="0" required="0"> +<longdesc lang="en"> +The tcp port lockd should listen on. +Note that setting this value will override all settings placed in the local /etc/sysconfig/nfs file. +</longdesc> +<shortdesc lang="en"> +lockd tcp port +</shortdesc> +<content type="integer" /> +</parameter> + +<parameter name="statd_outgoing_port" unique="0" required="0"> +<longdesc lang="en"> +The source port number sm-notify uses when sending reboot notifications. +Note that setting this value will override all settings placed in the local /etc/sysconfig/nfs file. +</longdesc> +<shortdesc lang="en"> +sm-notify source port +</shortdesc> +<content type="integer" /> +</parameter> + +<parameter name="statd_port" unique="0" required="0"> +<longdesc lang="en"> +The port number used for RPC listener sockets. +Note that setting this value will override all settings placed in the local /etc/sysconfig/nfs file. +</longdesc> +<shortdesc lang="en"> +rpc.statd listener port +</shortdesc> +<content type="integer" /> +</parameter> + +<parameter name="mountd_port" unique="0" required="0"> +<longdesc lang="en"> +The port number used for rpc.mountd listener sockets. +Note that setting this value will override all settings placed in the local /etc/sysconfig/nfs file. +</longdesc> +<shortdesc lang="en"> +rpc.mountd listener port +</shortdesc> +<content type="integer" /> +</parameter> + +<parameter name="rquotad_port" unique="0" required="0"> +<longdesc lang="en"> +The port number used for rpc.rquotad. +Note that setting this value will override all settings placed in the local /etc/sysconfig/nfs file. +</longdesc> +<shortdesc lang="en"> +rpc.rquotad port +</shortdesc> +<content type="integer" /> +</parameter> +EOF +} + +set_arg() +{ + local key="$1" + local value="$2" + local file="$3" + local requires_sysconfig="$4" + + if [ -z "$value" ]; then + return + fi + + # only write to the tmp /etc/sysconfig/nfs if sysconfig exists. + # otherwise this distro does not support setting these options. + if [ -d "/etc/sysconfig" ]; then + # replace if the value exists, append otherwise + if grep "^\s*${key}=" $file ; then + sed -i "s/\s*${key}=.*$/${key}=\"${value}\"/" $file + else + echo "${key}=\"${value}\"" >> $file + fi + elif [ "$requires_sysconfig" = "true" ]; then + ocf_log warn "/etc/sysconfig/nfs not found, unable to set port and nfsd args." + fi + + export ${key}="${value}" +} + +set_env_args() +{ + local tmpconfig=$(mktemp ${HA_RSCTMP}/nfsserver-tmp-XXXXX) + local statd_args + + if [ -f "$NFS_SYSCONFIG" ]; then + ## Take the $NFS_SYSCONFIG file as our skeleton + cp $NFS_SYSCONFIG $tmpconfig + fi + + # nfsd args + set_arg "RPCNFSDARGS" "$OCF_RESKEY_nfsd_args" "$tmpconfig" "true" + + # mountd args + if [ -n "$OCF_RESKEY_mountd_port" ]; then + set_arg "RPCMOUNTDOPTS" "-p $OCF_RESKEY_mountd_port" "$tmpconfig" "true" + fi + + # statd args. we always want to perform the notify using sm-notify after + # both rpc.statd and the nfsd daemons are initialized + statd_args="--no-notify" + if [ -n "$OCF_RESKEY_statd_outgoing_port" ]; then + statd_args="$statd_args -o $OCF_RESKEY_statd_outgoing_port" + fi + if [ -n "$OCF_RESKEY_statd_port" ]; then + statd_args="$statd_args -p $OCF_RESKEY_statd_port" + fi + set_arg "STATDARG" "$statd_args" "$tmpconfig" "false" + + # lockd ports + set_arg "LOCKD_UDPPORT" "$OCF_RESKEY_lockd_udp_port" "$tmpconfig" "true" + set_arg "LOCKD_TCPPORT" "$OCF_RESKEY_lockd_tcp_port" "$tmpconfig" "true" + + # rquotad_port + if [ -n "$OCF_RESKEY_rquotad_port" ]; then + set_arg "RPCRQUOTADOPTS" "-p $OCF_RESKEY_rquotad_port" "$tmpconfig" "true" + fi + + # override local nfs config. preserve previous local config though. + if [ -s $tmpconfig ]; then + if [ -f "$NFS_SYSCONFIG" ]; then + cat $NFS_SYSCONFIG | grep -q -e "$NFS_SYSCONFIG_AUTOGEN_TAG" > /dev/null 2>&1 + if [ $? -ne 0 ]; then + # backup local nfs config if it doesn't have our HA autogen tag in it. + mv -f $NFS_SYSCONFIG $NFS_SYSCONFIG_LOCAL_BACKUP + fi + fi + + cat $tmpconfig | grep -q -e "$NFS_SYSCONFIG_AUTOGEN_TAG" > /dev/null 2>&1 + if [ $? -ne 0 ]; then + echo "# $NFS_SYSCONFIG_AUTOGEN_TAG" > $NFS_SYSCONFIG + echo "# local config backup stored here, '$NFS_SYSCONFIG_LOCAL_BACKUP'" >> $NFS_SYSCONFIG + cat $tmpconfig >> $NFS_SYSCONFIG + else + cat $tmpconfig > $NFS_SYSCONFIG + fi + fi + rm -f $tmpconfig + + if [ -e "$NFSCONVERT" ]; then + ocf_log debug "Running $NFSCONVERT" + $NFSCONVERT + fi +} diff --git a/heartbeat/nginx b/heartbeat/nginx new file mode 100755 index 0000000..cb1c6ec --- /dev/null +++ b/heartbeat/nginx @@ -0,0 +1,956 @@ +#!/bin/sh +# +# High-Availability nginx OCF resource agent +# +# nginx +# +# Description: starts/stops nginx servers. +# +# Author: Alan Robertson +# Dejan Muhamedagic +# This code is based significantly on the apache resource agent +# +# Support: users@clusterlabs.org +# +# License: GNU General Public License (GPL) +# +# Copyright: (C) 2002-2010 International Business Machines +# +# +# Our parsing of the nginx config files is very rudimentary. +# It'll work with lots of different configurations - but not every +# possible configuration. +# +# Patches are being accepted ;-) +# +# OCF parameters: +# OCF_RESKEY_configfile +# OCF_RESKEY_nginx +# OCF_RESKEY_port +# OCF_RESKEY_options +# OCF_RESKEY_status10regex +# OCF_RESKEY_status10url +# OCF_RESKEY_client +# OCF_RESKEY_test20url +# OCF_RESKEY_test20regex +# OCF_RESKEY_test20conffile +# OCF_RESKEY_test20name +# OCF_RESKEY_external_monitor30_cmd +# +# +# TO DO: +# More extensive tests of extended monitor actions +# Look at the --with-http_stub_status_module for validating +# the configuration? (or is that automatically done?) +# Checking could certainly result in better error +# messages. +# Allow for the fact that the config file and so on might all be +# on shared disks - this affects the validate-all option. + + +: ${OCF_FUNCTIONS_DIR=$OCF_ROOT/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_httpd_default="/usr/sbin/httpd" +OCF_RESKEY_status10regex_default="Reading: [0-9]+ Writing: [0-9]+ Waiting: [0-9]+" + +: ${OCF_RESKEY_httpd=${OCF_RESKEY_httpd_default}} +: ${OCF_RESKEY_status10regex=${OCF_RESKEY_status10regex_default}} + +HA_VARRUNDIR=${HA_VARRUN} + +####################################################################### +# +# Configuration options - usually you don't need to change these +# +####################################################################### +# +NGINXDLIST="/usr/sbin/nginx /usr/local/sbin/nginx" + +# default options for http clients +# NB: We _always_ test a local resource, so it should be +# safe to connect from the local interface. +WGETOPTS="-O- -q -L --no-proxy --bind-address=127.0.0.1" +CURLOPTS="-o - -Ss -L --interface lo" + +LOCALHOST="http://localhost" +NGINXDOPTS="" +# +# +# End of Configuration options +####################################################################### + +CMD=`basename $0` + +# The config-file-pathname is the pathname to the configuration +# file for this web server. Various appropriate defaults are +# assumed if no config file is specified. +usage() { + cat <<-EOF +usage: $0 action + +action: + start start nginx + + stop stop nginx + + reload reload the nginx configuration + + status return the status of web server, running or stopped + + monitor return TRUE if the web server appears to be working. + For this to be supported you must configure mod_status + and give it a server-status URL - or configure what URL + you wish to be monitored. You have to have installed + either curl or wget for this to work. + + meta-data show meta data message + + validate-all validate the instance parameters + EOF + exit $1 +} + +# +# run the http client +# +curl_func() { + cl_opts="$CURLOPTS $test_httpclient_opts" + if + [ x != "x$test_user" ] + then + echo "-u $test_user:$test_password" | + curl -K - $cl_opts "$1" + else + curl $cl_opts "$1" + fi +} +wget_func() { + auth="" + cl_opts="$WGETOPTS $test_httpclient_opts" + [ x != "x$test_user" ] && + auth="--http-user=$test_user --http-passwd=$test_password" + wget $auth $cl_opts "$1" +} +# +# rely on whatever the user provided +userdefined() { + $test_httpclient $test_httpclient_opts "$1" +} + +# +# find a good http client +# +findhttpclient() { + # prefer curl if present... + if + [ "x$CLIENT" != x ] + then + echo "$CLIENT" + elif + which curl >/dev/null 2>&1 + then + echo "curl" + elif + which wget >/dev/null 2>&1 + then + echo "wget" + else + return 1 + fi +} +gethttpclient() { + [ -z "$test_httpclient" ] && + test_httpclient=$ourhttpclient + case "$test_httpclient" in + curl|wget) echo ${test_httpclient}_func;; #these are supported + *) echo userdefined;; + esac +} + +# test configuration good? +is_testconf_sane() { + if + [ "x$test_regex" = x -o "x$test_url" = x ] + then + ocf_exit_reason "test regular expression or test url empty" + return 1 + fi + if + [ "x$test_user$test_password" != x -a \( "x$test_user" = x -o "x$test_password" = x \) ] + then + ocf_exit_reason "bad user authentication for extended test" + return 1 + fi + return 0 +} +# +# read the test definition from the config +# +readtestconf() { + test_name="$1" # we look for this one or the first one if empty + lcnt=0 + readdef="" + test_url="" test_regex="" + test_user="" test_password="" + test_httpclient="" test_httpclient_opts="" + + while + read key value + do + lcnt=$((lcnt+1)) + if + [ "$readdef" ] + then + case "$key" in + "url") test_url="$value" ;; + "user") test_user="$value" ;; + "password") test_password="$value" ;; + "client") test_httpclient="$value" ;; + "client_opts") test_httpclient_opts="$value" ;; + "match") test_regex="$value" ;; + "end") break ;; + "#"*|"") ;; + *) ocf_exit_reason "$lcnt: $key: unknown keyword"; return 1 ;; + esac + else + [ "$key" = "test" ] && + [ -z "$test_name" -o "$test_name" = "$value" ] && + readdef=1 + fi + done +} + +nginxcat() { + awk ' + function procline() { + split($0,a); + if( a[1]~/^[Ii]nclude$/ ) { + procinclude(a[2]); + } else { + if( a[1]=="root" ) { + rootdir=a[2]; + gsub("\"","",rootdir); + } + print; + } + } + function printfile(infile, a) { + while( (getline<infile) > 0 ) { + procline(); + } + close(infile); + } + function allfiles(dir, cmd,f) { + cmd="find -L "dir" -type f"; + while( ( cmd | getline f ) > 0 ) { + printfile(f); + } + close(cmd); + } + function listfiles(pattern, cmd,f) { + cmd="ls "pattern" 2>/dev/null"; + while( ( cmd | getline f ) > 0 ) { + printfile(f); + } + close(cmd); + } + function procinclude(spec) { + if( rootdir!="" && spec!~/^\// ) { + spec=rootdir"/"spec; + } + if( isdir(spec) ) { + allfiles(spec); # read all files in a directory (and subdirs) + } else { + listfiles(spec); # there could be jokers + } + } + function isdir(s) { + return !system("test -d \""s"\""); + } + { procline(); } + ' $1 | + sed 's/#.*//;s/[[:blank:]]*$//;s/^[[:blank:]]*//' | + grep -v '^$' +} + +# +# set parameters (as shell vars) from our nginx config file +# +get_nginx_params() { + configfile=$1 + shift 1 + vars=`echo "$@" | sed 's/ /,/g'` + + eval ` + nginxcat $configfile | awk -v vars="$vars" ' + BEGIN{ + split(vars,v,","); + for( i in v ) + vl[i]=tolower(v[i]); + } + { + for( i in v ) + if( tolower($1)==vl[i] ) { + print v[i]"="$2 + delete vl[i] + break + } + } + '` +} + +# +# Return the location(s) that are handled by the given handler +# +FindLocationForHandler() { + PerlScript='while (<>) { + /^\s*location\s+([^ \s{]+)\s*{/i && ($loc=$1); + /^\s*stub_status\s+on\s*;$2/i && print "$loc\n"; + }' + nginxcat $1 | perl -e "$PerlScript" +} + +# +# Check if the port is valid +# +CheckPort() { + lclport="$1" + case "$lclport" in + *:[0-9]*) lclport=`echo "$lclport" | sed 's%^[^:][^:]*:%%'` + esac + ocf_is_decimal "$lclport" && [ $lclport -gt 0 -a $lclport -lt 65537 ] +} + +buildlocalurl() { + [ "x$listen" != "x" ] && + echo "http://${listen}" || + echo "${LOCALHOST}:${PORT}" +} +# +# Get all the parameters we need from the Nginx config file +# +GetParams() { + ConfigFile=$1 + DEFAULT_PID=`echo "$NGINX_CONFIGURATION" | sed -e 's%.*--pid-path=%%' -e 's% *--.*%%'` + if + [ ! -f $ConfigFile ] + then + return 1 + fi + + get_nginx_params $ConfigFile root pid listen + PidFile="$pid" + case $PidFile in + "") PidFile=$DEFAULT_PID ;; + *) ;; + esac + + for p in "$PORT" "$listen" 80 + do + if + CheckPort "$p" + then + PORT="$p" + break + fi + done + + echo $listen | grep ':' >/dev/null || # Listen could be just port spec + listen="localhost:$listen" + + # + # It's difficult to figure out whether the server supports + # the status operation. + # (we start our server with -DSTATUS - just in case :-)) + # + # Typically (but not necessarily) the status URL is /nginx_status + # + # For us to think status will work, we have to have the following things: + # + # - The server-status handler has to be mapped to some URL somewhere + # + # We assume that: + # + # - the "main" web server at $PORT will also support it if we can find it + # somewhere in the file + # - it will be supported at the same URL as the one we find in the file + # + # If this doesn't work for you, then set the status10url attribute. + # + if + [ "X$STATUSURL" = "X" ] + then + StatusURL=`FindLocationForHandler $1 nginx_status | tail -1` + STATUSURL="`buildlocalurl`$StatusURL" + fi + test ! -z "$PidFile" +} + +# +# return TRUE if a process with given PID is running +# +ProcessRunning() { + NginxPID=$1 + # Use /proc if it looks like it's here... + if + [ -d /proc -a -d /proc/1 ] + then + [ -d /proc/$NginxPID ] + else + # This assumes we're running as root... + kill -0 "$NginxPID" >/dev/null 2>&1 + fi +} + + +silent_status() { + if + [ -f $PidFile -a -s $PidFile ] && ocf_is_decimal "`cat $PidFile`" + then + ProcessRunning `cat $PidFile` + else + : No pid file + false + fi +} + +start_nginx() { + if + silent_status + then + ocf_log info "$CMD already running (pid $NginxPID)" + return $OCF_SUCCESS + fi + if + ocf_run $NGINXD $OPTIONS -t -c $CONFIGFILE + then + : Configuration file $CONFIGFILE looks OK + else + return $OCF_ERR_INSTALLED + fi + NGINX_VERSION=`$NGINXD -v 2>&1` + ocf_log info "Starting $NGINXD - $NGINX_VERSION" + ocf_log info "$NGINXD build configuration: $NGINX_CONFIGURATION" + if + ocf_run $NGINXD $NGINXDOPTS $OPTIONS -c $CONFIGFILE + then + : $NGINXD started without errors! + else + return $OCF_ERR_GENERIC + fi + tries=0 + # This looks like a potential infinite loop - but it's not in practice + # The LRM will time us out and kill us if nginx never starts working. + while + monitor_nginx + ec=$? + if + [ $ec -eq $OCF_NOT_RUNNING ] + then + tries=`expr $tries + 1` + ocf_log info "Waiting for $NGINXD $OPTIONS -c $CONFIGFILE to come up (try $tries)" + true + else + false + fi + do + sleep 1 + done + return $ec +} + +stop_nginx() { + if + silent_status + then + if + kill $NginxPID + then + tries=0 + while + ProcessRunning $NginxPID && [ $tries -lt 10 ] + do + sleep 1 + kill $NginxPID >/dev/null + ocf_log info "Killing nginx PID $NginxPID" + tries=`expr $tries + 1` + done + else + ocf_log warn "Killing nginx PID $NginxPID FAILED." + fi + if + ProcessRunning $NginxPID + then + ocf_log info "$CMD still running ($NginxPID)." + false + else + ocf_log info "$CMD stopped." + fi + else + ocf_log info "$CMD is not running." + fi + + # + # I'm not convinced this is a wonderful idea (AlanR) + # + for sig in SIGTERM SIGHUP SIGKILL + do + if + pgrep -f "$NGINXD.*$CONFIGFILE" >/dev/null + then + pkill -$sig -f $NGINXD.*$CONFIGFILE >/dev/null + ocf_log info "nginxd children were signalled ($sig)" + sleep 1 + else + break + fi + done +} + +reload_nginx() { + if + silent_status + then + if + kill -1 $NginxPID + then + : $NGINX reload signal to $NginxPID succeeded + return $OCF_SUCCESS + fi + return $OCF_ERR_GENERIC + fi + start_nginx +} + +status_nginx() { + silent_status + rc=$? + if + [ $rc -eq 0 ] + then + ocf_log info "$CMD is running (pid $NginxPID)." + return $OCF_SUCCESS + else + ocf_log info "$CMD is stopped." + return $OCF_NOT_RUNNING + fi +} + +fixtesturl() { + echo $test_url | grep -qs "^http" && return + test_url="`buildlocalurl`$test_url" +} + +monitor_nginx_external() { + if + [ -z "$EXTMONITOR" ] + then + ocf_exit_reason "$External level 30 Monitor Command not configured." + return $OCF_ERR_CONFIGURED + fi + extbase=`echo $EXTMONITOR | sed 's% .*%%'` + if + case "$extbase" in + /*) test -f "$extbase" -a -x "$extbase";; + *) which "$extbase" >/dev/null 2>&1 + esac + then + : OK - $extbase seems to be there... + else + ocf_exit_reason "$External monitor command [$extbase] is not installed." + return $OCF_ERR_CONFIGURED + fi + if + $extbase + then + : OK - $extbase succeeded + else + ocf_exit_reason "$extbase reported failure [rc=$?]" + return $OCF_NOT_RUNNING + fi + return $OCF_SUCCESS +} + + +monitor_nginx_extended() { + if + [ -f "$TESTCONFFILE" -a -r "$TESTCONFFILE" ] + then + readtestconf < $TESTCONFFILE + else + test_url="$TESTURL" + test_regex="$TESTREGEX20" + fi + whattorun=`gethttpclient` + fixtesturl + is_testconf_sane || return $OCF_ERR_CONFIGURED + $whattorun "$test_url" | grep -Ei "$test_regex" > /dev/null +} + +monitor_nginx_basic() { + if + [ -z "$STATUSURL" ] + then + ocf_exit_reason "status10url parameter empty" + return $OCF_ERR_CONFIGURED + elif + [ -z "$ourhttpclient" ] + then + ocf_exit_reason "could not find a http client; make sure that either wget or curl is available" + return $OCF_ERR_CONFIGURED + fi + ${ourhttpclient}_func "$STATUSURL" | grep -Ei "$TESTREGEX" > /dev/null +} + +monitor_nginx() { + silent_status + if + [ $? -ne 0 ] + then + ocf_log info "$CMD not running" + return $OCF_NOT_RUNNING + fi + if + [ -z "$OCF_CHECK_LEVEL" ] || [ "$OCF_CHECK_LEVEL" -lt 10 ] + then + return 0 + fi + ourhttpclient=`findhttpclient` # we'll need one + if + [ "$OCF_CHECK_LEVEL" -lt 20 ] + then + monitor_nginx_basic + elif + [ "$OCF_CHECK_LEVEL" -lt 30 ] + then + monitor_nginx_extended + else + monitor_nginx_external + fi +} + +metadata_nginx(){ + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="nginx" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This is the resource agent for the Nginx web/proxy server. +This resource agent does not monitor POP or IMAP servers, as +we don't know how to determine meaningful status for them. + +The start operation ends with a loop in which monitor is +repeatedly called to make sure that the server started and that +it is operational. Hence, if the monitor operation does not +succeed within the start operation timeout, the nginx resource +will end with an error status. + +The default monitor operation will verify that nginx is running. + +The level 10 monitor operation by default will try and fetch the /nginx_status +page - which is commented out in sample nginx configurations. +Make sure that the /nginx_status page works and that the access +is restricted to localhost (address 127.0.0.1) plus whatever +places _outside the cluster_ you want to monitor the server from. +See the status10url and status10regex attributes for more details. + +The level 20 monitor operation will perform a more complex set of tests +from a configuration file. + +The level 30 monitor operation will run an external command to perform +an arbitrary monitoring operation. + +</longdesc> +<shortdesc lang="en">Manages an Nginx web/proxy server instance</shortdesc> + +<parameters> + +<parameter name="configfile" required="0" unique="1"> +<longdesc lang="en"> +The full pathname of the Nginx configuration file. +This file is parsed to provide defaults for various other +resource agent parameters. +</longdesc> +<shortdesc lang="en">configuration file path</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="httpd"> +<longdesc lang="en"> +The full pathname of the httpd binary (optional). +</longdesc> +<shortdesc lang="en">httpd binary path</shortdesc> +<content type="string" default="${OCF_RESKEY_httpd_default}" /> +</parameter> + +<parameter name="port" > +<longdesc lang="en"> +A port number that we can probe for status information +using the statusurl. +This will default to the port number found in the +configuration file, or 80, if none can be found +in the configuration file. + +</longdesc> +<shortdesc lang="en">httpd port</shortdesc> +<content type="integer" /> +</parameter> + +<parameter name="status10url"> +<longdesc lang="en"> +The URL to monitor (the nginx server status page by default) when given a level 10 monitor operation. +If left unspecified, it will be inferred from +the nginx configuration file, or defaulted to /nginx_status. + +If you set this, make sure that it succeeds *only* from the +localhost (127.0.0.1) and no other cluster nodes. +Otherwise, the cluster software may complain +about it being active on multiple nodes. +</longdesc> +<shortdesc lang="en">url name</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="status10regex"> +<longdesc lang="en"> +Regular expression to match in the output of status10url. +Case insensitive. +</longdesc> +<shortdesc lang="en">monitor regular expression</shortdesc> +<content type="string" default="${OCF_RESKEY_status10regex_default}"/> +</parameter> + +<parameter name="testclient"> +<longdesc lang="en"> +Client to use to query to Nginx for level 10 and level 20 tests. +If not specified, the RA will try to find one on the system. +Currently, wget and curl are supported, with curl being preferred. +For example, you can set this parameter to "wget" if you prefer that to curl. +</longdesc> +<shortdesc lang="en">http client</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="test20url"> +<longdesc lang="en"> +URL to test. If it does not start with "http", then it's +considered to be relative to the document root address. +</longdesc> +<shortdesc lang="en">Level 20 monitor url</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="test20regex"> +<longdesc lang="en"> +Regular expression to match in the output of test20url. +Case insensitive. +</longdesc> +<shortdesc lang="en">Level 20 monitor regular expression</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="test20conffile"> +<longdesc lang="en"> +A file which contains a more complex test configuration. Could be useful if +you have to check more than one web application or in case sensitive +info should be passed as arguments (passwords). Furthermore, +using a config file is the only way to specify certain parameters. + +Please see README.webapps for examples and file description. +</longdesc> +<shortdesc lang="en">Level 20 test configuration file</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="test20name"> +<longdesc lang="en"> +Name of the test within the test configuration file. +</longdesc> +<shortdesc lang="en">Level 20 test name</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="external_monitor30_cmd"> +<longdesc lang="en"> +Command string to run which implements level 30 monitoring. +</longdesc> +<shortdesc lang="en">Level 30 test string</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="options"> +<longdesc lang="en"> +Extra options to apply when starting nginx. +</longdesc> +<shortdesc lang="en">nginx start options</shortdesc> +<content type="string" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="40s" /> +<action name="stop" timeout="60s" /> +<action name="reload" timeout="40s" /> +<action name="status" timeout="30s" /> +<action name="monitor" timeout="30s" depth="0" interval="10s" /> +<action name="monitor" timeout="30s" depth="10" interval="30s" /> +<action name="monitor" timeout="45s" depth="20" /> +<action name="monitor" timeout="60s" depth="30" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s" /> +</actions> +</resource-agent> +END + + exit $OCF_SUCCESS +} + +validate_all_nginx() { + if + CheckPort $PORT + # We are sure to succeed here, since we forced $PORT to be valid in GetParams() + then + : OK + else + ocf_exit_reason "Port number $PORT is invalid!" + exit $OCF_ERR_ARGS + fi + + if + [ -z $STATUSURL ] + then + : OK to be empty + else + case $STATUSURL in + http://*/*) ;; + *) ocf_exit_reason "Invalid STATUSURL $STATUSURL" + exit $OCF_ERR_ARGS ;; + esac + fi + if + [ ! -x $NGINXD ] + then + ocf_exit_reason "NGINXD $NGINXD not found or is not an executable!" + exit $OCF_ERR_ARGS + fi + if + [ ! -f $CONFIGFILE ] + then + # We are sure to succeed here, since we have parsed $CONFIGFILE before getting here + ocf_exit_reason "Configuration file $CONFIGFILE not found!" + exit $OCF_ERR_CONFIGURED + fi + if + ocf_run $NGINXD $OPTIONS -t -c $CONFIGFILE + then + : Cool $NGINXD likes $CONFIGFILE + else + ocf_exit_reason "$NGINXD $OPTIONS -t -c $CONFIGFILE reported a configuration error." + return $OCF_ERR_CONFIGURED + fi + return $OCF_SUCCESS +} + +if + [ $# -eq 1 ] +then + COMMAND=$1 + NGINXD="$OCF_RESKEY_httpd" + PORT="$OCF_RESKEY_port" + STATUSURL="$OCF_RESKEY_status10url" + CONFIGFILE="$OCF_RESKEY_configfile" + OPTIONS="$OCF_RESKEY_options" + CLIENT=${OCF_RESKEY_client} + TESTREGEX="${OCF_RESKEY_status10regex}" + TESTURL="$OCF_RESKEY_test20url" + TESTREGEX20=${OCF_RESKEY_test20regex} + TESTCONFFILE="$OCF_RESKEY_test20conffile" + TESTNAME="$OCF_RESKEY_test20name" + EXTMONITOR="$OCF_RESKEY_external_monitor30_cmd" +else + usage $OCF_ERR_ARGS +fi + +LSB_STATUS_STOPPED=3 +if + [ "X$NGINXD" = X -o ! -f "$NGINXD" -o ! -x "$NGINXD" ] +then + NGINXD= + for h in $NGINXDLIST + do + if + [ -f "$h" -a -x "$h" ] + then + NGINXD="$h" + break + fi + done +# It is possible that we still do not have a valid httpd at this stage + if + [ -z "$NGINXD" ] + then + case $COMMAND in + stop) exit $OCF_SUCCESS;; + monitor) exit $OCF_NOT_RUNNING;; + status) exit $LSB_STATUS_STOPPED;; + meta-data) metadata_nginx;; + esac + ocf_exit_reason "nginx binary not found! Please verify you've installed it" + exit $OCF_ERR_INSTALLED + fi + # Let the user know that the $NGINXD used is the one (s)he specified via $OCF_RESKEY_httpd + if + [ ! -z "$OCF_RESKEY_httpd" ] + then + ocf_log info "Using $NGINXD as nginx" + fi +fi + +httpd_basename=`basename $NGINXD` +case $httpd_basename in + *-*) httpd_basename=`echo "$httpd_basename" | sed -e 's%\-.*%%'`;; +esac +NGINX_CONFIGURATION=`$NGINXD -V 2>&1 |grep 'configure arguments:'` +DEFAULT_CONFIG=`echo "$NGINX_CONFIGURATION" | sed -e 's%.*--conf-path=%%' -e 's% *--.*%%'` + +case "$CONFIGFILE" in + "") CONFIGFILE=$DEFAULT_CONFIG;; + *) ;; +esac + +if + [ ! -f "$CONFIGFILE" ] +then + case $COMMAND in + stop) ocf_log warn "$CONFIGFILE not found - nginx considered stopped" + exit $OCF_SUCCESS;; + monitor) exit $OCF_NOT_RUNNING;; + status) exit $LSB_STATUS_STOPPED;; + esac +fi + +if + [ "X$COMMAND" = Xmeta-data ] || GetParams $CONFIGFILE +then + : OK +else + ocf_exit_reason "Cannot parse config file [$CONFIGFILE]" + exit $OCF_ERR_CONFIGURED +fi + +case $COMMAND in + start) start_nginx;; + stop) stop_nginx;; + reload) reload_nginx;; + status) status_nginx;; + monitor) monitor_nginx;; + meta-data) metadata_nginx;; + validate-all) validate_all_nginx;; + *) usage $OCF_ERR_UNIMPLEMENTED;; +esac diff --git a/heartbeat/nvmet-namespace b/heartbeat/nvmet-namespace new file mode 100755 index 0000000..fa9a9f8 --- /dev/null +++ b/heartbeat/nvmet-namespace @@ -0,0 +1,205 @@ +#!/bin/sh +# +# +# NVMe-oF (rdma, tcp, fc) OCF RA. Exports and manages NVMe targets. +# +# (c) 2021 LINBIT HA-Solutions GmbH, written by Philipp Reisner +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +OCF_RESKEY_namespace_id_default="1" +: ${OCF_RESKEY_namespace_id=${OCF_RESKEY_namespace_id_default}} + +OCF_RESKEY_nguid_default="" +: ${OCF_RESKEY_nguid=${OCF_RESKEY_nguid_default}} +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="nvmet-namespace" version="0.9"> +<version>1.0</version> + +<longdesc lang="en"> +Manages NVMe-oF namespaces. An NVMe-oF namespace is part of an NVMe-oF target. +A namespace must be created after the subsystem. It is similar in concept to +the LUN of an iSCSI target. +</longdesc> +<shortdesc lang="en">NVMe-oF target export agent</shortdesc> + +<parameters> +<parameter name="nqn" required="1" unique="1"> +<longdesc lang="en"> +The NVMe Qualified Name (NQN) is used to identify the remote NVMe +storage target. It is similar to an iSCSI Qualified Name +(IQN). While it is a free-form string, you should follow the convention: +nqn.2014-08.com.vendor:nvme:nvm-subsystem-sn-12345 +You need to refer here to the NQN of an NVMe Subsystem your created +with the NVMeSubsystem resource agent. +</longdesc> +<shortdesc lang="en">NVMe Qualified Name</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="namespace_id" required="0" unique="1"> +<longdesc lang="en"> +The NVMe namespace id (number). +</longdesc> +<shortdesc lang="en">namespace id</shortdesc> +<content type="integer" default="${OCF_RESKEY_namespace_id_default}"/> +</parameter> + +<parameter name="backing_path" required="1" unique="0"> +<longdesc lang="en"> +The full path to the block device or file that should be exposed through this +namespace. +</longdesc> +<shortdesc lang="en">block device full path</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="uuid" required="1" unique="1"> +<longdesc lang="en"> +The UUID that should be exposed by NVMe. Create it using uuidgen. This is +necessary for the initiators to accept the namespace from the new server +after failover. +</longdesc> +<shortdesc lang="en">UUID</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="nguid" required="0" unique="1"> +<longdesc lang="en"> +NGUID stands for Namespace Globally Unique Identifier. Seems to be optional, +looks like e.g. VMWare ESXi7 uses it to identify namespaces. +Use 'uuidgen' to create it. +</longdesc> +<shortdesc lang="en">NGUID</shortdesc> +<content type="string" default="${OCF_RESKEY_nguid_default}"/> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="10s" /> +<action name="stop" timeout="10s" /> +<action name="status" timeout="10s" interval="10s" depth="0" /> +<action name="monitor" timeout="10s" interval="10s" depth="0" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="10s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +nvmet_namespace_usage() { + cat <<END +usage: $0 {start|stop|status|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +nvmet_namespace_start() { + nvmet_namespace_monitor + if [ $? = $OCF_SUCCESS ]; then + return $OCF_SUCCESS + fi + + if [ ! -d ${subsys} ]; then + ocf_log err "$subsys does not exist -- Create it with NVMeSubsystem." + exit $OCF_ERR_GENERIC + fi + + mkdir ${namespace} + echo "${OCF_RESKEY_backing_path}" > ${namespace}/device_path + echo "${OCF_RESKEY_uuid}" > ${namespace}/device_uuid + [ "${OCF_RESKEY_nguid}" ] && echo "${OCF_RESKEY_nguid}" > ${namespace}/device_nguid + echo 1 > ${namespace}/enable + + nvmet_namespace_monitor +} + +nvmet_namespace_stop() { + nvmet_namespace_monitor + if [ $? -eq $OCF_NOT_RUNNING ]; then + return $OCF_SUCCESS + fi + + rmdir ${namespace} + + return $OCF_SUCCESS +} + +nvmet_namespace_monitor() { + [ -d ${namespace} ] || return $OCF_NOT_RUNNING + return $OCF_SUCCESS +} + +nvmet_namespace_validate() { + if [ ! -d /sys/kernel/config/nvmet ]; then + ocf_log err "/sys/kernel/config/nvmet does not exist -- Load the nvmet.ko linux kernel module." + exit $OCF_ERR_INSTALLED + fi + subsys=/sys/kernel/config/nvmet/subsystems/${OCF_RESKEY_nqn} + namespace=${subsys}/namespaces/${OCF_RESKEY_namespace_id} + + return $OCF_SUCCESS +} + + +case $1 in + meta-data) + meta_data + exit $OCF_SUCCESS + ;; + usage|help) + nvmet_namespace_usage + exit $OCF_SUCCESS + ;; +esac + +# Everything except usage and meta-data must pass the validate test +nvmet_namespace_validate + +case $__OCF_ACTION in +start) nvmet_namespace_start;; +stop) nvmet_namespace_stop;; +monitor|status) nvmet_namespace_monitor;; +reload) ocf_log info "Reloading..." + nvmet_namespace_start + ;; +validate-all) ;; +*) nvmet_namespace_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc diff --git a/heartbeat/nvmet-port b/heartbeat/nvmet-port new file mode 100755 index 0000000..cbf1109 --- /dev/null +++ b/heartbeat/nvmet-port @@ -0,0 +1,238 @@ +#!/bin/sh +# +# +# NVMe-oF (rdma, tcp, fc) OCF RA. Exports and manages NVMe targets. +# +# (c) 2021 LINBIT HA-Solutions GmbH, written by Philipp Reisner +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +OCF_RESKEY_port_id_default="0" +: ${OCF_RESKEY_port_id=${OCF_RESKEY_port_id_default}} + +OCF_RESKEY_type_default="tcp" +: ${OCF_RESKEY_type=${OCF_RESKEY_type_default}} + +OCF_RESKEY_addr_fam_default="ipv4" +: ${OCF_RESKEY_addr_fam=${OCF_RESKEY_addr_fam_default}} + +OCF_RESKEY_svcid_default="4420" +: ${OCF_RESKEY_svcid=${OCF_RESKEY_svcid_default}} +####################################################################### +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="nvmet-port" version="0.9"> +<version>1.0</version> + +<longdesc lang="en"> +Manages NVMe-oF ports. An NVMe-oF port is part of an NVMe-oF target. +A port must be created after the subsystem. It exposes one or multiple +subsystem(s) including the enclosed namespace(s) to the network. +</longdesc> +<shortdesc lang="en">NVMe-oF target export agent</shortdesc> + +<parameters> + +<parameter name="port_id" required="0" unique="1"> +<longdesc lang="en"> +The NVMe port number is a 16 bit number. +</longdesc> +<shortdesc lang="en">NVMe port</shortdesc> +<content type="integer" default="${OCF_RESKEY_port_id_default}"/> +</parameter> + +<parameter name="type" required="0" unique="0"> +<longdesc lang="en"> +The NVMe transport type. Should be one of 'tcp', 'rdma', 'fc' or 'loop'. +</longdesc> +<shortdesc lang="en">tcp or rdma</shortdesc> +<content type="string" default="${OCF_RESKEY_type_default}"/> +</parameter> + +<parameter name="addr_fam" required="0" unique="0"> +<longdesc lang="en"> +The address family of the address. Should be 'ipv4', 'ipv6' or 'fc'. +</longdesc> +<shortdesc lang="en">ipv4 or ipv6</shortdesc> +<content type="string" default="${OCF_RESKEY_addr_fam_default}"/> +</parameter> + +<parameter name="svcid" required="0" unique="0"> +<longdesc lang="en"> +The transport service identifier. The TCP/IP port number this target binds to, +or its RDMA protocol equivalent. +</longdesc> +<shortdesc lang="en">IP/RDMA port number</shortdesc> +<content type="integer" default="${OCF_RESKEY_svcid_default}"/> +</parameter> + +<parameter name="addr" required="1" unique="0"> +<longdesc lang="en"> +The transport address. The TCP/IP address this targets binds to, +or its RDMA protocol equivalent. +</longdesc> +<shortdesc lang="en">IP/RDMA/FC address</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="nqns" required="1" unique="0"> +<longdesc lang="en"> +A space-separated list of NQNs that should be exported through +this NVMe-oF-Target port. This list needs to have at least one entry. +</longdesc> +<shortdesc lang="en">list of NQNs</shortdesc> +<content type="string"/> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="10s" /> +<action name="stop" timeout="10s" /> +<action name="status" timeout="10s" interval="10s" depth="0" /> +<action name="monitor" timeout="10s" interval="10s" depth="0" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="10s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +nvmet_port_usage() { + cat <<END +usage: $0 {start|stop|status|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +nvmet_port_start() { + nvmet_port_monitor + if [ $? = $OCF_SUCCESS ]; then + return $OCF_SUCCESS + fi + + mkdir ${portdir} + echo ${OCF_RESKEY_addr} > ${portdir}/addr_traddr + echo ${OCF_RESKEY_type} > ${portdir}/addr_trtype + echo ${OCF_RESKEY_svcid} > ${portdir}/addr_trsvcid + echo ${OCF_RESKEY_addr_fam} > ${portdir}/addr_adrfam + + for subsystem in ${OCF_RESKEY_nqns}; do + ln -s /sys/kernel/config/nvmet/subsystems/${subsystem} \ + ${portdir}/subsystems/${subsystem} + done + + nvmet_port_monitor +} + +nvmet_port_stop() { + nvmet_port_monitor + if [ $? -eq $OCF_NOT_RUNNING ]; then + return $OCF_SUCCESS + fi + + for subsystem in ${OCF_RESKEY_nqns}; do + rm ${portdir}/subsystems/${subsystem} + done + + rmdir ${portdir} + + return $OCF_SUCCESS +} + +nvmet_port_monitor() { + [ -d ${portdir} ] || return $OCF_NOT_RUNNING + return $OCF_SUCCESS +} + +nvmet_port_validate() { + case "${OCF_RESKEY_type}" in + tcp|rdma|fc) + ;; + *) + ocf_log err "type must be tcp, rdma, or fc. OCF_RESKEY_type was set to $OCF_RESKEY_type" + exit $OCF_ERR_ARGS + ;; + esac + + case "${OCF_RESKEY_addr_fam}" in + ipv4|ipv6|fc) + ;; + *) + ocf_log err "addr_fam must be ipv4, ipv6, or fc. OCF_RESKEY_addr_fam was set to $OCF_RESKEY_addr_fam" + exit $OCF_ERR_ARGS + ;; + esac + + if [ -z "${OCF_RESKEY_nqns}" ]; then + ocf_log err "subsystems may not be empty. OCF_RESKEY_nqns was set to $OCF_RESKEY_nqns" + exit $OCF_ERR_ARGS + fi + + if [ ! -d /sys/kernel/config/nvmet ]; then + ocf_log err "/sys/kernel/config/nvmet does not exist -- Load the nvmet.ko linux kernel module." + exit $OCF_ERR_INSTALLED + fi + portdir=/sys/kernel/config/nvmet/ports/${OCF_RESKEY_port_id} + + return $OCF_SUCCESS +} + + +case $1 in + meta-data) + meta_data + exit $OCF_SUCCESS + ;; + usage|help) + nvmet_port_usage + exit $OCF_SUCCESS + ;; +esac + +# Everything except usage and meta-data must pass the validate test +nvmet_port_validate + +case $__OCF_ACTION in +start) nvmet_port_start;; +stop) nvmet_port_stop;; +monitor|status) nvmet_port_monitor;; +reload) ocf_log info "Reloading..." + nvmet_port_start + ;; +validate-all) ;; +*) nvmet_port_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc diff --git a/heartbeat/nvmet-subsystem b/heartbeat/nvmet-subsystem new file mode 100755 index 0000000..a1d0cd2 --- /dev/null +++ b/heartbeat/nvmet-subsystem @@ -0,0 +1,188 @@ +#!/bin/sh +# +# +# NVMe-oF (rdma, tcp, fc) OCF RA. Exports and manages NVMe targets. +# +# (c) 2021 LINBIT HA-Solutions GmbH, written by Philipp Reisner +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +OCF_RESKEY_allowed_initiators_default="" +: ${OCF_RESKEY_allowed_initiators=${OCF_RESKEY_allowed_initiators_default}} + +OCF_RESKEY_serial_default="" +: ${OCF_RESKEY_serial=${OCF_RESKEY_serial_default}} +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="nvmet-subsystem" version="0.9"> +<version>1.0</version> + +<longdesc lang="en"> +Manages NVMe-oF subsystems. An NVMe-oF subsystem is part of an NVMe-oF target. +A subsystem must be created first, before namespace(s) and port(s). +</longdesc> +<shortdesc lang="en">NVMe-oF target export agent</shortdesc> + +<parameters> + +<parameter name="nqn" required="1" unique="1"> +<longdesc lang="en"> +The NVMe Qualified Name (NQN) is used to identify the remote NVMe +storage target. It is similar to an iSCSI Qualified Name +(IQN). While it is a free-form string, you should follow the convention: +nqn.2014-08.com.vendor:nvme:nvm-subsystem-sn-12345 +</longdesc> +<shortdesc lang="en">NVMe Qualified Name</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="allowed_initiators" required="0" unique="0"> +<longdesc lang="en"> +Allowed initiators. A space-separated list of initiators allowed to +connect to this target. Initiators are identified by their NQN. +If the list is empty, any initiator will be allowed to connect. +</longdesc> +<shortdesc lang="en">List of NVMe initiators allowed to connect +to this target</shortdesc> +<content type="string" default="${OCF_RESKEY_allowed_initiators_default}"/> +</parameter> + +<parameter name="serial" required="0" unique="0"> +<longdesc lang="en"> +The serial of the subsystem. Set it to a random 16 character hex +value. Use hexdump -n 8 -e '4/4 "%08x" 1 "\n"' /dev/urandom +</longdesc> +<shortdesc lang="en">List of NVMe initiators allowed to connect +to this target</shortdesc> +<content type="string" default="${OCF_RESKEY_serial_default}"/> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="10s" /> +<action name="stop" timeout="10s" /> +<action name="status" timeout="10s" interval="10s" depth="0" /> +<action name="monitor" timeout="10s" interval="10s" depth="0" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="10s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +nvmet_subsystem_usage() { + cat <<END +usage: $0 {start|stop|status|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +nvmet_subsystem_start() { + nvmet_subsystem_monitor + if [ $? = $OCF_SUCCESS ]; then + return $OCF_SUCCESS + fi + + local subsys=/sys/kernel/config/nvmet/subsystems/${OCF_RESKEY_nqn} + mkdir ${subsys} + [ "${OCF_RESKEY_serial}" ] && echo ${OCF_RESKEY_serial} > ${subsys}/attr_serial + + if [ -z "${OCF_RESKEY_allowed_initiators}" ]; then + echo 1 > ${subsys}/attr_allow_any_host + else + local hosts_dir=/sys/kernel/config/nvmet/hosts + echo 0 > ${subsys}/attr_allow_any_host + for hostnqn in ${OCF_RESKEY_allowed_initiators}; do + mkdir -p ${hosts_dir}/${hostnqn} + ln -s ${hosts_dir}/${hostnqn} ${subsys}/allowed_hosts + done + fi + + nvmet_subsystem_monitor +} + +nvmet_subsystem_stop() { + nvmet_subsystem_monitor + if [ $? -eq $OCF_NOT_RUNNING ]; then + return $OCF_SUCCESS + fi + + rmdir /sys/kernel/config/nvmet/subsystems/${OCF_RESKEY_nqn} + + return $OCF_SUCCESS +} + +nvmet_subsystem_monitor() { + [ -d /sys/kernel/config/nvmet/subsystems/${OCF_RESKEY_nqn} ] || return $OCF_NOT_RUNNING + return $OCF_SUCCESS +} + +nvmet_subsystem_validate() { + if [ ! -d /sys/kernel/config/nvmet ]; then + ocf_log err "/sys/kernel/config/nvmet does not exist -- Load the nvmet.ko linux kernel module." + exit $OCF_ERR_INSTALLED + fi + return $OCF_SUCCESS +} + + +case $1 in + meta-data) + meta_data + exit $OCF_SUCCESS + ;; + usage|help) + nvmet_subsystem_usage + exit $OCF_SUCCESS + ;; +esac + +# Everything except usage and meta-data must pass the validate test +nvmet_subsystem_validate + +case $__OCF_ACTION in +start) nvmet_subsystem_start;; +stop) nvmet_subsystem_stop;; +monitor|status) nvmet_subsystem_monitor;; +reload) ocf_log info "Reloading..." + nvmet_subsystem_start + ;; +validate-all) ;; +*) nvmet_subsystem_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc diff --git a/heartbeat/ocf-binaries.in b/heartbeat/ocf-binaries.in new file mode 100644 index 0000000..e9bf95f --- /dev/null +++ b/heartbeat/ocf-binaries.in @@ -0,0 +1,75 @@ +# Make sure PATH contains all the usual suspects +PATH="$PATH:/sbin:/bin:/usr/sbin:/usr/bin" + +# Include /usr/ucb for finding whoami on Solaris +PATH="$PATH:/usr/ucb" + +export PATH + +# Binaries and binary options for use in Resource Agents +: ${AWK:=@AWK@} +: ${EGREP:="@EGREP@"} +: ${IFCONFIG_A_OPT:="@IFCONFIG_A_OPT@"} +: ${MAILCMD:=@MAILCMD@} +: ${PING:=@PING@} +: ${SH:=@SHELL@} +: ${TEST:=@TEST@} +: ${TESTPROG:=@TEST@} + +# Entries that should probably be removed +: ${BASENAME:=basename} +: ${BLOCKDEV:=blockdev} +: ${CAT:=cat} +: ${FSCK:=fsck} +: ${FUSER:=fuser} +: ${GETENT:=getent} +: ${GREP:=grep} +: ${IFCONFIG:=ifconfig} +: ${IPTABLES:=iptables} +## for cases that are known not to be serviceable with iptables-nft impl. +: ${IPTABLES_LEGACY:=iptables-legacy} +: ${IP2UTIL:=ip} +: ${MDADM:=mdadm} +: ${MODPROBE:=modprobe} +: ${MOUNT:=mount} +: ${MSGFMT:=msgfmt} +: ${NETSTAT:=netstat} +: ${PERL:=perl} +: ${RAIDSTART:=raidstart} +: ${RAIDSTOP:=raidstop} +: ${ROUTE:=route} +: ${UMOUNT:=umount} +: ${REBOOT:=reboot} +: ${POWEROFF_CMD:=poweroff} +: ${WGET:=wget} +: ${WHOAMI:=whoami} +: ${STRINGSCMD:=strings} +: ${SCP:=scp} +: ${SSH:=@SSH@} +: ${SWIG:=swig} +: ${GZIP_PROG:=gzip} +: ${TAR:=tar} +: ${MD5:=md5} +: ${DRBDADM:=drbdadm} +: ${DRBDSETUP:=drbdsetup} + +check_binary () { + if ! have_binary "$1"; then + if [ "$OCF_NOT_RUNNING" = 7 ]; then + # Chances are we have a fully setup OCF environment + ocf_exit_reason "Setup problem: couldn't find command: $1" + else + echo "Setup problem: couldn't find command: $1" + fi + exit $OCF_ERR_INSTALLED + fi +} + +have_binary () { + if [ "$OCF_TESTER_FAIL_HAVE_BINARY" = "1" ]; then + false + else + local bin=`echo $1 | sed -e 's/ -.*//'` + test -x "`which $bin 2>/dev/null`" + fi +} diff --git a/heartbeat/ocf-directories.in b/heartbeat/ocf-directories.in new file mode 100644 index 0000000..d8df035 --- /dev/null +++ b/heartbeat/ocf-directories.in @@ -0,0 +1,22 @@ +# Binaries and binary options for use in Resource Agents + +prefix=@prefix@ +exec_prefix=@exec_prefix@ +: ${INITDIR:=@INITDIR@} +: ${HA_DIR:=@sysconfdir@/ha.d} +: ${HA_RCDIR:=$HA_DIR/rc.d} +: ${HA_CONFDIR=$HA_DIR/conf} +: ${HA_CF:=$HA_DIR/ha.cf} +: ${HA_VARLIB:=@localstatedir@/lib/heartbeat} +: ${HA_RSCTMP:=@HA_RSCTMPDIR@} +: ${HA_RSCTMP_OLD:=@HA_VARRUNDIR@/heartbeat/rsctmp} +: ${HA_FIFO:=@localstatedir@/lib/heartbeat/fifo} +: ${HA_BIN:=@libexecdir@/heartbeat} +: ${HA_SBIN_DIR:=@sbindir@} +: ${HA_DATEFMT:="%b %d %T "} +: ${HA_DEBUGLOG:=/dev/null} +: ${HA_RESOURCEDIR:=$HA_DIR/resource.d} +: ${HA_DOCDIR:=@datadir@/doc/heartbeat} +: ${__SCRIPT_NAME:=`basename $0`} +: ${HA_VARRUN:=@localstatedir@/run} +: ${HA_VARLOCK:=@localstatedir@/lock/subsys} diff --git a/heartbeat/ocf-distro b/heartbeat/ocf-distro new file mode 100644 index 0000000..590c74e --- /dev/null +++ b/heartbeat/ocf-distro @@ -0,0 +1,209 @@ +# +# This is OCF Linux distribution query support +# +# Currently needed for the nfsserver RA which has some already +# released RH specific stuff (/etc/sysconfig/nfs editing) +# +# These functions are intended to be POSIX-compliant for portability. +# + +# systemd-based systems should all have an os-release file. +_ETC_OS_RELEASE_FILE="/etc/os-release" +_USR_OS_RELEASE_FILE="/usr/lib/os-release" + +# Legacy distro-specific files +_DEBIAN_VERSION_FILE="/etc/debian_version" +_REDHAT_RELEASE_FILE="/etc/redhat-release" +_SUSE_RELEASE_FILE="/etc/SuSE-release" + + +# Converts OS release ID to a standard form regardless of source. +_process_os_release_id() { + _os="$1" + + # Convert to lowercase, isolate distro name, remove whitespace + _os=$(echo "$_os" \ + | tr "[:upper:]" "[:lower:]" \ + | sed -e "s|\(gnu/\)*linux||" -e "s/server//" \ + -e "s/release.*//" -e "s/[[:digit:]].*//" \ + -e "s/[[:blank:]]//") + + # Normalize known distros to os-release names + case "$_os" in + *alma*) + _os="almalinux" + ;; + *centos*) + _os="centos" + ;; + *debian*) + _os="debian" + ;; + *fedora*) + _os="fedora" + ;; + *ol*) + _os="ol" + ;; + *redhat*|*rhel*|*scientific*) + _os="rhel" + ;; + *rocky*) + _os="rocky" + ;; + *opensuse*) + _os="opensuse" + ;; + *suseenterprise*) + _os="sles" + ;; + *ubuntu*) + _os="ubuntu" + ;; + esac + + echo "$_os" +} + +# Converts OS version ID to a form that ocf_version_cmp() can handle. +# Strips any garbage. +_process_os_version_id() { + _ver="$1" + _fmt="[[:digit:]][[:digit:].-]*[[:alnum:].\+-]*" + + echo "$_ver" | sed -e "s/[^[:digit:]]*\(${_fmt}\).*/\1/" +} + +# Gets OS release ID (i.e., distro) or version ID from os-release file. +# $_ETC_OS_RELEASE_FILE takes precedence over $_USR_OS_RELEASE_FILE. +_get_val_from_os_release_file() { + _key="" + _value="" + _func="" + + case "$1" in + id) + _key="ID" + _func="_process_os_release_id" + ;; + version_id) + _key="VERSION_ID" + _func="_process_os_version_id" + ;; + esac + + if [ -n "$_key" ]; then + if [ -f "$_ETC_OS_RELEASE_FILE" ]; then + _value=$(awk -F "=" -v k="$_key" '$1 == k {print $2}' \ + "$_ETC_OS_RELEASE_FILE" | tr -d \") + fi + + if [ -z "$_value" ] && [ -f "$_USR_OS_RELEASE_FILE" ]; then + _value=$(awk -F "=" -v k="$_key" '$1 == k {print $2}' \ + "$_USR_OS_RELEASE_FILE" | tr -d \") + fi + fi + + # Ensure the value is in the correct format + [ -n "$_func" ] && _value=$("$_func" "$_value") + + echo "$_value" +} + +# Gets OS release ID from lsb_release command or legacy *-release files +_get_os_from_legacy_source() { + _os="" + + if which lsb_release >/dev/null 2>&1; then + _os=$(lsb_release -si) + + elif [ -f "$_DEBIAN_VERSION_FILE" ]; then + _os="debian" + + elif [ -f "$_REDHAT_RELEASE_FILE" ]; then + _os=$(head -n 1 "$_REDHAT_RELEASE_FILE") + + elif [ -f "$_SUSE_RELEASE_FILE" ]; then + _os=$(head -n 1 "$_SUSE_RELEASE_FILE") + + else + _os=$(uname -s) + fi + + _process_os_release_id "$_os" +} + +# Gets OS version from lsb_release command or legacy *-release files +_get_version_from_legacy_source() { + _ver="" + + if which lsb_release >/dev/null 2>&1; then + _ver=$(lsb_release -sr) + + elif [ -f "$_DEBIAN_VERSION_FILE" ]; then + _ver=$(cat "$_DEBIAN_VERSION_FILE") + + elif [ -f "$_REDHAT_RELEASE_FILE" ]; then + _ver=$(head -1 "$_REDHAT_RELEASE_FILE") + + elif [ -f "$_SUSE_RELEASE_FILE" ]; then + _ver=$(awk '$1 == "VERSION" {print $3}' "$_SUSE_RELEASE_FILE") + _patchlevel=$(awk '$1 == "PATCHLEVEL" {print $3}' \ + "$_SUSE_RELEASE_FILE") + + [ -n "$_patchlevel" ] && _ver="${_ver}.${_patchlevel}" + + else + _ver=$(uname -r) + fi + + _process_os_version_id "$_ver" +} + +# Prints OS release ID (i.e., distro name) +get_release_id() { + _os=$(_get_val_from_os_release_file id) + + if [ -z "$_os" ]; then + _os=$(_get_os_from_legacy_source) + fi + + echo "$_os" +} + +# Prints OS version ID +get_os_version_id() { + _ver=$(_get_val_from_os_release_file version_id) + + if [ -z "$_ver" ] || [ "$(get_release_id)" = "debian" ]; then + # Debian only includes major release in os-release. + # $_DEBIAN_VERSION_FILE has ${major}.${minor}. + _ver=$(_get_version_from_legacy_source) + fi + + echo "$_ver" +} + +# Returns true if the OS is Debian-based, otherwise false +is_debian_based() { + get_release_id | grep -i -e "debian" -e "ubuntu" >/dev/null 2>&1 +} + +# Returns true if the OS is Red Hat-based, otherwise false +is_redhat_based() { + get_release_id | grep -i -e "almalinux" -e "centos" -e "fedora" -e "ol" \ + -e "redhat" -e "rhel" -e "rocky" -e "scientific" >/dev/null 2>&1 +} + +# Returns true if the OS is SUSE-based, otherwise false +is_suse_based() { + get_release_id | grep -i -e "sles" -e "suse" >/dev/null 2>&1 +} + +# Sets global variables OS and VER. +# get_os_ver() is currently unused upstream; maintained for backwards +# compatibility. +get_os_ver() { + OS=$(get_release_id) + VER=$(get_os_version_id) +} diff --git a/heartbeat/ocf-rarun b/heartbeat/ocf-rarun new file mode 100644 index 0000000..32bbab2 --- /dev/null +++ b/heartbeat/ocf-rarun @@ -0,0 +1,146 @@ +# +# This is the OCF RA driver. It should take care of all the +# boring details and leave only the parts which are really about +# the actual resource to the resource agent. +# +# The interface +# +# The RA needs to define functions for all supported actions and +# name them <RA>_<action>. For instance, apache_start or +# apache_meta_data. +# +# The following functions are required: +# +# - <RA>_methods +# - <RA>_usage +# - <RA>_meta_data +# - <RA>_start +# - <RA>_stop +# - <RA>_monitor +# +# The required parameters should all be listed in the +# OCF_REQUIRED_PARAMS variable. For example, "config user group". +# +# The OCF_REQUIRED_BINARIES variable should contain a list of all +# programs which are needed for the correct operation of the +# resource agent. +# +# <RA>_getconfig and <RA>_validate_all are optional. getconfig is +# where RA can read more configuration from the file system or do +# some other configuration processing. +# validate_all checks if the environment is OK. +# +# If it exists, the <RA>_probe function is invoked on probes +# (monitor with interval 0) instead of <RA>_monitor. +# +# NB: If the RA name contains a '-', it is going to be converted +# to '_' when generating function names. For syslog-ng for +# instance, the monitor function name would be syslog_ng_monitor + +is_function() { + test z"`command -v $1`" = z"$1" +} +run_function() { + is_function $1 && $1 +} +is_var_defined() { + test z != "z$(eval echo $`echo $1`)" +} +mk_action_func() { + ACTION_FUNC=`echo ${OCF_RESOURCE_TYPE}_$__OCF_ACTION | tr '-' '_'` +} +validate_args() { + is_function $ACTION_FUNC || { + ocf_exit_reason "$__OCF_ACTION: action not supported" + run_function ${OCF_RESOURCE_TYPE}_methods + exit $OCF_ERR_UNIMPLEMENTED + } +} +simple_actions() { + case $__OCF_ACTION in + meta-data|usage|methods) + $ACTION_FUNC + exit $OCF_SUCCESS + ;; + esac +} +run_probe() { + if is_function ${OCF_RESOURCE_TYPE}_probe; then + ${OCF_RESOURCE_TYPE}_probe + exit + fi +} +check_required_params() { + local v + for v in $OCF_REQUIRED_PARAMS; do + is_var_defined OCF_RESKEY_$v || { + ocf_exit_reason "$v: required parameter not set" + exit $OCF_ERR_CONFIGURED + } + done +} +# this function does an exit (end of the road) +handle_invalid_env() { + local rc msg + rc=$1 + msg=${2:-"environment is invalid, resource considered stopped"} + case "$__OCF_ACTION" in + stop) + ocf_log info $msg + exit $OCF_SUCCESS + ;; + monitor) + if ocf_is_probe; then + ocf_log info $msg + exit $OCF_NOT_RUNNING + else + # in recurring monitor, this amounts to error + ocf_exit_reason "$msg" + exit $OCF_ERR_GENERIC + fi + ;; + status) + ocf_log info $msg + exit $LSB_STATUS_STOPPED + ;; + *) + ocf_exit_reason "$msg" + exit $rc + ;; + esac +} +check_required_binaries() { + local v + for v in $OCF_REQUIRED_BINARIES; do + have_binary $v || { + handle_invalid_env $OCF_ERR_INSTALLED "$v: required binary not installed" + # unreachable + } + done +} +validate_env() { + check_required_binaries # all binaries present? + is_function ${OCF_RESOURCE_TYPE}_validate_all || + return + local rc + LSB_STATUS_STOPPED=3 + ${OCF_RESOURCE_TYPE}_validate_all # is environment ok? + rc=$? + if [ $rc -ne 0 ]; then + handle_invalid_env $rc + # unreachable + fi +} + +# ocf_rarun: the main function +ocf_rarun() { + mk_action_func # create action function name + validate_args # validate command line arguments + simple_actions # run meta-data (or similar) + check_required_params # all required parameters defined? + run_function ${OCF_RESOURCE_TYPE}_getconfig # get extra configuration + validate_env # is environment ok? + ocf_is_probe && run_probe # do probe + shift 1 # skip action + $ACTION_FUNC $* # run action +} diff --git a/heartbeat/ocf-returncodes b/heartbeat/ocf-returncodes new file mode 100644 index 0000000..dd5f017 --- /dev/null +++ b/heartbeat/ocf-returncodes @@ -0,0 +1,55 @@ +# +# Common varibales for the OCF Resource Agents supplied by +# heartbeat. +# +# Copyright (c) 2004 SUSE LINUX AG, Andrew Beekhof +# All Rights Reserved. +# +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +OCF_SUCCESS=0 +OCF_ERR_GENERIC=1 +OCF_ERR_ARGS=2 +OCF_ERR_UNIMPLEMENTED=3 +OCF_ERR_PERM=4 +OCF_ERR_INSTALLED=5 +OCF_ERR_CONFIGURED=6 +OCF_NOT_RUNNING=7 + +# Non-standard values. +# +# OCF does not include the concept of master/slave resources so we +# need to extend it so we can discover a resource's complete state. +# +# OCF_RUNNING_MASTER: +# The resource is in "master" mode and fully operational +# OCF_FAILED_MASTER: +# The resource is in "master" mode but in a failed state +# +# The extra two values should only be used during a probe. +# +# Probes are used to discover resources that were started outside of +# the CRM and/or left behind if the LRM fails. +# +# They can be identified in RA scripts by checking for: +# [ "${__OCF_ACTION}" = "monitor" -a "${OCF_RESKEY_CRM_meta_interval}" = "0" ] +# +# Failed "slaves" should continue to use: OCF_ERR_GENERIC +# Fully operational "slaves" should continue to use: OCF_SUCCESS +# +OCF_RUNNING_MASTER=8 +OCF_FAILED_MASTER=9 diff --git a/heartbeat/ocf-shellfuncs.in b/heartbeat/ocf-shellfuncs.in new file mode 100644 index 0000000..6852163 --- /dev/null +++ b/heartbeat/ocf-shellfuncs.in @@ -0,0 +1,1070 @@ +# +# +# Common helper functions for the OCF Resource Agents supplied by +# heartbeat. +# +# Copyright (c) 2004 SUSE LINUX AG, Lars Marowsky-Brée +# All Rights Reserved. +# +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +# Build version: 204f146196774ded1581aed1b6acd0d562b4f958 + +# TODO: Some of this should probably split out into a generic OCF +# library for shell scripts, but for the time being, we'll just use it +# ourselves... +# + +# TODO wish-list: +# - Generic function for evaluating version numbers +# - Generic function(s) to extract stuff from our own meta-data +# - Logging function which automatically adds resource identifier etc +# prefixes +# TODO: Move more common functionality for OCF RAs here. +# + +# This was common throughout all legacy Heartbeat agents +unset LC_ALL; export LC_ALL +unset LANGUAGE; export LANGUAGE + +: ${HA_SBIN_DIR:=@sbindir@} + +__SCRIPT_NAME=`basename $0` + +if [ -z "$OCF_ROOT" ]; then + : ${OCF_ROOT=@OCF_ROOT_DIR@} +fi + +if [ "$OCF_FUNCTIONS_DIR" = ${OCF_ROOT}/resource.d/heartbeat ]; then # old + unset OCF_FUNCTIONS_DIR +fi + +: ${OCF_FUNCTIONS_DIR:=${OCF_ROOT}/lib/heartbeat} + +. ${OCF_FUNCTIONS_DIR}/ocf-binaries +. ${OCF_FUNCTIONS_DIR}/ocf-returncodes +. ${OCF_FUNCTIONS_DIR}/ocf-directories +. ${OCF_FUNCTIONS_DIR}/ocf-rarun +. ${OCF_FUNCTIONS_DIR}/ocf-distro + +# Define OCF_RESKEY_CRM_meta_interval in case it isn't already set, +# to make sure that ocf_is_probe() always works +: ${OCF_RESKEY_CRM_meta_interval=0} + +ocf_is_root() { + if [ X`id -u` = X0 ]; then + true + else + false + fi +} + +ocf_maybe_random() { + if test -c /dev/urandom; then + od -An -N4 -tu4 /dev/urandom | tr -d '[:space:]' + else + awk -v pid=$$ 'BEGIN{srand(pid); print rand()}' | sed 's/^.*[.]//' + fi +} + +# Portability comments: +# o The following rely on Bourne "sh" pattern-matching, which is usually +# that for filename generation (note: not regexp). +# o The "*) true ;;" clause is probably unnecessary, but is included +# here for completeness. +# o The negation in the pattern uses "!". This seems to be common +# across many OSes (whereas the alternative "^" fails on some). +# o If an OS is encountered where this negation fails, then a possible +# alternative would be to replace the function contents by (e.g.): +# [ -z "`echo $1 | tr -d '[0-9]'`" ] +# +ocf_is_decimal() { + case "$1" in + ""|*[!0-9]*) # empty, or at least one non-decimal + false ;; + *) + true ;; + esac +} + +ocf_is_true() { + case "$1" in + yes|true|1|YES|TRUE|True|ja|on|ON) true ;; + *) false ;; + esac +} + +ocf_is_hex() { + case "$1" in + ""|*[!0-9a-fA-F]*) # empty, or at least one non-hex + false ;; + *) + true ;; + esac +} + +ocf_is_octal() { + case "$1" in + ""|*[!0-7]*) # empty, or at least one non-octal + false ;; + *) + true ;; + esac +} + +__ocf_set_defaults() { + __OCF_ACTION="$1" + + # Return to sanity for the agents... + unset LANG + LC_ALL=C + export LC_ALL + + # TODO: Review whether we really should source this. Or rewrite + # to match some emerging helper function syntax...? This imports + # things which no OCF RA should be using... + + # Strip the OCF_RESKEY_ prefix from this particular parameter + if [ -z "$OCF_RESKEY_OCF_CHECK_LEVEL" ]; then + : ${OCF_CHECK_LEVEL:=0} + else + : ${OCF_CHECK_LEVEL:=$OCF_RESKEY_OCF_CHECK_LEVEL} + fi + + if [ ! -d "$OCF_ROOT" ]; then + ha_log "ERROR: OCF_ROOT points to non-directory $OCF_ROOT." + exit $OCF_ERR_GENERIC + fi + + if [ -z "$OCF_RESOURCE_TYPE" ]; then + : ${OCF_RESOURCE_TYPE:=$__SCRIPT_NAME} + fi + + if [ "x$__OCF_ACTION" = "xmeta-data" ]; then + : ${OCF_RESOURCE_INSTANCE:="RESOURCE_ID"} + fi + + if [ -z "$OCF_RA_VERSION_MAJOR" ]; then + : We are being invoked as an init script. + : Fill in some things with reasonable values. + : ${OCF_RESOURCE_INSTANCE:="default"} + return 0 + fi + + if [ -z "$OCF_RESOURCE_INSTANCE" ]; then + ha_log "ERROR: Need to tell us our resource instance name." + exit $OCF_ERR_ARGS + fi +} + +hadate() { + date "+${HA_DATEFMT}" +} + +set_logtag() { + if [ -z "$HA_LOGTAG" ]; then + if [ -n "$OCF_RESOURCE_INSTANCE" ]; then + HA_LOGTAG="$__SCRIPT_NAME($OCF_RESOURCE_INSTANCE)[$$]" + else + HA_LOGTAG="$__SCRIPT_NAME[$$]" + fi + fi +} + +__ha_log() { + local ignore_stderr=false + local loglevel + + [ "x$1" = "x--ignore-stderr" ] && ignore_stderr=true && shift + + [ none = "$HA_LOGFACILITY" ] && HA_LOGFACILITY="" + # if we're connected to a tty, then output to stderr + if tty >/dev/null; then + if [ "x$HA_debug" = "x0" -a "x$loglevel" = xdebug ] ; then + return 0 + elif [ "$ignore_stderr" = "true" ]; then + # something already printed this error to stderr, so ignore + return 0 + fi + if [ "$HA_LOGTAG" ]; then + echo "$HA_LOGTAG: $*" + else + echo "$*" + fi >&2 + return 0 + fi + + set_logtag + + if [ "x${HA_LOGD}" = "xyes" ] ; then + ha_logger -t "${HA_LOGTAG}" "$@" + if [ "$?" -eq "0" ] ; then + return 0 + fi + fi + + if + [ -n "$HA_LOGFACILITY" ] + then + : logging through syslog + # loglevel is unknown, use 'notice' for now + loglevel=notice + case "${*}" in + *ERROR*) loglevel=err;; + *WARN*) loglevel=warning;; + *INFO*|info) loglevel=info;; + esac + logger -t "$HA_LOGTAG" -p ${HA_LOGFACILITY}.${loglevel} "${*}" + fi + if + [ -n "$HA_LOGFILE" ] + then + : appending to $HA_LOGFILE + echo `hadate`" $HA_LOGTAG: ${*}" >> $HA_LOGFILE + fi + if + [ -z "$HA_LOGFACILITY" -a -z "$HA_LOGFILE" ] && ! [ "$ignore_stderr" = "true" ] + then + : appending to stderr + echo `hadate`"${*}" >&2 + fi + if + [ -n "$HA_DEBUGLOG" ] + then + : appending to $HA_DEBUGLOG + if [ "$HA_LOGFILE"x != "$HA_DEBUGLOG"x ]; then + echo "$HA_LOGTAG: "`hadate`"${*}" >> $HA_DEBUGLOG + fi + fi +} + +ha_log() +{ + __ha_log "$@" +} + +ha_debug() { + + if [ "x${HA_debug}" = "x0" ] || [ -z "${HA_debug}" ] ; then + return 0 + fi + if tty >/dev/null; then + if [ "$HA_LOGTAG" ]; then + echo "$HA_LOGTAG: $*" + else + echo "$*" + fi >&2 + return 0 + fi + + set_logtag + + if [ "x${HA_LOGD}" = "xyes" ] ; then + ha_logger -t "${HA_LOGTAG}" -D "ha-debug" "$@" + if [ "$?" -eq "0" ] ; then + return 0 + fi + fi + + [ none = "$HA_LOGFACILITY" ] && HA_LOGFACILITY="" + + if + [ -n "$HA_LOGFACILITY" ] + then + : logging through syslog + logger -t "$HA_LOGTAG" -p "${HA_LOGFACILITY}.debug" "${*}" + fi + if + [ -n "$HA_DEBUGLOG" ] + then + : appending to $HA_DEBUGLOG + echo "$HA_LOGTAG: "`hadate`"${*}" >> $HA_DEBUGLOG + fi + if + [ -z "$HA_LOGFACILITY" -a -z "$HA_DEBUGLOG" ] + then + : appending to stderr + echo "$HA_LOGTAG: `hadate`${*}: ${HA_LOGFACILITY}" >&2 + fi +} + +ha_parameter() { + local VALUE + VALUE=`sed -e 's%[ ][ ]*% %' -e 's%^ %%' -e 's%#.*%%' $HA_CF | grep -i "^$1 " | sed 's%[^ ]* %%'` + if + [ "X$VALUE" = X ] + then + + case $1 in + keepalive) VALUE=2;; + deadtime) + ka=`ha_parameter keepalive` + VALUE=`expr $ka '*' 2 '+' 1`;; + esac + fi + echo $VALUE +} + +ocf_log() { + # TODO: Revisit and implement internally. + if + [ $# -lt 2 ] + then + ocf_log err "Not enough arguments [$#] to ocf_log." + fi + __OCF_PRIO="$1" + shift + __OCF_MSG="$*" + + case "${__OCF_PRIO}" in + crit) __OCF_PRIO="CRIT";; + err) __OCF_PRIO="ERROR";; + warn) __OCF_PRIO="WARNING";; + info) __OCF_PRIO="INFO";; + debug)__OCF_PRIO="DEBUG";; + *) __OCF_PRIO=`echo ${__OCF_PRIO}| tr '[a-z]' '[A-Z]'`;; + esac + + if [ "${__OCF_PRIO}" = "DEBUG" ]; then + ha_debug "${__OCF_PRIO}: $__OCF_MSG" + else + ha_log "${__OCF_PRIO}: $__OCF_MSG" + fi +} + +# +# ocf_exit_reason: print exit error string to stderr +# Usage: Allows the OCF script to provide a string +# describing why the exit code was returned. +# Arguments: reason - required, The string that represents why the error +# occured. +# +ocf_exit_reason() +{ + local cookie="$OCF_EXIT_REASON_PREFIX" + local fmt + local msg + + # No argument is likely not intentional. + # Just one argument implies a printf format string of just "%s". + # "Least surprise" in case some interpolated string from variable + # expansion or other contains a percent sign. + # More than one argument: first argument is going to be the format string. + case $# in + 0) ocf_log err "Not enough arguments to ocf_log_exit_msg." ;; + 1) fmt="%s" ;; + + *) fmt=$1 + shift + case $fmt in + *%*) : ;; # ok, does look like a format string + *) ocf_log warn "Does not look like format string: [$fmt]" ;; + esac ;; + esac + + if [ -z "$cookie" ]; then + # use a default prefix + cookie="ocf-exit-reason:" + fi + + msg=$(printf "${fmt}" "$@") + printf >&2 "%s%s\n" "$cookie" "$msg" + __ha_log --ignore-stderr "ERROR: $msg" +} + +# +# ocf_deprecated: Log a deprecation warning +# Usage: ocf_deprecated [param-name] +# Arguments: param-name optional, name of a boolean resource +# parameter that can be used to suppress +# the warning (default +# "ignore_deprecation") +ocf_deprecated() { + local param + param=${1:-ignore_deprecation} + # don't use ${!param} here, it's a bashism + if ! ocf_is_true $(eval echo \$OCF_RESKEY_$param); then + ocf_log warn "This resource agent is deprecated" \ + "and may be removed in a future release." \ + "See the man page for details." \ + "To suppress this warning, set the \"${param}\"" \ + "resource parameter to true." + fi +} + +# +# Ocf_run: Run a script, and log its output. +# Usage: ocf_run [-q] [-info|-warn|-err] <command> +# -q: don't log the output of the command if it succeeds +# -info|-warn|-err: log the output of the command at given +# severity if it fails (defaults to err) +# +ocf_run() { + local rc + local output + local verbose=1 + local loglevel=err + local var + + for var in 1 2 + do + case "$1" in + "-q") + verbose="" + shift 1;; + "-info"|"-warn"|"-err") + loglevel=`echo $1 | sed -e s/-//g` + shift 1;; + *) + ;; + esac + done + + output=`"$@" 2>&1` + rc=$? + [ -n "$output" ] && output="$(echo "$output" | tr -s ' \t\r\n' ' ')" + if [ $rc -eq 0 ]; then + if [ "$verbose" -a ! -z "$output" ]; then + ocf_log info "$output" + fi + else + if [ ! -z "$output" ]; then + ocf_log $loglevel "$output" + else + ocf_log $loglevel "command failed: $*" + fi + fi + + return $rc +} + +ocf_pidfile_status() { + local pid pidfile="$1" + if [ ! -e "$pidfile" ]; then + # Not exists + return 2 + fi + pid=$(cat "$pidfile") + kill -0 "$pid" > /dev/null 2>&1 + if [ $? = 0 ]; then + return 0 + fi + + # Stale + return 1 +} + +# mkdir(1) based locking +# first the directory is created with the name given as $1 +# then a file named "pid" is created within that directory with +# the process PID +# stale locks are handled carefully, the inode of a directory +# needs to match before and after test if the process is running +# empty directories are also handled appropriately +# we relax (sleep) occasionally to allow for other processes to +# finish managing the lock in case they are in the middle of the +# business + +relax() { sleep 0.5; } +ocf_get_stale_pid() { + local piddir pid dir_inode + + piddir="$1" + [ -z "$piddir" ] && return 2 + dir_inode="`ls -di $piddir 2>/dev/null`" + [ -z "$dir_inode" ] && return 1 + pid=`cat $piddir/pid 2>/dev/null` + if [ -z "$pid" ]; then + # empty directory? + relax + if [ "$dir_inode" = "`ls -di $piddir 2>/dev/null`" ]; then + echo $dir_inode + else + return 1 + fi + elif kill -0 $pid >/dev/null 2>&1; then + return 1 + elif relax && [ -e "$piddir/pid" ] && [ "$dir_inode" = "`ls -di $piddir 2>/dev/null`" ]; then + echo $pid + else + return 1 + fi +} + +# There is a race when the following two functions to manage the +# lock file (mk and rm) are invoked in parallel by different +# instances. It is up to the caller to reduce probability of that +# taking place (see ocf_take_lock() below). + +ocf_mk_pid() { + mkdir $1 2>/dev/null && echo $$ > $1/pid +} +ocf_rm_pid() { + rm -f $1/pid + rmdir $1 2>/dev/null +} + +# Testing and subsequently removing a stale lock (containing the +# process pid) is inherently difficult to do in such a way as to +# prevent a race between creating a pid file and removing it and +# its directory. We reduce the probability of that happening by +# checking if the stale lock persists over a random period of +# time. + +ocf_take_lock() { + local lockdir=$1 + local rnd + local stale_pid + + # we don't want it too short, so strip leading zeros + rnd=$(ocf_maybe_random | sed 's/^0*//') + stale_pid=`ocf_get_stale_pid $lockdir` + if [ -n "$stale_pid" ]; then + sleep 0.$rnd + # remove "stale pid" only if it persists + [ "$stale_pid" = "`ocf_get_stale_pid $lockdir`" ] && + ocf_rm_pid $lockdir + fi + while ! ocf_mk_pid $lockdir; do + ocf_log info "Sleeping until $lockdir is released..." + sleep 0.$rnd + done +} + +ocf_release_lock_on_exit() { + trap "ocf_rm_pid $1" EXIT +} + +# returns true if the CRM is currently running a probe. A probe is +# defined as a monitor operation with a monitoring interval of zero. +ocf_is_probe() { + [ "$__OCF_ACTION" = "monitor" -a "$OCF_RESKEY_CRM_meta_interval" = 0 ] +} + +# returns true if the resource is configured as a clone. This is +# defined as a resource where the clone-max meta attribute is present. +ocf_is_clone() { + [ ! -z "${OCF_RESKEY_CRM_meta_clone_max}" ] +} + +# returns true if the resource is configured as a multistate +# (master/slave) resource. This is defined as a resource where the +# master-max meta attribute is present, and set to greater than zero. +ocf_is_ms() { + [ "${OCF_RESKEY_CRM_meta_promotable}" = "true" ] || { [ ! -z "${OCF_RESKEY_CRM_meta_master_max}" ] && [ "${OCF_RESKEY_CRM_meta_master_max}" -gt 0 ]; } +} + +# version check functions +# allow . and - to delimit version numbers +# max version number is 999 +# +ocf_is_ver() { + echo $1 | grep '^[0-9][0-9.-]*[0-9A-Za-z.\+-]*$' >/dev/null 2>&1 +} + +# usage: ocf_version_cmp VER1 VER2 +# version strings can contain digits, dots, and dashes +# must start and end with a digit +# returns: +# 0: VER1 smaller (older) than VER2 +# 1: versions equal +# 2: VER1 greater (newer) than VER2 +# 3: bad format +ocf_version_cmp() { + ocf_is_ver "$1" || return 3 + ocf_is_ver "$2" || return 3 + local v1=$1 + local v2=$2 + + sort_version="sort -t. -k 1,1n -k 2,2n -k 3,3n -k 4,4n" + older=$( (echo "$v1"; echo "$v2") | $sort_version | head -1 ) + + if [ "$v1" = "$v2" ]; then + return 1 + elif [ "$v1" = "$older" ]; then + return 0 + else + return 2 # -1 would look funny in shell ;-) + fi +} + +ocf_local_nodename() { + # use crm_node -n for pacemaker > 1.1.8 + which pacemakerd > /dev/null 2>&1 + if [ $? -eq 0 ]; then + local version=$(pacemakerd -$ | grep "Pacemaker .*" | awk '{ print $2 }') + version=$(echo $version | awk -F- '{ print $1 }') + ocf_version_cmp "$version" "1.1.8" + if [ $? -eq 2 ]; then + which crm_node > /dev/null 2>&1 + if [ $? -eq 0 ]; then + crm_node -n + return + fi + fi + fi + + # otherwise use uname -n + uname -n +} + +# usage: dirname DIR +dirname() +{ + local a + local b + + [ $# = 1 ] || return 1 + a="$1" + while [ 1 ]; do + b="${a%/}" + [ "$a" = "$b" ] && break + a="$b" + done + b=${a%/*} + [ -z "$b" -o "$a" = "$b" ] && b="." + + echo "$b" + return 0 +} + +# usage: systemd_is_running +# returns: +# 0 PID 1 is systemd +# 1 otherwise +systemd_is_running() +{ + [ "$(cat /proc/1/comm 2>/dev/null)" = "systemd" ] +} + +# usage: systemd_drop_in <name> <After|Before> <dependency.service> +systemd_drop_in() +{ + local conf_file + if [ $# -ne 3 ]; then + ocf_log err "Incorrect number of arguments [$#] for systemd_drop_in." + fi + + systemdrundir="/run/systemd/system/resource-agents-deps.target.d" + mkdir -p "$systemdrundir" + conf_file="$systemdrundir/$1.conf" + cat >"$conf_file" <<EOF +[Unit] +$2=$3 +EOF + # The information is accessible through systemd API and systemd would + # complain about improper permissions. + chmod o+r "$conf_file" + systemctl daemon-reload +} + +# move process to root cgroup if realtime scheduling is enabled +ocf_move_to_root_cgroup_if_rt_enabled() +{ + if [ -e "/sys/fs/cgroup/cpu/cpu.rt_runtime_us" ]; then + echo $$ >> /sys/fs/cgroup/cpu/tasks + + if [ "$?" -ne "0" ]; then + ocf_log warn "Unable to move PID $$ to the root cgroup" + fi + fi +} + +# usage: crm_mon_no_validation args... +# run crm_mon without any cib schema validation +# This is useful when an agent runs in a bundle to avoid potential +# schema validation errors when host and bundle are not perfectly aligned +# To be used, your shell must support on process substitution (e.g. bash) +# returns: +# <crm_mon error codes> +crm_mon_no_validation() +{ + # The subshell prevents parsing error with incompatible shells + "$SHELL" -c "CIB_file=<(${HA_SBIN_DIR}/cibadmin -Q | sed 's/validate-with=\"[^\"]*\"/validate-with=\"none\"/') \ + ${HA_SBIN_DIR}/crm_mon \$*" -- $* +} + +# +# pseudo_resource status tracking function... +# +# This allows pseudo resources to give correct status information. As we add +# resource monitoring, and better resource tracking in general, this will +# become essential. +# +# These scripts work because ${HA_RSCTMP} is cleaned on node reboot. +# +# We create "resource-string" tracking files under ${HA_RSCTMP} in a +# very simple way: +# +# Existence of "${HA_RSCTMP}/resource-string" means that we consider +# the resource named by "resource-string" to be running. +# +# Note that "resource-string" needs to be unique. Using the resource type +# plus the resource instance arguments to make up the resource string +# is probably sufficient... +# +# usage: ha_pseudo_resource resource-string op [tracking_file] +# where op is {start|stop|monitor|status|restart|reload|print} +# print is a special op which just prints the tracking file location +# user can override our choice of the tracking file location by +# specifying it as the third arg +# Note that all operations are silent... +# +ha_pseudo_resource() +{ + local ha_resource_tracking_file="${3:-${HA_RSCTMP}/$1}" + case $2 in + start|restart|reload) touch "$ha_resource_tracking_file";; + stop) rm -f "$ha_resource_tracking_file";; + status|monitor) + if + [ -f "$ha_resource_tracking_file" ] + then + return 0 + else + case $2 in + status) return 3;; + *) return 7;; + esac + fi;; + print) echo "$ha_resource_tracking_file";; + *) return 3;; + esac +} + +# usage: rmtempdir TMPDIR +rmtempdir() +{ + [ $# = 1 ] || return 1 + if [ -e "$1" ]; then + rmdir "$1" || return 1 + fi + return 0 +} + +# usage: maketempfile [-d] +maketempfile() +{ + if [ $# = 1 -a "$1" = "-d" ]; then + mktemp -d + return 0 + elif [ $# != 0 ]; then + return 1 + fi + + mktemp + return 0 +} + +# usage: rmtempfile TMPFILE +rmtempfile () +{ + [ $# = 1 ] || return 1 + if [ -e "$1" ]; then + rm "$1" || return 1 + fi + return 0 +} + +# echo the first lower supported check level +# pass set of levels supported by the agent +# (in increasing order, 0 is optional) +ocf_check_level() +{ + local lvl prev + lvl=0 + prev=0 + if ocf_is_decimal "$OCF_CHECK_LEVEL"; then + # the level list should be very short + for lvl; do + if [ "$lvl" -eq "$OCF_CHECK_LEVEL" ]; then + break + elif [ "$lvl" -gt "$OCF_CHECK_LEVEL" ]; then + lvl=$prev # the previous one + break + fi + prev=$lvl + done + fi + echo $lvl +} + +# usage: ocf_stop_processes SIGNALS WAIT_TIME PIDS +# +# we send signals (use quotes for more than one!) in the order +# given; if one or more processes are still running we try KILL; +# the wait_time is the _total_ time we'll spend in this function +# this time may be slightly exceeded if the processes won't leave +# +# returns: +# 0: all processes left +# 1: some processes still running +# +# example: +# +# ocf_stop_processes TERM 5 $pids +# +ocf_stop_processes() { + local signals="$1" + local wait_time="$(($2/`echo $signals|wc -w`))" + shift 2 + local pids="$*" + local sig i + test -z "$pids" && + return 0 + for sig in $signals KILL; do + kill -s $sig $pids 2>/dev/null + # try to leave early, and yet leave processes time to exit + sleep 0.2 + for i in `seq $wait_time`; do + kill -s 0 $pids 2>/dev/null || + return 0 + sleep 1 + done + done + return 1 +} + +# +# create a given status directory +# if the directory path doesn't start with $HA_VARRUN, then +# we return with error (most of the calls would be with the user +# supplied configuration, hence we need to do necessary +# protection) +# used mostly for PID files +# +# usage: ocf_mkstatedir owner permissions path +# +# owner: user.group +# permissions: permissions +# path: directory path +# +# example: +# ocf_mkstatedir named 755 `dirname $pidfile` +# +ocf_mkstatedir() +{ + local owner + local perms + local path + + owner=$1 + perms=$2 + path=$3 + + test -d $path && return 0 + [ $(id -u) = 0 ] || return 1 + + case $path in + ${HA_VARRUN%/}/*) : this path is ok ;; + *) ocf_log err "cannot create $path (does not start with $HA_VARRUN)" + return 1 + ;; + esac + + mkdir -p $path && + chown $owner $path && + chmod $perms $path +} + +# +# create a unique status directory in $HA_VARRUN +# used mostly for PID files +# the directory is by default set to +# $HA_VARRUN/$OCF_RESOURCE_INSTANCE +# the directory name is printed to stdout +# +# usage: ocf_unique_rundir owner permissions name +# +# owner: user.group (default: "root") +# permissions: permissions (default: "755") +# name: some unique string (default: "$OCF_RESOURCE_INSTANCE") +# +# to use the default either don't set the parameter or set it to +# empty string ("") +# example: +# +# STATEDIR=`ocf_unique_rundir named "" myownstatedir` +# +ocf_unique_rundir() +{ + local path + local owner + local perms + local name + + owner=${1:-"root"} + perms=${2:-"755"} + name=${3:-"$OCF_RESOURCE_INSTANCE"} + path=$HA_VARRUN/$name + if [ ! -d $path ]; then + [ $(id -u) = 0 ] || return 1 + mkdir -p $path && + chown $owner $path && + chmod $perms $path || return 1 + fi + echo $path +} + +# +# RA tracing may be turned on by setting OCF_TRACE_RA +# the trace output will be saved to OCF_TRACE_FILE, if set, or +# by default to +# $HA_VARLIB/trace_ra/<type>/<id>.<action>.<timestamp> +# e.g. $HA_VARLIB/trace_ra/oracle/db.start.2012-11-27.08:37:08 +# +# OCF_TRACE_FILE: +# - FD (small integer [3-9]) in that case it is up to the callers +# to capture output; the FD _must_ be open for writing +# - absolute path +# +# NB: FD 9 may be used for tracing with bash >= v4 in case +# OCF_TRACE_FILE is set to a path. +# +ocf_bash_has_xtracefd() { + [ -n "$BASH_VERSION" ] && [ ${BASH_VERSINFO[0]} -ge 4 ] +} +# for backwards compatibility +ocf_is_bash4() { + ocf_bash_has_xtracefd +} +ocf_trace_redirect_to_file() { + local dest=$1 + if ocf_bash_has_xtracefd; then + exec 9>$dest + BASH_XTRACEFD=9 + else + exec 2>$dest + fi +} +ocf_trace_redirect_to_fd() { + local fd=$1 + if ocf_bash_has_xtracefd; then + BASH_XTRACEFD=$fd + else + exec 2>&$fd + fi +} +__ocf_test_trc_dest() { + local dest=$1 + if ! touch $dest; then + ocf_log warn "$dest not writable, trace not going to happen" + __OCF_TRC_DEST="" + __OCF_TRC_MANAGE="" + return 1 + fi + return 0 +} +ocf_default_trace_dest() { + tty >/dev/null && return + if [ -n "$OCF_RESOURCE_TYPE" -a \ + -n "$OCF_RESOURCE_INSTANCE" -a -n "$__OCF_ACTION" ]; then + local ts=`date +%F.%T` + __OCF_TRC_DEST=${OCF_RESKEY_trace_dir}/${OCF_RESOURCE_TYPE}/${OCF_RESOURCE_INSTANCE}.${__OCF_ACTION}.$ts + __OCF_TRC_MANAGE="1" + fi +} + +ocf_start_trace() { + export __OCF_TRC_DEST="" __OCF_TRC_MANAGE="" + case "$OCF_TRACE_FILE" in + [3-9]) ocf_trace_redirect_to_fd "$OCF_TRACE_FILE" ;; + /*/*) __OCF_TRC_DEST=$OCF_TRACE_FILE ;; + "") ocf_default_trace_dest ;; + *) + ocf_log warn "OCF_TRACE_FILE must be set to either FD (open for writing) or absolute file path" + ocf_default_trace_dest + ;; + esac + if [ "$__OCF_TRC_DEST" ]; then + mkdir -p `dirname $__OCF_TRC_DEST` + __ocf_test_trc_dest $__OCF_TRC_DEST || + return + ocf_trace_redirect_to_file "$__OCF_TRC_DEST" + fi + if [ -n "$BASH_VERSION" ]; then + PS4='+ `date +"%T"`: ${FUNCNAME[0]:+${FUNCNAME[0]}:}${LINENO}: ' + fi + set -x + env=$( echo; printenv | sort ) +} +ocf_stop_trace() { + set +x +} + +# Helper functions to map from nodename/bundle-name and physical hostname +# list_index_for_word "node0 node1 node2 node3 node4 node5" node4 --> 5 +# list_word_at_index "NA host1 host2 host3 host4 host5" 3 --> host2 + +# list_index_for_word "node1 node2 node3 node4 node5" node7 --> "" +# list_word_at_index "host1 host2 host3 host4 host5" 8 --> "" + +# attribute_target node1 --> host1 +list_index_for_word() { + echo $1 | tr ' ' '\n' | awk -v x="$2" '$0~x {print NR}' +} + +list_word_at_index() { + echo $1 | tr ' ' '\n' | awk -v n="$2" 'n == NR' +} + +ocf_attribute_target() { + if [ x$1 = x ]; then + if [ x$OCF_RESKEY_CRM_meta_container_attribute_target = xhost -a x$OCF_RESKEY_CRM_meta_physical_host != x ]; then + echo $OCF_RESKEY_CRM_meta_physical_host + else + if [ x$OCF_RESKEY_CRM_meta_on_node != x ]; then + echo $OCF_RESKEY_CRM_meta_on_node + else + ocf_local_nodename + fi + fi + return + elif [ x"$OCF_RESKEY_CRM_meta_notify_all_uname" != x ]; then + index=$(list_index_for_word "$OCF_RESKEY_CRM_meta_notify_all_uname" $1) + mapping="" + if [ x$index != x ]; then + mapping=$(list_word_at_index "$OCF_RESKEY_CRM_meta_notify_all_hosts" $index) + fi + if [ x$mapping != x -a x$mapping != xNA ]; then + echo $mapping + return + fi + fi + echo $1 +} + +ocf_promotion_score() { + ocf_version_cmp "$OCF_RESKEY_crm_feature_set" "3.10.0" + res=$? + if [ $res -eq 2 ] || [ $res -eq 1 ] || ! have_binary "crm_master"; then + ${HA_SBIN_DIR}/crm_attribute -p ${OCF_RESOURCE_INSTANCE} $@ + else + ${HA_SBIN_DIR}/crm_master -l reboot $@ + fi +} + +__ocf_set_defaults "$@" + +: ${OCF_TRACE_RA:=$OCF_RESKEY_trace_ra} +: ${OCF_RESKEY_trace_dir:="$HA_VARLIB/trace_ra"} +ocf_is_true "$OCF_TRACE_RA" && ocf_start_trace + +# pacemaker sets HA_use_logd, some others use HA_LOGD :/ +if ocf_is_true "$HA_use_logd"; then + : ${HA_LOGD:=yes} +fi diff --git a/heartbeat/ocf.py b/heartbeat/ocf.py new file mode 100644 index 0000000..dda2fed --- /dev/null +++ b/heartbeat/ocf.py @@ -0,0 +1,486 @@ +# +# Copyright (c) 2016 Red Hat, Inc, Oyvind Albrigtsen +# All Rights Reserved. +# +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +import sys, os, logging, syslog + +argv=sys.argv +env=os.environ + +# +# Common variables for the OCF Resource Agents supplied by +# heartbeat. +# + +OCF_SUCCESS=0 +OCF_ERR_GENERIC=1 +OCF_ERR_ARGS=2 +OCF_ERR_UNIMPLEMENTED=3 +OCF_ERR_PERM=4 +OCF_ERR_INSTALLED=5 +OCF_ERR_CONFIGURED=6 +OCF_NOT_RUNNING=7 + +# Non-standard values. +# +# OCF does not include the concept of master/slave resources so we +# need to extend it so we can discover a resource's complete state. +# +# OCF_RUNNING_MASTER: +# The resource is in "master" mode and fully operational +# OCF_FAILED_MASTER: +# The resource is in "master" mode but in a failed state +# +# The extra two values should only be used during a probe. +# +# Probes are used to discover resources that were started outside of +# the CRM and/or left behind if the LRM fails. +# +# They can be identified in RA scripts by checking for: +# [ "${__OCF_ACTION}" = "monitor" -a "${OCF_RESKEY_CRM_meta_interval}" = "0" ] +# +# Failed "slaves" should continue to use: OCF_ERR_GENERIC +# Fully operational "slaves" should continue to use: OCF_SUCCESS +# +OCF_RUNNING_MASTER=8 +OCF_FAILED_MASTER=9 + + +## Own logger handler that uses old-style syslog handler as otherwise +## everything is sourced from /dev/syslog +class SyslogLibHandler(logging.StreamHandler): + """ + A handler class that correctly push messages into syslog + """ + def emit(self, record): + syslog_level = { + logging.CRITICAL:syslog.LOG_CRIT, + logging.ERROR:syslog.LOG_ERR, + logging.WARNING:syslog.LOG_WARNING, + logging.INFO:syslog.LOG_INFO, + logging.DEBUG:syslog.LOG_DEBUG, + logging.NOTSET:syslog.LOG_DEBUG, + }[record.levelno] + + msg = self.format(record) + + # syslog.syslog can not have 0x00 character inside or exception + # is thrown + syslog.syslog(syslog_level, msg.replace("\x00","\n")) + return + + +OCF_RESOURCE_INSTANCE = env.get("OCF_RESOURCE_INSTANCE") + +OCF_ACTION = env.get("__OCF_ACTION") +if OCF_ACTION is None and len(argv) == 2: + OCF_ACTION = argv[1] + +HA_DEBUG = env.get("HA_debug", 0) +HA_DATEFMT = env.get("HA_DATEFMT", "%b %d %T ") +HA_LOGFACILITY = env.get("HA_LOGFACILITY") +HA_LOGFILE = env.get("HA_LOGFILE") +HA_DEBUGLOG = env.get("HA_DEBUGLOG") + +log = logging.getLogger(os.path.basename(argv[0])) +log.setLevel(logging.DEBUG) + +## add logging to stderr +if sys.stdout.isatty(): + seh = logging.StreamHandler(stream=sys.stderr) + if HA_DEBUG == 0: + seh.setLevel(logging.WARNING) + sehformatter = logging.Formatter('%(filename)s(%(OCF_RESOURCE_INSTANCE)s)[%(process)s]:\t%(asctime)s%(levelname)s: %(message)s', datefmt=HA_DATEFMT) + seh.setFormatter(sehformatter) + log.addHandler(seh) + +## add logging to syslog +if HA_LOGFACILITY: + slh = SyslogLibHandler() + if HA_DEBUG == 0: + slh.setLevel(logging.WARNING) + slhformatter = logging.Formatter('%(levelname)s: %(message)s') + slh.setFormatter(slhformatter) + log.addHandler(slh) + +## add logging to file +if HA_LOGFILE: + lfh = logging.FileHandler(HA_LOGFILE) + if HA_DEBUG == 0: + lfh.setLevel(logging.WARNING) + lfhformatter = logging.Formatter('%(filename)s(%(OCF_RESOURCE_INSTANCE)s)[%(process)s]:\t%(asctime)s%(levelname)s: %(message)s', datefmt=HA_DATEFMT) + lfh.setFormatter(lfhformatter) + log.addHandler(lfh) + +## add debug logging to file +if HA_DEBUGLOG and HA_LOGFILE != HA_DEBUGLOG: + dfh = logging.FileHandler(HA_DEBUGLOG) + if HA_DEBUG == 0: + dfh.setLevel(logging.WARNING) + dfhformatter = logging.Formatter('%(filename)s(%(OCF_RESOURCE_INSTANCE)s)[%(process)s]:\t%(asctime)s%(levelname)s: %(message)s', datefmt=HA_DATEFMT) + dfh.setFormatter(dfhformatter) + log.addHandler(dfh) + +logger = logging.LoggerAdapter(log, {'OCF_RESOURCE_INSTANCE': OCF_RESOURCE_INSTANCE}) + + +_exit_reason_set = False + +def ocf_exit_reason(msg): + """ + Print exit error string to stderr. + + Allows the OCF agent to provide a string describing + why the exit code was returned. + """ + global _exit_reason_set + cookie = env.get("OCF_EXIT_REASON_PREFIX", "ocf-exit-reason:") + sys.stderr.write("{}{}\n".format(cookie, msg)) + sys.stderr.flush() + logger.error(msg) + _exit_reason_set = True + + +def have_binary(name): + """ + True if binary exists, False otherwise. + """ + def _access_check(fn): + return (os.path.exists(fn) and + os.access(fn, os.F_OK | os.X_OK) and + not os.path.isdir(fn)) + if _access_check(name): + return True + path = env.get("PATH", os.defpath).split(os.pathsep) + seen = set() + for dir in path: + dir = os.path.normcase(dir) + if dir not in seen: + seen.add(dir) + name2 = os.path.join(dir, name) + if _access_check(name2): + return True + return False + + +def is_true(val): + """ + Convert an OCF truth value to a + Python boolean. + """ + return val in ("yes", "true", "1", 1, "YES", "TRUE", "ja", "on", "ON", True) + + +def is_probe(): + """ + A probe is defined as a monitor operation + with an interval of zero. This is called + by Pacemaker to check the status of a possibly + not running resource. + """ + return (OCF_ACTION == "monitor" and + ( env.get("OCF_RESKEY_CRM_meta_interval", "") == "0" or + env.get("OCF_RESKEY_CRM_meta_interval", "") == "" )) + + +def get_parameter(name, default=None): + """ + Extract the parameter value from the environment + """ + return env.get("OCF_RESKEY_{}".format(name), default) + + +def distro(): + """ + Return name of distribution/platform. + + If possible, returns "name/version", else + just "name". + """ + import subprocess + import platform + try: + ret = subprocess.check_output(["lsb_release", "-si"]) + if type(ret) != str: + ret = ret.decode() + distro = ret.strip() + ret = subprocess.check_output(["lsb_release", "-sr"]) + if type(ret) != str: + ret = ret.decode() + version = ret.strip() + return "{}/{}".format(distro, version) + except Exception: + if os.path.exists("/etc/debian_version"): + return "Debian" + if os.path.exists("/etc/SuSE-release"): + return "SUSE" + if os.path.exists("/etc/redhat-release"): + return "Redhat" + return platform.system() + + +class Parameter(object): + def __init__(self, name, shortdesc, longdesc, content_type, unique, required, default): + self.name = name + self.shortdesc = shortdesc + self.longdesc = longdesc + self.content_type = content_type + self.unique = unique + self.required = required + self.default = default + + def __str__(self): + return self.to_xml() + + def to_xml(self): + ret = '<parameter name="' + self.name + '"' + if self.unique: + ret += ' unique="1"' + if self.required: + ret += ' required="1"' + ret += ">\n" + ret += '<longdesc lang="en">' + self.longdesc + '</longdesc>' + "\n" + ret += '<shortdesc lang="en">' + self.shortdesc + '</shortdesc>' + "\n" + ret += '<content type="' + self.content_type + '"' + if self.default is not None: + ret += ' default="{}"'.format(self.default) + ret += " />\n" + ret += "</parameter>\n" + return ret + + + +class Action(object): + def __init__(self, name, timeout, interval, depth, role): + self.name = name + self.timeout = timeout + self.interval = interval + self.depth = depth + self.role = role + + def __str__(self): + return self.to_xml() + + def to_xml(self): + def opt(s, name, var): + if var is not None: + if type(var) == int and name in ("timeout", "interval"): + var = "{}s".format(var) + return s + ' {}="{}"'.format(name, var) + return s + ret = '<action name="{}"'.format(self.name) + ret = opt(ret, "timeout", self.timeout) + ret = opt(ret, "interval", self.interval) + ret = opt(ret, "depth", self.depth) + ret = opt(ret, "role", self.role) + ret += " />\n" + return ret + + +class Agent(object): + """ + OCF Resource Agent metadata XML generator helper. + + Use add_parameter/add_action to define parameters + and actions for the agent. Then call run() to + start the agent main loop. + + See doc/dev-guides/writing-python-agents.md for an example + of how to use it. + """ + + def __init__(self, name, shortdesc, longdesc, version=1.0, ocf_version=1.0): + self.name = name + self.shortdesc = shortdesc + self.longdesc = longdesc + self.version = version + self.ocf_version = ocf_version + self.parameters = [] + self.actions = [] + self._handlers = {} + + def add_parameter(self, name, shortdesc="", longdesc="", content_type="string", unique=False, required=False, default=None): + for param in self.parameters: + if param.name == name: + raise ValueError("Parameter {} defined twice in metadata".format(name)) + self.parameters.append(Parameter(name=name, + shortdesc=shortdesc, + longdesc=longdesc, + content_type=content_type, + unique=unique, + required=required, + default=default)) + return self + + def add_action(self, name, timeout=None, interval=None, depth=None, role=None, handler=None): + self.actions.append(Action(name=name, + timeout=timeout, + interval=interval, + depth=depth, + role=role)) + if handler is not None: + self._handlers[name] = handler + return self + + def __str__(self): + return self.to_xml() + + def to_xml(self): + return """<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="{name}" version="{version}"> +<version>{ocf_version}</version> +<longdesc lang="en"> +{longdesc} +</longdesc> +<shortdesc lang="en">{shortdesc}</shortdesc> + +<parameters> +{parameters} +</parameters> + +<actions> +{actions} +</actions> + +</resource-agent> +""".format(name=self.name, + version = self.version, + ocf_version = self.ocf_version, + longdesc=self.longdesc, + shortdesc=self.shortdesc, + parameters="".join(p.to_xml() for p in self.parameters), + actions="".join(a.to_xml() for a in self.actions)) + + def run(self): + run(self) + + +def run(agent, handlers=None): + """ + Main loop implementation for resource agents. + Does not return. + + Arguments: + + agent: Agent object. + + handlers: Dict of action name to handler function. + + Handler functions can take parameters as arguments, + the run loop will read parameter values from the + environment and pass to the handler. + """ + import inspect + + agent._handlers.update(handlers or {}) + handlers = agent._handlers + + def check_required_params(): + for p in agent.parameters: + if p.required and get_parameter(p.name) is None: + ocf_exit_reason("{}: Required parameter not set".format(p.name)) + sys.exit(OCF_ERR_CONFIGURED) + + def call_handler(func): + if hasattr(inspect, 'signature'): + params = inspect.signature(func).parameters.keys() + else: + params = inspect.getargspec(func).args + if 'self' in params: params.remove('self') + def value_for_parameter(param): + val = get_parameter(param) + if val is not None: + return val + for p in agent.parameters: + if p.name == param: + return p.default + arglist = [value_for_parameter(p) for p in params] + try: + rc = func(*arglist) + if rc is None: + rc = OCF_SUCCESS + return rc + except Exception as err: + if not _exit_reason_set: + ocf_exit_reason(str(err)) + else: + logger.error(str(err)) + return OCF_ERR_GENERIC + + meta_data_action = False + for action in agent.actions: + if action.name == "meta-data": + meta_data_action = True + break + if not meta_data_action: + agent.add_action("meta-data", timeout=10) + + if len(sys.argv) == 2 and sys.argv[1] in ("-h", "--help"): + sys.stdout.write("usage: %s {%s}\n\n" % (sys.argv[0], "|".join(sorted(handlers.keys()))) + + "Expects to have a fully populated OCF RA compliant environment set.\n") + sys.exit(OCF_SUCCESS) + + if OCF_ACTION is None: + ocf_exit_reason("No action argument set") + sys.exit(OCF_ERR_UNIMPLEMENTED) + if OCF_ACTION in ('meta-data', 'usage', 'methods'): + sys.stdout.write(agent.to_xml() + "\n") + sys.exit(OCF_SUCCESS) + + check_required_params() + if OCF_ACTION in handlers: + rc = call_handler(handlers[OCF_ACTION]) + sys.exit(rc) + sys.exit(OCF_ERR_UNIMPLEMENTED) + + +if __name__ == "__main__": + import unittest + + class TestMetadata(unittest.TestCase): + def test_noparams_noactions(self): + m = Agent("foo", shortdesc="shortdesc", longdesc="longdesc") + self.assertEqual("""<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="foo"> +<version>1.0</version> +<longdesc lang="en"> +longdesc +</longdesc> +<shortdesc lang="en">shortdesc</shortdesc> + +<parameters> + +</parameters> + +<actions> + +</actions> + +</resource-agent> +""", str(m)) + + def test_params_actions(self): + m = Agent("foo", shortdesc="shortdesc", longdesc="longdesc") + m.add_parameter("testparam") + m.add_action("start") + self.assertEqual(str(m.actions[0]), '<action name="start" />\n') + + unittest.main() diff --git a/heartbeat/ocivip b/heartbeat/ocivip new file mode 100755 index 0000000..053646d --- /dev/null +++ b/heartbeat/ocivip @@ -0,0 +1,263 @@ +#!/bin/sh +# +# +# Manage Secondary Private IP in Oracle Cloud Infrastructure with Pacemaker +# +# +# Copyright 2016-2018 Lorenzo Garuti <garuti.lorenzo@gmail.com> +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# + +# +# Prerequisites: +# +# - OCI CLI installed (https://docs.oracle.com/en-us/iaas/Content/API/SDKDocs/climanualinst.htm) +# - jq installed +# - dynamic group with a policy attacched +# - the policy must have this statement: +# allow dynamic-group <GROUP_NAME> to use virtual-network-family in compartment id <COMPARTMENT_ID> +# - a reserved secondary private IP address for Compute Instances high availability +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + +# +# Defaults +# +OCF_RESKEY_ocicli_default="/usr/local/bin/oci" +OCF_RESKEY_api_delay_default="3" +OCF_RESKEY_cidr_netmask_default="24" +OCF_RESKEY_interface_alias_default="0" +export OCI_CLI_AUTH=instance_principal + +: ${OCF_RESKEY_ocicli=${OCF_RESKEY_ocicli_default}} +: ${OCF_RESKEY_api_delay=${OCF_RESKEY_api_delay_default}} +: ${OCF_RESKEY_cidr_netmask=${OCF_RESKEY_cidr_netmask_default}} +: ${OCF_RESKEY_interface_alias=${OCF_RESKEY_interface_alias_default}} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="ocivip"> +<version>1.0</version> + +<longdesc lang="en"> +Resource Agent for OCI Compute instance Secondary Private IP Addresses. + +It manages OCI Secondary Private IP Addresses for Compute instances with oci cli. + +See https://docs.oracle.com/en-us/iaas/Content/API/Concepts/cliconcepts.htm for more information about oci cli. + +Prerequisites: + +- OCI CLI installed (https://docs.oracle.com/en-us/iaas/Content/API/SDKDocs/climanualinst.htm) +- jq installed +- dynamic group with a policy attacched +- the policy must have this statement: allow dynamic-group GROUP_NAME to use virtual-network-family in compartment id COMPARTMENT_ID +- a reserved secondary private IP address for Compute Instances high availability + +</longdesc> +<shortdesc lang="en">OCI Secondary Private IP Address for Compute instances Resource Agent</shortdesc> + +<parameters> + +<parameter name="ocicli" unique="0"> +<longdesc lang="en"> +OCI Command line interface (CLI) tools +</longdesc> +<shortdesc lang="en">OCI cli tools</shortdesc> +<content type="string" default="${OCF_RESKEY_ocicli_default}" /> +</parameter> + +<parameter name="secondary_private_ip" unique="1" required="1"> +<longdesc lang="en"> +reserved secondary private ip for compute instance +</longdesc> +<shortdesc lang="en">reserved secondary private ip for compute instance</shortdesc> +<content type="string" default="" /> +</parameter> + +<parameter name="cidr_netmask" unique="0"> +<longdesc lang="en"> +netmask for the secondary_private_ip +</longdesc> +<shortdesc lang="en">netmask for the secondary_private_ip</shortdesc> +<content type="integer" default="${OCF_RESKEY_cidr_netmask_default}" /> +</parameter> + +<parameter name="interface_alias" unique="0"> +<longdesc lang="en"> +numeric alias for the interface +</longdesc> +<shortdesc lang="en">numeric alias for the interface</shortdesc> +<content type="integer" default="${OCF_RESKEY_interface_alias_default}" /> +</parameter> + +<parameter name="api_delay" unique="0"> +<longdesc lang="en"> +a short delay between API calls, to avoid sending API too quick +</longdesc> +<shortdesc lang="en">a short delay between API calls</shortdesc> +<content type="integer" default="${OCF_RESKEY_api_delay_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="30s" /> +<action name="stop" timeout="30s" /> +<action name="monitor" timeout="30s" interval="20s" depth="0" /> +<action name="migrate_to" timeout="30s" /> +<action name="migrate_from" timeout="30s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate" timeout="10s" /> +<action name="validate-all" timeout="10s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +ocivip_usage() { + cat <<END +usage: $0 {start|stop|monitor|migrate_to|migrate_from|validate|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +ocivip_start() { + ocivip_monitor && return $OCF_SUCCESS + + $OCICLI network vnic assign-private-ip --vnic-id $VNIC_ID \ + --unassign-if-already-assigned \ + --ip-address ${SECONDARY_PRIVATE_IP} + RETOCI=$? + ip addr add ${SECONDARY_PRIVATE_IP}/${CIDR_NETMASK} dev ${PRIMARY_IFACE} label ${PRIMARY_IFACE}:${INTERFACE_ALIAS} + RETIP=$? + + # delay to avoid sending request too fast + sleep ${OCF_RESKEY_api_delay} + + if [ $RETOCI -ne 0 ] || [ $RETIP -ne 0 ]; then + return $OCF_NOT_RUNNING + fi + + ocf_log info "secondary_private_ip has been successfully brought up (${SECONDARY_PRIVATE_IP})" + return $OCF_SUCCESS +} + +ocivip_stop() { + ocivip_monitor || return $OCF_SUCCESS + + $OCICLI network vnic unassign-private-ip --vnic-id $VNIC_ID \ + --ip-address ${SECONDARY_PRIVATE_IP} + RETOCI=$? + ip addr del ${SECONDARY_PRIVATE_IP}/${CIDR_NETMASK} dev ${PRIMARY_IFACE}:${INTERFACE_ALIAS} + RETIP=$? + + # delay to avoid sending request too fast + sleep ${OCF_RESKEY_api_delay} + + if [ $RETOCI -ne 0 ] || [ $RETIP -ne 0 ]; then + return $OCF_NOT_RUNNING + fi + + ocf_log info "secondary_private_ip has been successfully brought down (${SECONDARY_PRIVATE_IP})" + return $OCF_SUCCESS +} + +ocivip_monitor() { + $OCICLI network private-ip list --vnic-id $VNIC_ID | grep -q "${SECONDARY_PRIVATE_IP}" + RETOCI=$? + + if [ $RETOCI -ne 0 ]; then + return $OCF_NOT_RUNNING + fi + return $OCF_SUCCESS +} + +ocivip_validate() { + check_binary ${OCICLI} + check_binary jq + + if [ -z "${VNIC_ID}" ]; then + ocf_exit_reason "vnic_id not found. Is this a Compute instance?" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +case $__OCF_ACTION in + meta-data) + meta_data + exit $OCF_SUCCESS + ;; +esac + +OCICLI="${OCF_RESKEY_ocicli}" +SECONDARY_PRIVATE_IP="${OCF_RESKEY_secondary_private_ip}" +CIDR_NETMASK="${OCF_RESKEY_cidr_netmask}" +INTERFACE_ALIAS="${OCF_RESKEY_interface_alias}" +VNIC_ID="$(curl -s -H "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/vnics/ | jq -r '.[0].vnicId')" +PRIMARY_IFACE=$(ip -4 route ls | grep default | grep -Po '(?<=dev )(\S+)') + +case $__OCF_ACTION in + start) + ocivip_validate || exit $? + ocivip_start + ;; + stop) + ocivip_stop + ;; + monitor) + ocivip_monitor + ;; + migrate_to) + ocf_log info "Migrating ${OCF_RESOURCE_INSTANCE} to ${OCF_RESKEY_CRM_meta_migrate_target}." + ocivip_stop + ;; + migrate_from) + ocf_log info "Migrating ${OCF_RESOURCE_INSTANCE} from ${OCF_RESKEY_CRM_meta_migrate_source}." + ocivip_start + ;; + reload) + ocf_log info "Reloading ${OCF_RESOURCE_INSTANCE} ..." + ;; + validate|validate-all) + ocivip_validate + ;; + usage|help) + ocivip_usage + exit $OCF_SUCCESS + ;; + *) + ocivip_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc diff --git a/heartbeat/openstack-cinder-volume b/heartbeat/openstack-cinder-volume new file mode 100755 index 0000000..116442c --- /dev/null +++ b/heartbeat/openstack-cinder-volume @@ -0,0 +1,294 @@ +#!/bin/sh +# +# +# OCF resource agent to attach a cinder volume to an instance. +# +# Copyright (c) 2018 Mathieu GRZYBEK +# Based on code of Markus Guertler +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +. ${OCF_FUNCTIONS_DIR}/openstack-common.sh + +# Defaults +OCF_RESKEY_volume_local_check_default="true" + +: ${OCF_RESKEY_volume_local_check=${OCF_RESKEY_volume_local_check_default}} + +####################################################################### + + +USAGE="usage: $0 {start|stop|status|meta-data}"; +############################################################################### + + +############################################################################### +# +# Functions +# +############################################################################### + + +metadata() { +cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="openstack-cinder-volume" version="2.0"> +<version>1.0</version> +<longdesc lang="en"> +Resource Agent to attach a cinder volume to an instance. +It relies on attributes given by openstack-info resource agent (openstack_id attribute). +</longdesc> +<shortdesc lang="en">Attach a cinder volume</shortdesc> + +<parameters> +END + +common_meta_data + +cat <<END +<parameter name="volume_local_check"> +<longdesc lang="en"> +This option allows the cluster to monitor the cinder volume presence without +calling the API. +</longdesc> +<shortdesc lang="en">Monitor cinder volume locally</shortdesc> +<content type="boolean" default="${OCF_RESKEY_volume_local_check_default}" /> +</parameter> + +<parameter name="volume_id" required="1"> +<longdesc lang="en"> +Cinder volume identifier to use to attach the block storage. +</longdesc> +<shortdesc lang="en">Volume ID</shortdesc> +<content type="string" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="180s" /> +<action name="stop" timeout="180s" /> +<action name="monitor" depth="0" timeout="180s" interval="60s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +_get_node_id() { + node_id=$(${HA_SBIN_DIR}/attrd_updater --query -n openstack_id -N $(crm_node -n) | + awk -F= '{gsub("\"","");print $NF}') + + if ! echo $node_id|grep -P "^[0-9a-f]{8}-([0-9a-f]{4}-){3}[0-9a-f]{12}$"; then + ocf_exit_reason "openstack_id attribute must be set for node $crm_node" + exit $OCF_ERR_CONFIGURED + fi +} + +osvol_validate() { + local result + + check_binary "$OCF_RESKEY_openstackcli" + + get_config + + result=$(run_openstackcli "volume list") + if ! echo "$result" | grep -q $OCF_RESKEY_volume_id; then + ocf_exit_reason "volume-id $OCF_RESKEY_volume_id not found" + return $OCF_ERR_CONFIGURED + fi + + ${HA_SBIN_DIR}/attrd_updater --query -n openstack_id -N $(crm_node -n) > /dev/null 2>&1 + if [ $? -ne 0 ] ; then + ocf_log warn "attr_updater failed to get openstack_id attribute of node $OCF_RESOURCE_INSTANCE" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +osvol_monitor() { + local result + local node_id + local short_volume_id + + node_id=$(_get_node_id) + + if [ "$__OCF_ACTION" = "monitor" ] && ocf_is_true $OCF_RESKEY_volume_local_check ; then + # + # Is the volue attached? + # We check the local devices + # + short_volume_id=$(echo $OCF_RESKEY_volume_id | awk '{print substr($0, 0, 20)}') + if lsblk /dev/disk/by-id/virtio-$short_volume_id 1>/dev/null 2>&1; then + return $OCF_SUCCESS + else + ocf_log warn "$OCF_RESKEY_volume_id is not attached to instance $node_id" + return $OCF_NOT_RUNNING + fi + fi + + # + # Is the volue attached? + # We use the API + # + result=$(run_openstackcli "volume show \ + --column status \ + --column attachments \ + --format value \ + $OCF_RESKEY_volume_id") + + if echo "$result" | grep -q available; then + ocf_log warn "$OCF_RESKEY_volume_id is not attached to any instance" + return $OCF_NOT_RUNNING + else + export attached_server_id=$(echo "$result"|head -n1| + grep -P -o "'server_id': '[0-9a-f]{8}-([0-9a-f]{4}-){3}[0-9a-f]{12}'"| + grep -P -o "[0-9a-f]{8}-([0-9a-f]{4}-){3}[0-9a-f]{12}") + ocf_log info "$OCF_RESKEY_volume_id is attached to instance $attached_server_id" + + # Compare node_id and the id of the node the volume is attached to + + if [ "$node_id" != "$attached_server_id" ] ; then + ocf_log warn "$OCF_RESKEY_volume_id is not attached to this instance" + return $OCF_NOT_RUNNING + fi + fi + + return $OCF_SUCCESS +} + +osvol_stop() { + local node_id + + # + # Is the volume already attached? + # + osvol_monitor + if [ $? = $OCF_NOT_RUNNING ]; then + ocf_log info "Volume $OCF_RESKEY_volume_id already available" + return $OCF_SUCCESS + fi + + node_id=$(_get_node_id) + + # + # Detach the volume + # + if ! run_openstackcli "server remove volume $node_id $OCF_RESKEY_volume_id"; then + ocf_log error "Couldn't remove volume $OCF_RESKEY_volume_id from instance $node_id" + return $OCF_ERR_GENERIC + fi + + ocf_log info "Successfully removed $OCF_RESKEY_volume_id from instance $node_id" + return $OCF_SUCCESS +} + +osvol_start() { + local node_id + + # + # Is the volume already attached? + # + osvol_monitor + if [ $? = $OCF_SUCCESS ]; then + ocf_log info "$OCF_RESKEY_volume_id already attached" + return $OCF_SUCCESS + fi + + # + # Detach it from another node + # TODO: make it optional in case multi-attachment is allowed by Cinder + # + if [ ! -z $attached_server_id ] ; then + if ! run_openstackcli "server remove volume $attached_server_id $OCF_RESKEY_volume_id"; then + ocf_log error "Couldn't remove volume $OCF_RESKEY_volume_id from instance $attached_server_id" + return $OCF_ERR_GENERIC + fi + fi + + export attached_server_id="" + + node_id=$(_get_node_id) + + # + # Attach the volume + # + run_openstackcli "server add volume $node_id $OCF_RESKEY_volume_id" + if [ $? != $OCF_SUCCESS ]; then + ocf_log error "Couldn't add volume $OCF_RESKEY_volume_id to instance $node_id" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +############################################################################### +# +# MAIN +# +############################################################################### + +case $__OCF_ACTION in + meta-data) + metadata + exit $OCF_SUCCESS + ;; + usage|help) + echo $USAGE + exit $OCF_SUCCESS + ;; +esac + +if ! ocf_is_root; then + ocf_log err "You must be root for $__OCF_ACTION operation." + exit $OCF_ERR_PERM +fi + +case $__OCF_ACTION in + start) + osvol_validate || exit $? + osvol_start;; + stop) + osvol_validate || exit $? + osvol_stop;; + monitor|status) + osvol_validate || exit $? + osvol_monitor;; + validate-all) + osvol_validate + ;; + *) + echo $USAGE + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +exit $? diff --git a/heartbeat/openstack-common.sh b/heartbeat/openstack-common.sh new file mode 100644 index 0000000..14d290b --- /dev/null +++ b/heartbeat/openstack-common.sh @@ -0,0 +1,173 @@ +OCF_RESKEY_user_domain_name_default="Default" +OCF_RESKEY_project_domain_name_default="Default" +OCF_RESKEY_openstackcli_default="/usr/bin/openstack" +OCF_RESKEY_insecure_default="false" + +: ${OCF_RESKEY_user_domain_name=${OCF_RESKEY_user_domain_name_default}} +: ${OCF_RESKEY_project_domain_name=${OCF_RESKEY_project_domain_name_default}} +: ${OCF_RESKEY_openstackcli=${OCF_RESKEY_openstackcli_default}} +: ${OCF_RESKEY_insecure=${OCF_RESKEY_insecure_default}} + +if ocf_is_true "${OCF_RESKEY_insecure}"; then + OCF_RESKEY_openstackcli="${OCF_RESKEY_openstackcli} --insecure" +fi + +common_meta_data() { + cat <<END + +<parameter name="cloud" required="0"> +<longdesc lang="en"> +Openstack cloud (from ~/.config/openstack/clouds.yaml or /etc/openstack/clouds.yaml). +</longdesc> +<shortdesc lang="en">Cloud from clouds.yaml</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="openrc" required="0"> +<longdesc lang="en"> +Openstack credentials as openrc file from api_access/openrc. +</longdesc> +<shortdesc lang="en">openrc file</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="auth_url" required="0"> +<longdesc lang="en"> +Keystone Auth URL +</longdesc> +<shortdesc lang="en">Keystone Auth URL</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="username" required="0"> +<longdesc lang="en"> +Username. +</longdesc> +<shortdesc lang="en">Username</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="password" required="0"> +<longdesc lang="en"> +Password. +</longdesc> +<shortdesc lang="en">Password</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="project_name" required="0"> +<longdesc lang="en"> +Keystone Project. +</longdesc> +<shortdesc lang="en">Keystone Project</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="user_domain_name" required="0"> +<longdesc lang="en"> +Keystone User Domain Name. +</longdesc> +<shortdesc lang="en">Keystone User Domain Name</shortdesc> +<content type="string" default="${OCF_RESKEY_user_domain_name_default}" /> +</parameter> + +<parameter name="project_domain_name" required="0"> +<longdesc lang="en"> +Keystone Project Domain Name. +</longdesc> +<shortdesc lang="en">Keystone Project Domain Name</shortdesc> +<content type="string" default="${OCF_RESKEY_project_domain_name_default}" /> +</parameter> + +<parameter name="openstackcli"> +<longdesc lang="en"> +Path to command line tools for openstack. +</longdesc> +<shortdesc lang="en">Path to Openstack CLI tool</shortdesc> +<content type="string" default="${OCF_RESKEY_openstackcli_default}" /> +</parameter> + +<parameter name="insecure"> +<longdesc lang="en"> +Allow insecure connections +</longdesc> +<shortdesc lang="en">Allow insecure connections</shortdesc> +<content type="boolean" default="${OCF_RESKEY_insecure_default}" /> +</parameter> +END +} + +get_config() { + if [ -n "$OCF_RESKEY_cloud" ]; then + TILDE=$(echo ~) + clouds_yaml="$TILDE/.config/openstack/clouds.yaml" + if [ ! -f "$clouds_yaml" ]; then + clouds_yaml="/etc/openstack/clouds.yaml" + fi + if [ ! -f "$clouds_yaml" ]; then + ocf_exit_reason "~/.config/openstack/clouds.yaml and /etc/openstack/clouds.yaml does not exist" + exit $OCF_ERR_CONFIGURED + fi + OCF_RESKEY_openstackcli="${OCF_RESKEY_openstackcli} --os-cloud $OCF_RESKEY_cloud" + elif [ -n "$OCF_RESKEY_openrc" ]; then + if [ ! -f "$OCF_RESKEY_openrc" ]; then + ocf_exit_reason "$OCF_RESKEY_openrc does not exist" + exit $OCF_ERR_CONFIGURED + fi + . $OCF_RESKEY_openrc + else + if [ -z "$OCF_RESKEY_auth_url" ]; then + ocf_exit_reason "auth_url not set" + exit $OCF_ERR_CONFIGURED + fi + if [ -z "$OCF_RESKEY_username" ]; then + ocf_exit_reason "username not set" + exit $OCF_ERR_CONFIGURED + fi + if [ -z "$OCF_RESKEY_password" ]; then + ocf_exit_reason "password not set" + exit $OCF_ERR_CONFIGURED + fi + if [ -z "$OCF_RESKEY_project_name" ]; then + ocf_exit_reason "project_name not set" + exit $OCF_ERR_CONFIGURED + fi + if [ -z "$OCF_RESKEY_user_domain_name" ]; then + ocf_exit_reason "user_domain_name not set" + exit $OCF_ERR_CONFIGURED + fi + if [ -z "$OCF_RESKEY_project_domain_name" ]; then + ocf_exit_reason "project_domain_name not set" + exit $OCF_ERR_CONFIGURED + fi + + OCF_RESKEY_openstackcli="${OCF_RESKEY_openstackcli} --os-auth-url $OCF_RESKEY_auth_url" + OCF_RESKEY_openstackcli="${OCF_RESKEY_openstackcli} --os-username $OCF_RESKEY_username" + OCF_RESKEY_openstackcli="${OCF_RESKEY_openstackcli} --os-password $OCF_RESKEY_password" + OCF_RESKEY_openstackcli="${OCF_RESKEY_openstackcli} --os-project-name $OCF_RESKEY_project_name" + OCF_RESKEY_openstackcli="${OCF_RESKEY_openstackcli} --os-user-domain-name $OCF_RESKEY_user_domain_name" + OCF_RESKEY_openstackcli="${OCF_RESKEY_openstackcli} --os-project-domain-name $OCF_RESKEY_project_domain_name" + fi +} + +run_openstackcli() { + local cmd="${OCF_RESKEY_openstackcli} $1" + local result + local rc + local start_time=$(date +%s) + local end_time + local elapsed_time + + result=$($cmd) + rc=$? + end_time=$(date +%s) + elapsed_time=$(expr $end_time - $start_time) + + if [ $elapsed_time -gt 20 ]; then + ocf_log warn "$cmd took ${elapsed_time}s to complete" + fi + + echo "$result" + + return $rc +} diff --git a/heartbeat/openstack-floating-ip b/heartbeat/openstack-floating-ip new file mode 100755 index 0000000..7317f19 --- /dev/null +++ b/heartbeat/openstack-floating-ip @@ -0,0 +1,257 @@ +#!/bin/sh +# +# +# OCF resource agent to move a floating address in an Openstack tenant. +# +# Copyright (c) 2018 Mathieu GRZYBEK +# Based on code of Markus Guertler +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +. ${OCF_FUNCTIONS_DIR}/openstack-common.sh + +# Defaults + +####################################################################### + + +USAGE="usage: $0 {start|stop|status|meta-data}"; +############################################################################### + + +############################################################################### +# +# Functions +# +############################################################################### + + +metadata() { +cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="openstack-floating-ip" version="2.0"> +<version>1.0</version> +<longdesc lang="en"> +Resource Agent to move a floating IP address from an instance to another one. +It relies on attributes given by openstack-info resource agent (openstack_ports, openstack_id attributes). +The attribute called "openstack_floating_ip" is updated. +</longdesc> +<shortdesc lang="en">Move a floating IP</shortdesc> + +<parameters> +END + +common_meta_data + +cat <<END +<parameter name="ip_id" required="1"> +<longdesc lang="en"> +Floating IP Identifier. +</longdesc> +<shortdesc lang="en">IP ID</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="subnet_id" required="1"> +<longdesc lang="en"> +Subnet Identifier to use to attach the address. +</longdesc> +<shortdesc lang="en">Subnet ID</shortdesc> +<content type="string" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="180s" /> +<action name="stop" timeout="180s" /> +<action name="monitor" depth="0" timeout="180s" interval="60s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +osflip_validate() { + local result + + check_binary "$OCF_RESKEY_openstackcli" + + get_config + + result=$(run_openstackcli "floating ip list") + if ! echo "$result" | grep -q $OCF_RESKEY_ip_id; then + ocf_exit_reason "ip-id $OCF_RESKEY_ip_id not found" + return $OCF_ERR_CONFIGURED + fi + + ${HA_SBIN_DIR}/attrd_updater --query -n openstack_ports -N $(crm_node -n) > /dev/null 2>&1 + if [ $? -ne 0 ] && ! ocf_is_probe; then + ocf_log warn "attr_updater failed to get openstack_ports attribute of node $OCF_RESOURCE_INSTANCE" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +osflip_monitor() { + local result + local floating_ip + local node_port_ids + local port + local buffer + + node_port_ids=$(${HA_SBIN_DIR}/attrd_updater --query -n openstack_ports -N $(crm_node -n) \ + | awk -F= '{gsub("\"","");print $NF}' \ + | tr ',' ' ' \ + | awk '{gsub("[^ ]*:", "");print}') + + # Is the IP active and attached? + result=$(run_openstackcli "floating ip show \ + --column port_id --column floating_ip_address \ + --format yaml \ + $OCF_RESKEY_ip_id") + + for port in $node_port_ids ; do + if echo "$result" | grep -q $port ; then + floating_ip=$(echo "$result" | awk '/floating_ip_address/ {print $2}') + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -S status -n openstack_floating_ip -v $floating_ip + + return $OCF_SUCCESS + fi + done + + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -D -S state -n openstack_floating_ip + ocf_log warn "$OCF_RESKEY_ip_id is not attached to any fixed address" + return $OCF_NOT_RUNNING +} + +osflip_stop() { + ocf_log info "Bringing down IP address $OCF_RESKEY_ip_id" + + osflip_monitor + if [ $? = $OCF_NOT_RUNNING ]; then + ocf_log info "Address $OCF_RESKEY_ip_id already down" + return $OCF_SUCCESS + fi + + if ! run_openstackcli "floating ip unset --port $OCF_RESKEY_ip_id"; then + return $OCF_ERR_GENERIC + fi + + osflip_monitor + if [ $? != $OCF_NOT_RUNNING ]; then + ocf_log error "Couldn't unset IP address $OCF_RESKEY_ip_id." + return $OCF_ERR_GENERIC + fi + + ocf_log info "Successfully brought down $OCF_RESKEY_ip_id" + return $OCF_SUCCESS +} + +osflip_start() { + local node_port_id + local node_port_ids + + osflip_monitor + if [ $? = $OCF_SUCCESS ]; then + ocf_log info "$OCF_RESKEY_ip_id already started" + return $OCF_SUCCESS + fi + + # Get port_id from subnet_id + node_port_ids=$(${HA_SBIN_DIR}/attrd_updater --query -n openstack_ports -N $(crm_node -n) \ + | awk '{gsub("value=","") ; gsub("\"","") ; print $NF}') + + node_port_id=$(echo $node_port_ids \ + | tr ',' '\n' \ + | awk -F: "/$OCF_RESKEY_subnet_id/ {print \$2}") + + ocf_log info "Moving IP address $OCF_RESKEY_ip_id to port ID $node_port_id" + + run_openstackcli "floating ip set --port $node_port_id $OCF_RESKEY_ip_id" + if [ $? != $OCF_SUCCESS ]; then + ocf_log error "$OCF_RESKEY_ip_id Cannot be set to port $node_port_id" + return $OCF_ERR_GENERIC + fi + + osflip_monitor + if [ $? != $OCF_SUCCESS ]; then + ocf_log error "$OCF_RESKEY_ip_id Cannot be set to port $node_port_id" + return $OCF_ERR_GENERIC + fi + + ocf_log info "Successfully brought up $OCF_RESKEY_ip_id" + return $OCF_SUCCESS +} + +############################################################################### +# +# MAIN +# +############################################################################### + +case $__OCF_ACTION in + meta-data) + metadata + exit $OCF_SUCCESS + ;; + usage|help) + echo $USAGE + exit $OCF_SUCCESS + ;; +esac + +if ! ocf_is_root; then + ocf_log err "You must be root for $__OCF_ACTION operation." + exit $OCF_ERR_PERM +fi + +case $__OCF_ACTION in + start) + osflip_validate || exit $? + osflip_start;; + stop) + osflip_validate || exit $? + osflip_stop;; + monitor) + osflip_validate || exit $? + osflip_monitor;; + validate-all) + osflip_validate + ;; + *) + echo $USAGE + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +exit $? diff --git a/heartbeat/openstack-info.in b/heartbeat/openstack-info.in new file mode 100755 index 0000000..6502f1d --- /dev/null +++ b/heartbeat/openstack-info.in @@ -0,0 +1,270 @@ +#!/bin/sh +# +# +# OCF resource agent to set attributes from Openstack instance details. +# It records (in the CIB) various attributes of a node +# +# Copyright (c) 2018 Mathieu Grzybek +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +. ${OCF_FUNCTIONS_DIR}/openstack-common.sh + +# Defaults +OCF_RESKEY_pidfile_default="$HA_RSCTMP/OSInfo-${OCF_RESOURCE_HOSTNAME}" +OCF_RESKEY_delay_default="0" +OCF_RESKEY_clone_default="0" +OCF_RESKEY_curlcli_default="/usr/bin/curl" +OCF_RESKEY_pythoncli_default="@PYTHON@" + +: ${OCF_RESKEY_curlcli=${OCF_RESKEY_curlcli_default}} +: ${OCF_RESKEY_pythoncli=${OCF_RESKEY_pythoncli_default}} +: ${OCF_RESKEY_pidfile=${OCF_RESKEY_pidfile_default}} +: ${OCF_RESKEY_delay=${OCF_RESKEY_delay_default}} +: ${OCF_RESKEY_clone=${OCF_RESKEY_clone_default}} + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="openstack-info" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +OCF resource agent to set attributes from Openstack instance details. +It records (in the CIB) various attributes of a node. +Sample output: + openstack_az : nova + openstack_flavor : c1.small + openstack_id : 60ac4343-5828-49b1-8aac-7c69b1417f31 + openstack_ports : 7960d889-9750-4160-bf41-c69a41ad72d9:96530d18-57a3-4718-af32-30f2a74c22a2,b0e55a06-bd75-468d-8baa-22cfeb65799f:a55ae917-8016-4b1e-8ffa-04311b9dc7d6 + +The layout of openstack_ports is a comma-separated list of tuples "subnet_id:port_id". +</longdesc> +<shortdesc lang="en">Records various node attributes in the CIB</shortdesc> + +<parameters> +END + +common_meta_data + + cat <<END +<parameter name="pidfile" unique="0"> +<longdesc lang="en">PID file</longdesc> +<shortdesc lang="en">PID file</shortdesc> +<content type="string" default="${OCF_RESKEY_pidfile_default}" /> +</parameter> + +<parameter name="delay" unique="0"> +<longdesc lang="en">Interval to allow values to stabilize</longdesc> +<shortdesc lang="en">Dampening Delay</shortdesc> +<content type="string" default="${OCF_RESKEY_delay_default}" /> +</parameter> + +<parameter name="curlcli"> +<longdesc lang="en"> +Path to command line cURL binary. +</longdesc> +<shortdesc lang="en">Path to cURL binary</shortdesc> +<content type="string" default="${OCF_RESKEY_curlcli_default}" /> +</parameter> + +<parameter name="pythoncli"> +<longdesc lang="en"> +Path to command line Python interpreter. +</longdesc> +<shortdesc lang="en">Path to Python interpreter</shortdesc> +<content type="string" default="${OCF_RESKEY_pythoncli_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="180s" /> +<action name="stop" timeout="180s" /> +<action name="monitor" timeout="180s" interval="60s"/> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="20s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +OSInfoStats() { + local value + local node_id + + get_config + + # Nova data: server ID + node_id=$($OCF_RESKEY_curlcli \ + -s http://169.254.169.254/openstack/latest/meta_data.json | + $OCF_RESKEY_pythoncli -m json.tool | + grep -P '\"uuid\": \".*\",$' | + grep -P -o '[0-9a-f]{8}-([0-9a-f]{4}-){3}[0-9a-f]{12}') + + if [ $? -ne 0 ] ; then + ocf_exit_reason "Cannot find server ID" + exit $OCF_ERR_GENERIC + fi + + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -n openstack_id -v "$node_id" + + # Nova data: flavor + value=$(run_openstackcli "server show \ + --format value \ + --column flavor \ + $node_id") + + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -n openstack_flavor -v "$value" + + # Nova data: availability zone + value=$(run_openstackcli "server show \ + --format value \ + --column OS-EXT-AZ:availability_zone \ + $node_id") + + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -n openstack_az -v "$value" + + # Network data: ports + value="" + for port_id in $(run_openstackcli "port list \ + --format value \ + --column id \ + --server $node_id"); do + subnet_result=$(run_openstackcli "port show \ + --format json \ + --column fixed_ips \ + ${port_id}") + subnet_id=$(echo "$subnet_result" | + grep -P '\"subnet_id\": \".*\",$' | + grep -P -o '[0-9a-f]{8}-([0-9a-f]{4}-){3}[0-9a-f]{12}') + value="${value}${subnet_id}:${port_id}," + done + value=${value%,} + + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -n openstack_ports -v "$value" + + if [ ! -z "$OS_REGION_NAME" ] ; then + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -n openstack_region -v "$OS_REGION_NAME" + fi + + if [ ! -z "$OS_TENANT_ID" ] ; then + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -n openstack_tenant_id -v "$OS_TENANT_ID" + + if [ ! -z "$OS_TENANT_NAME" ] ; then + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -n openstack_tenant_name -v "$OS_TENANT_NAME" + fi + else + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -n openstack_project_id -v "$OS_PROJECT_ID" + + if [ ! -z "$OS_PROJECT_NAME" ] ; then + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -n openstack_project_name -v "$OS_PROJECT_NAME" + fi + fi + +} + +OSInfo_usage() { + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +OSInfo_start() { + echo $OCF_RESKEY_clone > $OCF_RESKEY_pidfile + OSInfoStats + exit $OCF_SUCCESS +} + +OSInfo_stop() { + rm -f $OCF_RESKEY_pidfile + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -D -n openstack_id + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -D -n openstack_flavor + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -D -n openstack_az + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -D -n openstack_ports + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -D -n openstack_region + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -D -n openstack_tenant_id + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -D -n openstack_tenant_name + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -D -n openstack_project_id + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -D -n openstack_project_name + exit $OCF_SUCCESS +} + +OSInfo_monitor() { + if [ -f "$OCF_RESKEY_pidfile" ] ; then + OSInfoStats + exit $OCF_RUNNING + fi + exit $OCF_NOT_RUNNING +} + +OSInfo_validate() { + check_binary "$OCF_RESKEY_curlcli" + check_binary "$OCF_RESKEY_openstackcli" + check_binary "$OCF_RESKEY_pythoncli" + + return $OCF_SUCCESS +} + +if [ $# -ne 1 ]; then + OSInfo_usage + exit $OCF_ERR_ARGS +fi + +if [ x != x${OCF_RESKEY_delay} ]; then + OCF_RESKEY_delay="-d ${OCF_RESKEY_delay}" +fi + +case $__OCF_ACTION in +meta-data) meta_data + exit $OCF_SUCCESS + ;; +start) OSInfo_validate || exit $? + OSInfo_start + ;; +stop) OSInfo_stop + ;; +monitor) OSInfo_monitor + ;; +validate-all) OSInfo_validate + ;; +usage|help) OSInfo_usage + exit $OCF_SUCCESS + ;; +*) OSInfo_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +exit $? diff --git a/heartbeat/openstack-virtual-ip b/heartbeat/openstack-virtual-ip new file mode 100755 index 0000000..361357d --- /dev/null +++ b/heartbeat/openstack-virtual-ip @@ -0,0 +1,258 @@ +#!/bin/sh +# +# +# OCF resource agent to move a virtual address in an Openstack tenant. +# +# Copyright (c) 2018 Mathieu GRZYBEK +# Based on code of Markus Guertler +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +. ${OCF_FUNCTIONS_DIR}/openstack-common.sh + +# Defaults + +####################################################################### + + +USAGE="usage: $0 {start|stop|status|meta-data}"; +############################################################################### + + +############################################################################### +# +# Functions +# +############################################################################### + + +metadata() { +cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="openstack-virtual-ip" version="2.0"> +<version>1.0</version> +<longdesc lang="en"> +Resource Agent to move a virtual IP address from an instance to another one +by adding an allowed-address pair associated with an instance port. +It relies on attributes given by openstack-info resource agent (openstack_ports, openstack_id attributes). +The attribute called "openstack_virtual_ip" is updated. +</longdesc> +<shortdesc lang="en">Move a virtual IP</shortdesc> + +<parameters> +END + +common_meta_data + +cat <<END +<parameter name="ip" required="1"> +<longdesc lang="en"> +Virtual IP Address. +</longdesc> +<shortdesc lang="en">IP Address</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="subnet_id" required="1"> +<longdesc lang="en"> +Subnet Identifier to use to attach the address. +</longdesc> +<shortdesc lang="en">Subnet ID</shortdesc> +<content type="string" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="180s" /> +<action name="stop" timeout="180s" /> +<action name="monitor" depth="0" timeout="180s" interval="60s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +osvip_port_id() { + # Get port_id from subnet_id + node_port_ids=$(${HA_SBIN_DIR}/attrd_updater --query -n openstack_ports -N $(crm_node -n) \ + | awk '{gsub("value=","") ; gsub("\"","") ; print $NF}') + + node_port_id=$(echo $node_port_ids \ + | tr ',' '\n' \ + | awk -F: "/$OCF_RESKEY_subnet_id/ {print \$2}") + + echo ${node_port_id} +} + +osvip_validate() { + check_binary "$OCF_RESKEY_openstackcli" + + get_config + + ${HA_SBIN_DIR}/attrd_updater --query -n openstack_ports -N $(crm_node -n) > /dev/null 2>&1 + if [ $? -ne 0 ] && ! ocf_is_probe; then + ocf_log warn "attr_updater failed to get openstack_ports attribute of node $OCF_RESOURCE_INSTANCE" + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +osvip_monitor() { + local result + + node_port_id=$(osvip_port_id) + + result=$(run_openstackcli "port show \ + --format value \ + --column allowed_address_pairs \ + ${node_port_id}") + if echo "$result" | grep -q "$OCF_RESKEY_ip"; then + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -S status -n openstack_virtual_ip -v $OCF_RESKEY_ip + + return $OCF_SUCCESS + fi + + ${HA_SBIN_DIR}/attrd_updater ${OCF_RESKEY_delay} -D -S state -n openstack_virtual_ip + ocf_log warn "$OCF_RESKEY_ip is not attached to any fixed address" + return $OCF_NOT_RUNNING +} + +osvip_stop() { + node_port_id=$(osvip_port_id) + + ocf_log info "Bringing down IP address $OCF_RESKEY_ip" + + osvip_monitor + if [ $? = $OCF_NOT_RUNNING ]; then + ocf_log info "Address $OCF_RESKEY_ip already down" + return $OCF_SUCCESS + fi + + mac_address=$(run_openstackcli "port show \ + --format value \ + --column mac_address \ + $node_port_id") + echo "${mac_address}" | grep -q -P "^([0-9a-f]{2}:){5}[0-9a-f]{2}$" + if [ $? -ne 0 ]; then + ocf_log error "MAC address '${mac_address}' is not valid." + return $OCF_ERR_GENERIC + fi + + if ! run_openstackcli "port unset \ + --allowed-address \ + ip-address=$OCF_RESKEY_ip,mac-address=${mac_address} \ + $node_port_id"; then + return $OCF_ERR_GENERIC + fi + + osvip_monitor + if [ $? != $OCF_NOT_RUNNING ]; then + ocf_log error "Couldn't unset IP address $OCF_RESKEY_ip." + return $OCF_ERR_GENERIC + fi + + ocf_log info "Successfully brought down $OCF_RESKEY_ip" + return $OCF_SUCCESS +} + +osvip_start() { + node_port_id=$(osvip_port_id) + + osvip_monitor + if [ $? = $OCF_SUCCESS ]; then + ocf_log info "$OCF_RESKEY_ip already started" + return $OCF_SUCCESS + fi + + ocf_log info "Moving IP address $OCF_RESKEY_ip to port ID $node_port_id" + + run_openstackcli "port set \ + --allowed-address ip-address=$OCF_RESKEY_ip \ + $node_port_id" + if [ $? != $OCF_SUCCESS ]; then + ocf_log error "$OCF_RESKEY_ip Cannot be set to port $node_port_id" + return $OCF_ERR_GENERIC + fi + + osvip_monitor + if [ $? != $OCF_SUCCESS ]; then + ocf_log error "$OCF_RESKEY_ip Cannot be set to port $node_port_id" + return $OCF_ERR_GENERIC + fi + + ocf_log info "Successfully brought up $OCF_RESKEY_ip" + return $OCF_SUCCESS +} + +############################################################################### +# +# MAIN +# +############################################################################### + +case $__OCF_ACTION in + meta-data) + metadata + exit $OCF_SUCCESS + ;; + usage|help) + echo $USAGE + exit $OCF_SUCCESS + ;; +esac + +if ! ocf_is_root; then + ocf_log err "You must be root for $__OCF_ACTION operation." + exit $OCF_ERR_PERM +fi + +case $__OCF_ACTION in + start) + osvip_validate || exit $? + osvip_start;; + stop) + osvip_validate || exit $? + osvip_stop;; + monitor) + osvip_validate || exit $? + osvip_monitor;; + validate-all) + osvip_validate + ;; + *) + echo $USAGE + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +exit $? diff --git a/heartbeat/ora-common.sh b/heartbeat/ora-common.sh new file mode 100644 index 0000000..6100a66 --- /dev/null +++ b/heartbeat/ora-common.sh @@ -0,0 +1,90 @@ +# ora-common.sh +# +# Description: Common code for oracle and oralsnr resource agents +# +# +# Author: Dejan Muhamedagic +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL) +# Copyright: (C) 2012 Dejan Muhamedagic, SUSE/Attachmate +# + +# Gather up information about our oracle instance + +rmtmpfiles() { + rm -f $TMPFILES +} + +ora_common_getconfig() { + ORACLE_SID=$1 + # optional, defaults to whatever is in oratab + ORACLE_HOME=$2 + # optional, defaults to the owner of ORACLE_HOME + ORACLE_OWNER=$3 + # optional, defaults to $ORACLE_HOME/network/admin + # (only the oralsnr may provide and use this one) + TNS_ADMIN=$4 + + # get ORACLE_HOME from /etc/oratab if not set + if [ x = "x$ORACLE_HOME" ];then + ORACLE_HOME=`awk -F: "/^$ORACLE_SID:/"'{print $2}' /etc/oratab` + if [ -f /etc/oratab ]; then + if [ x = "x$ORACLE_HOME" ];then + handle_invalid_env $OCF_ERR_CONFIGURED "ORACLE_HOME could not be obtained from /etc/oratab. Please check the sid parameter." + fi + fi + fi + + # there a better way to find out ORACLE_OWNER? + [ x = "x$ORACLE_OWNER" ] && + ORACLE_OWNER=`ls -ld $ORACLE_HOME/. 2>/dev/null | awk 'NR==1{print $3}'` + + # There are use-cases were users want to be able to set a custom TMS_ADMIN path. + # When TNS_ADMIN is not provided, use the default path. + [ x = "x$TNS_ADMIN" ] && + TNS_ADMIN=$ORACLE_HOME/network/admin + + LD_LIBRARY_PATH=$ORACLE_HOME/lib + LIBPATH=$ORACLE_HOME/lib + PATH=$ORACLE_HOME/bin:$ORACLE_HOME/dbs:$PATH + export ORACLE_SID ORACLE_HOME ORACLE_OWNER TNS_ADMIN + export LD_LIBRARY_PATH LIBPATH + + ORA_ENVF=`mktemp` + dumporaenv > $ORA_ENVF + chmod 644 $ORA_ENVF + TMPFILES="$ORA_ENVF" + trap "rmtmpfiles" EXIT +} + +ora_common_validate_all() { + # Let's make sure a few important things are set... + if [ x = "x$ORACLE_OWNER" ]; then + ocf_log info "ORACLE_OWNER not set" + return $OCF_ERR_INSTALLED + fi + + US=`id -u -n` + if [ $US != root -a $US != $ORACLE_OWNER ] + then + ocf_exit_reason "$0 must be run as root or $ORACLE_OWNER" + return $OCF_ERR_PERM + fi + return 0 +} + +dumporaenv() { +cat<<EOF +PATH=$ORACLE_HOME/bin:$ORACLE_HOME/dbs:$PATH +ORACLE_SID=$ORACLE_SID +ORACLE_HOME=$ORACLE_HOME +ORACLE_OWNER=$ORACLE_OWNER +LD_LIBRARY_PATH=$ORACLE_HOME/lib +LIBPATH=$ORACLE_HOME/lib +TNS_ADMIN=$TNS_ADMIN +export ORACLE_SID ORACLE_HOME ORACLE_OWNER TNS_ADMIN +export LD_LIBRARY_PATH LIBPATH +EOF +} + +# vim:tabstop=4:shiftwidth=4:textwidth=0:wrapmargin=0 diff --git a/heartbeat/oraasm b/heartbeat/oraasm new file mode 100755 index 0000000..34c8df0 --- /dev/null +++ b/heartbeat/oraasm @@ -0,0 +1,183 @@ +#!/bin/sh +# +# License: GNU General Public License (GPL) +# (c) 2017 O. Albrigtsen +# and Linux-HA contributors +# +# ----------------------------------------------------------------------------- +# O C F R E S O U R C E S C R I P T S P E C I F I C A T I O N +# ----------------------------------------------------------------------------- +# +# NAME +# oraasm : OCF resource agent script for Oracle ASM +# + +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_user_default="grid" +OCF_RESKEY_diskgroup_default="" +OCF_RESKEY_home_default="" + +: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} +: ${OCF_RESKEY_diskgroup=${OCF_RESKEY_diskgroup_default}} +: ${OCF_RESKEY_home=${OCF_RESKEY_home_default}} + + +oraasm_usage() { + cat <<END + usage: $0 (start|stop|validate-all|meta-data|help|usage|monitor) + $0 manages a Oracle ASM Disk Group as an OCF HA resource. + The 'start' operation starts the instance. + The 'stop' operation stops the instance. + The 'status' operation reports whether the instance is running + The 'monitor' operation reports whether the instance seems to be working + The 'validate-all' operation reports whether the parameters are valid +END +} + +oraasm_meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="oraasm" version="0.75"> +<version>1.0</version> + +<longdesc lang="en">OCF Resource script for Oracle ASM. It uses the ohasd init-script to manage a Oracle ASM Disk Group as a HA resource.</longdesc> +<shortdesc lang="en">Oracle ASM resource agent</shortdesc> + +<parameters> + +<parameter name="user"> + <longdesc lang="en">Oracle Grid user</longdesc> + <shortdesc lang="en">Oracle Grid user</shortdesc> + <content type="string" default="${OCF_RESKEY_user_default}" /> +</parameter> + +<parameter name="diskgroup" required="1"> + <longdesc lang="en"> +The name of the Oracle Disk Group. +If not specified, then the Disk Group along with its home should be listed in /etc/oratab. + </longdesc> + <shortdesc lang="en">Oracle Disk Group</shortdesc> + <content type="string" default="${OCF_RESKEY_diskgroup_default}" /> +</parameter> + +<parameter name="home" unique="0"> +<longdesc lang="en">The Oracle Grid home directory</longdesc> +<shortdesc lang="en">home</shortdesc> +<content type="string" default="${OCF_RESKEY_home_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="60s" /> +<action name="stop" timeout="60s" /> +<action name="status" timeout="30s" /> +<action name="monitor" depth="0" timeout="30s" interval="10s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +oraasm_methods() { + cat <<-! + start + stop + status + monitor + validate-all + methods + meta-data + usage + ! +} + +oraasm_getconfig() { + [ x = "x$OCF_RESKEY_home" ] && + OCF_RESKEY_home=`awk -F: "/^+$OCF_RESKEY_diskgroup:/"'{print $2}' /etc/oratab` + PATH="$OCF_RESKEY_home/bin:$PATH" + + ORA_ENVF=`mktemp` + cat << EOF > $ORA_ENVF +PATH="$OCF_RESKEY_home/bin:$PATH" +EOF + chmod 644 $ORA_ENVF + trap "rm -f $ORA_ENVF" EXIT +} + +oraasm_start() { + # if resource is already running, no need to continue code after this. + if oraasm_monitor; then + ocf_log info "Oracle ASM is already running" + return $OCF_SUCCESS + fi + + ocf_run -q /etc/init.d/ohasd start + + while ! oraasm_monitor; do + sleep 1 + done + + return $OCF_SUCCESS +} + +oraasm_stop() { + oraasm_monitor + if [ $? -ne $OCF_SUCCESS ]; then + # Currently not running. Nothing to do. + ocf_log info "Oracle ASM is already stopped" + + return $OCF_SUCCESS + fi + + ocf_run -q /etc/init.d/ohasd stop + + # Wait for process to stop + while oraasm_monitor; do + sleep 1 + done + + return $OCF_SUCCESS +} + +oraasm_monitor() { + su - $OCF_RESKEY_user -c ". $ORA_ENVF; crsctl check has | grep -q \"CRS-4638\"" + case "$?" in + 0) + rc=$OCF_SUCCESS + ;; + 1) + rc=$OCF_NOT_RUNNING + ocf_log info "Oracle ASM is not running" + ;; + *) + rc=$OCF_ERR_GENERIC + ;; + esac + return $rc +} + +oraasm_status() { + rc=$(oraasm_monitor) + return $rc +} + +oraasm_validate_all() { + if [ x = "x$OCF_RESKEY_home" ]; then + ocf_exit_reason "home not set" + return $OCF_ERR_CONFIGURED + fi +} + + +OCF_REQUIRED_PARAMS="user diskgroup" +OCF_REQUIRED_BINARIES="/etc/init.d/ohasd crsctl" +ocf_rarun $* + +# vim:tabstop=4:shiftwidth=4:textwidth=0:wrapmargin=0 diff --git a/heartbeat/oracle b/heartbeat/oracle new file mode 100755 index 0000000..8cf4e36 --- /dev/null +++ b/heartbeat/oracle @@ -0,0 +1,789 @@ +#!/bin/sh +# +# +# oracle +# +# Description: Manages an Oracle Database as a High-Availability +# resource +# +# +# Author: Dejan Muhamedagic +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL) +# Copyright: (C) 2006 International Business Machines, Inc. +# +# This code inspired by the DB2 resource script +# written by Alan Robertson +# +# An example usage in /etc/ha.d/haresources: +# node1 10.0.0.170 oracle::RK1::/oracle/10.2::orark1 +# +# See oracle_usage() function below for more details... +# +# OCF instance parameters: +# OCF_RESKEY_sid +# OCF_RESKEY_home (optional; else read it from /etc/oratab) +# OCF_RESKEY_user (optional; figure it out by checking file ownership) +# OCF_RESKEY_ipcrm (optional; defaults to "instance") +# OCF_RESKEY_clear_backupmode (optional; default to "false") +# OCF_RESKEY_shutdown_method (optional; default to "checkpoint/abort") +# OCF_RESKEY_monuser (optional; defaults to "OCFMON") +# OCF_RESKEY_monpassword (optional; defaults to "OCFMON") +# OCF_RESKEY_monprofile (optional; defaults to "OCFMONPROFILE") +# +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs +. ${OCF_FUNCTIONS_DIR}/ora-common.sh + +####################################################################### + +oracle_usage() { + methods=`oracle_methods` + methods=`echo $methods | tr ' ' '|'` + cat <<-! + usage: $0 {$methods} + + $0 manages an Oracle Database instance as an HA resource. + + The 'start' operation starts the database. + The 'stop' operation stops the database. + The 'status' operation reports whether the database is running + The 'monitor' operation reports whether the database seems to be working + The 'dumpinstipc' operation prints IPC resources used by the instance + The 'cleanup' operation tries to clean up after Oracle was brutally stopped + The 'validate-all' operation reports whether the parameters are valid + The 'methods' operation reports on the methods $0 supports + + ! +} + +# Defaults +OCF_RESKEY_sid_default="" +OCF_RESKEY_home_default="" +OCF_RESKEY_user_default="" +OCF_RESKEY_monuser_default="OCFMON" +OCF_RESKEY_monpassword_default="OCFMON" +OCF_RESKEY_monprofile_default="OCFMONPROFILE" +OCF_RESKEY_ipcrm_default="instance" +OCF_RESKEY_clear_backupmode_default="false" +OCF_RESKEY_shutdown_method_default="checkpoint/abort" + +oracle_meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="oracle" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for oracle. Manages an Oracle Database instance +as an HA resource. +</longdesc> +<shortdesc lang="en">Manages an Oracle Database instance</shortdesc> + +<parameters> + +<parameter name="sid" unique="1" required="1"> +<longdesc lang="en"> +The Oracle SID (aka ORACLE_SID). +</longdesc> +<shortdesc lang="en">sid</shortdesc> +<content type="string" default="${OCF_RESKEY_sid_default}" /> +</parameter> + +<parameter name="home" unique="0"> +<longdesc lang="en"> +The Oracle home directory (aka ORACLE_HOME). +If not specified, then the SID along with its home should be listed in +/etc/oratab. +</longdesc> +<shortdesc lang="en">home</shortdesc> +<content type="string" default="${OCF_RESKEY_home_default}" /> +</parameter> + +<parameter name="user" unique="0"> +<longdesc lang="en"> +The Oracle owner (aka ORACLE_OWNER). +If not specified, then it is set to the owner of +file \$ORACLE_HOME/dbs/*\${ORACLE_SID}.ora. +If this does not work for you, just set it explicitely. +</longdesc> +<shortdesc lang="en">user</shortdesc> +<content type="string" default="${OCF_RESKEY_user_default}" /> +</parameter> + +<parameter name="monuser" unique="0"> +<longdesc lang="en"> +Monitoring user name. Every connection as +sysdba is logged in an audit log. This can +result in a large number of new files created. +A new user is created (if it doesn't exist) in +the start action and subsequently used in monitor. +It should have very limited rights. Make sure +that the password for this user does not expire. +</longdesc> +<shortdesc lang="en">monuser</shortdesc> +<content type="string" default="$OCF_RESKEY_monuser_default" /> +</parameter> + +<parameter name="monpassword" unique="0"> +<longdesc lang="en"> +Password for the monitoring user. Make sure +that the password for this user does not expire. +Need to explicitly set a password to a new monitor +user for the security reason. +</longdesc> +<shortdesc lang="en">monpassword</shortdesc> +<content type="string" default="$OCF_RESKEY_monpassword_default" /> +</parameter> + +<parameter name="monprofile" unique="0"> +<longdesc lang="en"> +Profile used by the monitoring user. If the +profile does not exist, it will be created +with a non-expiring password. +</longdesc> +<shortdesc lang="en">monprofile</shortdesc> +<content type="string" default="$OCF_RESKEY_monprofile_default" /> +</parameter> + +<parameter name="ipcrm" unique="0"> +<longdesc lang="en"> +Sometimes IPC objects (shared memory segments and semaphores) +belonging to an Oracle instance might be left behind which +prevents the instance from starting. It is not easy to figure out +which shared segments belong to which instance, in particular when +more instances are running as same user. + +What we use here is the "oradebug" feature and its "ipc" trace +utility. It is not optimal to parse the debugging information, but +I am not aware of any other way to find out about the IPC +information. In case the format or wording of the trace report +changes, parsing might fail. There are some precautions, however, +to prevent stepping on other peoples toes. There is also a +dumpinstipc option which will make us print the IPC objects which +belong to the instance. Use it to see if we parse the trace file +correctly. + +Three settings are possible: + +- none: don't mess with IPC and hope for the best (beware: you'll + probably be out of luck, sooner or later) +- instance: try to figure out the IPC stuff which belongs to the + instance and remove only those (default; should be safe) +- orauser: remove all IPC belonging to the user which runs the + instance (don't use this if you run more than one instance as same + user or if other apps running as this user use IPC) + +The default setting "instance" should be safe to use, but in that +case we cannot guarantee that the instance will start. In case IPC +objects were already left around, because, for instance, someone +mercilessly killing Oracle processes, there is no way any more to +find out which IPC objects should be removed. In that case, human +intervention is necessary, and probably _all_ instances running as +same user will have to be stopped. The third setting, "orauser", +guarantees IPC objects removal, but it does that based only on IPC +objects ownership, so you should use that only if every instance +runs as separate user. + +Please report any problems. Suggestions/fixes welcome. +</longdesc> +<shortdesc lang="en">ipcrm</shortdesc> +<content type="string" default="${OCF_RESKEY_ipcrm_default}" /> +</parameter> + +<parameter name="clear_backupmode" unique="0" required="0"> +<longdesc lang="en"> +The clear of the backup mode of ORACLE. +</longdesc> +<shortdesc lang="en">clear_backupmode</shortdesc> +<content type="boolean" default="${OCF_RESKEY_clear_backupmode_default}" /> +</parameter> + +<parameter name="shutdown_method" unique="0" required="0"> +<longdesc lang="en"> +How to stop Oracle is a matter of taste it seems. The default +method ("checkpoint/abort") is: + + alter system checkpoint; + shutdown abort; + +This should be the fastest safe way bring the instance down. If +you find "shutdown abort" distasteful, set this attribute to +"immediate" in which case we will + + shutdown immediate; + +If you still think that there's even better way to shutdown an +Oracle instance we are willing to listen. +</longdesc> +<shortdesc lang="en">shutdown_method</shortdesc> +<content type="string" default="${OCF_RESKEY_shutdown_method_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="120s" /> +<action name="stop" timeout="120s" /> +<action name="status" timeout="5s" /> +<action name="monitor" depth="0" timeout="30s" interval="120s" /> +<action name="validate-all" timeout="5s" /> +<action name="methods" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + + +# +# methods: What methods/operations do we support? +# +oracle_methods() { + cat <<-! + start + stop + status + monitor + dumpinstipc + showdbstat + cleanup + validate-all + methods + meta-data + usage + ! +} + +# +# Run commands as the Oracle owner... +# +execsql() { + if [ "$US" = "$ORACLE_OWNER" ]; then + sqlplus -S /nolog + else + su - $ORACLE_OWNER -s /bin/sh -c ". $ORA_ENVF; sqlplus -S /nolog" + fi +} + +# +# Run commands in the oracle admin sqlplus... +# +common_sql_opts() { + cat<<EOF +set feedback off +set heading off +set pagesize 0 +EOF +} +common_sql_filter() { + grep -v '^Connected' | + grep -v '^ENV MSG:' | + grep -v 'Your password will expire in' +} +runsql() { + local conn_s="$1" + shift 1 + local func + ( + echo "$conn_s" + common_sql_opts + for func; do $func; done + ) | + execsql | common_sql_filter +} +dbasql() { + runsql "connect / as sysdba" $* +} +monsql() { + runsql "connect \"$MONUSR\"/\"$MONPWD\"" $* +} +# use dbasql_one if the query should result in a single line output +# at times people stuff commands in oracle .profile +# which may produce extra output +dbasql_one() { + dbasql $* | tail -1 +} +monsql_one() { + monsql $* | tail -1 +} + +# +# various interesting sql +# +dbstat() { + echo 'select status from v$instance;' +} +dbmount() { + echo 'alter database mount;' +} +dbopen() { + echo 'alter database open;' +} +dbstop_immediate() { + echo 'shutdown immediate' +} +dbstop_checkpoint_abort() { + echo 'alter system checkpoint;' + echo 'shutdown abort' +} +dbstop() { + case "${shutdown_method}" in + "immediate") + dbstop_immediate + ;; + "checkpoint/abort") + dbstop_checkpoint_abort + ;; + esac +} +dbstart() { + echo 'startup' +} +dbstart_mount() { + echo 'startup mount' +} +dbendbackup() { + echo 'alter database end backup;' +} +db_backup_mode() { + echo "select 'COUNT'||count(*) from v\$backup where status='ACTIVE';" +} +is_clear_backupmode_set(){ + ocf_is_true "${clear_backupmode}" +} +is_instance_in_backup_mode() { + local count + count="`dbasql_one db_backup_mode | sed 's/COUNT//'`" + [ x"$count" != x"0" ] +} +clear_backup_mode() { + local output + output="`dbasql dbendbackup`" + ocf_log info "Oracle instance $ORACLE_SID alter database end backup: $output" +} +getdumpdest() { + #echo 'select value from v$parameter where name = \'user_dump_dest\';' + echo "select value from v\$parameter where name = 'user_dump_dest';" +} +getipc() { + echo "oradebug setmypid" + echo "oradebug tracefile_name" + echo "oradebug ipc" +} +show_mon_profile() { + echo "select PROFILE from dba_profiles where PROFILE='$MONPROFILE';" +} +mk_mon_profile() { + cat<<EOF +create profile "$MONPROFILE" limit FAILED_LOGIN_ATTEMPTS UNLIMITED PASSWORD_LIFE_TIME UNLIMITED; +EOF +} +show_mon_user() { + echo "select USERNAME, ACCOUNT_STATUS from dba_users where USERNAME='$MONUSR';" +} +mk_mon_user() { + cat<<EOF +create user "$MONUSR" identified by "$MONPWD" profile "$MONPROFILE"; +grant create session to "$MONUSR"; +grant select on v_\$instance to "$MONUSR"; +EOF +} +show_mon_user_profile() { + echo "select PROFILE from dba_users where USERNAME='$MONUSR';" +} +set_mon_user_profile() { + echo "alter user "$MONUSR" profile "$MONPROFILE";" +} +reset_mon_user_password() { + echo "alter user "$MONUSR" identified by "$MONPWD";" +} +check_mon_profile() { + local output + output=`dbasql show_mon_profile` + if echo "$output" | grep -iw "^$MONPROFILE" >/dev/null; then + return 0 + fi + output=`dbasql mk_mon_profile show_mon_profile` + if echo "$output" | grep -iw "^$MONPROFILE" >/dev/null; then + return 0 + elif echo "$output" | grep ORA-65140 >/dev/null 2>&1; then + ocf_exit_reason "monprofile must start with C## for container databases" + return $OCF_ERR_CONFIGURED + else + ocf_exit_reason "could not create $MONPROFILE oracle profile" + ocf_log err "sqlplus output: $output" + return 1 + fi +} +check_mon_user() { + local output + local output2 + + output=`dbasql show_mon_user` + if echo "$output" | grep -iw "^$MONUSR" >/dev/null; then + if echo "$output" | grep -w "EXPIRED" >/dev/null; then + dbasql reset_mon_user_password + fi + output=`dbasql show_mon_user_profile` + if echo "$output" | grep -iw "^$MONPROFILE" >/dev/null; then + return 0 + else + output=`dbasql set_mon_user_profile` + output2=`dbasql show_mon_user_profile` + if echo "$output2" | grep -iw "^$MONPROFILE" >/dev/null; then + return 0 + fi + ocf_exit_reason "could not set profile for $MONUSR oracle user" + ocf_log err "sqlplus output: $output( $output2 )" + return 1 + fi + fi + + if [ -z "$OCF_RESKEY_monpassword" ]; then + ocf_exit_reason "Resource param 'monpassword' not set. Please configure monpassword for $MONUSR oracle user" + exit $OCF_ERR_CONFIGURED + fi + + output=`dbasql mk_mon_user show_mon_user` + if echo "$output" | grep -iw "^$MONUSR" >/dev/null; then + return 0 + elif echo "$output" | grep ORA-65096 >/dev/null 2>&1; then + ocf_exit_reason "monuser must start with C## for container databases" + return $OCF_ERR_CONFIGURED + else + ocf_exit_reason "could not create $MONUSR oracle user" + ocf_log err "sqlplus output: $output" + return 1 + fi +} +# +# print the output of dbstat (for debugging) +# +showdbstat() { + echo "Full output:" + dbstat | execsql + echo "Stripped output:" + echo "<`dbasql dbstat`>" +} + +# +# IPC stuff: not overly complex, but quite involved :-/ +# + +# Part 1: Oracle +other_trace_junk() { + echo $1 | sed 's/trc$/trm/' +} +dumpinstipc() { + local output tracef + output=`dbasql getipc` # filename in the 2nd line + tracef=`echo "$output" | awk 'NR==2' | grep '^/.*trc$'` + if [ "$tracef" ]; then + echo $tracef + else + ocf_log warn "'dbasql getipc' failed: $output" + return 1 + fi +} +parseipc() { + local inf=$1 + if [ ! -f "$1" ]; then + ocf_log warn "$1: no such ipc trace file" + return 1 + fi + awk ' + $3 == "Shmid" {n=1;next} + n { + if( $3~/^[0-9]+$/ ) print $3; + n=0 + } + ' $inf | + sort -u | sed 's/^/m:/' + awk ' + /Semaphore List/ {insems=1;next} + insems { + for( i=1; i<=NF; i++ ) + if( $i~/^[0-9]+$/ ) print $i; + } + /system semaphore information/ {exit} + ' $inf | + sort -u | sed 's/^/s:/' + TMPFILES="$TMPFILES $inf `other_trace_junk $inf`" +} + +# Part 2: OS (ipcs,ipcrm) +filteroraipc() { # this portable? + grep -w $ORACLE_OWNER | awk '{print $2}' +} +ipcdesc() { + local what=$1 + case $what in + m) echo "shared memory segment";; + s) echo "semaphore";; + q) echo "message queue";; + esac +} +rmipc() { + local what=$1 id=$2 + ipcs -$what | filteroraipc | grep -iw $id >/dev/null 2>&1 || + return + ocf_log info "Removing `ipcdesc $what` $id." + ipcrm -$what $id +} +ipcrm_orauser() { + local what id + for what in m s q; do + for id in `ipcs -$what | filteroraipc`; do + rmipc $what $id + done + done +} +ipcrm_instance() { + local ipcobj + for ipcobj; do + rmipc `echo $ipcobj | sed 's/:/ /'` + done +} + +# +# oracle_status: is the Oracle instance running? +# +# quick check to see if the instance is up +is_proc_running() { + ps -ef | grep -wiqs "[^ ]*[_]pmon_${ORACLE_SID}" +} +# instance in OPEN state? +instance_live() { + local status=`monsql_one dbstat` + [ "$status" = OPEN ] && return 0 + ocf_log warn "Unable to login as \"$MONUSR\", using \"sysdba\" user instead" + status=`dbasql_one dbstat` + if [ "$status" = OPEN ]; then + return 0 + else + ocf_log info "$ORACLE_SID instance state is not OPEN (dbstat output: $status)" + return 1 + fi +} + +ora_cleanup() { + #rm -fr /tmp/.oracle #??? + rm -f `ls $ORACLE_HOME/dbs/lk* | grep -i "$ORACLE_SID\$"` + #return + + case $IPCRM in + none) + ;; + instance) + ipcrm_instance $* + ;; + orauser) + ipcrm_orauser $* + ;; + esac +} + +oracle_getconfig() { + ora_common_getconfig "$OCF_RESKEY_sid" "$OCF_RESKEY_home" "$OCF_RESKEY_user" + + clear_backupmode=${OCF_RESKEY_clear_backupmode:-${OCF_RESKEY_clear_backupmode_default}} + shutdown_method=${OCF_RESKEY_shutdown_method:-${OCF_RESKEY_shutdown_method_default}} + IPCRM=${OCF_RESKEY_ipcrm:-${OCF_RESKEY_ipcrm_default}} +} + +# +# oracle_start: Start the Oracle instance +# +# NOTE: We handle instance in the MOUNTED and STARTED states +# efficiently +# We *do not* handle instance in the restricted or read-only +# mode, i.e. it appears as running, but its availability is +# "not for general use" +# + +oracle_start() { + local status output + if is_proc_running; then + status="`dbasql_one dbstat`" + case "$status" in + "OPEN") + : nothing to be done, we can leave right now + ocf_log info "Oracle instance $ORACLE_SID already running" + return $OCF_SUCCESS + ;; + "STARTED") + output=`dbasql dbmount` + ;; + "MOUNTED") + : we proceed if mounted + ;; + *) # status unknown + output=`dbasql dbstop dbstart_mount` + ;; + esac + else + output="`dbasql dbstart_mount`" + # try to cleanup in case of + # ORA-01081: cannot start already-running ORACLE - shut it down first + if echo "$output" | grep ORA-01081 >/dev/null 2>&1; then + ocf_log info "ORA-01081 error found, trying to cleanup oracle (dbstart_mount output: $output)" + ora_cleanup + output=`dbasql dbstop_immediate` + output=`dbasql dbstart_mount` + fi + fi + + # oracle instance should be mounted. + status="`dbasql_one dbstat`" + case "$status" in + "MOUNTED") + ;; + *) + : error!! + ocf_exit_reason "oracle $ORACLE_SID can not be mounted (status: $status)" + return $OCF_ERR_GENERIC + ;; + esac + + # It is examined whether mode is "online backup mode", + # and if it is true, makes clear the mode. + # Afterwards, DB is opened. + if is_clear_backupmode_set && is_instance_in_backup_mode; then + clear_backup_mode + fi + output=`dbasql dbopen` + + # check/create the monitor profile + if ! check_mon_profile; then + # dbopen was failed if there is any $output + [ -n "$output" ] && ocf_exit_reason "oracle $ORACLE_SID can not be opened: $output" + return $OCF_ERR_GENERIC + fi + + # check/create the monitor user + if ! check_mon_user; then + # dbopen was failed if there is any $output + [ -n "$output" ] && ocf_exit_reason "oracle $ORACLE_SID can not be opened: $output" + return $OCF_ERR_GENERIC + fi + + if ! is_proc_running; then + ocf_exit_reason "oracle process not running: $output" + return $OCF_ERR_GENERIC + elif ! instance_live; then + ocf_exit_reason "oracle instance $ORACLE_SID not started: $output" + return $OCF_ERR_GENERIC + else + : cool, we are up and running + ocf_log info "Oracle instance $ORACLE_SID started: $output" + return $OCF_SUCCESS + fi +} + +# +# oracle_stop: Stop the Oracle instance +# +oracle_stop() { + local status output ipc="" + if is_proc_running; then + [ "$IPCRM" = "instance" ] && ipc=$(parseipc `dumpinstipc`) + output=`dbasql dbstop` + else + ocf_log info "Oracle instance $ORACLE_SID already stopped" + return $OCF_SUCCESS + fi + ocf_stop_processes TERM $PROCS_CLEANUP_TIME `proc_pids` # kill the procs if they hanged + if is_proc_running; then + ocf_exit_reason "Oracle instance $ORACLE_SID not stopped: $output" + return $OCF_ERR_GENERIC + else + ocf_log info "Oracle instance $ORACLE_SID stopped: $output" + sleep 1 # give em a chance to cleanup + ocf_log info "Cleaning up for $ORACLE_SID" + ora_cleanup "$ipc" + return $OCF_SUCCESS + fi +} + +# +# oracle_monitor: Can the Oracle instance do anything useful? +# +oracle_monitor() { + if ! is_proc_running; then + ocf_log info "oracle process not running" + return $OCF_NOT_RUNNING + fi + if ! instance_live; then + ocf_exit_reason "oracle instance $ORACLE_SID is down" + return $OCF_ERR_GENERIC + fi + #ocf_log info "Oracle instance $ORACLE_SID is alive" + return $OCF_SUCCESS +} + +# other supported actions +oracle_status() { + if is_proc_running + then + echo Oracle instance $ORACLE_SID is running + exit $OCF_SUCCESS + else + echo Oracle instance $ORACLE_SID is stopped + exit $OCF_NOT_RUNNING + fi +} +oracle_dumpinstipc() { + is_proc_running && parseipc `dumpinstipc` +} +oracle_showdbstat() { + showdbstat +} +oracle_cleanup() { + if [ "$IPCRM" = "instance" ]; then + ora_cleanup $(parseipc `dumpinstipc`) + else + ora_cleanup + fi +} +oracle_validate_all() { + case "${shutdown_method}" in + "immediate") ;; + "checkpoint/abort") ;; + *) ocf_exit_reason "unsupported shutdown_method, please read meta-data" + return $OCF_ERR_CONFIGURED + ;; + esac + + case "${IPCRM}" in + "none"|"instance"|"orauser") ;; + *) ocf_exit_reason "unsupported ipcrm setting, please read meta-data" + return $OCF_ERR_CONFIGURED + ;; + esac + + ora_common_validate_all +} + +# used in ora-common.sh +show_procs() { + ps -e -o pid,args | grep -i "[o]ra[a-zA-Z0-9_]*$ORACLE_SID$" +} +proc_pids() { show_procs | awk '{print $1}'; } +PROCS_CLEANUP_TIME="30" + +MONUSR=${OCF_RESKEY_monuser:-$OCF_RESKEY_monuser_default} +MONPWD=${OCF_RESKEY_monpassword:-$OCF_RESKEY_monpassword_default} +MONPROFILE=${OCF_RESKEY_monprofile:-$OCF_RESKEY_monprofile_default} + +MONUSR=$(echo "$MONUSR" | awk '{print toupper($0)}') +MONPROFILE=$(echo "$MONPROFILE" | awk '{print toupper($0)}') +OCF_REQUIRED_PARAMS="sid" +OCF_REQUIRED_BINARIES="sqlplus" +ocf_rarun $* + +# +# vim:tabstop=4:shiftwidth=4:textwidth=0:wrapmargin=0 diff --git a/heartbeat/oralsnr b/heartbeat/oralsnr new file mode 100755 index 0000000..dd0df1c --- /dev/null +++ b/heartbeat/oralsnr @@ -0,0 +1,293 @@ +#!/bin/sh +# +# +# oralsnr +# +# Description: Manages an Oracle Listener as a High-Availability +# resource +# +# +# Author: Dejan Muhamedagic +# Support: users@clusterlabs.org +# License: GNU General Public License (GPL) +# Copyright: (C) 2006 International Business Machines, Inc. +# +# This code inspired by the DB2 resource script +# written by Alan Robertson +# +# An example usage in /etc/ha.d/haresources: +# node1 10.0.0.170 oralsnr::sid::home::user::listener +# +# See oralsnr_usage() function below for more details... +# +# OCF instance parameters: +# OCF_RESKEY_sid (mandatory; for the monitor op) +# OCF_RESKEY_home (optional; else read it from /etc/oratab) +# OCF_RESKEY_user (optional; user to run the listener) +# OCF_RESKEY_listener (optional; defaults to LISTENER) +# +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs +. ${OCF_FUNCTIONS_DIR}/ora-common.sh + +# Parameter defaults + +OCF_RESKEY_sid_default="" +OCF_RESKEY_home_default="" +OCF_RESKEY_user_default="" +OCF_RESKEY_listener_default="LISTENER" + +: ${OCF_RESKEY_sid=${OCF_RESKEY_sid_default}} +: ${OCF_RESKEY_home=${OCF_RESKEY_home_default}} +: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} +: ${OCF_RESKEY_listener=${OCF_RESKEY_listener_default}} + +####################################################################### + +SH=/bin/sh + +oralsnr_usage() { + methods=`oralsnr_methods` + methods=`echo $methods | tr ' ' '|'` + cat <<-! + usage: $0 ($methods) + + $0 manages an Oracle Database instance as an HA resource. + + The 'start' operation starts the database. + The 'stop' operation stops the database. + The 'status' operation reports whether the database is running + The 'monitor' operation reports whether the database seems to be working + The 'validate-all' operation reports whether the parameters are valid + The 'methods' operation reports on the methods $0 supports + + ! +} + +oralsnr_meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="oralsnr" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for Oracle Listener. It manages an +Oracle Listener instance as an HA resource. +</longdesc> +<shortdesc lang="en">Manages an Oracle TNS listener</shortdesc> + +<parameters> + +<parameter name="sid" unique="1" required="1"> +<longdesc lang="en"> +The Oracle SID (aka ORACLE_SID). Necessary for the monitor op, +i.e. to do tnsping SID. +</longdesc> +<shortdesc lang="en">sid</shortdesc> +<content type="string" default="${OCF_RESKEY_sid_default}" /> +</parameter> + +<parameter name="home" unique="0"> +<longdesc lang="en"> +The Oracle home directory (aka ORACLE_HOME). +If not specified, then the SID should be listed in /etc/oratab. +</longdesc> +<shortdesc lang="en">home</shortdesc> +<content type="string" default="${OCF_RESKEY_home_default}" /> +</parameter> + +<parameter name="user" unique="0"> +<longdesc lang="en"> +Run the listener as this user. +</longdesc> +<shortdesc lang="en">user</shortdesc> +<content type="string" default="${OCF_RESKEY_user_default}" /> +</parameter> + +<parameter name="listener" unique="1"> +<longdesc lang="en"> +Listener instance to be started (as defined in listener.ora). +Defaults to LISTENER. +</longdesc> +<shortdesc lang="en">listener</shortdesc> +<content type="string" default="${OCF_RESKEY_listener_default}" /> +</parameter> + +<parameter name="tns_admin" required="0" unique="0"> +<longdesc lang="en"> + Full path to the directory that contains the Oracle + listener tnsnames.ora configuration file. The shell + variable TNS_ADMIN is set to the value provided. +</longdesc> +<shortdesc lang="en"> + Full path to the directory containing tnsnames.ora +</shortdesc> +<content type="string"/> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="120s" /> +<action name="stop" timeout="120s" /> +<action name="status" timeout="60s" /> +<action name="monitor" depth="0" timeout="30s" interval="10s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +<action name="methods" timeout="5s" /> +</actions> +</resource-agent> +END +} + + +# +# methods: What methods/operations do we support? +# +oralsnr_methods() { + cat <<-! + start + stop + status + monitor + validate-all + methods + meta-data + usage + ! +} + +# +# Run commands as the Oracle owner... +# +runasdba() { + if [ "$US" = "$ORACLE_OWNER" ]; then + $SH + else + ( + echo ". $ORA_ENVF" + cat + ) | su -s $SH - $ORACLE_OWNER + fi +} + +# +# oralsnr_start: Start the Oracle listener instance +# + +oralsnr_start() { + if is_proc_running && test_tnsping; then + : nothing to be done, we can leave right now + ocf_log info "Listener $listener already running" + return $OCF_SUCCESS + fi + output=`echo lsnrctl start $listener | runasdba` + if test_tnsping; then + : cool, we are up and running + ocf_log info "Listener $listener running: $output" + return $OCF_SUCCESS + else + ocf_exit_reason "Listener $listener appears to have started, but is not running properly: $output" + ocf_log err "Probable Oracle configuration error" + return $OCF_ERR_GENERIC + fi +} + +# +# oralsnr_stop: Stop the Oracle instance +# +oralsnr_stop() { + if is_proc_running; then + output=`echo lsnrctl stop $listener | runasdba` + else + ocf_log info "Listener $listener already stopped" + return $OCF_SUCCESS + fi + ocf_stop_processes TERM $PROCS_CLEANUP_TIME `proc_pids` # kill the procs if they hanged + if is_proc_running; then + ocf_exit_reason "Listener $listener not stopped: $output" + return $OCF_ERR_GENERIC + else + ocf_log info "Listener $listener stopped: $output" + return $OCF_SUCCESS + fi +} + +# +# is_proc_running: is the listener running? +# +is_proc_running() { + show_procs | grep "." > /dev/null +} +# the following two should be run only if the process is running +test_listener() { + local output + output=`lsnrctl status $listener` + if echo "$output" | tail -1 | grep -qs 'completed successfully' + then + return $OCF_SUCCESS + else + ocf_exit_reason "$listener status failed: $output" + return $OCF_ERR_GENERIC + fi +} +# and does it work? +test_tnsping() { + local output + output=`tnsping $ORACLE_SID` + if echo "$output" | tail -1 | grep -qs '^OK'; then + return $OCF_SUCCESS + else + ocf_exit_reason "tnsping $ORACLE_SID failed: $output" + return $OCF_ERR_GENERIC + fi +} + +# +# oralsnr_monitor: Can we connect to the listener? +# +oralsnr_monitor() { + if is_proc_running; then + test_listener && test_tnsping + else + return $OCF_NOT_RUNNING + fi +} + +oralsnr_status() { + if is_proc_running + then + echo Listener $listener is running + exit $OCF_SUCCESS + else + echo Listener $listener is stopped + exit $OCF_NOT_RUNNING + fi +} + +oralsnr_getconfig() { + ora_common_getconfig "$OCF_RESKEY_sid" "$OCF_RESKEY_home" "$OCF_RESKEY_user" "$OCF_RESKEY_tns_admin" + listener=${OCF_RESKEY_listener} +} + +oralsnr_validate_all() { + ora_common_validate_all +} + +# used in ora-common.sh +show_procs() { + ps -U "$ORACLE_OWNER" -o pid,user,args | + grep '[t]nslsnr' | grep -i -w "$listener" +} +proc_pids() { show_procs | awk '{print $1}'; } +PROCS_CLEANUP_TIME="10" + +OCF_REQUIRED_PARAMS="sid" +OCF_REQUIRED_BINARIES="lsnrctl tnsping" +ocf_rarun $* + +# +# vim:tabstop=4:shiftwidth=4:textwidth=0:wrapmargin=0 diff --git a/heartbeat/ovsmonitor b/heartbeat/ovsmonitor new file mode 100755 index 0000000..6765da4 --- /dev/null +++ b/heartbeat/ovsmonitor @@ -0,0 +1,469 @@ +#!/bin/sh +# +# OCF Resource Agent compliant script. +# Monitor the vitality of a local OpenVSwitch bond. +# +# Based on the work by Alexander Krauth. +# +# Transfered from ethmonitor into ovsmonitor by Mathieu Grzybek. +# +# Copyright (c) 2017 Robert Euhus, Alexander Krauth, Lars Marowsky-Bré +# Mathieu Grzybek +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# OCF parameters are as below +# +# OCF_RESKEY_bond +# OCF_RESKEY_bridge +# OCF_RESKEY_multiplicator +# OCF_RESKEY_name +# OCF_RESKEY_repeat_count +# OCF_RESKEY_repeat_interval +# OCF_RESKEY_pktcnt_timeout +# +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_bond_default="" +OCF_RESKEY_bridge_default="" +OCF_RESKEY_name_default="" +OCF_RESKEY_multiplier_default="1" +OCF_RESKEY_repeat_count_default="5" +OCF_RESKEY_repeat_interval_default="10" +OCF_RESKEY_pktcnt_timeout_default="5" +OCF_RESKEY_link_status_only_default="false" + +: ${OCF_RESKEY_bond=${OCF_RESKEY_bond_default}} +: ${OCF_RESKEY_bridge=${OCF_RESKEY_bridge_default}} +: ${OCF_RESKEY_name=${OCF_RESKEY_name_default}} +: ${OCF_RESKEY_multiplier=${OCF_RESKEY_multiplier_default}} +: ${OCF_RESKEY_repeat_count=${OCF_RESKEY_repeat_count_default}} +: ${OCF_RESKEY_repeat_interval=${OCF_RESKEY_repeat_interval_default}} +: ${OCF_RESKEY_pktcnt_timeout=${OCF_RESKEY_pktcnt_timeout_default}} +: ${OCF_RESKEY_link_status_only=${OCF_RESKEY_link_status_only_default}} + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="ovsmonitor" version="0.1"> +<version>1.0</version> + +<longdesc lang="en"> +Monitor the vitality of a local ovs bond. + +You may set up this RA as a clone resource to monitor the network bonds on different nodes, with the same bond name. +This is not related to the IP address or the network on which a bond is configured. +You may use this RA to move resources away from a node, which has a faulty bond or prevent moving resources to such a node. +This gives you independent control of the resources, without involving cluster intercommunication. But it requires your nodes to have more than one network bond. + +The resource configuration requires a monitor operation, because the monitor does the main part of the work. +In addition to the resource configuration, you need to configure some location constraints, based on a CIB attribute value. +The name of the attribute value is configured in the 'name' option of this RA. + +Example constraint configuration using crmsh +location loc_connected_node my_resource_grp \ + rule $id="rule_loc_connected_node" -INF: ovsmonitor-bond-public eq 0 + +Example constraint configuration using pcs. Only allow 'my_resource' to run on nodes where eth0 ethernet device is available. +pcs constraint location my_resource rule score=-INFINITY ovsmonitor-bond-public ne 1 + +The ethmonitor works in 3 different modes to test the bond vitality. +1. call ovs-appctl to see if at least one of the bonding's link status is up (if link is down -> error) +2. call ovs-ofctl and watch the RX counter (if packages come around in a certain time -> success) +3. return error +</longdesc> +<shortdesc lang="en">Monitors ovs bonding bonds</shortdesc> + +<parameters> +<parameter name="bond" unique="1" required="1"> +<longdesc lang="en"> +The name of the network bond which should be monitored (e.g. bond-public). +</longdesc> +<shortdesc lang="en">Bond bond name</shortdesc> +<content type="string" default="${OCF_RESKEY_bond_default}"/> +</parameter> + +<parameter name="bridge" unique="1" required="1"> +<longdesc lang="en"> +The name of the ovs bridge that contains the bridge. +</longdesc> +<shortdesc lang="en">ovs bridge</shortdesc> +<content type="string" default="${OCF_RESKEY_bridge_default}"/> +</parameter> + +<parameter name="name" unique="1"> +<longdesc lang="en"> +The name of the CIB attribute to set. This is the name to be used in the constraints. Defaults to "ovsmonitor-'bond_name'". +</longdesc> +<shortdesc lang="en">Attribute name</shortdesc> +<content type="string" default="${OCF_RESKEY_name_default}"/> +</parameter> + +<parameter name="multiplier" unique="0" > +<longdesc lang="en"> +Multiplier for the value of the CIB attriobute specified in parameter name. +</longdesc> +<shortdesc lang="en">Multiplier for result variable</shortdesc> +<content type="integer" default="${OCF_RESKEY_multiplier_default}"/> +</parameter> + +<parameter name="repeat_count"> +<longdesc lang="en"> +Specify how often the bond will be monitored, before the status is set to failed. You need to set the timeout of the monitoring operation to at least repeat_count * repeat_interval +</longdesc> +<shortdesc lang="en">Monitor repeat count</shortdesc> +<content type="integer" default="${OCF_RESKEY_repeat_count_default}"/> +</parameter> + +<parameter name="repeat_interval"> +<longdesc lang="en"> +Specify how long to wait in seconds between the repeat_counts. +</longdesc> +<shortdesc lang="en">Monitor repeat interval in seconds</shortdesc> +<content type="integer" default="${OCF_RESKEY_repeat_interval_default}"/> +</parameter> + +<parameter name="pktcnt_timeout"> +<longdesc lang="en"> +Timeout for the RX packet counter. Stop listening for packet counter changes after the given number of seconds. +</longdesc> +<shortdesc lang="en">packet counter timeout</shortdesc> +<content type="integer" default="${OCF_RESKEY_pktcnt_timeout_default}"/> +</parameter> + +<parameter name="link_status_only"> +<longdesc lang="en"> +Only report success based on link status. Do not perform RX counter related connectivity tests. +</longdesc> +<shortdesc lang="en">link status check only</shortdesc> +<content type="boolean" default="${OCF_RESKEY_link_status_only_default}" /> +</parameter> + +</parameters> +<actions> +<action name="start" timeout="60s" /> +<action name="stop" timeout="20s" /> +<action name="status" depth="0" timeout="60s" interval="10s" /> +<action name="monitor" depth="0" timeout="60s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="20s" /> +</actions> +</resource-agent> +END + + exit $OCF_SUCCESS +} + +# +# Return true, if the bond exists +# +is_bond() { + # + # List bonds but exclude FreeS/WAN ipsecN virtual bonds + # + ovs-appctl bond/show $OCF_RESKEY_bond 1>/dev/null 2>&1 +} + +# +# Return true, if the bridge exists +# +is_bridge() { + # + # List bonds but exclude FreeS/WAN ipsecN virtual bonds + # + #ovs-appctl bond/show $OCF_RESKEY_bond 1>/dev/null 2>&1 + ovs-vsctl show|grep Bridge|grep -q $OCF_RESKEY_bridge +} + + +if_init() { + local rc + + if [ X"$OCF_RESKEY_bond" = "X" ]; then + ocf_exit_reason "Bond name (the bond parameter) is mandatory" + exit $OCF_ERR_CONFIGURED + fi + + if [ X"$OCF_RESKEY_bridge" = "X" ]; then + ocf_exit_reason "Bridge name (the bridge parameter) is mandatory" + exit $OCF_ERR_CONFIGURED + fi + + BOND="$OCF_RESKEY_bond" + BRIDGE="$OCF_RESKEY_bridge" + + if is_bond + then + if ! is_bridge + then + ocf_exit_reason "Bridge $OCF_RESKEY_bond does not exist" + exit $OCF_ERR_CONFIGURED; + fi + else + ocf_exit_reason "Bond $OCF_RESKEY_bond does not exist" + exit $OCF_ERR_CONFIGURED; + fi + + if ! ocf_is_decimal "$OCF_RESKEY_multiplier"; then + ocf_exit_reason "Invalid OCF_RESKEY_multiplier [$OCF_RESKEY_multiplier]" + exit $OCF_ERR_CONFIGURED + fi + + ATTRNAME=${OCF_RESKEY_name:-"ovsmonitor-$BOND"} + + REP_COUNT=${OCF_RESKEY_repeat_count} + if ! ocf_is_decimal "$REP_COUNT" -o [ $REP_COUNT -lt 1 ]; then + ocf_exit_reason "Invalid OCF_RESKEY_repeat_count [$REP_COUNT]" + exit $OCF_ERR_CONFIGURED + fi + REP_INTERVAL_S=${OCF_RESKEY_repeat_interval} + if ! ocf_is_decimal "$REP_INTERVAL_S"; then + ocf_exit_reason "Invalid OCF_RESKEY_repeat_interval [$REP_INTERVAL_S]" + exit $OCF_ERR_CONFIGURED + fi + if ! ocf_is_decimal "$OCF_RESKEY_pktcnt_timeout"; then + ocf_exit_reason "Invalid OCF_RESKEY_pktcnt_timeout [$OCF_RESKEY_pktcnt_timeout]" + exit $OCF_ERR_CONFIGURED + fi + return $OCF_SUCCESS +} + +# get the link status on $BOND +# asks ip about running (up) bonds, returns the number of matching bond names that are up +get_link_status () { + #$IP2UTIL -o link show up dev "$BOND" | grep -v 'NO-CARRIER' | grep -c "$BOND" + ovs-appctl bond/show "$BOND"|awk -F: '/^slave/ {print $2}'|grep -c enabled +} + +# returns the number of received rx packets on $BOND +get_rx_packets () { + ocf_log debug "bond $BOND - bridge $BRIDGE" + #$IP2UTIL -o -s link show dev "$BOND" \ + # | sed 's/.* RX: [^0-9]*[0-9]* *\([0-9]*\) .*/\1/' + local ovs_port + + for ovs_port in $(ovs-appctl bond/show $BOND|awk '/^slave/ {gsub(":","");print $2}') ; do + ovs-ofctl dump-ports $BRIDGE $ovs_port + done \ + | awk -F, 'BEGIN{total=0} /rx/ {gsub(".*pkts=","");total=total+int($1)} END{print total}' +} + +# watch for packet counter changes for max. OCF_RESKEY_pktcnt_timeout seconds +# returns immedeately with return code 0 if any packets were received +# otherwise 1 is returned +watch_pkt_counter () { + local RX_PACKETS_NEW + local RX_PACKETS_OLD + RX_PACKETS_OLD="`get_rx_packets`" + for n in `seq $(( $OCF_RESKEY_pktcnt_timeout * 10 ))`; do + sleep 0.1 + RX_PACKETS_NEW="`get_rx_packets`" + ocf_log debug "RX_PACKETS_OLD: $RX_PACKETS_OLD RX_PACKETS_NEW: $RX_PACKETS_NEW" + if [ "$RX_PACKETS_OLD" -ne "$RX_PACKETS_NEW" ]; then + ocf_log debug "we received some packets." + return 0 + fi + done + return 1 +} + +# +# Check the bond depending on the level given as parameter: $OCF_RESKEY_check_level +# +# 10: watch for packet counter changes +# +# +# 30: watch for packet counter changes in promiscios mode +# +# If unsuccessfull in levels 18 and above, +# the tests for higher check levels are run. +# +if_check () { + # always check link status first + link_status="`get_link_status`" + ocf_log debug "link_status: $link_status (up > 0, down = 0)" + + if [ $link_status -eq 0 ]; then + ocf_log notice "link_status: DOWN" + return $OCF_NOT_RUNNING + fi + + # if using link_status_only, skip RX count related test + if ocf_is_true "$OCF_RESKEY_link_status_only"; then + return $OCF_SUCCESS + fi + + # watch for packet counter changes + ocf_log debug "watch for packet counter changes" + watch_pkt_counter + if [ $? -eq 0 ]; then + return $OCF_SUCCESS + else + ocf_log debug "No packets received during packet watch timeout" + fi + + # watch for packet counter changes in promiscios mode +# ocf_log debug "watch for packet counter changes in promiscios mode" + # be sure switch off promiscios mode in any case + # TODO: check first, wether promisc is already on and leave it untouched. +# trap "$IP2UTIL link set dev $BOND promisc off; exit" INT TERM EXIT +# $IP2UTIL link set dev $BOND promisc on +# watch_pkt_counter && return $OCF_SUCCESS +# $IP2UTIL link set dev $BOND promisc off +# trap - INT TERM EXIT + + # looks like it's not working (for whatever reason) + return $OCF_NOT_RUNNING +} + +####################################################################### + +if_usage() { + cat <<END +usage: $0 {start|stop|status|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +set_cib_value() { + local score=`expr $1 \* $OCF_RESKEY_multiplier` + attrd_updater -n $ATTRNAME -v $score + local rc=$? + case $rc in + 0) ocf_log debug "attrd_updater: Updated $ATTRNAME = $score" ;; + *) ocf_log warn "attrd_updater: Could not update $ATTRNAME = $score: rc=$rc";; + esac + return $rc +} + +if_monitor() { + ha_pseudo_resource $OCF_RESOURCE_INSTANCE monitor + local pseudo_status=$? + if [ $pseudo_status -ne $OCF_SUCCESS ]; then + exit $pseudo_status + fi + + local mon_rc=$OCF_NOT_RUNNING + local attr_rc=$OCF_NOT_RUNNING + local runs=0 + local start_time + local end_time + local sleep_time + while [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ] + do + start_time=`date +%s%N` + if_check + mon_rc=$? + REP_COUNT=$(( $REP_COUNT - 1 )) + if [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ]; then + ocf_log warn "Monitoring of $OCF_RESOURCE_INSTANCE failed, $REP_COUNT retries left." + end_time=`date +%s%N` + sleep_time=`echo "scale=9; ( $start_time + ( $REP_INTERVAL_S * 1000000000 ) - $end_time ) / 1000000000" | bc -q 2> /dev/null` + sleep $sleep_time 2> /dev/null + runs=$(($runs + 1)) + fi + + if [ $mon_rc -eq $OCF_SUCCESS -a $runs -ne 0 ]; then + ocf_log info "Monitoring of $OCF_RESOURCE_INSTANCE recovered from error" + fi + done + + ocf_log debug "Monitoring return code: $mon_rc" + if [ $mon_rc -eq $OCF_SUCCESS ]; then + set_cib_value 1 + attr_rc=$? + else + ocf_log err "Monitoring of $OCF_RESOURCE_INSTANCE failed." + set_cib_value 0 + attr_rc=$? + fi + + ## The resource should not fail, if the bond is down. It should fail, if the update of the CIB variable has errors. + ## To react on the bond failure you must use constraints based on the CIB variable value, not on the resource itself. + exit $attr_rc +} + +if_stop() +{ + attrd_updater -D -n $ATTRNAME + ha_pseudo_resource $OCF_RESOURCE_INSTANCE stop +} + +if_start() +{ + local rc + ha_pseudo_resource $OCF_RESOURCE_INSTANCE start + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + ocf_exit_reason "Failure to create ovsmonitor state file" + return $rc + fi + + # perform the first monitor during the start operation + if_monitor + return $? +} + + +if_validate() { + check_binary ovs-vsctl + check_binary ovs-appctl + check_binary ovs-ofctl + check_binary bc + if_init +} + +case $__OCF_ACTION in +meta-data) meta_data + ;; +usage|help) if_usage + exit $OCF_SUCCESS + ;; +esac + +if_validate + +case $__OCF_ACTION in +start) if_start + exit $? + ;; +stop) if_stop + exit $? + ;; +monitor|status) if_monitor + exit $? + ;; +validate-all) exit $? + ;; +*) if_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac diff --git a/heartbeat/pgagent b/heartbeat/pgagent new file mode 100755 index 0000000..b1e61b3 --- /dev/null +++ b/heartbeat/pgagent @@ -0,0 +1,139 @@ +#!/bin/sh +# +# High-Availability pgagent OCF resource agent +# +# Description: Starts/stops pgagent +# Author: Oleg Selin +# License: GNU General Public License (GPL) +# +# OCF parameters: +# OCF_RESKEY_connection_string +# OCF_RESKEY_user +# OCF_RESKEY_options +# +####################################################################### +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +OCF_RESKEY_executable_default="`which pgagent`" +OCF_RESKEY_connection_string_default="user=postgres host=/var/run/postgresql" +OCF_RESKEY_user_default="postgres" +OCF_RESKEY_options_default="-r 1 -t 1" + +: ${OCF_RESKEY_executable="${OCF_RESKEY_executable_default}"} +: ${OCF_RESKEY_connection_string="${OCF_RESKEY_connection_string_default}"} +: ${OCF_RESKEY_user="${OCF_RESKEY_user_default}"} +: ${OCF_RESKEY_options="${OCF_RESKEY_options_default}"} + +pgagent_validate_all() { + check_binary pgagent + ocf_log debug "executable: '$OCF_RESKEY_executable'" + ocf_log debug "connection string: '$OCF_RESKEY_connection_string'" + ocf_log debug "user: '$OCF_RESKEY_user'" + ocf_log debug "options: '$OCF_RESKEY_options'" + if [ -z "$OCF_RESKEY_connection_string" ]; then + ocf_log err "Connection string is not configured!" + exit $OCF_ERR_CONFIGURED + fi + if [ -z "$OCF_RESKEY_user" ]; then + ocf_log err "User is not configured!" + exit $OCF_ERR_CONFIGURED + fi + getent passwd $OCF_RESKEY_user >/dev/null 2>&1 + if [ ! $? -eq 0 ]; then + ocf_log err "User $OCF_RESKEY_user doesn't exist"; + return $OCF_ERR_CONFIGURED; + fi + return $OCF_SUCCESS +} + +pgagent_start() { + pgagent_validate_all + nohup su - $OCF_RESKEY_user -c "'$OCF_RESKEY_executable' $OCF_RESKEY_options '$OCF_RESKEY_connection_string'" > /dev/null 2>&1 & + sleep 1 + if pgagent_monitor; then + return $OCF_SUCCESS + fi + return $OCF_ERR_GENERIC +} + +pgagent_stop() { + pgagent_validate_all + pid=`pgrep -f -x -U $OCF_RESKEY_user "$OCF_RESKEY_executable $OCF_RESKEY_options $OCF_RESKEY_connection_string"` + if [ -n "$pid" ]; then + ocf_run kill $pid || return $OCF_ERR_GENERIC + fi + return $OCF_SUCCESS +} + +pgagent_monitor() { + if [ -z "$OCF_RESKEY_executable" ]; then + return $OCF_ERR_INSTALLED + fi + ocf_run pgrep -f -x -U "$OCF_RESKEY_user" "$OCF_RESKEY_executable $OCF_RESKEY_options $OCF_RESKEY_connection_string" || return $OCF_NOT_RUNNING + return $OCF_SUCCESS +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="pgagent" version="1.0"> +<version>1.0</version> +<longdesc lang="en">This is a pgagent Resource Agent.</longdesc> +<shortdesc lang="en">Controls pgagent</shortdesc> +<parameters> +<parameter name="connection_string"> +<longdesc lang="en">Connection string for pgagent.</longdesc> +<shortdesc lang="en">pgagent connection string</shortdesc> +<content type="string" default="$OCF_RESKEY_connection_string_default" /> +</parameter> +<parameter name="user"> +<longdesc lang="en">User to run pgagent as.</longdesc> +<shortdesc lang="en">User to run pgagent</shortdesc> +<content type="string" default="$OCF_RESKEY_user_default" /> +</parameter> +<parameter name="options"> +<longdesc lang="en">Options for pgagent.</longdesc> +<shortdesc lang="en">pgagent run options, see pgagent --help for details</shortdesc> +<content type="string" default="$OCF_RESKEY_options_default" /> +</parameter> +</parameters> +<actions> +<action name="start" timeout="5s" /> +<action name="stop" timeout="5s" /> +<action name="monitor" timeout="20s" interval="10s" depth="0" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s" /> +</actions> +</resource-agent> +END +} + +pgagent_usage() { + cat <<END +usage: $0 {start|stop|monitor|meta-data|validate-all} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +case $__OCF_ACTION in +start) pgagent_start;; +stop) pgagent_stop;; +monitor) pgagent_monitor;; +validate-all) pgagent_validate_all;; +usage|help) pgagent_usage + exit $OCF_SUCCESS + ;; +meta-data) meta_data + exit $OCF_SUCCESS + ;; +*) pgagent_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc diff --git a/heartbeat/pgsql b/heartbeat/pgsql new file mode 100755 index 0000000..532063a --- /dev/null +++ b/heartbeat/pgsql @@ -0,0 +1,2263 @@ +#!/bin/sh +# +# Description: Manages a PostgreSQL Server as an OCF High-Availability +# resource +# +# Authors: Serge Dubrouski (sergeyfd@gmail.com) -- original RA +# Florian Haas (florian@linbit.com) -- makeover +# Takatoshi MATSUO (matsuo.tak@gmail.com) -- support replication +# David Corlette (dcorlette@netiq.com) -- add support for non-standard library locations and non-standard port +# +# Copyright: 2006-2012 Serge Dubrouski <sergeyfd@gmail.com> +# and other Linux-HA contributors +# License: GNU General Public License (GPL) +# +############################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Use runuser if available for SELinux. +if [ -x /sbin/runuser ]; then + SU=runuser +else + SU=su +fi + +# +# Get PostgreSQL Configuration parameter +# +get_pgsql_param() { + local param_name + + param_name=$1 + perl_code="if (/^\s*$param_name[\s=]+\s*(.*)$/) { + \$dir=\$1; + \$dir =~ s/\s*\#.*//; + \$dir =~ s/^'(\S*)'/\$1/; + print \$dir;}" + + perl -ne "$perl_code" < $OCF_RESKEY_config +} + +# Defaults +OCF_RESKEY_pgctl_default=/usr/bin/pg_ctl +OCF_RESKEY_psql_default=/usr/bin/psql +OCF_RESKEY_pgdata_default=/var/lib/pgsql/data +OCF_RESKEY_pgdba_default=postgres +OCF_RESKEY_pghost_default="" +OCF_RESKEY_pgport_default=5432 +OCF_RESKEY_pglibs_default=/usr/lib +OCF_RESKEY_start_opt_default="" +OCF_RESKEY_ctl_opt_default="" +OCF_RESKEY_pgdb_default=template1 +OCF_RESKEY_logfile_default=/dev/null +OCF_RESKEY_socketdir_default="" +OCF_RESKEY_stop_escalate_default=90 +OCF_RESKEY_monitor_user_default="" +OCF_RESKEY_monitor_password_default="" +OCF_RESKEY_monitor_sql_default="select now();" +OCF_RESKEY_check_wal_receiver_default="false" +# Defaults for replication +OCF_RESKEY_rep_mode_default=none +OCF_RESKEY_node_list_default="" +OCF_RESKEY_restore_command_default="" +OCF_RESKEY_archive_cleanup_command_default="" +OCF_RESKEY_recovery_end_command_default="" +OCF_RESKEY_master_ip_default="" +OCF_RESKEY_repuser_default="postgres" +OCF_RESKEY_primary_conninfo_opt_default="" +OCF_RESKEY_restart_on_promote_default="false" +OCF_RESKEY_tmpdir_default="/var/lib/pgsql/tmp" +OCF_RESKEY_xlog_check_count_default="3" +OCF_RESKEY_crm_attr_timeout_default="5" +OCF_RESKEY_stop_escalate_in_slave_default=90 +OCF_RESKEY_replication_slot_name_default="" + +: ${OCF_RESKEY_pgctl=${OCF_RESKEY_pgctl_default}} +: ${OCF_RESKEY_psql=${OCF_RESKEY_psql_default}} +: ${OCF_RESKEY_pgdata=${OCF_RESKEY_pgdata_default}} +: ${OCF_RESKEY_pgdba=${OCF_RESKEY_pgdba_default}} +: ${OCF_RESKEY_pghost=${OCF_RESKEY_pghost_default}} +: ${OCF_RESKEY_pgport=${OCF_RESKEY_pgport_default}} +: ${OCF_RESKEY_pglibs=${OCF_RESKEY_pglibs_default}} +: ${OCF_RESKEY_config=${OCF_RESKEY_pgdata}/postgresql.conf} +: ${OCF_RESKEY_start_opt=${OCF_RESKEY_start_opt_default}} +: ${OCF_RESKEY_ctl_opt=${OCF_RESKEY_ctl_opt_default}} +: ${OCF_RESKEY_pgdb=${OCF_RESKEY_pgdb_default}} +: ${OCF_RESKEY_logfile=${OCF_RESKEY_logfile_default}} +: ${OCF_RESKEY_socketdir=${OCF_RESKEY_socketdir_default}} +: ${OCF_RESKEY_stop_escalate=${OCF_RESKEY_stop_escalate_default}} +: ${OCF_RESKEY_monitor_user=${OCF_RESKEY_monitor_user_default}} +: ${OCF_RESKEY_monitor_password=${OCF_RESKEY_monitor_password_default}} +: ${OCF_RESKEY_monitor_sql=${OCF_RESKEY_monitor_sql_default}} +: ${OCF_RESKEY_check_wal_receiver=${OCF_RESKEY_check_wal_receiver_default}} + +# for replication +: ${OCF_RESKEY_rep_mode=${OCF_RESKEY_rep_mode_default}} +: ${OCF_RESKEY_node_list=${OCF_RESKEY_node_list_default}} +: ${OCF_RESKEY_restore_command=${OCF_RESKEY_restore_command_default}} +: ${OCF_RESKEY_archive_cleanup_command=${OCF_RESKEY_archive_cleanup_command_default}} +: ${OCF_RESKEY_recovery_end_command=${OCF_RESKEY_recovery_end_command_default}} +: ${OCF_RESKEY_master_ip=${OCF_RESKEY_master_ip_default}} +: ${OCF_RESKEY_repuser=${OCF_RESKEY_repuser_default}} +: ${OCF_RESKEY_primary_conninfo_opt=${OCF_RESKEY_primary_conninfo_opt_default}} +: ${OCF_RESKEY_restart_on_promote=${OCF_RESKEY_restart_on_promote_default}} +: ${OCF_RESKEY_tmpdir=${OCF_RESKEY_tmpdir_default}} +: ${OCF_RESKEY_xlog_check_count=${OCF_RESKEY_xlog_check_count_default}} +: ${OCF_RESKEY_crm_attr_timeout=${OCF_RESKEY_crm_attr_timeout_default}} +: ${OCF_RESKEY_stop_escalate_in_slave=${OCF_RESKEY_stop_escalate_in_slave_default}} +: ${OCF_RESKEY_replication_slot_name=${OCF_RESKEY_replication_slot_name_default}} + +usage() { + cat <<EOF + usage: $0 start|stop|status|monitor|promote|demote|notify|meta-data|validate-all|methods + + $0 manages a PostgreSQL Server as an HA resource. + + The 'start' operation starts the PostgreSQL server. + The 'stop' operation stops the PostgreSQL server. + The 'status' operation reports whether the PostgreSQL is up. + The 'monitor' operation reports whether the PostgreSQL is running. + The 'promote' operation promotes the PostgreSQL server. + The 'demote' operation demotes the PostgreSQL server. + The 'validate-all' operation reports whether the parameters are valid. + The 'methods' operation reports on the methods $0 supports. +EOF + return $OCF_ERR_ARGS +} + +meta_data() { + cat <<EOF +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="pgsql" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for PostgreSQL. It manages a PostgreSQL as an HA resource. +</longdesc> +<shortdesc lang="en">Manages a PostgreSQL database instance</shortdesc> + +<parameters> +<parameter name="pgctl" unique="0" required="0"> +<longdesc lang="en"> +Path to pg_ctl command. +</longdesc> +<shortdesc lang="en">pgctl</shortdesc> +<content type="string" default="${OCF_RESKEY_pgctl_default}" /> +</parameter> + +<parameter name="start_opt" unique="0" required="0"> +<longdesc lang="en"> +Start options (-o start_opt in pg_ctl). "-i -p 5432" for example. +</longdesc> +<shortdesc lang="en">start_opt</shortdesc> +<content type="string" default="${OCF_RESKEY_start_opt_default}" /> + +</parameter> +<parameter name="ctl_opt" unique="0" required="0"> +<longdesc lang="en"> +Additional pg_ctl options (-w, -W etc..). +</longdesc> +<shortdesc lang="en">ctl_opt</shortdesc> +<content type="string" default="${OCF_RESKEY_ctl_opt_default}" /> +</parameter> + +<parameter name="psql" unique="0" required="0"> +<longdesc lang="en"> +Path to psql command. +</longdesc> +<shortdesc lang="en">psql</shortdesc> +<content type="string" default="${OCF_RESKEY_psql_default}" /> +</parameter> + +<parameter name="pgdata" unique="0" required="0"> +<longdesc lang="en"> +Path to PostgreSQL data directory. +</longdesc> +<shortdesc lang="en">pgdata</shortdesc> +<content type="string" default="${OCF_RESKEY_pgdata_default}" /> +</parameter> + +<parameter name="pgdba" unique="0" required="0"> +<longdesc lang="en"> +User that owns PostgreSQL. +</longdesc> +<shortdesc lang="en">pgdba</shortdesc> +<content type="string" default="${OCF_RESKEY_pgdba_default}" /> +</parameter> + +<parameter name="pghost" unique="0" required="0"> +<longdesc lang="en"> +Hostname/IP address where PostgreSQL is listening +</longdesc> +<shortdesc lang="en">pghost</shortdesc> +<content type="string" default="${OCF_RESKEY_pghost_default}" /> +</parameter> + +<parameter name="pgport" unique="0" required="0"> +<longdesc lang="en"> +Port where PostgreSQL is listening +</longdesc> +<shortdesc lang="en">pgport</shortdesc> +<content type="integer" default="${OCF_RESKEY_pgport_default}" /> +</parameter> + +<parameter name="pglibs" unique="0" required="0"> +<longdesc lang="en"> +Custom location of the Postgres libraries. If not set, the standard location +will be used. +</longdesc> +<shortdesc lang="en">pglibs</shortdesc> +<content type="string" default="${OCF_RESKEY_pglibs_default}" /> +</parameter> + +<parameter name="monitor_user" unique="0" required="0"> +<longdesc lang="en"> +PostgreSQL user that pgsql RA will user for monitor operations. If it's not set +pgdba user will be used. +</longdesc> +<shortdesc lang="en">monitor_user</shortdesc> +<content type="string" default="${OCF_RESKEY_monitor_user_default}" /> +</parameter> + +<parameter name="monitor_password" unique="0" required="0"> +<longdesc lang="en"> +Password for monitor user. +</longdesc> +<shortdesc lang="en">monitor_password</shortdesc> +<content type="string" default="${OCF_RESKEY_monitor_password_default}" /> +</parameter> + +<parameter name="monitor_sql" unique="0" required="0"> +<longdesc lang="en"> +SQL script that will be used for monitor operations. +</longdesc> +<shortdesc lang="en">monitor_sql</shortdesc> +<content type="string" default="${OCF_RESKEY_monitor_sql_default}" /> +</parameter> + +<parameter name="config" unique="0" required="0"> +<longdesc lang="en"> +Path to the PostgreSQL configuration file for the instance. +</longdesc> +<shortdesc lang="en">Configuration file</shortdesc> +<content type="string" default="${OCF_RESKEY_pgdata}/postgresql.conf" /> +</parameter> + +<parameter name="pgdb" unique="0" required="0"> +<longdesc lang="en"> +Database that will be used for monitoring. +</longdesc> +<shortdesc lang="en">pgdb</shortdesc> +<content type="string" default="${OCF_RESKEY_pgdb_default}" /> +</parameter> + +<parameter name="logfile" unique="0" required="0"> +<longdesc lang="en"> +Path to PostgreSQL server log output file. +</longdesc> +<shortdesc lang="en">logfile</shortdesc> +<content type="string" default="${OCF_RESKEY_logfile_default}" /> +</parameter> + +<parameter name="socketdir" unique="0" required="0"> +<longdesc lang="en"> +Unix socket directory for PostgreSQL. + +If you use PostgreSQL 9.3 or higher and define unix_socket_directories in the postgresql.conf, then you must set socketdir to determine which directory is used for psql command. +</longdesc> +<shortdesc lang="en">socketdir</shortdesc> +<content type="string" default="${OCF_RESKEY_socketdir_default}" /> +</parameter> + +<parameter name="stop_escalate" unique="0" required="0"> +<longdesc lang="en"> +Number of seconds to wait for stop (using -m fast) before resorting to -m immediate +</longdesc> +<shortdesc lang="en">stop escalation</shortdesc> +<content type="integer" default="${OCF_RESKEY_stop_escalate_default}" /> +</parameter> + +<parameter name="rep_mode" unique="0" required="0"> +<longdesc lang="en"> +Replication mode may be set to "async" or "sync" or "slave". +They require PostgreSQL 9.1 or later. +Once set, "async" and "sync" require node_list, master_ip, and +restore_command parameters,as well as configuring PostgreSQL +for replication (in postgresql.conf and pg_hba.conf). + +"slave" means that RA only makes recovery.conf before starting +to connect to primary which is running somewhere. +It doesn't need master/slave setting. +It requires master_ip restore_command parameters. +</longdesc> +<shortdesc lang="en">rep_mode</shortdesc> +<content type="string" default="${OCF_RESKEY_rep_mode_default}" /> +</parameter> + +<parameter name="node_list" unique="0" required="0"> +<longdesc lang="en"> +All node names. Please separate each node name with a space. +This is optional for replication. Defaults to all nodes in the cluster +</longdesc> +<shortdesc lang="en">node list</shortdesc> +<content type="string" default="${OCF_RESKEY_node_list_default}" /> +</parameter> + +<parameter name="restore_command" unique="0" required="0"> +<longdesc lang="en"> +restore_command for recovery.conf. +This is required for replication. +</longdesc> +<shortdesc lang="en">restore_command</shortdesc> +<content type="string" default="${OCF_RESKEY_restore_command_default}" /> +</parameter> + +<parameter name="archive_cleanup_command" unique="0" required="0"> +<longdesc lang="en"> +archive_cleanup_command for recovery.conf. +This is used for replication and is optional. +</longdesc> +<shortdesc lang="en">archive_cleanup_command</shortdesc> +<content type="string" default="${OCF_RESKEY_archive_cleanup_command_default}" /> +</parameter> + +<parameter name="recovery_end_command" unique="0" required="0"> +<longdesc lang="en"> +recovery_end_command for recovery.conf. +This is used for replication and is optional. +</longdesc> +<shortdesc lang="en">recovery_end_command</shortdesc> +<content type="string" default="${OCF_RESKEY_recovery_end_command_default}" /> +</parameter> + +<parameter name="master_ip" unique="0" required="0"> +<longdesc lang="en"> +Master's floating IP address to be connected from hot standby. +This parameter is used for "primary_conninfo" in recovery.conf. +This is required for replication. +</longdesc> +<shortdesc lang="en">master ip</shortdesc> +<content type="string" default="${OCF_RESKEY_master_ip_default}" /> +</parameter> + +<parameter name="repuser" unique="0" required="0"> +<longdesc lang="en"> +User used to connect to the master server. +This parameter is used for "primary_conninfo" in recovery.conf. +This is required for replication. +</longdesc> +<shortdesc lang="en">repuser</shortdesc> +<content type="string" default="${OCF_RESKEY_repuser_default}" /> +</parameter> + +<parameter name="primary_conninfo_opt" unique="0" required="0"> +<longdesc lang="en"> +primary_conninfo options of recovery.conf except host, port, user and application_name. +This is optional for replication. +</longdesc> +<shortdesc lang="en">primary_conninfo_opt</shortdesc> +<content type="string" default="${OCF_RESKEY_primary_conninfo_opt_default}" /> +</parameter> + +<parameter name="restart_on_promote" unique="0" required="0"> +<longdesc lang="en"> +If this is true, RA deletes recovery.conf and restarts PostgreSQL +on promote to keep Timeline ID. It probably makes fail-over slower. +It's recommended to set on-fail of promote up as fence. +This is optional for replication. +</longdesc> +<shortdesc lang="en">restart_on_promote</shortdesc> +<content type="boolean" default="${OCF_RESKEY_restart_on_promote_default}" /> +</parameter> + +<parameter name="replication_slot_name" unique="0" required="0"> +<longdesc lang="en"> +Set this option when using replication slots. +Can only use lower case letters, numbers and underscore for replication_slot_name. + +The replication slots would be created for each node, with the name adding the node name as postfix. +For example, replication_slot_name is "sample" and 2 slaves which are "node1" and "node2" connect to +their slots, the slots names are "sample_node1" and "sample_node2". +If the node name contains a upper case letter, hyphen and dot, those characters will be converted to a lower case letter or an underscore. +For example, Node-1.example.com to node_1_example_com. + +pgsql RA doesn't monitor and delete the replication slot. +When the slave node has been disconnected in failure or the like, execute one of the following manually. +Otherwise it may eventually cause a disk full because the master node will continue to accumulate the unsent WAL. +1. recover and reconnect the slave node to the master node as soon as possible. +2. delete the slot on the master node by following psql command. +$ select pg_drop_replication_slot('replication_slot_name'); +</longdesc> +<shortdesc lang="en">replication_slot_name</shortdesc> +<content type="string" default="${OCF_RESKEY_replication_slot_name_default}" /> +</parameter> + +<parameter name="tmpdir" unique="0" required="0"> +<longdesc lang="en"> +Path to temporary directory. +This is optional for replication. +</longdesc> +<shortdesc lang="en">tmpdir</shortdesc> +<content type="string" default="${OCF_RESKEY_tmpdir_default}" /> +</parameter> + +<parameter name="xlog_check_count" unique="0" required="0"> +<longdesc lang="en"> +Number of checks of xlog on monitor before promote. +This is optional for replication. + +Note: For backward compatibility, the terms are unified with PostgreSQL 9. + If you are using PostgreSQL 10 or later, replace "xlog" with "wal". + Likewise, replacing "location" with "lsn". +</longdesc> +<shortdesc lang="en">xlog check count</shortdesc> +<content type="integer" default="${OCF_RESKEY_xlog_check_count_default}" /> +</parameter> + +<parameter name="crm_attr_timeout" unique="0" required="0"> +<longdesc lang="en"> +The timeout of crm_attribute forever update command. +Default value is 5 seconds. +This is optional for replication. +</longdesc> +<shortdesc lang="en">The timeout of crm_attribute forever update command.</shortdesc> +<content type="integer" default="${OCF_RESKEY_crm_attr_timeout_default}" /> +</parameter> + +<parameter name="stop_escalate_in_slave" unique="0" required="0"> +<longdesc lang="en"> +Number of seconds to wait for stop (using -m fast) before resorting to -m immediate +in slave state. +This is optional for replication. +</longdesc> +<shortdesc lang="en">stop escalation_in_slave</shortdesc> +<content type="integer" default="${OCF_RESKEY_stop_escalate_in_slave_default}" /> +</parameter> + +<parameter name="check_wal_receiver" unique="0" required="0"> +<longdesc lang="en"> +If this is true, RA checks wal_receiver process on monitor +and notifies its status using "(resource name)-receiver-status" attribute. +It's useful for checking whether PostgreSQL (hot standby) connects to primary. +The attribute shows status as "normal" or "normal (master)" or "ERROR". +Note that if you configure PostgreSQL as master/slave resource, then +wal receiver is not running in the master and the attribute shows status as +"normal (master)" consistently because it is normal status. +</longdesc> +<shortdesc lang="en">check_wal_receiver</shortdesc> +<content type="boolean" default="${OCF_RESKEY_check_wal_receiver_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="120s" /> +<action name="stop" timeout="120s" /> +<action name="status" timeout="60s" /> +<action name="monitor" depth="0" timeout="30s" interval="30s"/> +<action name="monitor" depth="0" timeout="30s" interval="29s" role="Promoted" /> +<action name="promote" timeout="120s" /> +<action name="demote" timeout="120s" /> +<action name="notify" timeout="90s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s" /> +<action name="methods" timeout="5s" /> +</actions> +</resource-agent> +EOF +} + + +# +# Run the given command in the Resource owner environment... +# +runasowner() { + local quietrun="" + local loglevel="-err" + local var + + for var in 1 2 + do + case "$1" in + "-q") + quietrun="-q" + shift 1;; + "info"|"warn"|"err") + loglevel="-$1" + shift 1;; + *) + ;; + esac + done + + ocf_run $quietrun $loglevel $SU $OCF_RESKEY_pgdba -c "cd $OCF_RESKEY_pgdata; $*" +} + +# +# Shell escape +# +escape_string() { + echo "$*" | sed -e "s/'/'\\\\''/g" +} + + +# +# methods: What methods/operations do we support? +# + +pgsql_methods() { + cat <<EOF + start + stop + status + monitor + promote + demote + notify + methods + meta-data + validate-all +EOF +} + + +# Execulte SQL and return the result. +exec_sql() { + local sql="$1" + local output + local rc + + output=`$SU $OCF_RESKEY_pgdba -c "cd $OCF_RESKEY_pgdata; \ + $OCF_RESKEY_psql $psql_options -U $OCF_RESKEY_pgdba \ + -Atc \"$sql\""` + rc=$? + + echo $output + return $rc +} + + +#pgsql_real_start: Starts PostgreSQL +pgsql_real_start() { + local pgctl_options + local postgres_options + local rc + + pgsql_real_monitor info + rc=$? + if [ $rc -eq $OCF_SUCCESS -o $rc -eq $OCF_RUNNING_MASTER ]; then + ocf_log info "PostgreSQL is already running. PID=`cat $PIDFILE`" + if is_replication; then + return $OCF_ERR_GENERIC + else + return $OCF_SUCCESS + fi + fi + + # Remove postmaster.pid if it exists + rm -f $PIDFILE + + # Remove backup_label if it exists + if [ -f $BACKUPLABEL ] && ! is_replication; then + ocf_log info "Removing $BACKUPLABEL. The previous backup might have failed." + rm -f $BACKUPLABEL + fi + + # Check if we need to create a log file + if ! check_log_file $OCF_RESKEY_logfile + then + ocf_exit_reason "PostgreSQL can't write to the log file: $OCF_RESKEY_logfile" + return $OCF_ERR_PERM + fi + + # Check socket directory + if [ -n "$OCF_RESKEY_socketdir" ] + then + check_socket_dir + fi + + check_stat_temp_directory + + if [ "$OCF_RESKEY_rep_mode" = "slave" ]; then + rm -f $RECOVERY_CONF + make_recovery_conf || return $OCF_ERR_GENERIC + fi + + # Set options passed to pg_ctl + pgctl_options="$OCF_RESKEY_ctl_opt -D $OCF_RESKEY_pgdata -l $OCF_RESKEY_logfile" + + # Set options passed to the PostgreSQL server process + postgres_options="-c config_file=${OCF_RESKEY_config}" + + if [ -n "$OCF_RESKEY_pghost" ]; then + postgres_options="$postgres_options -h $OCF_RESKEY_pghost" + fi + if [ -n "$OCF_RESKEY_start_opt" ]; then + postgres_options="$postgres_options $OCF_RESKEY_start_opt" + fi + + # Tack pass-through options onto pg_ctl options + pgctl_options="$pgctl_options -o '$postgres_options'" + + # Invoke pg_ctl + runasowner "unset PGUSER; unset PGPASSWORD; $OCF_RESKEY_pgctl $pgctl_options -W start" + + if [ $? -eq 0 ]; then + # Probably started..... + ocf_log info "PostgreSQL start command sent." + else + ocf_exit_reason "Can't start PostgreSQL." + return $OCF_ERR_GENERIC + fi + + while : + do + pgsql_real_monitor warn + rc=$? + if [ $rc -eq $OCF_SUCCESS -o $rc -eq $OCF_RUNNING_MASTER ]; then + break; + fi + sleep 1 + ocf_log debug "PostgreSQL still hasn't started yet. Waiting..." + done + + # delete replication slots on all nodes. On master node will be created during promotion. + if use_replication_slot; then + delete_replication_slots + if [ $? -eq $OCF_ERR_GENERIC ]; then + ocf_exit_reason "PostgreSQL can't clean up replication_slot." + return $OCF_ERR_GENERIC + fi + fi + + ocf_log info "PostgreSQL is started." + return $rc +} + +pgsql_replication_start() { + local rc + local synchronous_standby_names + + # initializing for replication + change_pgsql_status "$NODENAME" "STOP" + delete_master_baseline + exec_with_retry 0 ocf_promotion_score -v $CAN_NOT_PROMOTE + rm -f ${XLOG_NOTE_FILE}.* $REP_MODE_CONF $RECOVERY_CONF + if ! make_recovery_conf || ! delete_xlog_location || ! set_async_mode_all; then + return $OCF_ERR_GENERIC + fi + + if [ -f $PGSQL_LOCK ]; then + ocf_exit_reason "My data may be inconsistent. You have to remove $PGSQL_LOCK file to force start." + return $OCF_ERR_GENERIC + fi + + # start + pgsql_real_start + if [ $? -ne $OCF_SUCCESS ]; then + return $OCF_ERR_GENERIC + fi + + synchronous_standby_names=$(exec_sql "${CHECK_SYNCHRONOUS_STANDBY_NAMES_SQL}") + if [ -n "${synchronous_standby_names}" ]; then + ocf_exit_reason "Invalid synchronous_standby_names is set in postgresql.conf." + return $OCF_ERR_CONFIGURED + fi + + change_pgsql_status "$NODENAME" "HS:alone" + return $OCF_SUCCESS +} + +#pgsql_start: pgsql_real_start() wrapper for replication +pgsql_start() { + if ! is_replication; then + pgsql_real_start + return $? + else + pgsql_replication_start + return $? + fi +} + +#pgsql_promote: Promote PostgreSQL +pgsql_promote() { + local output + local target + local rc + + if ! is_replication; then + ocf_exit_reason "Not in a replication mode." + return $OCF_ERR_CONFIGURED + fi + + output=`exec_sql "${CHECK_MS_SQL}"` + if [ $? -ne 0 ]; then + report_psql_error $rc $loglevel "Can't get PostgreSQL recovery status on promote." + return $OCF_ERR_GENERIC + fi + + if [ "$output" = "f" ]; then + ocf_log info "PostgreSQL is already Master. Don't execute promote." + return $OCF_SUCCESS + fi + + rm -f ${XLOG_NOTE_FILE}.* + + for target in $NODE_LIST; do + [ "$target" = "$NODENAME" ] && continue + change_data_status "$target" "DISCONNECT" + change_master_score "$target" "$CAN_NOT_PROMOTE" + done + + ocf_log info "Creating $PGSQL_LOCK." + touch $PGSQL_LOCK + show_master_baseline + + if ocf_is_true ${OCF_RESKEY_restart_on_promote}; then + ocf_log info "Restarting PostgreSQL instead of promote." + #stop : this function returns $OCF_SUCCESS only. + pgsql_real_stop slave + if "${USE_STANDBY_SIGNAL}"; then + rm -f ${OCF_RESKEY_pgdata}/standby.signal + else + rm -f $RECOVERY_CONF + fi + pgsql_real_start + rc=$? + if [ $rc -ne $OCF_RUNNING_MASTER ]; then + ocf_exit_reason "Can't start PostgreSQL as primary on promote." + if [ $rc -ne $OCF_SUCCESS ]; then + change_pgsql_status "$NODENAME" "STOP" + fi + return $OCF_ERR_GENERIC + fi + else + runasowner "$OCF_RESKEY_pgctl -D $OCF_RESKEY_pgdata -W promote" + if [ $? -eq 0 ]; then + ocf_log info "PostgreSQL promote command sent." + else + ocf_exit_reason "Can't promote PostgreSQL." + return $OCF_ERR_GENERIC + fi + + while : + do + pgsql_real_monitor warn + rc=$? + if [ $rc -eq $OCF_RUNNING_MASTER ]; then + break; + elif [ $rc -eq $OCF_ERR_GENERIC ]; then + ocf_exit_reason "Can't promote PostgreSQL." + return $rc + fi + sleep 1 + ocf_log debug "PostgreSQL still hasn't promoted yet. Waiting..." + done + ocf_log info "PostgreSQL is promoted." + fi + + # create replication slots on master after promotion + if use_replication_slot; then + create_replication_slots + if [ $? -eq $OCF_ERR_GENERIC ]; then + ocf_exit_reason "PostgreSQL can't create replication_slot." + return $OCF_ERR_GENERIC + fi + fi + + change_data_status "$NODENAME" "LATEST" + exec_with_retry 0 ocf_promotion_score -v $PROMOTE_ME + change_pgsql_status "$NODENAME" "PRI" + return $OCF_SUCCESS +} + +#pgsql_demote: Demote PostgreSQL +pgsql_demote() { + local rc + + if ! is_replication; then + ocf_exit_reason "Not in a replication mode." + return $OCF_ERR_CONFIGURED + fi + + exec_with_retry 0 ocf_promotion_score -v $CAN_NOT_PROMOTE + delete_master_baseline + + if ! pgsql_status; then + ocf_log info "PostgreSQL is already stopped on demote." + else + ocf_log info "Stopping PostgreSQL on demote." + pgsql_real_stop master + rc=$? + if [ "$rc" -ne "$OCF_SUCCESS" ]; then + change_pgsql_status "$NODENAME" "UNKNOWN" + return $rc + fi + fi + change_pgsql_status "$NODENAME" "STOP" + return $OCF_SUCCESS +} + +#pgsql_real_stop: Stop PostgreSQL +pgsql_real_stop() { + local rc + local count + local stop_escalate + + if ocf_is_true ${OCF_RESKEY_check_wal_receiver}; then + attrd_updater -n "$PGSQL_WAL_RECEIVER_STATUS_ATTR" -D + fi + + if ! pgsql_status + then + #Already stopped + return $OCF_SUCCESS + fi + + stop_escalate=$OCF_RESKEY_stop_escalate + if [ "$1" = "slave" ]; then + stop_escalate="$OCF_RESKEY_stop_escalate_in_slave" + fi + # adjust stop_escalate time when it is longer than the timeout + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ] && \ + [ "$stop_escalate" -ge $((OCF_RESKEY_CRM_meta_timeout/1000)) ]; then + stop_escalate=$(((OCF_RESKEY_CRM_meta_timeout/1000) - 10)) + ocf_log info "stop_escalate(or stop_escalate_in_slave) time is adjusted to ${stop_escalate} based on the configured timeout." + fi + + # Stop PostgreSQL, do not wait for clients to disconnect + if [ $stop_escalate -gt 0 ]; then + runasowner "$OCF_RESKEY_pgctl -W -D $OCF_RESKEY_pgdata stop -m fast" + fi + + # stop waiting + count=0 + while [ $count -lt $stop_escalate ] + do + if ! pgsql_status + then + #PostgreSQL stopped + break; + fi + count=`expr $count + 1` + sleep 1 + done + + if pgsql_status + then + #PostgreSQL is still up. Use another shutdown mode. + ocf_log info "PostgreSQL failed to stop after ${stop_escalate}s using -m fast. Trying -m immediate..." + runasowner "$OCF_RESKEY_pgctl -W -D $OCF_RESKEY_pgdata stop -m immediate" + fi + + while : + do + pgsql_real_monitor + rc=$? + if [ $rc -eq $OCF_NOT_RUNNING ]; then + # An unnecessary debug log is prevented. + break; + fi + sleep 1 + ocf_log debug "PostgreSQL still hasn't stopped yet. Waiting..." + done + + # Remove postmaster.pid if it exists + rm -f $PIDFILE + + if [ "$1" = "master" -a "$OCF_RESKEY_CRM_meta_notify_slave_uname" = " " ]; then + ocf_log info "Removing $PGSQL_LOCK." + rm -f $PGSQL_LOCK + fi + return $OCF_SUCCESS +} + +pgsql_replication_stop() { + local rc + + exec_with_retry 5 ocf_promotion_score -v $CAN_NOT_PROMOTE + delete_xlog_location + + if ! pgsql_status + then + ocf_log info "PostgreSQL is already stopped." + change_pgsql_status "$NODENAME" "STOP" + return $OCF_SUCCESS + fi + + pgsql_real_stop slave + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + change_pgsql_status "$NODENAME" "UNKNOWN" + return $rc + fi + + change_pgsql_status "$NODENAME" "STOP" + set_async_mode_all + delete_master_baseline + return $OCF_SUCCESS +} + +#pgsql_stop: pgsql_real_stop() wrapper for replication +pgsql_stop() { + if ! is_replication; then + pgsql_real_stop + return $? + else + pgsql_replication_stop + return $? + fi +} + +# +# pgsql_status: is PostgreSQL up? +# + +pgsql_status() { + if [ -f $PIDFILE ] + then + PID=`head -n 1 $PIDFILE` + runasowner "kill -s 0 $PID >/dev/null 2>&1" + return $? + fi + + # No PID file + false +} + +pgsql_wal_receiver_status() { + local PID + local receiver_parent_pids + local pgsql_real_monitor_status=$1 + + PID=`head -n 1 $PIDFILE` + receiver_parent_pids=`ps -ef | tr -s " " | grep "[w]al\s*receiver" | cut -d " " -f 3` + + if echo "$receiver_parent_pids" | grep -q -w "$PID" ; then + attrd_updater -n "$PGSQL_WAL_RECEIVER_STATUS_ATTR" -v "normal" + return 0 + fi + + if [ $pgsql_real_monitor_status -eq "$OCF_RUNNING_MASTER" ]; then + attrd_updater -n "$PGSQL_WAL_RECEIVER_STATUS_ATTR" -v "normal (master)" + return 0 + fi + + attrd_updater -n "$PGSQL_WAL_RECEIVER_STATUS_ATTR" -v "ERROR" + ocf_log warn "wal receiver process is not running" + return 1 +} + +# +# pgsql_real_monitor +# + +pgsql_real_monitor() { + local loglevel + local rc + local output + + # Set the log level of the error message + loglevel=${1:-err} + + if ! pgsql_status + then + ocf_log info "PostgreSQL is down" + return $OCF_NOT_RUNNING + fi + + if is_replication; then + #Check replication state + output=`exec_sql "${CHECK_MS_SQL}"` + rc=$? + + if [ $rc -ne 0 ]; then + report_psql_error $rc $loglevel "Can't get PostgreSQL recovery status." + return $OCF_ERR_GENERIC + fi + + case "$output" in + f) ocf_log debug "PostgreSQL is running as a primary." + if [ "$OCF_RESKEY_monitor_sql" = "$OCF_RESKEY_monitor_sql_default" ]; then + if ocf_is_probe; then + # Set initial score for primary. + exec_with_retry 0 ocf_promotion_score -v $PROMOTE_ME + fi + return $OCF_RUNNING_MASTER + fi + ;; + + t) ocf_log debug "PostgreSQL is running as a hot standby." + if ocf_is_probe; then + # Set initial score for hot standby. + exec_with_retry 0 ocf_promotion_score -v $CAN_NOT_PROMOTE + fi + return $OCF_SUCCESS;; + + *) ocf_exit_reason "$CHECK_MS_SQL output is $output" + return $OCF_ERR_GENERIC;; + esac + fi + + OCF_RESKEY_monitor_sql=`escape_string "$OCF_RESKEY_monitor_sql"` + runasowner -q $loglevel "$OCF_RESKEY_psql $psql_options \ + -c '$OCF_RESKEY_monitor_sql'" + rc=$? + if [ $rc -ne 0 ]; then + report_psql_error $rc $loglevel "PostgreSQL $OCF_RESKEY_pgdb isn't running." + return $OCF_ERR_GENERIC + fi + + if is_replication; then + return $OCF_RUNNING_MASTER + fi + return $OCF_SUCCESS +} + +pgsql_replication_monitor() { + local rc + + rc=$1 + if [ $rc -ne $OCF_SUCCESS -a $rc -ne "$OCF_RUNNING_MASTER" ]; then + return $rc + fi + # If I am Master + if [ $rc -eq $OCF_RUNNING_MASTER ]; then + change_data_status "$NODENAME" "LATEST" + change_pgsql_status "$NODENAME" "PRI" + control_slave_status || return $OCF_ERR_GENERIC + if [ "$RE_CONTROL_SLAVE" = "true" ]; then + sleep 2 + ocf_log info "re-controlling slave status." + RE_CONTROL_SLAVE="none" + control_slave_status || return $OCF_ERR_GENERIC + fi + return $rc + fi + + # I can't get master node name from $OCF_RESKEY_CRM_meta_notify_master_uname on monitor, + # so I will get master node name using crm_mon -n + print_crm_mon | grep -q -i -E "<resource id=\"${RESOURCE_NAME}\" .* role=\"(Promoted|Master)\"" + if [ $? -ne 0 ] ; then + # If I am Slave and Master is not exist + ocf_log info "Master does not exist." + change_pgsql_status "$NODENAME" "HS:alone" + have_master_right + if [ $? -eq 0 ]; then + rm -f ${XLOG_NOTE_FILE}.* + fi + else + output=`exec_with_retry 0 $CRM_ATTR_FOREVER -N "$NODENAME" \ + -n "$PGSQL_DATA_STATUS_ATTR" -G -q` + if [ "$output" = "DISCONNECT" ]; then + change_pgsql_status "$NODENAME" "HS:alone" + fi + fi + return $rc +} + +#pgsql_monitor: pgsql_real_monitor() wrapper for replication +pgsql_monitor() { + local rc + + pgsql_real_monitor + rc=$? + + if ocf_is_true ${OCF_RESKEY_check_wal_receiver}; then + pgsql_wal_receiver_status $rc + fi + + if ! is_replication; then + return $rc + else + pgsql_replication_monitor $rc + return $? + fi +} + +# pgsql_post_demote +pgsql_post_demote() { + DEMOTE_NODE=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname | sed "s/ /\n/g" | head -1 | tr '[A-Z]' '[a-z]'` + ocf_log debug "post-demote called. Demote uname is $DEMOTE_NODE" + if [ "$DEMOTE_NODE" != "$NODENAME" ]; then + if ! echo $OCF_RESKEY_CRM_meta_notify_master_uname | tr '[A-Z]' '[a-z]' | grep $NODENAME; then + show_master_baseline + change_pgsql_status "$NODENAME" "HS:alone" + fi + fi + return $OCF_SUCCESS +} + +pgsql_pre_promote() { + local master_baseline + local my_master_baseline + local cmp_location + local number_of_nodes + + # If my data is newer than new master's one, I fail my resource. + PROMOTE_NODE=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname | \ + sed "s/ /\n/g" | head -1 | tr '[A-Z]' '[a-z]'` + number_of_nodes=`echo $NODE_LIST | wc -w` + if [ $number_of_nodes -ge 3 -a \ + "$OCF_RESKEY_rep_mode" = "sync" -a \ + "$PROMOTE_NODE" != "$NODENAME" ]; then + master_baseline=`$CRM_ATTR_REBOOT -N "$PROMOTE_NODE" -n \ + "$PGSQL_MASTER_BASELINE" -G -q 2>/dev/null` + if [ $? -eq 0 ]; then + my_master_baseline=`$CRM_ATTR_REBOOT -N "$NODENAME" -n \ + "$PGSQL_MASTER_BASELINE" -G -q 2>/dev/null` + # get older location + cmp_location=`printf "$master_baseline\n$my_master_baseline\n" |\ + sort | head -1` + if [ "$cmp_location" != "$my_master_baseline" ]; then + # We used to set the failcount to INF for the resource here in + # order to move the master to the other node. However, setting + # the failcount should be done only by the CRM and so this use + # got deprecated in pacemaker version 1.1.17. Now we do the + # "ban resource from the node". + ocf_exit_reason "My data is newer than new master's one. New master's location : $master_baseline" + exec_with_retry 0 $CRM_RESOURCE -B -r $OCF_RESOURCE_INSTANCE -N $NODENAME -Q + return $OCF_ERR_GENERIC + fi + fi + fi + return $OCF_SUCCESS +} + +pgsql_notify() { + local type="${OCF_RESKEY_CRM_meta_notify_type}" + local op="${OCF_RESKEY_CRM_meta_notify_operation}" + local rc + + if ! is_replication; then + return $OCF_SUCCESS + fi + + ocf_log debug "notify: ${type} for ${op}" + case $type in + pre) + case $op in + promote) + pgsql_pre_promote + return $? + ;; + esac + ;; + post) + case $op in + promote) + delete_xlog_location + PROMOTE_NODE=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname | \ + sed "s/ /\n/g" | head -1 | tr '[A-Z]' '[a-z]'` + if [ "$PROMOTE_NODE" != "$NODENAME" ]; then + delete_master_baseline + fi + return $OCF_SUCCESS + ;; + demote) + pgsql_post_demote + return $? + ;; + start|stop) + MASTER_NODE=`echo $OCF_RESKEY_CRM_meta_notify_master_uname | \ + sed "s/ /\n/g" | head -1 | tr '[A-Z]' '[a-z]'` + if [ "$NODENAME" = "$MASTER_NODE" ]; then + control_slave_status + fi + return $OCF_SUCCESS + ;; + esac + ;; + esac + return $OCF_SUCCESS +} + +control_slave_status() { + local rc + local data_status + local target + local all_data_status + local tmp_data_status + local number_of_nodes + + all_data_status=`exec_sql "${CHECK_REPLICATION_STATE_SQL}"` + rc=$? + + if [ $rc -eq 0 ]; then + if [ -n "$all_data_status" ]; then + all_data_status=`echo $all_data_status | sed "s/\n/ /g"` + fi + else + report_psql_error $rc err "Can't get PostgreSQL replication status." + return 1 + fi + + number_of_nodes=`echo $NODE_LIST | wc -w` + for target in $NODE_LIST; do + if [ "$target" = "$NODENAME" ]; then + continue + fi + + data_status="DISCONNECT" + if [ -n "$all_data_status" ]; then + for tmp_data_status in $all_data_status; do + if ! echo $tmp_data_status | grep -q "^${target}|"; then + continue + fi + data_status=`echo $tmp_data_status | cut -d "|" -f 2,3` + ocf_log debug "node_name and data_status is $tmp_data_status" + break + done + fi + + case "$data_status" in + "STREAMING|SYNC") + change_data_status "$target" "$data_status" + change_master_score "$target" "$CAN_PROMOTE" + change_pgsql_status "$target" "HS:sync" + ;; + "STREAMING|ASYNC") + change_data_status "$target" "$data_status" + if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then + change_master_score "$target" "$CAN_NOT_PROMOTE" + set_sync_mode "$target" + else + if [ $number_of_nodes -le 2 ]; then + change_master_score "$target" "$CAN_PROMOTE" + else + # I can't determine which slave's data is newest in async mode. + change_master_score "$target" "$CAN_NOT_PROMOTE" + fi + fi + change_pgsql_status "$target" "HS:async" + ;; + "STREAMING|POTENTIAL") + change_data_status "$target" "$data_status" + change_master_score "$target" "$CAN_NOT_PROMOTE" + change_pgsql_status "$target" "HS:potential" + ;; + "DISCONNECT") + change_data_status "$target" "$data_status" + change_master_score "$target" "$CAN_NOT_PROMOTE" + if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then + set_async_mode "$target" + fi + ;; + *) + change_data_status "$target" "$data_status" + change_master_score "$target" "$CAN_NOT_PROMOTE" + if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then + set_async_mode "$target" + fi + change_pgsql_status "$target" "HS:connected" + ;; + esac + done + return 0 +} + +have_master_right() { + local old + local new + local output + local data_status + local node + local mylocation + local count + local newestXlog + local oldfile + local newfile + + ocf_log debug "Checking if I have a master right." + + data_status=`$CRM_ATTR_FOREVER -N "$NODENAME" -n \ + "$PGSQL_DATA_STATUS_ATTR" -G -q 2>/dev/null` + if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then + if [ -n "$data_status" -a "$data_status" != "STREAMING|SYNC" -a \ + "$data_status" != "LATEST" ]; then + ocf_log warn "My data is out-of-date. status=$data_status" + return 1 + fi + else + if [ -n "$data_status" -a "$data_status" != "STREAMING|SYNC" -a \ + "$data_status" != "STREAMING|ASYNC" -a \ + "$data_status" != "LATEST" ]; then + ocf_log warn "My data is out-of-date. status=$data_status" + return 1 + fi + fi + ocf_log info "My data status=$data_status." + + show_xlog_location + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to show my xlog location." + exit $OCF_ERR_GENERIC + fi + + old=0 + for count in `seq $OCF_RESKEY_xlog_check_count`; do + if [ -f ${XLOG_NOTE_FILE}.$count ]; then + old=$count + continue + fi + break + done + new=`expr $old + 1` + + # get xlog locations of all nodes + for node in ${NODE_LIST}; do + output=`$CRM_ATTR_REBOOT -N "$node" -n \ + "$PGSQL_XLOG_LOC_NAME" -G -q 2>/dev/null` + if [ $? -ne 0 ]; then + ocf_log warn "Can't get $node xlog location." + continue + else + ocf_log info "$node xlog location : $output" + echo "$node $output" >> ${XLOG_NOTE_FILE}.${new} + if [ "$node" = "$NODENAME" ]; then + mylocation=$output + fi + fi + done + + oldfile=`cat ${XLOG_NOTE_FILE}.${old} 2>/dev/null` + newfile=`cat ${XLOG_NOTE_FILE}.${new} 2>/dev/null` + if [ "$oldfile" != "$newfile" ]; then + # reset counter + rm -f ${XLOG_NOTE_FILE}.* + printf "$newfile\n" > ${XLOG_NOTE_FILE}.0 + return 1 + fi + + if [ "$new" -ge "$OCF_RESKEY_xlog_check_count" ]; then + newestXlog=`printf "$newfile\n" | sort -t " " -k 2,3 -r | \ + head -1 | cut -d " " -f 2` + if [ "$newestXlog" = "$mylocation" ]; then + ocf_log info "I have a master right." + exec_with_retry 5 ocf_promotion_score -v $PROMOTE_ME + return 0 + fi + change_data_status "$NODENAME" "DISCONNECT" + ocf_log info "I don't have correct master data." + # reset counter + rm -f ${XLOG_NOTE_FILE}.* + printf "$newfile\n" > ${XLOG_NOTE_FILE}.0 + fi + + return 1 +} + +is_replication() { + if [ "$OCF_RESKEY_rep_mode" != "none" -a "$OCF_RESKEY_rep_mode" != "slave" ]; then + return 0 + fi + return 1 +} + +use_replication_slot() { + if [ -n "$OCF_RESKEY_replication_slot_name" ]; then + return 0 + fi + + return 1 +} + +create_replication_slot_name() { + local number_of_nodes=0 + local target + local replication_slot_name + local replication_slot_name_list_tmp + local replication_slot_name_list + + if [ -n "$NODE_LIST" ]; then + number_of_nodes=`echo $NODE_LIST | wc -w` + fi + + if [ $number_of_nodes -le 0 ]; then + replication_slot_name_list="" + + # The Master node should have some slots equal to the number of Slaves, and + # the Slave nodes connect to their dedicated slot on the Master. + # To ensuring that the slots name are each unique, add postfix to $OCF_RESKEY_replication_slot. + # The postfix is "_$target". + else + for target in $NODE_LIST + do + if [ "$target" != "$NODENAME" ]; then + # The Uppercase, "-" and "." don't allow to use in slot_name. + # If the NODENAME contains them, convert upper case to lower case and "_" and "." to "_". + target=`echo "$target" | tr 'A-Z.-' 'a-z__'` + replication_slot_name="$OCF_RESKEY_replication_slot_name"_"$target" + replication_slot_name_list_tmp="$replication_slot_name_list" + replication_slot_name_list="$replication_slot_name_list_tmp $replication_slot_name" + fi + done + fi + + echo $replication_slot_name_list +} + +delete_replication_slot(){ + DELETE_REPLICATION_SLOT_sql="SELECT pg_drop_replication_slot('$1');" + output=`exec_sql "$DELETE_REPLICATION_SLOT_sql"` + return $? +} + +delete_replication_slots() { + local replication_slot_name_list + local replication_slot_name + + replication_slot_name_list=`create_replication_slot_name` + ocf_log debug "replication slot names are $replication_slot_name_list." + + for replication_slot_name in $replication_slot_name_list + do + if [ `check_replication_slot $replication_slot_name` = "1" ]; then + delete_replication_slot $replication_slot_name + if [ $? -eq 0 ]; then + ocf_log info "PostgreSQL delete the replication slot($replication_slot_name)." + else + ocf_exit_reason "$output" + return $OCF_ERR_GENERIC + fi + fi + done +} + +create_replication_slots() { + local replication_slot_name + local replication_slot_name_list + local output + local rc + local CREATE_REPLICATION_SLOT_sql + local DELETE_REPLICATION_SLOT_sql + + replication_slot_name_list=`create_replication_slot_name` + ocf_log debug "replication slot names are $replication_slot_name_list." + + for replication_slot_name in $replication_slot_name_list + do + # If the same name slot is already exists, initialize(delete and create) the slot. + if [ `check_replication_slot $replication_slot_name` = "1" ]; then + delete_replication_slot $replication_slot_name + if [ $? -eq 0 ]; then + ocf_log info "PostgreSQL delete the replication slot($replication_slot_name)." + else + ocf_exit_reason "$output" + return $OCF_ERR_GENERIC + fi + fi + + CREATE_REPLICATION_SLOT_sql="SELECT pg_create_physical_replication_slot('$replication_slot_name');" + output=`exec_sql "$CREATE_REPLICATION_SLOT_sql"` + rc=$? + + if [ $rc -eq 0 ]; then + ocf_log info "PostgreSQL creates the replication slot($replication_slot_name)." + else + ocf_exit_reason "$output" + return $OCF_ERR_GENERIC + fi + done + + return 0 +} + +# This function check the replication slot does exists. +check_replication_slot(){ + local replication_slot_name=$1 + local output + local CHECK_REPLICATION_SLOT_sql="SELECT count(*) FROM pg_replication_slots WHERE slot_name = '$replication_slot_name'" + + output=`exec_sql "$CHECK_REPLICATION_SLOT_sql"` + echo "$output" +} + +# On postgreSQL 10 or later, "location" means "lsn". +get_my_location() { + local rc + local output + local replay_loc + local receive_loc + local output1 + local output2 + local log1 + local log2 + local newer_location + + output=`exec_sql "$CHECK_XLOG_LOC_SQL"` + rc=$? + + if [ $rc -ne 0 ]; then + report_psql_error $rc err "Can't get my xlog location." + return 1 + fi + replay_loc=`echo $output | cut -d "|" -f 1` + receive_loc=`echo $output | cut -d "|" -f 2` + + output1=`echo "$replay_loc" | cut -d "/" -f 1` + output2=`echo "$replay_loc" | cut -d "/" -f 2` + log1=`printf "%08s\n" $output1 | sed "s/ /0/g"` + log2=`printf "%08s\n" $output2 | sed "s/ /0/g"` + replay_loc="${log1}${log2}" + + output1=`echo "$receive_loc" | cut -d "/" -f 1` + output2=`echo "$receive_loc" | cut -d "/" -f 2` + log1=`printf "%08s\n" $output1 | sed "s/ /0/g"` + log2=`printf "%08s\n" $output2 | sed "s/ /0/g"` + receive_loc="${log1}${log2}" + + newer_location=`printf "$replay_loc\n$receive_loc" | sort -r | head -1` + echo "$newer_location" + return 0 +} + +# On postgreSQL 10 or later, "xlog_location" means "wal_lsn". +show_xlog_location() { + local location + + location=`get_my_location` || return 1 + exec_with_retry 0 $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_XLOG_LOC_NAME" -v "$location" +} + +# On postgreSQL 10 or later, "xlog_location" means "wal_lsn". +delete_xlog_location() { + exec_with_retry 5 $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_XLOG_LOC_NAME" -D +} + +show_master_baseline() { + local rc + local location + + location=`get_my_location` + ocf_log info "My master baseline : $location." + exec_with_retry 0 $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_MASTER_BASELINE" -v "$location" +} + +delete_master_baseline() { + exec_with_retry 5 $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_MASTER_BASELINE" -D +} + +set_async_mode_all() { + [ "$OCF_RESKEY_rep_mode" = "sync" ] || return 0 + ocf_log info "Set all nodes into async mode." + runasowner -q err "echo \"synchronous_standby_names = ''\" > \"$REP_MODE_CONF\"" + if [ $? -ne 0 ]; then + ocf_exit_reason "Can't set all nodes into async mode." + return 1 + fi + return 0 +} + +set_async_mode() { + cat $REP_MODE_CONF | grep -q -E "(\"$1\")|([,' ]$1[,' ])" + if [ $? -eq 0 ]; then + ocf_log info "Setup $1 into async mode." + runasowner -q err "echo \"synchronous_standby_names = ''\" > \"$REP_MODE_CONF\"" + else + ocf_log debug "$1 is already in async mode." + return 0 + fi + exec_with_retry 0 reload_conf +} + +set_sync_mode() { + local sync_node_in_conf + + sync_node_in_conf=`cat $REP_MODE_CONF | cut -d "'" -f 2` + if [ -n "$sync_node_in_conf" ]; then + ocf_log debug "$sync_node_in_conf is already sync mode." + else + ocf_log info "Setup $1 into sync mode." + runasowner -q err "echo \"synchronous_standby_names = '\\\"$1\\\"'\" > \"$REP_MODE_CONF\"" + [ "$RE_CONTROL_SLAVE" = "false" ] && RE_CONTROL_SLAVE="true" + exec_with_retry 0 reload_conf + fi +} + +reload_conf() { + # Invoke pg_ctl + runasowner "$OCF_RESKEY_pgctl -D $OCF_RESKEY_pgdata reload" + if [ $? -eq 0 ]; then + ocf_log info "Reload configuration file." + else + ocf_exit_reason "Can't reload configuration file." + return 1 + fi + + return 0 +} + +user_recovery_conf() { + local nodename_tmp + + # put archive_cleanup_command and recovery_end_command only when defined by user + if [ -n "$OCF_RESKEY_archive_cleanup_command" ]; then + echo "archive_cleanup_command = '${OCF_RESKEY_archive_cleanup_command}'" + fi + if [ -n "$OCF_RESKEY_recovery_end_command" ]; then + echo "recovery_end_command = '${OCF_RESKEY_recovery_end_command}'" + fi + + if use_replication_slot; then + nodename_tmp=`echo "$NODENAME" | tr 'A-Z.-' 'a-z__'` + echo "primary_slot_name = '${OCF_RESKEY_replication_slot_name}_$nodename_tmp'" + fi +} + +make_recovery_conf() { + runasowner "touch $RECOVERY_CONF" + if [ $? -ne 0 ]; then + ocf_exit_reason "Can't create recovery.conf." + return 1 + fi + +cat > $RECOVERY_CONF <<END +primary_conninfo = 'host=${OCF_RESKEY_master_ip} port=${OCF_RESKEY_pgport} user=${OCF_RESKEY_repuser} application_name=${NODENAME} ${OCF_RESKEY_primary_conninfo_opt}' +restore_command = '${OCF_RESKEY_restore_command}' +recovery_target_timeline = 'latest' +END + + if "${USE_STANDBY_SIGNAL}"; then + # create a standby.signal to start standby server. + runasowner "touch ${OCF_RESKEY_pgdata}/standby.signal" + if [ $? -ne 0 ]; then + ocf_exit_reason "Can't create ${OCF_RESKEY_pgdata}/standby.signal." + return 1 + fi + else +cat >> $RECOVERY_CONF <<END +standby_mode = 'on' +END + fi + + user_recovery_conf >> $RECOVERY_CONF + ocf_log debug "Created recovery.conf. host=${OCF_RESKEY_master_ip}, user=${OCF_RESKEY_repuser}" + return 0 +} + +# change pgsql-status. +# arg1:node, arg2: value +change_pgsql_status() { + local output + + if ! is_node_online $1; then + return 0 + fi + + output=`$CRM_ATTR_REBOOT -N "$1" -n "$PGSQL_STATUS_ATTR" -G -q 2>/dev/null` + if [ "$output" != "$2" ]; then + # If slave's disk is broken, RA cannot read PID file + # and misjudges the PostgreSQL as down while it is running. + # It causes overwriting of pgsql-status by Master because replication is still connected. + if [ "$output" = "STOP" -o "$output" = "UNKNOWN" ]; then + if [ "$1" != "$NODENAME" ]; then + ocf_log warn "Changing $PGSQL_STATUS_ATTR on $1 : $output->$2 by $NODENAME is prohibited." + return 0 + fi + fi + ocf_log info "Changing $PGSQL_STATUS_ATTR on $1 : $output->$2." + exec_with_retry 0 $CRM_ATTR_REBOOT -N "$1" -n "$PGSQL_STATUS_ATTR" -v "$2" + fi + return 0 +} + +# change pgsql-data-status. +# arg1:node, arg2: value +change_data_status() { + local output + + if ! node_exist $1; then + return 0 + fi + + while : + do + output=`$CRM_ATTR_FOREVER -N "$1" -n "$PGSQL_DATA_STATUS_ATTR" -G -q 2>/dev/null` + if [ "$output" != "$2" ]; then + ocf_log info "Changing $PGSQL_DATA_STATUS_ATTR on $1 : $output->$2." + exec_with_retry 0 exec_with_timeout 0 "$CRM_ATTR_FOREVER" -N $1 -n $PGSQL_DATA_STATUS_ATTR -v "$2" + else + break + fi + done + return 0 +} + +# set master-score +# arg1:node, arg2: score, arg3: resoure +set_master_score() { + local current_score + + current_score=`$CRM_ATTR_REBOOT -N "$1" -n "master-$3" -G -q 2>/dev/null` + if [ -n "$current_score" -a "$current_score" != "$2" ]; then + ocf_log info "Changing $3 master score on $1 : $current_score->$2." + exec_with_retry 0 $CRM_ATTR_REBOOT -N "$1" -n "master-$3" -v "$2" + fi + return 0 +} + +# change master-score +# arg1:node, arg2: score +change_master_score() { + local instance + + if ! is_node_online $1; then + return 0 + fi + + if echo $OCF_RESOURCE_INSTANCE | grep -q ":"; then + # If Pacemaker version is 1.0.x + instance=0 + while : + do + if [ "$instance" -ge "$OCF_RESKEY_CRM_meta_clone_max" ]; then + break + fi + if [ "${RESOURCE_NAME}:${instance}" = "$OCF_RESOURCE_INSTANCE" ]; then + instance=`expr $instance + 1` + continue + fi + set_master_score $1 $2 "${RESOURCE_NAME}:${instance}" || return 1 + instance=`expr $instance + 1` + done + else + # If globally-unique=false and Pacemaker version is 1.1.8 or higher + # Master/Slave resource has no instance number + set_master_score $1 $2 ${RESOURCE_NAME} || return 1 + fi + return 0 +} + +report_psql_error() +{ + local rc + local loglevel + local message + + rc=$1 + loglevel=${2:-err} + message="$3" + + ocf_log $loglevel "$message rc=$rc" + if [ $rc -eq 1 ]; then + ocf_exit_reason "Fatal error (out of memory, file not found, etc.) occurred while executing the psql command." + elif [ $rc -eq 2 ]; then + ocf_log $loglevel "Connection error (connection to the server went bad and the session was not interactive) occurred while executing the psql command." + elif [ $rc -eq 3 ]; then + ocf_exit_reason "Script error (the variable ON_ERROR_STOP was set) occurred while executing the psql command." + fi +} + +# +# timeout management function +# arg1 timeout >= 0 (if arg1 is 0, OCF_RESKEY_crm_attr_timeout is used.) +# arg2 : command +# arg3 : command's args +exec_with_timeout() { + local func_pid + local count=$OCF_RESKEY_crm_attr_timeout + local rc + + if [ "$1" -ne 0 ]; then + count=$1 + fi + shift + + $* & + func_pid=$! + sleep .1 + + while kill -s 0 $func_pid >/dev/null 2>&1; do + sleep 1 + count=`expr $count - 1` + if [ $count -le 0 ]; then + ocf_exit_reason "\"$*\" (pid=$func_pid) timed out." + kill -s 9 $func_pid >/dev/null 2>&1 + return 1 + fi + ocf_log info "Waiting($count). \"$*\" (pid=$func_pid)." + done + wait $func_pid +} + +# retry command when command doesn't return 0 +# arg1 : count >= 0 (if arg1 is 0, it retries command in infinitum(1day)) +# arg2..argN : command and args +exec_with_retry() { + local count="86400" + local output + local rc + + if [ "$1" -ne 0 ]; then + count=$1 + fi + shift + + while [ $count -gt 0 ]; do + output=`$*` + rc=$? + if [ $rc -ne 0 ]; then + ocf_log warn "Retrying(remain $count). \"$*\" failed. rc=$rc. stdout=\"$output\"." + count=`expr $count - 1` + sleep 1 + else + printf "${output}" + return 0 + fi + done + + ocf_exit_reason "giving up executing \"$*\"" + return $rc +} + +is_node_online() { + print_crm_mon | grep -q -i "<node name=\"$1\" .* online=\"true\"" +} + +node_exist() { + print_crm_mon | grep -q -i "<node name=\"$1\" .* online" +} + +check_binary2() { + if ! have_binary "$1"; then + ocf_exit_reason "Setup problem: couldn't find command: $1" + return 1 + fi + return 0 +} + +check_config() { + local rc=0 + + if [ ! -f "$1" ]; then + if ocf_is_probe; then + ocf_log info "Unable to read $1 during probe." + rc=1 + else + ocf_exit_reason "Configuration file $1 doesn't exist" + rc=2 + fi + fi + + return $rc +} + +validate_ocf_check_level_10() { + local version + local check_config_rc + local rep_mode_string + local recovery_conf_string + local socket_directories + local rc + + version=`cat $OCF_RESKEY_pgdata/PG_VERSION` + + if ! check_binary2 "$OCF_RESKEY_pgctl" || + ! check_binary2 "$OCF_RESKEY_psql"; then + return $OCF_ERR_INSTALLED + fi + + check_config "$OCF_RESKEY_config" + check_config_rc=$? + [ $check_config_rc -eq 2 ] && return $OCF_ERR_INSTALLED + if [ $check_config_rc -eq 0 ]; then + ocf_version_cmp "$version" "9.3" + if [ $? -eq 0 ]; then + : ${OCF_RESKEY_socketdir=`get_pgsql_param unix_socket_directory`} + else + # unix_socket_directories is used by PostgreSQL 9.3 or higher. + socket_directories=`get_pgsql_param unix_socket_directories` + if [ -n "$socket_directories" ]; then + # unix_socket_directories may have multiple socket directories and the pgsql RA can not know which directory is used for psql command. + # Therefore, the user must set OCF_RESKEY_socketdir explicitly. + if [ -z "$OCF_RESKEY_socketdir" ]; then + ocf_exit_reason "In PostgreSQL 9.3 or higher, socketdir can't be empty if you define unix_socket_directories in the postgresql.conf." + return $OCF_ERR_CONFIGURED + fi + fi + fi + fi + + if ocf_is_probe; then + ocf_log info "Don't check $OCF_RESKEY_pgdata during probe" + else + if ! runasowner "test -w $OCF_RESKEY_pgdata"; then + ocf_exit_reason "Directory $OCF_RESKEY_pgdata is not writable by $OCF_RESKEY_pgdba" + return $OCF_ERR_PERM; + fi + fi + + if is_replication || [ "$OCF_RESKEY_rep_mode" = "slave" ]; then + if [ `printf "$version\n9.1" | sort -n | head -1` != "9.1" ]; then + ocf_exit_reason "Replication mode needs PostgreSQL 9.1 or higher." + return $OCF_ERR_INSTALLED + fi + ocf_version_cmp "$version" "12" + rc=$? + if [ $rc -eq 1 ]||[ $rc -eq 2 ]; then + # change the standby method for PosrgreSQL 12 or later. + USE_STANDBY_SIGNAL=true + # change the path to recovery.conf because it cause PostgreSQL start error. + RECOVERY_CONF=${OCF_RESKEY_tmpdir}/recovery.conf + if [ $check_config_rc -eq 0 ]; then + # adding recovery parameters to postgresql.conf. + recovery_conf_string="include '$RECOVERY_CONF' # added by pgsql RA" + if ! grep -q "^[[:space:]]*$recovery_conf_string" $OCF_RESKEY_config; then + ocf_log info "adding include directive $recovery_conf_string into $OCF_RESKEY_config" + echo "$recovery_conf_string" >> $OCF_RESKEY_config + fi + fi + fi + if [ ! -n "$OCF_RESKEY_master_ip" ]; then + ocf_exit_reason "master_ip can't be empty." + return $OCF_ERR_CONFIGURED + fi + fi + + if is_replication; then + REP_MODE_CONF=${OCF_RESKEY_tmpdir}/rep_mode.conf + PGSQL_LOCK=${OCF_RESKEY_tmpdir}/PGSQL.lock + XLOG_NOTE_FILE=${OCF_RESKEY_tmpdir}/xlog_note + + CRM_ATTR_REBOOT="${HA_SBIN_DIR}/crm_attribute -l reboot" + CRM_ATTR_FOREVER="${HA_SBIN_DIR}/crm_attribute -l forever" + CRM_RESOURCE="${HA_SBIN_DIR}/crm_resource" + + CAN_NOT_PROMOTE="-INFINITY" + CAN_PROMOTE="100" + PROMOTE_ME="1000" + + CHECK_MS_SQL="select pg_is_in_recovery()" + CHECK_SYNCHRONOUS_STANDBY_NAMES_SQL="show synchronous_standby_names" + ocf_version_cmp "$version" "10" + rc=$? + if [ $rc -eq 1 ]||[ $rc -eq 2 ]; then + CHECK_XLOG_LOC_SQL="select pg_last_wal_replay_lsn(),pg_last_wal_receive_lsn()" + else + CHECK_XLOG_LOC_SQL="select pg_last_xlog_replay_location(),pg_last_xlog_receive_location()" + fi + CHECK_REPLICATION_STATE_SQL="select application_name,upper(state),upper(sync_state) from pg_stat_replication" + + PGSQL_STATUS_ATTR="${RESOURCE_NAME}-status" + PGSQL_DATA_STATUS_ATTR="${RESOURCE_NAME}-data-status" + PGSQL_XLOG_LOC_NAME="${RESOURCE_NAME}-xlog-loc" + PGSQL_MASTER_BASELINE="${RESOURCE_NAME}-master-baseline" + + NODE_LIST=`echo $OCF_RESKEY_node_list | tr '[A-Z]' '[a-z]'` + RE_CONTROL_SLAVE="false" + + if ! ocf_is_ms; then + ocf_exit_reason "Replication(rep_mode=async or sync) requires Master/Slave configuration." + return $OCF_ERR_CONFIGURED + fi + if [ ! "$OCF_RESKEY_rep_mode" = "sync" -a ! "$OCF_RESKEY_rep_mode" = "async" ]; then + ocf_exit_reason "Invalid rep_mode : $OCF_RESKEY_rep_mode" + return $OCF_ERR_CONFIGURED + fi + if [ ! -n "$NODE_LIST" ]; then + ocf_exit_reason "node_list can't be empty." + return $OCF_ERR_CONFIGURED + fi + if [ $check_config_rc -eq 0 ]; then + rep_mode_string="include '$REP_MODE_CONF' # added by pgsql RA" + if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then + if ! grep -q "^[[:space:]]*$rep_mode_string" $OCF_RESKEY_config; then + ocf_log info "adding include directive into $OCF_RESKEY_config" + echo "$rep_mode_string" >> $OCF_RESKEY_config + fi + else + if grep -q "$rep_mode_string" $OCF_RESKEY_config; then + ocf_log info "deleting include directive from $OCF_RESKEY_config" + rep_mode_string=`echo $rep_mode_string | sed -e 's|/|\\\\/|g'` + sed -i "/$rep_mode_string/d" $OCF_RESKEY_config + fi + fi + fi + if ! mkdir -p $OCF_RESKEY_tmpdir || ! chown $OCF_RESKEY_pgdba $OCF_RESKEY_tmpdir || ! chmod 700 $OCF_RESKEY_tmpdir; then + ocf_exit_reason "Can't create directory $OCF_RESKEY_tmpdir or it is not readable by $OCF_RESKEY_pgdba" + return $OCF_ERR_PERM + fi + fi + + if [ "$OCF_RESKEY_rep_mode" = "slave" ]; then + if ocf_is_ms; then + ocf_exit_reason "Replication(rep_mode=slave) does not support Master/Slave configuration." + return $OCF_ERR_CONFIGURED + fi + fi + + if use_replication_slot; then + ocf_version_cmp "$version" "9.4" + rc=$? + if [ $rc -eq 0 ]||[ $rc -eq 3 ]; then + ocf_exit_reason "Replication slot needs PostgreSQL 9.4 or higher." + return $OCF_ERR_CONFIGURED + fi + + echo "$OCF_RESKEY_replication_slot_name" | grep -q -e '[^a-z0-9_]' + if [ $? -eq 0 ]; then + ocf_exit_reason "Invalid replication_slot_name($OCF_RESKEY_replication_slot_name). only use lower case letters, numbers, and the underscore character." + return $OCF_ERR_CONFIGURED + fi + fi + + return $OCF_SUCCESS +} + +# Validate most critical parameters +pgsql_validate_all() { + local rc + + getent passwd $OCF_RESKEY_pgdba >/dev/null 2>&1 + if [ ! $? -eq 0 ]; then + ocf_exit_reason "User $OCF_RESKEY_pgdba doesn't exist"; + return $OCF_ERR_INSTALLED; + fi + + if [ -n "$OCF_RESKEY_monitor_user" ] && [ -z "$OCF_RESKEY_monitor_password" ]; then + ocf_exit_reason "monitor password can't be empty" + return $OCF_ERR_CONFIGURED + fi + + if [ -z "$OCF_RESKEY_monitor_user" ] && [ -n "$OCF_RESKEY_monitor_password" ]; then + ocf_exit_reason "monitor_user has to be set if monitor_password is set" + return $OCF_ERR_CONFIGURED + fi + + if [ "$OCF_CHECK_LEVEL" -eq 10 ]; then + validate_ocf_check_level_10 + rc=$? + [ $rc -ne "$OCF_SUCCESS" ] && exit $rc + fi + + return $OCF_SUCCESS +} + + +# +# Check if we need to create a log file +# + +check_log_file() { + if [ ! -e "$1" ] + then + touch $1 > /dev/null 2>&1 + chown $OCF_RESKEY_pgdba:`getent passwd $OCF_RESKEY_pgdba | cut -d ":" -f 4` $1 + fi + + #Check if $OCF_RESKEY_pgdba can write to the log file + if ! runasowner "test -w $1" + then + return 1 + fi + + return 0 +} + +# +# Check if we need to create stats temp directory in tmpfs +# + +check_stat_temp_directory() { + local stats_temp + + stats_temp=`get_pgsql_param stats_temp_directory` + + if [ -z "$stats_temp" ]; then + return + fi + + if [ "${stats_temp#/}" = "$stats_temp" ]; then + stats_temp="$OCF_RESKEY_pgdata/$stats_temp" + fi + + if [ -d "$stats_temp" ]; then + return + fi + + if ! mkdir -p "$stats_temp"; then + ocf_exit_reason "Can't create directory $stats_temp" + exit $OCF_ERR_PERM + fi + + if ! chown $OCF_RESKEY_pgdba: "$stats_temp"; then + ocf_exit_reason "Can't change ownership for $stats_temp" + exit $OCF_ERR_PERM + fi + + if ! chmod 700 "$stats_temp"; then + ocf_exit_reason "Can't change permissions for $stats_temp" + exit $OCF_ERR_PERM + fi +} + +# +# Check socket directory +# +check_socket_dir() { + if [ ! -d "$OCF_RESKEY_socketdir" ]; then + if ! mkdir "$OCF_RESKEY_socketdir"; then + ocf_exit_reason "Can't create directory $OCF_RESKEY_socketdir" + exit $OCF_ERR_PERM + fi + + if ! chown $OCF_RESKEY_pgdba:`getent passwd \ + $OCF_RESKEY_pgdba | cut -d ":" -f 4` "$OCF_RESKEY_socketdir" + then + ocf_exit_reason "Can't change ownership for $OCF_RESKEY_socketdir" + exit $OCF_ERR_PERM + fi + + if ! chmod 2775 "$OCF_RESKEY_socketdir"; then + ocf_exit_reason "Can't change permissions for $OCF_RESKEY_socketdir" + exit $OCF_ERR_PERM + fi + else + if ! runasowner "touch $OCF_RESKEY_socketdir/test.$$"; then + ocf_exit_reason "$OCF_RESKEY_pgdba can't create files in $OCF_RESKEY_socketdir" + exit $OCF_ERR_PERM + fi + rm $OCF_RESKEY_socketdir/test.$$ + fi +} + +print_crm_mon() { + if [ -z "$CRM_MON_OUTPUT" ]; then + ocf_version_cmp "$OCF_RESKEY_crm_feature_set" "3.1.0" + res=$? + if [ -z "$OCF_RESKEY_crm_feature_set" ] || [ $res -eq 2 ]; then + XMLOPT="--output-as=xml" + ocf_version_cmp "$OCF_RESKEY_crm_feature_set" "3.2.0" + if [ $? -eq 1 ]; then + crm_mon -1 $XMLOPT >/dev/null 2>&1 + if [ $? -ne 0 ]; then + XMLOPT="--as-xml" + fi + fi + else + XMLOPT="--as-xml" + fi + CRM_MON_OUTPUT=`exec_with_retry 0 crm_mon -1 $XMLOPT` + fi + printf "${CRM_MON_OUTPUT}\n" +} + +# +# 'main' starts here... +# + + +if [ $# -ne 1 ] +then + usage + exit $OCF_ERR_GENERIC +fi + +PIDFILE=${OCF_RESKEY_pgdata}/postmaster.pid +BACKUPLABEL=${OCF_RESKEY_pgdata}/backup_label +RESOURCE_NAME=`echo $OCF_RESOURCE_INSTANCE | cut -d ":" -f 1` +PGSQL_WAL_RECEIVER_STATUS_ATTR="${RESOURCE_NAME}-receiver-status" +RECOVERY_CONF=${OCF_RESKEY_pgdata}/recovery.conf +NODENAME=$(ocf_local_nodename | tr '[A-Z]' '[a-z]') +USE_STANDBY_SIGNAL=false + +case "$1" in + methods) pgsql_methods + exit $?;; + + meta-data) meta_data + exit $OCF_SUCCESS;; +esac + +[ "$__OCF_ACTION" != "validate-all" ] && OCF_CHECK_LEVEL=10 +pgsql_validate_all +rc=$? + +[ "$1" = "validate-all" ] && exit $rc + +if [ $rc -ne 0 ] +then + case "$1" in + stop) if is_replication; then + change_pgsql_status "$NODENAME" "UNKNOWN" + fi + exit $OCF_SUCCESS;; + monitor) exit $OCF_NOT_RUNNING;; + status) exit $OCF_NOT_RUNNING;; + *) exit $rc;; + esac +fi + +US=`id -u -n` + +if [ $US != root -a $US != $OCF_RESKEY_pgdba ] +then + ocf_exit_reason "$0 must be run as root or $OCF_RESKEY_pgdba" + exit $OCF_ERR_GENERIC +fi + +# make psql command options +if [ -n "$OCF_RESKEY_monitor_user" ]; then + PGUSER=$OCF_RESKEY_monitor_user; export PGUSER + PGPASSWORD=$OCF_RESKEY_monitor_password; export PGPASSWORD + psql_options="-p $OCF_RESKEY_pgport $OCF_RESKEY_pgdb" +else + psql_options="-p $OCF_RESKEY_pgport -U $OCF_RESKEY_pgdba $OCF_RESKEY_pgdb" +fi + +if [ -n "$OCF_RESKEY_pghost" ]; then + psql_options="$psql_options -h $OCF_RESKEY_pghost" +else + if [ -n "$OCF_RESKEY_socketdir" ]; then + psql_options="$psql_options -h $OCF_RESKEY_socketdir" + fi +fi + +if [ -n "$OCF_RESKEY_pgport" ]; then + export PGPORT=$OCF_RESKEY_pgport +fi + +if [ -n "$OCF_RESKEY_pglibs" ]; then + if [ -n "$LD_LIBRARY_PATH" ]; then + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$OCF_RESKEY_pglibs + else + export LD_LIBRARY_PATH=$OCF_RESKEY_pglibs + fi +fi + + +# What kind of method was invoked? +case "$1" in + status) if pgsql_status + then + ocf_log info "PostgreSQL is up" + exit $OCF_SUCCESS + else + ocf_log info "PostgreSQL is down" + exit $OCF_NOT_RUNNING + fi;; + + monitor) pgsql_monitor + exit $?;; + + start) pgsql_start + exit $?;; + + promote) pgsql_promote + exit $?;; + + demote) pgsql_demote + exit $?;; + + notify) pgsql_notify + exit $?;; + + stop) pgsql_stop + exit $?;; + *) + exit $OCF_ERR_UNIMPLEMENTED;; +esac diff --git a/heartbeat/pingd b/heartbeat/pingd new file mode 100755 index 0000000..e2d5c31 --- /dev/null +++ b/heartbeat/pingd @@ -0,0 +1,297 @@ +#!/bin/sh +# +# +# pingd OCF Resource Agent +# Records (in the CIB) the current number of ping nodes a +# cluster node can connect to. +# +# Copyright (c) 2006 Andrew Beekhof +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_pidfile_default="$HA_RSCTMP/pingd-${OCF_RESOURCE_INSTANCE}" +OCF_RESKEY_user_default="root" +OCF_RESKEY_dampen_default="1s" +OCF_RESKEY_set_default="" +OCF_RESKEY_name_default="pingd" +OCF_RESKEY_section_default="" +OCF_RESKEY_multiplier_default="" +OCF_RESKEY_host_list_default="" +OCF_RESKEY_ignore_deprecation_default="false" + +: ${OCF_RESKEY_pidfile=${OCF_RESKEY_pidfile_default}} +: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} +: ${OCF_RESKEY_dampen=${OCF_RESKEY_dampen_default}} +: ${OCF_RESKEY_set=${OCF_RESKEY_set_default}} +: ${OCF_RESKEY_name=${OCF_RESKEY_name_default}} +: ${OCF_RESKEY_section=${OCF_RESKEY_section_default}} +: ${OCF_RESKEY_multiplier=${OCF_RESKEY_multiplier_default}} +: ${OCF_RESKEY_host_list=${OCF_RESKEY_host_list_default}} +: ${OCF_RESKEY_ignore_deprecation=${OCF_RESKEY_ignore_deprecation_default}} + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="pingd" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Deprecation warning: This agent is deprecated and may be removed from +a future release. See the ocf:pacemaker:pingd resource agent for a +supported alternative. -- +This is a pingd Resource Agent. +It records (in the CIB) the current number of ping nodes a node can connect to. +</longdesc> +<shortdesc lang="en">Monitors connectivity to specific hosts or +IP addresses ("ping nodes") (deprecated)</shortdesc> + +<parameters> + +<parameter name="pidfile" unique="0"> +<longdesc lang="en">PID file</longdesc> +<shortdesc lang="en">PID file</shortdesc> +<content type="string" default="${OCF_RESKEY_pidfile_default}" /> +</parameter> + + +<parameter name="user" unique="0"> +<longdesc lang="en"> +The user we want to run pingd as +</longdesc> +<shortdesc lang="en">The user we want to run pingd as</shortdesc> +<content type="string" default="${OCF_RESKEY_user_default}" /> +</parameter> + +<parameter name="dampen" unique="0"> +<longdesc lang="en"> +The time to wait (dampening) further changes occur +</longdesc> +<shortdesc lang="en">Dampening interval</shortdesc> +<content type="integer" default="${OCF_RESKEY_dampen_default=}"/> +</parameter> + +<parameter name="set" unique="0"> +<longdesc lang="en"> +The name of the instance_attributes set to place the value in. Rarely needs to be specified. +</longdesc> +<shortdesc lang="en">Set name</shortdesc> +<content type="integer" default="${OCF_RESKEY_set_default}"/> +</parameter> + +<parameter name="name" unique="0"> +<longdesc lang="en"> +The name of the attributes to set. This is the name to be used in the constraints. +</longdesc> +<shortdesc lang="en">Attribute name</shortdesc> +<content type="integer" default="${OCF_RESKEY_name_default}"/> +</parameter> + +<parameter name="section" unique="0"> +<longdesc lang="en"> +The section place the value in. Rarely needs to be specified. +</longdesc> +<shortdesc lang="en">Section name</shortdesc> +<content type="integer" default="${OCF_RESKEY_section_default}"/> +</parameter> + +<parameter name="multiplier" unique="0"> +<longdesc lang="en"> +The number by which to multiply the number of connected ping nodes by +</longdesc> +<shortdesc lang="en">Value multiplier</shortdesc> +<content type="integer" default="${OCF_RESKEY_multiplier_default}"/> +</parameter> + +<parameter name="host_list" unique="0"> +<longdesc lang="en"> +The list of ping nodes to count. Defaults to all configured ping nodes. Rarely needs to be specified. +</longdesc> +<shortdesc lang="en">Host list</shortdesc> +<content type="integer" default="${OCF_RESKEY_host_list_default}"/> +</parameter> + +<parameter name="ignore_deprecation"> +<longdesc lang="en"> +If set to true, suppresses the deprecation warning for this agent. +</longdesc> +<shortdesc lang="en">Suppress deprecation warning</shortdesc> +<content type="boolean" default="${OCF_RESKEY_ignore_deprecation_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="20s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +pingd_usage() { + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +pingd_start() { + extras="" + if [ ! -z "$OCF_RESKEY_multiplier" ]; then + extras="$extras -m $OCF_RESKEY_multiplier" + fi + if [ ! -z "$OCF_RESKEY_set" ]; then + extras="$extras -s $OCF_RESKEY_set" + fi + if [ ! -z "$OCF_RESKEY_section" ]; then + extras="$extras -S $OCF_RESKEY_section" + fi + for a_host in $OCF_RESKEY_host_list; do + extras="$extras -h $a_host" + done + pingd_cmd="${HA_BIN}/pingd -D -p $OCF_RESKEY_pidfile -a $OCF_RESKEY_name -d $OCF_RESKEY_dampen $extras" + + if [ ! -z $OCF_RESKEY_user ]; then + sudo -u $OCF_RESKEY_user $pingd_cmd + else + $pingd_cmd + fi + + rc=$? + if [ $rc = 0 ]; then + exit $OCF_SUCCESS + fi + + ocf_exit_reason "Could not run $pingd_cmd : rc=$rc" + exit $OCF_ERR_GENERIC +} + +pingd_stop() { + if [ -f $OCF_RESKEY_pidfile ]; then + pid=`cat $OCF_RESKEY_pidfile` + fi + if [ ! -z $pid ]; then + kill -s 9 $pid + rc=$? + + if [ $rc = 0 -o $rc = 1 ]; then + rm $OCF_RESKEY_pidfile + exit $OCF_SUCCESS + fi + + ocf_exit_reason "Unexpected result from kill -9 $pid: $rc" + exit $OCF_ERR_GENERIC + fi + exit $OCF_SUCCESS +} + +pingd_monitor() { + if [ -f $OCF_RESKEY_pidfile ]; then + pid=`cat $OCF_RESKEY_pidfile` + fi + if [ ! -z $pid ]; then + kill -s 0 $pid + if [ $? = 0 ]; then + exit $OCF_SUCCESS + fi + fi + exit $OCF_NOT_RUNNING +} + +pingd_validate() { +# Existence of the user + if [ ! -z $OCF_RESKEY_user ]; then + getent passwd "$OCF_RESKEY_user" >/dev/null + if [ $? -eq 0 ]; then + : Yes, user exists. We can further check his permission on crm_mon if necessary + else + ocf_exit_reason "The user $OCF_RESKEY_user does not exist!" + exit $OCF_ERR_ARGS + fi + fi + +# Pidfile better be an absolute path + case $OCF_RESKEY_pidfile in + /*) ;; + *) ocf_log warn "You should have pidfile($OCF_RESKEY_pidfile) of absolute path!" ;; + esac + +# Check the update interval + if ocf_is_decimal "$OCF_RESKEY_update" && [ $OCF_RESKEY_update -gt 0 ]; then + : + else + ocf_exit_reason "Invalid update interval $OCF_RESKEY_update. It should be positive integer!" + exit $OCF_ERR_ARGS + fi + + echo "Validate OK" + return $OCF_SUCCESS +} + +if [ $# -ne 1 ]; then + pingd_usage + exit $OCF_ERR_ARGS +fi + +if [ "$__OCF_ACTION" = "meta-data" ]; then + meta_data + exit $OCF_SUCCESS +fi + +# Be obnoxious, log deprecation warning on every invocation (unless +# suppressed by resource configuration). +ocf_deprecated + +case $__OCF_ACTION in +start) pingd_start + ;; +stop) pingd_stop + ;; +monitor) pingd_monitor + ;; +validate-all) pingd_validate + ;; +usage|help) pingd_usage + exit $OCF_SUCCESS + ;; +*) pingd_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +exit $? diff --git a/heartbeat/podman b/heartbeat/podman new file mode 100755 index 0000000..53867bf --- /dev/null +++ b/heartbeat/podman @@ -0,0 +1,628 @@ +#!/bin/sh +# +# The podman HA resource agent creates and launches a podman container +# based off a supplied podman image. Containers managed by this agent +# are both created and removed upon the agent's start and stop actions. +# +# Copyright (c) 2014 David Vossel <davidvossel@gmail.com> +# Michele Baldessari <michele@acksyn.org> +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_reuse_default="0" + +: ${OCF_RESKEY_reuse=${OCF_RESKEY_reuse_default}} + +####################################################################### + +meta_data() +{ + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="podman" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +The podman HA resource agent creates and launches a podman container +based off a supplied podman image. Containers managed by this agent +are both created and removed upon the agent's start and stop actions. +</longdesc> +<shortdesc lang="en">Podman container resource agent.</shortdesc> + +<parameters> +<parameter name="image" required="1" unique="0"> +<longdesc lang="en"> +The podman image to base this container off of. +</longdesc> +<shortdesc lang="en">podman image</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="name" required="0" unique="0"> +<longdesc lang="en"> +The name to give the created container. By default this will +be that resource's instance name. +</longdesc> +<shortdesc lang="en">podman container name</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="allow_pull" unique="0"> +<longdesc lang="en"> +Allow the image to be pulled from the configured podman registry when +the image does not exist locally. NOTE, this can drastically increase +the time required to start the container if the image repository is +pulled over the network. +</longdesc> +<shortdesc lang="en">Allow pulling non-local images</shortdesc> +<content type="boolean"/> +</parameter> + +<parameter name="run_opts" required="0" unique="0"> +<longdesc lang="en"> +Add options to be appended to the 'podman run' command which is used +when creating the container during the start action. This option allows +users to do things such as setting a custom entry point and injecting +environment variables into the newly created container. Note the '-d' +option is supplied regardless of this value to force containers to run +in the background. + +NOTE: Do not explicitly specify the --name argument in the run_opts. This +agent will set --name using either the resource's instance or the name +provided in the 'name' argument of this agent. + +</longdesc> +<shortdesc lang="en">run options</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="run_cmd" required="0" unique="0"> +<longdesc lang="en"> +Specify a command to launch within the container once +it has initialized. +</longdesc> +<shortdesc lang="en">run command</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="mount_points" required="0" unique="0"> +<longdesc lang="en"> +A comma separated list of directories that the container is expecting to use. +The agent will ensure they exist by running 'mkdir -p' +</longdesc> +<shortdesc lang="en">Required mount points</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="monitor_cmd" required="0" unique="0"> +<longdesc lang="en"> +Specify the full path of a command to launch within the container to check +the health of the container. This command must return 0 to indicate that +the container is healthy. A non-zero return code will indicate that the +container has failed and should be recovered. + +Note: Using this method for monitoring processes inside a container +is not recommended, as containerd tries to track processes running +inside the container and does not deal well with many short-lived +processes being spawned. Ensure that your container monitors its +own processes and terminates on fatal error rather than invoking +a command from the outside. +</longdesc> +<shortdesc lang="en">monitor command</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="force_kill" required="0" unique="0"> +<longdesc lang="en"> +Kill a container immediately rather than waiting for it to gracefully +shutdown +</longdesc> +<shortdesc lang="en">force kill</shortdesc> +<content type="boolean"/> +</parameter> + +<parameter name="reuse" required="0" unique="0"> +<longdesc lang="en"> +Allow the container to be reused once it is stopped. By default, +containers get removed once they are stopped. Enable this option +to have the particular one persist when this happens. +</longdesc> +<shortdesc lang="en">reuse container</shortdesc> +<content type="boolean" default="${OCF_RESKEY_reuse_default}"/> +</parameter> + +<parameter name="drop_in_dependency" required="0" unique="0"> +<longdesc lang="en"> +Use transient drop-in files to add extra dependencies to the systemd +scopes associated to the container. During reboot, this prevents systemd +to stop the container before pacemaker. +</longdesc> +<shortdesc lang="en">drop-in dependency</shortdesc> +<content type="boolean"/> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="90s" /> +<action name="stop" timeout="90s" /> +<action name="monitor" timeout="30s" interval="30s" depth="0" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="30s" /> +</actions> +</resource-agent> +END +} + +####################################################################### +REQUIRE_IMAGE_PULL=0 + +podman_usage() +{ + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + + +monitor_cmd_exec() +{ + local rc=$OCF_SUCCESS + local out + + out=$(podman exec ${CONTAINER} $OCF_RESKEY_monitor_cmd 2>&1) + rc=$? + # 125: no container with name or ID ${CONTAINER} found + # 126: container state improper (not running) + # 127: any other error + # 255: podman 2+: container not running + case "$rc" in + 125|126|255) + rc=$OCF_NOT_RUNNING + ;; + 0) + ocf_log debug "monitor cmd passed: exit code = $rc" + ;; + *) + ocf_exit_reason "monitor cmd failed (rc=$rc), output: $out" + rc=$OCF_ERR_GENERIC + ;; + esac + + return $rc +} + +container_exists() +{ + local rc + local out + + out=$(podman exec ${CONTAINER} $OCF_RESKEY_monitor_cmd 2>&1) + rc=$? + # 125: no container with name or ID ${CONTAINER} found + if [ $rc -ne 125 ]; then + return 0 + fi + return 1 +} + +remove_container() +{ + local rc + local execids + + if ocf_is_true "$OCF_RESKEY_reuse"; then + # never remove the container if we have reuse enabled. + return 0 + fi + + container_exists + if [ $? -ne 0 ]; then + # don't attempt to remove a container that doesn't exist + return 0 + fi + ocf_log notice "Cleaning up inactive container, ${CONTAINER}." + ocf_run podman rm -v $CONTAINER + rc=$? + if [ $rc -ne 0 ]; then + # due to a podman bug (rhbz#1841485), sometimes a stopped + # container can still be associated with Exec sessions, in + # which case the "podman rm" has to be forced + execids=$(podman inspect $CONTAINER --format '{{len .ExecIDs}}') + if [ "$execids" -ne "0" ]; then + ocf_log warn "Inactive container ${CONTAINER} has lingering exec sessions. Force-remove it." + ocf_run podman rm -f $CONTAINER + rc=$? + fi + fi + return $rc +} + +podman_simple_status() +{ + local rc + + # simple status is implemented via podman exec + # everything besides success is considered "not running" + monitor_cmd_exec + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + rc=$OCF_NOT_RUNNING; + fi + return $rc +} + +podman_monitor() +{ + # We rely on running podman exec to monitor the container + # state because that command seems to be less prone to + # performance issue under IO load. + # + # For probes to work, we expect cmd_exec to be able to report + # when a container is not running. Here, we're not interested + # in distinguishing whether it's stopped or non existing + # (there's function container_exists for that) + monitor_cmd_exec + return $? +} + +podman_create_mounts() { + oldIFS="$IFS" + IFS="," + for directory in $OCF_RESKEY_mount_points; do + mkdir -p "$directory" + done + IFS="$oldIFS" +} + +podman_container_id() +{ + # Retrieve the container ID by doing a "podman ps" rather than + # a "podman inspect", because the latter has performance issues + # under IO load. + # We could have run "podman start $CONTAINER" to get the ID back + # but if the container is stopped, the command will return a + # name instead of a container ID. This would break us. + podman ps --no-trunc --format '{{.ID}} {{.Names}}' | grep -F -w -m1 "$CONTAINER" | cut -d' ' -f1 +} + + +create_transient_drop_in_dependency() +{ + local cid=$1 + local rc=$OCF_SUCCESS + + if [ -z "$cid" ]; then + ocf_exit_reason "Container ID not found for \"$CONTAINER\". Not creating drop-in dependency" + return $OCF_ERR_GENERIC + fi + + ocf_log info "Creating drop-in dependency for \"$CONTAINER\" ($cid)" + for scope in "libpod-$cid.scope.d" "libpod-conmon-$cid.scope.d"; do + if [ $rc -eq $OCF_SUCCESS ] && [ ! -d /run/systemd/transient/"$scope" ]; then + mkdir -p /run/systemd/transient/"$scope" && \ + printf "[Unit]\nBefore=pacemaker.service" > /run/systemd/transient/"$scope"/dep.conf && \ + chmod ago+r /run/systemd/transient/"$scope" /run/systemd/transient/"$scope"/dep.conf + rc=$? + fi + done + + if [ $rc -ne $OCF_SUCCESS ]; then + ocf_log error "Could not create drop-in dependency for \"$CONTAINER\" ($cid)" + else + systemctl daemon-reload + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + ocf_log error "Could not refresh service definition after creating drop-in for \"$CONTAINER\"" + fi + fi + + return $rc +} + + +run_new_container() +{ + local opts=$1 + local image=$2 + local cmd=$3 + local rc + + ocf_log info "running container $CONTAINER for the first time" + out=$(podman run $opts $image $cmd 2>&1) + rc=$? + + if [ -n "$out" ]; then + out="$(echo "$out" | tr -s ' \t\r\n' ' ')" + if [ $rc -eq 0 ]; then + ocf_log info "$out" + else + ocf_log err "$out" + fi + fi + + if [ $rc -eq 125 ]; then + # If an internal podman error occurred, it might be because + # the internal storage layer still references an old container + # with the same name, even though podman itself thinks there + # is no such container. If so, purge the storage layer to try + # to clean the corruption and try again. + if echo "$out" | grep -q "unknown.*flag"; then + ocf_exit_reason "$out" + return $rc + fi + + ocf_log warn "Internal podman error while creating new container $CONTAINER. Retrying." + ocf_run podman rm --storage $CONTAINER + ocf_run podman run $opts $image $cmd + rc=$? + elif [ $rc -eq 127 ]; then + # rhbz#1972209: podman 3.0.x seems to be hit by a race + # where the cgroup is not yet set up properly when the OCI + # runtime configures the container. If that happens, recreate + # the container as long as we get the same error code or + # until start timeout preempts us. + while [ $rc -eq 127 ] && (echo "$out" | grep -q "cgroup.*scope not found") ; do + ocf_log warn "Internal podman error while assigning cgroup. Retrying." + # Arbitrary sleep to prevent consuming all CPU while looping + sleep 1 + podman rm -f "$CONTAINER" + out=$(podman run $opts $image $cmd 2>&1) + rc=$? + done + # Log the created container ID if it succeeded + if [ $rc -eq 0 ]; then + ocf_log info "$out" + fi + fi + + return $rc +} + + +podman_start() +{ + local cid + local rc + + podman_create_mounts + local run_opts="-d --name=${CONTAINER}" + # check to see if the container has already started + podman_simple_status + if [ $? -eq $OCF_SUCCESS ]; then + return $OCF_SUCCESS + fi + + if [ -n "$OCF_RESKEY_run_opts" ]; then + run_opts="$run_opts $OCF_RESKEY_run_opts" + fi + + if [ $REQUIRE_IMAGE_PULL -eq 1 ]; then + ocf_log notice "Beginning pull of image, ${OCF_RESKEY_image}" + podman pull "${OCF_RESKEY_image}" + if [ $? -ne 0 ]; then + ocf_exit_reason "failed to pull image ${OCF_RESKEY_image}" + return $OCF_ERR_GENERIC + fi + fi + + if ocf_is_true "$OCF_RESKEY_reuse" && container_exists; then + ocf_log info "starting existing container $CONTAINER." + ocf_run podman start $CONTAINER + else + # make sure any previous container matching our container name is cleaned up first. + # we already know at this point it wouldn't be running + remove_container + run_new_container "$run_opts" $OCF_RESKEY_image "$OCF_RESKEY_run_cmd" + if [ $? -eq 125 ]; then + return $OCF_ERR_GENERIC + fi + + fi + rc=$? + + # if the container was stopped or didn't exist before, systemd + # removed the libpod* scopes. So always try to recreate the drop-ins + if [ $rc -eq 0 ] && ocf_is_true "$OCF_RESKEY_drop_in_dependency"; then + cid=$(podman_container_id) + create_transient_drop_in_dependency "$cid" + rc=$? + fi + + if [ $rc -ne 0 ]; then + ocf_exit_reason "podman failed to launch container (rc: $rc)" + return $OCF_ERR_GENERIC + fi + + + # wait for monitor to pass before declaring that the container is started + while true; do + podman_simple_status + if [ $? -ne $OCF_SUCCESS ]; then + ocf_exit_reason "Newly created podman container exited after start" + return $OCF_ERR_GENERIC + fi + + monitor_cmd_exec + if [ $? -eq $OCF_SUCCESS ]; then + ocf_log notice "Container $CONTAINER started successfully" + return $OCF_SUCCESS + fi + + ocf_exit_reason "waiting on monitor_cmd to pass after start" + sleep 1 + done +} + +podman_stop() +{ + local timeout=60 + local rc + podman_simple_status + if [ $? -eq $OCF_NOT_RUNNING ]; then + remove_container + return $OCF_SUCCESS + fi + + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000) -10 )) + if [ $timeout -lt 10 ]; then + timeout=10 + fi + fi + + if ocf_is_true "$OCF_RESKEY_force_kill"; then + ocf_run podman kill $CONTAINER + rc=$? + else + ocf_log debug "waiting $timeout second[s] before killing container" + ocf_run podman stop -t=$timeout $CONTAINER + rc=$? + # on stop, systemd will automatically delete any transient + # drop-in conf that has been created earlier + fi + + if [ $rc -ne 0 ]; then + # If the stop failed, it could be because the controlling conmon + # process died unexpectedly. If so, a generic error code is returned + # but the associated container exit code is -1. If that's the case, + # assume there's no failure and continue with the rm as usual. + if [ $rc -eq 125 ] && \ + podman inspect --format '{{.State.Status}}:{{.State.ExitCode}}' $CONTAINER | grep -wq "stopped:-1"; then + ocf_log warn "Container ${CONTAINER} had an unexpected stop outcome. Trying to remove it anyway." + else + ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." + return $OCF_ERR_GENERIC + fi + fi + + remove_container + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to remove stopped container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +image_exists() +{ + podman image exists "${OCF_RESKEY_image}" + if [ $? -eq 0 ]; then + # image found + return 0 + fi + + if ocf_is_true "$OCF_RESKEY_allow_pull"; then + REQUIRE_IMAGE_PULL=1 + ocf_log notice "Image (${OCF_RESKEY_image}) does not exist locally but will be pulled during start" + return 0 + fi + # image not found. + return 1 +} + +podman_validate() +{ + check_binary podman + if [ -z "$OCF_RESKEY_image" ]; then + ocf_exit_reason "'image' option is required" + exit $OCF_ERR_CONFIGURED + fi + + image_exists + if [ $? -ne 0 ]; then + ocf_exit_reason "base image, ${OCF_RESKEY_image}, could not be found." + exit $OCF_ERR_CONFIGURED + fi + + return $OCF_SUCCESS +} + +# TODO : +# When a user starts plural clones in a node in globally-unique, a user cannot appoint plural name parameters. +# When a user appoints reuse, the resource agent cannot connect plural clones with a container. + +if ocf_is_true "$OCF_RESKEY_CRM_meta_globally_unique"; then + if [ -n "$OCF_RESKEY_name" ]; then + if [ -n "$OCF_RESKEY_CRM_meta_clone_node_max" ] && [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] + then + ocf_exit_reason "Cannot make plural clones from the same name parameter." + exit $OCF_ERR_CONFIGURED + fi + if [ -n "$OCF_RESKEY_CRM_meta_master_node_max" ] && [ "$OCF_RESKEY_CRM_meta_master_node_max" -ne 1 ] + then + ocf_exit_reason "Cannot make plural master from the same name parameter." + exit $OCF_ERR_CONFIGURED + fi + fi + : ${OCF_RESKEY_name=`echo ${OCF_RESOURCE_INSTANCE} | tr ':' '-'`} +else + : ${OCF_RESKEY_name=${OCF_RESOURCE_INSTANCE}} +fi + +CONTAINER=$OCF_RESKEY_name + +# Note: we currently monitor podman containers by with the "podman exec" +# command, so make sure that invocation is always valid by enforcing the +# exec command to be non-empty +: ${OCF_RESKEY_monitor_cmd:=/bin/true} + +# When OCF_RESKEY_drop_in_dependency is not populated, we +# look at another file-based way of enabling the option. +# Otherwise, consider it disabled. +if [ -z "$OCF_RESKEY_drop_in_dependency" ]; then + if [ -f "/etc/sysconfig/podman_drop_in" ] || \ + [ -f "/etc/default/podman_drop_in" ]; then + OCF_RESKEY_drop_in_dependency=yes + fi +fi + +case $__OCF_ACTION in +meta-data) meta_data + exit $OCF_SUCCESS;; +start) + podman_validate + podman_start;; +stop) podman_stop;; +monitor) podman_monitor;; +validate-all) podman_validate;; +usage|help) podman_usage + exit $OCF_SUCCESS + ;; +*) podman_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc diff --git a/heartbeat/portblock b/heartbeat/portblock new file mode 100755 index 0000000..06fcc19 --- /dev/null +++ b/heartbeat/portblock @@ -0,0 +1,666 @@ +#!/bin/sh +# +# portblock: iptables temporary portblocking control +# +# Author: Sun Jiang Dong (initial version) +# Philipp Reisner (per-IP filtering) +# +# License: GNU General Public License (GPL) +# +# Copyright: (C) 2005 International Business Machines +# +# OCF parameters are as below: +# OCF_RESKEY_protocol +# OCF_RESKEY_portno +# OCF_RESKEY_action +# OCF_RESKEY_ip +# OCF_RESKEY_tickle_dir +# OCF_RESKEY_sync_script +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_protocol_default="" +OCF_RESKEY_portno_default="" +OCF_RESKEY_direction_default="in" +OCF_RESKEY_action_default="" +OCF_RESKEY_ip_default="0.0.0.0/0" +OCF_RESKEY_reset_local_on_unblock_stop_default="false" +OCF_RESKEY_tickle_dir_default="" +OCF_RESKEY_sync_script_default="" + +: ${OCF_RESKEY_protocol=${OCF_RESKEY_protocol_default}} +: ${OCF_RESKEY_portno=${OCF_RESKEY_portno_default}} +: ${OCF_RESKEY_direction=${OCF_RESKEY_direction_default}} +: ${OCF_RESKEY_action=${OCF_RESKEY_action_default}} +: ${OCF_RESKEY_ip=${OCF_RESKEY_ip_default}} +: ${OCF_RESKEY_reset_local_on_unblock_stop=${OCF_RESKEY_reset_local_on_unblock_stop_default}} +: ${OCF_RESKEY_tickle_dir=${OCF_RESKEY_tickle_dir_default}} +: ${OCF_RESKEY_sync_script=${OCF_RESKEY_sync_script_default}} +####################################################################### +CMD=`basename $0` +TICKLETCP=$HA_BIN/tickle_tcp + +usage() +{ + cat <<END >&2 + usage: $CMD {start|stop|status|monitor|meta-data|validate-all} + + $CMD is used to temporarily block ports using iptables. + + It can be used to blackhole a port before bringing + up an IP address, and enable it after a service is started. + To do that for samba, the following can be used: + + crm configure <<EOF + primitive portblock-samba ocf:heartbeat:portblock \\ + params protocol=tcp portno=137,138 action=block + primitive portunblock-samba ocf:heartbeat:portblock \\ + params protocol=tcp portno=137,138 action=unblock + primitive samba-vip ocf:heartbeat:IPaddr2 \\ + params ip=10.10.10.20 + group g-samba \\ + portblock-samba samba-vip nmbd smbd portunblock-samba + EOF + + This will do the following things: + + - DROP all incoming packets for TCP ports 137 and 138 + - Bring up the IP alias 10.10.10.20 + - start the nmbd and smbd services + - Re-enable TCP ports 137 and 138 + (enable normal firewall rules on those ports) + + This prevents clients from getting TCP RST if they try to reconnect + to the service after the alias is enabled but before nmbd and smbd + are running. These packets will cause some clients to give up + attempting to reconnect to the server. + + Attempts to connect to UDP and other non-TCP ports which have nothing + listening can result in ICMP port unreachable responses, which can + have the same undesirable affect on some clients. + + NOTE: iptables is Linux-specific. + + An additional feature in the portblock RA is the tickle ACK function + enabled by specifying the tickle_dir parameter. The tickle ACK + triggers the clients to faster reconnect their TCP connections to the + fail-overed server. + + Please note that this feature is often used for the floating IP fail- + over scenario where the long-lived TCP connections need to be tickled. + It doesn't support the cluster alias IP scenario. + + When using the tickle ACK function, in addition to the normal usage + of portblock RA, the parameter tickle_dir must be specified in the + action=unblock instance of the portblock resources. + For example, you may stack resources like below: + portblock action=block + services + portblock action=unblock tickle_dir=/tickle/state/dir + + If you want to tickle all the TCP connections which connected to _one_ + floating IP but different ports, no matter how many portblock resources + you have defined, you should enable tickles for _one_ portblock + resource(action=unblock) only. + + The tickle_dir is a location which stores the established TCP + connections. It can be a shared directory(which is cluster-visible to + all nodes) or a local directory. + If you use the shared directory, you needn't do any other things. + If you use the local directory, you must also specify the sync_script + paramater. We recommend you to use csync2 as the sync_script. + For example, if you use the local directory /tmp/tickle as tickle_dir, + you could setup the csync2 as the csync2 documentation says and + configure your /etc/csync2/csync2.cfg like: + group ticklegroup { + host node1; + host node2; + key /etc/csync2/ticklegroup.key; + include /etc/csync2/csync2.cfg; + include /tmp/tickle; + auto younger; + } + Then specify the parameter sync_script as "csync2 -xv". + +END +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="portblock" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for portblock. It is used to temporarily block ports +using iptables. In addition, it may allow for faster TCP reconnects +for clients on failover. Use that if there are long lived TCP +connections to an HA service. This feature is enabled by setting the +tickle_dir parameter and only in concert with action set to unblock. +Note that the tickle ACK function is new as of version 3.0.2 and +hasn't yet seen widespread use. +</longdesc> +<shortdesc lang="en">Block and unblocks access to TCP and UDP ports</shortdesc> + +<parameters> +<parameter name="protocol" unique="0" required="1"> +<longdesc lang="en"> +The protocol used to be blocked/unblocked. +</longdesc> +<shortdesc lang="en">protocol</shortdesc> +<content type="string" default="${OCF_RESKEY_protocol_default}" /> +</parameter> + +<parameter name="portno" unique="0" required="1"> +<longdesc lang="en"> +The port number used to be blocked/unblocked. +</longdesc> +<shortdesc lang="en">portno</shortdesc> +<content type="string" default="${OCF_RESKEY_portno_default}" /> +</parameter> + +<parameter name="action" unique="0" required="1"> +<longdesc lang="en"> +The action (block/unblock) to be done on the protocol::portno. +</longdesc> +<shortdesc lang="en">action</shortdesc> +<content type="string" default="${OCF_RESKEY_action_default}" /> +</parameter> + +<parameter name="reset_local_on_unblock_stop" unique="0" required="0"> +<longdesc lang="en"> +If for some reason the long lived server side TCP sessions won't be cleaned up +by a reconfiguration/flush/stop of whatever services this portblock protects, +they would linger in the connection table, even after the IP is gone +and services have been switched over to another node. + +An example would be the default NFS kernel server. + +These "known" connections may seriously confuse and delay a later switchback. + +Enabling this option will cause this agent to try to get rid of these connections +by injecting a temporary iptables rule to TCP-reset outgoing packets from the +blocked ports, and additionally tickle them locally, +just before it starts to DROP incoming packets on "unblock stop". +</longdesc> +<shortdesc lang="en">(try to) reset server TCP sessions when unblock stops</shortdesc> +<content type="boolean" default="${OCF_RESKEY_reset_local_on_unblock_stop_default}" /> +</parameter> + +<parameter name="ip" unique="0" required="0"> +<longdesc lang="en"> +The IP address used to be blocked/unblocked. +</longdesc> +<shortdesc lang="en">ip</shortdesc> +<content type="string" default="${OCF_RESKEY_ip_default}" /> +</parameter> + +<parameter name="tickle_dir" unique="0" required="0"> +<longdesc lang="en"> +The shared or local directory (_must_ be absolute path) which +stores the established TCP connections. +</longdesc> +<shortdesc lang="en">Tickle directory</shortdesc> +<content type="string" default="${OCF_RESKEY_tickle_dir_default}" /> +</parameter> + +<parameter name="sync_script" unique="0" required="0"> +<longdesc lang="en"> +If the tickle_dir is a local directory, then the TCP connection state +file has to be replicated to other nodes in the cluster. It can be +csync2 (default), some wrapper of rsync, or whatever. It takes the +file name as a single argument. For csync2, set it to "csync2 -xv". +</longdesc> +<shortdesc lang="en">Connection state file synchronization script</shortdesc> +<content type="string" default="${OCF_RESKEY_sync_script_default}" /> +</parameter> + +<parameter name="direction" unique="0" required="0"> +<longdesc lang="en"> +Whether to block incoming or outgoing traffic. Can be either "in", +"out", or "both". +If "in" is used, the incoming ports are blocked on the INPUT chain. +If "out" is used, the outgoing ports are blocked on the OUTPUT chain. +If "both" is used, both the incoming and outgoing ports are blocked. +</longdesc> +<shortdesc lang="en">Whether to block incoming or outgoing traffic, or both</shortdesc> +<content type="string" default="${OCF_RESKEY_direction_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="status" depth="0" timeout="10s" interval="10s" /> +<action name="monitor" depth="0" timeout="10s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s" /> +</actions> +</resource-agent> +END +} + + +# +# Because this is the normal usage, we consider "block" +# resources to be pseudo-resources -- that is, their status can't +# be reliably determined through external means. +# This is because we expect an "unblock" resource to come along +# and disable us -- but we're still in some sense active... +# + +#active_grep_pat {udp|tcp} portno,portno ip {d|s} +# d = look for destination ports +# s = look for source ports +active_grep_pat() +{ + w="[ ][ ]*" + any="0\\.0\\.0\\.0/0" + src=$any dst=$3 + if [ "$4" = "s" ]; then + local src=$3 + local dst=$any + fi + echo "^DROP${w}${1}${w}--${w}${src}${w}${dst}${w}multiport${w}${4}ports${w}${2}$" +} + +#chain_isactive {udp|tcp} portno,portno ip chain +chain_isactive() +{ + [ "$4" = "OUTPUT" ] && ds="s" || ds="d" + PAT=$(active_grep_pat "$1" "$2" "$3" "$ds") + $IPTABLES $wait -n -L "$4" | grep "$PAT" >/dev/null +} + +# netstat -tn and ss -Htn, split on whitespace and colon, +# look very similar: +# tcp 0 0 10.43.55.1 675 10.43.9.8 2049 ESTABLISHED +# ESTAB 0 0 10.43.55.1 675 10.43.9.8 2049 +# so we can write one awk script for both +get_established_tcp_connections() +{ + local columns + if [ -z "$1" ] ; then + columns='$4,$5, $6,$7' + else + # swap local and remote for "tickle_local" + columns='$6,$7, $4,$5' + fi + $ss_or_netstat | awk -F '[:[:space:]]+' ' + ( $8 == "ESTABLISHED" || $1 == "ESTAB" ) && $4 == "'$OCF_RESKEY_ip'" \ + {printf "%s:%s\t%s:%s\n", '"$columns"'}' +} + +save_tcp_connections() +{ + [ -z "$OCF_RESKEY_tickle_dir" ] && return + statefile=$OCF_RESKEY_tickle_dir/$OCF_RESKEY_ip + # If we have _no_ sync script, we probably have a shared + # (or replicated) directory, and need to fsync, or we might + # end up with the just truncated file after failover, exactly + # when we need it. + # + # If we _do_ have a sync script, it is not that important whether + # the local state file is fsync'ed or not, the sync script is + # responsible to "atomically" communicate the state to the peer(s). + if [ -z "$OCF_RESKEY_sync_script" ]; then + get_established_tcp_connections | + dd of="$statefile".new conv=fsync status=none && + mv "$statefile".new "$statefile" + else + get_established_tcp_connections > $statefile + $OCF_RESKEY_sync_script $statefile > /dev/null 2>&1 & + fi +} + +tickle_remote() +{ + [ -z "$OCF_RESKEY_tickle_dir" ] && return + echo 1 > /proc/sys/net/ipv4/tcp_tw_recycle + f=$OCF_RESKEY_tickle_dir/$OCF_RESKEY_ip + [ -r $f ] || return + $TICKLETCP -n 3 < $f +} + +tickle_local() +{ + [ -z "$OCF_RESKEY_tickle_dir" ] && return + f=$OCF_RESKEY_tickle_dir/$OCF_RESKEY_ip + [ -r $f ] || return + + # swap "local" and "remote" address, + # so we tickle ourselves. + # We set up a REJECT with tcp-reset before we do so, so we get rid of + # the no longer wanted potentially long lived "ESTABLISHED" connection + # entries on the IP we are going to delet in a sec. These would get in + # the way if we switch-over and then switch-back in quick succession. + local i + awk '{ print $2, $1; }' $f | $TICKLETCP + $ss_or_netstat | grep -Fw $OCF_RESKEY_ip || return + for i in 0.1 0.5 1 2 4 ; do + sleep $i + # now kill what is currently in the list, + # not what was recorded during last monitor + get_established_tcp_connections swap | $TICKLETCP + $ss_or_netstat | grep -Fw $OCF_RESKEY_ip || break + done +} + +SayActive() +{ + echo "$CMD DROP rule [$*] is running (OK)" +} + +SayConsideredActive() +{ + echo "$CMD DROP rule [$*] considered to be running (OK)" +} + +SayInactive() +{ + echo "$CMD DROP rule [$*] is inactive" +} + +#IptablesStatus {udp|tcp} portno,portno ip {in|out|both} {block|unblock} +IptablesStatus() { + local rc + rc=$OCF_ERR_GENERIC + is_active=0 + if [ "$4" = "in" ] || [ "$4" = "both" ]; then + chain_isactive "$1" "$2" "$3" INPUT + is_active=$? + fi + if [ "$4" = "out" ] || [ "$4" = "both" ]; then + chain_isactive "$1" "$2" "$3" OUTPUT + r=$? + [ $r -gt $is_active ] && is_active=$r + fi + if [ $is_active -eq 0 ]; then + case $5 in + block) + SayActive $* + rc=$OCF_SUCCESS + ;; + *) + SayInactive $* + rc=$OCF_NOT_RUNNING + ;; + esac + else + case $5 in + block) + if ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" status; then + SayConsideredActive $* + rc=$OCF_SUCCESS + else + SayInactive $* + rc=$OCF_NOT_RUNNING + fi + ;; + *) + if ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" status; then + SayActive $* + #This is only run on real monitor events. + save_tcp_connections + rc=$OCF_SUCCESS + else + SayInactive $* + rc=$OCF_NOT_RUNNING + fi + ;; + esac + fi + return $rc +} + +#DoIptables {-I|-D} {udp|tcp} portno,portno ip chain +DoIptables() +{ + op=$1 proto=$2 ports=$3 ip=$4 chain=$5 + active=0; chain_isactive "$proto" "$ports" "$ip" "$chain" && active=1 + want_active=0; [ "$op" = "-I" ] && want_active=1 + echo "active: $active want_active: $want_active" + if [ $active -eq $want_active ] ; then + : Chain already in desired state + else + [ "$chain" = "OUTPUT" ] && ds="s" || ds="d" + $IPTABLES $wait "$op" "$chain" -p "$proto" -${ds} "$ip" -m multiport --${ds}ports "$ports" -j DROP + fi +} + +#IptablesBLOCK {udp|tcp} portno,portno ip {in|out|both} {block|unblock} +IptablesBLOCK() +{ + local rc_in=0 + local rc_out=0 + if [ "$4" = "in" ] || [ "$4" = "both" ]; then + local try_reset=false + if [ "$1/$5/$__OCF_ACTION" = tcp/unblock/stop ] && + ocf_is_true $reset_local_on_unblock_stop + then + try_reset=true + fi + if + chain_isactive "$1" "$2" "$3" INPUT + then + : OK -- chain already active + else + if $try_reset ; then + $IPTABLES $wait -I OUTPUT -p "$1" -s "$3" -m multiport --sports "$2" -j REJECT --reject-with tcp-reset + tickle_local + fi + $IPTABLES $wait -I INPUT -p "$1" -d "$3" -m multiport --dports "$2" -j DROP + rc_in=$? + if $try_reset ; then + $IPTABLES $wait -D OUTPUT -p "$1" -s "$3" -m multiport --sports "$2" -j REJECT --reject-with tcp-reset + fi + fi + fi + if [ "$4" = "out" ] || [ "$4" = "both" ]; then + DoIptables -I "$1" "$2" "$3" OUTPUT + rc_out=$? + fi + + [ $rc_in -gt $rc_out ] && return $rc_in || return $rc_out +} + +#IptablesUNBLOCK {udp|tcp} portno,portno ip {in|out|both} +IptablesUNBLOCK() +{ + if [ "$4" = "in" ] || [ "$4" = "both" ]; then + DoIptables -D "$1" "$2" "$3" INPUT + fi + if [ "$4" = "out" ] || [ "$4" = "both" ]; then + DoIptables -D "$1" "$2" "$3" OUTPUT + fi + + return $? +} + +#IptablesStart {udp|tcp} portno,portno ip {in|out|both} {block|unblock} +IptablesStart() +{ + ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" start + case $5 in + block) IptablesBLOCK "$@";; + unblock) + IptablesUNBLOCK "$@" + rc=$? + tickle_remote + #ignore run_tickle_tcp exit code! + return $rc + ;; + *) usage; return 1; + esac + + return $? +} + +#IptablesStop {udp|tcp} portno,portno ip {in|out|both} {block|unblock} +IptablesStop() +{ + ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" stop + case $5 in + block) IptablesUNBLOCK "$@";; + unblock) + save_tcp_connections + IptablesBLOCK "$@" + ;; + *) usage; return 1;; + esac + + return $? +} + +# +# Check if the port is valid, this function code is not decent, but works +# +CheckPort() { +# Examples of valid port: "1080", "1", "0080" +# Examples of invalid port: "1080bad", "0", "0000", "" + echo $1 |egrep -qx '[0-9]+(:[0-9]+)?(,[0-9]+(:[0-9]+)?)*' +} + +IptablesValidateAll() +{ + check_binary $IPTABLES + case $protocol in + tcp|udp) + ;; + *) + ocf_log err "Invalid protocol $protocol!" + exit $OCF_ERR_CONFIGURED + ;; + esac + + if CheckPort "$portno"; then + : + else + ocf_log err "Invalid port number $portno!" + exit $OCF_ERR_CONFIGURED + fi + + if [ -n "$OCF_RESKEY_tickle_dir" ]; then + if [ x"$action" != x"unblock" ]; then + ocf_log err "Tickles are only useful with action=unblock!" + exit $OCF_ERR_CONFIGURED + fi + if [ ! -d "$OCF_RESKEY_tickle_dir" ]; then + ocf_log err "The tickle dir doesn't exist!" + exit $OCF_ERR_INSTALLED + fi + fi + + case $action in + block|unblock) + ;; + *) + ocf_log err "Invalid action $action!" + exit $OCF_ERR_CONFIGURED + ;; + esac + + if ocf_is_true $reset_local_on_unblock_stop; then + if [ $action != unblock ] ; then + ocf_log err "reset_local_on_unblock_stop is only relevant with action=unblock" + exit $OCF_ERR_CONFIGURED + fi + if [ -z $OCF_RESKEY_tickle_dir ] ; then + ocf_log warn "reset_local_on_unblock_stop works best with tickle_dir enabled as well" + fi + fi + + return $OCF_SUCCESS +} + +if + ( [ $# -ne 1 ] ) +then + usage + exit $OCF_ERR_ARGS +fi + +case $1 in + meta-data) meta_data + exit $OCF_SUCCESS + ;; + + usage) usage + exit $OCF_SUCCESS + ;; + *) ;; +esac + +if [ -z "$OCF_RESKEY_protocol" ]; then + ocf_log err "Please set OCF_RESKEY_protocol" + exit $OCF_ERR_CONFIGURED +fi + +if [ -z "$OCF_RESKEY_portno" ]; then + ocf_log err "Please set OCF_RESKEY_portno" + exit $OCF_ERR_CONFIGURED +fi + +if [ -z "$OCF_RESKEY_action" ]; then + ocf_log err "Please set OCF_RESKEY_action" + exit $OCF_ERR_CONFIGURED +fi + +# iptables v1.4.20+ is required to use -w (wait) +version=$(iptables -V | awk -F ' v' '{print $NF}') +ocf_version_cmp "$version" "1.4.19.1" +if [ "$?" -eq "2" ]; then + wait="-w" +else + wait="" +fi + +protocol=$OCF_RESKEY_protocol +portno=$OCF_RESKEY_portno +direction=$OCF_RESKEY_direction +action=$OCF_RESKEY_action +ip=$OCF_RESKEY_ip +reset_local_on_unblock_stop=$OCF_RESKEY_reset_local_on_unblock_stop + + +# If "tickle" is enabled, we need to record the list of currently established +# connections during monitor. Use ss where available, and netstat otherwise. +if [ -n "$OCF_RESKEY_tickle_dir" ] ; then + if have_binary ss ; then + ss_or_netstat="ss -Htn" + elif have_binary netstat ; then + ss_or_netstat="netstat -tn" + else + ocf_log err "Neither ss nor netstat found, but needed to record estblished connections." + exit $OCF_ERR_INSTALLED + fi +fi + +case $1 in + start) + IptablesStart $protocol $portno $ip $direction $action + ;; + + stop) + IptablesStop $protocol $portno $ip $direction $action + ;; + + status|monitor) + IptablesStatus $protocol $portno $ip $direction $action + ;; + + validate-all) + IptablesValidateAll + ;; + + *) usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +exit $? diff --git a/heartbeat/postfix b/heartbeat/postfix new file mode 100755 index 0000000..6619360 --- /dev/null +++ b/heartbeat/postfix @@ -0,0 +1,429 @@ +#!/bin/sh +# +# Resource script for Postfix +# +# Description: Manages Postfix as an OCF resource in +# an high-availability setup. +# +# Author: Raoul Bhatia <r.bhatia@ipax.at> : Original Author +# License: GNU General Public License (GPL) +# Note: If you want to run multiple Postfix instances, please see +# http://amd.co.at/adminwiki/Postfix#Adding_a_Second_Postfix_Instance_on_one_Server +# http://www.postfix.org/postconf.5.html +# +# +# usage: $0 {start|stop|reload|monitor|validate-all|meta-data} +# +# The "start" arg starts a Postfix instance +# +# The "stop" arg stops it. +# +# OCF parameters: +# OCF_RESKEY_binary +# OCF_RESKEY_config_dir +# OCF_RESKEY_parameters +# +########################################################################## + +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_binary_default="/usr/sbin/postfix" +OCF_RESKEY_config_dir_default="" +OCF_RESKEY_parameters_default="" + +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +: ${OCF_RESKEY_config_dir=${OCF_RESKEY_config_dir_default}} +: ${OCF_RESKEY_parameters=${OCF_RESKEY_parameters_default}} + +USAGE="Usage: $0 {start|stop|reload|monitor|validate-all|meta-data}"; + +########################################################################## + +# Check availability of the runuser command, otherwise use su +if [ -x /sbin/runuser ]; then + SU=runuser +else + SU=su +fi + +usage() { + echo $USAGE >&2 +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="postfix" version="0.1"> +<version>1.0</version> +<longdesc lang="en"> +This script manages Postfix as an OCF resource in a high-availability setup. +</longdesc> +<shortdesc lang="en">Manages a highly available Postfix mail server instance</shortdesc> + +<parameters> + +<parameter name="binary" unique="0" required="0"> +<longdesc lang="en"> +Full path to the Postfix binary. +For example, "/usr/sbin/postfix". +</longdesc> +<shortdesc lang="en">Full path to Postfix binary</shortdesc> +<content type="string" default="${OCF_RESKEY_binary_default}" /> +</parameter> + +<parameter name="config_dir" unique="1" required="0"> +<longdesc lang="en"> +Full path to a Postfix configuration directory. +For example, "/etc/postfix". +</longdesc> +<shortdesc lang="en">Full path to configuration directory</shortdesc> +<content type="string" default="${OCF_RESKEY_config_dir_default}" /> +</parameter> + +<parameter name="parameters" unique="0" required="0"> +<longdesc lang="en"> +The Postfix daemon may be called with additional parameters. +Specify any of them here. +</longdesc> +<shortdesc lang="en"></shortdesc> +<content type="string" default="${OCF_RESKEY_parameters_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="reload" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="60s" /> +<action name="validate-all" timeout="20s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +postfix_running() { + local loglevel + loglevel=${1:-err} + + # run Postfix status if available + if ocf_is_true $status_support; then + $binary $OPTION_CONFIG_DIR status 2>&1 + ret=$? + if [ $ret -ne 0 ]; then + ocf_log $loglevel "Postfix status: " $ret + fi + return $ret + fi + + # manually check Postfix's pid + PIDFILE=${queue_dir}/pid/master.pid + if [ -f $PIDFILE ]; then + PID=`head -n 1 $PIDFILE` + kill -s 0 $PID >/dev/null 2>&1 && [ `ps -p $PID | grep master | wc -l` -eq 1 ] + return $? + fi + + # Postfix is not running + false +} + +postfix_start() +{ + # if Postfix is running return success + if postfix_running info; then + ocf_log info "Postfix already running." + return $OCF_SUCCESS + fi + + # start Postfix + $binary $OPTIONS start >/dev/null 2>&1 + ret=$? + + if [ $ret -ne 0 ]; then + ocf_exit_reason "Postfix returned error: $ret" + return $OCF_ERR_GENERIC + fi + + # grant some time for startup/forking the sub processes + # and loop initial monitoring until success or timeout + while true; do + sleep 1 + # break if postfix is up and running; log failure otherwise + postfix_running info && break + ocf_log info "Postfix failed initial monitor action: " $ret + done + + ocf_log info "Postfix started." + return $OCF_SUCCESS +} + + +postfix_stop() +{ + # if Postfix is not running return success + if ! postfix_running info; then + ocf_log info "Postfix already stopped." + return $OCF_SUCCESS + fi + + # stop Postfix + $binary $OPTIONS stop >/dev/null 2>&1 + ret=$? + + if [ $ret -ne 0 ]; then + ocf_exit_reason "Postfix returned an error while stopping: $ret" + return $OCF_ERR_GENERIC + fi + + # grant some time for shutdown and recheck 5 times + for i in 1 2 3 4 5; do + if postfix_running info; then + sleep 1 + else + break + fi + done + + # escalate to abort if we did not stop by now + # @TODO shall we loop here too? + if postfix_running info; then + ocf_exit_reason "Postfix failed to stop. Escalating to 'abort'." + + $binary $OPTIONS abort >/dev/null 2>&1; ret=$? + sleep 5 + + # postfix abort did not succeed + if postfix_running; then + ocf_exit_reason "Postfix failed to abort." + return $OCF_ERR_GENERIC + fi + fi + + ocf_log info "Postfix stopped." + return $OCF_SUCCESS +} + +postfix_reload() +{ + if postfix_running; then + ocf_log info "Reloading Postfix." + $binary $OPTIONS reload + fi +} + +postfix_monitor() +{ + local status_loglevel="err" + + # Set loglevel to info during probe + if ocf_is_probe; then + status_loglevel="info" + fi + + if postfix_running $status_loglevel; then + return $OCF_SUCCESS + fi + + return $OCF_NOT_RUNNING +} + +postfix_validate_all() +{ + # check that the Postfix binaries exist and can be executed + check_binary "$binary" + check_binary "postconf" + + # if true, run in-depth directory checks + dir_check=true + + # check config_dir and alternate_config_directories parameter + if [ "x$config_dir" != "x" ]; then + if [ ! -d "$config_dir" ]; then + if ocf_is_probe; then + ocf_log info "Postfix configuration directory '$config_dir' not readable during probe." + # skip in-depth directory checks if config file isn't readable during probe + dir_check=false + else + ocf_exit_reason "Postfix configuration directory '$config_dir' does not exist or is not readable." + return $OCF_ERR_INSTALLED + fi + fi + + alternate_config_directories=`postconf -h alternate_config_directories 2>/dev/null | grep "$config_dir/\?"` + if [ "x$alternate_config_directories" = "x" ]; then + ocf_exit_reason "Postfix main configuration must contain correct 'alternate_config_directories' parameter." + return $OCF_ERR_INSTALLED + fi + fi + + # check spool/queue and data directories (if applicable) + # this is required because "postfix check" does not catch all errors + if ocf_is_true $dir_check; then + if [ ! -d "$queue_dir" ]; then + if ocf_is_probe; then + ocf_log info "Postfix queue directory '$queue_dir' not readable during probe." + else + ocf_exit_reason "Postfix queue directory '$queue_dir' does not exist or is not readable." + return $OCF_ERR_INSTALLED + fi + fi + + if ocf_is_true $status_support; then + data_dir=`postconf $OPTION_CONFIG_DIR -h data_directory 2>/dev/null` + data_dir_count=`echo "$data_dir" | tr ',' ' ' | wc -w` + if [ $data_dir_count -gt 1 ]; then + ocf_exit_reason "Postfix data directory '$orig_data_dir' cannot be set to multiple directories." + return $OCF_ERR_INSTALLED + fi + if [ ! -d "$data_dir" ]; then + if ocf_is_probe; then + ocf_log info "Postfix data directory '$data_dir' not readable during probe." + else + ocf_exit_reason "Postfix data directory '$data_dir' does not exist or is not readable." + return $OCF_ERR_INSTALLED + fi + fi + fi + + # check directory permissions + if ocf_is_true $status_support; then + user=`postconf $OPTION_CONFIG_DIR -h mail_owner 2>/dev/null` + for dir in $data_dir; do + if ! $SU -s /bin/sh - $user -c "test -w $dir"; then + if ocf_is_probe; then + ocf_log info "Directory '$dir' is not writable by user '$user' during probe." + else + ocf_exit_reason "Directory '$dir' is not writable by user '$user'." + return $OCF_ERR_PERM; + fi + fi + done + fi + fi + + # run Postfix internal check, if not probing + if ! ocf_is_probe; then + $binary $OPTIONS check >/dev/null 2>&1 + ret=$? + if [ $ret -ne 0 ]; then + ocf_exit_reason "Postfix 'check' failed: $ret" + return $OCF_ERR_GENERIC + fi + fi + + return $OCF_SUCCESS +} + +# +# Main +# + +if [ $# -ne 1 ]; then + usage + exit $OCF_ERR_ARGS +fi + +binary=$OCF_RESKEY_binary +config_dir=$OCF_RESKEY_config_dir +parameters=$OCF_RESKEY_parameters + + +# handle parameters +case $1 in + meta-data) meta_data + exit $OCF_SUCCESS + ;; + + usage|help) usage + exit $OCF_SUCCESS + ;; +esac + +# build Postfix options string *outside* to access from each method +OPTIONS='' +OPTION_CONFIG_DIR='' + +# check for Postfix's postconf binary +check_binary "postconf" + +# check if the Postfix config_dir exist +if [ "x$config_dir" != "x" ]; then + # remove all trailing slashes to ease "postconf alternate_config_directories" match + config_dir=`echo $config_dir | sed 's/\/*$//'` + + # reset config_dir if it equals Postfix's default config_directory + postconf -h config_directory 2>/dev/null | grep -q "^$config_dir/\?$" + if [ $? -eq 0 ]; then + config_dir="" + fi + + # set OPTIONS if config_dir is still set + # save OPTION_CONFIG_DIR seperatly + if [ "x$config_dir" != "x" ]; then + OPTION_CONFIG_DIR="-c $config_dir" + OPTIONS=$OPTION_CONFIG_DIR + fi +fi + +# add all additional parameters to options string +if [ "x$parameters" != "x" ]; then + OPTIONS="$OPTIONS $parameters" +fi + +# important directories, used in different methods +queue_dir=`postconf $OPTION_CONFIG_DIR -h queue_directory 2>/dev/null` + +# check Postfix version and status support +status_support=false +postfix_version=`postconf -h mail_version 2>/dev/null` +ocf_version_cmp "$postfix_version" "2.5.0" +ret=$? + +# we need Postfix 2.5.0 or greater for status/data_directory support +if [ $ret -eq 1 -o $ret -eq 2 ]; then + status_support=true +fi + + +postfix_validate_all +ret=$? + +LSB_STATUS_STOPPED=3 +if [ $ret -ne $OCF_SUCCESS ]; then + case $1 in + stop) exit $OCF_SUCCESS ;; + *) exit $ret;; + esac +fi + +case $1 in + monitor) postfix_monitor + exit $? + ;; + start) postfix_start + exit $? + ;; + + stop) postfix_stop + exit $? + ;; + + reload) postfix_reload + exit $? + ;; + + validate-all) exit $OCF_SUCCESS + ;; + + *) usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac diff --git a/heartbeat/pound b/heartbeat/pound new file mode 100755 index 0000000..48aa221 --- /dev/null +++ b/heartbeat/pound @@ -0,0 +1,343 @@ +#!/bin/sh +# +# +# Pound +# +# Description: Manage pound instances as a HA resource +# +# Author: Taro Matsuzawa <btm@tech.email.ne.jp> +# +# License: GNU General Public License (GPL) +# +# See usage() for more details +# +# OCF instance parameters: +# OCF_RESKEY_pid +# OCF_RESKEY_binary +# OCF_RESKEY_ctl_binary +# OCF_RESKEY_socket_path +# OCF_RESKEY_config +# OCF_RESKEY_name +# OCF_RESKEY_maxfiles +# +####################################################################### +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### +# Set default paramenter values + +# Set these two first, as other defaults depend on it +OCF_RESKEY_name_default=${OCF_RESOURCE_INSTANCE} +: ${OCF_RESKEY_name=${OCF_RESKEY_name_default}} + +OCF_RESKEY_config_default="" +OCF_RESKEY_binary_default=pound +OCF_RESKEY_ctl_binary_default=poundctl +OCF_RESKEY_pid_default=/var/run/pound_${OCF_RESKEY_name}.pid +OCF_RESKEY_socket_path_default=/var/lib/pound/pound.cfg +OCF_RESKEY_maxfiles_default="" + +: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +: ${OCF_RESKEY_ctl_binary=${OCF_RESKEY_ctl_binary_default}} +: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}} +: ${OCF_RESKEY_socket_path=${OCF_RESKEY_socket_path_default}} +: ${OCF_RESKEY_maxfiles=${OCF_RESKEY_maxfiles_default}} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="pound" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +The Pound Resource Agent can manage Pound instances. +</longdesc> +<shortdesc lang="en">Manage a Pound instance</shortdesc> + +<parameters> + +<parameter name="config" unique="1" required="1"> +<longdesc lang="en"> +The Pound configuration file that Pound should manage, for example +"/etc/pound.cfg". +</longdesc> +<shortdesc lang="en">Pound configuration file</shortdesc> +<content type="string" default="${OCF_RESKEY_config_default}" /> +</parameter> + +<parameter name="name" unique="1"> +<longdesc lang="en"> +Override the name of the instance that should be given to Pound +(defaults to the resource identifier). +</longdesc> +<shortdesc lang="en">Instance name</shortdesc> +<content type="string" default="${OCF_RESKEY_name_default}" /> +</parameter> + +<parameter name="pid" unique="1"> +<longdesc lang="en"> +Write the process's PID to the specified file. +The default will include the specified name, i.e.: +"/var/run/pound_production.pid". Unlike what this help message shows, +it is most likely not necessary to change this parameter. +</longdesc> +<shortdesc lang="en">Pidfile</shortdesc> +<content type="string" default="${OCF_RESKEY_pid_default}" /> +</parameter> + +<parameter name="binary"> +<longdesc lang="en"> +This is used to start Pound server. +Normally use pound. +</longdesc> +<shortdesc lang="en"></shortdesc> +<content type="string" default="${OCF_RESKEY_binary_default}" /> +</parameter> + +<parameter name="ctl_binary"> +<longdesc lang="en"> +This is used to watch Pound status via Unix socket. +Normally use poundctl. +</longdesc> +<shortdesc lang="en"></shortdesc> +<content type="string" default="${OCF_RESKEY_ctl_binary_default}" /> +</parameter> + +<parameter name="socket_path"> +<longdesc lang="en"> +Write the process's Unix socket. +This parameter is same 'Control' parameter in configuration file, i.e.: +Control "/var/lib/pound/pound.cfg". +</longdesc> +<shortdesc lang="en"></shortdesc> +<content type="string" default="${OCF_RESKEY_socket_path_default}" /> +</parameter> + +<parameter name="maxfiles"> +<longdesc lang="en"> +Determines how many files pound is allowed to open at +a time. Helps to fix the 'Too many open files' error message. +</longdesc> +<shortdesc lang="en">Allowed number of open files.</shortdesc> +<content type="integer" default="${OCF_RESKEY_maxfiles_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" timeout="20s" interval="10s" depth="0" /> +<action name="status" timeout="20s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="20s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + + +pound_usage() { + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +pound_status() { + local pid + local rc + + # FAILED = pidfile exist, but no running proc (or mismatch pid) + # SUCCES = contents of pidfile == running process id + # NOTRUN = no pidfile, no running process + + # check if pidfile exists and larger than 0 bytes + if [ -s $OCF_RESKEY_pid ]; then + # it does, now check if the pid exists + pid=$(cat $OCF_RESKEY_pid) + ocf_run kill -s 0 $pid + rc=$? + if [ $rc -eq 0 ]; then + ocf_log info "Pound is running" + # check if the poundctl + ocf_run $OCF_RESKEY_ctl_binary -c $OCF_RESKEY_socket_path + p_rc=$? + if [ "$p_rc" -eq 0 ]; then + ocf_log info "poundctl reports success" + return $OCF_SUCCESS + else + ocf_log err "poundctl reports error" + return $OCF_ERR_GENERIC + fi + else + ocf_log err "Pound PID file exists, but pound is not running" + return $OCF_ERR_GENERIC + fi + fi + + return $OCF_NOT_RUNNING +} + +pound_start() { + local rc + local backend_options + + pound_status + rc=$? + if [ $rc -eq $OCF_SUCCESS ]; then + ocf_log info "Pound already running" + return $OCF_SUCCESS + fi + + # check configuration before start + ocf_run $OCF_RESKEY_binary \ + -c -f $OCF_RESKEY_config + c_rc=$? + if [ "$c_rc" -ne 0 ]; then + ocf_log err "Pound configuration file is not valid" + return $OCF_ERR_CONFIGURED + fi + + if [ -n "$OCF_RESKEY_maxfiles" ]; then + ulimit -n $OCF_RESKEY_maxfiles + u_rc=$? + if [ "$u_rc" -ne 0 ]; then + ocf_log warn "Could not set ulimit for open files for Pound to '$OCF_RESKEY_maxfiles'" + fi + fi + + ocf_run $OCF_RESKEY_binary \ + -f $OCF_RESKEY_config \ + -p $OCF_RESKEY_pid + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "Pound failed to start" + return $OCF_ERR_GENERIC + fi + + # Spin waiting for pound to come up. + # Let the CRM/LRM time us out if required + while true; do + pound_status + rc=$? + [ $rc -eq $OCF_SUCCESS ] && break + if [ $rc -ne $OCF_NOT_RUNNING ]; then + ocf_log err "Pound start failed" + exit $OCF_ERR_GENERIC + fi + sleep 2 + done + + ocf_log info "Pound started succesfully" + return $OCF_SUCCESS +} + +pound_stop() { + local rc + local pid + + pound_status + rc=$? + if [ $rc -eq $OCF_NOT_RUNNING ]; then + ocf_log info "Pound already stopped" + return $OCF_SUCCESS + fi + + # kill the pound process + pid=$(cat $OCF_RESKEY_pid) + ocf_run kill -s 0 $pid + rc=$? + + if [ $rc -ne 0 ]; then + ocf_log warn "Pound pid is not a valid process. Assume it is already stopped" + rm -f $OCF_RESKEY_pid + return $OCF_SUCCESS + fi + + ocf_run kill -s TERM $pid + rc=$? + + if [ $rc -ne 0 ]; then + ocf_log err "Pound failed to stop" + return $OCF_ERR_GENERIC + fi + + # stop waiting + shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-5)) + count=0 + while [ $count -lt $shutdown_timeout ]; do + # check if process still exists + ocf_run kill -s 0 $pid + rc=$? + if [ $rc -ne 0 ]; then + # Pound stopped succesfully, so let's delete the pidfile + rm -f $OCF_RESKEY_pid + break + fi + count=$(expr $count + 1) + sleep 1 + ocf_log info "Pound still hasn't stopped yet. Waiting..." + done + + pound_status + rc=$? + if [ $rc -ne $OCF_NOT_RUNNING ]; then + # Poound didn't quit on a SIGTERM, try SIGKILL + ocf_log warn "Pound failed to stop after ${shutdown_timeout}s using SIGTERM. Trying SIGKILL..." + ocf_run kill -s KILL $pid + # delete the pidfile + rm -f $OCF_RESKEY_pid + fi + + ocf_log info "Pound stopped" + return $OCF_SUCCESS +} + + +pound_validate() { + if [ -f $OCF_RESKEY_config ]; then + return $OCF_SUCCESS + else + return $OCF_ERR_INSTALLED + fi +} + + +case $__OCF_ACTION in + meta-data) + meta_data + exit $OCF_SUCCESS + ;; + start) + pound_start + ;; + stop) + pound_stop + ;; + monitor|status) + pound_status + ;; + validate-all) + pound_validate + ;; + usage|help) + pound_usage + exit $OCF_SUCCESS + ;; + *) + pound_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc + diff --git a/heartbeat/proftpd b/heartbeat/proftpd new file mode 100755 index 0000000..a9fc5ff --- /dev/null +++ b/heartbeat/proftpd @@ -0,0 +1,311 @@ +#!/bin/sh +# +# Resource script for Proftpd +# +# Description: Manages Proftpd as an OCF resource in +# an Active-Passive High Availability setup. +# +# Author: Rajat Upadhyaya <urajat@novell.com> : Pure-FTPd script +# Author: Achim Stumpf <hakim.news@googlemail.com> : Rewrite as Proftpd +# License: GNU General Public License (GPL) +# +# +# usage: $0 {start|stop|status|monitor|validate-all|meta-data} +# +# The "start" arg starts Proftpd. +# +# The "stop" arg stops it. +# +# OCF parameters: +# OCF_RESKEY_binary +# OCF_RESKEY_conffile +# OCF_RESKEY_pidfile +# OCF_RESKEY_curl_binary +# OCF_RESKEY_curl_url +# OCF_RESKEY_test_user +# OCF_RESKEY_test_pass +# +########################################################################## +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_binary_default="/usr/sbin/proftpd" +OCF_RESKEY_conffile_default="/etc/proftpd.conf" +OCF_RESKEY_pidfile_default="/var/run/proftpd.pid" +OCF_RESKEY_curl_binary_default="/usr/bin/curl" +OCF_RESKEY_curl_url_default="ftp://localhost/" +OCF_RESKEY_test_user_default="test" +OCF_RESKEY_test_pass_default="" + +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +: ${OCF_RESKEY_conffile=${OCF_RESKEY_conffile_default}} +: ${OCF_RESKEY_pidfile=${OCF_RESKEY_pidfile_default}} +: ${OCF_RESKEY_curl_binary=${OCF_RESKEY_curl_binary_default}} +: ${OCF_RESKEY_curl_url=${OCF_RESKEY_curl_url_default}} +: ${OCF_RESKEY_test_user=${OCF_RESKEY_test_user_default}} +: ${OCF_RESKEY_test_pass=${OCF_RESKEY_test_pass_default}} + +USAGE="Usage: $0 {start|stop|status|monitor|validate-all|meta-data}"; + +########################################################################## + +usage() { + echo $USAGE >&2 +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="proftpd" version="1.0"> +<version>1.0</version> +<longdesc lang="en"> +This script manages Proftpd in an Active-Passive setup +</longdesc> +<shortdesc lang="en">OCF Resource Agent compliant FTP script.</shortdesc> + +<parameters> + +<parameter name="binary" unique="0" required="0"> +<longdesc lang="en">The Proftpd binary</longdesc> +<shortdesc lang="en">The Proftpd binary</shortdesc> +<content type="string" default="${OCF_RESKEY_binary_default}" /> +</parameter> + +<parameter name="conffile" unique="0" required="0"> +<longdesc lang="en"> +The Proftpd configuration file name with full path. +For example, "/etc/proftpd.conf" +</longdesc> +<shortdesc lang="en">Configuration file name with full path</shortdesc> +<content type="string" default="${OCF_RESKEY_conffile_default}" /> +</parameter> + +<parameter name="pidfile" unique="0" required="0"> +<longdesc lang="en">The Proftpd PID file. The location of the PID file is configured in the Proftpd configuration file.</longdesc> +<shortdesc lang="en">PID file</shortdesc> +<content type="string" default="${OCF_RESKEY_pidfile_default}" /> +</parameter> + +<parameter name="curl_binary" unique="0" required="0"> +<longdesc lang="en">The absolute path to the curl binary for monitoring with OCF_CHECK_LEVEL greater zero.</longdesc> +<shortdesc lang="en">The absolute path to the curl binary</shortdesc> +<content type="string" default="${OCF_RESKEY_curl_binary_default}" /> +</parameter> + +<parameter name="curl_url" unique="0" required="0"> +<longdesc lang="en">The URL which is checked by curl with OCF_CHECK_LEVEL greater zero.</longdesc> +<shortdesc lang="en">The URL which is checked by curl</shortdesc> +<content type="string" default="${OCF_RESKEY_curl_url_default}" /> +</parameter> + +<parameter name="test_user" unique="0" required="0"> +<longdesc lang="en">The name of the ftp user for monitoring with OCF_CHECK_LEVEL greater zero.</longdesc> +<shortdesc lang="en">The name of the ftp user</shortdesc> +<content type="string" default="${OCF_RESKEY_test_user_default}" /> +</parameter> + +<parameter name="test_pass" unique="0" required="0"> +<longdesc lang="en">The password of the ftp user for monitoring with OCF_CHECK_LEVEL greater zero.</longdesc> +<shortdesc lang="en">The password of the ftp user</shortdesc> +<content type="string" default="${OCF_RESKEY_test_pass_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="60s" /> +<action name="monitor" depth="10" timeout="20s" interval="120s" /> +<action name="validate-all" timeout="20s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END + exit $OCF_SUCCESS +} + +isRunning() +{ + kill -s 0 "$1" > /dev/null 2>&1 +} + +proftpd_status() +{ + if [ -f "$OCF_RESKEY_pidfile" ] + then + # Proftpd is probably running + PID=`head -n 1 $OCF_RESKEY_pidfile` + if [ ! -z "$PID" ] ; then + isRunning "$PID" && `ps -p $PID | grep proftpd > /dev/null 2>&1` + return $? + fi + fi + + # Proftpd is not running + return $OCF_NOT_RUNNING; +} + +proftpd_start() +{ + # make a few checks and start Proftpd + if ocf_is_root ; then : ; else + ocf_log err "You must be root" + exit $OCF_ERR_PERM + fi + + # if Proftpd is running return success + if proftpd_status ; then + ocf_log info "Proftpd is running already" + exit $OCF_SUCCESS + fi + + # starting Proftpd + ${OCF_RESKEY_binary} --config ${OCF_RESKEY_conffile} 2>/dev/null + + if [ "$?" -ne 0 ]; then + ocf_log err "Proftpd returned error" $? + exit $OCF_ERR_GENERIC + fi + + exit $OCF_SUCCESS +} + + +proftpd_stop() +{ + if proftpd_status ; then + PID=`head -n 1 $OCF_RESKEY_pidfile` + if [ ! -z "$PID" ]; then + ocf_log info "Killing Proftpd PID $PID" + kill $PID > /dev/null 2>&1 + if [ "$?" -eq 0 ]; then + TRIES=0 + while isRunning "$PID" && [ "$TRIES" -lt 30 ] + do + sleep 1 + ocf_log info "Proftpd PID $PID is still running" + TRIES=`expr $TRIES + 1` + done + isRunning "$PID" + RET=$? + if [ "$RET" -eq 0 ]; then + ocf_log info "Killing Proftpd PID $PID with SIGKILL" + kill -s 9 $PID > /dev/null 2>&1 + while isRunning "$PID" + do + sleep 1 + ocf_log info "Proftpd PID $PID is still running" + done + fi + else + ocf_log err "Killing Proftpd PID $PID FAILED" + exit $OCF_ERR_GENERIC + fi + fi + fi + + exit $OCF_SUCCESS +} + +proftpd_monitor() +{ + proftpd_status + RET=$? + + if [ "$RET" -ne 0 -o "$OCF_CHECK_LEVEL" = 0 ]; then + if [ "$RET" -eq 0 ]; then + PID=`head -n 1 $OCF_RESKEY_pidfile` + ocf_log debug "Proftpd monitor on PID $PID succeeded" + return $OCF_SUCCESS + else + ocf_log debug "Proftpd monitor on PID $PID failed" + return $OCF_NOT_RUNNING + fi + else + ${OCF_RESKEY_curl_binary} -sS -u "${OCF_RESKEY_test_user}:${OCF_RESKEY_test_pass}" ${OCF_RESKEY_curl_url} > /dev/null 2>&1 + if [ "$?" -eq 0 ]; then + ocf_log debug "Proftpd monitor with curl on URL $OCF_RESKEY_curl_url succeeded" + return $OCF_SUCCESS + else + ocf_log err "Proftpd monitor with curl on URL $OCF_RESKEY_curl_url failed" + return $OCF_NOT_RUNNING + fi + fi +} + +proftpd_validate_all() +{ + + # check that the proftpd binary exists + if [ ! -x "$OCF_RESKEY_binary" ]; then + ocf_log err "Proftpd binary $OCF_RESKEY_binary does not exist" + exit $OCF_ERR_INSTALLED + fi + + # check that the Proftpd config file exists + if [ ! -f "$OCF_RESKEY_conffile" ]; then + ocf_log err "Proftpd config file $OCF_RESKEY_conffile does not exist" + exit $OCF_ERR_CONFIGURED + fi + + # check that the curl binary exists + if [ ! -x "$OCF_RESKEY_curl_binary" ]; then + ocf_log err "$OCF_RESKEY_curl_binary does not exist" + exit $OCF_ERR_INSTALLED + fi +} + +# +# Main +# + +if [ $# -ne 1 ] +then + usage + exit $OCF_ERR_ARGS +fi + +case $1 in + start) proftpd_validate_all + proftpd_start + ;; + + stop) proftpd_stop + ;; + + status) if proftpd_status + then + ocf_log info "Proftpd is running" + exit $OCF_SUCCESS + else + ocf_log info "Proftpd is stopped" + exit $OCF_NOT_RUNNING + fi + ;; + + monitor) proftpd_monitor + exit $? + ;; + + validate-all) proftpd_validate_all + exit $OCF_SUCCESS + ;; + + meta-data) meta_data + ;; + + usage) usage + exit $OCF_SUCCESS + ;; + + *) usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + diff --git a/heartbeat/ra-api-1.dtd b/heartbeat/ra-api-1.dtd new file mode 100644 index 0000000..0a0a889 --- /dev/null +++ b/heartbeat/ra-api-1.dtd @@ -0,0 +1,40 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!ELEMENT resource-agent (version,longdesc,shortdesc,parameters?,actions) > +<!ATTLIST resource-agent + name CDATA #REQUIRED + version CDATA #IMPLIED> + +<!ELEMENT version (#PCDATA)> + +<!ELEMENT parameters (parameter*)> + +<!ELEMENT actions (action*)> + +<!ELEMENT parameter (longdesc+,shortdesc+,content)> +<!ATTLIST parameter + name CDATA #REQUIRED + required (1|0) "0" + unique (1|0) "0"> + +<!ELEMENT longdesc ANY> +<!ATTLIST longdesc + lang NMTOKEN #IMPLIED> + +<!ELEMENT shortdesc ANY> +<!ATTLIST shortdesc + lang NMTOKEN #IMPLIED> + +<!ELEMENT content EMPTY> +<!ATTLIST content + type (string|integer|boolean|time) #REQUIRED + default CDATA #IMPLIED> + +<!ELEMENT action EMPTY> +<!ATTLIST action + name (start|stop|recover|monitor|restart|migrate_to|migrate_from|promote|demote|notify|status|reload|meta-data|usage|methods|validate-all) #REQUIRED + timeout CDATA #REQUIRED + interval CDATA #IMPLIED + start-delay CDATA #IMPLIED + role CDATA #IMPLIED + depth CDATA #IMPLIED> diff --git a/heartbeat/rabbitmq-cluster.in b/heartbeat/rabbitmq-cluster.in new file mode 100755 index 0000000..0d36d95 --- /dev/null +++ b/heartbeat/rabbitmq-cluster.in @@ -0,0 +1,632 @@ +#!@BASH_SHELL@ +# +# Copyright (c) 2014 David Vossel <davidvossel@gmail.com> +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_set_policy_default="" + +: ${OCF_RESKEY_set_policy=${OCF_RESKEY_set_policy_default}} + +####################################################################### + +# This arbitrary value here is used by the rmq_start action to +# signify that the resource agent must retry the start process +# It might potentially conflict with OCF assigned error code +# in the future. +RMQ_TRY_RESTART_ERROR_CODE=126 + +RMQ_SERVER=/usr/sbin/rabbitmq-server +RMQ_CTL=/usr/sbin/rabbitmqctl +RMQ_EVAL="${RMQ_CTL} eval -q" +RMQ_DATA_DIR="/var/lib/rabbitmq/mnesia" +RMQ_PID_DIR="/var/run/rabbitmq" +RMQ_PID_FILE="/var/run/rabbitmq/rmq.pid" +RMQ_LOG_DIR="/var/log/rabbitmq" +if [ "$__OCF_ACTION" != "meta-data" ]; then + NODENAME=$(ocf_attribute_target) +fi + +# this attr represents the current active local rmq node name. +# when rmq stops or the node is fenced, this attr disappears +RMQ_CRM_ATTR_COOKIE="rmq-node-attr-${OCF_RESOURCE_INSTANCE}" +# this attr represents the last known active local rmq node name +# when rmp stops or the node is fenced, the attr stays forever so +# we can continue to map an offline pcmk node to it's rmq node name +# equivalent. +RMQ_CRM_ATTR_COOKIE_LAST_KNOWN="rmq-node-attr-last-known-${OCF_RESOURCE_INSTANCE}" + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="rabbitmq-cluster" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Starts cloned rabbitmq cluster instance. NB: note that this RA +cannot be spawned across a mix of pacemaker and pacemaker-remote nodes. +Only on pacemaker *or* pacemaker-remote nodes exclusively. +</longdesc> +<shortdesc lang="en">rabbitmq clustered</shortdesc> + +<parameters> +<parameter name="set_policy" unique="1"> +<longdesc lang="en"> +Policy string to pass to 'rabbitmqctl set_policy' right after bootstrapping the first rabbitmq instance. +</longdesc> +<shortdesc lang="en">rabbitmqctl set_policy args</shortdesc> +<content type="string" default="${OCF_RESKEY_set_policy_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="100s" /> +<action name="stop" timeout="90s" /> +<action name="monitor" timeout="40s" interval="10s" depth="0" /> +<action name="meta-data" timeout="10s" /> +<action name="validate-all" timeout="20s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + +rmq_usage() { + cat <<END +usage: $0 {start|stop|monitor|notify|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +rmq_wipe_data() +{ + rm -rf $RMQ_DATA_DIR > /dev/null 2>&1 +} + +rmq_restore_users_perms_policies() +{ + # Restore users, user permissions, and policies (if any) + BaseDataDir=`dirname $RMQ_DATA_DIR` + $RMQ_EVAL " + %% Run only if Mnesia is ready. + lists:any(fun({mnesia,_,_}) -> true; ({_,_,_}) -> false end, application:which_applications()) andalso + begin + Restore = fun(Table, PostprocessFun, Filename) -> + case file:consult(Filename) of + {error, _} -> + ok; + {ok, [Result]} -> + lists:foreach(fun(X) -> mnesia:dirty_write(Table, PostprocessFun(X)) end, Result), + file:delete(Filename) + end + end, + + %% Restore users + + Upgrade = fun + ({internal_user, A, B, C}) -> {internal_user, A, B, C, rabbit_password_hashing_md5}; + ({internal_user, A, B, C, D}) -> {internal_user, A, B, C, D} + end, + + Downgrade = fun + ({internal_user, A, B, C}) -> {internal_user, A, B, C}; + ({internal_user, A, B, C, rabbit_password_hashing_md5}) -> {internal_user, A, B, C}; + %% Incompatible scheme, so we will loose user's password ('B' value) during conversion. + %% Unfortunately, this case will require manual intervention - user have to run: + %% rabbitmqctl change_password <A> <somenewpassword> + ({internal_user, A, B, C, _}) -> {internal_user, A, B, C} + end, + + %% Check db scheme first + [WildPattern] = ets:select(mnesia_gvar, [ { {{rabbit_user, wild_pattern}, '\\\$1'}, [], ['\\\$1'] } ]), + case WildPattern of + %% Version < 3.6.0 + {internal_user,'_','_','_'} -> + Restore(rabbit_user, Downgrade, \"$BaseDataDir/users.erl\"); + %% Version >= 3.6.0 + {internal_user,'_','_','_','_'} -> + Restore(rabbit_user, Upgrade, \"$BaseDataDir/users.erl\") + end, + + NoOp = fun(X) -> X end, + + %% Restore user permissions + Restore(rabbit_user_permission, NoOp, \"$BaseDataDir/users_perms.erl\"), + + %% Restore policies + Restore(rabbit_runtime_parameters, NoOp, \"$BaseDataDir/policies.erl\") + end. + " +} + +rmq_local_node() +{ + + local node_name=$($RMQ_CTL status 2>&1 | sed -n -e "s/^.*[S|s]tatus of node \(.*\)\s.*$/\1/p" | tr -d "'") + + if [ -z "$node_name" ]; then + node_name=$(cat /etc/rabbitmq/rabbitmq-env.conf 2>/dev/null | grep "\s*RABBITMQ_NODENAME=" | awk -F= '{print $2}') + fi + + echo "$node_name" +} + +rmq_join_list() +{ + local join_list=$(cibadmin -Q --xpath "//node_state[@crmd='online']//nvpair[@name='$RMQ_CRM_ATTR_COOKIE']" | grep "$RMQ_CRM_ATTR_COOKIE" | sed -n -e "s/^.*value=.\(.*\)\".*$/\1/p") + # If join_list is empty we want to check if there are any remote nodes + # where rabbitmq is allowed to run (i.e. nodes without the crmd=online selector) + if [ -z "$join_list" ]; then + # Get all the nodes written in the ATTR_COOKIE no matter if + # they are online or not. This will be one line per node like + # rabbit@overcloud-rabbit-0 + # rabbit@overcloud-rabbit-1 + # ... + local remote_join_list=$(cibadmin -Q --xpath "//node_state//nvpair[@name='$RMQ_CRM_ATTR_COOKIE']" | grep "$RMQ_CRM_ATTR_COOKIE" | sed -n -e "s/^.*value=.\(.*\)\".*$/\1/p") + # The following expression prepares a filter like '-e overcloud-rabbit-0 -e overcloud-rabbit-1 -e ...' + local filter=$(crm_mon_no_validation -r --as-xml | xmllint --format --xpath "//nodes//node[@online='true' and @standby='false']/@name" - | xargs -n1 echo | awk -F= '{print "-e "$2}') + # export the intersection which gives us only the nodes that + # a) wrote their namein the cib attrd + # b) run on nodes where pacemaker_remote is enabled + join_list="$(echo $remote_join_list | grep $filter)" + fi + + echo $join_list +} + +rmq_write_nodename() +{ + local node_name=$(rmq_local_node) + + if [ -z "$node_name" ]; then + ocf_log err "Failed to determine rabbitmq node name, exiting" + exit $OCF_ERR_GENERIC + fi + + # store the pcmknode to rmq node mapping as a transient attribute. This allows + # us to retrieve the join list with a simple xpath. + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "$RMQ_CRM_ATTR_COOKIE" -v "$node_name" + + # the pcmknode to rmq node mapping as a permanent attribute as well. this lets + # us continue to map offline nodes to their equivalent rmq node name + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l forever --name "$RMQ_CRM_ATTR_COOKIE_LAST_KNOWN" -v "$node_name" +} + +rmq_delete_nodename() +{ + # remove node-name + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "$RMQ_CRM_ATTR_COOKIE" -D +} + +prepare_dir () { + if [ ! -d ${1} ] ; then + mkdir -p ${1} + chown -R rabbitmq:rabbitmq ${1} + chmod 755 ${1} + fi +} + +remove_pid () { + rm -f ${RMQ_PID_FILE} > /dev/null 2>&1 +} + +rmq_app_running() { + if $RMQ_EVAL 'application:which_applications().' | grep -q '{rabbit,'; then + ocf_log debug "RabbitMQ application is running" + return $OCF_SUCCESS + else + ocf_log debug "RabbitMQ application is stopped" + rmq_delete_nodename + return $OCF_NOT_RUNNING + fi +} + +rmq_node_alive() { + if $RMQ_EVAL 'ok.'; then + ocf_log debug "RabbitMQ node is alive" + return $OCF_SUCCESS + else + ocf_log debug "RabbitMQ node is down" + rmq_delete_nodename + return $OCF_NOT_RUNNING + fi +} + +rmq_monitor() { + local rc + + status=$($RMQ_EVAL 'rabbit_mnesia:cluster_status_from_mnesia().' 2>&1) + if echo "${status}" | grep -q '^{ok'; then + pcs_running=$(rmq_join_list | wc -w) + ocf_log debug "Pacemaker thinks ${pcs_running} RabbitMQ nodes are running" + rmq_running=$($RMQ_EVAL 'length(mnesia:system_info(running_db_nodes)).') + ocf_log debug "RabbitMQ thinks ${rmq_running} RabbitMQ nodes are running" + + if [ $(( $rmq_running * 2 )) -lt $pcs_running ]; then + ocf_log info "RabbitMQ is a minority partition, failing monitor" + rmq_delete_nodename + return $OCF_ERR_GENERIC + fi + + ocf_log debug "RabbitMQ server is running normally" + rmq_write_nodename + + return $OCF_SUCCESS + else + ocf_log info "RabbitMQ server could not get cluster status from mnesia" + ocf_log debug "${status}" + rmq_delete_nodename + return $OCF_NOT_RUNNING + fi +} + +rmq_init_and_wait() +{ + local rc + local wait_timeout + local timeout_string + + prepare_dir $RMQ_PID_DIR + prepare_dir $RMQ_LOG_DIR + remove_pid + + # the server startup script uses this environment variable + export RABBITMQ_PID_FILE="$RMQ_PID_FILE" + + setsid sh -c "$RMQ_SERVER > ${RMQ_LOG_DIR}/startup_log 2> ${RMQ_LOG_DIR}/startup_err" & + + ocf_log info "Waiting for server to start" + # We want to give the wait command almost the full startup timeout we are given + # So we use the start operation timeout (in ms), convert it and subtract 5 seconds + # In the silly case that it is less than 10 seconds we just skip setting the timeout + wait_timeout=`expr $OCF_RESKEY_CRM_meta_timeout / 1000 - 5` + if [ $wait_timeout -gt 10 ]; then + timeout_string="--timeout ${wait_timeout}" + else + timeout_string="" + fi + $RMQ_CTL $timeout_string wait $RMQ_PID_FILE + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + remove_pid + ocf_log info "rabbitmq-server start failed with a timeout of ($timeout_string): $rc" + return $OCF_ERR_GENERIC + fi + + rmq_app_running + return $? +} + +rmq_set_policy() +{ + $RMQ_CTL set_policy "$@" > /dev/null 2>&1 +} + +rmq_start_first() +{ + local rc + + ocf_log info "Bootstrapping rabbitmq cluster" + rmq_wipe_data + rmq_init_and_wait + rc=$? + + if [ $rc -eq 0 ]; then + rc=$OCF_SUCCESS + ocf_log info "cluster bootstrapped" + rmq_write_nodename + + if [ -n "$OCF_RESKEY_set_policy" ]; then + # do not quote set_policy, we are passing in arguments + rmq_set_policy $OCF_RESKEY_set_policy + if [ $? -ne 0 ]; then + ocf_log err "Failed to set policy: $OCF_RESKEY_set_policy" + rc=$OCF_ERR_GENERIC + else + ocf_log info "Policy set: $OCF_RESKEY_set_policy" + fi + fi + + else + ocf_log info "failed to bootstrap cluster. Check SELINUX policy" + rc=$OCF_ERR_GENERIC + fi + + return $rc +} + +rmq_is_clustered() +{ + $RMQ_EVAL 'rabbit_mnesia:is_clustered().' | grep -q true +} + +rmq_join_existing() +{ + local join_list="$1" + local rc=$OCF_ERR_GENERIC + + ocf_log info "Joining existing cluster with [ $(echo $join_list | tr '\n' ' ') ] nodes." + rmq_init_and_wait + if [ $? -ne 0 ]; then + return $OCF_ERR_GENERIC + fi + + if rmq_is_clustered; then + ocf_log info "Successfully re-joined existing rabbitmq cluster automatically" + return $OCF_SUCCESS + fi + + # unconditionally join the cluster + $RMQ_CTL stop_app > /dev/null 2>&1 + for node in $(echo "$join_list"); do + ocf_log info "Attempting to join cluster with target node $node" + $RMQ_CTL join_cluster $node + if [ $? -eq 0 ]; then + ocf_log info "Joined cluster by connecting to node $node, starting app" + $RMQ_CTL start_app + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "'$RMQ_CTL start_app' failed" + fi + break; + fi + done + + if [ "$rc" -ne 0 ]; then + ocf_log info "Join process incomplete, shutting down." + return $OCF_ERR_GENERIC + fi + + ocf_log info "Successfully joined existing rabbitmq cluster" + return $OCF_SUCCESS +} + +rmq_forget_cluster_node_remotely() { + local running_cluster_nodes="$1" + local node_to_forget="$2" + + ocf_log info "Forgetting $node_to_forget via nodes [ $(echo $running_cluster_nodes | tr '\n' ' ') ]." + for running_cluster_node in $running_cluster_nodes; do + $RMQ_CTL -n $running_cluster_node forget_cluster_node $node_to_forget + if [ $? = 0 ]; then + ocf_log info "Succeeded forgetting $node_to_forget via $running_cluster_node." + return + else + ocf_log err "Failed to forget node $node_to_forget via $running_cluster_node." + fi + done +} + +rmq_notify() { + node_list="${OCF_RESKEY_CRM_meta_notify_stop_uname}" + mode="${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation}" + + + # When notifications are on, this agent is going to "forget" nodes once they + # leave the cluster. This is thought to resolve some issues where rabbitmq + # blocks trying to sync with an offline node after a fencing action occurs. + if ! [ "${mode}" = "post-stop" ]; then + return $OCF_SUCCESS + fi + + rmq_monitor + if [ $? -ne $OCF_SUCCESS ]; then + # only run forget when we are for sure active + return $OCF_SUCCESS + fi + + # forget each stopped rmq instance in the provided pcmk node in the list. + for node in $(echo "$node_list"); do + local rmq_node="$(${HA_SBIN_DIR}/crm_attribute -N $(ocf_attribute_target $node) -l forever --query --name $RMQ_CRM_ATTR_COOKIE_LAST_KNOWN -q)" + if [ -z "$rmq_node" ]; then + ocf_log warn "Unable to map pcmk node $node to a known rmq node." + continue + fi + ocf_log notice "Forgetting stopped node $rmq_node" + $RMQ_CTL forget_cluster_node $rmq_node + if [ $? -ne 0 ]; then + ocf_log warn "Unable to forget offline node $rmq_node." + fi + done + return $OCF_SUCCESS +} + +rmq_try_start() { + local join_list="" + local rc + + rmq_monitor + if [ $? -eq $OCF_SUCCESS ]; then + return $OCF_SUCCESS + fi + + join_list=$(rmq_join_list) + + # No join list means no active instances are up. This instance + # is the first, so it needs to bootstrap the rest + if [ -z "$join_list" ]; then + rmq_start_first + rc=$? + rmq_restore_users_perms_policies + return $rc + fi + + # Try to join existing cluster + ocf_log info "wiping data directory before joining" + local local_rmq_node="$(${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l forever --query --name $RMQ_CRM_ATTR_COOKIE_LAST_KNOWN -q)" + + rmq_stop + rmq_wipe_data + if [ -z "$local_rmq_node" ]; then + ocf_log warn "Unable to forget the cluster node because local node name cannot be detected" + else + rmq_forget_cluster_node_remotely "$join_list" "$local_rmq_node" + fi + rmq_join_existing "$join_list" + rc=$? + + if [ $rc -ne 0 ]; then + # we could not join the rabbitmq cluster from any of the running nodes + # this might be due to a unexpected reset of those nodes. Give ourself + # a chance to start by retrying the entire start sequence. + + ocf_log warn "Failed to join the RabbitMQ cluster from nodes ${join_list}. Stopping local unclustered rabbitmq" + rmq_stop + + ocf_log warn "Re-detect available rabbitmq nodes and try to start again" + # return an unused OCF value to signify a "retry" condition + return $RMQ_TRY_RESTART_ERROR_CODE + fi + + rmq_restore_users_perms_policies + + return $OCF_SUCCESS +} + +rmq_start() { + local rc=$RMQ_TRY_RESTART_ERROR_CODE + while [ $rc -eq $RMQ_TRY_RESTART_ERROR_CODE ]; do + rmq_try_start + rc=$? + done + return $rc +} + +rmq_stop() { + # Backup users, user permissions, and policies + BaseDataDir=`dirname $RMQ_DATA_DIR` + $RMQ_EVAL " + %% Run only if Mnesia is still available. + lists:any(fun({mnesia,_,_}) -> true; ({_,_,_}) -> false end, application:which_applications()) andalso + begin + Backup = fun(Table, SelectPattern, Filter, Filename) -> + Result = case catch mnesia:dirty_select(Table, [{SelectPattern, [Filter], ['\\\$_']}]) of + {'EXIT', _} -> []; + Any -> Any + end, + Result /= [] andalso file:write_file(Filename, io_lib:fwrite(\"~p.~n\", [Result])) + end, + + %% Backup users + %% Check db scheme first + [WildPattern] = ets:select(mnesia_gvar, [ { {{rabbit_user, wild_pattern}, '\\\$1'}, [], ['\\\$1'] } ]), + UsersSelectPattern = case WildPattern of + %% Version < 3.6.0 + {internal_user,'_','_','_'} -> {internal_user, '\\\$1', '_', '_'}; + %% Version >= 3.6.0 + {internal_user,'_','_','_','_'} -> {internal_user, '\\\$1', '_', '_', '_'} + end, + Backup(rabbit_user, UsersSelectPattern, {'/=', '\\\$1', <<\"guest\">>}, \"$BaseDataDir/users.erl\"), + + %% Backup user permissions + Backup(rabbit_user_permission, {'\\\$1', {'\\\$2', '\\\$3','\\\$4'}, '\\\$5'}, {'/=', '\\\$3', <<\"guest\">>}, \"$BaseDataDir/users_perms.erl\"), + + %% Backup policies + Backup(rabbit_runtime_parameters, {runtime_parameters, {'_', '\\\$1', '_'}, '_'}, {'==', '\\\$1', <<\"policy\">>}, \"$BaseDataDir/policies.erl\") + end. + " + + rmq_node_alive + if [ $? -eq $OCF_NOT_RUNNING ]; then + return $OCF_SUCCESS + fi + + $RMQ_CTL stop + rc=$? + + if [ $rc -ne 0 ]; then + ocf_log err "rabbitmq-server stop command failed: $RMQ_CTL stop, $rc" + rmq_delete_nodename + return $rc + fi + + #TODO add kill logic + stop_wait=1 + while [ $stop_wait = 1 ]; do + rmq_app_running + rc=$? + if [ "$rc" -eq $OCF_NOT_RUNNING ]; then + stop_wait=0 + break + elif [ "$rc" -ne $OCF_SUCCESS ]; then + ocf_log info "rabbitmq-server stop failed: $rc" + rmq_delete_nodename + exit $OCF_ERR_GENERIC + fi + sleep 1 + done + + rmq_delete_nodename + remove_pid + return $OCF_SUCCESS +} + +rmq_validate() { + check_binary $RMQ_SERVER + check_binary $RMQ_CTL + + # This resource only makes sense as a clone right now. at some point + # we may want to verify the following. + #TODO verify cloned + #TODO verify ordered=true + + # Given that this resource does the cluster join explicitly, + # having a cluster_nodes list in the static config file will + # likely conflict with this agent. + #TODO verify no cluster list in rabbitmq conf + #cat /etc/rabbitmq/rabbitmq.config | grep "cluster_nodes" + + return $OCF_SUCCESS +} + +case $__OCF_ACTION in +meta-data) meta_data + exit $OCF_SUCCESS + ;; +start) rmq_start;; +stop) rmq_stop;; +monitor) rmq_monitor;; +validate-all) rmq_validate;; +notify) rmq_notify;; +usage|help) rmq_usage + exit $OCF_SUCCESS + ;; +*) rmq_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc + diff --git a/heartbeat/rabbitmq-server-ha b/heartbeat/rabbitmq-server-ha new file mode 100755 index 0000000..8b3cd9e --- /dev/null +++ b/heartbeat/rabbitmq-server-ha @@ -0,0 +1,2444 @@ +#!/bin/sh +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# See usage() function below for more details ... +# +# Note that the script uses an external file to setup RabbitMQ policies +# so make sure to create it from an example shipped with the package. +# +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + +# Fill in some defaults if no values are specified + +PATH=/sbin:/usr/sbin:/bin:/usr/bin + +OCF_RESKEY_binary_default="/usr/sbin/rabbitmq-server" +OCF_RESKEY_ctl_default="/usr/sbin/rabbitmqctl" +OCF_RESKEY_debug_default=false +OCF_RESKEY_username_default="rabbitmq" +OCF_RESKEY_groupname_default="rabbitmq" +OCF_RESKEY_admin_user_default="guest" +OCF_RESKEY_admin_password_default="guest" +OCF_RESKEY_definitions_dump_file_default="/etc/rabbitmq/definitions" +OCF_RESKEY_pid_file_default="/var/run/rabbitmq/pid" +OCF_RESKEY_log_dir_default="/var/log/rabbitmq" +OCF_RESKEY_mnesia_base_default="/var/lib/rabbitmq/mnesia" +OCF_RESKEY_mnesia_schema_base_default="/var/lib/rabbitmq" +OCF_RESKEY_host_ip_default="127.0.0.1" +OCF_RESKEY_node_port_default=5672 +OCF_RESKEY_default_vhost_default="/" +OCF_RESKEY_erlang_cookie_default=false +OCF_RESKEY_erlang_cookie_file_default="/var/lib/rabbitmq/.erlang.cookie" +OCF_RESKEY_use_fqdn_default=false +OCF_RESKEY_fqdn_prefix_default="" +OCF_RESKEY_max_rabbitmqctl_timeouts_default=3 +OCF_RESKEY_policy_file_default="/usr/local/sbin/set_rabbitmq_policy" +OCF_RESKEY_rmq_feature_health_check_default=true +OCF_RESKEY_rmq_feature_local_list_queues_default=true +OCF_RESKEY_limit_nofile_default=65535 +OCF_RESKEY_avoid_using_iptables_default=false +OCF_RESKEY_allowed_cluster_nodes_default="" + +: ${HA_LOGTAG="lrmd"} +: ${HA_LOGFACILITY="daemon"} +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +: ${OCF_RESKEY_ctl=${OCF_RESKEY_ctl_default}} +: ${OCF_RESKEY_debug=${OCF_RESKEY_debug_default}} +: ${OCF_RESKEY_username=${OCF_RESKEY_username_default}} +: ${OCF_RESKEY_groupname=${OCF_RESKEY_groupname_default}} +: ${OCF_RESKEY_admin_user=${OCF_RESKEY_admin_user_default}} +: ${OCF_RESKEY_admin_password=${OCF_RESKEY_admin_password_default}} +: ${OCF_RESKEY_definitions_dump_file=${OCF_RESKEY_definitions_dump_file_default}} +: ${OCF_RESKEY_log_dir=${OCF_RESKEY_log_dir_default}} +: ${OCF_RESKEY_mnesia_base=${OCF_RESKEY_mnesia_base_default}} +: ${OCF_RESKEY_mnesia_schema_base=${OCF_RESKEY_mnesia_schema_base_default}} +: ${OCF_RESKEY_pid_file=${OCF_RESKEY_pid_file_default}} +: ${OCF_RESKEY_node_port=${OCF_RESKEY_node_port_default}} +: ${OCF_RESKEY_default_vhost=${OCF_RESKEY_default_vhost_default}} +: ${OCF_RESKEY_erlang_cookie=${OCF_RESKEY_erlang_cookie_default}} +: ${OCF_RESKEY_erlang_cookie_file=${OCF_RESKEY_erlang_cookie_file_default}} +: ${OCF_RESKEY_use_fqdn=${OCF_RESKEY_use_fqdn_default}} +: ${OCF_RESKEY_fqdn_prefix=${OCF_RESKEY_fqdn_prefix_default}} +: ${OCF_RESKEY_max_rabbitmqctl_timeouts=${OCF_RESKEY_max_rabbitmqctl_timeouts_default}} +: ${OCF_RESKEY_policy_file=${OCF_RESKEY_policy_file_default}} +: ${OCF_RESKEY_rmq_feature_health_check=${OCF_RESKEY_rmq_feature_health_check_default}} +: ${OCF_RESKEY_rmq_feature_local_list_queues=${OCF_RESKEY_rmq_feature_local_list_queues_default}} +: ${OCF_RESKEY_limit_nofile=${OCF_RESKEY_limit_nofile_default}} +: ${OCF_RESKEY_avoid_using_iptables=${OCF_RESKEY_avoid_using_iptables_default}} +: ${OCF_RESKEY_allowed_cluster_nodes=${OCF_RESKEY_allowed_cluster_nodes_default}} + +####################################################################### + +OCF_RESKEY_CRM_meta_timeout_default=30000 +: ${OCF_RESKEY_CRM_meta_timeout=${OCF_RESKEY_CRM_meta_timeout_default}} +OCF_RESKEY_start_time_default=$((OCF_RESKEY_CRM_meta_timeout / 6000 + 2)) +: ${OCF_RESKEY_start_time=${OCF_RESKEY_start_time_default}} +OCF_RESKEY_stop_time_default=${OCF_RESKEY_start_time_default} +: ${OCF_RESKEY_stop_time=${OCF_RESKEY_start_time_default}} +OCF_RESKEY_command_timeout_default="" +: ${OCF_RESKEY_command_timeout=${OCF_RESKEY_command_timeout_default}} +TIMEOUT_ARG=$((OCF_RESKEY_CRM_meta_timeout / 6000 + 30)) +COMMAND_TIMEOUT="/usr/bin/timeout ${OCF_RESKEY_command_timeout} ${TIMEOUT_ARG}" +RESOURCE_NAME=`echo $OCF_RESOURCE_INSTANCE | cut -d ":" -f 1` + +####################################################################### + +usage() { + cat <<UEND + usage: $0 (start|stop|validate-all|meta-data|status|monitor) + + $0 manages an ${OCF_RESKEY_binary} process as an HA resource + + The 'start' operation starts the networking service. + The 'stop' operation stops the networking service. + The 'validate-all' operation reports whether the parameters are valid + The 'meta-data' operation reports this RA's meta-data information + The 'status' operation reports whether the networking service is running + The 'monitor' operation reports whether the networking service seems to be working + +UEND +} + +meta_data() { + # The EXTENDED_OCF_PARAMS parameter below does not exist by default + # and hence converted to an empty string unless overridden. It + # could be used by an extention script to add new parameters. For + # example see https://review.openstack.org/#/c/249180/10 + + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="rabbitmq-server-ha"> +<version>1.0</version> + +<longdesc lang="en"> +Resource agent for RabbitMQ promotes a node, then cluster nodes can join it +</longdesc> +<shortdesc lang="en">Resource agent for RabbitMQ HA cluster</shortdesc> +<parameters> + +<parameter name="binary" unique="0" required="0"> +<longdesc lang="en"> +RabbitMQ binary +</longdesc> +<shortdesc lang="en">RabbitMQ binary</shortdesc> +<content type="string" default="${OCF_RESKEY_binary_default}" /> +</parameter> + +<parameter name="ctl" unique="0" required="0"> +<longdesc lang="en"> +rabbitctl binary +</longdesc> +<shortdesc lang="en">rabbitctl binary binary</shortdesc> +<content type="string" default="${OCF_RESKEY_ctl_default}" /> +</parameter> + +<parameter name="pid_file" unique="0" required="0"> +<longdesc lang="en"> +RabbitMQ PID file +</longdesc> +<shortdesc lang="en">RabbitMQ PID file</shortdesc> +<content type="string" default="${OCF_RESKEY_pid_file_default}" /> +</parameter> + +<parameter name="log_dir" unique="0" required="0"> +<longdesc lang="en"> +RabbitMQ log directory +</longdesc> +<shortdesc lang="en">RabbitMQ log directory</shortdesc> +<content type="string" default="${OCF_RESKEY_log_dir_default}" /> +</parameter> + +<parameter name="username" unique="0" required="0"> +<longdesc lang="en"> +RabbitMQ user name +</longdesc> +<shortdesc lang="en">RabbitMQ user name</shortdesc> +<content type="string" default="${OCF_RESKEY_username_default}" /> +</parameter> + +<parameter name="groupname" unique="0" required="0"> +<longdesc lang="en"> +RabbitMQ group name +</longdesc> +<shortdesc lang="en">RabbitMQ group name</shortdesc> +<content type="string" default="${OCF_RESKEY_groupname_default}" /> +</parameter> + +<parameter name="admin_user" unique="0" required="0"> +<longdesc lang="en"> +RabbitMQ default admin user for API +</longdesc> +<shortdesc lang="en">RabbitMQ admin user</shortdesc> +<content type="string" default="${OCF_RESKEY_admin_user_default}" /> +</parameter> + +<parameter name="admin_password" unique="0" required="0"> +<longdesc lang="en"> +RabbitMQ default admin user password for API +</longdesc> +<shortdesc lang="en">RabbitMQ admin password</shortdesc> +<content type="string" default="${OCF_RESKEY_admin_password_default}" /> +</parameter> + +<parameter name="definitions_dump_file" unique="0" required="0"> +<longdesc lang="en"> +RabbitMQ default definitions dump file +</longdesc> +<shortdesc lang="en">RabbitMQ definitions dump file</shortdesc> +<content type="string" default="${OCF_RESKEY_definitions_dump_file}" /> +</parameter> + +<parameter name="command_timeout" unique="0" required="0"> +<longdesc lang="en"> +Timeout command arguments for issued commands termination (value is auto evaluated) +</longdesc> +<shortdesc lang="en">Arguments for timeout wrapping command</shortdesc> +<content type="string" default="${OCF_RESKEY_command_timeout_default}" /> +</parameter> + +<parameter name="start_time" unique="0" required="0"> +<longdesc lang="en"> +Timeout for start rabbitmq server +</longdesc> +<shortdesc lang="en">Timeout for start rabbitmq server</shortdesc> +<content type="string" default="${OCF_RESKEY_start_time_default}" /> +</parameter> + +<parameter name="stop_time" unique="0" required="0"> +<longdesc lang="en"> +Timeout for stopping rabbitmq server +</longdesc> +<shortdesc lang="en">Timeout for stopping rabbitmq server</shortdesc> +<content type="string" default="${OCF_RESKEY_stop_time_default}" /> +</parameter> + +<parameter name="debug" unique="0" required="0"> +<longdesc lang="en"> +The debug flag for agent (${OCF_RESKEY_binary}) instance. +In the /tmp/ directory will be created rmq-* files for log +some operations and ENV values inside OCF-script. +</longdesc> +<shortdesc lang="en">AMQP server (${OCF_RESKEY_binary}) debug flag</shortdesc> +<content type="boolean" default="${OCF_RESKEY_debug_default}" /> +</parameter> + +<parameter name="mnesia_base" unique="0" required="0"> +<longdesc lang="en"> +Base directory for storing Mnesia files +</longdesc> +<shortdesc lang="en">Base directory for storing Mnesia files</shortdesc> +<content type="boolean" default="${OCF_RESKEY_mnesia_base_default}" /> +</parameter> + +<parameter name="mnesia_schema_base" unique="0" required="0"> +<longdesc lang="en"> +Parent directory for Mnesia schema directory +</longdesc> +<shortdesc lang="en">Parent directory for Mnesia schema directory</shortdesc> +<content type="string" default="${OCF_RESKEY_mnesia_schema_base_default}" /> +</parameter> + +<parameter name="host_ip" unique="0" required="0"> +<longdesc lang="en"> +${OCF_RESKEY_binary} should listen on this IP address +</longdesc> +<shortdesc lang="en">${OCF_RESKEY_binary} should listen on this IP address</shortdesc> +<content type="boolean" default="${OCF_RESKEY_host_ip_default}" /> +</parameter> + +<parameter name="node_port" unique="0" required="0"> +<longdesc lang="en"> +${OCF_RESKEY_binary} should listen on this port +</longdesc> +<shortdesc lang="en">${OCF_RESKEY_binary} should listen on this port</shortdesc> +<content type="boolean" default="${OCF_RESKEY_node_port_default}" /> +</parameter> + +<parameter name="default_vhost" unique="0" required="0"> +<longdesc lang="en"> +Default virtual host used for monitoring if a node is fully synchronized with +the rest of the cluster. In normal operation, the resource agent will wait for +queues from this virtual host on this node to be synchronized elsewhere before +stopping RabbitMQ. This also means queues in other virtual hosts may not be +fully synchronized on stop operations. +</longdesc> +<shortdesc lang="en">Default virtual host used for waiting for synchronization</shortdesc> +<content type="string" default="${OCF_RESKEY_default_vhost_default}" /> +</parameter> + +<parameter name="erlang_cookie" unique="0" required="0"> +<longdesc lang="en"> +Erlang cookie for clustering. If specified, will be updated at the mnesia reset +</longdesc> +<shortdesc lang="en">Erlang cookie</shortdesc> +<content type="boolean" default="${OCF_RESKEY_erlang_cookie_default}" /> +</parameter> + +<parameter name="erlang_cookie_file" unique="0" required="0"> +<longdesc lang="en"> +Erlang cookie file path where the cookie will be put, if requested +</longdesc> +<shortdesc lang="en">Erlang cookie file</shortdesc> +<content type="boolean" default="${OCF_RESKEY_erlang_cookie_file_default}" /> +</parameter> + +<parameter name="use_fqdn" unique="0" required="0"> +<longdesc lang="en"> +Either to use FQDN or a shortname for the rabbitmq node +</longdesc> +<shortdesc lang="en">Use FQDN</shortdesc> +<content type="boolean" default="${OCF_RESKEY_use_fqdn_default}" /> +</parameter> + +<parameter name="fqdn_prefix" unique="0" required="0"> +<longdesc lang="en"> +Optional FQDN prefix for RabbitMQ nodes in cluster. +FQDN prefix can be specified to host multiple RabbitMQ instances on a node or +in case of RabbitMQ running in dedicated network/interface. +</longdesc> +<shortdesc lang="en">FQDN prefix</shortdesc> +<content type="string" default="${OCF_RESKEY_fqdn_prefix_default}" /> +</parameter> + +<parameter name="max_rabbitmqctl_timeouts" unique="0" required="0"> +<longdesc lang="en"> +If during monitor call rabbitmqctl times out, the timeout is ignored +unless it is Nth timeout in a row. Here N is the value of the current parameter. +If too many timeouts happen in a raw, the monitor call will return with error. +</longdesc> +<shortdesc lang="en">Fail only if that many rabbitmqctl timeouts in a row occurred</shortdesc> +<content type="string" default="${OCF_RESKEY_max_rabbitmqctl_timeouts_default}" /> +</parameter> + +<parameter name="policy_file" unique="0" required="0"> +<longdesc lang="en"> +A path to the shell script to setup RabbitMQ policies +</longdesc> +<shortdesc lang="en">A policy file path</shortdesc> +<content type="string" default="${OCF_RESKEY_policy_file_default}" /> +</parameter> + +<parameter name="rmq_feature_health_check" unique="0" required="0"> +<longdesc lang="en"> +Since rabbit 3.6.4 list_queues/list_channels-based monitoring should +be replaced with "node_health_check" command, as it creates no network +load at all. +</longdesc> +<shortdesc lang="en">Use node_health_check for monitoring</shortdesc> +<content type="boolean" default="${OCF_RESKEY_rmq_feature_health_check_default}" /> +</parameter> + +<parameter name="rmq_feature_local_list_queues" unique="0" required="0"> +<longdesc lang="en"> +For rabbit version that implements --local flag for list_queues, this +can greatly reduce network overhead in cases when node is +stopped/demoted. +</longdesc> +<shortdesc lang="en">Use --local option for list_queues</shortdesc> +<content type="boolean" default="${OCF_RESKEY_rmq_feature_local_list_queues_default}" /> +</parameter> + +<parameter name="limit_nofile" unique="0" required="0"> +<longdesc lang="en"> +Soft and hard limit for NOFILE +</longdesc> +<shortdesc lang="en">NOFILE limit</shortdesc> +<content type="string" default="${OCF_RESKEY_limit_nofile_default}" /> +</parameter> + +<parameter name="avoid_using_iptables" unique="0" required="0"> +<longdesc lang="en"> +When set to true the iptables calls to block client access become +noops. This is useful when we run inside containers. +</longdesc> +<shortdesc lang="en">Disable iptables use entirely</shortdesc> +<content type="boolean" default="${OCF_RESKEY_avoid_using_iptables_default}" /> +</parameter> + +<parameter name="allowed_cluster_nodes" unique="0" required="0"> +<longdesc lang="en"> +When set to anything other than the empty string it must container the list of +cluster node names, separated by spaces, where the rabbitmq resource is allowed to run. +Tis is needed when rabbitmq is running on a subset of nodes part of a larger +cluster. The default ("") is to assume that all nodes part of the cluster will +run the rabbitmq resource. +</longdesc> +<shortdesc lang="en">List of cluster nodes where rabbitmq is allowed to run</shortdesc> +<content type="string" default="${OCF_RESKEY_allowed_cluster_nodes}" /> +</parameter> + +$EXTENDED_OCF_PARAMS + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="status" timeout="20s" /> +<action name="monitor" depth="0" timeout="30s" interval="5s" /> +<action name="monitor" depth="0" timeout="30s" interval="3s" role="Promoted"/> +<action name="promote" timeout="30s" /> +<action name="demote" timeout="30s" /> +<action name="notify" timeout="20s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + + +MIN_MASTER_SCORE=100 +BEST_MASTER_SCORE=1000 + + +####################################################################### +# Functions invoked by resource manager actions + +#TODO(bogdando) move proc_kill, proc_stop to shared OCF functions +# to be shipped with HA cluster packages +########################################################### +# Attempts to kill a process with retries and checks procfs +# to make sure the process is stopped. +# +# Globals: +# LL +# Arguments: +# $1 - pid of the process to try and kill +# $2 - service name used for logging and match-based kill, if the pid is "none" +# $3 - signal to use, defaults to SIGTERM +# $4 - number of retries, defaults to 5 +# $5 - time to sleep between retries, defaults to 2 +# Returns: +# 0 - if successful +# 1 - if process is still running according to procfs +# 2 - if invalid parameters passed in +########################################################### +proc_kill() +{ + local pid="${1}" + local service_name="${2}" + local signal="${3:-SIGTERM}" + local count="${4:-5}" + local process_sleep="${5:-2}" + local LH="${LL} proc_kill():" + local pgrp="$(ps -o pgid= ${pid} 2>/dev/null | tr -d '[[:space:]]')" + + if [ "${pid}" ] && [ "${pgrp}" = "1" ] ; then + ocf_log err "${LH} shall not kill by the bad pid 1 (init)!" + return 2 + fi + + if [ "${pid}" = "none" ]; then + local matched + matched="$(pgrep -fla ${service_name})" + if [ -z "${matched}" ] ; then + ocf_log info "${LH} cannot find any processes matching the ${service_name}, considering target process to be already dead" + return 0 + fi + ocf_log debug "${LH} no pid provided, will try the ${service_name}, matched list: ${matched}" + while [ $count -gt 0 ]; do + if [ -z "${matched}" ]; then + break + else + matched="$(pgrep -fla ${service_name})" + ocf_log debug "${LH} Stopping ${service_name} with ${signal}..." + ocf_run pkill -f -"${signal}" "${service_name}" + fi + sleep $process_sleep + count=$(( count-1 )) + done + pgrep -f "${service_name}" > /dev/null + if [ $? -ne 0 ] ; then + ocf_log debug "${LH} Stopped ${service_name} with ${signal}" + return 0 + else + ocf_log warn "${LH} Failed to stop ${service_name} with ${signal}" + return 1 + fi + else + # pid is not none + while [ $count -gt 0 ]; do + if [ ! -d "/proc/${pid}" ]; then + break + else + ocf_log debug "${LH} Stopping ${service_name} with ${signal}..." + ocf_run pkill -"${signal}" -g "${pgrp}" + fi + sleep $process_sleep + count=$(( count-1 )) + done + + # Check if the process ended after the last sleep + if [ ! -d "/proc/${pid}" ] ; then + ocf_log debug "${LH} Stopped ${service_name} with ${signal}" + return 0 + fi + + ocf_log warn "${LH} Failed to stop ${service_name} with ${signal}" + return 1 + fi +} + +########################################################### +# Attempts to kill a process with the given pid or pid file +# using proc_kill and will retry with sigkill if sigterm is +# unsuccessful. +# +# Globals: +# OCF_ERR_GENERIC +# OCF_SUCCESS +# LL +# Arguments: +# $1 - pidfile or pid or 'none', if stopping by the name matching +# $2 - service name used for logging or for the failback stopping method +# $3 - stop process timeout (in sec), used to determine how many times we try +# SIGTERM and an upper limit on how long this function should try and +# stop the process. Defaults to 15. +# Returns: +# OCF_SUCCESS - if successful +# OCF_ERR_GENERIC - if process is still running according to procfs +########################################################### +proc_stop() +{ + local pid_param="${1}" + local service_name="${2}" + local timeout="${3:-15}" + local LH="${LL} proc_stop():" + local i + local pid + local pidfile + if [ "${pid_param}" = "none" ] ; then + pid="none" + else + # check if provide just a number + echo "${pid_param}" | egrep -q '^[0-9]+$' + if [ $? -eq 0 ]; then + pid="${pid_param}" + elif [ -e "${pid_param}" ]; then # check if passed in a pid file + pidfile="${pid_param}" + pid=$(cat "${pidfile}" 2>/dev/null | tr -s " " "\n" | sort -u) + else + ocf_log warn "${LH} pid param ${pid_param} is not a file or a number, try match by ${service_name}" + pid="none" + fi + fi + # number of times to try a SIGTEM is (timeout - 5 seconds) / 2 seconds + local stop_count=$(( ($timeout-5)/2 )) + + # make sure we stop at least once + if [ $stop_count -le 0 ]; then + stop_count=1 + fi + + if [ -z "${pid}" ] ; then + ocf_log warn "${LH} unable to get PID from ${pidfile}, try match by ${service_name}" + pid="none" + fi + + if [ -n "${pid}" ]; then + for i in ${pid} ; do + [ "${i}" ] || break + ocf_log info "${LH} Stopping ${service_name} by PID ${i}" + proc_kill "${i}" "${service_name}" SIGTERM $stop_count + if [ $? -ne 0 ]; then + # SIGTERM failed, send a single SIGKILL + proc_kill "${i}" "${service_name}" SIGKILL 1 2 + if [ $? -ne 0 ]; then + ocf_log err "${LH} ERROR: could not stop ${service_name}" + return "${OCF_ERR_GENERIC}" + fi + fi + done + fi + + # Remove the pid file here which will remove empty pid files as well + if [ -n "${pidfile}" ]; then + rm -f "${pidfile}" + fi + + ocf_log info "${LH} Stopped ${service_name}" + return "${OCF_SUCCESS}" +} + +# Invokes the given command as a rabbitmq user and wrapped in the +# timeout command. +su_rabbit_cmd() { + local timeout + if [ "$1" = "-t" ]; then + timeout="/usr/bin/timeout ${OCF_RESKEY_command_timeout} $2" + shift 2 + else + timeout=$COMMAND_TIMEOUT + fi + local cmd="${1:-status}" + local LH="${LL} su_rabbit_cmd():" + local rc=1 + local user=$OCF_RESKEY_username + local mail=/var/spool/mail/rabbitmq + local pwd=/var/lib/rabbitmq + local home=/var/lib/rabbitmq + + ocf_log debug "${LH} invoking a command: ${cmd}" + su $user -s /bin/sh -c "USER=${user} MAIL=${mail} PWD=${pwd} HOME=${home} LOGNAME=${user} \ + ${timeout} ${cmd}" + rc=$? + ocf_log info "${LH} the invoked command exited ${rc}: ${cmd}" + return $rc +} + +now() { + date -u +%s +} + +set_limits() { + local current_limit=$(su $OCF_RESKEY_username -s /bin/sh -c "ulimit -n") + if [ ! -z $OCF_RESKEY_limit_nofile ] && [ $OCF_RESKEY_limit_nofile -gt $current_limit ] ; then + ulimit -n $OCF_RESKEY_limit_nofile + fi +} + +master_score() { + local LH="${LL} master_score():" + local score=$1 + if [ -z $score ] ; then + score=0 + fi + ocf_log info "${LH} Updating master score attribute with ${score}" + ocf_run crm_master -N $THIS_PCMK_NODE -l reboot -v $score || return $OCF_ERR_GENERIC + return $OCF_SUCCESS +} + +# Return either FQDN or shortname, depends on the OCF_RESKEY_use_fqdn. +get_hostname() { + local os=$(uname -s) + if ! ocf_is_true "${OCF_RESKEY_use_fqdn}"; then + if [ "$os" = "SunOS" ]; then + echo "$(hostname | sed 's@\..*@@')" + else + echo "$(hostname -s)" + fi + else + if [ "$os" = "SunOS" ]; then + echo "$(hostname)" + else + echo "$(hostname -f)" + fi + fi +} + +# Strip the FQDN to the shortname, if OCF_RESKEY_use_fqdn was set; +# Prepend prefix to the hostname +process_fqdn() { + if ! ocf_is_true "${OCF_RESKEY_use_fqdn}"; then + echo "${OCF_RESKEY_fqdn_prefix}$1" | awk -F. '{print $1}' + else + echo "${OCF_RESKEY_fqdn_prefix}$1" + fi +} + +# Return OCF_SUCCESS, if current host is in the list of given hosts. +# Otherwise, return 10 +my_host() { + local hostlist="$1" + local hostname + local hn + local rc=10 + local LH="${LL} my_host():" + + hostname=$(process_fqdn $(get_hostname)) + ocf_log debug "${LH} hostlist is: $hostlist" + for host in $hostlist ; do + hn=$(process_fqdn "${host}") + ocf_log debug "${LH} comparing '$hostname' with '$hn'" + if [ "${hostname}" = "${hn}" ] ; then + rc=$OCF_SUCCESS + break + fi + done + + return $rc +} + +get_integer_node_attr() { + local value + value=$(crm_attribute -N $1 -l reboot --name "$2" --query 2>/dev/null | awk '{ split($3, vals, "="); if (vals[2] != "(null)") print vals[2] }') + if [ $? -ne 0 ] || [ -z "$value" ] ; then + value=0 + fi + echo $value +} + +get_node_start_time() { + get_integer_node_attr $1 'rabbit-start-time' +} + +get_node_master_score() { + get_integer_node_attr $1 "master-${RESOURCE_NAME}" +} + +# Return either rabbit node name as FQDN or shortname, depends on the OCF_RESKEY_use_fqdn. +rabbit_node_name() { + echo "rabbit@$(process_fqdn $(ocf_attribute_target $1))" +} + +rmq_setup_env() { + local H + local dir + local name + H="$(get_hostname)" + export RABBITMQ_NODENAME=$(rabbit_node_name $H) + if [ "$OCF_RESKEY_node_port" != "$OCF_RESKEY_node_port_default" ]; then + export RABBITMQ_NODE_PORT=$OCF_RESKEY_node_port + fi + export RABBITMQ_PID_FILE=$OCF_RESKEY_pid_file + MNESIA_FILES="${OCF_RESKEY_mnesia_base}/$(rabbit_node_name $H)" + if ! ocf_is_true "${OCF_RESKEY_use_fqdn}"; then + name="-sname" + else + name="-name" + fi + export RABBITMQ_SERVER_START_ARGS="${RABBITMQ_SERVER_START_ARGS} -mnesia dir \"${MNESIA_FILES}\" ${name} $(rabbit_node_name $H)" + RMQ_START_TIME="${MNESIA_FILES}/ocf_server_start_time.txt" + MASTER_FLAG_FILE="${MNESIA_FILES}/ocf_master_for_${OCF_RESOURCE_INSTANCE}" + THIS_PCMK_NODE=$(ocf_attribute_target) + TOTALVMEM=`free -mt | awk '/Total:/ {print $2}'` + # check and make PID file dir + local PID_DIR=$( dirname $OCF_RESKEY_pid_file ) + if [ ! -d ${PID_DIR} ] ; then + mkdir -p ${PID_DIR} + chown -R ${OCF_RESKEY_username}:${OCF_RESKEY_groupname} ${PID_DIR} + chmod 755 ${PID_DIR} + fi + + # Regardless of whether we just created the directory or it + # already existed, check whether it is writable by the configured + # user + for dir in ${PID_DIR} "${OCF_RESKEY_mnesia_base}" "${OCF_RESKEY_log_dir}"; do + if test -e ${dir}; then + local files + files=$(su -s /bin/sh - $OCF_RESKEY_username -c "find ${dir} ! -writable") + if [ "${files}" ]; then + ocf_log warn "Directory ${dir} is not writable by ${OCF_RESKEY_username}, chowning." + chown -R ${OCF_RESKEY_username}:${OCF_RESKEY_groupname} "${dir}" + fi + fi + done + + export LL="${OCF_RESOURCE_INSTANCE}[$$]:" + update_cookie +} + +# Return a RabbitMQ node to its virgin state. +# For reset and force_reset to succeed the RabbitMQ application must have been stopped. +# If the app cannot be stopped, beam will be killed and mnesia files will be removed. +reset_mnesia() { + local LH="${LL} reset_mnesia():" + local make_amnesia=false + local rc=$OCF_ERR_GENERIC + + # check status of a beam process + get_status + rc=$? + if [ $rc -eq 0 ] ; then + # beam is running + # check status of rabbit app and stop it, if it is running + get_status rabbit + rc=$? + if [ $rc -eq 0 ] ; then + # rabbit app is running, have to stop it + ocf_log info "${LH} Stopping RMQ-app prior to reset the mnesia." + stop_rmq_server_app + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log warn "${LH} RMQ-app can't be stopped." + make_amnesia=true + fi + fi + + if ! $make_amnesia ; then + # rabbit app is not running, reset mnesia + ocf_log info "${LH} Execute reset with timeout: ${TIMEOUT_ARG}" + su_rabbit_cmd "${OCF_RESKEY_ctl} reset" + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log info "${LH} Execute force_reset with timeout: ${TIMEOUT_ARG}" + su_rabbit_cmd "${OCF_RESKEY_ctl} force_reset" + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log warn "${LH} Mnesia couldn't cleaned, even by force-reset command." + make_amnesia=true + fi + fi + fi + else + # there is no beam running + make_amnesia=true + ocf_log warn "${LH} There is no Beam process running." + fi + + # remove mnesia files, if required + if $make_amnesia ; then + kill_rmq_and_remove_pid + ocf_run rm -rf "${MNESIA_FILES}" + mnesia_schema_location="${OCF_RESKEY_mnesia_schema_base}/Mnesia.$(rabbit_node_name $(get_hostname))" + ocf_run rm -rf "$mnesia_schema_location" + ocf_log warn "${LH} Mnesia files appear corrupted and have been removed from ${MNESIA_FILES} and $mnesia_schema_location" + fi + # always return OCF SUCCESS + return $OCF_SUCCESS +} + + +block_client_access() +{ + # When OCF_RESKEY_avoid_using_iptables is true iptables calls are noops + if ocf_is_true "${OCF_RESKEY_avoid_using_iptables}"; then + return $OCF_SUCCESS + fi + # do not add temporary RMQ blocking rule, if it is already exist + # otherwise, try to add a blocking rule with max of 5 retries + local tries=5 + until $(iptables -nvL --wait | grep -q 'temporary RMQ block') || [ $tries -eq 0 ]; do + tries=$((tries-1)) + iptables --wait -I INPUT -p tcp -m tcp --dport ${OCF_RESKEY_node_port} -m state --state NEW,RELATED,ESTABLISHED \ + -m comment --comment 'temporary RMQ block' -j REJECT --reject-with tcp-reset + sleep 1 + done + if [ $tries -eq 0 ]; then + return $OCF_ERR_GENERIC + else + return $OCF_SUCCESS + fi +} + +unblock_client_access() +{ + local lhtext="none" + if [ -z $1 ] ; then + lhtext=$1 + fi + # When OCF_RESKEY_avoid_using_iptables is true iptables calls are noops + if ocf_is_true "${OCF_RESKEY_avoid_using_iptables}"; then + return + fi + # remove all temporary RMQ blocking rules, if there are more than one exist + for i in $(iptables -nvL --wait --line-numbers | awk '/temporary RMQ block/ {print $1}'); do + iptables --wait -D INPUT -p tcp -m tcp --dport ${OCF_RESKEY_node_port} -m state --state NEW,RELATED,ESTABLISHED \ + -m comment --comment 'temporary RMQ block' -j REJECT --reject-with tcp-reset + done + ocf_log info "${lhtext} unblocked access to RMQ port" +} + +get_nodes__base(){ + local infotype='' + local rc=$OCF_ERR_GENERIC + local c_status + + if [ "$1" = 'nodes' ] + then + infotype='db_nodes' + elif [ "$1" = 'running' ] + then + infotype='running_db_nodes' + fi + c_status=`${OCF_RESKEY_ctl} eval "mnesia:system_info(${infotype})." 2>/dev/null` + rc=$? + if [ $rc -ne 0 ] ; then + echo '' + return $OCF_ERR_GENERIC + fi + # translate line like '{running_nodes,['rabbit@node-1','rabbit@node-2','rabbit@node-3']},' to node_list + echo $(echo "${c_status}" | awk -F, '{ for (i=1;i<=NF;i++) { if ($i ~ /@/) { gsub(/[\[\]}{]/,"",$i); print $i; } }}' | tr -d "\'") + return $OCF_SUCCESS +} + +get_nodes() { + echo $(get_nodes__base nodes) + return $? +} + +get_running_nodes() { + echo $(get_nodes__base running) + return $? +} + +# Get alive cluster nodes in visible partition, but the specified one +get_alive_pacemaker_nodes_but() +{ + if [ -z "$1" ]; then + tmp_pcmk_node_list=`crm_node -l -p | sed -e '/(null)/d'` + else + tmp_pcmk_node_list=`crm_node -l -p | sed -e "s/${1}//g" | sed -e '/(null)/d'` + fi + # If OCF_RESKEY_allowed_cluster_nodes is set then we only want the intersection + # of the cluster node output and the allowed_cluster_nodes list + if [ -z "${OCF_RESKEY_allowed_cluster_nodes}" ]; then + pcmk_node_list=$tmp_pcmk_node_list + else + pcmk_node_list=`for i in $tmp_pcmk_node_list ${OCF_RESKEY_allowed_cluster_nodes}; do echo $i; done | sort | uniq -d` + fi + echo $pcmk_node_list +} + +# Get current master. If a parameter is provided, +# do not check node with that name +get_master_name_but() +{ + local node + for node in $(get_alive_pacemaker_nodes_but "$@") + do + ocf_log info "${LH} looking if $node is master" + + if is_master $node; then + ocf_log info "${LH} master is $node" + echo $node + break + fi + done +} + +# Evals some erlang code on current node +erl_eval() { + local fmt="${1:?}" + shift + + $COMMAND_TIMEOUT ${OCF_RESKEY_ctl} eval "$(printf "$fmt" "$@")" 2>/dev/null +} + +# Returns 0 if we are clustered with provideded node +is_clustered_with() +{ + local LH="${LH}: is_clustered_with: " + local node_name + local rc + node_name=$(rabbit_node_name $1) + + local seen_as_running + seen_as_running=$(erl_eval "lists:member('%s', rabbit_mnesia:cluster_nodes(running))." "$node_name") + rc=$? + if [ "$rc" -ne 0 ]; then + ocf_log err "${LH} Failed to check whether '$node_name' is considered running by us" + # We had a transient local error; that doesn't mean the remote node is + # not part of the cluster, so ignore this + elif [ "$seen_as_running" != true ]; then + ocf_log info "${LH} Node $node_name is not running, considering it not clustered with us" + return 1 + fi + + local seen_as_partitioned + seen_as_partitioned=$(erl_eval "lists:member('%s', rabbit_node_monitor:partitions())." "$node_name") + rc=$? + if [ "$rc" -ne 0 ]; then + ocf_log err "${LH} Failed to check whether '$node_name' is partitioned with us" + # We had a transient local error; that doesn't mean the remote node is + # partitioned with us, so ignore this + elif [ "$seen_as_partitioned" != false ]; then + ocf_log info "${LH} Node $node_name is partitioned from us" + return 1 + fi + + return $? +} + + +check_need_join_to() { + local join_to + local node + local running_nodes + local rc=$OCF_ERR_GENERIC + + rc=0 + join_to=$(rabbit_node_name $1) + running_nodes=$(get_running_nodes) + for node in $running_nodes ; do + if [ "${join_to}" = "${node}" ] ; then + rc=1 + break + fi + done + + return $rc +} + +# Update erlang cookie, if it has been specified +update_cookie() { + local cookie_file_content + if [ "${OCF_RESKEY_erlang_cookie}" != 'false' ] ; then + if [ -f "${OCF_RESKEY_erlang_cookie_file}" ]; then + # First line of cookie file without newline + cookie_file_content=$(head -n1 "${OCF_RESKEY_erlang_cookie_file}" | perl -pe chomp) + fi + # As there is a brief period of time when the file is empty + # (shell redirection has already opened and truncated file, + # and echo hasn't finished its job), we are doing this write + # only when cookie has changed. + if [ "${OCF_RESKEY_erlang_cookie}" != "${cookie_file_content}" ]; then + echo "${OCF_RESKEY_erlang_cookie}" > "${OCF_RESKEY_erlang_cookie_file}" + fi + # And this are idempotent operations, so we don't have to + # check any preconditions for running them. + chown ${OCF_RESKEY_username}:${OCF_RESKEY_groupname} "${OCF_RESKEY_erlang_cookie_file}" + chmod 600 "${OCF_RESKEY_erlang_cookie_file}" + fi + return $OCF_SUCCESS +} + +# Stop rmq beam process by pid and by rabbit node name match. Returns SUCCESS/ERROR +kill_rmq_and_remove_pid() { + local LH="${LL} kill_rmq_and_remove_pid():" + # Stop the rabbitmq-server by its pidfile, use the name matching as a fallback, + # and ignore the exit code + proc_stop "${OCF_RESKEY_pid_file}" "beam.*${RABBITMQ_NODENAME}" "${OCF_RESKEY_stop_time}" + # Ensure the beam.smp stopped by the rabbit node name matching as well + proc_stop none "beam.*${RABBITMQ_NODENAME}" "${OCF_RESKEY_stop_time}" + if [ $? -eq 0 ] ; then + return $OCF_SUCCESS + else + return $OCF_ERR_GENERIC + fi +} + +trim_var(){ + local string="$*" + echo ${string%% } +} + +action_validate() { + # todo(sv): validate some incoming parameters + OCF_RESKEY_CRM_meta_notify_post=$(trim_var $OCF_RESKEY_CRM_meta_notify_post) + OCF_RESKEY_CRM_meta_notify_pre=$(trim_var $OCF_RESKEY_CRM_meta_notify_pre) + OCF_RESKEY_CRM_meta_notify_start=$(trim_var $OCF_RESKEY_CRM_meta_notify_start) + OCF_RESKEY_CRM_meta_notify_stop=$(trim_var $OCF_RESKEY_CRM_meta_notify_stop) + OCF_RESKEY_CRM_meta_notify_start_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_start_resource) + OCF_RESKEY_CRM_meta_notify_stop_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_stop_resource) + OCF_RESKEY_CRM_meta_notify_active_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_active_resource) + OCF_RESKEY_CRM_meta_notify_inactive_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_inactive_resource) + OCF_RESKEY_CRM_meta_notify_start_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_start_uname) + OCF_RESKEY_CRM_meta_notify_stop_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_stop_uname) + OCF_RESKEY_CRM_meta_notify_active_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_active_uname) + OCF_RESKEY_CRM_meta_notify_master_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_master_resource) + OCF_RESKEY_CRM_meta_notify_master_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_master_uname) + OCF_RESKEY_CRM_meta_notify_demote_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_demote_resource) + OCF_RESKEY_CRM_meta_notify_demote_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_demote_uname) + OCF_RESKEY_CRM_meta_notify_slave_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_slave_resource) + OCF_RESKEY_CRM_meta_notify_slave_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_slave_uname) + OCF_RESKEY_CRM_meta_notify_promote_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_promote_resource) + OCF_RESKEY_CRM_meta_notify_promote_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_promote_uname) + return $OCF_SUCCESS +} + +update_rabbit_start_time_if_rc() { + local nowtime + local rc=$1 + if [ $rc -eq 0 ]; then + nowtime="$(now)" + ocf_log info "${LH} Rabbit app started successfully. Updating start time attribute with ${nowtime}" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update "${nowtime}" + fi +} + +join_to_cluster() { + local node="$1" + local rmq_node + local rc=$OCF_ERR_GENERIC + local LH="${LL} join_to_cluster():" + + ocf_log info "${LH} start." + + rmq_node=$(rabbit_node_name $node) + ocf_log info "${LH} Joining to cluster by node '${rmq_node}'." + get_status rabbit + rc=$? + if [ $rc -eq $OCF_SUCCESS ] ; then + ocf_log info "${LH} rabbitmq app will be stopped." + stop_rmq_server_app + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log err "${LH} Can't stop rabbitmq app by stop_app command. Stopping." + action_stop + return $OCF_ERR_GENERIC + fi + fi + ocf_log info "${LH} Execute join_cluster with timeout: ${TIMEOUT_ARG}" + su_rabbit_cmd "${OCF_RESKEY_ctl} join_cluster $rmq_node" + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log err "${LH} Can't join to cluster by node '${rmq_node}'. Stopping." + action_stop + return $OCF_ERR_GENERIC + fi + sleep 2 + try_to_start_rmq_app + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log err "${LH} Can't start RMQ app after join to cluster. Stopping." + action_stop + return $OCF_ERR_GENERIC + else + update_rabbit_start_time_if_rc 0 + ocf_log info "${LH} Joined to cluster succesfully." + fi + + ocf_log info "${LH} end." + return $rc +} + +unjoin_nodes_from_cluster() { + # node names of the nodes where the pcs resource is being stopped + local nodelist="$1" + local hostname + local nodename + local rc=$OCF_ERR_GENERIC + local rnode + # nodes in rabbit cluster db + local nodes_in_cluster + local LH="${LL} unjoin_nodes_from_cluster():" + + nodes_in_cluster=$(get_nodes) + rc=$? + if [ $rc -ne 0 ] ; then + # no nodes in node list, nothing to do + return $OCF_SUCCESS + fi + + # unjoin all cluster nodes which are being stopped (i.e. recieved post-stop notify), except *this* node + # before to unjoin the nodes, make sure they were disconnected from *this* node + for hostname in $nodelist ; do + nodename=$(rabbit_node_name $hostname) + if [ "${nodename}" = "${RABBITMQ_NODENAME}" ] ; then + continue + fi + for rnode in $nodes_in_cluster ; do + if [ "${nodename}" = "${rnode}" ] ; then + # disconnect node being unjoined from this node + ocf_run ${OCF_RESKEY_ctl} eval "disconnect_node(list_to_atom(\"${nodename}\"))." 2>&1 + rc=$? + if [ $rc -eq $OCF_SUCCESS ] ; then + ocf_log info "${LH} node '${nodename}' disconnected succesfully." + else + ocf_log info "${LH} disconnecting node '${nodename}' failed." + fi + + # unjoin node + # when the rabbit node went down, its status + # remains 'running' for a while, so few retries are required + local tries=0 + until [ $tries -eq 5 ]; do + tries=$((tries+1)) + if is_clustered_with $nodename; then + ocf_log info "${LH} the ${nodename} is alive and cannot be kicked from the cluster yet" + else + break + fi + sleep 10 + done + ocf_log info "${LH} Execute forget_cluster_node with timeout: ${TIMEOUT_ARG}" + su_rabbit_cmd "${OCF_RESKEY_ctl} forget_cluster_node ${nodename}" + rc=$? + if [ $rc -eq 0 ] ; then + ocf_log info "${LH} node '${nodename}' unjoined succesfully." + else + ocf_log warn "${LH} unjoining node '${nodename}' failed." + fi + fi + done + done + return $OCF_SUCCESS +} + +# Stop RMQ beam server process. Returns SUCCESS/ERROR +stop_server_process() { + local pid + local rc=$OCF_ERR_GENERIC + local LH="${LL} stop_server_process():" + + pid=$(cat ${OCF_RESKEY_pid_file}) + rc=$? + if [ $rc -ne 0 ] ; then + # Try to stop without known PID + ocf_log err "${LH} RMQ-server process PIDFILE was not found!" + su_rabbit_cmd "${OCF_RESKEY_ctl} stop >> \"${OCF_RESKEY_log_dir}/shutdown_log\" 2>&1" + if [ $? -eq 0 ] ; then + ocf_log info "${LH} RMQ-server process stopped succesfully, although there was no PIDFILE found." + ocf_log info "${LH} grant a graceful termintation window ${OCF_RESKEY_stop_time} to end its beam" + sleep "${OCF_RESKEY_stop_time}" + else + kill_rmq_and_remove_pid + fi + elif [ "${pid}" ] ; then + # Try to stop gracefully by known PID + ocf_log info "${LH} Execute stop with timeout: ${TIMEOUT_ARG}" + su_rabbit_cmd "${OCF_RESKEY_ctl} stop ${OCF_RESKEY_pid_file} >> \"${OCF_RESKEY_log_dir}/shutdown_log\" 2>&1" + [ $? -eq 0 ] && ocf_log info "${LH} RMQ-server process (PID=${pid}) stopped succesfully." + fi + + # Ensure there is no beam process and pidfile left + pgrep -f "beam.*${RABBITMQ_NODENAME}" > /dev/null + rc=$? + if [ -f ${OCF_RESKEY_pid_file} ] || [ $rc -eq 0 ] ; then + ocf_log warn "${LH} The pidfile or beam's still exist, forcing the RMQ-server cleanup" + kill_rmq_and_remove_pid + return $? + else + return $OCF_SUCCESS + fi +} + +# Stop RMQ-app. Return OCF_SUCCESS, if the app was stopped, +# otherwise return OCF_ERR_GENERIC +stop_rmq_server_app() { + local rc=$OCF_ERR_GENERIC + + # if the beam process isn't running, then rabbit app is stopped as well + get_status + rc=$? + if [ $rc -ne 0 ] ; then + return $OCF_SUCCESS + fi + + # stop the app + ocf_log info "${LH} Execute stop_app with timeout: ${TIMEOUT_ARG}" + su_rabbit_cmd "${OCF_RESKEY_ctl} stop_app >> \"${OCF_RESKEY_log_dir}/shutdown_log\" 2>&1" + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log err "${LH} RMQ-server app cannot be stopped." + return $OCF_ERR_GENERIC + fi + + get_status rabbit + rc=$? + if [ $rc -ne $OCF_SUCCESS ] ; then + ocf_log info "${LH} RMQ-server app stopped succesfully." + rc=$OCF_SUCCESS + else + ocf_log err "${LH} RMQ-server app cannot be stopped." + rc=$OCF_ERR_GENERIC + fi + + return $rc +} + +start_beam_process() { + local command + local rc=$OCF_ERR_GENERIC + local ts_end + local pf_end + local pid + local LH="${LL} start_beam_process():" + + # remove old PID-file if it exists + if [ -f "${OCF_RESKEY_pid_file}" ] ; then + ocf_log warn "${LH} found old PID-file '${OCF_RESKEY_pid_file}'." + pid=$(cat ${OCF_RESKEY_pid_file}) + if [ "${pid}" ] && [ -d "/proc/${pid}" ] ; then + ocf_run cat /proc/${pid}/cmdline | grep -c 'bin/beam' > /dev/null 2>&1 + rc=$? + if [ $rc -eq $OCF_SUCCESS ] ; then + ocf_log warn "${LH} found beam process with PID=${pid}, killing...'." + ocf_run kill -TERM $pid + else + ocf_log err "${LH} found unknown process with PID=${pid} from '${OCF_RESKEY_pid_file}'." + return $OCF_ERR_GENERIC + fi + fi + ocf_run rm -f $OCF_RESKEY_pid_file + fi + + [ -f /etc/default/rabbitmq-server ] && . /etc/default/rabbitmq-server + + # RabbitMQ requires high soft and hard limits for NOFILE + set_limits + + # run beam process + command="${OCF_RESKEY_binary} >> \"${OCF_RESKEY_log_dir}/startup_log\" 2>/dev/null" + RABBITMQ_NODE_ONLY=1 su rabbitmq -s /bin/sh -c "${command}"& + ts_end=$(( $(now) + ${OCF_RESKEY_start_time} )) + sleep 3 # give it some time, before attempting to start_app + # PID-file is now created later, if the application started successfully + # So assume beam.smp is started, and defer errors handling for start_app + return $OCF_SUCCESS +} + +check_plugins() { + # Check if it's safe to load plugins and if we need to do so. Logic is: + # if (EnabledPlugins > 0) and (ActivePlugins == 0) ; then it's safe to load + # If we have at least one active plugin, then it's not safe to re-load them + # because plugins:setup() would remove existing dependency plugins in plugins_expand_dir. + ${OCF_RESKEY_ctl} eval '{ok, EnabledFile} = application:get_env(rabbit, enabled_plugins_file), EnabledPlugins = rabbit_plugins:read_enabled(EnabledFile), ActivePlugins = rabbit_plugins:active(), if length(EnabledPlugins)>0 -> if length(ActivePlugins)==0 -> erlang:error("need_to_load_plugins"); true -> false end; true -> false end.' + return $? +} + +load_plugins() { + check_plugins + local rc=$? + if [ $rc -eq 0 ] ; then + return 0 + else + ${OCF_RESKEY_ctl} eval 'ToBeLoaded = rabbit_plugins:setup(), ok = app_utils:load_applications(ToBeLoaded), StartupApps = app_utils:app_dependency_order(ToBeLoaded,false), app_utils:start_applications(StartupApps).' + return $? + fi +} + +list_active_plugins() { + local list + list=`${OCF_RESKEY_ctl} eval 'rabbit_plugins:active().' 2>/dev/null` + echo "${list}" +} + +try_to_start_rmq_app() { + local startup_log="${1:-${OCF_RESKEY_log_dir}/startup_log}" + local rc=$OCF_ERR_GENERIC + local LH="${LL} try_to_start_rmq_app():" + + get_status + rc=$? + if [ $rc -ne $OCF_SUCCESS ] ; then + ocf_log info "${LH} RMQ-runtime (beam) not started, starting..." + start_beam_process + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + ocf_log err "${LH} Failed to start beam - returning from the function" + return $OCF_ERR_GENERIC + fi + fi + + + if [ -z "${startup_log}" ] ; then + startup_log="${OCF_RESKEY_log_dir}/startup_log" + fi + + ocf_log info "${LH} begin." + ocf_log info "${LH} Execute start_app with timeout: ${TIMEOUT_ARG}" + su_rabbit_cmd "${OCF_RESKEY_ctl} start_app >>${startup_log} 2>&1" + rc=$? + if [ $rc -eq 0 ] ; then + ocf_log info "${LH} start_app was successful." + ocf_log info "${LH} waiting for start to finish with timeout: ${TIMEOUT_ARG}" + su_rabbit_cmd "${OCF_RESKEY_ctl} wait ${OCF_RESKEY_pid_file}" + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log err "${LH} RMQ-server app failed to wait for start." + return $OCF_ERR_GENERIC + fi + rc=$OCF_SUCCESS + # Loading enabled modules + ocf_log info "${LH} start plugins." + load_plugins + local mrc=$? + if [ $mrc -eq 0 ] ; then + local mlist + mlist=`list_active_plugins` + ocf_log info "${LH} Starting plugins: ${mlist}" + else + ocf_log info "${LH} Starting plugins: failed." + fi + else + ocf_log info "${LH} start_app failed." + rc=$OCF_ERR_GENERIC + fi + return $rc +} + +start_rmq_server_app() { + local rc=$OCF_ERR_GENERIC + local startup_log="${OCF_RESKEY_log_dir}/startup_log" + local startup_output + local LH="${LL} start_rmq_server_app():" + local a + + #We are performing initial start check. + #We are not ready to provide service. + #Clients should not have access. + + + ocf_log info "${LH} begin." + # Safe-unblock the rules, if there are any + unblock_client_access "${LH}" + # Apply the blocking rule + block_client_access + rc=$? + if [ $rc -eq $OCF_SUCCESS ]; then + ocf_log info "${LH} blocked access to RMQ port" + else + ocf_log err "${LH} cannot block access to RMQ port!" + return $OCF_ERR_GENERIC + fi + get_status + rc=$? + if [ $rc -ne $OCF_SUCCESS ] ; then + ocf_log info "${LH} RMQ-runtime (beam) not started, starting..." + start_beam_process + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + unblock_client_access "${LH}" + return $OCF_ERR_GENERIC + fi + fi + + ocf_log info "${LH} RMQ-server app not started, starting..." + try_to_start_rmq_app "$startup_log" + rc=$? + if [ $rc -eq $OCF_SUCCESS ] ; then + # rabbitmq-server started successfuly as master of cluster + master_score $MIN_MASTER_SCORE + stop_rmq_server_app + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log err "${LH} RMQ-server app can't be stopped. Beam will be killed." + kill_rmq_and_remove_pid + unblock_client_access "${LH}" + return $OCF_ERR_GENERIC + fi + else + # error at start RMQ-server + ocf_log warn "${LH} RMQ-server app can't start without Mnesia cleaning." + for a in $(seq 1 10) ; do + rc=$OCF_ERR_GENERIC + reset_mnesia || break + try_to_start_rmq_app "$startup_log" + rc=$? + if [ $rc -eq $OCF_SUCCESS ]; then + stop_rmq_server_app + rc=$? + if [ $rc -eq $OCF_SUCCESS ]; then + ocf_log info "${LH} RMQ-server app Mnesia cleaned successfully." + rc=$OCF_SUCCESS + master_score $MIN_MASTER_SCORE + break + else + ocf_log err "${LH} RMQ-server app can't be stopped during Mnesia cleaning. Beam will be killed." + kill_rmq_and_remove_pid + unblock_client_access "${LH}" + return $OCF_ERR_GENERIC + fi + fi + done + fi + if [ $rc -eq $OCF_ERR_GENERIC ] ; then + ocf_log err "${LH} RMQ-server can't be started while many tries. Beam will be killed." + kill_rmq_and_remove_pid + fi + ocf_log info "${LH} end." + unblock_client_access "${LH}" + return $rc +} + +# check status of rabbit beam process or a rabbit app, if rabbit arg specified +# by default, test if the kernel app is running, otherwise consider it is "not running" +get_status() { + local what="${1:-kernel}" + local rc=$OCF_NOT_RUNNING + local LH="${LL} get_status():" + local body + local beam_running + + body=$( ${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} eval 'rabbit_misc:which_applications().' 2>&1 ) + rc=$? + + pgrep -f "beam.*${RABBITMQ_NODENAME}" > /dev/null + beam_running=$? + # report not running only if the which_applications() reported an error AND the beam is not running + if [ $rc -ne 0 ] && [ $beam_running -ne 0 ] ; then + ocf_log info "${LH} failed with code ${rc}. Command output: ${body}" + return $OCF_NOT_RUNNING + # return a generic error, if there were errors and beam is found running + elif [ $rc -ne 0 ] ; then + ocf_log info "${LH} found the beam process running but failed with code ${rc}. Command output: ${body}" + return $OCF_ERR_GENERIC + fi + + # try to parse the which_applications() output only if it exited w/o errors + if [ "${what}" ] && [ $rc -eq 0 ] ; then + rc=$OCF_NOT_RUNNING + echo "$body" | grep "\{${what}," > /dev/null 2>&1 && rc=$OCF_SUCCESS + + if [ $rc -ne $OCF_SUCCESS ] ; then + ocf_log info "${LH} app ${what} was not found in command output: ${body}" + fi + fi + + [ $rc -ne $OCF_SUCCESS ] && rc=$OCF_NOT_RUNNING + return $rc +} + +action_status() { + local rc=$OCF_ERR_GENERIC + + get_status + rc=$? + return $rc +} + +# return 0, if given node has a master attribute in CIB, +# otherwise, return 1 +is_master() { + local result + result=`crm_attribute -N "${1}" -l reboot --name 'rabbit-master' --query 2>/dev/null |\ + awk '{print $3}' | awk -F "=" '{print $2}' | sed -e '/(null)/d'` + if [ "${result}" != 'true' ] ; then + return 1 + fi + return 0 +} + +# Verify if su_rabbit_cmd exited by timeout by checking its return code. +# If it did not, return 0. If it did AND it is +# $OCF_RESKEY_max_rabbitmqctl_timeouts'th timeout in a row, +# return 2 to signal get_monitor that it should +# exit with error. Otherwise return 1 to signal that there was a timeout, +# but it should be ignored. Timeouts for different operations are tracked +# separately. The second argument is used to distingush them. +check_timeouts() { + local op_rc=$1 + local timeouts_attr_name=$2 + local op_name=$3 + + # 75 is EX_TEMPFAIL from sysexits, and is used by rabbitmqctl to signal about + # timeout. + if [ $op_rc -ne 124 ] && [ $op_rc -ne 137 ] && [ $op_rc -ne 75 ]; then + ocf_update_private_attr $timeouts_attr_name 0 + return 0 + fi + + local count + count=$(ocf_get_private_attr $timeouts_attr_name 0) + + count=$((count+1)) + # There is a slight chance that this piece of code will be executed twice simultaneously. + # As a result, $timeouts_attr_name's value will be one less than it should be. But we don't need + # precise calculation here. + ocf_update_private_attr $timeouts_attr_name $count + + if [ $count -lt $OCF_RESKEY_max_rabbitmqctl_timeouts ]; then + ocf_log warn "${LH} 'rabbitmqctl $op_name' timed out $count of max. $OCF_RESKEY_max_rabbitmqctl_timeouts time(s) in a row. Doing nothing for now." + return 1 + else + ocf_log err "${LH} 'rabbitmqctl $op_name' timed out $count of max. $OCF_RESKEY_max_rabbitmqctl_timeouts time(s) in a row and is not responding. The resource is failed." + return 2 + fi +} + +wait_sync() { + local wait_time=$1 + local queues + local opt_arg="" + + if ocf_is_true "$OCF_RESKEY_rmq_feature_local_list_queues"; then + opt_arg="--local" + fi + + queues="${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} -p ${OCF_RESKEY_default_vhost} list_queues $opt_arg name state" + + su_rabbit_cmd -t "${wait_time}" "sh -c \"while ${queues} | grep -q 'syncing,'; \ + do sleep 2; done\"" + + return $? +} + +get_monitor() { + local rc=$OCF_ERR_GENERIC + local LH="${LL} get_monitor():" + local status_master=1 + local rabbit_running + local name + local node + local node_start_time + local nowtime + local partitions_report + local node_partitions + + ocf_log info "${LH} CHECK LEVEL IS: ${OCF_CHECK_LEVEL}" + get_status + rc=$? + if [ $rc -eq $OCF_NOT_RUNNING ] ; then + ocf_log info "${LH} get_status() returns ${rc}." + ocf_log info "${LH} ensuring this slave does not get promoted." + master_score 0 + return $OCF_NOT_RUNNING + elif [ $rc -eq $OCF_SUCCESS ] ; then + ocf_log info "${LH} get_status() returns ${rc}." + ocf_log info "${LH} also checking if we are master." + get_status rabbit + rabbit_running=$? + is_master $THIS_PCMK_NODE + status_master=$? + ocf_log info "${LH} master attribute is ${status_master}" + if [ $status_master -eq 0 ] && [ $rabbit_running -eq $OCF_SUCCESS ] + then + ocf_log info "${LH} We are the running master" + rc=$OCF_RUNNING_MASTER + elif [ $status_master -eq 0 ] && [ $rabbit_running -ne $OCF_SUCCESS ] ; then + ocf_log err "${LH} We are the master and RMQ-runtime (beam) is not running. this is a failure" + exit $OCF_FAILED_MASTER + fi + fi + get_status rabbit + rabbit_running=$? + ocf_log info "${LH} checking if rabbit app is running" + + if [ $rc -eq $OCF_RUNNING_MASTER ]; then + if [ $rabbit_running -eq $OCF_SUCCESS ]; then + ocf_log info "${LH} rabbit app is running and is master of cluster" + else + ocf_log err "${LH} we are the master and rabbit app is not running. This is a failure" + exit $OCF_FAILED_MASTER + fi + else + start_time=$((180 + $(ocf_get_private_attr 'rabbit-start-phase-1-time' 0))) + restart_order_time=$((60 + $(ocf_get_private_attr 'rabbit-ordered-to-restart' 0))) + nowtime=$(now) + + # If we started more than 3 minutes ago, and + # we got order to restart less than 1 minute ago + if [ $nowtime -lt $restart_order_time ]; then + if [ $nowtime -gt $start_time ]; then + ocf_log err "${LH} failing because we have received an order to restart from the master" + stop_server_process + rc=$OCF_ERR_GENERIC + else + ocf_log warn "${LH} received an order to restart from the master, ignoring it because we have just started" + fi + fi + fi + + if [ $rc -eq $OCF_ERR_GENERIC ]; then + ocf_log err "${LH} get_status() returns generic error ${rc}" + ocf_log info "${LH} ensuring this slave does not get promoted." + master_score 0 + return $OCF_ERR_GENERIC + fi + + # Recounting our master score + ocf_log info "${LH} preparing to update master score for node" + local our_start_time + local new_score + local node_start_time + local node_score + + our_start_time=$(get_node_start_time $THIS_PCMK_NODE) + + if [ $our_start_time -eq 0 ]; then + new_score=$MIN_MASTER_SCORE + else + new_score=$BEST_MASTER_SCORE + for node in $(get_alive_pacemaker_nodes_but $THIS_PCMK_NODE) + do + node_start_time=$(get_node_start_time $node) + node_score=$(get_node_master_score $node) + + ocf_log info "${LH} comparing us (start time: $our_start_time, score: $new_score) with $node (start time: $node_start_time, score: $node_score)" + if [ $node_start_time -ne 0 ] && [ $node_score -ne 0 ] && [ $node_start_time -lt $our_start_time ]; then + new_score=$((node_score - 10 < new_score ? node_score - 10 : new_score )) + elif [ $node_start_time -ne 0 ] && [ $node_score -ne 0 ] && [ $node_start_time -eq $our_start_time ]; then + # Do not get promoted if the other node is already master and we have the same start time + if is_master $node; then + new_score=$((node_score - 10 < new_score ? node_score - 10 : new_score )) + fi + fi + done + fi + + if [ "$new_score" -ne "$(get_node_master_score $THIS_PCMK_NODE)" ]; then + master_score $new_score + fi + ocf_log info "${LH} our start time is $our_start_time and score is $new_score" + + # Skip all other checks if rabbit app is not running + if [ $rabbit_running -ne $OCF_SUCCESS ]; then + ocf_log info "${LH} RabbitMQ is not running, get_monitor function ready to return ${rc}" + return $rc + fi + + # rc can be SUCCESS or RUNNING_MASTER, don't touch it unless there + # is some error uncovered by node_health_check + if ! node_health_check; then + rc=$OCF_ERR_GENERIC + fi + + if [ $rc -eq $OCF_RUNNING_MASTER ] ; then + # If we are the master and healthy, perform various + # connectivity checks for other nodes in the cluster. + # Order a member to restart if something fishy happens with it. + # All cross-node checks MUST happen only here. + + partitions_report="$(partitions_report)" + + for node in $(get_alive_pacemaker_nodes_but $THIS_PCMK_NODE); do + # Restart node if we don't consider ourselves clustered with it + if ! is_clustered_with $node; then + ocf_log warn "${LH} node $node is not connected with us" + order_node_restart "$node" + continue + fi + + # Restart node if it has any unresolved partitions + node_partitions=$(grep_partitions_report $node "$partitions_report") + if [ ! -z "$node_partitions" ]; then + ocf_log warn "${LH} Node $node thinks that it is partitoned with $node_partitions" + order_node_restart "$node" + continue + fi + done + fi + + ocf_log info "${LH} get_monitor function ready to return ${rc}" + return $rc +} + +order_node_restart() { + local node=${1:?} + ocf_log warn "${LH} Ordering node '$node' to restart" + ocf_update_private_attr 'rabbit-ordered-to-restart' "$(now)" "$node" +} + +# Checks whether node is mentioned somewhere in report returned by +# partitions_report() +grep_partitions_report() { + local node="${1:?}" + local report="${2:?}" + local rabbit_node + rabbit_node=$(rabbit_node_name "$node") + echo "$report" | grep "PARTITIONED $rabbit_node:" | sed -e 's/^[^:]\+: //' +} + +# Report partitions (if any) from viewpoint of every running node in cluster. +# It is parseable/grepable version of `rabbitmqctl cluster_status`. +# +# If node sees partition, report will contain the line like: +# PARTITIONED node-name: list-of-nodes, which-node-name-considers, itself-partitioned-with +partitions_report() { + $COMMAND_TIMEOUT xargs -0 ${OCF_RESKEY_ctl} eval <<EOF +RpcTimeout = 10, + +Nodes = rabbit_mnesia:cluster_nodes(running), + +{Replies, _BadNodes} = gen_server:multi_call(Nodes, rabbit_node_monitor, partitions, RpcTimeout * 1000), + +lists:foreach(fun ({_, []}) -> ok; + ({Node, Partitions}) -> + PartitionsStr = string:join([atom_to_list(Part) || Part <- Partitions], + ", "), + io:format("PARTITIONED ~s: ~s~n", + [Node, PartitionsStr]) + end, Replies), + +ok. +EOF +} + +# Check if the rabbitmqctl control plane is alive. +node_health_check() { + local rc + if [ "$OCF_RESKEY_rmq_feature_health_check" = true ]; then + node_health_check_local + rc=$? + else + node_health_check_legacy + rc=$? + fi + return $rc +} + +node_health_check_local() { + local LH="${LH} node_health_check_local():" + local rc + local rc_timeouts + + # Give node_health_check some time to handle timeout by itself. + # By using internal rabbitmqctl timeouts, we allow it to print + # more useful diagnostics + local timeout=$((TIMEOUT_ARG - 2)) + su_rabbit_cmd "${OCF_RESKEY_ctl} node_health_check -t $timeout" + rc=$? + + check_timeouts $rc "rabbit_node_health_check_timeouts" "node_health_check" + rc_timeouts=$? + + if [ "$rc_timeouts" -eq 2 ]; then + master_score 0 + ocf_log info "${LH} node_health_check timed out, retry limit reached" + return $OCF_ERR_GENERIC + elif [ "$rc_timeouts" -eq 1 ]; then + ocf_log info "${LH} node_health_check timed out, going to retry" + return $OCF_SUCCESS + fi + + if [ "$rc" -ne 0 ]; then + ocf_log err "${LH} rabbitmqctl node_health_check exited with errors." + return $OCF_ERR_GENERIC + else + return $OCF_SUCCESS + fi +} + +node_health_check_legacy() { + local rc_alive + local timeout_alive + su_rabbit_cmd "${OCF_RESKEY_ctl} list_channels > /dev/null 2>&1" + rc_alive=$? + { [ $rc_alive -eq 137 ] || [ $rc_alive -eq 124 ] ; } && ocf_log err "${LH} 'rabbitmqctl list_channels' timed out, per-node explanation: $(enhanced_list_channels)" + check_timeouts $rc_alive "rabbit_list_channels_timeouts" "list_channels" + timeout_alive=$? + + if [ $timeout_alive -eq 2 ]; then + master_score 0 + return $OCF_ERR_GENERIC + elif [ $timeout_alive -eq 0 ]; then + if [ $rc_alive -ne 0 ]; then + ocf_log err "${LH} rabbitmqctl list_channels exited with errors." + rc=$OCF_ERR_GENERIC + fi + fi + + # Check for memory alarms for this Master or Slave node. + # If alert found, reset the alarm + # and restart the resource as it likely means a dead end situation + # when rabbitmq cluster is running with blocked publishing due + # to high memory watermark exceeded. + local alarms + local rc_alarms + local timeout_alarms + alarms=`su_rabbit_cmd "${OCF_RESKEY_ctl} -q eval 'rabbit_alarm:get_alarms().'" 2>/dev/null` + rc_alarms=$? + check_timeouts $rc_alarms "rabbit_get_alarms_timeouts" "get_alarms" + timeout_alarms=$? + + if [ $timeout_alarms -eq 2 ]; then + master_score 0 + return $OCF_ERR_GENERIC + + elif [ $timeout_alarms -eq 0 ]; then + if [ $rc_alarms -ne 0 ]; then + ocf_log err "${LH} rabbitmqctl get_alarms exited with errors." + rc=$OCF_ERR_GENERIC + + elif [ -n "${alarms}" ]; then + for node in ${alarms}; do + name=`echo ${node} | perl -n -e "m/memory,'(?<n>\S+)+'/ && print \"$+{n}\n\""` + if [ "${name}" = "${RABBITMQ_NODENAME}" ] ; then + ocf_log err "${LH} Found raised memory alarm. Erasing the alarm and restarting." + su_rabbit_cmd "${OCF_RESKEY_ctl} set_vm_memory_high_watermark 10 > /dev/null 2>&1" + rc=$OCF_ERR_GENERIC + break + fi + done + fi + fi + + if ! is_cluster_status_ok ; then + rc=$OCF_ERR_GENERIC + fi + + # Check if the list of all queues is available, + # Also report some queues stats and total virtual memory. + local queues + local rc_queues + local timeout_queues + queues=`su_rabbit_cmd "${OCF_RESKEY_ctl} -q -p ${OCF_RESKEY_default_vhost} list_queues memory messages consumer_utilisation"` + rc_queues=$? + check_timeouts $rc_queues "rabbit_list_queues_timeouts" "list_queues" + timeout_queues=$? + + if [ $timeout_queues -eq 2 ]; then + master_score 0 + return $OCF_ERR_GENERIC + + elif [ $timeout_queues -eq 0 ]; then + if [ $rc_queues -ne 0 ]; then + ocf_log err "${LH} rabbitmqctl list_queues exited with errors." + rc=$OCF_ERR_GENERIC + + elif [ -n "${queues}" ]; then + local q_c + q_c=`printf %b "${queues}\n" | wc -l` + local mem + mem=`printf %b "${queues}\n" | awk -v sum=0 '{sum+=$1} END {print (sum/1048576)}'` + local mes + mes=`printf %b "${queues}\n" | awk -v sum=0 '{sum+=$2} END {print sum}'` + local c_u + c_u=`printf %b "${queues}\n" | awk -v sum=0 -v cnt=${q_c} '{sum+=$3} END {print (sum+1)/(cnt+1)}'` + local status + status=`echo $(su_rabbit_cmd "${OCF_RESKEY_ctl} -q status")` + ocf_log info "${LH} RabbitMQ is running ${q_c} queues consuming ${mem}m of ${TOTALVMEM}m total, with ${mes} queued messages, average consumer utilization ${c_u}" + ocf_log info "${LH} RabbitMQ status: ${status}" + fi + fi + + return $rc +} + +ocf_get_private_attr() { + local attr_name="${1:?}" + local attr_default_value="${2:?}" + local nodename="${3:-$THIS_PCMK_NODE}" + local count + count=$(attrd_updater -p --name "$attr_name" --node "$nodename" --query) + if [ $? -ne 0 ]; then + echo $attr_default_value + else + echo "$count" | awk -vdef_val="$attr_default_value" '{ gsub(/"/, "", $3); split($3, vals, "="); if (vals[2] != "") print vals[2]; else print def_val }' + fi +} + +ocf_update_private_attr() { + local attr_name="${1:?}" + local attr_value="${2:?}" + local nodename="${3:-$THIS_PCMK_NODE}" + ocf_run attrd_updater -p --name "$attr_name" --node "$nodename" --update "$attr_value" +} + +rabbitmqctl_with_timeout_check() { + local command="${1:?}" + local timeout_attr_name="${2:?}" + + su_rabbit_cmd "${OCF_RESKEY_ctl} $command" + local rc=$? + + check_timeouts $rc $timeout_attr_name "$command" + local has_timed_out=$? + + case "$has_timed_out" in + 0) + return $rc;; + 1) + return 0;; + 2) + return 1;; + esac +} + +is_cluster_status_ok() { + local LH="${LH}: is_cluster_status_ok:" + rabbitmqctl_with_timeout_check cluster_status rabbit_cluster_status_timeouts > /dev/null 2>&1 +} + +action_monitor() { + local rc=$OCF_ERR_GENERIC + local LH="${LL} monitor:" + ocf_log debug "${LH} action start." + if ocf_is_true "${OCF_RESKEY_debug}"; then + d=`date '+%Y%m%d %H:%M:%S'` + echo $d >> /tmp/rmq-monitor.log + env >> /tmp/rmq-monitor.log + echo "$d [monitor] start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log + fi + get_monitor + rc=$? + ocf_log debug "${LH} role: ${OCF_RESKEY_CRM_meta_role}" + ocf_log debug "${LH} result: $rc" + ocf_log debug "${LH} action end." + return $rc +} + + +action_start() { + local rc=$OCF_ERR_GENERIC + local LH="${LL} start:" + local nowtime + + if ocf_is_true "${OCF_RESKEY_debug}"; then + d=`date '+%Y%m%d %H:%M:%S'` + echo $d >> /tmp/rmq-start.log + env >> /tmp/rmq-start.log + echo "$d [start] start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log + fi + + ocf_log info "${LH} action begin." + + get_status + rc=$? + if [ $rc -eq $OCF_SUCCESS ] ; then + ocf_log warn "${LH} RMQ-runtime (beam) already started." + return $OCF_SUCCESS + fi + + local attrs_to_zero="rabbit_list_channels_timeouts rabbit_get_alarms_timeouts rabbit_list_queues_timeouts rabbit_cluster_status_timeouts rabbit_node_health_check_timeouts" + local attr_name_to_reset + for attr_name_to_reset in $attrs_to_zero; do + ocf_update_private_attr $attr_name_to_reset 0 + done + + nowtime=$(now) + ocf_log info "${LH} Setting phase 1 one start time to $nowtime" + ocf_update_private_attr 'rabbit-start-phase-1-time' "$nowtime" + ocf_log info "${LH} Deleting start time attribute" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete + ocf_log info "${LH} Deleting master attribute" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete + + ocf_log info "${LH} RMQ going to start." + start_rmq_server_app + rc=$? + if [ $rc -eq $OCF_SUCCESS ] ; then + ocf_log info "${LH} RMQ prepared for start succesfully." + fi + + ocf_log info "${LH} action end." + return $rc +} + + +action_stop() { + local rc=$OCF_ERR_GENERIC + local LH="${LL} stop:" + + if ocf_is_true "${OCF_RESKEY_debug}"; then + d=$(date '+%Y%m%d %H:%M:%S') + echo $d >> /tmp/rmq-stop.log + env >> /tmp/rmq-stop.log + echo "$d [stop] start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log + fi + + ocf_log info "${LH} action begin." + + ocf_log info "${LH} Deleting master attribute" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete + master_score 0 + ocf_log info "${LH} Deleting start time attribute" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete + + # Wait for synced state first + ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync" + wait_sync $((OCF_RESKEY_stop_time/2)) + + ocf_log info "${LH} RMQ-runtime (beam) going to down." + stop_server_process + + if [ $? -ne $OCF_SUCCESS ] ; then + ocf_log err "RMQ-runtime (beam) couldn't be stopped and will likely became unmanaged. Take care of it manually!" + ocf_log info "${LH} action end." + exit $OCF_ERR_GENERIC + fi + + ocf_log info "${LH} RMQ-runtime (beam) not running." + ocf_log info "${LH} action end." + return $OCF_SUCCESS +} + +####################################################################### +# Enhanced list_channels: +# - nodes are processed in parallel +# - report contains information about which nodes timed out +# +# 'list_channels' is used as a healh-check for current node, but it +# actually checks overall health of all node in cluster. And there were +# some bugs where only one (non-local) channel became stuck, but OCF +# script was wrongfully killing local node. +# +# Hopefully all such bugs are fixed, but if not - it will allow to +# detect such conditions. +# +# Somewhat strange implementation is due to the following reasons: +# - ability to support older versions of RabbitMQ which have reached +# end-of-life with single version of the script +# - zero dependencies - for older versions this functionality could be +# implemented as a plugin, but it'll require this plugin installation +enhanced_list_channels() { + # One second less than timeout of su_rabbit_cmd + local timeout=$((${TIMEOUT_ARG:-5} - 1)) + + su_rabbit_cmd "xargs -0 ${OCF_RESKEY_ctl} eval" <<EOF +SecondsToCompletion = $timeout, + +%% Milliseconds since unix epoch +Now = fun() -> + {Mega, Secs, Micro} = os:timestamp(), + Mili = Micro div 1000, + Mili + 1000 * (Secs + 1000000 * Mega) + end, + +%% We shouldn't continue execution past this time +ShouldEndAt = Now() + SecondsToCompletion * 1000, + +%% How many milliseconds we still have +Timeout = fun() -> + case ShouldEndAt - Now() of + Past when Past =< 0 -> + 0; + Timeout -> + Timeout + end + end, + +%% Lambda combinator - for defining anonymous recursive functions +Y = fun(F) -> + (fun (X) -> F(fun(Y) -> (X(X))(Y) end) end)( + fun (X) -> F(fun(Y) -> (X(X))(Y) end) end) + end, + +Parent = self(), + +ListChannels = Y(fun(Rec) -> + fun (({Node, [], OkChannelsCount})) -> + Parent ! {Node, ok, OkChannelsCount}; + ({Node, [Chan|Rest], OkChannelsCount}) -> + case catch rpc:call(Node, rabbit_channel, info, [Chan], Timeout()) of + Infos when is_list(Infos) -> + Rec({Node, Rest, OkChannelsCount + 1}); + {badrpc, {'EXIT', {noproc, _}}} -> + %% Channel became dead before we could request it's status, don't care + Rec({Node, Rest, OkChannelsCount}); + Err -> + Parent ! {Node, Err, OkChannelsCount} + end + end + end), + +SingleNodeListing = fun(Node) -> + case catch rpc:call(Node, pg_local, get_members, [rabbit_channels], Timeout()) of + LocalChannels when is_list(LocalChannels) -> + ListChannels({Node, LocalChannels, 0}); + Err -> + Parent ! {Node, Err, 0} + end + end, + +AllNodes = rabbit_mnesia:cluster_nodes(running), +[ spawn(fun() -> SingleNodeListing(Node) end) || Node <- AllNodes ], + +WaitForNodes = Y(fun(Rec) -> + fun ({[], Acc}) -> + Acc; + ({RemainingNodes, Acc}) -> + receive + {Node, _Status, _ChannelCount} = Smth -> + RemainingNodes1 = lists:delete(Node, RemainingNodes), + Rec({RemainingNodes1, [Smth|Acc]}) + after Timeout() + 100 -> + Acc + end + end + end), + +Result = WaitForNodes({AllNodes, []}), + +ExpandedResult = [ case lists:keysearch(Node, 1, Result) of + {value, NodeResult} -> + NodeResult; + false -> + {Node, no_data_collected, 0} + end || Node <- AllNodes ], + +ExpandedResult. +EOF +} + +####################################################################### +# Join the cluster and return OCF_SUCCESS, if joined. +# Return 10, if node is trying to join to itself or empty destination. +# Return OCF_ERR_GENERIC, if cannot join. +jjj_join () { + local join_to="$1" + local rc=$OCF_ERR_GENERIC + local LH="${LL} jjj_join:" + + my_host ${join_to} + rc=$? + ocf_log debug "${LH} node='${join_to}' rc='${rc}'" + + # Check whether we are joining to ourselves + # or master host is not given + if [ $rc -ne 0 ] && [ "${join_to}" ] ; then + ocf_log info "${LH} Joining to cluster by node '${join_to}'" + join_to_cluster "${join_to}" + rc=$? + if [ $rc -ne $OCF_SUCCESS ] ; then + ocf_log err "${LH} Failed to join the cluster. The mnesia will be reset." + reset_mnesia + rc=$OCF_ERR_GENERIC + fi + fi + return $rc +} + +action_notify() { + local rc_join=$OCF_SUCCESS + local rc=$OCF_ERR_GENERIC + local rc2=$OCF_ERR_GENERIC + local LH="${LL} notify:" + local nodelist + + if ocf_is_true "${OCF_RESKEY_debug}"; then + d=`date '+%Y%m%d %H:%M:%S'` + echo $d >> /tmp/rmq-notify.log + env >> /tmp/rmq-notify.log + echo "$d [notify] ${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation} promote='${OCF_RESKEY_CRM_meta_notify_promote_uname}' demote='${OCF_RESKEY_CRM_meta_notify_demote_uname}' master='${OCF_RESKEY_CRM_meta_notify_master_uname}' slave='${OCF_RESKEY_CRM_meta_notify_slave_uname}' start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log + fi + + if [ "${OCF_RESKEY_CRM_meta_notify_type}" = 'post' ] ; then + # POST- anything notify section + case "$OCF_RESKEY_CRM_meta_notify_operation" in + promote) + ocf_log info "${LH} post-promote begin." + + rc=$OCF_SUCCESS + + # Do nothing, if the list of nodes being promoted reported empty. + # Delegate recovery, if needed, to the "running out of the cluster" monitor's logic + if [ -z "${OCF_RESKEY_CRM_meta_notify_promote_uname}" ] ; then + ocf_log warn "${LH} there are no nodes to join to reported on post-promote. Nothing to do." + + elif my_host "${OCF_RESKEY_CRM_meta_notify_promote_uname}"; then + ocf_log info "${LH} ignoring post-promote of self" + + elif is_clustered_with "${OCF_RESKEY_CRM_meta_notify_promote_uname}"; then + if get_status rabbit; then + ocf_log info "${LH} we are already clustered with master - ${OCF_RESKEY_CRM_meta_notify_promote_uname}. Nothing to do." + else + ocf_log info "${LH} we are already clustered with master - ${OCF_RESKEY_CRM_meta_notify_promote_uname}. We only need to start the app." + + try_to_start_rmq_app + rc2=$? + update_rabbit_start_time_if_rc $rc2 + fi + + else + # Note, this should fail when the mnesia is inconsistent. + # For example, when the "old" master processing the promition of the new one. + # Later this ex-master node will rejoin the cluster at post-start. + jjj_join "${OCF_RESKEY_CRM_meta_notify_promote_uname}" + rc=$? + if [ $rc -eq $OCF_ERR_GENERIC ] ; then + ocf_log err "${LH} Failed to join the cluster on post-promote. The resource will be restarted." + fi + fi + + ocf_log info "${LH} post-promote end." + return $rc + ;; + start) + ocf_log info "${LH} post-start begin." + local nodes_list="${OCF_RESKEY_CRM_meta_notify_start_uname} ${OCF_RESKEY_CRM_meta_notify_active_uname}" + # Do nothing, if the list of nodes being started or running reported empty + # Delegate recovery, if needed, to the "running out of the cluster" monitor's logic + if [ -z "${OCF_RESKEY_CRM_meta_notify_start_uname}" ] && [ -z "${OCF_RESKEY_CRM_meta_notify_active_uname}" ] ; then + ocf_log warn "${LH} I'm a last man standing and I must survive!" + ocf_log info "${LH} post-start end." + return $OCF_SUCCESS + fi + # check did this event from this host + my_host "${nodes_list}" + rc=$? + # Do nothing, if there is no master reported + # Delegate recovery, if needed, to the "running out of the cluster" monitor's logic + if [ -z "${OCF_RESKEY_CRM_meta_notify_master_uname}" ] ; then + ocf_log warn "${LH} there are no nodes to join to reported on post-start. Nothing to do." + ocf_log info "${LH} post-start end." + return $OCF_SUCCESS + fi + if [ $rc -eq $OCF_SUCCESS ] ; then + # Now we need to: + # a. join to the cluster if we are not joined yet + # b. start the RabbitMQ application, which is always + # stopped after start action finishes + check_need_join_to ${OCF_RESKEY_CRM_meta_notify_master_uname} + rc_join=$? + if [ $rc_join -eq $OCF_SUCCESS ]; then + ocf_log warn "${LH} Going to join node ${OCF_RESKEY_CRM_meta_notify_master_uname}" + jjj_join "${OCF_RESKEY_CRM_meta_notify_master_uname}" + rc2=$? + else + ocf_log warn "${LH} We are already clustered with node ${OCF_RESKEY_CRM_meta_notify_master_uname}" + + try_to_start_rmq_app + rc2=$? + update_rabbit_start_time_if_rc $rc2 + fi + if [ -s "${OCF_RESKEY_definitions_dump_file}" ] ; then + ocf_log info "File ${OCF_RESKEY_definitions_dump_file} exists" + ocf_run curl --silent --show-error --request POST --user $OCF_RESKEY_admin_user:$OCF_RESKEY_admin_password $OCF_RESKEY_host_ip:15672/api/definitions --header "Content-Type:application/json" --data @$OCF_RESKEY_definitions_dump_file + rc=$? + if [ $rc -eq $OCF_SUCCESS ] ; then + ocf_log info "RMQ definitions have imported succesfully." + else + ocf_log err "RMQ definitions have not imported." + fi + fi + if [ $rc2 -eq $OCF_ERR_GENERIC ] ; then + ocf_log warn "${LH} Failed to join the cluster on post-start. The resource will be restarted." + ocf_log info "${LH} post-start end." + return $OCF_ERR_GENERIC + fi + fi + ocf_log info "${LH} post-start end." + ;; + stop) + # if rabbitmq-server stops on any another node, we should remove it from cluster (as ordinary operation) + ocf_log info "${LH} post-stop begin." + # Report not running, if there are no nodes being stopped reported + if [ -z "${OCF_RESKEY_CRM_meta_notify_stop_uname}" ] ; then + ocf_log warn "${LH} there are no nodes being stopped reported on post-stop. The resource will be restarted." + ocf_log info "${LH} post-stop end." + return $OCF_ERR_GENERIC + fi + my_host "${OCF_RESKEY_CRM_meta_notify_stop_uname}" + rc=$? + if [ $rc -ne $OCF_SUCCESS ] ; then + # Wait for synced state first + ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync" + wait_sync $((OCF_RESKEY_stop_time/2)) + # On other nodes processing the post-stop, make sure the stopped node will be forgotten + unjoin_nodes_from_cluster "${OCF_RESKEY_CRM_meta_notify_stop_uname}" + else + # On the nodes being stopped, reset the master score + ocf_log info "${LH} resetting the master score." + master_score 0 + fi + # always returns OCF_SUCCESS + ocf_log info "${LH} post-stop end." + ;; + *) ;; + esac + fi + + return $OCF_SUCCESS +} + + +action_promote() { + local rc=$OCF_ERR_GENERIC + local LH="${LL} promote:" + + if ocf_is_true "${OCF_RESKEY_debug}"; then + d=$(date '+%Y%m%d %H:%M:%S') + echo $d >> /tmp/rmq-promote.log + env >> /tmp/rmq-promote.log + echo "$d [promote] start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log + fi + + ocf_log info "${LH} action begin." + + get_monitor + rc=$? + ocf_log info "${LH} get_monitor returns ${rc}" + case "$rc" in + "$OCF_SUCCESS") + # Running as slave. Normal, expected behavior. + ocf_log info "${LH} Resource is currently running as Slave" + # rabbitmqctl start_app if need + get_status rabbit + rc=$? + ocf_log info "${LH} Updating cluster master attribute" + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --update 'true' + if [ $rc -ne $OCF_SUCCESS ] ; then + ocf_log info "${LH} RMQ app is not started. Starting..." + start_rmq_server_app + rc=$? + if [ $rc -eq 0 ] ; then + try_to_start_rmq_app + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log err "${LH} Can't start RMQ app. Master resource is failed." + ocf_log info "${LH} action end." + exit $OCF_FAILED_MASTER + fi + + [ -f "${OCF_RESKEY_policy_file}" ] && . "${OCF_RESKEY_policy_file}" + + update_rabbit_start_time_if_rc $rc + + ocf_log info "${LH} Checking master status" + get_monitor + rc=$? + ocf_log info "${LH} Master status is $rc" + if [ $rc = $OCF_RUNNING_MASTER ] + then + rc=$OCF_SUCCESS + else + ocf_log err "${LH} Master resource is failed." + ocf_log info "${LH} action end." + exit $OCF_FAILED_MASTER + fi + else + ocf_log err "${LH} Can't start RMQ-runtime." + rc=$OCF_ERR_GENERIC + fi + fi + return $rc + ;; + "$OCF_RUNNING_MASTER") + # Already a master. Unexpected, but not a problem. + ocf_log warn "${LH} Resource is already running as Master" + rc=$OCF_SUCCESS + ;; + + "$OCF_FAILED_MASTER") + # Master failed. + ocf_log err "${LH} Master resource is failed and not running" + ocf_log info "${LH} action end." + exit $OCF_FAILED_MASTER + ;; + + "$OCF_NOT_RUNNING") + # Currently not running. + ocf_log err "${LH} Resource is currently not running" + rc=$OCF_NOT_RUNNING + ;; + *) + # Failed resource. Let the cluster manager recover. + ocf_log err "${LH} Unexpected error, cannot promote" + ocf_log info "${LH} action end." + exit $rc + ;; + esac + + # transform slave RMQ-server to master + + ocf_log info "${LH} action end." + return $rc +} + + +action_demote() { + local LH="${LL} demote:" + ocf_log info "${LH} action begin." + ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete + ocf_log info "${LH} action end." + return $OCF_SUCCESS +} +####################################################################### + +case "$1" in + meta-data) meta_data + exit $OCF_SUCCESS;; + usage|help) usage + exit $OCF_SUCCESS;; +esac + +rmq_setup_env + +# Anything except meta-data and help must pass validation +action_validate || exit $? + +# What kind of method was invoked? +case "$1" in + start) action_start;; + stop) action_stop;; + status) action_status;; + monitor) action_monitor;; + validate) action_validate;; + promote) action_promote;; + demote) action_demote;; + notify) action_notify;; + validate-all) action_validate;; + *) usage;; +esac +### diff --git a/heartbeat/redis.in b/heartbeat/redis.in new file mode 100755 index 0000000..6429477 --- /dev/null +++ b/heartbeat/redis.in @@ -0,0 +1,783 @@ +#!@BASH_SHELL@ +# +# Resource agent script for redis server. +# +# Copyright (c) 2013 Patrick Hemmer <patrick.hemmer@gmail.com> +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_bin_default="/usr/bin/redis-server" +OCF_RESKEY_client_bin_default="/usr/bin/redis-cli" +if [ -f "/etc/redis.conf" ]; then + OCF_RESKEY_config_default="/etc/redis.conf" +else + OCF_RESKEY_config_default="/etc/redis/redis.conf" +fi +OCF_RESKEY_user_default="redis" +OCF_RESKEY_rundir_default="/var/run/redis" +OCF_RESKEY_pidfile_name_default="redis-server.pid" +OCF_RESKEY_socket_name_default="redis.sock" +OCF_RESKEY_port_default="6379" +OCF_RESKEY_tunnel_host_default="127.0.0.1" +OCF_RESKEY_tunnel_port_map_default="" +OCF_RESKEY_wait_last_known_master_default="false" + +: ${OCF_RESKEY_bin=${OCF_RESKEY_bin_default}} +: ${OCF_RESKEY_client_bin=${OCF_RESKEY_client_bin_default}} +: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} +: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} +: ${OCF_RESKEY_rundir=${OCF_RESKEY_rundir_default}} +: ${OCF_RESKEY_pidfile_name=${OCF_RESKEY_pidfile_name_default}} +: ${OCF_RESKEY_socket_name=${OCF_RESKEY_socket_name_default}} +: ${OCF_RESKEY_port=${OCF_RESKEY_port_default}} +: ${OCF_RESKEY_tunnel_host=${OCF_RESKEY_tunnel_host_default}} +: ${OCF_RESKEY_tunnel_port_map=${OCF_RESKEY_tunnel_port_map_default}} +: ${OCF_RESKEY_wait_last_known_master=${OCF_RESKEY_wait_last_known_master_default}} + +CHECK_SLAVE_STATE=0 + +REDIS_CHECK_DUMP="/usr/bin/redis-check-dump" +REDIS_SERVER="$OCF_RESKEY_bin" +REDIS_CLIENT="$OCF_RESKEY_client_bin" +REDIS_CONFIG="$OCF_RESKEY_config" +REDIS_USER="$OCF_RESKEY_user" +REDIS_RUNDIR="$OCF_RESKEY_rundir" +REDIS_PIDFILE="$OCF_RESKEY_rundir/$OCF_RESKEY_pidfile_name" +REDIS_SOCKET="$OCF_RESKEY_rundir/$OCF_RESKEY_socket_name" +REDIS_REPLICATION_PORT="$OCF_RESKEY_port" + +if ! [ -f $REDIS_CHECK_DUMP ]; then + REDIS_CHECK_DUMP="$(which redis-check-dump 2>/dev/null)" +fi +if [ -z "$REDIS_CHECK_DUMP" ]; then + REDIS_CHECK_DUMP="$(which redis-check-rdb 2>/dev/null)" +fi + +if [ -r "$REDIS_CONFIG" ]; then + REDIS_DUMP_DIR="$(grep "^\s*dir\s" < "$REDIS_CONFIG" | awk '{ print $2 }' 2>/dev/null)" + REDIS_DUMP_FILE="$(grep "^\s*dbfilename\s" < "$REDIS_CONFIG" | awk '{ print $2 }' 2>/dev/null)" +fi +: ${REDIS_DUMP_DIR:=/var/lib/redis/} +: ${REDIS_DUMP_FILE:=dump.rdb} + +redis_meta_data() { + cat <<EOI +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="redis" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource agent script for redis server. + +This resource fully supports master/slave replication. The master preference of a node is determined by the 'slave_priority' parameter of the redis config. +When taking the resource from 'unmanaged' to 'managed', the currently active master will be given a priority of 1000 (plus 1 for each active connection). The default 'slave_priority' is 100, so the master will stay master. For a slave to become master after converting the resource to managed, set a slave_priority greater than 1000. +</longdesc> + +<shortdesc lang="en">Redis server</shortdesc> + +<parameters> +<parameter name="bin" unique="0" required="0"> +<longdesc lang="en"> +Path to \`redis-server\` +</longdesc> +<shortdesc lang="en">Path to \`redis-server\`</shortdesc> +<content type="string" default="${OCF_RESKEY_bin_default}" /> +</parameter> + +<parameter name="client_bin" unique="0" required="0"> +<longdesc lang="en"> +Path to \`redis-cli\` +</longdesc> +<shortdesc lang="en">Path to \`redis-cli\`</shortdesc> +<content type="string" default="${OCF_RESKEY_client_bin_default}" /> +</parameter> + +<parameter name="config" unique="1" required="0"> +<longdesc lang="en"> +Path to 'redis.conf' +</longdesc> +<shortdesc lang="en">Path to 'redis.conf'</shortdesc> +<content type="string" default="${OCF_RESKEY_config_default}" /> +</parameter> + +<parameter name="user" unique="0" required="0"> +<longdesc lang="en"> +User to run redis as +</longdesc> +<shortdesc lang="en">Redis user</shortdesc> +<content type="string" default="${OCF_RESKEY_user_default}" /> +</parameter> + +<parameter name="rundir" unique="1" required="0"> +<longdesc lang="en"> +Directory to store socket and pid file in +</longdesc> +<shortdesc lang="en">Redis var/run dir</shortdesc> +<content type="string" default="${OCF_RESKEY_rundir_default}"/> +</parameter> + +<parameter name="pidfile_name" unique="0" required="0"> +<longdesc lang="en"> +The filename to use for the pidfile. Will be created in the rundir. +Should only be a basename, not a full path. +</longdesc> +<shortdesc lang="en">Redis pidfile name</shortdesc> +<content type="string" default="${OCF_RESKEY_pidfile_name_default}"/> +</parameter> + +<parameter name="socket_name" unique="0" required="0"> +<longdesc lang="en"> +The filename to use for the socket. Will be crated in the rundir. +Should only be a basename, not a full path. +</longdesc> +<shortdesc lang="en">Redis socket name</shortdesc> +<content type="string" default="${OCF_RESKEY_socket_name_default}"/> +</parameter> + +<parameter name="port" unique="0" required="0"> +<longdesc lang="en"> +Port for replication client to connect to on remote server +</longdesc> +<shortdesc lang="en">Replication port</shortdesc> +<content type="string" default="${OCF_RESKEY_port_default}"/> +</parameter> + +<parameter name="tunnel_host" unique="0" required="0"> +<longdesc lang="en"> +When replication traffic is tunnelled, this is the host to target +to forward outgoing traffic to the redis master. The resource +agent configures the redis slave to target the master via +tunnel_host:tunnel_port. + +Note that in order to enable replication traffic tunneling, +parameter {tunnel_port_map} must be populated. +</longdesc> +<shortdesc lang="en">Tunnel host for replication traffic</shortdesc> +<content type="string" default="${OCF_RESKEY_tunnel_host_default}"/> +</parameter> + +<parameter name="tunnel_port_map" unique="0" required="0"> +<longdesc lang="en"> +A mapping of pacemaker node names to redis port number. + +To be used when redis servers need to tunnel replication traffic. +On every node where the redis resource is running, the redis server +listens to a different port. Each redis server can access its peers +for replication traffic via a tunnel accessible at {tunnel_host}:port. + +The mapping the form of: +pcmk1-name:port-for-redis1;pcmk2-name:port-for-redis2;pcmk3-name:port-for-redis3 + +where the redis resource started on node pcmk1-name would listen on +port port-for-redis1 +</longdesc> +<shortdesc lang="en">Mapping of Redis server name to redis port</shortdesc> +<content type="string" default="${OCF_RESKEY_tunnel_port_map_default}"/> +</parameter> + +<parameter name="wait_last_known_master" unique="0" required="0"> +<longdesc lang="en"> +During redis cluster bootstrap, wait for the last known master to be +promoted before allowing any other instances in the cluster to be +promoted. This lessens the risk of data loss when persistent data +is in use. +</longdesc> +<shortdesc lang="en">Wait for last known master</shortdesc> +<content type="boolean" default="${OCF_RESKEY_wait_last_known_master_default}"/> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="120s" /> +<action name="stop" timeout="120s" /> +<action name="status" timeout="60s" /> +<action name="monitor" depth="0" timeout="60s" interval="45s" /> +<action name="monitor" role="Promoted" depth="0" timeout="60s" interval="20s" /> +<action name="monitor" role="Unpromoted" depth="0" timeout="60s" interval="60s" /> +<action name="promote" timeout="120s" /> +<action name="demote" timeout="120s" /> +<action name="notify" timeout="90s" /> +<action name="validate-all" timeout="5s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +EOI +} + +INSTANCE_ATTR_NAME=$(echo "${OCF_RESOURCE_INSTANCE}" | awk -F : '{print $1}') +CRM_ATTR_REPL_INFO="${HA_SBIN_DIR}/crm_attribute --type crm_config --name ${INSTANCE_ATTR_NAME}_REPL_INFO -s redis_replication" +MASTER_HOST="" +MASTER_ACTIVE_CACHED="" +MASTER_ACTIVE="" +CLI_HAVE_AUTH_WARNING=0 +CLI_HAVE_ARG_NO_AUTH_WARNING=0 +CLI_HAVE_ENV_AUTH=0 + +redis_cli_features() +{ + + CLI_VER=$("$REDIS_CLIENT" -v | awk '{print $NF}') + # Starting with 4.0.10 there is a warning on stderr when using a pass + # Starting with 5.0.0 there is an argument to silence the warning: --no-auth-warning + # Starting with 5.0.3 there is an option to use REDISCLI_AUTH evironment variable for password, no warning in this case + + ocf_version_cmp $CLI_VER 5.0.3 + res=$? + if [[ res -ge 1 ]]; then + CLI_HAVE_ENV_AUTH=1 + fi + + ocf_version_cmp $CLI_VER 5.0.0 + res=$? + if [[ res -ge 1 ]]; then + CLI_HAVE_ARG_NO_AUTH_WARNING=1 + fi + + ocf_version_cmp $CLI_VER 4.0.10 + res=$? + if [[ res -ge 1 ]]; then + CLI_HAVE_AUTH_WARNING=1 + fi +} + +master_is_active() +{ + if [ -z "$MASTER_ACTIVE_CACHED" ]; then + # determine if a master instance is already up and is healthy + ocf_version_cmp "$OCF_RESKEY_crm_feature_set" "3.1.0" + res=$? + if [ -z "$OCF_RESKEY_crm_feature_set" ] || [ $res -eq 2 ]; then + XMLOPT="--output-as=xml" + ocf_version_cmp "$OCF_RESKEY_crm_feature_set" "3.2.0" + if [ $? -eq 1 ]; then + crm_mon_no_validation -1 $XMLOPT >/dev/null 2>&1 + if [ $? -ne 0 ]; then + XMLOPT="--as-xml" + fi + fi + else + XMLOPT="--as-xml" + fi + crm_mon_no_validation -1 $XMLOPT | grep -q -i -E "resource.*id=\"${OCF_RESOURCE_INSTANCE}\".* role=\"(Promoted|Master)\".* active=\"true\".* orphaned=\"false\".* failed=\"false\"" + MASTER_ACTIVE=$? + MASTER_ACTIVE_CACHED="true" + fi + return $MASTER_ACTIVE +} + +set_master() +{ + MASTER_HOST="$1" + ${CRM_ATTR_REPL_INFO} -v "$1" -q +} + +last_known_master() +{ + if [ -z "$MASTER_HOST" ]; then + MASTER_HOST="$(${CRM_ATTR_REPL_INFO} --query -q 2>/dev/null)" + fi + echo "$MASTER_HOST" +} + +crm_master_reboot() { + local node + node=$(ocf_attribute_target) + "${HA_SBIN_DIR}/crm_master" -N "$node" -l reboot "$@" +} + +calculate_score() +{ + perf_score="$1" + connected_clients="$2" + + if ocf_is_true "$OCF_RESKEY_wait_last_known_master"; then + # only set perferred score by slave_priority if + # we are not waiting for the last known master. Otherwise + # we want the agent to have complete control over the scoring. + perf_score="" + connected_clients="0" + fi + + if [[ -z "$perf_score" ]]; then + if [[ "$(last_known_master)" == "$NODENAME" ]]; then + perf_score=1000 + else + perf_score=1 + fi + fi + perf_score=$(( perf_score + connected_clients )) + echo "$perf_score" +} + +set_score() +{ + local score + local last_master + + score="$1" + + if ocf_is_true "$OCF_RESKEY_wait_last_known_master" && ! master_is_active; then + last_master="$(last_known_master)" + if [ -n "$last_master" ] && [[ "$last_master" != "$NODENAME" ]]; then + ocf_log info "Postponing setting master score for ${NODENAME} until last known master instance [${last_master}] is promoted" + return + fi + fi + + ocf_log debug "monitor: Setting master score to '$score'" + crm_master_reboot -v "$score" +} + +redis_client() { + ocf_log debug "redis_client: '$REDIS_CLIENT' -s '$REDIS_SOCKET' $*" + if [ -n "$clientpasswd" ]; then + # Consider redis-cli features to choose optimal password passing method and warning filtering workaround + if [[ CLI_HAVE_ENV_AUTH -eq 1 ]]; then + REDISCLI_AUTH=$clientpasswd "$REDIS_CLIENT" -s "$REDIS_SOCKET" "$@" | sed 's/\r//' + elif [[ CLI_HAVE_ARG_NO_AUTH_WARNING -eq 1 ]]; then + "$REDIS_CLIENT" -s "$REDIS_SOCKET" --no-auth-warning -a "$clientpasswd" "$@" | sed 's/\r//' + elif [[ CLI_HAVE_AUTH_WARNING -eq 1 ]]; then + ("$REDIS_CLIENT" -s "$REDIS_SOCKET" -a "$clientpasswd" "$@" 2>&1 >&3 3>&- | grep -v "Using a password" >&2 3>&-) 3>&1 | sed 's/\r//' + else + "$REDIS_CLIENT" -s "$REDIS_SOCKET" -a "$clientpasswd" "$@" | sed 's/\r//' + fi + else + "$REDIS_CLIENT" -s "$REDIS_SOCKET" "$@" | sed 's/\r//' + fi +} + +simple_status() { + local pid + + if ! [ -f "$REDIS_PIDFILE" ]; then + return $OCF_NOT_RUNNING + fi + + pid="$(<"$REDIS_PIDFILE")" + pidof $(basename "$REDIS_SERVER") | grep -q "\<$pid\>" || return $OCF_NOT_RUNNING + + ocf_log debug "monitor: redis-server running under pid $pid" + + return $OCF_SUCCESS +} + +redis_monitor() { + local res + local master_name + local last_known_master_port + + simple_status + res=$? + if (( res != OCF_SUCCESS )); then + return $res + fi + + typeset -A info + while read line; do + [[ "$line" == "#"* ]] && continue + [[ "$line" != *":"* ]] && continue + IFS=':' read -r key value <<< "$line" + info[$key]="$value" + done < <(redis_client info) + if [[ -z "${info[role]}" ]]; then + ocf_log err "monitor: Could not get role from \`$REDIS_CLIENT -s $REDIS_SOCKET info\`" + return $OCF_ERR_GENERIC + fi + + if ocf_is_ms; then + # Here we see if a score has already been set. + # If score isn't set we the redis setting 'slave_priority'. + # If that isn't set, we default to 1000 for a master, and 1 for slave. + # We then add 1 for each connected client + score="$(crm_master_reboot -G --quiet 2>/dev/null)" + if [[ -z "$score" ]]; then + score=$(calculate_score "${info[slave_priority]}" "${info[connected_clients]}") + set_score "$score" + fi + + if [[ "${info[role]}" == "master" ]]; then + if ocf_is_probe; then + set_master "$NODENAME" + fi + return $OCF_RUNNING_MASTER + fi + + if [ "$CHECK_SLAVE_STATE" -eq 1 ]; then + if [[ "${info[master_link_status]}" != "up" ]]; then + ocf_log info "monitor: Slave mode link has not yet been established (link=${info[master_link_status]})" + return $OCF_ERR_GENERIC + fi + if [[ "${info[master_host]}" != "$(last_known_master)" ]]; then + if [ -n "${OCF_RESKEY_tunnel_port_map}" ]; then + master_name=$(port_to_redis_node ${info[master_port]}) + last_known_master_port=$(redis_node_to_port $(last_known_master)) + if [[ "${info[master_host]}" != "${OCF_RESKEY_tunnel_host}" ]] || + [[ "${info[master_port]}" != "${last_known_master_port}" ]]; then + ocf_log err "monitor: Slave mode current tunnelled connection to redis server does not match running master. tunnelled='${info[master_host]}:${info[master_port]} (${master_name})', running='$(last_known_master)'" + return $OCF_ERR_GENERIC + fi + else + ocf_log err "monitor: Slave mode current master does not match running master. current=${info[master_host]}, running=$(last_known_master)" + return $OCF_ERR_GENERIC + fi + fi + fi + fi + return $OCF_SUCCESS +} + +redis_node_to_port() +{ + local node=$1 + echo "$OCF_RESKEY_tunnel_port_map" | tr ';' '\n' | tr -d ' ' | sed 's/:/ /' | awk -F' ' '$1=="'"$node"'" {print $2;exit}' +} + +port_to_redis_node() +{ + local port=$1 + echo "$OCF_RESKEY_tunnel_port_map" | tr ';' '\n' | tr -d ' ' | sed 's/:/ /' | awk -F' ' '$2=="'"$port"'" {print $1;exit}' +} + +get_tunnel_port_from_master() +{ + local master_name=$1 + crm_attribute --node "$master_name" -l forever --name ${INSTANCE_ATTR_NAME}-tunnel-port --query -q 2>/dev/null +} + +get_master_from_tunnel_port() +{ + local master_name=$1 + crm_attribute --node "$master_name" -l forever --name ${INSTANCE_ATTR_NAME}-tunnel-port --query -q 2>/dev/null +} + +check_dump_file() +{ + if ! have_binary "$REDIS_CHECK_DUMP"; then + return 0 + fi + $REDIS_CHECK_DUMP ${REDIS_DUMP_DIR}/${REDIS_DUMP_FILE} 2>&1 +} + +redis_start() { + local size + + redis_monitor + status=$? + + if (( status == OCF_SUCCESS )) || (( status == OCF_RUNNING_MASTER )); then + ocf_log info "start: redis is already running" + return $OCF_SUCCESS + fi + + [[ ! -d "$REDIS_RUNDIR" ]] && mkdir -p "$REDIS_RUNDIR" + chown -R "$REDIS_USER" "$REDIS_RUNDIR" + if have_binary "restorecon"; then + restorecon -Rv "$REDIS_RUNDIR" + fi + + + # check for 0 byte database dump file. This is an unrecoverable start + # condition that we can avoid by deleting the 0 byte database file. + if [ -f "${REDIS_DUMP_DIR}/${REDIS_DUMP_FILE}" ]; then + size="$(stat --format "%s" ${REDIS_DUMP_DIR}/${REDIS_DUMP_FILE})" + if [ "$?" -eq "0" ] && [ "$size" -eq "0" ]; then + ocf_log notice "Detected 0 byte ${REDIS_DUMP_FILE}, deleting zero length file to avoid start failure." + rm -f "${REDIS_DUMP_DIR}/${REDIS_DUMP_FILE}" + fi + fi + + ocf_log info "start: $REDIS_SERVER --daemonize yes --unixsocket '$REDIS_SOCKET' --pidfile '$REDIS_PIDFILE'" + output="$(su "$REDIS_USER" -s /bin/sh -c "cd '$REDIS_RUNDIR'; exec '$REDIS_SERVER' '$REDIS_CONFIG' --daemonize yes --unixsocket '$REDIS_SOCKET' --pidfile '$REDIS_PIDFILE'" 2>&1)" + + while true; do + # wait for redis to start + typeset -A info + while read line; do + [[ "$line" == "#"* ]] && continue + [[ "$line" != *":"* ]] && continue + IFS=':' read -r key value <<< "$line" + info[$key]="$value" + done < <(redis_client info) + + if (( info[loading] == 0 )); then + break + elif (( info[loading] == 1 )); then + sleep "${info[loading_eta_seconds]}" + elif pidof $(basename "$REDIS_SERVER") >/dev/null; then + # unknown error, but the process still exists. + # This check is mainly because redis daemonizes before it starts listening, causing `redis-cli` to fail + # See https://github.com/antirez/redis/issues/2368 + # It's possible that the `pidof` will pick up a different redis, but in that case, the start operation will just time out + sleep 1 + else + check_output="$(check_dump_file)" + ocf_log err "start: Unknown error waiting for redis to start. redis-check-dump output=${check_output//$'\n'/; }" + return $OCF_ERR_GENERIC + fi + done + + while ! [ -s "$REDIS_PIDFILE" ]; do + ocf_log debug "start: Waiting for pid file '$REDIS_PIDFILE' to appear" + sleep 1 + done + + ocf_is_ms && redis_demote # pacemaker expects resources to start in slave mode + + redis_monitor + status=$? + if (( status == OCF_SUCCESS )) || (( status == OCF_RUNNING_MASTER )); then + return $OCF_SUCCESS + fi + + check_output="$(check_dump_file)" + ocf_log err "start: Unknown error starting redis. redis-server output=${output//$'\n'/; } redis-check-dump output=${check_output//$'\n'/; }" + return $status +} + +redis_stop() { + redis_monitor + status=$? + + if (( status == OCF_NOT_RUNNING )); then + ocf_log info "stop: redis is already stopped" + crm_master_reboot -D + return $OCF_SUCCESS + fi + + pid="$(<"$REDIS_PIDFILE")" + kill -TERM "$pid" + + while true; do + simple_status + status=$? + if (( status == OCF_NOT_RUNNING )); then + crm_master_reboot -D + return $OCF_SUCCESS + fi + sleep 1 + done +} + +redis_promote() { + redis_monitor + status=$? + + if (( status == OCF_RUNNING_MASTER )); then + ocf_log info "promote: Already running as master" + set_master "$NODENAME" + return $OCF_SUCCESS + elif (( status != OCF_SUCCESS )); then + ocf_log err "promote: Node is not running as a slave" + return $OCF_ERR_GENERIC + fi + + redis_client slaveof no one + + redis_monitor + status=$? + if (( status == OCF_RUNNING_MASTER )); then + set_master "$NODENAME" + return $OCF_SUCCESS + fi + + ocf_log err "promote: Unknown error while promoting to master (status=$status)" + return $OCF_ERR_GENERIC +} + +redis_demote() { + local master_host + local master_port + local tunnel_port + + # client kill is only supported in Redis 2.8.12 or greater + version=$(redis_client -v | awk '{print $NF}') + ocf_version_cmp "$version" "2.8.11" + client_kill=$? + + CHECK_SLAVE_STATE=1 + redis_monitor + status=$? + + if (( status == OCF_SUCCESS )); then + ocf_log info "demote: Already running as slave" + return $OCF_SUCCESS + elif (( status == OCF_NOT_RUNNING )); then + ocf_log err "demote: Failed to demote, redis not running." + return $OCF_NOT_RUNNING + fi + + master_host="$(last_known_master)" + master_port="${REDIS_REPLICATION_PORT}" + + # The elected master has to remain a slave during startup. + # During this period a placeholder master host is assigned. + if [ -z "$master_host" ] || [[ "$master_host" == "$NODENAME" ]]; then + CHECK_SLAVE_STATE=0 + master_host="no-such-master" + elif ! master_is_active; then + # no master has been promoted yet. we'll be notified when the + # master starts. + CHECK_SLAVE_STATE=0 + master_host="no-such-master" + fi + + if [ -n "${OCF_RESKEY_tunnel_port_map}" ]; then + # master_host can be the special marker "no-such-master" + # while a master is being selected. In this case, no + # tunnel port is returned, but this is not fatal. + tunnel_port=$(redis_node_to_port "$master_host") + if [ -n "$tunnel_port" ]; then + ocf_log info "demote: Setting master to '$master_host' via local tunnel '${OCF_RESKEY_tunnel_host}' on port '$tunnel_port'" + master_host="${OCF_RESKEY_tunnel_host}" + master_port="$tunnel_port" + fi + else + ocf_log info "demote: Setting master to '$master_host'" + fi + + redis_client slaveof "$master_host" "$master_port" + + # Wait forever for the slave to connect to the master and finish the + # sync. Timeout is controlled by Pacemaker "op start timeout=XX". + # + # hint: redis master_link_status will only come "up" when + # the SYNC with the master has completed. + # This can take an arbitraty time (data) and should + # only be parametrized by the start operation timeout + # by the administrator, not by this resource agent code + while true; do + # Wait infinite if replication is syncing + # Then start/demote operation timeout determines timeout + if [ "$client_kill" -eq 2 ]; then + redis_client CLIENT PAUSE 2000 + fi + redis_monitor + status=$? + if (( status == OCF_SUCCESS )); then + if [ "$client_kill" -eq 2 ]; then + redis_client CLIENT KILL type normal + fi + return $OCF_SUCCESS + fi + + sleep 1 + done + + ocf_log err "demote: Unexpected error setting slave mode (status=$status)" + return $OCF_ERR_GENERIC +} + +redis_notify() { + mode="${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation}" + case "$mode" in + post-demote|post-promote) # change the master + redis_monitor + status=$? + if (( status == OCF_SUCCESS )); then # were a slave + # calling demote updates the slave's connection + # to the newly appointed Master instance. + redis_demote + fi + ;; + esac + return $OCF_SUCCESS +} + +redis_validate() { + if [[ ! -x "$REDIS_SERVER" ]]; then + ocf_log err "validate: $REDIS_SERVER does not exist or is not executable" + return $OCF_ERR_INSTALLED + fi + if [[ ! -x "$REDIS_CLIENT" ]]; then + ocf_log err "validate: $REDIS_CLIENT does not exist or is not executable" + return $OCF_ERR_INSTALLED + fi + if [[ ! -f "$REDIS_CONFIG" ]]; then + ocf_log err "validate: $REDIS_CONFIG does not exist" + return $OCF_ERR_CONFIGURED + fi + if ! getent passwd "$REDIS_USER" &>/dev/null; then + ocf_log err "validate: $REDIS_USER is not a valid user" + return $OCF_ERR_CONFIGURED + fi +} + +if [ "$__OCF_ACTION" != "meta-data" ]; then + NODENAME=$(ocf_attribute_target) +fi +if [ -r "$REDIS_CONFIG" ]; then + clientpasswd="$(sed -n -e 's/^\s*requirepass\s*\(.*\)\s*$/\1/p' < $REDIS_CONFIG | tail -n 1)" +fi + +if [ "$__OCF_ACTION" = "start" ]; then + redis_validate || exit $? +fi + +redis_cli_features + +ocf_log debug "action=${1:-$__OCF_ACTION} notify_type=${OCF_RESKEY_CRM_meta_notify_type} notify_operation=${OCF_RESKEY_CRM_meta_notify_operation} master_host=${OCF_RESKEY_CRM_meta_notify_master_uname} slave_host=${OCF_RESKEY_CRM_meta_notify_slave_uname} promote_host=${OCF_RESKEY_CRM_meta_notify_promote_uname} demote_host=${OCF_RESKEY_CRM_meta_notify_demote_uname}; params: bin=${OCF_RESKEY_bin} client_bin=${OCF_RESKEY_client_bin} config=${OCF_RESKEY_config} user=${OCF_RESKEY_user} rundir=${OCF_RESKEY_rundir} port=${OCF_RESKEY_port}" + +case "${1:-$__OCF_ACTION}" in + status|monitor) + redis_monitor + ;; + start) + redis_start + ;; + stop) + redis_stop + ;; + restart) + redis_stop && redis_start + ;; + promote) + redis_promote + ;; + demote) + redis_demote + ;; + notify) + redis_notify + ;; + meta-data) + redis_meta_data + ;; + validate-all) + redis_validate + ;; + *) + echo "Usage: $0 {monitor|start|stop|restart|promote|demote|notify|validate-all|meta-data}" + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +status=$? +ocf_log debug "exit_status=$status" +exit $status diff --git a/heartbeat/rkt b/heartbeat/rkt new file mode 100755 index 0000000..724986f --- /dev/null +++ b/heartbeat/rkt @@ -0,0 +1,475 @@ +#!/bin/sh +# +# The rkt HA resource agent creates and launches a container based off +# a supplied image. Containers managed by this agent are both created +# and removed upon the agent's start and stop actions. +# +# Copyright (c) 2017 Valentin Vidic <Valentin.Vidic@CARNet.hr> +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + +meta_data() +{ + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="rkt" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +The rkt HA resource agent creates and launches a container +based off a supplied image. Containers managed by this agent +are both created and removed upon the agent's start and stop actions. +</longdesc> +<shortdesc lang="en">rkt container resource agent.</shortdesc> + +<parameters> +<parameter name="image" required="1" unique="0"> +<longdesc lang="en"> +The image to base this container off of. +</longdesc> +<shortdesc lang="en">image</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="name" required="0" unique="0"> +<longdesc lang="en"> +The name to give the created container. By default this will +be that resource's instance name. +</longdesc> +<shortdesc lang="en">container name</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="allow_pull" unique="0"> +<longdesc lang="en"> +Allow the image to be pulled from the configured registry when +the image does not exist locally. NOTE, this can drastically increase +the time required to start the container if the image repository is +pulled over the network. +</longdesc> +<shortdesc lang="en">Allow pulling non-local images</shortdesc> +<content type="boolean"/> +</parameter> + +<parameter name="run_opts" required="0" unique="0"> +<longdesc lang="en"> +Add options to be appended to the 'rkt run' command which is used +when creating the container during the start action. This option allows +users to do things such as setting a custom entry point and injecting +environment variables into the newly created container. + +NOTE: Do not explicitly specify the --name argument in the run_opts. This +agent will set --name using either the resource's instance or the name +provided in the 'name' argument of this agent. + +</longdesc> +<shortdesc lang="en">run options</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="run_cmd" required="0" unique="0"> +<longdesc lang="en"> +Specify a command to launch within the container once +it has initialized. +</longdesc> +<shortdesc lang="en">run command</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="mount_points" required="0" unique="0"> +<longdesc lang="en"> +A comma separated list of directories that the container is expecting to use. +The agent will ensure they exist by running 'mkdir -p' +</longdesc> +<shortdesc lang="en">Required mount points</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="monitor_cmd" required="0" unique="0"> +<longdesc lang="en"> +Specify the full path of a command to launch within the container to check +the health of the container. This command must return 0 to indicate that +the container is healthy. A non-zero return code will indicate that the +container has failed and should be recovered. + +Note: Using this method for monitoring processes inside a container +is not recommended, as rkt tries to track processes running +inside the container and does not deal well with many short-lived +processes being spawned. Ensure that your container monitors its +own processes and terminates on fatal error rather than invoking +a command from the outside. +</longdesc> +<shortdesc lang="en">monitor command</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="force_kill" required="0" unique="0"> +<longdesc lang="en"> +Kill a container immediately rather than waiting for it to gracefully +shutdown +</longdesc> +<shortdesc lang="en">force kill</shortdesc> +<content type="boolean"/> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="90s" /> +<action name="stop" timeout="90s" /> +<action name="monitor" timeout="30s" interval="30s" depth="0" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="30s" /> +</actions> +</resource-agent> +END +} + +####################################################################### +REQUIRE_IMAGE_PULL=0 + +rkt_usage() +{ + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + + +monitor_cmd_exec() +{ + local rc=$OCF_SUCCESS + local out + local uuid + + if [ -z "$OCF_RESKEY_monitor_cmd" ]; then + return $rc + fi + + uuid=$(container_uuid) + out=$(rkt enter $uuid $OCF_RESKEY_monitor_cmd 2>&1) + rc=$? + + if [ $rc -eq 127 ]; then + ocf_log err "monitor cmd failed (rc=$rc), output: $out" + ocf_exit_reason "monitor_cmd, ${OCF_RESKEY_monitor_cmd}, not found within container." + # there is no recovering from this, exit immediately + exit $OCF_ERR_ARGS + elif [ $rc -ne 0 ]; then + ocf_exit_reason "monitor cmd failed (rc=$rc), output: $out" + rc=$OCF_ERR_GENERIC + else + ocf_log debug "monitor cmd passed: exit code = $rc" + fi + + return $rc +} + +container_exists() +{ + rkt list --no-legend | awk -v C=${CONTAINER} '$2 == C {exit 0} ENDFILE {exit 1}' +} + +container_uuid() +{ + rkt list --no-legend --full | awk -v C=${CONTAINER} '$2 == C {print $1; exit}' +} + +container_state() +{ + rkt list --no-legend | awk -v C=${CONTAINER} '$2 == C {print $4; exit}' +} + +remove_container() +{ + local uuid + + container_exists + if [ $? -ne 0 ]; then + # don't attempt to remove a container that doesn't exist + return 0 + fi + + uuid=$(container_uuid) + ocf_log notice "Cleaning up inactive container, ${CONTAINER}." + ocf_run rkt rm $uuid +} + +rkt_simple_status() +{ + local val + + container_exists + if [ $? -ne 0 ]; then + return $OCF_NOT_RUNNING + fi + + # retrieve the 'STATE' attribute for the container + val=$(container_state) + if [ "$val" = "running" ]; then + # container exists and is running + return $OCF_SUCCESS + fi + + ocf_log debug "container ${CONTAINER} state is $val" + return $OCF_NOT_RUNNING +} + +rkt_monitor() +{ + local rc=0 + + rkt_simple_status + rc=$? + + if [ $rc -ne 0 ]; then + return $rc + fi + + monitor_cmd_exec +} + +rkt_create_mounts() { + oldIFS="$IFS" + IFS="," + for directory in $OCF_RESKEY_mount_points; do + mkdir -p "$directory" + done + IFS="$oldIFS" +} + +rkt_start() +{ + rkt_create_mounts + local run_opts="--name=${CONTAINER}" + + # check to see if the container has already started + rkt_simple_status + if [ $? -eq $OCF_SUCCESS ]; then + return $OCF_SUCCESS + fi + + if [ -n "$OCF_RESKEY_run_cmd" ]; then + run_opts="$run_opts --exec=$OCF_RESKEY_run_cmd" + fi + + if [ -n "$OCF_RESKEY_run_opts" ]; then + run_opts="$run_opts $OCF_RESKEY_run_opts" + fi + + if [ $REQUIRE_IMAGE_PULL -eq 1 ]; then + ocf_log notice "Beginning pull of image, ${OCF_RESKEY_image}" + rkt fetch "${OCF_RESKEY_image}" + if [ $? -ne 0 ]; then + ocf_exit_reason "failed to pull image ${OCF_RESKEY_image}" + return $OCF_ERR_GENERIC + fi + fi + + # make sure any previous container matching our container name is cleaned up first. + # we already know at this point it wouldn't be running + remove_container + ocf_log info "Starting container, ${CONTAINER}." + ocf_run systemd-run --slice=machine rkt run $OCF_RESKEY_image $run_opts + + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to launch container" + return $OCF_ERR_GENERIC + fi + + while ! container_exists || [ "$(container_state)" = "preparing" ] ; do + ocf_log debug "waiting for container to start" + sleep 1 + done + + # wait for monitor to pass before declaring that the container is started + while true; do + rkt_simple_status + if [ $? -ne $OCF_SUCCESS ]; then + ocf_exit_reason "Newly created container exited after start" + return $OCF_ERR_GENERIC + fi + + monitor_cmd_exec + if [ $? -eq $OCF_SUCCESS ]; then + ocf_log notice "Container ${CONTAINER} started successfully as $(container_uuid)" + return $OCF_SUCCESS + fi + + ocf_exit_reason "waiting on monitor_cmd to pass after start" + sleep 1 + done +} + +rkt_stop() +{ + local timeout=60 + local uuid + + rkt_simple_status + if [ $? -eq $OCF_NOT_RUNNING ]; then + remove_container + return $OCF_SUCCESS + fi + + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + timeout=$(( $OCF_RESKEY_CRM_meta_timeout/1000 - 10 )) + if [ $timeout -lt 10 ]; then + timeout=10 + fi + fi + + uuid=$(container_uuid) + if ocf_is_true "$OCF_RESKEY_force_kill"; then + ocf_log info "Killing container, ${CONTAINER}." + ocf_run rkt stop --force $uuid + else + ocf_log info "Stopping container, ${CONTAINER}." + ocf_run rkt stop $uuid + fi + + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." + return $OCF_ERR_GENERIC + fi + + while [ $timeout -gt 0 ]; do + rkt_simple_status + if [ $? -eq $OCF_NOT_RUNNING ]; then + break + fi + + ocf_log debug "waiting for container to stop" + timeout=$(( $timeout - 1 )) + sleep 1 + done + + rkt_simple_status + if [ $? -eq $OCF_SUCCESS ]; then + ocf_exit_reason "Failed to stop container, ${CONTAINER}." + return $OCF_ERR_GENERIC + fi + + remove_container + if [ $? -ne 0 ]; then + ocf_exit_reason "Failed to remove stopped container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +image_exists() +{ + rkt image list --no-legend | awk -v I=${OCF_RESKEY_image} '$2 == I {exit 0} ENDFILE {exit 1}' + if [ $? -eq 0 ]; then + # image found + return 0 + fi + + if ocf_is_true "$OCF_RESKEY_allow_pull"; then + REQUIRE_IMAGE_PULL=1 + ocf_log notice "Image (${OCF_RESKEY_image}) does not exist locally but will be pulled during start" + return 0 + fi + + # image not found. + return 1 +} + +rkt_validate() +{ + check_binary rkt + check_binary systemd-run + + if [ -z "$OCF_RESKEY_image" ]; then + ocf_exit_reason "'image' option is required" + exit $OCF_ERR_CONFIGURED + fi + + if echo ${CONTAINER} | grep -q '[^a-z0-9-]'; then + ocf_exit_reason "'name' must contain only lower case alphanumeric characters and -" + exit $OCF_ERR_CONFIGURED + fi + + image_exists + if [ $? -ne 0 ]; then + ocf_exit_reason "base image, ${OCF_RESKEY_image}, could not be found." + exit $OCF_ERR_CONFIGURED + fi + + return $OCF_SUCCESS +} + +# TODO : +# When a user starts plural clones in a node in globally-unique, a user cannot appoint plural name parameters. + +if ocf_is_true "$OCF_RESKEY_CRM_meta_globally_unique"; then + if [ -n "$OCF_RESKEY_name" ]; then + if [ -n "$OCF_RESKEY_CRM_meta_clone_node_max" ] && [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] + then + ocf_exit_reason "Cannot make plural clones from the same name parameter." + exit $OCF_ERR_CONFIGURED + fi + if [ -n "$OCF_RESKEY_CRM_meta_master_node_max" ] && [ "$OCF_RESKEY_CRM_meta_master_node_max" -ne 1 ] + then + ocf_exit_reason "Cannot make plural master from the same name parameter." + exit $OCF_ERR_CONFIGURED + fi + fi + : ${OCF_RESKEY_name=`echo ${OCF_RESOURCE_INSTANCE} | tr ':' '-'`} +else + : ${OCF_RESKEY_name=${OCF_RESOURCE_INSTANCE}} +fi + +CONTAINER=$OCF_RESKEY_name + +case $__OCF_ACTION in +meta-data) meta_data + exit $OCF_SUCCESS;; +start) + rkt_validate + rkt_start;; +stop) rkt_stop;; +monitor) rkt_monitor;; +validate-all) rkt_validate;; +usage|help) rkt_usage + exit $OCF_SUCCESS + ;; +*) rkt_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc diff --git a/heartbeat/rsyncd b/heartbeat/rsyncd new file mode 100755 index 0000000..d15aaf6 --- /dev/null +++ b/heartbeat/rsyncd @@ -0,0 +1,280 @@ +#!/bin/sh +# +# Resource script for rsync daemon +# +# Description: Manages rsync daemon as an OCF resource in +# an High Availability setup. +# +# Author: Dhairesh Oza <odhairesh@novell.com> +# License: GNU General Public License (GPL) +# +# +# usage: $0 {start|stop|status|monitor|validate-all|meta-data} +# +# The "start" arg starts rsyncd. +# +# The "stop" arg stops it. +# +# OCF parameters: +# OCF_RESKEY_binpath +# OCF_RESKEY_conffile +# OCF_RESKEY_bwlimit +# +# Note:This RA requires that the rsyncd config files has a "pid file" +# entry so that it is able to act on the correct process +########################################################################## +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_binpath_default="rsync" +OCF_RESKEY_conffile_default="/etc/rsyncd.conf" +OCF_RESKEY_bwlimit_default="" + +: ${OCF_RESKEY_binpath=${OCF_RESKEY_binpath_default}} +: ${OCF_RESKEY_conffile=${OCF_RESKEY_conffile_default}} +: ${OCF_RESKEY_bwlimit=${OCF_RESKEY_bwlimit_default}} + +USAGE="Usage: $0 {start|stop|status|monitor|validate-all|meta-data}"; + +########################################################################## + +usage() +{ + echo $USAGE >&2 +} + +meta_data() +{ +cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="rsyncd" version="1.0"> +<version>1.0</version> +<longdesc lang="en"> +This script manages rsync daemon +</longdesc> +<shortdesc lang="en">Manages an rsync daemon</shortdesc> + +<parameters> + +<parameter name="binpath"> +<longdesc lang="en"> +The rsync binary path. +For example, "/usr/bin/rsync" +</longdesc> +<shortdesc lang="en">Full path to the rsync binary</shortdesc> +<content type="string" default="${OCF_RESKEY_binpath_default}"/> +</parameter> + +<parameter name="conffile"> +<longdesc lang="en"> +The rsync daemon configuration file name with full path. +For example, "/etc/rsyncd.conf" +</longdesc> +<shortdesc lang="en">Configuration file name with full path</shortdesc> +<content type="string" default="${OCF_RESKEY_conffile_default}" /> +</parameter> + +<parameter name="bwlimit"> +<longdesc lang="en"> +This option allows you to specify a maximum transfer +rate in kilobytes per second. This option is +most effective when using rsync with large files +(several megabytes and up). Due to the nature of +rsync transfers, blocks of data are sent, then if +rsync determines the transfer was too fast, it will +wait before sending the next data block. The result +is an average transfer rate equaling the specified +limit. A value of zero specifies no limit. +</longdesc> +<shortdesc lang="en">limit I/O bandwidth, KBytes per second</shortdesc> +<content type="string" default="${OCF_RESKEY_bwlimit_default}"/> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s"/> +<action name="stop" timeout="20s"/> +<action name="monitor" depth="0" timeout="20s" interval="60s" /> +<action name="validate-all" timeout="20s"/> +<action name="meta-data" timeout="5s"/> +</actions> +</resource-agent> +END +exit $OCF_SUCCESS +} + +get_pid_and_conf_file() +{ + if [ -n "$OCF_RESKEY_conffile" ]; then + CONF_FILE=$OCF_RESKEY_conffile + else + CONF_FILE="/etc/rsyncd.conf" + fi + + grep -v "^#" "$CONF_FILE" | grep "pid file" > /dev/null + if [ $? -eq 0 ]; then + PIDFILE=`grep -v "^#" "$CONF_FILE" | grep "pid file" | awk -F "=" '{ print $2 }'` + fi +} + +rsyncd_status() +{ + if [ -n "$PIDFILE" -a -f $PIDFILE ]; then + # rsync is probably running + PID=`cat $PIDFILE` + if [ -n "$PID" ]; then + if ps -p $PID | grep rsync >/dev/null ; then + ocf_log info "rsync daemon running" + return $OCF_SUCCESS + else + ocf_log info "rsync daemon is not running but pid file exists" + return $OCF_ERR_GENERIC + fi + else + ocf_exit_reason "PID file empty!" + return $OCF_ERR_GENERIC + fi + fi + + # rsyncd is not running + ocf_log info "rsync daemon is not running" + return $OCF_NOT_RUNNING +} + +rsyncd_start() +{ + # if rsyncd is running return success + rsyncd_status + retVal=$? + if [ $retVal -eq $OCF_SUCCESS ]; then + exit $OCF_SUCCESS + elif [ $retVal -ne $OCF_NOT_RUNNING ]; then + ocf_exit_reason "Error. Unknown status." + exit $OCF_ERR_GENERIC + fi + + if [ -n "$OCF_RESKEY_binpath" ]; then + COMMAND="$OCF_RESKEY_binpath --daemon" + else + COMMAND="rsync --daemon" + fi + if [ -n "$OCF_RESKEY_conffile" ]; then + COMMAND="$COMMAND --config $OCF_RESKEY_conffile" + fi + if [ -n "$OCF_RESKEY_bwlimit" ]; then + COMMAND="$COMMAND --bwlimit $OCF_RESKEY_bwlimit" + fi + + if grep -v "^#" "$CONF_FILE" | grep "pid file" > /dev/null ; then + $COMMAND; + if [ $? -ne 0 ]; then + ocf_exit_reason "Error. rsync daemon returned error $?." + exit $OCF_ERR_GENERIC + fi + else + ocf_exit_reason "Error. \"pid file\" entry required in the rsyncd config file by rsyncd OCF RA." + return $OCF_ERR_GENERIC + fi + + ocf_log info "Started rsync daemon." + exit $OCF_SUCCESS +} + + +rsyncd_stop() +{ + if rsyncd_status ; then + PID=`cat $PIDFILE` + if [ -n "$PID" ] ; then + kill $PID + if [ $? -ne 0 ]; then + kill -s KILL $PID + if [ $? -ne 0 ]; then + ocf_exit_reason "Error. Could not stop rsync daemon." + return $OCF_ERR_GENERIC + fi + fi + rm $PIDFILE 2>/dev/null + fi + fi + ocf_log info "Stopped rsync daemon." + exit $OCF_SUCCESS +} + +rsyncd_monitor() +{ + rsyncd_status +} + +rsyncd_validate_all() +{ + if [ -n "$OCF_RESKEY_binpath" -a ! -x "$OCF_RESKEY_binpath" ]; then + ocf_exit_reason "Binary path $OCF_RESKEY_binpath does not exist." + exit $OCF_ERR_ARGS + fi + if [ -n "$OCF_RESKEY_conffile" -a ! -f "$OCF_RESKEY_conffile" ]; then + ocf_exit_reason "Config file $OCF_RESKEY_conffile does not exist." + exit $OCF_ERR_ARGS + fi + + if grep -v "^#" "$CONF_FILE" | grep "pid file" > /dev/null ; then + : + else + ocf_exit_reason "Error. \"pid file\" entry required in the rsyncd config file by rsyncd OCF RA." + return $OCF_ERR_GENERIC + fi + +#Not checking "$OCF_RESKEY_bwlimit" + + return $OCF_SUCCESS +} + + +# +# Main +# + +if [ $# -ne 1 ]; then + usage + exit $OCF_ERR_ARGS +fi + +case $1 in + start) get_pid_and_conf_file + rsyncd_start + ;; + + stop) get_pid_and_conf_file + rsyncd_stop + ;; + + status) get_pid_and_conf_file + rsyncd_status + ;; + + monitor)get_pid_and_conf_file + rsyncd_monitor + ;; + + validate-all) get_pid_and_conf_file + rsyncd_validate_all + ;; + + meta-data) meta_data + ;; + + usage) usage + exit $OCF_SUCCESS + ;; + + *) usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + diff --git a/heartbeat/rsyslog.in b/heartbeat/rsyslog.in new file mode 100644 index 0000000..80d5c17 --- /dev/null +++ b/heartbeat/rsyslog.in @@ -0,0 +1,264 @@ +#!@BASH_SHELL@ +# +# Description: Manages a rsyslog instance, provided by NTT OSSC as an +# OCF High-Availability resource under Heartbeat/LinuxHA control +# +# Copyright (c) 2011 NIPPON TELEGRAPH AND TELEPHONE CORPORATION +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# +############################################################################## +# OCF parameters: +# OCF_RESKEY_rsyslog_binary : Path to rsyslog binary. +# Default is "/sbin/rsyslogd" +# OCF_RESKEY_configfile : Configuration file +# OCF_RESKEY_start_opts : Startup options +# +# Only OCF_RESKEY_configfile must be specified. Each of the rests +# has its default value or refers OCF_RESKEY_configfile to make +# its value when no explicit value is given. +# +# Further infomation for setup: +# There are sample configurations at the end of this file. +# +############################################################################### + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_configfile_default="" +OCF_RESKEY_rsyslog_binary_default="/sbin/rsyslogd" +OCF_RESKEY_start_opts_default="" + +: ${OCF_RESKEY_configfile=${OCF_RESKEY_configfile_default}} +: ${OCF_RESKEY_rsyslog_binary=${OCF_RESKEY_rsyslog_binary_default}} +: ${OCF_RESKEY_start_opts=${OCF_RESKEY_start_opts_default}} + +usage() +{ + cat <<-! +usage: $0 action + +action: + start : start a new rsyslog instance + + stop : stop the running rsyslog instance + + status : return the status of rsyslog, run or down + + monitor : return TRUE if the rsyslog appears to be working. + + meta-data : show meta data message + + validate-all: validate the instance parameters +! + return $OCF_ERR_UNIMPLEMENTED +} + +metadata_rsyslog() +{ + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="rsyslog" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This script manages a rsyslog instance as an HA resource. +</longdesc> +<shortdesc lang="en">rsyslog resource agent</shortdesc> + +<parameters> + +<parameter name="configfile" unique="1" required="1"> +<longdesc lang="en"> +This parameter specifies a configuration file +for a rsyslog instance managed by this RA. +</longdesc> +<shortdesc lang="en">Configuration file</shortdesc> +<content type="string" default="${OCF_RESKEY_configfile_default}"/> +</parameter> + +<parameter name="rsyslog_binary" unique="0"> +<longdesc lang="en"> +This parameter specifies rsyslog's executable file. +</longdesc> +<shortdesc lang="en">rsyslog executable</shortdesc> +<content type="string" default="${OCF_RESKEY_rsyslog_binary_default}"/> +</parameter> + +<parameter name="start_opts" unique="0"> +<longdesc lang="en"> +This parameter specifies startup options for a +rsyslog instance managed by this RA. When no value is given, no startup +options is used. Don't use option '-F'. It causes a stuck of a start action. +</longdesc> +<shortdesc lang="en">Start options</shortdesc> +<content type="string" default="${OCF_RESKEY_start_opts_default}"/> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="60s" /> +<action name="status" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="20s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s"/> +</actions> +</resource-agent> +END + return $OCF_SUCCESS +} + +monitor_rsyslog() +{ + set -- $(pgrep -f "$PROCESS_PATTERN" 2>/dev/null) + case $# in + 0) ocf_log debug "No rsyslog process for $CONFIGFILE" + return $OCF_NOT_RUNNING;; + 1) return $OCF_SUCCESS;; + esac + ocf_log warn "Multiple rsyslog process for $CONFIGFILE" + return $OCF_SUCCESS +} + +start_rsyslog() +{ + local ocf_status + monitor_rsyslog + if [ $? = "$OCF_SUCCESS" ]; then + return $OCF_SUCCESS + fi + + $RSYSLOG_EXE -f $CONFIGFILE $START_OPTS 2>&1 + ocf_status=$? + if [ "$ocf_status" != "$OCF_SUCCESS" ]; then + return $OCF_ERR_GENERIC + fi + + while true; do + monitor_rsyslog + if [ $? = "$OCF_SUCCESS" ]; then + return $OCF_SUCCESS + fi + sleep 1 + done +} + +stop_rsyslog() +{ + pkill -TERM -f "$PROCESS_PATTERN" + + typeset lapse_sec=0 + while pgrep -f "$PROCESS_PATTERN" > /dev/null; do + sleep 1 + lapse_sec=$(( lapse_sec + 1 )) + ocf_log debug "stop_rsyslog[${OCF_RESOURCE_INSTANCE}]: stop NORM $lapse_sec/$OCF_RESKEY_CRM_meta_timeout" + if [ $lapse_sec -ge $OCF_RESKEY_CRM_meta_timeout ]; then + break + fi + done + + lapse_sec=0 + while pgrep -f "$PROCESS_PATTERN" > /dev/null; do + pkill -KILL -f "$PROCESS_PATTERN" + sleep 1 + lapse_sec=$(( lapse_sec + 1 )) + ocf_log debug "stop_rsyslog[${OCF_RESOURCE_INSTANCE}]: suspend rsyslog by SIGKILL ($lapse_sec/@@@)" + done + + return $OCF_SUCCESS +} + +status_rsyslog() +{ + monitor_rsyslog + rc=$? + if [ $rc = $OCF_SUCCESS ]; then + echo "rsyslog service is running." + elif [ $rc = $OCF_NOT_RUNNING ]; then + echo "rsyslog service is stopped." + fi + return $rc +} + +validate_all_rsyslog() +{ + ocf_log info "validate_all_rsyslog[${OCF_RESOURCE_INSTANCE}]" + return $OCF_SUCCESS +} + +if [[ "$1" = "meta-data" ]]; then + metadata_rsyslog + exit $? +fi + +CONFIGFILE="${OCF_RESKEY_configfile}" +if [[ -z "$CONFIGFILE" ]]; then + ocf_log err "undefined parameter:configfile" + exit $OCF_ERR_CONFIGURED +fi +if [[ ! -f "$CONFIGFILE" ]]; then + ocf_log err "Config file $CONFIGFILE does not exist." + exit $OCF_ERR_CONFIGURED +fi + +RSYSLOG_EXE="${OCF_RESKEY_rsyslog_binary}" +if [[ ! -x "$RSYSLOG_EXE" ]]; then + ocf_log err "Invalid value:rsyslog_binary:$RSYSLOG_EXE" + exit $OCF_ERR_CONFIGURED +fi + +START_OPTS=${OCF_RESKEY_start_opts} +PROCESS_PATTERN="$RSYSLOG_EXE -f $CONFIGFILE" + +COMMAND=$1 + +case "$COMMAND" in + start) + ocf_log debug "[${OCF_RESOURCE_INSTANCE}] Enter rsyslog start" + start_rsyslog + func_status=$? + ocf_log debug "[${OCF_RESOURCE_INSTANCE}] Leave rsyslog start $func_status" + exit $func_status + ;; + stop) + ocf_log debug "[${OCF_RESOURCE_INSTANCE}] Enter rsyslog stop" + stop_rsyslog + func_status=$? + ocf_log debug "[${OCF_RESOURCE_INSTANCE}] Leave rsyslog stop $func_status" + exit $func_status + ;; + status) + status_rsyslog + exit $? + ;; + monitor) + monitor_rsyslog + func_status=$? + exit $func_status + ;; + validate-all) + validate_all_rsyslog + exit $? + ;; + *) + usage + ;; +esac diff --git a/heartbeat/sapdb-nosha.sh b/heartbeat/sapdb-nosha.sh new file mode 100644 index 0000000..31b52e7 --- /dev/null +++ b/heartbeat/sapdb-nosha.sh @@ -0,0 +1,744 @@ +# +# sapdatabase-nosha - for systems not having SAPHostAgent installed +# (sourced by SAPDatabase) +# +# Description: this code is separated from the SAPDatabase agent to +# be downward compatible and support systems which do +# not have SAPHostAgent installed. +# It will be removed in a later release completely. +# +# Author: Alexander Krauth, October 2006 +# Support: linux@sap.com +# License: GNU General Public License (GPL) +# Copyright: (c) 2006, 2007 Alexander Krauth +# + + +trap_handler() { + rm -f $TEMPFILE + exit $OCF_ERR_GENERIC +} + + +# +# listener_start: Start the given listener +# +listener_start() { + local orasid="ora`echo $SID | tr '[:upper:]' '[:lower:]'`" + local lrc=$OCF_SUCCESS + local output + output=`echo "lsnrctl start $NETSERVICENAME" | su - $orasid 2>&1` + if [ $? -eq 0 ] + then + ocf_log info "Oracle Listener $NETSERVICENAME started: $output" + lrc=$OCF_SUCCESS + else + ocf_log err "Oracle Listener $NETSERVICENAME start failed: $output" + lrc=$OCF_ERR_GENERIC + fi + return $lrc +} + +# +# listener_stop: Stop the given listener +# +listener_stop() { + local orasid="ora`echo $SID | tr '[:upper:]' '[:lower:]'`" + local lrc=$OCF_SUCCESS + if + listener_status + then + : listener is running, trying to stop it later... + else + return $OCF_SUCCESS + fi + local output + output=`echo "lsnrctl stop $NETSERVICENAME" | su - $orasid 2>&1` + if [ $? -eq 0 ] + then + ocf_log info "Oracle Listener $NETSERVICENAME stopped: $output" + else + ocf_log err "Oracle Listener $NETSERVICENAME stop failed: $output" + lrc=$OCF_ERR_GENERIC + fi + return $lrc +} + +# +# listener_status: is the given listener running? +# +listener_status() { + local lrc=$OCF_SUCCESS + local orasid="ora`echo $SID | tr '[:upper:]' '[:lower:]'`" + # Note: ps cuts off it's output at column $COLUMNS, so "ps -ef" can not be used here + # as the output might be to long. + local cnt=`ps efo args --user $orasid | grep $NETSERVICENAME | grep -c tnslsnr` + if [ $cnt -eq 1 ] + then + lrc=$OCF_SUCCESS + else + ocf_log info "listener process not running for $NETSERVICENAME for $SID" + lrc=$OCF_ERR_GENERIC + fi + return $lrc +} + +# +# x_server_start: Start the given x_server +# +x_server_start() { + local rc=$OCF_SUCCESS + local output + output=`echo "x_server start" | su - $sidadm 2>&1` + if [ $? -eq 0 ] + then + ocf_log info "MaxDB x_server start: $output" + lrc=$OCF_SUCCESS + else + ocf_log err "MaxDB x_server start failed: $output" + lrc=$OCF_ERR_GENERIC + fi + return $lrc +} + +# +# x_server_stop: Stop the x_server +# +x_server_stop() { + local lrc=$OCF_SUCCESS + local output + output=`echo "x_server stop" | su - $sidadm 2>&1` + if [ $? -eq 0 ] + then + ocf_log info "MaxDB x_server stop: $output" + else + ocf_log err "MaxDB x_server stop failed: $output" + lrc=$OCF_ERR_GENERIC + fi + return $lrc +} + +# +# x_server_status: is the x_server running? +# +x_server_status() { + local lrc=$OCF_SUCCESS + local sdbuser=`grep "^SdbOwner" /etc/opt/sdb | awk -F'=' '{print $2}'` + # Note: ps cuts off it's output at column $COLUMNS, so "ps -ef" can not be used here + # as the output might be to long. + local cnt=`ps efo args --user $sdbuser | grep -c vserver` + if [ $cnt -ge 1 ] + then + lrc=$OCF_SUCCESS + else + ocf_log info "x_server process not running" + lrc=$OCF_ERR_GENERIC + fi + return $lrc +} + +# +# oracle_stop: Stop the Oracle database without any condition +# +oracle_stop() { +echo '#!/bin/sh +LOG=$HOME/stopdb.log +date > $LOG + +if [ -x "${ORACLE_HOME}/bin/sqlplus" ] +then + SRVMGRDBA_EXE="${ORACLE_HOME}/bin/sqlplus" +else + echo "Can not find executable sqlplus" >> $LOG + exit 1 +fi + +$SRVMGRDBA_EXE /NOLOG >> $LOG << ! +connect / as sysdba +shutdown immediate +exit +! +rc=$? +cat $LOG +exit $rc' > $TEMPFILE + +chmod 700 $TEMPFILE +chown $sidadm $TEMPFILE + +su - $sidadm -c $TEMPFILE +retcode=$? +rm -f $TEMPFILE + +if [ $retcode -eq 0 ]; then + sapdatabase_status + if [ $? -ne $OCF_NOT_RUNNING ]; then + retcode=1 + fi +fi + +return $retcode +} + +# +# maxdb_stop: Stop the MaxDB database without any condition +# +maxdb_stop() { + +# x_Server must be running to stop database +x_server_status +if [ $? -ne $OCF_SUCCESS ]; then x_server_start; fi + +if [ $DBJ2EE_ONLY -eq 1 ]; then + userkey=c_J2EE +else + userkey=c +fi + +echo "#!/bin/sh +LOG=\$HOME/stopdb.log +date > \$LOG +echo \"Stop database with xuserkey >$userkey<\" >> \$LOG +dbmcli -U ${userkey} db_offline >> \$LOG 2>&1 +exit \$?" > $TEMPFILE + +chmod 700 $TEMPFILE +chown $sidadm $TEMPFILE + +su - $sidadm -c $TEMPFILE +retcode=$? +rm -f $TEMPFILE + +if [ $retcode -eq 0 ]; then + sapdatabase_status + if [ $? -ne $OCF_NOT_RUNNING ]; then + retcode=1 + fi +fi + +return $retcode +} + +# +# db6udb_stop: Stop the DB2/UDB database without any condition +# +db6udb_stop() { +echo '#!/bin/sh +LOG=$HOME/stopdb.log +date > $LOG +echo "Shut down the database" >> $LOG +$INSTHOME/sqllib/bin/db2 deactivate database $DB2DBDFT |tee -a $LOG 2>&1 +$INSTHOME/sqllib/adm/db2stop force |tee -a $LOG 2>&1 +exit $?' > $TEMPFILE + +chmod 700 $TEMPFILE +chown $sidadm $TEMPFILE + +su - $sidadm -c $TEMPFILE +retcode=$? +rm -f $TEMPFILE + +if [ $retcode -eq 0 ]; then + sapdatabase_status + if [ $? -ne $OCF_NOT_RUNNING ]; then + retcode=1 + fi +fi + +return $retcode +} + +# +# oracle_recover: try to clean up oracle after a crash +# +oracle_recover() { +echo '#!/bin/sh +LOG=$HOME/recover.log +date > $LOG +echo "Logfile written by heartbeat SAPDatabase resource agent" >> $LOG + +if [ -x "${ORACLE_HOME}/bin/sqlplus" ] +then + SRVMGRDBA_EXE="${ORACLE_HOME}/bin/sqlplus" +else + echo "Can not find executable sqlplus" >> $LOG + exit 1 +fi + +$SRVMGRDBA_EXE /NOLOG >> $LOG << ! +connect / as sysdba +shutdown abort +startup mount +alter database end backup; +alter database open; +exit +! +rc=$? +cat $LOG +exit $rc' > $TEMPFILE + + chmod 700 $TEMPFILE + chown $sidadm $TEMPFILE + + su - $sidadm -c $TEMPFILE + retcode=$? + rm -f $TEMPFILE + + return $retcode +} + +# +# maxdb_recover: try to clean up MaxDB after a crash +# +maxdb_recover() { + # x_Server must be running to stop database + x_server_status + if [ $? -ne $OCF_SUCCESS ]; then x_server_start; fi + + if [ $DBJ2EE_ONLY -eq 1 ]; then + userkey=c_J2EE + else + userkey=c + fi + +echo "#!/bin/sh +LOG=\$HOME/recover.log +date > \$LOG +echo \"Logfile written by heartbeat SAPDatabase resource agent\" >> \$LOG +echo \"Cleanup database with xuserkey >$userkey<\" >> \$LOG +echo \"db_stop\" >> \$LOG 2>&1 +dbmcli -U ${userkey} db_stop >> \$LOG 2>&1 +echo \"db_clear\" >> \$LOG 2>&1 +dbmcli -U ${userkey} db_clear >> \$LOG 2>&1 +echo \"db_online\" >> \$LOG 2>&1 +dbmcli -U ${userkey} db_online >> \$LOG 2>&1 +rc=\$? +cat \$LOG +exit \$rc" > $TEMPFILE + + chmod 700 $TEMPFILE + chown $sidadm $TEMPFILE + + su - $sidadm -c $TEMPFILE + retcode=$? + rm -f $TEMPFILE + + return $retcode +} + +# +# db6udb_recover: try to recover DB/2 after a crash +# +db6udb_recover() { + db2sid="db2`echo $SID | tr '[:upper:]' '[:lower:]'`" + +echo '#!/bin/sh +LOG=$HOME/recover.log +date > $LOG +echo "Logfile written by heartbeat SAPDatabase resource agent" >> $LOG +$INSTHOME/sqllib/bin/db2_kill >> $LOG 2>&1 +$INSTHOME/sqllib/adm/db2start >> $LOG 2>&1 +$INSTHOME/sqllib/bin/db2 activate database $DB2DBDFT >> $LOG 2>&1 +rc=$? +cat $LOG +exit $rc' > $TEMPFILE + + chmod 700 $TEMPFILE + chown $db2sid $TEMPFILE + + su - $db2sid -c $TEMPFILE + retcode=$? + rm -f $TEMPFILE + + return $retcode +} + + +# +# sapdatabase_start : Start the SAP database +# +sapdatabase_start() { + sapuserexit PRE_START_USEREXIT "$OCF_RESKEY_PRE_START_USEREXIT" + + case $DBTYPE in + ADA) x_server_start + ;; + ORA) listener_start + ;; + esac + + output=`su - $sidadm -c $SAPSTARTDB` + rc=$? + + if [ $DBJ2EE_ONLY -eq 1 ] + then + sapdatabase_monitor 1 + rc=$? + fi + + if [ $rc -ne 0 -a $OCF_RESKEY_AUTOMATIC_RECOVER -eq 1 ] + then + ocf_log warn "SAP database $SID start failed: $output" + ocf_log warn "Try to recover database $SID" + + output='' + sapdatabase_recover + rc=$? + fi + + if [ $rc -eq 0 ] + then + ocf_log info "SAP database $SID started: $output" + rc=$OCF_SUCCESS + sapuserexit POST_START_USEREXIT "$OCF_RESKEY_POST_START_USEREXIT" + else + ocf_log err "SAP database $SID start failed: $output" + rc=$OCF_ERR_GENERIC + fi + + return $rc +} + +# +# sapdatabase_stop: Stop the SAP database +# +sapdatabase_stop() { + + sapuserexit PRE_STOP_USEREXIT "$OCF_RESKEY_PRE_STOP_USEREXIT" + + # use of the stopdb kernel script is not possible, because there are to may checks in that + # script. We want to stop the database regardless of anything. + #output=`su - $sidadm -c $SAPSTOPDB` + + case $DBTYPE in + ORA) output=`oracle_stop` + ;; + ADA) output=`maxdb_stop` + ;; + DB6) output=`db6udb_stop` + ;; + esac + + if [ $? -eq 0 ] + then + ocf_log info "SAP database $SID stopped: $output" + rc=$OCF_SUCCESS + else + ocf_log err "SAP database $SID stop failed: $output" + rc=$OCF_ERR_GENERIC + fi + + case $DBTYPE in + ORA) listener_stop + ;; + ADA) x_server_stop + ;; + esac + + sapuserexit POST_STOP_USEREXIT "$OCF_RESKEY_POST_STOP_USEREXIT" + + return $rc +} + + +# +# sapdatabase_monitor: Can the given database instance do anything useful? +# +sapdatabase_monitor() { + strict=$1 + + sapdatabase_status + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + return $rc + fi + + case $DBTYPE in + ADA) x_server_status + if [ $? -ne $OCF_SUCCESS ]; then x_server_start; fi + ;; + ORA) listener_status + if [ $? -ne $OCF_SUCCESS ]; then listener_start; fi + ;; + esac + + if [ $strict -eq 0 ] + then + return $rc + else + if [ $DBJ2EE_ONLY -eq 0 ] + then + output=`echo "$SAPDBCONNECT -d -w /dev/null" | su $sidadm 2>&1` + if [ $? -le 4 ] + then + rc=$OCF_SUCCESS + else + rc=$OCF_NOT_RUNNING + fi + else + MYCP="" + EXECMD="" + + # WebAS Java 6.40+7.00 + IAIK_JCE="$SECSTORE"/iaik_jce.jar + IAIK_JCE_EXPORT="$SECSTORE"/iaik_jce_export.jar + EXCEPTION="$BOOTSTRAP"/exception.jar + LOGGING="$BOOTSTRAP"/logging.jar + OPENSQLSTA="$BOOTSTRAP"/opensqlsta.jar + TC_SEC_SECSTOREFS="$BOOTSTRAP"/tc_sec_secstorefs.jar + JDDI="$BOOTSTRAP"/../server0/bin/ext/jdbdictionary/jddi.jar + ANTLR="$BOOTSTRAP"/../server0/bin/ext/antlr/antlr.jar + FRAME="$BOOTSTRAP"/../server0/bin/system/frame.jar + + # only start jdbcconnect when all jars available + if [ -f "$EXCEPTION" -a -f "$LOGGING" -a -f "$OPENSQLSTA" -a -f "$TC_SEC_SECSTOREFS" -a -f "$JDDI" -a -f "$ANTLR" -a -f "$FRAME" -a -f "$SAPDBCONNECT" ] + then + MYCP=".:$FRAME:$ANTLR:$JDDI:$IAIK_JCE_EXPORT:$IAIK_JCE:$EXCEPTION:$LOGGING:$OPENSQLSTA:$TC_SEC_SECSTOREFS:$DB_JARS:$SAPDBCONNECT" + EXECMD="com.sap.inst.jdbc.connect.JdbcCon -sec $SID:$SID" + else + # WebAS Java 7.10 + LAUNCHER=${BOOTSTRAP}/sap.com~tc~bl~offline_launcher~impl.jar + + if [ -f "$DB_JARS" -a -f "$SAPDBCONNECT" -a -f "$LAUNCHER" ] + then + MYCP="$LAUNCHER" + EXECMD="com.sap.engine.offline.OfflineToolStart com.sap.inst.jdbc.connect.JdbcCon ${SAPDBCONNECT}:${SECSTORE}:${DB_JARS}:${BOOTSTRAP} -sec $SID:$SID" + fi + fi + + if [ -n "$EXECMD" ] + then + output=`${JAVA_HOME}/bin/java -cp $MYCP $EXECMD 2> /dev/null` + if [ $? -le 0 ] + then + rc=$OCF_SUCCESS + else + rc=$OCF_NOT_RUNNING + fi + else + output="Cannot find all jar files needed for database monitoring." + rc=$OCF_ERR_GENERIC + fi + fi + fi + + if [ $rc -ne $OCF_SUCCESS ] + then + ocf_log err "The SAP database $SID is not running: $output" + fi + return $rc +} + + +# +# sapdatabase_status: Are there any database processes on this host ? +# +sapdatabase_status() { + case $DBTYPE in + ADA) SEARCH="$SID/db/pgm/kernel" + SUSER=`grep "^SdbOwner" /etc/opt/sdb | awk -F'=' '{print $2}'` + SNUM=2 + ;; + ORA) SEARCH="ora_[a-z][a-z][a-z][a-z]_" + SUSER="ora`echo $SID | tr '[:upper:]' '[:lower:]'`" + SNUM=4 + ;; + DB6) SEARCH="db2[a-z][a-z][a-z]" + SUSER="db2`echo $SID | tr '[:upper:]' '[:lower:]'`" + SNUM=2 + ;; + esac + + # Note: ps cuts off it's output at column $COLUMNS, so "ps -ef" can not be used here + # as the output might be to long. + cnt=`ps efo args --user $SUSER 2> /dev/null | grep -c "$SEARCH"` + if [ $cnt -ge $SNUM ] + then + rc=$OCF_SUCCESS + else + # ocf_log info "Database Instance $SID is not running on `hostname`" + rc=$OCF_NOT_RUNNING + fi + return $rc +} + + +# +# sapdatabase_recover: +# +sapdatabase_recover() { + + case $DBTYPE in + ORA) recoutput=`oracle_recover` + ;; + ADA) recoutput=`maxdb_recover` + ;; + DB6) recoutput=`db6udb_recover` + ;; + esac + + sapdatabase_monitor 1 + retcode=$? + + if [ $retcode -eq $OCF_SUCCESS ] + then + ocf_log info "Recover of SAP database $SID was successful: $recoutput" + else + ocf_log err "Recover of SAP database $SID failed: $recoutput" + fi + + return $retcode +} + + +# +# sapdatabase_validate: Check the symantic of the input parameters +# +sapdatabase_validate() { + rc=$OCF_SUCCESS + if [ `echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$'` -ne 1 ] + then + ocf_log err "Parsing parameter SID: '$SID' is not a valid system ID!" + rc=$OCF_ERR_ARGS + fi + + case "$DBTYPE" in + ORA|ADA|DB6) ;; + *) ocf_log err "Parsing parameter DBTYPE: '$DBTYPE' is not a supported database type!" + rc=$OCF_ERR_ARGS ;; + esac + + return $rc +} + + +# +# sapdatabase_init: initialize global variables at the beginning +# +sapdatabase_init() { + +ocf_log warn "Usage of SAPDatabase resource agent without SAPHostAgent is deprecated. Please read documentation of SAPDatabase resource agent and follow SAP note 1031096 for the installation of SAPHostAgent." + +# optional OCF parameters, we try to guess which directories are correct +EXESTARTDB="startdb" +EXESTOPDB="stopdb" +EXEDBCONNECT="R3trans" +if [ -z "$OCF_RESKEY_DBJ2EE_ONLY" ]; then + DBJ2EE_ONLY=0 +else + case "$OCF_RESKEY_DBJ2EE_ONLY" in + 1|true|TRUE|yes|YES) DBJ2EE_ONLY=1 + EXESTARTDB="startj2eedb" + EXESTOPDB="stopj2eedb" + EXEDBCONNECT="jdbcconnect.jar" + ;; + 0|false|FALSE|no|NO) DBJ2EE_ONLY=0;; + *) ocf_log err "Parsing parameter DBJ2EE_ONLY: '$DBJ2EE_ONLY' is not a boolean value!" + exit $OCF_ERR_ARGS ;; + esac +fi + +if [ -z "$OCF_RESKEY_NETSERVICENAME" ]; then + case "$DBTYPE" in + ORA|ora) NETSERVICENAME="LISTENER";; + *) NETSERVICENAME="";; + esac +else + NETSERVICENAME="$OCF_RESKEY_NETSERVICENAME" +fi + +if [ -z "$OCF_RESKEY_STRICT_MONITORING" ]; then + OCF_RESKEY_STRICT_MONITORING=0 +else + case "$OCF_RESKEY_STRICT_MONITORING" in + 1|true|TRUE|yes|YES) OCF_RESKEY_STRICT_MONITORING=1;; + 0|false|FALSE|no|NO) OCF_RESKEY_STRICT_MONITORING=0;; + *) ocf_log err "Parsing parameter STRICT_MONITORING: '$OCF_RESKEY_STRICT_MONITORING' is not a boolean value!" + exit $OCF_ERR_ARGS ;; + esac +fi + +PATHLIST=" +$OCF_RESKEY_DIR_EXECUTABLE +/usr/sap/$SID/*/exe +/usr/sap/$SID/SYS/exe/run +/sapmnt/$SID/exe +" +DIR_EXECUTABLE="" +for EXEPATH in $PATHLIST +do + if [ -x $EXEPATH/$EXESTARTDB -a -x $EXEPATH/$EXESTOPDB -a -x $EXEPATH/$EXEDBCONNECT ] + then + DIR_EXECUTABLE=$EXEPATH + SAPSTARTDB=$EXEPATH/$EXESTARTDB + SAPSTOPDB=$EXEPATH/$EXESTOPDB + SAPDBCONNECT=$EXEPATH/$EXEDBCONNECT + break + fi +done +if [ -z "$DIR_EXECUTABLE" ] +then + ocf_log warn "Cannot find $EXESTARTDB,$EXESTOPDB and $EXEDBCONNECT executable, please set DIR_EXECUTABLE parameter!" + exit $OCF_NOT_RUNNING +fi + +if [ $DBJ2EE_ONLY -eq 1 ] +then + if [ -n "$OCF_RESKEY_DIR_BOOTSTRAP" ] + then + BOOTSTRAP="$OCF_RESKEY_DIR_BOOTSTRAP" + else + BOOTSTRAP=`ls -1d /usr/sap/$SID/*/j2ee/cluster/bootstrap | head -1` + fi + + if [ -n "$OCF_RESKEY_DIR_SECSTORE" ] + then + SECSTORE="$OCF_RESKEY_DIR_SECSTORE" + else + SECSTORE=/usr/sap/$SID/SYS/global/security/lib/tools + fi + + if [ -n "$OCF_RESKEY_JAVA_HOME" ] + then + JAVA_HOME="$OCF_RESKEY_JAVA_HOME" + PATH=$JAVA_HOME/bin:$PATH + else + if [ -n "$JAVA_HOME" ] + then + PATH=$JAVA_HOME/bin:$PATH + else + ocf_log err "Cannot find JAVA_HOME directory, please set JAVA_HOME parameter!" + exit $OCF_NOT_RUNNING + fi + fi + + if [ -n "$OCF_RESKEY_DB_JARS" ] + then + DB_JARS=$OCF_RESKEY_DB_JARS + else + if [ -f "$BOOTSTRAP"/bootstrap.properties ]; then + DB_JARS=`cat $BOOTSTRAP/bootstrap.properties | grep -i rdbms.driverLocation | sed -e 's/\\\:/:/g' | awk -F= '{print $2}'` + fi + fi +fi + +if [ -z "$OCF_RESKEY_AUTOMATIC_RECOVER" ] +then + OCF_RESKEY_AUTOMATIC_RECOVER=0 +else + case "$OCF_RESKEY_AUTOMATIC_RECOVER" in + 1|true|TRUE|yes|YES) OCF_RESKEY_AUTOMATIC_RECOVER=1;; + 0|false|FALSE|no|NO) OCF_RESKEY_AUTOMATIC_RECOVER=0;; + esac +fi + +# as root user we need the library path to the SAP kernel to be able to call executables +if [ `echo $LD_LIBRARY_PATH | grep -c "^$DIR_EXECUTABLE\>"` -eq 0 ]; then + LD_LIBRARY_PATH=$DIR_EXECUTABLE${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH + export LD_LIBRARY_PATH +fi +sidadm="`echo $SID | tr '[:upper:]' '[:lower:]'`adm" +} + +# Set a tempfile and make sure to clean it up again +TEMPFILE="${HA_RSCTMP}/SAPDatabase.$$.tmp" +trap trap_handler INT TERM diff --git a/heartbeat/sapdb.sh b/heartbeat/sapdb.sh new file mode 100755 index 0000000..66e9854 --- /dev/null +++ b/heartbeat/sapdb.sh @@ -0,0 +1,367 @@ +# +# sapdb.sh - for systems having SAPHostAgent installed +# (sourced by SAPDatabase) +# +# Description: This code is separated from the SAPDatabase agent to +# introduce new functions for systems which having +# SAPHostAgent installed. +# Someday it might be merged back into SAPDatabase agein. +# +# Author: Alexander Krauth, September 2010 +# Support: linux@sap.com +# License: GNU General Public License (GPL) +# Copyright: (c) 2010, 2012 Alexander Krauth +# + + +# +# background_check_saphostexec : Run a request to saphostexec in a separate task, to be able to react to a hanging process +# +background_check_saphostexec() { + timeout=600 + count=0 + + $SAPHOSTCTRL -function ListDatabases >/dev/null 2>&1 & + pid=$! + + while kill -0 $pid > /dev/null 2>&1 + do + sleep 0.1 + count=$(( $count + 1 )) + if [ $count -ge $timeout ]; then + kill -9 $pid >/dev/null 2>&1 + ocf_log warn "saphostexec did not respond to the method 'ListDatabases' within 60 seconds" + return $OCF_ERR_GENERIC # Timeout + fi + done + + # child has already finished, now evaluate its returncode + wait $pid +} + +# +# cleanup_saphostexec : make sure to cleanup the SAPHostAgent in case of any +# misbehavior +# +cleanup_saphostexec() { + pkill -9 -f "$SAPHOSTEXEC" + pkill -9 -f "$SAPHOSTSRV" + oscolpid=$(pgrep -f "$SAPHOSTOSCOL") # we check saposcol pid, because it + # might not run under control of + # saphostexec + + # cleanup saposcol shared memory, otherwise it will not start again + if [ -n "$oscolpid" ];then + kill -9 $oscolpid + oscolipc=$(ipcs -m | grep "4dbe " | awk '{print $2}') + if [ -n "$oscolipc" ]; then + ipcrm -m $oscolipc + fi + fi + + # removing the unix domain socket file as it might have wrong permissions or + # ownership - it will be recreated by saphostexec during next start + [ -r /tmp/.sapstream1128 ] && rm -f /tmp/.sapstream1128 +} + +# +# check_saphostexec : Before using saphostctrl we make sure that the +# saphostexec is running on the current node. +# +check_saphostexec() { + chkrc=$OCF_SUCCESS + running=$(pgrep -f "$SAPHOSTEXEC" | wc -l) + + if [ $running -gt 0 ]; then + if background_check_saphostexec; then + return $OCF_SUCCESS + else + ocf_log warn "saphostexec did not respond to the method 'ListDatabases' correctly (rc=$?), it will be killed now" + running=0 + fi + fi + + if [ $running -eq 0 ]; then + ocf_log warn "saphostexec is not running on node `hostname`, it will be started now" + cleanup_saphostexec + output=`$SAPHOSTEXEC -restart 2>&1` + + # now make sure the daemon has been started and is able to respond + srvrc=1 + while [ $srvrc -ne 0 ] && [ "$(pgrep -f "$SAPHOSTEXEC" | wc -l)" -gt 0 ] + do + sleep 1 + background_check_saphostexec + srvrc=$? + done + + if [ $srvrc -eq 0 ] + then + ocf_log info "saphostexec on node $(hostname) was restarted !" + chkrc=$OCF_SUCCESS + else + ocf_log error "saphostexec on node $(hostname) could not be started! - $output" + chkrc=$OCF_ERR_GENERIC + fi + fi + + return $chkrc +} + + +# +# sapdatabase_start : Start the SAP database +# +sapdatabase_start() { + + check_saphostexec + rc=$? + + if [ $rc -eq $OCF_SUCCESS ] + then + sapuserexit PRE_START_USEREXIT "$OCF_RESKEY_PRE_START_USEREXIT" + + DBINST="" + if [ -n "$OCF_RESKEY_DBINSTANCE" ] + then + DBINST="-dbinstance $OCF_RESKEY_DBINSTANCE " + fi + FORCE="" + if ocf_is_true $OCF_RESKEY_AUTOMATIC_RECOVER + then + FORCE="-force" + fi + DBOSUSER="" + if [ -n "$OCF_RESKEY_DBOSUSER" ] + then + DBOSUSER="-dbuser $OCF_RESKEY_DBOSUSER " + fi + output=`$SAPHOSTCTRL -function StartDatabase -dbname $SID -dbtype $DBTYPE $DBINST $DBOSUSER $FORCE -service` + + sapdatabase_monitor 1 + rc=$? + + if [ $rc -eq 0 ] + then + ocf_log info "SAP database $SID started: $output" + rc=$OCF_SUCCESS + + sapuserexit POST_START_USEREXIT "$OCF_RESKEY_POST_START_USEREXIT" + else + ocf_log err "SAP database $SID start failed: $output" + rc=$OCF_ERR_GENERIC + fi + fi + + return $rc +} + +# +# sapdatabase_stop: Stop the SAP database +# +sapdatabase_stop() { + + check_saphostexec + rc=$? + + if [ $rc -eq $OCF_SUCCESS ] + then + sapuserexit PRE_STOP_USEREXIT "$OCF_RESKEY_PRE_STOP_USEREXIT" + + DBINST="" + if [ -n "$OCF_RESKEY_DBINSTANCE" ] + then + DBINST="-dbinstance $OCF_RESKEY_DBINSTANCE " + fi + DBOSUSER="" + if [ -n "$OCF_RESKEY_DBOSUSER" ] + then + DBOSUSER="-dbuser $OCF_RESKEY_DBOSUSER " + fi + output=`$SAPHOSTCTRL -function StopDatabase -dbname $SID -dbtype $DBTYPE $DBINST $DBOSUSER -force -service` + + if [ $? -eq 0 ] + then + ocf_log info "SAP database $SID stopped: $output" + rc=$OCF_SUCCESS + else + ocf_log err "SAP database $SID stop failed: $output" + rc=$OCF_ERR_GENERIC + fi + fi + + sapuserexit POST_STOP_USEREXIT "$OCF_RESKEY_POST_STOP_USEREXIT" + + return $rc +} + + +# +# sapdatabase_monitor: Can the given database instance do anything useful? +# +sapdatabase_monitor() { + strict=$1 + rc=$OCF_SUCCESS + + if ! ocf_is_true $strict + then + sapdatabase_status + rc=$? + else + check_saphostexec + rc=$? + + if [ $rc -eq $OCF_SUCCESS ] + then + count=0 + + DBINST="" + if [ -n "$OCF_RESKEY_DBINSTANCE" ] + then + DBINST="-dbinstance $OCF_RESKEY_DBINSTANCE " + fi + if [ -n "$OCF_RESKEY_DBOSUSER" ] + then + DBOSUSER="-dbuser $OCF_RESKEY_DBOSUSER " + fi + output=`$SAPHOSTCTRL -function GetDatabaseStatus -dbname $SID -dbtype $DBTYPE $DBINST $DBOSUSER` + + # we have to parse the output, because the returncode doesn't tell anything about the instance status + for SERVICE in `echo "$output" | grep -i 'Component[ ]*Name *[:=] [A-Za-z][A-Za-z0-9_]* (' | sed 's/^.*Component[ ]*Name *[:=] *\([A-Za-z][A-Za-z0-9_]*\).*$/\1/i'` + do + COLOR=`echo "$output" | grep -i "Component[ ]*Name *[:=] *$SERVICE (" | sed 's/^.*Status *[:=] *\([A-Za-z][A-Za-z0-9_]*\).*$/\1/i' | uniq` + STATE=0 + + case $COLOR in + Running) STATE=$OCF_SUCCESS;; + *) STATE=$OCF_NOT_RUNNING;; + esac + + SEARCH=`echo "$OCF_RESKEY_MONITOR_SERVICES" | sed 's/\+/\\\+/g' | sed 's/\./\\\./g'` + if [ `echo "$SERVICE" | egrep -c "$SEARCH"` -eq 1 ] + then + if [ $STATE -eq $OCF_NOT_RUNNING ] + then + ocf_log err "SAP database service $SERVICE is not running with status $COLOR !" + rc=$STATE + fi + count=1 + fi + done + + if [ $count -eq 0 -a $rc -eq $OCF_SUCCESS ] + then + ocf_log err "The resource does not run any services which this RA could monitor!" + rc=$OCF_ERR_ARGS + fi + + if [ $rc -ne $OCF_SUCCESS ] + then + ocf_log err "The SAP database $SID is not running: $output" + fi + fi + fi + return $rc +} + + +# +# sapdatabase_status: Are there any database processes on this host ? +# +sapdatabase_status() { + sid=`echo $SID | tr '[:upper:]' '[:lower:]'` + + SUSER=${OCF_RESKEY_DBOSUSER:-""} + + case $DBTYPE in + ADA) SEARCH="$SID/db/pgm/kernel" + [ -z "$SUSER" ] && SUSER=`grep "^SdbOwner" /etc/opt/sdb | awk -F'=' '{print $2}'` + SNUM=2 + ;; + ORA) DBINST=${OCF_RESKEY_DBINSTANCE} + DBINST=${OCF_RESKEY_DBINSTANCE:-${SID}} + SEARCH="ora_[a-z][a-z][a-z][a-z]_$DBINST" + + if [ -z "$SUSER" ]; then + id "oracle" > /dev/null 2> /dev/null && SUSER="oracle" + id "ora${sid}" > /dev/null 2> /dev/null && SUSER="${SUSER:+${SUSER},}ora${sid}" + fi + + SNUM=4 + ;; + DB6) SEARCH="db2[a-z][a-z][a-z]" + [ -z "$SUSER" ] && SUSER="db2${sid}" + SNUM=2 + ;; + SYB) SEARCH="dataserver" + [ -z "$SUSER" ] && SUSER="syb${sid}" + SNUM=1 + ;; + HDB) SEARCH="hdb[a-z]*server" + [ -z "$SUSER" ] && SUSER="${sid}adm" + SNUM=1 + ;; + esac + + [ -z "$SUSER" ] && return $OCF_ERR_INSTALLED + + cnt=`ps -u $SUSER -o args 2> /dev/null | grep -v grep | grep -c $SEARCH` + [ $cnt -ge $SNUM ] && return $OCF_SUCCESS + return $OCF_NOT_RUNNING +} + + +# +# sapdatabase_recover: +# +sapdatabase_recover() { + OCF_RESKEY_AUTOMATIC_RECOVER=1 + sapdatabase_stop + sapdatabase_start +} + + +# +# sapdatabase_validate: Check the semantics of the input parameters +# +sapdatabase_validate() { + rc=$OCF_SUCCESS + if [ `echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$'` -ne 1 ] + then + ocf_log err "Parsing parameter SID: '$SID' is not a valid system ID!" + rc=$OCF_ERR_ARGS + fi + + case "$DBTYPE" in + ORA|ADA|DB6|SYB|HDB) ;; + *) ocf_log err "Parsing parameter DBTYPE: '$DBTYPE' is not a supported database type!" + rc=$OCF_ERR_ARGS ;; + esac + + return $rc +} + +# +# sapdatabase_init: initialize global variables at the beginning +# +sapdatabase_init() { +OCF_RESKEY_AUTOMATIC_RECOVER_default=0 +: ${OCF_RESKEY_AUTOMATIC_RECOVER=${OCF_RESKEY_AUTOMATIC_RECOVER_default}} + +if [ -z "$OCF_RESKEY_MONITOR_SERVICES" ] +then + case $DBTYPE in + ORA) export OCF_RESKEY_MONITOR_SERVICES="Instance|Database|Listener" + ;; + ADA) export OCF_RESKEY_MONITOR_SERVICES="Database" + ;; + DB6) db2sid="db2`echo $SID | tr '[:upper:]' '[:lower:]'`" + export OCF_RESKEY_MONITOR_SERVICES="${SID}|${db2sid}" + ;; + SYB) export OCF_RESKEY_MONITOR_SERVICES="Server" + ;; + HDB) export OCF_RESKEY_MONITOR_SERVICES="hdbindexserver|hdbnameserver" + ;; + esac +fi +} diff --git a/heartbeat/scsi2reservation b/heartbeat/scsi2reservation new file mode 100755 index 0000000..9b29ec4 --- /dev/null +++ b/heartbeat/scsi2reservation @@ -0,0 +1,176 @@ +#!/bin/sh +# by hxinwei@gmail.com +# License: GNU General Public License 2 (GPL2) + +if [ -n "$OCF_DEBUG_LIBRARY" ]; then + . $OCF_DEBUG_LIBRARY +else + : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs +fi + +# Parameter defaults + +OCF_RESKEY_scsi_reserve_default="/usr/sbin/scsi_reserve" +OCF_RESKEY_sharedisk_default="/dev/sdb" +OCF_RESKEY_start_loop_default="10" + +: ${OCF_RESKEY_scsi_reserve=${OCF_RESKEY_scsi_reserve_default}} +: ${OCF_RESKEY_sharedisk=${OCF_RESKEY_sharedisk_default}} +: ${OCF_RESKEY_start_loop=${OCF_RESKEY_start_loop_default}} + +scsi2reserve_meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="scsi2reservation" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +The scsi-2-reserve resource agent is a place holder for SCSI-2 reservation. +A healthy instance of scsi-2-reserve resource, indicates the own of the specified SCSI device. +This resource agent depends on the scsi_reserve from scsires package, which is Linux specific. +</longdesc> + +<shortdesc lang="en"> +scsi-2 reservation +</shortdesc> + +<parameters> + +<parameter name="scsi_reserve" unique="0" required="0"> +<longdesc lang="en"> +The scsi_reserve is a command from scsires package. +It helps one to issue SCSI-2 reservation on SCSI devices. +</longdesc> +<shortdesc lang="en">Manages exclusive access to shared storage media thrugh SCSI-2 reservations</shortdesc> +<content type="string" default="${OCF_RESKEY_scsi_reserve_default}" /> +</parameter> + + +<parameter name="sharedisk" unique="0" required="0"> +<longdesc lang="en"> +The shared disk that can be reserved. +</longdesc> +<shortdesc lang="en"> +Shared disk. +</shortdesc> +<content type="string" default="${OCF_RESKEY_sharedisk_default}" /> +</parameter> + +<parameter name="start_loop" unique="0" required="0"> +<longdesc lang="en"> +We are going to try several times before giving up. Start_loop indicates how many times we are going to re-try. +</longdesc> +<shortdesc lang="en"> +Times to re-try before giving up. +</shortdesc> +<content type="string" default="${OCF_RESKEY_start_loop_default}" /> +</parameter> + + +</parameters> + +<actions> +<action name="start" timeout="300s" /> +<action name="stop" timeout="100s" /> +<action name="monitor" depth="0" timeout="20s" interval="20s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="30s" /> +</actions> +</resource-agent> +END + + return $OCF_SUCCESS +} + +scsi2reserve_usage() { + cat <<END + usage: $0 {start|stop|monitor|validate-all|meta-data} +END +} + +case $__OCF_ACTION in + meta-data) scsi2reserve_meta_data + exit $OCF_SUCCESS + ;; + usage|help) scsi2reserve_usage + exit $OCF_SUCCESS + ;; + *) + ;; +esac + + +scsi2reserve_validate () +{ + if [ ! -x "${OCF_RESKEY_scsi_reserve}" ]; then + exit $OCF_ERR_INSTALLED + fi + if [ -z "${OCF_RESKEY_sharedisk}" ]; then + exit $OCF_ERR_GENERIC + fi + if [ ! -b "${OCF_RESKEY_sharedisk}" ]; then + exit $OCF_ERR_ARGS + fi +} + +scsi2reserve_validate + +scsi2reserve_start () +{ + ite=$OCF_RESKEY_start_loop + while [ $ite -ge 0 ]; do + /sbin/startproc ${OCF_RESKEY_scsi_reserve} -d ${OCF_RESKEY_sharedisk} --preempt --hold --stonith + sleep 15 + /sbin/checkproc ${OCF_RESKEY_scsi_reserve} + rc=$? + if [ $rc -eq 0 ]; then + exit $OCF_SUCCESS + fi + ite=`expr $ite - 1` + done + exit $OCF_ERR_GENERIC +} + +scsi2reserve_monitor () +{ + /sbin/checkproc ${OCF_RESKEY_scsi_reserve} + rc=$? + if [ $rc -eq 0 ]; then + exit $OCF_SUCCESS + else + exit $OCF_NOT_RUNNING + fi +} + +scsi2reserve_stop () +{ + /sbin/killproc ${OCF_RESKEY_scsi_reserve} + ${OCF_RESKEY_scsi_reserve} -d ${OCF_RESKEY_sharedisk} --release + exit $OCF_SUCCESS +} + +if [ $# -ne 1 ]; then + scsi2reserve_usage + exit $OCF_ERR_ARGS +fi + +case $__OCF_ACTION in + start) scsi2reserve_start + ;; + stop) scsi2reserve_stop + ;; + monitor) scsi2reserve_monitor + ;; + validate-all) scsi2reserve_validate + exit $OCF_SUCCESS + ;; + *) scsi2reserve_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + +exit $? + + diff --git a/heartbeat/send_ua.c b/heartbeat/send_ua.c new file mode 100644 index 0000000..bd65e2c --- /dev/null +++ b/heartbeat/send_ua.c @@ -0,0 +1,133 @@ + +/* + * This program manages IPv6 address with OCF Resource Agent standard. + * + * Author: Huang Zhen <zhenh@cn.ibm.com> + * Copyright (c) 2004 International Business Machines + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#include <IPv6addr.h> + +#include <stdio.h> +#include <stdlib.h> +#include <malloc.h> +#include <unistd.h> +#include <sys/socket.h> +#include <arpa/inet.h> /* for inet_pton */ +#include <net/if.h> /* for if_nametoindex */ +#include <sys/ioctl.h> +#include <fcntl.h> +#include <signal.h> +#include <errno.h> + +static void usage_send_ua(const char* self); +static void byebye(int nsig); + +int +main(int argc, char* argv[]) +{ + char* ipv6addr; + int count = UA_REPEAT_COUNT; + int interval = 1000; /* default 1000 msec */ + int ch; + int i; + char* cp; + char* prov_ifname = NULL; + struct in6_addr addr6; + struct sigaction act; + + /* Check binary name */ + if (argc < 4) { + usage_send_ua(argv[0]); + return OCF_ERR_ARGS; + } + while ((ch = getopt(argc, argv, "h?c:i:")) != EOF) { + switch(ch) { + case 'c': /* count option */ + count = atoi(optarg); + break; + case 'i': /* interval option */ + interval = atoi(optarg); + break; + case 'h': + case '?': + default: + usage_send_ua(argv[0]); + return OCF_ERR_ARGS; + } + } + + /* set termination signal */ + memset(&act, 0, sizeof(struct sigaction)); + act.sa_flags &= ~SA_RESTART; /* redundant - to stress syscalls should fail */ + act.sa_handler = byebye; + if ((sigemptyset(&act.sa_mask) < 0) || (sigaction(SIGTERM, &act, NULL) < 0)) { + printf("ERROR: Could not set handler for signal: %s", strerror(errno)); + return OCF_ERR_GENERIC; + } + + ipv6addr = argv[optind]; + + if (ipv6addr == NULL) { + printf("ERROR: Please set OCF_RESKEY_ipv6addr to the IPv6 address you want to manage."); + usage_send_ua(argv[0]); + return OCF_ERR_ARGS; + } + + /* legacy option */ + if ((cp = strchr(ipv6addr, '/'))) { + *cp=0; + } + + prov_ifname = argv[optind+2]; + + if (inet_pton(AF_INET6, ipv6addr, &addr6) <= 0) { + printf("ERROR: Invalid IPv6 address [%s]", ipv6addr); + usage_send_ua(argv[0]); + return OCF_ERR_ARGS; + } + + /* Check whether this system supports IPv6 */ + if (access(IF_INET6, R_OK)) { + printf("ERROR: No support for INET6 on this system."); + return OCF_ERR_GENERIC; + } + + /* Send unsolicited advertisement packet to neighbor */ + for (i = 0; i < count; i++) { + send_ua(&addr6, prov_ifname); + usleep(interval * 1000); + } + + return OCF_SUCCESS; +} + +static void usage_send_ua(const char* self) +{ + printf("usage: %s [-i[=Interval]] [-c[=Count]] [-h] IPv6-Address Prefix Interface\n",self); + return; +} + +/* Following code is copied from send_arp.c, linux-HA project. */ +void +byebye(int nsig) +{ + (void)nsig; + /* Avoid an "error exit" log message if we're killed */ + exit(0); +} + diff --git a/heartbeat/sfex b/heartbeat/sfex new file mode 100755 index 0000000..b079ca0 --- /dev/null +++ b/heartbeat/sfex @@ -0,0 +1,311 @@ +#!/bin/sh +# +# Shared Disk File EXclusiveness (SF-EX) OCF RA. +# prevent a destruction of data on shared disk file system +# due to Split-Brain. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +# 02110-1301, USA. +# +# Copyright (c) 2007 NIPPON TELEGRAPH AND TELEPHONE CORPORATION +# +# NOTE: +# As a prerequisite for running SF-EX, one device should be +# initialized as below. +# +# sfex_init [-n <numlocks>] <device> +# +# Example: +# +# /usr/sbin/sfex_init -n 10 /dev/sdb1 +# +# if further information is necessary, See README. +# +####################################################################### +# Initialization: + +# switching ocf-shellfuncs path +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_device_default="" +OCF_RESKEY_index_default="1" +OCF_RESKEY_collision_timeout_default="1" +OCF_RESKEY_monitor_interval_default="10" +OCF_RESKEY_lock_timeout_default="100" + +: ${OCF_RESKEY_device=${OCF_RESKEY_device_default}} +: ${OCF_RESKEY_index=${OCF_RESKEY_index_default}} +: ${OCF_RESKEY_collision_timeout=${OCF_RESKEY_collision_timeout_default}} +: ${OCF_RESKEY_monitor_interval=${OCF_RESKEY_monitor_interval_default}} +: ${OCF_RESKEY_lock_timeout=${OCF_RESKEY_lock_timeout_default}} + +####################################################################### + +SFEX_DAEMON=${HA_BIN}/sfex_daemon + +usage() { + cat <<END + usage: $0 {start|stop|monitor|meta-data} +END +} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="sfex" version="1.3"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for SF-EX. It manages a shared storage medium exclusively . +</longdesc> +<shortdesc lang="en">Manages exclusive access to shared storage using Shared Disk File EXclusiveness (SF-EX)</shortdesc> +<parameters> +<parameter name="device" unique="0" required="1"> +<longdesc lang="en"> +Block device path that stores exclusive control data. +</longdesc> +<shortdesc lang="en">block device</shortdesc> +<content type="string" default="${OCF_RESKEY_device_default}" /> +</parameter> +<parameter name="index" unique="0" required="0"> +<longdesc lang="en"> +Location in block device where exclusive control data is stored. 1 or more is specified. Default is 1. +</longdesc> +<shortdesc lang="en">index</shortdesc> +<content type="integer" default="${OCF_RESKEY_index_default}" /> +</parameter> +<parameter name="collision_timeout" unique="0" required="0"> +<longdesc lang="en"> +Waiting time when a collision of lock acquisition is detected. Default is 1 second. +</longdesc> +<shortdesc lang="en">waiting time for lock acquisition</shortdesc> +<content type="integer" default="${OCF_RESKEY_collision_timeout_default}" /> +</parameter> +<parameter name="monitor_interval" unique="0" required="0"> +<longdesc lang="en"> +Monitor interval(sec). Default is ${OCF_RESKEY_monitor_interval_default} seconds +</longdesc> +<shortdesc lang="en">monitor interval</shortdesc> +<content type="integer" default="${OCF_RESKEY_monitor_interval_default}" /> +</parameter> +<parameter name="lock_timeout" unique="0" required="0"> +<longdesc lang="en"> +Valid term of lock(sec). Default is ${OCF_RESKEY_lock_timeout_default} seconds. +The lock_timeout is calculated by the following formula. + + lock_timeout = monitor_interval + "The expiration time of the lock" + +We suggest 90 seconds as a default value of the "The expiration time of the lock", but you should change it in consideration of access delay to the shared disk and the switch time of the multipath driver. + +The lock timeout have an impact on start action timeout because start action timeout value is calculated by the following formula. + + start timeout = collision_timeout + lock_timeout + "safety margin" + +The "safety margin" is decided within the range of about 10-20 seconds(It depends on your system requirement). +</longdesc> +<shortdesc lang="en">Valid term of lock</shortdesc> +<content type="integer" default="${OCF_RESKEY_lock_timeout_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="120s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" depth="0" timeout="10s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s" /> +</actions> +</resource-agent> +END +} + +# +# START: Exclusive control starts. +# +# It loops permanently until the lock can be acquired when locked with +# the other node. In this case, the reception of the stop signal by the +# timeout time passage set to CIB becomes the only stop opportunity. +# +sfex_start() { + ocf_log info "sfex_daemon: starting..." + + sfex_monitor + if [ $? -eq $OCF_SUCCESS ]; then + ocf_log info "sfex_daemon already started." + return $OCF_SUCCESS + fi + + $SFEX_DAEMON -i $INDEX -c $COLLISION_TIMEOUT -t $LOCK_TIMEOUT -m $MONITOR_INTERVAL -r ${OCF_RESOURCE_INSTANCE} $DEVICE + + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "sfex_daemon failed to start." + return $OCF_ERR_GENERIC + fi + + while : + do + sfex_monitor + if [ $? -eq $OCF_SUCCESS ]; then + ocf_log info "sfex_daemon: started." + return $OCF_SUCCESS + fi + ocf_log debug "Waiting for the start-up of the sfex_daemon..." + sleep 1 + done + ocf_log err "Can't find a sfex_daemon process. Starting a sfex_daemon failed." + return $OCF_ERR_GENERIC +} + +# +# STOP: stop exclusive control +# +sfex_stop() { + ocf_log info "sfex_daemon: stopping..." + + # Check the sfex daemon has already stopped. + sfex_monitor + if [ $? -eq $OCF_NOT_RUNNING ]; then + ocf_log info "sfex_daemon already stopped." + return $OCF_SUCCESS + fi + + # Stop sfex daemon by sending SIGTERM signal. + pid=`/usr/bin/pgrep -f "$SFEX_DAEMON .* ${OCF_RESOURCE_INSTANCE} "` + /bin/kill $pid + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "sfex_daemon failed to stop" + return $rc + fi + +#sfex could be in state D if the device is gone, and then not terminate. +#Wait and check again if the daemon is already properly shutdown. + + shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-5)) + count=0 + while [ $count -lt $shutdown_timeout ] + do + sfex_monitor + if [ $? -eq $OCF_NOT_RUNNING ]; then + ocf_log info "sfex_daemon: stopped." + return $OCF_SUCCESS + fi + count=`expr $count + 1` + ocf_log debug "waiting for sfex_daemon to exit ($count/$shutdown_timeout)" + sleep 1 + done + + sfex_monitor + if [ $? -ne $OCF_NOT_RUNNING ]; then + ocf_log warn "regular shutdown of sfex_daemon timed out, using SIGKILL" + /bin/kill -s KILL $pid + fi + + while : + do + sfex_monitor + if [ $? -eq $OCF_NOT_RUNNING ]; then + break; + fi + ocf_log debug "waiting for sfex_daemon to exit after SIGKILL" + sleep 1 + done + + ocf_log info "sfex_daemon: stopped." + return $OCF_SUCCESS +} + +sfex_monitor() { + ocf_log debug "sfex_monitor: started..." + + # Find a sfex_daemon process using daemon name and resource name. + if /usr/bin/pgrep -f "$SFEX_DAEMON .* ${OCF_RESOURCE_INSTANCE} " > /dev/null 2>&1; then + ocf_log debug "sfex_monitor: complete. sfex_daemon is running." + return $OCF_SUCCESS + fi + + ocf_log debug "sfex_monitor: complete. sfex_daemon is not running." + return $OCF_NOT_RUNNING +} + +# +# main process +# + +# check arguments +if [ $# -ne 1 ]; then + usage + exit $OCF_ERR_ARGS +fi +OP=$1 + +# These operations do not require instance parameters +case $OP in + meta-data) + meta_data + exit $OCF_SUCCESS + ;; + usage) + usage + exit $OCF_SUCCESS + ;; +esac + +# check parameters +DEVICE=$OCF_RESKEY_device +INDEX=${OCF_RESKEY_index} +COLLISION_TIMEOUT=${OCF_RESKEY_collision_timeout} +LOCK_TIMEOUT=${OCF_RESKEY_lock_timeout} +MONITOR_INTERVAL=${OCF_RESKEY_monitor_interval} + +sfex_validate () { +if [ -z "$DEVICE" ]; then + ocf_log err "Please set OCF_RESKEY_device to device for sfex meta-data" + exit $OCF_ERR_ARGS +fi +if [ ! -w "$DEVICE" ]; then + ocf_log warn "Couldn't find device [$DEVICE]. Expected /dev/??? to exist" + exit $OCF_ERR_ARGS +fi +} + +if [ -n "$OCF_RESKEY_CRM_meta_clone" ]; then + ocf_log err "THIS RA DO NOT SUPPORT CLONE MODE!" + exit $OCF_ERR_CONFIGURED +fi + +case $OP in + start) + sfex_start + ;; + stop) + sfex_stop + ;; + monitor) + sfex_monitor + ;; + validate-all) + sfex_validate + ;; + *) + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +exit $? diff --git a/heartbeat/sg_persist.in b/heartbeat/sg_persist.in new file mode 100644 index 0000000..620c02f --- /dev/null +++ b/heartbeat/sg_persist.in @@ -0,0 +1,699 @@ +#!@BASH_SHELL@ +# +# +# OCF Resource Agent compliant PERSISTENT SCSI RESERVATION resource script. +# +# +# Copyright (c) 2011 Evgeny Nifontov and lwang@suse.com All Rights Reserved. +# +# "Heartbeat drbd OCF Resource Agent: 2007, Lars Marowsky-Bree" was used +# as example of multistate OCF Resource Agent. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# +# OCF instance parameters +# OCF_RESKEY_binary +# OCF_RESKEY_devs +# OCF_RESKEY_required_devs_nof +# OCF_RESKEY_reservation_type +# OCF_RESKEY_master_score_base +# OCF_RESKEY_master_score_dev_factor +# OCF_RESKEY_master_score_delay +# +# TODO +# +# 1) PROBLEM: devices which were not accessible during 'start' action, will be never registered/reserved +# TODO: 'Master' and 'Salve' registers new devs in 'monitor' action +# TODO: 'Master' reserves new devs in 'monitor' action + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_binary_default="sg_persist" # binary name for the resource +OCF_RESKEY_devs_default="" # device list +OCF_RESKEY_required_devs_nof_default="1" # number of required devices +OCF_RESKEY_reservation_type_default="1" # reservation type +OCF_RESKEY_master_score_base_default="0" # master score base +OCF_RESKEY_master_score_dev_factor_default="100" # device factor for master score +OCF_RESKEY_master_score_delay_default="30" # delay for master score + +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +: ${OCF_RESKEY_devs=${OCF_RESKEY_devs_default}} +: ${OCF_RESKEY_required_devs_nof=${OCF_RESKEY_required_devs_nof_default}} +: ${OCF_RESKEY_reservation_type=${OCF_RESKEY_reservation_type_default}} +: ${OCF_RESKEY_master_score_base=${OCF_RESKEY_master_score_base_default}} +: ${OCF_RESKEY_master_score_dev_factor=${OCF_RESKEY_master_score_dev_factor_default}} +: ${OCF_RESKEY_master_score_delay=${OCF_RESKEY_master_score_delay_default}} + +####################################################################### + + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="sg_persist" version="1.1"> +<version>1.0</version> + +<longdesc lang="en"> +This resource agent manages SCSI PERSISTENT RESERVATIONS. +"sg_persist" from sg3_utils is used, please see its documentation. +Should be used as multistate (Master/Slave) resource +Slave registers its node id ("crm_node -i") as reservation key ( --param-rk ) on each device in the "devs" list. +Master reserves all devices from "devs" list with reservation "--prout-type" value from "reservation_type" parameter. +</longdesc> +<shortdesc lang="en">Manages SCSI PERSISTENT RESERVATIONS</shortdesc> + +<parameters> +<parameter name="binary" unique="0"> +<longdesc lang="en"> +The name of the binary that manages the resource. +</longdesc> +<shortdesc lang="en">the binary name of the resource</shortdesc> +<content type="string" default="${OCF_RESKEY_binary_default}"/> +</parameter> + +<parameter name="devs" unique="0" required="1"> +<longdesc lang="en"> +Device list. Multiple devices can be listed with blank space as separator. +Shell wildcards are allowed. +</longdesc> +<shortdesc lang="en">device list</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="required_devs_nof" unique="0" required="0"> +<longdesc lang="en"> +Minimum number of "working" devices from device list + 1) existing + 2) "sg_persist --read-keys \$device" works (Return code 0) +resource actions "start","monitor","promote" and "validate-all" return "\$OCF_ERR_INSTALLED" +if the actual number of "working" devices is less then "required_devs_nof". +resource actions "stop" and "demote" tries to remove reservations and registration keys from +all working devices, but always return "\$OCF_SUCCESS" +</longdesc> +<shortdesc lang="en">minimum number of working devices</shortdesc> +<content type="string" default="${OCF_RESKEY_required_devs_nof_default}"/> +</parameter> + +<parameter name="reservation_type" unique="0" required="0"> +<longdesc lang="en"> +reservation type +</longdesc> +<shortdesc lang="en">reservation type</shortdesc> +<content type="string" default="${OCF_RESKEY_reservation_type_default}" /> +</parameter> + +<parameter name="master_score_base" unique="0" required="0"> +<longdesc lang="en"> +master_score_base value +"master_score_base" value is used in "master_score" calculation: +master_score = \$master_score_base + \$master_score_dev_factor * \$working_devs +if set to bigger value in sg_persist resource configuration on some node, this node will be "preferred" for master role. +</longdesc> +<shortdesc lang="en">base master_score value</shortdesc> +<content type="string" default="${OCF_RESKEY_master_score_base_default}" /> +</parameter> + +<parameter name="master_score_dev_factor" unique="0" required="0"> +<longdesc lang="en"> +Working device factor in master_score calculation +each "working" device provides additional value to "master_score", +so the node that sees more devices will be preferred for the "Master"-role +Setting it to 0 will disable this behavior. +</longdesc> +<shortdesc lang="en">working device factor in master_score calculation</shortdesc> +<content type="string" default="${OCF_RESKEY_master_score_dev_factor_default}" /> +</parameter> + +<parameter name="master_score_delay" unique="0" required="0"> +<longdesc lang="en"> +master/slave decreases/increases its master_score after delay of \$master_score_delay seconds +so if some device gets inaccessible, the slave decreases its master_score first and the resource will no be watched +and after this device reappears again the master increases its master_score first +this can work only if the master_score_delay is bigger then monitor interval on both master and slave +Setting it to 0 will disable this behavior. +</longdesc> +<shortdesc lang="en">master_score decrease/increase delay time</shortdesc> +<content type="string" default="${OCF_RESKEY_master_score_delay_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="30s" /> +<action name="promote" timeout="30s" /> +<action name="demote" timeout="30s" /> +<action name="notify" timeout="30s" /> +<action name="stop" timeout="30s" /> +<action name="monitor" depth="0" timeout="20s" interval="29s" role="Unpromoted" /> +<action name="monitor" depth="0" timeout="20s" interval="60s" role="Promoted" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="30s" /> +</actions> +</resource-agent> +END + + exit $OCF_SUCCESS +} + +sg_persist_init() { + + if ! ocf_is_root ; then + ocf_log err "You must be root to perform this operation." + exit $OCF_ERR_PERM + fi + + SG_PERSIST=${OCF_RESKEY_binary} + check_binary $SG_PERSIST + + ROLE=$OCF_RESKEY_CRM_meta_role + NOW=$(date +%s) + + RESOURCE="${OCF_RESOURCE_INSTANCE}" + MASTER_SCORE_VAR_NAME="master-${OCF_RESOURCE_INSTANCE//:/-}" + PENDING_VAR_NAME="pending-$MASTER_SCORE_VAR_NAME" + + #only works with corocync + CRM_NODE="${HA_SBIN_DIR}/crm_node" + NODE_ID_DEC=$($CRM_NODE -i) + + NODE=$($CRM_NODE -l | $GREP -w ^$NODE_ID_DEC) + NODE=${NODE#$NODE_ID_DEC } + NODE=${NODE% *} + + MASTER_SCORE_ATTRIBUTE="${HA_SBIN_DIR}/crm_attribute --lifetime=reboot --name=$MASTER_SCORE_VAR_NAME --node=$NODE" + PENDING_ATTRIBUTE="${HA_SBIN_DIR}/crm_attribute --lifetime=reboot --name=$PENDING_VAR_NAME --node=$NODE" + + NODE_ID_HEX=$(printf '0x%x' $NODE_ID_DEC) + + if [ -z "$NODE_ID_HEX" ]; then + ocf_log err "Couldn't get node id with \"$CRM_NODE\"" + exit $OCF_ERR_INSTALLED + fi + + ocf_log debug "$RESOURCE: NODE:$NODE, ROLE:$ROLE, NODE_ID DEC:$NODE_ID_DEC HEX:$NODE_ID_HEX" + + DEVS=${OCF_RESKEY_devs} + REQUIRED_DEVS_NOF=${OCF_RESKEY_required_devs_nof} + RESERVATION_TYPE=${OCF_RESKEY_reservation_type} + MASTER_SCORE_BASE=${OCF_RESKEY_master_score_base} + MASTER_SCORE_DEV_FACTOR=${OCF_RESKEY_master_score_dev_factor} + MASTER_SCORE_DELAY=${OCF_RESKEY_master_score_delay} + + ocf_log debug "$RESOURCE: DEVS=$DEVS" + ocf_log debug "$RESOURCE: REQUIRED_DEVS_NOF=$REQUIRED_DEVS_NOF" + ocf_log debug "$RESOURCE: RESERVATION_TYPE=$RESERVATION_TYPE" + ocf_log debug "$RESOURCE: MASTER_SCORE_BASE=$MASTER_SCORE_BASE" + ocf_log debug "$RESOURCE: MASTER_SCORE_DEV_FACTOR=$MASTER_SCORE_DEV_FACTOR" + ocf_log debug "$RESOURCE: MASTER_SCORE_DELAY=$MASTER_SCORE_DELAY" + + #expand path wildcards + DEVS=$(echo $DEVS) + + if [ -z "$DEVS" ]; then + ocf_log err "\"devs\" not defined" + exit $OCF_ERR_INSTALLED + fi + + sg_persist_check_devs + sg_persist_get_status +} + +sg_persist_action_usage() { + cat <<END + usage: $0 {start|stop|monitor|validate-all|promote|demote|notify|meta-data} + + Expects to have a fully populated OCF RA-compliant environment set. +END +} + +sg_persist_get_status() { + + unset WORKING_DEVS[*] + + for dev in ${EXISTING_DEVS[*]} + do + READ_KEYS=`$SG_PERSIST --in --read-keys $dev 2>&1` + [ $? -eq 0 ] || continue + + WORKING_DEVS+=($dev) + + echo "$READ_KEYS" | $GREP -qw $NODE_ID_HEX\$ + [ $? -eq 0 ] || continue + + REGISTERED_DEVS+=($dev) + + READ_RESERVATION=`$SG_PERSIST --in --read-reservation $dev 2>&1` + [ $? -eq 0 ] || continue + + echo "$READ_RESERVATION" | $GREP -qw $NODE_ID_HEX\$ + if [ $? -eq 0 ]; then + RESERVED_DEVS+=($dev) + fi + + reservation_key=`echo $READ_RESERVATION | $GREP -o 'Key=0x[0-9a-f]*' | $GREP -o '0x[0-9a-f]*'` + if [ -n "$reservation_key" ]; then + DEVS_WITH_RESERVATION+=($dev) + RESERVATION_KEYS+=($reservation_key) + fi + done + + WORKING_DEVS_NOF=${#WORKING_DEVS[*]} + + ocf_log debug "$RESOURCE: working devices: `sg_persist_echo_array ${WORKING_DEVS[*]}`" + ocf_log debug "$RESOURCE: number of working devices: $WORKING_DEVS_NOF" + + ocf_log debug "$RESOURCE: registered devices: `sg_persist_echo_array ${REGISTERED_DEVS[*]}`" + ocf_log debug "$RESOURCE: reserved devices: `sg_persist_echo_array ${RESERVED_DEVS[*]}`" + ocf_log debug "$RESOURCE: devices with reservation: `sg_persist_echo_array ${DEVS_WITH_RESERVATION[*]}`" + ocf_log debug "$RESOURCE: reservation keys: `sg_persist_echo_array ${RESERVATION_KEYS[*]}`" + + MASTER_SCORE=$(($MASTER_SCORE_BASE + $MASTER_SCORE_DEV_FACTOR*$WORKING_DEVS_NOF)) + ocf_log debug "$RESOURCE: master_score: $MASTER_SCORE_BASE + $MASTER_SCORE_DEV_FACTOR*$WORKING_DEVS_NOF = $MASTER_SCORE" + +} + +sg_persist_check_devs() { + + for dev in $DEVS + do + if [ -e "$dev" ]; then + EXISTING_DEVS+=($dev) + fi + done + + EXISTING_DEVS_NOF=${#EXISTING_DEVS[*]} + if [ $EXISTING_DEVS_NOF -lt $REQUIRED_DEVS_NOF ]; then + ocf_log err "Number of existing devices=$EXISTING_DEVS_NOF less then required_devs_nof=$REQUIRED_DEVS_NOF" + exit $OCF_ERR_INSTALLED + fi + +} + +sg_persist_is_registered() { + for registered_dev in ${REGISTERED_DEVS[*]} + do + if [ "$registered_dev" == "$1" ]; then + return 0 + fi + done + return 1 +} + +sg_persist_get_reservation_key() { + for array_index in ${!DEVS_WITH_RESERVATION[*]} + do + if [ "${DEVS_WITH_RESERVATION[$array_index]}" == "$1" ]; then + echo ${RESERVATION_KEYS[$array_index]} + return 0 + fi + done + echo "" +} + +sg_persist_echo_array() { + str_count=0 + arr_str="" + + for str in "$@" + do + arr_str="$arr_str[$str_count]:$str " + str_count=$(($str_count+1)) + done + echo $arr_str +} + +sg_persist_parse_act_pending() { + + ACT_PENDING_TS=0 + ACT_PENDING_SCORE=0 + + if [ -n "$ACT_PENDING" ]; then + ACT_PENDING_TS=${ACT_PENDING%%_*} + ACT_PENDING_SCORE=${ACT_PENDING##*_} + fi +} + +sg_persist_clear_pending() { + if [ -n "$ACT_PENDING" ]; then + DO_PENDING_UPDATE="YES" + NEW_PENDING="" + fi +} + +sg_persist_new_master_score() { + DO_MASTER_SCORE_UPDATE="YES" + NEW_MASTER_SCORE=$1 +} + +sg_persist_new_pending() { + DO_PENDING_UPDATE="YES" + NEW_PENDING=$1 +} + + +# Functions invoked by resource manager actions + +sg_persist_action_start() { + + ocf_run $MASTER_SCORE_ATTRIBUTE --update=$MASTER_SCORE + ocf_run $PENDING_ATTRIBUTE --update="" + + if [ $WORKING_DEVS_NOF -lt $REQUIRED_DEVS_NOF ]; then + ocf_log err "$RESOURCE: Number of working devices=$WORKING_DEVS_NOF less then required_devs_nof=$REQUIRED_DEVS_NOF" + exit $OCF_ERR_GENERIC + fi + + for dev in ${WORKING_DEVS[*]} + do + if sg_persist_is_registered $dev ; then + : OK + else + ocf_run $SG_PERSIST --out --no-inquiry --register --param-rk=0 --param-sark=$NODE_ID_HEX $dev + if [ $? -ne $OCF_SUCCESS ] + then + return $OCF_ERR_GENERIC + fi + fi + done + + return $OCF_SUCCESS +} + +sg_persist_action_stop() { + + if [ ${#REGISTERED_DEVS[*]} -eq 0 ]; then + ocf_log debug "$RESOURCE stop: already no registrations" + else + # Clear preference for becoming master + ocf_run $MASTER_SCORE_ATTRIBUTE --delete + ocf_run $PENDING_ATTRIBUTE --delete + + for dev in ${REGISTERED_DEVS[*]} + do + ocf_run $SG_PERSIST --out --no-inquiry --register --param-rk=$NODE_ID_HEX --param-sark=0 $dev + done + fi + + return $OCF_SUCCESS +} + +sg_persist_action_monitor() { + + ACT_MASTER_SCORE=`$MASTER_SCORE_ATTRIBUTE --query --quiet 2>/dev/null` + ocf_log debug "$RESOURCE monitor: ACT_MASTER_SCORE=$ACT_MASTER_SCORE" + + ACT_PENDING=`$PENDING_ATTRIBUTE --query --quiet 2>/dev/null` + ocf_log debug "$RESOURCE monitor: ACT_PENDING=$ACT_PENDING" + + sg_persist_parse_act_pending + ocf_log debug "$RESOURCE monitor: ACT_PENDING_TS=$ACT_PENDING_TS" + ocf_log debug "$RESOURCE monitor: ACT_PENDING_VAL=$ACT_PENDING_SCORE" + + ocf_log debug "$MASTER_SCORE, $ACT_MASTER_SCORE, $ROLE" + + DO_MASTER_SCORE_UPDATE="NO" + DO_PENDING_UPDATE="NO" + if [ -n "$ACT_MASTER_SCORE" ] + then + if [ $ACT_MASTER_SCORE -eq $MASTER_SCORE ]; then + sg_persist_clear_pending + else + case $ROLE in + Master) + if [ $MASTER_SCORE -lt $ACT_MASTER_SCORE ]; then + if [ -n "$ACT_PENDING" ] + then + if [ $(($NOW-$ACT_PENDING_TS-$MASTER_SCORE_DELAY)) -ge 0 ]; then + sg_persist_new_master_score $MASTER_SCORE + sg_persist_clear_pending + fi + else + if [ $MASTER_SCORE_DELAY -eq 0 ]; then + sg_persist_new_master_score $MASTER_SCORE + sg_persist_clear_pending + else + sg_persist_new_pending "${NOW}_${MASTER_SCORE}" + fi + fi + else + sg_persist_new_master_score $MASTER_SCORE + sg_persist_clear_pending + fi + ;; + + Slave) + if [ $MASTER_SCORE -gt $ACT_MASTER_SCORE ]; then + if [ -n "$ACT_PENDING" ]; then + if [ $(($NOW-$ACT_PENDING_TS-$MASTER_SCORE_DELAY)) -ge 0 ]; then + sg_persist_new_master_score $MASTER_SCORE + sg_persist_clear_pending + fi + else + if [ $MASTER_SCORE_DELAY -eq 0 ]; then + sg_persist_new_master_score $MASTER_SCORE + sg_persist_clear_pending + else + sg_persist_new_pending "${NOW}_${MASTER_SCORE}" + fi + fi + else + sg_persist_new_master_score $MASTER_SCORE + sg_persist_clear_pending + fi + ;; + + *) + ;; + + esac + fi + fi + + if [ $DO_MASTER_SCORE_UPDATE == "YES" ]; then + ocf_run $MASTER_SCORE_ATTRIBUTE --update=$NEW_MASTER_SCORE + fi + + if [ $DO_PENDING_UPDATE == "YES" ]; then + ocf_run $PENDING_ATTRIBUTE --update=$NEW_PENDING + fi + + if [ ${#REGISTERED_DEVS[*]} -eq 0 ]; then + ocf_log debug "$RESOURCE monitor: no registrations" + if [ -n "$ACT_MASTER_SCORE" ]; then + ocf_run $MASTER_SCORE_ATTRIBUTE --delete + ocf_run $PENDING_ATTRIBUTE --delete + fi + return $OCF_NOT_RUNNING + fi + + if [ ${#RESERVED_DEVS[*]} -eq ${#WORKING_DEVS[*]} ]; then + if [ -z "$ACT_MASTER_SCORE" ]; then + ocf_run $MASTER_SCORE_ATTRIBUTE --update=$MASTER_SCORE + ocf_run $PENDING_ATTRIBUTE --update="" + fi + return $OCF_RUNNING_MASTER + fi + + if [ ${#REGISTERED_DEVS[*]} -eq ${#WORKING_DEVS[*]} ]; then + if [ -z "$ACT_MASTER_SCORE" ]; then + ocf_run $MASTER_SCORE_ATTRIBUTE --update=$MASTER_SCORE + ocf_run $PENDING_ATTRIBUTE --update="" + fi + if [ $RESERVATION_TYPE -eq 7 ] || [ $RESERVATION_TYPE -eq 8 ]; then + if [ ${#DEVS_WITH_RESERVATION[*]} -gt 0 ]; then + return $OCF_RUNNING_MASTER + else + return $OCF_SUCCESS + fi + else + return $OCF_SUCCESS + fi + fi + + ocf_log err "$RESOURCE monitor: unexpected state" + + return $OCF_ERR_GENERIC +} + +sg_persist_action_promote() { + + if [ ${#RESERVED_DEVS[*]} -gt 0 ]; then + ocf_log info "$RESOURCE promote: already master" + return $OCF_SUCCESS + fi + + for dev in ${WORKING_DEVS[*]} + do + reservation_key=`sg_persist_get_reservation_key $dev` + case $RESERVATION_TYPE in + 1|3|5|6) + if [ -z "$reservation_key" ]; then + ocf_run $SG_PERSIST --out --no-inquiry --reserve --param-rk=$NODE_ID_HEX --prout-type=$RESERVATION_TYPE $dev + if [ $? -ne $OCF_SUCCESS ]; then + return $OCF_ERR_GENERIC + fi + else + ocf_run $SG_PERSIST --out --no-inquiry --preempt --param-sark=$reservation_key --param-rk=$NODE_ID_HEX --prout-type=$RESERVATION_TYPE $dev + if [ $? -ne $OCF_SUCCESS ]; then + return $OCF_ERR_GENERIC + fi + fi + ;; + + 7|8) + if [ -z "$reservation_key" ]; then + ocf_run $SG_PERSIST --out --no-inquiry --reserve --param-rk=$NODE_ID_HEX --prout-type=$RESERVATION_TYPE $dev + if [ $? -ne $OCF_SUCCESS ] + then + return $OCF_ERR_GENERIC + fi + else + ocf_log info "$RESOURCE promote: there already exist an reservation holder, all registrants become reservation holders" + return $OCF_SUCCESS + fi + ;; + + *) + return $OCF_ERR_ARGS + ;; + + esac + done + + return $OCF_SUCCESS +} + +sg_persist_action_demote() { + case $RESERVATION_TYPE in + 1|3|5|6) + if [ ${#RESERVED_DEVS[*]} -eq 0 ]; then + ocf_log info "$RESOURCE demote: already slave" + return $OCF_SUCCESS + fi + + for dev in ${RESERVED_DEVS[*]} + do + ocf_run $SG_PERSIST --out --no-inquiry --release --param-rk=$NODE_ID_HEX --prout-type=$RESERVATION_TYPE $dev + if [ $? -ne $OCF_SUCCESS ]; then + return $OCF_ERR_GENERIC + fi + done + ;; + + 7|8) #in case of 7/8, --release won't release the reservation unless unregister the key. + if [ ${#REGISTERED_DEVS[*]} -eq 0 ]; then + ocf_log info "$RESOURCE demote: already slave" + return $OCF_SUCCESS + fi + + for dev in ${REGISTERED_DEVS[*]} + do + ocf_run $SG_PERSIST --out --no-inquiry --register --param-rk=$NODE_ID_HEX --param-sark=0 $dev + if [ $? -ne $OCF_SUCCESS ]; then + return $OCF_ERR_GENERIC + fi + done + ;; + + *) + return $OCF_ERR_ARGS + ;; + esac + + return $OCF_SUCCESS +} + +sg_persist_action_notify() { + local n_type="$OCF_RESKEY_CRM_meta_notify_type" + local n_op="$OCF_RESKEY_CRM_meta_notify_operation" + set -- $OCF_RESKEY_CRM_meta_notify_active_resource + local n_active="$#" + set -- $OCF_RESKEY_CRM_meta_notify_stop_resource + local n_stop="$#" + set -- $OCF_RESKEY_CRM_meta_notify_start_resource + local n_start="$#" + + ocf_log debug "$RESOURCE notify: $n_type for $n_op - counts: active $n_active - starting $n_start - stopping $n_stop" + + return $OCF_SUCCESS +} + +sg_persist_action_validate_all () { + if [ "$OCF_CHECK_LEVEL" -eq 10 ]; then + if [ "$OCF_RESKEY_CRM_meta_master_max" != "1" ] && [ "$RESERVATION_TYPE" != "7" ] && [ "$RESERVATION_TYPE" != "8" ]; then + ocf_log err "Master options misconfigured." + exit $OCF_ERR_CONFIGURED + fi + fi + + return $OCF_SUCCESS +} + +if [ $# -ne 1 ]; then + echo "Incorrect parameter count." + sg_persist_action_usage + exit $OCF_ERR_ARGS +fi + +ACTION=$1 +case $ACTION in + meta-data) + meta_data + ;; + + validate-all) + sg_persist_init + sg_persist_action_validate_all + ;; + + start|promote|monitor|stop|demote) + ocf_log debug "$RESOURCE: starting action \"$ACTION\"" + sg_persist_init + if [ "$__OCF_ACTION" = "start" ]; then + OCF_CHECK_LEVEL=10 + sg_persist_action_validate_all + fi + sg_persist_action_$ACTION + exit $? + ;; + + notify) + sg_persist_action_notify + exit $? + ;; + + usage|help) + sg_persist_action_usage + exit $OCF_SUCCESS + ;; + + *) + sg_persist_action_usage + exit $OCF_ERR_ARGS + ;; + + esac diff --git a/heartbeat/shellfuncs.in b/heartbeat/shellfuncs.in new file mode 100644 index 0000000..9991620 --- /dev/null +++ b/heartbeat/shellfuncs.in @@ -0,0 +1,96 @@ +# Author: Alan Robertson +# Support: linux-ha-dev@lists.tummy.com +# License: GNU Lesser General Public License (LGPL) +# +# Set these variables if they're not already set... +# + +: ${HA_SBIN_DIR:=@sbindir@} +: ${HA_NOARCHBIN:=@datadir@/heartbeat} +: ${OCF_AGENTS:=@OCF_RA_DIR@/heartbeat/} + +export HA_DIR HA_RCDIR HA_FIFO HA_BIN +export HA_DEBUGLOG HA_LOGFILE HA_LOGFACILITY +export HA_DATEFMT HA_RESOURCEDIR HA_DOCDIR +export OCF_AGENTS + +PATH=$HA_BIN:${HA_SBIN_DIR}:${HA_NOARCHBIN}:$PATH +PATH=`echo $PATH | sed -e 's%::%%' -e 's%:\.:%:%' -e 's%^:%%' -e 's%^\.:%%'` +export PATH + +# A suitable echo command +Echo() { + echo "$@" +} + +# copy stdin (text) to FIFO, with surrounding ">>>" and "<<<" marker lines. +# no args.; no result +# Notes: +# o Using "cat -" rather than "cat" simply for clarity. +# o The trailing "| cat -" tries to hold things together as a single +# write (which is probably preferable behaviour in this context). +ha_clustermsg() { + (echo ">>>"; cat -; echo "<<<") | cat - >> $HA_FIFO +} + +ha_parameter() { + VALUE=`sed -e 's%[ ][ ]*% %' -e 's%^ %%' -e 's%#.*%%' $HA_CF | + grep -i "^$1 " | sed 's%[^ ]* %%'` + if + [ "X$VALUE" = X ] + then + + case $1 in + keepalive) VALUE=2;; + deadtime) + ka=`ha_parameter keepalive` + VALUE=`expr $ka '*' 2 '+' 1`;; + esac + fi + Echo $VALUE +} + +BSD_Status() { + local base=${1##*/} + local pid + + ret_status=`/bin/ps -ao pid,command | grep $base | sed 's/ .*//'` + + if + [ "$ret_status" != "" ] + then + echo "${base} is running..." + return 0 + fi + + if + [ -f $HA_VARRUN/${base}.pid ] + then + echo "${base} dead but pid file exists" + return 1 + fi + + if + [ -f /var/run/${base}.pid ] + then + echo "${base} dead but pid file exists" + return 1 + fi + + if + [ -f $HA_VARLOCK/${base}.pid ] + then + echo "${base} dead but lock file exists" + return 2 + fi + + if + [ -f /var/spool/lock/${base} ] + then + echo "${base} dead but lock file exists" + return 2 + fi +} + +# Now get the good stuff +. @OCF_LIB_DIR@/heartbeat/ocf-shellfuncs diff --git a/heartbeat/slapd.in b/heartbeat/slapd.in new file mode 100644 index 0000000..ffccd1d --- /dev/null +++ b/heartbeat/slapd.in @@ -0,0 +1,594 @@ +#!@BASH_SHELL@ +# +# Stand-alone LDAP Daemon (slapd) +# +# Description: Manages Stand-alone LDAP Daemon (slapd) as an OCF resource in +# an high-availability setup. +# +# Authors: Jeroen Koekkoek +# nozawat@gmail.com +# John Keith Hohm +# +# License: GNU General Public License (GPL) +# Copyright: (C) 2011 Pagelink B.V. +# +# The OCF code was inspired by the Postfix resource script written by +# Raoul Bhatia <r.bhatia@ipax.at>. +# +# The code for managing the slapd instance is based on the the slapd init +# script found in Debian GNU/Linux 6.0. +# +# OCF parameters: +# OCF_RESKEY_slapd +# OCF_RESKEY_ldapsearch +# OCF_RESKEY_config +# OCF_RESKEY_pidfile +# OCF_RESKEY_user +# OCF_RESKEY_group +# OCF_RESKEY_services +# OCF_RESKEY_watch_suffix +# OCF_RESKEY_ignore_suffix +# OCF_RESKEY_bind_dn +# OCF_RESKEY_password +# OCF_RESKEY_parameters +# OCF_RESKEY_stop_escalate +# OCF_RESKEY_maxfiles +# +################################################################################ + +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_slapd_default="/usr/sbin/slapd" +OCF_RESKEY_ldapsearch_default="ldapsearch" +OCF_RESKEY_config_default="" +OCF_RESKEY_pidfile_default="" +OCF_RESKEY_user_default="" +OCF_RESKEY_group_default="" +OCF_RESKEY_services_default="ldap:///" +OCF_RESKEY_watch_suffix_default="" +OCF_RESKEY_ignore_suffix_default="" +OCF_RESKEY_bind_dn_default="" +OCF_RESKEY_password_default="" +OCF_RESKEY_parameters_default="" +OCF_RESKEY_stop_escalate_default="15" +OCF_RESKEY_maxfiles_default="" + +: ${OCF_RESKEY_slapd=${OCF_RESKEY_slapd_default}} +: ${OCF_RESKEY_ldapsearch=${OCF_RESKEY_ldapsearch_default}} +: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} +: ${OCF_RESKEY_pidfile=${OCF_RESKEY_pidfile_default}} +: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} +: ${OCF_RESKEY_group=${OCF_RESKEY_group_default}} +: ${OCF_RESKEY_services=${OCF_RESKEY_services_default}} +: ${OCF_RESKEY_watch_suffix=${OCF_RESKEY_watch_suffix_default}} +: ${OCF_RESKEY_ignore_suffix=${OCF_RESKEY_ignore_suffix_default}} +: ${OCF_RESKEY_bind_dn=${OCF_RESKEY_bind_dn_default}} +: ${OCF_RESKEY_password=${OCF_RESKEY_password_default}} +: ${OCF_RESKEY_parameters=${OCF_RESKEY_parameters_default}} +: ${OCF_RESKEY_stop_escalate=${OCF_RESKEY_stop_escalate_default}} +: ${OCF_RESKEY_maxfiles=${OCF_RESKEY_maxfiles_default}} + +USAGE="Usage: $0 {start|stop|status|monitor|validate-all|meta-data}" +ORIG_IFS=$IFS +NEWLINE=' +' + +################################################################################ + +usage() { + echo $USAGE >&2 +} + +meta_data() +{ + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="slapd" version="0.1"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for Stand-alone LDAP Daemon (slapd). It manages a slapd instance as an OCF resource. +</longdesc> +<shortdesc lang="en">Manages a Stand-alone LDAP Daemon (slapd) instance</shortdesc> + +<parameters> + +<parameter name="slapd" unique="0" required="0"> +<longdesc lang="en"> +Full path to the slapd binary. +For example, "/usr/sbin/slapd". +</longdesc> +<shortdesc lang="en">Full path to slapd binary</shortdesc> +<content type="string" default="${OCF_RESKEY_slapd_default}" /> +</parameter> + +<parameter name="ldapsearch" unique="0" required="0"> +<longdesc lang="en"> +Full path to the ldapsearch binary. +For example, "/usr/bin/ldapsearch". +</longdesc> +<shortdesc lang="en">Full path to ldapsearch binary</shortdesc> +<content type="string" default="${OCF_RESKEY_ldapsearch_default}" /> +</parameter> + +<parameter name="config" required="0" unique="1"> +<longdesc lang="en"> +Full path to a slapd configuration directory or a slapd configuration file. +For example, "/etc/ldap/slapd.d" or "/etc/ldap/slapd.conf". +</longdesc> +<shortdesc lang="en">Full path to configuration directory or file</shortdesc> +<content type="string" default="${OCF_RESKEY_config_default}"/> +</parameter> + +<parameter name="pidfile" required="0" unique="0"> +<longdesc lang="en"> +File to read the PID from; read from olcPidFile/pidfile in config if not set. +</longdesc> +<shortdesc lang="en">File to read PID from</shortdesc> +<content type="string" default="${OCF_RESKEY_pidfile_default}" /> +</parameter> + +<parameter name="user" unique="0" required="0"> +<longdesc lang="en"> +User name or id slapd will run with. The group id is also changed to this +user's gid, unless the group parameter is used to override. +</longdesc> +<shortdesc lang="en">User name or id slapd will run with</shortdesc> +<content type="string" default="${OCF_RESKEY_user_default}" /> +</parameter> + +<parameter name="group" unique="0" required="0"> +<longdesc lang="en"> +Group name or id slapd will run with. +</longdesc> +<shortdesc lang="en">Group name or id slapd will run with</shortdesc> +<content type="string" default="${OCF_RESKEY_group_default}" /> +</parameter> + +<parameter name="services" required="0" unique="1"> +<longdesc lang="en"> +LDAP (and other scheme) URLs slapd will serve. +For example, "ldap://127.0.0.1:389 ldaps:/// ldapi:///" +</longdesc> +<shortdesc lang="en">LDAP (and other scheme) URLs to serve</shortdesc> +<content type="string" default="${OCF_RESKEY_services_default}"/> +</parameter> + +<parameter name="watch_suffix" required="0" unique="0"> +<longdesc lang="en"> +Suffix (database backend) that will be monitored for availability. Multiple +suffixes can be specified by providing a space separated list. By providing one +or more suffixes here, the ignore_suffix parameter is discarded. All suffixes +will be monitored if left blank. +</longdesc> +<shortdesc lang="en">Suffix that will be monitored for availability.</shortdesc> +<content type="string" default="${OCF_RESKEY_watch_suffix_default}"/> +</parameter> + +<parameter name="ignore_suffix" required="0" unique="0"> +<longdesc lang="en"> +Suffix (database backend) that will not be monitored for availability. Multiple +suffixes can be specified by providing a space separated list. No suffix will +be excluded if left blank. +</longdesc> +<shortdesc lang="en">Suffix that will not be monitored for availability.</shortdesc> +<content type="string" default="${OCF_RESKEY_ignore_suffix_default}"/> +</parameter> + +<parameter name="bind_dn" required="0" unique="0"> +<longdesc lang="en"> +Distinguished Name used to bind to the LDAP directory for testing. Leave blank +to bind to the LDAP directory anonymously. +</longdesc> +<shortdesc lang="en">Distinguished Name used to bind to the LDAP directory for testing.</shortdesc> +<content type="string" default="${OCF_RESKEY_bind_dn_default}"/> +</parameter> + +<parameter name="password" required="0" unique="0"> +<longdesc lang="en"> +Password used to bind to the LDAP directory for testing. +</longdesc> +<shortdesc lang="en">Password used to bind to the LDAP directory for testing.</shortdesc> +<content type="string" default="${OCF_RESKEY_password_default}"/> +</parameter> + +<parameter name="parameters" unique="0" required="0"> +<longdesc lang="en"> +slapd may be called with additional parameters. +Specify any of them here. +</longdesc> +<shortdesc lang="en">Any additional parameters to slapd.</shortdesc> +<content type="string" default="${OCF_RESKEY_parameters_default}" /> +</parameter> + +<parameter name="stop_escalate" unique="0" required="0"> +<longdesc lang="en"> +Number of seconds to wait for shutdown (using SIGTERM) before resorting to +SIGKILL +</longdesc> +<shortdesc lang="en">Seconds before stop escalation to KILL</shortdesc> +<content type="integer" default="${OCF_RESKEY_stop_escalate_default}" /> +</parameter> + +<parameter name="maxfiles"> +<longdesc lang="en"> +Maximum number of open files (for ulimit -n) +</longdesc> +<shortdesc lang="en">Max open files</shortdesc> +<content type="string" default="${OCF_RESKEY_maxfiles_default}" /> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="60s" /> +<action name="validate-all" timeout="20s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +watch_suffix() +{ + local rc + + if [ -n "$OCF_RESKEY_watch_suffix" ]; then + if echo "'$OCF_RESKEY_watch_suffix'" | grep "'$1'" >/dev/null 2>&1; then + rc=0 + else + rc=1 + fi + else + if echo "'$OCF_RESKEY_ignore_suffix'" | grep "'$1'" >/dev/null 2>&1; then + rc=1 + else + rc=0 + fi + fi + + return $rc +} + +slapd_pid() +{ + local pid + + if [ -f "$pid_file" ]; then + pid=`head -n 1 "$pid_file" 2>/dev/null` + + if [ "X$pid" != "X" ]; then + echo "$pid" + return $OCF_SUCCESS + fi + + ocf_exit_reason "slapd pid file '$pid_file' empty." + return $OCF_ERR_GENERIC + fi + + ocf_log info "slapd pid file '$pid_file' does not exist." + return $OCF_NOT_RUNNING +} + +slapd_status() +{ + local pid=$1 + + if ! kill -0 $pid >/dev/null 2>&1; then + return $OCF_NOT_RUNNING + else + return $OCF_SUCCESS + fi +} + +slapd_start() +{ + local options + local reason + local rc + local state + + slapd_status `slapd_pid`; state=$? + + if [ $state -eq $OCF_SUCCESS ]; then + ocf_log info "slapd already running." + return $state + elif [ $state -eq $OCF_ERR_GENERIC ]; then + return $state + fi + + options="-u $user -g $group" + + if [ -d "$config" ]; then + options="$options -F $config" + elif [ -f "$config" ]; then + options="$options -f $config" + else + ocf_exit_reason "slapd configuration '$config' does not exist." + return $OCF_ERR_INSTALLED + fi + + if [ -n "$parameters" ]; then + options="$options $parameters" + fi + + if [ -n "$OCF_RESKEY_maxfiles" ]; then + ulimit -n $OCF_RESKEY_maxfiles + u_rc=$? + if [ "$u_rc" -ne 0 ]; then + ocf_log warn "Could not set ulimit for open files for slapd to '$OCF_RESKEY_maxfiles'" + fi + fi + + if [ -n "$services" ]; then + $slapd -h "$services" $options 2>&1; rc=$? + else + $slapd $options 2>&1; rc=$? + fi + + if [ $rc -ne 0 ]; then + ocf_exit_reason "slapd returned error." + + return $OCF_ERR_GENERIC + fi + + while true; do + slapd_monitor start + if [ $? = "$OCF_SUCCESS" ]; then + break + fi + sleep 1 + done + + ocf_log info "slapd started." + + return $OCF_SUCCESS +} + +slapd_stop() +{ + local pid + local rc + local state + + pid=`slapd_pid`; slapd_status $pid; state=$? + + if [ $state -eq $OCF_NOT_RUNNING ]; then + ocf_log info "slapd already stopped." + return $OCF_SUCCESS + elif [ $state -eq $OCF_ERR_GENERIC ]; then + return $state + fi + + ocf_stop_processes TERM $OCF_RESKEY_stop_escalate $pid; rc=$? + if [ $rc -eq 1 ]; then + ocf_log err "cannot stop slapd." + return $OCF_ERR_GENERIC + fi + + if [ -f "$pid_file" ]; then + rm -f "$pid_file" >/dev/null 2>&1 + fi + + ocf_log info "slapd stopped." + return $OCF_SUCCESS +} + +slapd_monitor() +{ + local options + local rc + local state + local suffix + local suffixes + local err_option="-info" + + slapd_status `slapd_pid`; state=$? + if [ $state -eq $OCF_NOT_RUNNING ]; then + if [ -z "$1" ];then + if ! ocf_is_probe; then + ocf_exit_reason "slapd process not found." + fi + fi + return $state + elif [ $state -ne $OCF_SUCCESS ]; then + ocf_exit_reason "slapd returned error." + return $state + fi + + if [ -d "$config" ]; then + for suffix in `find "$config"/'cn=config' -type f -name olcDatabase* -exec \ + sed -ne 's/^[[:space:]]*olcSuffix:[[:space:]]\+\(.\+\)/\1/p' {} \;` + do + suffix=${suffix#\"*} + suffix=${suffix%\"*} + + if watch_suffix $suffix; then + suffixes="$suffixes $suffix" + fi + done + + elif [ -f "$config" ]; then + for suffix in `sed -ne 's/^[[:space:]]*suffix[[:space:]]\+\(.\+\)/\1/p' "$config"` + do + suffix=${suffix#\"*} + suffix=${suffix%\"*} + + if watch_suffix $suffix; then + suffixes="$suffixes $suffix" + fi + done + + else + if ocf_is_probe; then + ocf_log info "slapd configuration '$config' does not exist during probe." + else + ocf_exit_reason "slapd configuration '$config' does not exist." + return $OCF_ERR_INSTALLED + fi + fi + + options="-LLL -s base -x" + + if [ -n "$bind_dn" ]; then + options="$options -D $bind_dn -w $password" + fi + + [ -z "$1" ] && err_option="" + for suffix in $suffixes; do + ocf_run -q $err_option "$ldapsearch" -H "$services" -b "$suffix" $options >/dev/null 2>&1; rc=$? + + case "$rc" in + "0") + ocf_log debug "slapd database with suffix '$suffix' reachable" + ;; + "49") + ocf_exit_reason "slapd database with suffix '$suffix' unreachable. Invalid credentials." + return $OCF_ERR_CONFIGURED + ;; + *) + if [ -z "$1" ] || [ -n "$1" -a $rc -ne 1 ]; then + ocf_exit_reason "slapd database with suffix '$suffix' unreachable. exit code ($rc)" + fi + state=$OCF_ERR_GENERIC + ;; + esac + done + + return $state +} + +slapd_validate_all() +{ + check_binary "$slapd" + check_binary "$ldapsearch" + + if [ -z "$pid_file" ]; then + if [ -d "$config" ]; then + pid_file=`sed -ne \ + 's/^olcPidFile:[[:space:]]\+\(.\+\)[[:space:]]*/\1/p' \ + "$config"/'cn=config.ldif' 2>/dev/null` + elif [ -f "$config" ]; then + pid_file=`sed -ne \ + 's/^pidfile[[:space:]]\+\(.\+\)/\1/p' \ + "$config" 2>/dev/null` + else + if ocf_is_probe; then + ocf_log info "slapd configuration '$config' does not exist during probe." + else + ocf_exit_reason "slapd configuration '$config' does not exist." + return $OCF_ERR_INSTALLED + fi + fi + fi + + if [ -z "$user" ]; then + user=`id -nu 2>/dev/null` + elif ! id "$user" >/dev/null 2>&1; then + ocf_exit_reason "slapd user '$user' does not exist" + return $OCF_ERR_INSTALLED + fi + + if [ -z "$group" ]; then + group=`id -ng 2>/dev/null` + elif ! grep "^$group:" /etc/group >/dev/null 2>&1; then + ocf_exit_reason "slapd group '$group' does not exist" + return $OCF_ERR_INSTALLED + fi + + pid_dir=`dirname "$pid_file"` + if [ ! -d "$pid_dir" ]; then + mkdir -p "$pid_dir" + chown -R "$user" "$pid_dir" + chgrp -R "$group" "$pid_dir" + fi + + return $OCF_SUCCESS +} + +# +# Main +# + +slapd=$OCF_RESKEY_slapd +ldapsearch=$OCF_RESKEY_ldapsearch +config=$OCF_RESKEY_config +user=$OCF_RESKEY_user +group=$OCF_RESKEY_group +services=$OCF_RESKEY_services +bind_dn=$OCF_RESKEY_bind_dn +password=$OCF_RESKEY_password +parameters=$OCF_RESKEY_parameters +pid_file=$OCF_RESKEY_pidfile + +if [ -z "$config" ]; then + config_dirname="/etc/ldap" + if [ -e "/etc/openldap" ]; then + config_dirname="/etc/openldap" + fi + + config="$config_dirname/slapd.conf" + if [ -e "$config_dirname/slapd.d" ]; then + config="$config_dirname/slapd.d" + fi +fi + +if [ $# -ne 1 ]; then + usage + exit $OCF_ERR_ARGS +fi + +case $1 in + meta-data) + meta_data + exit $OCF_SUCCESS + ;; + usage|help) + usage + exit $OCF_SUCCESS + ;; +esac + +slapd_validate_all +rc=$? +[ $rc -eq $OCF_SUCCESS ] || exit $rc + +case $1 in + status) + slapd_status `slapd_pid`; state=$? + + if [ $state -eq $OCF_SUCCESS ]; then + ocf_log debug "slapd is running." + elif [ $state -eq $OCF_NOT_RUNNING ]; then + ocf_log debug "slapd is stopped." + fi + + exit $state + ;; + start) + slapd_start + exit $? + ;; + stop) + slapd_stop + exit $? + ;; + monitor) + slapd_monitor; state=$? + exit $state + ;; + validate-all) + exit $OCF_SUCCESS + ;; + *) + usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac diff --git a/heartbeat/smb-share.in b/heartbeat/smb-share.in new file mode 100755 index 0000000..8a1a0a8 --- /dev/null +++ b/heartbeat/smb-share.in @@ -0,0 +1,494 @@ +#!@BASH_SHELL@ +# +# OCF Resource Agent for create samba config snippets. +# +# +# +# Copyright (c) 2021 B1 Systems GmbH <info@b1-systems.de> +# Author: +# Tobias D. Oestreicher <oestreicher@b1-systems.de> +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# +# +# OCF parameters are as below: +# OCF_RESKEY_outfile +# OCF_RESKEY_includesfile +# OCF_RESKEY_confd +# OCF_RESKEY_share +# OCF_RESKEY_path +# OCF_RESKEY_hosts_allow +# OCF_RESKEY_browseable +# OCF_RESKEY_writeable +# OCF_RESKEY_read_only +# OCF_RESKEY_guest_ok +# OCF_RESKEY_directory_mask +# OCF_RESKEY_create_mask +# OCF_RESKEY_printable +# OCF_RESKEY_valid_users +# OCF_RESKEY_force_user +# OCF_RESKEY_force_group +# OCF_RESKEY_extraopt +# OCF_RESKEY_extraopt_list +# +####################################################################### + +####################################################################### +# +# Purpose: +# -------- +# This RA is used to control samba shares on the fly. +# For adding and removing samba shares no restart of the samba daemon +# is needed. This is the equivalent of the exportfs RA which is used +# for nfs shares. +# +# How to use: +# ----------- +# For this RA to work as expected you need a cloned samba daemon which +# have to be startet before. +# After this RA manages config snippets placed in the filesystem and +# after a fence of that node these snippets will still located there +# you will have to use a tmpfs mount for this. +# Also you need a basic smb.conf file in which all global parameters an +# permanent shares should be placed. +# Within this smb.conf also you must put a line in the global section +# like this: +# +# include = /etc/samba/conf.d/pacemaker-includes.conf +# +# The filename can be changed by setting the parameter "includesfile". +# Every share created by this RA will create a new file located in: +# +# /etc/samba/conf.d/ +# +# This directory also can be changed by setting the RA parameter "confd". +# +####################################################################### + +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults + +OCF_RESKEY_outfile_default="" +OCF_RESKEY_includesfile_default="/etc/samba/conf.d/pacemaker-includes.conf" +OCF_RESKEY_confd_default="/etc/samba/conf.d" +OCF_RESKEY_share_default="" +OCF_RESKEY_path_default="" +OCF_RESKEY_hosts_allow_default="" +OCF_RESKEY_browseable_default="" +OCF_RESKEY_writeable_default="" +OCF_RESKEY_read_only_default="" +OCF_RESKEY_guest_ok_default="" +OCF_RESKEY_directory_mask_default="" +OCF_RESKEY_create_mask_default="" +OCF_RESKEY_printable_default="" +OCF_RESKEY_valid_users_default="" +OCF_RESKEY_force_user_default="" +OCF_RESKEY_force_group_default="" +OCF_RESKEY_extraopt_default="" +OCF_RESKEY_extraopt_list_default="" + + +: ${OCF_RESKEY_outfile=${OCF_RESKEY_outfile_default}} +: ${OCF_RESKEY_includesfile=${OCF_RESKEY_includesfile_default}} +: ${OCF_RESKEY_confd=${OCF_RESKEY_confd_default}} +: ${OCF_RESKEY_share=${OCF_RESKEY_share_default}} +: ${OCF_RESKEY_path=${OCF_RESKEY_path_default}} +: ${OCF_RESKEY_hosts_allow=${OCF_RESKEY_hosts_allow_default}} +: ${OCF_RESKEY_browseable=${OCF_RESKEY_browseable_default}} +: ${OCF_RESKEY_writeable=${OCF_RESKEY_writeable_default}} +: ${OCF_RESKEY_read_only=${OCF_RESKEY_read_only_default}} +: ${OCF_RESKEY_guest_ok=${OCF_RESKEY_guest_ok_default}} +: ${OCF_RESKEY_directory_mask=${OCF_RESKEY_directory_mask_default}} +: ${OCF_RESKEY_create_mask=${OCF_RESKEY_create_mask_default}} +: ${OCF_RESKEY_printable=${OCF_RESKEY_printable_default}} +: ${OCF_RESKEY_valid_users=${OCF_RESKEY_valid_users_default}} +: ${OCF_RESKEY_force_user=${OCF_RESKEY_force_user_default}} +: ${OCF_RESKEY_force_group=${OCF_RESKEY_force_group_default}} +: ${OCF_RESKEY_extraopt=${OCF_RESKEY_extraopt_default}} +: ${OCF_RESKEY_extraopt_list=${OCF_RESKEY_extraopt_list_default}} + +####################################################################### + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="smb-share" version="1.0"> + <version>1.0</version> + <longdesc lang="en"> +This RA is used to control samba shares on the fly. +For adding and removing samba shares no restart of the samba daemon +is needed. This is the equivalent of the exportfs RA which is used +for nfs shares. + +For this RA to work as expected you need a cloned samba daemon which +have to be startet before. +After this RA manages config snippets placed in the filesystem and +after a fence of that node these snippets will still located there +you will have to use a tmpfs mount for this. +Also you need a basic smb.conf file in which all global parameters an +permanent shares should be placed. +Within this smb.conf also you must put a line in the global section +like this: + + include = /etc/samba/conf.d/pacemaker-includes.conf + +The filename can be changed by setting the parameter includesfile. +Every share created by this RA will create a new file located in: + + /etc/samba/conf.d/ + +This directory also can be changed by setting the RA parameter confd. + </longdesc> + <shortdesc lang="en">Manages samba shares on the fly</shortdesc> + + <parameters> + + <parameter name="share" unique="1" required="1"> + <longdesc lang="en"> + Set the name of a windows share which should be added to Samba + example name "myshare" resulting in [myshare]. + </longdesc> + <shortdesc lang="en">sharename</shortdesc> + <content type="string" default="${OCF_RESKEY_share_default}" /> + </parameter> + + <parameter name="path" unique="1" required="0"> + <longdesc lang="en"> + Set the path to share for cifs clients. + example path "/srv/data/myshare". + </longdesc> + <shortdesc lang="en">path to share</shortdesc> + <content type="string" default="${OCF_RESKEY_path_default}" /> + </parameter> + + <parameter name="hosts_allow" unique="0" required="0"> + <longdesc lang="en"> + This parameter is a comma, space, or tab delimited set of hosts which are permitted to access a service. + </longdesc> + <shortdesc lang="en">hosts allow parameter</shortdesc> + <content type="string" default="${OCF_RESKEY_hosts_allow_default}" /> + </parameter> + + <parameter name="browseable" unique="0" required="0"> + <longdesc lang="en"> + This controls whether this share is seen in the list of available shares in a net view and in the browse list. + </longdesc> + <shortdesc lang="en">browseable parameter</shortdesc> + <content type="string" default="${OCF_RESKEY_browseable_default}" /> + </parameter> + + <parameter name="writeable" unique="0" required="0"> + <longdesc lang="en"> + Inverted synonym for read only. + </longdesc> + <shortdesc lang="en">writeable parameter</shortdesc> + <content type="string" default="${OCF_RESKEY_writeable_default}" /> + </parameter> + + <parameter name="read_only" unique="0" required="0"> + <longdesc lang="en"> + This option can be used to turn the writing backends tdb, tdb2, and ldap into read only mode. + This can be useful e.g. in cases where a pre-filled database exists that should not be extended automatically. + </longdesc> + <shortdesc lang="en">read only parameter</shortdesc> + <content type="string" default="${OCF_RESKEY_read_only_default}" /> + </parameter> + + <parameter name="guest_ok" unique="0" required="0"> + <longdesc lang="en"> + If this parameter is yes for a service, then no password is required to connect to the service. Privileges will be those of the guest account. + </longdesc> + <shortdesc lang="en">guest ok parameter</shortdesc> + <content type="string" default="${OCF_RESKEY_guest_ok_default}" /> + </parameter> + + <parameter name="directory_mask" unique="0" required="0"> + <longdesc lang="en"> + This parameter is the octal modes which are used when converting DOS modes to UNIX modes when creating UNIX directories. + </longdesc> + <shortdesc lang="en">directory mask parameter</shortdesc> + <content type="string" default="${OCF_RESKEY_directory_mask_default}" /> + </parameter> + + <parameter name="create_mask" unique="0" required="0"> + <longdesc lang="en"> + When a file is created, the necessary permissions are calculated according to the mapping from DOS modes to UNIX permissions, + and the resulting UNIX mode is then bit-wise ANDed with this parameter. This parameter may be thought of as a bit-wise MASK for the UNIX modes of a file. + </longdesc> + <shortdesc lang="en">create mask parameter</shortdesc> + <content type="string" default="${OCF_RESKEY_create_mask_default}" /> + </parameter> + + <parameter name="printable" unique="0" required="0"> + <longdesc lang="en"> + If this parameter is yes, then clients may open, write to and submit spool files on the directory specified for the service. + </longdesc> + <shortdesc lang="en">printable parameter</shortdesc> + <content type="string" default="${OCF_RESKEY_printable_default}" /> + </parameter> + + <parameter name="valid_users" unique="0" required="0"> + <longdesc lang="en"> + This is a list of users that should be allowed to login to this service. Names starting with @, + and & are interpreted + using the same rules as described in the invalid users parameter. + </longdesc> + <shortdesc lang="en">valid users parameter</shortdesc> + <content type="string" default="${OCF_RESKEY_valid_users_default}" /> + </parameter> + + <parameter name="force_user" unique="0" required="0"> + <longdesc lang="en"> + This specifies a UNIX user name that will be assigned as the default user for all users connecting to this service. This is useful for sharing files. + </longdesc> + <shortdesc lang="en">force user parameter</shortdesc> + <content type="string" default="${OCF_RESKEY_force_user_default}" /> + </parameter> + + <parameter name="force_group" unique="0" required="0"> + <longdesc lang="en"> + This specifies a UNIX group name that will be assigned as the default primary group for all users connecting to this service. + This is useful for sharing files by ensuring that all access to files on service will use the named group for their permissions checking. + </longdesc> + <shortdesc lang="en">force group parameter</shortdesc> + <content type="string" default="${OCF_RESKEY_force_group_default}" /> + </parameter> + + <parameter name="extraopt" unique="0" required="0"> + <longdesc lang="en"> + This option can be used to define an additional key = value pair. + In this parameter also a semicolon could be placed. + Need to set e.g somthinspecial = value + </longdesc> + <shortdesc lang="en">additional key value pair</shortdesc> + <content type="string" default="${OCF_RESKEY_extraopt_default}" /> + </parameter> + + <parameter name="extraopt_list" unique="0" required="0"> + <longdesc lang="en"> + This option can be used to define multiple additional key = value pairs. + Define the list of element semicolon separated. + e.g somethingspecial = value;one more = value2 + </longdesc> + <shortdesc lang="en">additional key value pairs as semicolon separated list</shortdesc> + <content type="string" default="${OCF_RESKEY_extraopt_list_default}" /> + </parameter> + + <parameter name="outfile" unique="1" required="1"> + <longdesc lang="en"> + Set the path and filename where the snipped should be written. + example "/etc/samba/conf.d/myshare.inc". + </longdesc> + <shortdesc lang="en">outputfile</shortdesc> + <content type="string" default="${OCF_RESKEY_outfile_default}" /> + </parameter> + + <parameter name="confd" unique="0" required="0"> + <longdesc lang="en"> + Set the path where the includes will be written. This folder have to be a tmpfs mount + This defaults to "/etc/samba/conf.d". + </longdesc> + <shortdesc lang="en">confd directory</shortdesc> + <content type="string" default="${OCF_RESKEY_confd_default}" /> + </parameter> + + <parameter name="includesfile" unique="0" required="0"> + <longdesc lang="en"> + Set the path and filename in which the include should be placed. + example includesfile "/etc/samba/conf.d/pacemaker-includes.conf". + This option manages the include= parameter within this file + </longdesc> + <shortdesc lang="en">includesfile for smb.conf</shortdesc> + <content type="string" default="${OCF_RESKEY_includesfile_default}" /> + </parameter> + + </parameters> + + <actions> + <action name="start" timeout="20s" /> + <action name="stop" timeout="20s" /> + <action name="status" depth="0" timeout="20s" interval="10s" /> + <action name="monitor" depth="0" timeout="20s" interval="10s" /> + <action name="meta-data" timeout="5s" /> + <action name="validate-all" timeout="20s" /> + </actions> +</resource-agent> +END + + exit $OCF_SUCCESS +} + +smb_share_addinclude() { + if [ ! -e ${OCF_RESKEY_includesfile} ];then + echo '[global]' > ${OCF_RESKEY_includesfile} + fi + if [ $(grep -c "include = $OCF_RESKEY_outfile" ${OCF_RESKEY_includesfile}) -eq 0 ];then + echo "include = $OCF_RESKEY_outfile" >> ${OCF_RESKEY_includesfile} + fi +} + +smb_share_delinclude() { + ESCAPED=$(echo $OCF_RESKEY_outfile|sed 's,/,\\/,g') + sed -i /include.=.$ESCAPED/d ${OCF_RESKEY_includesfile} +} + +smb_share_create() { + + echo "[${OCF_RESKEY_share}]" > $OCF_RESKEY_outfile + if [ ! -z "$OCF_RESKEY_path" ];then echo " path = $OCF_RESKEY_path" >> $OCF_RESKEY_outfile; fi + if [ ! -z "$OCF_RESKEY_hosts_allow" ];then echo " hosts allow = $OCF_RESKEY_hosts_allow" >> $OCF_RESKEY_outfile; fi + if [ ! -z "$OCF_RESKEY_browseable" ];then echo " browseable = $OCF_RESKEY_browseable" >> $OCF_RESKEY_outfile; fi + if [ ! -z "$OCF_RESKEY_writeable" ];then echo " writeable = $OCF_RESKEY_writeable" >> $OCF_RESKEY_outfile; fi + if [ ! -z "$OCF_RESKEY_read_only" ];then echo " read only = $OCF_RESKEY_read_only" >> $OCF_RESKEY_outfile; fi + if [ ! -z "$OCF_RESKEY_guest_ok" ];then echo " guest ok = $OCF_RESKEY_guest_ok" >> $OCF_RESKEY_outfile; fi + if [ ! -z "$OCF_RESKEY_directory_mask" ];then echo " directory mask = $OCF_RESKEY_directory_mask" >> $OCF_RESKEY_outfile; fi + if [ ! -z "$OCF_RESKEY_create_mask" ];then echo " create mask = $OCF_RESKEY_create_mask" >> $OCF_RESKEY_outfile; fi + if [ ! -z "$OCF_RESKEY_printable" ];then echo " printable = $OCF_RESKEY_printable" >> $OCF_RESKEY_outfile; fi + if [ ! -z "$OCF_RESKEY_valid_users" ];then echo " valid users = $OCF_RESKEY_valid_users" >> $OCF_RESKEY_outfile; fi + if [ ! -z "$OCF_RESKEY_force_user" ];then echo " force user = $OCF_RESKEY_force_user" >> $OCF_RESKEY_outfile; fi + if [ ! -z "$OCF_RESKEY_force_group" ];then echo " force group = $OCF_RESKEY_force_group" >> $OCF_RESKEY_outfile; fi + if [ ! -z "$OCF_RESKEY_extraopt" ];then echo " $OCF_RESKEY_extraopt" >> $OCF_RESKEY_outfile; fi + if [ ! -z "$OCF_RESKEY_extraopt_list" ];then + IFS=';' read -r -a array <<< "$OCF_RESKEY_extraopt_list" + for i in "${array[@]}";do + echo " $i" >> $OCF_RESKEY_outfile; + done + fi + smb_share_addinclude +} + +smb_share_delete() { + if [ -e $OCF_RESKEY_outfile ];then + rm -f $OCF_RESKEY_outfile 2>/dev/null + smb_share_delinclude + exit $OCF_SUCCESS + fi +} + +smb_share_reloadconfig() { + smbcontrol smbd reload-config 2>/dev/null + if [ $? -eq 0 ];then + exit $OCF_SUCCESS + else + ocf_log err "Seems you have an error in your samba configuration" + exit $OCF_ERR_CONFIGURED + fi +} + +smb_share_checktmpmount() { + + ISMOUNT=$(grep $OCF_RESKEY_confd /proc/mounts|grep -c tmpfs) + if [ $ISMOUNT -eq 0 ];then + ocf_log err "The directoy /etc/samba/conf.d need to be a tmpfs mount" + exit $OCF_ERR_CONFIGURED + fi +} + + +###################################################################### + +smb_share_usage() { + cat <<END +usage: $0 {start|stop|status|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END + return $OCF_SUCCESS +} + +smb_share_start() { + smb_share_create + smb_share_reloadconfig +} + +smb_share_stop() { + smbcontrol smbd close-share $OCF_RESKEY_share > /dev/null 2>&1 + smb_share_delete + smb_share_reloadconfig +} + +smb_share_monitor() { + RES=$(smbcontrol smbd ping > /dev/null 2>&1) + if [ $? -eq 0 ];then + if [ $(testparm -s 2>/dev/null| egrep -c \\[$OCF_RESKEY_share\\]) -eq 1 ];then + return $OCF_SUCCESS + else + return $OCF_NOT_RUNNING + fi + else + return $OCF_NOT_RUNNING + fi +} + +smb_share_state() { + smb_share_checktmpmount + RES=$(smbcontrol smbd ping > /dev/null 2>&1) + if [ $? -eq 0 ];then + if [ $(testparm -s 2>/dev/null| egrep -c \\[$OCF_RESKEY_share\\]) -eq 1 ];then + ocf_log info "Samba share $OCF_RESKEY_share is active" + return $OCF_SUCCESS + else + ocf_log info "Samba share $OCF_RESKEY_share is not active" + return $OCF_NOT_RUNNING + fi + else + ocf_log info "Samba share $OCF_RESKEY_share is not active" + return $OCF_NOT_RUNNING + fi +} + +smb_share_validate() { + return $OCF_SUCCESS +} + + +case $__OCF_ACTION in +meta-data) meta_data + ;; +usage|help) smb_share_usage + ;; +esac + + +case $__OCF_ACTION in +start) smb_share_start + ;; +stop) smb_share_stop + ;; +status) smb_share_state + ;; +monitor) smb_share_monitor + ;; +validate-all) smb_share_validate + ;; +*) smb_share_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +exit $? +# vi:sw=4:ts=8: diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in new file mode 100644 index 0000000..284dec3 --- /dev/null +++ b/heartbeat/storage-mon.in @@ -0,0 +1,399 @@ +#!@BASH_SHELL@ +# +# Copyright (C) 2021 Red Hat, Inc. All rights reserved. +# +# Authors: Christine Caulfield <ccaulfie@redhat.com> +# Fabio M. Di Nitto <fdinitto@redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +# +# Checks storage I/O status of all given drives and writes the #health-storage +# status into the CIB +# Implementation is heavily based on ocf:pacemaker:HealtSMART +# +# It sends a single block on IO to a radom location on the device and reports any errors returned. +# If the IO hangs, that will also be returned. (bear in mind tha tmay also hang the C app in some +# instances). +# +# It's worth making a note in the RA description that the smartmon RA is also recommended (this +# does not replace it), and that Pacemaker health checking should be configued. +# +# https://clusterlabs.org/pacemaker/doc/2.1/Pacemaker_Explained/singlehtml/index.html#tracking-node-health + +####################################################################### + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# +STORAGEMON=${HA_BIN}/storage_mon +ATTRDUP=${HA_SBIN_DIR}/attrd_updater +PIDFILE=${HA_VARRUN}/storage-mon-${OCF_RESOURCE_INSTANCE}.pid +ATTRNAME="#health-${OCF_RESOURCE_INSTANCE}" + +OCF_RESKEY_CRM_meta_interval_default="0" +OCF_RESKEY_io_timeout_default="10" +OCF_RESKEY_check_interval_default="30" +OCF_RESKEY_inject_errors_default="" +OCF_RESKEY_state_file_default="${HA_RSCTMP%%/}/storage-mon-${OCF_RESOURCE_INSTANCE}.state" +OCF_RESKEY_daemonize_default="false" + +# Explicitly list all environment variables used, to make static analysis happy +: ${OCF_RESKEY_CRM_meta_interval:=${OCF_RESKEY_CRM_meta_interval_default}} +: ${OCF_RESKEY_drives:=""} +: ${OCF_RESKEY_io_timeout:=${OCF_RESKEY_io_timeout_default}} +: ${OCF_RESKEY_check_interval:=${OCF_RESKEY_check_interval_default}} +: ${OCF_RESKEY_inject_errors:=${OCF_RESKEY_inject_errors_default}} +: ${OCF_RESKEY_state_file:=${OCF_RESKEY_state_file_default}} +: ${OCF_RESKEY_daemonize:=${OCF_RESKEY_daemonize_default}} + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="storage-mon" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +System health agent that checks the storage I/O status of the given drives and +updates the #health-storage attribute. Usage is highly recommended in combination +with the HealthSMART monitoring agent. The agent currently support a maximum of 25 +devices per instance. +</longdesc> +<shortdesc lang="en">storage I/O health status</shortdesc> + +<parameters> + +<parameter name="state_file" unique="1"> +<longdesc lang="en"> +Location to store the resource state in. +</longdesc> +<shortdesc lang="en">State file</shortdesc> +<content type="string" default="${OCF_RESKEY_state_file_default}" /> +</parameter> + +<parameter name="drives" unique="1" required="1"> +<longdesc lang="en"> +The drive(s) to check as a SPACE separated list. Enter the full path to the device, e.g. "/dev/sda". +</longdesc> +<shortdesc lang="en">Drives to check</shortdesc> +<content type="string" default="" /> +</parameter> + +<parameter name="io_timeout" unique="0"> +<longdesc lang="en"> +Specify disk I/O timeout in seconds. Minimum 1, recommended 10 (default). +</longdesc> +<shortdesc lang="en">Disk I/O timeout</shortdesc> +<content type="integer" default="${OCF_RESKEY_io_timeout_default}" /> +</parameter> + +<parameter name="check_interval" unique="0"> +<longdesc lang="en"> +Specify interval between I/O checks in seconds.(Only supported with the damonize option.) +</longdesc> +<shortdesc lang="en">I/O check interval</shortdesc> +<content type="integer" default="${OCF_RESKEY_check_interval_default}" /> +</parameter> + +<parameter name="inject_errors" unique="0"> +<longdesc lang="en"> +Used only for testing! Specify % of I/O errors to simulate drives failures. +</longdesc> +<shortdesc lang="en">Specify % of I/O errors to simulate drives failures</shortdesc> +<content type="integer" default="${OCF_RESKEY_inject_errors_default}" /> +</parameter> + +<parameter name="daemonize" unique="0"> +<longdesc lang="en"> +Specifies to start storage-mon as a daemon and check for devices. +</longdesc> +<shortdesc lang="en">start storage-mon with daemon</shortdesc> +<content type="boolean" default="${OCF_RESKEY_daemonize_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="10s" /> +<action name="stop" timeout="120s" /> +<action name="monitor" timeout="120s" interval="30s" start-delay="0s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="10s" /> +</actions> +</resource-agent> +END + return $OCF_SUCCESS +} + +####################################################################### + +storage-mon_usage() { + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END + return $1 +} + +storage-mon_init() { + #Test for presence of storage_mon helper + if [ ! -x "$STORAGEMON" ] ; then + ocf_log err "${STORAGEMON} not installed." + exit $OCF_ERR_INSTALLED + fi + + if [ ! -x "$ATTRDUP" ] ; then + ocf_log err "${ATTRDUP} not installed." + exit $OCF_ERR_INSTALLED + fi + + i=0 + for DRIVE in ${OCF_RESKEY_drives}; do + if [ ! -e "$DRIVE" ] ; then + ocf_log err "${DRIVE} not found on the system" + exit $OCF_ERR_INSTALLED + fi + i=$((i + 1)) + done + + if [ "$i" -gt "25" ]; then + ocf_log err "Too many drives ($i) configured for this agent. Max 25." + exit $OCF_ERR_CONFIGURED + fi + + if [ "${OCF_RESKEY_io_timeout}" -lt "1" ]; then + ocf_log err "Minimum timeout is 1. Recommended ${OCF_RESKEY_io_timeout_default} (default)." + exit $OCF_ERR_CONFIGURED + fi + + if [ "${OCF_RESKEY_check_interval}" -lt "1" ]; then + ocf_log err "Minimum interval to check is 1. default ${OCF_RESKEY_check_interval_default}." + exit $OCF_ERR_CONFIGURED + fi + + if [ -n "${OCF_RESKEY_inject_errors}" ]; then + if [ "${OCF_RESKEY_inject_errors}" -lt "1" ] || [ "${OCF_RESKEY_inject_errors}" -gt "100" ]; then + ocf_log err "Inject errors % has to be a value between 1 and 100." + exit $OCF_ERR_CONFIGURED + fi + fi +} + +storage-mon_update_attribute() { + + while : + do + "$ATTRDUP" -n ${ATTRNAME} -U "$1" -d "5s" + rc=$? + if [ $rc -eq 0 ]; then + break + fi + + ocf_log debug "${1} attribute by attrd_updater failed" + if [ "$1" = "red" ]; then + # If the attrd_updater fails with the red attribute, return an error to let pacemaker handle the failure immediately. + return $OCF_ERR_GENERIC + fi + done + return $OCF_SUCCESS +} + +storage-mon_monitor() { + if ! ocf_is_true "$OCF_RESKEY_daemonize"; then + storage-mon_init + + # Monitor _MUST!_ differentiate correctly between running + # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING). + # That is THREE states, not just yes/no. + + if [ ! -f "${OCF_RESKEY_state_file}" ]; then + return $OCF_NOT_RUNNING + fi + + # generate command line + cmdline="" + for DRIVE in ${OCF_RESKEY_drives}; do + cmdline="$cmdline --device $DRIVE --score 1" + done + cmdline="$cmdline --timeout ${OCF_RESKEY_io_timeout}" + if [ -n "${OCF_RESKEY_inject_errors}" ]; then + cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}" + fi + $STORAGEMON $cmdline + if [ $? -ne 0 ]; then + status="red" + else + status="green" + fi + + storage-mon_update_attribute $status + return "$?" + else + ocf_pidfile_status "${PIDFILE}" > /dev/null 2>&1 + case "$?" in + 0) rc=$OCF_SUCCESS;; + 1|2) rc=$OCF_NOT_RUNNING;; + *) rc=$OCF_ERR_GENERIC;; + esac + + if [ $rc -ne $OCF_SUCCESS ]; then + return "$rc" + fi + if [ "$1" = "pid_check_only" ]; then + return "$rc" + fi + + # generate client command line + cmdline="" + cmdline="$cmdline --client --attrname ${ATTRNAME}" + while : + do + # 0 : Normal. + # greater than 0 : monitoring error. + # 255(-1) : communication system error. + # 254(-2) : Not all checks completed for first device in daemon mode. + $STORAGEMON $cmdline + rc=$? + case "$rc" in + 254|255) + # If there is a communication error or the initial check of all devices has not been completed, + # it will loop and try to reconnect. + # When everything ends with a communication error during monitor, a monitor timeout occurs. + ocf_log debug "client monitor error : $rc" + ;; + 0) + status="green" + break + ;; + *) + status="red" + break + ;; + esac + done + + storage-mon_update_attribute $status + return "$?" + fi +} + +storage-mon_start() { + if ! ocf_is_true "$OCF_RESKEY_daemonize"; then + storage-mon_monitor + if [ $? -eq $OCF_SUCCESS ]; then + return $OCF_SUCCESS + fi + touch "${OCF_RESKEY_state_file}" + else + storage-mon_init + # generate command line + cmdline="" + for DRIVE in ${OCF_RESKEY_drives}; do + cmdline="$cmdline --device $DRIVE --score 1" + done + cmdline="$cmdline --daemonize --timeout ${OCF_RESKEY_io_timeout} --interval ${OCF_RESKEY_check_interval} --pidfile ${PIDFILE} --attrname ${ATTRNAME}" + if [ -n "${OCF_RESKEY_inject_errors}" ]; then + cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}" + fi + $STORAGEMON $cmdline + if [ "$?" -ne 0 ]; then + return $OCF_ERR_GENERIC + fi + fi +} + +storage-mon_stop() { + storage-mon_monitor + rc=$? + + if ! ocf_is_true "$OCF_RESKEY_daemonize"; then + if [ $rc -eq $OCF_SUCCESS ]; then + rm "${OCF_RESKEY_state_file}" + fi + else + case "$rc" in + $OCF_SUCCESS) + ;; + $OCF_NOT_RUNNING) + return "$OCF_SUCCESS";; + *) + return "$rc";; + esac + + kill -TERM $(cat "${PIDFILE}") + if [ "$?" -ne 0 ]; then + return $OCF_ERR_GENERIC + fi + + while true; do + storage-mon_monitor pid_check_only + rc="$?" + case "$rc" in + $OCF_SUCCESS) + ;; + $OCF_NOT_RUNNING) + return "$OCF_SUCCESS";; + *) + return "$rc";; + esac + sleep 1 + done + fi + return $OCF_SUCCESS +} + +storage-mon_validate() { + storage-mon_init + + if ! ocf_is_true "$OCF_RESKEY_daemonize"; then + # Is the state directory writable? + state_dir=$(dirname "${OCF_RESKEY_state_file}") + touch "$state_dir/$$" + if [ $? -ne 0 ]; then + return $OCF_ERR_CONFIGURED + fi + rm "$state_dir/$$" + fi + + return $OCF_SUCCESS +} + +case "$__OCF_ACTION" in + start) storage-mon_start;; + stop) storage-mon_stop;; + monitor) storage-mon_monitor;; + validate-all) storage-mon_validate;; + meta-data) meta_data;; + usage|help) storage-mon_usage $OCF_SUCCESS;; + *) storage-mon_usage $OCF_ERR_UNIMPLEMENTED;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc +# vim: set filetype=sh: diff --git a/heartbeat/sybaseASE.in b/heartbeat/sybaseASE.in new file mode 100755 index 0000000..8b31502 --- /dev/null +++ b/heartbeat/sybaseASE.in @@ -0,0 +1,905 @@ +#!@BASH_SHELL@ +# +# Sybase Availability Agent for Red Hat Cluster v15.0.2 +# Copyright (C) - 2007 +# Sybase, Inc. All rights reserved. +# +# Sybase Availability Agent for Red Hat Cluster v15.0.2 is licensed +# under the GNU General Public License Version 2. +# +# Author(s): +# Jian-ping Hui <jphui@sybase.com> +# +# Description: Service script for starting/stopping/monitoring \ +# Sybase Adaptive Server on: \ +# Red Hat Enterprise Linux 7 ES \ +# Red Hat Enterprise Linux 7 AS +# +# NOTES: +# +# (1) Before running this script, we assume that user has installed +# Sybase ASE 15.0.2 or higher version on the machine. Please +# customize your configuration in /etc/cluster/cluster.conf according +# to your actual environment. We assume the following files exist before +# you start the service: +# /$sybase_home/SYBASE.sh +# /$sybase_home/$sybase_ase/install/RUN_$server_name +# +# (2) You can customize the interval value in the meta-data section if needed: +# <action name="start" timeout="300s" /> +# <action name="stop" timeout="300s" /> +# +# <!-- Checks to see if it''s mounted in the right place --> +# <action name="status" interval="30s" timeout="100s" /> +# <action name="monitor" interval="30s" timeout="100s" /> +# +# <!--Checks to see if we can read from the mountpoint --> +# <action name="status" depth="10" timeout="100s" interval="120s" /> +# <action name="monitor" depth="10" timeout="100s" interval="120s" /> +# +# <action name="meta-data" timeout="5s" /> +# <action name="validate-all" timeout="5s" /> +# The timeout value is not supported by Redhat in RHCS5.0. +# + +####################################################################### +# Initialization: + +if [ -f /etc/init.d/functions ]; then + . /etc/init.d/functions +fi +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + +# Default timeouts when we aren't using the rgmanager wrapper +if ! ocf_is_true "$OCF_RESKEY_is_rgmanager_wrapper"; then + if [ -z "$OCF_RESKEY_CRM_meta_timeout" ]; then + case $1 in + start|stop) OCF_RESKEY_CRM_meta_timeout=300000 ;; + *) OCF_RESKEY_CRM_meta_timeout=100000 ;; + esac + fi + default_timeout=$(((${OCF_RESKEY_CRM_meta_timeout}/1000) - 5)) + default_force_stop_timeout=$(((${OCF_RESKEY_CRM_meta_timeout}/1000) - 5)) + : ${OCF_RESKEY_shutdown_timeout=${default_force_stop_timeout}} + : ${OCF_RESKEY_deep_probe_timeout=${default_timeout}} + : ${OCF_RESKEY_start_timeout=${default_timeout}} +fi + +sybase_user_default="sybase" +sybase_home_default="detect" +ase_default="detect" +ocs_default="detect" + +: ${OCF_RESKEY_sybase_user=${sybase_user_default}} +: ${OCF_RESKEY_sybase_ase=${ase_default}} +: ${OCF_RESKEY_sybase_ocs=${ocs_default}} +: ${OCF_RESKEY_sybase_home=${sybase_home_default}} + +if [ "$__OCF_ACTION" != "meta-data" ]; then + if [ "$OCF_RESKEY_sybase_home" = "detect" ]; then + if [ -d "/opt/sap" ]; then + OCF_RESKEY_sybase_home="/opt/sap" + elif [ -d "/opt/sybase" ]; then + OCF_RESKEY_sybase_home="/opt/sybase" + else + ocf_log err "sybaseASE: Unable to detect 'sybase_home'." + exit $OCF_ERR_ARGS + fi + fi + + sybase_env="$OCF_RESKEY_sybase_home/SYBASE.env" + + if [ "$OCF_RESKEY_sybase_ase" = "detect" ]; then + if [ -f "$sybase_env" ]; then + OCF_RESKEY_sybase_ase=$(grep "SYBASE_ASE" "$sybase_env" | cut -d= -f2) + else + ocf_log err "sybaseASE: Unable to detect 'sybase_ase'." + exit $OCF_ERR_ARGS + fi + fi + + if [ "$OCF_RESKEY_sybase_ocs" = "detect" ]; then + if [ -f "$sybase_env" ]; then + OCF_RESKEY_sybase_ocs=$(grep "SYBASE_OCS" "$sybase_env" | cut -d= -f2) + else + ocf_log err "sybaseASE: Unable to detect 'sybase_ocs'." + exit $OCF_ERR_ARGS + fi + fi +fi + + +interfaces_file_default="${OCF_RESKEY_sybase_home}/interfaces" +: ${OCF_RESKEY_interfaces_file=${interfaces_file_default}} + +if [ $__OCF_ACTION != "meta-data" ]; then + logfile_default="$OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ase/install/$OCF_RESKEY_server_name.log" +else + logfile_default="detect" +fi +: ${OCF_RESKEY_logfile=${logfile_default}} + +export LD_POINTER_GUARD=0 + +####################################################################################### +# Declare some variables we will use in the script. # +####################################################################################### +declare login_string="" +declare RUNSERVER_SCRIPT=$OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ase/install/RUN_$OCF_RESKEY_server_name +declare CONSOLE_LOG="$OCF_RESKEY_logfile" + +################################################################################################## +# This function will be called by Pacemaker to get the meta data of resource agent "sybaseASE". # +################################################################################################## +meta_data() +{ + cat <<EOT +<?xml version="1.0" ?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="sybaseASE" version="1.0"> + <version>1.0</version> + + <longdesc lang="en"> + Sybase ASE Failover Instance + </longdesc> + <shortdesc lang="en"> + Sybase ASE Failover Instance + </shortdesc> + + <parameters> + <parameter name="sybase_home"> + <longdesc lang="en"> + The home directory of sybase products + </longdesc> + <shortdesc lang="en"> + SYBASE home directory + </shortdesc> + <content type="string" default="${sybase_home_default}"/> + </parameter> + + <parameter name="sybase_ase"> + <longdesc lang="en"> + The directory name under sybase_home where ASE products are installed + </longdesc> + <shortdesc lang="en"> + SYBASE_ASE directory name + </shortdesc> + <content type="string" default="$ase_default" /> + </parameter> + + <parameter name="sybase_ocs"> + <longdesc lang="en"> + The directory name under sybase_home where OCS products are installed, i.e. ASE-15_0 + </longdesc> + <shortdesc lang="en"> + SYBASE_OCS directory name + </shortdesc> + <content type="string" default="${ocs_default}" /> + </parameter> + + <parameter name="server_name" unique="1" required="1"> + <longdesc lang="en"> + The ASE server name which is configured for the HA service + </longdesc> + <shortdesc lang="en"> + ASE server name + </shortdesc> + <content type="string" /> + </parameter> + + <parameter name="interfaces_file"> + <longdesc lang="en"> + The full path of interfaces file which is used to start/access the ASE server + </longdesc> + <shortdesc lang="en"> + Interfaces file + </shortdesc> + <content type="string" default="$interfaces_file_default"/> + </parameter> + + <parameter name="sybase_user"> + <longdesc lang="en"> + The user who can run ASE server + </longdesc> + <shortdesc lang="en"> + Sybase user + </shortdesc> + <content type="string" default="$sybase_user_default" /> + </parameter> + + <parameter name="db_user" required="1"> + <longdesc lang="en"> + The database user required to login to isql. + </longdesc> + <shortdesc lang="en"> + Sybase user + </shortdesc> + <content type="string"/> + </parameter> + + <parameter name="db_passwd"> + <longdesc lang="en"> + The database user's password required to login to isql. + </longdesc> + <shortdesc lang="en"> + Sybase user + </shortdesc> + <content type="string"/> + </parameter> + + <parameter name="logfile"> + <longdesc lang="en"> + Logfile + </longdesc> + <shortdesc lang="en"> + Logfile + </shortdesc> + <content type="string" default="$logfile_default" /> + </parameter> + + </parameters> + + <actions> + <action name="start" timeout="300s" /> + <action name="stop" timeout="300s" /> + + <!-- Checks to see if it''s mounted in the right place --> + <action name="status" interval="30s" timeout="100s" /> + <action name="monitor" interval="30s" timeout="100s" /> + + <!--Checks to see if we can read from the mountpoint --> + <action name="status" depth="10" timeout="100s" interval="120s" /> + <action name="monitor" depth="10" timeout="100s" interval="120s" /> + + <action name="meta-data" timeout="5s" /> + <action name="validate-all" timeout="5s" /> + </actions> +</resource-agent> +EOT +} + +ase_engine0_process() +{ + sed -n -e '/engine 0/s/^.*os pid \([0-9]*\).*online$/\1/p' $CONSOLE_LOG +} + +ase_engine0_thread() +{ + sed -n -e 's/.*Thread.*LWP \([0-9]*\).*online as engine 0.*/\1/p' $CONSOLE_LOG +} + +ase_engine_threadpool_pid() +{ + sed -n -e 's/.*Adaptive Server is running as process id \([0-9]*\).*/\1/p' $CONSOLE_LOG +} + +ase_all_pids() +{ + local PIDS=$(sed -n -e '/engine /s/^.*os pid \([0-9]*\).*online$/\1/p' $CONSOLE_LOG) + if [ -z "$PIDS" ]; then + #engines are running in a threadpool + PIDS=$(ase_engine_threadpool_pid) + fi + echo $PIDS +} + +################################################################################################## +# Function Name: verify_all # +# Parameter: None # +# Return value: # +# 0 SUCCESS # +# OCF_ERR_ARGS Parameters are invalid # +# Description: Do some validation on the user-configurable stuff at the beginning of the script. # +################################################################################################## +verify_all() +{ + ocf_log debug "sybaseASE: Start 'verify_all'" + + check_binary "ksh" + + # Check if the parameter 'sybase_home' is set. + if [[ -z "$OCF_RESKEY_sybase_home" ]] + then + ocf_log err "sybaseASE: The parameter 'sybase_home' is not set." + return $OCF_ERR_ARGS + fi + + # Check if the parameter 'sybase_home' is a valid path. + if [[ ! -d $OCF_RESKEY_sybase_home ]] + then + ocf_log err "sybaseASE: The sybase_home '$OCF_RESKEY_sybase_home' doesn't exist." + return $OCF_ERR_ARGS + fi + + # Check if the script file SYBASE.sh exists + if [[ ! -f $OCF_RESKEY_sybase_home/SYBASE.sh ]] + then + ocf_log err "sybaseASE: The file $OCF_RESKEY_sybase_home/SYBASE.sh is required to run this script. Failed to run the script." + return $OCF_ERR_ARGS + fi + + # Check if the parameter 'sybase_ase' is set. + if [[ -z "$OCF_RESKEY_sybase_ase" ]] + then + ocf_log err "sybaseASE: The parameter 'sybase_ase' is not set." + return $OCF_ERR_ARGS + fi + + # Check if the directory /$OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ase exists. + if [[ ! -d $OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ase ]] + then + ocf_log err "sybaseASE: The directory '$OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ase' doesn't exist." + return $OCF_ERR_ARGS + fi + + # Check if the parameter 'sybase_ocs' is set. + if [[ -z "$OCF_RESKEY_sybase_ocs" ]] + then + ocf_log err "sybaseASE: The parameter 'sybase_ocs' is not set." + return $OCF_ERR_ARGS + fi + + # Check if the directory /$OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ocs exists. + if [[ ! -d $OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ocs ]] + then + ocf_log err "sybaseASE: The directory '$OCF_RESKEY_sybase_home/$OCF_RESKEY_sybase_ocs' doesn't exist." + return $OCF_ERR_ARGS + fi + + # Check if the parameter 'server_name' is set. + if [[ -z "$OCF_RESKEY_server_name" ]] + then + ocf_log err "sybaseASE: The parameter 'server_name' is not set." + return $OCF_ERR_ARGS + fi + + # Check if the Run_server file exists. + if [[ ! -f $RUNSERVER_SCRIPT ]] + then + ocf_log err "sybaseASE: The file $RUNSERVER_SCRIPT doesn't exist. The sybase directory may be incorrect." + return $OCF_ERR_ARGS + fi + + # Check if the user 'sybase_user' exist + id -u $OCF_RESKEY_sybase_user + if [[ $? != 0 ]] + then + ocf_log err "sybaseASE: The user '$OCF_RESKEY_sybase_user' doesn't exist in the system." + return $OCF_ERR_ARGS + fi + + # Check if the parameter 'interfaces_file' is set + if [[ -z "$OCF_RESKEY_interfaces_file" ]] + then + ocf_log err "sybaseASE: The parameter 'interfaces_file' is not set." + return $OCF_ERR_ARGS + fi + + # Check if the file 'interfaces_file' exists + if [[ ! -f $OCF_RESKEY_interfaces_file ]] + then + ocf_log err "sybaseASE: The interfaces file '$OCF_RESKEY_interfaces_file' doesn't exist." + return $OCF_ERR_ARGS + fi + + # Check if the parameter 'db_user' is set + if [[ -z "$OCF_RESKEY_db_user" ]] + then + ocf_log err "sybaseASE: The parameter 'db_user' is not set." + return $OCF_ERR_ARGS + fi + + # Check if the parameter 'shutdown_timeout' is a valid value + if [[ $OCF_RESKEY_shutdown_timeout -eq 0 ]] + then + ocf_log err "sybaseASE: The parameter 'shutdown_timeout' is not set. Its value cannot be zero." + return $OCF_ERR_ARGS + fi + + # Check if the parameter 'start_timeout' is a valid value + if [[ $OCF_RESKEY_start_timeout -eq 0 ]] + then + ocf_log err "sybaseASE: The parameter 'start_timeout' is not set. Its value cannot be zero." + return $OCF_ERR_ARGS + fi + + # Check if the parameter 'deep_probe_timeout' is a valid value + if [[ $OCF_RESKEY_deep_probe_timeout -eq 0 ]] + then + ocf_log err "sybaseASE: The parameter 'deep_probe_timeout' is not set. Its value cannot be zero." + return $OCF_ERR_ARGS + fi + + ocf_log debug "sybaseASE: End 'verify_all' successfully." + + return $OCF_SUCCESS +} + +set_login_string() +{ + tmpstring="" + login_sting="" + + login_string="-U$OCF_RESKEY_db_user -P$OCF_RESKEY_db_passwd" + return 0 +} + +############################################################################################## +# Function name: ase_start # +# Parameter: None # +# Return value: # +# 0 SUCCESS # +# 1 FAIL # +# Description: This function is used to start the ASE server in primary or secondary server. # +############################################################################################## +ase_start() +{ + ocf_log debug "sybaseASE: Start 'ase_start'" + + # Check if the server is running. If yes, return SUCCESS directly. Otherwise, continue the start work. + ase_is_running + if [[ $? = 0 ]] + then + # The server is running. + ocf_log info "sybaseASE: Server is running. Start is success." + return $OCF_SUCCESS + fi + + # The server is not running. We need to start it. + # If the log file existed, delete it. + if [[ -f $CONSOLE_LOG ]] + then + rm -f $CONSOLE_LOG + fi + + ocf_log debug "sybaseASE: Starting '$OCF_RESKEY_server_name'..." + + # Run runserver script to start the server. Since this script will be run by root and ASE server + # needs to be run by another user, we need to change the user to sybase_user first. Then, run + # the script to start the server. + su $OCF_RESKEY_sybase_user -c ksh << EOF + # set required SYBASE environment by running SYBASE.sh. + . $OCF_RESKEY_sybase_home/SYBASE.sh + # Run the RUNSERVER_SCRIPT to start the server. + . $RUNSERVER_SCRIPT > $CONSOLE_LOG 2>&1 & +EOF + + # Monitor every 1 seconds if the server has + # recovered, until RECOVERY_TIMEOUT. + t=0 + while [[ $t -le $OCF_RESKEY_start_timeout ]] + do + grep -s "Recovery complete." $CONSOLE_LOG > /dev/null 2>&1 + if [[ $? != 0 ]] + then + # The server has not completed the recovery. We need to continue to monitor the recovery + # process. + t=`expr $t + 1` + else + # The server has completed the recovery. + ocf_log info "sybaseASE: ASE server '$OCF_RESKEY_server_name' started successfully." + break + fi + sleep 1 + done + + # If $t is larger than start_timeout, it means the ASE server cannot start in given time. Otherwise, it + # means the ASE server has started successfully. + if [[ $t -gt $OCF_RESKEY_start_timeout ]] + then + # The server cannot start in specified time. We think the start is failed. + ocf_log err "sybaseASE: Failed to start ASE server '$OCF_RESKEY_server_name'. Please check the server error log $CONSOLE_LOG for possible problems." + return $OCF_ERR_GENERIC + fi + + ase_is_running + if [ $? -ne 0 ]; then + ocf_log err "sybaseASE: ase_start could not detect database initialized properly." + + return $OCF_ERR_GENERIC + fi + ocf_log debug "sybaseASE: End 'ase_start' successfully." + return $OCF_SUCCESS +} + +############################################################################################# +# Function name: ase_stop # +# Parameter: None # +# Return value: # +# 0 SUCCESS # +# 1 FAIL # +# Description: This function is used to stop the ASE server in primary or secondary server. # +############################################################################################# +ase_stop() +{ + ocf_log debug "sybaseASE: Start 'ase_stop'" + + # Check if the ASE server is still running. + ase_is_running + if [[ $? != 0 ]] + then + # The ASE server is not running. We need not to shutdown it. + ocf_log info "sybaseASE: The dataserver $OCF_RESKEY_server_name is not running." + return $OCF_SUCCESS + fi + + set_login_string + + # Just in case things are hung, start a process that will wait for the + # timeout period, then kill any remaining porcesses. We'll need to + # monitor this process (set -m), so we can terminate it later if it is + # not needed. + set -m + kill_ase $OCF_RESKEY_shutdown_timeout & + KILL_PID=$! # If successful, we will also terminate watchdog process + + # Run "shutdown with nowait" from isql command line to shutdown the server + su $OCF_RESKEY_sybase_user -c ksh << EOF + # set required SYBASE environment by running SYBASE.sh. + . $OCF_RESKEY_sybase_home/SYBASE.sh + # Run "shutdown with nowait" to shutdown the server immediately. + (echo "use master" ; echo go ; echo "shutdown with nowait"; echo go) | \ + \$SYBASE/\$SYBASE_OCS/bin/isql $login_string -S$OCF_RESKEY_server_name -I$OCF_RESKEY_interfaces_file & +EOF + + sleep 5 + + # Check if the server has been shut down successfully + t=0 + while [[ $t -lt $OCF_RESKEY_shutdown_timeout ]] + do + # Search "ueshutdown: exiting" in the server log. If found, it means the server has been shut down. + # Otherwise, we need to wait. + tail $CONSOLE_LOG | grep "ueshutdown: exiting" > /dev/null 2>&1 + if [[ $? != 0 ]] + then + # The shutdown is still in processing. Wait... + sleep 2 + t=`expr $t+2` + else + # The shutdown is success. + ocf_log info "sybaseASE: ASE server '$OCF_RESKEY_server_name' shutdown with isql successfully." + break + fi + done + + # If $t is larger than shutdown_timeout, it means the ASE server cannot be shut down in given time. We need + # to wait for the background kill process to kill the OS processes directly. + if [[ $t -ge $OCF_RESKEY_shutdown_timeout ]] + then + ocf_log err "sybaseASE: Shutdown of '$OCF_RESKEY_server_name' from isql failed. Server is either down or unreachable." + fi + + # Here, the ASE server has been shut down by isql command or killed by background process. We need to do + # further check to make sure all processes have gone away before saying shutdown is complete. This stops the + # other node from starting up the package before it has been stopped and the file system has been unmounted. + + # Get all processes ids from log file + declare -a ENGINE_ALL=$(ase_all_pids) + + typeset -i num_procs=${#ENGINE_ALL[@]} + + # We cannot find any process id from log file. It may be because the log file is corrupted or be deleted. + # In this case, we determine the shutdown is failed. + if [[ ${#ENGINE_ALL[@]} -lt 1 ]] + then + ocf_log err "sybaseASE: Unable to find the process id from $CONSOLE_LOG." + ocf_log err "sybaseASE: Stop ASE server failed." + return $OCF_ERR_GENERIC + fi + + # Monitor the system processes to make sure all ASE related processes have gone away. + while true + do + # To every engine process, search it in system processes list. If it is not in the + # list, it means this process has gone away. Otherwise, we need to wait for it is + # killed by background process. + for i in "${ENGINE_ALL[@]}" + do + ps -fu $OCF_RESKEY_sybase_user | awk '{print $2}' | grep $i | grep -v grep + if [[ $? != 0 ]] + then + ocf_log debug "sybaseASE: $i process has stopped." + c=0 + while (( c < $num_procs )) + do + if [[ ${ENGINE_ALL[$c]} = $i ]] + then + unset ENGINE_ALL[$c] + c=$num_procs + fi + (( c = c + 1 )) + done + fi + done + + # To here, all processes should have gone away. + if [[ ${#ENGINE_ALL[@]} -lt 1 ]] + then + # + # Looks like shutdown was successful, so kill the + # script to kill any hung processes, which we started earlier. + # Check to see if the script is still running. If jobs + # returns that the script is done, then we don't need to kill + # it. + # + job=$(jobs | grep -v Done) + if [[ ${job} != "" ]] + then + ocf_log debug "sybaseASE: Killing the kill_ase script." + + kill -15 $KILL_PID > /dev/null 2>&1 + fi + break + fi + sleep 5 + done + + ocf_log debug "sybaseASE: End 'ase_stop'." + + return $OCF_SUCCESS +} + +#################################################################################### +# Function name: ase_is_running # +# Parameter: None # +# Return value: # +# 0 ASE server is running # +# 1 ASE server is not running or there are errors # +# Description: This function is used to check if the ASE server is still running . # +#################################################################################### +ase_is_running() +{ + local PID + local THREAD + # If the error log doesn't exist, we can say there is no ASE is running. + if [[ ! -f $CONSOLE_LOG ]] + then + ocf_log debug "could not find console log $CONSOLE_LOG" + return $OCF_NOT_RUNNING + fi + + # The error log file exists. Check if the engine 0 is alive. + PID=$(ase_engine0_process) + if [ -n "$PID" ]; then + kill -s 0 $PID > /dev/null 2>&1 + if [ $? -eq 0 ]; then + # The engine 0 is running. + ocf_log debug "Found engine 0 pid $PID to be running" + return $OCF_SUCCESS + fi + # The engine 0 is not running. + return $OCF_NOT_RUNNING + fi + + PID=$(ase_engine_threadpool_pid) + THREAD=$(ase_engine0_thread) + if [ -n "$PID" ] && [ -n "$THREAD" ]; then + ps -AL | grep -q "${PID}[[:space:]]*${THREAD} " + if [ $? -eq 0 ]; then + # engine 0 thread is running + ocf_log debug "Found engine 0 thread $THREAD in pid $PID to be running" + return $OCF_SUCCESS + fi + # The engine 0 is not running. + return $OCF_NOT_RUNNING + fi + return $OCF_ERR_GENERIC +} + +#################################################################################### +# Function name: kill_ase # +# Parameter: # +# DELAY The seconds to wait before killing the ASE processes. 0 means # +# kill the ASE processes immediately. # +# Return value: None # +# 1 ASE server is not running or there are errors # +# Description: This function is used to check if the ASE server is still running . # +#################################################################################### +kill_ase() +{ + ocf_log debug "sybaseASE: Start 'kill_ase'." + + DELAY=$1 + + # Wait for sometime before sending a kill signal. + t=0 + while [[ $t -lt $DELAY ]] + do + sleep 1 + t=`expr $t+1` + done + + # Get the process ids from log file + declare -a ENGINE_ALL=$(ase_all_pids) + + # If there is no process id found in the log file, we need not to continue. + if [[ ${#ENGINE_ALL[@]} -lt 1 ]] + then + ocf_log err "sybaseASE: Unable to find the process id from $CONSOLE_LOG." + return $OCF_ERR_GENERIC + fi + + # Kill the datasever process(es) + for pid in "${ENGINE_ALL[@]}" + do + kill -9 $pid > /dev/null 2>&1 + if [[ $? != 0 ]] + then + ocf_log info "sybaseASE: kill_ase function did NOT find process $pid running." + else + ocf_log info "sybaseASE: kill_ase function did find process $pid running. Sent SIGTERM." + fi + done + + ocf_log debug "sybaseASE: End 'kill_ase'." + return $OCF_SUCCESS +} + +##################################################################################### +# Function name: ase_status # +# Parameter: # +# 0 Level 0 probe. In this level, we just check if engine 0 is alive # +# 10 Level 10 probe. In this level, we need to probe if the ASE server # +# still has response. # +# Return value: # +# 0 The server is still alive # +# 1 The server is down # +# Description: This function is used to check if the ASE server is still running. # +##################################################################################### +ase_status() +{ + local rc + ocf_log debug "sybaseASE: Start 'ase_status'." + + # Step 1: Check if the engine 0 is alive + ase_is_running + rc=$? + if [ $rc -ne 0 ]; then + # ASE is down. Return fail to Pacemaker to trigger the failover process. + ocf_log err "sybaseASE: ASE server is down." + return $rc + fi + + # ASE process is still alive. + # Step2: If this is level 10 probe, We need to check if the ASE server still has response. + if [[ $1 -gt 0 ]] + then + ocf_log debug "sybaseASE: Need to run deep probe." + # Run deep probe + deep_probe + if [[ $? = 1 ]] + then + # Deep probe failed. This means the server has been down. + ocf_log err "sybaseASE: Deep probe found the ASE server is down." + return $OCF_ERR_GENERIC + fi + fi + + ocf_log debug "sybaseASE: End 'ase_status'." + + return $OCF_SUCCESS +} + +#################################################################################### +# Function name: deep_probe # +# Parameter: None # +# Return value: # +# 0 ASE server is alive # +# 1 ASE server is down # +# Description: This function is used to run deep probe to make sure the ASE server # +# still has response. # +#################################################################################### +deep_probe() +{ + declare -i rv + + ocf_log debug "sybaseASE: Start 'deep_probe'." + + # Declare two temporary files which will be used in this probe. + tmpfile1="$(mktemp /tmp/sybaseASE.1.XXXXXX)" + tmpfile2="$(mktemp /tmp/sybaseASE.2.XXXXXX)" + + set_login_string + + rm -f $tmpfile1 + rm -f $tmpfile2 + + # The login file is correct. We have gotten the login account and password from it. + # Run isql command in background. + su $OCF_RESKEY_sybase_user -c ksh << EOF + # set required SYBASE environment by running SYBASE.sh. + . $OCF_RESKEY_sybase_home/SYBASE.sh + # Run a very simple SQL statement to make sure the server is still ok. The output will be put to + # tmpfile1. + (echo "select 1"; echo "go") | + \$SYBASE/\$SYBASE_OCS/bin/isql $login_string -S$OCF_RESKEY_server_name -I$OCF_RESKEY_interfaces_file -t $OCF_RESKEY_deep_probe_timeout -e -o$tmpfile1 & + # Record the isql command process id to temporary file. If the isql is hung, we need this process id + # to kill the hung process. + echo \$! > $tmpfile2 +EOF + + declare -i t=0 + + # Monitor the output file tmpfile1. + while [[ $t -lt $OCF_RESKEY_deep_probe_timeout ]] + do + # If the SQL statement is executed successfully, we will get the following output: + # 1> select 1 + # + # ----------- + # 1 + # + # (1 row affected) + # So, we determine if the execution is success by searching the keyword "(1 row affected)". + grep "(1 row affected)" $tmpfile1 + if [[ $? = 0 ]] + then + ocf_log debug "sybaseASE: Deep probe sucess." + break + else + sleep 1 + t=`expr $t+1` + fi + done + + # If $t is larger than deep_probe_timeout, it means the isql command line cannot finish in given time. + # This means the deep probe failed. We need to kill the isql process manually. + if [[ $t -ge $OCF_RESKEY_deep_probe_timeout ]] + then + ocf_log err "sybaseASE: Deep probe fail. The dataserver has no response." + + # Read the process id of isql process from tmpfile2 + pid=`cat $tmpfile2 | awk '{print $1}'` + + rm -f $tmpfile1 + rm -f $tmpfile2 + + # Kill the isql process directly. + kill -9 $pid + return 1 + fi + + rm -f $tmpfile1 + rm -f $tmpfile2 + + ocf_log debug "sybaseASE: End 'deep_probe'." + + return 0 +} + +############################# +# Do some real work here... # +############################# +case $__OCF_ACTION in + start) + verify_all || exit $OCF_ERR_GENERIC + ase_start + exit $? + ;; + stop) + ase_stop + exit $? + ;; + status | monitor) + ase_status $OCF_CHECK_LEVEL + exit $? + ;; + meta-data) + meta_data + exit $OCF_SUCCESS + ;; + validate-all) + verify_all + exit $? + ;; + *) + echo "Usage: $SCRIPT {start|stop|monitor|status|validate-all|meta-data}" + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +exit 0 + diff --git a/heartbeat/symlink b/heartbeat/symlink new file mode 100755 index 0000000..decd9f7 --- /dev/null +++ b/heartbeat/symlink @@ -0,0 +1,245 @@ +#!/bin/sh +# +# +# An OCF RA that manages a symlink +# +# Copyright (c) 2011 Dominik Klein +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="symlink" version="1.1"> +<version>1.0</version> + +<longdesc lang="en"> +This resource agent that manages a symbolic link (symlink). + +It is primarily intended to manage configuration files which should be +enabled or disabled based on where the resource is running, such as +cron job definitions and the like. +</longdesc> +<shortdesc lang="en">Manages a symbolic link</shortdesc> +<parameters> +<parameter name="link" required="1"> +<longdesc lang="en"> +Full path of the symbolic link to be managed. This must obviously be +in a filesystem that supports symbolic links. +</longdesc> +<shortdesc lang="en">Full path of the symlink</shortdesc> +<content type="string"/> +</parameter> +<parameter name="target" required="1"> +<longdesc lang="en"> +Full path to the link target (the file or directory which the symlink points to). +</longdesc> +<shortdesc lang="en">Full path to the link target</shortdesc> +<content type="string" /> +</parameter> +<parameter name="backup_suffix"> +<longdesc lang="en"> +A suffix to append to any files that the resource agent moves out of +the way because they clash with "link". + +If this is unset (the default), then the resource agent will simply +refuse to create a symlink if it clashes with an existing file. +</longdesc> +<shortdesc lang="en">Suffix to append to backup files</shortdesc> +<content type="string" /> +</parameter> +</parameters> +<actions> +<action name="start" timeout="15s" /> +<action name="stop" timeout="15s" /> +<action name="monitor" depth="0" timeout="15s" interval="60s"/> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="10s" /> +</actions> +</resource-agent> +END +} + +symlink_monitor() { + # This applies the following logic: + # + # * If $OCF_RESKEY_link does not exist, then the resource is + # definitely stopped. + # + # * If $OCF_RESKEY_link exists and is a symlink that points to + # ${OCF_RESKEY_target}, then the resource is definitely started. + # + # * If $OCF_RESKEY_link exists, but is anything other than a + # symlink to ${OCF_RESKEY_target}, then the status depends on whether + # ${OCF_RESKEY_backup_suffix} is set: + # + # - if ${OCF_RESKEY_backup_suffix} is set, then the resource is + # simply not running. The existing file will be moved out of + # the way, to ${OCF_RESKEY_link}${OCF_RESKEY_backup_suffix}, + # when the resource starts. + # + # - if ${OCF_RESKEY_backup_suffix} is not set, then an existing + # file ${OCF_RESKEY_link} is an error condition, and the + # resource can't start here. + rc=$OCF_ERR_GENERIC + + # Using ls here instead of "test -e", as "test -e" returns false + # if the file does exist, but it a symlink to a file that doesn't + if ! ls "$OCF_RESKEY_link" >/dev/null 2>&1; then + ocf_log debug "$OCF_RESKEY_link does not exist" + rc=$OCF_NOT_RUNNING + elif [ ! -L "$OCF_RESKEY_link" ]; then + if [ -z "$OCF_RESKEY_backup_suffix" ]; then + ocf_exit_reason "$OCF_RESKEY_link exists but is not a symbolic link!" + exit $OCF_ERR_INSTALLED + else + ocf_log debug "$OCF_RESKEY_link exists but is not a symbolic link, will be moved to ${OCF_RESKEY_link}${OCF_RESKEY_backup_suffix} on start" + rc=$OCF_NOT_RUNNING + fi + elif readlink -m "$OCF_RESKEY_link" | egrep -q "^$(readlink -m ${OCF_RESKEY_target})$"; then + ocf_log debug "$OCF_RESKEY_link exists and is a symbolic link to ${OCF_RESKEY_target}." + rc=$OCF_SUCCESS + else + if [ -z "$OCF_RESKEY_backup_suffix" ]; then + ocf_exit_reason "$OCF_RESKEY_link does not point to ${OCF_RESKEY_target}!" + exit $OCF_ERR_INSTALLED + else + ocf_log debug "$OCF_RESKEY_link does not point to ${OCF_RESKEY_target}, will be moved to ${OCF_RESKEY_link}${OCF_RESKEY_backup_suffix} on start" + rc=$OCF_NOT_RUNNING + fi + fi + return $rc +} + +symlink_start() { + if ! symlink_monitor; then + if [ -e "$OCF_RESKEY_link" ]; then + if [ -z "$OCF_RESKEY_backup_suffix" ]; then + # Shouldn't happen, because symlink_monitor should + # have errored out. But there is a chance that + # something else put that file there after + # symlink_monitor ran. + ocf_exit_reason "$OCF_RESKEY_link exists and no backup_suffix is set, won't overwrite." + exit $OCF_ERR_GENERIC + else + ocf_log debug "Found $OCF_RESKEY_link, moving to ${OCF_RESKEY_link}${OCF_RESKEY_backup_suffix}" + ocf_run mv -v "$OCF_RESKEY_link" "${OCF_RESKEY_link}${OCF_RESKEY_backup_suffix}" \ + || exit $OCF_ERR_GENERIC + fi + fi + ocf_run ln -sv "$OCF_RESKEY_target" "$OCF_RESKEY_link" + symlink_monitor + return $? + else + return $OCF_SUCCESS + fi +} + +symlink_stop() { + if symlink_monitor; then + ocf_run rm -vf "$OCF_RESKEY_link" || exit $OCF_ERR_GENERIC + if ! symlink_monitor; then + if [ -e "${OCF_RESKEY_link}${OCF_RESKEY_backup_suffix}" ]; then + ocf_log debug "Found backup ${OCF_RESKEY_link}${OCF_RESKEY_backup_suffix}, moving to $OCF_RESKEY_link" + # if restoring the backup fails then still return with + # $OCF_SUCCESS, but log a warning + ocf_run -warn mv "${OCF_RESKEY_link}${OCF_RESKEY_backup_suffix}" "$OCF_RESKEY_link" + fi + return $OCF_SUCCESS + else + ocf_exit_reason "Removing $OCF_RESKEY_link failed." + return $OCF_ERR_GENERIC + fi + else + return $OCF_SUCCESS + fi +} + +symlink_validate_all() { + if [ "x${OCF_RESKEY_link}" = "x" ]; then + ocf_exit_reason "Mandatory parameter link is unset" + exit $OCF_ERR_CONFIGURED + fi + if [ "x${OCF_RESKEY_target}" = "x" ]; then + ocf_exit_reason "Mandatory parameter target is unset" + exit $OCF_ERR_CONFIGURED + fi + + # Having a non-existant target is technically not an error, as + # symlinks are allowed to point to non-existant paths. But it + # still doesn't hurt to warn people if the target does not exist + # (but only during non-probes). + if [ ! -e "${OCF_RESKEY_target}" ]; then + ocf_log warn "${OCF_RESKEY_target} does not exist!" + fi +} + +symlink_usage() { + cat <<EOF +usage: $0 {start|stop|monitor|validate-all|meta-data} +Expects to have a fully populated OCF RA-compliant environment set. +EOF +} + +if [ $# -ne 1 ]; then + symlink_usage + exit $OCF_ERR_ARGS +fi + +case $__OCF_ACTION in +meta-data) + meta_data + exit $OCF_SUCCESS + ;; +usage) + symlink_usage + exit $OCF_SUCCESS +esac + +# Everything except usage and meta-data must pass the validate test +symlink_validate_all || exit + +case $__OCF_ACTION in +start) + symlink_start + ;; +stop) + symlink_stop + ;; +status|monitor) + symlink_monitor + ;; +validate-all) + ;; +*) + symlink_usage + exit $OCF_ERR_UNIMPLEMENTED +esac +# exit code is the exit code (return code) of the last command (shell function) diff --git a/heartbeat/syslog-ng.in b/heartbeat/syslog-ng.in new file mode 100644 index 0000000..246db28 --- /dev/null +++ b/heartbeat/syslog-ng.in @@ -0,0 +1,467 @@ +#!@BASH_SHELL@ +# +# Description: Manages a syslog-ng instance, provided by NTT OSSC as an +# OCF High-Availability resource under Heartbeat/LinuxHA control +# +# Copyright (c) 2009 NIPPON TELEGRAPH AND TELEPHONE CORPORATION +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# +############################################################################## +# OCF parameters: +# OCF_RESKEY_syslog_ng_binary : Path to syslog-ng binary. +# Default is "/sbin/syslog-ng" +# OCF_RESKEY_configfile : Configuration file +# OCF_RESKEY_start_opts : Startup options +# OCF_RESKEY_kill_term_timeout: Number of seconds to await to confirm a +# normal stop method +# +# Only OCF_RESKEY_configfile must be specified. Each of the rests +# has its default value or refers OCF_RESKEY_configfile to make +# its value when no explicit value is given. +# +# Further infomation for setup: +# There are sample configurations at the end of this file. +# +############################################################################### + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_configfile_default="" +OCF_RESKEY_syslog_ng_binary_default="/sbin/syslog-ng" +OCF_RESKEY_syslog_ng_ctl_default="/sbin/syslog-ng-ctl" +OCF_RESKEY_qdisk_dir_default="" +OCF_RESKEY_control_file_default="" +OCF_RESKEY_persist_file_default="" +OCF_RESKEY_pidfile_default="" +OCF_RESKEY_start_opts_default="" +OCF_RESKEY_kill_term_timeout_default="10" + +: ${OCF_RESKEY_configfile=${OCF_RESKEY_configfile_default}} +: ${OCF_RESKEY_syslog_ng_binary=${OCF_RESKEY_syslog_ng_binary_default}} +: ${OCF_RESKEY_syslog_ng_ctl=${OCF_RESKEY_syslog_ng_ctl_default}} +: ${OCF_RESKEY_qdisk_dir=${OCF_RESKEY_qdisk_dir_default}} +: ${OCF_RESKEY_control_file=${OCF_RESKEY_control_file_default}} +: ${OCF_RESKEY_persist_file=${OCF_RESKEY_persist_file_default}} +: ${OCF_RESKEY_pidfile=${OCF_RESKEY_pidfile_default}} +: ${OCF_RESKEY_start_opts=${OCF_RESKEY_start_opts_default}} +: ${OCF_RESKEY_kill_term_timeout=${OCF_RESKEY_kill_term_timeout_default}} + +usage() +{ + cat <<-! +usage: $0 action + +action: + start : start a new syslog-ng instance + + stop : stop the running syslog-ng instance + + status : return the status of syslog-ng, run or down + + monitor : return TRUE if the syslog-ng appears to be working. + + meta-data : show meta data message + + validate-all: validate the instance parameters +! + return $OCF_ERR_UNIMPLEMENTED +} + +metadata_syslog_ng() +{ + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="syslog-ng" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +This script manages a syslog-ng instance as an HA resource. + +For Premium Edition you should set the following parameters +(based on default path being "/opt/syslog-ng"): +syslog_ng_binary="/opt/syslog-ng/sbin/syslog-ng" +syslog_ng_ctl="/opt/syslog-ng/sbin/syslog-ng-ctl" +control_file="/opt/syslog-ng/var/run/syslog-ng.ctl" +persist_file="/opt/syslog-ng/var/syslog-ng.persist" +pidfile="/opt/syslog-ng/var/run/syslog-ng.pid" + +Additional parameter for Premium Edition 6 only: +qdisk_dir="/opt/syslog-ng/var/" +</longdesc> +<shortdesc lang="en">Syslog-ng resource agent</shortdesc> + +<parameters> + +<parameter name="configfile" unique="0" required="1"> +<longdesc lang="en"> +This parameter specifies a configuration file +for a syslog-ng instance managed by this RA. +</longdesc> +<shortdesc lang="en">Configuration file</shortdesc> +<content type="string" default="${OCF_RESKEY_configfile_default}"/> +</parameter> + +<parameter name="syslog_ng_binary" unique="0"> +<longdesc lang="en"> +This parameter specifies syslog-ng's executable file. +</longdesc> +<shortdesc lang="en">syslog-ng executable</shortdesc> +<content type="string" default="${OCF_RESKEY_syslog_ng_binary_default}"/> +</parameter> + +<parameter name="syslog_ng_ctl" unique="0"> +<longdesc lang="en"> +This parameter specifies the path of the syslog-ng-ctl executable file. +</longdesc> +<shortdesc lang="en">syslog-ng-ctl executable</shortdesc> +<content type="string" default="${OCF_RESKEY_syslog_ng_ctl_default}"/> +</parameter> + +<parameter name="qdisk_dir" unique="0"> +<longdesc lang="en"> +This parameter specifies the directory used for holding disk buffers of +syslog-ng (only supported in Premium Edition 6). +</longdesc> +<shortdesc lang="en">disk buffer directory (PE6 only)</shortdesc> +<content type="string" default="${OCF_RESKEY_qdisk_dir_default}"/> +</parameter> + +<parameter name="control_file" unique="0"> +<longdesc lang="en"> +This parameter specifies the path, where syslog-ng would place its control +socket, through which it can be controlled. +</longdesc> +<shortdesc lang="en">process control socket</shortdesc> +<content type="string" default="${OCF_RESKEY_control_file_default}"/> +</parameter> + +<parameter name="persist_file" unique="0"> +<longdesc lang="en"> +This parameter specifies the path for syslog-ng's persist file, which holds +persistent information about the mapping of destinations and disk buffers, +the internal state of sources, etc. +</longdesc> +<shortdesc lang="en">persist file path</shortdesc> +<content type="string" default="${OCF_RESKEY_persist_file_default}"/> +</parameter> + +<parameter name="pidfile" unique="0"> +<longdesc lang="en"> +This parameter specifies the path where the pid file of syslog-ng resides. +</longdesc> +<shortdesc lang="en">pidfile path</shortdesc> +<content type="string" default="${OCF_RESKEY_pidfile_default}"/> +</parameter> + +<parameter name="start_opts" unique="0"> +<longdesc lang="en"> +This parameter specifies startup options for a +syslog-ng instance managed by this RA. When no value is given, no startup +options is used. Don't use option '-F'. It causes a stuck of a start action. +</longdesc> +<shortdesc lang="en">Start options</shortdesc> +<content type="string" default="${OCF_RESKEY_start_opts_default}"/> +</parameter> + +<parameter name="kill_term_timeout" unique="0"> +<longdesc lang="en"> +On a stop action, a normal stop method(pkill -TERM) is firstly used. +And then the confirmation of its completion is waited for +the specified seconds by this parameter. +The default value is 10. +</longdesc> +<shortdesc lang="en">Number of seconds to await to confirm a normal stop method</shortdesc> +<content type="integer" default="${OCF_RESKEY_kill_term_timeout_default}"/> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="60s" /> +<action name="stop" timeout="120s" /> +<action name="status" timeout="60s" /> +<action name="monitor" depth="0" timeout="60s" interval="60s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s"/> +</actions> +</resource-agent> +END + return $OCF_SUCCESS +} + +monitor_syslog_ng() +{ + set -- $(pgrep -f "$PROCESS_PATTERN" 2>/dev/null) + case $# in + 0) ocf_log debug "No syslog-ng process for $CONFIGFILE" + return $OCF_NOT_RUNNING;; + 1) return $OCF_SUCCESS;; + esac + ocf_log warn "Multiple syslog-ng process for $CONFIGFILE" + return $OCF_SUCCESS +} + +start_syslog_ng() +{ + monitor_syslog_ng + if [[ $? = "$OCF_SUCCESS" ]]; then + return $OCF_SUCCESS + fi + + # set -- $SYSLOG_NG_OPTS + # ocf_run "$SYSLOG_NG_EXE" -f "$SYSLOG_NG_CONF" "$@" + # reduce to this? + ocf_run "$SYSLOG_NG_EXE" -f "$CONFIGFILE" $START_OPTS + ocf_status=$? + if [[ "$ocf_status" != "$OCF_SUCCESS" ]]; then + return $OCF_ERR_GENERIC + fi + + while true; do + monitor_syslog_ng + if [[ $? = "$OCF_SUCCESS" ]]; then + return $OCF_SUCCESS + fi + sleep 1 + done +} + +stop_syslog_ng() +{ + if [ -x "$SYSLOG_NG_CTL" ]; then + if [ -n "${OCF_RESKEY_control_file}" ] && [ -S "${OCF_RESKEY_control_file}" ]; then + "$SYSLOG_NG_CTL" stop "$CONTROL_FILE" + CTL_STATUS=$? + [ $CTL_STATUS -ne 0 ] && pkill -TERM -f "$PROCESS_PATTERN" + else + pkill -TERM -f "$PROCESS_PATTERN" + fi + else + pkill -TERM -f "$PROCESS_PATTERN" + fi + + typeset lapse_sec=0 + while pgrep -f "$PROCESS_PATTERN" > /dev/null; do + sleep 1 + lapse_sec=$(( lapse_sec + 1 )) + ocf_log debug "stop_syslog_ng[$SYSLOG_NG_NAME]: stop NORM $lapse_sec/$KILL_TERM_TIMEOUT" + if [ $lapse_sec -ge $KILL_TERM_TIMEOUT ]; then + break + fi + done + + # if the process can't be removed, then the following part is + # not going to be executed (the RA will be killed by lrmd on + # timeout) and the pidfile will remain; don't know if that + # has any consequences + # 2009/09/18 Nakahira + # If the syslog-ng process hangs, syslog-ng RA waits + # $KILL_TERM_TIMEOUT seconds. + # The stop timeout of RA should be longer than $KILL_TERM_TIMEOUT. + lapse_sec=0 + while pgrep -f "$PROCESS_PATTERN" > /dev/null; do + pkill -KILL -f "$PROCESS_PATTERN" + sleep 1 + lapse_sec=$(( lapse_sec + 1 )) + ocf_log debug "stop_syslog_ng[$SYSLOG_NG_NAME]: suspend syslog_ng by SIGKILL ($lapse_sec/@@@)" + done + + return $OCF_SUCCESS +} + +status_syslog_ng() +{ + # ???? why not monitor and then print running or stopped + monitor_syslog_ng + rc=$? + if [ $rc = $OCF_SUCCESS ]; then + echo "Syslog-ng service is running." + elif [ $rc = $OCF_NOT_RUNNING ]; then + echo "Syslog-ng service is stopped." + else + echo "Mutiple syslog-ng process for $CONFIGFILE." + fi + return $rc +} + +validate_all_syslog_ng() +{ + ocf_log info "validate_all_syslog_ng[$SYSLOG_NG_NAME]" + return $OCF_SUCCESS +} + +if [[ "$1" = "meta-data" ]]; then + metadata_syslog_ng + exit $? +fi + +CONFIGFILE="${OCF_RESKEY_configfile}" +if [[ -z "$CONFIGFILE" ]]; then + ocf_log err "undefined parameter:configfile" + exit $OCF_ERR_CONFIGURED +fi + +SYSLOG_NG_NAME=${CONFIGFILE##*/} +SYSLOG_NG_NAME=${SYSLOG_NG_NAME%.*} + +SYSLOG_NG_EXE="${OCF_RESKEY_syslog_ng_binary}" +if [[ ! -x "$SYSLOG_NG_EXE" ]]; then + ocf_log err "Invalid value:syslog_ng_binary:$SYSLOG_NG_EXE" + exit $OCF_ERR_CONFIGURED +fi + +SYSLOG_NG_CTL="${OCF_RESKEY_syslog_ng_ctl}" + +# actually, the pidfile has no function; the status is checked by +# testing for a running process only + +KILL_TERM_TIMEOUT="${OCF_RESKEY_kill_term_timeout}" +if ! ocf_is_decimal "$KILL_TERM_TIMEOUT"; then + ocf_log err "Invalid value:kill_term_timeout:$KILL_TERM_TIMEOUT" + exit $OCF_ERR_CONFIGURED +fi + +QDISK_DIR="${OCF_RESKEY_qdisk_dir}" +CONTROL_FILE="${OCF_RESKEY_control_file}" +PERSIST_FILE="${OCF_RESKEY_persist_file}" +PID_FILE="${OCF_RESKEY_pidfile}" +EXECUTABLE=$(basename "$SYSLOG_NG_EXE") +PROCESS_PATTERN="$EXECUTABLE -f $CONFIGFILE" + +COMMAND=$1 + +[ -n "$QDISK_DIR" ] && QDISK_DIR="--qdisk-dir $QDISK_DIR" +[ -n "$PERSIST_FILE" ] && PERSIST_FILE="--persist-file $PERSIST_FILE" +[ -n "$CONTROL_FILE" ] && CONTROL_FILE="--control $CONTROL_FILE" +[ -n "$PID_FILE" ] && PID_FILE="--pidfile $PID_FILE" + +START_OPTS="${OCF_RESKEY_start_opts} $QDISK_DIR $CONTROL_FILE $PERSIST_FILE $PID_FILE" + +case "$COMMAND" in + start) + ocf_log debug "[$SYSLOG_NG_NAME] Enter syslog_ng start" + start_syslog_ng + func_status=$? + ocf_log debug "[$SYSLOG_NG_NAME] Leave syslog_ng start $func_status" + exit $func_status + ;; + stop) + ocf_log debug "[$SYSLOG_NG_NAME] Enter syslog_ng stop" + stop_syslog_ng + func_status=$? + ocf_log debug "[$SYSLOG_NG_NAME] Leave syslog_ng stop $func_status" + exit $func_status + ;; + status) + status_syslog_ng + exit $? + ;; + monitor) + #ocf_log debug "[$SYSLOG_NG_NAME] Enter syslog_ng monitor" + monitor_syslog_ng + func_status=$? + #ocf_log debug "[$SYSLOG_NG_NAME] Leave syslog_ng monitor $func_status" + exit $func_status + ;; + validate-all) + validate_all_syslog_ng + exit $? + ;; + *) + usage + ;; +esac + +# vim: set sw=4 ts=4 : + +### A sample snippet of cib.xml for a syslog-ng resource +## +# <primitive id="prmApSyslog-ng" class="ocf" type="syslog-ng" provider="heartbeat"> +# <instance_attributes id="prmDummyB_instance_attrs"> +# <attributes> +# <nvpair id="atr:Syslog-ng:syslog-ng:configfile" name="configfile" value="/etc/syslog-ng/syslog-ng-ext.conf"/> +# </attributes> +# </instance_attributes> +# <operations> +# <op id="op:prmSyslog-ng:start" name="start" timeout="60s" on_fail="restart"/> +# <op id="op:prmSyslog-ng:monitor" name="monitor" interval="10s" timeout="60s" on_fail="restart"/> +# <op id="op:prmSyslog-ng:stop" name="stop" timeout="60s" on_fail="block"/> +# </operations> +# </primitive> + +### A sample syslog-ng configuration file for a log collecting host +### +### This sample is for a log collecting host by syslog-ng. +### A syslog-ng process configurated by this sample accepts all messages +### from a certain network. Any message from the network is preserved into +### a file for security infomation. Restricting messages to "authpriv" from +### the network is done on log sending hosts. (See the sample below) +### Any internal message of the syslog-ng process is preserved into its +### dedicated file. And any "authpriv" internal message of the syslog-ng +### process is also preserved into the security infomation file. +### +### Change "f_incoming" to suit your enviroment. +### If you use it as a configuration file for the sample cib.xml above, +### save it into "/etc/syslog-ng/syslog-ng-ext.conf". +## +#options { +# sync (0); +# time_reopen (10); +# log_fifo_size (1000); +# long_hostnames (off); +# use_dns (yes); +# use_fqdn (no); +# create_dirs (no); +# keep_hostname (yes); }; +# +#source s_internal { internal(); }; +#source s_incoming { udp(port(514)); }; +#filter f_internal { facility(authpriv); }; +#filter f_incoming { netmask("172.20.0.0/255.255.192.0"); }; +# +#destination d_internal { file("/var/log/syslog-ng-ext.log" perm(0640));}; +#destination d_incoming { +# file("/var/log/secure-ext.log" create_dirs(yes) perm(0640)); }; +# +#log { source(s_internal); destination(d_internal); }; +#log { source(s_internal); filter(f_internal); destination(d_incoming); }; +#log { source(s_incoming); filter(f_incoming); destination(d_incoming); }; + +### A sample snippet of syslog-ng configuration file for a log sending host +### +### This sample is for a log sending host that uses syslog-ng. +### +### Replace "syslog-ng-ext" to the IP address or the hostname of your +### log collecting host and append it to "syslog-ng.conf" of each log sending +### host. See the install default syslog-ng.conf to know what "s_sys" and +### "f_auth" are. +## +#destination d_outgoing { udp("syslog-ng-ext" port(514)); }; +#log { source(s_sys); filter(f_auth); destination(d_outgoing); }; + +### A sample snippet of syslog configuration file for a log sending host +### +### This sample is for a log sending host that uses syslog. +### +### Replace "syslog-ng-ext" to the IP address or the hostname of your +### log collecting host and append it to "syslog.conf" of each log sending +### host. +## +# authpriv.* @syslog-ng-ext diff --git a/heartbeat/tomcat b/heartbeat/tomcat new file mode 100755 index 0000000..fa27151 --- /dev/null +++ b/heartbeat/tomcat @@ -0,0 +1,816 @@ +#!/bin/sh +# +# Description: Manages a Tomcat Server as an OCF High-Availability +# resource under Heartbeat/LinuxHA control +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +# 02110-1301, USA. +# +# Copyright (c) 2007 NIPPON TELEGRAPH AND TELEPHONE CORPORATION +# +####################################################################### +# OCF parameters: +# OCF_RESKEY_tomcat_name - The name of the resource. Default is tomcat +# OCF_RESKEY_script_log - A destination of the log of this script. Default /var/log/OCF_RESKEY_tomcat_name.log +# OCF_RESKEY_tomcat_stop_timeout - Time-out at the time of the stop. Default is 5. DEPRECATED +# OCF_RESKEY_tomcat_suspend_trialcount - The re-try number of times awaiting a stop. Default is 10. DEPRECATED +# OCF_RESKEY_tomcat_user - A user name to start a resource. +# OCF_RESKEY_statusurl - URL for state confirmation. Default is http://127.0.0.1:8080 +# OCF_RESKEY_max_stop_time - The max time it should take for proper shutdown. Restrictions, only Tomcat6. +# OCF_RESKEY_java_home - Home directory of Java. Default is none +# OCF_RESKEY_java_opts - Options to pass to Java JVM for start and stop. Default is none +# OCF_RESKEY_catalina_home - Home directory of Tomcat. Default is none +# OCF_RESKEY_catalina_base - Base directory of Tomcat. Default is OCF_RESKEY_catalina_home +# OCF_RESKEY_catalina_out - Log file name of Tomcat. Default is OCF_RESKEY_catalina_base/logs/catalina.out +# OCF_RESKEY_catalina_pid - A PID file name of Tomcat. Default is OCF_RESKEY_catalina_base/logs/catalina.pid +# OCF_RESKEY_tomcat_start_opts - Start options of Tomcat. Default is none. +# OCF_RESKEY_catalina_opts - CATALINA_OPTS environment variable. Default is none. +# OCF_RESKEY_catalina_tmpdir - CATALINA_TMPDIR environment variable. Default is none. +# OCF_RESKEY_catalina_rotate_log - Control catalina.out logrotation flag. Default is NO. +# OCF_RESKEY_catalina_rotatetime - catalina.out logrotation time span(seconds). Default is 86400. +# OCF_RESKEY_java_endorsed_dirs - JAVA_ENDORSED_DIRS environment variable. Default is none. +# OCF_RESKEY_logging_config - LOGGING_CONFIG environment variable. Default is none. +# OCF_RESKEY_logging_manager - LOGGING_MANAGER environment variable. Default is none. +############################################################################### + + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Use runuser if available for SELinux. +if [ -x /sbin/runuser ]; then + SU=runuser +else + SU=su +fi + +usage() +{ + cat <<-EOF +usage: $0 action + +action: + start start Tomcat + + stop stop Tomcat + + status return the status of Tomcat, up or down + + monitor return TRUE if Tomcat appears to be working. + You have to have installed $WGETNAME for this to work. + + meta-data show meta data message + + validate-all validate the instance parameters +EOF +} + +isrunning_tomcat() +{ + $WGET --tries=20 -O /dev/null $RESOURCE_STATUSURL >/dev/null 2>&1 +} + +isalive_tomcat() +{ + if ocf_is_true $SYSTEMD; then + systemctl is-active tomcat@${TOMCAT_NAME} > /dev/null 2>&1 + return $? + fi + # As the server stops, the PID file disappears. To avoid race conditions, + # we will have remembered the PID of a running instance on script entry. + local pid=$rememberedPID + # If there is a PID file, attempt to use that + if [ -f $CATALINA_PID ]; then + local tmp + ocf_log debug "Reading pid from $CATALINA_PID" + tmp=`head -n 1 $CATALINA_PID` + if [ $? -eq 0 ]; then + pid=$tmp + fi + fi + if [ -n "$pid" ] && [ "$pid" -gt 0 ]; then + # Retry message for restraint + ocf_log debug "Sending noop signal to $pid" + kill -s 0 $pid >/dev/null 2>&1 + return $? + fi + # No PID file + false +} + +# Check rotatelogs process and restart if it is stopped +monitor_rotatelogs() +{ + pgrep -f "$ROTATELOGS.*$CATALINA_BASE/logs/catalina_%F.log" > /dev/null 2>&1 + if [ $? -ne 0 ]; then + ocf_log warn "A rotatelogs command for $CATALINA_BASE/logs/catalina_%F.log is not running. Restarting it." + start_rotatelogs + if [ $? -eq 0 ]; then + ocf_log info "Restart rotatelogs process succeeded." + else + ocf_log warn "Restart rotatelogs process failed." + fi + fi +} + +monitor_tomcat() +{ + isalive_tomcat || + return $OCF_NOT_RUNNING + isrunning_tomcat || + return $OCF_ERR_GENERIC + if ocf_is_true ${CATALINA_ROTATE_LOG}; then + # Monitor rotatelogs process and restart it if it is stopped. + # And never consider rotatelogs process failure to be a monitor failure + # as long as Tomcat process works fine. + monitor_rotatelogs + fi + return $OCF_SUCCESS +} + +start_rotatelogs() +{ + # -s is required because tomcat5.5's login shell is /bin/false + $SU - -s /bin/sh $RESOURCE_TOMCAT_USER \ + -c "$ROTATELOGS -l \"$CATALINA_BASE/logs/catalina_%F.log\" $CATALINA_ROTATETIME" \ + < "$CATALINA_OUT" > /dev/null 2>&1 & +} + +# Execute catalina.out log rotation +rotate_catalina_out() +{ + # Check catalina_%F.log is writable or not. + CURRENT_ROTATELOG_SUFFIX=`date +"%F"` + $SU - -s /bin/sh $RESOURCE_TOMCAT_USER \ + -c "touch \"$CATALINA_BASE/logs/catalina_$CURRENT_ROTATELOG_SUFFIX.log\"" > /dev/null 2>&1 + if [ $? -ne 0 ]; then + ocf_exit_reason "$CATALINA_BASE/logs/catalina_$CURRENT_ROTATELOG_SUFFIX.log is not writable." + return $OCF_ERR_GENERIC + fi + + # Clean up and set permissions on required files + rm -rf "$CATALINA_BASE"/temp/* + if [ -p "$CATALINA_OUT" ]; then + rm -f "$CATALINA_OUT" + elif [ -e "$CATALINA_OUT" ]; then + DATE=`date +"%F-%H%M%S"` + ocf_log warn "$CATALINA_OUT already exists. It is saved as $CATALINA_OUT-$DATE" + mv "$CATALINA_OUT" "$CATALINA_OUT-$DATE" + fi + mkfifo -m700 "$CATALINA_OUT" + chown --dereference "$RESOURCE_TOMCAT_USER" "$CATALINA_OUT" || true + + start_rotatelogs +} + +create_systemd_config() +{ +cat<<-EOF > /etc/sysconfig/tomcat@${TOMCAT_NAME} +JAVA_HOME=${JAVA_HOME} +JAVA_OPTS="${JAVA_OPTS}" +CATALINA_HOME=${CATALINA_HOME} +CATALINA_BASE=${CATALINA_BASE} +CATALINA_OUT=${CATALINA_OUT} +CATALINA_OPTS="${CATALINA_OPTS}" +CATALINA_TMPDIR="${CATALINA_TMPDIR}" +JAVA_ENDORSED_DIRS="${JAVA_ENDORSED_DIRS}" +LOGGING_CONFIG="${LOGGING_CONFIG}" +LOGGING_MANAGER="${LOGGING_MANAGER}" +TOMCAT_CFG=${TOMCAT_CFG} +EOF +} + +# shellcheck disable=SC2068 +tomcatCommand() +{ + if ocf_is_true $SYSTEMD; then + systemctl $@ tomcat@${TOMCAT_NAME} + else +cat<<-END_TOMCAT_COMMAND + export JAVA_HOME=${JAVA_HOME} + export JAVA_OPTS="${JAVA_OPTS}" + export CATALINA_HOME=${CATALINA_HOME} + export CATALINA_BASE=${CATALINA_BASE} + export CATALINA_OUT=${CATALINA_OUT} + export CATALINA_PID=${CATALINA_PID} + export CATALINA_OPTS="${CATALINA_OPTS}" + export CATALINA_TMPDIR="${CATALINA_TMPDIR}" + export JAVA_ENDORSED_DIRS="${JAVA_ENDORSED_DIRS}" + export LOGGING_CONFIG="${LOGGING_CONFIG}" + export LOGGING_MANAGER="${LOGGING_MANAGER}" + export TOMCAT_CFG=${TOMCAT_CFG} + $TOMCAT_START_SCRIPT $@ +END_TOMCAT_COMMAND + fi +} + +# shellcheck disable=SC2068 +attemptTomcatCommand() +{ + if [ -n "$REDIRECT_DEFAULT_CONFIG" ]; then + TOMCAT_CFG=$(mktemp "${HA_RSCTMP}/tomcat-tmp-XXXXX.cfg") + export TOMCAT_CFG + fi + + if ocf_is_true $SYSTEMD; then + tomcatCommand $@ + elif [ "$RESOURCE_TOMCAT_USER" = root ]; then + "$TOMCAT_START_SCRIPT" $@ >> "$TOMCAT_CONSOLE" 2>&1 + else + tomcatCommand $@ | $SU - -s /bin/sh "$RESOURCE_TOMCAT_USER" >> "$TOMCAT_CONSOLE" 2>&1 + fi + + if [ -n "$REDIRECT_DEFAULT_CONFIG" ]; then + rm -f "$TOMCAT_CFG" + fi +} + +start_tomcat() +{ + if ocf_is_true $SYSTEMD; then + create_systemd_config + fi + cd "$CATALINA_HOME/bin" || return $OCF_ERR_GENERIC + + validate_all_tomcat || exit $? + + monitor_tomcat + if [ $? -eq $OCF_SUCCESS ]; then + return $OCF_SUCCESS + fi + + # Remove $CATALINA_PID if it exists + rm -f $CATALINA_PID + + #ocf_log debug "catalina.out rotation FLG = ${CATALINA_ROTATE_LOG}" + if ocf_is_true ${CATALINA_ROTATE_LOG}; then + rotate_catalina_out + if [ $? -eq 0 ]; then + ocf_log debug "Rotate catalina.out succeeded." + else + ocf_exit_reason "Rotate catalina.out failed. Avoid starting tomcat without catalina.out rotation." + return $OCF_ERR_GENERIC + fi + fi + + echo "`date "+%Y/%m/%d %T"`: start ===========================" >> "$TOMCAT_CONSOLE" + + ocf_log debug "CATALINA_OPTS value = ${CATALINA_OPTS}" + attemptTomcatCommand start ${TOMCAT_START_OPTS} & + + while true; do + monitor_tomcat + if [ $? -eq $OCF_SUCCESS ]; then + break + fi + ocf_log debug "start_tomcat[$TOMCAT_NAME]: retry monitor_tomcat" + sleep 3 + done + + return $OCF_SUCCESS +} + +stop_tomcat() +{ + local stop_time + local RA_TIMEOUT=20 + local TOMCAT_STOP_OPTS="" + + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + RA_TIMEOUT=$((OCF_RESKEY_CRM_meta_timeout/1000)) + fi + + STOP_TIMEOUT=$((RA_TIMEOUT-5)) + if [ -n "$MAX_STOP_TIME" ]; then + if [ $MAX_STOP_TIME -gt $RA_TIMEOUT ]; then + ocf_log warn "max_stop_timeout must be shorter than the timeout of stop operation." + fi + if [ $MAX_STOP_TIME -eq 0 ]; then + STOP_TIMEOUT=$RA_TIMEOUT + else + STOP_TIMEOUT=$MAX_STOP_TIME + fi + fi + + cd "$CATALINA_HOME/bin" + + memorize_pid # This lets monitoring continue to work reliably + + echo "`date "+%Y/%m/%d %T"`: stop ###########################" >> "$TOMCAT_CONSOLE" + + if [ "$TOMCAT_START_SCRIPT" = "$CATALINA_HOME/bin/catalina.sh" ]; then + TOMCAT_STOP_OPTS="$STOP_TIMEOUT -force" + fi + stop_time=$(date +%s) + attemptTomcatCommand stop $TOMCAT_STOP_OPTS + + lapse_sec=0 + while isalive_tomcat; do + sleep 1 + lapse_sec=`expr $(date +%s) - $stop_time` + if [ $lapse_sec -ge $STOP_TIMEOUT ]; then + ocf_log debug "stop_tomcat[$TOMCAT_NAME]: stop failed, killing with SIGKILL ($lapse_sec)" + kill -s KILL $rememberedPID > /dev/null 2>&1 + fi + done + + if ocf_is_true ${CATALINA_ROTATE_LOG}; then + rm -f "$CATALINA_PID" "${CATALINA_OUT}" + else + rm -f "$CATALINA_PID" + fi + return $OCF_SUCCESS +} + +metadata_tomcat() +{ + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="tomcat" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +Resource script for Tomcat. It manages a Tomcat instance as a cluster resource. +</longdesc> +<shortdesc lang="en">Manages a Tomcat servlet environment instance</shortdesc> + +<parameters> + +<parameter name="tomcat_name" unique="1" > +<longdesc lang="en"> +The name of the resource, added as a Java parameter in JAVA_OPTS: +-Dname=<tomcat_name> to Tomcat process on start. Used to ensure +process is still running and must be unique. +</longdesc> +<shortdesc lang="en">The name of the resource</shortdesc> +<content type="string" default="${OCF_RESKEY_tomcat_name_default}" /> +</parameter> + +<parameter name="script_log" unique="1"> +<longdesc lang="en"> +Log file, used during start and stop operations. +</longdesc> +<shortdesc lang="en">Log file</shortdesc> +<content type="string" default="${OCF_RESKEY_script_log_default}" /> +</parameter> + +<parameter name="tomcat_stop_timeout" unique="0"> +<longdesc lang="en"> +Time-out for stop operation. DEPRECATED +</longdesc> +<shortdesc lang="en">Time-out for the stop operation. DEPRECATED</shortdesc> +<content type="integer" default="${OCF_RESKEY_tomcat_stop_timeout_default}" /> +</parameter> + +<parameter name="tomcat_suspend_trialcount" unique="0"> +<longdesc lang="en"> +Maximum number of times to retry stop operation before suspending +and killing Tomcat. DEPRECATED. Does not retry. +</longdesc> +<shortdesc lang="en">Max retry count for stop operation. DEPRECATED</shortdesc> +<content type="integer" default="${OCF_RESKEY_tomcat_suspend_trialcount_default}" /> +</parameter> + +<parameter name="tomcat_user" unique="0"> +<longdesc lang="en"> +The user who starts Tomcat. +</longdesc> +<shortdesc lang="en">The user who starts Tomcat</shortdesc> +<content type="string" default="${OCF_RESKEY_tomcat_user_default}" /> +</parameter> + +<parameter name="statusurl" unique="0"> +<longdesc lang="en"> +URL for state confirmation. +</longdesc> +<shortdesc lang="en">URL for state confirmation</shortdesc> +<content type="string" default="${OCF_RESKEY_statusurl_default}" /> +</parameter> + +<parameter name="max_stop_time" unique="0"> +<longdesc lang="en"> +Number of seconds to wait during a stop before drastic measures +(force kill) are used on the tomcat process. +This number MUST be less than your cluster stop timeout for the resource. +The default value is five seconds before the timeout value of stop operation. +When it is over this value, it stops a process in kill commands. +This parameter is only effective on Tomcat 6 or later. +</longdesc> +<shortdesc lang="en">The max time it should take for proper shutdown.</shortdesc> +<content type="integer" default="${OCF_RESKEY_max_stop_time_default}" /> +</parameter> + +<parameter name="java_home" unique="0" required="1"> +<longdesc lang="en"> +Home directory of Java. +</longdesc> +<shortdesc lang="en">Home directory of Java</shortdesc> +<content type="string" default="${OCF_RESKEY_java_home_default}" /> +</parameter> + +<parameter name="java_opts" unique="0"> +<longdesc lang="en"> +Java JVM options used on start and stop. +</longdesc> +<shortdesc lang="en">Java options parsed to JVM, used on start and stop.</shortdesc> +<content type="string" default="${OCF_RESKEY_java_opts_default}" /> +</parameter> + +<parameter name="catalina_home" unique="0" required="1"> +<longdesc lang="en"> +Home directory of Tomcat. +</longdesc> +<shortdesc lang="en">Home directory of Tomcat</shortdesc> +<content type="string" default="${OCF_RESKEY_catalina_home_default}" /> +</parameter> + +<parameter name="catalina_base" unique="1"> +<longdesc lang="en"> +Instance directory of Tomcat +</longdesc> +<shortdesc lang="en">Instance directory of Tomcat, defaults to catalina_home</shortdesc> +<content type="string" default="${OCF_RESKEY_catalina_base_default}" /> +</parameter> + +<parameter name="catalina_out" unique="1"> +<longdesc lang="en"> +Log file name of Tomcat +</longdesc> +<shortdesc lang="en">Log file name of Tomcat, defaults to catalina_base/logs/catalina.out</shortdesc> +<content type="string" default="${OCF_RESKEY_catalina_out_default}" /> +</parameter> + +<parameter name="catalina_pid" unique="1"> +<longdesc lang="en"> +A PID file name for Tomcat. +</longdesc> +<shortdesc lang="en">A PID file name for Tomcat</shortdesc> +<content type="string" default="${OCF_RESKEY_catalina_pid_default}" /> +</parameter> + +<parameter name="force_systemd" unique="0" required="0"> +<longdesc lang="en"> +Force use of systemd when available. +</longdesc> +<shortdesc lang="en">Force use of systemd when available</shortdesc> +<content type="string" default="${OCF_RESKEY_force_systemd_default}" /> +</parameter> + +<parameter name="tomcat_start_script" unique="0" required="0"> +<longdesc lang="en"> +Absolute path to the custom tomcat start script to use. +</longdesc> +<shortdesc lang="en">Tomcat start script location</shortdesc> +<content type="string" default="${OCF_RESKEY_tomcat_start_script_default}" /> +</parameter> + +<parameter name="tomcat_start_opts" unique="0"> +<longdesc lang="en"> +Tomcat start options. +</longdesc> +<shortdesc lang="en">Tomcat start options</shortdesc> +<content type="string" default="${OCF_RESKEY_tomcat_start_opts_default}" /> +</parameter> + +<parameter name="catalina_opts" unique="0"> +<longdesc lang="en"> +Catalina options, for the start operation only. +</longdesc> +<shortdesc lang="en">Catalina options</shortdesc> +<content type="string" default="${OCF_RESKEY_catalina_opts_default}" /> +</parameter> + +<parameter name="catalina_tmpdir" unique="1"> +<longdesc lang="en"> +Temporary directory of Tomcat +</longdesc> +<shortdesc lang="en">Temporary directory of Tomcat, defaults to none</shortdesc> +<content type="string" default="${OCF_RESKEY_catalina_tmpdir_default}" /> +</parameter> + +<parameter name="catalina_rotate_log" unique="0"> +<longdesc lang="en"> +Rotate catalina.out flag. +</longdesc> +<shortdesc lang="en">Rotate catalina.out flag</shortdesc> +<content type="boolean" default="${OCF_RESKEY_catalina_rotate_log_default}" /> +</parameter> + +<parameter name="catalina_rotatetime" unique="0"> +<longdesc lang="en"> +catalina.out rotation interval (seconds). +</longdesc> +<shortdesc lang="en">catalina.out rotation interval (seconds)</shortdesc> +<content type="integer" default="${OCF_RESKEY_catalina_rotatetime_default}" /> +</parameter> + +<parameter name="java_endorsed_dirs" unique="1"> +<longdesc lang="en"> +Java_endorsed_dirs of tomcat +</longdesc> +<shortdesc lang="en">Java_endorsed_dirs of Tomcat, defaults to none</shortdesc> +<content type="string" default="${OCF_RESKEY_java_endorsed_dirs_default}" /> +</parameter> + +<parameter name="logging_config" unique="1"> +<longdesc lang="en"> +Logging_config of tomcat +</longdesc> +<shortdesc lang="en">Logging_config of Tomcat, defaults to none</shortdesc> +<content type="string" default="${OCF_RESKEY_logging_config_default}" /> +</parameter> + +<parameter name="logging_manager" unique="1"> +<longdesc lang="en"> +Logging_manager of tomcat +</longdesc> +<shortdesc lang="en">Logging_manager of Tomcat, defaults to none.</shortdesc> +<content type="string" default="${OCF_RESKEY_logging_manager_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="60s" /> +<action name="stop" timeout="120s" /> +<action name="status" timeout="60s" /> +<action name="monitor" depth="0" timeout="30s" interval="10s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="5s"/> +</actions> +</resource-agent> +END + return $OCF_SUCCESS +} + +validate_all_tomcat() +{ + local port + local rc=$OCF_SUCCESS + ocf_log info "validate_all_tomcat[$TOMCAT_NAME]" + + check_binary $WGET + + if ! ocf_is_true $OCF_RESKEY_force_systemd && [ -z "${TOMCAT_START_SCRIPT}" ]; then + ocf_exit_reason "No default tomcat start script detected. Please specify start script location using the 'tomcat_start_script' option" + rc=$OCF_ERR_CONFIGURED + fi + + if [ -n "$MAX_STOP_TIME" ] && [ "$MAX_STOP_TIME" -lt 0 ]; then + ocf_exit_reason "max_stop_time must be set to a value greater than 0." + rc=$OCF_ERR_CONFIGURED + fi + + if echo "$RESOURCE_STATUSURL" | grep -q ":[0-9][0-9]*" ; then + port=${RESOURCE_STATUSURL##*:} + port=${port%%/*} + ocf_log debug "Tomcat port is $port" + ocf_log debug "grep port=\"$port\" $CATALINA_BASE/conf/server.xml" + grep "port=\"$port\"" $CATALINA_BASE/conf/server.xml > /dev/null 2>&1 + if [ $? -ne 0 ]; then + ocf_exit_reason "Your configured status URL specifies a port ($port), but the server does not have a connector listening to that port in $CATALINA_BASE/conf/server.xml" + rc=$OCF_ERR_INSTALLED + fi + fi + + if ocf_is_true ${CATALINA_ROTATE_LOG}; then + if [ ! -x "$ROTATELOGS" ]; then + ocf_exit_reason "rotatelogs command does not exist." + rc=$OCF_ERR_INSTALLED + fi + fi + + return $rc +} + +# As we stop tomcat, it removes it's own pid file...we still want to know what it was +memorize_pid() +{ + if [ -f $CATALINA_PID ]; then + rememberedPID=$(cat $CATALINA_PID) + fi +} + +# +### tomcat RA environment variables +# + +# Parameter defaults + +OCF_RESKEY_tomcat_name_default="tomcat" +OCF_RESKEY_catalina_home_default="" +OCF_RESKEY_catalina_base_default="" + +: ${OCF_RESKEY_tomcat_name=${OCF_RESKEY_tomcat_name_default}} +: ${OCF_RESKEY_catalina_home=${OCF_RESKEY_catalina_home_default}} +: ${OCF_RESKEY_catalina_base=${OCF_RESKEY_catalina_base_default}} + +# Only default to true for RedHat systems without catalina.sh +if [ -e "${OCF_RESKEY_catalina_home}/bin/catalina.sh" ] || ! is_redhat_based; then + OCF_RESKEY_force_systemd_default=0 +else + OCF_RESKEY_force_systemd_default=1 +fi + +: ${OCF_RESKEY_force_systemd=${OCF_RESKEY_force_systemd_default}} + +if [ -z "${OCF_RESKEY_tomcat_start_script}" ]; then + if ocf_is_true $OCF_RESKEY_force_systemd && \ + systemd_is_running; then + SYSTEMD=1 + elif [ -e "${OCF_RESKEY_catalina_home}/bin/catalina.sh" ]; then + TOMCAT_START_SCRIPT="${OCF_RESKEY_catalina_home}/bin/catalina.sh" + elif [ -e "/usr/sbin/tomcat" ]; then + REDIRECT_DEFAULT_CONFIG=1 + TOMCAT_START_SCRIPT="/usr/sbin/tomcat" + elif [ -e "/usr/sbin/tomcat6" ]; then + REDIRECT_DEFAULT_CONFIG=1 + TOMCAT_START_SCRIPT="/usr/sbin/tomcat6" + fi +fi + +OCF_RESKEY_script_log_default="/var/log/${OCF_RESKEY_tomcat_name}.log" +OCF_RESKEY_tomcat_stop_timeout_default="" +OCF_RESKEY_tomcat_suspend_trialcount_default="" +OCF_RESKEY_tomcat_user_default="root" +OCF_RESKEY_statusurl_default="http://127.0.0.1:8080" +OCF_RESKEY_max_stop_time_default="" +OCF_RESKEY_java_home_default="" +OCF_RESKEY_java_opts_default="" +OCF_RESKEY_catalina_out_default="${OCF_RESKEY_catalina_base-${OCF_RESKEY_catalina_home}}/logs/catalina.out" +OCF_RESKEY_catalina_pid_default="" +OCF_RESKEY_tomcat_start_script_default="${TOMCAT_START_SCRIPT}" +OCF_RESKEY_tomcat_start_opts_default="" +OCF_RESKEY_catalina_opts_default="" +OCF_RESKEY_catalina_tmpdir_default="" +OCF_RESKEY_catalina_rotate_log_default="NO" +OCF_RESKEY_catalina_rotatetime_default="86400" +OCF_RESKEY_java_endorsed_dirs_default="" +OCF_RESKEY_logging_config_default="" +OCF_RESKEY_logging_manager_default="" + +: ${OCF_RESKEY_script_log=${OCF_RESKEY_script_log_default}} +: ${OCF_RESKEY_tomcat_stop_timeout=${OCF_RESKEY_tomcat_stop_timeout_default}} +: ${OCF_RESKEY_tomcat_suspend_trialcount=${OCF_RESKEY_tomcat_suspend_trialcount_default}} +: ${OCF_RESKEY_tomcat_user=${OCF_RESKEY_tomcat_user_default}} +: ${OCF_RESKEY_statusurl=${OCF_RESKEY_statusurl_default}} +: ${OCF_RESKEY_max_stop_time=${OCF_RESKEY_max_stop_time_default}} +: ${OCF_RESKEY_java_home=${OCF_RESKEY_java_home_default}} +: ${OCF_RESKEY_java_opts=${OCF_RESKEY_java_opts_default}} +: ${OCF_RESKEY_catalina_out=${OCF_RESKEY_catalina_out_default}} +: ${OCF_RESKEY_catalina_pid=${OCF_RESKEY_catalina_pid_default}} +: ${OCF_RESKEY_tomcat_start_script=${OCF_RESKEY_tomcat_start_script_default}} +: ${OCF_RESKEY_tomcat_start_opts=${OCF_RESKEY_tomcat_start_opts_default}} +: ${OCF_RESKEY_catalina_opts=${OCF_RESKEY_catalina_opts_default}} +: ${OCF_RESKEY_catalina_tmpdir=${OCF_RESKEY_catalina_tmpdir_default}} +: ${OCF_RESKEY_catalina_rotate_log=${OCF_RESKEY_catalina_rotate_log_default}} +: ${OCF_RESKEY_catalina_rotatetime=${OCF_RESKEY_catalina_rotatetime_default}} +: ${OCF_RESKEY_java_endorsed_dirs=${OCF_RESKEY_java_endorsed_dirs_default}} +: ${OCF_RESKEY_logging_config=${OCF_RESKEY_logging_config_default}} +: ${OCF_RESKEY_logging_manager=${OCF_RESKEY_logging_manager_default}} + +COMMAND=$1 +TOMCAT_NAME="${OCF_RESKEY_tomcat_name}" +TOMCAT_CONSOLE="${OCF_RESKEY_script_log}" +RESOURCE_TOMCAT_USER="${OCF_RESKEY_tomcat_user}" +RESOURCE_STATUSURL="${OCF_RESKEY_statusurl}" + +JAVA_HOME="${OCF_RESKEY_java_home}" +JAVA_OPTS="${OCF_RESKEY_java_opts}" +CATALINA_HOME="${OCF_RESKEY_catalina_home}" +CATALINA_BASE="${OCF_RESKEY_catalina_base-${OCF_RESKEY_catalina_home}}" +CATALINA_OUT="${OCF_RESKEY_catalina_out}" + +CATALINA_PID=$OCF_RESKEY_catalina_pid +if [ -z "$CATALINA_PID" ] && [ "$__OCF_ACTION" = "start" ]; then + mkdir -p "${HA_RSCTMP}/${TOMCAT_NAME}_tomcatstate/" + if [ "${RESOURCE_TOMCAT_USER}" != "root" ]; then + chown ${RESOURCE_TOMCAT_USER} "${HA_RSCTMP}/${TOMCAT_NAME}_tomcatstate/" + fi + CATALINA_PID="${HA_RSCTMP}/${TOMCAT_NAME}_tomcatstate/catalina.pid" +fi + +MAX_STOP_TIME="${OCF_RESKEY_max_stop_time}" + +TOMCAT_START_OPTS="${OCF_RESKEY_tomcat_start_opts}" +TOMCAT_START_SCRIPT="${OCF_RESKEY_tomcat_start_script}" +CATALINA_OPTS="-Dname=$TOMCAT_NAME ${OCF_RESKEY_catalina_opts}" +CATALINA_TMPDIR="${OCF_RESKEY_catalina_tmpdir}" +CATALINA_ROTATE_LOG="${OCF_RESKEY_catalina_rotate_log}" +CATALINA_ROTATETIME="${OCF_RESKEY_catalina_rotatetime}" +JAVA_ENDORSED_DIRS="${OCF_RESKEY_java_endorsed_dirs}" +LOGGING_CONFIG="${OCF_RESKEY_logging_config}" +LOGGING_MANAGER="${OCF_RESKEY_logging_manager}" + +LSB_STATUS_STOPPED=3 +if [ $# -ne 1 ]; then + usage + exit $OCF_ERR_ARGS +fi +case "$COMMAND" in + meta-data) metadata_tomcat; exit $OCF_SUCCESS;; + help|usage) usage; exit $OCF_SUCCESS;; +esac + +if [ ! -d "$JAVA_HOME" -o ! -d "$CATALINA_HOME" -o ! -d "$CATALINA_BASE" ]; then + case $COMMAND in + stop) exit $OCF_SUCCESS;; + monitor) exit $OCF_NOT_RUNNING;; + status) exit $LSB_STATUS_STOPPED;; + esac + ocf_exit_reason "JAVA_HOME or CATALINA_HOME or CATALINA_BASE does not exist." + exit $OCF_ERR_INSTALLED +fi + +export JAVA_HOME JAVA_OPTS CATALINA_HOME CATALINA_BASE CATALINA_OUT CATALINA_PID CATALINA_OPTS CATALINA_TMPDIR JAVA_ENDORSED_DIRS LOGGING_CONFIG LOGGING_MANAGER + +JAVA=${JAVA_HOME}/bin/java + +if [ ! -x "$JAVA" ]; then + case $COMMAND in + stop) exit $OCF_SUCCESS;; + monitor) exit $OCF_NOT_RUNNING;; + status) exit $LSB_STATUS_STOPPED;; + esac + ocf_exit_reason "java command does not exist." + exit $OCF_ERR_INSTALLED +fi + +ROTATELOGS="" +if ocf_is_true ${CATALINA_ROTATE_LOG}; then + # Look for rotatelogs/rotatelogs2 + if [ -x /usr/sbin/rotatelogs ]; then + ROTATELOGS=/usr/sbin/rotatelogs + elif [ -x /usr/sbin/rotatelogs2 ]; then + ROTATELOGS=/usr/sbin/rotatelogs2 + fi +fi + +# +# ------------------ +# the main script +# ------------------ +# +case "$COMMAND" in + start) + ocf_log debug "[$TOMCAT_NAME] Enter tomcat start" + start_tomcat + func_status=$? + ocf_log debug "[$TOMCAT_NAME] Leave tomcat start $func_status" + exit $func_status + ;; + stop) + ocf_log debug "[$TOMCAT_NAME] Enter tomcat stop" + stop_tomcat + func_status=$? + ocf_log debug "[$TOMCAT_NAME] Leave tomcat stop $func_status" + exit $func_status + ;; + status) + if monitor_tomcat; then + echo tomcat instance $TOMCAT_NAME is running + exit $OCF_SUCCESS + else + echo tomcat instance $TOMCAT_NAME is stopped + exit $OCF_NOT_RUNNING + fi + exit $? + ;; + monitor) + #ocf_log debug "[$TOMCAT_NAME] Enter tomcat monitor" + monitor_tomcat + func_status=$? + #ocf_log debug "[$TOMCAT_NAME] Leave tomcat monitor $func_status" + exit $func_status + ;; + meta-data) + metadata_tomcat + exit $? + ;; + validate-all) + validate_all_tomcat + exit $? + ;; + usage|help) + usage + exit $OCF_SUCCESS + ;; + *) + usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + diff --git a/heartbeat/varnish b/heartbeat/varnish new file mode 100755 index 0000000..5fbf35c --- /dev/null +++ b/heartbeat/varnish @@ -0,0 +1,504 @@ +#!/bin/sh +# +# +# Varnish +# +# Description: Manage varnish instances as a HA resource +# +# Author: Léon Keijser <keijser@stone-it.com> +# +# License: GNU General Public License (GPL) +# +# See usage() for more details +# +# OCF instance parameters: +# OCF_RESKEY_pid +# OCF_RESKEY_binary +# OCF_RESKEY_client_binary +# OCF_RESKEY_config +# OCF_RESKEY_name +# OCF_RESKEY_listen_address +# OCF_RESKEY_mgmt_address +# OCF_RESKEY_ttl +# OCF_RESKEY_varnish_user +# OCF_RESKEY_varnish_group +# OCF_RESKEY_backend_type +# OCF_RESKEY_backend_size +# OCF_RESKEY_backend_file +# OCF_RESKEY_thread_pools +# OCF_RESKEY_thread_pool_min +# OCF_RESKEY_thread_pool_max +# OCF_RESKEY_thread_pool_timeout +# OCF_RESKEY_secret +# +####################################################################### +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### +# Set default paramenter values + +# Set these two first, as other defaults depend on it +OCF_RESKEY_name_default=${OCF_RESOURCE_INSTANCE} +: ${OCF_RESKEY_name=${OCF_RESKEY_name_default}} + +OCF_RESKEY_config_default="" +OCF_RESKEY_binary_default=varnishd +OCF_RESKEY_client_binary_default=varnishadm +OCF_RESKEY_pid_default=/var/run/varnishd_${OCF_RESKEY_name}.pid +OCF_RESKEY_listen_address_default=0.0.0.0:80 +OCF_RESKEY_ttl_default=600 +OCF_RESKEY_varnish_user_default=varnish +OCF_RESKEY_varnish_group_default=varnish +OCF_RESKEY_backend_type_default=malloc +OCF_RESKEY_backend_size_default=1G +OCF_RESKEY_backend_file_default=/var/lib/varnish/${OCF_RESKEY_name}.bin +OCF_RESKEY_thread_pools_default=2 +OCF_RESKEY_thread_pool_min_default=100 +OCF_RESKEY_thread_pool_max_default=3000 +OCF_RESKEY_thread_pool_timeout_default=120 +OCF_RESKEY_maxfiles_default=131072 +OCF_RESKEY_max_locked_memory_default=82000 +OCF_RESKEY_secret_default=/etc/varnish/secret + +: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +: ${OCF_RESKEY_client_binary=${OCF_RESKEY_client_binary_default}} +: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}} +: ${OCF_RESKEY_listen_address=${OCF_RESKEY_listen_address_default}} +: ${OCF_RESKEY_ttl=${OCF_RESKEY_ttl_default}} +: ${OCF_RESKEY_varnish_user=${OCF_RESKEY_varnish_user_default}} +: ${OCF_RESKEY_varnish_group=${OCF_RESKEY_varnish_group_default}} +: ${OCF_RESKEY_backend_type=${OCF_RESKEY_backend_type_default}} +: ${OCF_RESKEY_backend_size=${OCF_RESKEY_backend_size_default}} +: ${OCF_RESKEY_backend_file=${OCF_RESKEY_backend_file_default}} +: ${OCF_RESKEY_thread_pools=${OCF_RESKEY_thread_pools_default}} +: ${OCF_RESKEY_thread_pool_min=${OCF_RESKEY_thread_pool_min_default}} +: ${OCF_RESKEY_thread_pool_max=${OCF_RESKEY_thread_pool_max_default}} +: ${OCF_RESKEY_thread_pool_timeout=${OCF_RESKEY_thread_pool_timeout_default}} +: ${OCF_RESKEY_maxfiles=${OCF_RESKEY_maxfiles_default}} +: ${OCF_RESKEY_max_locked_memory=${OCF_RESKEY_max_locked_memory_default}} +: ${OCF_RESKEY_secret=${OCF_RESKEY_secret_default}} + +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="varnish" version="1.0"> +<version>1.0</version> + +<longdesc lang="en"> +The Varnish Resource Agent can manage several varnishd +instances throughout the cluster. It does so by creating +a unique PID file and requires a unique listen address +and name for each instance. +</longdesc> +<shortdesc lang="en">Manage a Varnish instance</shortdesc> + +<parameters> + +<parameter name="config" unique="1" required="1"> +<longdesc lang="en"> +The VCL configuration file that Varnish should manage, for example +"/etc/varnish/default.vcl". +</longdesc> +<shortdesc lang="en">VCL file</shortdesc> +<content type="string" default="${OCF_RESKEY_config_default}" /> +</parameter> + +<parameter name="name" unique="1"> +<longdesc lang="en"> +Override the name of the instance that should be given to Varnish +(defaults to the resource identifier). +</longdesc> +<shortdesc lang="en">Instance name</shortdesc> +<content type="string" default="${OCF_RESKEY_name_default}" /> +</parameter> + +<parameter name="pid" unique="1"> +<longdesc lang="en"> +Write the process's PID to the specified file. +The default will include the specified name, i.e.: +"/var/run/varnish_production.pid". Unlike what this help message shows, +it is most likely not necessary to change this parameter. +</longdesc> +<shortdesc lang="en">Listen address</shortdesc> +<content type="string" default="${OCF_RESKEY_pid_default}" /> +</parameter> + +<parameter name="listen_address" unique="1"> +<longdesc lang="en"> +Listen on this address:port, for example "192.168.1.1:80" +</longdesc> +<shortdesc lang="en">Listen address</shortdesc> +<content type="string" default="${OCF_RESKEY_listen_address_default}" /> +</parameter> + +<parameter name="mgmt_address" unique="1" required="1"> +<longdesc lang="en"> +Provide a management interface, for example "127.0.0.1:2222" +</longdesc> +<shortdesc lang="en">Management interface</shortdesc> +<content type="string" /> +</parameter> + +<parameter name="ttl"> +<longdesc lang="en"> +Specify a hard minimum time to live for cached documents. +</longdesc> +<shortdesc lang="en">TTL</shortdesc> +<content type="integer" default="${OCF_RESKEY_ttl_default}" /> +</parameter> + +<parameter name="varnish_user"> +<longdesc lang="en"> +Specify the name of an unprivileged user to which the +child process should switch before it starts accepting +connections. +</longdesc> +<shortdesc lang="en">Unprivileged user</shortdesc> +<content type="string" default="${OCF_RESKEY_varnish_user_default}" /> +</parameter> + +<parameter name="varnish_group"> +<longdesc lang="en"> +Specify the name of an unprivileged group to which +the child process should switch before it starts accepting +connections. +</longdesc> +<shortdesc lang="en">Unprivileged group</shortdesc> +<content type="string" default="${OCF_RESKEY_varnish_group_default}" /> +</parameter> + +<parameter name="backend_type"> +<longdesc lang="en"> +Use the specified storage backend. Valid options are +'malloc' for memory and 'file' for a file backend. +</longdesc> +<shortdesc lang="en">Backend type</shortdesc> +<content type="string" default="${OCF_RESKEY_backend_type_default}" /> +</parameter> + +<parameter name="backend_size"> +<longdesc lang="en"> +Specify the size of the backend. For example "1G". +</longdesc> +<shortdesc lang="en">Backend size</shortdesc> +<content type="string" default="${OCF_RESKEY_backend_size_default}" /> +</parameter> + +<parameter name="backend_file" unique="1"> +<longdesc lang="en"> +Specify the backend filename if you use backend_type file. +For example /var/lib/varnish/mybackend.bin +</longdesc> +<shortdesc lang="en">Backend file</shortdesc> +<content type="string" default="${OCF_RESKEY_backend_file_default}" /> +</parameter> + +<parameter name="threads_pools"> +<longdesc lang="en"> +Number of worker thread pools. +Each pool has the minimum, maximum and timeout values configured in the +thread_pool_min, thread_pool_max and thread_pool_timeout parameters +</longdesc> +<shortdesc lang="en">Worker thread pools</shortdesc> +<content type="string" default="${OCF_RESKEY_thread_pools_default}" /> +</parameter> + +<parameter name="thread_pool_min"> +<longdesc lang="en"> +Start at least min but no more than max worker +threads with the specified idle timeout in each pool. +</longdesc> +<shortdesc lang="en">Minimum worker threads</shortdesc> +<content type="string" default="${OCF_RESKEY_thread_pool_min_default}" /> +</parameter> + +<parameter name="thread_pool_max"> +<longdesc lang="en"> +Start at least min but no more than max worker +threads with the specified idle timeout in each pool. +</longdesc> +<shortdesc lang="en">Maximum worker threads</shortdesc> +<content type="string" default="${OCF_RESKEY_thread_pool_max_default}" /> +</parameter> + +<parameter name="thread_pool_timeout"> +<longdesc lang="en"> +Start at least min but no more than max worker +threads with the specified idle timeout in each pool. +</longdesc> +<shortdesc lang="en">Worker threads timeout</shortdesc> +<content type="string" default="${OCF_RESKEY_thread_pool_timeout_default}" /> +</parameter> + +<parameter name="client_binary"> +<longdesc lang="en"> +This is used to control Varnish via a CLI. It's currently +only used to check the status of the running child process. +</longdesc> +<shortdesc lang="en">Varnish admin utility</shortdesc> +<content type="string" default="${OCF_RESKEY_client_binary_default}" /> +</parameter> + +<parameter name="maxfiles"> +<longdesc lang="en"> +Maximum number of open files (for ulimit -n) +</longdesc> +<shortdesc lang="en">Max open files</shortdesc> +<content type="string" default="${OCF_RESKEY_maxfiles_default}" /> +</parameter> + +<parameter name="max_locked_memory"> +<longdesc lang="en"> +Locked shared memory limit (for ulimit -l) +</longdesc> +<shortdesc lang="en">Max locked memory</shortdesc> +<content type="string" default="${OCF_RESKEY_max_locked_memory_default}" /> +</parameter> + +<parameter name="secret"> +<longdesc lang="en"> +Path to a file containing a secret used for authorizing access to the management port. +</longdesc> +<shortdesc lang="en">Path of the secret file</shortdesc> +<content type="string" default="${OCF_RESKEY_secret_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" timeout="20s" interval="10s" depth="0" /> +<action name="status" timeout="20s" /> +<action name="meta-data" timeout="5s" /> +<action name="validate-all" timeout="20s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + + +varnish_usage() { + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +varnish_status() { + local pid + local rc + + # FAILED = pidfile exist, but no running proc (or mismatch pid) + # SUCCES = contents of pidfile == running process id + # NOTRUN = no pidfile, no running process + + # check if pidfile exists and larger than 0 bytes + if [ -s $OCF_RESKEY_pid ]; then + # it does, now check if the pid exists + pid=$(cat $OCF_RESKEY_pid) + ocf_run kill -s 0 $pid + rc=$? + if [ $rc -eq 0 ]; then + ocf_log info "Varnish is running" + # check if the child process is started and varnish is + # reporting child status as ok + ocf_run $OCF_RESKEY_client_binary -T $OCF_RESKEY_mgmt_address -S $OCF_RESKEY_secret status + v_rc=$? + if [ "$v_rc" -eq 0 ]; then + ocf_log info "Varnish child reported running" + return $OCF_SUCCESS + else + ocf_log err "Varnish child not running" + return $OCF_ERR_GENERIC + fi + else + ocf_log err "Varnish PID file exists, but varnishd is not running" + return $OCF_ERR_GENERIC + fi + fi + + return $OCF_NOT_RUNNING +} + +varnish_start() { + local rc + local backend_options + + varnish_status + rc=$? + if [ $rc -eq $OCF_SUCCESS ]; then + ocf_log info "Varnish already running" + return $OCF_SUCCESS + fi + + # check which backend is to be used + case "$OCF_RESKEY_backend_type" in + malloc) + backend_options="$OCF_RESKEY_backend_size" + ;; + file) + backend_options="$OCF_RESKEY_backend_file,$OCF_RESKEY_backend_size" + ;; + *) + # not implemented yet + return $OCF_ERR_CONFIGURED + ;; + esac + + # set maximum locked shared memory + if [ -n "$OCF_RESKEY_max_locked_memory" ]; then + ocf_log info "Setting max_locked_memory to ${OCF_RESKEY_max_locked_memory}" + ulimit -l $OCF_RESKEY_max_locked_memory + u_rc=$? + if [ "$u_rc" -ne 0 ]; then + ocf_log warn "Could not set ulimit for locked share memory for Varnish to '$OCF_RESKEY_max_locked_memory'" + fi + fi + + # set maximum number of open files + if [ -n "$OCF_RESKEY_maxfiles" ]; then + ulimit -n $OCF_RESKEY_maxfiles + u_rc=$? + if [ "$u_rc" -ne 0 ]; then + ocf_log warn "Could not set ulimit for open files for Varnish to '$OCF_RESKEY_maxfiles'" + fi + fi + + ocf_run $OCF_RESKEY_binary \ + -P $OCF_RESKEY_pid \ + -a $OCF_RESKEY_listen_address \ + -f $OCF_RESKEY_config \ + -T $OCF_RESKEY_mgmt_address \ + -t $OCF_RESKEY_ttl \ + -u $OCF_RESKEY_varnish_user \ + -g $OCF_RESKEY_varnish_group \ + -p thread_pools=$OCF_RESKEY_thread_pools \ + -p thread_pool_min=$OCF_RESKEY_thread_pool_min \ + -p thread_pool_max=$OCF_RESKEY_thread_pool_max \ + -p thread_pool_timeout=$OCF_RESKEY_thread_pool_timeout \ + -s $OCF_RESKEY_backend_type,$backend_options \ + -S $OCF_RESKEY_secret \ + -n $OCF_RESKEY_name + rc=$? + if [ $rc -ne 0 ]; then + ocf_log err "Varnish failed to start" + return $OCF_ERR_GENERIC + fi + + # Spin waiting for varnishd to come up. + # Let the CRM/LRM time us out if required + while true; do + varnish_status + rc=$? + [ $rc -eq $OCF_SUCCESS ] && break + if [ $rc -ne $OCF_NOT_RUNNING ]; then + ocf_log err "Varnish start failed" + exit $OCF_ERR_GENERIC + fi + sleep 2 + done + + ocf_log info "Varnish started succesfully" + return $OCF_SUCCESS +} + +varnish_stop() { + local rc + local pid + + varnish_status + rc=$? + if [ $rc -eq $OCF_NOT_RUNNING ]; then + ocf_log info "Varnish already stopped" + return $OCF_SUCCESS + fi + + # kill the varnish process + pid=$(cat $OCF_RESKEY_pid) + ocf_run kill -s TERM $pid + rc=$? + + if [ $rc -ne 0 ]; then + ocf_log err "Varnish failed to stop" + return $OCF_ERR_GENERIC + fi + + # stop waiting + shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-5)) + count=0 + while [ $count -lt $shutdown_timeout ]; do + # check if process still exists + ocf_run kill -s 0 $pid + rc=$? + if [ $rc -ne 0 ]; then + # Varnish stopped succesfully, so let's delete the pidfile + rm -f $OCF_RESKEY_pid + break + fi + count=$(expr $count + 1) + sleep 1 + ocf_log info "Varnish still hasn't stopped yet. Waiting..." + done + + varnish_status + rc=$? + if [ $rc -ne $OCF_NOT_RUNNING ]; then + # varnish didn't quit on a SIGTERM, try SIGKILL + ocf_log warn "Varnish failed to stop after ${shutdown_timeout}s using SIGTERM. Trying SIGKILL..." + ocf_run kill -s KILL $pid + # delete the pidfile + rm -f $OCF_RESKEY_pid + fi + + ocf_log info "Varnish stopped" + return $OCF_SUCCESS +} + + +varnish_validate() { + if [ -f $OCF_RESKEY_config ]; then + return $OCF_SUCCESS + else + return $OCF_ERR_INSTALLED + fi +} + + +case $__OCF_ACTION in + meta-data) + meta_data + exit $OCF_SUCCESS + ;; + start) + varnish_start + ;; + stop) + varnish_stop + ;; + monitor|status) + varnish_status + ;; + validate-all) + varnish_validate + ;; + usage|help) + varnish_usage + exit $OCF_SUCCESS + ;; + *) + varnish_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc + diff --git a/heartbeat/vdo-vol b/heartbeat/vdo-vol new file mode 100755 index 0000000..29bd7b8 --- /dev/null +++ b/heartbeat/vdo-vol @@ -0,0 +1,240 @@ +#!/bin/sh +# +# License: GNU General Public License (GPL) +# (c) 2018 O. Albrigtsen +# and Linux-HA contributors +# +# ----------------------------------------------------------------------------- +# O C F R E S O U R C E S C R I P T S P E C I F I C A T I O N +# ----------------------------------------------------------------------------- +# +# NAME +# vdo-vol : OCF resource agent script for VDO (Virtual Data Optimizer) +# + +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Defaults +OCF_RESKEY_volume_default="" + +: ${OCF_RESKEY_volume=${OCF_RESKEY_volume_default}} + + +vdo_usage() { + cat <<END + usage: $0 (start|stop|validate-all|meta-data|help|usage|monitor) + $0 manages VDO (Virtual Data Optimizer) volume(s) as an OCF HA resource. + The 'start' operation starts the instance. + The 'stop' operation stops the instance. + The 'status' operation reports whether the instance is running + The 'monitor' operation reports whether the instance seems to be working + The 'validate-all' operation reports whether the parameters are valid +END +} + +vdo_meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="vdo-vol" version="0.75"> +<version>1.0</version> + +<longdesc lang="en"> +OCF Resource script for VDO (Virtual Data Optimizer) volume(s). It manages VDO volume(s) as a HA resource. + +The configuration file needs to be synced to all nodes, and the systemd vdo service must be disabled when +using this agent. +</longdesc> +<shortdesc lang="en">VDO resource agent</shortdesc> + +<parameters> + +<parameter name="config"> + <longdesc lang="en">Configuration file</longdesc> + <shortdesc lang="en">Config file</shortdesc> + <content type="string" default="${OCF_RESKEY_config_default}" /> +</parameter> + +<parameter name="volume"> + <longdesc lang="en">VDO Volume (leave empty for all)</longdesc> + <shortdesc lang="en">Volume (empty for all)</shortdesc> + <content type="string" default="${OCF_RESKEY_volume_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="60s" /> +<action name="stop" timeout="20s" /> +<action name="status" timeout="20s" /> +<action name="monitor" depth="0" timeout="20s" interval="10s" start-delay="10s" /> +<action name="validate-all" timeout="20s" /> +<action name="meta-data" timeout="20s" /> +</actions> +</resource-agent> +END +} + + +rebuild() { + ocf_log warn "${OCF_RESKEY_volume} is in $MODE mode, starting in rebuild mode" + + vdo stop $OPTIONS + + while vdo_monitor skiprocheck; do + sleep 1 + done + + vdo start $OPTIONS --forceRebuild + + while ! vdo_monitor; do + sleep 1 + done + + return $? +} + +vdo_start() { + # if resource is already running,no need to continue code after this. + if vdo_monitor; then + ocf_log info "VDO volume(s): ${OCF_RESKEY_volume} is already active" + return $OCF_SUCCESS + fi + + vdo activate $OPTIONS + vdo start $OPTIONS + + while ! vdo_monitor skiprocheck; do + sleep 1 + done + + MODE=$(vdostats --verbose ${OCF_RESKEY_volume} | grep "operating mode" | awk '{print $NF}') + if [ $(echo "$MODE" | grep -v "normal" | wc -l) -gt 0 ]; then + rebuild + fi + + if [ $? -eq $OCF_SUCCESS ]; then + ocf_log info "VDO volume(s): ${OCF_RESKEY_volume} activated" + return ${OCF_SUCCESS} + fi + + return $? +} + +vdo_stop() { + vdo_monitor skiprocheck + if [ $? -ne $OCF_SUCCESS ]; then + # Currently not running. Nothing to do. + ocf_log info "VDO volume(s): ${OCF_RESKEY_volume} already deactivated" + + return $OCF_SUCCESS + fi + + vdo stop $OPTIONS + vdo deactivate $OPTIONS + + # Wait for process to stop + while vdo_monitor skiprocheck; do + sleep 1 + done + + return $OCF_SUCCESS +} + +vdo_monitor(){ + status=$(vdo status $OPTIONS 2>&1) + MODE=$(vdostats --verbose ${OCF_RESKEY_volume} | grep "operating mode" | awk '{print $NF}') + + case "$status" in + *"ERROR - vdodumpconfig: Failed to make FileLayer from"*) + if ocf_is_probe; then + return $OCF_NOT_RUNNING + fi + return $OCF_ERR_GENERIC + ;; + *"Device mapper status: not available"*) + return $OCF_NOT_RUNNING + ;; + *"Device mapper status: "*online*) + if [ "$MODE" = "read-only" ] && [ "$1" != "skiprocheck" ]; then + ocf_log err "VDO volume(s): ${OCF_RESKEY_volume} is in $MODE mode." + return $OCF_ERR_GENERIC + else + return $OCF_SUCCESS + fi + ;; + *) + ocf_log err "VDO volume(s): ${OCF_RESKEY_volume} failed\n$status" + return $OCF_ERR_GENERIC;; + esac +} + +vdo_validate_all(){ + check_binary "vdo" + + if systemctl is-enabled vdo > /dev/null 2>&1; then + ocf_exit_reason "systemd service vdo needs to be disabled" + exit $OCF_ERR_CONFIGURED + fi + + if [ -n "${OCF_RESKEY_config}" ] && [ ! -f "${OCF_RESKEY_config}" ]; then + ocf_exit_reason "Configuration file: ${OCF_RESKEY_config} not found" + exit $OCF_ERR_CONFIGURED + fi + + return $OCF_SUCCESS +} + + +# **************************** MAIN SCRIPT ************************************ + +# Make sure meta-data and usage always succeed +case $__OCF_ACTION in + meta-data) + vdo_meta_data + exit $OCF_SUCCESS + ;; + usage|help) + vdo_usage + exit $OCF_SUCCESS + ;; +esac + +# This OCF agent script need to be run as root user. +if ! ocf_is_root; then + echo "$0 agent script need to be run as root user." + ocf_log debug "$0 agent script need to be run as root user." + exit $OCF_ERR_GENERIC +fi + +if [ -z "${OCF_RESKEY_volume}" ]; then + OPTIONS="-a" +else + OPTIONS="-n ${OCF_RESKEY_volume}" +fi + +if [ -n "${OCF_RESKEY_config}" ]; then + OPTIONS="$OPTIONS -f ${OCF_RESKEY_config}" +fi + +# Translate each action into the appropriate function call +case $__OCF_ACTION in + start) + vdo_validate_all + vdo_start;; + stop) + vdo_stop;; + status|monitor) + vdo_monitor;; + validate-all) + ;; + *) + vdo_usage + exit $OCF_ERR_UNIMPLEMENTED;; +esac + +exit $? + +# End of this script diff --git a/heartbeat/vmware b/heartbeat/vmware new file mode 100755 index 0000000..f784fb1 --- /dev/null +++ b/heartbeat/vmware @@ -0,0 +1,393 @@ +#!/bin/sh +# +# VMware OCF resource agent +# +# Copyright (c) 2010 Apra Sistemi s.r.l. +# All Rights Reserved. +# +# Description: Manages VMware server 2.0 virtual machines +# as High-Availability resources +# +# +# Author: Cristian Mammoli <c.mammoli AT apra DOT it> +# License: GNU General Public License (GPL) +# Copyright: (C) 2010 Apra Sistemi s.r.l. +# +# See usage() function below for more details... +# +# OCF instance parameters: +# * OCF_RESKEY_vmxpath (mandatory: full path to the virtual machine vmx file) +# * OCF_RESKEY_vimshbin (optional: full path to the vmware-vim-cmd executable, +# fallback to default location if not declared) +# +# Requirements/caveats: +# * vmware-server 2.0 installed and autostarted on all nodes +# * vmdk files must be in the same directory of the vmx file +# * vmx filenames must be unique, even if stored in different directories +# * The default value of operation timeout (20 sec) isn't enough if you are +# dealing with many virtual machines: raise it to something around 600 secs +# or use operation attributes with the proposed values +# * Moving a vm among nodes will cause its mac address to change: if you need +# to preserve the mac address set it manually in the nic options +# * The script should be able to deal with paths and filenames with spaces, +# anyway try to avoid it + +# Initialization +################################################################# + +# Source ocf shell functions +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Basic variables configuration +OCF_RESKEY_vimshbin_default="/usr/bin/vmware-vim-cmd" +: ${OCF_RESKEY_vimshbin=${OCF_RESKEY_vimshbin_default}} +################################################################# + +# Path to the virtual machine configuration file +VMXPATH="$OCF_RESKEY_vmxpath" + +# Path to the vmware-vim-cmd executable +VIMSHBIN="$OCF_RESKEY_vimshbin" + +# Global variables +VMXDIR= +RELVMXPATH= +VMID= +VM= +VMAUTOMSG= + +# vmware-vim-cmd functions +################################################################# + +# Get virtual machine vid +vmware_get_vid() { + $VIMSHBIN vmsvc/getallvms \ + | awk '/\/'"$1"'/ {print $1}' +} + +# Is the vm waiting for input after a migration? +vmware_uuid_alt() { + $VIMSHBIN vmsvc/message $1 \ + | awk /^msg.uuid.altered/ +} + +# Get message id +vmware_get_msgid() { + $VIMSHBIN vmsvc/message $1 \ + | awk '/^Virtual machine message/ {print $4}' \ + | awk -F : '{print $1}' +} + +# Answers message +vmware_answer_msg() { + $VIMSHBIN vmsvc/message $1 $2 $3 >/dev/null +} + +# Register a virtual machine +vmware_register_vm() { + $VIMSHBIN solo/registervm '"'$1'"' >/dev/null +} + +# Unregister a virtual machine +vmware_unregister_vm() { + $VIMSHBIN vmsvc/unregister $1 >/dev/null +} + +# Start a virtual machine +vmware_poweron_vm() { + $VIMSHBIN vmsvc/power.on $1 >/dev/null +} + +# Suspend a virtual machine +vmware_suspend_vm() { + $VIMSHBIN vmsvc/power.suspend $1 >/dev/null +} + +# Get virtual machine power state +vmware_get_status() { + $VIMSHBIN vmsvc/power.getstate $1 \ + | awk '/^Powered on/ || /^Powered off/ || /^Suspended/' +} + +# Get vid of missing virtual machines +vmware_get_broken() { + $VIMSHBIN vmsvc/getallvm 2>&1 \ + | awk -F \' '/^Skipping/ {print $2}' +} + +# Variables depending on the above functions +################################################################# + +vmware_set_env() { + # Directory containing the virtual machine + VMXDIR="`dirname "$VMXPATH"`" + + # Basename of the configuration file + RELVMXPATH="`basename "$VMXPATH"`" + + # Vid of the virtual machine (can be empty if the vm is not registered) + VMID=`vmware_get_vid "$RELVMXPATH"` + + # Virtual machine name + VM="`awk -F '"' '/^displayName/ {print $2}' "$VMXPATH"`" + + # msg.autoAnswer value in config file + VMAUTOMSG="`awk -F '"' '/^msg.autoAnswer/ {print toupper($2)}' "$VMXPATH"`" +} + +# Main functions +################################################################# + +# Print usage summary +vmware_usage() { + cat <<END +usage: $0 {start|stop|status|monitor|meta-data|validate-all} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +# Check for mandatory files presence and consistency +vmware_validate() { + if [ -z "`pidof vmware-hostd`" ]; then + ocf_log err "vmware-hostd is not running" + exit $OCF_ERR_GENERIC + fi + + if [ ! -x "$VIMSHBIN" ]; then + ocf_log err "vmware-vim-cmd executable missing or not in path ($VIMSHBIN)" + exit $OCF_ERR_ARGS + fi + + if [ ! -f "$VMXPATH" ]; then + ocf_log err "Specified vmx file ($VMXPATH) does not exist" + exit $OCF_ERR_ARGS + fi + + # Now we can safely setup variables... + vmware_set_env + + # ... and verify them + if [ -z "$VM" ]; then + ocf_log err "Could not find out virtual machine name" + exit $OCF_ERR_ARGS + fi + + if [ "$VMAUTOMSG" != "TRUE" ]; then + ocf_log warn "Please set msg.autoAnswer = \"TRUE\" in your config file" + fi + + # $VMID is allowed to be empty in case we are validating a + # virtual machine which is not registered + + return $OCF_SUCCESS +} + +# More relaxed checking in case of probes +vmware_validate_probe() { + if [ ! -x "$VIMSHBIN" ]; then + ocf_log warn "vmware-vim-cmd executable missing or not in path ($VIMSHBIN)" + exit $OCF_NOT_RUNNING + fi + + if [ ! -f "$VMXPATH" ]; then + ocf_log warn "Specified vmx file ($VMXPATH) does not exist" + exit $OCF_NOT_RUNNING + fi + + # Now we can safely setup variables... + vmware_set_env +} + +# Start a virtual machine +vmware_start() { + # Don't start a VM if it's already running + if vmware_monitor; then + ocf_log info "Virtual machine $VM is already running" + return $OCF_SUCCESS + else + # Removes stale lockfiles and missing virtual machines + # in case of a crash. + # Do not use with a clustered filesystem or you could + # end up starting the same VM in more than one node + ocf_log info "Removing stale lockfiles" + find "$VMXDIR" -name \*.lck -type f -exec rm "{}" \; + for BVM in `vmware_get_broken`; do + ocf_log info "Unregistering missing virtual machine $BVM" + vmware_unregister_vm $BVM + done + if [ -z "$VMID" ]; then + # VM is not registered, need to register + ocf_log info "Virtual machine $VM is not registered" + ocf_log info "Registering Virtual machine $VM" + vmware_register_vm "$VMXPATH" + VMID=`vmware_get_vid "$RELVMXPATH"` + if [ -z "$VMID" ]; then + ocf_log err "Could not register virtual machine $VM" + exit $OCF_ERR_GENERIC + fi + ocf_log info "Virtual machine $VM registered with ID $VMID" + fi + ocf_log info "Powering on virtual machine $VM" + vmware_poweron_vm $VMID + # Give the VM some time to initialize + sleep 10 + + if [ "$VMAUTOMSG" != "TRUE" ]; then + # msg.autoAnswer is not set: we try to deal with the + # most common question: msg.uuid.altered + ocf_log info "Checking msg.uuid.altered on VM $VM" + if [ -n "`vmware_uuid_alt $VMID`" ]; then + MSGID=`vmware_get_msgid $VMID` + vmware_answer_msg $VMID $MSGID 2 + fi + fi + + # Check if the VM is running. We don't bother + # with timeouts: we rely on the CRM for that. + while :; do + vmware_monitor && break + ocf_log info "Virtual machine $VM is still stopped: delaying 10 seconds" + sleep 10 + done + + ocf_log info "Virtual machine $VM is running" + return $OCF_SUCCESS + fi +} + +# Stop a virtual machine +vmware_stop() { + # Don't stop a VM if it's not registered + if [ -z "$VMID" ]; then + ocf_log info "Virtual machine $VM is not registered" + return $OCF_SUCCESS + else + # Don't stop a VM if it's already stopped + if vmware_monitor; then + # If the VM is running send a suspend signal and wait + # until it is off. We don't bother with timeouts: we + # rely on the CRM for that. + ocf_log info "Virtual machine $VM is running: suspending it" + vmware_suspend_vm $VMID + sleep 5 + while vmware_monitor; do + ocf_log info "Virtual machine $VM is still running: delaying 10 seconds" + sleep 10 + done + else + ocf_log info "Virtual machine $VM is already stopped" + fi + # VMware randomly fails to unregister VMs, + # so we loop until we have success or timeout + ocf_log info "Unregistering virtual machine $VM" + vmware_unregister_vm $VMID + VMID=`vmware_get_vid "$RELVMXPATH"` + while [ -n "$VMID" ]; do + ocf_log warn "Could not unregister virtual machine $VM: retrying." + sleep 10 + vmware_unregister_vm $VMID + VMID=`vmware_get_vid "$RELVMXPATH"` + done + ocf_log info "Virtual machine $VM is stopped" + return $OCF_SUCCESS + fi +} + +# Monitor a virtual machine +vmware_monitor() { + if [ -n "$VMID" ] && [ "`vmware_get_status $VMID`" = "Powered on" ]; then + ocf_log debug "Virtual machine $VM (ID $VMID) is running..." + return $OCF_SUCCESS + else + ocf_log debug "Virtual machine $VM is stopped/suspended/not registered" + return $OCF_NOT_RUNNING + fi +} + +# Print metadata informations +meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="vmware" version="0.2"> +<version>1.0</version> +<longdesc lang="en"> +OCF compliant script to control vmware server 2.0 virtual machines. +</longdesc> +<shortdesc lang="en">Manages VMWare Server 2.0 virtual machines</shortdesc> + +<parameters> +<parameter name="vmxpath" unique="0" required="1"> +<longdesc lang="en"> +VMX configuration file path +</longdesc> +<shortdesc lang="en">VMX file path</shortdesc> +<content type="string"/> +</parameter> + +<parameter name="vimshbin" unique="0" required="0"> +<longdesc lang="en"> +vmware-vim-cmd executable path +</longdesc> +<shortdesc lang="en">vmware-vim-cmd path</shortdesc> +<content type="string" default="${OCF_RESKEY_vimshbin_default}"/> +</parameter> +</parameters> + +<actions> +<action name="start" timeout="600s" /> +<action name="stop" timeout="600s" /> +<action name="monitor" timeout="30s" interval="300s" depth="0"/> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +# See how we were called +################################################################# + +case $1 in +meta-data) + meta_data + exit $OCF_SUCCESS + ;; + +start) + vmware_validate + vmware_start + ;; + +stop) + vmware_validate + vmware_stop + ;; + +status|monitor) + if ocf_is_probe; then + vmware_validate_probe + else + vmware_validate + fi + vmware_monitor + ;; + +usage|help) + vmware_usage + exit $OCF_SUCCESS + ;; + +validate-all) + vmware_validate + ;; + +*) + vmware_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; + +esac + +exit $? diff --git a/heartbeat/vsftpd.in b/heartbeat/vsftpd.in new file mode 100644 index 0000000..3831c4f --- /dev/null +++ b/heartbeat/vsftpd.in @@ -0,0 +1,259 @@ +#!@BASH_SHELL@ +# +# Resource script for vsftpd +# +# Description: Manages vsftpd as an OCF resource in +# an Active-Passive High Availability setup. +# +# Author: Michel Rode <rode@b1-systems.de> : vsftpd script +# License: GNU General Public License (GPLv2) +# +# +# usage: $0 {start|stop|status|monitor|validate-all|meta-data} +# +# The "start" arg starts vsftpd. +# +# The "stop" arg stops it. +# +# OCF parameters: +# OCF_RESKEY_binpath +# OCF_RESKEY_conffile +# OCF_RESKEY_pidfile +# +########################################################################## +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +# Parameter defaults + +OCF_RESKEY_binpath_default="/usr/sbin/vsftpd" +OCF_RESKEY_conffile_default="/etc/vsftpd/vsftpd.conf" +OCF_RESKEY_pidfile_default="/var/run/vsftpd.pid" + +: ${OCF_RESKEY_binpath=${OCF_RESKEY_binpath_default}} +: ${OCF_RESKEY_conffile=${OCF_RESKEY_conffile_default}} +: ${OCF_RESKEY_pidfile=${OCF_RESKEY_pidfile_default}} + +USAGE="Usage: $0 {start|stop|status|monitor|validate-all|meta-data}"; + +########################################################################## + +usage() +{ + echo $USAGE >&2 +} + +meta_data() +{ +cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="vsftpd" version="1.0"> +<version>1.0</version> +<longdesc lang="en"> +This script manages vsftpd +</longdesc> +<shortdesc lang="en">Manages an vsftpd</shortdesc> + +<parameters> + +<parameter name="binpath"> +<longdesc lang="en"> +The vsftpd binary path. +For example, "/usr/sbin/vsftpd" +</longdesc> +<shortdesc lang="en">Full path to the vsftpd binary</shortdesc> +<content type="string" default="${OCF_RESKEY_binpath_default}"/> +</parameter> + +<parameter name="conffile"> +<longdesc lang="en"> +The vsftpd configuration file name with full path. +For example, "/etc/vsftpd/vsftpd.conf" +</longdesc> +<shortdesc lang="en">Configuration file name with full path</shortdesc> +<content type="string" default="${OCF_RESKEY_conffile_default}" /> +</parameter> + +<parameter name="pidfile"> +<longdesc lang="en"> +The vsftpd pidfile with full path. +For example, "/var/run/vsftpd.pid" +</longdesc> +<shortdesc lang="en">PID file with full path</shortdesc> +<content type="string" default="${OCF_RESKEY_pidfile_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s"/> +<action name="stop" timeout="20s"/> +<action name="monitor" depth="0" timeout="20s" interval="60s" /> +<action name="validate-all" timeout="20s"/> +<action name="meta-data" timeout="5s"/> +</actions> +</resource-agent> +END +exit $OCF_SUCCESS +} + +get_pidfile() +{ + PIDFILE=$OCF_RESKEY_pidfile +} + +vsftpd_status() +{ + if [ -n "$PIDFILE" -a -f $PIDFILE ]; then + # vsftpd is probably running + PID=`cat $PIDFILE` + if [ -n "$PID" ]; then + if ps -p $PID | grep vsftpd >/dev/null ; then + ocf_log info "vsftpd daemon running" + return $OCF_SUCCESS + else + ocf_log info "vsftpd daemon is not running but pid file exists" + return $OCF_ERR_GENERIC + fi + else + ocf_log err "PID file empty!" + return $OCF_ERR_GENERIC + fi + fi + + # vsftpd is not running + ocf_log info "vsftpd daemon is not running" + return $OCF_NOT_RUNNING +} + + +vsftpd_start() +{ + # if vsftpd is running return success + vsftpd_status + retVal=$? + if [ $retVal -eq $OCF_SUCCESS ]; then + exit $OCF_SUCCESS + elif [ $retVal -ne $OCF_NOT_RUNNING ]; then + ocf_log err "Error. Unknown status." + exit $OCF_ERR_GENERIC + fi + + if [ -n "$OCF_RESKEY_binpath" ]; then + COMMAND="$OCF_RESKEY_binpath" + fi + if [ -n "$OCF_RESKEY_conffile" ]; then + COMMAND="$COMMAND $OCF_RESKEY_conffile" + fi + + $COMMAND; + if [ $? -ne 0 ]; then + ocf_log err "Error. vsftpd returned error $?." + exit $OCF_ERR_GENERIC + fi + + PID=$( pgrep $OCF_RESKEY_binpath ) + case $? in + 0) + ocf_log info "PID file (pid:${PID} at $PIDFILE) created for vsftpd." + ocf_log info "Started vsftpd." + echo $PID > $PIDFILE + exit $OCF_SUCCESS + ;; + 1) + rm -f "$PIDFILE" > /dev/null 2>&1 + ocf_log info "$Error getting pid." + exit $OCF_ERR_GENERIC + ;; + *) + rm -f "$PIDFILE" > /dev/null 2>&1 + ocf_exit_reason "Error encountered detecting pid of vsftpd." + exit $OCF_ERR_GENERIC + ;; + esac + +} + + +vsftpd_stop() +{ + if vsftpd_status ; then + PID=`cat $PIDFILE` + if [ -n "$PID" ] ; then + kill $PID + if [ $? -ne 0 ]; then + kill -s KILL $PID + if [ $? -ne 0 ]; then + ocf_log err "Error. Could not stop vsftpd daemon." + return $OCF_ERR_GENERIC + fi + fi + rm $PIDFILE 2>/dev/null + fi + fi + ocf_log info "Stopped vsftpd daemon." + exit $OCF_SUCCESS +} + +vsftpd_monitor() +{ + vsftpd_status +} + +vsftpd_validate_all() +{ + check_binary $OCF_RESKEY_binpath + + if [ -n "$OCF_RESKEY_conffile" -a ! -f "$OCF_RESKEY_conffile" ]; then + ocf_log err "Config file $OCF_RESKEY_conffile does not exist." + exit $OCF_ERR_ARGS + fi + + return $OCF_SUCCESS +} + + +# +# Main +# + +if [ $# -ne 1 ]; then + usage + exit $OCF_ERR_ARGS +fi + +case $1 in + start) get_pidfile + vsftpd_start + ;; + + stop) get_pidfile + vsftpd_stop + ;; + + status) get_pidfile + vsftpd_status + ;; + + monitor)get_pidfile + vsftpd_monitor + ;; + + validate-all) vsftpd_validate_all + ;; + + meta-data) meta_data + ;; + + usage) usage + exit $OCF_SUCCESS + ;; + + *) usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac + diff --git a/heartbeat/zabbixserver b/heartbeat/zabbixserver new file mode 100755 index 0000000..cb9e023 --- /dev/null +++ b/heartbeat/zabbixserver @@ -0,0 +1,315 @@ +#!/bin/sh +# +# +# zabbixserver OCF RA for zabbix_server daemon +# +# Copyright (c) 2012 Krzysztof Gajdemski <songo@debian.org.pl> +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + +# +# Defaults +# +OCF_RESKEY_binary_default="zabbix_server" +OCF_RESKEY_pid_default="/var/run/zabbix-server/zabbix_server.pid" +OCF_RESKEY_config_default="" + +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}} +: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} + +# sleep interval when waiting for threads cleanup +sleepint=1 + +# +# Functions +# +zabbixserver_meta_data() { + cat <<END +<?xml version="1.0"?> +<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> +<resource-agent name="zabbixserver" version="0.0.1"> +<version>1.0</version> + +<longdesc lang="en"> +This is a Zabbix server Resource Agent for zabbix_server monitoring +daemon. See: http://www.zabbix.com/ +</longdesc> +<shortdesc lang="en">Zabbix server resource agent</shortdesc> + +<parameters> + +<parameter name="binary" unique="0" required="0"> +<longdesc lang="en"> +Location of the zabbix_server binary. +</longdesc> +<shortdesc lang="en">Zabbix server binary</shortdesc> +<content type="string" default="${OCF_RESKEY_binary_default}" /> +</parameter> + +<parameter name="pid" unique="1" required="0"> +<longdesc lang="en"> +Path to zabbix_server pidfile. As it's created by daemon itself +it must be the same as specified in the Zabbix configuration file +with parameter 'PidFile='. +</longdesc> +<shortdesc lang="en">Path to pidfile</shortdesc> +<content type="string" default="${OCF_RESKEY_pid_default}" /> +</parameter> + +<parameter name="config" unique="1" required="0"> +<longdesc lang="en"> +Path to zabbix_server configuration file. Assumed server default +if not specified. +</longdesc> +<shortdesc lang="en">Path to configuration file</shortdesc> +<content type="string" default="${OCF_RESKEY_config_default}" /> +</parameter> + +</parameters> + +<actions> +<action name="start" timeout="20s" /> +<action name="stop" timeout="20s" /> +<action name="monitor" timeout="20s" interval="10s" depth="0"/> +<action name="validate-all" timeout="20s" /> +<action name="meta-data" timeout="5s" /> +</actions> +</resource-agent> +END +} + +####################################################################### + + +zabbixserver_usage() { + cat <<END +usage: $0 {start|stop|monitor|validate-all|meta-data} + +Expects to have a fully populated OCF RA-compliant environment set. +END +} + +# +# Get an actual PID from a given pidfile. If it can't +# be found then return 1 +# +getpid() { + # pidfile doesn't exists + [ -f $1 ] || return 1 + sed -n '1 { /[0-9]/p }' $1 + + return 0 +} + +# +# Check for the server configuration file +# +check_config() { + # check only when it is specified by user + if [ ! -z "$1" ] && [ ! -f "$1" ]; then + if ocf_is_probe; then + ocf_log info "Can't read configuration file $1 during probe" + else + ocf_exit_reason "Can't read configuration file $1" + return 1 + fi + fi + + return 0 +} + +# +# Start Zabbix daemon +# +startserver() { + local command + local params + + command=$OCF_RESKEY_binary + + # use additional parameters if specified + if [ "$OCF_RESKEY_config" ]; then + params="--config $OCF_RESKEY_config" + command="$command $params" + fi + + ocf_log debug "Starting server using command: $command" + + ocf_run $command +} + +# +# Check the process status (PID is given as an argument) +# +process_status() { + local pid + + pid=$1 + + # check if parent process is running + ocf_run -q kill -s 0 $pid 2> /dev/null 1>&2 +} + +# +# start the agent +# +zabbixserver_start() { + local rc + + # check the resource status + zabbixserver_monitor + rc=$? + case "$rc" in + $OCF_SUCCESS) + ocf_log info "Resource is already running" + return $OCF_SUCCESS + ;; + $OCF_NOT_RUNNING) + ;; + *) + exit $OCF_ERR_GENERIC + ;; + esac + + # remove stale pidfile if it exists + if [ -f $OCF_RESKEY_pid ]; then + ocf_log info "Removing stale pidfile" + rm $OCF_RESKEY_pid + fi + + startserver + if [ $? -ne 0 ]; then + ocf_exit_reason "Can't start Zabbix server" + return $OCF_ERR_GENERIC + fi + + # wait if it starts really + while ! zabbixserver_monitor; do + ocf_log debug "Resource has not started yet, waiting" + sleep $sleepint + done + + return $OCF_SUCCESS +} + +# +# stop the agent +# +zabbixserver_stop() { + local pid + local rc + + # check the resource status + zabbixserver_monitor + rc=$? + case "$rc" in + $OCF_SUCCESS) + ;; + $OCF_NOT_RUNNING) + ocf_log info "Resource is already stopped" + return $OCF_SUCCESS + ;; + *) + exit $OCF_ERR_GENERIC + ;; + esac + + pid=`getpid $OCF_RESKEY_pid` + if [ $? -ne 0 ]; then + ocf_exit_reason "Can't find process PID" + return $OCF_ERR_GENERIC + fi + + # kill the process + ocf_run -q kill $pid + if [ $? -ne 0 ]; then + ocf_exit_reason "Can't stop process (PID $pid)" + return $OCF_ERR_GENERIC + fi + + # Wait until the parent process terminates. + # NOTE: The parent may be still waiting for its children. A regular monitor + # function will not detect this condition because the pidfile may be + # removed just now. + while process_status $pid; do + ocf_log debug "Waiting for process to terminate..." + sleep $sleepint + done + + # wait if it stops really + while zabbixserver_monitor; do + ocf_log debug "Resource has not stopped yet, waiting" + sleep $sleepint + done + + # remove stale pidfile if it exists + if [ -f $OCF_RESKEY_pid ]; then + ocf_log debug "Pidfile still exists, removing" + rm $OCF_RESKEY_pid + fi + + return $OCF_SUCCESS +} + +# +# resource monitor +# +zabbixserver_monitor() { + local pid + + pid=`getpid $OCF_RESKEY_pid` + if [ $? -eq 0 ]; then + process_status $pid + if [ $? -eq 0 ]; then + ocf_log debug "Resource is running" + return $OCF_SUCCESS + fi + fi + + ocf_log info "Resource is not running" + return $OCF_NOT_RUNNING +} + +# +# validate configuration +# +zabbixserver_validate_all() { + check_config $OCF_RESKEY_config || return $OCF_ERR_INSTALLED + ocf_mkstatedir root 755 `dirname $OCF_RESKEY_pid` || return $OCF_ERR_INSTALLED + return $OCF_SUCCESS +} + +# +# main +# +OCF_REQUIRED_PARAMS="" +OCF_REQUIRED_BINARIES="$OCF_RESKEY_binary" +ocf_rarun $* |