summaryrefslogtreecommitdiffstats
path: root/heartbeat/Raid1
diff options
context:
space:
mode:
Diffstat (limited to 'heartbeat/Raid1')
-rwxr-xr-xheartbeat/Raid1586
1 files changed, 586 insertions, 0 deletions
diff --git a/heartbeat/Raid1 b/heartbeat/Raid1
new file mode 100755
index 0000000..924d94c
--- /dev/null
+++ b/heartbeat/Raid1
@@ -0,0 +1,586 @@
+#!/bin/sh
+#
+#
+# License: GNU General Public License (GPL)
+# Support: users@clusterlabs.org
+#
+# Raid1
+# Description: Manages a Linux software RAID device on a shared storage medium.
+# Original Author: Eric Z. Ayers (eric.ayers@compgen.com)
+# Original Release: 25 Oct 2000
+# RAID patches: http://people.redhat.com/mingo/raid-patches/
+# Word to the Wise: http://lwn.net/2000/0810/a/raid-faq.php3
+# Sympathetic Ear: mailto:linux-raid@vger.kernel.org
+#
+# usage: $0 {start|stop|status|monitor|validate-all|usage|meta-data}
+#
+#
+# EXAMPLE config file /etc/raidtab.md0
+# This file must exist on both machines!
+#
+# raiddev /dev/md0
+# raid-level 1
+# nr-raid-disks 2
+# chunk-size 64k
+# persistent-superblock 1
+# #nr-spare-disks 0
+# device /dev/sda1
+# raid-disk 0
+# device /dev/sdb1
+# raid-disk 1
+#
+# EXAMPLE config file /etc/mdadm.conf (for more info:man mdadm.conf)
+#
+# DEVICE /dev/sdb1 /dev/sdc1
+# ARRAY /dev/md0 UUID=4a865b55:ba27ef8d:29cd5701:6fb42799
+#######################################################################
+# Initialization:
+
+: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
+. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
+
+# Parameter defaults
+
+OCF_RESKEY_raidconf_default=""
+OCF_RESKEY_raiddev_default=""
+OCF_RESKEY_homehost_default=""
+OCF_RESKEY_force_stop_default="true"
+OCF_RESKEY_udev_default="true"
+OCF_RESKEY_force_clones_default="false"
+
+: ${OCF_RESKEY_raidconf=${OCF_RESKEY_raidconf_default}}
+: ${OCF_RESKEY_raiddev=${OCF_RESKEY_raiddev_default}}
+: ${OCF_RESKEY_homehost=${OCF_RESKEY_homehost_default}}
+: ${OCF_RESKEY_force_stop=${OCF_RESKEY_force_stop_default}}
+: ${OCF_RESKEY_udev=${OCF_RESKEY_udev_default}}
+: ${OCF_RESKEY_force_clones=${OCF_RESKEY_force_clones_default}}
+
+#######################################################################
+
+usage() {
+ cat <<-EOT
+ usage: $0 {start|stop|status|monitor|validate-all|usage|meta-data}
+ EOT
+}
+
+meta_data() {
+ cat <<END
+<?xml version="1.0"?>
+<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
+<resource-agent name="Raid1" version="1.0">
+<version>1.0</version>
+
+<longdesc lang="en">
+This resource agent manages Linux software RAID (MD) devices on
+a shared storage medium. It uses mdadm(8) to start, stop, and
+monitor the MD devices. Raidtools are supported, but deprecated.
+See https://raid.wiki.kernel.org/index.php/Linux_Raid for more
+information.
+</longdesc>
+<shortdesc lang="en">Manages Linux software RAID (MD) devices on shared storage</shortdesc>
+
+<parameters>
+<parameter name="raidconf" unique="0" required="1">
+<longdesc lang="en">
+The RAID configuration file, e.g. /etc/mdadm.conf.
+</longdesc>
+<shortdesc lang="en">RAID config file</shortdesc>
+<content type="string" default="${OCF_RESKEY_raidconf_default}" />
+</parameter>
+
+<parameter name="raiddev" unique="0" required="1">
+<longdesc lang="en">
+One or more block devices to use, space separated. Alternatively,
+set to "auto" to manage all devices specified in raidconf.
+</longdesc>
+<shortdesc lang="en">block device</shortdesc>
+<content type="string" default="${OCF_RESKEY_raiddev_default}" />
+</parameter>
+
+<parameter name="homehost" unique="0" required="0">
+<longdesc lang="en">
+The value for the homehost directive; this is an mdadm feature to
+protect RAIDs against being activated by accident. It is recommended to
+create RAIDs managed by the cluster with "homehost" set to a special
+value, so they are not accidentally auto-assembled by nodes not
+supposed to own them.
+</longdesc>
+<shortdesc lang="en">Homehost for mdadm</shortdesc>
+<content type="string" default="${OCF_RESKEY_homehost_default}" />
+</parameter>
+
+<parameter name="force_stop" unique="0" required="0">
+<longdesc lang="en">
+If processes or kernel threads are using the array, it cannot be
+stopped. We will try to stop processes, first by sending TERM and
+then, if that doesn't help in $PROC_CLEANUP_TIME seconds, using KILL.
+The lsof(8) program is required to get the list of array users.
+Of course, the kernel threads cannot be stopped this way.
+If the processes are critical for data integrity, then set this
+parameter to false. Note that in that case the stop operation
+will fail and the node will be fenced.
+</longdesc>
+<shortdesc lang="en">force stop processes using the array</shortdesc>
+<content type="boolean" default="${OCF_RESKEY_force_stop_default}" />
+</parameter>
+
+<parameter name="udev" unique="0" required="0">
+<longdesc lang="en">
+Wait until udevd creates a device in the start operation. On a
+normally loaded host this should happen quickly, but you may be
+unlucky. If you are not using udev set this to "no".
+</longdesc>
+<shortdesc lang="en">udev</shortdesc>
+<content type="boolean" default="${OCF_RESKEY_udev_default}" />
+</parameter>
+
+<parameter name="force_clones">
+<longdesc lang="en">
+Activating the same md RAID array on multiple nodes at the same time
+will result in data corruption and thus is forbidden by default.
+
+A safe example could be an array that is only named identically across
+all nodes, but is in fact distinct.
+
+Only set this to "true" if you know what you are doing!
+</longdesc>
+<shortdesc lang="en">force ability to run as a clone</shortdesc>
+<content type="boolean" default="${OCF_RESKEY_force_clones_default}" />
+</parameter>
+
+</parameters>
+
+<actions>
+<action name="start" timeout="20s" />
+<action name="stop" timeout="20s" />
+<action name="status" depth="0" timeout="20s" interval="10s" />
+<action name="monitor" depth="0" timeout="20s" interval="10s" />
+<action name="validate-all" timeout="5s" />
+<action name="meta-data" timeout="5s" />
+</actions>
+</resource-agent>
+END
+}
+
+udev_settle() {
+ if ocf_is_true $WAIT_FOR_UDEV; then
+ udevadm settle $*
+ fi
+}
+list_conf_arrays() {
+ test -f $RAIDCONF || {
+ ocf_exit_reason "$RAIDCONF gone missing!"
+ exit $OCF_ERR_GENERIC
+ }
+ grep ^ARRAY $RAIDCONF | awk '{print $2}'
+}
+forall() {
+ local func=$1
+ local checkall=$2
+ local mddev rc=0
+ for mddev in $RAIDDEVS; do
+ $func $mddev
+ rc=$(($rc | $?))
+ [ "$checkall" = all ] && continue
+ [ $rc -ne 0 ] && return $rc
+ done
+ return $rc
+}
+are_arrays_stopped() {
+ local rc mddev
+ for mddev in $RAIDDEVS; do
+ raid1_monitor_one $mddev
+ rc=$?
+ [ $rc -ne $OCF_NOT_RUNNING ] && break
+ done
+ test $rc -eq $OCF_NOT_RUNNING
+}
+
+md_assemble() {
+ local mddev=$1
+ $MDADM --assemble $mddev --config=$RAIDCONF $MDADM_HOMEHOST
+ udev_settle --exit-if-exists=$mddev
+}
+#
+# START: Start up the RAID device
+#
+raid1_start() {
+ local rc
+ raid1_monitor
+ rc=$?
+ if [ $rc -eq $OCF_SUCCESS ]; then
+ # md already online, nothing to do.
+ return $OCF_SUCCESS
+ fi
+ if [ $rc -ne $OCF_NOT_RUNNING ]; then
+ # If the array is in a broken state, this agent doesn't
+ # know how to repair that.
+ ocf_exit_reason "$RAIDDEVS in a broken state; cannot start (rc=$rc)"
+ return $OCF_ERR_GENERIC
+ fi
+
+ if [ $HAVE_RAIDTOOLS = "true" ]; then
+ # Run raidstart to start up the RAID array
+ $RAIDSTART --configfile $RAIDCONF $MDDEV
+ else
+ forall md_assemble all
+ fi
+
+ raid1_monitor
+ if [ $? -eq $OCF_SUCCESS ]; then
+ return $OCF_SUCCESS
+ else
+ ocf_exit_reason "Couldn't start RAID for $RAIDDEVS"
+ return $OCF_ERR_GENERIC
+ fi
+}
+
+#
+# STOP: stop the RAID device
+#
+mark_readonly() {
+ local mddev=$1
+ local rc
+ ocf_log info "Attempting to mark array $mddev readonly"
+ $MDADM --readonly $mddev --config=$RAIDCONF
+ rc=$?
+ if [ $rc -ne 0 ]; then
+ ocf_exit_reason "Failed to set $mddev readonly (rc=$rc)"
+ fi
+ return $rc
+}
+mknod_raid1_stop() {
+ # first create a block device file, then try to stop the
+ # array
+ local rc n tmp_block_file
+ n=`echo $1 | sed 's/[^0-9]*//'`
+ if ! ocf_is_decimal "$n"; then
+ ocf_log warn "could not get the minor device number from $1"
+ return 1
+ fi
+ tmp_block_file="$HA_RSCTMP/${OCF_RESOURCE_INSTANCE}-`basename $1`"
+ rm -f $tmp_block_file
+ ocf_log info "block device file $1 missing, creating one in order to stop the array"
+ mknod $tmp_block_file b 9 $n
+ $MDADM --stop $tmp_block_file --config=$RAIDCONF
+ rc=$?
+ rm -f $tmp_block_file
+ return $rc
+}
+raid1_stop_one() {
+ ocf_log info "Stopping array $1"
+ if [ -b "$1" ]; then
+ $MDADM --stop $1 --config=$RAIDCONF &&
+ return
+ else
+ # newer mdadm releases can stop arrays when given the
+ # basename; try that first
+ $MDADM --stop `basename $1` --config=$RAIDCONF &&
+ return
+ # otherwise create a block device file
+ mknod_raid1_stop $1
+ fi
+}
+get_users_pids() {
+ local mddev=$1
+ local outp l
+ ocf_log debug "running lsof to list $mddev users..."
+ outp=`lsof $mddev | tail -n +2`
+ echo "$outp" | awk '{print $2}' | sort -u
+ echo "$outp" | while read l; do
+ ocf_log warn "$l"
+ done
+}
+stop_raid_users() {
+ local pids
+ pids=`forall get_users_pids all | sort -u`
+ if [ -z "$pids" ]; then
+ ocf_log warn "lsof reported no users holding arrays"
+ return 2
+ else
+ ocf_stop_processes TERM $PROC_CLEANUP_TIME $pids
+ fi
+}
+stop_arrays() {
+ if [ $HAVE_RAIDTOOLS = "true" ]; then
+ $RAIDSTOP --configfile $RAIDCONF $MDDEV
+ else
+ forall raid1_stop_one all
+ fi
+}
+showusers() {
+ local disk
+ for disk; do
+ if have_binary lsof; then
+ ocf_log info "running lsof to list $disk users..."
+ ocf_run -warn lsof $disk
+ fi
+ if [ -d /sys/block/$disk/holders ]; then
+ ocf_log info "ls -l /sys/block/$disk/holders"
+ ocf_run -warn ls -l /sys/block/$disk/holders
+ fi
+ done
+}
+raid1_stop() {
+ local rc
+ # See if the MD device is already cleanly stopped:
+ if are_arrays_stopped; then
+ return $OCF_SUCCESS
+ fi
+
+ # Turn off raid
+ if ! stop_arrays; then
+ if ocf_is_true $FORCESTOP; then
+ if have_binary lsof; then
+ stop_raid_users
+ case $? in
+ 2) false;;
+ *) stop_arrays;;
+ esac
+ else
+ ocf_log warn "install lsof(8) to list users holding the disk"
+ false
+ fi
+ else
+ false
+ fi
+ fi
+ rc=$?
+
+ if [ $rc -ne 0 ]; then
+ ocf_log warn "Couldn't stop RAID for $RAIDDEVS (rc=$rc)"
+ showusers $RAIDDEVS
+ if [ $HAVE_RAIDTOOLS != "true" ]; then
+ forall mark_readonly all
+ fi
+ return $OCF_ERR_GENERIC
+ fi
+
+ if are_arrays_stopped; then
+ return $OCF_SUCCESS
+ fi
+
+ ocf_exit_reason "RAID $RAIDDEVS still active after stop command!"
+ return $OCF_ERR_GENERIC
+}
+
+#
+# monitor: a less noisy status
+#
+raid1_monitor_one() {
+ local mddev=$1
+ local md=
+ local rc
+ local TRY_READD=0
+ local pbsize
+ # check if the md device exists first
+ # but not if we are in the stop operation
+ # device existence is important only for the running arrays
+ if [ "$__OCF_ACTION" != "stop" ]; then
+ if [ -h "$mddev" ]; then
+ md=$(ls $mddev -l | awk -F'/' '{print $NF}')
+ elif [ -b "$mddev" ]; then
+ md=$(echo $mddev | sed 's,/dev/,,')
+ else
+ ocf_log info "$mddev is not a block device"
+ return $OCF_NOT_RUNNING
+ fi
+ fi
+ if ! grep -e "^$md[ \t:]" /proc/mdstat >/dev/null ; then
+ ocf_log info "$md not found in /proc/mdstat"
+ return $OCF_NOT_RUNNING
+ fi
+ if [ $HAVE_RAIDTOOLS != "true" ]; then
+ $MDADM --detail --test $mddev >/dev/null 2>&1 ; rc=$?
+ case $rc in
+ 0) ;;
+ 1) ocf_log warn "$mddev has at least one failed device."
+ TRY_READD=1
+ ;;
+ 2) ocf_exit_reason "$mddev has failed."
+ return $OCF_ERR_GENERIC
+ ;;
+ 4)
+ if [ "$__OCF_ACTION" = "stop" ] ; then
+ # There may be a transient invalid device after
+ # we stop MD due to uevent processing, the
+ # original device is stopped though.
+ return $OCF_NOT_RUNNING
+ else
+ ocf_exit_reason "mdadm failed on $mddev."
+ return $OCF_ERR_GENERIC
+ fi
+ ;;
+ *) ocf_exit_reason "mdadm returned an unknown result ($rc)."
+ return $OCF_ERR_GENERIC
+ ;;
+ esac
+ fi
+ if [ "$__OCF_ACTION" = "monitor" -a "$OCF_RESKEY_CRM_meta_interval" != 0 \
+ -a $TRY_READD -eq 1 -a $OCF_CHECK_LEVEL -gt 0 ]; then
+ ocf_log info "Attempting recovery sequence to re-add devices on $mddev:"
+ $MDADM $mddev --fail detached
+ $MDADM $mddev --remove failed
+ $MDADM $mddev --re-add missing
+ # TODO: At this stage, there's nothing to actually do
+ # here. Either this worked or it did not.
+ fi
+
+ pbsize=`(blockdev --getpbsz $mddev || stat -c "%o" $mddev) 2>/dev/null`
+ if [ -z "$pbsize" ]; then
+ ocf_log warn "both blockdev and stat could not get the block size (will use 4k)"
+ pbsize=4096 # try with 4k
+ fi
+ if ! dd if=$mddev count=1 bs=$pbsize of=/dev/null \
+ iflag=direct >/dev/null 2>&1 ; then
+ ocf_exit_reason "$mddev: I/O error on read"
+ return $OCF_ERR_GENERIC
+ fi
+
+ return $OCF_SUCCESS
+}
+
+raid1_monitor() {
+ forall raid1_monitor_one
+}
+
+#
+# STATUS: is the raid device online or offline?
+#
+raid1_status() {
+ # See if the MD device is online
+ local rc
+ raid1_monitor
+ rc=$?
+ if [ $rc -ne $OCF_SUCCESS ]; then
+ echo "stopped"
+ else
+ echo "running"
+ fi
+ return $rc
+}
+
+raid1_validate_all() {
+ return $OCF_SUCCESS
+}
+
+PROC_CLEANUP_TIME=3
+
+if
+ ( [ $# -ne 1 ] )
+then
+ usage
+ exit $OCF_ERR_ARGS
+fi
+
+case "$1" in
+ meta-data)
+ meta_data
+ exit $OCF_SUCCESS
+ ;;
+ usage)
+ usage
+ exit $OCF_SUCCESS
+ ;;
+ *)
+ ;;
+esac
+
+RAIDCONF="$OCF_RESKEY_raidconf"
+MDDEV="$OCF_RESKEY_raiddev"
+FORCESTOP="${OCF_RESKEY_force_stop}"
+WAIT_FOR_UDEV="${OCF_RESKEY_udev}"
+
+if [ -z "$RAIDCONF" ] ; then
+ ocf_exit_reason "Please set OCF_RESKEY_raidconf!"
+ exit $OCF_ERR_CONFIGURED
+fi
+
+if [ ! -r "$RAIDCONF" ] ; then
+ ocf_exit_reason "Configuration file [$RAIDCONF] does not exist, or can not be opened!"
+ exit $OCF_ERR_INSTALLED
+fi
+
+if [ -z "$MDDEV" ] ; then
+ ocf_exit_reason "Please set OCF_RESKEY_raiddev to the Raid device you want to control!"
+ exit $OCF_ERR_CONFIGURED
+fi
+
+if ocf_is_clone && ! ocf_is_true "$OCF_RESKEY_force_clones"; then
+ ocf_exit_reason "md RAID arrays are NOT safe to run as a clone!"
+ ocf_log err "Please read the comment on the force_clones parameter."
+ exit $OCF_ERR_CONFIGURED
+fi
+
+if ocf_is_true $WAIT_FOR_UDEV && ! have_binary udevadm; then
+ if [ "$__OCF_ACTION" = "start" ]; then
+ ocf_log warn "either install udevadm or set udev to false"
+ ocf_log info "setting udev to false!"
+ fi
+ WAIT_FOR_UDEV=0
+fi
+
+if ! ocf_is_true $WAIT_FOR_UDEV; then
+ export MDADM_NO_UDEV=1
+fi
+
+if ocf_is_true $FORCESTOP && ! have_binary lsof; then
+ ocf_log warn "Please install lsof(8), we may need it when stopping Raid device! Now continuing anyway ..."
+fi
+
+HAVE_RAIDTOOLS=false
+if have_binary $MDADM >/dev/null 2>&1 ; then
+ if [ -n "$OCF_RESKEY_homehost" ]; then
+ MDADM_HOMEHOST="--homehost=${OCF_RESKEY_homehost}"
+ else
+ MDADM_HOMEHOST=""
+ fi
+else
+ check_binary $RAIDSTART
+ HAVE_RAIDTOOLS=true
+fi
+
+if [ $HAVE_RAIDTOOLS = true ]; then
+ if [ "$MDDEV" = "auto" ]; then
+ ocf_exit_reason "autoconf supported only with mdadm!"
+ exit $OCF_ERR_INSTALLED
+ elif [ `echo $MDDEV|wc -w` -gt 1 ]; then
+ ocf_exit_reason "multiple devices supported only with mdadm!"
+ exit $OCF_ERR_INSTALLED
+ fi
+fi
+
+if [ "$MDDEV" = "auto" ]; then
+ RAIDDEVS=`list_conf_arrays`
+else
+ RAIDDEVS="$MDDEV"
+fi
+
+# At this stage,
+# [ $HAVE_RAIDTOOLS = false ] <=> we have $MDADM,
+# otherwise we have raidtools (raidstart and raidstop)
+
+# Look for how we are called
+case "$1" in
+ start)
+ raid1_start
+ ;;
+ stop)
+ raid1_stop
+ ;;
+ status)
+ raid1_status
+ ;;
+ monitor)
+ raid1_monitor
+ ;;
+ validate-all)
+ raid1_validate_all
+ ;;
+ *)
+ usage
+ exit $OCF_ERR_UNIMPLEMENTED
+ ;;
+esac
+
+exit $?