summaryrefslogtreecommitdiffstats
path: root/heartbeat/ovsmonitor
diff options
context:
space:
mode:
Diffstat (limited to 'heartbeat/ovsmonitor')
-rwxr-xr-xheartbeat/ovsmonitor469
1 files changed, 469 insertions, 0 deletions
diff --git a/heartbeat/ovsmonitor b/heartbeat/ovsmonitor
new file mode 100755
index 0000000..6765da4
--- /dev/null
+++ b/heartbeat/ovsmonitor
@@ -0,0 +1,469 @@
+#!/bin/sh
+#
+# OCF Resource Agent compliant script.
+# Monitor the vitality of a local OpenVSwitch bond.
+#
+# Based on the work by Alexander Krauth.
+#
+# Transfered from ethmonitor into ovsmonitor by Mathieu Grzybek.
+#
+# Copyright (c) 2017 Robert Euhus, Alexander Krauth, Lars Marowsky-Bré
+# Mathieu Grzybek
+# All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like. Any license provided herein, whether implied or
+# otherwise, applies only to this software file. Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
+#
+# OCF parameters are as below
+#
+# OCF_RESKEY_bond
+# OCF_RESKEY_bridge
+# OCF_RESKEY_multiplicator
+# OCF_RESKEY_name
+# OCF_RESKEY_repeat_count
+# OCF_RESKEY_repeat_interval
+# OCF_RESKEY_pktcnt_timeout
+#
+#######################################################################
+# Initialization:
+
+: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
+. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
+
+# Parameter defaults
+
+OCF_RESKEY_bond_default=""
+OCF_RESKEY_bridge_default=""
+OCF_RESKEY_name_default=""
+OCF_RESKEY_multiplier_default="1"
+OCF_RESKEY_repeat_count_default="5"
+OCF_RESKEY_repeat_interval_default="10"
+OCF_RESKEY_pktcnt_timeout_default="5"
+OCF_RESKEY_link_status_only_default="false"
+
+: ${OCF_RESKEY_bond=${OCF_RESKEY_bond_default}}
+: ${OCF_RESKEY_bridge=${OCF_RESKEY_bridge_default}}
+: ${OCF_RESKEY_name=${OCF_RESKEY_name_default}}
+: ${OCF_RESKEY_multiplier=${OCF_RESKEY_multiplier_default}}
+: ${OCF_RESKEY_repeat_count=${OCF_RESKEY_repeat_count_default}}
+: ${OCF_RESKEY_repeat_interval=${OCF_RESKEY_repeat_interval_default}}
+: ${OCF_RESKEY_pktcnt_timeout=${OCF_RESKEY_pktcnt_timeout_default}}
+: ${OCF_RESKEY_link_status_only=${OCF_RESKEY_link_status_only_default}}
+
+#######################################################################
+
+meta_data() {
+ cat <<END
+<?xml version="1.0"?>
+<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
+<resource-agent name="ovsmonitor" version="0.1">
+<version>1.0</version>
+
+<longdesc lang="en">
+Monitor the vitality of a local ovs bond.
+
+You may set up this RA as a clone resource to monitor the network bonds on different nodes, with the same bond name.
+This is not related to the IP address or the network on which a bond is configured.
+You may use this RA to move resources away from a node, which has a faulty bond or prevent moving resources to such a node.
+This gives you independent control of the resources, without involving cluster intercommunication. But it requires your nodes to have more than one network bond.
+
+The resource configuration requires a monitor operation, because the monitor does the main part of the work.
+In addition to the resource configuration, you need to configure some location constraints, based on a CIB attribute value.
+The name of the attribute value is configured in the 'name' option of this RA.
+
+Example constraint configuration using crmsh
+location loc_connected_node my_resource_grp \
+ rule $id="rule_loc_connected_node" -INF: ovsmonitor-bond-public eq 0
+
+Example constraint configuration using pcs. Only allow 'my_resource' to run on nodes where eth0 ethernet device is available.
+pcs constraint location my_resource rule score=-INFINITY ovsmonitor-bond-public ne 1
+
+The ethmonitor works in 3 different modes to test the bond vitality.
+1. call ovs-appctl to see if at least one of the bonding's link status is up (if link is down -> error)
+2. call ovs-ofctl and watch the RX counter (if packages come around in a certain time -> success)
+3. return error
+</longdesc>
+<shortdesc lang="en">Monitors ovs bonding bonds</shortdesc>
+
+<parameters>
+<parameter name="bond" unique="1" required="1">
+<longdesc lang="en">
+The name of the network bond which should be monitored (e.g. bond-public).
+</longdesc>
+<shortdesc lang="en">Bond bond name</shortdesc>
+<content type="string" default="${OCF_RESKEY_bond_default}"/>
+</parameter>
+
+<parameter name="bridge" unique="1" required="1">
+<longdesc lang="en">
+The name of the ovs bridge that contains the bridge.
+</longdesc>
+<shortdesc lang="en">ovs bridge</shortdesc>
+<content type="string" default="${OCF_RESKEY_bridge_default}"/>
+</parameter>
+
+<parameter name="name" unique="1">
+<longdesc lang="en">
+The name of the CIB attribute to set. This is the name to be used in the constraints. Defaults to "ovsmonitor-'bond_name'".
+</longdesc>
+<shortdesc lang="en">Attribute name</shortdesc>
+<content type="string" default="${OCF_RESKEY_name_default}"/>
+</parameter>
+
+<parameter name="multiplier" unique="0" >
+<longdesc lang="en">
+Multiplier for the value of the CIB attriobute specified in parameter name.
+</longdesc>
+<shortdesc lang="en">Multiplier for result variable</shortdesc>
+<content type="integer" default="${OCF_RESKEY_multiplier_default}"/>
+</parameter>
+
+<parameter name="repeat_count">
+<longdesc lang="en">
+Specify how often the bond will be monitored, before the status is set to failed. You need to set the timeout of the monitoring operation to at least repeat_count * repeat_interval
+</longdesc>
+<shortdesc lang="en">Monitor repeat count</shortdesc>
+<content type="integer" default="${OCF_RESKEY_repeat_count_default}"/>
+</parameter>
+
+<parameter name="repeat_interval">
+<longdesc lang="en">
+Specify how long to wait in seconds between the repeat_counts.
+</longdesc>
+<shortdesc lang="en">Monitor repeat interval in seconds</shortdesc>
+<content type="integer" default="${OCF_RESKEY_repeat_interval_default}"/>
+</parameter>
+
+<parameter name="pktcnt_timeout">
+<longdesc lang="en">
+Timeout for the RX packet counter. Stop listening for packet counter changes after the given number of seconds.
+</longdesc>
+<shortdesc lang="en">packet counter timeout</shortdesc>
+<content type="integer" default="${OCF_RESKEY_pktcnt_timeout_default}"/>
+</parameter>
+
+<parameter name="link_status_only">
+<longdesc lang="en">
+Only report success based on link status. Do not perform RX counter related connectivity tests.
+</longdesc>
+<shortdesc lang="en">link status check only</shortdesc>
+<content type="boolean" default="${OCF_RESKEY_link_status_only_default}" />
+</parameter>
+
+</parameters>
+<actions>
+<action name="start" timeout="60s" />
+<action name="stop" timeout="20s" />
+<action name="status" depth="0" timeout="60s" interval="10s" />
+<action name="monitor" depth="0" timeout="60s" interval="10s" />
+<action name="meta-data" timeout="5s" />
+<action name="validate-all" timeout="20s" />
+</actions>
+</resource-agent>
+END
+
+ exit $OCF_SUCCESS
+}
+
+#
+# Return true, if the bond exists
+#
+is_bond() {
+ #
+ # List bonds but exclude FreeS/WAN ipsecN virtual bonds
+ #
+ ovs-appctl bond/show $OCF_RESKEY_bond 1>/dev/null 2>&1
+}
+
+#
+# Return true, if the bridge exists
+#
+is_bridge() {
+ #
+ # List bonds but exclude FreeS/WAN ipsecN virtual bonds
+ #
+ #ovs-appctl bond/show $OCF_RESKEY_bond 1>/dev/null 2>&1
+ ovs-vsctl show|grep Bridge|grep -q $OCF_RESKEY_bridge
+}
+
+
+if_init() {
+ local rc
+
+ if [ X"$OCF_RESKEY_bond" = "X" ]; then
+ ocf_exit_reason "Bond name (the bond parameter) is mandatory"
+ exit $OCF_ERR_CONFIGURED
+ fi
+
+ if [ X"$OCF_RESKEY_bridge" = "X" ]; then
+ ocf_exit_reason "Bridge name (the bridge parameter) is mandatory"
+ exit $OCF_ERR_CONFIGURED
+ fi
+
+ BOND="$OCF_RESKEY_bond"
+ BRIDGE="$OCF_RESKEY_bridge"
+
+ if is_bond
+ then
+ if ! is_bridge
+ then
+ ocf_exit_reason "Bridge $OCF_RESKEY_bond does not exist"
+ exit $OCF_ERR_CONFIGURED;
+ fi
+ else
+ ocf_exit_reason "Bond $OCF_RESKEY_bond does not exist"
+ exit $OCF_ERR_CONFIGURED;
+ fi
+
+ if ! ocf_is_decimal "$OCF_RESKEY_multiplier"; then
+ ocf_exit_reason "Invalid OCF_RESKEY_multiplier [$OCF_RESKEY_multiplier]"
+ exit $OCF_ERR_CONFIGURED
+ fi
+
+ ATTRNAME=${OCF_RESKEY_name:-"ovsmonitor-$BOND"}
+
+ REP_COUNT=${OCF_RESKEY_repeat_count}
+ if ! ocf_is_decimal "$REP_COUNT" -o [ $REP_COUNT -lt 1 ]; then
+ ocf_exit_reason "Invalid OCF_RESKEY_repeat_count [$REP_COUNT]"
+ exit $OCF_ERR_CONFIGURED
+ fi
+ REP_INTERVAL_S=${OCF_RESKEY_repeat_interval}
+ if ! ocf_is_decimal "$REP_INTERVAL_S"; then
+ ocf_exit_reason "Invalid OCF_RESKEY_repeat_interval [$REP_INTERVAL_S]"
+ exit $OCF_ERR_CONFIGURED
+ fi
+ if ! ocf_is_decimal "$OCF_RESKEY_pktcnt_timeout"; then
+ ocf_exit_reason "Invalid OCF_RESKEY_pktcnt_timeout [$OCF_RESKEY_pktcnt_timeout]"
+ exit $OCF_ERR_CONFIGURED
+ fi
+ return $OCF_SUCCESS
+}
+
+# get the link status on $BOND
+# asks ip about running (up) bonds, returns the number of matching bond names that are up
+get_link_status () {
+ #$IP2UTIL -o link show up dev "$BOND" | grep -v 'NO-CARRIER' | grep -c "$BOND"
+ ovs-appctl bond/show "$BOND"|awk -F: '/^slave/ {print $2}'|grep -c enabled
+}
+
+# returns the number of received rx packets on $BOND
+get_rx_packets () {
+ ocf_log debug "bond $BOND - bridge $BRIDGE"
+ #$IP2UTIL -o -s link show dev "$BOND" \
+ # | sed 's/.* RX: [^0-9]*[0-9]* *\([0-9]*\) .*/\1/'
+ local ovs_port
+
+ for ovs_port in $(ovs-appctl bond/show $BOND|awk '/^slave/ {gsub(":","");print $2}') ; do
+ ovs-ofctl dump-ports $BRIDGE $ovs_port
+ done \
+ | awk -F, 'BEGIN{total=0} /rx/ {gsub(".*pkts=","");total=total+int($1)} END{print total}'
+}
+
+# watch for packet counter changes for max. OCF_RESKEY_pktcnt_timeout seconds
+# returns immedeately with return code 0 if any packets were received
+# otherwise 1 is returned
+watch_pkt_counter () {
+ local RX_PACKETS_NEW
+ local RX_PACKETS_OLD
+ RX_PACKETS_OLD="`get_rx_packets`"
+ for n in `seq $(( $OCF_RESKEY_pktcnt_timeout * 10 ))`; do
+ sleep 0.1
+ RX_PACKETS_NEW="`get_rx_packets`"
+ ocf_log debug "RX_PACKETS_OLD: $RX_PACKETS_OLD RX_PACKETS_NEW: $RX_PACKETS_NEW"
+ if [ "$RX_PACKETS_OLD" -ne "$RX_PACKETS_NEW" ]; then
+ ocf_log debug "we received some packets."
+ return 0
+ fi
+ done
+ return 1
+}
+
+#
+# Check the bond depending on the level given as parameter: $OCF_RESKEY_check_level
+#
+# 10: watch for packet counter changes
+#
+#
+# 30: watch for packet counter changes in promiscios mode
+#
+# If unsuccessfull in levels 18 and above,
+# the tests for higher check levels are run.
+#
+if_check () {
+ # always check link status first
+ link_status="`get_link_status`"
+ ocf_log debug "link_status: $link_status (up > 0, down = 0)"
+
+ if [ $link_status -eq 0 ]; then
+ ocf_log notice "link_status: DOWN"
+ return $OCF_NOT_RUNNING
+ fi
+
+ # if using link_status_only, skip RX count related test
+ if ocf_is_true "$OCF_RESKEY_link_status_only"; then
+ return $OCF_SUCCESS
+ fi
+
+ # watch for packet counter changes
+ ocf_log debug "watch for packet counter changes"
+ watch_pkt_counter
+ if [ $? -eq 0 ]; then
+ return $OCF_SUCCESS
+ else
+ ocf_log debug "No packets received during packet watch timeout"
+ fi
+
+ # watch for packet counter changes in promiscios mode
+# ocf_log debug "watch for packet counter changes in promiscios mode"
+ # be sure switch off promiscios mode in any case
+ # TODO: check first, wether promisc is already on and leave it untouched.
+# trap "$IP2UTIL link set dev $BOND promisc off; exit" INT TERM EXIT
+# $IP2UTIL link set dev $BOND promisc on
+# watch_pkt_counter && return $OCF_SUCCESS
+# $IP2UTIL link set dev $BOND promisc off
+# trap - INT TERM EXIT
+
+ # looks like it's not working (for whatever reason)
+ return $OCF_NOT_RUNNING
+}
+
+#######################################################################
+
+if_usage() {
+ cat <<END
+usage: $0 {start|stop|status|monitor|validate-all|meta-data}
+
+Expects to have a fully populated OCF RA-compliant environment set.
+END
+}
+
+set_cib_value() {
+ local score=`expr $1 \* $OCF_RESKEY_multiplier`
+ attrd_updater -n $ATTRNAME -v $score
+ local rc=$?
+ case $rc in
+ 0) ocf_log debug "attrd_updater: Updated $ATTRNAME = $score" ;;
+ *) ocf_log warn "attrd_updater: Could not update $ATTRNAME = $score: rc=$rc";;
+ esac
+ return $rc
+}
+
+if_monitor() {
+ ha_pseudo_resource $OCF_RESOURCE_INSTANCE monitor
+ local pseudo_status=$?
+ if [ $pseudo_status -ne $OCF_SUCCESS ]; then
+ exit $pseudo_status
+ fi
+
+ local mon_rc=$OCF_NOT_RUNNING
+ local attr_rc=$OCF_NOT_RUNNING
+ local runs=0
+ local start_time
+ local end_time
+ local sleep_time
+ while [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ]
+ do
+ start_time=`date +%s%N`
+ if_check
+ mon_rc=$?
+ REP_COUNT=$(( $REP_COUNT - 1 ))
+ if [ $mon_rc -ne $OCF_SUCCESS -a $REP_COUNT -gt 0 ]; then
+ ocf_log warn "Monitoring of $OCF_RESOURCE_INSTANCE failed, $REP_COUNT retries left."
+ end_time=`date +%s%N`
+ sleep_time=`echo "scale=9; ( $start_time + ( $REP_INTERVAL_S * 1000000000 ) - $end_time ) / 1000000000" | bc -q 2> /dev/null`
+ sleep $sleep_time 2> /dev/null
+ runs=$(($runs + 1))
+ fi
+
+ if [ $mon_rc -eq $OCF_SUCCESS -a $runs -ne 0 ]; then
+ ocf_log info "Monitoring of $OCF_RESOURCE_INSTANCE recovered from error"
+ fi
+ done
+
+ ocf_log debug "Monitoring return code: $mon_rc"
+ if [ $mon_rc -eq $OCF_SUCCESS ]; then
+ set_cib_value 1
+ attr_rc=$?
+ else
+ ocf_log err "Monitoring of $OCF_RESOURCE_INSTANCE failed."
+ set_cib_value 0
+ attr_rc=$?
+ fi
+
+ ## The resource should not fail, if the bond is down. It should fail, if the update of the CIB variable has errors.
+ ## To react on the bond failure you must use constraints based on the CIB variable value, not on the resource itself.
+ exit $attr_rc
+}
+
+if_stop()
+{
+ attrd_updater -D -n $ATTRNAME
+ ha_pseudo_resource $OCF_RESOURCE_INSTANCE stop
+}
+
+if_start()
+{
+ local rc
+ ha_pseudo_resource $OCF_RESOURCE_INSTANCE start
+ rc=$?
+ if [ $rc -ne $OCF_SUCCESS ]; then
+ ocf_exit_reason "Failure to create ovsmonitor state file"
+ return $rc
+ fi
+
+ # perform the first monitor during the start operation
+ if_monitor
+ return $?
+}
+
+
+if_validate() {
+ check_binary ovs-vsctl
+ check_binary ovs-appctl
+ check_binary ovs-ofctl
+ check_binary bc
+ if_init
+}
+
+case $__OCF_ACTION in
+meta-data) meta_data
+ ;;
+usage|help) if_usage
+ exit $OCF_SUCCESS
+ ;;
+esac
+
+if_validate
+
+case $__OCF_ACTION in
+start) if_start
+ exit $?
+ ;;
+stop) if_stop
+ exit $?
+ ;;
+monitor|status) if_monitor
+ exit $?
+ ;;
+validate-all) exit $?
+ ;;
+*) if_usage
+ exit $OCF_ERR_UNIMPLEMENTED
+ ;;
+esac