#!/bin/sh # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # See usage() function below for more details ... # # Note that the script uses an external file to setup RabbitMQ policies # so make sure to create it from an example shipped with the package. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### # Fill in some defaults if no values are specified PATH=/sbin:/usr/sbin:/bin:/usr/bin OCF_RESKEY_binary_default="/usr/sbin/rabbitmq-server" OCF_RESKEY_ctl_default="/usr/sbin/rabbitmqctl" OCF_RESKEY_debug_default=false OCF_RESKEY_username_default="rabbitmq" OCF_RESKEY_groupname_default="rabbitmq" OCF_RESKEY_admin_user_default="guest" OCF_RESKEY_admin_password_default="guest" OCF_RESKEY_definitions_dump_file_default="/etc/rabbitmq/definitions" OCF_RESKEY_pid_file_default="/var/run/rabbitmq/pid" OCF_RESKEY_log_dir_default="/var/log/rabbitmq" OCF_RESKEY_mnesia_base_default="/var/lib/rabbitmq/mnesia" OCF_RESKEY_mnesia_schema_base_default="/var/lib/rabbitmq" OCF_RESKEY_host_ip_default="127.0.0.1" OCF_RESKEY_node_port_default=5672 OCF_RESKEY_default_vhost_default="/" OCF_RESKEY_erlang_cookie_default=false OCF_RESKEY_erlang_cookie_file_default="/var/lib/rabbitmq/.erlang.cookie" OCF_RESKEY_use_fqdn_default=false OCF_RESKEY_fqdn_prefix_default="" OCF_RESKEY_max_rabbitmqctl_timeouts_default=3 OCF_RESKEY_policy_file_default="/usr/local/sbin/set_rabbitmq_policy" OCF_RESKEY_rmq_feature_health_check_default=true OCF_RESKEY_rmq_feature_local_list_queues_default=true OCF_RESKEY_limit_nofile_default=65535 OCF_RESKEY_avoid_using_iptables_default=false OCF_RESKEY_allowed_cluster_nodes_default="" : ${HA_LOGTAG="lrmd"} : ${HA_LOGFACILITY="daemon"} : ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} : ${OCF_RESKEY_ctl=${OCF_RESKEY_ctl_default}} : ${OCF_RESKEY_debug=${OCF_RESKEY_debug_default}} : ${OCF_RESKEY_username=${OCF_RESKEY_username_default}} : ${OCF_RESKEY_groupname=${OCF_RESKEY_groupname_default}} : ${OCF_RESKEY_admin_user=${OCF_RESKEY_admin_user_default}} : ${OCF_RESKEY_admin_password=${OCF_RESKEY_admin_password_default}} : ${OCF_RESKEY_definitions_dump_file=${OCF_RESKEY_definitions_dump_file_default}} : ${OCF_RESKEY_log_dir=${OCF_RESKEY_log_dir_default}} : ${OCF_RESKEY_mnesia_base=${OCF_RESKEY_mnesia_base_default}} : ${OCF_RESKEY_mnesia_schema_base=${OCF_RESKEY_mnesia_schema_base_default}} : ${OCF_RESKEY_pid_file=${OCF_RESKEY_pid_file_default}} : ${OCF_RESKEY_node_port=${OCF_RESKEY_node_port_default}} : ${OCF_RESKEY_default_vhost=${OCF_RESKEY_default_vhost_default}} : ${OCF_RESKEY_erlang_cookie=${OCF_RESKEY_erlang_cookie_default}} : ${OCF_RESKEY_erlang_cookie_file=${OCF_RESKEY_erlang_cookie_file_default}} : ${OCF_RESKEY_use_fqdn=${OCF_RESKEY_use_fqdn_default}} : ${OCF_RESKEY_fqdn_prefix=${OCF_RESKEY_fqdn_prefix_default}} : ${OCF_RESKEY_max_rabbitmqctl_timeouts=${OCF_RESKEY_max_rabbitmqctl_timeouts_default}} : ${OCF_RESKEY_policy_file=${OCF_RESKEY_policy_file_default}} : ${OCF_RESKEY_rmq_feature_health_check=${OCF_RESKEY_rmq_feature_health_check_default}} : ${OCF_RESKEY_rmq_feature_local_list_queues=${OCF_RESKEY_rmq_feature_local_list_queues_default}} : ${OCF_RESKEY_limit_nofile=${OCF_RESKEY_limit_nofile_default}} : ${OCF_RESKEY_avoid_using_iptables=${OCF_RESKEY_avoid_using_iptables_default}} : ${OCF_RESKEY_allowed_cluster_nodes=${OCF_RESKEY_allowed_cluster_nodes_default}} ####################################################################### OCF_RESKEY_CRM_meta_timeout_default=30000 : ${OCF_RESKEY_CRM_meta_timeout=${OCF_RESKEY_CRM_meta_timeout_default}} OCF_RESKEY_start_time_default=$((OCF_RESKEY_CRM_meta_timeout / 6000 + 2)) : ${OCF_RESKEY_start_time=${OCF_RESKEY_start_time_default}} OCF_RESKEY_stop_time_default=${OCF_RESKEY_start_time_default} : ${OCF_RESKEY_stop_time=${OCF_RESKEY_start_time_default}} OCF_RESKEY_command_timeout_default="" : ${OCF_RESKEY_command_timeout=${OCF_RESKEY_command_timeout_default}} TIMEOUT_ARG=$((OCF_RESKEY_CRM_meta_timeout / 6000 + 30)) COMMAND_TIMEOUT="/usr/bin/timeout ${OCF_RESKEY_command_timeout} ${TIMEOUT_ARG}" RESOURCE_NAME=`echo $OCF_RESOURCE_INSTANCE | cut -d ":" -f 1` ####################################################################### usage() { cat < 1.0 Resource agent for RabbitMQ promotes a node, then cluster nodes can join it Resource agent for RabbitMQ HA cluster RabbitMQ binary RabbitMQ binary rabbitctl binary rabbitctl binary binary RabbitMQ PID file RabbitMQ PID file RabbitMQ log directory RabbitMQ log directory RabbitMQ user name RabbitMQ user name RabbitMQ group name RabbitMQ group name RabbitMQ default admin user for API RabbitMQ admin user RabbitMQ default admin user password for API RabbitMQ admin password RabbitMQ default definitions dump file RabbitMQ definitions dump file Timeout command arguments for issued commands termination (value is auto evaluated) Arguments for timeout wrapping command Timeout for start rabbitmq server Timeout for start rabbitmq server Timeout for stopping rabbitmq server Timeout for stopping rabbitmq server The debug flag for agent (${OCF_RESKEY_binary}) instance. In the /tmp/ directory will be created rmq-* files for log some operations and ENV values inside OCF-script. AMQP server (${OCF_RESKEY_binary}) debug flag Base directory for storing Mnesia files Base directory for storing Mnesia files Parent directory for Mnesia schema directory Parent directory for Mnesia schema directory ${OCF_RESKEY_binary} should listen on this IP address ${OCF_RESKEY_binary} should listen on this IP address ${OCF_RESKEY_binary} should listen on this port ${OCF_RESKEY_binary} should listen on this port Default virtual host used for monitoring if a node is fully synchronized with the rest of the cluster. In normal operation, the resource agent will wait for queues from this virtual host on this node to be synchronized elsewhere before stopping RabbitMQ. This also means queues in other virtual hosts may not be fully synchronized on stop operations. Default virtual host used for waiting for synchronization Erlang cookie for clustering. If specified, will be updated at the mnesia reset Erlang cookie Erlang cookie file path where the cookie will be put, if requested Erlang cookie file Either to use FQDN or a shortname for the rabbitmq node Use FQDN Optional FQDN prefix for RabbitMQ nodes in cluster. FQDN prefix can be specified to host multiple RabbitMQ instances on a node or in case of RabbitMQ running in dedicated network/interface. FQDN prefix If during monitor call rabbitmqctl times out, the timeout is ignored unless it is Nth timeout in a row. Here N is the value of the current parameter. If too many timeouts happen in a raw, the monitor call will return with error. Fail only if that many rabbitmqctl timeouts in a row occurred A path to the shell script to setup RabbitMQ policies A policy file path Since rabbit 3.6.4 list_queues/list_channels-based monitoring should be replaced with "node_health_check" command, as it creates no network load at all. Use node_health_check for monitoring For rabbit version that implements --local flag for list_queues, this can greatly reduce network overhead in cases when node is stopped/demoted. Use --local option for list_queues Soft and hard limit for NOFILE NOFILE limit When set to true the iptables calls to block client access become noops. This is useful when we run inside containers. Disable iptables use entirely When set to anything other than the empty string it must container the list of cluster node names, separated by spaces, where the rabbitmq resource is allowed to run. Tis is needed when rabbitmq is running on a subset of nodes part of a larger cluster. The default ("") is to assume that all nodes part of the cluster will run the rabbitmq resource. List of cluster nodes where rabbitmq is allowed to run $EXTENDED_OCF_PARAMS END } MIN_MASTER_SCORE=100 BEST_MASTER_SCORE=1000 ####################################################################### # Functions invoked by resource manager actions #TODO(bogdando) move proc_kill, proc_stop to shared OCF functions # to be shipped with HA cluster packages ########################################################### # Attempts to kill a process with retries and checks procfs # to make sure the process is stopped. # # Globals: # LL # Arguments: # $1 - pid of the process to try and kill # $2 - service name used for logging and match-based kill, if the pid is "none" # $3 - signal to use, defaults to SIGTERM # $4 - number of retries, defaults to 5 # $5 - time to sleep between retries, defaults to 2 # Returns: # 0 - if successful # 1 - if process is still running according to procfs # 2 - if invalid parameters passed in ########################################################### proc_kill() { local pid="${1}" local service_name="${2}" local signal="${3:-SIGTERM}" local count="${4:-5}" local process_sleep="${5:-2}" local LH="${LL} proc_kill():" local pgrp="$(ps -o pgid= ${pid} 2>/dev/null | tr -d '[[:space:]]')" if [ "${pid}" ] && [ "${pgrp}" = "1" ] ; then ocf_log err "${LH} shall not kill by the bad pid 1 (init)!" return 2 fi if [ "${pid}" = "none" ]; then local matched matched="$(pgrep -fla ${service_name})" if [ -z "${matched}" ] ; then ocf_log info "${LH} cannot find any processes matching the ${service_name}, considering target process to be already dead" return 0 fi ocf_log debug "${LH} no pid provided, will try the ${service_name}, matched list: ${matched}" while [ $count -gt 0 ]; do if [ -z "${matched}" ]; then break else matched="$(pgrep -fla ${service_name})" ocf_log debug "${LH} Stopping ${service_name} with ${signal}..." ocf_run pkill -f -"${signal}" "${service_name}" fi sleep $process_sleep count=$(( count-1 )) done pgrep -f "${service_name}" > /dev/null if [ $? -ne 0 ] ; then ocf_log debug "${LH} Stopped ${service_name} with ${signal}" return 0 else ocf_log warn "${LH} Failed to stop ${service_name} with ${signal}" return 1 fi else # pid is not none while [ $count -gt 0 ]; do if [ ! -d "/proc/${pid}" ]; then break else ocf_log debug "${LH} Stopping ${service_name} with ${signal}..." ocf_run pkill -"${signal}" -g "${pgrp}" fi sleep $process_sleep count=$(( count-1 )) done # Check if the process ended after the last sleep if [ ! -d "/proc/${pid}" ] ; then ocf_log debug "${LH} Stopped ${service_name} with ${signal}" return 0 fi ocf_log warn "${LH} Failed to stop ${service_name} with ${signal}" return 1 fi } ########################################################### # Attempts to kill a process with the given pid or pid file # using proc_kill and will retry with sigkill if sigterm is # unsuccessful. # # Globals: # OCF_ERR_GENERIC # OCF_SUCCESS # LL # Arguments: # $1 - pidfile or pid or 'none', if stopping by the name matching # $2 - service name used for logging or for the failback stopping method # $3 - stop process timeout (in sec), used to determine how many times we try # SIGTERM and an upper limit on how long this function should try and # stop the process. Defaults to 15. # Returns: # OCF_SUCCESS - if successful # OCF_ERR_GENERIC - if process is still running according to procfs ########################################################### proc_stop() { local pid_param="${1}" local service_name="${2}" local timeout="${3:-15}" local LH="${LL} proc_stop():" local i local pid local pidfile if [ "${pid_param}" = "none" ] ; then pid="none" else # check if provide just a number echo "${pid_param}" | egrep -q '^[0-9]+$' if [ $? -eq 0 ]; then pid="${pid_param}" elif [ -e "${pid_param}" ]; then # check if passed in a pid file pidfile="${pid_param}" pid=$(cat "${pidfile}" 2>/dev/null | tr -s " " "\n" | sort -u) else ocf_log warn "${LH} pid param ${pid_param} is not a file or a number, try match by ${service_name}" pid="none" fi fi # number of times to try a SIGTEM is (timeout - 5 seconds) / 2 seconds local stop_count=$(( ($timeout-5)/2 )) # make sure we stop at least once if [ $stop_count -le 0 ]; then stop_count=1 fi if [ -z "${pid}" ] ; then ocf_log warn "${LH} unable to get PID from ${pidfile}, try match by ${service_name}" pid="none" fi if [ -n "${pid}" ]; then for i in ${pid} ; do [ "${i}" ] || break ocf_log info "${LH} Stopping ${service_name} by PID ${i}" proc_kill "${i}" "${service_name}" SIGTERM $stop_count if [ $? -ne 0 ]; then # SIGTERM failed, send a single SIGKILL proc_kill "${i}" "${service_name}" SIGKILL 1 2 if [ $? -ne 0 ]; then ocf_log err "${LH} ERROR: could not stop ${service_name}" return "${OCF_ERR_GENERIC}" fi fi done fi # Remove the pid file here which will remove empty pid files as well if [ -n "${pidfile}" ]; then rm -f "${pidfile}" fi ocf_log info "${LH} Stopped ${service_name}" return "${OCF_SUCCESS}" } # Invokes the given command as a rabbitmq user and wrapped in the # timeout command. su_rabbit_cmd() { local timeout if [ "$1" = "-t" ]; then timeout="/usr/bin/timeout ${OCF_RESKEY_command_timeout} $2" shift 2 else timeout=$COMMAND_TIMEOUT fi local cmd="${1:-status}" local LH="${LL} su_rabbit_cmd():" local rc=1 local user=$OCF_RESKEY_username local mail=/var/spool/mail/rabbitmq local pwd=/var/lib/rabbitmq local home=/var/lib/rabbitmq ocf_log debug "${LH} invoking a command: ${cmd}" su $user -s /bin/sh -c "USER=${user} MAIL=${mail} PWD=${pwd} HOME=${home} LOGNAME=${user} \ ${timeout} ${cmd}" rc=$? ocf_log info "${LH} the invoked command exited ${rc}: ${cmd}" return $rc } now() { date -u +%s } set_limits() { local current_limit=$(su $OCF_RESKEY_username -s /bin/sh -c "ulimit -n") if [ ! -z $OCF_RESKEY_limit_nofile ] && [ $OCF_RESKEY_limit_nofile -gt $current_limit ] ; then ulimit -n $OCF_RESKEY_limit_nofile fi } master_score() { local LH="${LL} master_score():" local score=$1 if [ -z $score ] ; then score=0 fi ocf_log info "${LH} Updating master score attribute with ${score}" ocf_run crm_master -N $THIS_PCMK_NODE -l reboot -v $score || return $OCF_ERR_GENERIC return $OCF_SUCCESS } # Return either FQDN or shortname, depends on the OCF_RESKEY_use_fqdn. get_hostname() { local os=$(uname -s) if ! ocf_is_true "${OCF_RESKEY_use_fqdn}"; then if [ "$os" = "SunOS" ]; then echo "$(hostname | sed 's@\..*@@')" else echo "$(hostname -s)" fi else if [ "$os" = "SunOS" ]; then echo "$(hostname)" else echo "$(hostname -f)" fi fi } # Strip the FQDN to the shortname, if OCF_RESKEY_use_fqdn was set; # Prepend prefix to the hostname process_fqdn() { if ! ocf_is_true "${OCF_RESKEY_use_fqdn}"; then echo "${OCF_RESKEY_fqdn_prefix}$1" | awk -F. '{print $1}' else echo "${OCF_RESKEY_fqdn_prefix}$1" fi } # Return OCF_SUCCESS, if current host is in the list of given hosts. # Otherwise, return 10 my_host() { local hostlist="$1" local hostname local hn local rc=10 local LH="${LL} my_host():" hostname=$(process_fqdn $(get_hostname)) ocf_log debug "${LH} hostlist is: $hostlist" for host in $hostlist ; do hn=$(process_fqdn "${host}") ocf_log debug "${LH} comparing '$hostname' with '$hn'" if [ "${hostname}" = "${hn}" ] ; then rc=$OCF_SUCCESS break fi done return $rc } get_integer_node_attr() { local value value=$(crm_attribute -N $1 -l reboot --name "$2" --query 2>/dev/null | awk '{ split($3, vals, "="); if (vals[2] != "(null)") print vals[2] }') if [ $? -ne 0 ] || [ -z "$value" ] ; then value=0 fi echo $value } get_node_start_time() { get_integer_node_attr $1 'rabbit-start-time' } get_node_master_score() { get_integer_node_attr $1 "master-${RESOURCE_NAME}" } # Return either rabbit node name as FQDN or shortname, depends on the OCF_RESKEY_use_fqdn. rabbit_node_name() { echo "rabbit@$(process_fqdn $(ocf_attribute_target $1))" } rmq_setup_env() { local H local dir local name H="$(get_hostname)" export RABBITMQ_NODENAME=$(rabbit_node_name $H) if [ "$OCF_RESKEY_node_port" != "$OCF_RESKEY_node_port_default" ]; then export RABBITMQ_NODE_PORT=$OCF_RESKEY_node_port fi export RABBITMQ_PID_FILE=$OCF_RESKEY_pid_file MNESIA_FILES="${OCF_RESKEY_mnesia_base}/$(rabbit_node_name $H)" if ! ocf_is_true "${OCF_RESKEY_use_fqdn}"; then name="-sname" else name="-name" fi export RABBITMQ_SERVER_START_ARGS="${RABBITMQ_SERVER_START_ARGS} -mnesia dir \"${MNESIA_FILES}\" ${name} $(rabbit_node_name $H)" RMQ_START_TIME="${MNESIA_FILES}/ocf_server_start_time.txt" MASTER_FLAG_FILE="${MNESIA_FILES}/ocf_master_for_${OCF_RESOURCE_INSTANCE}" THIS_PCMK_NODE=$(ocf_attribute_target) TOTALVMEM=`free -mt | awk '/Total:/ {print $2}'` # check and make PID file dir local PID_DIR=$( dirname $OCF_RESKEY_pid_file ) if [ ! -d ${PID_DIR} ] ; then mkdir -p ${PID_DIR} chown -R ${OCF_RESKEY_username}:${OCF_RESKEY_groupname} ${PID_DIR} chmod 755 ${PID_DIR} fi # Regardless of whether we just created the directory or it # already existed, check whether it is writable by the configured # user for dir in ${PID_DIR} "${OCF_RESKEY_mnesia_base}" "${OCF_RESKEY_log_dir}"; do if test -e ${dir}; then local files files=$(su -s /bin/sh - $OCF_RESKEY_username -c "find ${dir} ! -writable") if [ "${files}" ]; then ocf_log warn "Directory ${dir} is not writable by ${OCF_RESKEY_username}, chowning." chown -R ${OCF_RESKEY_username}:${OCF_RESKEY_groupname} "${dir}" fi fi done export LL="${OCF_RESOURCE_INSTANCE}[$$]:" update_cookie } # Return a RabbitMQ node to its virgin state. # For reset and force_reset to succeed the RabbitMQ application must have been stopped. # If the app cannot be stopped, beam will be killed and mnesia files will be removed. reset_mnesia() { local LH="${LL} reset_mnesia():" local make_amnesia=false local rc=$OCF_ERR_GENERIC # check status of a beam process get_status rc=$? if [ $rc -eq 0 ] ; then # beam is running # check status of rabbit app and stop it, if it is running get_status rabbit rc=$? if [ $rc -eq 0 ] ; then # rabbit app is running, have to stop it ocf_log info "${LH} Stopping RMQ-app prior to reset the mnesia." stop_rmq_server_app rc=$? if [ $rc -ne 0 ] ; then ocf_log warn "${LH} RMQ-app can't be stopped." make_amnesia=true fi fi if ! $make_amnesia ; then # rabbit app is not running, reset mnesia ocf_log info "${LH} Execute reset with timeout: ${TIMEOUT_ARG}" su_rabbit_cmd "${OCF_RESKEY_ctl} reset" rc=$? if [ $rc -ne 0 ] ; then ocf_log info "${LH} Execute force_reset with timeout: ${TIMEOUT_ARG}" su_rabbit_cmd "${OCF_RESKEY_ctl} force_reset" rc=$? if [ $rc -ne 0 ] ; then ocf_log warn "${LH} Mnesia couldn't cleaned, even by force-reset command." make_amnesia=true fi fi fi else # there is no beam running make_amnesia=true ocf_log warn "${LH} There is no Beam process running." fi # remove mnesia files, if required if $make_amnesia ; then kill_rmq_and_remove_pid ocf_run rm -rf "${MNESIA_FILES}" mnesia_schema_location="${OCF_RESKEY_mnesia_schema_base}/Mnesia.$(rabbit_node_name $(get_hostname))" ocf_run rm -rf "$mnesia_schema_location" ocf_log warn "${LH} Mnesia files appear corrupted and have been removed from ${MNESIA_FILES} and $mnesia_schema_location" fi # always return OCF SUCCESS return $OCF_SUCCESS } block_client_access() { # When OCF_RESKEY_avoid_using_iptables is true iptables calls are noops if ocf_is_true "${OCF_RESKEY_avoid_using_iptables}"; then return $OCF_SUCCESS fi # do not add temporary RMQ blocking rule, if it is already exist # otherwise, try to add a blocking rule with max of 5 retries local tries=5 until $(iptables -nvL --wait | grep -q 'temporary RMQ block') || [ $tries -eq 0 ]; do tries=$((tries-1)) iptables --wait -I INPUT -p tcp -m tcp --dport ${OCF_RESKEY_node_port} -m state --state NEW,RELATED,ESTABLISHED \ -m comment --comment 'temporary RMQ block' -j REJECT --reject-with tcp-reset sleep 1 done if [ $tries -eq 0 ]; then return $OCF_ERR_GENERIC else return $OCF_SUCCESS fi } unblock_client_access() { local lhtext="none" if [ -z $1 ] ; then lhtext=$1 fi # When OCF_RESKEY_avoid_using_iptables is true iptables calls are noops if ocf_is_true "${OCF_RESKEY_avoid_using_iptables}"; then return fi # remove all temporary RMQ blocking rules, if there are more than one exist for i in $(iptables -nvL --wait --line-numbers | awk '/temporary RMQ block/ {print $1}'); do iptables --wait -D INPUT -p tcp -m tcp --dport ${OCF_RESKEY_node_port} -m state --state NEW,RELATED,ESTABLISHED \ -m comment --comment 'temporary RMQ block' -j REJECT --reject-with tcp-reset done ocf_log info "${lhtext} unblocked access to RMQ port" } get_nodes__base(){ local infotype='' local rc=$OCF_ERR_GENERIC local c_status if [ "$1" = 'nodes' ] then infotype='db_nodes' elif [ "$1" = 'running' ] then infotype='running_db_nodes' fi c_status=`${OCF_RESKEY_ctl} eval "mnesia:system_info(${infotype})." 2>/dev/null` rc=$? if [ $rc -ne 0 ] ; then echo '' return $OCF_ERR_GENERIC fi # translate line like '{running_nodes,['rabbit@node-1','rabbit@node-2','rabbit@node-3']},' to node_list echo $(echo "${c_status}" | awk -F, '{ for (i=1;i<=NF;i++) { if ($i ~ /@/) { gsub(/[\[\]}{]/,"",$i); print $i; } }}' | tr -d "\'") return $OCF_SUCCESS } get_nodes() { echo $(get_nodes__base nodes) return $? } get_running_nodes() { echo $(get_nodes__base running) return $? } # Get alive cluster nodes in visible partition, but the specified one get_alive_pacemaker_nodes_but() { if [ -z "$1" ]; then tmp_pcmk_node_list=`crm_node -l -p | sed -e '/(null)/d'` else tmp_pcmk_node_list=`crm_node -l -p | sed -e "s/${1}//g" | sed -e '/(null)/d'` fi # If OCF_RESKEY_allowed_cluster_nodes is set then we only want the intersection # of the cluster node output and the allowed_cluster_nodes list if [ -z "${OCF_RESKEY_allowed_cluster_nodes}" ]; then pcmk_node_list=$tmp_pcmk_node_list else pcmk_node_list=`for i in $tmp_pcmk_node_list ${OCF_RESKEY_allowed_cluster_nodes}; do echo $i; done | sort | uniq -d` fi echo $pcmk_node_list } # Get current master. If a parameter is provided, # do not check node with that name get_master_name_but() { local node for node in $(get_alive_pacemaker_nodes_but "$@") do ocf_log info "${LH} looking if $node is master" if is_master $node; then ocf_log info "${LH} master is $node" echo $node break fi done } # Evals some erlang code on current node erl_eval() { local fmt="${1:?}" shift $COMMAND_TIMEOUT ${OCF_RESKEY_ctl} eval "$(printf "$fmt" "$@")" 2>/dev/null } # Returns 0 if we are clustered with provideded node is_clustered_with() { local LH="${LH}: is_clustered_with: " local node_name local rc node_name=$(rabbit_node_name $1) local seen_as_running seen_as_running=$(erl_eval "lists:member('%s', rabbit_mnesia:cluster_nodes(running))." "$node_name") rc=$? if [ "$rc" -ne 0 ]; then ocf_log err "${LH} Failed to check whether '$node_name' is considered running by us" # We had a transient local error; that doesn't mean the remote node is # not part of the cluster, so ignore this elif [ "$seen_as_running" != true ]; then ocf_log info "${LH} Node $node_name is not running, considering it not clustered with us" return 1 fi local seen_as_partitioned seen_as_partitioned=$(erl_eval "lists:member('%s', rabbit_node_monitor:partitions())." "$node_name") rc=$? if [ "$rc" -ne 0 ]; then ocf_log err "${LH} Failed to check whether '$node_name' is partitioned with us" # We had a transient local error; that doesn't mean the remote node is # partitioned with us, so ignore this elif [ "$seen_as_partitioned" != false ]; then ocf_log info "${LH} Node $node_name is partitioned from us" return 1 fi return $? } check_need_join_to() { local join_to local node local running_nodes local rc=$OCF_ERR_GENERIC rc=0 join_to=$(rabbit_node_name $1) running_nodes=$(get_running_nodes) for node in $running_nodes ; do if [ "${join_to}" = "${node}" ] ; then rc=1 break fi done return $rc } # Update erlang cookie, if it has been specified update_cookie() { local cookie_file_content if [ "${OCF_RESKEY_erlang_cookie}" != 'false' ] ; then if [ -f "${OCF_RESKEY_erlang_cookie_file}" ]; then # First line of cookie file without newline cookie_file_content=$(head -n1 "${OCF_RESKEY_erlang_cookie_file}" | perl -pe chomp) fi # As there is a brief period of time when the file is empty # (shell redirection has already opened and truncated file, # and echo hasn't finished its job), we are doing this write # only when cookie has changed. if [ "${OCF_RESKEY_erlang_cookie}" != "${cookie_file_content}" ]; then echo "${OCF_RESKEY_erlang_cookie}" > "${OCF_RESKEY_erlang_cookie_file}" fi # And this are idempotent operations, so we don't have to # check any preconditions for running them. chown ${OCF_RESKEY_username}:${OCF_RESKEY_groupname} "${OCF_RESKEY_erlang_cookie_file}" chmod 600 "${OCF_RESKEY_erlang_cookie_file}" fi return $OCF_SUCCESS } # Stop rmq beam process by pid and by rabbit node name match. Returns SUCCESS/ERROR kill_rmq_and_remove_pid() { local LH="${LL} kill_rmq_and_remove_pid():" # Stop the rabbitmq-server by its pidfile, use the name matching as a fallback, # and ignore the exit code proc_stop "${OCF_RESKEY_pid_file}" "beam.*${RABBITMQ_NODENAME}" "${OCF_RESKEY_stop_time}" # Ensure the beam.smp stopped by the rabbit node name matching as well proc_stop none "beam.*${RABBITMQ_NODENAME}" "${OCF_RESKEY_stop_time}" if [ $? -eq 0 ] ; then return $OCF_SUCCESS else return $OCF_ERR_GENERIC fi } trim_var(){ local string="$*" echo ${string%% } } action_validate() { # todo(sv): validate some incoming parameters OCF_RESKEY_CRM_meta_notify_post=$(trim_var $OCF_RESKEY_CRM_meta_notify_post) OCF_RESKEY_CRM_meta_notify_pre=$(trim_var $OCF_RESKEY_CRM_meta_notify_pre) OCF_RESKEY_CRM_meta_notify_start=$(trim_var $OCF_RESKEY_CRM_meta_notify_start) OCF_RESKEY_CRM_meta_notify_stop=$(trim_var $OCF_RESKEY_CRM_meta_notify_stop) OCF_RESKEY_CRM_meta_notify_start_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_start_resource) OCF_RESKEY_CRM_meta_notify_stop_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_stop_resource) OCF_RESKEY_CRM_meta_notify_active_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_active_resource) OCF_RESKEY_CRM_meta_notify_inactive_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_inactive_resource) OCF_RESKEY_CRM_meta_notify_start_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_start_uname) OCF_RESKEY_CRM_meta_notify_stop_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_stop_uname) OCF_RESKEY_CRM_meta_notify_active_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_active_uname) OCF_RESKEY_CRM_meta_notify_master_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_master_resource) OCF_RESKEY_CRM_meta_notify_master_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_master_uname) OCF_RESKEY_CRM_meta_notify_demote_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_demote_resource) OCF_RESKEY_CRM_meta_notify_demote_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_demote_uname) OCF_RESKEY_CRM_meta_notify_slave_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_slave_resource) OCF_RESKEY_CRM_meta_notify_slave_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_slave_uname) OCF_RESKEY_CRM_meta_notify_promote_resource=$(trim_var $OCF_RESKEY_CRM_meta_notify_promote_resource) OCF_RESKEY_CRM_meta_notify_promote_uname=$(trim_var $OCF_RESKEY_CRM_meta_notify_promote_uname) return $OCF_SUCCESS } update_rabbit_start_time_if_rc() { local nowtime local rc=$1 if [ $rc -eq 0 ]; then nowtime="$(now)" ocf_log info "${LH} Rabbit app started successfully. Updating start time attribute with ${nowtime}" ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --update "${nowtime}" fi } join_to_cluster() { local node="$1" local rmq_node local rc=$OCF_ERR_GENERIC local LH="${LL} join_to_cluster():" ocf_log info "${LH} start." rmq_node=$(rabbit_node_name $node) ocf_log info "${LH} Joining to cluster by node '${rmq_node}'." get_status rabbit rc=$? if [ $rc -eq $OCF_SUCCESS ] ; then ocf_log info "${LH} rabbitmq app will be stopped." stop_rmq_server_app rc=$? if [ $rc -ne 0 ] ; then ocf_log err "${LH} Can't stop rabbitmq app by stop_app command. Stopping." action_stop return $OCF_ERR_GENERIC fi fi ocf_log info "${LH} Execute join_cluster with timeout: ${TIMEOUT_ARG}" su_rabbit_cmd "${OCF_RESKEY_ctl} join_cluster $rmq_node" rc=$? if [ $rc -ne 0 ] ; then ocf_log err "${LH} Can't join to cluster by node '${rmq_node}'. Stopping." action_stop return $OCF_ERR_GENERIC fi sleep 2 try_to_start_rmq_app rc=$? if [ $rc -ne 0 ] ; then ocf_log err "${LH} Can't start RMQ app after join to cluster. Stopping." action_stop return $OCF_ERR_GENERIC else update_rabbit_start_time_if_rc 0 ocf_log info "${LH} Joined to cluster succesfully." fi ocf_log info "${LH} end." return $rc } unjoin_nodes_from_cluster() { # node names of the nodes where the pcs resource is being stopped local nodelist="$1" local hostname local nodename local rc=$OCF_ERR_GENERIC local rnode # nodes in rabbit cluster db local nodes_in_cluster local LH="${LL} unjoin_nodes_from_cluster():" nodes_in_cluster=$(get_nodes) rc=$? if [ $rc -ne 0 ] ; then # no nodes in node list, nothing to do return $OCF_SUCCESS fi # unjoin all cluster nodes which are being stopped (i.e. recieved post-stop notify), except *this* node # before to unjoin the nodes, make sure they were disconnected from *this* node for hostname in $nodelist ; do nodename=$(rabbit_node_name $hostname) if [ "${nodename}" = "${RABBITMQ_NODENAME}" ] ; then continue fi for rnode in $nodes_in_cluster ; do if [ "${nodename}" = "${rnode}" ] ; then # disconnect node being unjoined from this node ocf_run ${OCF_RESKEY_ctl} eval "disconnect_node(list_to_atom(\"${nodename}\"))." 2>&1 rc=$? if [ $rc -eq $OCF_SUCCESS ] ; then ocf_log info "${LH} node '${nodename}' disconnected succesfully." else ocf_log info "${LH} disconnecting node '${nodename}' failed." fi # unjoin node # when the rabbit node went down, its status # remains 'running' for a while, so few retries are required local tries=0 until [ $tries -eq 5 ]; do tries=$((tries+1)) if is_clustered_with $nodename; then ocf_log info "${LH} the ${nodename} is alive and cannot be kicked from the cluster yet" else break fi sleep 10 done ocf_log info "${LH} Execute forget_cluster_node with timeout: ${TIMEOUT_ARG}" su_rabbit_cmd "${OCF_RESKEY_ctl} forget_cluster_node ${nodename}" rc=$? if [ $rc -eq 0 ] ; then ocf_log info "${LH} node '${nodename}' unjoined succesfully." else ocf_log warn "${LH} unjoining node '${nodename}' failed." fi fi done done return $OCF_SUCCESS } # Stop RMQ beam server process. Returns SUCCESS/ERROR stop_server_process() { local pid local rc=$OCF_ERR_GENERIC local LH="${LL} stop_server_process():" pid=$(cat ${OCF_RESKEY_pid_file}) rc=$? if [ $rc -ne 0 ] ; then # Try to stop without known PID ocf_log err "${LH} RMQ-server process PIDFILE was not found!" su_rabbit_cmd "${OCF_RESKEY_ctl} stop >> \"${OCF_RESKEY_log_dir}/shutdown_log\" 2>&1" if [ $? -eq 0 ] ; then ocf_log info "${LH} RMQ-server process stopped succesfully, although there was no PIDFILE found." ocf_log info "${LH} grant a graceful termintation window ${OCF_RESKEY_stop_time} to end its beam" sleep "${OCF_RESKEY_stop_time}" else kill_rmq_and_remove_pid fi elif [ "${pid}" ] ; then # Try to stop gracefully by known PID ocf_log info "${LH} Execute stop with timeout: ${TIMEOUT_ARG}" su_rabbit_cmd "${OCF_RESKEY_ctl} stop ${OCF_RESKEY_pid_file} >> \"${OCF_RESKEY_log_dir}/shutdown_log\" 2>&1" [ $? -eq 0 ] && ocf_log info "${LH} RMQ-server process (PID=${pid}) stopped succesfully." fi # Ensure there is no beam process and pidfile left pgrep -f "beam.*${RABBITMQ_NODENAME}" > /dev/null rc=$? if [ -f ${OCF_RESKEY_pid_file} ] || [ $rc -eq 0 ] ; then ocf_log warn "${LH} The pidfile or beam's still exist, forcing the RMQ-server cleanup" kill_rmq_and_remove_pid return $? else return $OCF_SUCCESS fi } # Stop RMQ-app. Return OCF_SUCCESS, if the app was stopped, # otherwise return OCF_ERR_GENERIC stop_rmq_server_app() { local rc=$OCF_ERR_GENERIC # if the beam process isn't running, then rabbit app is stopped as well get_status rc=$? if [ $rc -ne 0 ] ; then return $OCF_SUCCESS fi # stop the app ocf_log info "${LH} Execute stop_app with timeout: ${TIMEOUT_ARG}" su_rabbit_cmd "${OCF_RESKEY_ctl} stop_app >> \"${OCF_RESKEY_log_dir}/shutdown_log\" 2>&1" rc=$? if [ $rc -ne 0 ] ; then ocf_log err "${LH} RMQ-server app cannot be stopped." return $OCF_ERR_GENERIC fi get_status rabbit rc=$? if [ $rc -ne $OCF_SUCCESS ] ; then ocf_log info "${LH} RMQ-server app stopped succesfully." rc=$OCF_SUCCESS else ocf_log err "${LH} RMQ-server app cannot be stopped." rc=$OCF_ERR_GENERIC fi return $rc } start_beam_process() { local command local rc=$OCF_ERR_GENERIC local ts_end local pf_end local pid local LH="${LL} start_beam_process():" # remove old PID-file if it exists if [ -f "${OCF_RESKEY_pid_file}" ] ; then ocf_log warn "${LH} found old PID-file '${OCF_RESKEY_pid_file}'." pid=$(cat ${OCF_RESKEY_pid_file}) if [ "${pid}" ] && [ -d "/proc/${pid}" ] ; then ocf_run cat /proc/${pid}/cmdline | grep -c 'bin/beam' > /dev/null 2>&1 rc=$? if [ $rc -eq $OCF_SUCCESS ] ; then ocf_log warn "${LH} found beam process with PID=${pid}, killing...'." ocf_run kill -TERM $pid else ocf_log err "${LH} found unknown process with PID=${pid} from '${OCF_RESKEY_pid_file}'." return $OCF_ERR_GENERIC fi fi ocf_run rm -f $OCF_RESKEY_pid_file fi [ -f /etc/default/rabbitmq-server ] && . /etc/default/rabbitmq-server # RabbitMQ requires high soft and hard limits for NOFILE set_limits # run beam process command="${OCF_RESKEY_binary} >> \"${OCF_RESKEY_log_dir}/startup_log\" 2>/dev/null" RABBITMQ_NODE_ONLY=1 su rabbitmq -s /bin/sh -c "${command}"& ts_end=$(( $(now) + ${OCF_RESKEY_start_time} )) sleep 3 # give it some time, before attempting to start_app # PID-file is now created later, if the application started successfully # So assume beam.smp is started, and defer errors handling for start_app return $OCF_SUCCESS } check_plugins() { # Check if it's safe to load plugins and if we need to do so. Logic is: # if (EnabledPlugins > 0) and (ActivePlugins == 0) ; then it's safe to load # If we have at least one active plugin, then it's not safe to re-load them # because plugins:setup() would remove existing dependency plugins in plugins_expand_dir. ${OCF_RESKEY_ctl} eval '{ok, EnabledFile} = application:get_env(rabbit, enabled_plugins_file), EnabledPlugins = rabbit_plugins:read_enabled(EnabledFile), ActivePlugins = rabbit_plugins:active(), if length(EnabledPlugins)>0 -> if length(ActivePlugins)==0 -> erlang:error("need_to_load_plugins"); true -> false end; true -> false end.' return $? } load_plugins() { check_plugins local rc=$? if [ $rc -eq 0 ] ; then return 0 else ${OCF_RESKEY_ctl} eval 'ToBeLoaded = rabbit_plugins:setup(), ok = app_utils:load_applications(ToBeLoaded), StartupApps = app_utils:app_dependency_order(ToBeLoaded,false), app_utils:start_applications(StartupApps).' return $? fi } list_active_plugins() { local list list=`${OCF_RESKEY_ctl} eval 'rabbit_plugins:active().' 2>/dev/null` echo "${list}" } try_to_start_rmq_app() { local startup_log="${1:-${OCF_RESKEY_log_dir}/startup_log}" local rc=$OCF_ERR_GENERIC local LH="${LL} try_to_start_rmq_app():" get_status rc=$? if [ $rc -ne $OCF_SUCCESS ] ; then ocf_log info "${LH} RMQ-runtime (beam) not started, starting..." start_beam_process rc=$? if [ $rc -ne $OCF_SUCCESS ]; then ocf_log err "${LH} Failed to start beam - returning from the function" return $OCF_ERR_GENERIC fi fi if [ -z "${startup_log}" ] ; then startup_log="${OCF_RESKEY_log_dir}/startup_log" fi ocf_log info "${LH} begin." ocf_log info "${LH} Execute start_app with timeout: ${TIMEOUT_ARG}" su_rabbit_cmd "${OCF_RESKEY_ctl} start_app >>${startup_log} 2>&1" rc=$? if [ $rc -eq 0 ] ; then ocf_log info "${LH} start_app was successful." ocf_log info "${LH} waiting for start to finish with timeout: ${TIMEOUT_ARG}" su_rabbit_cmd "${OCF_RESKEY_ctl} wait ${OCF_RESKEY_pid_file}" rc=$? if [ $rc -ne 0 ] ; then ocf_log err "${LH} RMQ-server app failed to wait for start." return $OCF_ERR_GENERIC fi rc=$OCF_SUCCESS # Loading enabled modules ocf_log info "${LH} start plugins." load_plugins local mrc=$? if [ $mrc -eq 0 ] ; then local mlist mlist=`list_active_plugins` ocf_log info "${LH} Starting plugins: ${mlist}" else ocf_log info "${LH} Starting plugins: failed." fi else ocf_log info "${LH} start_app failed." rc=$OCF_ERR_GENERIC fi return $rc } start_rmq_server_app() { local rc=$OCF_ERR_GENERIC local startup_log="${OCF_RESKEY_log_dir}/startup_log" local startup_output local LH="${LL} start_rmq_server_app():" local a #We are performing initial start check. #We are not ready to provide service. #Clients should not have access. ocf_log info "${LH} begin." # Safe-unblock the rules, if there are any unblock_client_access "${LH}" # Apply the blocking rule block_client_access rc=$? if [ $rc -eq $OCF_SUCCESS ]; then ocf_log info "${LH} blocked access to RMQ port" else ocf_log err "${LH} cannot block access to RMQ port!" return $OCF_ERR_GENERIC fi get_status rc=$? if [ $rc -ne $OCF_SUCCESS ] ; then ocf_log info "${LH} RMQ-runtime (beam) not started, starting..." start_beam_process rc=$? if [ $rc -ne $OCF_SUCCESS ]; then unblock_client_access "${LH}" return $OCF_ERR_GENERIC fi fi ocf_log info "${LH} RMQ-server app not started, starting..." try_to_start_rmq_app "$startup_log" rc=$? if [ $rc -eq $OCF_SUCCESS ] ; then # rabbitmq-server started successfuly as master of cluster master_score $MIN_MASTER_SCORE stop_rmq_server_app rc=$? if [ $rc -ne 0 ] ; then ocf_log err "${LH} RMQ-server app can't be stopped. Beam will be killed." kill_rmq_and_remove_pid unblock_client_access "${LH}" return $OCF_ERR_GENERIC fi else # error at start RMQ-server ocf_log warn "${LH} RMQ-server app can't start without Mnesia cleaning." for a in $(seq 1 10) ; do rc=$OCF_ERR_GENERIC reset_mnesia || break try_to_start_rmq_app "$startup_log" rc=$? if [ $rc -eq $OCF_SUCCESS ]; then stop_rmq_server_app rc=$? if [ $rc -eq $OCF_SUCCESS ]; then ocf_log info "${LH} RMQ-server app Mnesia cleaned successfully." rc=$OCF_SUCCESS master_score $MIN_MASTER_SCORE break else ocf_log err "${LH} RMQ-server app can't be stopped during Mnesia cleaning. Beam will be killed." kill_rmq_and_remove_pid unblock_client_access "${LH}" return $OCF_ERR_GENERIC fi fi done fi if [ $rc -eq $OCF_ERR_GENERIC ] ; then ocf_log err "${LH} RMQ-server can't be started while many tries. Beam will be killed." kill_rmq_and_remove_pid fi ocf_log info "${LH} end." unblock_client_access "${LH}" return $rc } # check status of rabbit beam process or a rabbit app, if rabbit arg specified # by default, test if the kernel app is running, otherwise consider it is "not running" get_status() { local what="${1:-kernel}" local rc=$OCF_NOT_RUNNING local LH="${LL} get_status():" local body local beam_running body=$( ${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} eval 'rabbit_misc:which_applications().' 2>&1 ) rc=$? pgrep -f "beam.*${RABBITMQ_NODENAME}" > /dev/null beam_running=$? # report not running only if the which_applications() reported an error AND the beam is not running if [ $rc -ne 0 ] && [ $beam_running -ne 0 ] ; then ocf_log info "${LH} failed with code ${rc}. Command output: ${body}" return $OCF_NOT_RUNNING # return a generic error, if there were errors and beam is found running elif [ $rc -ne 0 ] ; then ocf_log info "${LH} found the beam process running but failed with code ${rc}. Command output: ${body}" return $OCF_ERR_GENERIC fi # try to parse the which_applications() output only if it exited w/o errors if [ "${what}" ] && [ $rc -eq 0 ] ; then rc=$OCF_NOT_RUNNING echo "$body" | grep "\{${what}," > /dev/null 2>&1 && rc=$OCF_SUCCESS if [ $rc -ne $OCF_SUCCESS ] ; then ocf_log info "${LH} app ${what} was not found in command output: ${body}" fi fi [ $rc -ne $OCF_SUCCESS ] && rc=$OCF_NOT_RUNNING return $rc } action_status() { local rc=$OCF_ERR_GENERIC get_status rc=$? return $rc } # return 0, if given node has a master attribute in CIB, # otherwise, return 1 is_master() { local result result=`crm_attribute -N "${1}" -l reboot --name 'rabbit-master' --query 2>/dev/null |\ awk '{print $3}' | awk -F "=" '{print $2}' | sed -e '/(null)/d'` if [ "${result}" != 'true' ] ; then return 1 fi return 0 } # Verify if su_rabbit_cmd exited by timeout by checking its return code. # If it did not, return 0. If it did AND it is # $OCF_RESKEY_max_rabbitmqctl_timeouts'th timeout in a row, # return 2 to signal get_monitor that it should # exit with error. Otherwise return 1 to signal that there was a timeout, # but it should be ignored. Timeouts for different operations are tracked # separately. The second argument is used to distingush them. check_timeouts() { local op_rc=$1 local timeouts_attr_name=$2 local op_name=$3 # 75 is EX_TEMPFAIL from sysexits, and is used by rabbitmqctl to signal about # timeout. if [ $op_rc -ne 124 ] && [ $op_rc -ne 137 ] && [ $op_rc -ne 75 ]; then ocf_update_private_attr $timeouts_attr_name 0 return 0 fi local count count=$(ocf_get_private_attr $timeouts_attr_name 0) count=$((count+1)) # There is a slight chance that this piece of code will be executed twice simultaneously. # As a result, $timeouts_attr_name's value will be one less than it should be. But we don't need # precise calculation here. ocf_update_private_attr $timeouts_attr_name $count if [ $count -lt $OCF_RESKEY_max_rabbitmqctl_timeouts ]; then ocf_log warn "${LH} 'rabbitmqctl $op_name' timed out $count of max. $OCF_RESKEY_max_rabbitmqctl_timeouts time(s) in a row. Doing nothing for now." return 1 else ocf_log err "${LH} 'rabbitmqctl $op_name' timed out $count of max. $OCF_RESKEY_max_rabbitmqctl_timeouts time(s) in a row and is not responding. The resource is failed." return 2 fi } wait_sync() { local wait_time=$1 local queues local opt_arg="" if ocf_is_true "$OCF_RESKEY_rmq_feature_local_list_queues"; then opt_arg="--local" fi queues="${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} -p ${OCF_RESKEY_default_vhost} list_queues $opt_arg name state" su_rabbit_cmd -t "${wait_time}" "sh -c \"while ${queues} | grep -q 'syncing,'; \ do sleep 2; done\"" return $? } get_monitor() { local rc=$OCF_ERR_GENERIC local LH="${LL} get_monitor():" local status_master=1 local rabbit_running local name local node local node_start_time local nowtime local partitions_report local node_partitions ocf_log info "${LH} CHECK LEVEL IS: ${OCF_CHECK_LEVEL}" get_status rc=$? if [ $rc -eq $OCF_NOT_RUNNING ] ; then ocf_log info "${LH} get_status() returns ${rc}." ocf_log info "${LH} ensuring this slave does not get promoted." master_score 0 return $OCF_NOT_RUNNING elif [ $rc -eq $OCF_SUCCESS ] ; then ocf_log info "${LH} get_status() returns ${rc}." ocf_log info "${LH} also checking if we are master." get_status rabbit rabbit_running=$? is_master $THIS_PCMK_NODE status_master=$? ocf_log info "${LH} master attribute is ${status_master}" if [ $status_master -eq 0 ] && [ $rabbit_running -eq $OCF_SUCCESS ] then ocf_log info "${LH} We are the running master" rc=$OCF_RUNNING_MASTER elif [ $status_master -eq 0 ] && [ $rabbit_running -ne $OCF_SUCCESS ] ; then ocf_log err "${LH} We are the master and RMQ-runtime (beam) is not running. this is a failure" exit $OCF_FAILED_MASTER fi fi get_status rabbit rabbit_running=$? ocf_log info "${LH} checking if rabbit app is running" if [ $rc -eq $OCF_RUNNING_MASTER ]; then if [ $rabbit_running -eq $OCF_SUCCESS ]; then ocf_log info "${LH} rabbit app is running and is master of cluster" else ocf_log err "${LH} we are the master and rabbit app is not running. This is a failure" exit $OCF_FAILED_MASTER fi else start_time=$((180 + $(ocf_get_private_attr 'rabbit-start-phase-1-time' 0))) restart_order_time=$((60 + $(ocf_get_private_attr 'rabbit-ordered-to-restart' 0))) nowtime=$(now) # If we started more than 3 minutes ago, and # we got order to restart less than 1 minute ago if [ $nowtime -lt $restart_order_time ]; then if [ $nowtime -gt $start_time ]; then ocf_log err "${LH} failing because we have received an order to restart from the master" stop_server_process rc=$OCF_ERR_GENERIC else ocf_log warn "${LH} received an order to restart from the master, ignoring it because we have just started" fi fi fi if [ $rc -eq $OCF_ERR_GENERIC ]; then ocf_log err "${LH} get_status() returns generic error ${rc}" ocf_log info "${LH} ensuring this slave does not get promoted." master_score 0 return $OCF_ERR_GENERIC fi # Recounting our master score ocf_log info "${LH} preparing to update master score for node" local our_start_time local new_score local node_start_time local node_score our_start_time=$(get_node_start_time $THIS_PCMK_NODE) if [ $our_start_time -eq 0 ]; then new_score=$MIN_MASTER_SCORE else new_score=$BEST_MASTER_SCORE for node in $(get_alive_pacemaker_nodes_but $THIS_PCMK_NODE) do node_start_time=$(get_node_start_time $node) node_score=$(get_node_master_score $node) ocf_log info "${LH} comparing us (start time: $our_start_time, score: $new_score) with $node (start time: $node_start_time, score: $node_score)" if [ $node_start_time -ne 0 ] && [ $node_score -ne 0 ] && [ $node_start_time -lt $our_start_time ]; then new_score=$((node_score - 10 < new_score ? node_score - 10 : new_score )) elif [ $node_start_time -ne 0 ] && [ $node_score -ne 0 ] && [ $node_start_time -eq $our_start_time ]; then # Do not get promoted if the other node is already master and we have the same start time if is_master $node; then new_score=$((node_score - 10 < new_score ? node_score - 10 : new_score )) fi fi done fi if [ "$new_score" -ne "$(get_node_master_score $THIS_PCMK_NODE)" ]; then master_score $new_score fi ocf_log info "${LH} our start time is $our_start_time and score is $new_score" # Skip all other checks if rabbit app is not running if [ $rabbit_running -ne $OCF_SUCCESS ]; then ocf_log info "${LH} RabbitMQ is not running, get_monitor function ready to return ${rc}" return $rc fi # rc can be SUCCESS or RUNNING_MASTER, don't touch it unless there # is some error uncovered by node_health_check if ! node_health_check; then rc=$OCF_ERR_GENERIC fi if [ $rc -eq $OCF_RUNNING_MASTER ] ; then # If we are the master and healthy, perform various # connectivity checks for other nodes in the cluster. # Order a member to restart if something fishy happens with it. # All cross-node checks MUST happen only here. partitions_report="$(partitions_report)" for node in $(get_alive_pacemaker_nodes_but $THIS_PCMK_NODE); do # Restart node if we don't consider ourselves clustered with it if ! is_clustered_with $node; then ocf_log warn "${LH} node $node is not connected with us" order_node_restart "$node" continue fi # Restart node if it has any unresolved partitions node_partitions=$(grep_partitions_report $node "$partitions_report") if [ ! -z "$node_partitions" ]; then ocf_log warn "${LH} Node $node thinks that it is partitoned with $node_partitions" order_node_restart "$node" continue fi done fi ocf_log info "${LH} get_monitor function ready to return ${rc}" return $rc } order_node_restart() { local node=${1:?} ocf_log warn "${LH} Ordering node '$node' to restart" ocf_update_private_attr 'rabbit-ordered-to-restart' "$(now)" "$node" } # Checks whether node is mentioned somewhere in report returned by # partitions_report() grep_partitions_report() { local node="${1:?}" local report="${2:?}" local rabbit_node rabbit_node=$(rabbit_node_name "$node") echo "$report" | grep "PARTITIONED $rabbit_node:" | sed -e 's/^[^:]\+: //' } # Report partitions (if any) from viewpoint of every running node in cluster. # It is parseable/grepable version of `rabbitmqctl cluster_status`. # # If node sees partition, report will contain the line like: # PARTITIONED node-name: list-of-nodes, which-node-name-considers, itself-partitioned-with partitions_report() { $COMMAND_TIMEOUT xargs -0 ${OCF_RESKEY_ctl} eval < ok; ({Node, Partitions}) -> PartitionsStr = string:join([atom_to_list(Part) || Part <- Partitions], ", "), io:format("PARTITIONED ~s: ~s~n", [Node, PartitionsStr]) end, Replies), ok. EOF } # Check if the rabbitmqctl control plane is alive. node_health_check() { local rc if [ "$OCF_RESKEY_rmq_feature_health_check" = true ]; then node_health_check_local rc=$? else node_health_check_legacy rc=$? fi return $rc } node_health_check_local() { local LH="${LH} node_health_check_local():" local rc local rc_timeouts # Give node_health_check some time to handle timeout by itself. # By using internal rabbitmqctl timeouts, we allow it to print # more useful diagnostics local timeout=$((TIMEOUT_ARG - 2)) su_rabbit_cmd "${OCF_RESKEY_ctl} node_health_check -t $timeout" rc=$? check_timeouts $rc "rabbit_node_health_check_timeouts" "node_health_check" rc_timeouts=$? if [ "$rc_timeouts" -eq 2 ]; then master_score 0 ocf_log info "${LH} node_health_check timed out, retry limit reached" return $OCF_ERR_GENERIC elif [ "$rc_timeouts" -eq 1 ]; then ocf_log info "${LH} node_health_check timed out, going to retry" return $OCF_SUCCESS fi if [ "$rc" -ne 0 ]; then ocf_log err "${LH} rabbitmqctl node_health_check exited with errors." return $OCF_ERR_GENERIC else return $OCF_SUCCESS fi } node_health_check_legacy() { local rc_alive local timeout_alive su_rabbit_cmd "${OCF_RESKEY_ctl} list_channels > /dev/null 2>&1" rc_alive=$? { [ $rc_alive -eq 137 ] || [ $rc_alive -eq 124 ] ; } && ocf_log err "${LH} 'rabbitmqctl list_channels' timed out, per-node explanation: $(enhanced_list_channels)" check_timeouts $rc_alive "rabbit_list_channels_timeouts" "list_channels" timeout_alive=$? if [ $timeout_alive -eq 2 ]; then master_score 0 return $OCF_ERR_GENERIC elif [ $timeout_alive -eq 0 ]; then if [ $rc_alive -ne 0 ]; then ocf_log err "${LH} rabbitmqctl list_channels exited with errors." rc=$OCF_ERR_GENERIC fi fi # Check for memory alarms for this Master or Slave node. # If alert found, reset the alarm # and restart the resource as it likely means a dead end situation # when rabbitmq cluster is running with blocked publishing due # to high memory watermark exceeded. local alarms local rc_alarms local timeout_alarms alarms=`su_rabbit_cmd "${OCF_RESKEY_ctl} -q eval 'rabbit_alarm:get_alarms().'" 2>/dev/null` rc_alarms=$? check_timeouts $rc_alarms "rabbit_get_alarms_timeouts" "get_alarms" timeout_alarms=$? if [ $timeout_alarms -eq 2 ]; then master_score 0 return $OCF_ERR_GENERIC elif [ $timeout_alarms -eq 0 ]; then if [ $rc_alarms -ne 0 ]; then ocf_log err "${LH} rabbitmqctl get_alarms exited with errors." rc=$OCF_ERR_GENERIC elif [ -n "${alarms}" ]; then for node in ${alarms}; do name=`echo ${node} | perl -n -e "m/memory,'(?\S+)+'/ && print \"$+{n}\n\""` if [ "${name}" = "${RABBITMQ_NODENAME}" ] ; then ocf_log err "${LH} Found raised memory alarm. Erasing the alarm and restarting." su_rabbit_cmd "${OCF_RESKEY_ctl} set_vm_memory_high_watermark 10 > /dev/null 2>&1" rc=$OCF_ERR_GENERIC break fi done fi fi if ! is_cluster_status_ok ; then rc=$OCF_ERR_GENERIC fi # Check if the list of all queues is available, # Also report some queues stats and total virtual memory. local queues local rc_queues local timeout_queues queues=`su_rabbit_cmd "${OCF_RESKEY_ctl} -q -p ${OCF_RESKEY_default_vhost} list_queues memory messages consumer_utilisation"` rc_queues=$? check_timeouts $rc_queues "rabbit_list_queues_timeouts" "list_queues" timeout_queues=$? if [ $timeout_queues -eq 2 ]; then master_score 0 return $OCF_ERR_GENERIC elif [ $timeout_queues -eq 0 ]; then if [ $rc_queues -ne 0 ]; then ocf_log err "${LH} rabbitmqctl list_queues exited with errors." rc=$OCF_ERR_GENERIC elif [ -n "${queues}" ]; then local q_c q_c=`printf %b "${queues}\n" | wc -l` local mem mem=`printf %b "${queues}\n" | awk -v sum=0 '{sum+=$1} END {print (sum/1048576)}'` local mes mes=`printf %b "${queues}\n" | awk -v sum=0 '{sum+=$2} END {print sum}'` local c_u c_u=`printf %b "${queues}\n" | awk -v sum=0 -v cnt=${q_c} '{sum+=$3} END {print (sum+1)/(cnt+1)}'` local status status=`echo $(su_rabbit_cmd "${OCF_RESKEY_ctl} -q status")` ocf_log info "${LH} RabbitMQ is running ${q_c} queues consuming ${mem}m of ${TOTALVMEM}m total, with ${mes} queued messages, average consumer utilization ${c_u}" ocf_log info "${LH} RabbitMQ status: ${status}" fi fi return $rc } ocf_get_private_attr() { local attr_name="${1:?}" local attr_default_value="${2:?}" local nodename="${3:-$THIS_PCMK_NODE}" local count count=$(attrd_updater -p --name "$attr_name" --node "$nodename" --query) if [ $? -ne 0 ]; then echo $attr_default_value else echo "$count" | awk -vdef_val="$attr_default_value" '{ gsub(/"/, "", $3); split($3, vals, "="); if (vals[2] != "") print vals[2]; else print def_val }' fi } ocf_update_private_attr() { local attr_name="${1:?}" local attr_value="${2:?}" local nodename="${3:-$THIS_PCMK_NODE}" ocf_run attrd_updater -p --name "$attr_name" --node "$nodename" --update "$attr_value" } rabbitmqctl_with_timeout_check() { local command="${1:?}" local timeout_attr_name="${2:?}" su_rabbit_cmd "${OCF_RESKEY_ctl} $command" local rc=$? check_timeouts $rc $timeout_attr_name "$command" local has_timed_out=$? case "$has_timed_out" in 0) return $rc;; 1) return 0;; 2) return 1;; esac } is_cluster_status_ok() { local LH="${LH}: is_cluster_status_ok:" rabbitmqctl_with_timeout_check cluster_status rabbit_cluster_status_timeouts > /dev/null 2>&1 } action_monitor() { local rc=$OCF_ERR_GENERIC local LH="${LL} monitor:" ocf_log debug "${LH} action start." if ocf_is_true "${OCF_RESKEY_debug}"; then d=`date '+%Y%m%d %H:%M:%S'` echo $d >> /tmp/rmq-monitor.log env >> /tmp/rmq-monitor.log echo "$d [monitor] start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log fi get_monitor rc=$? ocf_log debug "${LH} role: ${OCF_RESKEY_CRM_meta_role}" ocf_log debug "${LH} result: $rc" ocf_log debug "${LH} action end." return $rc } action_start() { local rc=$OCF_ERR_GENERIC local LH="${LL} start:" local nowtime if ocf_is_true "${OCF_RESKEY_debug}"; then d=`date '+%Y%m%d %H:%M:%S'` echo $d >> /tmp/rmq-start.log env >> /tmp/rmq-start.log echo "$d [start] start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log fi ocf_log info "${LH} action begin." get_status rc=$? if [ $rc -eq $OCF_SUCCESS ] ; then ocf_log warn "${LH} RMQ-runtime (beam) already started." return $OCF_SUCCESS fi local attrs_to_zero="rabbit_list_channels_timeouts rabbit_get_alarms_timeouts rabbit_list_queues_timeouts rabbit_cluster_status_timeouts rabbit_node_health_check_timeouts" local attr_name_to_reset for attr_name_to_reset in $attrs_to_zero; do ocf_update_private_attr $attr_name_to_reset 0 done nowtime=$(now) ocf_log info "${LH} Setting phase 1 one start time to $nowtime" ocf_update_private_attr 'rabbit-start-phase-1-time' "$nowtime" ocf_log info "${LH} Deleting start time attribute" ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete ocf_log info "${LH} Deleting master attribute" ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete ocf_log info "${LH} RMQ going to start." start_rmq_server_app rc=$? if [ $rc -eq $OCF_SUCCESS ] ; then ocf_log info "${LH} RMQ prepared for start succesfully." fi ocf_log info "${LH} action end." return $rc } action_stop() { local rc=$OCF_ERR_GENERIC local LH="${LL} stop:" if ocf_is_true "${OCF_RESKEY_debug}"; then d=$(date '+%Y%m%d %H:%M:%S') echo $d >> /tmp/rmq-stop.log env >> /tmp/rmq-stop.log echo "$d [stop] start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log fi ocf_log info "${LH} action begin." ocf_log info "${LH} Deleting master attribute" ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete master_score 0 ocf_log info "${LH} Deleting start time attribute" ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-start-time' --delete # Wait for synced state first ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync" wait_sync $((OCF_RESKEY_stop_time/2)) ocf_log info "${LH} RMQ-runtime (beam) going to down." stop_server_process if [ $? -ne $OCF_SUCCESS ] ; then ocf_log err "RMQ-runtime (beam) couldn't be stopped and will likely became unmanaged. Take care of it manually!" ocf_log info "${LH} action end." exit $OCF_ERR_GENERIC fi ocf_log info "${LH} RMQ-runtime (beam) not running." ocf_log info "${LH} action end." return $OCF_SUCCESS } ####################################################################### # Enhanced list_channels: # - nodes are processed in parallel # - report contains information about which nodes timed out # # 'list_channels' is used as a healh-check for current node, but it # actually checks overall health of all node in cluster. And there were # some bugs where only one (non-local) channel became stuck, but OCF # script was wrongfully killing local node. # # Hopefully all such bugs are fixed, but if not - it will allow to # detect such conditions. # # Somewhat strange implementation is due to the following reasons: # - ability to support older versions of RabbitMQ which have reached # end-of-life with single version of the script # - zero dependencies - for older versions this functionality could be # implemented as a plugin, but it'll require this plugin installation enhanced_list_channels() { # One second less than timeout of su_rabbit_cmd local timeout=$((${TIMEOUT_ARG:-5} - 1)) su_rabbit_cmd "xargs -0 ${OCF_RESKEY_ctl} eval" < {Mega, Secs, Micro} = os:timestamp(), Mili = Micro div 1000, Mili + 1000 * (Secs + 1000000 * Mega) end, %% We shouldn't continue execution past this time ShouldEndAt = Now() + SecondsToCompletion * 1000, %% How many milliseconds we still have Timeout = fun() -> case ShouldEndAt - Now() of Past when Past =< 0 -> 0; Timeout -> Timeout end end, %% Lambda combinator - for defining anonymous recursive functions Y = fun(F) -> (fun (X) -> F(fun(Y) -> (X(X))(Y) end) end)( fun (X) -> F(fun(Y) -> (X(X))(Y) end) end) end, Parent = self(), ListChannels = Y(fun(Rec) -> fun (({Node, [], OkChannelsCount})) -> Parent ! {Node, ok, OkChannelsCount}; ({Node, [Chan|Rest], OkChannelsCount}) -> case catch rpc:call(Node, rabbit_channel, info, [Chan], Timeout()) of Infos when is_list(Infos) -> Rec({Node, Rest, OkChannelsCount + 1}); {badrpc, {'EXIT', {noproc, _}}} -> %% Channel became dead before we could request it's status, don't care Rec({Node, Rest, OkChannelsCount}); Err -> Parent ! {Node, Err, OkChannelsCount} end end end), SingleNodeListing = fun(Node) -> case catch rpc:call(Node, pg_local, get_members, [rabbit_channels], Timeout()) of LocalChannels when is_list(LocalChannels) -> ListChannels({Node, LocalChannels, 0}); Err -> Parent ! {Node, Err, 0} end end, AllNodes = rabbit_mnesia:cluster_nodes(running), [ spawn(fun() -> SingleNodeListing(Node) end) || Node <- AllNodes ], WaitForNodes = Y(fun(Rec) -> fun ({[], Acc}) -> Acc; ({RemainingNodes, Acc}) -> receive {Node, _Status, _ChannelCount} = Smth -> RemainingNodes1 = lists:delete(Node, RemainingNodes), Rec({RemainingNodes1, [Smth|Acc]}) after Timeout() + 100 -> Acc end end end), Result = WaitForNodes({AllNodes, []}), ExpandedResult = [ case lists:keysearch(Node, 1, Result) of {value, NodeResult} -> NodeResult; false -> {Node, no_data_collected, 0} end || Node <- AllNodes ], ExpandedResult. EOF } ####################################################################### # Join the cluster and return OCF_SUCCESS, if joined. # Return 10, if node is trying to join to itself or empty destination. # Return OCF_ERR_GENERIC, if cannot join. jjj_join () { local join_to="$1" local rc=$OCF_ERR_GENERIC local LH="${LL} jjj_join:" my_host ${join_to} rc=$? ocf_log debug "${LH} node='${join_to}' rc='${rc}'" # Check whether we are joining to ourselves # or master host is not given if [ $rc -ne 0 ] && [ "${join_to}" ] ; then ocf_log info "${LH} Joining to cluster by node '${join_to}'" join_to_cluster "${join_to}" rc=$? if [ $rc -ne $OCF_SUCCESS ] ; then ocf_log err "${LH} Failed to join the cluster. The mnesia will be reset." reset_mnesia rc=$OCF_ERR_GENERIC fi fi return $rc } action_notify() { local rc_join=$OCF_SUCCESS local rc=$OCF_ERR_GENERIC local rc2=$OCF_ERR_GENERIC local LH="${LL} notify:" local nodelist if ocf_is_true "${OCF_RESKEY_debug}"; then d=`date '+%Y%m%d %H:%M:%S'` echo $d >> /tmp/rmq-notify.log env >> /tmp/rmq-notify.log echo "$d [notify] ${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation} promote='${OCF_RESKEY_CRM_meta_notify_promote_uname}' demote='${OCF_RESKEY_CRM_meta_notify_demote_uname}' master='${OCF_RESKEY_CRM_meta_notify_master_uname}' slave='${OCF_RESKEY_CRM_meta_notify_slave_uname}' start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log fi if [ "${OCF_RESKEY_CRM_meta_notify_type}" = 'post' ] ; then # POST- anything notify section case "$OCF_RESKEY_CRM_meta_notify_operation" in promote) ocf_log info "${LH} post-promote begin." rc=$OCF_SUCCESS # Do nothing, if the list of nodes being promoted reported empty. # Delegate recovery, if needed, to the "running out of the cluster" monitor's logic if [ -z "${OCF_RESKEY_CRM_meta_notify_promote_uname}" ] ; then ocf_log warn "${LH} there are no nodes to join to reported on post-promote. Nothing to do." elif my_host "${OCF_RESKEY_CRM_meta_notify_promote_uname}"; then ocf_log info "${LH} ignoring post-promote of self" elif is_clustered_with "${OCF_RESKEY_CRM_meta_notify_promote_uname}"; then if get_status rabbit; then ocf_log info "${LH} we are already clustered with master - ${OCF_RESKEY_CRM_meta_notify_promote_uname}. Nothing to do." else ocf_log info "${LH} we are already clustered with master - ${OCF_RESKEY_CRM_meta_notify_promote_uname}. We only need to start the app." try_to_start_rmq_app rc2=$? update_rabbit_start_time_if_rc $rc2 fi else # Note, this should fail when the mnesia is inconsistent. # For example, when the "old" master processing the promition of the new one. # Later this ex-master node will rejoin the cluster at post-start. jjj_join "${OCF_RESKEY_CRM_meta_notify_promote_uname}" rc=$? if [ $rc -eq $OCF_ERR_GENERIC ] ; then ocf_log err "${LH} Failed to join the cluster on post-promote. The resource will be restarted." fi fi ocf_log info "${LH} post-promote end." return $rc ;; start) ocf_log info "${LH} post-start begin." local nodes_list="${OCF_RESKEY_CRM_meta_notify_start_uname} ${OCF_RESKEY_CRM_meta_notify_active_uname}" # Do nothing, if the list of nodes being started or running reported empty # Delegate recovery, if needed, to the "running out of the cluster" monitor's logic if [ -z "${OCF_RESKEY_CRM_meta_notify_start_uname}" ] && [ -z "${OCF_RESKEY_CRM_meta_notify_active_uname}" ] ; then ocf_log warn "${LH} I'm a last man standing and I must survive!" ocf_log info "${LH} post-start end." return $OCF_SUCCESS fi # check did this event from this host my_host "${nodes_list}" rc=$? # Do nothing, if there is no master reported # Delegate recovery, if needed, to the "running out of the cluster" monitor's logic if [ -z "${OCF_RESKEY_CRM_meta_notify_master_uname}" ] ; then ocf_log warn "${LH} there are no nodes to join to reported on post-start. Nothing to do." ocf_log info "${LH} post-start end." return $OCF_SUCCESS fi if [ $rc -eq $OCF_SUCCESS ] ; then # Now we need to: # a. join to the cluster if we are not joined yet # b. start the RabbitMQ application, which is always # stopped after start action finishes check_need_join_to ${OCF_RESKEY_CRM_meta_notify_master_uname} rc_join=$? if [ $rc_join -eq $OCF_SUCCESS ]; then ocf_log warn "${LH} Going to join node ${OCF_RESKEY_CRM_meta_notify_master_uname}" jjj_join "${OCF_RESKEY_CRM_meta_notify_master_uname}" rc2=$? else ocf_log warn "${LH} We are already clustered with node ${OCF_RESKEY_CRM_meta_notify_master_uname}" try_to_start_rmq_app rc2=$? update_rabbit_start_time_if_rc $rc2 fi if [ -s "${OCF_RESKEY_definitions_dump_file}" ] ; then ocf_log info "File ${OCF_RESKEY_definitions_dump_file} exists" ocf_run curl --silent --show-error --request POST --user $OCF_RESKEY_admin_user:$OCF_RESKEY_admin_password $OCF_RESKEY_host_ip:15672/api/definitions --header "Content-Type:application/json" --data @$OCF_RESKEY_definitions_dump_file rc=$? if [ $rc -eq $OCF_SUCCESS ] ; then ocf_log info "RMQ definitions have imported succesfully." else ocf_log err "RMQ definitions have not imported." fi fi if [ $rc2 -eq $OCF_ERR_GENERIC ] ; then ocf_log warn "${LH} Failed to join the cluster on post-start. The resource will be restarted." ocf_log info "${LH} post-start end." return $OCF_ERR_GENERIC fi fi ocf_log info "${LH} post-start end." ;; stop) # if rabbitmq-server stops on any another node, we should remove it from cluster (as ordinary operation) ocf_log info "${LH} post-stop begin." # Report not running, if there are no nodes being stopped reported if [ -z "${OCF_RESKEY_CRM_meta_notify_stop_uname}" ] ; then ocf_log warn "${LH} there are no nodes being stopped reported on post-stop. The resource will be restarted." ocf_log info "${LH} post-stop end." return $OCF_ERR_GENERIC fi my_host "${OCF_RESKEY_CRM_meta_notify_stop_uname}" rc=$? if [ $rc -ne $OCF_SUCCESS ] ; then # Wait for synced state first ocf_log info "${LH} waiting $((OCF_RESKEY_stop_time/2)) to sync" wait_sync $((OCF_RESKEY_stop_time/2)) # On other nodes processing the post-stop, make sure the stopped node will be forgotten unjoin_nodes_from_cluster "${OCF_RESKEY_CRM_meta_notify_stop_uname}" else # On the nodes being stopped, reset the master score ocf_log info "${LH} resetting the master score." master_score 0 fi # always returns OCF_SUCCESS ocf_log info "${LH} post-stop end." ;; *) ;; esac fi return $OCF_SUCCESS } action_promote() { local rc=$OCF_ERR_GENERIC local LH="${LL} promote:" if ocf_is_true "${OCF_RESKEY_debug}"; then d=$(date '+%Y%m%d %H:%M:%S') echo $d >> /tmp/rmq-promote.log env >> /tmp/rmq-promote.log echo "$d [promote] start='${OCF_RESKEY_CRM_meta_notify_start_uname}' stop='${OCF_RESKEY_CRM_meta_notify_stop_uname}' active='${OCF_RESKEY_CRM_meta_notify_active_uname}' inactive='${OCF_RESKEY_CRM_meta_notify_inactive_uname}'" >> /tmp/rmq-ocf.log fi ocf_log info "${LH} action begin." get_monitor rc=$? ocf_log info "${LH} get_monitor returns ${rc}" case "$rc" in "$OCF_SUCCESS") # Running as slave. Normal, expected behavior. ocf_log info "${LH} Resource is currently running as Slave" # rabbitmqctl start_app if need get_status rabbit rc=$? ocf_log info "${LH} Updating cluster master attribute" ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --update 'true' if [ $rc -ne $OCF_SUCCESS ] ; then ocf_log info "${LH} RMQ app is not started. Starting..." start_rmq_server_app rc=$? if [ $rc -eq 0 ] ; then try_to_start_rmq_app rc=$? if [ $rc -ne 0 ] ; then ocf_log err "${LH} Can't start RMQ app. Master resource is failed." ocf_log info "${LH} action end." exit $OCF_FAILED_MASTER fi [ -f "${OCF_RESKEY_policy_file}" ] && . "${OCF_RESKEY_policy_file}" update_rabbit_start_time_if_rc $rc ocf_log info "${LH} Checking master status" get_monitor rc=$? ocf_log info "${LH} Master status is $rc" if [ $rc = $OCF_RUNNING_MASTER ] then rc=$OCF_SUCCESS else ocf_log err "${LH} Master resource is failed." ocf_log info "${LH} action end." exit $OCF_FAILED_MASTER fi else ocf_log err "${LH} Can't start RMQ-runtime." rc=$OCF_ERR_GENERIC fi fi return $rc ;; "$OCF_RUNNING_MASTER") # Already a master. Unexpected, but not a problem. ocf_log warn "${LH} Resource is already running as Master" rc=$OCF_SUCCESS ;; "$OCF_FAILED_MASTER") # Master failed. ocf_log err "${LH} Master resource is failed and not running" ocf_log info "${LH} action end." exit $OCF_FAILED_MASTER ;; "$OCF_NOT_RUNNING") # Currently not running. ocf_log err "${LH} Resource is currently not running" rc=$OCF_NOT_RUNNING ;; *) # Failed resource. Let the cluster manager recover. ocf_log err "${LH} Unexpected error, cannot promote" ocf_log info "${LH} action end." exit $rc ;; esac # transform slave RMQ-server to master ocf_log info "${LH} action end." return $rc } action_demote() { local LH="${LL} demote:" ocf_log info "${LH} action begin." ocf_run crm_attribute -N $THIS_PCMK_NODE -l reboot --name 'rabbit-master' --delete ocf_log info "${LH} action end." return $OCF_SUCCESS } ####################################################################### case "$1" in meta-data) meta_data exit $OCF_SUCCESS;; usage|help) usage exit $OCF_SUCCESS;; esac rmq_setup_env # Anything except meta-data and help must pass validation action_validate || exit $? # What kind of method was invoked? case "$1" in start) action_start;; stop) action_stop;; status) action_status;; monitor) action_monitor;; validate) action_validate;; promote) action_promote;; demote) action_demote;; notify) action_notify;; validate-all) action_validate;; *) usage;; esac ###