summaryrefslogtreecommitdiffstats
path: root/daemons/fenced/fenced_remote.c
diff options
context:
space:
mode:
Diffstat (limited to 'daemons/fenced/fenced_remote.c')
-rw-r--r--daemons/fenced/fenced_remote.c459
1 files changed, 272 insertions, 187 deletions
diff --git a/daemons/fenced/fenced_remote.c b/daemons/fenced/fenced_remote.c
index 843b3d4..f87eeb6 100644
--- a/daemons/fenced/fenced_remote.c
+++ b/daemons/fenced/fenced_remote.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2009-2023 the Pacemaker project contributors
+ * Copyright 2009-2024 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
@@ -24,7 +24,6 @@
#include <regex.h>
#include <crm/crm.h>
-#include <crm/msg_xml.h>
#include <crm/common/ipc.h>
#include <crm/common/ipc_internal.h>
#include <crm/cluster/internal.h>
@@ -370,24 +369,25 @@ undo_op_remap(remote_fencing_op_t *op)
* \internal
* \brief Create notification data XML for a fencing operation result
*
- * \param[in] op Fencer operation that completed
+ * \param[in,out] parent Parent XML element for newly created element
+ * \param[in] op Fencer operation that completed
*
* \return Newly created XML to add as notification data
* \note The caller is responsible for freeing the result.
*/
static xmlNode *
-fencing_result2xml(const remote_fencing_op_t *op)
+fencing_result2xml(xmlNode *parent, const remote_fencing_op_t *op)
{
- xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE);
+ xmlNode *notify_data = pcmk__xe_create(parent, PCMK__XE_ST_NOTIFY_FENCE);
- crm_xml_add_int(notify_data, "state", op->state);
- crm_xml_add(notify_data, F_STONITH_TARGET, op->target);
- crm_xml_add(notify_data, F_STONITH_ACTION, op->action);
- crm_xml_add(notify_data, F_STONITH_DELEGATE, op->delegate);
- crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, op->id);
- crm_xml_add(notify_data, F_STONITH_ORIGIN, op->originator);
- crm_xml_add(notify_data, F_STONITH_CLIENTID, op->client_id);
- crm_xml_add(notify_data, F_STONITH_CLIENTNAME, op->client_name);
+ crm_xml_add_int(notify_data, PCMK_XA_STATE, op->state);
+ crm_xml_add(notify_data, PCMK__XA_ST_TARGET, op->target);
+ crm_xml_add(notify_data, PCMK__XA_ST_DEVICE_ACTION, op->action);
+ crm_xml_add(notify_data, PCMK__XA_ST_DELEGATE, op->delegate);
+ crm_xml_add(notify_data, PCMK__XA_ST_REMOTE_OP, op->id);
+ crm_xml_add(notify_data, PCMK__XA_ST_ORIGIN, op->originator);
+ crm_xml_add(notify_data, PCMK__XA_ST_CLIENTID, op->client_id);
+ crm_xml_add(notify_data, PCMK__XA_ST_CLIENTNAME, op->client_name);
return notify_data;
}
@@ -403,25 +403,26 @@ void
fenced_broadcast_op_result(const remote_fencing_op_t *op, bool op_merged)
{
static int count = 0;
- xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY);
- xmlNode *notify_data = fencing_result2xml(op);
+ xmlNode *bcast = pcmk__xe_create(NULL, PCMK__XE_ST_REPLY);
+ xmlNode *wrapper = NULL;
+ xmlNode *notify_data = NULL;
count++;
crm_trace("Broadcasting result to peers");
- crm_xml_add(bcast, F_TYPE, T_STONITH_NOTIFY);
- crm_xml_add(bcast, F_SUBTYPE, "broadcast");
- crm_xml_add(bcast, F_STONITH_OPERATION, T_STONITH_NOTIFY);
- crm_xml_add_int(bcast, "count", count);
+ crm_xml_add(bcast, PCMK__XA_T, PCMK__VALUE_ST_NOTIFY);
+ crm_xml_add(bcast, PCMK__XA_SUBT, PCMK__VALUE_BROADCAST);
+ crm_xml_add(bcast, PCMK__XA_ST_OP, STONITH_OP_NOTIFY);
+ crm_xml_add_int(bcast, PCMK_XA_COUNT, count);
if (op_merged) {
- pcmk__xe_set_bool_attr(bcast, F_STONITH_MERGED, true);
+ pcmk__xe_set_bool_attr(bcast, PCMK__XA_ST_OP_MERGED, true);
}
+ wrapper = pcmk__xe_create(bcast, PCMK__XE_ST_CALLDATA);
+ notify_data = fencing_result2xml(wrapper, op);
stonith__xe_set_result(notify_data, &op->result);
- add_message_xml(bcast, F_STONITH_CALLDATA, notify_data);
- send_cluster_message(NULL, crm_msg_stonith_ng, bcast, FALSE);
- free_xml(notify_data);
+ pcmk__cluster_send_message(NULL, crm_msg_stonith_ng, bcast);
free_xml(bcast);
return;
@@ -447,12 +448,12 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data)
}
/* Do notification with a clean data object */
- crm_xml_add_int(data, "state", op->state);
- crm_xml_add(data, F_STONITH_TARGET, op->target);
- crm_xml_add(data, F_STONITH_OPERATION, op->action);
+ crm_xml_add_int(data, PCMK_XA_STATE, op->state);
+ crm_xml_add(data, PCMK__XA_ST_TARGET, op->target);
+ crm_xml_add(data, PCMK__XA_ST_OP, op->action);
reply = fenced_construct_reply(op->request, data, &op->result);
- crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate);
+ crm_xml_add(reply, PCMK__XA_ST_DELEGATE, op->delegate);
/* Send fencing OP reply to local client that initiated fencing */
client = pcmk__find_client_by_id(op->client_id);
@@ -463,10 +464,11 @@ handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data)
}
/* bcast to all local clients that the fencing operation happend */
- notify_data = fencing_result2xml(op);
- fenced_send_notification(T_STONITH_NOTIFY_FENCE, &op->result, notify_data);
+ notify_data = fencing_result2xml(NULL, op);
+ fenced_send_notification(PCMK__VALUE_ST_NOTIFY_FENCE, &op->result,
+ notify_data);
free_xml(notify_data);
- fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL);
+ fenced_send_notification(PCMK__VALUE_ST_NOTIFY_HISTORY, NULL, NULL);
/* mark this op as having notify's already sent */
op->notify_sent = TRUE;
@@ -509,12 +511,13 @@ finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data)
static char *
delegate_from_xml(xmlNode *xml)
{
- xmlNode *match = get_xpath_object("//@" F_STONITH_DELEGATE, xml, LOG_NEVER);
+ xmlNode *match = get_xpath_object("//@" PCMK__XA_ST_DELEGATE, xml,
+ LOG_NEVER);
if (match == NULL) {
- return crm_element_value_copy(xml, F_ORIG);
+ return crm_element_value_copy(xml, PCMK__XA_SRC);
} else {
- return crm_element_value_copy(match, F_STONITH_DELEGATE);
+ return crm_element_value_copy(match, PCMK__XA_ST_DELEGATE);
}
}
@@ -564,7 +567,7 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup)
undo_op_remap(op);
if (data == NULL) {
- data = create_xml_node(NULL, "remote-op");
+ data = pcmk__xe_create(NULL, "remote-op");
local_data = data;
} else if (op->delegate == NULL) {
@@ -584,15 +587,15 @@ finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup)
}
}
- if (dup || (crm_element_value(data, F_STONITH_MERGED) != NULL)) {
+ if (dup || (crm_element_value(data, PCMK__XA_ST_OP_MERGED) != NULL)) {
op_merged = true;
}
/* Tell everyone the operation is done, we will continue
* with doing the local notifications once we receive
* the broadcast back. */
- subt = crm_element_value(data, F_SUBTYPE);
- if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) {
+ subt = crm_element_value(data, PCMK__XA_SUBT);
+ if (!dup && !pcmk__str_eq(subt, PCMK__VALUE_BROADCAST, pcmk__str_none)) {
/* Defer notification until the bcast message arrives */
fenced_broadcast_op_result(op, op_merged);
free_xml(local_data);
@@ -800,7 +803,8 @@ add_required_device(remote_fencing_op_t *op, const char *device)
sort_strings);
if (!match) {
- op->automatic_list = g_list_prepend(op->automatic_list, strdup(device));
+ op->automatic_list = g_list_prepend(op->automatic_list,
+ pcmk__str_copy(device));
}
}
@@ -833,7 +837,10 @@ set_op_device_list(remote_fencing_op_t * op, GList *devices)
op->devices_list = NULL;
}
for (lpc = devices; lpc != NULL; lpc = lpc->next) {
- op->devices_list = g_list_append(op->devices_list, strdup(lpc->data));
+ const char *device = lpc->data;
+
+ op->devices_list = g_list_append(op->devices_list,
+ pcmk__str_copy(device));
}
op->devices = op->devices_list;
}
@@ -1001,6 +1008,7 @@ merge_duplicates(remote_fencing_op_t *op)
g_hash_table_iter_init(&iter, stonith_remote_op_list);
while (g_hash_table_iter_next(&iter, NULL, (void **)&other)) {
const char *other_action = op_requested_action(other);
+ crm_node_t *node = NULL;
if (!strcmp(op->id, other->id)) {
continue; // Don't compare against self
@@ -1030,7 +1038,11 @@ merge_duplicates(remote_fencing_op_t *op)
op->id, other->id, other->target);
continue;
}
- if (!fencing_peer_active(crm_get_peer(0, other->originator))) {
+
+ node = pcmk__get_node(0, other->originator, NULL,
+ pcmk__node_search_cluster_member);
+
+ if (!fencing_peer_active(node)) {
crm_notice("Failing action '%s' targeting %s originating from "
"client %s@%s: Originator is dead " CRM_XS " id=%.8s",
other->action, other->target, other->client_name,
@@ -1042,8 +1054,8 @@ merge_duplicates(remote_fencing_op_t *op)
}
if ((other->total_timeout > 0)
&& (now > (other->total_timeout + other->created))) {
- crm_trace("%.8s not duplicate of %.8s: old (%ld vs. %ld + %d)",
- op->id, other->id, now, other->created,
+ crm_trace("%.8s not duplicate of %.8s: old (%lld vs. %lld + %ds)",
+ op->id, other->id, (long long)now, (long long)other->created,
other->total_timeout);
continue;
}
@@ -1055,7 +1067,7 @@ merge_duplicates(remote_fencing_op_t *op)
if (other->total_timeout == 0) {
other->total_timeout = op->total_timeout =
TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL);
- crm_trace("Best guess as to timeout used for %.8s: %d",
+ crm_trace("Best guess as to timeout used for %.8s: %ds",
other->id, other->total_timeout);
}
crm_notice("Merging fencing action '%s' targeting %s originating from "
@@ -1097,12 +1109,12 @@ int
fenced_handle_manual_confirmation(const pcmk__client_t *client, xmlNode *msg)
{
remote_fencing_op_t *op = NULL;
- xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR);
+ xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_TARGET, msg, LOG_ERR);
CRM_CHECK(dev != NULL, return EPROTO);
crm_notice("Received manual confirmation that %s has been fenced",
- pcmk__s(crm_element_value(dev, F_STONITH_TARGET),
+ pcmk__s(crm_element_value(dev, PCMK__XA_ST_TARGET),
"unknown target"));
op = initiate_remote_stonith_op(client, msg, TRUE);
if (op == NULL) {
@@ -1110,7 +1122,7 @@ fenced_handle_manual_confirmation(const pcmk__client_t *client, xmlNode *msg)
}
op->state = st_done;
set_fencing_completed(op);
- op->delegate = strdup("a human");
+ op->delegate = pcmk__str_copy("a human");
// For the fencer's purposes, the fencing operation is done
pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
@@ -1137,7 +1149,8 @@ void *
create_remote_stonith_op(const char *client, xmlNode *request, gboolean peer)
{
remote_fencing_op_t *op = NULL;
- xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_NEVER);
+ xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_TARGET, request,
+ LOG_NEVER);
int call_options = 0;
const char *operation = NULL;
@@ -1146,7 +1159,7 @@ create_remote_stonith_op(const char *client, xmlNode *request, gboolean peer)
/* If this operation is owned by another node, check to make
* sure we haven't already created this operation. */
if (peer && dev) {
- const char *op_id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
+ const char *op_id = crm_element_value(dev, PCMK__XA_ST_REMOTE_OP);
CRM_CHECK(op_id != NULL, return NULL);
@@ -1158,15 +1171,14 @@ create_remote_stonith_op(const char *client, xmlNode *request, gboolean peer)
}
}
- op = calloc(1, sizeof(remote_fencing_op_t));
- CRM_ASSERT(op != NULL);
+ op = pcmk__assert_alloc(1, sizeof(remote_fencing_op_t));
- crm_element_value_int(request, F_STONITH_TIMEOUT, &(op->base_timeout));
+ crm_element_value_int(request, PCMK__XA_ST_TIMEOUT, &(op->base_timeout));
// Value -1 means disable any static/random fencing delays
- crm_element_value_int(request, F_STONITH_DELAY, &(op->client_delay));
+ crm_element_value_int(request, PCMK__XA_ST_DELAY, &(op->client_delay));
if (peer && dev) {
- op->id = crm_element_value_copy(dev, F_STONITH_REMOTE_OP_ID);
+ op->id = crm_element_value_copy(dev, PCMK__XA_ST_REMOTE_OP);
} else {
op->id = crm_generate_uuid();
}
@@ -1175,41 +1187,49 @@ create_remote_stonith_op(const char *client, xmlNode *request, gboolean peer)
op->state = st_query;
op->replies_expected = fencing_active_peers();
- op->action = crm_element_value_copy(dev, F_STONITH_ACTION);
- op->originator = crm_element_value_copy(dev, F_STONITH_ORIGIN);
- op->delegate = crm_element_value_copy(dev, F_STONITH_DELEGATE); /* May not be set */
- op->created = time(NULL);
+ op->action = crm_element_value_copy(dev, PCMK__XA_ST_DEVICE_ACTION);
+ /* The node initiating the stonith operation. If an operation is relayed,
+ * this is the last node the operation lands on. When in standalone mode,
+ * origin is the ID of the client that originated the operation.
+ *
+ * Or may be the name of the function that created the operation.
+ */
+ op->originator = crm_element_value_copy(dev, PCMK__XA_ST_ORIGIN);
if (op->originator == NULL) {
/* Local or relayed request */
- op->originator = strdup(stonith_our_uname);
+ op->originator = pcmk__str_copy(stonith_our_uname);
}
- CRM_LOG_ASSERT(client != NULL);
- if (client) {
- op->client_id = strdup(client);
- }
+ // Delegate may not be set
+ op->delegate = crm_element_value_copy(dev, PCMK__XA_ST_DELEGATE);
+ op->created = time(NULL);
+ CRM_LOG_ASSERT(client != NULL);
+ op->client_id = pcmk__str_copy(client);
/* For a RELAY operation, set fenced on the client. */
- operation = crm_element_value(request, F_STONITH_OPERATION);
+ operation = crm_element_value(request, PCMK__XA_ST_OP);
if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
op->client_name = crm_strdup_printf("%s.%lu", crm_system_name,
(unsigned long) getpid());
} else {
- op->client_name = crm_element_value_copy(request, F_STONITH_CLIENTNAME);
+ op->client_name = crm_element_value_copy(request,
+ PCMK__XA_ST_CLIENTNAME);
}
- op->target = crm_element_value_copy(dev, F_STONITH_TARGET);
- op->request = copy_xml(request); /* TODO: Figure out how to avoid this */
- crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options);
+ op->target = crm_element_value_copy(dev, PCMK__XA_ST_TARGET);
+
+ // @TODO Figure out how to avoid copying XML here
+ op->request = pcmk__xml_copy(NULL, request);
+ crm_element_value_int(request, PCMK__XA_ST_CALLOPT, &call_options);
op->call_options = call_options;
- crm_element_value_int(request, F_STONITH_CALLID, &(op->client_callid));
+ crm_element_value_int(request, PCMK__XA_ST_CALLID, &(op->client_callid));
crm_trace("%s new fencing op %s ('%s' targeting %s for client %s, "
- "base timeout %d, %u %s expected)",
+ "base timeout %ds, %u %s expected)",
(peer && dev)? "Recorded" : "Generated", op->id, op->action,
op->target, op->client_name, op->base_timeout,
op->replies_expected,
@@ -1220,14 +1240,15 @@ create_remote_stonith_op(const char *client, xmlNode *request, gboolean peer)
crm_node_t *node;
pcmk__scan_min_int(op->target, &nodeid, 0);
- node = pcmk__search_known_node_cache(nodeid, NULL, CRM_GET_PEER_ANY);
+ node = pcmk__search_node_caches(nodeid, NULL,
+ pcmk__node_search_any
+ |pcmk__node_search_cluster_cib);
/* Ensure the conversion only happens once */
stonith__clear_call_options(op->call_options, op->id, st_opt_cs_nodeid);
if (node && node->uname) {
- free(op->target);
- op->target = strdup(node->uname);
+ pcmk__str_update(&(op->target), node->uname);
} else {
crm_warn("Could not expand nodeid '%s' into a host name", op->target);
@@ -1239,7 +1260,7 @@ create_remote_stonith_op(const char *client, xmlNode *request, gboolean peer)
if (op->state != st_duplicate) {
/* kick history readers */
- fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL);
+ fenced_send_notification(PCMK__VALUE_ST_NOTIFY_HISTORY, NULL, NULL);
}
/* safe to trim as long as that doesn't touch pending ops */
@@ -1272,7 +1293,7 @@ initiate_remote_stonith_op(const pcmk__client_t *client, xmlNode *request,
if (client) {
client_id = client->id;
} else {
- client_id = crm_element_value(request, F_STONITH_CLIENTID);
+ client_id = crm_element_value(request, PCMK__XA_ST_CLIENTID);
}
CRM_LOG_ASSERT(client_id != NULL);
@@ -1305,7 +1326,7 @@ initiate_remote_stonith_op(const pcmk__client_t *client, xmlNode *request,
default:
crm_notice("Requesting peer fencing (%s) targeting %s "
- CRM_XS " id=%.8s state=%s base_timeout=%d",
+ CRM_XS " id=%.8s state=%s base_timeout=%ds",
op->action, op->target, op->id,
stonith_op_state_str(op->state), op->base_timeout);
}
@@ -1313,24 +1334,24 @@ initiate_remote_stonith_op(const pcmk__client_t *client, xmlNode *request,
query = stonith_create_op(op->client_callid, op->id, STONITH_OP_QUERY,
NULL, op->call_options);
- crm_xml_add(query, F_STONITH_REMOTE_OP_ID, op->id);
- crm_xml_add(query, F_STONITH_TARGET, op->target);
- crm_xml_add(query, F_STONITH_ACTION, op_requested_action(op));
- crm_xml_add(query, F_STONITH_ORIGIN, op->originator);
- crm_xml_add(query, F_STONITH_CLIENTID, op->client_id);
- crm_xml_add(query, F_STONITH_CLIENTNAME, op->client_name);
- crm_xml_add_int(query, F_STONITH_TIMEOUT, op->base_timeout);
+ crm_xml_add(query, PCMK__XA_ST_REMOTE_OP, op->id);
+ crm_xml_add(query, PCMK__XA_ST_TARGET, op->target);
+ crm_xml_add(query, PCMK__XA_ST_DEVICE_ACTION, op_requested_action(op));
+ crm_xml_add(query, PCMK__XA_ST_ORIGIN, op->originator);
+ crm_xml_add(query, PCMK__XA_ST_CLIENTID, op->client_id);
+ crm_xml_add(query, PCMK__XA_ST_CLIENTNAME, op->client_name);
+ crm_xml_add_int(query, PCMK__XA_ST_TIMEOUT, op->base_timeout);
/* In case of RELAY operation, RELAY information is added to the query to delete the original operation of RELAY. */
- operation = crm_element_value(request, F_STONITH_OPERATION);
+ operation = crm_element_value(request, PCMK__XA_ST_OP);
if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
- relay_op_id = crm_element_value(request, F_STONITH_REMOTE_OP_ID);
+ relay_op_id = crm_element_value(request, PCMK__XA_ST_REMOTE_OP);
if (relay_op_id) {
- crm_xml_add(query, F_STONITH_REMOTE_OP_ID_RELAY, relay_op_id);
+ crm_xml_add(query, PCMK__XA_ST_REMOTE_OP_RELAY, relay_op_id);
}
}
- send_cluster_message(NULL, crm_msg_stonith_ng, query, FALSE);
+ pcmk__cluster_send_message(NULL, crm_msg_stonith_ng, query);
free_xml(query);
query_timeout = op->base_timeout * TIMEOUT_MULTIPLY_FACTOR;
@@ -1348,6 +1369,16 @@ enum find_best_peer_options {
FIND_PEER_VERIFIED_ONLY = 0x0004,
};
+static bool
+is_watchdog_fencing(const remote_fencing_op_t *op, const char *device)
+{
+ return (stonith_watchdog_timeout_ms > 0
+ // Only an explicit mismatch is considered not a watchdog fencing.
+ && pcmk__str_eq(device, STONITH_WATCHDOG_ID, pcmk__str_null_matches)
+ && pcmk__is_fencing_action(op->action)
+ && node_does_watchdog_fencing(op->target));
+}
+
static peer_device_info_t *
find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer_options options)
{
@@ -1443,10 +1474,10 @@ stonith_choose_peer(remote_fencing_op_t * op)
&& pcmk_is_set(op->call_options, st_opt_topology)
&& (advance_topology_level(op, false) == pcmk_rc_ok));
- if ((stonith_watchdog_timeout_ms > 0)
- && pcmk__is_fencing_action(op->action)
- && pcmk__str_eq(device, STONITH_WATCHDOG_ID, pcmk__str_none)
- && node_does_watchdog_fencing(op->target)) {
+ /* With a simple watchdog fencing configuration without a topology,
+ * "device" is NULL here. Consider it should be done with watchdog fencing.
+ */
+ if (is_watchdog_fencing(op, device)) {
crm_info("Couldn't contact watchdog-fencing target-node (%s)",
op->target);
/* check_watchdog_fencing_and_wait will log additional info */
@@ -1458,32 +1489,69 @@ stonith_choose_peer(remote_fencing_op_t * op)
}
static int
+valid_fencing_timeout(int specified_timeout, bool action_specific,
+ const remote_fencing_op_t *op, const char *device)
+{
+ int timeout = specified_timeout;
+
+ if (!is_watchdog_fencing(op, device)) {
+ return timeout;
+ }
+
+ timeout = (int) QB_MIN(QB_MAX(specified_timeout,
+ stonith_watchdog_timeout_ms / 1000), INT_MAX);
+
+ if (timeout > specified_timeout) {
+ if (action_specific) {
+ crm_warn("pcmk_%s_timeout %ds for %s is too short (must be >= "
+ PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " %ds), using %ds "
+ "instead",
+ op->action, specified_timeout, device? device : "watchdog",
+ timeout, timeout);
+
+ } else {
+ crm_warn("Fencing timeout %ds is too short (must be >= "
+ PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " %ds), using %ds "
+ "instead",
+ specified_timeout, timeout, timeout);
+ }
+ }
+
+ return timeout;
+}
+
+static int
get_device_timeout(const remote_fencing_op_t *op,
const peer_device_info_t *peer, const char *device,
bool with_delay)
{
+ int timeout = op->base_timeout;
device_properties_t *props;
- int delay = 0;
+
+ timeout = valid_fencing_timeout(op->base_timeout, false, op, device);
if (!peer || !device) {
- return op->base_timeout;
+ return timeout;
}
props = g_hash_table_lookup(peer->devices, device);
if (!props) {
- return op->base_timeout;
+ return timeout;
+ }
+
+ if (props->custom_action_timeout[op->phase]) {
+ timeout = valid_fencing_timeout(props->custom_action_timeout[op->phase],
+ true, op, device);
}
// op->client_delay < 0 means disable any static/random fencing delays
if (with_delay && (op->client_delay >= 0)) {
// delay_base is eventually limited by delay_max
- delay = (props->delay_max[op->phase] > 0 ?
- props->delay_max[op->phase] : props->delay_base[op->phase]);
+ timeout += (props->delay_max[op->phase] > 0 ?
+ props->delay_max[op->phase] : props->delay_base[op->phase]);
}
- return (props->custom_action_timeout[op->phase]?
- props->custom_action_timeout[op->phase] : op->base_timeout)
- + delay;
+ return timeout;
}
struct timeout_data {
@@ -1532,7 +1600,7 @@ static int
get_op_total_timeout(const remote_fencing_op_t *op,
const peer_device_info_t *chosen_peer)
{
- int total_timeout = 0;
+ long long total_timeout = 0;
stonith_topology_t *tp = find_topology_for_host(op->target);
if (pcmk_is_set(op->call_options, st_opt_topology) && tp) {
@@ -1558,17 +1626,7 @@ get_op_total_timeout(const remote_fencing_op_t *op,
continue;
}
for (device_list = tp->levels[i]; device_list; device_list = device_list->next) {
- /* in case of watchdog-device we add the timeout to the budget
- regardless of if we got a reply or not
- */
- if ((stonith_watchdog_timeout_ms > 0)
- && pcmk__is_fencing_action(op->action)
- && pcmk__str_eq(device_list->data, STONITH_WATCHDOG_ID,
- pcmk__str_none)
- && node_does_watchdog_fencing(op->target)) {
- total_timeout += stonith_watchdog_timeout_ms / 1000;
- continue;
- }
+ bool found = false;
for (iter = op->query_results; iter != NULL; iter = iter->next) {
const peer_device_info_t *peer = iter->data;
@@ -1586,9 +1644,17 @@ get_op_total_timeout(const remote_fencing_op_t *op,
total_timeout += get_device_timeout(op, peer,
device_list->data,
true);
+ found = true;
break;
}
} /* End Loop3: match device with peer that owns device, find device's timeout period */
+
+ /* in case of watchdog-device we add the timeout to the budget
+ if didn't get a reply
+ */
+ if (!found && is_watchdog_fencing(op, device_list->data)) {
+ total_timeout += stonith_watchdog_timeout_ms / 1000;
+ }
} /* End Loop2: iterate through devices at a specific level */
} /*End Loop1: iterate through fencing levels */
@@ -1612,15 +1678,23 @@ get_op_total_timeout(const remote_fencing_op_t *op,
} else if (chosen_peer) {
total_timeout = get_peer_timeout(op, chosen_peer);
+
} else {
+ total_timeout = valid_fencing_timeout(op->base_timeout, false, op,
+ NULL);
+ }
+
+ if (total_timeout <= 0) {
total_timeout = op->base_timeout;
}
/* Take any requested fencing delay into account to prevent it from eating
* up the total timeout.
*/
- return ((total_timeout ? total_timeout : op->base_timeout)
- + ((op->client_delay > 0)? op->client_delay : 0));
+ if (op->client_delay > 0) {
+ total_timeout += op->client_delay;
+ }
+ return (int) QB_MIN(total_timeout, INT_MAX);
}
static void
@@ -1643,9 +1717,9 @@ report_timeout_period(remote_fencing_op_t * op, int op_timeout)
}
crm_trace("Reporting timeout for %s (id=%.8s)", op->client_name, op->id);
- client_node = crm_element_value(op->request, F_STONITH_CLIENTNODE);
- call_id = crm_element_value(op->request, F_STONITH_CALLID);
- client_id = crm_element_value(op->request, F_STONITH_CLIENTID);
+ client_node = crm_element_value(op->request, PCMK__XA_ST_CLIENTNODE);
+ call_id = crm_element_value(op->request, PCMK__XA_ST_CALLID);
+ client_id = crm_element_value(op->request, PCMK__XA_ST_CLIENTID);
if (!client_node || !call_id || !client_id) {
return;
}
@@ -1658,12 +1732,14 @@ report_timeout_period(remote_fencing_op_t * op, int op_timeout)
/* The client is connected to another node, relay this update to them */
update = stonith_create_op(op->client_callid, op->id, STONITH_OP_TIMEOUT_UPDATE, NULL, 0);
- crm_xml_add(update, F_STONITH_REMOTE_OP_ID, op->id);
- crm_xml_add(update, F_STONITH_CLIENTID, client_id);
- crm_xml_add(update, F_STONITH_CALLID, call_id);
- crm_xml_add_int(update, F_STONITH_TIMEOUT, op_timeout);
+ crm_xml_add(update, PCMK__XA_ST_REMOTE_OP, op->id);
+ crm_xml_add(update, PCMK__XA_ST_CLIENTID, client_id);
+ crm_xml_add(update, PCMK__XA_ST_CALLID, call_id);
+ crm_xml_add_int(update, PCMK__XA_ST_TIMEOUT, op_timeout);
- send_cluster_message(crm_get_peer(0, client_node), crm_msg_stonith_ng, update, FALSE);
+ pcmk__cluster_send_message(pcmk__get_node(0, client_node, NULL,
+ pcmk__node_search_cluster_member),
+ crm_msg_stonith_ng, update);
free_xml(update);
@@ -1742,17 +1818,18 @@ static gboolean
check_watchdog_fencing_and_wait(remote_fencing_op_t * op)
{
if (node_does_watchdog_fencing(op->target)) {
+ guint timeout_ms = QB_MIN(stonith_watchdog_timeout_ms, UINT_MAX);
- crm_notice("Waiting %lds for %s to self-fence (%s) for "
+ crm_notice("Waiting %s for %s to self-fence (%s) for "
"client %s " CRM_XS " id=%.8s",
- (stonith_watchdog_timeout_ms / 1000),
- op->target, op->action, op->client_name, op->id);
+ pcmk__readable_interval(timeout_ms), op->target, op->action,
+ op->client_name, op->id);
if (op->op_timer_one) {
g_source_remove(op->op_timer_one);
}
- op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms,
- remote_op_watchdog_done, op);
+ op->op_timer_one = g_timeout_add(timeout_ms, remote_op_watchdog_done,
+ op);
return TRUE;
} else {
crm_debug("Skipping fallback to watchdog-fencing as %s is "
@@ -1819,7 +1896,7 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, peer);
op->op_timer_total = g_timeout_add(1000 * op->total_timeout, remote_op_timeout, op);
report_timeout_period(op, op->total_timeout);
- crm_info("Total timeout set to %d for peer's fencing targeting %s for %s"
+ crm_info("Total timeout set to %ds for peer's fencing targeting %s for %s"
CRM_XS "id=%.8s",
op->total_timeout, op->target, op->client_name, op->id);
}
@@ -1846,6 +1923,9 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
if (peer) {
int timeout_one = 0;
xmlNode *remote_op = stonith_create_op(op->client_callid, op->id, STONITH_OP_FENCE, NULL, 0);
+ const crm_node_t *peer_node =
+ pcmk__get_node(0, peer->host, NULL,
+ pcmk__node_search_cluster_member);
if (op->client_delay > 0) {
/* Take requested fencing delay into account to prevent it from
@@ -1854,15 +1934,15 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
timeout_one = TIMEOUT_MULTIPLY_FACTOR * op->client_delay;
}
- crm_xml_add(remote_op, F_STONITH_REMOTE_OP_ID, op->id);
- crm_xml_add(remote_op, F_STONITH_TARGET, op->target);
- crm_xml_add(remote_op, F_STONITH_ACTION, op->action);
- crm_xml_add(remote_op, F_STONITH_ORIGIN, op->originator);
- crm_xml_add(remote_op, F_STONITH_CLIENTID, op->client_id);
- crm_xml_add(remote_op, F_STONITH_CLIENTNAME, op->client_name);
- crm_xml_add_int(remote_op, F_STONITH_TIMEOUT, timeout);
- crm_xml_add_int(remote_op, F_STONITH_CALLOPTS, op->call_options);
- crm_xml_add_int(remote_op, F_STONITH_DELAY, op->client_delay);
+ crm_xml_add(remote_op, PCMK__XA_ST_REMOTE_OP, op->id);
+ crm_xml_add(remote_op, PCMK__XA_ST_TARGET, op->target);
+ crm_xml_add(remote_op, PCMK__XA_ST_DEVICE_ACTION, op->action);
+ crm_xml_add(remote_op, PCMK__XA_ST_ORIGIN, op->originator);
+ crm_xml_add(remote_op, PCMK__XA_ST_CLIENTID, op->client_id);
+ crm_xml_add(remote_op, PCMK__XA_ST_CLIENTNAME, op->client_name);
+ crm_xml_add_int(remote_op, PCMK__XA_ST_TIMEOUT, timeout);
+ crm_xml_add_int(remote_op, PCMK__XA_ST_CALLOPT, op->call_options);
+ crm_xml_add_int(remote_op, PCMK__XA_ST_DELAY, op->client_delay);
if (device) {
timeout_one += TIMEOUT_MULTIPLY_FACTOR *
@@ -1871,14 +1951,15 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
"using %s " CRM_XS " for client %s (%ds)",
peer->host, op->action, op->target, device,
op->client_name, timeout_one);
- crm_xml_add(remote_op, F_STONITH_DEVICE, device);
+ crm_xml_add(remote_op, PCMK__XA_ST_DEVICE_ID, device);
} else {
timeout_one += TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(op, peer);
crm_notice("Requesting that %s perform '%s' action targeting %s "
- CRM_XS " for client %s (%ds, %lds)",
+ CRM_XS " for client %s (%ds, %s)",
peer->host, op->action, op->target, op->client_name,
- timeout_one, stonith_watchdog_timeout_ms);
+ timeout_one,
+ pcmk__readable_interval(stonith_watchdog_timeout_ms));
}
op->state = st_exec;
@@ -1887,11 +1968,8 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
op->op_timer_one = 0;
}
- if (!((stonith_watchdog_timeout_ms > 0)
- && (pcmk__str_eq(device, STONITH_WATCHDOG_ID, pcmk__str_none)
- || (pcmk__str_eq(peer->host, op->target, pcmk__str_casei)
- && pcmk__is_fencing_action(op->action)))
- && check_watchdog_fencing_and_wait(op))) {
+ if (!is_watchdog_fencing(op, device)
+ || !check_watchdog_fencing_and_wait(op)) {
/* Some thoughts about self-fencing cases reaching this point:
- Actually check in check_watchdog_fencing_and_wait
@@ -1907,8 +1985,8 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
enabled for a node but the watchdog-fencing-device isn't
explicitly chosen for suicide. Local pe-execution in sbd
may detect the node as unclean and lead to timely suicide.
- Otherwise the selection of stonith-watchdog-timeout at
- least is questionable.
+ Otherwise the selection of PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
+ at least is questionable.
*/
/* coming here we're not waiting for watchdog timeout -
@@ -1916,7 +1994,7 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
op->op_timer_one = g_timeout_add((1000 * timeout_one), remote_op_timeout_one, op);
}
- send_cluster_message(crm_get_peer(0, peer->host), crm_msg_stonith_ng, remote_op, FALSE);
+ pcmk__cluster_send_message(peer_node, crm_msg_stonith_ng, remote_op);
peer->tried = TRUE;
free_xml(remote_op);
return;
@@ -1948,11 +2026,15 @@ request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
* but we have all the expected replies, then no devices
* are available to execute the fencing operation. */
- if(stonith_watchdog_timeout_ms > 0 && pcmk__str_eq(device,
- STONITH_WATCHDOG_ID, pcmk__str_null_matches)) {
- if (check_watchdog_fencing_and_wait(op)) {
- return;
- }
+ if (is_watchdog_fencing(op, device)
+ && check_watchdog_fencing_and_wait(op)) {
+ /* Consider a watchdog fencing targeting an offline node executing
+ * once it starts waiting for the target to self-fence. So that when
+ * the query timer pops, remote_op_query_timeout() considers the
+ * fencing already in progress.
+ */
+ op->state = st_exec;
+ return;
}
if (op->state == st_query) {
@@ -2078,24 +2160,25 @@ parse_action_specific(const xmlNode *xml, const char *peer, const char *device,
enum st_remap_phase phase, device_properties_t *props)
{
props->custom_action_timeout[phase] = 0;
- crm_element_value_int(xml, F_STONITH_ACTION_TIMEOUT,
+ crm_element_value_int(xml, PCMK__XA_ST_ACTION_TIMEOUT,
&props->custom_action_timeout[phase]);
if (props->custom_action_timeout[phase]) {
- crm_trace("Peer %s with device %s returned %s action timeout %d",
+ crm_trace("Peer %s with device %s returned %s action timeout %ds",
peer, device, action, props->custom_action_timeout[phase]);
}
props->delay_max[phase] = 0;
- crm_element_value_int(xml, F_STONITH_DELAY_MAX, &props->delay_max[phase]);
+ crm_element_value_int(xml, PCMK__XA_ST_DELAY_MAX, &props->delay_max[phase]);
if (props->delay_max[phase]) {
- crm_trace("Peer %s with device %s returned maximum of random delay %d for %s",
+ crm_trace("Peer %s with device %s returned maximum of random delay %ds for %s",
peer, device, props->delay_max[phase], action);
}
props->delay_base[phase] = 0;
- crm_element_value_int(xml, F_STONITH_DELAY_BASE, &props->delay_base[phase]);
+ crm_element_value_int(xml, PCMK__XA_ST_DELAY_BASE,
+ &props->delay_base[phase]);
if (props->delay_base[phase]) {
- crm_trace("Peer %s with device %s returned base delay %d for %s",
+ crm_trace("Peer %s with device %s returned base delay %ds for %s",
peer, device, props->delay_base[phase], action);
}
@@ -2103,7 +2186,7 @@ parse_action_specific(const xmlNode *xml, const char *peer, const char *device,
if (pcmk__str_eq(action, PCMK_ACTION_ON, pcmk__str_none)) {
int required = 0;
- crm_element_value_int(xml, F_STONITH_DEVICE_REQUIRED, &required);
+ crm_element_value_int(xml, PCMK__XA_ST_REQUIRED, &required);
if (required) {
crm_trace("Peer %s requires device %s to execute for action %s",
peer, device, action);
@@ -2114,7 +2197,7 @@ parse_action_specific(const xmlNode *xml, const char *peer, const char *device,
/* If a reboot is remapped to off+on, it's possible that a node is allowed
* to perform one action but not another.
*/
- if (pcmk__xe_attr_is_true(xml, F_STONITH_ACTION_DISALLOWED)) {
+ if (pcmk__xe_attr_is_true(xml, PCMK__XA_ST_ACTION_DISALLOWED)) {
props->disallowed[phase] = TRUE;
crm_trace("Peer %s is disallowed from executing %s for device %s",
peer, action, device);
@@ -2136,37 +2219,39 @@ add_device_properties(const xmlNode *xml, remote_fencing_op_t *op,
{
xmlNode *child;
int verified = 0;
- device_properties_t *props = calloc(1, sizeof(device_properties_t));
+ device_properties_t *props =
+ pcmk__assert_alloc(1, sizeof(device_properties_t));
int flags = st_device_supports_on; /* Old nodes that don't set the flag assume they support the on action */
/* Add a new entry to this peer's devices list */
- CRM_ASSERT(props != NULL);
- g_hash_table_insert(peer->devices, strdup(device), props);
+ g_hash_table_insert(peer->devices, pcmk__str_copy(device), props);
/* Peers with verified (monitored) access will be preferred */
- crm_element_value_int(xml, F_STONITH_DEVICE_VERIFIED, &verified);
+ crm_element_value_int(xml, PCMK__XA_ST_MONITOR_VERIFIED, &verified);
if (verified) {
crm_trace("Peer %s has confirmed a verified device %s",
peer->host, device);
props->verified = TRUE;
}
- crm_element_value_int(xml, F_STONITH_DEVICE_SUPPORT_FLAGS, &flags);
+ crm_element_value_int(xml, PCMK__XA_ST_DEVICE_SUPPORT_FLAGS, &flags);
props->device_support_flags = flags;
/* Parse action-specific device properties */
parse_action_specific(xml, peer->host, device, op_requested_action(op),
op, st_phase_requested, props);
- for (child = pcmk__xml_first_child(xml); child != NULL;
- child = pcmk__xml_next(child)) {
+ for (child = pcmk__xe_first_child(xml, NULL, NULL, NULL); child != NULL;
+ child = pcmk__xe_next(child)) {
/* Replies for "reboot" operations will include the action-specific
* values for "off" and "on" in child elements, just in case the reboot
* winds up getting remapped.
*/
- if (pcmk__str_eq(ID(child), PCMK_ACTION_OFF, pcmk__str_none)) {
+ if (pcmk__str_eq(pcmk__xe_id(child), PCMK_ACTION_OFF, pcmk__str_none)) {
parse_action_specific(child, peer->host, device, PCMK_ACTION_OFF,
op, st_phase_off, props);
- } else if (pcmk__str_eq(ID(child), PCMK_ACTION_ON, pcmk__str_none)) {
+
+ } else if (pcmk__str_eq(pcmk__xe_id(child), PCMK_ACTION_ON,
+ pcmk__str_none)) {
parse_action_specific(child, peer->host, device, PCMK_ACTION_ON,
op, st_phase_on, props);
}
@@ -2188,19 +2273,17 @@ static peer_device_info_t *
add_result(remote_fencing_op_t *op, const char *host, int ndevices,
const xmlNode *xml)
{
- peer_device_info_t *peer = calloc(1, sizeof(peer_device_info_t));
+ peer_device_info_t *peer = pcmk__assert_alloc(1,
+ sizeof(peer_device_info_t));
xmlNode *child;
- // cppcheck seems not to understand the abort logic in CRM_CHECK
- // cppcheck-suppress memleak
- CRM_CHECK(peer != NULL, return NULL);
- peer->host = strdup(host);
+ peer->host = pcmk__str_copy(host);
peer->devices = pcmk__strkey_table(free, free);
/* Each child element describes one capable device available to the peer */
- for (child = pcmk__xml_first_child(xml); child != NULL;
- child = pcmk__xml_next(child)) {
- const char *device = ID(child);
+ for (child = pcmk__xe_first_child(xml, NULL, NULL, NULL); child != NULL;
+ child = pcmk__xe_next(child)) {
+ const char *device = pcmk__xe_id(child);
if (device) {
add_device_properties(child, op, peer, device);
@@ -2241,16 +2324,16 @@ process_remote_stonith_query(xmlNode *msg)
remote_fencing_op_t *op = NULL;
peer_device_info_t *peer = NULL;
uint32_t replies_expected;
- xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
+ xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_REMOTE_OP, msg, LOG_ERR);
CRM_CHECK(dev != NULL, return -EPROTO);
- id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
+ id = crm_element_value(dev, PCMK__XA_ST_REMOTE_OP);
CRM_CHECK(id != NULL, return -EPROTO);
- dev = get_xpath_object("//@" F_STONITH_AVAILABLE_DEVICES, msg, LOG_ERR);
+ dev = get_xpath_object("//@" PCMK__XA_ST_AVAILABLE_DEVICES, msg, LOG_ERR);
CRM_CHECK(dev != NULL, return -EPROTO);
- crm_element_value_int(dev, F_STONITH_AVAILABLE_DEVICES, &ndevices);
+ crm_element_value_int(dev, PCMK__XA_ST_AVAILABLE_DEVICES, &ndevices);
op = g_hash_table_lookup(stonith_remote_op_list, id);
if (op == NULL) {
@@ -2266,7 +2349,7 @@ process_remote_stonith_query(xmlNode *msg)
if ((++op->replies >= replies_expected) && (op->state == st_query)) {
have_all_replies = TRUE;
}
- host = crm_element_value(msg, F_ORIG);
+ host = crm_element_value(msg, PCMK__XA_SRC);
host_is_target = pcmk__str_eq(host, op->target, pcmk__str_casei);
crm_info("Query result %d of %d from %s for %s/%s (%d device%s) %s",
@@ -2339,12 +2422,12 @@ fenced_process_fencing_reply(xmlNode *msg)
const char *id = NULL;
const char *device = NULL;
remote_fencing_op_t *op = NULL;
- xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
+ xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_REMOTE_OP, msg, LOG_ERR);
pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
CRM_CHECK(dev != NULL, return);
- id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
+ id = crm_element_value(dev, PCMK__XA_ST_REMOTE_OP);
CRM_CHECK(id != NULL, return);
dev = stonith__find_xe_with_result(msg);
@@ -2352,7 +2435,7 @@ fenced_process_fencing_reply(xmlNode *msg)
stonith__xe_get_result(dev, &result);
- device = crm_element_value(dev, F_STONITH_DEVICE);
+ device = crm_element_value(dev, PCMK__XA_ST_DEVICE_ID);
if (stonith_remote_op_list) {
op = g_hash_table_lookup(stonith_remote_op_list, id);
@@ -2360,7 +2443,7 @@ fenced_process_fencing_reply(xmlNode *msg)
if ((op == NULL) && pcmk__result_ok(&result)) {
/* Record successful fencing operations */
- const char *client_id = crm_element_value(dev, F_STONITH_CLIENTID);
+ const char *client_id = crm_element_value(dev, PCMK__XA_ST_CLIENTID);
op = create_remote_stonith_op(client_id, dev, TRUE);
}
@@ -2383,7 +2466,9 @@ fenced_process_fencing_reply(xmlNode *msg)
return;
}
- if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) {
+ if (pcmk__str_eq(crm_element_value(msg, PCMK__XA_SUBT),
+ PCMK__VALUE_BROADCAST, pcmk__str_none)) {
+
if (pcmk__result_ok(&op->result)) {
op->state = st_done;
} else {
@@ -2412,7 +2497,7 @@ fenced_process_fencing_reply(xmlNode *msg)
return;
}
- device = crm_element_value(msg, F_STONITH_DEVICE);
+ device = crm_element_value(msg, PCMK__XA_ST_DEVICE_ID);
if ((op->phase == 2) && !pcmk__result_ok(&op->result)) {
/* A remapped "on" failed, but the node was already turned off