diff options
Diffstat (limited to 'daemons/controld/controld_te_events.c')
-rw-r--r-- | daemons/controld/controld_te_events.c | 601 |
1 files changed, 601 insertions, 0 deletions
diff --git a/daemons/controld/controld_te_events.c b/daemons/controld/controld_te_events.c new file mode 100644 index 0000000..d4e2b0f --- /dev/null +++ b/daemons/controld/controld_te_events.c @@ -0,0 +1,601 @@ +/* + * Copyright 2004-2023 the Pacemaker project contributors + * + * The version control history for this file may have further details. + * + * This source code is licensed under the GNU General Public License version 2 + * or later (GPLv2+) WITHOUT ANY WARRANTY. + */ + +#include <crm_internal.h> + +#include <sys/param.h> +#include <crm/crm.h> +#include <crm/cib.h> +#include <crm/msg_xml.h> +#include <crm/common/xml.h> + +#include <pacemaker-controld.h> + +#include <crm/common/attrd_internal.h> +#include <crm/common/ipc_attrd_internal.h> + +/*! + * \internal + * \brief Action numbers of outside events processed in current update diff + * + * This table is to be used as a set. It should be empty when the transitioner + * begins processing a CIB update diff. It ensures that if there are multiple + * events (for example, "_last_0" and "_last_failure_0") for the same action, + * only one of them updates the failcount. Events that originate outside the + * cluster can't be confirmed, since they're not in the transition graph. + */ +static GHashTable *outside_events = NULL; + +/*! + * \internal + * \brief Empty the hash table containing action numbers of outside events + */ +void +controld_remove_all_outside_events(void) +{ + if (outside_events != NULL) { + g_hash_table_remove_all(outside_events); + } +} + +/*! + * \internal + * \brief Destroy the hash table containing action numbers of outside events + */ +void +controld_destroy_outside_events_table(void) +{ + if (outside_events != NULL) { + g_hash_table_destroy(outside_events); + outside_events = NULL; + } +} + +/*! + * \internal + * \brief Add an outside event's action number to a set + * + * \return Standard Pacemaker return code. Specifically, \p pcmk_rc_ok if the + * event was not already in the set, or \p pcmk_rc_already otherwise. + */ +static int +record_outside_event(gint action_num) +{ + if (outside_events == NULL) { + outside_events = g_hash_table_new(NULL, NULL); + } + + if (g_hash_table_add(outside_events, GINT_TO_POINTER(action_num))) { + return pcmk_rc_ok; + } + return pcmk_rc_already; +} + +gboolean +fail_incompletable_actions(pcmk__graph_t *graph, const char *down_node) +{ + const char *target_uuid = NULL; + const char *router = NULL; + const char *router_uuid = NULL; + xmlNode *last_action = NULL; + + GList *gIter = NULL; + GList *gIter2 = NULL; + + if (graph == NULL || graph->complete) { + return FALSE; + } + + gIter = graph->synapses; + for (; gIter != NULL; gIter = gIter->next) { + pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) gIter->data; + + if (pcmk_any_flags_set(synapse->flags, pcmk__synapse_confirmed|pcmk__synapse_failed)) { + /* We've already been here */ + continue; + } + + gIter2 = synapse->actions; + for (; gIter2 != NULL; gIter2 = gIter2->next) { + pcmk__graph_action_t *action = (pcmk__graph_action_t *) gIter2->data; + + if ((action->type == pcmk__pseudo_graph_action) + || pcmk_is_set(action->flags, pcmk__graph_action_confirmed)) { + continue; + } else if (action->type == pcmk__cluster_graph_action) { + const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); + + if (pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) { + continue; + } + } + + target_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); + router = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE); + if (router) { + crm_node_t *node = crm_get_peer(0, router); + if (node) { + router_uuid = node->uuid; + } + } + + if (pcmk__str_eq(target_uuid, down_node, pcmk__str_casei) || pcmk__str_eq(router_uuid, down_node, pcmk__str_casei)) { + pcmk__set_graph_action_flags(action, pcmk__graph_action_failed); + pcmk__set_synapse_flags(synapse, pcmk__synapse_failed); + last_action = action->xml; + stop_te_timer(action); + pcmk__update_graph(graph, action); + + if (pcmk_is_set(synapse->flags, pcmk__synapse_executed)) { + crm_notice("Action %d (%s) was pending on %s (offline)", + action->id, crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY), down_node); + } else { + crm_info("Action %d (%s) is scheduled for %s (offline)", + action->id, crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY), down_node); + } + } + } + } + + if (last_action != NULL) { + crm_info("Node %s shutdown resulted in un-runnable actions", down_node); + abort_transition(INFINITY, pcmk__graph_restart, "Node failure", + last_action); + return TRUE; + } + + return FALSE; +} + +/*! + * \internal + * \brief Update failure-related node attributes if warranted + * + * \param[in] event XML describing operation that (maybe) failed + * \param[in] event_node_uuid Node that event occurred on + * \param[in] rc Actual operation return code + * \param[in] target_rc Expected operation return code + * \param[in] do_update If TRUE, do update regardless of operation type + * \param[in] ignore_failures If TRUE, update last failure but not fail count + * + * \return TRUE if this was not a direct nack, success or lrm status refresh + */ +static gboolean +update_failcount(const xmlNode *event, const char *event_node_uuid, int rc, + int target_rc, gboolean do_update, gboolean ignore_failures) +{ + guint interval_ms = 0; + + char *task = NULL; + char *rsc_id = NULL; + + const char *value = NULL; + const char *id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY); + const char *on_uname = crm_peer_uname(event_node_uuid); + const char *origin = crm_element_value(event, XML_ATTR_ORIGIN); + + // Nothing needs to be done for success or status refresh + if (rc == target_rc) { + return FALSE; + } else if (pcmk__str_eq(origin, "build_active_RAs", pcmk__str_casei)) { + crm_debug("No update for %s (rc=%d) on %s: Old failure from lrm status refresh", + id, rc, on_uname); + return FALSE; + } + + /* Sanity check */ + CRM_CHECK(on_uname != NULL, return TRUE); + CRM_CHECK(parse_op_key(id, &rsc_id, &task, &interval_ms), + crm_err("Couldn't parse: %s", ID(event)); goto bail); + + /* Decide whether update is necessary and what value to use */ + if ((interval_ms > 0) + || pcmk__str_eq(task, CRMD_ACTION_PROMOTE, pcmk__str_none) + || pcmk__str_eq(task, CRMD_ACTION_DEMOTE, pcmk__str_none)) { + do_update = TRUE; + + } else if (pcmk__str_eq(task, CRMD_ACTION_START, pcmk__str_none)) { + do_update = TRUE; + value = pcmk__s(controld_globals.transition_graph->failed_start_offset, + CRM_INFINITY_S); + + } else if (pcmk__str_eq(task, CRMD_ACTION_STOP, pcmk__str_none)) { + do_update = TRUE; + value = pcmk__s(controld_globals.transition_graph->failed_stop_offset, + CRM_INFINITY_S); + } + + if (do_update) { + pcmk__attrd_query_pair_t *fail_pair = NULL; + pcmk__attrd_query_pair_t *last_pair = NULL; + char *fail_name = NULL; + char *last_name = NULL; + GList *attrs = NULL; + + uint32_t opts = pcmk__node_attr_none; + + char *now = pcmk__ttoa(time(NULL)); + + // Fail count will be either incremented or set to infinity + if (!pcmk_str_is_infinity(value)) { + value = XML_NVPAIR_ATTR_VALUE "++"; + } + + if (g_hash_table_lookup(crm_remote_peer_cache, event_node_uuid)) { + opts |= pcmk__node_attr_remote; + } + + crm_info("Updating %s for %s on %s after failed %s: rc=%d (update=%s, time=%s)", + (ignore_failures? "last failure" : "failcount"), + rsc_id, on_uname, task, rc, value, now); + + /* Update the fail count, if we're not ignoring failures */ + if (!ignore_failures) { + fail_pair = calloc(1, sizeof(pcmk__attrd_query_pair_t)); + CRM_ASSERT(fail_pair != NULL); + + fail_name = pcmk__failcount_name(rsc_id, task, interval_ms); + fail_pair->name = fail_name; + fail_pair->value = value; + fail_pair->node = on_uname; + + attrs = g_list_prepend(attrs, fail_pair); + } + + /* Update the last failure time (even if we're ignoring failures, + * so that failure can still be detected and shown, e.g. by crm_mon) + */ + last_pair = calloc(1, sizeof(pcmk__attrd_query_pair_t)); + CRM_ASSERT(last_pair != NULL); + + last_name = pcmk__lastfailure_name(rsc_id, task, interval_ms); + last_pair->name = last_name; + last_pair->value = now; + last_pair->node = on_uname; + + attrs = g_list_prepend(attrs, last_pair); + + update_attrd_list(attrs, opts); + + free(fail_name); + free(fail_pair); + + free(last_name); + free(last_pair); + g_list_free(attrs); + + free(now); + } + + bail: + free(rsc_id); + free(task); + return TRUE; +} + +pcmk__graph_action_t * +controld_get_action(int id) +{ + for (GList *item = controld_globals.transition_graph->synapses; + item != NULL; item = item->next) { + pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) item->data; + + for (GList *item2 = synapse->actions; item2; item2 = item2->next) { + pcmk__graph_action_t *action = (pcmk__graph_action_t *) item2->data; + + if (action->id == id) { + return action; + } + } + } + return NULL; +} + +pcmk__graph_action_t * +get_cancel_action(const char *id, const char *node) +{ + GList *gIter = NULL; + GList *gIter2 = NULL; + + gIter = controld_globals.transition_graph->synapses; + for (; gIter != NULL; gIter = gIter->next) { + pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) gIter->data; + + gIter2 = synapse->actions; + for (; gIter2 != NULL; gIter2 = gIter2->next) { + const char *task = NULL; + const char *target = NULL; + pcmk__graph_action_t *action = (pcmk__graph_action_t *) gIter2->data; + + task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); + if (!pcmk__str_eq(CRMD_ACTION_CANCEL, task, pcmk__str_casei)) { + continue; + } + + task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); + if (!pcmk__str_eq(task, id, pcmk__str_casei)) { + crm_trace("Wrong key %s for %s on %s", task, id, node); + continue; + } + + target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); + if (node && !pcmk__str_eq(target, node, pcmk__str_casei)) { + crm_trace("Wrong node %s for %s on %s", target, id, node); + continue; + } + + crm_trace("Found %s on %s", id, node); + return action; + } + } + + return NULL; +} + +bool +confirm_cancel_action(const char *id, const char *node_id) +{ + const char *op_key = NULL; + const char *node_name = NULL; + pcmk__graph_action_t *cancel = get_cancel_action(id, node_id); + + if (cancel == NULL) { + return FALSE; + } + op_key = crm_element_value(cancel->xml, XML_LRM_ATTR_TASK_KEY); + node_name = crm_element_value(cancel->xml, XML_LRM_ATTR_TARGET); + + stop_te_timer(cancel); + te_action_confirmed(cancel, controld_globals.transition_graph); + + crm_info("Cancellation of %s on %s confirmed (action %d)", + op_key, node_name, cancel->id); + return TRUE; +} + +/* downed nodes are listed like: <downed> <node id="UUID1" /> ... </downed> */ +#define XPATH_DOWNED "//" XML_GRAPH_TAG_DOWNED \ + "/" XML_CIB_TAG_NODE "[@" XML_ATTR_ID "='%s']" + +/*! + * \brief Find a transition event that would have made a specified node down + * + * \param[in] target UUID of node to match + * + * \return Matching event if found, NULL otherwise + */ +pcmk__graph_action_t * +match_down_event(const char *target) +{ + pcmk__graph_action_t *match = NULL; + xmlXPathObjectPtr xpath_ret = NULL; + GList *gIter, *gIter2; + + char *xpath = crm_strdup_printf(XPATH_DOWNED, target); + + for (gIter = controld_globals.transition_graph->synapses; + gIter != NULL && match == NULL; + gIter = gIter->next) { + + for (gIter2 = ((pcmk__graph_synapse_t * ) gIter->data)->actions; + gIter2 != NULL && match == NULL; + gIter2 = gIter2->next) { + + match = (pcmk__graph_action_t *) gIter2->data; + if (pcmk_is_set(match->flags, pcmk__graph_action_executed)) { + xpath_ret = xpath_search(match->xml, xpath); + if (numXpathResults(xpath_ret) < 1) { + match = NULL; + } + freeXpathObject(xpath_ret); + } else { + // Only actions that were actually started can match + match = NULL; + } + } + } + + free(xpath); + + if (match != NULL) { + crm_debug("Shutdown action %d (%s) found for node %s", match->id, + crm_element_value(match->xml, XML_LRM_ATTR_TASK_KEY), target); + } else { + crm_debug("No reason to expect node %s to be down", target); + } + return match; +} + +void +process_graph_event(xmlNode *event, const char *event_node) +{ + int rc = -1; // Actual result + int target_rc = -1; // Expected result + int status = -1; // Executor status + int callid = -1; // Executor call ID + int transition_num = -1; // Transition number + int action_num = -1; // Action number within transition + char *update_te_uuid = NULL; + bool ignore_failures = FALSE; + const char *id = NULL; + const char *desc = NULL; + const char *magic = NULL; + const char *uname = NULL; + + CRM_ASSERT(event != NULL); + +/* +<lrm_rsc_op id="rsc_east-05_last_0" operation_key="rsc_east-05_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.6" transition-key="9:2:7:be2e97d9-05e2-439d-863e-48f7aecab2aa" transition-magic="0:7;9:2:7:be2e97d9-05e2-439d-863e-48f7aecab2aa" call-id="17" rc-code="7" op-status="0" interval="0" last-rc-change="1355361636" exec-time="128" queue-time="0" op-digest="c81f5f40b1c9e859c992e800b1aa6972"/> +*/ + + magic = crm_element_value(event, XML_ATTR_TRANSITION_KEY); + if (magic == NULL) { + /* non-change */ + return; + } + + crm_element_value_int(event, XML_LRM_ATTR_OPSTATUS, &status); + if (status == PCMK_EXEC_PENDING) { + return; + } + + id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY); + crm_element_value_int(event, XML_LRM_ATTR_RC, &rc); + crm_element_value_int(event, XML_LRM_ATTR_CALLID, &callid); + + rc = pcmk__effective_rc(rc); + + if (decode_transition_key(magic, &update_te_uuid, &transition_num, + &action_num, &target_rc) == FALSE) { + // decode_transition_key() already logged the bad key + crm_err("Can't process action %s result: Incompatible versions? " + CRM_XS " call-id=%d", id, callid); + abort_transition(INFINITY, pcmk__graph_restart, "Bad event", event); + return; + } + + if (transition_num == -1) { + // E.g. crm_resource --fail + if (record_outside_event(action_num) != pcmk_rc_ok) { + crm_debug("Outside event with transition key '%s' has already been " + "processed", magic); + goto bail; + } + desc = "initiated outside of the cluster"; + abort_transition(INFINITY, pcmk__graph_restart, "Unexpected event", + event); + + } else if ((action_num < 0) + || !pcmk__str_eq(update_te_uuid, controld_globals.te_uuid, + pcmk__str_none)) { + desc = "initiated by a different DC"; + abort_transition(INFINITY, pcmk__graph_restart, "Foreign event", event); + + } else if ((controld_globals.transition_graph->id != transition_num) + || controld_globals.transition_graph->complete) { + + // Action is not from currently active transition + + guint interval_ms = 0; + + if (parse_op_key(id, NULL, NULL, &interval_ms) + && (interval_ms != 0)) { + /* Recurring actions have the transition number they were first + * scheduled in. + */ + + if (status == PCMK_EXEC_CANCELLED) { + confirm_cancel_action(id, get_node_id(event)); + goto bail; + } + + desc = "arrived after initial scheduling"; + abort_transition(INFINITY, pcmk__graph_restart, + "Change in recurring result", event); + + } else if (controld_globals.transition_graph->id != transition_num) { + desc = "arrived really late"; + abort_transition(INFINITY, pcmk__graph_restart, "Old event", event); + } else { + desc = "arrived late"; + abort_transition(INFINITY, pcmk__graph_restart, "Inactive graph", + event); + } + + } else { + // Event is result of an action from currently active transition + pcmk__graph_action_t *action = controld_get_action(action_num); + + if (action == NULL) { + // Should never happen + desc = "unknown"; + abort_transition(INFINITY, pcmk__graph_restart, "Unknown event", + event); + + } else if (pcmk_is_set(action->flags, pcmk__graph_action_confirmed)) { + /* Nothing further needs to be done if the action has already been + * confirmed. This can happen e.g. when processing both an + * "xxx_last_0" or "xxx_last_failure_0" record as well as the main + * history record, which would otherwise result in incorrectly + * bumping the fail count twice. + */ + crm_log_xml_debug(event, "Event already confirmed:"); + goto bail; + + } else { + /* An action result needs to be confirmed. + * (This is the only case where desc == NULL.) + */ + + if (pcmk__str_eq(crm_meta_value(action->params, XML_OP_ATTR_ON_FAIL), "ignore", pcmk__str_casei)) { + ignore_failures = TRUE; + + } else if (rc != target_rc) { + pcmk__set_graph_action_flags(action, pcmk__graph_action_failed); + } + + stop_te_timer(action); + te_action_confirmed(action, controld_globals.transition_graph); + + if (pcmk_is_set(action->flags, pcmk__graph_action_failed)) { + abort_transition(action->synapse->priority + 1, + pcmk__graph_restart, "Event failed", event); + } + } + } + + if (id == NULL) { + id = "unknown action"; + } + uname = crm_element_value(event, XML_LRM_ATTR_TARGET); + if (uname == NULL) { + uname = "unknown node"; + } + + if (status == PCMK_EXEC_INVALID) { + // We couldn't attempt the action + crm_info("Transition %d action %d (%s on %s): %s", + transition_num, action_num, id, uname, + pcmk_exec_status_str(status)); + + } else if (desc && update_failcount(event, event_node, rc, target_rc, + (transition_num == -1), FALSE)) { + crm_notice("Transition %d action %d (%s on %s): expected '%s' but got '%s' " + CRM_XS " target-rc=%d rc=%d call-id=%d event='%s'", + transition_num, action_num, id, uname, + services_ocf_exitcode_str(target_rc), + services_ocf_exitcode_str(rc), + target_rc, rc, callid, desc); + + } else if (desc) { + crm_info("Transition %d action %d (%s on %s): %s " + CRM_XS " rc=%d target-rc=%d call-id=%d", + transition_num, action_num, id, uname, + desc, rc, target_rc, callid); + + } else if (rc == target_rc) { + crm_info("Transition %d action %d (%s on %s) confirmed: %s " + CRM_XS " rc=%d call-id=%d", + transition_num, action_num, id, uname, + services_ocf_exitcode_str(rc), rc, callid); + + } else { + update_failcount(event, event_node, rc, target_rc, + (transition_num == -1), ignore_failures); + crm_notice("Transition %d action %d (%s on %s): expected '%s' but got '%s' " + CRM_XS " target-rc=%d rc=%d call-id=%d", + transition_num, action_num, id, uname, + services_ocf_exitcode_str(target_rc), + services_ocf_exitcode_str(rc), + target_rc, rc, callid); + } + + bail: + free(update_te_uuid); +} |