diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 07:45:40 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 07:45:40 +0000 |
commit | 07d7f4cfa4b10de87a31b68191036ff446add675 (patch) | |
tree | 7162524d8aaf1aef62d2f4fa51f595ed113981ff /daemons/controld | |
parent | Adding upstream version 2.1.6. (diff) | |
download | pacemaker-07d7f4cfa4b10de87a31b68191036ff446add675.tar.xz pacemaker-07d7f4cfa4b10de87a31b68191036ff446add675.zip |
Adding upstream version 2.1.7.upstream/2.1.7
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'daemons/controld')
31 files changed, 796 insertions, 616 deletions
diff --git a/daemons/controld/Makefile.am b/daemons/controld/Makefile.am index 08be1ff..1312090 100644 --- a/daemons/controld/Makefile.am +++ b/daemons/controld/Makefile.am @@ -14,34 +14,20 @@ halibdir = $(CRM_DAEMON_DIR) halib_PROGRAMS = pacemaker-controld -noinst_HEADERS = controld_alerts.h \ - controld_callbacks.h \ - controld_cib.h \ - controld_fencing.h \ - controld_fsa.h \ - controld_globals.h \ - controld_lrm.h \ - controld_membership.h \ - controld_messages.h \ - controld_metadata.h \ - controld_throttle.h \ - controld_timers.h \ - controld_transition.h \ - controld_utils.h \ - pacemaker-controld.h +noinst_HEADERS = $(wildcard *.h) pacemaker_controld_CFLAGS = $(CFLAGS_HARDENED_EXE) pacemaker_controld_LDFLAGS = $(LDFLAGS_HARDENED_EXE) -pacemaker_controld_LDADD = $(top_builddir)/lib/fencing/libstonithd.la \ - $(top_builddir)/lib/pacemaker/libpacemaker.la \ - $(top_builddir)/lib/pengine/libpe_rules.la \ - $(top_builddir)/lib/cib/libcib.la \ - $(top_builddir)/lib/cluster/libcrmcluster.la \ - $(top_builddir)/lib/common/libcrmcommon.la \ - $(top_builddir)/lib/services/libcrmservice.la \ - $(top_builddir)/lib/lrmd/liblrmd.la \ - $(CLUSTERLIBS) +pacemaker_controld_LDADD = $(top_builddir)/lib/pacemaker/libpacemaker.la +pacemaker_controld_LDADD += $(top_builddir)/lib/cib/libcib.la +pacemaker_controld_LDADD += $(top_builddir)/lib/pengine/libpe_rules.la +pacemaker_controld_LDADD += $(top_builddir)/lib/fencing/libstonithd.la +pacemaker_controld_LDADD += $(top_builddir)/lib/cluster/libcrmcluster.la +pacemaker_controld_LDADD += $(top_builddir)/lib/lrmd/liblrmd.la +pacemaker_controld_LDADD += $(top_builddir)/lib/services/libcrmservice.la +pacemaker_controld_LDADD += $(top_builddir)/lib/common/libcrmcommon.la +pacemaker_controld_LDADD += $(CLUSTERLIBS) pacemaker_controld_SOURCES = pacemaker-controld.c \ controld_alerts.c \ @@ -79,9 +65,11 @@ endif CLEANFILES = $(man7_MANS) if BUILD_LEGACY_LINKS +.PHONY: install-exec-hook install-exec-hook: cd $(DESTDIR)$(CRM_DAEMON_DIR) && rm -f crmd && $(LN_S) pacemaker-controld crmd +.PHONY: uninstall-hook uninstall-hook: cd $(DESTDIR)$(CRM_DAEMON_DIR) && rm -f crmd endif diff --git a/daemons/controld/controld_callbacks.c b/daemons/controld/controld_callbacks.c index d578adc..7078739 100644 --- a/daemons/controld/controld_callbacks.c +++ b/daemons/controld/controld_callbacks.c @@ -1,5 +1,5 @@ /* - * Copyright 2004-2022 the Pacemaker project contributors + * Copyright 2004-2023 the Pacemaker project contributors * * The version control history for this file may have further details. * @@ -107,6 +107,8 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d bool appeared = FALSE; bool is_remote = pcmk_is_set(node->flags, crm_remote_node); + controld_node_pending_timer(node); + /* The controller waits to receive some information from the membership * layer before declaring itself operational. If this is being called for a * cluster node, indicate that we have it. @@ -274,13 +276,14 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d if (down) { const char *task = crm_element_value(down->xml, XML_LRM_ATTR_TASK); - if (pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) { + if (pcmk__str_eq(task, PCMK_ACTION_STONITH, pcmk__str_casei)) { /* tengine_stonith_callback() confirms fence actions */ crm_trace("Updating CIB %s fencer reported fencing of %s complete", (pcmk_is_set(down->flags, pcmk__graph_action_confirmed)? "after" : "before"), node->uname); - } else if (!appeared && pcmk__str_eq(task, CRM_OP_SHUTDOWN, pcmk__str_casei)) { + } else if (!appeared && pcmk__str_eq(task, PCMK_ACTION_DO_SHUTDOWN, + pcmk__str_casei)) { // Shutdown actions are immediately confirmed (i.e. no_wait) if (!is_remote) { @@ -342,6 +345,17 @@ peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *d } } + if (!appeared && (type == crm_status_processes) + && (node->when_member > 1)) { + /* The node left CPG but is still a cluster member. Set its + * membership time to 1 to record it in the cluster state as a + * boolean, so we don't fence it due to node-pending-timeout. + */ + node->when_member = 1; + flags |= node_update_cluster; + controld_node_pending_timer(node); + } + /* Update the CIB node state */ update = create_node_state_update(node, flags, NULL, __func__); if (update == NULL) { diff --git a/daemons/controld/controld_cib.c b/daemons/controld/controld_cib.c index 94b99dd..865e41f 100644 --- a/daemons/controld/controld_cib.c +++ b/daemons/controld/controld_cib.c @@ -22,90 +22,6 @@ // Call ID of the most recent in-progress CIB resource update (or 0 if none) static int pending_rsc_update = 0; -// Call IDs of requested CIB replacements that won't trigger a new election -// (used as a set of gint values) -static GHashTable *cib_replacements = NULL; - -/*! - * \internal - * \brief Store the call ID of a CIB replacement that the controller requested - * - * The \p do_cib_replaced() callback function will avoid triggering a new - * election when we're notified of one of these expected replacements. - * - * \param[in] call_id CIB call ID (or 0 for a synchronous call) - * - * \note This function should be called after making any asynchronous CIB - * request (or before making any synchronous CIB request) that may replace - * part of the nodes or status section. This may include CIB sync calls. - */ -void -controld_record_cib_replace_call(int call_id) -{ - CRM_CHECK(call_id >= 0, return); - - if (cib_replacements == NULL) { - cib_replacements = g_hash_table_new(NULL, NULL); - } - - /* If the call ID is already present in the table, then it's old. We may not - * be removing them properly, and we could improperly ignore replacement - * notifications if cib_t:call_id wraps around. - */ - CRM_LOG_ASSERT(g_hash_table_add(cib_replacements, - GINT_TO_POINTER((gint) call_id))); -} - -/*! - * \internal - * \brief Remove the call ID of a CIB replacement from the replacements table - * - * \param[in] call_id CIB call ID (or 0 for a synchronous call) - * - * \return \p true if \p call_id was found in the table, or \p false otherwise - * - * \note CIB notifications run before CIB callbacks. If this function is called - * from within a callback, \p do_cib_replaced() will have removed - * \p call_id from the table first if relevant changes triggered a - * notification. - */ -bool -controld_forget_cib_replace_call(int call_id) -{ - CRM_CHECK(call_id >= 0, return false); - - if (cib_replacements == NULL) { - return false; - } - return g_hash_table_remove(cib_replacements, - GINT_TO_POINTER((gint) call_id)); -} - -/*! - * \internal - * \brief Empty the hash table containing call IDs of CIB replacement requests - */ -void -controld_forget_all_cib_replace_calls(void) -{ - if (cib_replacements != NULL) { - g_hash_table_remove_all(cib_replacements); - } -} - -/*! - * \internal - * \brief Free the hash table containing call IDs of CIB replacement requests - */ -void -controld_destroy_cib_replacements_table(void) -{ - if (cib_replacements != NULL) { - g_hash_table_destroy(cib_replacements); - cib_replacements = NULL; - } -} - /*! * \internal * \brief Respond to a dropped CIB connection @@ -127,54 +43,54 @@ handle_cib_disconnect(gpointer user_data) controld_clear_fsa_input_flags(R_CIB_CONNECTED); } else { // Expected - crm_info("Connection to the CIB manager terminated"); + crm_info("Disconnected from the CIB manager"); } } static void do_cib_updated(const char *event, xmlNode * msg) { - if (pcmk__alert_in_patchset(msg, TRUE)) { - controld_trigger_config(); + const xmlNode *patchset = NULL; + const char *client_name = NULL; + + crm_debug("Received CIB diff notification: DC=%s", pcmk__btoa(AM_I_DC)); + + if (cib__get_notify_patchset(msg, &patchset) != pcmk_rc_ok) { + return; } -} -static void -do_cib_replaced(const char *event, xmlNode * msg) -{ - int call_id = 0; - const char *client_id = crm_element_value(msg, F_CIB_CLIENTID); - uint32_t change_section = cib_change_section_nodes - |cib_change_section_status; - long long value = 0; + if (cib__element_in_patchset(patchset, XML_CIB_TAG_ALERTS) + || cib__element_in_patchset(patchset, XML_CIB_TAG_CRMCONFIG)) { + + controld_trigger_config(); + } - crm_debug("Updating the CIB after a replace: DC=%s", pcmk__btoa(AM_I_DC)); if (!AM_I_DC) { + // We're not in control of the join sequence return; } - if ((crm_element_value_int(msg, F_CIB_CALLID, &call_id) == 0) - && pcmk__str_eq(client_id, controld_globals.cib_client_id, - pcmk__str_none) - && controld_forget_cib_replace_call(call_id)) { - // We requested this replace op. No need to restart the join. + client_name = crm_element_value(msg, F_CIB_CLIENTNAME); + if (!cib__client_triggers_refresh(client_name)) { + // The CIB is still accurate return; } - if ((crm_element_value_ll(msg, F_CIB_CHANGE_SECTION, &value) < 0) - || (value < 0) || (value > UINT32_MAX)) { + if (cib__element_in_patchset(patchset, XML_CIB_TAG_NODES) + || cib__element_in_patchset(patchset, XML_CIB_TAG_STATUS)) { - crm_trace("Couldn't parse '%s' from message", F_CIB_CHANGE_SECTION); - } else { - change_section = (uint32_t) value; - } - - if (pcmk_any_flags_set(change_section, cib_change_section_nodes - |cib_change_section_status)) { + /* An unsafe client modified the nodes or status section. Ensure the + * node list is up-to-date, and start the join process again so we get + * everyone's current resource history. + */ + if (client_name == NULL) { + client_name = crm_element_value(msg, F_CIB_CLIENTID); + } + crm_notice("Populating nodes and starting an election after %s event " + "triggered by %s", + event, pcmk__s(client_name, "(unidentified client)")); - /* start the join process again so we get everyone's LRM status */ populate_cib_nodes(node_update_quick|node_update_all, __func__); - register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL); } } @@ -186,12 +102,10 @@ controld_disconnect_cib_manager(void) CRM_ASSERT(cib_conn != NULL); - crm_info("Disconnecting from the CIB manager"); + crm_debug("Disconnecting from the CIB manager"); controld_clear_fsa_input_flags(R_CIB_CONNECTED); - cib_conn->cmds->del_notify_callback(cib_conn, T_CIB_REPLACE_NOTIFY, - do_cib_replaced); cib_conn->cmds->del_notify_callback(cib_conn, T_CIB_DIFF_NOTIFY, do_cib_updated); cib_free_callbacks(cib_conn); @@ -201,8 +115,6 @@ controld_disconnect_cib_manager(void) cib_scope_local|cib_discard_reply); cib_conn->cmds->signoff(cib_conn); } - - crm_notice("Disconnected from the CIB manager"); } /* A_CIB_STOP, A_CIB_START, O_CIB_RESTART */ @@ -217,7 +129,6 @@ do_cib_control(long long action, cib_t *cib_conn = controld_globals.cib_conn; void (*dnotify_fn) (gpointer user_data) = handle_cib_disconnect; - void (*replace_cb) (const char *event, xmlNodePtr msg) = do_cib_replaced; void (*update_cb) (const char *event, xmlNodePtr msg) = do_cib_updated; int rc = pcmk_ok; @@ -264,11 +175,6 @@ do_cib_control(long long action, crm_err("Could not set dnotify callback"); } else if (cib_conn->cmds->add_notify_callback(cib_conn, - T_CIB_REPLACE_NOTIFY, - replace_cb) != pcmk_ok) { - crm_err("Could not set CIB notification callback (replace)"); - - } else if (cib_conn->cmds->add_notify_callback(cib_conn, T_CIB_DIFF_NOTIFY, update_cb) != pcmk_ok) { crm_err("Could not set CIB notification callback (update)"); @@ -276,8 +182,6 @@ do_cib_control(long long action, } else { controld_set_fsa_input_flags(R_CIB_CONNECTED); cib_retries = 0; - cib_conn->cmds->client_id(cib_conn, &controld_globals.cib_client_id, - NULL); } if (!pcmk_is_set(controld_globals.fsa_input_register, R_CIB_CONNECTED)) { @@ -310,11 +214,12 @@ do_cib_control(long long action, unsigned int cib_op_timeout(void) { + // @COMPAT: Drop env_timeout at 3.0.0 static int env_timeout = -1; unsigned int calculated_timeout = 0; if (env_timeout == -1) { - const char *env = getenv("PCMK_cib_timeout"); + const char *env = pcmk__env_option(PCMK__ENV_CIB_TIMEOUT); pcmk__scan_min_int(env, &env_timeout, MIN_CIB_OP_TIMEOUT); crm_trace("Minimum CIB op timeout: %ds (environment: %s)", @@ -401,67 +306,87 @@ cib_delete_callback(xmlNode *msg, int call_id, int rc, xmlNode *output, /*! * \internal - * \brief Delete subsection of a node's CIB node_state + * \brief Get the XPath and description of a node state section to be deleted * - * \param[in] uname Desired node - * \param[in] section Subsection of node_state to delete - * \param[in] options CIB call options to use + * \param[in] uname Desired node + * \param[in] section Subsection of node_state to be deleted + * \param[out] xpath Where to store XPath of \p section + * \param[out] desc If not \c NULL, where to store description of \p section */ void -controld_delete_node_state(const char *uname, enum controld_section_e section, - int options) +controld_node_state_deletion_strings(const char *uname, + enum controld_section_e section, + char **xpath, char **desc) { - cib_t *cib_conn = controld_globals.cib_conn; - - char *xpath = NULL; - char *desc = NULL; + const char *desc_pre = NULL; // Shutdown locks that started before this time are expired long long expire = (long long) time(NULL) - controld_globals.shutdown_lock_limit; - CRM_CHECK(uname != NULL, return); switch (section) { case controld_section_lrm: - xpath = crm_strdup_printf(XPATH_NODE_LRM, uname); - desc = crm_strdup_printf("resource history for node %s", uname); + *xpath = crm_strdup_printf(XPATH_NODE_LRM, uname); + desc_pre = "resource history"; break; case controld_section_lrm_unlocked: - xpath = crm_strdup_printf(XPATH_NODE_LRM_UNLOCKED, - uname, uname, expire); - desc = crm_strdup_printf("resource history (other than shutdown " - "locks) for node %s", uname); + *xpath = crm_strdup_printf(XPATH_NODE_LRM_UNLOCKED, + uname, uname, expire); + desc_pre = "resource history (other than shutdown locks)"; break; case controld_section_attrs: - xpath = crm_strdup_printf(XPATH_NODE_ATTRS, uname); - desc = crm_strdup_printf("transient attributes for node %s", uname); + *xpath = crm_strdup_printf(XPATH_NODE_ATTRS, uname); + desc_pre = "transient attributes"; break; case controld_section_all: - xpath = crm_strdup_printf(XPATH_NODE_ALL, uname); - desc = crm_strdup_printf("all state for node %s", uname); + *xpath = crm_strdup_printf(XPATH_NODE_ALL, uname); + desc_pre = "all state"; break; case controld_section_all_unlocked: - xpath = crm_strdup_printf(XPATH_NODE_ALL_UNLOCKED, - uname, uname, expire, uname); - desc = crm_strdup_printf("all state (other than shutdown locks) " - "for node %s", uname); + *xpath = crm_strdup_printf(XPATH_NODE_ALL_UNLOCKED, + uname, uname, expire, uname); + desc_pre = "all state (other than shutdown locks)"; + break; + default: + // We called this function incorrectly + CRM_ASSERT(false); break; } - if (cib_conn == NULL) { - crm_warn("Unable to delete %s: no CIB connection", desc); - free(desc); - } else { - int call_id; - - cib__set_call_options(options, "node state deletion", - cib_xpath|cib_multiple); - call_id = cib_conn->cmds->remove(cib_conn, xpath, NULL, options); - crm_info("Deleting %s (via CIB call %d) " CRM_XS " xpath=%s", - desc, call_id, xpath); - fsa_register_cib_callback(call_id, desc, cib_delete_callback); - // CIB library handles freeing desc + if (desc != NULL) { + *desc = crm_strdup_printf("%s for node %s", desc_pre, uname); } +} + +/*! + * \internal + * \brief Delete subsection of a node's CIB node_state + * + * \param[in] uname Desired node + * \param[in] section Subsection of node_state to delete + * \param[in] options CIB call options to use + */ +void +controld_delete_node_state(const char *uname, enum controld_section_e section, + int options) +{ + cib_t *cib = controld_globals.cib_conn; + char *xpath = NULL; + char *desc = NULL; + int cib_rc = pcmk_ok; + + CRM_ASSERT((uname != NULL) && (cib != NULL)); + + controld_node_state_deletion_strings(uname, section, &xpath, &desc); + + cib__set_call_options(options, "node state deletion", + cib_xpath|cib_multiple); + cib_rc = cib->cmds->remove(cib, xpath, NULL, options); + fsa_register_cib_callback(cib_rc, desc, cib_delete_callback); + crm_info("Deleting %s (via CIB call %d) " CRM_XS " xpath=%s", + desc, cib_rc, xpath); + + // CIB library handles freeing desc free(xpath); } @@ -491,11 +416,12 @@ controld_delete_resource_history(const char *rsc_id, const char *node, char *desc = NULL; char *xpath = NULL; int rc = pcmk_rc_ok; + cib_t *cib = controld_globals.cib_conn; CRM_CHECK((rsc_id != NULL) && (node != NULL), return EINVAL); desc = crm_strdup_printf("resource history for %s on %s", rsc_id, node); - if (controld_globals.cib_conn == NULL) { + if (cib == NULL) { crm_err("Unable to clear %s: no CIB connection", desc); free(desc); return ENOTCONN; @@ -503,9 +429,10 @@ controld_delete_resource_history(const char *rsc_id, const char *node, // Ask CIB to delete the entry xpath = crm_strdup_printf(XPATH_RESOURCE_HISTORY, node, rsc_id); - rc = cib_internal_op(controld_globals.cib_conn, PCMK__CIB_REQUEST_DELETE, - NULL, xpath, NULL, NULL, call_options|cib_xpath, - user_name); + + cib->cmds->set_user(cib, user_name); + rc = cib->cmds->remove(cib, xpath, NULL, call_options|cib_xpath); + cib->cmds->set_user(cib, NULL); if (rc < 0) { rc = pcmk_legacy2rc(rc); @@ -841,10 +768,17 @@ cib_rsc_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *use case pcmk_ok: case -pcmk_err_diff_failed: case -pcmk_err_diff_resync: - crm_trace("Resource update %d complete: rc=%d", call_id, rc); + crm_trace("Resource history update completed (call=%d rc=%d)", + call_id, rc); break; default: - crm_warn("Resource update %d failed: (rc=%d) %s", call_id, rc, pcmk_strerror(rc)); + if (call_id > 0) { + crm_warn("Resource history update %d failed: %s " + CRM_XS " rc=%d", call_id, pcmk_strerror(rc), rc); + } else { + crm_warn("Resource history update failed: %s " CRM_XS " rc=%d", + pcmk_strerror(rc), rc); + } } if (call_id == pending_rsc_update) { @@ -863,10 +797,11 @@ should_preserve_lock(lrmd_event_data_t *op) if (!pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) { return false; } - if (!strcmp(op->op_type, RSC_STOP) && (op->rc == PCMK_OCF_OK)) { + if (!strcmp(op->op_type, PCMK_ACTION_STOP) && (op->rc == PCMK_OCF_OK)) { return true; } - if (!strcmp(op->op_type, RSC_STATUS) && (op->rc == PCMK_OCF_NOT_RUNNING)) { + if (!strcmp(op->op_type, PCMK_ACTION_MONITOR) + && (op->rc == PCMK_OCF_NOT_RUNNING)) { return true; } return false; @@ -876,10 +811,10 @@ should_preserve_lock(lrmd_event_data_t *op) * \internal * \brief Request a CIB update * - * \param[in] section Section of CIB to update - * \param[in,out] data New XML of CIB section to update - * \param[in] options CIB call options - * \param[in] callback If not NULL, set this as the operation callback + * \param[in] section Section of CIB to update + * \param[in] data New XML of CIB section to update + * \param[in] options CIB call options + * \param[in] callback If not \c NULL, set this as the operation callback * * \return Standard Pacemaker return code * @@ -890,14 +825,13 @@ int controld_update_cib(const char *section, xmlNode *data, int options, void (*callback)(xmlNode *, int, int, xmlNode *, void *)) { + cib_t *cib = controld_globals.cib_conn; int cib_rc = -ENOTCONN; CRM_ASSERT(data != NULL); - if (controld_globals.cib_conn != NULL) { - cib_rc = cib_internal_op(controld_globals.cib_conn, - PCMK__CIB_REQUEST_MODIFY, NULL, section, - data, NULL, options, NULL); + if (cib != NULL) { + cib_rc = cib->cmds->modify(cib, section, data, options); if (cib_rc >= 0) { crm_debug("Submitted CIB update %d for %s section", cib_rc, section); @@ -1047,7 +981,6 @@ controld_delete_action_history(const lrmd_event_data_t *op) controld_globals.cib_conn->cmds->remove(controld_globals.cib_conn, XML_CIB_TAG_STATUS, xml_top, cib_none); - crm_log_xml_trace(xml_top, "op:cancel"); free_xml(xml_top); } @@ -1087,7 +1020,6 @@ controld_cib_delete_last_failure(const char *rsc_id, const char *node, { char *xpath = NULL; char *last_failure_key = NULL; - CRM_CHECK((rsc_id != NULL) && (node != NULL), return); // Generate XPath to match desired entry diff --git a/daemons/controld/controld_cib.h b/daemons/controld/controld_cib.h index bd9492a..dcc5a48 100644 --- a/daemons/controld/controld_cib.h +++ b/daemons/controld/controld_cib.h @@ -43,11 +43,6 @@ fsa_cib_anon_update_discard_reply(const char *section, xmlNode *data) { } } -void controld_record_cib_replace_call(int call_id); -bool controld_forget_cib_replace_call(int call_id); -void controld_forget_all_cib_replace_calls(void); -void controld_destroy_cib_replacements_table(void); - int controld_update_cib(const char *section, xmlNode *data, int options, void (*callback)(xmlNode *, int, int, xmlNode *, void *)); @@ -62,6 +57,9 @@ enum controld_section_e { controld_section_all_unlocked }; +void controld_node_state_deletion_strings(const char *uname, + enum controld_section_e section, + char **xpath, char **desc); void controld_delete_node_state(const char *uname, enum controld_section_e section, int options); int controld_delete_resource_history(const char *rsc_id, const char *node, @@ -118,8 +116,8 @@ int crmd_cib_smart_opt(void); static inline bool controld_action_is_recordable(const char *action) { - return !pcmk__str_any_of(action, CRMD_ACTION_CANCEL, CRMD_ACTION_DELETE, - CRMD_ACTION_NOTIFY, CRMD_ACTION_METADATA, NULL); + return !pcmk__str_any_of(action, PCMK_ACTION_CANCEL, PCMK_ACTION_DELETE, + PCMK_ACTION_NOTIFY, PCMK_ACTION_META_DATA, NULL); } #endif // PCMK__CONTROLD_CIB__H diff --git a/daemons/controld/controld_control.c b/daemons/controld/controld_control.c index ffc62a0..644d686 100644 --- a/daemons/controld/controld_control.c +++ b/daemons/controld/controld_control.c @@ -221,6 +221,7 @@ crmd_exit(crm_exit_t exit_code) g_list_free(controld_globals.fsa_message_queue); controld_globals.fsa_message_queue = NULL; + controld_free_node_pending_timers(); controld_election_fini(); /* Tear down the CIB manager connection, but don't free it yet -- it could @@ -265,7 +266,6 @@ crmd_exit(crm_exit_t exit_code) controld_globals.te_uuid = NULL; free_max_generation(); - controld_destroy_cib_replacements_table(); controld_destroy_failed_sync_table(); controld_destroy_outside_events_table(); @@ -323,20 +323,12 @@ do_exit(long long action, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { crm_exit_t exit_code = CRM_EX_OK; - int log_level = LOG_INFO; - const char *exit_type = "gracefully"; - if (action & A_EXIT_1) { - log_level = LOG_ERR; - exit_type = "forcefully"; + if (pcmk_is_set(action, A_EXIT_1)) { exit_code = CRM_EX_ERROR; + crm_err("Exiting now due to errors"); } - verify_stopped(cur_state, LOG_ERR); - do_crm_log(log_level, "Performing %s - %s exiting the controller", - fsa_action2string(action), exit_type); - - crm_info("[%s] stopped (%d)", crm_system_name, exit_code); crmd_exit(exit_code); } @@ -504,7 +496,8 @@ do_started(long long action, } else { crm_notice("Pacemaker controller successfully started and accepting connections"); } - controld_trigger_fencer_connect(); + controld_set_fsa_input_flags(R_ST_REQUIRED); + controld_timer_fencer_connect(GINT_TO_POINTER(TRUE)); controld_clear_fsa_input_flags(R_STARTING); register_fsa_input(msg_data->fsa_cause, I_PENDING, NULL); @@ -684,6 +677,17 @@ static pcmk__cluster_option_t controller_options[] = { "passed since the shutdown was initiated, even if the node has not " "rejoined.") }, + { + XML_CONFIG_ATTR_NODE_PENDING_TIMEOUT, NULL, "time", NULL, + "0", pcmk__valid_interval_spec, + N_("How long to wait for a node that has joined the cluster to join " + "the controller process group"), + N_("Fence nodes that do not join the controller process group within " + "this much time after joining the cluster, to allow the cluster " + "to continue managing resources. A value of 0 means never fence " + "pending nodes. Setting the value to 2h means fence nodes after " + "2 hours.") + }, }; void @@ -722,9 +726,8 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void } crmconfig = output; - if ((crmconfig) && - (crm_element_name(crmconfig)) && - (strcmp(crm_element_name(crmconfig), XML_CIB_TAG_CRMCONFIG) != 0)) { + if ((crmconfig != NULL) + && !pcmk__xe_is(crmconfig, XML_CIB_TAG_CRMCONFIG)) { crmconfig = first_named_child(crmconfig, XML_CIB_TAG_CRMCONFIG); } if (!crmconfig) { @@ -761,6 +764,10 @@ config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void controld_globals.shutdown_lock_limit = crm_parse_interval_spec(value) / 1000; + value = g_hash_table_lookup(config_hash, + XML_CONFIG_ATTR_NODE_PENDING_TIMEOUT); + controld_globals.node_pending_timeout = crm_parse_interval_spec(value) / 1000; + value = g_hash_table_lookup(config_hash, "cluster-name"); pcmk__str_update(&(controld_globals.cluster_name), value); diff --git a/daemons/controld/controld_corosync.c b/daemons/controld/controld_corosync.c index 4378b30..b69e821 100644 --- a/daemons/controld/controld_corosync.c +++ b/daemons/controld/controld_corosync.c @@ -1,5 +1,5 @@ /* - * Copyright 2004-2022 the Pacemaker project contributors + * Copyright 2004-2023 the Pacemaker project contributors * * The version control history for this file may have further details. * @@ -81,9 +81,6 @@ crmd_cs_destroy(gpointer user_data) if (!pcmk_is_set(controld_globals.fsa_input_register, R_HA_DISCONNECTED)) { crm_crit("Lost connection to cluster layer, shutting down"); crmd_exit(CRM_EX_DISCONNECT); - - } else { - crm_info("Corosync connection closed"); } } @@ -122,7 +119,8 @@ cpg_membership_callback(cpg_handle_t handle, const struct cpg_name *cpg_name, if (controld_globals.dc_name != NULL) { crm_node_t *peer = NULL; - peer = pcmk__search_cluster_node_cache(0, controld_globals.dc_name); + peer = pcmk__search_cluster_node_cache(0, controld_globals.dc_name, + NULL); if (peer != NULL) { for (int i = 0; i < left_list_entries; ++i) { if (left_list[i].nodeid == peer->id) { diff --git a/daemons/controld/controld_election.c b/daemons/controld/controld_election.c index 5f33d5b..70ffecc 100644 --- a/daemons/controld/controld_election.c +++ b/daemons/controld/controld_election.c @@ -263,13 +263,6 @@ do_dc_release(long long action, } else if (action & A_DC_RELEASED) { crm_info("DC role released"); -#if 0 - if (are there errors) { - /* we can't stay up if not healthy */ - /* or perhaps I_ERROR and go to S_RECOVER? */ - result = I_SHUTDOWN; - } -#endif if (pcmk_is_set(controld_globals.fsa_input_register, R_SHUTDOWN)) { xmlNode *update = NULL; crm_node_t *node = crm_get_peer(0, controld_globals.our_nodename); diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c index 0de399c..480d37d 100644 --- a/daemons/controld/controld_execd.c +++ b/daemons/controld/controld_execd.c @@ -52,14 +52,10 @@ static void lrm_connection_destroy(void) { if (pcmk_is_set(controld_globals.fsa_input_register, R_LRM_CONNECTED)) { - crm_crit("Connection to executor failed"); + crm_crit("Lost connection to local executor"); register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); controld_clear_fsa_input_flags(R_LRM_CONNECTED); - - } else { - crm_info("Disconnected from executor"); } - } static char * @@ -171,7 +167,7 @@ update_history_cache(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, lrmd_event_ return; } - if (pcmk__str_eq(op->op_type, RSC_NOTIFY, pcmk__str_casei)) { + if (pcmk__str_eq(op->op_type, PCMK_ACTION_NOTIFY, pcmk__str_casei)) { return; } @@ -222,10 +218,10 @@ update_history_cache(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, lrmd_event_ } entry->last = lrmd_copy_event(op); - if (op->params && pcmk__strcase_any_of(op->op_type, CRMD_ACTION_START, - CRMD_ACTION_RELOAD, - CRMD_ACTION_RELOAD_AGENT, - CRMD_ACTION_STATUS, NULL)) { + if (op->params && pcmk__strcase_any_of(op->op_type, PCMK_ACTION_START, + PCMK_ACTION_RELOAD, + PCMK_ACTION_RELOAD_AGENT, + PCMK_ACTION_MONITOR, NULL)) { if (entry->stop_params) { g_hash_table_destroy(entry->stop_params); } @@ -243,7 +239,9 @@ update_history_cache(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, lrmd_event_ op->rsc_id, op->op_type, op->interval_ms); entry->recurring_op_list = g_list_prepend(entry->recurring_op_list, lrmd_copy_event(op)); - } else if (entry->recurring_op_list && !pcmk__str_eq(op->op_type, RSC_STATUS, pcmk__str_casei)) { + } else if ((entry->recurring_op_list != NULL) + && !pcmk__str_eq(op->op_type, PCMK_ACTION_MONITOR, + pcmk__str_casei)) { crm_trace("Dropping %d recurring ops because of: " PCMK__OP_FMT, g_list_length(entry->recurring_op_list), op->rsc_id, op->op_type, op->interval_ms); @@ -376,10 +374,8 @@ do_lrm_control(long long action, } controld_clear_fsa_input_flags(R_LRM_CONNECTED); - crm_info("Disconnecting from the executor"); lrm_state_disconnect(lrm_state); lrm_state_reset_tables(lrm_state, FALSE); - crm_notice("Disconnected from the executor"); } if (action & A_LRM_CONNECT) { @@ -510,11 +506,14 @@ is_rsc_active(lrm_state_t * lrm_state, const char *rsc_id) crm_trace("Processing %s: %s.%d=%d", rsc_id, entry->last->op_type, entry->last->interval_ms, entry->last->rc); - if (entry->last->rc == PCMK_OCF_OK && pcmk__str_eq(entry->last->op_type, CRMD_ACTION_STOP, pcmk__str_casei)) { + if ((entry->last->rc == PCMK_OCF_OK) + && pcmk__str_eq(entry->last->op_type, PCMK_ACTION_STOP, + pcmk__str_casei)) { return FALSE; } else if (entry->last->rc == PCMK_OCF_OK - && pcmk__str_eq(entry->last->op_type, CRMD_ACTION_MIGRATE, pcmk__str_casei)) { + && pcmk__str_eq(entry->last->op_type, PCMK_ACTION_MIGRATE_TO, + pcmk__str_casei)) { // A stricter check is too complex ... leave that to the scheduler return FALSE; @@ -668,7 +667,7 @@ notify_deleted(lrm_state_t * lrm_state, ha_msg_input_t * input, const char *rsc_ crm_info("Notifying %s on %s that %s was%s deleted", from_sys, (from_host? from_host : "localhost"), rsc_id, ((rc == pcmk_ok)? "" : " not")); - op = construct_op(lrm_state, input->xml, rsc_id, CRMD_ACTION_DELETE); + op = construct_op(lrm_state, input->xml, rsc_id, PCMK_ACTION_DELETE); controld_rc2event(op, pcmk_legacy2rc(rc)); controld_ack_event_directly(from_host, from_sys, NULL, op, rsc_id); lrmd_free_event(op); @@ -1117,7 +1116,8 @@ synthesize_lrmd_failure(lrm_state_t *lrm_state, const xmlNode *action, op = construct_op(lrm_state, action, ID(xml_rsc), operation); - if (pcmk__str_eq(operation, RSC_NOTIFY, pcmk__str_casei)) { // Notifications can't fail + if (pcmk__str_eq(operation, PCMK_ACTION_NOTIFY, pcmk__str_casei)) { + // Notifications can't fail fake_op_status(lrm_state, op, PCMK_EXEC_DONE, PCMK_OCF_OK, NULL); } else { fake_op_status(lrm_state, op, op_status, rc, exit_reason); @@ -1329,7 +1329,7 @@ do_lrm_delete(ha_msg_input_t *input, lrm_state_t *lrm_state, if (cib_rc != pcmk_rc_ok) { lrmd_event_data_t *op = NULL; - op = construct_op(lrm_state, input->xml, rsc->id, CRMD_ACTION_DELETE); + op = construct_op(lrm_state, input->xml, rsc->id, PCMK_ACTION_DELETE); /* These are resource clean-ups, not actions, so no exit reason is * needed. @@ -1394,7 +1394,9 @@ metadata_complete(int pid, const pcmk__action_result_t *result, void *user_data) md = controld_cache_metadata(lrm_state->metadata_cache, data->rsc, result->action_stdout); } - do_lrm_rsc_op(lrm_state, data->rsc, data->input_xml, md); + if (!pcmk_is_set(controld_globals.fsa_input_register, R_HA_DISCONNECTED)) { + do_lrm_rsc_op(lrm_state, data->rsc, data->input_xml, md); + } free_metadata_cb_data(data); } @@ -1438,11 +1440,11 @@ do_lrm_invoke(long long action, from_host = crm_element_value(input->msg, F_CRM_HOST_FROM); } - if (pcmk__str_eq(crm_op, CRM_OP_LRM_DELETE, pcmk__str_none)) { + if (pcmk__str_eq(crm_op, PCMK_ACTION_LRM_DELETE, pcmk__str_none)) { if (!pcmk__str_eq(from_sys, CRM_SYSTEM_TENGINE, pcmk__str_none)) { crm_rsc_delete = TRUE; // from crm_resource } - operation = CRMD_ACTION_DELETE; + operation = PCMK_ACTION_DELETE; } else if (input->xml != NULL) { operation = crm_element_value(input->xml, XML_LRM_ATTR_TASK); @@ -1486,7 +1488,7 @@ do_lrm_invoke(long long action, } else if (operation != NULL) { lrmd_rsc_info_t *rsc = NULL; xmlNode *xml_rsc = find_xml_node(input->xml, XML_CIB_TAG_RESOURCE, TRUE); - gboolean create_rsc = !pcmk__str_eq(operation, CRMD_ACTION_DELETE, + gboolean create_rsc = !pcmk__str_eq(operation, PCMK_ACTION_DELETE, pcmk__str_none); int rc; @@ -1534,12 +1536,13 @@ do_lrm_invoke(long long action, return; } - if (pcmk__str_eq(operation, CRMD_ACTION_CANCEL, pcmk__str_none)) { + if (pcmk__str_eq(operation, PCMK_ACTION_CANCEL, pcmk__str_none)) { if (!do_lrm_cancel(input, lrm_state, rsc, from_host, from_sys)) { crm_log_xml_warn(input->xml, "Bad command"); } - } else if (pcmk__str_eq(operation, CRMD_ACTION_DELETE, pcmk__str_none)) { + } else if (pcmk__str_eq(operation, PCMK_ACTION_DELETE, + pcmk__str_none)) { do_lrm_delete(input, lrm_state, rsc, from_sys, from_host, crm_rsc_delete, user_name); @@ -1554,7 +1557,7 @@ do_lrm_invoke(long long action, * changed (using something like inotify, or a hash or modification * time of the agent executable). */ - if (strcmp(operation, CRMD_ACTION_START) != 0) { + if (strcmp(operation, PCMK_ACTION_START) != 0) { md = controld_get_rsc_metadata(lrm_state, rsc, controld_metadata_from_cache); } @@ -1619,7 +1622,8 @@ construct_op(const lrm_state_t *lrm_state, const xmlNode *rsc_op, lrmd__set_result(op, PCMK_OCF_UNKNOWN, PCMK_EXEC_PENDING, NULL); if (rsc_op == NULL) { - CRM_LOG_ASSERT(pcmk__str_eq(CRMD_ACTION_STOP, operation, pcmk__str_casei)); + CRM_LOG_ASSERT(pcmk__str_eq(operation, PCMK_ACTION_STOP, + pcmk__str_casei)); op->user_data = NULL; /* the stop_all_resources() case * by definition there is no DC (or they'd be shutting @@ -1654,7 +1658,7 @@ construct_op(const lrm_state_t *lrm_state, const xmlNode *rsc_op, class = crm_element_value(primitive, XML_AGENT_ATTR_CLASS); if (pcmk_is_set(pcmk_get_ra_caps(class), pcmk_ra_cap_fence_params) - && pcmk__str_eq(operation, CRMD_ACTION_STATUS, pcmk__str_casei) + && pcmk__str_eq(operation, PCMK_ACTION_MONITOR, pcmk__str_casei) && (op->interval_ms > 0)) { op_timeout = g_hash_table_lookup(params, "pcmk_monitor_timeout"); @@ -1663,7 +1667,7 @@ construct_op(const lrm_state_t *lrm_state, const xmlNode *rsc_op, } } - if (!pcmk__str_eq(operation, RSC_STOP, pcmk__str_casei)) { + if (!pcmk__str_eq(operation, PCMK_ACTION_STOP, pcmk__str_casei)) { op->params = params; } else { @@ -1703,7 +1707,8 @@ construct_op(const lrm_state_t *lrm_state, const xmlNode *rsc_op, op->user_data = strdup(transition); if (op->interval_ms != 0) { - if (pcmk__strcase_any_of(operation, CRMD_ACTION_START, CRMD_ACTION_STOP, NULL)) { + if (pcmk__strcase_any_of(operation, PCMK_ACTION_START, PCMK_ACTION_STOP, + NULL)) { crm_err("Start and Stop actions cannot have an interval: %u", op->interval_ms); op->interval_ms = 0; @@ -1849,7 +1854,7 @@ static bool should_cancel_recurring(const char *rsc_id, const char *action, guint interval_ms) { if (is_remote_lrmd_ra(NULL, NULL, rsc_id) && (interval_ms == 0) - && (strcmp(action, CRMD_ACTION_MIGRATE) == 0)) { + && (strcmp(action, PCMK_ACTION_MIGRATE_TO) == 0)) { /* Don't stop monitoring a migrating Pacemaker Remote connection * resource until the entire migration has completed. We must detect if * the connection is unexpectedly severed, even during a migration. @@ -1859,8 +1864,8 @@ should_cancel_recurring(const char *rsc_id, const char *action, guint interval_m // Cancel recurring actions before changing resource state return (interval_ms == 0) - && !pcmk__str_any_of(action, CRMD_ACTION_STATUS, CRMD_ACTION_NOTIFY, - NULL); + && !pcmk__str_any_of(action, PCMK_ACTION_MONITOR, + PCMK_ACTION_NOTIFY, NULL); } /*! @@ -1876,7 +1881,7 @@ static const char * should_nack_action(const char *action) { if (pcmk_is_set(controld_globals.fsa_input_register, R_SHUTDOWN) - && pcmk__str_eq(action, RSC_START, pcmk__str_none)) { + && pcmk__str_eq(action, PCMK_ACTION_START, pcmk__str_none)) { register_fsa_input(C_SHUTDOWN, I_SHUTDOWN, NULL); return "Not attempting start due to shutdown in progress"; @@ -1888,7 +1893,7 @@ should_nack_action(const char *action) case S_TRANSITION_ENGINE: break; default: - if (!pcmk__str_eq(action, CRMD_ACTION_STOP, pcmk__str_none)) { + if (!pcmk__str_eq(action, PCMK_ACTION_STOP, pcmk__str_none)) { return "Controller cannot attempt actions at this time"; } break; @@ -1930,8 +1935,8 @@ do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc, xmlNode *msg, return; } - if (pcmk__str_any_of(operation, CRMD_ACTION_RELOAD, - CRMD_ACTION_RELOAD_AGENT, NULL)) { + if (pcmk__str_any_of(operation, PCMK_ACTION_RELOAD, + PCMK_ACTION_RELOAD_AGENT, NULL)) { /* Pre-2.1.0 DCs will schedule reload actions only, and 2.1.0+ DCs * will schedule reload-agent actions only. In either case, we need * to map that to whatever the resource agent actually supports. @@ -1939,9 +1944,9 @@ do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc, xmlNode *msg, */ if ((md != NULL) && pcmk_is_set(md->ra_flags, ra_supports_legacy_reload)) { - operation = CRMD_ACTION_RELOAD; + operation = PCMK_ACTION_RELOAD; } else { - operation = CRMD_ACTION_RELOAD_AGENT; + operation = PCMK_ACTION_RELOAD_AGENT; } } @@ -1968,8 +1973,9 @@ do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc, xmlNode *msg, /* now do the op */ crm_notice("Requesting local execution of %s operation for %s on %s " CRM_XS " transition_key=%s op_key=" PCMK__OP_FMT, - crm_action_str(op->op_type, op->interval_ms), rsc->id, lrm_state->node_name, - pcmk__s(transition, ""), rsc->id, operation, op->interval_ms); + pcmk__readable_action(op->op_type, op->interval_ms), rsc->id, + lrm_state->node_name, pcmk__s(transition, ""), rsc->id, + operation, op->interval_ms); nack_reason = should_nack_action(operation); if (nack_reason != NULL) { @@ -2131,7 +2137,8 @@ log_executor_event(const lrmd_event_data_t *op, const char *op_key, GString *str = g_string_sized_new(100); // reasonable starting size pcmk__g_strcat(str, - "Result of ", crm_action_str(op->op_type, op->interval_ms), + "Result of ", + pcmk__readable_action(op->op_type, op->interval_ms), " operation for ", op->rsc_id, NULL); if (node_name != NULL) { @@ -2401,7 +2408,8 @@ process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op, log_executor_event(op, op_key, node_name, removed); if (lrm_state) { - if (!pcmk__str_eq(op->op_type, RSC_METADATA, pcmk__str_casei)) { + if (!pcmk__str_eq(op->op_type, PCMK_ACTION_META_DATA, + pcmk__str_casei)) { crmd_alert_resource_op(lrm_state->node_name, op); } else if (rsc && (op->rc == PCMK_OCF_OK)) { char *metadata = unescape_newlines(op->output); diff --git a/daemons/controld/controld_execd_state.c b/daemons/controld/controld_execd_state.c index 8c68bfc..b90cc5e 100644 --- a/daemons/controld/controld_execd_state.c +++ b/daemons/controld/controld_execd_state.c @@ -132,12 +132,6 @@ lrm_state_create(const char *node_name) return state; } -void -lrm_state_destroy(const char *node_name) -{ - g_hash_table_remove(lrm_state_table, node_name); -} - static gboolean remote_proxy_remove_by_node(gpointer key, gpointer value, gpointer user_data) { @@ -307,7 +301,7 @@ lrm_state_destroy_all(void) lrm_state_t * lrm_state_find(const char *node_name) { - if (!node_name) { + if ((node_name == NULL) || (lrm_state_table == NULL)) { return NULL; } return g_hash_table_lookup(lrm_state_table, node_name); @@ -318,6 +312,8 @@ lrm_state_find_or_create(const char *node_name) { lrm_state_t *lrm_state; + CRM_CHECK(lrm_state_table != NULL, return NULL); + lrm_state = g_hash_table_lookup(lrm_state_table, node_name); if (!lrm_state) { lrm_state = lrm_state_create(node_name); @@ -329,6 +325,9 @@ lrm_state_find_or_create(const char *node_name) GList * lrm_state_get_list(void) { + if (lrm_state_table == NULL) { + return NULL; + } return g_hash_table_get_values(lrm_state_table); } @@ -799,7 +798,7 @@ lrm_state_unregister_rsc(lrm_state_t * lrm_state, } if (is_remote_lrmd_ra(NULL, NULL, rsc_id)) { - lrm_state_destroy(rsc_id); + g_hash_table_remove(lrm_state_table, rsc_id); return pcmk_ok; } diff --git a/daemons/controld/controld_fencing.c b/daemons/controld/controld_fencing.c index 89cb61f..9557d9e 100644 --- a/daemons/controld/controld_fencing.c +++ b/daemons/controld/controld_fencing.c @@ -218,8 +218,11 @@ send_stonith_update(pcmk__graph_action_t *action, const char *target, CRM_CHECK(target != NULL, return); CRM_CHECK(uuid != NULL, return); - /* Make sure the membership and join caches are accurate */ - peer = crm_get_peer_full(0, target, CRM_GET_PEER_ANY); + /* Make sure the membership and join caches are accurate. + * Try getting any existing node cache entry also by node uuid in case it + * doesn't have an uname yet. + */ + peer = pcmk__get_peer_full(0, target, uuid, CRM_GET_PEER_ANY); CRM_CHECK(peer != NULL, return); @@ -391,7 +394,7 @@ execute_stonith_cleanup(void) */ static stonith_t *stonith_api = NULL; -static crm_trigger_t *stonith_reconnect = NULL; +static mainloop_timer_t *controld_fencer_connect_timer = NULL; static char *te_client_id = NULL; static gboolean @@ -422,7 +425,7 @@ fail_incompletable_stonith(pcmk__graph_t *graph) } task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); - if (task && pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) { + if (pcmk__str_eq(task, PCMK_ACTION_STONITH, pcmk__str_casei)) { pcmk__set_graph_action_flags(action, pcmk__graph_action_failed); last_action = action->xml; pcmk__update_graph(graph, action); @@ -447,11 +450,12 @@ tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e) te_cleanup_stonith_history_sync(st, FALSE); if (pcmk_is_set(controld_globals.fsa_input_register, R_ST_REQUIRED)) { - crm_crit("Fencing daemon connection failed"); - mainloop_set_trigger(stonith_reconnect); - + crm_err("Lost fencer connection (will attempt to reconnect)"); + if (!mainloop_timer_running(controld_fencer_connect_timer)) { + mainloop_timer_start(controld_fencer_connect_timer); + } } else { - crm_info("Fencing daemon disconnected"); + crm_info("Disconnected from fencer"); } if (stonith_api) { @@ -515,7 +519,7 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event) crmd_alert_fencing_op(event); - if (pcmk__str_eq("on", event->action, pcmk__str_none)) { + if (pcmk__str_eq(PCMK_ACTION_ON, event->action, pcmk__str_none)) { // Unfencing doesn't need special handling, just a log message if (succeeded) { crm_notice("%s was unfenced by %s at the request of %s@%s", @@ -647,14 +651,14 @@ handle_fence_notification(stonith_t *st, stonith_event_t *event) /*! * \brief Connect to fencer * - * \param[in] user_data If NULL, retry failures now, otherwise retry in main loop + * \param[in] user_data If NULL, retry failures now, otherwise retry in mainloop timer * - * \return TRUE + * \return G_SOURCE_REMOVE on success, G_SOURCE_CONTINUE to retry * \note If user_data is NULL, this will wait 2s between attempts, for up to * 30 attempts, meaning the controller could be blocked as long as 58s. */ -static gboolean -te_connect_stonith(gpointer user_data) +gboolean +controld_timer_fencer_connect(gpointer user_data) { int rc = pcmk_ok; @@ -662,13 +666,13 @@ te_connect_stonith(gpointer user_data) stonith_api = stonith_api_new(); if (stonith_api == NULL) { crm_err("Could not connect to fencer: API memory allocation failed"); - return TRUE; + return G_SOURCE_REMOVE; } } if (stonith_api->state != stonith_disconnected) { crm_trace("Already connected to fencer, no need to retry"); - return TRUE; + return G_SOURCE_REMOVE; } if (user_data == NULL) { @@ -681,17 +685,30 @@ te_connect_stonith(gpointer user_data) } else { // Non-blocking (retry failures later in main loop) rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL); + + if (controld_fencer_connect_timer == NULL) { + controld_fencer_connect_timer = + mainloop_timer_add("controld_fencer_connect", 1000, + TRUE, controld_timer_fencer_connect, + GINT_TO_POINTER(TRUE)); + } + if (rc != pcmk_ok) { if (pcmk_is_set(controld_globals.fsa_input_register, R_ST_REQUIRED)) { crm_notice("Fencer connection failed (will retry): %s " CRM_XS " rc=%d", pcmk_strerror(rc), rc); - mainloop_set_trigger(stonith_reconnect); + + if (!mainloop_timer_running(controld_fencer_connect_timer)) { + mainloop_timer_start(controld_fencer_connect_timer); + } + + return G_SOURCE_CONTINUE; } else { crm_info("Fencer connection failed (ignoring because no longer required): %s " CRM_XS " rc=%d", pcmk_strerror(rc), rc); } - return TRUE; + return G_SOURCE_REMOVE; } } @@ -709,23 +726,7 @@ te_connect_stonith(gpointer user_data) crm_notice("Fencer successfully connected"); } - return TRUE; -} - -/*! - \internal - \brief Schedule fencer connection attempt in main loop -*/ -void -controld_trigger_fencer_connect(void) -{ - if (stonith_reconnect == NULL) { - stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW, - te_connect_stonith, - GINT_TO_POINTER(TRUE)); - } - controld_set_fsa_input_flags(R_ST_REQUIRED); - mainloop_set_trigger(stonith_reconnect); + return G_SOURCE_REMOVE; } void @@ -745,9 +746,9 @@ controld_disconnect_fencer(bool destroy) stonith_api->cmds->free(stonith_api); stonith_api = NULL; } - if (stonith_reconnect) { - mainloop_destroy_trigger(stonith_reconnect); - stonith_reconnect = NULL; + if (controld_fencer_connect_timer) { + mainloop_timer_del(controld_fencer_connect_timer); + controld_fencer_connect_timer = NULL; } if (te_client_id) { free(te_client_id); @@ -843,7 +844,7 @@ tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data) crm_info("Fence operation %d for %s succeeded", data->call_id, target); if (!(pcmk_is_set(action->flags, pcmk__graph_action_confirmed))) { te_action_confirmed(action, NULL); - if (pcmk__str_eq("on", op, pcmk__str_casei)) { + if (pcmk__str_eq(PCMK_ACTION_ON, op, pcmk__str_casei)) { const char *value = NULL; char *now = pcmk__ttoa(time(NULL)); gboolean is_remote_node = FALSE; @@ -981,7 +982,7 @@ controld_execute_fence_action(pcmk__graph_t *graph, priority_delay ? priority_delay : ""); /* Passing NULL means block until we can connect... */ - te_connect_stonith(NULL); + controld_timer_fencer_connect(NULL); pcmk__scan_min_int(priority_delay, &delay_i, 0); rc = fence_with_delay(target, type, delay_i); @@ -1000,12 +1001,14 @@ controld_execute_fence_action(pcmk__graph_t *graph, bool controld_verify_stonith_watchdog_timeout(const char *value) { + long st_timeout = value? crm_get_msec(value) : 0; const char *our_nodename = controld_globals.our_nodename; gboolean rv = TRUE; - if (stonith_api && (stonith_api->state != stonith_disconnected) && - stonith__watchdog_fencing_enabled_for_node_api(stonith_api, - our_nodename)) { + if (st_timeout == 0 + || (stonith_api && (stonith_api->state != stonith_disconnected) && + stonith__watchdog_fencing_enabled_for_node_api(stonith_api, + our_nodename))) { rv = pcmk__valid_sbd_timeout(value); } return rv; diff --git a/daemons/controld/controld_fencing.h b/daemons/controld/controld_fencing.h index 86a5050..76779c6 100644 --- a/daemons/controld/controld_fencing.h +++ b/daemons/controld/controld_fencing.h @@ -19,7 +19,7 @@ void controld_configure_fencing(GHashTable *options); void st_fail_count_reset(const char * target); // stonith API client -void controld_trigger_fencer_connect(void); +gboolean controld_timer_fencer_connect(gpointer user_data); void controld_disconnect_fencer(bool destroy); int controld_execute_fence_action(pcmk__graph_t *graph, pcmk__graph_action_t *action); diff --git a/daemons/controld/controld_fsa.c b/daemons/controld/controld_fsa.c index 622d1c8..06559b8 100644 --- a/daemons/controld/controld_fsa.c +++ b/daemons/controld/controld_fsa.c @@ -205,7 +205,6 @@ s_crmd_fsa(enum crmd_fsa_cause cause) fsa_data->data_type = fsa_dt_none; controld_globals.fsa_message_queue = g_list_append(controld_globals.fsa_message_queue, fsa_data); - fsa_data = NULL; } while ((controld_globals.fsa_message_queue != NULL) && !pcmk_is_set(controld_globals.flags, controld_fsa_is_stalled)) { @@ -275,7 +274,6 @@ s_crmd_fsa(enum crmd_fsa_cause cause) /* start doing things... */ s_crmd_fsa_actions(fsa_data); delete_fsa_input(fsa_data); - fsa_data = NULL; } if ((controld_globals.fsa_message_queue != NULL) @@ -620,11 +618,6 @@ do_state_transition(enum crmd_fsa_state cur_state, if (next_state != S_ELECTION && cur_state != S_RELEASE_DC) { controld_stop_current_election_timeout(); } -#if 0 - if ((controld_globals.fsa_input_register & R_SHUTDOWN)) { - controld_set_fsa_action_flags(A_DC_TIMER_STOP); - } -#endif if (next_state == S_INTEGRATION) { controld_set_fsa_action_flags(A_INTEGRATE_TIMER_START); } else { diff --git a/daemons/controld/controld_globals.h b/daemons/controld/controld_globals.h index eff1607..2ff8a57 100644 --- a/daemons/controld/controld_globals.h +++ b/daemons/controld/controld_globals.h @@ -45,9 +45,6 @@ typedef struct { //! Connection to the CIB cib_t *cib_conn; - //! CIB connection's client ID - const char *cib_client_id; - // Scheduler @@ -93,6 +90,9 @@ typedef struct { //! Max lifetime (in seconds) of a resource's shutdown lock to a node guint shutdown_lock_limit; + //! Node pending timeout + guint node_pending_timeout; + //! Main event loop GMainLoop *mainloop; } controld_globals_t; diff --git a/daemons/controld/controld_join_client.c b/daemons/controld/controld_join_client.c index da6a9d6..805ecbd 100644 --- a/daemons/controld/controld_join_client.c +++ b/daemons/controld/controld_join_client.c @@ -112,15 +112,6 @@ do_cl_join_offer_respond(long long action, CRM_CHECK(input != NULL, return); -#if 0 - if (we are sick) { - log error; - - /* save the request for later? */ - return; - } -#endif - welcome_from = crm_element_value(input->msg, F_CRM_HOST_FROM); join_id = crm_element_value(input->msg, F_CRM_JOIN_ID); crm_trace("Accepting cluster join offer from node %s "CRM_XS" join-%s", @@ -195,32 +186,34 @@ join_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void * free_xml(generation); } -static void -set_join_state(const char * start_state) +void +set_join_state(const char *start_state, const char *node_name, const char *node_uuid, + bool remote) { if (pcmk__str_eq(start_state, "standby", pcmk__str_casei)) { crm_notice("Forcing node %s to join in %s state per configured " - "environment", controld_globals.our_nodename, start_state); + "environment", node_name, start_state); cib__update_node_attr(controld_globals.logger_out, controld_globals.cib_conn, cib_sync_call, - XML_CIB_TAG_NODES, controld_globals.our_uuid, - NULL, NULL, NULL, "standby", "on", NULL, NULL); + XML_CIB_TAG_NODES, node_uuid, + NULL, NULL, NULL, "standby", "on", NULL, + remote ? "remote" : NULL); } else if (pcmk__str_eq(start_state, "online", pcmk__str_casei)) { crm_notice("Forcing node %s to join in %s state per configured " - "environment", controld_globals.our_nodename, start_state); + "environment", node_name, start_state); cib__update_node_attr(controld_globals.logger_out, controld_globals.cib_conn, cib_sync_call, - XML_CIB_TAG_NODES, controld_globals.our_uuid, - NULL, NULL, NULL, "standby", "off", NULL, NULL); + XML_CIB_TAG_NODES, node_uuid, + NULL, NULL, NULL, "standby", "off", NULL, + remote ? "remote" : NULL); } else if (pcmk__str_eq(start_state, "default", pcmk__str_casei)) { - crm_debug("Not forcing a starting state on node %s", - controld_globals.our_nodename); + crm_debug("Not forcing a starting state on node %s", node_name); } else { crm_warn("Unrecognized start state '%s', using 'default' (%s)", - start_state, controld_globals.our_nodename); + start_state, node_name); } } @@ -335,7 +328,8 @@ do_cl_join_finalize_respond(long long action, first_join = FALSE; if (start_state) { - set_join_state(start_state); + set_join_state(start_state, controld_globals.our_nodename, + controld_globals.our_uuid, false); } } diff --git a/daemons/controld/controld_join_dc.c b/daemons/controld/controld_join_dc.c index f82b132..2fe6710 100644 --- a/daemons/controld/controld_join_dc.c +++ b/daemons/controld/controld_join_dc.c @@ -172,7 +172,6 @@ start_join_round(void) max_generation_xml = NULL; } controld_clear_fsa_input_flags(R_HAVE_CIB); - controld_forget_all_cib_replace_calls(); } /*! @@ -607,10 +606,6 @@ do_dc_join_finalize(long long action, rc = controld_globals.cib_conn->cmds->sync_from(controld_globals.cib_conn, sync_from, NULL, cib_none); - - if (pcmk_is_set(controld_globals.fsa_input_register, R_HAVE_CIB)) { - controld_record_cib_replace_call(rc); - } fsa_register_cib_callback(rc, sync_from, finalize_sync_callback); } @@ -629,8 +624,6 @@ finalize_sync_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, voi { CRM_LOG_ASSERT(-EPERM != rc); - controld_forget_cib_replace_call(call_id); - if (rc != pcmk_ok) { const char *sync_from = (const char *) user_data; @@ -674,22 +667,25 @@ finalize_sync_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, voi } static void -join_update_complete_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) +join_node_state_commit_callback(xmlNode *msg, int call_id, int rc, + xmlNode *output, void *user_data) { - fsa_data_t *msg_data = NULL; + const char *node = user_data; - if (rc == pcmk_ok) { - crm_debug("join-%d node history update (via CIB call %d) complete", - current_join_id, call_id); - check_join_state(controld_globals.fsa_state, __func__); + if (rc != pcmk_ok) { + fsa_data_t *msg_data = NULL; // for register_fsa_error() macro - } else { - crm_err("join-%d node history update (via CIB call %d) failed: %s " - "(next transition may determine resource status incorrectly)", - current_join_id, call_id, pcmk_strerror(rc)); + crm_crit("join-%d node history update (via CIB call %d) for node %s " + "failed: %s", + current_join_id, call_id, node, pcmk_strerror(rc)); crm_log_xml_debug(msg, "failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } + + crm_debug("join-%d node history update (via CIB call %d) for node %s " + "complete", + current_join_id, call_id, node); + check_join_state(controld_globals.fsa_state, __func__); } /* A_DC_JOIN_PROCESS_ACK */ @@ -701,33 +697,39 @@ do_dc_join_ack(long long action, { int join_id = -1; ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg); - enum controld_section_e section = controld_section_lrm; - const int cib_opts = cib_scope_local|cib_can_create; const char *op = crm_element_value(join_ack->msg, F_CRM_TASK); - const char *join_from = crm_element_value(join_ack->msg, F_CRM_HOST_FROM); + char *join_from = crm_element_value_copy(join_ack->msg, F_CRM_HOST_FROM); crm_node_t *peer = NULL; + enum controld_section_e section = controld_section_lrm; + char *xpath = NULL; + xmlNode *state = join_ack->xml; + xmlNode *execd_state = NULL; + + cib_t *cib = controld_globals.cib_conn; + int rc = pcmk_ok; + // Sanity checks if (join_from == NULL) { crm_warn("Ignoring message received without node identification"); - return; + goto done; } if (op == NULL) { crm_warn("Ignoring message received from %s without task", join_from); - return; + goto done; } if (strcmp(op, CRM_OP_JOIN_CONFIRM)) { crm_debug("Ignoring '%s' message from %s while waiting for '%s'", op, join_from, CRM_OP_JOIN_CONFIRM); - return; + goto done; } if (crm_element_value_int(join_ack->msg, F_CRM_JOIN_ID, &join_id) != 0) { crm_warn("Ignoring join confirmation from %s without valid join ID", join_from); - return; + goto done; } peer = crm_get_peer(0, join_from); @@ -736,7 +738,7 @@ do_dc_join_ack(long long action, "(currently %s not %s)", join_id, join_from, crm_join_phase_str(peer->join), crm_join_phase_str(crm_join_finalized)); - return; + goto done; } if (join_id != current_join_id) { @@ -744,40 +746,85 @@ do_dc_join_ack(long long action, "because currently on join-%d", join_id, join_from, current_join_id); crm_update_peer_join(__func__, peer, crm_join_nack); - return; + goto done; } crm_update_peer_join(__func__, peer, crm_join_confirmed); /* Update CIB with node's current executor state. A new transition will be - * triggered later, when the CIB notifies us of the change. + * triggered later, when the CIB manager notifies us of the change. + * + * The delete and modify requests are part of an atomic transaction. */ + rc = cib->cmds->init_transaction(cib); + if (rc != pcmk_ok) { + goto done; + } + + // Delete relevant parts of node's current executor state from CIB if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) { section = controld_section_lrm_unlocked; } - controld_delete_node_state(join_from, section, cib_scope_local); + controld_node_state_deletion_strings(join_from, section, &xpath, NULL); + + rc = cib->cmds->remove(cib, xpath, NULL, + cib_scope_local + |cib_xpath + |cib_multiple + |cib_transaction); + if (rc != pcmk_ok) { + goto done; + } + + // Update CIB with node's latest known executor state if (pcmk__str_eq(join_from, controld_globals.our_nodename, pcmk__str_casei)) { - xmlNode *now_dc_lrmd_state = controld_query_executor_state(); - - if (now_dc_lrmd_state != NULL) { - crm_debug("Updating local node history for join-%d " - "from query result", join_id); - controld_update_cib(XML_CIB_TAG_STATUS, now_dc_lrmd_state, cib_opts, - join_update_complete_callback); - free_xml(now_dc_lrmd_state); + + // Use the latest possible state if processing our own join ack + execd_state = controld_query_executor_state(); + + if (execd_state != NULL) { + crm_debug("Updating local node history for join-%d from query " + "result", + current_join_id); + state = execd_state; + } else { crm_warn("Updating local node history from join-%d confirmation " - "because query failed", join_id); - controld_update_cib(XML_CIB_TAG_STATUS, join_ack->xml, cib_opts, - join_update_complete_callback); + "because query failed", + current_join_id); } + } else { crm_debug("Updating node history for %s from join-%d confirmation", - join_from, join_id); - controld_update_cib(XML_CIB_TAG_STATUS, join_ack->xml, cib_opts, - join_update_complete_callback); + join_from, current_join_id); + } + + rc = cib->cmds->modify(cib, XML_CIB_TAG_STATUS, state, + cib_scope_local|cib_can_create|cib_transaction); + free_xml(execd_state); + if (rc != pcmk_ok) { + goto done; + } + + // Commit the transaction + rc = cib->cmds->end_transaction(cib, true, cib_scope_local); + fsa_register_cib_callback(rc, join_from, join_node_state_commit_callback); + + if (rc > 0) { + // join_from will be freed after callback + join_from = NULL; + rc = pcmk_ok; + } + +done: + if (rc != pcmk_ok) { + crm_crit("join-%d node history update for node %s failed: %s", + current_join_id, join_from, pcmk_strerror(rc)); + register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } + free(join_from); + free(xpath); } void @@ -808,7 +855,7 @@ finalize_join_for(gpointer key, gpointer value, gpointer user_data) */ crm_trace("Updating node name and UUID in CIB for %s", join_to); tmp1 = create_xml_node(NULL, XML_CIB_TAG_NODE); - set_uuid(tmp1, XML_ATTR_ID, join_node); + crm_xml_add(tmp1, XML_ATTR_ID, crm_peer_uuid(join_node)); crm_xml_add(tmp1, XML_ATTR_UNAME, join_to); fsa_cib_anon_update(XML_CIB_TAG_NODES, tmp1); free_xml(tmp1); diff --git a/daemons/controld/controld_lrm.h b/daemons/controld/controld_lrm.h index 25f3db3..c3113e4 100644 --- a/daemons/controld/controld_lrm.h +++ b/daemons/controld/controld_lrm.h @@ -109,11 +109,6 @@ gboolean lrm_state_init_local(void); void lrm_state_destroy_all(void); /*! - * \brief Destroy executor connection by node name - */ -void lrm_state_destroy(const char *node_name); - -/*! * \brief Find lrm_state data by node name */ lrm_state_t *lrm_state_find(const char *node_name); diff --git a/daemons/controld/controld_membership.c b/daemons/controld/controld_membership.c index 1f7e4c0..f25d1e9 100644 --- a/daemons/controld/controld_membership.c +++ b/daemons/controld/controld_membership.c @@ -138,10 +138,8 @@ create_node_state_update(crm_node_t *node, int flags, xmlNode *parent, pcmk__xe_set_bool_attr(node_state, XML_NODE_IS_REMOTE, true); } - set_uuid(node_state, XML_ATTR_ID, node); - - if (crm_element_value(node_state, XML_ATTR_ID) == NULL) { - crm_info("Node update for %s cancelled: no id", node->uname); + if (crm_xml_add(node_state, XML_ATTR_ID, crm_peer_uuid(node)) == NULL) { + crm_info("Node update for %s cancelled: no ID", node->uname); free_xml(node_state); return NULL; } @@ -149,17 +147,31 @@ create_node_state_update(crm_node_t *node, int flags, xmlNode *parent, crm_xml_add(node_state, XML_ATTR_UNAME, node->uname); if ((flags & node_update_cluster) && node->state) { - pcmk__xe_set_bool_attr(node_state, XML_NODE_IN_CLUSTER, - pcmk__str_eq(node->state, CRM_NODE_MEMBER, pcmk__str_casei)); + if (compare_version(controld_globals.dc_version, "3.18.0") >= 0) { + // A value 0 means the node is not a cluster member. + crm_xml_add_ll(node_state, PCMK__XA_IN_CCM, node->when_member); + + } else { + pcmk__xe_set_bool_attr(node_state, PCMK__XA_IN_CCM, + pcmk__str_eq(node->state, CRM_NODE_MEMBER, + pcmk__str_casei)); + } } if (!pcmk_is_set(node->flags, crm_remote_node)) { if (flags & node_update_peer) { - value = OFFLINESTATUS; - if (pcmk_is_set(node->processes, crm_get_cluster_proc())) { - value = ONLINESTATUS; + if (compare_version(controld_globals.dc_version, "3.18.0") >= 0) { + // A value 0 means the peer is offline in CPG. + crm_xml_add_ll(node_state, PCMK__XA_CRMD, node->when_online); + + } else { + // @COMPAT DCs < 2.1.7 use online/offline rather than timestamp + value = OFFLINESTATUS; + if (pcmk_is_set(node->processes, crm_get_cluster_proc())) { + value = ONLINESTATUS; + } + crm_xml_add(node_state, PCMK__XA_CRMD, value); } - crm_xml_add(node_state, XML_NODE_IS_PEER, value); } if (flags & node_update_join) { @@ -168,11 +180,11 @@ create_node_state_update(crm_node_t *node, int flags, xmlNode *parent, } else { value = CRMD_JOINSTATE_MEMBER; } - crm_xml_add(node_state, XML_NODE_JOIN_STATE, value); + crm_xml_add(node_state, PCMK__XA_JOIN, value); } if (flags & node_update_expected) { - crm_xml_add(node_state, XML_NODE_EXPECTED, node->expected); + crm_xml_add(node_state, PCMK__XA_EXPECTED, node->expected); } } @@ -210,7 +222,7 @@ search_conflicting_node_callback(xmlNode * msg, int call_id, int rc, return; } - if (pcmk__str_eq(crm_element_name(output), XML_CIB_TAG_NODE, pcmk__str_casei)) { + if (pcmk__xe_is(output, XML_CIB_TAG_NODE)) { node_xml = output; } else { @@ -224,7 +236,7 @@ search_conflicting_node_callback(xmlNode * msg, int call_id, int rc, crm_node_t *node = NULL; gboolean known = FALSE; - if (!pcmk__str_eq(crm_element_name(node_xml), XML_CIB_TAG_NODE, pcmk__str_casei)) { + if (!pcmk__xe_is(node_xml, XML_CIB_TAG_NODE)) { continue; } diff --git a/daemons/controld/controld_messages.c b/daemons/controld/controld_messages.c index 54b27ec..39f3c7a 100644 --- a/daemons/controld/controld_messages.c +++ b/daemons/controld/controld_messages.c @@ -328,52 +328,80 @@ route_message(enum crmd_fsa_cause cause, xmlNode * input) gboolean relay_message(xmlNode * msg, gboolean originated_locally) { - int dest = 1; + enum crm_ais_msg_types dest = crm_msg_ais; bool is_for_dc = false; bool is_for_dcib = false; bool is_for_te = false; bool is_for_crm = false; bool is_for_cib = false; bool is_local = false; - const char *host_to = crm_element_value(msg, F_CRM_HOST_TO); - const char *sys_to = crm_element_value(msg, F_CRM_SYS_TO); - const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM); - const char *type = crm_element_value(msg, F_TYPE); - const char *task = crm_element_value(msg, F_CRM_TASK); - const char *ref = crm_element_value(msg, XML_ATTR_REFERENCE); + bool broadcast = false; + const char *host_to = NULL; + const char *sys_to = NULL; + const char *sys_from = NULL; + const char *type = NULL; + const char *task = NULL; + const char *ref = NULL; + crm_node_t *node_to = NULL; + + CRM_CHECK(msg != NULL, return TRUE); + + host_to = crm_element_value(msg, F_CRM_HOST_TO); + sys_to = crm_element_value(msg, F_CRM_SYS_TO); + sys_from = crm_element_value(msg, F_CRM_SYS_FROM); + type = crm_element_value(msg, F_TYPE); + task = crm_element_value(msg, F_CRM_TASK); + ref = crm_element_value(msg, XML_ATTR_REFERENCE); + + broadcast = pcmk__str_empty(host_to); if (ref == NULL) { ref = "without reference ID"; } - if (msg == NULL) { - crm_warn("Cannot route empty message"); - return TRUE; - - } else if (pcmk__str_eq(task, CRM_OP_HELLO, pcmk__str_casei)) { - crm_trace("No routing needed for hello message %s", ref); + if (pcmk__str_eq(task, CRM_OP_HELLO, pcmk__str_casei)) { + crm_trace("Received hello %s from %s (no processing needed)", + ref, pcmk__s(sys_from, "unidentified source")); + crm_log_xml_trace(msg, "hello"); return TRUE; + } - } else if (!pcmk__str_eq(type, T_CRM, pcmk__str_casei)) { - crm_warn("Received invalid message %s: type '%s' not '" T_CRM "'", + // Require message type (set by create_request()) + if (!pcmk__str_eq(type, T_CRM, pcmk__str_casei)) { + crm_warn("Ignoring invalid message %s with type '%s' (not '" T_CRM "')", ref, pcmk__s(type, "")); - crm_log_xml_warn(msg, "[bad message type]"); + crm_log_xml_trace(msg, "ignored"); return TRUE; + } - } else if (sys_to == NULL) { - crm_warn("Received invalid message %s: no subsystem", ref); - crm_log_xml_warn(msg, "[no subsystem]"); + // Require a destination subsystem (also set by create_request()) + if (sys_to == NULL) { + crm_warn("Ignoring invalid message %s with no " F_CRM_SYS_TO, ref); + crm_log_xml_trace(msg, "ignored"); return TRUE; } + // Get the message type appropriate to the destination subsystem + if (is_corosync_cluster()) { + dest = text2msg_type(sys_to); + if ((dest < crm_msg_ais) || (dest > crm_msg_stonith_ng)) { + /* Unrecognized value, use a sane default + * + * @TODO Maybe we should bail instead + */ + dest = crm_msg_crmd; + } + } + is_for_dc = (strcasecmp(CRM_SYSTEM_DC, sys_to) == 0); is_for_dcib = (strcasecmp(CRM_SYSTEM_DCIB, sys_to) == 0); is_for_te = (strcasecmp(CRM_SYSTEM_TENGINE, sys_to) == 0); is_for_cib = (strcasecmp(CRM_SYSTEM_CIB, sys_to) == 0); is_for_crm = (strcasecmp(CRM_SYSTEM_CRMD, sys_to) == 0); + // Check whether message should be processed locally is_local = false; - if (pcmk__str_empty(host_to)) { + if (broadcast) { if (is_for_dc || is_for_te) { is_local = false; @@ -397,6 +425,7 @@ relay_message(xmlNode * msg, gboolean originated_locally) } else if (pcmk__str_eq(controld_globals.our_nodename, host_to, pcmk__str_casei)) { is_local = true; + } else if (is_for_crm && pcmk__str_eq(task, CRM_OP_LRM_DELETE, pcmk__str_casei)) { xmlNode *msg_data = get_message_xml(msg, F_CRM_DATA); const char *mode = crm_element_value(msg_data, PCMK__XA_MODE); @@ -407,69 +436,68 @@ relay_message(xmlNode * msg, gboolean originated_locally) } } - if (is_for_dc || is_for_dcib || is_for_te) { - if (AM_I_DC && is_for_te) { - crm_trace("Route message %s locally as transition request", ref); - send_msg_via_ipc(msg, sys_to); + // Check whether message should be relayed - } else if (AM_I_DC) { + if (is_for_dc || is_for_dcib || is_for_te) { + if (AM_I_DC) { + if (is_for_te) { + crm_trace("Route message %s locally as transition request", + ref); + crm_log_xml_trace(msg, sys_to); + send_msg_via_ipc(msg, sys_to); + return TRUE; // No further processing of message is needed + } crm_trace("Route message %s locally as DC request", ref); return FALSE; // More to be done by caller + } - } else if (originated_locally && !pcmk__strcase_any_of(sys_from, CRM_SYSTEM_PENGINE, - CRM_SYSTEM_TENGINE, NULL)) { - - if (is_corosync_cluster()) { - dest = text2msg_type(sys_to); + if (originated_locally + && !pcmk__strcase_any_of(sys_from, CRM_SYSTEM_PENGINE, + CRM_SYSTEM_TENGINE, NULL)) { + crm_trace("Relay message %s to DC (via %s)", + ref, pcmk__s(host_to, "broadcast")); + crm_log_xml_trace(msg, "relayed"); + if (!broadcast) { + node_to = crm_get_peer(0, host_to); } - crm_trace("Relay message %s to DC", ref); - send_cluster_message(host_to ? crm_get_peer(0, host_to) : NULL, dest, msg, TRUE); - - } else { - /* Neither the TE nor the scheduler should be sending messages - * to DCs on other nodes. By definition, if we are no longer the DC, - * then the scheduler's or TE's data should be discarded. - */ - crm_trace("Discard message %s because we are not DC", ref); + send_cluster_message(node_to, dest, msg, TRUE); + return TRUE; } - } else if (is_local && (is_for_crm || is_for_cib)) { - crm_trace("Route message %s locally as controller request", ref); - return FALSE; // More to be done by caller - - } else if (is_local) { - crm_trace("Relay message %s locally to %s", - ref, (sys_to? sys_to : "unknown client")); - crm_log_xml_trace(msg, "[IPC relay]"); - send_msg_via_ipc(msg, sys_to); - - } else { - crm_node_t *node_to = NULL; - - if (is_corosync_cluster()) { - dest = text2msg_type(sys_to); + /* Transition engine and scheduler messages are sent only to the DC on + * the same node. If we are no longer the DC, discard this message. + */ + crm_trace("Ignoring message %s because we are no longer DC", ref); + crm_log_xml_trace(msg, "ignored"); + return TRUE; // No further processing of message is needed + } - if (dest == crm_msg_none || dest > crm_msg_stonith_ng) { - dest = crm_msg_crmd; - } + if (is_local) { + if (is_for_crm || is_for_cib) { + crm_trace("Route message %s locally as controller request", ref); + return FALSE; // More to be done by caller } + crm_trace("Relay message %s locally to %s", ref, sys_to); + crm_log_xml_trace(msg, "IPC-relay"); + send_msg_via_ipc(msg, sys_to); + return TRUE; + } - if (host_to) { - node_to = pcmk__search_cluster_node_cache(0, host_to); - if (node_to == NULL) { - crm_warn("Cannot route message %s: Unknown node %s", - ref, host_to); - return TRUE; - } - crm_trace("Relay message %s to %s", - ref, (node_to->uname? node_to->uname : "peer")); - } else { - crm_trace("Broadcast message %s to all peers", ref); + if (!broadcast) { + node_to = pcmk__search_cluster_node_cache(0, host_to, NULL); + if (node_to == NULL) { + crm_warn("Ignoring message %s because node %s is unknown", + ref, host_to); + crm_log_xml_trace(msg, "ignored"); + return TRUE; } - send_cluster_message(host_to ? node_to : NULL, dest, msg, TRUE); } - return TRUE; // No further processing of message is needed + crm_trace("Relay message %s to %s", + ref, pcmk__s(host_to, "all peers")); + crm_log_xml_trace(msg, "relayed"); + send_cluster_message(node_to, dest, msg, TRUE); + return TRUE; } // Return true if field contains a positive integer @@ -546,6 +574,7 @@ controld_authorize_ipc_message(const xmlNode *client_msg, pcmk__client_t *curr_c } crm_trace("Validated IPC hello from client %s", client_name); + crm_log_xml_trace(client_msg, "hello"); if (curr_client) { curr_client->userdata = strdup(client_name); } @@ -553,6 +582,7 @@ controld_authorize_ipc_message(const xmlNode *client_msg, pcmk__client_t *curr_c return false; rejected: + crm_log_xml_trace(client_msg, "rejected"); if (curr_client) { qb_ipcs_disconnect(curr_client->ipcs); } @@ -575,7 +605,9 @@ handle_message(xmlNode *msg, enum crmd_fsa_cause cause) return I_NULL; } - crm_err("Unknown message type: %s", type); + crm_warn("Ignoring message with unknown " F_CRM_MSG_TYPE " '%s'", + pcmk__s(type, "")); + crm_log_xml_trace(msg, "bad"); return I_NULL; } @@ -701,7 +733,7 @@ handle_lrm_delete(xmlNode *stored_msg) crm_info("Notifying %s on %s that %s was%s deleted", from_sys, (from_host? from_host : "local node"), rsc_id, ((rc == pcmk_rc_ok)? "" : " not")); - op = lrmd_new_event(rsc_id, CRMD_ACTION_DELETE, 0); + op = lrmd_new_event(rsc_id, PCMK_ACTION_DELETE, 0); op->type = lrmd_event_exec_complete; op->user_data = strdup(transition? transition : FAKE_TE_ID); op->params = pcmk__strkey_table(free, free); @@ -732,7 +764,7 @@ handle_remote_state(const xmlNode *msg) bool remote_is_up = false; int rc = pcmk_rc_ok; - rc = pcmk__xe_get_bool_attr(msg, XML_NODE_IN_CLUSTER, &remote_is_up); + rc = pcmk__xe_get_bool_attr(msg, PCMK__XA_IN_CCM, &remote_is_up); CRM_CHECK(remote_uname && rc == pcmk_rc_ok, return I_NULL); @@ -818,7 +850,7 @@ handle_node_list(const xmlNode *request) crm_xml_add_ll(xml, XML_ATTR_ID, (long long) node->id); // uint32_t crm_xml_add(xml, XML_ATTR_UNAME, node->uname); - crm_xml_add(xml, XML_NODE_IN_CLUSTER, node->state); + crm_xml_add(xml, PCMK__XA_IN_CCM, node->state); } // Create and send reply @@ -875,7 +907,7 @@ handle_node_info_request(const xmlNode *msg) if (node) { crm_xml_add(reply_data, XML_ATTR_ID, node->uuid); crm_xml_add(reply_data, XML_ATTR_UNAME, node->uname); - crm_xml_add(reply_data, XML_NODE_IS_PEER, node->state); + crm_xml_add(reply_data, PCMK__XA_CRMD, node->state); pcmk__xe_set_bool_attr(reply_data, XML_NODE_IS_REMOTE, pcmk_is_set(node->flags, crm_remote_node)); } @@ -988,14 +1020,15 @@ handle_request(xmlNode *stored_msg, enum crmd_fsa_cause cause) /* Optimize this for the DC - it has the most to do */ + crm_log_xml_trace(stored_msg, "request"); if (op == NULL) { - crm_log_xml_warn(stored_msg, "[request without " F_CRM_TASK "]"); + crm_warn("Ignoring request without " F_CRM_TASK); return I_NULL; } if (strcmp(op, CRM_OP_SHUTDOWN_REQ) == 0) { const char *from = crm_element_value(stored_msg, F_CRM_HOST_FROM); - crm_node_t *node = pcmk__search_cluster_node_cache(0, from); + crm_node_t *node = pcmk__search_cluster_node_cache(0, from, NULL); pcmk__update_peer_expected(__func__, node, CRMD_JOINSTATE_DOWN); if(AM_I_DC == FALSE) { @@ -1062,11 +1095,6 @@ handle_request(xmlNode *stored_msg, enum crmd_fsa_cause cause) if (controld_globals.fsa_state == S_HALT) { crm_debug("Forcing an election from S_HALT"); return I_ELECTION; -#if 0 - } else if (AM_I_DC) { - /* This is the old way of doing things but what is gained? */ - return I_ELECTION; -#endif } } else if (strcmp(op, CRM_OP_JOIN_OFFER) == 0) { @@ -1157,8 +1185,9 @@ handle_response(xmlNode *stored_msg) { const char *op = crm_element_value(stored_msg, F_CRM_TASK); + crm_log_xml_trace(stored_msg, "reply"); if (op == NULL) { - crm_log_xml_err(stored_msg, "Bad message"); + crm_warn("Ignoring reply without " F_CRM_TASK); } else if (AM_I_DC && strcmp(op, CRM_OP_PECALC) == 0) { // Check whether scheduler answer been superseded by subsequent request @@ -1295,7 +1324,7 @@ broadcast_remote_state_message(const char *node_name, bool node_up) node_name, node_up? "coming up" : "going down"); crm_xml_add(msg, XML_ATTR_ID, node_name); - pcmk__xe_set_bool_attr(msg, XML_NODE_IN_CLUSTER, node_up); + pcmk__xe_set_bool_attr(msg, PCMK__XA_IN_CCM, node_up); if (node_up) { crm_xml_add(msg, PCMK__XA_CONN_HOST, controld_globals.our_nodename); diff --git a/daemons/controld/controld_metadata.c b/daemons/controld/controld_metadata.c index 240a978..c813ceb 100644 --- a/daemons/controld/controld_metadata.c +++ b/daemons/controld/controld_metadata.c @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 the Pacemaker project contributors + * Copyright 2017-2023 the Pacemaker project contributors * * The version control history for this file may have further details. * @@ -172,7 +172,7 @@ controld_cache_metadata(GHashTable *mdc, const lrmd_rsc_info_t *rsc, const char *action_name = crm_element_value(match, "name"); - if (pcmk__str_eq(action_name, CRMD_ACTION_RELOAD_AGENT, + if (pcmk__str_eq(action_name, PCMK_ACTION_RELOAD_AGENT, pcmk__str_none)) { if (ocf1_1) { controld_set_ra_flags(md, key, ra_supports_reload_agent); @@ -181,7 +181,7 @@ controld_cache_metadata(GHashTable *mdc, const lrmd_rsc_info_t *rsc, "because it does not support OCF 1.1 or later", key); } - } else if (!ocf1_1 && pcmk__str_eq(action_name, CRMD_ACTION_RELOAD, + } else if (!ocf1_1 && pcmk__str_eq(action_name, PCMK_ACTION_RELOAD, pcmk__str_casei)) { controld_set_ra_flags(md, key, ra_supports_legacy_reload); } diff --git a/daemons/controld/controld_remote_ra.c b/daemons/controld/controld_remote_ra.c index f24b755..d692ef6 100644 --- a/daemons/controld/controld_remote_ra.c +++ b/daemons/controld/controld_remote_ra.c @@ -280,6 +280,7 @@ remote_node_up(const char *node_name) int call_opt; xmlNode *update, *state; crm_node_t *node; + lrm_state_t *connection_rsc = NULL; CRM_CHECK(node_name != NULL, return); crm_info("Announcing Pacemaker Remote node %s", node_name); @@ -301,6 +302,20 @@ remote_node_up(const char *node_name) purge_remote_node_attrs(call_opt, node); pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0); + /* Apply any start state that we were given from the environment on the + * remote node. + */ + connection_rsc = lrm_state_find(node->uname); + + if (connection_rsc != NULL) { + lrmd_t *lrm = connection_rsc->conn; + const char *start_state = lrmd__node_start_state(lrm); + + if (start_state) { + set_join_state(start_state, node->uname, node->uuid, true); + } + } + /* pacemaker_remote nodes don't participate in the membership layer, * so cluster nodes don't automatically get notified when they come and go. * We send a cluster message to the DC, and update the CIB node state entry, @@ -392,10 +407,11 @@ check_remote_node_state(const remote_ra_cmd_t *cmd) return; } - if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) { + if (pcmk__str_eq(cmd->action, PCMK_ACTION_START, pcmk__str_casei)) { remote_node_up(cmd->rsc_id); - } else if (pcmk__str_eq(cmd->action, "migrate_from", pcmk__str_casei)) { + } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_MIGRATE_FROM, + pcmk__str_casei)) { /* After a successful migration, we don't need to do remote_node_up() * because the DC already knows the node is up, and we don't want to * clear LRM history etc. We do need to add the remote node to this @@ -408,7 +424,7 @@ check_remote_node_state(const remote_ra_cmd_t *cmd) CRM_CHECK(node != NULL, return); pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0); - } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { + } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_STOP, pcmk__str_casei)) { lrm_state_t *lrm_state = lrm_state_find(cmd->rsc_id); remote_ra_data_t *ra_data = lrm_state? lrm_state->remote_ra_data : NULL; @@ -510,7 +526,8 @@ retry_start_cmd_cb(gpointer data) return FALSE; } cmd = ra_data->cur_cmd; - if (!pcmk__strcase_any_of(cmd->action, "start", "migrate_from", NULL)) { + if (!pcmk__strcase_any_of(cmd->action, PCMK_ACTION_START, + PCMK_ACTION_MIGRATE_FROM, NULL)) { return FALSE; } update_remaining_timeout(cmd); @@ -681,7 +698,8 @@ remote_lrm_op_callback(lrmd_event_data_t * op) handle_remote_ra_stop(lrm_state, NULL); remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM); /* now fake the reply of a successful 'stop' */ - synthesize_lrmd_success(NULL, lrm_state->node_name, "stop"); + synthesize_lrmd_success(NULL, lrm_state->node_name, + PCMK_ACTION_STOP); } return; } @@ -695,8 +713,9 @@ remote_lrm_op_callback(lrmd_event_data_t * op) /* Start actions and migrate from actions complete after connection * comes back to us. */ - if (op->type == lrmd_event_connect && pcmk__strcase_any_of(cmd->action, "start", - "migrate_from", NULL)) { + if ((op->type == lrmd_event_connect) + && pcmk__strcase_any_of(cmd->action, PCMK_ACTION_START, + PCMK_ACTION_MIGRATE_FROM, NULL)) { if (op->connection_rc < 0) { update_remaining_timeout(cmd); @@ -731,7 +750,9 @@ remote_lrm_op_callback(lrmd_event_data_t * op) report_remote_ra_result(cmd); cmd_handled = TRUE; - } else if (op->type == lrmd_event_poke && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { + } else if ((op->type == lrmd_event_poke) + && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR, + pcmk__str_casei)) { if (cmd->monitor_timeout_id) { g_source_remove(cmd->monitor_timeout_id); @@ -758,7 +779,9 @@ remote_lrm_op_callback(lrmd_event_data_t * op) } cmd_handled = TRUE; - } else if (op->type == lrmd_event_disconnect && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { + } else if ((op->type == lrmd_event_disconnect) + && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR, + pcmk__str_casei)) { if (pcmk_is_set(ra_data->status, remote_active) && !pcmk_is_set(cmd->status, cmd_cancel)) { pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, @@ -771,7 +794,9 @@ remote_lrm_op_callback(lrmd_event_data_t * op) } cmd_handled = TRUE; - } else if (op->type == lrmd_event_new_client && pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) { + } else if ((op->type == lrmd_event_new_client) + && pcmk__str_eq(cmd->action, PCMK_ACTION_STOP, + pcmk__str_casei)) { handle_remote_ra_stop(lrm_state, cmd); cmd_handled = TRUE; @@ -882,7 +907,8 @@ handle_remote_ra_exec(gpointer user_data) ra_data->cmds = g_list_remove_link(ra_data->cmds, first); g_list_free_1(first); - if (!strcmp(cmd->action, "start") || !strcmp(cmd->action, "migrate_from")) { + if (pcmk__str_any_of(cmd->action, PCMK_ACTION_START, + PCMK_ACTION_MIGRATE_FROM, NULL)) { lrm_remote_clear_flags(lrm_state, expect_takeover | takeover_complete); if (handle_remote_ra_start(lrm_state, cmd, cmd->timeout) == pcmk_rc_ok) { @@ -894,7 +920,7 @@ handle_remote_ra_exec(gpointer user_data) } report_remote_ra_result(cmd); - } else if (!strcmp(cmd->action, "monitor")) { + } else if (!strcmp(cmd->action, PCMK_ACTION_MONITOR)) { if (lrm_state_is_connected(lrm_state) == TRUE) { rc = lrm_state_poke_connection(lrm_state); @@ -917,7 +943,7 @@ handle_remote_ra_exec(gpointer user_data) } report_remote_ra_result(cmd); - } else if (!strcmp(cmd->action, "stop")) { + } else if (!strcmp(cmd->action, PCMK_ACTION_STOP)) { if (pcmk_is_set(ra_data->status, expect_takeover)) { /* briefly wait on stop for the takeover event to occur. If the @@ -933,13 +959,14 @@ handle_remote_ra_exec(gpointer user_data) handle_remote_ra_stop(lrm_state, cmd); - } else if (!strcmp(cmd->action, "migrate_to")) { + } else if (strcmp(cmd->action, PCMK_ACTION_MIGRATE_TO) == 0) { lrm_remote_clear_flags(lrm_state, takeover_complete); lrm_remote_set_flags(lrm_state, expect_takeover); pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL); report_remote_ra_result(cmd); - } else if (pcmk__str_any_of(cmd->action, CRMD_ACTION_RELOAD, - CRMD_ACTION_RELOAD_AGENT, NULL)) { + + } else if (pcmk__str_any_of(cmd->action, PCMK_ACTION_RELOAD, + PCMK_ACTION_RELOAD_AGENT, NULL)) { /* Currently the only reloadable parameter is reconnect_interval, * which is only used by the scheduler via the CIB, so reloads are a * no-op. @@ -1029,13 +1056,13 @@ static gboolean is_remote_ra_supported_action(const char *action) { return pcmk__str_any_of(action, - CRMD_ACTION_START, - CRMD_ACTION_STOP, - CRMD_ACTION_STATUS, - CRMD_ACTION_MIGRATE, - CRMD_ACTION_MIGRATED, - CRMD_ACTION_RELOAD_AGENT, - CRMD_ACTION_RELOAD, + PCMK_ACTION_START, + PCMK_ACTION_STOP, + PCMK_ACTION_MONITOR, + PCMK_ACTION_MIGRATE_TO, + PCMK_ACTION_MIGRATE_FROM, + PCMK_ACTION_RELOAD_AGENT, + PCMK_ACTION_RELOAD, NULL); } @@ -1048,7 +1075,9 @@ fail_all_monitor_cmds(GList * list) for (gIter = list; gIter != NULL; gIter = gIter->next) { cmd = gIter->data; - if ((cmd->interval_ms > 0) && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { + if ((cmd->interval_ms > 0) + && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR, + pcmk__str_casei)) { rm_list = g_list_append(rm_list, cmd); } } @@ -1137,8 +1166,9 @@ handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms, if (ra_data->cur_cmd && !pcmk_is_set(ra_data->cur_cmd->status, cmd_cancel) && - (ra_data->cur_cmd->interval_ms == interval_ms) && - pcmk__str_eq(ra_data->cur_cmd->action, "monitor", pcmk__str_casei)) { + (ra_data->cur_cmd->interval_ms == interval_ms) + && pcmk__str_eq(ra_data->cur_cmd->action, PCMK_ACTION_MONITOR, + pcmk__str_casei)) { cmd = ra_data->cur_cmd; goto handle_dup; @@ -1147,7 +1177,8 @@ handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms, for (gIter = ra_data->recurring_cmds; gIter != NULL; gIter = gIter->next) { cmd = gIter->data; if ((cmd->interval_ms == interval_ms) - && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { + && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR, + pcmk__str_casei)) { goto handle_dup; } } @@ -1155,7 +1186,8 @@ handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms, for (gIter = ra_data->cmds; gIter != NULL; gIter = gIter->next) { cmd = gIter->data; if ((cmd->interval_ms == interval_ms) - && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) { + && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR, + pcmk__str_casei)) { goto handle_dup; } } @@ -1165,7 +1197,7 @@ handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms, handle_dup: crm_trace("merging duplicate monitor cmd " PCMK__OP_FMT, - cmd->rsc_id, "monitor", interval_ms); + cmd->rsc_id, PCMK_ACTION_MONITOR, interval_ms); /* update the userdata */ if (userdata) { @@ -1385,7 +1417,7 @@ remote_ra_maintenance(lrm_state_t * lrm_state, gboolean maintenance) } #define XPATH_PSEUDO_MAINTENANCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \ - "[@" XML_LRM_ATTR_TASK "='" CRM_OP_MAINTENANCE_NODES "']/" \ + "[@" XML_LRM_ATTR_TASK "='" PCMK_ACTION_MAINTENANCE_NODES "']/" \ XML_GRAPH_TAG_MAINTENANCE /*! @@ -1403,9 +1435,10 @@ remote_ra_process_maintenance_nodes(xmlNode *xml) xmlNode *node; int cnt = 0, cnt_remote = 0; - for (node = - first_named_child(getXpathResult(search, 0), XML_CIB_TAG_NODE); - node != NULL; node = pcmk__xml_next(node)) { + for (node = first_named_child(getXpathResult(search, 0), + XML_CIB_TAG_NODE); + node != NULL; node = crm_next_same_xml(node)) { + lrm_state_t *lrm_state = lrm_state_find(ID(node)); cnt++; diff --git a/daemons/controld/controld_schedulerd.c b/daemons/controld/controld_schedulerd.c index 912f9a5..8aca83f 100644 --- a/daemons/controld/controld_schedulerd.c +++ b/daemons/controld/controld_schedulerd.c @@ -45,11 +45,11 @@ controld_shutdown_schedulerd_ipc(void) * \internal * \brief Save CIB query result to file, raising FSA error * - * \param[in] msg Ignored - * \param[in] call_id Call ID of CIB query - * \param[in] rc Return code of CIB query - * \param[in,out] output Result of CIB query - * \param[in] user_data Unique identifier for filename + * \param[in] msg Ignored + * \param[in] call_id Call ID of CIB query + * \param[in] rc Return code of CIB query + * \param[in] output Result of CIB query + * \param[in] user_data Unique identifier for filename * * \note This is intended to be called after a scheduler connection fails. */ @@ -90,8 +90,9 @@ handle_disconnect(void) int rc = pcmk_ok; char *uuid_str = crm_generate_uuid(); - crm_crit("Connection to the scheduler failed " - CRM_XS " uuid=%s", uuid_str); + crm_crit("Lost connection to the scheduler " + CRM_XS " CIB will be saved to " PE_STATE_DIR "/pe-core-%s.bz2", + uuid_str); /* * The scheduler died... @@ -107,9 +108,6 @@ handle_disconnect(void) NULL, NULL, cib_scope_local); fsa_register_cib_callback(rc, uuid_str, save_cib_contents); - - } else { - crm_info("Connection to the scheduler released"); } controld_clear_fsa_input_flags(R_PE_CONNECTED); @@ -199,9 +197,10 @@ new_schedulerd_ipc_connection(void) pcmk_register_ipc_callback(schedulerd_api, scheduler_event_callback, NULL); - rc = pcmk_connect_ipc(schedulerd_api, pcmk_ipc_dispatch_main); + rc = pcmk__connect_ipc(schedulerd_api, pcmk_ipc_dispatch_main, 3); if (rc != pcmk_rc_ok) { - crm_err("Error connecting to the scheduler: %s", pcmk_rc_str(rc)); + crm_err("Error connecting to %s: %s", + pcmk_ipc_name(schedulerd_api, true), pcmk_rc_str(rc)); return false; } diff --git a/daemons/controld/controld_te_actions.c b/daemons/controld/controld_te_actions.c index d8cfcad..fe6b744 100644 --- a/daemons/controld/controld_te_actions.c +++ b/daemons/controld/controld_te_actions.c @@ -47,7 +47,7 @@ execute_pseudo_action(pcmk__graph_t *graph, pcmk__graph_action_t *pseudo) const char *task = crm_element_value(pseudo->xml, XML_LRM_ATTR_TASK); /* send to peers as well? */ - if (pcmk__str_eq(task, CRM_OP_MAINTENANCE_NODES, pcmk__str_casei)) { + if (pcmk__str_eq(task, PCMK_ACTION_MAINTENANCE_NODES, pcmk__str_casei)) { GHashTableIter iter; crm_node_t *node = NULL; @@ -125,7 +125,7 @@ execute_cluster_action(pcmk__graph_t *graph, pcmk__graph_action_t *action) router_node = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE); if (router_node == NULL) { router_node = on_node; - if (pcmk__str_eq(task, CRM_OP_LRM_DELETE, pcmk__str_none)) { + if (pcmk__str_eq(task, PCMK_ACTION_LRM_DELETE, pcmk__str_none)) { const char *mode = crm_element_value(action->xml, PCMK__XA_MODE); if (pcmk__str_eq(mode, XML_TAG_CIB, pcmk__str_none)) { @@ -148,7 +148,8 @@ execute_cluster_action(pcmk__graph_t *graph, pcmk__graph_action_t *action) id, task, on_node, (is_local? " locally" : ""), (no_wait? " without waiting" : "")); - if (is_local && pcmk__str_eq(task, CRM_OP_SHUTDOWN, pcmk__str_none)) { + if (is_local + && pcmk__str_eq(task, PCMK_ACTION_DO_SHUTDOWN, pcmk__str_none)) { /* defer until everything else completes */ crm_info("Controller request '%s' is a local shutdown", id); graph->completion_action = pcmk__graph_shutdown; @@ -156,7 +157,7 @@ execute_cluster_action(pcmk__graph_t *graph, pcmk__graph_action_t *action) te_action_confirmed(action, graph); return pcmk_rc_ok; - } else if (pcmk__str_eq(task, CRM_OP_SHUTDOWN, pcmk__str_none)) { + } else if (pcmk__str_eq(task, PCMK_ACTION_DO_SHUTDOWN, pcmk__str_none)) { crm_node_t *peer = crm_get_peer(0, router_node); pcmk__update_peer_expected(__func__, peer, CRMD_JOINSTATE_DOWN); @@ -318,7 +319,7 @@ controld_record_action_timeout(pcmk__graph_action_t *action) int target_rc = get_target_rc(action); crm_warn("%s %d: %s on %s timed out", - crm_element_name(action->xml), action->id, task_uuid, target); + action->xml->name, action->id, task_uuid, target); op = synthesize_timeout_event(action, target_rc); controld_record_action_event(action, op); @@ -528,9 +529,9 @@ te_update_job_count(pcmk__graph_action_t *action, int offset) * the connection resources */ target = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE); - if ((target == NULL) && pcmk__strcase_any_of(task, CRMD_ACTION_MIGRATE, - CRMD_ACTION_MIGRATED, NULL)) { - + if ((target == NULL) + && pcmk__strcase_any_of(task, PCMK_ACTION_MIGRATE_TO, + PCMK_ACTION_MIGRATE_FROM, NULL)) { const char *t1 = crm_meta_value(action->params, XML_LRM_ATTR_MIGRATE_SOURCE); const char *t2 = crm_meta_value(action->params, XML_LRM_ATTR_MIGRATE_TARGET); @@ -586,7 +587,8 @@ allowed_on_node(const pcmk__graph_t *graph, const pcmk__graph_action_t *action, return false; } else if(graph->migration_limit > 0 && r->migrate_jobs >= graph->migration_limit) { - if (pcmk__strcase_any_of(task, CRMD_ACTION_MIGRATE, CRMD_ACTION_MIGRATED, NULL)) { + if (pcmk__strcase_any_of(task, PCMK_ACTION_MIGRATE_TO, + PCMK_ACTION_MIGRATE_FROM, NULL)) { crm_trace("Peer %s is over their migration job limit of %d (%d): deferring %s", target, graph->migration_limit, r->migrate_jobs, id); return false; @@ -624,8 +626,9 @@ graph_action_allowed(pcmk__graph_t *graph, pcmk__graph_action_t *action) * the connection resources */ target = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE); - if ((target == NULL) && pcmk__strcase_any_of(task, CRMD_ACTION_MIGRATE, - CRMD_ACTION_MIGRATED, NULL)) { + if ((target == NULL) + && pcmk__strcase_any_of(task, PCMK_ACTION_MIGRATE_TO, + PCMK_ACTION_MIGRATE_FROM, NULL)) { target = crm_meta_value(action->params, XML_LRM_ATTR_MIGRATE_SOURCE); if (!allowed_on_node(graph, action, target)) { return false; diff --git a/daemons/controld/controld_te_callbacks.c b/daemons/controld/controld_te_callbacks.c index cf9de83..c26e757 100644 --- a/daemons/controld/controld_te_callbacks.c +++ b/daemons/controld/controld_te_callbacks.c @@ -225,12 +225,12 @@ process_resource_updates(const char *node, xmlNode *xml, xmlNode *change, return; } - if (strcmp(TYPE(xml), XML_CIB_TAG_LRM) == 0) { + if (pcmk__xe_is(xml, XML_CIB_TAG_LRM)) { xml = first_named_child(xml, XML_LRM_TAG_RESOURCES); CRM_CHECK(xml != NULL, return); } - CRM_CHECK(strcmp(TYPE(xml), XML_LRM_TAG_RESOURCES) == 0, return); + CRM_CHECK(pcmk__xe_is(xml, XML_LRM_TAG_RESOURCES), return); /* * Updates by, or in response to, TE actions will never contain updates @@ -558,7 +558,7 @@ te_update_diff(const char *event, xmlNode * msg) p_del[0], p_del[1], p_del[2], p_add[0], p_add[1], p_add[2], fsa_state2string(controld_globals.fsa_state)); - crm_element_value_int(diff, "format", &format); + crm_element_value_int(diff, PCMK_XA_FORMAT, &format); switch (format) { case 1: te_update_diff_v1(event, diff); diff --git a/daemons/controld/controld_te_events.c b/daemons/controld/controld_te_events.c index d4e2b0f..28977c0 100644 --- a/daemons/controld/controld_te_events.c +++ b/daemons/controld/controld_te_events.c @@ -111,7 +111,7 @@ fail_incompletable_actions(pcmk__graph_t *graph, const char *down_node) } else if (action->type == pcmk__cluster_graph_action) { const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); - if (pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) { + if (pcmk__str_eq(task, PCMK_ACTION_STONITH, pcmk__str_casei)) { continue; } } @@ -196,16 +196,16 @@ update_failcount(const xmlNode *event, const char *event_node_uuid, int rc, /* Decide whether update is necessary and what value to use */ if ((interval_ms > 0) - || pcmk__str_eq(task, CRMD_ACTION_PROMOTE, pcmk__str_none) - || pcmk__str_eq(task, CRMD_ACTION_DEMOTE, pcmk__str_none)) { + || pcmk__str_eq(task, PCMK_ACTION_PROMOTE, pcmk__str_none) + || pcmk__str_eq(task, PCMK_ACTION_DEMOTE, pcmk__str_none)) { do_update = TRUE; - } else if (pcmk__str_eq(task, CRMD_ACTION_START, pcmk__str_none)) { + } else if (pcmk__str_eq(task, PCMK_ACTION_START, pcmk__str_none)) { do_update = TRUE; value = pcmk__s(controld_globals.transition_graph->failed_start_offset, CRM_INFINITY_S); - } else if (pcmk__str_eq(task, CRMD_ACTION_STOP, pcmk__str_none)) { + } else if (pcmk__str_eq(task, PCMK_ACTION_STOP, pcmk__str_none)) { do_update = TRUE; value = pcmk__s(controld_globals.transition_graph->failed_stop_offset, CRM_INFINITY_S); @@ -314,7 +314,7 @@ get_cancel_action(const char *id, const char *node) pcmk__graph_action_t *action = (pcmk__graph_action_t *) gIter2->data; task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); - if (!pcmk__str_eq(CRMD_ACTION_CANCEL, task, pcmk__str_casei)) { + if (!pcmk__str_eq(PCMK_ACTION_CANCEL, task, pcmk__str_casei)) { continue; } diff --git a/daemons/controld/controld_te_utils.c b/daemons/controld/controld_te_utils.c index ecbc0b2..5a9f029 100644 --- a/daemons/controld/controld_te_utils.c +++ b/daemons/controld/controld_te_utils.c @@ -1,5 +1,5 @@ /* - * Copyright 2004-2022 the Pacemaker project contributors + * Copyright 2004-2023 the Pacemaker project contributors * * The version control history for this file may have further details. * @@ -17,6 +17,8 @@ //! Triggers transition graph processing static crm_trigger_t *transition_trigger = NULL; +static GHashTable *node_pending_timers = NULL; + gboolean stop_te_timer(pcmk__graph_action_t *action) { @@ -132,11 +134,13 @@ static struct abort_timer_s { static gboolean abort_timer_popped(gpointer data) { - if (AM_I_DC && (abort_timer.aborted == FALSE)) { - abort_transition(abort_timer.priority, abort_timer.action, - abort_timer.text, NULL); + struct abort_timer_s *abort_timer = (struct abort_timer_s *) data; + + if (AM_I_DC && (abort_timer->aborted == FALSE)) { + abort_transition(abort_timer->priority, abort_timer->action, + abort_timer->text, NULL); } - abort_timer.id = 0; + abort_timer->id = 0; return FALSE; // do not immediately reschedule timer } @@ -158,7 +162,143 @@ abort_after_delay(int abort_priority, enum pcmk__graph_next abort_action, abort_timer.priority = abort_priority; abort_timer.action = abort_action; abort_timer.text = abort_text; - abort_timer.id = g_timeout_add(delay_ms, abort_timer_popped, NULL); + abort_timer.id = g_timeout_add(delay_ms, abort_timer_popped, &abort_timer); +} + +static void +free_node_pending_timer(gpointer data) +{ + struct abort_timer_s *node_pending_timer = (struct abort_timer_s *) data; + + if (node_pending_timer->id != 0) { + g_source_remove(node_pending_timer->id); + node_pending_timer->id = 0; + } + + free(node_pending_timer); +} + +static gboolean +node_pending_timer_popped(gpointer key) +{ + struct abort_timer_s *node_pending_timer = NULL; + + if (node_pending_timers == NULL) { + return FALSE; + } + + node_pending_timer = g_hash_table_lookup(node_pending_timers, key); + if (node_pending_timer == NULL) { + return FALSE; + } + + crm_warn("Node with id '%s' pending timed out (%us) on joining the process " + "group", + (const char *) key, controld_globals.node_pending_timeout); + + if (controld_globals.node_pending_timeout > 0) { + abort_timer_popped(node_pending_timer); + } + + g_hash_table_remove(node_pending_timers, key); + + return FALSE; // do not reschedule timer +} + +static void +init_node_pending_timer(const crm_node_t *node, guint timeout) +{ + struct abort_timer_s *node_pending_timer = NULL; + char *key = NULL; + + if (node->uuid == NULL) { + return; + } + + if (node_pending_timers == NULL) { + node_pending_timers = pcmk__strikey_table(free, + free_node_pending_timer); + + // The timer is somehow already existing + } else if (g_hash_table_lookup(node_pending_timers, node->uuid) != NULL) { + return; + } + + crm_notice("Waiting for pending %s with id '%s' to join the process " + "group (timeout=%us)", + node->uname ? node->uname : "node", node->uuid, + controld_globals.node_pending_timeout); + + node_pending_timer = calloc(1, sizeof(struct abort_timer_s)); + CRM_ASSERT(node_pending_timer != NULL); + + node_pending_timer->aborted = FALSE; + node_pending_timer->priority = INFINITY; + node_pending_timer->action = pcmk__graph_restart; + node_pending_timer->text = "Node pending timed out"; + + key = strdup(node->uuid); + CRM_ASSERT(key != NULL); + + g_hash_table_replace(node_pending_timers, key, node_pending_timer); + + node_pending_timer->id = g_timeout_add_seconds(timeout, + node_pending_timer_popped, + key); + CRM_ASSERT(node_pending_timer->id != 0); +} + +static void +remove_node_pending_timer(const char *node_uuid) +{ + if (node_pending_timers == NULL) { + return; + } + + g_hash_table_remove(node_pending_timers, node_uuid); +} + +void +controld_node_pending_timer(const crm_node_t *node) +{ + long long remaining_timeout = 0; + + /* If the node is not an active cluster node, is leaving the cluster, or is + * already part of CPG, or node-pending-timeout is disabled, free any + * node pending timer for it. + */ + if (pcmk_is_set(node->flags, crm_remote_node) + || (node->when_member <= 1) || (node->when_online > 0) + || (controld_globals.node_pending_timeout == 0)) { + remove_node_pending_timer(node->uuid); + return; + } + + // Node is a cluster member but offline in CPG + + remaining_timeout = node->when_member - time(NULL) + + controld_globals.node_pending_timeout; + + /* It already passed node pending timeout somehow. + * Free any node pending timer of it. + */ + if (remaining_timeout <= 0) { + remove_node_pending_timer(node->uuid); + return; + } + + init_node_pending_timer(node, remaining_timeout); +} + +void +controld_free_node_pending_timers(void) +{ + if (node_pending_timers == NULL) { + return; + } + + g_hash_table_destroy(node_pending_timers); + node_pending_timers = NULL; } static const char * @@ -246,7 +386,7 @@ abort_transition_graph(int abort_priority, enum pcmk__graph_next abort_action, const xmlNode *search = NULL; for(search = reason; search; search = search->parent) { - if (pcmk__str_eq(XML_TAG_DIFF, TYPE(search), pcmk__str_casei)) { + if (pcmk__xe_is(search, XML_TAG_DIFF)) { diff = search; break; } @@ -255,7 +395,7 @@ abort_transition_graph(int abort_priority, enum pcmk__graph_next abort_action, if(diff) { xml_patch_versions(diff, add, del); for(search = reason; search; search = search->parent) { - if (pcmk__str_eq(XML_DIFF_CHANGE, TYPE(search), pcmk__str_casei)) { + if (pcmk__xe_is(search, XML_DIFF_CHANGE)) { change = search; break; } @@ -276,14 +416,13 @@ abort_transition_graph(int abort_priority, enum pcmk__graph_next abort_action, do_crm_log(level, "Transition %d aborted by %s.%s: %s " CRM_XS " cib=%d.%d.%d source=%s:%d path=%s complete=%s", - controld_globals.transition_graph->id, TYPE(reason), + controld_globals.transition_graph->id, reason->name, ID(reason), abort_text, add[0], add[1], add[2], fn, line, (const char *) local_path->str, pcmk__btoa(controld_globals.transition_graph->complete)); g_string_free(local_path, TRUE); } else { - const char *kind = NULL; const char *op = crm_element_value(change, XML_DIFF_OP); const char *path = crm_element_value(change, XML_DIFF_PATH); @@ -297,9 +436,9 @@ abort_transition_graph(int abort_priority, enum pcmk__graph_next abort_action, reason = reason->children; } } + CRM_CHECK(reason != NULL, goto done); } - kind = TYPE(reason); if(strcmp(op, "delete") == 0) { const char *shortpath = strrchr(path, '/'); @@ -310,7 +449,7 @@ abort_transition_graph(int abort_priority, enum pcmk__graph_next abort_action, add[0], add[1], add[2], fn, line, path, pcmk__btoa(controld_globals.transition_graph->complete)); - } else if (pcmk__str_eq(XML_CIB_TAG_NVPAIR, kind, pcmk__str_none)) { + } else if (pcmk__xe_is(reason, XML_CIB_TAG_NVPAIR)) { do_crm_log(level, "Transition %d aborted by %s doing %s %s=%s: %s " CRM_XS " cib=%d.%d.%d source=%s:%d path=%s complete=%s", controld_globals.transition_graph->id, @@ -320,7 +459,7 @@ abort_transition_graph(int abort_priority, enum pcmk__graph_next abort_action, abort_text, add[0], add[1], add[2], fn, line, path, pcmk__btoa(controld_globals.transition_graph->complete)); - } else if (pcmk__str_eq(XML_LRM_TAG_RSC_OP, kind, pcmk__str_none)) { + } else if (pcmk__xe_is(reason, XML_LRM_TAG_RSC_OP)) { const char *magic = crm_element_value(reason, XML_ATTR_TRANSITION_MAGIC); do_crm_log(level, "Transition %d aborted by operation %s '%s' on %s: %s " @@ -331,14 +470,15 @@ abort_transition_graph(int abort_priority, enum pcmk__graph_next abort_action, magic, add[0], add[1], add[2], fn, line, pcmk__btoa(controld_globals.transition_graph->complete)); - } else if (pcmk__str_any_of(kind, XML_CIB_TAG_STATE, XML_CIB_TAG_NODE, NULL)) { + } else if (pcmk__str_any_of((const char *) reason->name, + XML_CIB_TAG_STATE, XML_CIB_TAG_NODE, NULL)) { const char *uname = crm_peer_uname(ID(reason)); do_crm_log(level, "Transition %d aborted by %s '%s' on %s: %s " CRM_XS " cib=%d.%d.%d source=%s:%d complete=%s", controld_globals.transition_graph->id, - kind, op, (uname? uname : ID(reason)), abort_text, - add[0], add[1], add[2], fn, line, + reason->name, op, pcmk__s(uname, ID(reason)), + abort_text, add[0], add[1], add[2], fn, line, pcmk__btoa(controld_globals.transition_graph->complete)); } else { @@ -347,12 +487,13 @@ abort_transition_graph(int abort_priority, enum pcmk__graph_next abort_action, do_crm_log(level, "Transition %d aborted by %s.%s '%s': %s " CRM_XS " cib=%d.%d.%d source=%s:%d path=%s complete=%s", controld_globals.transition_graph->id, - TYPE(reason), (id? id : ""), (op? op : "change"), + reason->name, pcmk__s(id, ""), pcmk__s(op, "change"), abort_text, add[0], add[1], add[2], fn, line, path, pcmk__btoa(controld_globals.transition_graph->complete)); } } +done: if (controld_globals.transition_graph->complete) { if (controld_get_period_transition_timer() > 0) { controld_stop_transition_timer(); diff --git a/daemons/controld/controld_throttle.c b/daemons/controld/controld_throttle.c index 5b7f9c0..a4775e5 100644 --- a/daemons/controld/controld_throttle.c +++ b/daemons/controld/controld_throttle.c @@ -154,7 +154,7 @@ throttle_cib_load(float *load) if(stream == NULL) { int rc = errno; - crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_strerror(rc), rc); + crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_rc_str(rc), rc); free(loadfile); loadfile = NULL; return FALSE; } @@ -220,7 +220,7 @@ throttle_load_avg(float *load) stream = fopen(loadfile, "r"); if(stream == NULL) { int rc = errno; - crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_strerror(rc), rc); + crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_rc_str(rc), rc); return FALSE; } @@ -407,7 +407,7 @@ static void throttle_update_job_max(const char *preference) { long long max = 0LL; - const char *env_limit = getenv("PCMK_node_action_limit"); + const char *env_limit = pcmk__env_option(PCMK__ENV_NODE_ACTION_LIMIT); if (env_limit != NULL) { preference = env_limit; // Per-node override diff --git a/daemons/controld/controld_transition.c b/daemons/controld/controld_transition.c index c8a342c..897c6d3 100644 --- a/daemons/controld/controld_transition.c +++ b/daemons/controld/controld_transition.c @@ -1,5 +1,5 @@ /* - * Copyright 2004-2022 the Pacemaker project contributors + * Copyright 2004-2023 the Pacemaker project contributors * * The version control history for this file may have further details. * @@ -15,11 +15,6 @@ #include <pacemaker-controld.h> -static void -global_cib_callback(const xmlNode * msg, int callid, int rc, xmlNode * output) -{ -} - static pcmk__graph_t * create_blank_graph(void) { @@ -82,12 +77,6 @@ do_te_control(long long action, crm_err("Could not set CIB notification callback"); init_ok = FALSE; } - - if (cib_conn->cmds->set_op_callback(cib_conn, - global_cib_callback) != pcmk_ok) { - crm_err("Could not set CIB global callback"); - init_ok = FALSE; - } } if (init_ok) { diff --git a/daemons/controld/controld_transition.h b/daemons/controld/controld_transition.h index 2da4221..0655bd9 100644 --- a/daemons/controld/controld_transition.h +++ b/daemons/controld/controld_transition.h @@ -48,6 +48,8 @@ void controld_destroy_transition_trigger(void); void controld_trigger_graph_as(const char *fn, int line); void abort_after_delay(int abort_priority, enum pcmk__graph_next abort_action, const char *abort_text, guint delay_ms); +void controld_node_pending_timer(const crm_node_t *node); +void controld_free_node_pending_timers(void); void abort_transition_graph(int abort_priority, enum pcmk__graph_next abort_action, const char *abort_text, const xmlNode *reason, diff --git a/daemons/controld/controld_utils.c b/daemons/controld/controld_utils.c index 4ce09d9..9b306ee 100644 --- a/daemons/controld/controld_utils.c +++ b/daemons/controld/controld_utils.c @@ -828,7 +828,7 @@ get_node_id(xmlNode *lrm_rsc_op) { xmlNode *node = lrm_rsc_op; - while (node != NULL && !pcmk__str_eq(XML_CIB_TAG_STATE, TYPE(node), pcmk__str_casei)) { + while ((node != NULL) && !pcmk__xe_is(node, XML_CIB_TAG_STATE)) { node = node->parent; } diff --git a/daemons/controld/pacemaker-controld.c b/daemons/controld/pacemaker-controld.c index 5858898..e4a72c2 100644 --- a/daemons/controld/pacemaker-controld.c +++ b/daemons/controld/pacemaker-controld.c @@ -112,7 +112,7 @@ main(int argc, char **argv) goto done; } - if (crm_ipc_connect(old_instance)) { + if (pcmk__connect_generic_ipc(old_instance) == pcmk_rc_ok) { /* IPC end-point already up */ crm_ipc_close(old_instance); crm_ipc_destroy(old_instance); diff --git a/daemons/controld/pacemaker-controld.h b/daemons/controld/pacemaker-controld.h index 1484a00..2334cce 100644 --- a/daemons/controld/pacemaker-controld.h +++ b/daemons/controld/pacemaker-controld.h @@ -36,4 +36,7 @@ void controld_remove_voter(const char *uname); void controld_election_fini(void); void controld_stop_current_election_timeout(void); +void set_join_state(const char *start_state, const char *node_name, + const char *node_uuid, bool remote); + #endif |